xref: /illumos-gate/usr/src/uts/intel/io/mc-amd/mcamd_drv.c (revision 5279807d7e1818eac6f90ac640b7a89cdb37522d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/conf.h>
28 #include <sys/ddi.h>
29 #include <sys/ddifm.h>
30 #include <sys/sunddi.h>
31 #include <sys/sunndi.h>
32 #include <sys/stat.h>
33 #include <sys/modctl.h>
34 #include <sys/types.h>
35 #include <sys/cpuvar.h>
36 #include <sys/cmn_err.h>
37 #include <sys/kmem.h>
38 #include <sys/cred.h>
39 #include <sys/ksynch.h>
40 #include <sys/rwlock.h>
41 #include <sys/pghw.h>
42 #include <sys/open.h>
43 #include <sys/policy.h>
44 #include <sys/x86_archext.h>
45 #include <sys/cpu_module.h>
46 #include <qsort.h>
47 #include <sys/pci_cfgspace.h>
48 #include <sys/mc.h>
49 #include <sys/mc_amd.h>
50 #include <mcamd.h>
51 #include <mcamd_dimmcfg.h>
52 #include <mcamd_pcicfg.h>
53 #include <mcamd_api.h>
54 #include <sys/fm/cpu/AMD.h>
55 
56 /*
57  * Set to prevent mc-amd from attaching.
58  */
59 int mc_no_attach = 0;
60 
61 /*
62  * Of the 754/939/940 packages, only socket 940 supports quadrank registered
63  * dimms.  Unfortunately, no memory-controller register indicates the
64  * presence of quadrank dimm support or presence (i.e., in terms of number
65  * of slots per cpu, and chip-select lines per slot,  The following may be set
66  * in /etc/system to indicate the presence of quadrank support on a motherboard.
67  *
68  * There is no need to set this for F(1207) and S1g1.
69  */
70 int mc_quadranksupport = 0;
71 
72 mc_t *mc_list, *mc_last;
73 krwlock_t mc_lock;
74 int mc_hold_attached = 1;
75 
76 #define	MAX(m, n) ((m) >= (n) ? (m) : (n))
77 #define	MIN(m, n) ((m) <= (n) ? (m) : (n))
78 
79 /*
80  * The following tuneable is used to determine the DRAM scrubbing rate.
81  * The values range from 0x00-0x16 as described in the BKDG.  Zero
82  * disables DRAM scrubbing.  Values above zero indicate rates in descending
83  * order.
84  *
85  * The default value below is used on several Sun systems.  In the future
86  * this code should assign values dynamically based on memory sizing.
87  */
88 uint32_t mc_scrub_rate_dram = 0xd;	/* 64B every 163.8 us; 1GB per 45 min */
89 
90 enum {
91 	MC_SCRUB_BIOSDEFAULT,	/* retain system default value */
92 	MC_SCRUB_FIXED,		/* assign mc_scrub_rate_* values */
93 	MC_SCRUB_MAX		/* assign max of system and tunables */
94 } mc_scrub_policy = MC_SCRUB_MAX;
95 
96 static void
97 mc_snapshot_destroy(mc_t *mc)
98 {
99 	ASSERT(RW_LOCK_HELD(&mc_lock));
100 
101 	if (mc->mc_snapshot == NULL)
102 		return;
103 
104 	kmem_free(mc->mc_snapshot, mc->mc_snapshotsz);
105 	mc->mc_snapshot = NULL;
106 	mc->mc_snapshotsz = 0;
107 	mc->mc_snapshotgen++;
108 }
109 
110 static int
111 mc_snapshot_update(mc_t *mc)
112 {
113 	ASSERT(RW_LOCK_HELD(&mc_lock));
114 
115 	if (mc->mc_snapshot != NULL)
116 		return (0);
117 
118 	if (nvlist_pack(mc->mc_nvl, &mc->mc_snapshot, &mc->mc_snapshotsz,
119 	    NV_ENCODE_XDR, KM_SLEEP) != 0)
120 		return (-1);
121 
122 	return (0);
123 }
124 
125 static mc_t *
126 mc_lookup_by_chipid(int chipid)
127 {
128 	mc_t *mc;
129 
130 	ASSERT(RW_LOCK_HELD(&mc_lock));
131 
132 	for (mc = mc_list; mc != NULL; mc = mc->mc_next) {
133 		if (mc->mc_props.mcp_num  == chipid)
134 			return (mc);
135 	}
136 
137 	return (NULL);
138 }
139 
140 /*
141  * Read config register pairs into the two arrays provided on the given
142  * handle and at offsets as follows:
143  *
144  *	Index	Array r1 offset			Array r2 offset
145  *	0	r1addr				r2addr
146  *	1	r1addr + incr			r2addr + incr
147  *	2	r1addr + 2 * incr		r2addr + 2 * incr
148  *	...
149  *	n - 1	r1addr + (n - 1) * incr		r2addr + (n - 1) * incr
150  *
151  * The number of registers to read into the r1 array is r1n; the number
152  * for the r2 array is r2n.
153  */
154 static void
155 mc_prop_read_pair(mc_pcicfg_hdl_t cfghdl, uint32_t *r1, off_t r1addr,
156     int r1n, uint32_t *r2, off_t r2addr, int r2n, off_t incr)
157 {
158 	int i;
159 
160 	for (i = 0; i < MAX(r1n, r2n); i++, r1addr += incr, r2addr += incr) {
161 		if (i < r1n)
162 			r1[i] = mc_pcicfg_get32(cfghdl, r1addr);
163 		if (i < r2n)
164 			r2[i] = mc_pcicfg_get32(cfghdl, r2addr);
165 	}
166 }
167 
168 /*ARGSUSED*/
169 static int
170 mc_nvl_add_socket_cb(cmi_hdl_t whdl, void *arg1, void *arg2, void *arg3)
171 {
172 	uint32_t skt = *((uint32_t *)arg1);
173 	cmi_hdl_t *hdlp = (cmi_hdl_t *)arg2;
174 
175 	if (cmi_hdl_getsockettype(whdl) == skt) {
176 		cmi_hdl_hold(whdl);	/* short-term hold */
177 		*hdlp = whdl;
178 		return (CMI_HDL_WALK_DONE);
179 	} else {
180 		return (CMI_HDL_WALK_NEXT);
181 	}
182 }
183 
184 static void
185 mc_nvl_add_socket(nvlist_t *nvl, mc_t *mc)
186 {
187 	cmi_hdl_t hdl = NULL;
188 	const char *s;
189 
190 	cmi_hdl_walk(mc_nvl_add_socket_cb, (void *)&mc->mc_socket,
191 	    (void *)&hdl, NULL);
192 	if (hdl == NULL)
193 		s = "Unknown";  /* no cpu for this chipid found */
194 	else
195 		s = cmi_hdl_getsocketstr(hdl);
196 
197 	(void) nvlist_add_string(nvl, "socket", s);
198 
199 	if (hdl != NULL)
200 		cmi_hdl_rele(hdl);
201 }
202 
203 static uint32_t
204 mc_ecc_enabled(mc_t *mc)
205 {
206 	uint32_t rev = mc->mc_props.mcp_rev;
207 	union mcreg_nbcfg nbcfg;
208 
209 	MCREG_VAL32(&nbcfg) = mc->mc_cfgregs.mcr_nbcfg;
210 
211 	return (MC_REV_MATCH(rev, MC_F_REVS_BCDE) ?
212 	    MCREG_FIELD_F_preF(&nbcfg, EccEn) :
213 	    MCREG_FIELD_F_revFG(&nbcfg, EccEn));
214 }
215 
216 static uint32_t
217 mc_ck_enabled(mc_t *mc)
218 {
219 	uint32_t rev = mc->mc_props.mcp_rev;
220 	union mcreg_nbcfg nbcfg;
221 
222 	MCREG_VAL32(&nbcfg) = mc->mc_cfgregs.mcr_nbcfg;
223 
224 	return (MC_REV_MATCH(rev, MC_F_REVS_BCDE) ?
225 	    MCREG_FIELD_F_preF(&nbcfg, ChipKillEccEn) :
226 	    MCREG_FIELD_F_revFG(&nbcfg, ChipKillEccEn));
227 }
228 
229 static void
230 mc_nvl_add_ecctype(nvlist_t *nvl, mc_t *mc)
231 {
232 	(void) nvlist_add_string(nvl, "ecc-type", mc_ecc_enabled(mc) ?
233 	    (mc_ck_enabled(mc) ? "ChipKill 128/16" : "Normal 64/8") : "None");
234 }
235 
236 static void
237 mc_nvl_add_prop(nvlist_t *nvl, void *node, mcamd_propcode_t code, int reqval)
238 {
239 	int valfound;
240 	uint64_t value;
241 	const char *name = mcamd_get_propname(code);
242 
243 	valfound = mcamd_get_numprop(NULL, (mcamd_node_t *)node, code, &value);
244 
245 	ASSERT(name != NULL && valfound);
246 	if (name != NULL && valfound && (!reqval || value != MC_INVALNUM))
247 		(void) nvlist_add_uint64(nvl, name, value);
248 }
249 
250 static void
251 mc_nvl_add_cslist(nvlist_t *mcnvl, mc_t *mc)
252 {
253 	mc_cs_t *mccs = mc->mc_cslist;
254 	nvlist_t *cslist[MC_CHIP_NCS];
255 	int nelem, i;
256 
257 	for (nelem = 0; mccs != NULL; mccs = mccs->mccs_next, nelem++) {
258 		nvlist_t **csp = &cslist[nelem];
259 		char csname[MCDCFG_CSNAMELEN];
260 
261 		(void) nvlist_alloc(csp, NV_UNIQUE_NAME, KM_SLEEP);
262 		mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_NUM, 0);
263 		mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_BASE_ADDR, 0);
264 		mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_MASK, 0);
265 		mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_SIZE, 0);
266 
267 		/*
268 		 * It is possible for an mc_cs_t not to have associated
269 		 * DIMM info if mcdcfg_lookup failed.
270 		 */
271 		if (mccs->mccs_csl[0] != NULL) {
272 			mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_CSDIMM1, 1);
273 			mcdcfg_csname(mc->mc_socket, mccs->mccs_csl[0], csname,
274 			    sizeof (csname));
275 			(void) nvlist_add_string(*csp, "dimm1-csname", csname);
276 		}
277 
278 		if (mccs->mccs_csl[1] != NULL) {
279 			mc_nvl_add_prop(*csp, mccs, MCAMD_PROP_CSDIMM2, 1);
280 			mcdcfg_csname(mc->mc_socket, mccs->mccs_csl[1], csname,
281 			    sizeof (csname));
282 			(void) nvlist_add_string(*csp, "dimm2-csname", csname);
283 		}
284 	}
285 
286 	/* Add cslist nvlist array even if zero members */
287 	(void) nvlist_add_nvlist_array(mcnvl, "cslist", cslist, nelem);
288 	for (i = 0; i < nelem; i++)
289 		nvlist_free(cslist[i]);
290 }
291 
292 static void
293 mc_nvl_add_dimmlist(nvlist_t *mcnvl, mc_t *mc)
294 {
295 	nvlist_t *dimmlist[MC_CHIP_NDIMM];
296 	mc_dimm_t *mcd;
297 	int nelem, i;
298 
299 	for (nelem = 0, mcd = mc->mc_dimmlist; mcd != NULL;
300 	    mcd = mcd->mcd_next, nelem++) {
301 		nvlist_t **dimmp = &dimmlist[nelem];
302 		uint64_t csnums[MC_CHIP_DIMMRANKMAX];
303 		char csname[4][MCDCFG_CSNAMELEN];
304 		char *csnamep[4];
305 		int ncs = 0;
306 
307 		(void) nvlist_alloc(dimmp, NV_UNIQUE_NAME, KM_SLEEP);
308 
309 		mc_nvl_add_prop(*dimmp, mcd, MCAMD_PROP_NUM, 1);
310 		mc_nvl_add_prop(*dimmp, mcd, MCAMD_PROP_SIZE, 1);
311 
312 		for (i = 0; i < MC_CHIP_DIMMRANKMAX; i++) {
313 			if (mcd->mcd_cs[i] != NULL) {
314 				csnums[ncs] =
315 				    mcd->mcd_cs[i]->mccs_props.csp_num;
316 				mcdcfg_csname(mc->mc_socket, mcd->mcd_csl[i],
317 				    csname[ncs], MCDCFG_CSNAMELEN);
318 				csnamep[ncs] = csname[ncs];
319 				ncs++;
320 			}
321 		}
322 
323 		(void) nvlist_add_uint64_array(*dimmp, "csnums", csnums, ncs);
324 		(void) nvlist_add_string_array(*dimmp, "csnames", csnamep, ncs);
325 	}
326 
327 	/* Add dimmlist nvlist array even if zero members */
328 	(void) nvlist_add_nvlist_array(mcnvl, "dimmlist", dimmlist, nelem);
329 	for (i = 0; i < nelem; i++)
330 		nvlist_free(dimmlist[i]);
331 }
332 
333 static void
334 mc_nvl_add_htconfig(nvlist_t *mcnvl, mc_t *mc)
335 {
336 	mc_cfgregs_t *mcr = &mc->mc_cfgregs;
337 	union mcreg_htroute *htrp = (union mcreg_htroute *)&mcr->mcr_htroute[0];
338 	union mcreg_nodeid *nip = (union mcreg_nodeid *)&mcr->mcr_htnodeid;
339 	union mcreg_unitid *uip = (union mcreg_unitid *)&mcr->mcr_htunitid;
340 	int ndcnt = HT_COHERENTNODES(nip);
341 	uint32_t BCRte[MC_CHIP_MAXNODES];
342 	uint32_t RPRte[MC_CHIP_MAXNODES];
343 	uint32_t RQRte[MC_CHIP_MAXNODES];
344 	nvlist_t *nvl;
345 	int i;
346 
347 	(void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
348 
349 	(void) nvlist_add_uint32(nvl, "NodeId", MCREG_FIELD_CMN(nip, NodeId));
350 	(void) nvlist_add_uint32(nvl, "CoherentNodes", HT_COHERENTNODES(nip));
351 	(void) nvlist_add_uint32(nvl, "SbNode", MCREG_FIELD_CMN(nip, SbNode));
352 	(void) nvlist_add_uint32(nvl, "LkNode", MCREG_FIELD_CMN(nip, LkNode));
353 	(void) nvlist_add_uint32(nvl, "SystemCoreCount",
354 	    HT_SYSTEMCORECOUNT(nip));
355 
356 	(void) nvlist_add_uint32(nvl, "C0Unit", MCREG_FIELD_CMN(uip, C0Unit));
357 	(void) nvlist_add_uint32(nvl, "C1Unit", MCREG_FIELD_CMN(uip, C1Unit));
358 	(void) nvlist_add_uint32(nvl, "McUnit", MCREG_FIELD_CMN(uip, McUnit));
359 	(void) nvlist_add_uint32(nvl, "HbUnit", MCREG_FIELD_CMN(uip, HbUnit));
360 	(void) nvlist_add_uint32(nvl, "SbLink", MCREG_FIELD_CMN(uip, SbLink));
361 
362 	if (ndcnt <= MC_CHIP_MAXNODES) {
363 		for (i = 0; i < ndcnt; i++, htrp++) {
364 			BCRte[i] = MCREG_FIELD_CMN(htrp, BCRte);
365 			RPRte[i] = MCREG_FIELD_CMN(htrp, RPRte);
366 			RQRte[i] = MCREG_FIELD_CMN(htrp, RQRte);
367 		}
368 
369 		(void) nvlist_add_uint32_array(nvl, "BroadcastRoutes",
370 		    &BCRte[0], ndcnt);
371 		(void) nvlist_add_uint32_array(nvl, "ResponseRoutes",
372 		    &RPRte[0], ndcnt);
373 		(void) nvlist_add_uint32_array(nvl, "RequestRoutes",
374 		    &RQRte[0], ndcnt);
375 	}
376 
377 	(void) nvlist_add_nvlist(mcnvl, "htconfig", nvl);
378 	nvlist_free(nvl);
379 }
380 
381 static nvlist_t *
382 mc_nvl_create(mc_t *mc)
383 {
384 	nvlist_t *mcnvl;
385 
386 	(void) nvlist_alloc(&mcnvl, NV_UNIQUE_NAME, KM_SLEEP);
387 
388 	/*
389 	 * Since this nvlist is used in populating the topo tree changes
390 	 * made here may propogate through to changed property names etc
391 	 * in the topo tree.  Some properties in the topo tree will be
392 	 * contracted via ARC, so be careful what you change here.
393 	 */
394 	(void) nvlist_add_uint8(mcnvl, MC_NVLIST_VERSTR, MC_NVLIST_VERS1);
395 
396 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_NUM, 0);
397 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_REV, 0);
398 	(void) nvlist_add_string(mcnvl, "revname", mc->mc_revname);
399 	mc_nvl_add_socket(mcnvl, mc);
400 	mc_nvl_add_ecctype(mcnvl, mc);
401 
402 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_BASE_ADDR, 0);
403 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_LIM_ADDR, 0);
404 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_ILEN, 0);
405 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_ILSEL, 0);
406 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_CSINTLVFCTR, 0);
407 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_DRAMHOLE_SIZE, 0);
408 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_ACCESS_WIDTH, 0);
409 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_CSBANKMAPREG, 0);
410 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_BANKSWZL, 0);
411 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_MOD64MUX, 0);
412 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_SPARECS, 1);
413 	mc_nvl_add_prop(mcnvl, mc, MCAMD_PROP_BADCS, 1);
414 
415 	mc_nvl_add_cslist(mcnvl, mc);
416 	mc_nvl_add_dimmlist(mcnvl, mc);
417 	mc_nvl_add_htconfig(mcnvl, mc);
418 
419 	return (mcnvl);
420 }
421 
422 /*
423  * Link a dimm to its associated chip-selects and chip-select lines.
424  * Total the size of all ranks of this dimm.
425  */
426 static void
427 mc_dimm_csadd(mc_t *mc, mc_dimm_t *mcd, mc_cs_t *mccs, const mcdcfg_csl_t *csl)
428 {
429 	int factor = (mc->mc_props.mcp_accwidth == 128) ? 2 : 1;
430 	uint64_t sz = 0;
431 	int i;
432 
433 	/* Skip to first unused rank slot */
434 	for (i = 0; i < MC_CHIP_DIMMRANKMAX; i++) {
435 		if (mcd->mcd_cs[i] == NULL) {
436 			mcd->mcd_cs[i] = mccs;
437 			mcd->mcd_csl[i] = csl;
438 			sz += mccs->mccs_props.csp_size / factor;
439 			break;
440 		} else {
441 			sz += mcd->mcd_cs[i]->mccs_props.csp_size / factor;
442 		}
443 	}
444 
445 	ASSERT(i != MC_CHIP_DIMMRANKMAX);
446 
447 	mcd->mcd_size = sz;
448 }
449 
450 /*
451  * Create a dimm structure and call to link it to its associated chip-selects.
452  */
453 static mc_dimm_t *
454 mc_dimm_create(mc_t *mc, uint_t num)
455 {
456 	mc_dimm_t *mcd = kmem_zalloc(sizeof (mc_dimm_t), KM_SLEEP);
457 
458 	mcd->mcd_hdr.mch_type = MC_NT_DIMM;
459 	mcd->mcd_mc = mc;
460 	mcd->mcd_num = num;
461 
462 	return (mcd);
463 }
464 
465 /*
466  * The chip-select structure includes an array of dimms associated with
467  * that chip-select.  This function fills that array, and also builds
468  * the list of all dimms on this memory controller mc_dimmlist.  The
469  * caller has filled a structure with all there is to know about the
470  * associated dimm(s).
471  */
472 static void
473 mc_csdimms_create(mc_t *mc, mc_cs_t *mccs, mcdcfg_rslt_t *rsltp)
474 {
475 	mc_dimm_t *found[MC_CHIP_DIMMPERCS];
476 	mc_dimm_t *mcd;
477 	int nfound = 0;
478 	int i;
479 
480 	/*
481 	 * Has some other chip-select already created this dimm or dimms?
482 	 * If so then link to the dimm(s) from the mccs_dimm array,
483 	 * record their topo numbers in the csp_dimmnums array, and link
484 	 * the dimm(s) to the additional chip-select.
485 	 */
486 	for (mcd = mc->mc_dimmlist; mcd != NULL; mcd = mcd->mcd_next) {
487 		for (i = 0; i < rsltp->ndimm; i++) {
488 			if (mcd->mcd_num == rsltp->dimm[i].toponum)
489 				found[nfound++] = mcd;
490 		}
491 	}
492 	ASSERT(nfound == 0 || nfound == rsltp->ndimm);
493 
494 	for (i = 0; i < rsltp->ndimm; i++) {
495 		if (nfound == 0) {
496 			mcd = mc_dimm_create(mc, rsltp->dimm[i].toponum);
497 			if (mc->mc_dimmlist == NULL)
498 				mc->mc_dimmlist = mcd;
499 			else
500 				mc->mc_dimmlast->mcd_next = mcd;
501 			mc->mc_dimmlast = mcd;
502 		} else {
503 			mcd = found[i];
504 		}
505 
506 		mccs->mccs_dimm[i] = mcd;
507 		mccs->mccs_csl[i] = rsltp->dimm[i].cslp;
508 		mccs->mccs_props.csp_dimmnums[i] = mcd->mcd_num;
509 		mc_dimm_csadd(mc, mcd, mccs, rsltp->dimm[i].cslp);
510 
511 	}
512 
513 	/* The rank number is constant across all constituent dimm(s) */
514 	mccs->mccs_props.csp_dimmrank = rsltp->dimm[0].cslp->csl_rank;
515 }
516 
517 /*
518  * mc_dimmlist_create is called after we have discovered all enabled
519  * (and spare or testfailed on revs F and G) chip-selects on the
520  * given memory controller.  For each chip-select we must derive
521  * the associated dimms, remembering that a chip-select csbase/csmask
522  * pair may be associated with up to 2 chip-select lines (in 128 bit mode)
523  * and that any one dimm may be associated with 1, 2, or 4 chip-selects
524  * depending on whether it is single, dual or quadrank.
525  */
526 static void
527 mc_dimmlist_create(mc_t *mc)
528 {
529 	union mcreg_dramcfg_hi *drcfghip =
530 	    (union mcreg_dramcfg_hi *)(&mc->mc_cfgregs.mcr_dramcfghi);
531 	mc_props_t *mcp = &mc->mc_props;
532 	uint32_t rev = mcp->mcp_rev;
533 	mc_cs_t *mccs;
534 	int r4 = 0, s4 = 0;
535 
536 	/*
537 	 * Are we dealing with quadrank registered dimms?
538 	 *
539 	 * For socket 940 we can't tell and we'll assume we're not.
540 	 * This can be over-ridden by the admin in /etc/system by setting
541 	 * mc_quadranksupport nonzero.  A possible optimisation in systems
542 	 * that export an SMBIOS table would be to count the number of
543 	 * dimm slots per cpu - more than 4 would indicate no quadrank support
544 	 * and 4 or fewer would indicate that if we see any of the upper
545 	 * chip-selects enabled then a quadrank dimm is present.
546 	 *
547 	 * For socket F(1207) we can check a bit in the dram config high reg.
548 	 *
549 	 * Other socket types do not support registered dimms.
550 	 */
551 	if (mc->mc_socket == X86_SOCKET_940)
552 		r4 = mc_quadranksupport != 0;
553 	else if (mc->mc_socket == X86_SOCKET_F1207)
554 		r4 = MCREG_FIELD_F_revFG(drcfghip, FourRankRDimm);
555 
556 	/*
557 	 * Are we dealing with quadrank SO-DIMMs?  These are supported
558 	 * in AM2 and S1g1 packages only, but in all rev F/G cases we
559 	 * can detect their presence via a bit in the dram config high reg.
560 	 */
561 	if (MC_REV_MATCH(rev, MC_F_REVS_FG))
562 		s4 = MCREG_FIELD_F_revFG(drcfghip, FourRankSODimm);
563 
564 	for (mccs = mc->mc_cslist; mccs != NULL; mccs = mccs->mccs_next) {
565 		mcdcfg_rslt_t rslt;
566 
567 		/*
568 		 * If lookup fails we will not create dimm structures for
569 		 * this chip-select.  In the mc_cs_t we will have both
570 		 * csp_dimmnum members set to MC_INVALNUM and patounum
571 		 * code will see from those that we do not have dimm info
572 		 * for this chip-select.
573 		 */
574 		if (mcdcfg_lookup(rev, mcp->mcp_mod64mux, mcp->mcp_accwidth,
575 		    mccs->mccs_props.csp_num, mc->mc_socket,
576 		    r4, s4, &rslt) < 0)
577 			continue;
578 
579 		mc_csdimms_create(mc, mccs, &rslt);
580 	}
581 }
582 
583 static mc_cs_t *
584 mc_cs_create(mc_t *mc, uint_t num, uint64_t base, uint64_t mask, size_t sz,
585     int csbe, int spare, int testfail)
586 {
587 	mc_cs_t *mccs = kmem_zalloc(sizeof (mc_cs_t), KM_SLEEP);
588 	mccs_props_t *csp = &mccs->mccs_props;
589 	int i;
590 
591 	mccs->mccs_hdr.mch_type = MC_NT_CS;
592 	mccs->mccs_mc = mc;
593 	csp->csp_num = num;
594 	csp->csp_base = base;
595 	csp->csp_mask = mask;
596 	csp->csp_size = sz;
597 	csp->csp_csbe = csbe;
598 	csp->csp_spare = spare;
599 	csp->csp_testfail = testfail;
600 
601 	for (i = 0; i < MC_CHIP_DIMMPERCS; i++)
602 		csp->csp_dimmnums[i] = MC_INVALNUM;
603 
604 	if (spare)
605 		mc->mc_props.mcp_sparecs = num;
606 
607 	return (mccs);
608 }
609 
610 /*
611  * For any cs# of this mc marked TestFail generate an ereport with
612  * resource identifying the associated dimm(s).
613  */
614 static void
615 mc_report_testfails(mc_t *mc)
616 {
617 	mc_unum_t unum;
618 	mc_cs_t *mccs;
619 	int i;
620 
621 	for (mccs = mc->mc_cslist; mccs != NULL; mccs = mccs->mccs_next) {
622 		if (mccs->mccs_props.csp_testfail) {
623 			unum.unum_board = 0;
624 			unum.unum_chip = mc->mc_props.mcp_num;
625 			unum.unum_mc = 0;
626 			unum.unum_chan = MC_INVALNUM;
627 			unum.unum_cs = mccs->mccs_props.csp_num;
628 			unum.unum_rank = mccs->mccs_props.csp_dimmrank;
629 			unum.unum_offset = MCAMD_RC_INVALID_OFFSET;
630 			for (i = 0; i < MC_CHIP_DIMMPERCS; i++)
631 				unum.unum_dimms[i] = MC_INVALNUM;
632 
633 			mcamd_ereport_post(mc, FM_EREPORT_CPU_AMD_MC_TESTFAIL,
634 			    &unum,
635 			    FM_EREPORT_PAYLOAD_FLAGS_CPU_AMD_MC_TESTFAIL);
636 		}
637 	}
638 }
639 
640 /*
641  * Function 0 - HyperTransport Technology Configuration
642  */
643 static void
644 mc_mkprops_htcfg(mc_pcicfg_hdl_t cfghdl, mc_t *mc)
645 {
646 	union mcreg_nodeid nodeid;
647 	off_t offset;
648 	int i;
649 
650 	mc->mc_cfgregs.mcr_htnodeid = MCREG_VAL32(&nodeid) =
651 	    mc_pcicfg_get32(cfghdl, MC_HT_REG_NODEID);
652 
653 	mc->mc_cfgregs.mcr_htunitid = mc_pcicfg_get32(cfghdl, MC_HT_REG_UNITID);
654 
655 	for (i = 0, offset = MC_HT_REG_RTBL_NODE_0;
656 	    i < HT_COHERENTNODES(&nodeid);
657 	    i++, offset += MC_HT_REG_RTBL_INCR)
658 		mc->mc_cfgregs.mcr_htroute[i] = mc_pcicfg_get32(cfghdl, offset);
659 }
660 
661 /*
662  * Function 1 Configuration - Address Map (see BKDG 3.4.4 DRAM Address Map)
663  *
664  * Read the Function 1 Address Map for each potential DRAM node.  The Base
665  * Address for a node gives the starting system address mapped at that node,
666  * and the limit gives the last valid address mapped at that node.  Regions for
667  * different nodes should not overlap, unless node-interleaving is enabled.
668  * The base register also indicates the node-interleaving settings (IntlvEn).
669  * The limit register includes IntlvSel which determines which 4K blocks will
670  * be routed to this node and the destination node ID for addresses that fall
671  * within the [base, limit] range - this must match the pair number.
672  */
673 static void
674 mc_mkprops_addrmap(mc_pcicfg_hdl_t cfghdl, mc_t *mc)
675 {
676 	union mcreg_drambase basereg;
677 	union mcreg_dramlimit limreg;
678 	mc_props_t *mcp = &mc->mc_props;
679 	mc_cfgregs_t *mcr = &mc->mc_cfgregs;
680 	union mcreg_dramhole hole;
681 	int nodeid = mc->mc_props.mcp_num;
682 
683 	mcr->mcr_drambase = MCREG_VAL32(&basereg) = mc_pcicfg_get32(cfghdl,
684 	    MC_AM_REG_DRAMBASE_0 + nodeid * MC_AM_REG_DRAM_INCR);
685 
686 	mcr->mcr_dramlimit = MCREG_VAL32(&limreg) = mc_pcicfg_get32(cfghdl,
687 	    MC_AM_REG_DRAMLIM_0 + nodeid * MC_AM_REG_DRAM_INCR);
688 
689 	/*
690 	 * Derive some "cooked" properties for nodes that have a range of
691 	 * physical addresses that are read or write enabled and for which
692 	 * the DstNode matches the node we are attaching.
693 	 */
694 	if (MCREG_FIELD_CMN(&limreg, DRAMLimiti) != 0 &&
695 	    MCREG_FIELD_CMN(&limreg, DstNode) == nodeid &&
696 	    (MCREG_FIELD_CMN(&basereg, WE) || MCREG_FIELD_CMN(&basereg, RE))) {
697 		mcp->mcp_base = MC_DRAMBASE(&basereg);
698 		mcp->mcp_lim = MC_DRAMLIM(&limreg);
699 		mcp->mcp_ilen = MCREG_FIELD_CMN(&basereg, IntlvEn);
700 		mcp->mcp_ilsel = MCREG_FIELD_CMN(&limreg, IntlvSel);
701 	}
702 
703 	/*
704 	 * The Function 1 DRAM Hole Address Register tells us which node(s)
705 	 * own the DRAM space that is hoisted above 4GB, together with the
706 	 * hole base and offset for this node.  This was introduced in
707 	 * revision E.
708 	 */
709 	if (MC_REV_ATLEAST(mc->mc_props.mcp_rev, MC_F_REV_E)) {
710 		mcr->mcr_dramhole = MCREG_VAL32(&hole) =
711 		    mc_pcicfg_get32(cfghdl, MC_AM_REG_HOLEADDR);
712 
713 		if (MCREG_FIELD_CMN(&hole, DramHoleValid))
714 			mcp->mcp_dramhole_size = MC_DRAMHOLE_SIZE(&hole);
715 	}
716 }
717 
718 /*
719  * Read some function 3 parameters via PCI Mechanism 1 accesses (which
720  * will serialize any NB accesses).
721  */
722 static void
723 mc_getmiscctl(mc_t *mc)
724 {
725 	uint32_t rev = mc->mc_props.mcp_rev;
726 	union mcreg_nbcfg nbcfg;
727 	union mcreg_sparectl sparectl;
728 
729 	mc->mc_cfgregs.mcr_nbcfg = MCREG_VAL32(&nbcfg) =
730 	    mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_NBCFG);
731 
732 	if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
733 		mc->mc_cfgregs.mcr_sparectl = MCREG_VAL32(&sparectl) =
734 		    mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL,
735 		    MC_CTL_REG_SPARECTL);
736 
737 		if (MCREG_FIELD_F_revFG(&sparectl, SwapDone)) {
738 			mc->mc_props.mcp_badcs =
739 			    MCREG_FIELD_F_revFG(&sparectl, BadDramCs);
740 		}
741 	}
742 }
743 
744 static int
745 csbasecmp(mc_cs_t **csapp, mc_cs_t **csbpp)
746 {
747 	uint64_t basea = (*csapp)->mccs_props.csp_base;
748 	uint64_t baseb = (*csbpp)->mccs_props.csp_base;
749 
750 	if (basea == baseb)
751 		return (0);
752 	else if (basea < baseb)
753 		return (-1);
754 	else
755 		return (1);
756 }
757 
758 /*
759  * The following are for use in simulating TestFail for a chip-select
760  * without poking at the hardware (which tends to get upset if you do
761  * since the BIOS needs to restart to map a failed cs out).  For internal
762  * testing only!  Note that setting these does not give the full experience -
763  * the select chip-select *is* enabled and can give errors etc and the
764  * patounum logic will get confused.
765  */
766 int testfail_mcnum = -1;
767 int testfail_csnum = -1;
768 
769 /*
770  * Function 2 configuration - DRAM Controller
771  */
772 static void
773 mc_mkprops_dramctl(mc_pcicfg_hdl_t cfghdl, mc_t *mc)
774 {
775 	union mcreg_csbase base[MC_CHIP_NCS];
776 	union mcreg_csmask mask[MC_CHIP_NCS];
777 	union mcreg_dramcfg_lo drcfg_lo;
778 	union mcreg_dramcfg_hi drcfg_hi;
779 	union mcreg_drammisc drmisc;
780 	union mcreg_bankaddrmap baddrmap;
781 	mc_props_t *mcp = &mc->mc_props;
782 	mc_cfgregs_t *mcr = &mc->mc_cfgregs;
783 	int maskdivisor;
784 	int wide = 0;
785 	uint32_t rev = mc->mc_props.mcp_rev;
786 	int i;
787 	mcamd_hdl_t hdl;
788 
789 	mcamd_mkhdl(&hdl);	/* to call into common code */
790 
791 	/*
792 	 * Read Function 2 DRAM Configuration High and Low registers.  The High
793 	 * part is mostly concerned with memory clocks etc and we'll not have
794 	 * any use for that.  The Low component tells us if ECC is enabled,
795 	 * if we're in 64- or 128-bit MC mode, how the upper chip-selects
796 	 * are mapped, which chip-select pairs are using x4 parts, etc.
797 	 */
798 	MCREG_VAL32(&drcfg_lo) = mc_pcicfg_get32(cfghdl, MC_DC_REG_DRAMCFGLO);
799 	MCREG_VAL32(&drcfg_hi) = mc_pcicfg_get32(cfghdl, MC_DC_REG_DRAMCFGHI);
800 	mcr->mcr_dramcfglo = MCREG_VAL32(&drcfg_lo);
801 	mcr->mcr_dramcfghi = MCREG_VAL32(&drcfg_hi);
802 
803 	/*
804 	 * Note the DRAM controller width.  The 64/128 bit is in a different
805 	 * bit position for revision F and G.
806 	 */
807 	if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
808 		wide = MCREG_FIELD_F_revFG(&drcfg_lo, Width128);
809 	} else {
810 		wide = MCREG_FIELD_F_preF(&drcfg_lo, Width128);
811 	}
812 	mcp->mcp_accwidth = wide ? 128 : 64;
813 
814 	/*
815 	 * Read Function 2 DRAM Controller Miscellaenous Regsiter for those
816 	 * revs that support it.  This include the Mod64Mux indication on
817 	 * these revs - for rev E it is in DRAM config low.
818 	 */
819 	if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
820 		mcr->mcr_drammisc = MCREG_VAL32(&drmisc) =
821 		    mc_pcicfg_get32(cfghdl, MC_DC_REG_DRAMMISC);
822 		mcp->mcp_mod64mux = MCREG_FIELD_F_revFG(&drmisc, Mod64Mux);
823 	} else if (MC_REV_MATCH(rev, MC_F_REV_E)) {
824 		mcp->mcp_mod64mux = MCREG_FIELD_F_preF(&drcfg_lo, Mod64BitMux);
825 	}
826 
827 	/*
828 	 * Read Function 2 DRAM Bank Address Mapping.  This encodes the
829 	 * type of DIMM module in use for each chip-select pair.
830 	 * Prior ro revision F it also tells us whether BankSwizzle mode
831 	 * is enabled - in rev F that has moved to dram config hi register.
832 	 */
833 	mcp->mcp_csbankmapreg = MCREG_VAL32(&baddrmap) =
834 	    mc_pcicfg_get32(cfghdl, MC_DC_REG_BANKADDRMAP);
835 
836 	/*
837 	 * Determine whether bank swizzle mode is active.  Bank swizzling was
838 	 * introduced as an option in rev E,  but the bit that indicates it
839 	 * is enabled has moved in revs F/G.
840 	 */
841 	if (MC_REV_MATCH(rev, MC_F_REV_E)) {
842 		mcp->mcp_bnkswzl =
843 		    MCREG_FIELD_F_preF(&baddrmap, BankSwizzleMode);
844 	} else if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
845 		mcp->mcp_bnkswzl = MCREG_FIELD_F_revFG(&drcfg_hi,
846 		    BankSwizzleMode);
847 	}
848 
849 	/*
850 	 * Read the DRAM CS Base and DRAM CS Mask registers.  Revisions prior
851 	 * to F have an equal number of base and mask registers; revision F
852 	 * has twice as many base registers as masks.
853 	 */
854 	maskdivisor = MC_REV_MATCH(rev, MC_F_REVS_FG) ? 2 : 1;
855 
856 	mc_prop_read_pair(cfghdl,
857 	    (uint32_t *)base, MC_DC_REG_CSBASE_0, MC_CHIP_NCS,
858 	    (uint32_t *)mask, MC_DC_REG_CSMASK_0, MC_CHIP_NCS / maskdivisor,
859 	    MC_DC_REG_CS_INCR);
860 
861 	/*
862 	 * Create a cs node for each enabled chip-select as well as
863 	 * any appointed online spare chip-selects and for any that have
864 	 * failed test.
865 	 */
866 	for (i = 0; i < MC_CHIP_NCS; i++) {
867 		mc_cs_t *mccs;
868 		uint64_t csbase, csmask;
869 		size_t sz;
870 		int csbe, spare, testfail;
871 
872 		if (MC_REV_MATCH(rev, MC_F_REVS_FG)) {
873 			csbe = MCREG_FIELD_F_revFG(&base[i], CSEnable);
874 			spare = MCREG_FIELD_F_revFG(&base[i], Spare);
875 			testfail = MCREG_FIELD_F_revFG(&base[i], TestFail);
876 		} else {
877 			csbe = MCREG_FIELD_F_preF(&base[i], CSEnable);
878 			spare = 0;
879 			testfail = 0;
880 		}
881 
882 		/* Testing hook */
883 		if (testfail_mcnum != -1 && testfail_csnum != -1 &&
884 		    mcp->mcp_num == testfail_mcnum && i == testfail_csnum) {
885 			csbe = spare = 0;
886 			testfail = 1;
887 			cmn_err(CE_NOTE, "Pretending MC %d CS %d failed test",
888 			    testfail_mcnum, testfail_csnum);
889 		}
890 
891 		/*
892 		 * If the chip-select is not enabled then skip it unless
893 		 * it is a designated online spare or is marked with TestFail.
894 		 */
895 		if (!csbe && !(spare || testfail))
896 			continue;
897 
898 		/*
899 		 * For an enabled or spare chip-select the Bank Address Mapping
900 		 * register will be valid as will the chip-select mask.  The
901 		 * base will not be valid but we'll read and store it anyway.
902 		 * We will not know whether the spare is already swapped in
903 		 * until MC function 3 attaches.
904 		 */
905 		if (csbe || spare) {
906 			if (mcamd_cs_size(&hdl, (mcamd_node_t *)mc, i, &sz) < 0)
907 				continue;
908 			csbase = MC_CSBASE(&base[i], rev);
909 			csmask = MC_CSMASK(&mask[i / maskdivisor], rev);
910 		} else {
911 			sz = 0;
912 			csbase = csmask = 0;
913 		}
914 
915 		mccs = mc_cs_create(mc, i, csbase, csmask, sz,
916 		    csbe, spare, testfail);
917 
918 		if (mc->mc_cslist == NULL)
919 			mc->mc_cslist = mccs;
920 		else
921 			mc->mc_cslast->mccs_next = mccs;
922 		mc->mc_cslast = mccs;
923 
924 		mccs->mccs_cfgregs.csr_csbase = MCREG_VAL32(&base[i]);
925 		mccs->mccs_cfgregs.csr_csmask =
926 		    MCREG_VAL32(&mask[i / maskdivisor]);
927 
928 		/*
929 		 * Check for cs bank interleaving - some bits clear in the
930 		 * lower mask.  All banks must/will have the same lomask bits
931 		 * if cs interleaving is active.
932 		 */
933 		if (csbe && !mcp->mcp_csintlvfctr) {
934 			int bitno, ibits = 0;
935 			for (bitno = MC_CSMASKLO_LOBIT(rev);
936 			    bitno <= MC_CSMASKLO_HIBIT(rev); bitno++) {
937 				if (!(csmask & (1 << bitno)))
938 					ibits++;
939 			}
940 			mcp->mcp_csintlvfctr = 1 << ibits;
941 		}
942 	}
943 
944 	/*
945 	 * If there is no chip-select interleave on this node determine
946 	 * whether the chip-select ranks are contiguous or if there
947 	 * is a hole.
948 	 */
949 	if (mcp->mcp_csintlvfctr == 1) {
950 		mc_cs_t *csp[MC_CHIP_NCS];
951 		mc_cs_t *mccs;
952 		int ncsbe = 0;
953 
954 		for (mccs = mc->mc_cslist; mccs != NULL;
955 		    mccs = mccs->mccs_next) {
956 			if (mccs->mccs_props.csp_csbe)
957 				csp[ncsbe++] = mccs;
958 		}
959 
960 		if (ncsbe != 0) {
961 			qsort((void *)csp, ncsbe, sizeof (mc_cs_t *),
962 			    (int (*)(const void *, const void *))csbasecmp);
963 
964 			for (i = 1; i < ncsbe; i++) {
965 				if (csp[i]->mccs_props.csp_base !=
966 				    csp[i - 1]->mccs_props.csp_base +
967 				    csp[i - 1]->mccs_props.csp_size)
968 					mc->mc_csdiscontig = 1;
969 			}
970 		}
971 	}
972 
973 
974 	/*
975 	 * Since we do not attach to MC function 3 go ahead and read some
976 	 * config parameters from it now.
977 	 */
978 	mc_getmiscctl(mc);
979 
980 	/*
981 	 * Now that we have discovered all enabled/spare/testfail chip-selects
982 	 * we divine the associated DIMM configuration.
983 	 */
984 	mc_dimmlist_create(mc);
985 }
986 
987 typedef struct mc_bind_map {
988 	const char *bm_bindnm;	 /* attachment binding name */
989 	enum mc_funcnum bm_func; /* PCI config space function number for bind */
990 	const char *bm_model;	 /* value for device node model property */
991 	void (*bm_mkprops)(mc_pcicfg_hdl_t, mc_t *);
992 } mc_bind_map_t;
993 
994 /*
995  * Do not attach to MC function 3 - agpgart already attaches to that.
996  * Function 3 may be a good candidate for a nexus driver to fan it out
997  * into virtual devices by functionality.  We will use pci_mech1_getl
998  * to retrieve the function 3 parameters we require.
999  */
1000 
1001 static const mc_bind_map_t mc_bind_map[] = {
1002 	{ MC_FUNC_HTCONFIG_BINDNM, MC_FUNC_HTCONFIG,
1003 	    "AMD Memory Controller (HT Configuration)", mc_mkprops_htcfg },
1004 	{ MC_FUNC_ADDRMAP_BINDNM, MC_FUNC_ADDRMAP,
1005 	    "AMD Memory Controller (Address Map)", mc_mkprops_addrmap },
1006 	{ MC_FUNC_DRAMCTL_BINDNM, MC_FUNC_DRAMCTL,
1007 	    "AMD Memory Controller (DRAM Controller & HT Trace)",
1008 	    mc_mkprops_dramctl },
1009 	NULL
1010 };
1011 
1012 /*ARGSUSED*/
1013 static int
1014 mc_open(dev_t *devp, int flag, int otyp, cred_t *credp)
1015 {
1016 	if (otyp != OTYP_CHR)
1017 		return (EINVAL);
1018 
1019 	rw_enter(&mc_lock, RW_READER);
1020 	if (mc_lookup_by_chipid(getminor(*devp)) == NULL) {
1021 		rw_exit(&mc_lock);
1022 		return (EINVAL);
1023 	}
1024 	rw_exit(&mc_lock);
1025 
1026 	return (0);
1027 }
1028 
1029 /*ARGSUSED*/
1030 static int
1031 mc_close(dev_t dev, int flag, int otyp, cred_t *credp)
1032 {
1033 	return (0);
1034 }
1035 
1036 /*
1037  * Enable swap from chip-select csnum to the spare chip-select on this
1038  * memory controller (if any).
1039  */
1040 
1041 int mc_swapdonetime = 30;	/* max number of seconds to wait for SwapDone */
1042 
1043 static int
1044 mc_onlinespare(mc_t *mc, int csnum)
1045 {
1046 	mc_props_t *mcp = &mc->mc_props;
1047 	union mcreg_sparectl sparectl;
1048 	union mcreg_scrubctl scrubctl;
1049 	mc_cs_t *mccs;
1050 	hrtime_t tmax;
1051 	int i = 0;
1052 
1053 	ASSERT(RW_WRITE_HELD(&mc_lock));
1054 
1055 	if (!MC_REV_MATCH(mcp->mcp_rev, MC_F_REVS_FG))
1056 		return (ENOTSUP);	/* MC rev does not offer online spare */
1057 	else if (mcp->mcp_sparecs == MC_INVALNUM)
1058 		return (ENODEV);	/* Supported, but no spare configured */
1059 	else if (mcp->mcp_badcs != MC_INVALNUM)
1060 		return (EBUSY);		/* Spare already swapped in */
1061 	else if (csnum == mcp->mcp_sparecs)
1062 		return (EINVAL);	/* Can't spare the spare! */
1063 
1064 	for (mccs = mc->mc_cslist; mccs != NULL; mccs = mccs->mccs_next) {
1065 		if (mccs->mccs_props.csp_num == csnum)
1066 			break;
1067 	}
1068 	if (mccs == NULL)
1069 		return (EINVAL);	/* nominated bad CS does not exist */
1070 
1071 	/*
1072 	 * If the DRAM Scrubber is not enabled then the swap cannot succeed.
1073 	 */
1074 	MCREG_VAL32(&scrubctl) = mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL,
1075 	    MC_CTL_REG_SCRUBCTL);
1076 	if (MCREG_FIELD_CMN(&scrubctl, DramScrub) == 0)
1077 		return (ENODEV);	/* DRAM scrubber not enabled */
1078 
1079 	/*
1080 	 * Read Online Spare Comtrol Register again, just in case our
1081 	 * state does not reflect reality.
1082 	 */
1083 	MCREG_VAL32(&sparectl) = mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL,
1084 	    MC_CTL_REG_SPARECTL);
1085 
1086 	if (MCREG_FIELD_F_revFG(&sparectl, SwapDone))
1087 		return (EBUSY);
1088 
1089 	/* Write to the BadDramCs field */
1090 	MCREG_FIELD_F_revFG(&sparectl, BadDramCs) = csnum;
1091 	mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL,
1092 	    MCREG_VAL32(&sparectl));
1093 
1094 	/* And request that the swap to the spare start */
1095 	MCREG_FIELD_F_revFG(&sparectl, SwapEn) = 1;
1096 	mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL,
1097 	    MCREG_VAL32(&sparectl));
1098 
1099 	/*
1100 	 * Poll for SwapDone - we have disabled notification by interrupt.
1101 	 * Swap takes "several CPU cycles, depending on the DRAM speed, but
1102 	 * is performed in the background" (Family 0Fh Bios Porting Guide).
1103 	 * We're in a slow ioctl path so there is no harm in waiting around
1104 	 * a bit - consumers of the ioctl must be aware that it may take
1105 	 * a moment.  We will poll for up to mc_swapdonetime seconds,
1106 	 * limiting that to 120s.
1107 	 *
1108 	 * The swap is performed by the DRAM scrubber (which must be enabled)
1109 	 * whose scrub rate is accelerated for the duration of the swap.
1110 	 * The maximum swap rate is 40.0ns per 64 bytes, so the maximum
1111 	 * supported cs size of 16GB would take 10.7s at that max rate
1112 	 * of 25000000 scrubs/second.
1113 	 */
1114 	tmax = gethrtime() + MIN(mc_swapdonetime, 120) * 1000000000ULL;
1115 	do {
1116 		if (i++ < 20)
1117 			delay(drv_usectohz(100000));	/* 0.1s for up to 2s */
1118 		else
1119 			delay(drv_usectohz(500000));	/* 0.5s */
1120 
1121 		MCREG_VAL32(&sparectl) = mc_pcicfg_get32_nohdl(mc,
1122 		    MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL);
1123 	} while (!MCREG_FIELD_F_revFG(&sparectl, SwapDone) &&
1124 	    gethrtime() < tmax);
1125 
1126 	if (!MCREG_FIELD_F_revFG(&sparectl, SwapDone))
1127 		return (ETIME);		/* Operation timed out */
1128 
1129 	mcp->mcp_badcs = csnum;
1130 	mc->mc_cfgregs.mcr_sparectl = MCREG_VAL32(&sparectl);
1131 	mc->mc_spareswaptime = gethrtime();
1132 
1133 	return (0);
1134 }
1135 
1136 /*ARGSUSED*/
1137 static int
1138 mc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1139 {
1140 	int rc = 0;
1141 	mc_t *mc;
1142 
1143 	if (cmd != MC_IOC_SNAPSHOT_INFO && cmd != MC_IOC_SNAPSHOT &&
1144 	    cmd != MC_IOC_ONLINESPARE_EN)
1145 		return (EINVAL);
1146 
1147 	rw_enter(&mc_lock, RW_READER);
1148 
1149 	if ((mc = mc_lookup_by_chipid(getminor(dev))) == NULL) {
1150 		rw_exit(&mc_lock);
1151 		return (EINVAL);
1152 	}
1153 
1154 	switch (cmd) {
1155 	case MC_IOC_SNAPSHOT_INFO: {
1156 		mc_snapshot_info_t mcs;
1157 
1158 		if (mc_snapshot_update(mc) < 0) {
1159 			rw_exit(&mc_lock);
1160 			return (EIO);
1161 		}
1162 
1163 		mcs.mcs_size = mc->mc_snapshotsz;
1164 		mcs.mcs_gen = mc->mc_snapshotgen;
1165 
1166 		if (ddi_copyout(&mcs, (void *)arg, sizeof (mc_snapshot_info_t),
1167 		    mode) < 0)
1168 			rc = EFAULT;
1169 		break;
1170 	}
1171 
1172 	case MC_IOC_SNAPSHOT:
1173 		if (mc_snapshot_update(mc) < 0) {
1174 			rw_exit(&mc_lock);
1175 			return (EIO);
1176 		}
1177 
1178 		if (ddi_copyout(mc->mc_snapshot, (void *)arg, mc->mc_snapshotsz,
1179 		    mode) < 0)
1180 			rc = EFAULT;
1181 		break;
1182 
1183 	case MC_IOC_ONLINESPARE_EN:
1184 		if (drv_priv(credp) != 0) {
1185 			rw_exit(&mc_lock);
1186 			return (EPERM);
1187 		}
1188 
1189 		if (!rw_tryupgrade(&mc_lock)) {
1190 			rw_exit(&mc_lock);
1191 			return (EAGAIN);
1192 		}
1193 
1194 		if ((rc = mc_onlinespare(mc, (int)arg)) == 0) {
1195 			mc_snapshot_destroy(mc);
1196 			nvlist_free(mc->mc_nvl);
1197 			mc->mc_nvl = mc_nvl_create(mc);
1198 		}
1199 
1200 		break;
1201 	}
1202 
1203 	rw_exit(&mc_lock);
1204 
1205 	return (rc);
1206 }
1207 
1208 static struct cb_ops mc_cb_ops = {
1209 	mc_open,
1210 	mc_close,
1211 	nodev,		/* not a block driver */
1212 	nodev,		/* no print routine */
1213 	nodev,		/* no dump routine */
1214 	nodev,		/* no read routine */
1215 	nodev,		/* no write routine */
1216 	mc_ioctl,
1217 	nodev,		/* no devmap routine */
1218 	nodev,		/* no mmap routine */
1219 	nodev,		/* no segmap routine */
1220 	nochpoll,	/* no chpoll routine */
1221 	ddi_prop_op,
1222 	0,		/* not a STREAMS driver */
1223 	D_NEW | D_MP,	/* safe for multi-thread/multi-processor */
1224 };
1225 
1226 /*ARGSUSED*/
1227 static int
1228 mc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
1229 {
1230 	int rc = DDI_SUCCESS;
1231 	mc_t *mc;
1232 
1233 	if (infocmd != DDI_INFO_DEVT2DEVINFO &&
1234 	    infocmd != DDI_INFO_DEVT2INSTANCE) {
1235 		*result = NULL;
1236 		return (DDI_FAILURE);
1237 	}
1238 
1239 	rw_enter(&mc_lock, RW_READER);
1240 
1241 	if ((mc = mc_lookup_by_chipid(getminor((dev_t)arg))) == NULL ||
1242 	    mc->mc_funcs[MC_FUNC_DEVIMAP].mcf_devi == NULL) {
1243 		rc = DDI_FAILURE;
1244 	} else if (infocmd == DDI_INFO_DEVT2DEVINFO) {
1245 		*result = mc->mc_funcs[MC_FUNC_DEVIMAP].mcf_devi;
1246 	} else {
1247 		*result = (void *)(uintptr_t)
1248 		    mc->mc_funcs[MC_FUNC_DEVIMAP].mcf_instance;
1249 	}
1250 
1251 	rw_exit(&mc_lock);
1252 
1253 	return (rc);
1254 }
1255 
1256 /*ARGSUSED2*/
1257 static int
1258 mc_fm_handle(dev_info_t *dip, ddi_fm_error_t *fmerr, const void *arg)
1259 {
1260 	pci_ereport_post(dip, fmerr, NULL);
1261 	return (fmerr->fme_status);
1262 }
1263 
1264 static void
1265 mc_fm_init(dev_info_t *dip)
1266 {
1267 	int fmcap = DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE;
1268 	ddi_fm_init(dip, &fmcap, NULL);
1269 	pci_ereport_setup(dip);
1270 	ddi_fm_handler_register(dip, mc_fm_handle, NULL);
1271 }
1272 
1273 /*ARGSUSED*/
1274 static int
1275 mc_create_cb(cmi_hdl_t whdl, void *arg1, void *arg2, void *arg3)
1276 {
1277 	chipid_t chipid = *((chipid_t *)arg1);
1278 	cmi_hdl_t *hdlp = (cmi_hdl_t *)arg2;
1279 
1280 	if (cmi_hdl_chipid(whdl) == chipid) {
1281 		cmi_hdl_hold(whdl);	/* short-term hold */
1282 		*hdlp = whdl;
1283 		return (CMI_HDL_WALK_DONE);
1284 	} else {
1285 		return (CMI_HDL_WALK_NEXT);
1286 	}
1287 }
1288 
1289 static mc_t *
1290 mc_create(chipid_t chipid)
1291 {
1292 	mc_t *mc;
1293 	cmi_hdl_t hdl = NULL;
1294 
1295 	ASSERT(RW_WRITE_HELD(&mc_lock));
1296 
1297 	/*
1298 	 * Find a handle for one of a chip's CPU.
1299 	 *
1300 	 * We can use one of the chip's CPUs since all cores
1301 	 * of a chip share the same revision and socket type.
1302 	 */
1303 	cmi_hdl_walk(mc_create_cb, (void *)&chipid, (void *)&hdl, NULL);
1304 	if (hdl == NULL)
1305 		return (NULL);	/* no cpu for this chipid found! */
1306 
1307 	mc = kmem_zalloc(sizeof (mc_t), KM_SLEEP);
1308 
1309 	mc->mc_hdr.mch_type = MC_NT_MC;
1310 	mc->mc_props.mcp_num = chipid;
1311 	mc->mc_props.mcp_sparecs = MC_INVALNUM;
1312 	mc->mc_props.mcp_badcs = MC_INVALNUM;
1313 
1314 	mc->mc_props.mcp_rev = cmi_hdl_chiprev(hdl);
1315 	mc->mc_revname = cmi_hdl_chiprevstr(hdl);
1316 	mc->mc_socket = cmi_hdl_getsockettype(hdl);
1317 
1318 	if (mc_list == NULL)
1319 		mc_list = mc;
1320 	if (mc_last != NULL)
1321 		mc_last->mc_next = mc;
1322 
1323 	mc->mc_next = NULL;
1324 	mc_last = mc;
1325 
1326 	cmi_hdl_rele(hdl);
1327 
1328 	return (mc);
1329 }
1330 
1331 /*
1332  * Return the maximum scrubbing rate between r1 and r2, where r2 is extracted
1333  * from the specified 'cfg' register value using 'mask' and 'shift'.  If a
1334  * value is zero, scrubbing is off so return the opposite value.  Otherwise
1335  * the maximum rate is the smallest non-zero value of the two values.
1336  */
1337 static uint32_t
1338 mc_scrubber_max(uint32_t r1, uint32_t cfg, uint32_t mask, uint32_t shift)
1339 {
1340 	uint32_t r2 = (cfg & mask) >> shift;
1341 
1342 	if (r1 != 0 && r2 != 0)
1343 		return (MIN(r1, r2));
1344 
1345 	return (r1 ? r1 : r2);
1346 }
1347 
1348 
1349 /*
1350  * Enable the memory scrubber.  We must use the mc_pcicfg_{get32,put32}_nohdl
1351  * interfaces since we do not bind to function 3.
1352  */
1353 cmi_errno_t
1354 mc_scrubber_enable(mc_t *mc)
1355 {
1356 	mc_props_t *mcp = &mc->mc_props;
1357 	chipid_t chipid = (chipid_t)mcp->mcp_num;
1358 	uint32_t rev = (uint32_t)mcp->mcp_rev;
1359 	mc_cfgregs_t *mcr = &mc->mc_cfgregs;
1360 	union mcreg_scrubctl scrubctl;
1361 	union mcreg_dramscrublo dalo;
1362 	union mcreg_dramscrubhi dahi;
1363 
1364 	mcr->mcr_scrubctl = MCREG_VAL32(&scrubctl) =
1365 	    mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBCTL);
1366 
1367 	mcr->mcr_scrubaddrlo = MCREG_VAL32(&dalo) =
1368 	    mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_LO);
1369 
1370 	mcr->mcr_scrubaddrhi = MCREG_VAL32(&dahi) =
1371 	    mc_pcicfg_get32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_HI);
1372 
1373 	if (mc_scrub_policy == MC_SCRUB_BIOSDEFAULT)
1374 		return (MCREG_FIELD_CMN(&scrubctl, DramScrub) !=
1375 		    AMD_NB_SCRUBCTL_RATE_NONE ?
1376 		    CMI_SUCCESS : CMIERR_MC_NOMEMSCRUB);
1377 
1378 	/*
1379 	 * Disable DRAM scrubbing while we fiddle.
1380 	 */
1381 	MCREG_FIELD_CMN(&scrubctl, DramScrub) = AMD_NB_SCRUBCTL_RATE_NONE;
1382 	mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBCTL,
1383 	    MCREG_VAL32(&scrubctl));
1384 
1385 	/*
1386 	 * Setup DRAM Scrub Address Low and High registers for the
1387 	 * base address of this node, and to select srubber redirect.
1388 	 */
1389 	MCREG_FIELD_CMN(&dalo, ScrubReDirEn) = 1;
1390 	MCREG_FIELD_CMN(&dalo, ScrubAddrLo) =
1391 	    AMD_NB_SCRUBADDR_MKLO(mcp->mcp_base);
1392 
1393 	MCREG_FIELD_CMN(&dahi, ScrubAddrHi) =
1394 	    AMD_NB_SCRUBADDR_MKHI(mcp->mcp_base);
1395 
1396 	mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_LO,
1397 	    MCREG_VAL32(&dalo));
1398 	mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBADDR_HI,
1399 	    MCREG_VAL32(&dahi));
1400 
1401 	if (mc_scrub_rate_dram > AMD_NB_SCRUBCTL_RATE_MAX) {
1402 		cmn_err(CE_WARN, "mc_scrub_rate_dram is too large; "
1403 		    "resetting to 0x%x\n", AMD_NB_SCRUBCTL_RATE_MAX);
1404 		mc_scrub_rate_dram = AMD_NB_SCRUBCTL_RATE_MAX;
1405 	}
1406 
1407 	switch (mc_scrub_policy) {
1408 	case MC_SCRUB_FIXED:
1409 		/* Use the system value checked above */
1410 		break;
1411 
1412 	default:
1413 		cmn_err(CE_WARN, "Unknown mc_scrub_policy value %d - "
1414 		    "using default policy of MC_SCRUB_MAX", mc_scrub_policy);
1415 		/*FALLTHRU*/
1416 
1417 	case MC_SCRUB_MAX:
1418 		mc_scrub_rate_dram = mc_scrubber_max(mc_scrub_rate_dram,
1419 		    mcr->mcr_scrubctl, AMD_NB_SCRUBCTL_DRAM_MASK,
1420 		    AMD_NB_SCRUBCTL_DRAM_SHIFT);
1421 		break;
1422 	}
1423 
1424 	/*
1425 	 * OPTERON_ERRATUM_99:
1426 	 * This erratum applies on revisions D and earlier.
1427 	 * This erratum also applies on revisions E and later,
1428 	 * if BIOS uses chip-select hoisting instead of DRAM hole
1429 	 * mapping.
1430 	 *
1431 	 * Do not enable the dram scrubber if the chip-select ranges
1432 	 * for the node are not contiguous.
1433 	 */
1434 	if (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE &&
1435 	    mc->mc_csdiscontig) {
1436 		cmn_err(CE_CONT, "?Opteron DRAM scrubber disabled on revision "
1437 		    "%s chip %d because DRAM hole is present on this node",
1438 		    mc->mc_revname, chipid);
1439 		mc_scrub_rate_dram = AMD_NB_SCRUBCTL_RATE_NONE;
1440 	}
1441 
1442 	/*
1443 	 * OPTERON_ERRATUM_101:
1444 	 * This erratum applies on revisions D and earlier.
1445 	 *
1446 	 * If the DRAM Base Address register's IntlvEn field indicates that
1447 	 * node interleaving is enabled, we must disable the DRAM scrubber
1448 	 * and return zero to indicate that Solaris should use s/w instead.
1449 	 */
1450 	if (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE &&
1451 	    mcp->mcp_ilen != 0 &&
1452 	    !X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_E)) {
1453 		cmn_err(CE_CONT, "?Opteron DRAM scrubber disabled on revision "
1454 		    "%s chip %d because DRAM memory is node-interleaved",
1455 		    mc->mc_revname, chipid);
1456 		mc_scrub_rate_dram = AMD_NB_SCRUBCTL_RATE_NONE;
1457 	}
1458 
1459 	if (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE) {
1460 		MCREG_FIELD_CMN(&scrubctl, DramScrub) = mc_scrub_rate_dram;
1461 		mc_pcicfg_put32_nohdl(mc, MC_FUNC_MISCCTL, MC_CTL_REG_SCRUBCTL,
1462 		    MCREG_VAL32(&scrubctl));
1463 	}
1464 
1465 	return (mc_scrub_rate_dram != AMD_NB_SCRUBCTL_RATE_NONE ?
1466 	    CMI_SUCCESS : CMIERR_MC_NOMEMSCRUB);
1467 }
1468 
1469 /*ARGSUSED*/
1470 static int
1471 mc_attach_cb(cmi_hdl_t whdl, void *arg1, void *arg2, void *arg3)
1472 {
1473 	mc_t *mc = (mc_t *)arg1;
1474 	mcamd_prop_t chipid = *((mcamd_prop_t *)arg2);
1475 
1476 	if (cmi_hdl_chipid(whdl) == chipid) {
1477 		mcamd_mc_register(whdl, mc);
1478 	}
1479 
1480 	return (CMI_HDL_WALK_NEXT);
1481 }
1482 
1483 static int mc_sw_scrub_disabled = 0;
1484 
1485 static int
1486 mc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1487 {
1488 	mc_pcicfg_hdl_t cfghdl;
1489 	const mc_bind_map_t *bm;
1490 	const char *bindnm;
1491 	char *unitstr = NULL;
1492 	enum mc_funcnum func;
1493 	long unitaddr;
1494 	int chipid, rc;
1495 	mc_t *mc;
1496 
1497 	/*
1498 	 * This driver has no hardware state, but does
1499 	 * claim to have a reg property, so it will be
1500 	 * called on suspend.  It is probably better to
1501 	 * make sure it doesn't get called on suspend,
1502 	 * but it is just as easy to make sure we just
1503 	 * return DDI_SUCCESS if called.
1504 	 */
1505 	if (cmd == DDI_RESUME)
1506 		return (DDI_SUCCESS);
1507 
1508 	if (cmd != DDI_ATTACH || mc_no_attach != 0)
1509 		return (DDI_FAILURE);
1510 
1511 	bindnm = ddi_binding_name(dip);
1512 	for (bm = mc_bind_map; bm->bm_bindnm != NULL; bm++) {
1513 		if (strcmp(bindnm, bm->bm_bindnm) == 0) {
1514 			func = bm->bm_func;
1515 			break;
1516 		}
1517 	}
1518 
1519 	if (bm->bm_bindnm == NULL)
1520 		return (DDI_FAILURE);
1521 
1522 	/*
1523 	 * We need the device number, which corresponds to the processor node
1524 	 * number plus 24.  The node number can then be used to associate this
1525 	 * memory controller device with a given processor chip.
1526 	 */
1527 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip,
1528 	    DDI_PROP_DONTPASS, "unit-address", &unitstr) != DDI_PROP_SUCCESS) {
1529 		cmn_err(CE_WARN, "failed to find unit-address for %s", bindnm);
1530 		return (DDI_FAILURE);
1531 	}
1532 
1533 	rc = ddi_strtol(unitstr, NULL, 16, &unitaddr);
1534 	ASSERT(rc == 0 && unitaddr >= MC_AMD_DEV_OFFSET);
1535 
1536 	if (rc != 0 || unitaddr < MC_AMD_DEV_OFFSET) {
1537 		cmn_err(CE_WARN, "failed to parse unit address %s for %s\n",
1538 		    unitstr, bindnm);
1539 		ddi_prop_free(unitstr);
1540 		return (DDI_FAILURE);
1541 	}
1542 	ddi_prop_free(unitstr);
1543 
1544 	chipid = unitaddr - MC_AMD_DEV_OFFSET;
1545 
1546 	rw_enter(&mc_lock, RW_WRITER);
1547 
1548 	for (mc = mc_list; mc != NULL; mc = mc->mc_next) {
1549 		if (mc->mc_props.mcp_num == chipid)
1550 			break;
1551 	}
1552 
1553 	/* Integrate this memory controller device into existing set */
1554 	if (mc == NULL) {
1555 		mc = mc_create(chipid);
1556 
1557 		if (mc == NULL) {
1558 			/*
1559 			 * We don't complain here because this is a legitimate
1560 			 * path for MP systems.  On those machines, we'll attach
1561 			 * before all CPUs have been initialized, and thus the
1562 			 * chip verification in mc_create will fail.  We'll be
1563 			 * reattached later for those CPUs.
1564 			 */
1565 			rw_exit(&mc_lock);
1566 			return (DDI_FAILURE);
1567 		}
1568 	} else {
1569 		mc_snapshot_destroy(mc);
1570 	}
1571 
1572 	/* Beyond this point, we're committed to creating this node */
1573 
1574 	mc_fm_init(dip);
1575 
1576 	ASSERT(mc->mc_funcs[func].mcf_devi == NULL);
1577 	mc->mc_funcs[func].mcf_devi = dip;
1578 	mc->mc_funcs[func].mcf_instance = ddi_get_instance(dip);
1579 
1580 	mc->mc_ref++;
1581 
1582 	/*
1583 	 * Add the common properties to this node, and then add any properties
1584 	 * that are specific to this node based upon its configuration space.
1585 	 */
1586 	(void) ddi_prop_update_string(DDI_DEV_T_NONE,
1587 	    dip, "model", (char *)bm->bm_model);
1588 
1589 	(void) ddi_prop_update_int(DDI_DEV_T_NONE,
1590 	    dip, "chip-id", mc->mc_props.mcp_num);
1591 
1592 	if (bm->bm_mkprops != NULL &&
1593 	    mc_pcicfg_setup(mc, bm->bm_func, &cfghdl) == DDI_SUCCESS) {
1594 		bm->bm_mkprops(cfghdl, mc);
1595 		mc_pcicfg_teardown(cfghdl);
1596 	}
1597 
1598 	/*
1599 	 * If this is the last node to be attached for this memory controller,
1600 	 * then create the minor node, enable scrubbers, and register with
1601 	 * cpu module(s) for this chip.
1602 	 */
1603 	if (func == MC_FUNC_DEVIMAP) {
1604 		mc_props_t *mcp = &mc->mc_props;
1605 		int dram_present = 0;
1606 
1607 		if (ddi_create_minor_node(dip, "mc-amd", S_IFCHR,
1608 		    mcp->mcp_num, "ddi_mem_ctrl",
1609 		    0) != DDI_SUCCESS) {
1610 			cmn_err(CE_WARN, "failed to create minor node for chip "
1611 			    "%d memory controller\n",
1612 			    (chipid_t)mcp->mcp_num);
1613 		}
1614 
1615 		/*
1616 		 * Register the memory controller for every CPU of this chip.
1617 		 *
1618 		 * If there is memory present on this node and ECC is enabled
1619 		 * attempt to enable h/w memory scrubbers for this node.
1620 		 * If we are successful in enabling *any* hardware scrubbers,
1621 		 * disable the software memory scrubber.
1622 		 */
1623 		cmi_hdl_walk(mc_attach_cb, (void *)mc, (void *)&mcp->mcp_num,
1624 		    NULL);
1625 
1626 		if (mcp->mcp_lim != mcp->mcp_base) {
1627 			/*
1628 			 * This node may map non-dram memory alone, so we
1629 			 * must check for an enabled chip-select to be
1630 			 * sure there is dram present.
1631 			 */
1632 			mc_cs_t *mccs;
1633 
1634 			for (mccs = mc->mc_cslist; mccs != NULL;
1635 			    mccs = mccs->mccs_next) {
1636 				if (mccs->mccs_props.csp_csbe) {
1637 					dram_present = 1;
1638 					break;
1639 				}
1640 			}
1641 		}
1642 
1643 		if (dram_present && !mc_ecc_enabled(mc)) {
1644 			/*
1645 			 * On a single chip system there is no point in
1646 			 * scrubbing if there is no ECC on the single node.
1647 			 * On a multichip system, necessarily Opteron using
1648 			 * registered ECC-capable DIMMs, if there is memory
1649 			 * present on a node but no ECC there then we'll assume
1650 			 * ECC is disabled for all nodes and we will not enable
1651 			 * the scrubber and wll also disable the software
1652 			 * memscrub thread.
1653 			 */
1654 			rc = 1;
1655 		} else if (!dram_present) {
1656 			/* No memory on this node - others decide memscrub */
1657 			rc = 0;
1658 		} else {
1659 			/*
1660 			 * There is memory on this node and ECC is enabled.
1661 			 * Call via the cpu module to enable memory scrubbing
1662 			 * on this node - we could call directly but then
1663 			 * we may overlap with a request to enable chip-cache
1664 			 * scrubbing.
1665 			 */
1666 			rc = mc_scrubber_enable(mc);
1667 		}
1668 
1669 		if (rc == CMI_SUCCESS && !mc_sw_scrub_disabled++)
1670 			cmi_mc_sw_memscrub_disable();
1671 
1672 		mc_report_testfails(mc);
1673 	}
1674 
1675 	/*
1676 	 * Update nvlist for as far as we have gotten in attach/init.
1677 	 */
1678 	nvlist_free(mc->mc_nvl);
1679 	mc->mc_nvl = mc_nvl_create(mc);
1680 
1681 	rw_exit(&mc_lock);
1682 	return (DDI_SUCCESS);
1683 }
1684 
1685 /*ARGSUSED*/
1686 static int
1687 mc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1688 {
1689 	/*
1690 	 * See the comment about suspend in
1691 	 * mc_attach().
1692 	 */
1693 	if (cmd == DDI_SUSPEND)
1694 		return (DDI_SUCCESS);
1695 	else
1696 		return (DDI_FAILURE);
1697 }
1698 
1699 
1700 static struct dev_ops mc_ops = {
1701 	DEVO_REV,		/* devo_rev */
1702 	0,			/* devo_refcnt */
1703 	mc_getinfo,		/* devo_getinfo */
1704 	nulldev,		/* devo_identify */
1705 	nulldev,		/* devo_probe */
1706 	mc_attach,		/* devo_attach */
1707 	mc_detach,		/* devo_detach */
1708 	nodev,			/* devo_reset */
1709 	&mc_cb_ops,		/* devo_cb_ops */
1710 	NULL,			/* devo_bus_ops */
1711 	NULL,			/* devo_power */
1712 	ddi_quiesce_not_needed,		/* devo_quiesce */
1713 };
1714 
1715 static struct modldrv modldrv = {
1716 	&mod_driverops,
1717 	"Memory Controller for AMD processors",
1718 	&mc_ops
1719 };
1720 
1721 static struct modlinkage modlinkage = {
1722 	MODREV_1,
1723 	(void *)&modldrv,
1724 	NULL
1725 };
1726 
1727 int
1728 _init(void)
1729 {
1730 	/*
1731 	 * Refuse to load if there is no PCI config space support.
1732 	 */
1733 	if (pci_getl_func == NULL)
1734 		return (ENOTSUP);
1735 
1736 	rw_init(&mc_lock, NULL, RW_DRIVER, NULL);
1737 	return (mod_install(&modlinkage));
1738 }
1739 
1740 int
1741 _info(struct modinfo *modinfop)
1742 {
1743 	return (mod_info(&modlinkage, modinfop));
1744 }
1745 
1746 int
1747 _fini(void)
1748 {
1749 	int rc;
1750 
1751 	if ((rc = mod_remove(&modlinkage)) != 0)
1752 		return (rc);
1753 
1754 	rw_destroy(&mc_lock);
1755 	return (0);
1756 }
1757