xref: /titanic_52/usr/src/uts/i86pc/cpu/authenticamd/authamd_main.c (revision 380789fc80376bd1573770361cb177a08c7e3524)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * "Generic AMD" model-specific support.  If no more-specific support can
31  * be found, or such modules declines to initialize, then for AuthenticAMD
32  * cpus this module can have a crack at providing some AMD model-specific
33  * support that at least goes beyond common MCA architectural features
34  * if not down to the nitty-gritty level for a particular model.  We
35  * are layered on top of a cpu module, likely cpu.generic, so there is no
36  * need for us to perform common architecturally-accessible functions.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/cmn_err.h>
41 #include <sys/modctl.h>
42 #include <sys/cpu_module.h>
43 #include <sys/mca_x86.h>
44 #include <sys/pci_cfgspace.h>
45 #include <sys/x86_archext.h>
46 #include <sys/mc_amd.h>
47 #include <sys/fm/protocol.h>
48 #include <sys/fm/cpu/GENAMD.h>
49 #include <sys/nvpair.h>
50 #include <sys/controlregs.h>
51 #include <sys/pghw.h>
52 #include <sys/sunddi.h>
53 #include <sys/sysmacros.h>
54 #include <sys/cpu_module_ms_impl.h>
55 
56 #include "authamd.h"
57 
58 int authamd_ms_support_disable = 0;
59 
60 #define	AUTHAMD_F_REVS_BCDE \
61 	(X86_CHIPREV_AMD_F_REV_B | X86_CHIPREV_AMD_F_REV_C0 | \
62 	X86_CHIPREV_AMD_F_REV_CG | X86_CHIPREV_AMD_F_REV_D | \
63 	X86_CHIPREV_AMD_F_REV_E)
64 
65 #define	AUTHAMD_F_REVS_FG \
66 	(X86_CHIPREV_AMD_F_REV_F | X86_CHIPREV_AMD_F_REV_G)
67 
68 #define	AUTHAMD_10_REVS_AB \
69 	(X86_CHIPREV_AMD_10_REV_A | X86_CHIPREV_AMD_10_REV_B)
70 
71 /*
72  * Bitmasks of support for various features.  Try to enable features
73  * via inclusion in one of these bitmasks and check that at the
74  * feature imlementation - that way new family support may often simply
75  * simply need to update these bitmasks.
76  */
77 
78 /*
79  * Families that this module will provide some model-specific
80  * support for (if no more-specific module claims it first).
81  * We try to support whole families rather than differentiate down
82  * to revision.
83  */
84 #define	AUTHAMD_SUPPORTED(fam) \
85 	((fam) == AUTHAMD_FAMILY_6 || (fam) == AUTHAMD_FAMILY_F || \
86 	(fam) == AUTHAMD_FAMILY_10)
87 
88 /*
89  * Families/revisions for which we can recognise main memory ECC errors.
90  */
91 #define	AUTHAMD_MEMECC_RECOGNISED(rev) \
92 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \
93 	X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
94 
95 /*
96  * Families/revisions that have an Online Spare Control Register
97  */
98 #define	AUTHAMD_HAS_ONLINESPARECTL(rev) \
99 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) || \
100 	X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
101 
102 /*
103  * Families/revisions for which we will perform NB MCA Config changes
104  */
105 #define	AUTHAMD_DO_NBMCACFG(rev) \
106 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) || \
107 	X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
108 
109 /*
110  * Families/revisions that have chip cache scrubbers.
111  */
112 #define	AUTHAMD_HAS_CHIPSCRUB(rev) \
113 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) || \
114 	X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
115 
116 /*
117  * Families/revisions that have a NB misc register or registers -
118  * evaluates to 0 if no support, otherwise the number of MC4_MISCj.
119  */
120 #define	AUTHAMD_NBMISC_NUM(rev) \
121 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F)? 1 : \
122 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A) ? 3 : 0))
123 
124 /*
125  * Families/revision for which we wish not to machine check for GART
126  * table walk errors - bit 10 of NB CTL.
127  */
128 #define	AUTHAMD_NOGARTTBLWLK_MC(rev) \
129 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \
130 	X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
131 
132 /*
133  * Families/revisions that are potentially L3 capable
134  */
135 #define	AUTHAMD_L3CAPABLE(rev) \
136 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
137 
138 /*
139  * We recognise main memory ECC errors for AUTHAMD_MEMECC_RECOGNISED
140  * revisions as:
141  *
142  *	- being reported by the NB
143  *	- being a compound bus/interconnect error (external to chip)
144  *	- having LL of LG
145  *	- having II of MEM (but could still be a master/target abort)
146  *	- having CECC or UECC set
147  *
148  * We do not check the extended error code (first nibble of the
149  * model-specific error code on AMD) since this has changed from
150  * family 0xf to family 0x10 (ext code 0 now reserved on family 0x10).
151  * Instead we use CECC/UECC to separate off the master/target
152  * abort cases.
153  *
154  * We insist that the detector be the NorthBridge bank;  although
155  * IC/DC can report some main memory errors, they do not capture
156  * an address at sufficient resolution to be useful and the NB will
157  * report most errors.
158  */
159 #define	AUTHAMD_IS_MEMECCERR(bank, status) \
160 	((bank) == AMD_MCA_BANK_NB && \
161 	MCAX86_ERRCODE_ISBUS_INTERCONNECT(MCAX86_ERRCODE(status)) && \
162 	MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \
163 	MCAX86_ERRCODE_II(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_II_MEM && \
164 	((status) & (AMD_BANK_STAT_CECC | AMD_BANK_STAT_UECC)))
165 
166 static authamd_error_disp_t authamd_memce_disp = {
167 	FM_EREPORT_CPU_GENAMD,
168 	FM_EREPORT_CPU_GENAMD_MEM_CE,
169 	FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_CE
170 };
171 
172 static authamd_error_disp_t authamd_memue_disp = {
173 	FM_EREPORT_CPU_GENAMD,
174 	FM_EREPORT_CPU_GENAMD_MEM_UE,
175 	FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_UE
176 };
177 
178 static authamd_error_disp_t authamd_ckmemce_disp = {
179 	FM_EREPORT_CPU_GENAMD,
180 	FM_EREPORT_CPU_GENAMD_CKMEM_CE,
181 	FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_CE
182 };
183 
184 static authamd_error_disp_t authamd_ckmemue_disp = {
185 	FM_EREPORT_CPU_GENAMD,
186 	FM_EREPORT_CPU_GENAMD_CKMEM_UE,
187 	FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_UE
188 };
189 
190 /*
191  * We recognise GART walk errors as:
192  *
193  *	- being reported by the NB
194  *	- being a compound TLB error
195  *	- having LL of LG and TT of GEN
196  *	- having UC set
197  *	- possibly having PCC set (if source CPU)
198  */
199 #define	AUTHAMD_IS_GARTERR(bank, status) \
200 	((bank) == AMD_MCA_BANK_NB && \
201 	MCAX86_ERRCODE_ISTLB(MCAX86_ERRCODE(status)) && \
202 	MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \
203 	MCAX86_ERRCODE_TT(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_TT_GEN && \
204 	(status) & MSR_MC_STATUS_UC)
205 
206 static authamd_error_disp_t authamd_gart_disp = {
207 	FM_EREPORT_CPU_GENAMD,			/* use generic subclass */
208 	FM_EREPORT_CPU_GENADM_GARTTBLWLK,	/* use generic leafclass */
209 	0					/* no additional payload */
210 };
211 
212 
213 static struct authamd_chipshared *authamd_shared[AUTHAMD_MAX_CHIPS];
214 
215 static int
216 authamd_chip_once(authamd_data_t *authamd, enum authamd_cfgonce_bitnum what)
217 {
218 	return (atomic_set_long_excl(&authamd->amd_shared->acs_cfgonce,
219 	    what) == 0 ?  B_TRUE : B_FALSE);
220 }
221 
222 static void
223 authamd_pcicfg_write(uint_t chipid, uint_t func, uint_t reg, uint32_t val)
224 {
225 	ASSERT(chipid + 24 <= 31);
226 	ASSERT((func & 7) == func);
227 	ASSERT((reg & 3) == 0 && reg < 256);
228 
229 	cmi_pci_putl(0, chipid + 24, func, reg, 0, val);
230 }
231 
232 static uint32_t
233 authamd_pcicfg_read(uint_t chipid, uint_t func, uint_t reg)
234 {
235 	ASSERT(chipid + 24 <= 31);
236 	ASSERT((func & 7) == func);
237 	ASSERT((reg & 3) == 0 && reg < 256);
238 
239 	return (cmi_pci_getl(0, chipid + 24, func, reg, 0, 0));
240 }
241 
242 void
243 authamd_bankstatus_prewrite(cmi_hdl_t hdl, authamd_data_t *authamd)
244 {
245 	uint64_t hwcr;
246 
247 	if (cmi_hdl_rdmsr(hdl, MSR_AMD_HWCR, &hwcr) != CMI_SUCCESS)
248 		return;
249 
250 	authamd->amd_hwcr = hwcr;
251 
252 	if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) {
253 		hwcr |= AMD_HWCR_MCI_STATUS_WREN;
254 		(void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr);
255 	}
256 }
257 
258 void
259 authamd_bankstatus_postwrite(cmi_hdl_t hdl, authamd_data_t *authamd)
260 {
261 	uint64_t hwcr = authamd->amd_hwcr;
262 
263 	if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) {
264 		hwcr &= ~AMD_HWCR_MCI_STATUS_WREN;
265 		(void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr);
266 	}
267 }
268 
269 /*
270  * Read EccCnt repeatedly for all possible channel/chip-select combos:
271  *
272  *	- read sparectl register
273  *	- if EccErrCntWrEn is set, clear that bit in the just-read value
274  *	  and write it back to sparectl;  this *may* clobber the EccCnt
275  *	  for the channel/chip-select combination currently selected, so
276  *	  we leave this bit clear if we had to clear it
277  *	- cycle through all channel/chip-select combinations writing each
278  *	  combination to sparectl before reading the register back for
279  *	  EccCnt for that combination;  since EccErrCntWrEn is clear
280  *	  the writes to select what count to read will not themselves
281  *	  zero any counts
282  */
283 static int
284 authamd_read_ecccnt(authamd_data_t *authamd, struct authamd_logout *msl)
285 {
286 	union mcreg_sparectl sparectl;
287 	uint_t chipid = authamd->amd_shared->acs_chipid;
288 	uint_t family = authamd->amd_shared->acs_family;
289 	uint32_t rev = authamd->amd_shared->acs_rev;
290 	int chan, cs;
291 
292 	/*
293 	 * Check for feature support;  this macro will test down to the
294 	 * family revision number, whereafter we'll switch on family
295 	 * assuming that future revisions will use the same register
296 	 * format.
297 	 */
298 	if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) {
299 		bzero(&msl->aal_eccerrcnt, sizeof (msl->aal_eccerrcnt));
300 		return (0);
301 	}
302 
303 	MCREG_VAL32(&sparectl) =
304 	    authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL);
305 
306 	switch (family) {
307 	case AUTHAMD_FAMILY_F:
308 		MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 0;
309 		break;
310 
311 	case AUTHAMD_FAMILY_10:
312 		MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 0;
313 		break;
314 	}
315 
316 	for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) {
317 		switch (family) {
318 		case AUTHAMD_FAMILY_F:
319 			MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) =
320 			    chan;
321 			break;
322 
323 		case AUTHAMD_FAMILY_10:
324 			MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) =
325 			    chan;
326 			break;
327 		}
328 
329 		for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) {
330 			switch (family) {
331 			case AUTHAMD_FAMILY_F:
332 				MCREG_FIELD_F_revFG(&sparectl,
333 				    EccErrCntDramCs) = cs;
334 				break;
335 
336 			case AUTHAMD_FAMILY_10:
337 				MCREG_FIELD_10_revAB(&sparectl,
338 				    EccErrCntDramCs) = cs;
339 				break;
340 			}
341 
342 			authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL,
343 			    MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl));
344 
345 			MCREG_VAL32(&sparectl) = authamd_pcicfg_read(chipid,
346 			    MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL);
347 
348 			switch (family) {
349 			case AUTHAMD_FAMILY_F:
350 				msl->aal_eccerrcnt[chan][cs] =
351 				    MCREG_FIELD_F_revFG(&sparectl, EccErrCnt);
352 				break;
353 			case AUTHAMD_FAMILY_10:
354 				msl->aal_eccerrcnt[chan][cs] =
355 				    MCREG_FIELD_10_revAB(&sparectl, EccErrCnt);
356 				break;
357 			}
358 		}
359 	}
360 
361 	return (1);
362 }
363 
364 /*
365  * Clear EccCnt for all possible channel/chip-select combos:
366  *
367  *	- set EccErrCntWrEn in sparectl, if necessary
368  *	- write 0 to EccCnt for all channel/chip-select combinations
369  *	- clear EccErrCntWrEn
370  *
371  * If requested also disable the interrupts taken on counter overflow
372  * and on swap done.
373  */
374 static void
375 authamd_clear_ecccnt(authamd_data_t *authamd, boolean_t clrint)
376 {
377 	union mcreg_sparectl sparectl;
378 	uint_t chipid = authamd->amd_shared->acs_chipid;
379 	uint_t family = authamd->amd_shared->acs_family;
380 	uint32_t rev = authamd->amd_shared->acs_rev;
381 	int chan, cs;
382 
383 	if (!AUTHAMD_HAS_ONLINESPARECTL(rev))
384 		return;
385 
386 	MCREG_VAL32(&sparectl) =
387 	    authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL);
388 
389 	switch (family) {
390 	case AUTHAMD_FAMILY_F:
391 		MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 1;
392 		if (clrint) {
393 			MCREG_FIELD_F_revFG(&sparectl, EccErrInt) = 0;
394 			MCREG_FIELD_F_revFG(&sparectl, SwapDoneInt) = 0;
395 		}
396 		break;
397 
398 	case AUTHAMD_FAMILY_10:
399 		MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 1;
400 		if (clrint) {
401 			MCREG_FIELD_10_revAB(&sparectl, EccErrInt) = 0;
402 			MCREG_FIELD_10_revAB(&sparectl, SwapDoneInt) = 0;
403 		}
404 		break;
405 	}
406 
407 	authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL,
408 	    MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl));
409 
410 	for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) {
411 		switch (family) {
412 		case AUTHAMD_FAMILY_F:
413 			MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) =
414 			    chan;
415 			break;
416 
417 		case AUTHAMD_FAMILY_10:
418 			MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) =
419 			    chan;
420 			break;
421 		}
422 
423 		for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) {
424 			switch (family) {
425 			case AUTHAMD_FAMILY_F:
426 				MCREG_FIELD_F_revFG(&sparectl,
427 				    EccErrCntDramCs) = cs;
428 				MCREG_FIELD_F_revFG(&sparectl,
429 				    EccErrCnt) = 0;
430 				break;
431 
432 			case AUTHAMD_FAMILY_10:
433 				MCREG_FIELD_10_revAB(&sparectl,
434 				    EccErrCntDramCs) = cs;
435 				MCREG_FIELD_10_revAB(&sparectl,
436 				    EccErrCnt) = 0;
437 				break;
438 			}
439 
440 			authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL,
441 			    MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl));
442 		}
443 	}
444 }
445 
446 /*
447  * cms_init entry point.
448  *
449  * This module provides broad model-specific support for AMD families
450  * 0x6, 0xf and 0x10.  Future families will have to be evaluated once their
451  * documentation is available.
452  */
453 int
454 authamd_init(cmi_hdl_t hdl, void **datap)
455 {
456 	uint_t chipid = cmi_hdl_chipid(hdl);
457 	struct authamd_chipshared *sp, *osp;
458 	uint_t family = cmi_hdl_family(hdl);
459 	authamd_data_t *authamd;
460 	uint64_t cap;
461 
462 	if (authamd_ms_support_disable || !AUTHAMD_SUPPORTED(family))
463 		return (ENOTSUP);
464 
465 	if (!(x86_feature & X86_MCA))
466 		return (ENOTSUP);
467 
468 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
469 		return (ENOTSUP);
470 
471 	if (!(cap & MCG_CAP_CTL_P))
472 		return (ENOTSUP);
473 
474 	authamd = *datap = kmem_zalloc(sizeof (authamd_data_t), KM_SLEEP);
475 	cmi_hdl_hold(hdl);	/* release in fini */
476 	authamd->amd_hdl = hdl;
477 
478 	if ((sp = authamd_shared[chipid]) == NULL) {
479 		sp = kmem_zalloc(sizeof (struct authamd_chipshared), KM_SLEEP);
480 		osp = atomic_cas_ptr(&authamd_shared[chipid], NULL, sp);
481 		if (osp != NULL) {
482 			kmem_free(sp, sizeof (struct authamd_chipshared));
483 			sp = osp;
484 		} else {
485 			sp->acs_chipid = chipid;
486 			sp->acs_family = family;
487 			sp->acs_rev = cmi_hdl_chiprev(hdl);
488 		}
489 	}
490 	authamd->amd_shared = sp;
491 
492 	return (0);
493 }
494 
495 /*
496  * cms_logout_size entry point.
497  */
498 /*ARGSUSED*/
499 size_t
500 authamd_logout_size(cmi_hdl_t hdl)
501 {
502 	return (sizeof (struct authamd_logout));
503 }
504 
505 /*
506  * cms_mcgctl_val entry point
507  *
508  * Instead of setting all bits to 1 we can set just those for the
509  * error detector banks known to exist.
510  */
511 /*ARGSUSED*/
512 uint64_t
513 authamd_mcgctl_val(cmi_hdl_t hdl, int nbanks, uint64_t proposed)
514 {
515 	return (nbanks < 64 ? (1ULL << nbanks) - 1 : proposed);
516 }
517 
518 /*
519  * cms_bankctl_skipinit entry point
520  *
521  * On K6 we do not initialize MC0_CTL since, reportedly, this bank (for DC)
522  * may produce spurious machine checks.
523  */
524 /*ARGSUSED*/
525 boolean_t
526 authamd_bankctl_skipinit(cmi_hdl_t hdl, int bank)
527 {
528 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
529 
530 	return (authamd->amd_shared->acs_family == AUTHAMD_FAMILY_6 &&
531 	    bank == 0 ?  B_TRUE : B_FALSE);
532 }
533 
534 /*
535  * cms_bankctl_val entry point
536  */
537 uint64_t
538 authamd_bankctl_val(cmi_hdl_t hdl, int bank, uint64_t proposed)
539 {
540 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
541 	uint32_t rev = authamd->amd_shared->acs_rev;
542 	uint64_t val = proposed;
543 
544 	/*
545 	 * The Intel MCA says we can write all 1's to enable #MC for
546 	 * all errors, and AMD docs say much the same.  But, depending
547 	 * perhaps on other config registers, taking machine checks
548 	 * for some errors such as GART TLB errors and master/target
549 	 * aborts may be bad - they set UC and sometime also PCC, but
550 	 * we should not always panic for these error types.
551 	 *
552 	 * Our cms_error_action entry point can suppress such panics,
553 	 * however we can also use the cms_bankctl_val entry point to
554 	 * veto enabling of some of the known villains in the first place.
555 	 */
556 	if (bank == AMD_MCA_BANK_NB && AUTHAMD_NOGARTTBLWLK_MC(rev))
557 		val &= ~AMD_NB_EN_GARTTBLWK;
558 
559 	return (val);
560 }
561 
562 /*
563  * Bits to add to NB MCA config (after watchdog config).
564  */
565 uint32_t authamd_nb_mcacfg_add = AMD_NB_CFG_ADD_CMN;
566 
567 /*
568  * Bits to remove from NB MCA config (after watchdog config)
569  */
570 uint32_t authamd_nb_mcacfg_remove = AMD_NB_CFG_REMOVE_CMN;
571 
572 /*
573  * NB Watchdog policy, and rate we use if enabling.
574  */
575 enum {
576 	AUTHAMD_NB_WDOG_LEAVEALONE,
577 	AUTHAMD_NB_WDOG_DISABLE,
578 	AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED,
579 	AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE
580 } authamd_nb_watchdog_policy = AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED;
581 
582 uint32_t authamd_nb_mcacfg_wdog = AMD_NB_CFG_WDOGTMRCNTSEL_4095 |
583     AMD_NB_CFG_WDOGTMRBASESEL_1MS;
584 
585 /*
586  * Per-core cache scrubbing policy and rates.
587  */
588 enum {
589 	AUTHAMD_SCRUB_BIOSDEFAULT,	/* leave as BIOS configured */
590 	AUTHAMD_SCRUB_FIXED,		/* assign our chosen rate */
591 	AUTHAMD_SCRUB_MAX		/* use higher of ours and BIOS rate */
592 } authamd_scrub_policy = AUTHAMD_SCRUB_MAX;
593 
594 uint32_t authamd_scrub_rate_dcache = 0xf;	/* 64K per 0.67 seconds */
595 uint32_t authamd_scrub_rate_l2cache = 0xe;	/* 1MB per 5.3 seconds */
596 uint32_t authamd_scrub_rate_l3cache = 0xd;	/* 1MB per 2.7 seconds */
597 
598 static uint32_t
599 authamd_scrubrate(uint32_t osrate, uint32_t biosrate, const char *varnm)
600 {
601 	uint32_t rate;
602 
603 	if (osrate > AMD_NB_SCRUBCTL_RATE_MAX) {
604 		cmn_err(CE_WARN, "%s is too large, resetting to 0x%x\n",
605 		    varnm, AMD_NB_SCRUBCTL_RATE_MAX);
606 		osrate = AMD_NB_SCRUBCTL_RATE_MAX;
607 	}
608 
609 	switch (authamd_scrub_policy) {
610 	case AUTHAMD_SCRUB_FIXED:
611 		rate = osrate;
612 		break;
613 
614 	default:
615 		cmn_err(CE_WARN, "Unknown authamd_scrub_policy %d - "
616 		    "using default policy of AUTHAMD_SCRUB_MAX",
617 		    authamd_scrub_policy);
618 		/*FALLTHRU*/
619 
620 	case AUTHAMD_SCRUB_MAX:
621 		if (osrate != 0 && biosrate != 0)
622 			rate = MIN(osrate, biosrate);	/* small is fast */
623 		else
624 			rate = osrate ? osrate : biosrate;
625 	}
626 
627 	return (rate);
628 }
629 
630 /*
631  * cms_mca_init entry point.
632  */
633 /*ARGSUSED*/
634 void
635 authamd_mca_init(cmi_hdl_t hdl, int nbanks)
636 {
637 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
638 	uint32_t rev = authamd->amd_shared->acs_rev;
639 	uint_t chipid = authamd->amd_shared->acs_chipid;
640 
641 	/*
642 	 * On chips with a NB online spare control register take control
643 	 * and clear ECC counts.
644 	 */
645 	if (AUTHAMD_HAS_ONLINESPARECTL(rev) &&
646 	    authamd_chip_once(authamd, AUTHAMD_CFGONCE_ONLNSPRCFG)) {
647 		authamd_clear_ecccnt(authamd, B_TRUE);
648 	}
649 
650 	/*
651 	 * And since we are claiming the telemetry stop the BIOS receiving
652 	 * an SMI on NB threshold overflow.
653 	 */
654 	if (AUTHAMD_NBMISC_NUM(rev) &&
655 	    authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBTHRESH)) {
656 		union mcmsr_nbmisc nbm;
657 		int i;
658 
659 		authamd_bankstatus_prewrite(hdl, authamd);
660 
661 		for (i = 0; i < AUTHAMD_NBMISC_NUM(rev); i++) {
662 			if (cmi_hdl_rdmsr(hdl, MC_MSR_NB_MISC(i),
663 			    (uint64_t *)&nbm) != CMI_SUCCESS)
664 				continue;
665 
666 			if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) &&
667 			    MCMSR_FIELD_F_revFG(&nbm, mcmisc_Valid) &&
668 			    MCMSR_FIELD_F_revFG(&nbm, mcmisc_CntP)) {
669 				MCMSR_FIELD_F_revFG(&nbm, mcmisc_IntType) = 0;
670 			} else if (X86_CHIPREV_ATLEAST(rev,
671 			    X86_CHIPREV_AMD_10_REV_A) &&
672 			    MCMSR_FIELD_10_revAB(&nbm, mcmisc_Valid) &&
673 			    MCMSR_FIELD_10_revAB(&nbm, mcmisc_CntP)) {
674 				MCMSR_FIELD_10_revAB(&nbm, mcmisc_IntType) = 0;
675 			}
676 
677 			(void) cmi_hdl_wrmsr(hdl, MC_MSR_NB_MISC(i),
678 			    MCMSR_VAL(&nbm));
679 		}
680 
681 		authamd_bankstatus_postwrite(hdl, authamd);
682 	}
683 
684 	/*
685 	 * NB MCA Configuration Register.
686 	 */
687 	if (AUTHAMD_DO_NBMCACFG(rev) &&
688 	    authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCACFG)) {
689 		uint32_t val = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL,
690 		    MC_CTL_REG_NBCFG);
691 
692 		switch (authamd_nb_watchdog_policy) {
693 		case AUTHAMD_NB_WDOG_LEAVEALONE:
694 			break;
695 
696 		case AUTHAMD_NB_WDOG_DISABLE:
697 			val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK |
698 			    AMD_NB_CFG_WDOGTMRCNTSEL_MASK);
699 			val |= AMD_NB_CFG_WDOGTMRDIS;
700 			break;
701 
702 		default:
703 			cmn_err(CE_NOTE, "authamd_nb_watchdog_policy=%d "
704 			    "unrecognised, using default policy",
705 			    authamd_nb_watchdog_policy);
706 			/*FALLTHRU*/
707 
708 		case AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED:
709 			if (!(val & AMD_NB_CFG_WDOGTMRDIS))
710 				break;	/* if enabled leave rate intact */
711 			/*FALLTHRU*/
712 
713 		case AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE:
714 			val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK |
715 			    AMD_NB_CFG_WDOGTMRCNTSEL_MASK |
716 			    AMD_NB_CFG_WDOGTMRDIS);
717 			val |= authamd_nb_mcacfg_wdog;
718 			break;
719 		}
720 
721 		/*
722 		 * Bit 0 of the NB MCA Config register is reserved on family
723 		 * 0x10.
724 		 */
725 		if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
726 			authamd_nb_mcacfg_add &= ~AMD_NB_CFG_CPUECCERREN;
727 
728 		val &= ~authamd_nb_mcacfg_remove;
729 		val |= authamd_nb_mcacfg_add;
730 
731 		authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_NBCFG,
732 		    val);
733 	}
734 
735 	/*
736 	 * Cache scrubbing.  We can't enable DRAM scrubbing since
737 	 * we don't know the DRAM base for this node.
738 	 */
739 	if (AUTHAMD_HAS_CHIPSCRUB(rev) &&
740 	    authamd_scrub_policy != AUTHAMD_SCRUB_BIOSDEFAULT &&
741 	    authamd_chip_once(authamd, AUTHAMD_CFGONCE_CACHESCRUB)) {
742 		uint32_t val = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL,
743 		    MC_CTL_REG_SCRUBCTL);
744 		int l3cap = 0;
745 
746 		if (AUTHAMD_L3CAPABLE(rev)) {
747 			l3cap = (authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL,
748 			    MC_CTL_REG_NBCAP) & MC_NBCAP_L3CAPABLE) != 0;
749 		}
750 
751 		authamd_scrub_rate_dcache =
752 		    authamd_scrubrate(authamd_scrub_rate_dcache,
753 		    (val & AMD_NB_SCRUBCTL_DC_MASK) >> AMD_NB_SCRUBCTL_DC_SHIFT,
754 		    "authamd_scrub_rate_dcache");
755 
756 		authamd_scrub_rate_l2cache =
757 		    authamd_scrubrate(authamd_scrub_rate_l2cache,
758 		    (val & AMD_NB_SCRUBCTL_L2_MASK) >> AMD_NB_SCRUBCTL_L2_SHIFT,
759 		    "authamd_scrub_rate_l2cache");
760 
761 		authamd_scrub_rate_l3cache = l3cap ?
762 		    authamd_scrubrate(authamd_scrub_rate_l3cache,
763 		    (val & AMD_NB_SCRUBCTL_L3_MASK) >> AMD_NB_SCRUBCTL_L3_SHIFT,
764 		    "authamd_scrub_rate_l3cache") : 0;
765 
766 		val = AMD_NB_MKSCRUBCTL(authamd_scrub_rate_l3cache,
767 		    authamd_scrub_rate_dcache, authamd_scrub_rate_l2cache,
768 		    val & AMD_NB_SCRUBCTL_DRAM_MASK);
769 
770 		authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL,
771 		    MC_CTL_REG_SCRUBCTL, val);
772 	}
773 
774 }
775 
776 /*
777  * cms_poll_ownermask entry point.
778  */
779 uint64_t
780 authamd_poll_ownermask(cmi_hdl_t hdl, hrtime_t pintvl)
781 {
782 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
783 	struct authamd_chipshared *acsp = authamd->amd_shared;
784 	hrtime_t now = gethrtime_waitfree();
785 	hrtime_t last = acsp->acs_poll_timestamp;
786 	int dopoll = 0;
787 
788 	if (now - last > 2 * pintvl || last == 0) {
789 		acsp->acs_pollowner = hdl;
790 		dopoll = 1;
791 	} else if (acsp->acs_pollowner == hdl) {
792 		dopoll = 1;
793 	}
794 
795 	if (dopoll)
796 		acsp->acs_poll_timestamp = now;
797 
798 	return (dopoll ? -1ULL : ~(1 << AMD_MCA_BANK_NB));
799 
800 }
801 
802 /*
803  * cms_bank_logout entry point.
804  */
805 /*ARGSUSED*/
806 void
807 authamd_bank_logout(cmi_hdl_t hdl, int bank, uint64_t status,
808     uint64_t addr, uint64_t misc, void *mslogout)
809 {
810 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
811 	struct authamd_logout *msl = mslogout;
812 	uint32_t rev = authamd->amd_shared->acs_rev;
813 
814 	if (msl == NULL)
815 		return;
816 
817 	/*
818 	 * For main memory ECC errors on revisions with an Online Spare
819 	 * Control Register grab the ECC counts by channel and chip-select
820 	 * and reset them to 0.
821 	 */
822 	if (AUTHAMD_MEMECC_RECOGNISED(rev) &&
823 	    AUTHAMD_IS_MEMECCERR(bank, status) &&
824 	    AUTHAMD_HAS_ONLINESPARECTL(rev)) {
825 		if (authamd_read_ecccnt(authamd, msl))
826 			authamd_clear_ecccnt(authamd, B_FALSE);
827 	}
828 }
829 
830 /*
831  * cms_error_action entry point
832  */
833 
834 int authamd_forgive_uc = 0;	/* For test/debug only */
835 int authamd_forgive_pcc = 0;	/* For test/debug only */
836 int authamd_fake_poison = 0;	/* For test/debug only */
837 
838 /*ARGSUSED*/
839 uint32_t
840 authamd_error_action(cmi_hdl_t hdl, int ismc, int bank,
841     uint64_t status, uint64_t addr, uint64_t misc, void *mslogout)
842 {
843 	authamd_error_disp_t *disp;
844 	uint32_t rv = 0;
845 
846 	if (authamd_forgive_uc)
847 		rv |= CMS_ERRSCOPE_CLEARED_UC;
848 
849 	if (authamd_forgive_pcc)
850 		rv |= CMS_ERRSCOPE_CURCONTEXT_OK;
851 
852 	if (authamd_fake_poison && status & MSR_MC_STATUS_UC)
853 		rv |= CMS_ERRSCOPE_POISONED;
854 
855 	if (rv)
856 		return (rv);
857 
858 	disp = authamd_disp_match(hdl, bank, status, addr, misc, mslogout);
859 
860 	if (disp == &authamd_gart_disp) {
861 		/*
862 		 * GART walk errors set UC and possibly PCC (if source CPU)
863 		 * but should not be regarded as terminal.
864 		 */
865 		return (CMS_ERRSCOPE_IGNORE_ERR);
866 	}
867 
868 	/*
869 	 * May also want to consider master abort and target abort.  These
870 	 * also set UC and PCC (if src CPU) but the requester gets -1
871 	 * and I believe the IO stuff in Solaris will handle that.
872 	 */
873 
874 	return (rv);
875 }
876 
877 /*
878  * cms_disp_match entry point
879  */
880 /*ARGSUSED*/
881 cms_cookie_t
882 authamd_disp_match(cmi_hdl_t hdl, int bank, uint64_t status,
883     uint64_t addr, uint64_t misc, void *mslogout)
884 {
885 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
886 	/* uint16_t errcode = MCAX86_ERRCODE(status); */
887 	uint16_t exterrcode = AMD_EXT_ERRCODE(status);
888 	uint32_t rev = authamd->amd_shared->acs_rev;
889 
890 	/*
891 	 * Recognise main memory ECC errors
892 	 */
893 	if (AUTHAMD_MEMECC_RECOGNISED(rev) &&
894 	    AUTHAMD_IS_MEMECCERR(bank, status)) {
895 		if (status & AMD_BANK_STAT_CECC) {
896 			return (exterrcode == 0 ? &authamd_memce_disp :
897 			    &authamd_ckmemce_disp);
898 		} else if (status & AMD_BANK_STAT_UECC) {
899 			return (exterrcode == 0 ? &authamd_memue_disp :
900 			    &authamd_ckmemue_disp);
901 		}
902 	}
903 
904 	/*
905 	 * Recognise GART walk errors
906 	 */
907 	if (AUTHAMD_NOGARTTBLWLK_MC(rev) && AUTHAMD_IS_GARTERR(bank, status))
908 		return (&authamd_gart_disp);
909 
910 	return (NULL);
911 }
912 
913 /*
914  * cms_ereport_class entry point
915  */
916 /*ARGSUSED*/
917 void
918 authamd_ereport_class(cmi_hdl_t hdl, cms_cookie_t mscookie,
919     const char **cpuclsp, const char **leafclsp)
920 {
921 	const authamd_error_disp_t *aed = mscookie;
922 
923 	if (aed == NULL)
924 		return;
925 
926 	if (aed->aad_subclass != NULL)
927 		*cpuclsp = aed->aad_subclass;
928 	if (aed->aad_leafclass != NULL)
929 		*leafclsp = aed->aad_leafclass;
930 }
931 
932 /*ARGSUSED*/
933 static void
934 authamd_ereport_add_resource(cmi_hdl_t hdl, authamd_data_t *authamd,
935     nvlist_t *ereport, nv_alloc_t *nva, void *mslogout)
936 {
937 	nvlist_t *elems[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS];
938 	uint8_t counts[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS];
939 	authamd_logout_t *msl;
940 	nvlist_t *nvl;
941 	int nelems = 0;
942 	int i, chan, cs;
943 
944 	if ((msl = mslogout) == NULL)
945 		return;
946 
947 	for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) {
948 		for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) {
949 			if (msl->aal_eccerrcnt[chan][cs] == 0)
950 				continue;
951 
952 			if ((nvl = fm_nvlist_create(nva)) == NULL)
953 				continue;
954 
955 			elems[nelems] = nvl;
956 			counts[nelems++] = msl->aal_eccerrcnt[chan][cs];
957 
958 			fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 5,
959 			    "motherboard", 0,
960 			    "chip", authamd->amd_shared->acs_chipid,
961 			    "memory-controller", 0,
962 			    "dram-channel", chan,
963 			    "chip-select", cs);
964 		}
965 	}
966 
967 	if (nelems == 0)
968 		return;
969 
970 	fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCE,
971 	    DATA_TYPE_NVLIST_ARRAY, nelems, elems,
972 	    NULL);
973 
974 	fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCECNT,
975 	    DATA_TYPE_UINT8_ARRAY, nelems, &counts[0],
976 	    NULL);
977 
978 	for (i = 0; i < nelems; i++)
979 		fm_nvlist_destroy(elems[i], nva ? FM_NVA_RETAIN : FM_NVA_FREE);
980 }
981 
982 /*
983  * cms_ereport_add_logout entry point
984  */
985 /*ARGSUSED*/
986 void
987 authamd_ereport_add_logout(cmi_hdl_t hdl, nvlist_t *ereport, nv_alloc_t *nva,
988     int bank, uint64_t status, uint64_t addr, uint64_t misc,
989     void *mslogout, cms_cookie_t mscookie)
990 {
991 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
992 	const authamd_error_disp_t *aed = mscookie;
993 	uint64_t members;
994 
995 	if (aed == NULL)
996 		return;
997 
998 	members = aed->aad_ereport_members;
999 
1000 	if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYND) {
1001 		fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_SYND,
1002 		    DATA_TYPE_UINT16, (uint16_t)AMD_BANK_SYND(status),
1003 		    NULL);
1004 
1005 		if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) {
1006 			fm_payload_set(ereport,
1007 			    FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE,
1008 			    DATA_TYPE_STRING, "E",
1009 			    NULL);
1010 		}
1011 	}
1012 
1013 	if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_CKSYND) {
1014 		fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_CKSYND,
1015 		    DATA_TYPE_UINT16, (uint16_t)AMD_NB_STAT_CKSYND(status),
1016 		    NULL);
1017 
1018 		if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) {
1019 			fm_payload_set(ereport,
1020 			    FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE,
1021 			    DATA_TYPE_STRING, "C",
1022 			    NULL);
1023 		}
1024 	}
1025 
1026 	if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_RESOURCE &&
1027 	    status & MSR_MC_STATUS_ADDRV) {
1028 		authamd_ereport_add_resource(hdl, authamd, ereport, nva,
1029 		    mslogout);
1030 	}
1031 }
1032 
1033 /*
1034  * cms_msrinject entry point
1035  */
1036 cms_errno_t
1037 authamd_msrinject(cmi_hdl_t hdl, uint_t msr, uint64_t val)
1038 {
1039 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
1040 	cms_errno_t rv = CMSERR_BADMSRWRITE;
1041 
1042 	authamd_bankstatus_prewrite(hdl, authamd);
1043 	if (cmi_hdl_wrmsr(hdl, msr, val) == CMI_SUCCESS)
1044 		rv = CMS_SUCCESS;
1045 	authamd_bankstatus_postwrite(hdl, authamd);
1046 
1047 	return (rv);
1048 }
1049 
1050 cms_api_ver_t _cms_api_version = CMS_API_VERSION_0;
1051 
1052 const cms_ops_t _cms_ops = {
1053 	authamd_init,			/* cms_init */
1054 	NULL,				/* cms_post_startup */
1055 	NULL,				/* cms_post_mpstartup */
1056 	authamd_logout_size,		/* cms_logout_size */
1057 	authamd_mcgctl_val,		/* cms_mcgctl_val */
1058 	authamd_bankctl_skipinit,	/* cms_bankctl_skipinit */
1059 	authamd_bankctl_val,		/* cms_bankctl_val */
1060 	NULL,				/* cms_bankstatus_skipinit */
1061 	NULL,				/* cms_bankstatus_val */
1062 	authamd_mca_init,		/* cms_mca_init */
1063 	authamd_poll_ownermask,		/* cms_poll_ownermask */
1064 	authamd_bank_logout,		/* cms_bank_logout */
1065 	authamd_error_action,		/* cms_error_action */
1066 	authamd_disp_match,		/* cms_disp_match */
1067 	authamd_ereport_class,		/* cms_ereport_class */
1068 	NULL,				/* cms_ereport_detector */
1069 	NULL,				/* cms_ereport_includestack */
1070 	authamd_ereport_add_logout,	/* cms_ereport_add_logout */
1071 	authamd_msrinject,		/* cms_msrinject */
1072 	NULL,				/* cms_fini */
1073 };
1074 
1075 static struct modlcpu modlcpu = {
1076 	&mod_cpuops,
1077 	"Generic AMD model-specific MCA"
1078 };
1079 
1080 static struct modlinkage modlinkage = {
1081 	MODREV_1,
1082 	(void *)&modlcpu,
1083 	NULL
1084 };
1085 
1086 int
1087 _init(void)
1088 {
1089 	return (mod_install(&modlinkage));
1090 }
1091 
1092 int
1093 _info(struct modinfo *modinfop)
1094 {
1095 	return (mod_info(&modlinkage, modinfop));
1096 }
1097 
1098 int
1099 _fini(void)
1100 {
1101 	return (mod_remove(&modlinkage));
1102 }
1103