xref: /titanic_50/usr/src/uts/i86pc/cpu/authenticamd/authamd_main.c (revision 65c8f1c0a342917e5c22dcf2b006e6307631ed67)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * "Generic AMD" model-specific support.  If no more-specific support can
31  * be found, or such modules declines to initialize, then for AuthenticAMD
32  * cpus this module can have a crack at providing some AMD model-specific
33  * support that at least goes beyond common MCA architectural features
34  * if not down to the nitty-gritty level for a particular model.  We
35  * are layered on top of a cpu module, likely cpu.generic, so there is no
36  * need for us to perform common architecturally-accessible functions.
37  */
38 
39 #include <sys/types.h>
40 #include <sys/cmn_err.h>
41 #include <sys/modctl.h>
42 #include <sys/cpu_module.h>
43 #include <sys/mca_x86.h>
44 #include <sys/pci_cfgspace.h>
45 #include <sys/x86_archext.h>
46 #include <sys/mc_amd.h>
47 #include <sys/fm/protocol.h>
48 #include <sys/fm/cpu/GENAMD.h>
49 #include <sys/nvpair.h>
50 #include <sys/controlregs.h>
51 #include <sys/pghw.h>
52 #include <sys/sunddi.h>
53 #include <sys/sysmacros.h>
54 #include <sys/cpu_module_ms_impl.h>
55 
56 #include "authamd.h"
57 
58 int authamd_ms_support_disable = 0;
59 
60 #define	AUTHAMD_F_REVS_BCDE \
61 	(X86_CHIPREV_AMD_F_REV_B | X86_CHIPREV_AMD_F_REV_C0 | \
62 	X86_CHIPREV_AMD_F_REV_CG | X86_CHIPREV_AMD_F_REV_D | \
63 	X86_CHIPREV_AMD_F_REV_E)
64 
65 #define	AUTHAMD_F_REVS_FG \
66 	(X86_CHIPREV_AMD_F_REV_F | X86_CHIPREV_AMD_F_REV_G)
67 
68 #define	AUTHAMD_10_REVS_AB \
69 	(X86_CHIPREV_AMD_10_REV_A | X86_CHIPREV_AMD_10_REV_B)
70 
71 /*
72  * Bitmasks of support for various features.  Try to enable features
73  * via inclusion in one of these bitmasks and check that at the
74  * feature imlementation - that way new family support may often simply
75  * simply need to update these bitmasks.
76  */
77 
78 /*
79  * Families that this module will provide some model-specific
80  * support for (if no more-specific module claims it first).
81  * We try to support whole families rather than differentiate down
82  * to revision.
83  */
84 #define	AUTHAMD_SUPPORTED(fam) \
85 	((fam) == AUTHAMD_FAMILY_6 || (fam) == AUTHAMD_FAMILY_F || \
86 	(fam) == AUTHAMD_FAMILY_10)
87 
88 /*
89  * Models that include an on-chip NorthBridge.
90  */
91 #define	AUTHAMD_NBONCHIP(rev) \
92 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \
93 	X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
94 
95 /*
96  * Families/revisions for which we can recognise main memory ECC errors.
97  */
98 #define	AUTHAMD_MEMECC_RECOGNISED(rev) \
99 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \
100 	X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
101 
102 /*
103  * Families/revisions that have an Online Spare Control Register
104  */
105 #define	AUTHAMD_HAS_ONLINESPARECTL(rev) \
106 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) || \
107 	X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
108 
109 /*
110  * Families/revisions for which we will perform NB MCA Config changes
111  */
112 #define	AUTHAMD_DO_NBMCACFG(rev) \
113 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \
114 	X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
115 
116 /*
117  * Families/revisions that have chip cache scrubbers.
118  */
119 #define	AUTHAMD_HAS_CHIPSCRUB(rev) \
120 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \
121 	X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
122 
123 /*
124  * Families/revisions that have a NB misc register or registers -
125  * evaluates to 0 if no support, otherwise the number of MC4_MISCj.
126  */
127 #define	AUTHAMD_NBMISC_NUM(rev) \
128 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F)? 1 : \
129 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A) ? 3 : 0))
130 
131 /*
132  * Families/revision for which we wish not to machine check for GART
133  * table walk errors - bit 10 of NB CTL.
134  */
135 #define	AUTHAMD_NOGARTTBLWLK_MC(rev) \
136 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B) || \
137 	X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
138 
139 /*
140  * Families/revisions that are potentially L3 capable
141  */
142 #define	AUTHAMD_L3CAPABLE(rev) \
143 	(X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
144 
145 /*
146  * We recognise main memory ECC errors for AUTHAMD_MEMECC_RECOGNISED
147  * revisions as:
148  *
149  *	- being reported by the NB
150  *	- being a compound bus/interconnect error (external to chip)
151  *	- having LL of LG
152  *	- having II of MEM (but could still be a master/target abort)
153  *	- having CECC or UECC set
154  *
155  * We do not check the extended error code (first nibble of the
156  * model-specific error code on AMD) since this has changed from
157  * family 0xf to family 0x10 (ext code 0 now reserved on family 0x10).
158  * Instead we use CECC/UECC to separate off the master/target
159  * abort cases.
160  *
161  * We insist that the detector be the NorthBridge bank;  although
162  * IC/DC can report some main memory errors, they do not capture
163  * an address at sufficient resolution to be useful and the NB will
164  * report most errors.
165  */
166 #define	AUTHAMD_IS_MEMECCERR(bank, status) \
167 	((bank) == AMD_MCA_BANK_NB && \
168 	MCAX86_ERRCODE_ISBUS_INTERCONNECT(MCAX86_ERRCODE(status)) && \
169 	MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \
170 	MCAX86_ERRCODE_II(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_II_MEM && \
171 	((status) & (AMD_BANK_STAT_CECC | AMD_BANK_STAT_UECC)))
172 
173 static authamd_error_disp_t authamd_memce_disp = {
174 	FM_EREPORT_CPU_GENAMD,
175 	FM_EREPORT_CPU_GENAMD_MEM_CE,
176 	FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_CE
177 };
178 
179 static authamd_error_disp_t authamd_memue_disp = {
180 	FM_EREPORT_CPU_GENAMD,
181 	FM_EREPORT_CPU_GENAMD_MEM_UE,
182 	FM_EREPORT_GENAMD_PAYLOAD_FLAGS_MEM_UE
183 };
184 
185 static authamd_error_disp_t authamd_ckmemce_disp = {
186 	FM_EREPORT_CPU_GENAMD,
187 	FM_EREPORT_CPU_GENAMD_CKMEM_CE,
188 	FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_CE
189 };
190 
191 static authamd_error_disp_t authamd_ckmemue_disp = {
192 	FM_EREPORT_CPU_GENAMD,
193 	FM_EREPORT_CPU_GENAMD_CKMEM_UE,
194 	FM_EREPORT_GENAMD_PAYLOAD_FLAGS_CKMEM_UE
195 };
196 
197 /*
198  * We recognise GART walk errors as:
199  *
200  *	- being reported by the NB
201  *	- being a compound TLB error
202  *	- having LL of LG and TT of GEN
203  *	- having UC set
204  *	- possibly having PCC set (if source CPU)
205  */
206 #define	AUTHAMD_IS_GARTERR(bank, status) \
207 	((bank) == AMD_MCA_BANK_NB && \
208 	MCAX86_ERRCODE_ISTLB(MCAX86_ERRCODE(status)) && \
209 	MCAX86_ERRCODE_LL(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_LL_LG && \
210 	MCAX86_ERRCODE_TT(MCAX86_ERRCODE(status)) == MCAX86_ERRCODE_TT_GEN && \
211 	(status) & MSR_MC_STATUS_UC)
212 
213 static authamd_error_disp_t authamd_gart_disp = {
214 	FM_EREPORT_CPU_GENAMD,			/* use generic subclass */
215 	FM_EREPORT_CPU_GENADM_GARTTBLWLK,	/* use generic leafclass */
216 	0					/* no additional payload */
217 };
218 
219 
220 static struct authamd_chipshared *authamd_shared[AUTHAMD_MAX_CHIPS];
221 
222 static int
223 authamd_chip_once(authamd_data_t *authamd, enum authamd_cfgonce_bitnum what)
224 {
225 	return (atomic_set_long_excl(&authamd->amd_shared->acs_cfgonce,
226 	    what) == 0 ?  B_TRUE : B_FALSE);
227 }
228 
229 static void
230 authamd_pcicfg_write(uint_t chipid, uint_t func, uint_t reg, uint32_t val)
231 {
232 	ASSERT(chipid + 24 <= 31);
233 	ASSERT((func & 7) == func);
234 	ASSERT((reg & 3) == 0 && reg < 256);
235 
236 	cmi_pci_putl(0, chipid + 24, func, reg, 0, val);
237 }
238 
239 static uint32_t
240 authamd_pcicfg_read(uint_t chipid, uint_t func, uint_t reg)
241 {
242 	ASSERT(chipid + 24 <= 31);
243 	ASSERT((func & 7) == func);
244 	ASSERT((reg & 3) == 0 && reg < 256);
245 
246 	return (cmi_pci_getl(0, chipid + 24, func, reg, 0, 0));
247 }
248 
249 void
250 authamd_bankstatus_prewrite(cmi_hdl_t hdl, authamd_data_t *authamd)
251 {
252 	uint64_t hwcr;
253 
254 	if (cmi_hdl_rdmsr(hdl, MSR_AMD_HWCR, &hwcr) != CMI_SUCCESS)
255 		return;
256 
257 	authamd->amd_hwcr = hwcr;
258 
259 	if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) {
260 		hwcr |= AMD_HWCR_MCI_STATUS_WREN;
261 		(void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr);
262 	}
263 }
264 
265 void
266 authamd_bankstatus_postwrite(cmi_hdl_t hdl, authamd_data_t *authamd)
267 {
268 	uint64_t hwcr = authamd->amd_hwcr;
269 
270 	if (!(hwcr & AMD_HWCR_MCI_STATUS_WREN)) {
271 		hwcr &= ~AMD_HWCR_MCI_STATUS_WREN;
272 		(void) cmi_hdl_wrmsr(hdl, MSR_AMD_HWCR, hwcr);
273 	}
274 }
275 
276 /*
277  * Read EccCnt repeatedly for all possible channel/chip-select combos:
278  *
279  *	- read sparectl register
280  *	- if EccErrCntWrEn is set, clear that bit in the just-read value
281  *	  and write it back to sparectl;  this *may* clobber the EccCnt
282  *	  for the channel/chip-select combination currently selected, so
283  *	  we leave this bit clear if we had to clear it
284  *	- cycle through all channel/chip-select combinations writing each
285  *	  combination to sparectl before reading the register back for
286  *	  EccCnt for that combination;  since EccErrCntWrEn is clear
287  *	  the writes to select what count to read will not themselves
288  *	  zero any counts
289  */
290 static int
291 authamd_read_ecccnt(authamd_data_t *authamd, struct authamd_logout *msl)
292 {
293 	union mcreg_sparectl sparectl;
294 	uint_t chipid = authamd->amd_shared->acs_chipid;
295 	uint_t family = authamd->amd_shared->acs_family;
296 	uint32_t rev = authamd->amd_shared->acs_rev;
297 	int chan, cs;
298 
299 	/*
300 	 * Check for feature support;  this macro will test down to the
301 	 * family revision number, whereafter we'll switch on family
302 	 * assuming that future revisions will use the same register
303 	 * format.
304 	 */
305 	if (!AUTHAMD_HAS_ONLINESPARECTL(rev)) {
306 		bzero(&msl->aal_eccerrcnt, sizeof (msl->aal_eccerrcnt));
307 		return (0);
308 	}
309 
310 	MCREG_VAL32(&sparectl) =
311 	    authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL);
312 
313 	switch (family) {
314 	case AUTHAMD_FAMILY_F:
315 		MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 0;
316 		break;
317 
318 	case AUTHAMD_FAMILY_10:
319 		MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 0;
320 		break;
321 	}
322 
323 	for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) {
324 		switch (family) {
325 		case AUTHAMD_FAMILY_F:
326 			MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) =
327 			    chan;
328 			break;
329 
330 		case AUTHAMD_FAMILY_10:
331 			MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) =
332 			    chan;
333 			break;
334 		}
335 
336 		for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) {
337 			switch (family) {
338 			case AUTHAMD_FAMILY_F:
339 				MCREG_FIELD_F_revFG(&sparectl,
340 				    EccErrCntDramCs) = cs;
341 				break;
342 
343 			case AUTHAMD_FAMILY_10:
344 				MCREG_FIELD_10_revAB(&sparectl,
345 				    EccErrCntDramCs) = cs;
346 				break;
347 			}
348 
349 			authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL,
350 			    MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl));
351 
352 			MCREG_VAL32(&sparectl) = authamd_pcicfg_read(chipid,
353 			    MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL);
354 
355 			switch (family) {
356 			case AUTHAMD_FAMILY_F:
357 				msl->aal_eccerrcnt[chan][cs] =
358 				    MCREG_FIELD_F_revFG(&sparectl, EccErrCnt);
359 				break;
360 			case AUTHAMD_FAMILY_10:
361 				msl->aal_eccerrcnt[chan][cs] =
362 				    MCREG_FIELD_10_revAB(&sparectl, EccErrCnt);
363 				break;
364 			}
365 		}
366 	}
367 
368 	return (1);
369 }
370 
371 /*
372  * Clear EccCnt for all possible channel/chip-select combos:
373  *
374  *	- set EccErrCntWrEn in sparectl, if necessary
375  *	- write 0 to EccCnt for all channel/chip-select combinations
376  *	- clear EccErrCntWrEn
377  *
378  * If requested also disable the interrupts taken on counter overflow
379  * and on swap done.
380  */
381 static void
382 authamd_clear_ecccnt(authamd_data_t *authamd, boolean_t clrint)
383 {
384 	union mcreg_sparectl sparectl;
385 	uint_t chipid = authamd->amd_shared->acs_chipid;
386 	uint_t family = authamd->amd_shared->acs_family;
387 	uint32_t rev = authamd->amd_shared->acs_rev;
388 	int chan, cs;
389 
390 	if (!AUTHAMD_HAS_ONLINESPARECTL(rev))
391 		return;
392 
393 	MCREG_VAL32(&sparectl) =
394 	    authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_SPARECTL);
395 
396 	switch (family) {
397 	case AUTHAMD_FAMILY_F:
398 		MCREG_FIELD_F_revFG(&sparectl, EccErrCntWrEn) = 1;
399 		if (clrint) {
400 			MCREG_FIELD_F_revFG(&sparectl, EccErrInt) = 0;
401 			MCREG_FIELD_F_revFG(&sparectl, SwapDoneInt) = 0;
402 		}
403 		break;
404 
405 	case AUTHAMD_FAMILY_10:
406 		MCREG_FIELD_10_revAB(&sparectl, EccErrCntWrEn) = 1;
407 		if (clrint) {
408 			MCREG_FIELD_10_revAB(&sparectl, EccErrInt) = 0;
409 			MCREG_FIELD_10_revAB(&sparectl, SwapDoneInt) = 0;
410 		}
411 		break;
412 	}
413 
414 	authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL,
415 	    MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl));
416 
417 	for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) {
418 		switch (family) {
419 		case AUTHAMD_FAMILY_F:
420 			MCREG_FIELD_F_revFG(&sparectl, EccErrCntDramChan) =
421 			    chan;
422 			break;
423 
424 		case AUTHAMD_FAMILY_10:
425 			MCREG_FIELD_10_revAB(&sparectl, EccErrCntDramChan) =
426 			    chan;
427 			break;
428 		}
429 
430 		for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) {
431 			switch (family) {
432 			case AUTHAMD_FAMILY_F:
433 				MCREG_FIELD_F_revFG(&sparectl,
434 				    EccErrCntDramCs) = cs;
435 				MCREG_FIELD_F_revFG(&sparectl,
436 				    EccErrCnt) = 0;
437 				break;
438 
439 			case AUTHAMD_FAMILY_10:
440 				MCREG_FIELD_10_revAB(&sparectl,
441 				    EccErrCntDramCs) = cs;
442 				MCREG_FIELD_10_revAB(&sparectl,
443 				    EccErrCnt) = 0;
444 				break;
445 			}
446 
447 			authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL,
448 			    MC_CTL_REG_SPARECTL, MCREG_VAL32(&sparectl));
449 		}
450 	}
451 }
452 
453 /*
454  * cms_init entry point.
455  *
456  * This module provides broad model-specific support for AMD families
457  * 0x6, 0xf and 0x10.  Future families will have to be evaluated once their
458  * documentation is available.
459  */
460 int
461 authamd_init(cmi_hdl_t hdl, void **datap)
462 {
463 	uint_t chipid = cmi_hdl_chipid(hdl);
464 	struct authamd_chipshared *sp, *osp;
465 	uint_t family = cmi_hdl_family(hdl);
466 	authamd_data_t *authamd;
467 	uint64_t cap;
468 
469 	if (authamd_ms_support_disable || !AUTHAMD_SUPPORTED(family))
470 		return (ENOTSUP);
471 
472 	if (!(x86_feature & X86_MCA))
473 		return (ENOTSUP);
474 
475 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
476 		return (ENOTSUP);
477 
478 	if (!(cap & MCG_CAP_CTL_P))
479 		return (ENOTSUP);
480 
481 	authamd = *datap = kmem_zalloc(sizeof (authamd_data_t), KM_SLEEP);
482 	cmi_hdl_hold(hdl);	/* release in fini */
483 	authamd->amd_hdl = hdl;
484 
485 	if ((sp = authamd_shared[chipid]) == NULL) {
486 		sp = kmem_zalloc(sizeof (struct authamd_chipshared), KM_SLEEP);
487 		sp->acs_chipid = chipid;
488 		sp->acs_family = family;
489 		sp->acs_rev = cmi_hdl_chiprev(hdl);
490 		membar_producer();
491 
492 		osp = atomic_cas_ptr(&authamd_shared[chipid], NULL, sp);
493 		if (osp != NULL) {
494 			kmem_free(sp, sizeof (struct authamd_chipshared));
495 			sp = osp;
496 		}
497 	}
498 	authamd->amd_shared = sp;
499 
500 	return (0);
501 }
502 
503 /*
504  * cms_logout_size entry point.
505  */
506 /*ARGSUSED*/
507 size_t
508 authamd_logout_size(cmi_hdl_t hdl)
509 {
510 	return (sizeof (struct authamd_logout));
511 }
512 
513 /*
514  * cms_mcgctl_val entry point
515  *
516  * Instead of setting all bits to 1 we can set just those for the
517  * error detector banks known to exist.
518  */
519 /*ARGSUSED*/
520 uint64_t
521 authamd_mcgctl_val(cmi_hdl_t hdl, int nbanks, uint64_t proposed)
522 {
523 	return (nbanks < 64 ? (1ULL << nbanks) - 1 : proposed);
524 }
525 
526 /*
527  * cms_bankctl_skipinit entry point
528  *
529  * On K6 we do not initialize MC0_CTL since, reportedly, this bank (for DC)
530  * may produce spurious machine checks.
531  *
532  * Only allow a single core to setup the NorthBridge MCi_CTL register.
533  */
534 /*ARGSUSED*/
535 boolean_t
536 authamd_bankctl_skipinit(cmi_hdl_t hdl, int bank)
537 {
538 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
539 	uint32_t rev = authamd->amd_shared->acs_rev;
540 
541 	if (authamd->amd_shared->acs_family == AUTHAMD_FAMILY_6)
542 		return (bank == 0 ?  B_TRUE : B_FALSE);
543 
544 	if (AUTHAMD_NBONCHIP(rev) && bank == AMD_MCA_BANK_NB) {
545 		return (authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCA) ==
546 		    B_TRUE ? B_FALSE : B_TRUE);
547 	}
548 
549 	return (B_FALSE);
550 }
551 
552 /*
553  * cms_bankctl_val entry point
554  */
555 uint64_t
556 authamd_bankctl_val(cmi_hdl_t hdl, int bank, uint64_t proposed)
557 {
558 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
559 	uint32_t rev = authamd->amd_shared->acs_rev;
560 	uint64_t val = proposed;
561 
562 	/*
563 	 * The Intel MCA says we can write all 1's to enable #MC for
564 	 * all errors, and AMD docs say much the same.  But, depending
565 	 * perhaps on other config registers, taking machine checks
566 	 * for some errors such as GART TLB errors and master/target
567 	 * aborts may be bad - they set UC and sometime also PCC, but
568 	 * we should not always panic for these error types.
569 	 *
570 	 * Our cms_error_action entry point can suppress such panics,
571 	 * however we can also use the cms_bankctl_val entry point to
572 	 * veto enabling of some of the known villains in the first place.
573 	 */
574 	if (bank == AMD_MCA_BANK_NB && AUTHAMD_NOGARTTBLWLK_MC(rev))
575 		val &= ~AMD_NB_EN_GARTTBLWK;
576 
577 	return (val);
578 }
579 
580 /*
581  * Bits to add to NB MCA config (after watchdog config).
582  */
583 uint32_t authamd_nb_mcacfg_add = AMD_NB_CFG_ADD_CMN;
584 
585 /*
586  * Bits to remove from NB MCA config (after watchdog config)
587  */
588 uint32_t authamd_nb_mcacfg_remove = AMD_NB_CFG_REMOVE_CMN;
589 
590 /*
591  * NB Watchdog policy, and rate we use if enabling.
592  */
593 enum {
594 	AUTHAMD_NB_WDOG_LEAVEALONE,
595 	AUTHAMD_NB_WDOG_DISABLE,
596 	AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED,
597 	AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE
598 } authamd_nb_watchdog_policy = AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED;
599 
600 uint32_t authamd_nb_mcacfg_wdog = AMD_NB_CFG_WDOGTMRCNTSEL_4095 |
601     AMD_NB_CFG_WDOGTMRBASESEL_1MS;
602 
603 /*
604  * Per-core cache scrubbing policy and rates.
605  */
606 enum {
607 	AUTHAMD_SCRUB_BIOSDEFAULT,	/* leave as BIOS configured */
608 	AUTHAMD_SCRUB_FIXED,		/* assign our chosen rate */
609 	AUTHAMD_SCRUB_MAX		/* use higher of ours and BIOS rate */
610 } authamd_scrub_policy = AUTHAMD_SCRUB_MAX;
611 
612 uint32_t authamd_scrub_rate_dcache = 0xf;	/* 64K per 0.67 seconds */
613 uint32_t authamd_scrub_rate_l2cache = 0xe;	/* 1MB per 5.3 seconds */
614 uint32_t authamd_scrub_rate_l3cache = 0xd;	/* 1MB per 2.7 seconds */
615 
616 static uint32_t
617 authamd_scrubrate(uint32_t osrate, uint32_t biosrate, const char *varnm)
618 {
619 	uint32_t rate;
620 
621 	if (osrate > AMD_NB_SCRUBCTL_RATE_MAX) {
622 		cmn_err(CE_WARN, "%s is too large, resetting to 0x%x\n",
623 		    varnm, AMD_NB_SCRUBCTL_RATE_MAX);
624 		osrate = AMD_NB_SCRUBCTL_RATE_MAX;
625 	}
626 
627 	switch (authamd_scrub_policy) {
628 	case AUTHAMD_SCRUB_FIXED:
629 		rate = osrate;
630 		break;
631 
632 	default:
633 		cmn_err(CE_WARN, "Unknown authamd_scrub_policy %d - "
634 		    "using default policy of AUTHAMD_SCRUB_MAX",
635 		    authamd_scrub_policy);
636 		/*FALLTHRU*/
637 
638 	case AUTHAMD_SCRUB_MAX:
639 		if (osrate != 0 && biosrate != 0)
640 			rate = MIN(osrate, biosrate);	/* small is fast */
641 		else
642 			rate = osrate ? osrate : biosrate;
643 	}
644 
645 	return (rate);
646 }
647 
648 /*
649  * cms_mca_init entry point.
650  */
651 /*ARGSUSED*/
652 void
653 authamd_mca_init(cmi_hdl_t hdl, int nbanks)
654 {
655 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
656 	uint32_t rev = authamd->amd_shared->acs_rev;
657 	uint_t chipid = authamd->amd_shared->acs_chipid;
658 
659 	/*
660 	 * On chips with a NB online spare control register take control
661 	 * and clear ECC counts.
662 	 */
663 	if (AUTHAMD_HAS_ONLINESPARECTL(rev) &&
664 	    authamd_chip_once(authamd, AUTHAMD_CFGONCE_ONLNSPRCFG)) {
665 		authamd_clear_ecccnt(authamd, B_TRUE);
666 	}
667 
668 	/*
669 	 * And since we are claiming the telemetry stop the BIOS receiving
670 	 * an SMI on NB threshold overflow.
671 	 */
672 	if (AUTHAMD_NBMISC_NUM(rev) &&
673 	    authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBTHRESH)) {
674 		union mcmsr_nbmisc nbm;
675 		int i;
676 
677 		authamd_bankstatus_prewrite(hdl, authamd);
678 
679 		for (i = 0; i < AUTHAMD_NBMISC_NUM(rev); i++) {
680 			if (cmi_hdl_rdmsr(hdl, MC_MSR_NB_MISC(i),
681 			    (uint64_t *)&nbm) != CMI_SUCCESS)
682 				continue;
683 
684 			if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F) &&
685 			    MCMSR_FIELD_F_revFG(&nbm, mcmisc_Valid) &&
686 			    MCMSR_FIELD_F_revFG(&nbm, mcmisc_CntP)) {
687 				MCMSR_FIELD_F_revFG(&nbm, mcmisc_IntType) = 0;
688 			} else if (X86_CHIPREV_ATLEAST(rev,
689 			    X86_CHIPREV_AMD_10_REV_A) &&
690 			    MCMSR_FIELD_10_revAB(&nbm, mcmisc_Valid) &&
691 			    MCMSR_FIELD_10_revAB(&nbm, mcmisc_CntP)) {
692 				MCMSR_FIELD_10_revAB(&nbm, mcmisc_IntType) = 0;
693 			}
694 
695 			(void) cmi_hdl_wrmsr(hdl, MC_MSR_NB_MISC(i),
696 			    MCMSR_VAL(&nbm));
697 		}
698 
699 		authamd_bankstatus_postwrite(hdl, authamd);
700 	}
701 
702 	/*
703 	 * NB MCA Configuration Register.
704 	 */
705 	if (AUTHAMD_DO_NBMCACFG(rev) &&
706 	    authamd_chip_once(authamd, AUTHAMD_CFGONCE_NBMCACFG)) {
707 		uint32_t val = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL,
708 		    MC_CTL_REG_NBCFG);
709 
710 		switch (authamd_nb_watchdog_policy) {
711 		case AUTHAMD_NB_WDOG_LEAVEALONE:
712 			break;
713 
714 		case AUTHAMD_NB_WDOG_DISABLE:
715 			val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK |
716 			    AMD_NB_CFG_WDOGTMRCNTSEL_MASK);
717 			val |= AMD_NB_CFG_WDOGTMRDIS;
718 			break;
719 
720 		default:
721 			cmn_err(CE_NOTE, "authamd_nb_watchdog_policy=%d "
722 			    "unrecognised, using default policy",
723 			    authamd_nb_watchdog_policy);
724 			/*FALLTHRU*/
725 
726 		case AUTHAMD_NB_WDOG_ENABLE_IF_DISABLED:
727 			if (!(val & AMD_NB_CFG_WDOGTMRDIS))
728 				break;	/* if enabled leave rate intact */
729 			/*FALLTHRU*/
730 
731 		case AUTHAMD_NB_WDOG_ENABLE_FORCE_RATE:
732 			val &= ~(AMD_NB_CFG_WDOGTMRBASESEL_MASK |
733 			    AMD_NB_CFG_WDOGTMRCNTSEL_MASK |
734 			    AMD_NB_CFG_WDOGTMRDIS);
735 			val |= authamd_nb_mcacfg_wdog;
736 			break;
737 		}
738 
739 		/*
740 		 * Bit 0 of the NB MCA Config register is reserved on family
741 		 * 0x10.
742 		 */
743 		if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_10_REV_A))
744 			authamd_nb_mcacfg_add &= ~AMD_NB_CFG_CPUECCERREN;
745 
746 		val &= ~authamd_nb_mcacfg_remove;
747 		val |= authamd_nb_mcacfg_add;
748 
749 		authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL, MC_CTL_REG_NBCFG,
750 		    val);
751 	}
752 
753 	/*
754 	 * Cache scrubbing.  We can't enable DRAM scrubbing since
755 	 * we don't know the DRAM base for this node.
756 	 */
757 	if (AUTHAMD_HAS_CHIPSCRUB(rev) &&
758 	    authamd_scrub_policy != AUTHAMD_SCRUB_BIOSDEFAULT &&
759 	    authamd_chip_once(authamd, AUTHAMD_CFGONCE_CACHESCRUB)) {
760 		uint32_t val = authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL,
761 		    MC_CTL_REG_SCRUBCTL);
762 		int l3cap = 0;
763 
764 		if (AUTHAMD_L3CAPABLE(rev)) {
765 			l3cap = (authamd_pcicfg_read(chipid, MC_FUNC_MISCCTL,
766 			    MC_CTL_REG_NBCAP) & MC_NBCAP_L3CAPABLE) != 0;
767 		}
768 
769 		authamd_scrub_rate_dcache =
770 		    authamd_scrubrate(authamd_scrub_rate_dcache,
771 		    (val & AMD_NB_SCRUBCTL_DC_MASK) >> AMD_NB_SCRUBCTL_DC_SHIFT,
772 		    "authamd_scrub_rate_dcache");
773 
774 		authamd_scrub_rate_l2cache =
775 		    authamd_scrubrate(authamd_scrub_rate_l2cache,
776 		    (val & AMD_NB_SCRUBCTL_L2_MASK) >> AMD_NB_SCRUBCTL_L2_SHIFT,
777 		    "authamd_scrub_rate_l2cache");
778 
779 		authamd_scrub_rate_l3cache = l3cap ?
780 		    authamd_scrubrate(authamd_scrub_rate_l3cache,
781 		    (val & AMD_NB_SCRUBCTL_L3_MASK) >> AMD_NB_SCRUBCTL_L3_SHIFT,
782 		    "authamd_scrub_rate_l3cache") : 0;
783 
784 		val = AMD_NB_MKSCRUBCTL(authamd_scrub_rate_l3cache,
785 		    authamd_scrub_rate_dcache, authamd_scrub_rate_l2cache,
786 		    val & AMD_NB_SCRUBCTL_DRAM_MASK);
787 
788 		authamd_pcicfg_write(chipid, MC_FUNC_MISCCTL,
789 		    MC_CTL_REG_SCRUBCTL, val);
790 	}
791 
792 }
793 
794 /*
795  * cms_poll_ownermask entry point.
796  */
797 uint64_t
798 authamd_poll_ownermask(cmi_hdl_t hdl, hrtime_t pintvl)
799 {
800 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
801 	struct authamd_chipshared *acsp = authamd->amd_shared;
802 	hrtime_t now = gethrtime_waitfree();
803 	hrtime_t last = acsp->acs_poll_timestamp;
804 	int dopoll = 0;
805 
806 	if (now - last > 2 * pintvl || last == 0) {
807 		acsp->acs_pollowner = hdl;
808 		dopoll = 1;
809 	} else if (acsp->acs_pollowner == hdl) {
810 		dopoll = 1;
811 	}
812 
813 	if (dopoll)
814 		acsp->acs_poll_timestamp = now;
815 
816 	return (dopoll ? -1ULL : ~(1 << AMD_MCA_BANK_NB));
817 
818 }
819 
820 /*
821  * cms_bank_logout entry point.
822  */
823 /*ARGSUSED*/
824 void
825 authamd_bank_logout(cmi_hdl_t hdl, int bank, uint64_t status,
826     uint64_t addr, uint64_t misc, void *mslogout)
827 {
828 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
829 	struct authamd_logout *msl = mslogout;
830 	uint32_t rev = authamd->amd_shared->acs_rev;
831 
832 	if (msl == NULL)
833 		return;
834 
835 	/*
836 	 * For main memory ECC errors on revisions with an Online Spare
837 	 * Control Register grab the ECC counts by channel and chip-select
838 	 * and reset them to 0.
839 	 */
840 	if (AUTHAMD_MEMECC_RECOGNISED(rev) &&
841 	    AUTHAMD_IS_MEMECCERR(bank, status) &&
842 	    AUTHAMD_HAS_ONLINESPARECTL(rev)) {
843 		if (authamd_read_ecccnt(authamd, msl))
844 			authamd_clear_ecccnt(authamd, B_FALSE);
845 	}
846 }
847 
848 /*
849  * cms_error_action entry point
850  */
851 
852 int authamd_forgive_uc = 0;	/* For test/debug only */
853 int authamd_forgive_pcc = 0;	/* For test/debug only */
854 int authamd_fake_poison = 0;	/* For test/debug only */
855 
856 /*ARGSUSED*/
857 uint32_t
858 authamd_error_action(cmi_hdl_t hdl, int ismc, int bank,
859     uint64_t status, uint64_t addr, uint64_t misc, void *mslogout)
860 {
861 	authamd_error_disp_t *disp;
862 	uint32_t rv = 0;
863 
864 	if (authamd_forgive_uc)
865 		rv |= CMS_ERRSCOPE_CLEARED_UC;
866 
867 	if (authamd_forgive_pcc)
868 		rv |= CMS_ERRSCOPE_CURCONTEXT_OK;
869 
870 	if (authamd_fake_poison && status & MSR_MC_STATUS_UC)
871 		rv |= CMS_ERRSCOPE_POISONED;
872 
873 	if (rv)
874 		return (rv);
875 
876 	disp = authamd_disp_match(hdl, bank, status, addr, misc, mslogout);
877 
878 	if (disp == &authamd_gart_disp) {
879 		/*
880 		 * GART walk errors set UC and possibly PCC (if source CPU)
881 		 * but should not be regarded as terminal.
882 		 */
883 		return (CMS_ERRSCOPE_IGNORE_ERR);
884 	}
885 
886 	/*
887 	 * May also want to consider master abort and target abort.  These
888 	 * also set UC and PCC (if src CPU) but the requester gets -1
889 	 * and I believe the IO stuff in Solaris will handle that.
890 	 */
891 
892 	return (rv);
893 }
894 
895 /*
896  * cms_disp_match entry point
897  */
898 /*ARGSUSED*/
899 cms_cookie_t
900 authamd_disp_match(cmi_hdl_t hdl, int bank, uint64_t status,
901     uint64_t addr, uint64_t misc, void *mslogout)
902 {
903 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
904 	/* uint16_t errcode = MCAX86_ERRCODE(status); */
905 	uint16_t exterrcode = AMD_EXT_ERRCODE(status);
906 	uint32_t rev = authamd->amd_shared->acs_rev;
907 
908 	/*
909 	 * Recognise main memory ECC errors
910 	 */
911 	if (AUTHAMD_MEMECC_RECOGNISED(rev) &&
912 	    AUTHAMD_IS_MEMECCERR(bank, status)) {
913 		if (status & AMD_BANK_STAT_CECC) {
914 			return (exterrcode == 0 ? &authamd_memce_disp :
915 			    &authamd_ckmemce_disp);
916 		} else if (status & AMD_BANK_STAT_UECC) {
917 			return (exterrcode == 0 ? &authamd_memue_disp :
918 			    &authamd_ckmemue_disp);
919 		}
920 	}
921 
922 	/*
923 	 * Recognise GART walk errors
924 	 */
925 	if (AUTHAMD_NOGARTTBLWLK_MC(rev) && AUTHAMD_IS_GARTERR(bank, status))
926 		return (&authamd_gart_disp);
927 
928 	return (NULL);
929 }
930 
931 /*
932  * cms_ereport_class entry point
933  */
934 /*ARGSUSED*/
935 void
936 authamd_ereport_class(cmi_hdl_t hdl, cms_cookie_t mscookie,
937     const char **cpuclsp, const char **leafclsp)
938 {
939 	const authamd_error_disp_t *aed = mscookie;
940 
941 	if (aed == NULL)
942 		return;
943 
944 	if (aed->aad_subclass != NULL)
945 		*cpuclsp = aed->aad_subclass;
946 	if (aed->aad_leafclass != NULL)
947 		*leafclsp = aed->aad_leafclass;
948 }
949 
950 /*ARGSUSED*/
951 static void
952 authamd_ereport_add_resource(cmi_hdl_t hdl, authamd_data_t *authamd,
953     nvlist_t *ereport, nv_alloc_t *nva, void *mslogout)
954 {
955 	nvlist_t *elems[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS];
956 	uint8_t counts[AUTHAMD_DRAM_NCHANNEL * AUTHAMD_DRAM_NCS];
957 	authamd_logout_t *msl;
958 	nvlist_t *nvl;
959 	int nelems = 0;
960 	int i, chan, cs;
961 
962 	if ((msl = mslogout) == NULL)
963 		return;
964 
965 	for (chan = 0; chan < AUTHAMD_DRAM_NCHANNEL; chan++) {
966 		for (cs = 0; cs < AUTHAMD_DRAM_NCS; cs++) {
967 			if (msl->aal_eccerrcnt[chan][cs] == 0)
968 				continue;
969 
970 			if ((nvl = fm_nvlist_create(nva)) == NULL)
971 				continue;
972 
973 			elems[nelems] = nvl;
974 			counts[nelems++] = msl->aal_eccerrcnt[chan][cs];
975 
976 			fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 5,
977 			    "motherboard", 0,
978 			    "chip", authamd->amd_shared->acs_chipid,
979 			    "memory-controller", 0,
980 			    "dram-channel", chan,
981 			    "chip-select", cs);
982 		}
983 	}
984 
985 	if (nelems == 0)
986 		return;
987 
988 	fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCE,
989 	    DATA_TYPE_NVLIST_ARRAY, nelems, elems,
990 	    NULL);
991 
992 	fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_RESOURCECNT,
993 	    DATA_TYPE_UINT8_ARRAY, nelems, &counts[0],
994 	    NULL);
995 
996 	for (i = 0; i < nelems; i++)
997 		fm_nvlist_destroy(elems[i], nva ? FM_NVA_RETAIN : FM_NVA_FREE);
998 }
999 
1000 /*
1001  * cms_ereport_add_logout entry point
1002  */
1003 /*ARGSUSED*/
1004 void
1005 authamd_ereport_add_logout(cmi_hdl_t hdl, nvlist_t *ereport, nv_alloc_t *nva,
1006     int bank, uint64_t status, uint64_t addr, uint64_t misc,
1007     void *mslogout, cms_cookie_t mscookie)
1008 {
1009 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
1010 	const authamd_error_disp_t *aed = mscookie;
1011 	uint64_t members;
1012 
1013 	if (aed == NULL)
1014 		return;
1015 
1016 	members = aed->aad_ereport_members;
1017 
1018 	if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYND) {
1019 		fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_SYND,
1020 		    DATA_TYPE_UINT16, (uint16_t)AMD_BANK_SYND(status),
1021 		    NULL);
1022 
1023 		if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) {
1024 			fm_payload_set(ereport,
1025 			    FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE,
1026 			    DATA_TYPE_STRING, "E",
1027 			    NULL);
1028 		}
1029 	}
1030 
1031 	if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_CKSYND) {
1032 		fm_payload_set(ereport, FM_EREPORT_GENAMD_PAYLOAD_NAME_CKSYND,
1033 		    DATA_TYPE_UINT16, (uint16_t)AMD_NB_STAT_CKSYND(status),
1034 		    NULL);
1035 
1036 		if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_SYNDTYPE) {
1037 			fm_payload_set(ereport,
1038 			    FM_EREPORT_GENAMD_PAYLOAD_NAME_SYNDTYPE,
1039 			    DATA_TYPE_STRING, "C",
1040 			    NULL);
1041 		}
1042 	}
1043 
1044 	if (members & FM_EREPORT_GENAMD_PAYLOAD_FLAG_RESOURCE &&
1045 	    status & MSR_MC_STATUS_ADDRV) {
1046 		authamd_ereport_add_resource(hdl, authamd, ereport, nva,
1047 		    mslogout);
1048 	}
1049 }
1050 
1051 /*
1052  * cms_msrinject entry point
1053  */
1054 cms_errno_t
1055 authamd_msrinject(cmi_hdl_t hdl, uint_t msr, uint64_t val)
1056 {
1057 	authamd_data_t *authamd = cms_hdl_getcmsdata(hdl);
1058 	cms_errno_t rv = CMSERR_BADMSRWRITE;
1059 
1060 	authamd_bankstatus_prewrite(hdl, authamd);
1061 	if (cmi_hdl_wrmsr(hdl, msr, val) == CMI_SUCCESS)
1062 		rv = CMS_SUCCESS;
1063 	authamd_bankstatus_postwrite(hdl, authamd);
1064 
1065 	return (rv);
1066 }
1067 
1068 cms_api_ver_t _cms_api_version = CMS_API_VERSION_0;
1069 
1070 const cms_ops_t _cms_ops = {
1071 	authamd_init,			/* cms_init */
1072 	NULL,				/* cms_post_startup */
1073 	NULL,				/* cms_post_mpstartup */
1074 	authamd_logout_size,		/* cms_logout_size */
1075 	authamd_mcgctl_val,		/* cms_mcgctl_val */
1076 	authamd_bankctl_skipinit,	/* cms_bankctl_skipinit */
1077 	authamd_bankctl_val,		/* cms_bankctl_val */
1078 	NULL,				/* cms_bankstatus_skipinit */
1079 	NULL,				/* cms_bankstatus_val */
1080 	authamd_mca_init,		/* cms_mca_init */
1081 	authamd_poll_ownermask,		/* cms_poll_ownermask */
1082 	authamd_bank_logout,		/* cms_bank_logout */
1083 	authamd_error_action,		/* cms_error_action */
1084 	authamd_disp_match,		/* cms_disp_match */
1085 	authamd_ereport_class,		/* cms_ereport_class */
1086 	NULL,				/* cms_ereport_detector */
1087 	NULL,				/* cms_ereport_includestack */
1088 	authamd_ereport_add_logout,	/* cms_ereport_add_logout */
1089 	authamd_msrinject,		/* cms_msrinject */
1090 	NULL,				/* cms_fini */
1091 };
1092 
1093 static struct modlcpu modlcpu = {
1094 	&mod_cpuops,
1095 	"Generic AMD model-specific MCA"
1096 };
1097 
1098 static struct modlinkage modlinkage = {
1099 	MODREV_1,
1100 	(void *)&modlcpu,
1101 	NULL
1102 };
1103 
1104 int
1105 _init(void)
1106 {
1107 	return (mod_install(&modlinkage));
1108 }
1109 
1110 int
1111 _info(struct modinfo *modinfop)
1112 {
1113 	return (mod_info(&modlinkage, modinfop));
1114 }
1115 
1116 int
1117 _fini(void)
1118 {
1119 	return (mod_remove(&modlinkage));
1120 }
1121