xref: /illumos-gate/usr/src/cmd/fm/eversholt/files/i386/i86pc/gcpu_amd.esc (revision bea83d026ee1bd1b2a2419e1d0232f107a5d7d9b)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29/*
30 * Eversholt rules for generic AMD with on-chip memory-controller(s), as seen
31 * in AMD family 0xf and 0x10.
32 *
33 * In the absence of any model-specific support, any memory errors that
34 * are observed via MCA (typically through an on-chip memory-controller)
35 * will surface as ereport.cpu.generic-x86.bus_interconnect_memory[_uc]
36 * ereports and are diagnosed via generic rules in gcpu.esc.
37 *
38 * If full model-specific support is available, including full NorthBridge
39 * support, then memory ereports will surface in a more-specific subclass
40 * such as ereport.cpu.amd.mem_ce; these are diagnosed in amd64.esc.
41 *
42 * In the case where some "vendor generic" support is present, memory errors
43 * are reported as ereport.cpu.generic-x86.mem_{ce,ue} and include a
44 * syndrome and syndrome-type, and usually also a resource FMRI to identify
45 * the affected resource.  In the AMD case a resource FMRI is included for
46 * those chip versions that include an Online Spare Control register; this
47 * register provides counts of ECC errors seen per channel and chip-select
48 * on a NorthBridge node.  The resource FMRI has form
49 * 	hc:///motherboard/chip/memory-controller/dram-channel/chip-select
50 * in these cases.
51 */
52
53#pragma dictionary "GMCA"
54
55/*
56 * The number of pages that must be faulted on a chip-select for repeated
57 * correctable errors before we will consider one of the component dimms
58 * faulty.
59 */
60#define	CS_DIMMSB_THRESH	64
61
62/*
63 * The maximum number of pages we will diagnose as faulty on any one
64 * chip-select (must be at least CS_PAGEFLT_THRESH).  If a chip-select
65 * has a fault that will affect zillions of pages this limit stops us
66 * diagnosing excessive numbers of page faults.
67 */
68#define	CS_PAGEFLT_MAX		(2 * CS_DIMMSB_THRESH)
69
70/*
71 * SERD paramters for individual page faults.  When more than PAGE_SB_COUNT
72 * correctable ereports are experienced on a single chip-select within
73 * PAGE_SB_TIME the engine will fire and we will fault the most recent
74 * page.
75 */
76#define	PAGE_SB_COUNT		3
77#define	PAGE_SB_TIME		24h
78
79fru chip;
80
81#define	CSPATH	chip/memory-controller/dram-channel/chip-select
82
83asru chip/cpu;
84asru CSPATH;
85
86/*
87 * ADDR_VALID is true if the ereport payload includes IA32_MCi_ADDR.
88 */
89#define	ADDR_VALID (payloadprop_defined("IA32_MCi_ADDR"))
90
91/*
92 * CONTAINS_CS is true if the resource nvlist array exists and one of its
93 * members matches the chip-select path.  This is used to constrain
94 * propogations to those for which a resource element matches the
95 * chip-select path of the propogation.  This is necessary because the
96 * detector element of memory ereports is a cpu and not the chip-select itself.
97 */
98#define	CONTAINS_CS (payloadprop_contains("resource", asru(CSPATH)))
99
100#define	SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR")))
101/* Generic memory ereports. */
102event ereport.cpu.generic-x86.mem_ce@chip/cpu { within(1s) };
103event ereport.cpu.generic-x86.mem_ue@chip/cpu { within(1s) };
104
105/*
106 *	 ========= Propogations for correctable page faults ============
107 *	|								|
108 *	| Discard mem_ce with no resource in the ereport payload.	|
109 *	| Discard mem_ce with no address info - we can't fault the	|
110 *	| corresponding page without it.				|
111 *	|								|
112 *	| For a mem_ce ereport detected by a given chip/cpu (as per	|
113 *	| the payload detector info) whose resource payload member	|
114 *	| includes a chip/memory-controller/dram-channel/chip-select	|
115 *	| (CSPATH) for the same chip number, diagnose to an upset event	|
116 *	| associated with a per-CSPATH SERD engine as long as we are	|
117 *	| below the page fault limit for this CSPATH (defined below);	|
118 *	| if we are over that limit then discard the event since we	|
119 *	| will already have faulted a dimm and there is no point in	|
120 *	| continuing to diagnose endless page faults from a dimm with	|
121 *	| something like a pin failure.					|
122 *	|								|
123 *	| When the per-CSPATH SERD engine fires we fault the page	|
124 *	| containing the address included in the ereport that caused	|
125 *	| the trip, and increment a per-CSPATH counter to count page	|
126 *	| faults on that chip-select from repeated correctable errors.	|
127 *	|								|
128 *	| A mem_ue ereport produces an immediate page_ue fault.		|
129 *	|===============================================================|
130 */
131
132/* Counter for page faults diagnosed on a chip-select */
133engine stat.cepgflt@CSPATH;
134
135#define	CS_PGFLT_LIMIT_REACHED (count(stat.cepgflt@CSPATH) > CS_PAGEFLT_MAX)
136
137/* Page fault event for repeated correctable errors */
138event fault.memory.generic-x86.page_ce@CSPATH,
139    FITrate=1000,			/* meaningless */
140    message=0,				/* do not message individual pageflts */
141    ASRU=CSPATH,
142    count=stat.cepgflt@CSPATH,		/* increment on pageflt diagnosis */
143    action=confcall("rewrite-ASRU");	/* identify page in chip-select */
144
145/* Upset to diagnose correctable ereports to */
146event upset.memory.generic-x86.page_ce@CSPATH,
147    engine=serd.memory.generic-x86.page_ce@CSPATH;
148
149/* Synthetic ereport generated when page_ce SERD engine trips */
150event ereport.memory.generic-x86.page_ce_trip@CSPATH { within(1s) };
151
152/* SERD engine for each chip-select */
153engine serd.memory.generic-x86.page_ce@CSPATH,
154    N=PAGE_SB_COUNT, T=PAGE_SB_TIME,
155    method=persistent,
156    trip=ereport.memory.generic-x86.page_ce_trip@CSPATH;
157
158/* Upset to discard events to when we're over limit */
159event upset.memory.generic-x86.overpgfltlimit@CSPATH;
160
161/*
162 * Discard ereports with no resource or no address info
163 */
164event upset.memory.generic-x86.discard@chip/cpu;
165prop upset.memory.generic-x86.discard@chip/cpu
166    { !payloadprop_defined("resource") || !ADDR_VALID } (1)->
167    ereport.cpu.generic-x86.mem_ce@chip/cpu;
168
169/*
170 * For as long as we are below the page fault limit diagnose correctable ereport
171 * observations as upsets to feed the SERD engine.
172 */
173prop upset.memory.generic-x86.page_ce@CSPATH
174    { ADDR_VALID && CONTAINS_CS && !CS_PGFLT_LIMIT_REACHED } (0)->
175    ereport.cpu.generic-x86.mem_ce@chip/cpu;
176
177/*
178 * Discard ereports if we are above the page fault limit on this chip-select,
179 */
180prop upset.memory.generic-x86.overpgfltlimit@CSPATH
181    { ADDR_VALID && CONTAINS_CS && CS_PGFLT_LIMIT_REACHED } (1)->
182    ereport.cpu.generic-x86.mem_ce@chip/cpu;
183
184/* Diagnose a page fault when the pagefault SERD engine trips */
185prop fault.memory.generic-x86.page_ce@CSPATH (1)->
186    ereport.memory.generic-x86.page_ce_trip@CSPATH;
187
188/* Include address info in the page fault diagnosed, for rewrite-ASRU */
189prop fault.memory.generic-x86.page_ce@CSPATH
190    { ADDR_VALID && CONTAINS_CS && SET_ADDR } (0)->
191    ereport.cpu.generic-x86.mem_ce@chip/cpu;
192
193/*
194 *	 ========= Propogations for correctable DIMM faults ============
195 *	|								|
196 *	| A dimm_ce fault is diagnosed when we have faulted an		|
197 *	| excessive number of page_ce faults on a chip-select - more	|
198 *	| than CE_DIMMSB_THRESH.					|
199 * 	|								|
200 *	| A dimm_ue fault is diagnosed on the first uncorrectable	|
201 *	| ereport from a chip-select.					|
202 *	|===============================================================|
203 */
204
205/* DIMM fault event for CE failures */
206event fault.memory.generic-x86.dimm_ce@CSPATH,
207    ASRU=CSPATH,
208    FITrate=1000,			/* meaningless */
209    action=confcall("rewrite-ASRU");	/* rewrite in "mem" FMRI scheme */
210
211#define	CS_DIMMSB_THRESH_REACHED \
212	(count(stat.cepgflt@CSPATH) == CS_DIMMSB_THRESH)
213
214/*
215 * This upset is diagnosed in parallel with upset.memory.generic-x86.page_ce
216 * on the CSPATH, and the associated SERD engine has the same parameters
217 * as serd.memory.generic-x86.page_ce@CSPATH so they fire at the same time.
218 * When this one fires we check whether we have reached the diagnosis
219 * threshold for a dimm_ce.
220 */
221event upset.memory.generic-x86.dimm_ce@CSPATH,
222    engine=serd.memory.generic-x86.dimm_ce_limitchk@CSPATH;
223
224event ereport.memory.generic-x86.dimm_ce_limitchk@CSPATH { within(1s) };
225
226engine serd.memory.generic-x86.dimm_ce_limitchk@CSPATH,
227    N=PAGE_SB_COUNT, T=PAGE_SB_TIME,
228    method=persistent,
229    trip=ereport.memory.generic-x86.dimm_ce_limitchk@CSPATH;
230
231prop upset.memory.generic-x86.dimm_ce@CSPATH
232    { ADDR_VALID && CONTAINS_CS } (0)->
233    ereport.cpu.generic-x86.mem_ce@chip/cpu;
234
235prop fault.memory.generic-x86.dimm_ce@CSPATH
236    { CS_DIMMSB_THRESH_REACHED } (0)->
237    ereport.memory.generic-x86.dimm_ce_limitchk@CSPATH;
238
239event upset.memory.generic-x86.discard2@CSPATH;
240prop upset.memory.generic-x86.discard2@CSPATH
241    { !CS_DIMMSB_THRESH_REACHED } (0)->
242    ereport.memory.generic-x86.dimm_ce_limitchk@CSPATH;
243
244/*
245 *	 ========= Propogations for uncorrectable page faults ==========
246 *	|								|
247 *	| A UE produces an immediate page fault.  But we also want a	|
248 *	| corresponding dimm fault and since we do not like multi-entry	|
249 *	| suspect lists we arrange two distinct fault management	|
250 *	| exercises by diagnosing a mem_ue to two upset events that	|
251 *	| feed instant-trip SERD engines.  Yuck.			|
252 *	|===============================================================|
253 */
254
255/* Page fault event for uncorrectable errors */
256event fault.memory.generic-x86.page_ue@CSPATH,
257    FITrate=1000,			/* meaningless */
258    message=0,				/* do not message individual pageflts */
259    count=stat.cepgflt@CSPATH,		/* increment on pageflt diagnosis */
260    action=confcall("rewrite-ASRU");	/* identify page in chip-select */
261
262/* Upset for page fault */
263event upset.memory.generic-x86.page_ue@CSPATH,
264    engine=serd.memory.generic-x86.page_ue@CSPATH;
265
266/* Synthetic erport generated when the page_ue SERD engine trips */
267event ereport.memory.generic-x86.page_ue_trip@CSPATH { within(1s) };
268
269/* Instant-trip engine for page fault */
270engine serd.memory.generic-x86.page_ue@CSPATH,
271    N=0, T=1h,	/* trip on first upset */
272    method=persistent,
273    trip=ereport.memory.generic-x86.page_ue_trip@CSPATH;
274
275/* Discard events with no address info */
276event upset.memory.generic-x86.discard3@CSPATH;
277prop upset.memory.generic-x86.discard3@CSPATH
278    { !payloadprop_defined("resource") || !ADDR_VALID } (1)->
279    ereport.cpu.generic-x86.mem_ue@chip/cpu;
280
281/* Diagnose a page_ue upset on a mem_ue event */
282prop upset.memory.generic-x86.page_ue@CSPATH
283    { ADDR_VALID && CONTAINS_CS } (0)->
284    ereport.cpu.generic-x86.mem_ue@chip/cpu;
285
286/* On the immediate SERD trip diagnose a page fault */
287prop fault.memory.generic-x86.page_ue@CSPATH (1)->
288    ereport.memory.generic-x86.page_ue_trip@CSPATH;
289
290/* Include address info in the page fault diagnosed, for rewrite-ASRU */
291prop fault.memory.generic-x86.page_ue@CSPATH
292    { ADDR_VALID && CONTAINS_CS && SET_ADDR } (0)->
293    ereport.cpu.generic-x86.mem_ue@chip/cpu;
294
295/*
296 *	 ========= Propogations for uncorrectable dimm faults ==========
297 *	|								|
298 *	| A UE produces an immediate dimm fault.  As explained in the	|
299 *	| page_ue block comment above we split the exercise in two in	|
300 *	| order to produce independent page_ue and dimm_ue diagnoses.	|
301 *	|===============================================================|
302 */
303
304/* Dimm fault for an uncorrectable error */
305event fault.memory.generic-x86.dimm_ue@CSPATH,
306    ASRU=CSPATH,
307    FITrate=1000,			/* meaningless */
308    action=confcall("rewrite-ASRU");	/* rewrite in "mem" FMRI scheme */
309
310/* Upset for dimm fault */
311event upset.memory.generic-x86.dimm_ue@CSPATH,
312    engine=serd.memory.generic-x86.dimm_ue@CSPATH;
313
314/* Sythetic ereport generated when the dimm_ue SERD engine trips */
315event ereport.memory.generic-x86.dimm_ue_trip@CSPATH { within(1s) };
316
317/* Instant-trip engine for dimm fault */
318engine serd.memory.generic-x86.dimm_ue@CSPATH,
319    N=0, T=1h,	/* trip on first upset */
320    method=persistent,
321    trip=ereport.memory.generic-x86.dimm_ue_trip@CSPATH;
322
323/* Diagnose a dimm_ue upset on a mem_ue event (in addition to page_ue upset) */
324prop upset.memory.generic-x86.dimm_ue@CSPATH
325    { CONTAINS_CS } (0)->
326    ereport.cpu.generic-x86.mem_ue@chip/cpu;
327
328/* On the immediate SERD trip diagnose a dimm fault */
329prop fault.memory.generic-x86.dimm_ue@CSPATH (1)->
330    ereport.memory.generic-x86.dimm_ue_trip@CSPATH;
331
332/*
333 *	 ========= Propogations for GART Table Walk Errors =============
334 *	|								|
335 *	| These are usually due to software mis-programming of the GART	|
336 *	| TLB rather than from hardware errors.  It would be incorrect	|
337 *	| to fault and potentially offline a cpu in response to these	|
338 *	| so they have their own fault class to facilitate us ignoring	|
339 *	| them.								|
340 *	|===============================================================|
341 */
342
343event ereport.cpu.generic-x86.gart_tbl_walk@chip/cpu { within(1s) };
344event upset.cpu.generic-x86.gart_tbl_walk@chip/cpu;
345
346prop upset.cpu.generic-x86.gart_tbl_walk@chip/cpu (1)->
347    ereport.cpu.generic-x86.gart_tbl_walk@chip/cpu;
348