xref: /illumos-gate/usr/src/cmd/fm/eversholt/files/i386/i86pc/gcpu_amd.esc (revision bb0ade0978a02d3fe0b0165cd4725fdcb593fbfb)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29/*
30 * Eversholt rules for generic AMD with on-chip memory-controller(s), as seen
31 * in AMD family 0xf and 0x10.
32 *
33 * In the absence of any model-specific support, any memory errors that
34 * are observed via MCA (typically through an on-chip memory-controller)
35 * will surface as ereport.cpu.generic-x86.bus_interconnect_memory[_uc]
36 * ereports and are diagnosed via generic rules in gcpu.esc.
37 *
38 * If full model-specific support is available, including full NorthBridge
39 * support, then memory ereports will surface in a more-specific subclass
40 * such as ereport.cpu.amd.mem_ce; these are diagnosed in amd64.esc.
41 *
42 * In the case where some "vendor generic" support is present, memory errors
43 * are reported as ereport.cpu.generic-x86.mem_{ce,ue} and include a
44 * syndrome and syndrome-type, and usually also a resource FMRI to identify
45 * the affected resource.  In the AMD case a resource FMRI is included for
46 * those chip versions that include an Online Spare Control register; this
47 * register provides counts of ECC errors seen per channel and chip-select
48 * on a NorthBridge node.  The resource FMRI has form
49 * 	hc:///motherboard/chip/memory-controller/dram-channel/chip-select
50 * in these cases.
51 */
52
53#pragma dictionary "GMCA"
54
55/*
56 * The number of pages that must be faulted on a chip-select for repeated
57 * correctable errors before we will consider one of the component dimms
58 * faulty.
59 */
60#define	CS_DIMMSB_THRESH	64
61
62/*
63 * The maximum number of pages we will diagnose as faulty on any one
64 * chip-select (must be at least CS_PAGEFLT_THRESH).  If a chip-select
65 * has a fault that will affect zillions of pages this limit stops us
66 * diagnosing excessive numbers of page faults.
67 */
68#define	CS_PAGEFLT_MAX		(2 * CS_DIMMSB_THRESH)
69
70/*
71 * SERD paramters for individual page faults.  When more than PAGE_SB_COUNT
72 * correctable ereports are experienced on a single chip-select within
73 * PAGE_SB_TIME the engine will fire and we will fault the most recent
74 * page.
75 */
76#define	PAGE_SB_COUNT		3
77#define	PAGE_SB_TIME		24h
78
79#define	CSPATH	chip/memory-controller/dram-channel/chip-select
80
81/*
82 * ADDR_VALID is true if the ereport payload includes IA32_MCi_ADDR.
83 */
84#define	ADDR_VALID (payloadprop_defined("IA32_MCi_ADDR"))
85
86/*
87 * CONTAINS_CS is true if the resource nvlist array exists and one of its
88 * members matches the chip-select path.  This is used to constrain
89 * propogations to those for which a resource element matches the
90 * chip-select path of the propogation.  This is necessary because the
91 * detector element of memory ereports is a cpu and not the chip-select itself.
92 */
93#define	CONTAINS_CS (payloadprop_contains("resource", asru(CSPATH)))
94
95#define	SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR")))
96/* Generic memory ereports. */
97event ereport.cpu.generic-x86.mem_ce@chip/cpu { within(1s) };
98event ereport.cpu.generic-x86.mem_ue@chip/cpu { within(1s) };
99
100/*
101 *	 ========= Propogations for correctable memory faults ==========
102 *	|								|
103 *	| Discard mem_ce with no resource in the ereport payload.	|
104 *	| Discard mem_ce with no address info - we can't fault the	|
105 *	| corresponding page without it.				|
106 *	|								|
107 *	| For a mem_ce ereport detected by a given chip/cpu (as per	|
108 *	| the payload detector info) whose resource payload member	|
109 *	| includes a chip/memory-controller/dram-channel/chip-select	|
110 *	| (CSPATH) for the same chip number, diagnose to an fault event	|
111 *	| associated with a per-CSPATH SERD engine as long as we are	|
112 *	| below the page fault limit for this CSPATH (defined below);	|
113 *	| if we are over that limit then discard the event since we	|
114 *	| will already have faulted a dimm and there is no point in	|
115 *	| continuing to diagnose endless page faults from a dimm with	|
116 *	| something like a pin failure.					|
117 *	|								|
118 *	| When the per-CSPATH SERD engine fires we fault the page	|
119 *	| containing the address included in the ereport that caused	|
120 *	| the trip, and increment a per-CSPATH counter to count page	|
121 *	| faults on that chip-select from repeated correctable errors.	|
122 *	|								|
123 *	| A dimm_ce fault is diagnosed when we have faulted an		|
124 *	| excessive number of page_ce faults on a chip-select - more	|
125 *	| than CE_DIMMSB_THRESH.					|
126 *	|===============================================================|
127 */
128
129#define	CS_PGFLT_LIMIT_REACHED (count(stat.cepgflt@CSPATH) > CS_PAGEFLT_MAX)
130#define	CS_DIMMSB_THRESH_REACHED \
131	(count(stat.cepgflt@CSPATH) >= CS_DIMMSB_THRESH)
132
133engine stat.cepgflt@CSPATH;
134engine serd.memory.generic-x86.page_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
135event fault.memory.generic-x86.page_ce@CSPATH,
136    message=0, response=0,		/* do not message individual pageflts */
137    count=stat.cepgflt@CSPATH,		/* increment on pageflt diagnosis */
138    engine=serd.memory.generic-x86.page_ce@CSPATH;
139engine serd.memory.generic-x86.dimm_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
140event fault.memory.generic-x86.dimm_ce@CSPATH,
141    engine=serd.memory.generic-x86.dimm_ce@CSPATH;
142
143prop fault.memory.generic-x86.page_ce@CSPATH
144    { ADDR_VALID && CONTAINS_CS && !CS_PGFLT_LIMIT_REACHED && SET_ADDR } (1)->
145    ereport.cpu.generic-x86.mem_ce@chip/cpu;
146
147prop fault.memory.generic-x86.dimm_ce@CSPATH
148    { ADDR_VALID && CONTAINS_CS && CS_DIMMSB_THRESH_REACHED } (1)->
149    ereport.cpu.generic-x86.mem_ce@chip/cpu;
150
151event upset.memory.generic-x86.discard@chip/cpu;
152prop upset.memory.generic-x86.discard@chip/cpu
153    { !payloadprop_defined("resource") || !ADDR_VALID } (1)->
154    ereport.cpu.generic-x86.mem_ce@chip/cpu;
155
156/*
157 *	 ========= Propogations for uncorrectable page faults ==========
158 *	|								|
159 *	| A UE produces an immediate page fault.
160 *	|===============================================================|
161 */
162
163event fault.memory.generic-x86.page_ue@CSPATH,
164    message=0, response=0,		/* do not message individual pageflts */
165    count=stat.cepgflt@CSPATH;		/* increment on pageflt diagnosis */
166event fault.memory.generic-x86.dimm_ue@CSPATH;
167
168prop fault.memory.generic-x86.page_ue@CSPATH
169    { ADDR_VALID && CONTAINS_CS && SET_ADDR } (1)->
170    ereport.cpu.generic-x86.mem_ue@chip/cpu;
171
172prop fault.memory.generic-x86.dimm_ue@CSPATH
173    { ADDR_VALID && CONTAINS_CS } (1)->
174    ereport.cpu.generic-x86.mem_ue@chip/cpu;
175
176event upset.memory.generic-x86.discard3@CSPATH;
177prop upset.memory.generic-x86.discard3@CSPATH
178    { !payloadprop_defined("resource") || !ADDR_VALID } (1)->
179    ereport.cpu.generic-x86.mem_ue@chip/cpu;
180
181/*
182 *	 ========= Propogations for GART Table Walk Errors =============
183 *	|								|
184 *	| These are usually due to software mis-programming of the GART	|
185 *	| TLB rather than from hardware errors.  It would be incorrect	|
186 *	| to fault and potentially offline a cpu in response to these	|
187 *	| so they have their own fault class to facilitate us ignoring	|
188 *	| them.								|
189 *	|===============================================================|
190 */
191
192event ereport.cpu.generic-x86.gart_tbl_walk@chip/cpu { within(1s) };
193event upset.cpu.generic-x86.gart_tbl_walk@chip/cpu;
194
195prop upset.cpu.generic-x86.gart_tbl_walk@chip/cpu (1)->
196    ereport.cpu.generic-x86.gart_tbl_walk@chip/cpu;
197