xref: /illumos-gate/usr/src/cmd/fm/eversholt/files/i386/i86pc/amd64.esc (revision 24da5b34f49324ed742a340010ed5bd3d4e06625)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#pragma dictionary "AMD"
30
31/*
32 * Eversholt rules for the AMD Opteron CPU/Memory
33 */
34
35fru motherboard;
36fru chip;
37fru dimm;
38
39asru chip/cpu;
40asru dimm;
41asru dimm/rank;
42asru dram-channel;
43asru chip/memory-controller/chip-select;
44
45#define	MAX(x, y) ((x) >= (y) ? (x) : (y))
46#define	MIN(x, y) ((x) <= (y) ? (x) : (y))
47
48/*
49 * GET_ADDR relies on the fact that variables have global scope across an FME.
50 * Thus for each FME the assignment only occurs for the first invocation
51 * but the comparison happens on each. Thus if the new address matches the
52 * address of an existing open FME, then we return true running in the context
53 * of that FME. If the new address doesn't match the address of any existing
54 * open FME, then we return true in the context of a newly opened FME.
55 */
56#define GET_ADDR (defined($addr) ? ($addr == payloadprop("addr")) :	\
57	($addr = payloadprop("addr")))
58
59#define	GET_OFFSET ($offset = payloadprop("resource[0].hc-specific.offset"))
60
61/*
62 * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that
63 * we diagnose for page faults, to record the physical address of the faulting
64 * page.  The "asru-" prefix is hooked in the "rewrite-ASRU" confcalls made on
65 * diagnosis of associated faults when the libtopo mem scheme rewrites the
66 * asru in "mem" scheme.
67 */
68#define	SET_ADDR (setpayloadprop("asru-physaddr", $addr))
69
70#define	SET_OFFSET (setpayloadprop("asru-offset", $offset))
71
72/*
73 * RESOURCE_EXISTS is true if a member with name "resource" exists in the
74 * payload - regardless of type (e.g., nvlist or nvlist array) or value.
75 */
76#define	RESOURCE_EXISTS	(payloadprop_defined("resource"))
77
78/*
79 * CONTAINS_RANK is true if the "resource" nvlist array (as used in memory
80 * ereports) exists and one if its members matches the path for the
81 * rank node.  Our memory propogation are of the form
82 *
83 * "prop foo@chip/memory-controller/dimm/rank -> blah@chip/cpu"
84 *
85 * since cpus detect memory errors;  in eversholt such a propogation, where
86 * the lhs path and rhs path do not match, expands to the cross-product of
87 * all dimms, ranks and cpus on the same chip (since chip appears in the
88 * path on both sides).  We use CONTAINS_RANK to constrain the propogation
89 * such that it only happens if the payload resource matches the rank.
90 */
91#define	CONTAINS_RANK (payloadprop_contains("resource", \
92	asru(chip/memory-controller/dimm/rank)))
93
94/*
95 * The following will tell us whether a syndrome that is known to be
96 * correctable (from a mem_ce ereport) is single-bit or multi-bit.  For a
97 * correctable ChipKill syndrome the number of bits set in the lowest
98 * nibble indicates how many bits were in error.
99 */
100
101#define	CBITMASK(synd) ((synd) & 0xf)
102
103#define	CKSINGLE(synd)							\
104	((synd) == 0 ||							\
105	(CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 ||		\
106	CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8))
107
108#define	SINGLE_BIT_CE							\
109	(payloadprop("syndrome-type") == "E" ||				\
110	(payloadprop("syndrome-type") == "C" &&				\
111	CKSINGLE(payloadprop("syndrome"))))
112
113#define	MULTI_BIT_CE							\
114	(payloadprop("syndrome-type") == "C" &&				\
115	!CKSINGLE(payloadprop("syndrome")))
116
117/*
118 * A single bit fault in a memory rank can cause:
119 *
120 *  - mem_ce : reported by nb
121 *  - inf_sys_ecc1: reported by ic or dc; inf_sys_ecc1 errors detected at the
122 *    ic do not record a syndrome; these errors will not be triggered in
123 *    ChipKill ECC mode (the NB corrects all ECC errors in that mode)
124 *  - s_ecc1: reported by bu; this error will not be triggered in ChipKill
125 *    ECC mode (the NB corrects all ECC in that mode)
126 *
127 * Single-bit errors are fed into a per-rank SERD engine; if a SERD engine
128 * trips we diagnose a fault.memory.page so that the response agent can
129 * retire the page that caused the trip.  If the total number of pages
130 * faulted in this way on a single rank exceeds a threshold we will
131 * diagnose a fault.memory.dimm_sb against the containing.
132 *
133 * Multibit ChipKill-correctable errors are treated identically to
134 * single-bit errors, but via separate serd engines to allow distinct
135 * parameters if desired.
136 *
137 * Uncorrectable errors produce an immediate page fault and corresponding
138 * fault.memory.dimm_ue.
139 *
140 * Page faults are essentially internal - action is only required when
141 * they are accompanied by a dimm fault.  As such we include message=0
142 * on page faults.
143 */
144
145event ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu{within(5s)};
146event ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu{within(5s)};
147event ereport.cpu.amd.bu.s_ecc1@chip/cpu{within(5s)};
148event ereport.cpu.amd.nb.mem_ce@chip/cpu{within(5s)};
149
150/*
151 * If the address is not valid then no resource member will be included
152 * in a nb.mem_ce or nb.mem_ue ereport.  These cases should be rare.
153 * We will also discard all inf_sys_ecc1 events detected at the ic since they
154 * have no syndrome and therefore no resource information.
155 * We will discard such ereports.  An alternative may be to SERD them
156 * on a per MC basis and trip if we see too many such events.
157 */
158
159event upset.memory.discard1@chip/cpu;
160
161/*								#PAGE#
162 * Single-bit correctable errors are diagnosed as upsets and feed into per-rank
163 * SERD engines which diagnose fault.memory.page_sb if they trip.
164 *
165 * Multi-bit correctable (via ChipKill) errors are diagnosed as upsets and feed
166 * into additional per-rank SERD engines which diagnose fault.memory.page_ck
167 * if they trip.
168 *
169 * The number of fault.memory.page and fault.memory.page_ck diagnosed is
170 * counted in stat engines for each type.  These are used in deciding
171 * whether to declare a dimm faulty after repeated page faults.
172 */
173
174#define PAGE_FIT		1
175#define PAGE_SB_COUNT		2
176#define PAGE_SB_TIME		72h
177#define	PAGE_CK_COUNT		2
178#define	PAGE_CK_TIME		72h
179
180/*
181 * The fraction of pages on a single rank that must be diagnosed as faulty
182 * with single correctable unit faults before we will fault the rank.
183 * Once we have faulted the rank we will continue to diagnose any further page
184 * faults on the rank up to some maximum multiple of the threshold at which
185 * we faulted the dimm.  This allows us to potentially contain some fairly
186 * far-reaching but still limited-extent fault (such as a partial column
187 * failure) without getting carried away and allowing a single faulty rank to
188 * use up the entire system-imposed page retirenment limit (which, once
189 * reached, causes retirement request to have no effect other than to fill
190 * the fault manager cache and logs).
191 *
192 * This fraction is specified in basis points, where 100 basis points are
193 * equivalent to 1 percent.  It is applied on a per-rank basis.
194 *
195 * The system imposes an absolute maximum on the number of pages it will
196 * retire;  the current value is 10 basis points, or 0.1% of 'physmem'.  Note
197 * that 'physmem' is reduced from installed memory pages by an amount
198 * reflecting permanent kernel memory allocations.  This system page retire
199 * limit bounds the maximum real response to page faults across all ranks
200 * that fault manager response agents can effect, but it should not be confused
201 * with any diagnosis threshold (i.e., the number of faulty pages we are
202 * prepared to tolerate from a single rank before faulting the rank is
203 * distinct from the total number of pages we are prepared to retire from use
204 * in response to that and other faults).  It is, however, desirable to
205 * arrange that the maximum number of pages we are prepared to fault from
206 * any one rank is less than the system-wide quota.
207 */
208#define	PAGE_RETIRE_LIMIT_BPS	5		/* or 0.05%; ~ 131 pages/GB %/
209
210/*
211 * A macro to manipulate the above fraction.  Given a size in bytes convert
212 * this to pages (4K pagesize) and calculate the number of those pages
213 * indicated by PAGE_RETIRE_LIMIT_BPS basis points.
214 */
215#define	_BPS_PGCNT(totalbytes) \
216	((((totalbytes) / 4096 ) * PAGE_RETIRE_LIMIT_BPS) / 10000)
217
218/*
219 * The single-correctable-unit threshold at which number of faulted pages
220 * on a rank we we fault the rank.  We insist that this be at least 128 and
221 * never more than 512.
222 */
223#define	RANK_THRESH MIN(512, MAX(128, \
224	_BPS_PGCNT(confprop(asru(chip/memory-controller/dimm/rank), "size"))))
225
226/*
227 * The maximum number of single-correctable-unit page faults we will diagnose
228 * on a single rank (must be greater than RANK_THRESH).  We set
229 * this at twice the rank fault threshold.
230 */
231#define	RANK_PGFLT_MAX (2 * RANK_THRESH)
232
233engine stat.sbpgflt@chip/memory-controller/dimm/rank;
234engine stat.ckpgflt@chip/memory-controller/dimm/rank;
235
236event fault.memory.page_sb@chip/memory-controller/dimm/rank,
237    FITrate=PAGE_FIT, ASRU=dimm/rank, message=0,
238    count=stat.sbpgflt@chip/memory-controller/dimm/rank,
239    action=confcall("rewrite-ASRU"); /* rewrite ASRU to identify page in rank */
240
241#define	SB_PGFLTS (count(stat.sbpgflt@chip/memory-controller/dimm/rank))
242
243event fault.memory.page_ck@chip/memory-controller/dimm/rank,
244    FITrate=PAGE_FIT, ASRU=dimm/rank, message=0,
245    count=stat.ckpgflt@chip/memory-controller/dimm/rank,
246    action=confcall("rewrite-ASRU"); /* rewrite ASRU to identify page in rank */
247
248#define	CK_PGFLTS (count(stat.ckpgflt@chip/memory-controller/dimm/rank))
249
250#define	RANK_PGFLT_LIMIT_REACHED \
251    (SB_PGFLTS + CK_PGFLTS > RANK_PGFLT_MAX)
252
253event ereport.memory.page_sb_trip@chip/memory-controller/dimm/rank;
254engine serd.memory.page_sb@chip/memory-controller/dimm/rank,
255    N=PAGE_SB_COUNT, T=PAGE_SB_TIME, method=persistent,
256    trip=ereport.memory.page_sb_trip@chip/memory-controller/dimm/rank;
257event upset.memory.page_sb@chip/memory-controller/dimm/rank,
258    engine=serd.memory.page_sb@chip/memory-controller/dimm/rank;
259
260event ereport.memory.page_ck_trip@chip/memory-controller/dimm/rank;
261engine serd.memory.page_ck@chip/memory-controller/dimm/rank,
262    N=PAGE_CK_COUNT, T=PAGE_CK_TIME, method=persistent,
263    trip=ereport.memory.page_ck_trip@chip/memory-controller/dimm/rank;
264event upset.memory.page_ck@chip/memory-controller/dimm/rank,
265    engine=serd.memory.page_ck@chip/memory-controller/dimm/rank;
266
267event upset.memory.overpgfltlimit@chip/memory-controller/dimm/rank;
268
269/*
270 * If we have not reached the per-rank limit on faulted pages then
271 * continue to explain ereport observations as upsets which can lead
272 * lead to page fault diagnoses if the serd engine trips.
273 */
274prop upset.memory.page_sb@chip/memory-controller/dimm/rank (0)->
275    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu
276	{ CONTAINS_RANK && SINGLE_BIT_CE && !RANK_PGFLT_LIMIT_REACHED },
277    ereport.cpu.amd.bu.s_ecc1@chip/cpu
278 	{ CONTAINS_RANK && SINGLE_BIT_CE && !RANK_PGFLT_LIMIT_REACHED },
279    ereport.cpu.amd.nb.mem_ce@chip/cpu
280	{ CONTAINS_RANK && SINGLE_BIT_CE && !RANK_PGFLT_LIMIT_REACHED };
281
282prop upset.memory.page_ck@chip/memory-controller/dimm/rank (0)->
283    /* no dc.inf_sys_ecc1 or bu.s_ecc1 in ChipKill mode */
284    ereport.cpu.amd.nb.mem_ce@chip/cpu
285	{ CONTAINS_RANK && MULTI_BIT_CE && !RANK_PGFLT_LIMIT_REACHED };
286
287/*
288 * If we have reached the per-rank limit on faulted pages then diagnose
289 * further observations on the rank to a engine-less upset (i.e., discard
290 * them).
291 */
292prop upset.memory.overpgfltlimit@chip/memory-controller/dimm/rank (1)->
293    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu
294	{ CONTAINS_RANK && RANK_PGFLT_LIMIT_REACHED },
295    ereport.cpu.amd.bu.s_ecc1@chip/cpu
296	{ CONTAINS_RANK && RANK_PGFLT_LIMIT_REACHED },
297    ereport.cpu.amd.nb.mem_ce@chip/cpu
298	{ CONTAINS_RANK && RANK_PGFLT_LIMIT_REACHED };
299
300prop fault.memory.page_sb@chip/memory-controller/dimm/rank (1)->
301    ereport.memory.page_sb_trip@chip/memory-controller/dimm/rank;
302
303prop fault.memory.page_ck@chip/memory-controller/dimm/rank (1)->
304    ereport.memory.page_ck_trip@chip/memory-controller/dimm/rank;
305
306prop fault.memory.page_sb@chip/memory-controller/dimm/rank
307    { SET_ADDR && SET_OFFSET } (0)->
308    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu
309	{ CONTAINS_RANK && GET_ADDR && GET_OFFSET },
310    ereport.cpu.amd.bu.s_ecc1@chip/cpu
311	{ CONTAINS_RANK && GET_ADDR && GET_OFFSET },
312    ereport.cpu.amd.nb.mem_ce@chip/cpu
313	{ CONTAINS_RANK && GET_ADDR && GET_OFFSET };
314
315prop fault.memory.page_ck@chip/memory-controller/dimm/rank
316    { SET_ADDR && SET_OFFSET } (0)->
317    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu
318	{ CONTAINS_RANK && GET_ADDR && GET_OFFSET },
319    ereport.cpu.amd.bu.s_ecc1@chip/cpu
320	{ CONTAINS_RANK && GET_ADDR && GET_OFFSET },
321    ereport.cpu.amd.nb.mem_ce@chip/cpu
322	{ CONTAINS_RANK && GET_ADDR && GET_OFFSET };
323
324/*
325 * Discard memory ereports that do not indicate a resource.
326 */
327prop upset.memory.discard1@chip/cpu (1)->
328    ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu { !RESOURCE_EXISTS },
329    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu { !RESOURCE_EXISTS },
330    ereport.cpu.amd.bu.s_ecc1@chip/cpu { !RESOURCE_EXISTS },
331    ereport.cpu.amd.nb.mem_ce@chip/cpu { !RESOURCE_EXISTS };
332
333/*								#DIMM_SCU#
334 * "Single-correctable-unit" DIMM faults are diagnosed when the total number of
335 * page faults (diagnosed from repeated single-bit or multibit-chipkills)
336 * from any one rank on that DIMM reaches a threshold.  A "correctable unit"
337 * is a single bit in normal 64/8 ECC mode, or a single symbol in ChipKill
338 * 128/16 mode (i.e., nibble-aligned nibble for the code used on Opteron).
339 *
340 * We do not stop diagnosing further single-bit page faults once we have
341 * declared a single-bit DIMM fault - we continue diagnosing them and
342 * response agents can continue to retire those pages up to the system-imposed
343 * retirement limit.
344 *
345 * Two distinct fault types may be diagnosed - fault.memory.dimm_sb and
346 * fault.memory.dimm_ck.  Which one is diagnosed depends on whether we
347 * have reached the threshold for a majority of single-bit page faults or
348 * multibit page faults.
349 *
350 * Implementation: we maintain parallel SERD engines to the page_sb and
351 * page_ck engines, which trip in unison.  On trip it generates a distinct
352 * ereport which we diagnose to a fault if the threshold has been
353 * reached, or to a throwaway upset if not.
354 *
355 */
356
357#define DIMM_SB_FIT		2000
358#define DIMM_CK_FIT		4000
359
360event fault.memory.dimm_sb@chip/memory-controller/dimm/rank,
361    FITrate=DIMM_SB_FIT, FRU=dimm, ASRU=dimm,
362    action=confcall("rewrite-ASRU"); /* rewrite non-leaf ASRU in mem scheme */
363
364event fault.memory.dimm_ck@chip/memory-controller/dimm/rank,
365    FITrate=DIMM_CK_FIT, FRU=dimm, ASRU=dimm,
366    action=confcall("rewrite-ASRU"); /* rewrite non-leaf ASRU in mem scheme */
367
368event ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank
369	{ within(5s) };
370engine serd.memory.dimm_sb@chip/memory-controller/dimm/rank,
371    N=PAGE_SB_COUNT, T=PAGE_SB_TIME, method=persistent,
372    trip=ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank;
373event upset.memory.dimm_sb@chip/memory-controller/dimm/rank,
374    engine=serd.memory.dimm_sb@chip/memory-controller/dimm/rank;
375
376event ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank
377	{ within(5s) };
378engine serd.memory.dimm_ck@chip/memory-controller/dimm/rank,
379    N=PAGE_CK_COUNT, T=PAGE_CK_TIME, method=persistent,
380    trip=ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank;
381event upset.memory.dimm_ck@chip/memory-controller/dimm/rank,
382    engine=serd.memory.dimm_ck@chip/memory-controller/dimm/rank;
383
384event upset.memory.discard2@chip/memory-controller/dimm/rank;
385
386prop upset.memory.dimm_sb@chip/memory-controller/dimm/rank (0)->
387    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu { CONTAINS_RANK && SINGLE_BIT_CE },
388    ereport.cpu.amd.bu.s_ecc1@chip/cpu { CONTAINS_RANK && SINGLE_BIT_CE },
389    ereport.cpu.amd.nb.mem_ce@chip/cpu { CONTAINS_RANK && SINGLE_BIT_CE };
390
391prop upset.memory.dimm_ck@chip/memory-controller/dimm/rank (0)->
392    ereport.cpu.amd.nb.mem_ce@chip/cpu { CONTAINS_RANK && MULTI_BIT_CE };
393
394/*
395 * The following two propogations diagnose a fault.memory.dimm_sb when
396 * either the dimm_sb or dimm_ck engine trips (for a new page fault)
397 * and the total number of page faults (sb and ck) exceeds the threshold
398 * value with the majority being from sb page faults.
399 */
400prop fault.memory.dimm_sb@chip/memory-controller/dimm/rank (0)->
401    ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank
402    { SB_PGFLTS + CK_PGFLTS > RANK_THRESH && SB_PGFLTS > RANK_THRESH / 2 };
403
404prop fault.memory.dimm_sb@chip/memory-controller/dimm/rank (0)->
405    ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank
406    { SB_PGFLTS + CK_PGFLTS > RANK_THRESH && SB_PGFLTS > RANK_THRESH / 2 };
407
408/*
409 * The following two propogation diagnose a fault.memory.dimm_ck when
410 * either the dimm_sb or dimm_ck engine trip (for a new page fault)
411 * and the total number of page faults (sb and ck) exceeds the threshold
412 * value with the majority  being from ck page faults.
413 */
414prop fault.memory.dimm_ck@chip/memory-controller/dimm/rank (0)->
415    ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank
416    { SB_PGFLTS + CK_PGFLTS > RANK_THRESH && CK_PGFLTS > RANK_THRESH / 2 };
417
418prop fault.memory.dimm_ck@chip/memory-controller/dimm/rank (0)->
419    ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank
420    { SB_PGFLTS + CK_PGFLTS > RANK_THRESH && CK_PGFLTS > RANK_THRESH / 2 };
421
422prop upset.memory.discard2@chip/memory-controller/dimm/rank (1)->
423    ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank,
424    ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank;
425
426/* 								#DIMM_UE#
427 *								#PAGE_UE#
428 * An uncorrectable multi-bit fault in a memory dimm can cause:
429 *
430 *  - mem_ue    	   : reported by nb for an access from a remote cpu
431 *  - inf_sys_eccm : reported by ic or dc; the ic does not report a syndrome
432 *  - s_eccm	   : reported by bu
433 *
434 * Note we use a SERD engine here simply as a way of ensuring that we get
435 * both dimm and page faults reported.
436 *
437 * Since on production systems we force HT Sync Flood on uncorrectable
438 * memory errors (if not already set as such by the BIOS, as it should be)
439 * we won't actually receive these ereports since the system will be reset.
440 */
441
442#define DIMM_UE_FIT		6000
443
444event ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu{within(5s)};
445event ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu{within(5s)};
446event ereport.cpu.amd.bu.s_eccm@chip/cpu{within(5s)};
447event ereport.cpu.amd.nb.mem_ue@chip/cpu{within(5s)};
448
449event fault.memory.dimm_ue@chip/memory-controller/dimm/rank,
450    FITrate=DIMM_UE_FIT, FRU=dimm, ASRU=dimm,
451    action=confcall("rewrite-ASRU"); /* rewrite non-leaf ASRU in mem scheme */
452
453event fault.memory.page_ue@chip/memory-controller/dimm/rank,
454    FITrate=PAGE_FIT, ASRU=dimm/rank, message=0,
455    action=confcall("rewrite-ASRU"); /* rewrite ASRU to identify page in rank */
456
457event ereport.memory.dimm_ue_trip@chip/memory-controller/dimm/rank;
458engine serd.memory.dimm_ue@chip/memory-controller/dimm/rank,
459    N=0, T=1h, method=persistent,
460    trip=ereport.memory.dimm_ue_trip@chip/memory-controller/dimm/rank;
461event upset.memory.dimm_ue@chip/memory-controller/dimm/rank,
462    engine=serd.memory.dimm_ue@chip/memory-controller/dimm/rank;
463
464event ereport.memory.page_ue_trip@chip/memory-controller/dimm/rank;
465engine serd.memory.page_ue@chip/memory-controller/dimm/rank,
466    N=0, T=1h, method=persistent,
467    trip=ereport.memory.page_ue_trip@chip/memory-controller/dimm/rank;
468event upset.memory.page_ue@chip/memory-controller/dimm/rank,
469    engine=serd.memory.page_ue@chip/memory-controller/dimm/rank;
470
471event upset.memory.discard3@chip/cpu;
472
473prop upset.memory.page_ue@chip/memory-controller/dimm/rank (0)->
474    ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu { CONTAINS_RANK },
475    ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu { CONTAINS_RANK },
476    ereport.cpu.amd.bu.s_eccm@chip/cpu { CONTAINS_RANK },
477    ereport.cpu.amd.nb.mem_ue@chip/cpu { CONTAINS_RANK };
478
479prop upset.memory.dimm_ue@chip/memory-controller/dimm/rank (0)->
480    ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu { CONTAINS_RANK },
481    ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu { CONTAINS_RANK },
482    ereport.cpu.amd.bu.s_eccm@chip/cpu { CONTAINS_RANK },
483    ereport.cpu.amd.nb.mem_ue@chip/cpu { CONTAINS_RANK };
484
485prop fault.memory.page_ue@chip/memory-controller/dimm/rank (1)->
486    ereport.memory.page_ue_trip@chip/memory-controller/dimm/rank;
487
488prop fault.memory.page_ue@chip/memory-controller/dimm/rank
489    { SET_ADDR && SET_OFFSET } (0)->
490    ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu
491	{ CONTAINS_RANK && GET_ADDR && GET_OFFSET},
492    ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu
493	{ CONTAINS_RANK && GET_ADDR && GET_OFFSET},
494    ereport.cpu.amd.bu.s_eccm@chip/cpu
495	{ CONTAINS_RANK && GET_ADDR && GET_OFFSET},
496    ereport.cpu.amd.nb.mem_ue@chip/cpu
497	{ CONTAINS_RANK && GET_ADDR && GET_OFFSET };
498
499prop fault.memory.dimm_ue@chip/memory-controller/dimm/rank (1)->
500    ereport.memory.dimm_ue_trip@chip/memory-controller/dimm/rank;
501
502prop upset.memory.discard3@chip/cpu (1)->
503    ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu { !RESOURCE_EXISTS },
504    ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu { !RESOURCE_EXISTS },
505    ereport.cpu.amd.bu.s_eccm@chip/cpu { !RESOURCE_EXISTS },
506    ereport.cpu.amd.nb.mem_ce@chip/cpu { !RESOURCE_EXISTS };
507
508/*								#CSTESTFAIL#
509 * If the BIOS fails a chip-select during POST, or perhaps after a
510 * sync flood from an uncorrectable error, then on revision F and G it
511 * should mark that chip-select as TestFail in the CS Base register.
512 * When the memory-controller driver discovers all the MC configuration
513 * it notes such failed chip-selects and creates topology nodes for the
514 * chip-select and associated dimms and ranks, and produces an ereport for each
515 * failed chip-select with detector set to the memory-controller node
516 * and resource indicating the failed chip-select.
517 */
518
519event ereport.cpu.amd.mc.cs_testfail@chip/memory-controller;
520
521event fault.memory.dimm_testfail@chip/memory-controller/dimm/rank,
522    FITrate=1000, ASRU=dimm, FRU=dimm,
523    action=confcall("rewrite-ASRU"); /* rewrite non-leaf ASRU in mem scheme */
524
525event error.memory.cs_testfail@chip/memory-controller/chip-select;
526
527#define	CONTAINS_CS (payloadprop_contains("resource", \
528	asru(chip/memory-controller/chip-select)))
529
530prop error.memory.cs_testfail@chip/memory-controller/chip-select ->
531    ereport.cpu.amd.mc.cs_testfail@chip/memory-controller
532    { CONTAINS_CS };
533
534#define CSMATCH(s) \
535	(confprop_defined(asru(chip/memory-controller/chip-select), s) && \
536	confprop(asru(chip/memory-controller/chip-select), s) == \
537	confprop(asru(chip/memory-controller/dimm/rank), "csname"))
538
539prop fault.memory.dimm_testfail@chip/memory-controller/dimm/rank ->
540    error.memory.cs_testfail@chip/memory-controller/chip-select
541    { CSMATCH("dimm1-csname") || CSMATCH("dimm2-csname")};
542
543/*								#ADDRPAR#
544 * DRAM Command/Address Parity Errors.
545 *
546 *  - dramaddr_par : reported by the nb; the NB status register includes
547 *    a bit indicating which dram controller channel (A or B) experienced
548 *    the error.
549 */
550
551event ereport.cpu.amd.nb.dramaddr_par@chip/cpu;
552
553event fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel,
554    FITrate=1000, ASRU=dram-channel;
555
556#define GET_CHANNEL ($chan = (payloadprop("bank-status") >> 32 & 0x200) ? \
557    1 : 0)
558
559prop fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel[y] (0)->
560    ereport.cpu.amd.nb.dramaddr_par@chip/cpu { GET_CHANNEL && $chan == y };
561
562/*
563 * l2 cache data errors.
564 */
565
566#define L2CACHEDATA_FIT		1000
567#define L2CACHEDATA_SB_COUNT	3
568#define L2CACHEDATA_SB_TIME	12h
569
570event fault.cpu.amd.l2cachedata@chip/cpu, FITrate=L2CACHEDATA_FIT,
571	FRU=chip, ASRU=chip/cpu;
572event error.cpu.amd.l2cachedata_sb@chip/cpu;
573event error.cpu.amd.l2cachedata_mb@chip/cpu;
574
575prop fault.cpu.amd.l2cachedata@chip/cpu (1)->
576    error.cpu.amd.l2cachedata_sb@chip/cpu,
577    error.cpu.amd.l2cachedata_mb@chip/cpu;
578
579/* 								#L2D_SINGLE#
580 * A single bit data array fault in an l2 cache can cause:
581 *
582 *  - inf_l2_ecc1 : reported by ic on this cpu
583 *  - inf_l2_ecc1 : reported by dc on this cpu
584 *  - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu
585 *
586 * Single-bit errors are diagnosed to cache upsets.  SERD engines are used
587 * to count upsets resulting from CEs.
588 */
589
590event ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu{within(5s)};
591event ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu{within(5s)};
592event ereport.cpu.amd.bu.l2d_ecc1@chip/cpu{within(5s)};
593event ereport.cpu.amd.l2d_sb_trip@chip/cpu;
594
595engine serd.cpu.amd.l2d_sb@chip/cpu,
596    N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME, method=persistent,
597    trip=ereport.cpu.amd.l2d_sb_trip@chip/cpu;
598
599event upset.cpu.amd.l2d_sb@chip/cpu,
600	engine=serd.cpu.amd.l2d_sb@chip/cpu;
601
602prop upset.cpu.amd.l2d_sb@chip/cpu (1)->
603    ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu,
604    ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu,
605    ereport.cpu.amd.bu.l2d_ecc1@chip/cpu;
606
607prop error.cpu.amd.l2cachedata_sb@chip/cpu (1)->
608    ereport.cpu.amd.l2d_sb_trip@chip/cpu;
609
610prop fault.cpu.amd.l2cachedata@chip/cpu (0)->
611    ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu,
612    ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu,
613    ereport.cpu.amd.bu.l2d_ecc1@chip/cpu;
614
615/* 								#L2D_MULTI#
616 * A multi-bit data array fault in an l2 cache can cause:
617 *
618 *  - inf_l2_eccm : reported by ic on this cpu
619 *  - inf_l2_eccm : reported by dc on this cpu
620 *  - l2d_eccm : reported by bu on copyback or on snoop from another cpu
621 */
622
623event ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu;
624event ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu;
625event ereport.cpu.amd.bu.l2d_eccm@chip/cpu;
626
627prop error.cpu.amd.l2cachedata_mb@chip/cpu (1)->
628    ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu,
629    ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu,
630    ereport.cpu.amd.bu.l2d_eccm@chip/cpu;
631
632prop fault.cpu.amd.l2cachedata@chip/cpu (0)->
633    ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu,
634    ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu,
635    ereport.cpu.amd.bu.l2d_eccm@chip/cpu;
636
637/*
638 * l2 cache main tag errors
639 */
640
641#define L2CACHETAG_FIT		1000
642#define L2CACHETAG_SB_COUNT	3
643#define L2CACHETAG_SB_TIME	12h
644
645event fault.cpu.amd.l2cachetag@chip/cpu, FITrate=L2CACHETAG_FIT,
646	FRU=chip, ASRU=chip/cpu;
647event error.cpu.amd.l2cachetag_sb@chip/cpu;
648event error.cpu.amd.l2cachetag_mb@chip/cpu;
649
650prop fault.cpu.amd.l2cachetag@chip/cpu (1)->
651    error.cpu.amd.l2cachetag_sb@chip/cpu,
652    error.cpu.amd.l2cachetag_mb@chip/cpu;
653
654/* 								#L2T_SINGLE#
655 * A single bit tag array fault in an l2 cache can cause:
656 *
657 *  - l2t_ecc1 : reported by bu on this cpu when detected during snoop
658 *  - l2t_par : reported by bu on this cpu when detected other than during snoop
659 *
660 * Note that the bu.l2t_par ereport could be due to a single bit or multi bit
661 * event. If the l2t_sb_trip has already triggered it will be treated as another
662 * ce, otherwise it will be treated as a ue event.
663 */
664
665event ereport.cpu.amd.bu.l2t_ecc1@chip/cpu{within(5s)};
666event ereport.cpu.amd.bu.l2t_par@chip/cpu;
667event ereport.cpu.amd.l2t_sb_trip@chip/cpu;
668
669engine serd.cpu.amd.l2t_sb@chip/cpu,
670    N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME, method=persistent,
671    trip=ereport.cpu.amd.l2t_sb_trip@chip/cpu;
672
673event upset.cpu.amd.l2t_sb@chip/cpu,
674	engine=serd.cpu.amd.l2t_sb@chip/cpu;
675
676prop upset.cpu.amd.l2t_sb@chip/cpu (1)->
677    ereport.cpu.amd.bu.l2t_ecc1@chip/cpu,
678    ereport.cpu.amd.bu.l2t_par@chip/cpu;
679
680prop error.cpu.amd.l2cachetag_sb@chip/cpu (1)->
681    ereport.cpu.amd.l2t_sb_trip@chip/cpu;
682
683prop fault.cpu.amd.l2cachetag@chip/cpu (0)->
684    ereport.cpu.amd.bu.l2t_ecc1@chip/cpu,
685    ereport.cpu.amd.bu.l2t_par@chip/cpu;
686
687/* 								#L2T_MULTI#
688 * A multi-bit tag array fault in an l2 cache can cause:
689 *
690 *  - l2t_eccm : reported by bu on this cpu when detected during snoop
691 *  - l2t_par : reported by bu on this cpu when detected other than during snoop
692 */
693
694event ereport.cpu.amd.bu.l2t_eccm@chip/cpu;
695
696prop error.cpu.amd.l2cachetag_mb@chip/cpu (1)->
697    ereport.cpu.amd.bu.l2t_eccm@chip/cpu,
698    ereport.cpu.amd.bu.l2t_par@chip/cpu;
699
700prop fault.cpu.amd.l2cachetag@chip/cpu (0)->
701    ereport.cpu.amd.bu.l2t_eccm@chip/cpu,
702    ereport.cpu.amd.bu.l2t_par@chip/cpu;
703
704/* 								#ICD_PAR#
705 * A data array parity fault in an I cache can cause:
706 *
707 *  - data_par : reported by ic on this cpu
708 */
709
710#define ICACHEDATA_FIT		1000
711#define ICACHEDATA_SB_COUNT	2
712#define ICACHEDATA_SB_TIME	168h
713
714event ereport.cpu.amd.ic.data_par@chip/cpu{within(5s)};
715event ereport.cpu.amd.ic_dp_trip@chip/cpu;
716
717event fault.cpu.amd.icachedata@chip/cpu, FITrate=ICACHEDATA_FIT,
718	FRU=chip, ASRU=chip/cpu;
719
720engine serd.cpu.amd.icachedata@chip/cpu,
721    N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME, method=persistent,
722    trip=ereport.cpu.amd.ic_dp_trip@chip/cpu;
723
724event upset.cpu.amd.icachedata@chip/cpu,
725	engine=serd.cpu.amd.icachedata@chip/cpu;
726
727prop upset.cpu.amd.icachedata@chip/cpu (1)->
728    ereport.cpu.amd.ic.data_par@chip/cpu;
729
730prop fault.cpu.amd.icachedata@chip/cpu (1)->
731    ereport.cpu.amd.ic_dp_trip@chip/cpu;
732
733prop fault.cpu.amd.icachedata@chip/cpu (0)->
734    ereport.cpu.amd.ic.data_par@chip/cpu;
735
736/* 								#ICT_PAR#
737 * A tag array parity fault in an I cache can cause:
738 *
739 *  - tag_par : reported by ic on this cpu
740 */
741
742#define ICACHETAG_FIT		1000
743#define ICACHETAG_SB_COUNT	2
744#define ICACHETAG_SB_TIME	168h
745
746event ereport.cpu.amd.ic.tag_par@chip/cpu{within(5s)};
747event ereport.cpu.amd.ic_tp_trip@chip/cpu;
748
749event fault.cpu.amd.icachetag@chip/cpu, FITrate=ICACHETAG_FIT,
750	FRU=chip, ASRU=chip/cpu;
751
752engine serd.cpu.amd.icachetag@chip/cpu,
753    N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME, method=persistent,
754    trip=ereport.cpu.amd.ic_tp_trip@chip/cpu;
755
756event upset.cpu.amd.icachetag@chip/cpu,
757	engine=serd.cpu.amd.icachetag@chip/cpu;
758
759prop upset.cpu.amd.icachetag@chip/cpu (1)->
760    ereport.cpu.amd.ic.tag_par@chip/cpu;
761
762prop fault.cpu.amd.icachetag@chip/cpu (1)->
763    ereport.cpu.amd.ic_tp_trip@chip/cpu;
764
765prop fault.cpu.amd.icachetag@chip/cpu (0)->
766    ereport.cpu.amd.ic.tag_par@chip/cpu;
767
768/* 								#ICT_SNOOP#
769 * A snoop tag array parity fault in an I cache can cause:
770 *
771 *  - stag_par : reported by ic on this cpu
772 */
773
774#define ICACHESTAG_FIT		1000
775
776event ereport.cpu.amd.ic.stag_par@chip/cpu{within(5s)};
777
778event fault.cpu.amd.icachestag@chip/cpu, FITrate=ICACHESTAG_FIT,
779	FRU=chip, ASRU=chip/cpu;
780
781prop fault.cpu.amd.icachestag@chip/cpu (1)->
782    ereport.cpu.amd.ic.stag_par@chip/cpu;
783
784/* 								#ICTLB_1#
785 * An l1tlb parity fault in an I cache can cause:
786 *
787 *  - l1tlb_par : reported by ic on this cpu
788 */
789
790#define ICACHEL1TLB_FIT		1000
791#define ICACHEL1TLB_SB_COUNT	2
792#define ICACHEL1TLB_SB_TIME	168h
793
794event ereport.cpu.amd.ic.l1tlb_par@chip/cpu{within(5s)};
795event ereport.cpu.amd.ic_l1tlb_trip@chip/cpu;
796
797event fault.cpu.amd.l1itlb@chip/cpu, FITrate=ICACHEL1TLB_FIT,
798	FRU=chip, ASRU=chip/cpu;
799
800engine serd.cpu.amd.l1itlb@chip/cpu,
801    N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME, method=persistent,
802    trip=ereport.cpu.amd.ic_l1tlb_trip@chip/cpu;
803
804event upset.cpu.amd.l1itlb@chip/cpu,
805	engine=serd.cpu.amd.l1itlb@chip/cpu;
806
807prop upset.cpu.amd.l1itlb@chip/cpu (1)->
808    ereport.cpu.amd.ic.l1tlb_par@chip/cpu;
809
810prop fault.cpu.amd.l1itlb@chip/cpu (1)->
811    ereport.cpu.amd.ic_l1tlb_trip@chip/cpu;
812
813prop fault.cpu.amd.l1itlb@chip/cpu (0)->
814    ereport.cpu.amd.ic.l1tlb_par@chip/cpu;
815
816/* 								#ICTLB_2#
817 * An l2tlb parity fault in an I cache can cause:
818 *
819 *  - l2tlb_par : reported by ic on this cpu
820 */
821
822#define ICACHEL2TLB_FIT		1000
823#define ICACHEL2TLB_SB_COUNT	2
824#define ICACHEL2TLB_SB_TIME	168h
825
826event ereport.cpu.amd.ic.l2tlb_par@chip/cpu{within(5s)};
827event ereport.cpu.amd.ic_l2tlb_trip@chip/cpu;
828
829event fault.cpu.amd.l2itlb@chip/cpu, FITrate=ICACHEL2TLB_FIT,
830	FRU=chip, ASRU=chip/cpu;
831
832engine serd.cpu.amd.l2itlb@chip/cpu,
833    N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME, method=persistent,
834    trip=ereport.cpu.amd.ic_l2tlb_trip@chip/cpu;
835
836event upset.cpu.amd.l2itlb@chip/cpu,
837	engine=serd.cpu.amd.l2itlb@chip/cpu;
838
839prop upset.cpu.amd.l2itlb@chip/cpu (1)->
840    ereport.cpu.amd.ic.l2tlb_par@chip/cpu;
841
842prop fault.cpu.amd.l2itlb@chip/cpu (1)->
843    ereport.cpu.amd.ic_l2tlb_trip@chip/cpu;
844
845prop fault.cpu.amd.l2itlb@chip/cpu (0)->
846    ereport.cpu.amd.ic.l2tlb_par@chip/cpu;
847
848/*
849 * dcache data errors
850 */
851
852#define DCACHEDATA_FIT		1000
853#define DCACHEDATA_SB_COUNT	2
854#define DCACHEDATA_SB_TIME	168h
855
856event fault.cpu.amd.dcachedata@chip/cpu, FITrate=DCACHEDATA_FIT,
857	FRU=chip, ASRU=chip/cpu;
858event error.cpu.amd.dcachedata_sb@chip/cpu;
859event error.cpu.amd.dcachedata_mb@chip/cpu;
860
861prop fault.cpu.amd.dcachedata@chip/cpu (1)->
862    error.cpu.amd.dcachedata_sb@chip/cpu,
863    error.cpu.amd.dcachedata_mb@chip/cpu;
864
865/* 								#DCD_SINGLE#
866 * A single bit data array fault in an D cache can cause:
867 *
868 *  - data_ecc1 : reported by dc on this cpu by scrubber
869 *  - data_ecc1_uc : reported by dc on this cpu other than by scrubber
870 *
871 * Make data_ecc1_uc fault immediately as it may have caused a panic
872 */
873
874event ereport.cpu.amd.dc.data_ecc1@chip/cpu{within(5s)};
875event ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu{within(5s)};
876event ereport.cpu.amd.dc_sb_trip@chip/cpu;
877
878engine serd.cpu.amd.dc_sb@chip/cpu,
879    N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME, method=persistent,
880    trip=ereport.cpu.amd.dc_sb_trip@chip/cpu;
881
882engine serd.cpu.amd.dc_sb_uc@chip/cpu,
883    N=0, T=1hr, method=persistent,
884    trip=ereport.cpu.amd.dc_sb_trip@chip/cpu;
885
886event upset.cpu.amd.dc_sb@chip/cpu,
887	engine=serd.cpu.amd.dc_sb@chip/cpu;
888
889event upset.cpu.amd.dc_sb_uc@chip/cpu,
890	engine=serd.cpu.amd.dc_sb_uc@chip/cpu;
891
892prop upset.cpu.amd.dc_sb@chip/cpu (1)->
893    ereport.cpu.amd.dc.data_ecc1@chip/cpu;
894
895prop upset.cpu.amd.dc_sb_uc@chip/cpu (1)->
896    ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu;
897
898prop error.cpu.amd.dcachedata_sb@chip/cpu (1)->
899    ereport.cpu.amd.dc_sb_trip@chip/cpu;
900
901prop fault.cpu.amd.dcachedata@chip/cpu (0)->
902    ereport.cpu.amd.dc.data_ecc1@chip/cpu,
903    ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu;
904
905/* 								#DCD_MULTI#
906 * A multi-bit data array fault in an D cache can cause:
907 *
908 *  - data_eccm : reported by dc on this cpu
909 */
910
911event ereport.cpu.amd.dc.data_eccm@chip/cpu;
912
913prop error.cpu.amd.dcachedata_mb@chip/cpu (1)->
914    ereport.cpu.amd.dc.data_eccm@chip/cpu;
915
916prop fault.cpu.amd.dcachedata@chip/cpu (0)->
917    ereport.cpu.amd.dc.data_eccm@chip/cpu;
918
919/* 								#DCT_PAR#
920 * A tag array parity fault in an D cache can cause:
921 *
922 *  - tag_par : reported by dc on this cpu
923 */
924
925#define DCACHETAG_FIT		1000
926
927event ereport.cpu.amd.dc.tag_par@chip/cpu{within(5s)};
928
929event fault.cpu.amd.dcachetag@chip/cpu, FITrate=DCACHETAG_FIT,
930	FRU=chip, ASRU=chip/cpu;
931
932prop fault.cpu.amd.dcachetag@chip/cpu (1)->
933    ereport.cpu.amd.dc.tag_par@chip/cpu;
934
935/* 								#DCT_SNOOP#
936 * A snoop tag array parity fault in an D cache can cause:
937 *
938 *  - stag_par : reported by dc on this cpu
939 */
940
941#define DCACHESTAG_FIT		1000
942
943event ereport.cpu.amd.dc.stag_par@chip/cpu{within(5s)};
944
945event fault.cpu.amd.dcachestag@chip/cpu, FITrate=DCACHESTAG_FIT,
946	FRU=chip, ASRU=chip/cpu;
947
948prop fault.cpu.amd.dcachestag@chip/cpu (1)->
949    ereport.cpu.amd.dc.stag_par@chip/cpu;
950
951/* 								#DCTLB_1#
952 * An l1tlb parity fault in an D cache can cause:
953 *
954 *  - l1tlb_par : reported by dc on this cpu
955 */
956
957#define L1DTLB_FIT		1000
958
959event ereport.cpu.amd.dc.l1tlb_par@chip/cpu{within(5s)};
960
961event fault.cpu.amd.l1dtlb@chip/cpu, FITrate=L1DTLB_FIT,
962	FRU=chip, ASRU=chip/cpu;
963
964prop fault.cpu.amd.l1dtlb@chip/cpu (1)->
965    ereport.cpu.amd.dc.l1tlb_par@chip/cpu;
966
967/* 								#DCTLB_2#
968 * An l2tlb parity fault in an D cache can cause:
969 *
970 *  - l2tlb_par : reported by dc on this cpu
971 */
972
973#define L2DTLB_FIT		1000
974
975event ereport.cpu.amd.dc.l2tlb_par@chip/cpu{within(5s)};
976
977event fault.cpu.amd.l2dtlb@chip/cpu, FITrate=L2DTLB_FIT,
978	FRU=chip, ASRU=chip/cpu;
979
980prop fault.cpu.amd.l2dtlb@chip/cpu (1)->
981    ereport.cpu.amd.dc.l2tlb_par@chip/cpu;
982
983/*								#MISC#
984 * Ereports that should not normally happen and which we will discard
985 * without diagnosis if they do.  These fall into a few categories:
986 *
987 *	- the corresponding detector is not enabled, typically because
988 *	  detection/handling of the event is taking place elsewhere
989 *	  (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk)
990 *	- the event is associated with a sync flood so even if the detector is
991 *	  enabled we will never handle the event and generate an ereport *and*
992 *	  even if the ereport did arrive we could perform no useful diagnosis
993 *	  e.g., the NB can be configured for sync flood on nb.mem_eccm
994 *	  but we don't choose to discard that ereport here since we could have
995 *	  made a useful diagnosis from it had it been delivered
996 *	  (nb.ht_sync, nb.ht_crc)
997 *	- events that will be accompanied by an immediate panic and
998 *	  delivery of the ereport during subsequent reboot but from
999 *	  which no useful diagnosis can be made. (nb.rmw, nb.wdog)
1000 *
1001 * Ereports for all of these can be generated by error simulation and
1002 * injection.  We will perform a null diagnosos of all these ereports in order
1003 * to avoid "no subscription" complaints during test harness runs.
1004 */
1005
1006event ereport.cpu.amd.nb.ma@cpu;
1007event ereport.cpu.amd.nb.ta@cpu;
1008event ereport.cpu.amd.ls.s_rde@cpu;
1009event ereport.cpu.amd.ic.rdde@cpu;
1010event ereport.cpu.amd.bu.s_rde@cpu;
1011event ereport.cpu.amd.nb.gart_walk@cpu;
1012event ereport.cpu.amd.nb.ht_sync@cpu;
1013event ereport.cpu.amd.nb.ht_crc@cpu;
1014event ereport.cpu.amd.nb.rmw@cpu;
1015event ereport.cpu.amd.nb.wdog@cpu;
1016event ereport.cpu.amd.unknown@cpu;
1017
1018event upset.null_diag@cpu;
1019
1020prop upset.null_diag@cpu (1)->
1021    ereport.cpu.amd.nb.ma@cpu,
1022    ereport.cpu.amd.nb.ta@cpu,
1023    ereport.cpu.amd.ls.s_rde@cpu,
1024    ereport.cpu.amd.ic.rdde@cpu,
1025    ereport.cpu.amd.bu.s_rde@cpu,
1026    ereport.cpu.amd.nb.gart_walk@cpu,
1027    ereport.cpu.amd.nb.ht_sync@cpu,
1028    ereport.cpu.amd.nb.ht_crc@cpu,
1029    ereport.cpu.amd.nb.rmw@cpu,
1030    ereport.cpu.amd.nb.wdog@cpu,
1031    ereport.cpu.amd.unknown@cpu;
1032