xref: /illumos-gate/usr/src/cmd/fm/eversholt/files/i386/i86pc/amd64.esc (revision d6bb6a8465e557cb946ef49d56ed3202f6218652)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#pragma dictionary "AMD"
30
31/*
32 * Eversholt rules for the AMD Opteron CPU/Memory
33 */
34
35fru dimm;
36asru dimm;
37
38fru chip;
39asru chip/cpu;
40
41
42/*								#MEM#
43 * GET_ADDR relies on the fact that variables have global scope across an FME.
44 * Thus for each FME the assignment only occurs for the first invocation
45 * but the comparison happens on each. Thus if the new address matches the
46 * address of an existing open FME, then we return true running in the context
47 * of that FME. If the new address doesn't match the address of any existing
48 * open FME, then we return true in the context of a newly opened FME.
49 */
50#define GET_ADDR (defined($addr) ? ($addr == payloadprop("addr")) :	\
51	($addr = payloadprop("addr")))
52
53#define	GET_OFFSET ($offset = payloadprop("resource[0].hc-specific.offset"))
54
55/*
56 * SET_ADDR is used to set a payload value in the fault that we diagnose
57 * for page faults, to record the physical address of the faulting page.
58 */
59#define	SET_ADDR (setpayloadprop("asru-physaddr", $addr))
60
61#define	SET_OFFSET (setpayloadprop("asru-offset", $offset))
62
63/*
64 * RESOURCE_EXISTS is true if a pair with name "resource" exists in the
65 * payload - regardless of type (e.g., nvlist or nvlist array) or value.
66 */
67#define	RESOURCE_EXISTS	(payloadprop_defined("resource"))
68
69/*
70 * CONTAINS_DIMM is true if the "resource" nvlist array (as used in memory
71 * ereports) exists and one if its members matches the path for the
72 * dimm node.  Our memory propogation are of the form "foo@dimm -> blah@cpu"
73 * since cpus detect memory errors;  in eversholt such a propogation, where
74 * the lhs path and rhs path do not match, expands to the cross-product of
75 * all dimms and cpus in the system.  We use CONTAINS_DIMM to constrain
76 * the propogation such that it only happens if the payload resource
77 * matches the dimm.
78 */
79#define	CONTAINS_DIMM (payloadprop_contains("resource", asru(dimm)))
80
81/*
82 * The following will tell us whether a syndrome that is known to be
83 * correctable (from a mem_ecc1) is single-bit or multi-bit.  For a
84 * correctable ChipKill syndrome the number of bits set in the lowest
85 * nibble indicates how many bit were in error.
86 */
87
88#define	CBITMASK(synd) ((synd) & 0xf)
89
90#define	CKSINGLE(synd)							\
91	((synd) == 0 ||							\
92	(CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 ||		\
93	CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8))
94
95#define	SINGLE_BIT_CE							\
96	(payloadprop("syndrome-type") == "E" ||				\
97	(payloadprop("syndrome-type") == "C" &&				\
98	CKSINGLE(payloadprop("syndrome"))))
99
100#define	MULTI_BIT_CE							\
101	(payloadprop("syndrome-type") == "C" &&				\
102	!CKSINGLE(payloadprop("syndrome")))
103
104/*
105 * A single bit fault in a memory dimm can cause:
106 *
107 *  - mem_ce : reported by nb for an access from a remote cpu
108 *
109 * Single-bit errors are fed into a per-DIMM SERD engine; if a SERD engine
110 * trips we diagnose a fault.memory.page so that the response agent can
111 * retire the page that caused the trip.  If the total number of pages
112 * faulted in this way on a single DIMM exceeds a threshold we will
113 * diagnose a fault.memory.dimm_sb against the DIMM.
114 *
115 * Multibit ChipKill-correctable errors produce an immediate page fault.
116 * This is achieved through SERD engines using N=0 so the facility is there
117 * to be a little more tolerant of these errors in future.
118 *
119 * Uncorrectable errors produce an immediate page fault and corresponding
120 * fault.memory.dimm_ue.
121 *
122 * Page faults are essentially internal - action is only required when
123 * they are accompanied by a dimm fault.  As such we include message=0
124 * on DIMM faults.
125 */
126
127event ereport.cpu.amd.nb.mem_ce@cpu;
128
129/*
130 * If the address is not valid then no resource member will be included
131 * in a nb.mem_ce or nb.mem_ue ereport.  These cases should be rare.
132 * We will discard such ereports.  An alternative may be to SERD them
133 * on a per MC basis and trip if we see too many such events.
134 */
135
136event upset.memory.discard@cpu;
137
138/*								#PAGE#
139 * Page faults of all types diagnose to a single fault class and are
140 * counted with a stat.
141 *
142 * Single-bit errors are diagnosed as upsets and feed into per-DIMM
143 * SERD engines which diagnose fault.memory.page if they trip.
144 */
145
146#define PAGE_FIT		1
147#define PAGE_SB_COUNT		2
148#define PAGE_SB_TIME		72h
149#define	PAGE_CK_COUNT		0
150#define	PAGE_CK_TIME		1h
151
152engine stat.page_fault@dimm;
153event fault.memory.page@dimm, FITrate=PAGE_FIT,
154    ASRU=dimm, message=0, count=stat.page_fault@dimm,
155    action=confcall("rewrite-ASRU");
156event error.memory.page_sb@dimm;
157event error.memory.page_ck@dimm;
158event error.memory.page_ue@dimm;
159
160prop fault.memory.page@dimm (1)->
161    error.memory.page_sb@dimm,
162    error.memory.page_ck@dimm,
163    error.memory.page_ue@dimm;
164
165event ereport.memory.page_sb_trip@dimm;
166engine serd.memory.page_sb@dimm, N=PAGE_SB_COUNT, T=PAGE_SB_TIME,
167    method=persistent, trip=ereport.memory.page_sb_trip@dimm;
168event upset.memory.page_sb@dimm, engine=serd.memory.page_sb@dimm;
169
170event ereport.memory.page_ck_trip@dimm;
171engine serd.memory.page_ck@dimm, N=PAGE_CK_COUNT, T=PAGE_CK_TIME,
172    method=persistent, trip=ereport.memory.page_ck_trip@dimm;
173event upset.memory.page_ck@dimm, engine=serd.memory.page_ck@dimm;
174
175prop upset.memory.page_sb@dimm (0)->
176    ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && SINGLE_BIT_CE };
177
178prop upset.memory.page_ck@dimm (0)->
179    ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && MULTI_BIT_CE };
180
181prop error.memory.page_sb@dimm (1)->
182    ereport.memory.page_sb_trip@dimm;
183
184prop error.memory.page_ck@dimm (1)->
185    ereport.memory.page_ck_trip@dimm;
186
187prop fault.memory.page@dimm { SET_ADDR && SET_OFFSET } (0)->
188    ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && GET_ADDR && GET_OFFSET };
189
190prop upset.memory.discard@cpu (1)->
191    ereport.cpu.amd.nb.mem_ce@cpu { !RESOURCE_EXISTS };
192
193/*								#DIMM_SB#
194 * Single-bit DIMM faults are diagnosed when the number of page faults
195 * (of all types since they all are counted in a single per-DIMM stat engine)
196 * reaches a threshold.  Since our tolerance of ChipKill and UE faults
197 * is much lower than that for single-bit errors the threshold will only be
198 * reached for repeated single-bit page faults.  We do not stop diagnosing
199 * further single-bit page faults once we have declared a single-bit DIMM
200 * fault - we continue diagnosing them and response agents can continue to
201 * retire those pages up to the system-imposed retirement limit.
202 *
203 * We maintain a parallel SERD engine to the page_sb engine which trips
204 * in unison, but on trip it generates a distinct ereport which we
205 * diagnose to a dimm_sb fault if the threshold has been reached, or
206 * to a throwaway upset if not.
207 */
208
209#define DIMM_SB_FIT		2000
210#define DIMM_SB_THRESH		128
211
212event fault.memory.dimm_sb@dimm, FITrate=DIMM_SB_FIT, FRU=dimm, ASRU=dimm,
213    action=confcall("rewrite-ASRU");
214
215event ereport.memory.dimm_sb_trip@dimm;
216event upset.memory.discard@dimm;
217engine serd.memory.dimm_sb@dimm, N=PAGE_SB_COUNT, T=PAGE_SB_TIME,
218    method=persistent, trip=ereport.memory.dimm_sb_trip@dimm;
219event upset.memory.dimm_sb@dimm, engine=serd.memory.dimm_sb@dimm;
220
221prop upset.memory.dimm_sb@dimm (0)->
222    ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM };	/* sb and ck */
223
224prop upset.memory.discard@dimm (1)->
225    ereport.memory.dimm_sb_trip@dimm;
226
227prop fault.memory.dimm_sb@dimm (0)->
228    ereport.memory.dimm_sb_trip@dimm {
229	count(stat.page_fault@dimm) >= DIMM_SB_THRESH };
230
231/*								#DIMM_CK#
232 * ChipKill-correctable multi-bit errors produce immediate page faults.
233 * If the fault is indeed isolated to just a few cells then we have contained
234 * the error;  if not, say if the SDRAM device is failing, then we will hit a
235 * number of other similar errors in a short space of time.  Thus we will
236 * SERD these in diagnosing a fault.memory.dimm_ck and not simply fault
237 * the DIMM at the first instance.
238 */
239
240#define DIMM_CK_FIT		4000
241#define	DIMM_CK_COUNT		2
242#define	DIMM_CK_TIME		72h
243
244event fault.memory.dimm_ck@dimm, FITrate=DIMM_CK_FIT, FRU=dimm, ASRU=dimm,
245    action=confcall("rewrite-ASRU");
246
247event ereport.memory.dimm_ck_trip@dimm;
248engine serd.memory.dimm_ck@dimm, N=DIMM_CK_COUNT, T=DIMM_CK_TIME,
249    method=persistent, trip=ereport.memory.dimm_ck_trip@dimm;
250event upset.memory.dimm_ck@dimm, engine=serd.memory.dimm_ck@dimm;
251
252prop upset.memory.dimm_ck@dimm (0)->
253    ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && MULTI_BIT_CE };
254
255prop fault.memory.dimm_ck@dimm (1)->
256    ereport.memory.dimm_ck_trip@dimm;
257
258prop fault.memory.page@dimm { SET_ADDR && SET_OFFSET } (0)->
259    ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && MULTI_BIT_CE &&
260    GET_ADDR && GET_OFFSET };
261
262/* 								#DIMM_UE#
263 * A multi-bit fault in a memory dimm can cause:
264 *
265 *  - ue    : reported by nb for an access from a remote cpu
266 *
267 * Note we use a SERD engine here simply as a way of ensuring that we get
268 * both dimm and page faults reported
269 */
270
271#define DIMM_UE_FIT		6000
272
273event ereport.cpu.amd.nb.mem_ue@cpu;
274event ereport.memory.page_ue_trip@dimm;
275event ereport.memory.dimm_ue_trip@dimm;
276event fault.memory.dimm_ue@dimm, FITrate=DIMM_UE_FIT, FRU=dimm, ASRU=dimm,
277    action=confcall("rewrite-ASRU");
278event upset.memory.page_ue@dimm, engine=serd.memory.page_ue@dimm;
279event upset.memory.dimm_ue@dimm, engine=serd.memory.dimm_ue@dimm;
280
281engine serd.memory.dimm_ue@dimm, N=0, T=1h,
282    method=persistent, trip=ereport.memory.dimm_ue_trip@dimm;
283
284engine serd.memory.page_ue@dimm, N=0, T=1h,
285    method=persistent, trip=ereport.memory.page_ue_trip@dimm;
286
287prop upset.memory.page_ue@dimm (0)->
288    ereport.cpu.amd.nb.mem_ue@cpu { CONTAINS_DIMM };
289
290prop upset.memory.dimm_ue@dimm (0)->
291    ereport.cpu.amd.nb.mem_ue@cpu { CONTAINS_DIMM };
292
293prop error.memory.page_ue@dimm (1)->
294    ereport.memory.page_ue_trip@dimm;
295
296prop fault.memory.page@dimm { SET_ADDR && SET_OFFSET } (0)->
297    ereport.cpu.amd.nb.mem_ue@cpu { CONTAINS_DIMM && GET_ADDR & GET_OFFSET };
298
299prop fault.memory.dimm_ue@dimm (1)->
300    ereport.memory.dimm_ue_trip@dimm;
301
302prop upset.memory.discard@cpu (1)->
303    ereport.cpu.amd.nb.mem_ce@cpu { !RESOURCE_EXISTS };
304
305/*								#L2D#
306 * l2 cache data errors.
307 */
308
309#define L2CACHEDATA_FIT		1000
310#define L2CACHEDATA_SB_COUNT	3
311#define L2CACHEDATA_SB_TIME	12h
312
313event fault.cpu.amd.l2cachedata@chip/cpu, FITrate=L2CACHEDATA_FIT,
314	FRU=chip, ASRU=chip/cpu;
315event error.cpu.amd.l2cachedata_sb@chip/cpu;
316event error.cpu.amd.l2cachedata_mb@chip/cpu;
317
318prop fault.cpu.amd.l2cachedata@chip/cpu (1)->
319    error.cpu.amd.l2cachedata_sb@chip/cpu,
320    error.cpu.amd.l2cachedata_mb@chip/cpu;
321
322/* 								#L2D_SINGLE#
323 * A single bit data array fault in an l2 cache can cause:
324 *
325 *  - inf_l2_ecc1 : reported by ic on this cpu
326 *  - inf_l2_ecc1 : reported by dc on this cpu
327 *  - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu
328 *
329 * Single-bit errors are diagnosed to cache upsets.  SERD engines are used
330 * to count upsets resulting from CEs.
331 */
332
333event ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu{within(5s)};
334event ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu{within(5s)};
335event ereport.cpu.amd.bu.l2d_ecc1@chip/cpu{within(5s)};
336event ereport.cpu.amd.l2d_sb_trip@chip/cpu;
337
338engine serd.cpu.amd.l2d_sb@chip/cpu,
339    N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME, method=persistent,
340    trip=ereport.cpu.amd.l2d_sb_trip@chip/cpu;
341
342event upset.cpu.amd.l2d_sb@chip/cpu,
343	engine=serd.cpu.amd.l2d_sb@chip/cpu;
344
345prop upset.cpu.amd.l2d_sb@chip/cpu (1)->
346    ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu,
347    ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu,
348    ereport.cpu.amd.bu.l2d_ecc1@chip/cpu;
349
350prop error.cpu.amd.l2cachedata_sb@chip/cpu (1)->
351    ereport.cpu.amd.l2d_sb_trip@chip/cpu;
352
353prop fault.cpu.amd.l2cachedata@chip/cpu (0)->
354    ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu,
355    ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu,
356    ereport.cpu.amd.bu.l2d_ecc1@chip/cpu;
357
358/* 								#L2D_MULTI#
359 * A multi-bit data array fault in an l2 cache can cause:
360 *
361 *  - inf_l2_eccm : reported by ic on this cpu
362 *  - inf_l2_eccm : reported by dc on this cpu
363 *  - l2d_eccm : reported by bu on copyback or on snoop from another cpu
364 */
365
366event ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu;
367event ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu;
368event ereport.cpu.amd.bu.l2d_eccm@chip/cpu;
369
370prop error.cpu.amd.l2cachedata_mb@chip/cpu (1)->
371    ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu,
372    ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu,
373    ereport.cpu.amd.bu.l2d_eccm@chip/cpu;
374
375prop fault.cpu.amd.l2cachedata@chip/cpu (0)->
376    ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu,
377    ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu,
378    ereport.cpu.amd.bu.l2d_eccm@chip/cpu;
379
380/*								#L2T#
381 * l2 cache main tag errors
382 */
383
384#define L2CACHETAG_FIT		1000
385#define L2CACHETAG_SB_COUNT	3
386#define L2CACHETAG_SB_TIME	12h
387
388event fault.cpu.amd.l2cachetag@chip/cpu, FITrate=L2CACHETAG_FIT,
389	FRU=chip, ASRU=chip/cpu;
390event error.cpu.amd.l2cachetag_sb@chip/cpu;
391event error.cpu.amd.l2cachetag_mb@chip/cpu;
392
393prop fault.cpu.amd.l2cachetag@chip/cpu (1)->
394    error.cpu.amd.l2cachetag_sb@chip/cpu,
395    error.cpu.amd.l2cachetag_mb@chip/cpu;
396
397/* 								#L2T_SINGLE#
398 * A single bit tag array fault in an l2 cache can cause:
399 *
400 *  - l2t_ecc1 : reported by bu on this cpu when detected during snoop
401 *  - l2t_par : reported by bu on this cpu when detected other than during snoop
402 *
403 * Note that the bu.l2t_par ereport could be due to a single bit or multi bit
404 * event. If the l2t_sb_trip has already triggered it will be treated as another
405 * ce, otherwise it will be treated as a ue event.
406 */
407
408event ereport.cpu.amd.bu.l2t_ecc1@chip/cpu{within(5s)};
409event ereport.cpu.amd.bu.l2t_par@chip/cpu;
410event ereport.cpu.amd.l2t_sb_trip@chip/cpu;
411
412engine serd.cpu.amd.l2t_sb@chip/cpu,
413    N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME, method=persistent,
414    trip=ereport.cpu.amd.l2t_sb_trip@chip/cpu;
415
416event upset.cpu.amd.l2t_sb@chip/cpu,
417	engine=serd.cpu.amd.l2t_sb@chip/cpu;
418
419prop upset.cpu.amd.l2t_sb@chip/cpu (1)->
420    ereport.cpu.amd.bu.l2t_ecc1@chip/cpu,
421    ereport.cpu.amd.bu.l2t_par@chip/cpu;
422
423prop error.cpu.amd.l2cachetag_sb@chip/cpu (1)->
424    ereport.cpu.amd.l2t_sb_trip@chip/cpu;
425
426prop fault.cpu.amd.l2cachetag@chip/cpu (0)->
427    ereport.cpu.amd.bu.l2t_ecc1@chip/cpu,
428    ereport.cpu.amd.bu.l2t_par@chip/cpu;
429
430/* 								#L2T_MULTI#
431 * A multi-bit tag array fault in an l2 cache can cause:
432 *
433 *  - l2t_eccm : reported by bu on this cpu when detected during snoop
434 *  - l2t_par : reported by bu on this cpu when detected other than during snoop
435 */
436
437event ereport.cpu.amd.bu.l2t_eccm@chip/cpu;
438
439prop error.cpu.amd.l2cachetag_mb@chip/cpu (1)->
440    ereport.cpu.amd.bu.l2t_eccm@chip/cpu,
441    ereport.cpu.amd.bu.l2t_par@chip/cpu;
442
443prop fault.cpu.amd.l2cachetag@chip/cpu (0)->
444    ereport.cpu.amd.bu.l2t_eccm@chip/cpu,
445    ereport.cpu.amd.bu.l2t_par@chip/cpu;
446
447/* 								#ICD_PAR#
448 * A data array parity fault in an I cache can cause:
449 *
450 *  - data_par : reported by ic on this cpu
451 */
452
453#define ICACHEDATA_FIT		1000
454#define ICACHEDATA_SB_COUNT	2
455#define ICACHEDATA_SB_TIME	168h
456
457event ereport.cpu.amd.ic.data_par@chip/cpu{within(5s)};
458event ereport.cpu.amd.ic_dp_trip@chip/cpu;
459
460event fault.cpu.amd.icachedata@chip/cpu, FITrate=ICACHEDATA_FIT,
461	FRU=chip, ASRU=chip/cpu;
462
463engine serd.cpu.amd.icachedata@chip/cpu,
464    N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME, method=persistent,
465    trip=ereport.cpu.amd.ic_dp_trip@chip/cpu;
466
467event upset.cpu.amd.icachedata@chip/cpu,
468	engine=serd.cpu.amd.icachedata@chip/cpu;
469
470prop upset.cpu.amd.icachedata@chip/cpu (1)->
471    ereport.cpu.amd.ic.data_par@chip/cpu;
472
473prop fault.cpu.amd.icachedata@chip/cpu (1)->
474    ereport.cpu.amd.ic_dp_trip@chip/cpu;
475
476prop fault.cpu.amd.icachedata@chip/cpu (0)->
477    ereport.cpu.amd.ic.data_par@chip/cpu;
478
479/* 								#ICT_PAR#
480 * A tag array parity fault in an I cache can cause:
481 *
482 *  - tag_par : reported by ic on this cpu
483 */
484
485#define ICACHETAG_FIT		1000
486#define ICACHETAG_SB_COUNT	2
487#define ICACHETAG_SB_TIME	168h
488
489event ereport.cpu.amd.ic.tag_par@chip/cpu{within(5s)};
490event ereport.cpu.amd.ic_tp_trip@chip/cpu;
491
492event fault.cpu.amd.icachetag@chip/cpu, FITrate=ICACHETAG_FIT,
493	FRU=chip, ASRU=chip/cpu;
494
495engine serd.cpu.amd.icachetag@chip/cpu,
496    N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME, method=persistent,
497    trip=ereport.cpu.amd.ic_tp_trip@chip/cpu;
498
499event upset.cpu.amd.icachetag@chip/cpu,
500	engine=serd.cpu.amd.icachetag@chip/cpu;
501
502prop upset.cpu.amd.icachetag@chip/cpu (1)->
503    ereport.cpu.amd.ic.tag_par@chip/cpu;
504
505prop fault.cpu.amd.icachetag@chip/cpu (1)->
506    ereport.cpu.amd.ic_tp_trip@chip/cpu;
507
508prop fault.cpu.amd.icachetag@chip/cpu (0)->
509    ereport.cpu.amd.ic.tag_par@chip/cpu;
510
511/* 								#ICT_SNOOP#
512 * A snoop tag array parity fault in an I cache can cause:
513 *
514 *  - stag_par : reported by ic on this cpu
515 */
516
517#define ICACHESTAG_FIT		1000
518
519event ereport.cpu.amd.ic.stag_par@chip/cpu{within(5s)};
520
521event fault.cpu.amd.icachestag@chip/cpu, FITrate=ICACHESTAG_FIT,
522	FRU=chip, ASRU=chip/cpu;
523
524prop fault.cpu.amd.icachestag@chip/cpu (1)->
525    ereport.cpu.amd.ic.stag_par@chip/cpu;
526
527/* 								#ICTLB_1#
528 * An l1tlb parity fault in an I cache can cause:
529 *
530 *  - l1tlb_par : reported by ic on this cpu
531 */
532
533#define ICACHEL1TLB_FIT		1000
534#define ICACHEL1TLB_SB_COUNT	2
535#define ICACHEL1TLB_SB_TIME	168h
536
537event ereport.cpu.amd.ic.l1tlb_par@chip/cpu{within(5s)};
538event ereport.cpu.amd.ic_l1tlb_trip@chip/cpu;
539
540event fault.cpu.amd.l1itlb@chip/cpu, FITrate=ICACHEL1TLB_FIT,
541	FRU=chip, ASRU=chip/cpu;
542
543engine serd.cpu.amd.l1itlb@chip/cpu,
544    N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME, method=persistent,
545    trip=ereport.cpu.amd.ic_l1tlb_trip@chip/cpu;
546
547event upset.cpu.amd.l1itlb@chip/cpu,
548	engine=serd.cpu.amd.l1itlb@chip/cpu;
549
550prop upset.cpu.amd.l1itlb@chip/cpu (1)->
551    ereport.cpu.amd.ic.l1tlb_par@chip/cpu;
552
553prop fault.cpu.amd.l1itlb@chip/cpu (1)->
554    ereport.cpu.amd.ic_l1tlb_trip@chip/cpu;
555
556prop fault.cpu.amd.l1itlb@chip/cpu (0)->
557    ereport.cpu.amd.ic.l1tlb_par@chip/cpu;
558
559/* 								#ICTLB_2#
560 * An l2tlb parity fault in an I cache can cause:
561 *
562 *  - l2tlb_par : reported by ic on this cpu
563 */
564
565#define ICACHEL2TLB_FIT		1000
566#define ICACHEL2TLB_SB_COUNT	2
567#define ICACHEL2TLB_SB_TIME	168h
568
569event ereport.cpu.amd.ic.l2tlb_par@chip/cpu{within(5s)};
570event ereport.cpu.amd.ic_l2tlb_trip@chip/cpu;
571
572event fault.cpu.amd.l2itlb@chip/cpu, FITrate=ICACHEL2TLB_FIT,
573	FRU=chip, ASRU=chip/cpu;
574
575engine serd.cpu.amd.l2itlb@chip/cpu,
576    N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME, method=persistent,
577    trip=ereport.cpu.amd.ic_l2tlb_trip@chip/cpu;
578
579event upset.cpu.amd.l2itlb@chip/cpu,
580	engine=serd.cpu.amd.l2itlb@chip/cpu;
581
582prop upset.cpu.amd.l2itlb@chip/cpu (1)->
583    ereport.cpu.amd.ic.l2tlb_par@chip/cpu;
584
585prop fault.cpu.amd.l2itlb@chip/cpu (1)->
586    ereport.cpu.amd.ic_l2tlb_trip@chip/cpu;
587
588prop fault.cpu.amd.l2itlb@chip/cpu (0)->
589    ereport.cpu.amd.ic.l2tlb_par@chip/cpu;
590
591/*								#DCD#
592 * dcache data errors
593 */
594
595#define DCACHEDATA_FIT		1000
596#define DCACHEDATA_SB_COUNT	2
597#define DCACHEDATA_SB_TIME	168h
598
599event fault.cpu.amd.dcachedata@chip/cpu, FITrate=DCACHEDATA_FIT,
600	FRU=chip, ASRU=chip/cpu;
601event error.cpu.amd.dcachedata_sb@chip/cpu;
602event error.cpu.amd.dcachedata_mb@chip/cpu;
603
604prop fault.cpu.amd.dcachedata@chip/cpu (1)->
605    error.cpu.amd.dcachedata_sb@chip/cpu,
606    error.cpu.amd.dcachedata_mb@chip/cpu;
607
608/* 								#DCD_SINGLE#
609 * A single bit data array fault in an D cache can cause:
610 *
611 *  - data_ecc1 : reported by dc on this cpu by scrubber
612 *  - data_ecc1_uc : reported by dc on this cpu other than by scrubber
613 *
614 * Make data_ecc1_uc fault immediately as it may have caused a panic
615 */
616
617event ereport.cpu.amd.dc.data_ecc1@chip/cpu{within(5s)};
618event ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu{within(5s)};
619event ereport.cpu.amd.dc_sb_trip@chip/cpu;
620
621engine serd.cpu.amd.dc_sb@chip/cpu,
622    N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME, method=persistent,
623    trip=ereport.cpu.amd.dc_sb_trip@chip/cpu;
624
625engine serd.cpu.amd.dc_sb_uc@chip/cpu,
626    N=0, T=1hr, method=persistent,
627    trip=ereport.cpu.amd.dc_sb_trip@chip/cpu;
628
629event upset.cpu.amd.dc_sb@chip/cpu,
630	engine=serd.cpu.amd.dc_sb@chip/cpu;
631
632event upset.cpu.amd.dc_sb_uc@chip/cpu,
633	engine=serd.cpu.amd.dc_sb_uc@chip/cpu;
634
635prop upset.cpu.amd.dc_sb@chip/cpu (1)->
636    ereport.cpu.amd.dc.data_ecc1@chip/cpu;
637
638prop upset.cpu.amd.dc_sb_uc@chip/cpu (1)->
639    ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu;
640
641prop error.cpu.amd.dcachedata_sb@chip/cpu (1)->
642    ereport.cpu.amd.dc_sb_trip@chip/cpu;
643
644prop fault.cpu.amd.dcachedata@chip/cpu (0)->
645    ereport.cpu.amd.dc.data_ecc1@chip/cpu,
646    ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu;
647
648/* 								#DCD_MULTI#
649 * A multi-bit data array fault in an D cache can cause:
650 *
651 *  - data_eccm : reported by dc on this cpu
652 */
653
654event ereport.cpu.amd.dc.data_eccm@chip/cpu;
655
656prop error.cpu.amd.dcachedata_mb@chip/cpu (1)->
657    ereport.cpu.amd.dc.data_eccm@chip/cpu;
658
659prop fault.cpu.amd.dcachedata@chip/cpu (0)->
660    ereport.cpu.amd.dc.data_eccm@chip/cpu;
661
662/* 								#DCT_PAR#
663 * A tag array parity fault in an D cache can cause:
664 *
665 *  - tag_par : reported by dc on this cpu
666 */
667
668#define DCACHETAG_FIT		1000
669
670event ereport.cpu.amd.dc.tag_par@chip/cpu{within(5s)};
671
672event fault.cpu.amd.dcachetag@chip/cpu, FITrate=DCACHETAG_FIT,
673	FRU=chip, ASRU=chip/cpu;
674
675prop fault.cpu.amd.dcachetag@chip/cpu (1)->
676    ereport.cpu.amd.dc.tag_par@chip/cpu;
677
678/* 								#DCT_SNOOP#
679 * A snoop tag array parity fault in an D cache can cause:
680 *
681 *  - stag_par : reported by dc on this cpu
682 */
683
684#define DCACHESTAG_FIT		1000
685
686event ereport.cpu.amd.dc.stag_par@chip/cpu{within(5s)};
687
688event fault.cpu.amd.dcachestag@chip/cpu, FITrate=DCACHESTAG_FIT,
689	FRU=chip, ASRU=chip/cpu;
690
691prop fault.cpu.amd.dcachestag@chip/cpu (1)->
692    ereport.cpu.amd.dc.stag_par@chip/cpu;
693
694/* 								#DCTLB_1#
695 * An l1tlb parity fault in an D cache can cause:
696 *
697 *  - l1tlb_par : reported by dc on this cpu
698 */
699
700#define L1DTLB_FIT		1000
701
702event ereport.cpu.amd.dc.l1tlb_par@chip/cpu{within(5s)};
703
704event fault.cpu.amd.l1dtlb@chip/cpu, FITrate=L1DTLB_FIT,
705	FRU=chip, ASRU=chip/cpu;
706
707prop fault.cpu.amd.l1dtlb@chip/cpu (1)->
708    ereport.cpu.amd.dc.l1tlb_par@chip/cpu;
709
710/* 								#DCTLB_2#
711 * An l2tlb parity fault in an D cache can cause:
712 *
713 *  - l2tlb_par : reported by dc on this cpu
714 */
715
716#define L2DTLB_FIT		1000
717
718event ereport.cpu.amd.dc.l2tlb_par@chip/cpu{within(5s)};
719
720event fault.cpu.amd.l2dtlb@chip/cpu, FITrate=L2DTLB_FIT,
721	FRU=chip, ASRU=chip/cpu;
722
723prop fault.cpu.amd.l2dtlb@chip/cpu (1)->
724    ereport.cpu.amd.dc.l2tlb_par@chip/cpu;
725
726/*								#DPATH_SB#
727 * Datapath errors between NB/MC and core.
728 */
729
730#define	CPU_DP_FIT		1000
731
732event fault.cpu.amd.datapath@chip/cpu, FITrate=CPU_DP_FIT, FRU=chip,
733	ASRU=chip/cpu;
734event error.cpu.amd.datapath_sb@chip/cpu;
735event error.cpu.amd.datapath_mb@chip/cpu;
736
737prop fault.cpu.amd.datapath@chip/cpu (1)->
738    error.cpu.amd.datapath_sb@chip/cpu,
739    error.cpu.amd.datapath_mb@chip/cpu;
740
741/*
742 * A single bit fault in the datapath between the NB and requesting core
743 * can cause:
744 *
745 *  - inf_sys_ecc1 : reported by ic on access from a local cpu
746 *  - inf_sys_ecc1 : reported by dc on access from a local cpu
747 *  - s_ecc1 : reported by bu on access from a local cpu (hw prefetch etc)
748 *
749 * Empirical observations show that in 64/8 ECC mode some memory CEs *can*
750 * travel past the DRAM controller and on to the IC/DC/BU to be reported
751 * via the above errors.  This is not the case with ChipKill enabled.
752 * We should not be diagnosing datapath/chip errors for these.  While
753 * this behaviour is clarified the serd parameters will be set to infinity
754 * (and the multibit counterpats will not be seen because of sync flood).
755 */
756
757#define	CPU_DP_COUNT	5000
758#define	CPU_DP_TIME	1m
759
760event ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu{within(5s)};
761event ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu{within(5s)};
762event ereport.cpu.amd.bu.s_ecc1@chip/cpu{within(5s)};
763event upset.cpu.dp_sb@chip/cpu, engine=serd.cpu.dp_sb@chip/cpu;
764event ereport.cpu.amd.dp_sb_trip@chip/cpu;
765
766engine serd.cpu.dp_sb@chip/cpu, N=CPU_DP_COUNT, T=CPU_DP_TIME,
767    method=persistent, trip=ereport.cpu.amd.dp_sb_trip@chip/cpu;
768
769prop upset.cpu.dp_sb@chip/cpu (1)->
770    ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu,
771    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu,
772    ereport.cpu.amd.bu.s_ecc1@chip/cpu;
773
774prop error.cpu.amd.datapath_sb@chip/cpu (1)->
775    ereport.cpu.amd.dp_sb_trip@chip/cpu;
776
777prop fault.cpu.amd.datapath@chip/cpu (0)->
778    ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu,
779    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu,
780    ereport.cpu.amd.bu.s_ecc1@chip/cpu;
781
782/*								#DPATH_MB#
783 * A multi-bit fault in the datapath between the NB and requesting core
784 * can cause:
785 *
786 *  - inf_sys_eccm : reported by ic on access from a local cpu
787 *  - inf_sys_eccm : reported by dc on access from a local cpu
788 *  - s_eccm : reported by bu on access from a local cpu (hw prefetch etc)
789 */
790
791event ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu;
792event ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu;
793event ereport.cpu.amd.bu.s_eccm@chip/cpu;
794
795prop error.cpu.amd.datapath_mb@chip/cpu (1)->
796    ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu,
797    ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu,
798    ereport.cpu.amd.bu.s_eccm@chip/cpu;
799
800prop fault.cpu.amd.datapath@chip/cpu (0)->
801    ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu,
802    ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu,
803    ereport.cpu.amd.bu.s_eccm@chip/cpu;
804
805/*
806 * Ereports that should not normally happen and which we will discard
807 * without diagnosis if they do.  These fall into a few categories:
808 *
809 *	- the corresponding detector is not enabled, typically because
810 *	  detection/handling of the event is taking place elsewhere
811 *	  (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk)
812 *	- the event is associated with a sync flood so even if the detector is
813 *	  enabled we will never handle the event and generate an ereport *and*
814 *	  even if the ereport did arrive we could perform no useful diagnosis
815 *	  e.g., the NB can be configured for sync flood on nb.mem_eccm
816 *	  but we don't choose to discard that ereport here since we could have
817 *	  made a useful diagnosis from it had it been delivered
818 *	  (nb.ht_sync, nb.ht_crc)
819 *	- events that will be accompanied by an immediate panic and
820 *	  delivery of the ereport during subsequent reboot but from
821 *	  which no useful diagnosis can be made. (nb.rmw, nb.wdog)
822 *
823 * Ereports for all of these can be generated by error simulation and
824 * injection.  We will perform a null diagnosos of all these ereports in order
825 * to avoid "no subscription" complaints during test harness runs.
826 */
827
828event ereport.cpu.amd.nb.ma@cpu;
829event ereport.cpu.amd.nb.ta@cpu;
830event ereport.cpu.amd.ls.s_rde@cpu;
831event ereport.cpu.amd.ic.rdde@cpu;
832event ereport.cpu.amd.bu.s_rde@cpu;
833event ereport.cpu.amd.nb.gart_walk@cpu;
834event ereport.cpu.amd.nb.ht_sync@cpu;
835event ereport.cpu.amd.nb.ht_crc@cpu;
836event ereport.cpu.amd.nb.rmw@cpu;
837event ereport.cpu.amd.nb.wdog@cpu;
838event ereport.cpu.amd.unknown@cpu;
839
840event upset.null_diag@cpu;
841
842prop upset.null_diag@cpu (1)->
843    ereport.cpu.amd.nb.ma@cpu,
844    ereport.cpu.amd.nb.ta@cpu,
845    ereport.cpu.amd.ls.s_rde@cpu,
846    ereport.cpu.amd.ic.rdde@cpu,
847    ereport.cpu.amd.bu.s_rde@cpu,
848    ereport.cpu.amd.nb.gart_walk@cpu,
849    ereport.cpu.amd.nb.ht_sync@cpu,
850    ereport.cpu.amd.nb.ht_crc@cpu,
851    ereport.cpu.amd.nb.rmw@cpu,
852    ereport.cpu.amd.nb.wdog@cpu,
853    ereport.cpu.amd.unknown@cpu;
854