xref: /illumos-gate/usr/src/cmd/fm/eversholt/files/i386/i86pc/amd64.esc (revision cf327f5a61bfa78d5cf81410e439640e480f850b)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma dictionary "AMD"
28
29/*
30 * Eversholt rules for the AMD Opteron CPU/Memory
31 */
32
33#define	MAX(x, y) ((x) >= (y) ? (x) : (y))
34#define	MIN(x, y) ((x) <= (y) ? (x) : (y))
35
36/*
37 * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that
38 * we diagnose for page faults, to record the physical address of the faulting
39 * page.
40 */
41#define	SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR")))
42
43#define	SET_OFFSET (setpayloadprop("asru-offset", \
44	payloadprop("resource[0].hc-specific.offset")))
45
46/*
47 * RESOURCE_EXISTS is true if a member with name "resource" exists in the
48 * payload - regardless of type (e.g., nvlist or nvlist array) or value.
49 */
50#define	RESOURCE_EXISTS	(payloadprop_defined("resource"))
51
52/*
53 * CONTAINS_RANK is true if the "resource" nvlist array (as used in memory
54 * ereports) exists and one if its members matches the path for the
55 * rank node.  Our memory propogation are of the form
56 *
57 * "prop foo@chip/memory-controller/dimm/rank -> blah@chip/core/strand"
58 *
59 * since cpus detect memory errors;  in eversholt such a propogation, where
60 * the lhs path and rhs path do not match, expands to the cross-product of
61 * all dimms, ranks and cpus on the same chip (since chip appears in the
62 * path on both sides).  We use CONTAINS_RANK to constrain the propogation
63 * such that it only happens if the payload resource matches the rank.
64 */
65#define	CONTAINS_RANK (payloadprop_contains("resource", \
66	asru(chip/memory-controller/dimm/rank)) \
67	|| payloadprop_contains("resource", \
68	asru(chip/memory-controller/dimm)))
69
70/*
71 * The following will tell us whether a syndrome that is known to be
72 * correctable (from a mem_ce ereport) is single-bit or multi-bit.  For a
73 * correctable ChipKill syndrome the number of bits set in the lowest
74 * nibble indicates how many bits were in error.
75 */
76
77#define	CBITMASK(synd) ((synd) & 0xf)
78
79#define	CKSINGLE(synd)							\
80	((synd) == 0 ||							\
81	(CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 ||		\
82	CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8))
83
84#define	SINGLE_BIT_CE							\
85	(payloadprop("syndrome-type") == "E" ||				\
86	(payloadprop("syndrome-type") == "C" &&				\
87	CKSINGLE(payloadprop("syndrome"))))
88
89#define	MULTI_BIT_CE							\
90	(payloadprop("syndrome-type") == "C" &&				\
91	!CKSINGLE(payloadprop("syndrome")))
92
93/*								#PAGE#
94 *								#DIMM_SCU#
95 * A single bit fault in a memory rank can cause:
96 *
97 *  - mem_ce : reported by nb
98 *  - inf_sys_ecc1: reported by ic or dc; inf_sys_ecc1 errors detected at the
99 *    ic do not record a syndrome; these errors will not be triggered in
100 *    ChipKill ECC mode (the NB corrects all ECC errors in that mode)
101 *  - s_ecc1: reported by bu; this error will not be triggered in ChipKill
102 *    ECC mode (the NB corrects all ECC in that mode)
103 *
104 * Single-bit errors are fed into a per-rank SERD engine; if a SERD engine
105 * trips we diagnose a fault.memory.page so that the response agent can
106 * retire the page that caused the trip.  If the total number of pages
107 * faulted in this way on a single rank exceeds a threshold we will
108 * diagnose a fault.memory.dimm_sb against the containing dimm.
109 *
110 * Multibit ChipKill-correctable errors are treated identically to
111 * single-bit errors, but via separate serd engines to allow distinct
112 * parameters if desired.
113 *
114 * Uncorrectable errors produce an immediate page fault and corresponding
115 * fault.memory.dimm_ue.
116 *
117 * Page faults are essentially internal - action is only required when
118 * they are accompanied by a dimm fault.  As such we include message=0
119 * on page faults.
120 */
121
122event ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand{within(5s)};
123event ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand{within(5s)};
124event ereport.cpu.amd.bu.s_ecc1@chip/core/strand{within(5s)};
125event ereport.cpu.amd.nb.mem_ce@chip/core/strand{within(5s)};
126
127/*
128 * Single-bit correctable errors feed into per-rank
129 * SERD engines which diagnose fault.memory.page_sb if they trip.
130 *
131 * Multi-bit correctable (via ChipKill) errors feed
132 * into additional per-rank SERD engines which diagnose fault.memory.page_ck
133 * if they trip.
134 *
135 * The number of fault.memory.page and fault.memory.page_ck diagnosed is
136 * counted in stat engines for each type.  These are used in deciding
137 * whether to declare a dimm faulty after repeated page faults.
138 */
139
140#define PAGE_SB_COUNT		2
141#define PAGE_SB_TIME		72h
142#define	PAGE_CK_COUNT		2
143#define	PAGE_CK_TIME		72h
144
145engine stat.sbpgflt@chip/memory-controller/dimm/rank;
146engine stat.ckpgflt@chip/memory-controller/dimm/rank;
147engine serd.memory.page_sb@chip/memory-controller/dimm/rank,
148    N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
149engine serd.memory.page_ck@chip/memory-controller/dimm/rank,
150    N=PAGE_CK_COUNT, T=PAGE_CK_TIME;
151engine serd.memory.dimm_sb@chip/memory-controller/dimm/rank,
152    N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
153engine serd.memory.dimm_ck@chip/memory-controller/dimm/rank,
154    N=PAGE_CK_COUNT, T=PAGE_CK_TIME;
155event fault.memory.page_sb@chip/memory-controller/dimm/rank, message=0,
156    count=stat.sbpgflt@chip/memory-controller/dimm/rank, response=0,
157    engine=serd.memory.page_sb@chip/memory-controller/dimm/rank;
158event fault.memory.page_ck@chip/memory-controller/dimm/rank, message=0,
159    count=stat.ckpgflt@chip/memory-controller/dimm/rank, response=0,
160    engine=serd.memory.page_ck@chip/memory-controller/dimm/rank;
161event fault.memory.dimm_sb@chip/memory-controller/dimm/rank,
162    engine=serd.memory.dimm_sb@chip/memory-controller/dimm/rank;
163event fault.memory.dimm_ck@chip/memory-controller/dimm/rank,
164    engine=serd.memory.dimm_ck@chip/memory-controller/dimm/rank;
165
166/*
167 * The fraction of pages on a single rank that must be diagnosed as faulty
168 * with single correctable unit faults before we will fault the rank.
169 * Once we have faulted the rank we will continue to diagnose any further page
170 * faults on the rank up to some maximum multiple of the threshold at which
171 * we faulted the dimm.  This allows us to potentially contain some fairly
172 * far-reaching but still limited-extent fault (such as a partial column
173 * failure) without getting carried away and allowing a single faulty rank to
174 * use up the entire system-imposed page retirenment limit (which, once
175 * reached, causes retirement request to have no effect other than to fill
176 * the fault manager cache and logs).
177 *
178 * This fraction is specified in basis points, where 100 basis points are
179 * equivalent to 1 percent.  It is applied on a per-rank basis.
180 *
181 * The system imposes an absolute maximum on the number of pages it will
182 * retire;  the current value is 10 basis points, or 0.1% of 'physmem'.  Note
183 * that 'physmem' is reduced from installed memory pages by an amount
184 * reflecting permanent kernel memory allocations.  This system page retire
185 * limit bounds the maximum real response to page faults across all ranks
186 * that fault manager response agents can effect, but it should not be confused
187 * with any diagnosis threshold (i.e., the number of faulty pages we are
188 * prepared to tolerate from a single rank before faulting the rank is
189 * distinct from the total number of pages we are prepared to retire from use
190 * in response to that and other faults).  It is, however, desirable to
191 * arrange that the maximum number of pages we are prepared to fault from
192 * any one rank is less than the system-wide quota.
193 */
194#define	PAGE_RETIRE_LIMIT_BPS	5		/* or 0.05%; ~ 131 pages/GB %/
195
196/*
197 * A macro to manipulate the above fraction.  Given a size in bytes convert
198 * this to pages (4K pagesize) and calculate the number of those pages
199 * indicated by PAGE_RETIRE_LIMIT_BPS basis points.
200 */
201#define	_BPS_PGCNT(totalbytes) \
202	((((totalbytes) / 4096 ) * PAGE_RETIRE_LIMIT_BPS) / 10000)
203
204/*
205 * The single-correctable-unit threshold at which number of faulted pages
206 * on a rank we we fault the rank.  We insist that this be at least 128 and
207 * never more than 512.
208 */
209#define	RANK_THRESH MIN(512, MAX(128, \
210	_BPS_PGCNT(confprop(chip/memory-controller/dimm/rank, "size"))))
211
212/*
213 * The maximum number of single-correctable-unit page faults we will diagnose
214 * on a single rank (must be greater than RANK_THRESH).  We set
215 * this at twice the rank fault threshold.
216 */
217#define	RANK_PGFLT_MAX (2 * RANK_THRESH)
218
219#define	SB_PGFLTS (count(stat.sbpgflt@chip/memory-controller/dimm/rank))
220#define	CK_PGFLTS (count(stat.ckpgflt@chip/memory-controller/dimm/rank))
221
222/*
223 * "Single-correctable-unit" DIMM faults are diagnosed when the total number of
224 * page faults (diagnosed from repeated single-bit or multibit-chipkills)
225 * from any one rank on that DIMM reaches a threshold.  A "correctable unit"
226 * is a single bit in normal 64/8 ECC mode, or a single symbol in ChipKill
227 * 128/16 mode (i.e., nibble-aligned nibble for the code used on Opteron).
228 *
229 * We do not stop diagnosing further single-bit page faults once we have
230 * declared a single-bit DIMM fault - we continue diagnosing them and
231 * response agents can continue to retire those pages up to the system-imposed
232 * retirement limit.
233 *
234 * Two distinct fault types may be diagnosed - fault.memory.dimm_sb and
235 * fault.memory.dimm_ck.  Which one is diagnosed depends on whether we
236 * have reached the threshold for a majority of single-bit page faults or
237 * multibit page faults.
238 *
239 * Implementation: we maintain parallel SERD engines to the page_sb and
240 * page_ck engines, which trip in unison.  On trip it generates a distinct
241 * ereport which we diagnose to a fault if the threshold has been reached.
242 */
243prop fault.memory.page_sb@chip/memory-controller/dimm/rank
244    { CONTAINS_RANK && SINGLE_BIT_CE &&
245      SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)->
246    ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand,
247    ereport.cpu.amd.bu.s_ecc1@chip/core/strand,
248    ereport.cpu.amd.nb.mem_ce@chip/core/strand;
249
250prop fault.memory.page_ck@chip/memory-controller/dimm/rank
251    { CONTAINS_RANK && !SINGLE_BIT_CE &&
252      SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)->
253    ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand,
254    ereport.cpu.amd.bu.s_ecc1@chip/core/strand,
255    ereport.cpu.amd.nb.mem_ce@chip/core/strand;
256
257prop fault.memory.dimm_sb@chip/memory-controller/dimm/rank
258    { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH &&
259      SB_PGFLTS > RANK_THRESH / 2 } (1)->
260    ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand,
261    ereport.cpu.amd.bu.s_ecc1@chip/core/strand,
262    ereport.cpu.amd.nb.mem_ce@chip/core/strand;
263
264prop fault.memory.dimm_ck@chip/memory-controller/dimm/rank
265    { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH &&
266      CK_PGFLTS > RANK_THRESH / 2 } (1)->
267    ereport.cpu.amd.nb.mem_ce@chip/core/strand;
268
269/*
270 * If the address is not valid then no resource member will be included
271 * in a nb.mem_ce or nb.mem_ue ereport.  These cases should be rare.
272 * We will also discard all inf_sys_ecc1 events detected at the ic since they
273 * have no syndrome and therefore no resource information.
274 * We will discard such ereports.  An alternative may be to SERD them
275 * on a per MC basis and trip if we see too many such events.
276 */
277event upset.memory.discard1@chip/core/strand;
278prop upset.memory.discard1@chip/core/strand
279    { !RESOURCE_EXISTS } (1)->
280    ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand,
281    ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand,
282    ereport.cpu.amd.bu.s_ecc1@chip/core/strand,
283    ereport.cpu.amd.nb.mem_ce@chip/core/strand;
284
285/* 								#DIMM_UE#
286 *								#PAGE_UE#
287 * An uncorrectable multi-bit fault in a memory dimm can cause:
288 *
289 *  - mem_ue    	   : reported by nb for an access from a remote cpu
290 *  - inf_sys_eccm : reported by ic or dc; the ic does not report a syndrome
291 *  - s_eccm	   : reported by bu
292 *
293 * Since on production systems we force HT Sync Flood on uncorrectable
294 * memory errors (if not already set as such by the BIOS, as it should be)
295 * we won't actually receive these ereports since the system will be reset.
296 */
297
298event ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand{within(5s)};
299event ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand{within(5s)};
300event ereport.cpu.amd.bu.s_eccm@chip/core/strand{within(5s)};
301event ereport.cpu.amd.nb.mem_ue@chip/core/strand{within(5s)};
302
303event fault.memory.dimm_ue@chip/memory-controller/dimm/rank;
304event fault.memory.page_ue@chip/memory-controller/dimm/rank, message=0,
305    response=0;
306
307prop fault.memory.dimm_ue@chip/memory-controller/dimm/rank
308    { CONTAINS_RANK } (1)->
309    ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand,
310    ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand,
311    ereport.cpu.amd.bu.s_eccm@chip/core/strand,
312    ereport.cpu.amd.nb.mem_ue@chip/core/strand;
313
314prop fault.memory.page_ue@chip/memory-controller/dimm/rank
315    { CONTAINS_RANK && SET_ADDR && SET_OFFSET } (1)->
316    ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand,
317    ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand,
318    ereport.cpu.amd.bu.s_eccm@chip/core/strand,
319    ereport.cpu.amd.nb.mem_ue@chip/core/strand;
320
321event upset.memory.discard3@chip/core/strand;
322prop upset.memory.discard3@chip/core/strand
323    { !RESOURCE_EXISTS } (1)->
324    ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand,
325    ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand,
326    ereport.cpu.amd.bu.s_eccm@chip/core/strand,
327    ereport.cpu.amd.nb.mem_ue@chip/core/strand;
328
329/*								#CSTESTFAIL#
330 * If the BIOS fails a chip-select during POST, or perhaps after a
331 * sync flood from an uncorrectable error, then on revision F and G it
332 * should mark that chip-select as TestFail in the CS Base register.
333 * When the memory-controller driver discovers all the MC configuration
334 * it notes such failed chip-selects and creates topology nodes for the
335 * chip-select and associated dimms and ranks, and produces an ereport for each
336 * failed chip-select with detector set to the memory-controller node
337 * and resource indicating the failed chip-select.
338 */
339
340event ereport.cpu.amd.mc.cs_testfail@chip/memory-controller{within(5s)};
341event fault.memory.dimm_testfail@chip/memory-controller/dimm/rank;
342event error.memory.cs_testfail@chip/memory-controller/chip-select;
343
344#define	CONTAINS_CS (payloadprop_contains("resource", \
345	asru(chip/memory-controller/chip-select)))
346
347prop error.memory.cs_testfail@chip/memory-controller/chip-select (1)->
348    ereport.cpu.amd.mc.cs_testfail@chip/memory-controller
349    { CONTAINS_CS };
350
351#define CSMATCH(s) \
352	(confprop_defined(chip/memory-controller/chip-select, s) && \
353	confprop(chip/memory-controller/chip-select, s) == \
354	confprop(chip/memory-controller/dimm/rank, "csname"))
355
356prop fault.memory.dimm_testfail@chip/memory-controller/dimm/rank (1)->
357    error.memory.cs_testfail@chip/memory-controller/chip-select
358    { CSMATCH("dimm1-csname") || CSMATCH("dimm2-csname")};
359
360/*								#ADDRPAR#
361 * DRAM Command/Address Parity Errors.
362 *
363 *  - dramaddr_par : reported by the nb; the NB status register includes
364 *    a bit indicating which dram controller channel (A or B) experienced
365 *    the error.
366 */
367
368event ereport.cpu.amd.nb.dramaddr_par@chip/core/strand{within(5s)};
369event fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel, response=0;
370
371prop fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel[y] (0)->
372    ereport.cpu.amd.nb.dramaddr_par@chip/core/strand {
373    ((payloadprop("IA32_MCi_STATUS") >> 32 & 0x200) ? 1 : 0) == y };
374
375/* 								#L2D_SINGLE#
376 * A single bit data array fault in an l2 cache can cause:
377 *
378 *  - inf_l2_ecc1 : reported by ic on this cpu
379 *  - inf_l2_ecc1 : reported by dc on this cpu
380 *  - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu
381 */
382
383#define L2CACHEDATA_SB_COUNT	3
384#define L2CACHEDATA_SB_TIME	12h
385
386event ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand{within(5s)};
387event ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand{within(5s)};
388event ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand{within(5s)};
389engine serd.cpu.amd.l2d_sb@chip/core/strand,
390    N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME;
391event fault.cpu.amd.l2cachedata@chip/core/strand, engine=serd.cpu.amd.l2d_sb@chip/core/strand;
392
393prop fault.cpu.amd.l2cachedata@chip/core/strand (0)->
394    ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand,
395    ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand,
396    ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand;
397
398/* 								#L2D_MULTI#
399 * A multi-bit data array fault in an l2 cache can cause:
400 *
401 *  - inf_l2_eccm : reported by ic on this cpu
402 *  - inf_l2_eccm : reported by dc on this cpu
403 *  - l2d_eccm : reported by bu on copyback or on snoop from another cpu
404 */
405
406event ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand{within(5s)};
407event ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand{within(5s)};
408event ereport.cpu.amd.bu.l2d_eccm@chip/core/strand{within(5s)};
409
410prop fault.cpu.amd.l2cachedata@chip/core/strand
411    { setserdincrement(L2CACHEDATA_SB_COUNT + 1) } (0)->
412    ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand,
413    ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand,
414    ereport.cpu.amd.bu.l2d_eccm@chip/core/strand;
415
416/* 								#L2T_SINGLE#
417 * A single bit tag array fault in an l2 cache can cause:
418 *
419 *  - l2t_ecc1 : reported by bu on this cpu when detected during snoop
420 *  - l2t_par : reported by bu on this cpu when detected other than during snoop
421 */
422
423#define L2CACHETAG_SB_COUNT	3
424#define L2CACHETAG_SB_TIME	12h
425
426event ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand{within(5s)};
427event ereport.cpu.amd.bu.l2t_par@chip/core/strand{within(5s)};
428engine serd.cpu.amd.l2t_sb@chip/core/strand,
429    N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME;
430event fault.cpu.amd.l2cachetag@chip/core/strand, engine=serd.cpu.amd.l2t_sb@chip/core/strand;
431
432prop fault.cpu.amd.l2cachetag@chip/core/strand (0)->
433    ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand,
434    ereport.cpu.amd.bu.l2t_par@chip/core/strand;
435
436/* 								#L2T_MULTI#
437 * A multi-bit tag array fault in an l2 cache can cause:
438 *
439 *  - l2t_eccm : reported by bu on this cpu when detected during snoop
440 *  - l2t_par : reported by bu on this cpu when detected other than during snoop
441 */
442
443event ereport.cpu.amd.bu.l2t_eccm@chip/core/strand{within(5s)};
444
445prop fault.cpu.amd.l2cachetag@chip/core/strand
446    { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)->
447    ereport.cpu.amd.bu.l2t_eccm@chip/core/strand,
448    ereport.cpu.amd.bu.l2t_par@chip/core/strand;
449
450/* 								#ICD_PAR#
451 * A data array parity fault in an I cache can cause:
452 *
453 *  - data_par : reported by ic on this cpu
454 */
455
456#define ICACHEDATA_SB_COUNT	2
457#define ICACHEDATA_SB_TIME	168h
458
459event ereport.cpu.amd.ic.data_par@chip/core/strand{within(5s)};
460engine serd.cpu.amd.icachedata@chip/core/strand,
461    N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME;
462event fault.cpu.amd.icachedata@chip/core/strand,
463    engine=serd.cpu.amd.icachedata@chip/core/strand;
464
465prop fault.cpu.amd.icachedata@chip/core/strand (0)->
466    ereport.cpu.amd.ic.data_par@chip/core/strand;
467
468/* 								#ICT_PAR#
469 * A tag array parity fault in an I cache can cause:
470 *
471 *  - tag_par : reported by ic on this cpu
472 */
473
474#define ICACHETAG_SB_COUNT	2
475#define ICACHETAG_SB_TIME	168h
476
477event ereport.cpu.amd.ic.tag_par@chip/core/strand{within(5s)};
478engine serd.cpu.amd.icachetag@chip/core/strand,
479    N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME;
480event fault.cpu.amd.icachetag@chip/core/strand, engine=serd.cpu.amd.icachetag@chip/core/strand;
481
482prop fault.cpu.amd.icachetag@chip/core/strand (0)->
483    ereport.cpu.amd.ic.tag_par@chip/core/strand;
484
485/* 								#ICT_SNOOP#
486 * A snoop tag array parity fault in an I cache can cause:
487 *
488 *  - stag_par : reported by ic on this cpu
489 */
490
491event ereport.cpu.amd.ic.stag_par@chip/core/strand{within(5s)};
492event fault.cpu.amd.icachestag@chip/core/strand;
493
494prop fault.cpu.amd.icachestag@chip/core/strand (1)->
495    ereport.cpu.amd.ic.stag_par@chip/core/strand;
496
497/* 								#ICTLB_1#
498 * An l1tlb parity fault in an I cache can cause:
499 *
500 *  - l1tlb_par : reported by ic on this cpu
501 */
502
503#define ICACHEL1TLB_SB_COUNT	2
504#define ICACHEL1TLB_SB_TIME	168h
505
506event ereport.cpu.amd.ic.l1tlb_par@chip/core/strand{within(5s)};
507engine serd.cpu.amd.l1itlb@chip/core/strand,
508    N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME;
509event fault.cpu.amd.l1itlb@chip/core/strand, engine=serd.cpu.amd.l1itlb@chip/core/strand;
510
511prop fault.cpu.amd.l1itlb@chip/core/strand (0)->
512    ereport.cpu.amd.ic.l1tlb_par@chip/core/strand;
513
514/* 								#ICTLB_2#
515 * An l2tlb parity fault in an I cache can cause:
516 *
517 *  - l2tlb_par : reported by ic on this cpu
518 */
519
520#define ICACHEL2TLB_SB_COUNT	2
521#define ICACHEL2TLB_SB_TIME	168h
522
523event ereport.cpu.amd.ic.l2tlb_par@chip/core/strand{within(5s)};
524engine serd.cpu.amd.l2itlb@chip/core/strand,
525    N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME;
526event fault.cpu.amd.l2itlb@chip/core/strand, engine=serd.cpu.amd.l2itlb@chip/core/strand;
527
528prop fault.cpu.amd.l2itlb@chip/core/strand (0)->
529    ereport.cpu.amd.ic.l2tlb_par@chip/core/strand;
530
531/* 								#DCD_SINGLE#
532 * A single bit data array fault in an D cache can cause:
533 *
534 *  - data_ecc1 : reported by dc on this cpu by scrubber
535 *  - data_ecc1_uc : reported by dc on this cpu other than by scrubber
536 *
537 * Make data_ecc1_uc fault immediately as it may have caused a panic, so
538 * it is handled by the multi-bit case in the following section.
539 */
540
541#define DCACHEDATA_SB_COUNT	2
542#define DCACHEDATA_SB_TIME	168h
543
544event ereport.cpu.amd.dc.data_ecc1@chip/core/strand{within(5s)};
545event ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand{within(5s)};
546engine serd.cpu.amd.dc_sb@chip/core/strand,
547    N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME;
548event fault.cpu.amd.dcachedata@chip/core/strand, engine=serd.cpu.amd.dc_sb@chip/core/strand;
549
550prop fault.cpu.amd.dcachedata@chip/core/strand (0)->
551    ereport.cpu.amd.dc.data_ecc1@chip/core/strand;
552
553/* 								#DCD_MULTI#
554 * A multi-bit data array fault in an D cache can cause:
555 *
556 *  - data_eccm : reported by dc on this cpu
557 */
558
559event ereport.cpu.amd.dc.data_eccm@chip/core/strand{within(5s)};
560
561prop fault.cpu.amd.dcachedata@chip/core/strand
562    { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)->
563    ereport.cpu.amd.dc.data_eccm@chip/core/strand,
564    ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand;
565
566/* 								#DCT_PAR#
567 * A tag array parity fault in an D cache can cause:
568 *
569 *  - tag_par : reported by dc on this cpu
570 */
571
572event ereport.cpu.amd.dc.tag_par@chip/core/strand{within(5s)};
573event fault.cpu.amd.dcachetag@chip/core/strand;
574
575prop fault.cpu.amd.dcachetag@chip/core/strand (1)->
576    ereport.cpu.amd.dc.tag_par@chip/core/strand;
577
578/* 								#DCT_SNOOP#
579 * A snoop tag array parity fault in an D cache can cause:
580 *
581 *  - stag_par : reported by dc on this cpu
582 */
583
584event ereport.cpu.amd.dc.stag_par@chip/core/strand{within(5s)};
585event fault.cpu.amd.dcachestag@chip/core/strand;
586
587prop fault.cpu.amd.dcachestag@chip/core/strand (1)->
588    ereport.cpu.amd.dc.stag_par@chip/core/strand;
589
590/* 								#DCTLB_1#
591 * An l1tlb parity fault in an D cache can cause:
592 *
593 *  - l1tlb_par : reported by dc on this cpu
594 */
595
596event ereport.cpu.amd.dc.l1tlb_par@chip/core/strand{within(5s)};
597event fault.cpu.amd.l1dtlb@chip/core/strand;
598
599prop fault.cpu.amd.l1dtlb@chip/core/strand (1)->
600    ereport.cpu.amd.dc.l1tlb_par@chip/core/strand;
601
602/* 								#DCTLB_2#
603 * An l2tlb parity fault in an D cache can cause:
604 *
605 *  - l2tlb_par : reported by dc on this cpu
606 */
607
608event ereport.cpu.amd.dc.l2tlb_par@chip/core/strand{within(5s)};
609event fault.cpu.amd.l2dtlb@chip/core/strand;
610
611prop fault.cpu.amd.l2dtlb@chip/core/strand (1)->
612    ereport.cpu.amd.dc.l2tlb_par@chip/core/strand;
613
614/*								#MISC#
615 * Ereports that should not normally happen and which we will discard
616 * without diagnosis if they do.  These fall into a few categories:
617 *
618 *	- the corresponding detector is not enabled, typically because
619 *	  detection/handling of the event is taking place elsewhere
620 *	  (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk)
621 *	- the event is associated with a sync flood so even if the detector is
622 *	  enabled we will never handle the event and generate an ereport *and*
623 *	  even if the ereport did arrive we could perform no useful diagnosis
624 *	  e.g., the NB can be configured for sync flood on nb.mem_eccm
625 *	  but we don't choose to discard that ereport here since we could have
626 *	  made a useful diagnosis from it had it been delivered
627 *	  (nb.ht_sync, nb.ht_crc)
628 *	- events that will be accompanied by an immediate panic and
629 *	  delivery of the ereport during subsequent reboot but from
630 *	  which no useful diagnosis can be made. (nb.rmw, nb.wdog)
631 *
632 * Ereports for all of these can be generated by error simulation and
633 * injection.  We will perform a null diagnosos of all these ereports in order
634 * to avoid "no subscription" complaints during test harness runs.
635 */
636
637event ereport.cpu.amd.nb.ma@strand{within(5s)};
638event ereport.cpu.amd.nb.ta@strand{within(5s)};
639event ereport.cpu.amd.ls.s_rde@strand{within(5s)};
640event ereport.cpu.amd.ic.rdde@strand{within(5s)};
641event ereport.cpu.amd.bu.s_rde@strand{within(5s)};
642event ereport.cpu.amd.nb.gart_walk@strand{within(5s)};
643event ereport.cpu.amd.nb.ht_sync@strand{within(5s)};
644event ereport.cpu.amd.nb.ht_crc@strand{within(5s)};
645event ereport.cpu.amd.nb.rmw@strand{within(5s)};
646event ereport.cpu.amd.nb.wdog@strand{within(5s)};
647event ereport.cpu.amd.unknown@strand{within(5s)};
648
649event upset.null_diag@strand;
650
651prop upset.null_diag@strand (1)->
652    ereport.cpu.amd.nb.ma@strand,
653    ereport.cpu.amd.nb.ta@strand,
654    ereport.cpu.amd.ls.s_rde@strand,
655    ereport.cpu.amd.ic.rdde@strand,
656    ereport.cpu.amd.bu.s_rde@strand,
657    ereport.cpu.amd.nb.gart_walk@strand,
658    ereport.cpu.amd.nb.ht_sync@strand,
659    ereport.cpu.amd.nb.ht_crc@strand,
660    ereport.cpu.amd.nb.rmw@strand,
661    ereport.cpu.amd.nb.wdog@strand,
662    ereport.cpu.amd.unknown@strand;
663