1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29#pragma dictionary "AMD" 30 31/* 32 * Eversholt rules for the AMD Opteron CPU/Memory 33 */ 34 35fru motherboard; 36fru chip; 37fru dimm; 38 39asru chip/cpu; 40asru dimm; 41asru dimm/rank; 42asru dram-channel; 43asru chip/memory-controller/chip-select; 44 45#define MAX(x, y) ((x) >= (y) ? (x) : (y)) 46#define MIN(x, y) ((x) <= (y) ? (x) : (y)) 47 48/* 49 * GET_ADDR relies on the fact that variables have global scope across an FME. 50 * Thus for each FME the assignment only occurs for the first invocation 51 * but the comparison happens on each. Thus if the new address matches the 52 * address of an existing open FME, then we return true running in the context 53 * of that FME. If the new address doesn't match the address of any existing 54 * open FME, then we return true in the context of a newly opened FME. 55 */ 56#define GET_ADDR (defined($addr) ? ($addr == payloadprop("addr")) : \ 57 ($addr = payloadprop("addr"))) 58 59#define GET_OFFSET ($offset = payloadprop("resource[0].hc-specific.offset")) 60 61/* 62 * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that 63 * we diagnose for page faults, to record the physical address of the faulting 64 * page. The "asru-" prefix is hooked in the "rewrite-ASRU" confcalls made on 65 * diagnosis of associated faults when the libtopo mem scheme rewrites the 66 * asru in "mem" scheme. 67 */ 68#define SET_ADDR (setpayloadprop("asru-physaddr", $addr)) 69 70#define SET_OFFSET (setpayloadprop("asru-offset", $offset)) 71 72/* 73 * RESOURCE_EXISTS is true if a member with name "resource" exists in the 74 * payload - regardless of type (e.g., nvlist or nvlist array) or value. 75 */ 76#define RESOURCE_EXISTS (payloadprop_defined("resource")) 77 78/* 79 * CONTAINS_RANK is true if the "resource" nvlist array (as used in memory 80 * ereports) exists and one if its members matches the path for the 81 * rank node. Our memory propogation are of the form 82 * 83 * "prop foo@chip/memory-controller/dimm/rank -> blah@chip/cpu" 84 * 85 * since cpus detect memory errors; in eversholt such a propogation, where 86 * the lhs path and rhs path do not match, expands to the cross-product of 87 * all dimms, ranks and cpus on the same chip (since chip appears in the 88 * path on both sides). We use CONTAINS_RANK to constrain the propogation 89 * such that it only happens if the payload resource matches the rank. 90 */ 91#define CONTAINS_RANK (payloadprop_contains("resource", \ 92 asru(chip/memory-controller/dimm/rank))) 93 94/* 95 * The following will tell us whether a syndrome that is known to be 96 * correctable (from a mem_ce ereport) is single-bit or multi-bit. For a 97 * correctable ChipKill syndrome the number of bits set in the lowest 98 * nibble indicates how many bits were in error. 99 */ 100 101#define CBITMASK(synd) ((synd) & 0xf) 102 103#define CKSINGLE(synd) \ 104 ((synd) == 0 || \ 105 (CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 || \ 106 CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8)) 107 108#define SINGLE_BIT_CE \ 109 (payloadprop("syndrome-type") == "E" || \ 110 (payloadprop("syndrome-type") == "C" && \ 111 CKSINGLE(payloadprop("syndrome")))) 112 113#define MULTI_BIT_CE \ 114 (payloadprop("syndrome-type") == "C" && \ 115 !CKSINGLE(payloadprop("syndrome"))) 116 117/* 118 * A single bit fault in a memory rank can cause: 119 * 120 * - mem_ce : reported by nb 121 * - inf_sys_ecc1: reported by ic or dc; inf_sys_ecc1 errors detected at the 122 * ic do not record a syndrome; these errors will not be triggered in 123 * ChipKill ECC mode (the NB corrects all ECC errors in that mode) 124 * - s_ecc1: reported by bu; this error will not be triggered in ChipKill 125 * ECC mode (the NB corrects all ECC in that mode) 126 * 127 * Single-bit errors are fed into a per-rank SERD engine; if a SERD engine 128 * trips we diagnose a fault.memory.page so that the response agent can 129 * retire the page that caused the trip. If the total number of pages 130 * faulted in this way on a single rank exceeds a threshold we will 131 * diagnose a fault.memory.dimm_sb against the containing. 132 * 133 * Multibit ChipKill-correctable errors are treated identically to 134 * single-bit errors, but via separate serd engines to allow distinct 135 * parameters if desired. 136 * 137 * Uncorrectable errors produce an immediate page fault and corresponding 138 * fault.memory.dimm_ue. 139 * 140 * Page faults are essentially internal - action is only required when 141 * they are accompanied by a dimm fault. As such we include message=0 142 * on page faults. 143 */ 144 145event ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu{within(5s)}; 146event ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu{within(5s)}; 147event ereport.cpu.amd.bu.s_ecc1@chip/cpu{within(5s)}; 148event ereport.cpu.amd.nb.mem_ce@chip/cpu{within(5s)}; 149 150/* 151 * If the address is not valid then no resource member will be included 152 * in a nb.mem_ce or nb.mem_ue ereport. These cases should be rare. 153 * We will also discard all inf_sys_ecc1 events detected at the ic since they 154 * have no syndrome and therefore no resource information. 155 * We will discard such ereports. An alternative may be to SERD them 156 * on a per MC basis and trip if we see too many such events. 157 */ 158 159event upset.memory.discard1@chip/cpu; 160 161/* #PAGE# 162 * Single-bit correctable errors are diagnosed as upsets and feed into per-rank 163 * SERD engines which diagnose fault.memory.page_sb if they trip. 164 * 165 * Multi-bit correctable (via ChipKill) errors are diagnosed as upsets and feed 166 * into additional per-rank SERD engines which diagnose fault.memory.page_ck 167 * if they trip. 168 * 169 * The number of fault.memory.page and fault.memory.page_ck diagnosed is 170 * counted in stat engines for each type. These are used in deciding 171 * whether to declare a dimm faulty after repeated page faults. 172 */ 173 174#define PAGE_FIT 1 175#define PAGE_SB_COUNT 2 176#define PAGE_SB_TIME 72h 177#define PAGE_CK_COUNT 2 178#define PAGE_CK_TIME 72h 179 180/* 181 * The fraction of pages on a single rank that must be diagnosed as faulty 182 * with single correctable unit faults before we will fault the rank. 183 * Once we have faulted the rank we will continue to diagnose any further page 184 * faults on the rank up to some maximum multiple of the threshold at which 185 * we faulted the dimm. This allows us to potentially contain some fairly 186 * far-reaching but still limited-extent fault (such as a partial column 187 * failure) without getting carried away and allowing a single faulty rank to 188 * use up the entire system-imposed page retirenment limit (which, once 189 * reached, causes retirement request to have no effect other than to fill 190 * the fault manager cache and logs). 191 * 192 * This fraction is specified in basis points, where 100 basis points are 193 * equivalent to 1 percent. It is applied on a per-rank basis. 194 * 195 * The system imposes an absolute maximum on the number of pages it will 196 * retire; the current value is 10 basis points, or 0.1% of 'physmem'. Note 197 * that 'physmem' is reduced from installed memory pages by an amount 198 * reflecting permanent kernel memory allocations. This system page retire 199 * limit bounds the maximum real response to page faults across all ranks 200 * that fault manager response agents can effect, but it should not be confused 201 * with any diagnosis threshold (i.e., the number of faulty pages we are 202 * prepared to tolerate from a single rank before faulting the rank is 203 * distinct from the total number of pages we are prepared to retire from use 204 * in response to that and other faults). It is, however, desirable to 205 * arrange that the maximum number of pages we are prepared to fault from 206 * any one rank is less than the system-wide quota. 207 */ 208#define PAGE_RETIRE_LIMIT_BPS 5 /* or 0.05%; ~ 131 pages/GB %/ 209 210/* 211 * A macro to manipulate the above fraction. Given a size in bytes convert 212 * this to pages (4K pagesize) and calculate the number of those pages 213 * indicated by PAGE_RETIRE_LIMIT_BPS basis points. 214 */ 215#define _BPS_PGCNT(totalbytes) \ 216 ((((totalbytes) / 4096 ) * PAGE_RETIRE_LIMIT_BPS) / 10000) 217 218/* 219 * The single-correctable-unit threshold at which number of faulted pages 220 * on a rank we we fault the rank. We insist that this be at least 128 and 221 * never more than 512. 222 */ 223#define RANK_THRESH MIN(512, MAX(128, \ 224 _BPS_PGCNT(confprop(asru(chip/memory-controller/dimm/rank), "size")))) 225 226/* 227 * The maximum number of single-correctable-unit page faults we will diagnose 228 * on a single rank (must be greater than RANK_THRESH). We set 229 * this at twice the rank fault threshold. 230 */ 231#define RANK_PGFLT_MAX (2 * RANK_THRESH) 232 233engine stat.sbpgflt@chip/memory-controller/dimm/rank; 234engine stat.ckpgflt@chip/memory-controller/dimm/rank; 235 236event fault.memory.page_sb@chip/memory-controller/dimm/rank, 237 FITrate=PAGE_FIT, ASRU=dimm/rank, message=0, 238 count=stat.sbpgflt@chip/memory-controller/dimm/rank, 239 action=confcall("rewrite-ASRU"); /* rewrite ASRU to identify page in rank */ 240 241#define SB_PGFLTS (count(stat.sbpgflt@chip/memory-controller/dimm/rank)) 242 243event fault.memory.page_ck@chip/memory-controller/dimm/rank, 244 FITrate=PAGE_FIT, ASRU=dimm/rank, message=0, 245 count=stat.ckpgflt@chip/memory-controller/dimm/rank, 246 action=confcall("rewrite-ASRU"); /* rewrite ASRU to identify page in rank */ 247 248#define CK_PGFLTS (count(stat.ckpgflt@chip/memory-controller/dimm/rank)) 249 250#define RANK_PGFLT_LIMIT_REACHED \ 251 (SB_PGFLTS + CK_PGFLTS > RANK_PGFLT_MAX) 252 253event ereport.memory.page_sb_trip@chip/memory-controller/dimm/rank; 254engine serd.memory.page_sb@chip/memory-controller/dimm/rank, 255 N=PAGE_SB_COUNT, T=PAGE_SB_TIME, method=persistent, 256 trip=ereport.memory.page_sb_trip@chip/memory-controller/dimm/rank; 257event upset.memory.page_sb@chip/memory-controller/dimm/rank, 258 engine=serd.memory.page_sb@chip/memory-controller/dimm/rank; 259 260event ereport.memory.page_ck_trip@chip/memory-controller/dimm/rank; 261engine serd.memory.page_ck@chip/memory-controller/dimm/rank, 262 N=PAGE_CK_COUNT, T=PAGE_CK_TIME, method=persistent, 263 trip=ereport.memory.page_ck_trip@chip/memory-controller/dimm/rank; 264event upset.memory.page_ck@chip/memory-controller/dimm/rank, 265 engine=serd.memory.page_ck@chip/memory-controller/dimm/rank; 266 267event upset.memory.overpgfltlimit@chip/memory-controller/dimm/rank; 268 269/* 270 * If we have not reached the per-rank limit on faulted pages then 271 * continue to explain ereport observations as upsets which can lead 272 * lead to page fault diagnoses if the serd engine trips. 273 */ 274prop upset.memory.page_sb@chip/memory-controller/dimm/rank (0)-> 275 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu 276 { CONTAINS_RANK && SINGLE_BIT_CE && !RANK_PGFLT_LIMIT_REACHED }, 277 ereport.cpu.amd.bu.s_ecc1@chip/cpu 278 { CONTAINS_RANK && SINGLE_BIT_CE && !RANK_PGFLT_LIMIT_REACHED }, 279 ereport.cpu.amd.nb.mem_ce@chip/cpu 280 { CONTAINS_RANK && SINGLE_BIT_CE && !RANK_PGFLT_LIMIT_REACHED }; 281 282prop upset.memory.page_ck@chip/memory-controller/dimm/rank (0)-> 283 /* no dc.inf_sys_ecc1 or bu.s_ecc1 in ChipKill mode */ 284 ereport.cpu.amd.nb.mem_ce@chip/cpu 285 { CONTAINS_RANK && MULTI_BIT_CE && !RANK_PGFLT_LIMIT_REACHED }; 286 287/* 288 * If we have reached the per-rank limit on faulted pages then diagnose 289 * further observations on the rank to a engine-less upset (i.e., discard 290 * them). 291 */ 292prop upset.memory.overpgfltlimit@chip/memory-controller/dimm/rank (1)-> 293 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu 294 { CONTAINS_RANK && RANK_PGFLT_LIMIT_REACHED }, 295 ereport.cpu.amd.bu.s_ecc1@chip/cpu 296 { CONTAINS_RANK && RANK_PGFLT_LIMIT_REACHED }, 297 ereport.cpu.amd.nb.mem_ce@chip/cpu 298 { CONTAINS_RANK && RANK_PGFLT_LIMIT_REACHED }; 299 300prop fault.memory.page_sb@chip/memory-controller/dimm/rank (1)-> 301 ereport.memory.page_sb_trip@chip/memory-controller/dimm/rank; 302 303prop fault.memory.page_ck@chip/memory-controller/dimm/rank (1)-> 304 ereport.memory.page_ck_trip@chip/memory-controller/dimm/rank; 305 306prop fault.memory.page_sb@chip/memory-controller/dimm/rank 307 { SET_ADDR && SET_OFFSET } (0)-> 308 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu 309 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }, 310 ereport.cpu.amd.bu.s_ecc1@chip/cpu 311 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }, 312 ereport.cpu.amd.nb.mem_ce@chip/cpu 313 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }; 314 315prop fault.memory.page_ck@chip/memory-controller/dimm/rank 316 { SET_ADDR && SET_OFFSET } (0)-> 317 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu 318 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }, 319 ereport.cpu.amd.bu.s_ecc1@chip/cpu 320 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }, 321 ereport.cpu.amd.nb.mem_ce@chip/cpu 322 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }; 323 324prop upset.memory.discard1@chip/cpu (1)-> 325 ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu; /* always discard - no resource */ 326 327prop upset.memory.discard1@chip/cpu (1)-> 328 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu { !RESOURCE_EXISTS }, 329 ereport.cpu.amd.bu.s_ecc1@chip/cpu { !RESOURCE_EXISTS }, 330 ereport.cpu.amd.nb.mem_ce@chip/cpu { !RESOURCE_EXISTS }; 331 332/* #DIMM_SCU# 333 * "Single-correctable-unit" DIMM faults are diagnosed when the total number of 334 * page faults (diagnosed from repeated single-bit or multibit-chipkills) 335 * from any one rank on that DIMM reaches a threshold. A "correctable unit" 336 * is a single bit in normal 64/8 ECC mode, or a single symbol in ChipKill 337 * 128/16 mode (i.e., nibble-aligned nibble for the code used on Opteron). 338 * 339 * We do not stop diagnosing further single-bit page faults once we have 340 * declared a single-bit DIMM fault - we continue diagnosing them and 341 * response agents can continue to retire those pages up to the system-imposed 342 * retirement limit. 343 * 344 * Two distinct fault types may be diagnosed - fault.memory.dimm_sb and 345 * fault.memory.dimm_ck. Which one is diagnosed depends on whether we 346 * have reached the threshold for a majority of single-bit page faults or 347 * multibit page faults. 348 * 349 * Implementation: we maintain parallel SERD engines to the page_sb and 350 * page_ck engines, which trip in unison. On trip it generates a distinct 351 * ereport which we diagnose to a fault if the threshold has been 352 * reached, or to a throwaway upset if not. 353 * 354 */ 355 356#define DIMM_SB_FIT 2000 357#define DIMM_CK_FIT 4000 358 359event fault.memory.dimm_sb@chip/memory-controller/dimm/rank, 360 FITrate=DIMM_SB_FIT, FRU=dimm, ASRU=dimm, 361 action=confcall("rewrite-ASRU"); /* rewrite non-leaf ASRU in mem scheme */ 362 363event fault.memory.dimm_ck@chip/memory-controller/dimm/rank, 364 FITrate=DIMM_CK_FIT, FRU=dimm, ASRU=dimm, 365 action=confcall("rewrite-ASRU"); /* rewrite non-leaf ASRU in mem scheme */ 366 367event ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank 368 { within(5s) }; 369engine serd.memory.dimm_sb@chip/memory-controller/dimm/rank, 370 N=PAGE_SB_COUNT, T=PAGE_SB_TIME, method=persistent, 371 trip=ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank; 372event upset.memory.dimm_sb@chip/memory-controller/dimm/rank, 373 engine=serd.memory.dimm_sb@chip/memory-controller/dimm/rank; 374 375event ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank 376 { within(5s) }; 377engine serd.memory.dimm_ck@chip/memory-controller/dimm/rank, 378 N=PAGE_CK_COUNT, T=PAGE_CK_TIME, method=persistent, 379 trip=ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank; 380event upset.memory.dimm_ck@chip/memory-controller/dimm/rank, 381 engine=serd.memory.dimm_ck@chip/memory-controller/dimm/rank; 382 383event upset.memory.discard2@chip/memory-controller/dimm/rank; 384 385prop upset.memory.dimm_sb@chip/memory-controller/dimm/rank (0)-> 386 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu { CONTAINS_RANK && SINGLE_BIT_CE }, 387 ereport.cpu.amd.bu.s_ecc1@chip/cpu { CONTAINS_RANK && SINGLE_BIT_CE }, 388 ereport.cpu.amd.nb.mem_ce@chip/cpu { CONTAINS_RANK && SINGLE_BIT_CE }; 389 390prop upset.memory.dimm_ck@chip/memory-controller/dimm/rank (0)-> 391 ereport.cpu.amd.nb.mem_ce@chip/cpu { CONTAINS_RANK && MULTI_BIT_CE }; 392 393/* 394 * The following two propogations diagnose a fault.memory.dimm_sb when 395 * either the dimm_sb or dimm_ck engine trips (for a new page fault) 396 * and the total number of page faults (sb and ck) exceeds the threshold 397 * value with the majority being from sb page faults. 398 */ 399prop fault.memory.dimm_sb@chip/memory-controller/dimm/rank (0)-> 400 ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank 401 { SB_PGFLTS + CK_PGFLTS > RANK_THRESH && SB_PGFLTS > RANK_THRESH / 2 }; 402 403prop fault.memory.dimm_sb@chip/memory-controller/dimm/rank (0)-> 404 ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank 405 { SB_PGFLTS + CK_PGFLTS > RANK_THRESH && SB_PGFLTS > RANK_THRESH / 2 }; 406 407/* 408 * The following two propogation diagnose a fault.memory.dimm_ck when 409 * either the dimm_sb or dimm_ck engine trip (for a new page fault) 410 * and the total number of page faults (sb and ck) exceeds the threshold 411 * value with the majority being from ck page faults. 412 */ 413prop fault.memory.dimm_ck@chip/memory-controller/dimm/rank (0)-> 414 ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank 415 { SB_PGFLTS + CK_PGFLTS > RANK_THRESH && CK_PGFLTS > RANK_THRESH / 2 }; 416 417prop fault.memory.dimm_ck@chip/memory-controller/dimm/rank (0)-> 418 ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank 419 { SB_PGFLTS + CK_PGFLTS > RANK_THRESH && CK_PGFLTS > RANK_THRESH / 2 }; 420 421prop upset.memory.discard2@chip/memory-controller/dimm/rank (1)-> 422 ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank, 423 ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank; 424 425/* #DIMM_UE# 426 * #PAGE_UE# 427 * An uncorrectable multi-bit fault in a memory dimm can cause: 428 * 429 * - mem_ue : reported by nb for an access from a remote cpu 430 * - inf_sys_eccm : reported by ic or dc; the ic does not report a syndrome 431 * - s_eccm : reported by bu 432 * 433 * Note we use a SERD engine here simply as a way of ensuring that we get 434 * both dimm and page faults reported. 435 * 436 * Since on production systems we force HT Sync Flood on uncorrectable 437 * memory errors (if not already set as such by the BIOS, as it should be) 438 * we won't actually receive these ereports since the system will be reset. 439 */ 440 441#define DIMM_UE_FIT 6000 442 443event ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu{within(5s)}; 444event ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu{within(5s)}; 445event ereport.cpu.amd.bu.s_eccm@chip/cpu{within(5s)}; 446event ereport.cpu.amd.nb.mem_ue@chip/cpu{within(5s)}; 447 448event fault.memory.dimm_ue@chip/memory-controller/dimm/rank, 449 FITrate=DIMM_UE_FIT, FRU=dimm, ASRU=dimm, 450 action=confcall("rewrite-ASRU"); /* rewrite non-leaf ASRU in mem scheme */ 451 452event fault.memory.page_ue@chip/memory-controller/dimm/rank, 453 FITrate=PAGE_FIT, ASRU=dimm/rank, message=0, 454 action=confcall("rewrite-ASRU"); /* rewrite ASRU to identify page in rank */ 455 456event ereport.memory.dimm_ue_trip@chip/memory-controller/dimm/rank; 457engine serd.memory.dimm_ue@chip/memory-controller/dimm/rank, 458 N=0, T=1h, method=persistent, 459 trip=ereport.memory.dimm_ue_trip@chip/memory-controller/dimm/rank; 460event upset.memory.dimm_ue@chip/memory-controller/dimm/rank, 461 engine=serd.memory.dimm_ue@chip/memory-controller/dimm/rank; 462 463event ereport.memory.page_ue_trip@chip/memory-controller/dimm/rank; 464engine serd.memory.page_ue@chip/memory-controller/dimm/rank, 465 N=0, T=1h, method=persistent, 466 trip=ereport.memory.page_ue_trip@chip/memory-controller/dimm/rank; 467event upset.memory.page_ue@chip/memory-controller/dimm/rank, 468 engine=serd.memory.page_ue@chip/memory-controller/dimm/rank; 469 470event upset.memory.discard3@chip/cpu; 471 472prop upset.memory.page_ue@chip/memory-controller/dimm/rank (0)-> 473 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu { CONTAINS_RANK }, 474 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu { CONTAINS_RANK }, 475 ereport.cpu.amd.bu.s_eccm@chip/cpu { CONTAINS_RANK }, 476 ereport.cpu.amd.nb.mem_ue@chip/cpu { CONTAINS_RANK }; 477 478prop upset.memory.dimm_ue@chip/memory-controller/dimm/rank (0)-> 479 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu { CONTAINS_RANK }, 480 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu { CONTAINS_RANK }, 481 ereport.cpu.amd.bu.s_eccm@chip/cpu { CONTAINS_RANK }, 482 ereport.cpu.amd.nb.mem_ue@chip/cpu { CONTAINS_RANK }; 483 484prop fault.memory.page_ue@chip/memory-controller/dimm/rank (1)-> 485 ereport.memory.page_ue_trip@chip/memory-controller/dimm/rank; 486 487prop fault.memory.page_ue@chip/memory-controller/dimm/rank 488 { SET_ADDR && SET_OFFSET } (0)-> 489 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu 490 { CONTAINS_RANK && GET_ADDR && GET_OFFSET}, 491 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu 492 { CONTAINS_RANK && GET_ADDR && GET_OFFSET}, 493 ereport.cpu.amd.bu.s_eccm@chip/cpu 494 { CONTAINS_RANK && GET_ADDR && GET_OFFSET}, 495 ereport.cpu.amd.nb.mem_ue@chip/cpu 496 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }; 497 498prop fault.memory.dimm_ue@chip/memory-controller/dimm/rank (1)-> 499 ereport.memory.dimm_ue_trip@chip/memory-controller/dimm/rank; 500 501prop upset.memory.discard3@chip/cpu (1)-> 502 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu { !RESOURCE_EXISTS }, 503 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu { !RESOURCE_EXISTS }, 504 ereport.cpu.amd.bu.s_eccm@chip/cpu { !RESOURCE_EXISTS }, 505 ereport.cpu.amd.nb.mem_ce@chip/cpu { !RESOURCE_EXISTS }; 506 507/* #CSTESTFAIL# 508 * If the BIOS fails a chip-select during POST, or perhaps after a 509 * sync flood from an uncorrectable error, then on revision F and G it 510 * should mark that chip-select as TestFail in the CS Base register. 511 * When the memory-controller driver discovers all the MC configuration 512 * it notes such failed chip-selects and creates topology nodes for the 513 * chip-select and associated dimms and ranks, and produces an ereport for each 514 * failed chip-select with detector set to the memory-controller node 515 * and resource indicating the failed chip-select. 516 */ 517 518event ereport.cpu.amd.mc.cs_testfail@chip/memory-controller; 519 520event fault.memory.dimm_testfail@chip/memory-controller/dimm/rank, 521 FITrate=1000, ASRU=dimm, FRU=dimm, 522 action=confcall("rewrite-ASRU"); /* rewrite non-leaf ASRU in mem scheme */ 523 524event error.memory.cs_testfail@chip/memory-controller/chip-select; 525 526#define CONTAINS_CS (payloadprop_contains("resource", \ 527 asru(chip/memory-controller/chip-select))) 528 529prop error.memory.cs_testfail@chip/memory-controller/chip-select -> 530 ereport.cpu.amd.mc.cs_testfail@chip/memory-controller 531 { CONTAINS_CS }; 532 533#define CSMATCH(s) \ 534 (confprop_defined(asru(chip/memory-controller/chip-select), s) && \ 535 confprop(asru(chip/memory-controller/chip-select), s) == \ 536 confprop(asru(chip/memory-controller/dimm/rank), "csname")) 537 538prop fault.memory.dimm_testfail@chip/memory-controller/dimm/rank -> 539 error.memory.cs_testfail@chip/memory-controller/chip-select 540 { CSMATCH("dimm1-csname") || CSMATCH("dimm2-csname")}; 541 542/* #ADDRPAR# 543 * DRAM Command/Address Parity Errors. 544 * 545 * - dramaddr_par : reported by the nb; the NB status register includes 546 * a bit indicating which dram controller channel (A or B) experienced 547 * the error. 548 */ 549 550event ereport.cpu.amd.nb.dramaddr_par@chip/cpu; 551 552event fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel, 553 FITrate=1000, ASRU=dram-channel; 554 555#define GET_CHANNEL ($chan = (payloadprop("bank-status") >> 32 & 0x200) ? \ 556 1 : 0) 557 558prop fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel[y] (0)-> 559 ereport.cpu.amd.nb.dramaddr_par@chip/cpu { GET_CHANNEL && $chan == y }; 560 561/* 562 * l2 cache data errors. 563 */ 564 565#define L2CACHEDATA_FIT 1000 566#define L2CACHEDATA_SB_COUNT 3 567#define L2CACHEDATA_SB_TIME 12h 568 569event fault.cpu.amd.l2cachedata@chip/cpu, FITrate=L2CACHEDATA_FIT, 570 FRU=chip, ASRU=chip/cpu; 571event error.cpu.amd.l2cachedata_sb@chip/cpu; 572event error.cpu.amd.l2cachedata_mb@chip/cpu; 573 574prop fault.cpu.amd.l2cachedata@chip/cpu (1)-> 575 error.cpu.amd.l2cachedata_sb@chip/cpu, 576 error.cpu.amd.l2cachedata_mb@chip/cpu; 577 578/* #L2D_SINGLE# 579 * A single bit data array fault in an l2 cache can cause: 580 * 581 * - inf_l2_ecc1 : reported by ic on this cpu 582 * - inf_l2_ecc1 : reported by dc on this cpu 583 * - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu 584 * 585 * Single-bit errors are diagnosed to cache upsets. SERD engines are used 586 * to count upsets resulting from CEs. 587 */ 588 589event ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu{within(5s)}; 590event ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu{within(5s)}; 591event ereport.cpu.amd.bu.l2d_ecc1@chip/cpu{within(5s)}; 592event ereport.cpu.amd.l2d_sb_trip@chip/cpu; 593 594engine serd.cpu.amd.l2d_sb@chip/cpu, 595 N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME, method=persistent, 596 trip=ereport.cpu.amd.l2d_sb_trip@chip/cpu; 597 598event upset.cpu.amd.l2d_sb@chip/cpu, 599 engine=serd.cpu.amd.l2d_sb@chip/cpu; 600 601prop upset.cpu.amd.l2d_sb@chip/cpu (1)-> 602 ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu, 603 ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu, 604 ereport.cpu.amd.bu.l2d_ecc1@chip/cpu; 605 606prop error.cpu.amd.l2cachedata_sb@chip/cpu (1)-> 607 ereport.cpu.amd.l2d_sb_trip@chip/cpu; 608 609prop fault.cpu.amd.l2cachedata@chip/cpu (0)-> 610 ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu, 611 ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu, 612 ereport.cpu.amd.bu.l2d_ecc1@chip/cpu; 613 614/* #L2D_MULTI# 615 * A multi-bit data array fault in an l2 cache can cause: 616 * 617 * - inf_l2_eccm : reported by ic on this cpu 618 * - inf_l2_eccm : reported by dc on this cpu 619 * - l2d_eccm : reported by bu on copyback or on snoop from another cpu 620 */ 621 622event ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu; 623event ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu; 624event ereport.cpu.amd.bu.l2d_eccm@chip/cpu; 625 626prop error.cpu.amd.l2cachedata_mb@chip/cpu (1)-> 627 ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu, 628 ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu, 629 ereport.cpu.amd.bu.l2d_eccm@chip/cpu; 630 631prop fault.cpu.amd.l2cachedata@chip/cpu (0)-> 632 ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu, 633 ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu, 634 ereport.cpu.amd.bu.l2d_eccm@chip/cpu; 635 636/* 637 * l2 cache main tag errors 638 */ 639 640#define L2CACHETAG_FIT 1000 641#define L2CACHETAG_SB_COUNT 3 642#define L2CACHETAG_SB_TIME 12h 643 644event fault.cpu.amd.l2cachetag@chip/cpu, FITrate=L2CACHETAG_FIT, 645 FRU=chip, ASRU=chip/cpu; 646event error.cpu.amd.l2cachetag_sb@chip/cpu; 647event error.cpu.amd.l2cachetag_mb@chip/cpu; 648 649prop fault.cpu.amd.l2cachetag@chip/cpu (1)-> 650 error.cpu.amd.l2cachetag_sb@chip/cpu, 651 error.cpu.amd.l2cachetag_mb@chip/cpu; 652 653/* #L2T_SINGLE# 654 * A single bit tag array fault in an l2 cache can cause: 655 * 656 * - l2t_ecc1 : reported by bu on this cpu when detected during snoop 657 * - l2t_par : reported by bu on this cpu when detected other than during snoop 658 * 659 * Note that the bu.l2t_par ereport could be due to a single bit or multi bit 660 * event. If the l2t_sb_trip has already triggered it will be treated as another 661 * ce, otherwise it will be treated as a ue event. 662 */ 663 664event ereport.cpu.amd.bu.l2t_ecc1@chip/cpu{within(5s)}; 665event ereport.cpu.amd.bu.l2t_par@chip/cpu; 666event ereport.cpu.amd.l2t_sb_trip@chip/cpu; 667 668engine serd.cpu.amd.l2t_sb@chip/cpu, 669 N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME, method=persistent, 670 trip=ereport.cpu.amd.l2t_sb_trip@chip/cpu; 671 672event upset.cpu.amd.l2t_sb@chip/cpu, 673 engine=serd.cpu.amd.l2t_sb@chip/cpu; 674 675prop upset.cpu.amd.l2t_sb@chip/cpu (1)-> 676 ereport.cpu.amd.bu.l2t_ecc1@chip/cpu, 677 ereport.cpu.amd.bu.l2t_par@chip/cpu; 678 679prop error.cpu.amd.l2cachetag_sb@chip/cpu (1)-> 680 ereport.cpu.amd.l2t_sb_trip@chip/cpu; 681 682prop fault.cpu.amd.l2cachetag@chip/cpu (0)-> 683 ereport.cpu.amd.bu.l2t_ecc1@chip/cpu, 684 ereport.cpu.amd.bu.l2t_par@chip/cpu; 685 686/* #L2T_MULTI# 687 * A multi-bit tag array fault in an l2 cache can cause: 688 * 689 * - l2t_eccm : reported by bu on this cpu when detected during snoop 690 * - l2t_par : reported by bu on this cpu when detected other than during snoop 691 */ 692 693event ereport.cpu.amd.bu.l2t_eccm@chip/cpu; 694 695prop error.cpu.amd.l2cachetag_mb@chip/cpu (1)-> 696 ereport.cpu.amd.bu.l2t_eccm@chip/cpu, 697 ereport.cpu.amd.bu.l2t_par@chip/cpu; 698 699prop fault.cpu.amd.l2cachetag@chip/cpu (0)-> 700 ereport.cpu.amd.bu.l2t_eccm@chip/cpu, 701 ereport.cpu.amd.bu.l2t_par@chip/cpu; 702 703/* #ICD_PAR# 704 * A data array parity fault in an I cache can cause: 705 * 706 * - data_par : reported by ic on this cpu 707 */ 708 709#define ICACHEDATA_FIT 1000 710#define ICACHEDATA_SB_COUNT 2 711#define ICACHEDATA_SB_TIME 168h 712 713event ereport.cpu.amd.ic.data_par@chip/cpu{within(5s)}; 714event ereport.cpu.amd.ic_dp_trip@chip/cpu; 715 716event fault.cpu.amd.icachedata@chip/cpu, FITrate=ICACHEDATA_FIT, 717 FRU=chip, ASRU=chip/cpu; 718 719engine serd.cpu.amd.icachedata@chip/cpu, 720 N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME, method=persistent, 721 trip=ereport.cpu.amd.ic_dp_trip@chip/cpu; 722 723event upset.cpu.amd.icachedata@chip/cpu, 724 engine=serd.cpu.amd.icachedata@chip/cpu; 725 726prop upset.cpu.amd.icachedata@chip/cpu (1)-> 727 ereport.cpu.amd.ic.data_par@chip/cpu; 728 729prop fault.cpu.amd.icachedata@chip/cpu (1)-> 730 ereport.cpu.amd.ic_dp_trip@chip/cpu; 731 732prop fault.cpu.amd.icachedata@chip/cpu (0)-> 733 ereport.cpu.amd.ic.data_par@chip/cpu; 734 735/* #ICT_PAR# 736 * A tag array parity fault in an I cache can cause: 737 * 738 * - tag_par : reported by ic on this cpu 739 */ 740 741#define ICACHETAG_FIT 1000 742#define ICACHETAG_SB_COUNT 2 743#define ICACHETAG_SB_TIME 168h 744 745event ereport.cpu.amd.ic.tag_par@chip/cpu{within(5s)}; 746event ereport.cpu.amd.ic_tp_trip@chip/cpu; 747 748event fault.cpu.amd.icachetag@chip/cpu, FITrate=ICACHETAG_FIT, 749 FRU=chip, ASRU=chip/cpu; 750 751engine serd.cpu.amd.icachetag@chip/cpu, 752 N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME, method=persistent, 753 trip=ereport.cpu.amd.ic_tp_trip@chip/cpu; 754 755event upset.cpu.amd.icachetag@chip/cpu, 756 engine=serd.cpu.amd.icachetag@chip/cpu; 757 758prop upset.cpu.amd.icachetag@chip/cpu (1)-> 759 ereport.cpu.amd.ic.tag_par@chip/cpu; 760 761prop fault.cpu.amd.icachetag@chip/cpu (1)-> 762 ereport.cpu.amd.ic_tp_trip@chip/cpu; 763 764prop fault.cpu.amd.icachetag@chip/cpu (0)-> 765 ereport.cpu.amd.ic.tag_par@chip/cpu; 766 767/* #ICT_SNOOP# 768 * A snoop tag array parity fault in an I cache can cause: 769 * 770 * - stag_par : reported by ic on this cpu 771 */ 772 773#define ICACHESTAG_FIT 1000 774 775event ereport.cpu.amd.ic.stag_par@chip/cpu{within(5s)}; 776 777event fault.cpu.amd.icachestag@chip/cpu, FITrate=ICACHESTAG_FIT, 778 FRU=chip, ASRU=chip/cpu; 779 780prop fault.cpu.amd.icachestag@chip/cpu (1)-> 781 ereport.cpu.amd.ic.stag_par@chip/cpu; 782 783/* #ICTLB_1# 784 * An l1tlb parity fault in an I cache can cause: 785 * 786 * - l1tlb_par : reported by ic on this cpu 787 */ 788 789#define ICACHEL1TLB_FIT 1000 790#define ICACHEL1TLB_SB_COUNT 2 791#define ICACHEL1TLB_SB_TIME 168h 792 793event ereport.cpu.amd.ic.l1tlb_par@chip/cpu{within(5s)}; 794event ereport.cpu.amd.ic_l1tlb_trip@chip/cpu; 795 796event fault.cpu.amd.l1itlb@chip/cpu, FITrate=ICACHEL1TLB_FIT, 797 FRU=chip, ASRU=chip/cpu; 798 799engine serd.cpu.amd.l1itlb@chip/cpu, 800 N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME, method=persistent, 801 trip=ereport.cpu.amd.ic_l1tlb_trip@chip/cpu; 802 803event upset.cpu.amd.l1itlb@chip/cpu, 804 engine=serd.cpu.amd.l1itlb@chip/cpu; 805 806prop upset.cpu.amd.l1itlb@chip/cpu (1)-> 807 ereport.cpu.amd.ic.l1tlb_par@chip/cpu; 808 809prop fault.cpu.amd.l1itlb@chip/cpu (1)-> 810 ereport.cpu.amd.ic_l1tlb_trip@chip/cpu; 811 812prop fault.cpu.amd.l1itlb@chip/cpu (0)-> 813 ereport.cpu.amd.ic.l1tlb_par@chip/cpu; 814 815/* #ICTLB_2# 816 * An l2tlb parity fault in an I cache can cause: 817 * 818 * - l2tlb_par : reported by ic on this cpu 819 */ 820 821#define ICACHEL2TLB_FIT 1000 822#define ICACHEL2TLB_SB_COUNT 2 823#define ICACHEL2TLB_SB_TIME 168h 824 825event ereport.cpu.amd.ic.l2tlb_par@chip/cpu{within(5s)}; 826event ereport.cpu.amd.ic_l2tlb_trip@chip/cpu; 827 828event fault.cpu.amd.l2itlb@chip/cpu, FITrate=ICACHEL2TLB_FIT, 829 FRU=chip, ASRU=chip/cpu; 830 831engine serd.cpu.amd.l2itlb@chip/cpu, 832 N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME, method=persistent, 833 trip=ereport.cpu.amd.ic_l2tlb_trip@chip/cpu; 834 835event upset.cpu.amd.l2itlb@chip/cpu, 836 engine=serd.cpu.amd.l2itlb@chip/cpu; 837 838prop upset.cpu.amd.l2itlb@chip/cpu (1)-> 839 ereport.cpu.amd.ic.l2tlb_par@chip/cpu; 840 841prop fault.cpu.amd.l2itlb@chip/cpu (1)-> 842 ereport.cpu.amd.ic_l2tlb_trip@chip/cpu; 843 844prop fault.cpu.amd.l2itlb@chip/cpu (0)-> 845 ereport.cpu.amd.ic.l2tlb_par@chip/cpu; 846 847/* 848 * dcache data errors 849 */ 850 851#define DCACHEDATA_FIT 1000 852#define DCACHEDATA_SB_COUNT 2 853#define DCACHEDATA_SB_TIME 168h 854 855event fault.cpu.amd.dcachedata@chip/cpu, FITrate=DCACHEDATA_FIT, 856 FRU=chip, ASRU=chip/cpu; 857event error.cpu.amd.dcachedata_sb@chip/cpu; 858event error.cpu.amd.dcachedata_mb@chip/cpu; 859 860prop fault.cpu.amd.dcachedata@chip/cpu (1)-> 861 error.cpu.amd.dcachedata_sb@chip/cpu, 862 error.cpu.amd.dcachedata_mb@chip/cpu; 863 864/* #DCD_SINGLE# 865 * A single bit data array fault in an D cache can cause: 866 * 867 * - data_ecc1 : reported by dc on this cpu by scrubber 868 * - data_ecc1_uc : reported by dc on this cpu other than by scrubber 869 * 870 * Make data_ecc1_uc fault immediately as it may have caused a panic 871 */ 872 873event ereport.cpu.amd.dc.data_ecc1@chip/cpu{within(5s)}; 874event ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu{within(5s)}; 875event ereport.cpu.amd.dc_sb_trip@chip/cpu; 876 877engine serd.cpu.amd.dc_sb@chip/cpu, 878 N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME, method=persistent, 879 trip=ereport.cpu.amd.dc_sb_trip@chip/cpu; 880 881engine serd.cpu.amd.dc_sb_uc@chip/cpu, 882 N=0, T=1hr, method=persistent, 883 trip=ereport.cpu.amd.dc_sb_trip@chip/cpu; 884 885event upset.cpu.amd.dc_sb@chip/cpu, 886 engine=serd.cpu.amd.dc_sb@chip/cpu; 887 888event upset.cpu.amd.dc_sb_uc@chip/cpu, 889 engine=serd.cpu.amd.dc_sb_uc@chip/cpu; 890 891prop upset.cpu.amd.dc_sb@chip/cpu (1)-> 892 ereport.cpu.amd.dc.data_ecc1@chip/cpu; 893 894prop upset.cpu.amd.dc_sb_uc@chip/cpu (1)-> 895 ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu; 896 897prop error.cpu.amd.dcachedata_sb@chip/cpu (1)-> 898 ereport.cpu.amd.dc_sb_trip@chip/cpu; 899 900prop fault.cpu.amd.dcachedata@chip/cpu (0)-> 901 ereport.cpu.amd.dc.data_ecc1@chip/cpu, 902 ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu; 903 904/* #DCD_MULTI# 905 * A multi-bit data array fault in an D cache can cause: 906 * 907 * - data_eccm : reported by dc on this cpu 908 */ 909 910event ereport.cpu.amd.dc.data_eccm@chip/cpu; 911 912prop error.cpu.amd.dcachedata_mb@chip/cpu (1)-> 913 ereport.cpu.amd.dc.data_eccm@chip/cpu; 914 915prop fault.cpu.amd.dcachedata@chip/cpu (0)-> 916 ereport.cpu.amd.dc.data_eccm@chip/cpu; 917 918/* #DCT_PAR# 919 * A tag array parity fault in an D cache can cause: 920 * 921 * - tag_par : reported by dc on this cpu 922 */ 923 924#define DCACHETAG_FIT 1000 925 926event ereport.cpu.amd.dc.tag_par@chip/cpu{within(5s)}; 927 928event fault.cpu.amd.dcachetag@chip/cpu, FITrate=DCACHETAG_FIT, 929 FRU=chip, ASRU=chip/cpu; 930 931prop fault.cpu.amd.dcachetag@chip/cpu (1)-> 932 ereport.cpu.amd.dc.tag_par@chip/cpu; 933 934/* #DCT_SNOOP# 935 * A snoop tag array parity fault in an D cache can cause: 936 * 937 * - stag_par : reported by dc on this cpu 938 */ 939 940#define DCACHESTAG_FIT 1000 941 942event ereport.cpu.amd.dc.stag_par@chip/cpu{within(5s)}; 943 944event fault.cpu.amd.dcachestag@chip/cpu, FITrate=DCACHESTAG_FIT, 945 FRU=chip, ASRU=chip/cpu; 946 947prop fault.cpu.amd.dcachestag@chip/cpu (1)-> 948 ereport.cpu.amd.dc.stag_par@chip/cpu; 949 950/* #DCTLB_1# 951 * An l1tlb parity fault in an D cache can cause: 952 * 953 * - l1tlb_par : reported by dc on this cpu 954 */ 955 956#define L1DTLB_FIT 1000 957 958event ereport.cpu.amd.dc.l1tlb_par@chip/cpu{within(5s)}; 959 960event fault.cpu.amd.l1dtlb@chip/cpu, FITrate=L1DTLB_FIT, 961 FRU=chip, ASRU=chip/cpu; 962 963prop fault.cpu.amd.l1dtlb@chip/cpu (1)-> 964 ereport.cpu.amd.dc.l1tlb_par@chip/cpu; 965 966/* #DCTLB_2# 967 * An l2tlb parity fault in an D cache can cause: 968 * 969 * - l2tlb_par : reported by dc on this cpu 970 */ 971 972#define L2DTLB_FIT 1000 973 974event ereport.cpu.amd.dc.l2tlb_par@chip/cpu{within(5s)}; 975 976event fault.cpu.amd.l2dtlb@chip/cpu, FITrate=L2DTLB_FIT, 977 FRU=chip, ASRU=chip/cpu; 978 979prop fault.cpu.amd.l2dtlb@chip/cpu (1)-> 980 ereport.cpu.amd.dc.l2tlb_par@chip/cpu; 981 982/* #MISC# 983 * Ereports that should not normally happen and which we will discard 984 * without diagnosis if they do. These fall into a few categories: 985 * 986 * - the corresponding detector is not enabled, typically because 987 * detection/handling of the event is taking place elsewhere 988 * (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk) 989 * - the event is associated with a sync flood so even if the detector is 990 * enabled we will never handle the event and generate an ereport *and* 991 * even if the ereport did arrive we could perform no useful diagnosis 992 * e.g., the NB can be configured for sync flood on nb.mem_eccm 993 * but we don't choose to discard that ereport here since we could have 994 * made a useful diagnosis from it had it been delivered 995 * (nb.ht_sync, nb.ht_crc) 996 * - events that will be accompanied by an immediate panic and 997 * delivery of the ereport during subsequent reboot but from 998 * which no useful diagnosis can be made. (nb.rmw, nb.wdog) 999 * 1000 * Ereports for all of these can be generated by error simulation and 1001 * injection. We will perform a null diagnosos of all these ereports in order 1002 * to avoid "no subscription" complaints during test harness runs. 1003 */ 1004 1005event ereport.cpu.amd.nb.ma@cpu; 1006event ereport.cpu.amd.nb.ta@cpu; 1007event ereport.cpu.amd.ls.s_rde@cpu; 1008event ereport.cpu.amd.ic.rdde@cpu; 1009event ereport.cpu.amd.bu.s_rde@cpu; 1010event ereport.cpu.amd.nb.gart_walk@cpu; 1011event ereport.cpu.amd.nb.ht_sync@cpu; 1012event ereport.cpu.amd.nb.ht_crc@cpu; 1013event ereport.cpu.amd.nb.rmw@cpu; 1014event ereport.cpu.amd.nb.wdog@cpu; 1015event ereport.cpu.amd.unknown@cpu; 1016 1017event upset.null_diag@cpu; 1018 1019prop upset.null_diag@cpu (1)-> 1020 ereport.cpu.amd.nb.ma@cpu, 1021 ereport.cpu.amd.nb.ta@cpu, 1022 ereport.cpu.amd.ls.s_rde@cpu, 1023 ereport.cpu.amd.ic.rdde@cpu, 1024 ereport.cpu.amd.bu.s_rde@cpu, 1025 ereport.cpu.amd.nb.gart_walk@cpu, 1026 ereport.cpu.amd.nb.ht_sync@cpu, 1027 ereport.cpu.amd.nb.ht_crc@cpu, 1028 ereport.cpu.amd.nb.rmw@cpu, 1029 ereport.cpu.amd.nb.wdog@cpu, 1030 ereport.cpu.amd.unknown@cpu; 1031