1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29#pragma dictionary "AMD" 30 31/* 32 * Eversholt rules for the AMD Opteron CPU/Memory 33 */ 34 35fru motherboard; 36fru chip; 37fru dimm; 38 39asru chip/cpu; 40asru dimm; 41asru dimm/rank; 42asru dram-channel; 43asru chip/memory-controller/chip-select; 44 45#define MAX(x, y) ((x) >= (y) ? (x) : (y)) 46#define MIN(x, y) ((x) <= (y) ? (x) : (y)) 47 48/* 49 * GET_ADDR relies on the fact that variables have global scope across an FME. 50 * Thus for each FME the assignment only occurs for the first invocation 51 * but the comparison happens on each. Thus if the new address matches the 52 * address of an existing open FME, then we return true running in the context 53 * of that FME. If the new address doesn't match the address of any existing 54 * open FME, then we return true in the context of a newly opened FME. 55 */ 56#define GET_ADDR (defined($addr) ? ($addr == payloadprop("addr")) : \ 57 ($addr = payloadprop("addr"))) 58 59#define GET_OFFSET ($offset = payloadprop("resource[0].hc-specific.offset")) 60 61/* 62 * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that 63 * we diagnose for page faults, to record the physical address of the faulting 64 * page. The "asru-" prefix is hooked in the "rewrite-ASRU" confcalls made on 65 * diagnosis of associated faults when the libtopo mem scheme rewrites the 66 * asru in "mem" scheme. 67 */ 68#define SET_ADDR (setpayloadprop("asru-physaddr", $addr)) 69 70#define SET_OFFSET (setpayloadprop("asru-offset", $offset)) 71 72/* 73 * RESOURCE_EXISTS is true if a member with name "resource" exists in the 74 * payload - regardless of type (e.g., nvlist or nvlist array) or value. 75 */ 76#define RESOURCE_EXISTS (payloadprop_defined("resource")) 77 78/* 79 * CONTAINS_RANK is true if the "resource" nvlist array (as used in memory 80 * ereports) exists and one if its members matches the path for the 81 * rank node. Our memory propogation are of the form 82 * 83 * "prop foo@chip/memory-controller/dimm/rank -> blah@chip/cpu" 84 * 85 * since cpus detect memory errors; in eversholt such a propogation, where 86 * the lhs path and rhs path do not match, expands to the cross-product of 87 * all dimms, ranks and cpus on the same chip (since chip appears in the 88 * path on both sides). We use CONTAINS_RANK to constrain the propogation 89 * such that it only happens if the payload resource matches the rank. 90 */ 91#define CONTAINS_RANK (payloadprop_contains("resource", \ 92 asru(chip/memory-controller/dimm/rank))) 93 94/* 95 * The following will tell us whether a syndrome that is known to be 96 * correctable (from a mem_ce ereport) is single-bit or multi-bit. For a 97 * correctable ChipKill syndrome the number of bits set in the lowest 98 * nibble indicates how many bits were in error. 99 */ 100 101#define CBITMASK(synd) ((synd) & 0xf) 102 103#define CKSINGLE(synd) \ 104 ((synd) == 0 || \ 105 (CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 || \ 106 CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8)) 107 108#define SINGLE_BIT_CE \ 109 (payloadprop("syndrome-type") == "E" || \ 110 (payloadprop("syndrome-type") == "C" && \ 111 CKSINGLE(payloadprop("syndrome")))) 112 113#define MULTI_BIT_CE \ 114 (payloadprop("syndrome-type") == "C" && \ 115 !CKSINGLE(payloadprop("syndrome"))) 116 117/* 118 * A single bit fault in a memory rank can cause: 119 * 120 * - mem_ce : reported by nb 121 * - inf_sys_ecc1: reported by ic or dc; inf_sys_ecc1 errors detected at the 122 * ic do not record a syndrome; these errors will not be triggered in 123 * ChipKill ECC mode (the NB corrects all ECC errors in that mode) 124 * - s_ecc1: reported by bu; this error will not be triggered in ChipKill 125 * ECC mode (the NB corrects all ECC in that mode) 126 * 127 * Single-bit errors are fed into a per-rank SERD engine; if a SERD engine 128 * trips we diagnose a fault.memory.page so that the response agent can 129 * retire the page that caused the trip. If the total number of pages 130 * faulted in this way on a single rank exceeds a threshold we will 131 * diagnose a fault.memory.dimm_sb against the containing. 132 * 133 * Multibit ChipKill-correctable errors are treated identically to 134 * single-bit errors, but via separate serd engines to allow distinct 135 * parameters if desired. 136 * 137 * Uncorrectable errors produce an immediate page fault and corresponding 138 * fault.memory.dimm_ue. 139 * 140 * Page faults are essentially internal - action is only required when 141 * they are accompanied by a dimm fault. As such we include message=0 142 * on page faults. 143 */ 144 145event ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu{within(5s)}; 146event ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu{within(5s)}; 147event ereport.cpu.amd.bu.s_ecc1@chip/cpu{within(5s)}; 148event ereport.cpu.amd.nb.mem_ce@chip/cpu{within(5s)}; 149 150/* 151 * If the address is not valid then no resource member will be included 152 * in a nb.mem_ce or nb.mem_ue ereport. These cases should be rare. 153 * We will also discard all inf_sys_ecc1 events detected at the ic since they 154 * have no syndrome and therefore no resource information. 155 * We will discard such ereports. An alternative may be to SERD them 156 * on a per MC basis and trip if we see too many such events. 157 */ 158 159event upset.memory.discard1@chip/cpu; 160 161/* #PAGE# 162 * Single-bit correctable errors are diagnosed as upsets and feed into per-rank 163 * SERD engines which diagnose fault.memory.page_sb if they trip. 164 * 165 * Multi-bit correctable (via ChipKill) errors are diagnosed as upsets and feed 166 * into additional per-rank SERD engines which diagnose fault.memory.page_ck 167 * if they trip. 168 * 169 * The number of fault.memory.page and fault.memory.page_ck diagnosed is 170 * counted in stat engines for each type. These are used in deciding 171 * whether to declare a dimm faulty after repeated page faults. 172 */ 173 174#define PAGE_FIT 1 175#define PAGE_SB_COUNT 2 176#define PAGE_SB_TIME 72h 177#define PAGE_CK_COUNT 2 178#define PAGE_CK_TIME 72h 179 180/* 181 * The fraction of pages on a single rank that must be diagnosed as faulty 182 * with single correctable unit faults before we will fault the rank. 183 * Once we have faulted the rank we will continue to diagnose any further page 184 * faults on the rank up to some maximum multiple of the threshold at which 185 * we faulted the dimm. This allows us to potentially contain some fairly 186 * far-reaching but still limited-extent fault (such as a partial column 187 * failure) without getting carried away and allowing a single faulty rank to 188 * use up the entire system-imposed page retirenment limit (which, once 189 * reached, causes retirement request to have no effect other than to fill 190 * the fault manager cache and logs). 191 * 192 * This fraction is specified in basis points, where 100 basis points are 193 * equivalent to 1 percent. It is applied on a per-rank basis. 194 * 195 * The system imposes an absolute maximum on the number of pages it will 196 * retire; the current value is 10 basis points, or 0.1% of 'physmem'. Note 197 * that 'physmem' is reduced from installed memory pages by an amount 198 * reflecting permanent kernel memory allocations. This system page retire 199 * limit bounds the maximum real response to page faults across all ranks 200 * that fault manager response agents can effect, but it should not be confused 201 * with any diagnosis threshold (i.e., the number of faulty pages we are 202 * prepared to tolerate from a single rank before faulting the rank is 203 * distinct from the total number of pages we are prepared to retire from use 204 * in response to that and other faults). It is, however, desirable to 205 * arrange that the maximum number of pages we are prepared to fault from 206 * any one rank is less than the system-wide quota. 207 */ 208#define PAGE_RETIRE_LIMIT_BPS 5 /* or 0.05%; ~ 131 pages/GB %/ 209 210/* 211 * A macro to manipulate the above fraction. Given a size in bytes convert 212 * this to pages (4K pagesize) and calculate the number of those pages 213 * indicated by PAGE_RETIRE_LIMIT_BPS basis points. 214 */ 215#define _BPS_PGCNT(totalbytes) \ 216 ((((totalbytes) / 4096 ) * PAGE_RETIRE_LIMIT_BPS) / 10000) 217 218/* 219 * The single-correctable-unit threshold at which number of faulted pages 220 * on a rank we we fault the rank. We insist that this be at least 128 and 221 * never more than 512. 222 */ 223#define RANK_THRESH MIN(512, MAX(128, \ 224 _BPS_PGCNT(confprop(asru(chip/memory-controller/dimm/rank), "size")))) 225 226/* 227 * The maximum number of single-correctable-unit page faults we will diagnose 228 * on a single rank (must be greater than RANK_THRESH). We set 229 * this at twice the rank fault threshold. 230 */ 231#define RANK_PGFLT_MAX (2 * RANK_THRESH) 232 233engine stat.sbpgflt@chip/memory-controller/dimm/rank; 234engine stat.ckpgflt@chip/memory-controller/dimm/rank; 235 236event fault.memory.page_sb@chip/memory-controller/dimm/rank, 237 FITrate=PAGE_FIT, ASRU=dimm/rank, message=0, 238 count=stat.sbpgflt@chip/memory-controller/dimm/rank, 239 action=confcall("rewrite-ASRU"); /* rewrite ASRU to identify page in rank */ 240 241#define SB_PGFLTS (count(stat.sbpgflt@chip/memory-controller/dimm/rank)) 242 243event fault.memory.page_ck@chip/memory-controller/dimm/rank, 244 FITrate=PAGE_FIT, ASRU=dimm/rank, message=0, 245 count=stat.ckpgflt@chip/memory-controller/dimm/rank, 246 action=confcall("rewrite-ASRU"); /* rewrite ASRU to identify page in rank */ 247 248#define CK_PGFLTS (count(stat.ckpgflt@chip/memory-controller/dimm/rank)) 249 250#define RANK_PGFLT_LIMIT_REACHED \ 251 (SB_PGFLTS + CK_PGFLTS > RANK_PGFLT_MAX) 252 253event ereport.memory.page_sb_trip@chip/memory-controller/dimm/rank; 254engine serd.memory.page_sb@chip/memory-controller/dimm/rank, 255 N=PAGE_SB_COUNT, T=PAGE_SB_TIME, method=persistent, 256 trip=ereport.memory.page_sb_trip@chip/memory-controller/dimm/rank; 257event upset.memory.page_sb@chip/memory-controller/dimm/rank, 258 engine=serd.memory.page_sb@chip/memory-controller/dimm/rank; 259 260event ereport.memory.page_ck_trip@chip/memory-controller/dimm/rank; 261engine serd.memory.page_ck@chip/memory-controller/dimm/rank, 262 N=PAGE_CK_COUNT, T=PAGE_CK_TIME, method=persistent, 263 trip=ereport.memory.page_ck_trip@chip/memory-controller/dimm/rank; 264event upset.memory.page_ck@chip/memory-controller/dimm/rank, 265 engine=serd.memory.page_ck@chip/memory-controller/dimm/rank; 266 267event upset.memory.overpgfltlimit@chip/memory-controller/dimm/rank; 268 269/* 270 * If we have not reached the per-rank limit on faulted pages then 271 * continue to explain ereport observations as upsets which can lead 272 * lead to page fault diagnoses if the serd engine trips. 273 */ 274prop upset.memory.page_sb@chip/memory-controller/dimm/rank (0)-> 275 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu 276 { CONTAINS_RANK && SINGLE_BIT_CE && !RANK_PGFLT_LIMIT_REACHED }, 277 ereport.cpu.amd.bu.s_ecc1@chip/cpu 278 { CONTAINS_RANK && SINGLE_BIT_CE && !RANK_PGFLT_LIMIT_REACHED }, 279 ereport.cpu.amd.nb.mem_ce@chip/cpu 280 { CONTAINS_RANK && SINGLE_BIT_CE && !RANK_PGFLT_LIMIT_REACHED }; 281 282prop upset.memory.page_ck@chip/memory-controller/dimm/rank (0)-> 283 /* no dc.inf_sys_ecc1 or bu.s_ecc1 in ChipKill mode */ 284 ereport.cpu.amd.nb.mem_ce@chip/cpu 285 { CONTAINS_RANK && MULTI_BIT_CE && !RANK_PGFLT_LIMIT_REACHED }; 286 287/* 288 * If we have reached the per-rank limit on faulted pages then diagnose 289 * further observations on the rank to a engine-less upset (i.e., discard 290 * them). 291 */ 292prop upset.memory.overpgfltlimit@chip/memory-controller/dimm/rank (1)-> 293 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu 294 { CONTAINS_RANK && RANK_PGFLT_LIMIT_REACHED }, 295 ereport.cpu.amd.bu.s_ecc1@chip/cpu 296 { CONTAINS_RANK && RANK_PGFLT_LIMIT_REACHED }, 297 ereport.cpu.amd.nb.mem_ce@chip/cpu 298 { CONTAINS_RANK && RANK_PGFLT_LIMIT_REACHED }; 299 300prop fault.memory.page_sb@chip/memory-controller/dimm/rank (1)-> 301 ereport.memory.page_sb_trip@chip/memory-controller/dimm/rank; 302 303prop fault.memory.page_ck@chip/memory-controller/dimm/rank (1)-> 304 ereport.memory.page_ck_trip@chip/memory-controller/dimm/rank; 305 306prop fault.memory.page_sb@chip/memory-controller/dimm/rank 307 { SET_ADDR && SET_OFFSET } (0)-> 308 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu 309 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }, 310 ereport.cpu.amd.bu.s_ecc1@chip/cpu 311 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }, 312 ereport.cpu.amd.nb.mem_ce@chip/cpu 313 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }; 314 315prop fault.memory.page_ck@chip/memory-controller/dimm/rank 316 { SET_ADDR && SET_OFFSET } (0)-> 317 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu 318 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }, 319 ereport.cpu.amd.bu.s_ecc1@chip/cpu 320 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }, 321 ereport.cpu.amd.nb.mem_ce@chip/cpu 322 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }; 323 324/* 325 * Discard memory ereports that do not indicate a resource. 326 */ 327prop upset.memory.discard1@chip/cpu (1)-> 328 ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu { !RESOURCE_EXISTS }, 329 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu { !RESOURCE_EXISTS }, 330 ereport.cpu.amd.bu.s_ecc1@chip/cpu { !RESOURCE_EXISTS }, 331 ereport.cpu.amd.nb.mem_ce@chip/cpu { !RESOURCE_EXISTS }; 332 333/* #DIMM_SCU# 334 * "Single-correctable-unit" DIMM faults are diagnosed when the total number of 335 * page faults (diagnosed from repeated single-bit or multibit-chipkills) 336 * from any one rank on that DIMM reaches a threshold. A "correctable unit" 337 * is a single bit in normal 64/8 ECC mode, or a single symbol in ChipKill 338 * 128/16 mode (i.e., nibble-aligned nibble for the code used on Opteron). 339 * 340 * We do not stop diagnosing further single-bit page faults once we have 341 * declared a single-bit DIMM fault - we continue diagnosing them and 342 * response agents can continue to retire those pages up to the system-imposed 343 * retirement limit. 344 * 345 * Two distinct fault types may be diagnosed - fault.memory.dimm_sb and 346 * fault.memory.dimm_ck. Which one is diagnosed depends on whether we 347 * have reached the threshold for a majority of single-bit page faults or 348 * multibit page faults. 349 * 350 * Implementation: we maintain parallel SERD engines to the page_sb and 351 * page_ck engines, which trip in unison. On trip it generates a distinct 352 * ereport which we diagnose to a fault if the threshold has been 353 * reached, or to a throwaway upset if not. 354 * 355 */ 356 357#define DIMM_SB_FIT 2000 358#define DIMM_CK_FIT 4000 359 360event fault.memory.dimm_sb@chip/memory-controller/dimm/rank, 361 FITrate=DIMM_SB_FIT, FRU=dimm, ASRU=dimm, 362 action=confcall("rewrite-ASRU"); /* rewrite non-leaf ASRU in mem scheme */ 363 364event fault.memory.dimm_ck@chip/memory-controller/dimm/rank, 365 FITrate=DIMM_CK_FIT, FRU=dimm, ASRU=dimm, 366 action=confcall("rewrite-ASRU"); /* rewrite non-leaf ASRU in mem scheme */ 367 368event ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank 369 { within(5s) }; 370engine serd.memory.dimm_sb@chip/memory-controller/dimm/rank, 371 N=PAGE_SB_COUNT, T=PAGE_SB_TIME, method=persistent, 372 trip=ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank; 373event upset.memory.dimm_sb@chip/memory-controller/dimm/rank, 374 engine=serd.memory.dimm_sb@chip/memory-controller/dimm/rank; 375 376event ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank 377 { within(5s) }; 378engine serd.memory.dimm_ck@chip/memory-controller/dimm/rank, 379 N=PAGE_CK_COUNT, T=PAGE_CK_TIME, method=persistent, 380 trip=ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank; 381event upset.memory.dimm_ck@chip/memory-controller/dimm/rank, 382 engine=serd.memory.dimm_ck@chip/memory-controller/dimm/rank; 383 384event upset.memory.discard2@chip/memory-controller/dimm/rank; 385 386prop upset.memory.dimm_sb@chip/memory-controller/dimm/rank (0)-> 387 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu { CONTAINS_RANK && SINGLE_BIT_CE }, 388 ereport.cpu.amd.bu.s_ecc1@chip/cpu { CONTAINS_RANK && SINGLE_BIT_CE }, 389 ereport.cpu.amd.nb.mem_ce@chip/cpu { CONTAINS_RANK && SINGLE_BIT_CE }; 390 391prop upset.memory.dimm_ck@chip/memory-controller/dimm/rank (0)-> 392 ereport.cpu.amd.nb.mem_ce@chip/cpu { CONTAINS_RANK && MULTI_BIT_CE }; 393 394/* 395 * The following two propogations diagnose a fault.memory.dimm_sb when 396 * either the dimm_sb or dimm_ck engine trips (for a new page fault) 397 * and the total number of page faults (sb and ck) exceeds the threshold 398 * value with the majority being from sb page faults. 399 */ 400prop fault.memory.dimm_sb@chip/memory-controller/dimm/rank (0)-> 401 ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank 402 { SB_PGFLTS + CK_PGFLTS > RANK_THRESH && SB_PGFLTS > RANK_THRESH / 2 }; 403 404prop fault.memory.dimm_sb@chip/memory-controller/dimm/rank (0)-> 405 ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank 406 { SB_PGFLTS + CK_PGFLTS > RANK_THRESH && SB_PGFLTS > RANK_THRESH / 2 }; 407 408/* 409 * The following two propogation diagnose a fault.memory.dimm_ck when 410 * either the dimm_sb or dimm_ck engine trip (for a new page fault) 411 * and the total number of page faults (sb and ck) exceeds the threshold 412 * value with the majority being from ck page faults. 413 */ 414prop fault.memory.dimm_ck@chip/memory-controller/dimm/rank (0)-> 415 ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank 416 { SB_PGFLTS + CK_PGFLTS > RANK_THRESH && CK_PGFLTS > RANK_THRESH / 2 }; 417 418prop fault.memory.dimm_ck@chip/memory-controller/dimm/rank (0)-> 419 ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank 420 { SB_PGFLTS + CK_PGFLTS > RANK_THRESH && CK_PGFLTS > RANK_THRESH / 2 }; 421 422prop upset.memory.discard2@chip/memory-controller/dimm/rank (1)-> 423 ereport.memory.dimm_sb_trip@chip/memory-controller/dimm/rank, 424 ereport.memory.dimm_ck_trip@chip/memory-controller/dimm/rank; 425 426/* #DIMM_UE# 427 * #PAGE_UE# 428 * An uncorrectable multi-bit fault in a memory dimm can cause: 429 * 430 * - mem_ue : reported by nb for an access from a remote cpu 431 * - inf_sys_eccm : reported by ic or dc; the ic does not report a syndrome 432 * - s_eccm : reported by bu 433 * 434 * Note we use a SERD engine here simply as a way of ensuring that we get 435 * both dimm and page faults reported. 436 * 437 * Since on production systems we force HT Sync Flood on uncorrectable 438 * memory errors (if not already set as such by the BIOS, as it should be) 439 * we won't actually receive these ereports since the system will be reset. 440 */ 441 442#define DIMM_UE_FIT 6000 443 444event ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu{within(5s)}; 445event ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu{within(5s)}; 446event ereport.cpu.amd.bu.s_eccm@chip/cpu{within(5s)}; 447event ereport.cpu.amd.nb.mem_ue@chip/cpu{within(5s)}; 448 449event fault.memory.dimm_ue@chip/memory-controller/dimm/rank, 450 FITrate=DIMM_UE_FIT, FRU=dimm, ASRU=dimm, 451 action=confcall("rewrite-ASRU"); /* rewrite non-leaf ASRU in mem scheme */ 452 453event fault.memory.page_ue@chip/memory-controller/dimm/rank, 454 FITrate=PAGE_FIT, ASRU=dimm/rank, message=0, 455 action=confcall("rewrite-ASRU"); /* rewrite ASRU to identify page in rank */ 456 457event ereport.memory.dimm_ue_trip@chip/memory-controller/dimm/rank; 458engine serd.memory.dimm_ue@chip/memory-controller/dimm/rank, 459 N=0, T=1h, method=persistent, 460 trip=ereport.memory.dimm_ue_trip@chip/memory-controller/dimm/rank; 461event upset.memory.dimm_ue@chip/memory-controller/dimm/rank, 462 engine=serd.memory.dimm_ue@chip/memory-controller/dimm/rank; 463 464event ereport.memory.page_ue_trip@chip/memory-controller/dimm/rank; 465engine serd.memory.page_ue@chip/memory-controller/dimm/rank, 466 N=0, T=1h, method=persistent, 467 trip=ereport.memory.page_ue_trip@chip/memory-controller/dimm/rank; 468event upset.memory.page_ue@chip/memory-controller/dimm/rank, 469 engine=serd.memory.page_ue@chip/memory-controller/dimm/rank; 470 471event upset.memory.discard3@chip/cpu; 472 473prop upset.memory.page_ue@chip/memory-controller/dimm/rank (0)-> 474 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu { CONTAINS_RANK }, 475 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu { CONTAINS_RANK }, 476 ereport.cpu.amd.bu.s_eccm@chip/cpu { CONTAINS_RANK }, 477 ereport.cpu.amd.nb.mem_ue@chip/cpu { CONTAINS_RANK }; 478 479prop upset.memory.dimm_ue@chip/memory-controller/dimm/rank (0)-> 480 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu { CONTAINS_RANK }, 481 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu { CONTAINS_RANK }, 482 ereport.cpu.amd.bu.s_eccm@chip/cpu { CONTAINS_RANK }, 483 ereport.cpu.amd.nb.mem_ue@chip/cpu { CONTAINS_RANK }; 484 485prop fault.memory.page_ue@chip/memory-controller/dimm/rank (1)-> 486 ereport.memory.page_ue_trip@chip/memory-controller/dimm/rank; 487 488prop fault.memory.page_ue@chip/memory-controller/dimm/rank 489 { SET_ADDR && SET_OFFSET } (0)-> 490 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu 491 { CONTAINS_RANK && GET_ADDR && GET_OFFSET}, 492 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu 493 { CONTAINS_RANK && GET_ADDR && GET_OFFSET}, 494 ereport.cpu.amd.bu.s_eccm@chip/cpu 495 { CONTAINS_RANK && GET_ADDR && GET_OFFSET}, 496 ereport.cpu.amd.nb.mem_ue@chip/cpu 497 { CONTAINS_RANK && GET_ADDR && GET_OFFSET }; 498 499prop fault.memory.dimm_ue@chip/memory-controller/dimm/rank (1)-> 500 ereport.memory.dimm_ue_trip@chip/memory-controller/dimm/rank; 501 502prop upset.memory.discard3@chip/cpu (1)-> 503 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu { !RESOURCE_EXISTS }, 504 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu { !RESOURCE_EXISTS }, 505 ereport.cpu.amd.bu.s_eccm@chip/cpu { !RESOURCE_EXISTS }, 506 ereport.cpu.amd.nb.mem_ce@chip/cpu { !RESOURCE_EXISTS }; 507 508/* #CSTESTFAIL# 509 * If the BIOS fails a chip-select during POST, or perhaps after a 510 * sync flood from an uncorrectable error, then on revision F and G it 511 * should mark that chip-select as TestFail in the CS Base register. 512 * When the memory-controller driver discovers all the MC configuration 513 * it notes such failed chip-selects and creates topology nodes for the 514 * chip-select and associated dimms and ranks, and produces an ereport for each 515 * failed chip-select with detector set to the memory-controller node 516 * and resource indicating the failed chip-select. 517 */ 518 519event ereport.cpu.amd.mc.cs_testfail@chip/memory-controller; 520 521event fault.memory.dimm_testfail@chip/memory-controller/dimm/rank, 522 FITrate=1000, ASRU=dimm, FRU=dimm, 523 action=confcall("rewrite-ASRU"); /* rewrite non-leaf ASRU in mem scheme */ 524 525event error.memory.cs_testfail@chip/memory-controller/chip-select; 526 527#define CONTAINS_CS (payloadprop_contains("resource", \ 528 asru(chip/memory-controller/chip-select))) 529 530prop error.memory.cs_testfail@chip/memory-controller/chip-select -> 531 ereport.cpu.amd.mc.cs_testfail@chip/memory-controller 532 { CONTAINS_CS }; 533 534#define CSMATCH(s) \ 535 (confprop_defined(asru(chip/memory-controller/chip-select), s) && \ 536 confprop(asru(chip/memory-controller/chip-select), s) == \ 537 confprop(asru(chip/memory-controller/dimm/rank), "csname")) 538 539prop fault.memory.dimm_testfail@chip/memory-controller/dimm/rank -> 540 error.memory.cs_testfail@chip/memory-controller/chip-select 541 { CSMATCH("dimm1-csname") || CSMATCH("dimm2-csname")}; 542 543/* #ADDRPAR# 544 * DRAM Command/Address Parity Errors. 545 * 546 * - dramaddr_par : reported by the nb; the NB status register includes 547 * a bit indicating which dram controller channel (A or B) experienced 548 * the error. 549 */ 550 551event ereport.cpu.amd.nb.dramaddr_par@chip/cpu; 552 553event fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel, 554 FITrate=1000, ASRU=dram-channel; 555 556#define GET_CHANNEL ($chan = (payloadprop("bank-status") >> 32 & 0x200) ? \ 557 1 : 0) 558 559prop fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel[y] (0)-> 560 ereport.cpu.amd.nb.dramaddr_par@chip/cpu { GET_CHANNEL && $chan == y }; 561 562/* 563 * l2 cache data errors. 564 */ 565 566#define L2CACHEDATA_FIT 1000 567#define L2CACHEDATA_SB_COUNT 3 568#define L2CACHEDATA_SB_TIME 12h 569 570event fault.cpu.amd.l2cachedata@chip/cpu, FITrate=L2CACHEDATA_FIT, 571 FRU=chip, ASRU=chip/cpu; 572event error.cpu.amd.l2cachedata_sb@chip/cpu; 573event error.cpu.amd.l2cachedata_mb@chip/cpu; 574 575prop fault.cpu.amd.l2cachedata@chip/cpu (1)-> 576 error.cpu.amd.l2cachedata_sb@chip/cpu, 577 error.cpu.amd.l2cachedata_mb@chip/cpu; 578 579/* #L2D_SINGLE# 580 * A single bit data array fault in an l2 cache can cause: 581 * 582 * - inf_l2_ecc1 : reported by ic on this cpu 583 * - inf_l2_ecc1 : reported by dc on this cpu 584 * - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu 585 * 586 * Single-bit errors are diagnosed to cache upsets. SERD engines are used 587 * to count upsets resulting from CEs. 588 */ 589 590event ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu{within(5s)}; 591event ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu{within(5s)}; 592event ereport.cpu.amd.bu.l2d_ecc1@chip/cpu{within(5s)}; 593event ereport.cpu.amd.l2d_sb_trip@chip/cpu; 594 595engine serd.cpu.amd.l2d_sb@chip/cpu, 596 N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME, method=persistent, 597 trip=ereport.cpu.amd.l2d_sb_trip@chip/cpu; 598 599event upset.cpu.amd.l2d_sb@chip/cpu, 600 engine=serd.cpu.amd.l2d_sb@chip/cpu; 601 602prop upset.cpu.amd.l2d_sb@chip/cpu (1)-> 603 ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu, 604 ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu, 605 ereport.cpu.amd.bu.l2d_ecc1@chip/cpu; 606 607prop error.cpu.amd.l2cachedata_sb@chip/cpu (1)-> 608 ereport.cpu.amd.l2d_sb_trip@chip/cpu; 609 610prop fault.cpu.amd.l2cachedata@chip/cpu (0)-> 611 ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu, 612 ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu, 613 ereport.cpu.amd.bu.l2d_ecc1@chip/cpu; 614 615/* #L2D_MULTI# 616 * A multi-bit data array fault in an l2 cache can cause: 617 * 618 * - inf_l2_eccm : reported by ic on this cpu 619 * - inf_l2_eccm : reported by dc on this cpu 620 * - l2d_eccm : reported by bu on copyback or on snoop from another cpu 621 */ 622 623event ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu; 624event ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu; 625event ereport.cpu.amd.bu.l2d_eccm@chip/cpu; 626 627prop error.cpu.amd.l2cachedata_mb@chip/cpu (1)-> 628 ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu, 629 ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu, 630 ereport.cpu.amd.bu.l2d_eccm@chip/cpu; 631 632prop fault.cpu.amd.l2cachedata@chip/cpu (0)-> 633 ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu, 634 ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu, 635 ereport.cpu.amd.bu.l2d_eccm@chip/cpu; 636 637/* 638 * l2 cache main tag errors 639 */ 640 641#define L2CACHETAG_FIT 1000 642#define L2CACHETAG_SB_COUNT 3 643#define L2CACHETAG_SB_TIME 12h 644 645event fault.cpu.amd.l2cachetag@chip/cpu, FITrate=L2CACHETAG_FIT, 646 FRU=chip, ASRU=chip/cpu; 647event error.cpu.amd.l2cachetag_sb@chip/cpu; 648event error.cpu.amd.l2cachetag_mb@chip/cpu; 649 650prop fault.cpu.amd.l2cachetag@chip/cpu (1)-> 651 error.cpu.amd.l2cachetag_sb@chip/cpu, 652 error.cpu.amd.l2cachetag_mb@chip/cpu; 653 654/* #L2T_SINGLE# 655 * A single bit tag array fault in an l2 cache can cause: 656 * 657 * - l2t_ecc1 : reported by bu on this cpu when detected during snoop 658 * - l2t_par : reported by bu on this cpu when detected other than during snoop 659 * 660 * Note that the bu.l2t_par ereport could be due to a single bit or multi bit 661 * event. If the l2t_sb_trip has already triggered it will be treated as another 662 * ce, otherwise it will be treated as a ue event. 663 */ 664 665event ereport.cpu.amd.bu.l2t_ecc1@chip/cpu{within(5s)}; 666event ereport.cpu.amd.bu.l2t_par@chip/cpu; 667event ereport.cpu.amd.l2t_sb_trip@chip/cpu; 668 669engine serd.cpu.amd.l2t_sb@chip/cpu, 670 N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME, method=persistent, 671 trip=ereport.cpu.amd.l2t_sb_trip@chip/cpu; 672 673event upset.cpu.amd.l2t_sb@chip/cpu, 674 engine=serd.cpu.amd.l2t_sb@chip/cpu; 675 676prop upset.cpu.amd.l2t_sb@chip/cpu (1)-> 677 ereport.cpu.amd.bu.l2t_ecc1@chip/cpu, 678 ereport.cpu.amd.bu.l2t_par@chip/cpu; 679 680prop error.cpu.amd.l2cachetag_sb@chip/cpu (1)-> 681 ereport.cpu.amd.l2t_sb_trip@chip/cpu; 682 683prop fault.cpu.amd.l2cachetag@chip/cpu (0)-> 684 ereport.cpu.amd.bu.l2t_ecc1@chip/cpu, 685 ereport.cpu.amd.bu.l2t_par@chip/cpu; 686 687/* #L2T_MULTI# 688 * A multi-bit tag array fault in an l2 cache can cause: 689 * 690 * - l2t_eccm : reported by bu on this cpu when detected during snoop 691 * - l2t_par : reported by bu on this cpu when detected other than during snoop 692 */ 693 694event ereport.cpu.amd.bu.l2t_eccm@chip/cpu; 695 696prop error.cpu.amd.l2cachetag_mb@chip/cpu (1)-> 697 ereport.cpu.amd.bu.l2t_eccm@chip/cpu, 698 ereport.cpu.amd.bu.l2t_par@chip/cpu; 699 700prop fault.cpu.amd.l2cachetag@chip/cpu (0)-> 701 ereport.cpu.amd.bu.l2t_eccm@chip/cpu, 702 ereport.cpu.amd.bu.l2t_par@chip/cpu; 703 704/* #ICD_PAR# 705 * A data array parity fault in an I cache can cause: 706 * 707 * - data_par : reported by ic on this cpu 708 */ 709 710#define ICACHEDATA_FIT 1000 711#define ICACHEDATA_SB_COUNT 2 712#define ICACHEDATA_SB_TIME 168h 713 714event ereport.cpu.amd.ic.data_par@chip/cpu{within(5s)}; 715event ereport.cpu.amd.ic_dp_trip@chip/cpu; 716 717event fault.cpu.amd.icachedata@chip/cpu, FITrate=ICACHEDATA_FIT, 718 FRU=chip, ASRU=chip/cpu; 719 720engine serd.cpu.amd.icachedata@chip/cpu, 721 N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME, method=persistent, 722 trip=ereport.cpu.amd.ic_dp_trip@chip/cpu; 723 724event upset.cpu.amd.icachedata@chip/cpu, 725 engine=serd.cpu.amd.icachedata@chip/cpu; 726 727prop upset.cpu.amd.icachedata@chip/cpu (1)-> 728 ereport.cpu.amd.ic.data_par@chip/cpu; 729 730prop fault.cpu.amd.icachedata@chip/cpu (1)-> 731 ereport.cpu.amd.ic_dp_trip@chip/cpu; 732 733prop fault.cpu.amd.icachedata@chip/cpu (0)-> 734 ereport.cpu.amd.ic.data_par@chip/cpu; 735 736/* #ICT_PAR# 737 * A tag array parity fault in an I cache can cause: 738 * 739 * - tag_par : reported by ic on this cpu 740 */ 741 742#define ICACHETAG_FIT 1000 743#define ICACHETAG_SB_COUNT 2 744#define ICACHETAG_SB_TIME 168h 745 746event ereport.cpu.amd.ic.tag_par@chip/cpu{within(5s)}; 747event ereport.cpu.amd.ic_tp_trip@chip/cpu; 748 749event fault.cpu.amd.icachetag@chip/cpu, FITrate=ICACHETAG_FIT, 750 FRU=chip, ASRU=chip/cpu; 751 752engine serd.cpu.amd.icachetag@chip/cpu, 753 N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME, method=persistent, 754 trip=ereport.cpu.amd.ic_tp_trip@chip/cpu; 755 756event upset.cpu.amd.icachetag@chip/cpu, 757 engine=serd.cpu.amd.icachetag@chip/cpu; 758 759prop upset.cpu.amd.icachetag@chip/cpu (1)-> 760 ereport.cpu.amd.ic.tag_par@chip/cpu; 761 762prop fault.cpu.amd.icachetag@chip/cpu (1)-> 763 ereport.cpu.amd.ic_tp_trip@chip/cpu; 764 765prop fault.cpu.amd.icachetag@chip/cpu (0)-> 766 ereport.cpu.amd.ic.tag_par@chip/cpu; 767 768/* #ICT_SNOOP# 769 * A snoop tag array parity fault in an I cache can cause: 770 * 771 * - stag_par : reported by ic on this cpu 772 */ 773 774#define ICACHESTAG_FIT 1000 775 776event ereport.cpu.amd.ic.stag_par@chip/cpu{within(5s)}; 777 778event fault.cpu.amd.icachestag@chip/cpu, FITrate=ICACHESTAG_FIT, 779 FRU=chip, ASRU=chip/cpu; 780 781prop fault.cpu.amd.icachestag@chip/cpu (1)-> 782 ereport.cpu.amd.ic.stag_par@chip/cpu; 783 784/* #ICTLB_1# 785 * An l1tlb parity fault in an I cache can cause: 786 * 787 * - l1tlb_par : reported by ic on this cpu 788 */ 789 790#define ICACHEL1TLB_FIT 1000 791#define ICACHEL1TLB_SB_COUNT 2 792#define ICACHEL1TLB_SB_TIME 168h 793 794event ereport.cpu.amd.ic.l1tlb_par@chip/cpu{within(5s)}; 795event ereport.cpu.amd.ic_l1tlb_trip@chip/cpu; 796 797event fault.cpu.amd.l1itlb@chip/cpu, FITrate=ICACHEL1TLB_FIT, 798 FRU=chip, ASRU=chip/cpu; 799 800engine serd.cpu.amd.l1itlb@chip/cpu, 801 N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME, method=persistent, 802 trip=ereport.cpu.amd.ic_l1tlb_trip@chip/cpu; 803 804event upset.cpu.amd.l1itlb@chip/cpu, 805 engine=serd.cpu.amd.l1itlb@chip/cpu; 806 807prop upset.cpu.amd.l1itlb@chip/cpu (1)-> 808 ereport.cpu.amd.ic.l1tlb_par@chip/cpu; 809 810prop fault.cpu.amd.l1itlb@chip/cpu (1)-> 811 ereport.cpu.amd.ic_l1tlb_trip@chip/cpu; 812 813prop fault.cpu.amd.l1itlb@chip/cpu (0)-> 814 ereport.cpu.amd.ic.l1tlb_par@chip/cpu; 815 816/* #ICTLB_2# 817 * An l2tlb parity fault in an I cache can cause: 818 * 819 * - l2tlb_par : reported by ic on this cpu 820 */ 821 822#define ICACHEL2TLB_FIT 1000 823#define ICACHEL2TLB_SB_COUNT 2 824#define ICACHEL2TLB_SB_TIME 168h 825 826event ereport.cpu.amd.ic.l2tlb_par@chip/cpu{within(5s)}; 827event ereport.cpu.amd.ic_l2tlb_trip@chip/cpu; 828 829event fault.cpu.amd.l2itlb@chip/cpu, FITrate=ICACHEL2TLB_FIT, 830 FRU=chip, ASRU=chip/cpu; 831 832engine serd.cpu.amd.l2itlb@chip/cpu, 833 N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME, method=persistent, 834 trip=ereport.cpu.amd.ic_l2tlb_trip@chip/cpu; 835 836event upset.cpu.amd.l2itlb@chip/cpu, 837 engine=serd.cpu.amd.l2itlb@chip/cpu; 838 839prop upset.cpu.amd.l2itlb@chip/cpu (1)-> 840 ereport.cpu.amd.ic.l2tlb_par@chip/cpu; 841 842prop fault.cpu.amd.l2itlb@chip/cpu (1)-> 843 ereport.cpu.amd.ic_l2tlb_trip@chip/cpu; 844 845prop fault.cpu.amd.l2itlb@chip/cpu (0)-> 846 ereport.cpu.amd.ic.l2tlb_par@chip/cpu; 847 848/* 849 * dcache data errors 850 */ 851 852#define DCACHEDATA_FIT 1000 853#define DCACHEDATA_SB_COUNT 2 854#define DCACHEDATA_SB_TIME 168h 855 856event fault.cpu.amd.dcachedata@chip/cpu, FITrate=DCACHEDATA_FIT, 857 FRU=chip, ASRU=chip/cpu; 858event error.cpu.amd.dcachedata_sb@chip/cpu; 859event error.cpu.amd.dcachedata_mb@chip/cpu; 860 861prop fault.cpu.amd.dcachedata@chip/cpu (1)-> 862 error.cpu.amd.dcachedata_sb@chip/cpu, 863 error.cpu.amd.dcachedata_mb@chip/cpu; 864 865/* #DCD_SINGLE# 866 * A single bit data array fault in an D cache can cause: 867 * 868 * - data_ecc1 : reported by dc on this cpu by scrubber 869 * - data_ecc1_uc : reported by dc on this cpu other than by scrubber 870 * 871 * Make data_ecc1_uc fault immediately as it may have caused a panic 872 */ 873 874event ereport.cpu.amd.dc.data_ecc1@chip/cpu{within(5s)}; 875event ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu{within(5s)}; 876event ereport.cpu.amd.dc_sb_trip@chip/cpu; 877 878engine serd.cpu.amd.dc_sb@chip/cpu, 879 N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME, method=persistent, 880 trip=ereport.cpu.amd.dc_sb_trip@chip/cpu; 881 882engine serd.cpu.amd.dc_sb_uc@chip/cpu, 883 N=0, T=1hr, method=persistent, 884 trip=ereport.cpu.amd.dc_sb_trip@chip/cpu; 885 886event upset.cpu.amd.dc_sb@chip/cpu, 887 engine=serd.cpu.amd.dc_sb@chip/cpu; 888 889event upset.cpu.amd.dc_sb_uc@chip/cpu, 890 engine=serd.cpu.amd.dc_sb_uc@chip/cpu; 891 892prop upset.cpu.amd.dc_sb@chip/cpu (1)-> 893 ereport.cpu.amd.dc.data_ecc1@chip/cpu; 894 895prop upset.cpu.amd.dc_sb_uc@chip/cpu (1)-> 896 ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu; 897 898prop error.cpu.amd.dcachedata_sb@chip/cpu (1)-> 899 ereport.cpu.amd.dc_sb_trip@chip/cpu; 900 901prop fault.cpu.amd.dcachedata@chip/cpu (0)-> 902 ereport.cpu.amd.dc.data_ecc1@chip/cpu, 903 ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu; 904 905/* #DCD_MULTI# 906 * A multi-bit data array fault in an D cache can cause: 907 * 908 * - data_eccm : reported by dc on this cpu 909 */ 910 911event ereport.cpu.amd.dc.data_eccm@chip/cpu; 912 913prop error.cpu.amd.dcachedata_mb@chip/cpu (1)-> 914 ereport.cpu.amd.dc.data_eccm@chip/cpu; 915 916prop fault.cpu.amd.dcachedata@chip/cpu (0)-> 917 ereport.cpu.amd.dc.data_eccm@chip/cpu; 918 919/* #DCT_PAR# 920 * A tag array parity fault in an D cache can cause: 921 * 922 * - tag_par : reported by dc on this cpu 923 */ 924 925#define DCACHETAG_FIT 1000 926 927event ereport.cpu.amd.dc.tag_par@chip/cpu{within(5s)}; 928 929event fault.cpu.amd.dcachetag@chip/cpu, FITrate=DCACHETAG_FIT, 930 FRU=chip, ASRU=chip/cpu; 931 932prop fault.cpu.amd.dcachetag@chip/cpu (1)-> 933 ereport.cpu.amd.dc.tag_par@chip/cpu; 934 935/* #DCT_SNOOP# 936 * A snoop tag array parity fault in an D cache can cause: 937 * 938 * - stag_par : reported by dc on this cpu 939 */ 940 941#define DCACHESTAG_FIT 1000 942 943event ereport.cpu.amd.dc.stag_par@chip/cpu{within(5s)}; 944 945event fault.cpu.amd.dcachestag@chip/cpu, FITrate=DCACHESTAG_FIT, 946 FRU=chip, ASRU=chip/cpu; 947 948prop fault.cpu.amd.dcachestag@chip/cpu (1)-> 949 ereport.cpu.amd.dc.stag_par@chip/cpu; 950 951/* #DCTLB_1# 952 * An l1tlb parity fault in an D cache can cause: 953 * 954 * - l1tlb_par : reported by dc on this cpu 955 */ 956 957#define L1DTLB_FIT 1000 958 959event ereport.cpu.amd.dc.l1tlb_par@chip/cpu{within(5s)}; 960 961event fault.cpu.amd.l1dtlb@chip/cpu, FITrate=L1DTLB_FIT, 962 FRU=chip, ASRU=chip/cpu; 963 964prop fault.cpu.amd.l1dtlb@chip/cpu (1)-> 965 ereport.cpu.amd.dc.l1tlb_par@chip/cpu; 966 967/* #DCTLB_2# 968 * An l2tlb parity fault in an D cache can cause: 969 * 970 * - l2tlb_par : reported by dc on this cpu 971 */ 972 973#define L2DTLB_FIT 1000 974 975event ereport.cpu.amd.dc.l2tlb_par@chip/cpu{within(5s)}; 976 977event fault.cpu.amd.l2dtlb@chip/cpu, FITrate=L2DTLB_FIT, 978 FRU=chip, ASRU=chip/cpu; 979 980prop fault.cpu.amd.l2dtlb@chip/cpu (1)-> 981 ereport.cpu.amd.dc.l2tlb_par@chip/cpu; 982 983/* #MISC# 984 * Ereports that should not normally happen and which we will discard 985 * without diagnosis if they do. These fall into a few categories: 986 * 987 * - the corresponding detector is not enabled, typically because 988 * detection/handling of the event is taking place elsewhere 989 * (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk) 990 * - the event is associated with a sync flood so even if the detector is 991 * enabled we will never handle the event and generate an ereport *and* 992 * even if the ereport did arrive we could perform no useful diagnosis 993 * e.g., the NB can be configured for sync flood on nb.mem_eccm 994 * but we don't choose to discard that ereport here since we could have 995 * made a useful diagnosis from it had it been delivered 996 * (nb.ht_sync, nb.ht_crc) 997 * - events that will be accompanied by an immediate panic and 998 * delivery of the ereport during subsequent reboot but from 999 * which no useful diagnosis can be made. (nb.rmw, nb.wdog) 1000 * 1001 * Ereports for all of these can be generated by error simulation and 1002 * injection. We will perform a null diagnosos of all these ereports in order 1003 * to avoid "no subscription" complaints during test harness runs. 1004 */ 1005 1006event ereport.cpu.amd.nb.ma@cpu; 1007event ereport.cpu.amd.nb.ta@cpu; 1008event ereport.cpu.amd.ls.s_rde@cpu; 1009event ereport.cpu.amd.ic.rdde@cpu; 1010event ereport.cpu.amd.bu.s_rde@cpu; 1011event ereport.cpu.amd.nb.gart_walk@cpu; 1012event ereport.cpu.amd.nb.ht_sync@cpu; 1013event ereport.cpu.amd.nb.ht_crc@cpu; 1014event ereport.cpu.amd.nb.rmw@cpu; 1015event ereport.cpu.amd.nb.wdog@cpu; 1016event ereport.cpu.amd.unknown@cpu; 1017 1018event upset.null_diag@cpu; 1019 1020prop upset.null_diag@cpu (1)-> 1021 ereport.cpu.amd.nb.ma@cpu, 1022 ereport.cpu.amd.nb.ta@cpu, 1023 ereport.cpu.amd.ls.s_rde@cpu, 1024 ereport.cpu.amd.ic.rdde@cpu, 1025 ereport.cpu.amd.bu.s_rde@cpu, 1026 ereport.cpu.amd.nb.gart_walk@cpu, 1027 ereport.cpu.amd.nb.ht_sync@cpu, 1028 ereport.cpu.amd.nb.ht_crc@cpu, 1029 ereport.cpu.amd.nb.rmw@cpu, 1030 ereport.cpu.amd.nb.wdog@cpu, 1031 ereport.cpu.amd.unknown@cpu; 1032