1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23/* 24 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28#pragma ident "%Z%%M% %I% %E% SMI" 29 30#pragma dictionary "AMD" 31 32/* 33 * Eversholt rules for the AMD Opteron CPU/Memory 34 */ 35 36fru dimm; 37asru dimm; 38 39fru chip; 40asru chip/cpu; 41 42 43/* #MEM# 44 * GET_ADDR relies on the fact that variables have global scope across an FME. 45 * Thus for each FME the assignment only occurs for the first invocation 46 * but the comparison happens on each. Thus if the new address matches the 47 * address of an existing open FME, then we return true running in the context 48 * of that FME. If the new address doesn't match the address of any existing 49 * open FME, then we return true in the context of a newly opened FME. 50 */ 51#define GET_ADDR (defined($addr) ? ($addr == payloadprop("addr")) : \ 52 ($addr = payloadprop("addr"))) 53 54#define GET_OFFSET ($offset = payloadprop("resource[0].hc-specific.offset")) 55 56/* 57 * SET_ADDR is used to set a payload value in the fault that we diagnose 58 * for page faults, to record the physical address of the faulting page. 59 */ 60#define SET_ADDR (setpayloadprop("asru-physaddr", $addr)) 61 62#define SET_OFFSET (setpayloadprop("asru-offset", $offset)) 63 64/* 65 * RESOURCE_EXISTS is true if a pair with name "resource" exists in the 66 * payload - regardless of type (e.g., nvlist or nvlist array) or value. 67 */ 68#define RESOURCE_EXISTS (payloadprop_defined("resource")) 69 70/* 71 * CONTAINS_DIMM is true if the "resource" nvlist array (as used in memory 72 * ereports) exists and one if its members matches the path for the 73 * dimm node. Our memory propogation are of the form "foo@dimm -> blah@cpu" 74 * since cpus detect memory errors; in eversholt such a propogation, where 75 * the lhs path and rhs path do not match, expands to the cross-product of 76 * all dimms and cpus in the system. We use CONTAINS_DIMM to constrain 77 * the propogation such that it only happens if the payload resource 78 * matches the dimm. 79 */ 80#define CONTAINS_DIMM (payloadprop_contains("resource", asru(dimm))) 81 82/* 83 * The following will tell us whether a syndrome that is known to be 84 * correctable (from a mem_ecc1) is single-bit or multi-bit. For a 85 * correctable ChipKill syndrome the number of bits set in the lowest 86 * nibble indicates how many bit were in error. 87 */ 88 89#define CBITMASK(synd) ((synd) & 0xf) 90 91#define CKSINGLE(synd) \ 92 ((synd) == 0 || \ 93 (CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 || \ 94 CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8)) 95 96#define SINGLE_BIT_CE \ 97 (payloadprop("syndrome-type") == "E" || \ 98 (payloadprop("syndrome-type") == "C" && \ 99 CKSINGLE(payloadprop("syndrome")))) 100 101#define MULTI_BIT_CE \ 102 (payloadprop("syndrome-type") == "C" && \ 103 !CKSINGLE(payloadprop("syndrome"))) 104 105/* 106 * A single bit fault in a memory dimm can cause: 107 * 108 * - mem_ce : reported by nb for an access from a remote cpu 109 * 110 * Single-bit errors are fed into a per-DIMM SERD engine; if a SERD engine 111 * trips we diagnose a fault.memory.page so that the response agent can 112 * retire the page that caused the trip. If the total number of pages 113 * faulted in this way on a single DIMM exceeds a threshold we will 114 * diagnose a fault.memory.dimm_sb against the DIMM. 115 * 116 * Multibit ChipKill-correctable errors produce an immediate page fault 117 * and corresponding fault.memory.dimm_ck. This is achieved through 118 * SERD engines using N=0 so the facility is there to be a little more 119 * tolerant of these errors. 120 * 121 * Uncorrectable errors produce an immediate page fault and corresponding 122 * fault.memory.dimm_ue. 123 * 124 * Page faults are essentially internal - action is only required when 125 * they are accompanied by a dimm fault. As such we include message=0 126 * on DIMM faults. 127 */ 128 129event ereport.cpu.amd.nb.mem_ce@cpu; 130 131/* 132 * If the address is not valid then no resource member will be included 133 * in a nb.mem_ce or nb.mem_ue ereport. These cases should be rare. 134 * We will discard such ereports. An alternative may be to SERD them 135 * on a per MC basis and trip if we see too many such events. 136 */ 137 138event upset.memory.discard@cpu; 139 140/* #PAGE# 141 * Page faults of all types diagnose to a single fault class and are 142 * counted with a stat. 143 * 144 * Single-bit errors are diagnosed as upsets and feed into per-DIMM 145 * SERD engines which diagnose fault.memory.page if they trip. 146 */ 147 148#define PAGE_FIT 1 149#define PAGE_SB_COUNT 2 150#define PAGE_SB_TIME 72h 151#define PAGE_CK_COUNT 0 152#define PAGE_CK_TIME 1h 153 154engine stat.page_fault@dimm; 155event fault.memory.page@dimm, FITrate=PAGE_FIT, 156 ASRU=dimm, message=0, count=stat.page_fault@dimm, 157 action=confcall("rewrite-ASRU"); 158event error.memory.page_sb@dimm; 159event error.memory.page_ck@dimm; 160event error.memory.page_ue@dimm; 161 162prop fault.memory.page@dimm (1)-> 163 error.memory.page_sb@dimm, 164 error.memory.page_ck@dimm, 165 error.memory.page_ue@dimm; 166 167event ereport.memory.page_sb_trip@dimm; 168engine serd.memory.page_sb@dimm, N=PAGE_SB_COUNT, T=PAGE_SB_TIME, 169 method=persistent, trip=ereport.memory.page_sb_trip@dimm; 170event upset.memory.page_sb@dimm, engine=serd.memory.page_sb@dimm; 171 172event ereport.memory.page_ck_trip@dimm; 173engine serd.memory.page_ck@dimm, N=PAGE_CK_COUNT, T=PAGE_CK_TIME, 174 method=persistent, trip=ereport.memory.page_ck_trip@dimm; 175event upset.memory.page_ck@dimm, engine=serd.memory.page_ck@dimm; 176 177prop upset.memory.page_sb@dimm (0)-> 178 ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && SINGLE_BIT_CE }; 179 180prop upset.memory.page_ck@dimm (0)-> 181 ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && MULTI_BIT_CE }; 182 183prop error.memory.page_sb@dimm (1)-> 184 ereport.memory.page_sb_trip@dimm; 185 186prop error.memory.page_ck@dimm (1)-> 187 ereport.memory.page_ck_trip@dimm; 188 189prop fault.memory.page@dimm { SET_ADDR && SET_OFFSET } (0)-> 190 ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && GET_ADDR && GET_OFFSET }; 191 192prop upset.memory.discard@cpu (1)-> 193 ereport.cpu.amd.nb.mem_ce@cpu { !RESOURCE_EXISTS }; 194 195/* #DIMM_SB# 196 * Single-bit DIMM faults are diagnosed when the number of page faults 197 * (of all types since they all are counted in a single per-DIMM stat engine) 198 * reaches a threshold. Since our tolerance of ChipKill and UE faults 199 * is much lower than that for single-bit errors the threshold will only be 200 * reached for repeated single-bit page faults. We do not stop diagnosing 201 * further single-bit page faults once we have declared a single-bit DIMM 202 * fault - we continue diagnosing them and response agents can continue to 203 * retire those pages up to the system-imposed retirement limit. 204 * 205 * We maintain a parallel SERD engine to the page_sb engine which trips 206 * in unison, but on trip it generates a distinct ereport which we 207 * diagnose to a dimm_sb fault if the threshold has been reached, or 208 * to a throwaway upset if not. 209 */ 210 211#define DIMM_SB_FIT 2000 212#define DIMM_SB_THRESH 128 213 214event fault.memory.dimm_sb@dimm, FITrate=DIMM_SB_FIT, FRU=dimm, ASRU=dimm; 215 216event ereport.memory.dimm_sb_trip@dimm; 217event upset.memory.discard@dimm; 218engine serd.memory.dimm_sb@dimm, N=PAGE_SB_COUNT, T=PAGE_SB_TIME, 219 method=persistent, trip=ereport.memory.dimm_sb_trip@dimm; 220event upset.memory.dimm_sb@dimm, engine=serd.memory.dimm_sb@dimm; 221 222prop upset.memory.dimm_sb@dimm (0)-> 223 ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM }; /* sb and ck */ 224 225prop upset.memory.discard@dimm (1)-> 226 ereport.memory.dimm_sb_trip@dimm; 227 228prop fault.memory.dimm_sb@dimm (0)-> 229 ereport.memory.dimm_sb_trip@dimm { 230 count(stat.page_fault@dimm) >= DIMM_SB_THRESH }; 231 232/* #DIMM_CK# 233 * ChipKill-correctable multi-bit faults indicate a likely failing SDRAM 234 * part. We will SERD them but with a very low/zero tolerance. 235 */ 236 237#define DIMM_CK_FIT 4000 238#define DIMM_CK_COUNT 0 239#define DIMM_CK_TIME 1h 240 241event fault.memory.dimm_ck@dimm, FITrate=DIMM_CK_FIT, FRU=dimm, ASRU=dimm; 242 243event ereport.memory.dimm_ck_trip@dimm; 244engine serd.memory.dimm_ck@dimm, N=DIMM_CK_COUNT, T=DIMM_CK_TIME, 245 method=persistent, trip=ereport.memory.dimm_ck_trip@dimm; 246event upset.memory.dimm_ck@dimm, engine=serd.memory.dimm_ck@dimm; 247 248prop upset.memory.dimm_ck@dimm (0)-> 249 ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && MULTI_BIT_CE }; 250 251prop fault.memory.dimm_ck@dimm (1)-> 252 ereport.memory.dimm_ck_trip@dimm; 253 254prop fault.memory.page@dimm { SET_ADDR && SET_OFFSET } (0)-> 255 ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && MULTI_BIT_CE && 256 GET_ADDR && GET_OFFSET }; 257 258/* #DIMM_UE# 259 * A multi-bit fault in a memory dimm can cause: 260 * 261 * - ue : reported by nb for an access from a remote cpu 262 * 263 * Note we use a SERD engine here simply as a way of ensuring that we get 264 * both dimm and page faults reported 265 */ 266 267#define DIMM_UE_FIT 6000 268 269event ereport.cpu.amd.nb.mem_ue@cpu; 270event ereport.memory.page_ue_trip@dimm; 271event ereport.memory.dimm_ue_trip@dimm; 272event fault.memory.dimm_ue@dimm, FITrate=DIMM_UE_FIT, FRU=dimm, ASRU=dimm; 273event upset.memory.page_ue@dimm, engine=serd.memory.page_ue@dimm; 274event upset.memory.dimm_ue@dimm, engine=serd.memory.dimm_ue@dimm; 275 276engine serd.memory.dimm_ue@dimm, N=0, T=1h, 277 method=persistent, trip=ereport.memory.dimm_ue_trip@dimm; 278 279engine serd.memory.page_ue@dimm, N=0, T=1h, 280 method=persistent, trip=ereport.memory.page_ue_trip@dimm; 281 282prop upset.memory.page_ue@dimm (0)-> 283 ereport.cpu.amd.nb.mem_ue@cpu { CONTAINS_DIMM }; 284 285prop upset.memory.dimm_ue@dimm (0)-> 286 ereport.cpu.amd.nb.mem_ue@cpu { CONTAINS_DIMM }; 287 288prop error.memory.page_ue@dimm (1)-> 289 ereport.memory.page_ue_trip@dimm; 290 291prop fault.memory.page@dimm { SET_ADDR && SET_OFFSET } (0)-> 292 ereport.cpu.amd.nb.mem_ue@cpu { CONTAINS_DIMM && GET_ADDR & GET_OFFSET }; 293 294prop fault.memory.dimm_ue@dimm (1)-> 295 ereport.memory.dimm_ue_trip@dimm; 296 297prop upset.memory.discard@cpu (1)-> 298 ereport.cpu.amd.nb.mem_ce@cpu { !RESOURCE_EXISTS }; 299 300/* #L2D# 301 * l2 cache data errors. 302 */ 303 304#define L2CACHEDATA_FIT 1000 305#define L2CACHEDATA_SB_COUNT 3 306#define L2CACHEDATA_SB_TIME 12h 307 308event fault.cpu.amd.l2cachedata@chip/cpu, FITrate=L2CACHEDATA_FIT, 309 FRU=chip, ASRU=chip/cpu; 310event error.cpu.amd.l2cachedata_sb@chip/cpu; 311event error.cpu.amd.l2cachedata_mb@chip/cpu; 312 313prop fault.cpu.amd.l2cachedata@chip/cpu (1)-> 314 error.cpu.amd.l2cachedata_sb@chip/cpu, 315 error.cpu.amd.l2cachedata_mb@chip/cpu; 316 317/* #L2D_SINGLE# 318 * A single bit data array fault in an l2 cache can cause: 319 * 320 * - inf_l2_ecc1 : reported by ic on this cpu 321 * - inf_l2_ecc1 : reported by dc on this cpu 322 * - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu 323 * 324 * Single-bit errors are diagnosed to cache upsets. SERD engines are used 325 * to count upsets resulting from CEs. 326 */ 327 328event ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu{within(5s)}; 329event ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu{within(5s)}; 330event ereport.cpu.amd.bu.l2d_ecc1@chip/cpu{within(5s)}; 331event ereport.cpu.amd.l2d_sb_trip@chip/cpu; 332 333engine serd.cpu.amd.l2d_sb@chip/cpu, 334 N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME, method=persistent, 335 trip=ereport.cpu.amd.l2d_sb_trip@chip/cpu; 336 337event upset.cpu.amd.l2d_sb@chip/cpu, 338 engine=serd.cpu.amd.l2d_sb@chip/cpu; 339 340prop upset.cpu.amd.l2d_sb@chip/cpu (1)-> 341 ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu, 342 ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu, 343 ereport.cpu.amd.bu.l2d_ecc1@chip/cpu; 344 345prop error.cpu.amd.l2cachedata_sb@chip/cpu (1)-> 346 ereport.cpu.amd.l2d_sb_trip@chip/cpu; 347 348prop fault.cpu.amd.l2cachedata@chip/cpu (0)-> 349 ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu, 350 ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu, 351 ereport.cpu.amd.bu.l2d_ecc1@chip/cpu; 352 353/* #L2D_MULTI# 354 * A multi-bit data array fault in an l2 cache can cause: 355 * 356 * - inf_l2_eccm : reported by ic on this cpu 357 * - inf_l2_eccm : reported by dc on this cpu 358 * - l2d_eccm : reported by bu on copyback or on snoop from another cpu 359 */ 360 361event ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu; 362event ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu; 363event ereport.cpu.amd.bu.l2d_eccm@chip/cpu; 364 365prop error.cpu.amd.l2cachedata_mb@chip/cpu (1)-> 366 ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu, 367 ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu, 368 ereport.cpu.amd.bu.l2d_eccm@chip/cpu; 369 370prop fault.cpu.amd.l2cachedata@chip/cpu (0)-> 371 ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu, 372 ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu, 373 ereport.cpu.amd.bu.l2d_eccm@chip/cpu; 374 375/* #L2T# 376 * l2 cache main tag errors 377 */ 378 379#define L2CACHETAG_FIT 1000 380#define L2CACHETAG_SB_COUNT 3 381#define L2CACHETAG_SB_TIME 12h 382 383event fault.cpu.amd.l2cachetag@chip/cpu, FITrate=L2CACHETAG_FIT, 384 FRU=chip, ASRU=chip/cpu; 385event error.cpu.amd.l2cachetag_sb@chip/cpu; 386event error.cpu.amd.l2cachetag_mb@chip/cpu; 387 388prop fault.cpu.amd.l2cachetag@chip/cpu (1)-> 389 error.cpu.amd.l2cachetag_sb@chip/cpu, 390 error.cpu.amd.l2cachetag_mb@chip/cpu; 391 392/* #L2T_SINGLE# 393 * A single bit tag array fault in an l2 cache can cause: 394 * 395 * - l2t_ecc1 : reported by bu on this cpu when detected during snoop 396 * - l2t_par : reported by bu on this cpu when detected other than during snoop 397 * 398 * Note that the bu.l2t_par ereport could be due to a single bit or multi bit 399 * event. If the l2t_sb_trip has already triggered it will be treated as another 400 * ce, otherwise it will be treated as a ue event. 401 */ 402 403event ereport.cpu.amd.bu.l2t_ecc1@chip/cpu{within(5s)}; 404event ereport.cpu.amd.bu.l2t_par@chip/cpu; 405event ereport.cpu.amd.l2t_sb_trip@chip/cpu; 406 407engine serd.cpu.amd.l2t_sb@chip/cpu, 408 N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME, method=persistent, 409 trip=ereport.cpu.amd.l2t_sb_trip@chip/cpu; 410 411event upset.cpu.amd.l2t_sb@chip/cpu, 412 engine=serd.cpu.amd.l2t_sb@chip/cpu; 413 414prop upset.cpu.amd.l2t_sb@chip/cpu (1)-> 415 ereport.cpu.amd.bu.l2t_ecc1@chip/cpu, 416 ereport.cpu.amd.bu.l2t_par@chip/cpu; 417 418prop error.cpu.amd.l2cachetag_sb@chip/cpu (1)-> 419 ereport.cpu.amd.l2t_sb_trip@chip/cpu; 420 421prop fault.cpu.amd.l2cachetag@chip/cpu (0)-> 422 ereport.cpu.amd.bu.l2t_ecc1@chip/cpu, 423 ereport.cpu.amd.bu.l2t_par@chip/cpu; 424 425/* #L2T_MULTI# 426 * A multi-bit tag array fault in an l2 cache can cause: 427 * 428 * - l2t_eccm : reported by bu on this cpu when detected during snoop 429 * - l2t_par : reported by bu on this cpu when detected other than during snoop 430 */ 431 432event ereport.cpu.amd.bu.l2t_eccm@chip/cpu; 433 434prop error.cpu.amd.l2cachetag_mb@chip/cpu (1)-> 435 ereport.cpu.amd.bu.l2t_eccm@chip/cpu, 436 ereport.cpu.amd.bu.l2t_par@chip/cpu; 437 438prop fault.cpu.amd.l2cachetag@chip/cpu (0)-> 439 ereport.cpu.amd.bu.l2t_eccm@chip/cpu, 440 ereport.cpu.amd.bu.l2t_par@chip/cpu; 441 442/* #ICD_PAR# 443 * A data array parity fault in an I cache can cause: 444 * 445 * - data_par : reported by ic on this cpu 446 */ 447 448#define ICACHEDATA_FIT 1000 449#define ICACHEDATA_SB_COUNT 2 450#define ICACHEDATA_SB_TIME 168h 451 452event ereport.cpu.amd.ic.data_par@chip/cpu{within(5s)}; 453event ereport.cpu.amd.ic_dp_trip@chip/cpu; 454 455event fault.cpu.amd.icachedata@chip/cpu, FITrate=ICACHEDATA_FIT, 456 FRU=chip, ASRU=chip/cpu; 457 458engine serd.cpu.amd.icachedata@chip/cpu, 459 N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME, method=persistent, 460 trip=ereport.cpu.amd.ic_dp_trip@chip/cpu; 461 462event upset.cpu.amd.icachedata@chip/cpu, 463 engine=serd.cpu.amd.icachedata@chip/cpu; 464 465prop upset.cpu.amd.icachedata@chip/cpu (1)-> 466 ereport.cpu.amd.ic.data_par@chip/cpu; 467 468prop fault.cpu.amd.icachedata@chip/cpu (1)-> 469 ereport.cpu.amd.ic_dp_trip@chip/cpu; 470 471prop fault.cpu.amd.icachedata@chip/cpu (0)-> 472 ereport.cpu.amd.ic.data_par@chip/cpu; 473 474/* #ICT_PAR# 475 * A tag array parity fault in an I cache can cause: 476 * 477 * - tag_par : reported by ic on this cpu 478 */ 479 480#define ICACHETAG_FIT 1000 481#define ICACHETAG_SB_COUNT 2 482#define ICACHETAG_SB_TIME 168h 483 484event ereport.cpu.amd.ic.tag_par@chip/cpu{within(5s)}; 485event ereport.cpu.amd.ic_tp_trip@chip/cpu; 486 487event fault.cpu.amd.icachetag@chip/cpu, FITrate=ICACHETAG_FIT, 488 FRU=chip, ASRU=chip/cpu; 489 490engine serd.cpu.amd.icachetag@chip/cpu, 491 N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME, method=persistent, 492 trip=ereport.cpu.amd.ic_tp_trip@chip/cpu; 493 494event upset.cpu.amd.icachetag@chip/cpu, 495 engine=serd.cpu.amd.icachetag@chip/cpu; 496 497prop upset.cpu.amd.icachetag@chip/cpu (1)-> 498 ereport.cpu.amd.ic.tag_par@chip/cpu; 499 500prop fault.cpu.amd.icachetag@chip/cpu (1)-> 501 ereport.cpu.amd.ic_tp_trip@chip/cpu; 502 503prop fault.cpu.amd.icachetag@chip/cpu (0)-> 504 ereport.cpu.amd.ic.tag_par@chip/cpu; 505 506/* #ICT_SNOOP# 507 * A snoop tag array parity fault in an I cache can cause: 508 * 509 * - stag_par : reported by ic on this cpu 510 */ 511 512#define ICACHESTAG_FIT 1000 513 514event ereport.cpu.amd.ic.stag_par@chip/cpu{within(5s)}; 515 516event fault.cpu.amd.icachestag@chip/cpu, FITrate=ICACHESTAG_FIT, 517 FRU=chip, ASRU=chip/cpu; 518 519prop fault.cpu.amd.icachestag@chip/cpu (1)-> 520 ereport.cpu.amd.ic.stag_par@chip/cpu; 521 522/* #ICTLB_1# 523 * An l1tlb parity fault in an I cache can cause: 524 * 525 * - l1tlb_par : reported by ic on this cpu 526 */ 527 528#define ICACHEL1TLB_FIT 1000 529#define ICACHEL1TLB_SB_COUNT 2 530#define ICACHEL1TLB_SB_TIME 168h 531 532event ereport.cpu.amd.ic.l1tlb_par@chip/cpu{within(5s)}; 533event ereport.cpu.amd.ic_l1tlb_trip@chip/cpu; 534 535event fault.cpu.amd.l1itlb@chip/cpu, FITrate=ICACHEL1TLB_FIT, 536 FRU=chip, ASRU=chip/cpu; 537 538engine serd.cpu.amd.l1itlb@chip/cpu, 539 N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME, method=persistent, 540 trip=ereport.cpu.amd.ic_l1tlb_trip@chip/cpu; 541 542event upset.cpu.amd.l1itlb@chip/cpu, 543 engine=serd.cpu.amd.l1itlb@chip/cpu; 544 545prop upset.cpu.amd.l1itlb@chip/cpu (1)-> 546 ereport.cpu.amd.ic.l1tlb_par@chip/cpu; 547 548prop fault.cpu.amd.l1itlb@chip/cpu (1)-> 549 ereport.cpu.amd.ic_l1tlb_trip@chip/cpu; 550 551prop fault.cpu.amd.l1itlb@chip/cpu (0)-> 552 ereport.cpu.amd.ic.l1tlb_par@chip/cpu; 553 554/* #ICTLB_2# 555 * An l2tlb parity fault in an I cache can cause: 556 * 557 * - l2tlb_par : reported by ic on this cpu 558 */ 559 560#define ICACHEL2TLB_FIT 1000 561#define ICACHEL2TLB_SB_COUNT 2 562#define ICACHEL2TLB_SB_TIME 168h 563 564event ereport.cpu.amd.ic.l2tlb_par@chip/cpu{within(5s)}; 565event ereport.cpu.amd.ic_l2tlb_trip@chip/cpu; 566 567event fault.cpu.amd.l2itlb@chip/cpu, FITrate=ICACHEL2TLB_FIT, 568 FRU=chip, ASRU=chip/cpu; 569 570engine serd.cpu.amd.l2itlb@chip/cpu, 571 N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME, method=persistent, 572 trip=ereport.cpu.amd.ic_l2tlb_trip@chip/cpu; 573 574event upset.cpu.amd.l2itlb@chip/cpu, 575 engine=serd.cpu.amd.l2itlb@chip/cpu; 576 577prop upset.cpu.amd.l2itlb@chip/cpu (1)-> 578 ereport.cpu.amd.ic.l2tlb_par@chip/cpu; 579 580prop fault.cpu.amd.l2itlb@chip/cpu (1)-> 581 ereport.cpu.amd.ic_l2tlb_trip@chip/cpu; 582 583prop fault.cpu.amd.l2itlb@chip/cpu (0)-> 584 ereport.cpu.amd.ic.l2tlb_par@chip/cpu; 585 586/* #DCD# 587 * dcache data errors 588 */ 589 590#define DCACHEDATA_FIT 1000 591#define DCACHEDATA_SB_COUNT 2 592#define DCACHEDATA_SB_TIME 168h 593 594event fault.cpu.amd.dcachedata@chip/cpu, FITrate=DCACHEDATA_FIT, 595 FRU=chip, ASRU=chip/cpu; 596event error.cpu.amd.dcachedata_sb@chip/cpu; 597event error.cpu.amd.dcachedata_mb@chip/cpu; 598 599prop fault.cpu.amd.dcachedata@chip/cpu (1)-> 600 error.cpu.amd.dcachedata_sb@chip/cpu, 601 error.cpu.amd.dcachedata_mb@chip/cpu; 602 603/* #DCD_SINGLE# 604 * A single bit data array fault in an D cache can cause: 605 * 606 * - data_ecc1 : reported by dc on this cpu by scrubber 607 * - data_ecc1_uc : reported by dc on this cpu other than by scrubber 608 * 609 * Make data_ecc1_uc fault immediately as it may have caused a panic 610 */ 611 612event ereport.cpu.amd.dc.data_ecc1@chip/cpu{within(5s)}; 613event ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu{within(5s)}; 614event ereport.cpu.amd.dc_sb_trip@chip/cpu; 615 616engine serd.cpu.amd.dc_sb@chip/cpu, 617 N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME, method=persistent, 618 trip=ereport.cpu.amd.dc_sb_trip@chip/cpu; 619 620engine serd.cpu.amd.dc_sb_uc@chip/cpu, 621 N=0, T=1hr, method=persistent, 622 trip=ereport.cpu.amd.dc_sb_trip@chip/cpu; 623 624event upset.cpu.amd.dc_sb@chip/cpu, 625 engine=serd.cpu.amd.dc_sb@chip/cpu; 626 627event upset.cpu.amd.dc_sb_uc@chip/cpu, 628 engine=serd.cpu.amd.dc_sb_uc@chip/cpu; 629 630prop upset.cpu.amd.dc_sb@chip/cpu (1)-> 631 ereport.cpu.amd.dc.data_ecc1@chip/cpu; 632 633prop upset.cpu.amd.dc_sb_uc@chip/cpu (1)-> 634 ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu; 635 636prop error.cpu.amd.dcachedata_sb@chip/cpu (1)-> 637 ereport.cpu.amd.dc_sb_trip@chip/cpu; 638 639prop fault.cpu.amd.dcachedata@chip/cpu (0)-> 640 ereport.cpu.amd.dc.data_ecc1@chip/cpu, 641 ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu; 642 643/* #DCD_MULTI# 644 * A multi-bit data array fault in an D cache can cause: 645 * 646 * - data_eccm : reported by dc on this cpu 647 */ 648 649event ereport.cpu.amd.dc.data_eccm@chip/cpu; 650 651prop error.cpu.amd.dcachedata_mb@chip/cpu (1)-> 652 ereport.cpu.amd.dc.data_eccm@chip/cpu; 653 654prop fault.cpu.amd.dcachedata@chip/cpu (0)-> 655 ereport.cpu.amd.dc.data_eccm@chip/cpu; 656 657/* #DCT_PAR# 658 * A tag array parity fault in an D cache can cause: 659 * 660 * - tag_par : reported by dc on this cpu 661 */ 662 663#define DCACHETAG_FIT 1000 664 665event ereport.cpu.amd.dc.tag_par@chip/cpu{within(5s)}; 666 667event fault.cpu.amd.dcachetag@chip/cpu, FITrate=DCACHETAG_FIT, 668 FRU=chip, ASRU=chip/cpu; 669 670prop fault.cpu.amd.dcachetag@chip/cpu (1)-> 671 ereport.cpu.amd.dc.tag_par@chip/cpu; 672 673/* #DCT_SNOOP# 674 * A snoop tag array parity fault in an D cache can cause: 675 * 676 * - stag_par : reported by dc on this cpu 677 */ 678 679#define DCACHESTAG_FIT 1000 680 681event ereport.cpu.amd.dc.stag_par@chip/cpu{within(5s)}; 682 683event fault.cpu.amd.dcachestag@chip/cpu, FITrate=DCACHESTAG_FIT, 684 FRU=chip, ASRU=chip/cpu; 685 686prop fault.cpu.amd.dcachestag@chip/cpu (1)-> 687 ereport.cpu.amd.dc.stag_par@chip/cpu; 688 689/* #DCTLB_1# 690 * An l1tlb parity fault in an D cache can cause: 691 * 692 * - l1tlb_par : reported by dc on this cpu 693 */ 694 695#define L1DTLB_FIT 1000 696 697event ereport.cpu.amd.dc.l1tlb_par@chip/cpu{within(5s)}; 698 699event fault.cpu.amd.l1dtlb@chip/cpu, FITrate=L1DTLB_FIT, 700 FRU=chip, ASRU=chip/cpu; 701 702prop fault.cpu.amd.l1dtlb@chip/cpu (1)-> 703 ereport.cpu.amd.dc.l1tlb_par@chip/cpu; 704 705/* #DCTLB_2# 706 * An l2tlb parity fault in an D cache can cause: 707 * 708 * - l2tlb_par : reported by dc on this cpu 709 */ 710 711#define L2DTLB_FIT 1000 712 713event ereport.cpu.amd.dc.l2tlb_par@chip/cpu{within(5s)}; 714 715event fault.cpu.amd.l2dtlb@chip/cpu, FITrate=L2DTLB_FIT, 716 FRU=chip, ASRU=chip/cpu; 717 718prop fault.cpu.amd.l2dtlb@chip/cpu (1)-> 719 ereport.cpu.amd.dc.l2tlb_par@chip/cpu; 720 721/* #DPATH_SB# 722 * Datapath errors between NB/MC and core. 723 */ 724 725#define CPU_DP_FIT 1000 726 727event fault.cpu.amd.datapath@chip/cpu, FITrate=CPU_DP_FIT, FRU=chip, 728 ASRU=chip/cpu; 729event error.cpu.amd.datapath_sb@chip/cpu; 730event error.cpu.amd.datapath_mb@chip/cpu; 731 732prop fault.cpu.amd.datapath@chip/cpu (1)-> 733 error.cpu.amd.datapath_sb@chip/cpu, 734 error.cpu.amd.datapath_mb@chip/cpu; 735 736/* 737 * A single bit fault in the datapath between the NB and requesting core 738 * can cause: 739 * 740 * - inf_sys_ecc1 : reported by ic on access from a local cpu 741 * - inf_sys_ecc1 : reported by dc on access from a local cpu 742 * - s_ecc1 : reported by bu on access from a local cpu (hw prefetch etc) 743 */ 744 745#define CPU_DP_COUNT 3 746#define CPU_DP_TIME 12h 747 748event ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu{within(5s)}; 749event ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu{within(5s)}; 750event ereport.cpu.amd.bu.s_ecc1@chip/cpu{within(5s)}; 751event upset.cpu.dp_sb@chip/cpu, engine=serd.cpu.dp_sb@chip/cpu; 752event ereport.cpu.amd.dp_sb_trip@chip/cpu; 753 754engine serd.cpu.dp_sb@chip/cpu, N=CPU_DP_COUNT, T=CPU_DP_TIME, 755 method=persistent, trip=ereport.cpu.amd.dp_sb_trip@chip/cpu; 756 757prop upset.cpu.dp_sb@chip/cpu (1)-> 758 ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu, 759 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu, 760 ereport.cpu.amd.bu.s_ecc1@chip/cpu; 761 762prop error.cpu.amd.datapath_sb@chip/cpu (1)-> 763 ereport.cpu.amd.dp_sb_trip@chip/cpu; 764 765prop fault.cpu.amd.datapath@chip/cpu (0)-> 766 ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu, 767 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu, 768 ereport.cpu.amd.bu.s_ecc1@chip/cpu; 769 770/* #DPATH_MB# 771 * A multi-bit fault in the datapath between the NB and requesting core 772 * can cause: 773 * 774 * - inf_sys_eccm : reported by ic on access from a local cpu 775 * - inf_sys_eccm : reported by dc on access from a local cpu 776 * - s_eccm : reported by bu on access from a local cpu (hw prefetch etc) 777 */ 778 779event ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu; 780event ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu; 781event ereport.cpu.amd.bu.s_eccm@chip/cpu; 782 783prop error.cpu.amd.datapath_mb@chip/cpu (1)-> 784 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu, 785 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu, 786 ereport.cpu.amd.bu.s_eccm@chip/cpu; 787 788prop fault.cpu.amd.datapath@chip/cpu (0)-> 789 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu, 790 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu, 791 ereport.cpu.amd.bu.s_eccm@chip/cpu; 792 793/* 794 * Ereports that should not normally happen and which we will discard 795 * without diagnosis if they do. These fall into a few categories: 796 * 797 * - the corresponding detector is not enabled, typically because 798 * detection/handling of the event is taking place elsewhere 799 * (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk) 800 * - the event is associated with a sync flood so even if the detector is 801 * enabled we will never handle the event and generate an ereport *and* 802 * even if the ereport did arrive we could perform no useful diagnosis 803 * e.g., the NB can be configured for sync flood on nb.mem_eccm 804 * but we don't choose to discard that ereport here since we could have 805 * made a useful diagnosis from it had it been delivered 806 * (nb.ht_sync, nb.ht_crc) 807 * - events that will be accompanied by an immediate panic and 808 * delivery of the ereport during subsequent reboot but from 809 * which no useful diagnosis can be made. (nb.rmw, nb.wdog) 810 * 811 * Ereports for all of these can be generated by error simulation and 812 * injection. We will perform a null diagnosos of all these ereports in order 813 * to avoid "no subscription" complaints during test harness runs. 814 */ 815 816event ereport.cpu.amd.nb.ma@cpu; 817event ereport.cpu.amd.nb.ta@cpu; 818event ereport.cpu.amd.ls.s_rde@cpu; 819event ereport.cpu.amd.ic.rdde@cpu; 820event ereport.cpu.amd.bu.s_rde@cpu; 821event ereport.cpu.amd.nb.gart_walk@cpu; 822event ereport.cpu.amd.nb.ht_sync@cpu; 823event ereport.cpu.amd.nb.ht_crc@cpu; 824event ereport.cpu.amd.nb.rmw@cpu; 825event ereport.cpu.amd.nb.wdog@cpu; 826event ereport.cpu.amd.unknown@cpu; 827 828event upset.null_diag@cpu; 829 830prop upset.null_diag@cpu (1)-> 831 ereport.cpu.amd.nb.ma@cpu, 832 ereport.cpu.amd.nb.ta@cpu, 833 ereport.cpu.amd.ls.s_rde@cpu, 834 ereport.cpu.amd.ic.rdde@cpu, 835 ereport.cpu.amd.bu.s_rde@cpu, 836 ereport.cpu.amd.nb.gart_walk@cpu, 837 ereport.cpu.amd.nb.ht_sync@cpu, 838 ereport.cpu.amd.nb.ht_crc@cpu, 839 ereport.cpu.amd.nb.rmw@cpu, 840 ereport.cpu.amd.nb.wdog@cpu, 841 ereport.cpu.amd.unknown@cpu; 842