1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29#pragma dictionary "AMD" 30 31/* 32 * Eversholt rules for the AMD Opteron CPU/Memory 33 */ 34 35fru dimm; 36asru dimm; 37 38fru chip; 39asru chip/cpu; 40 41 42/* #MEM# 43 * GET_ADDR relies on the fact that variables have global scope across an FME. 44 * Thus for each FME the assignment only occurs for the first invocation 45 * but the comparison happens on each. Thus if the new address matches the 46 * address of an existing open FME, then we return true running in the context 47 * of that FME. If the new address doesn't match the address of any existing 48 * open FME, then we return true in the context of a newly opened FME. 49 */ 50#define GET_ADDR (defined($addr) ? ($addr == payloadprop("addr")) : \ 51 ($addr = payloadprop("addr"))) 52 53#define GET_OFFSET ($offset = payloadprop("resource[0].hc-specific.offset")) 54 55/* 56 * SET_ADDR is used to set a payload value in the fault that we diagnose 57 * for page faults, to record the physical address of the faulting page. 58 */ 59#define SET_ADDR (setpayloadprop("asru-physaddr", $addr)) 60 61#define SET_OFFSET (setpayloadprop("asru-offset", $offset)) 62 63/* 64 * RESOURCE_EXISTS is true if a pair with name "resource" exists in the 65 * payload - regardless of type (e.g., nvlist or nvlist array) or value. 66 */ 67#define RESOURCE_EXISTS (payloadprop_defined("resource")) 68 69/* 70 * CONTAINS_DIMM is true if the "resource" nvlist array (as used in memory 71 * ereports) exists and one if its members matches the path for the 72 * dimm node. Our memory propogation are of the form "foo@dimm -> blah@cpu" 73 * since cpus detect memory errors; in eversholt such a propogation, where 74 * the lhs path and rhs path do not match, expands to the cross-product of 75 * all dimms and cpus in the system. We use CONTAINS_DIMM to constrain 76 * the propogation such that it only happens if the payload resource 77 * matches the dimm. 78 */ 79#define CONTAINS_DIMM (payloadprop_contains("resource", asru(dimm))) 80 81/* 82 * The following will tell us whether a syndrome that is known to be 83 * correctable (from a mem_ecc1) is single-bit or multi-bit. For a 84 * correctable ChipKill syndrome the number of bits set in the lowest 85 * nibble indicates how many bit were in error. 86 */ 87 88#define CBITMASK(synd) ((synd) & 0xf) 89 90#define CKSINGLE(synd) \ 91 ((synd) == 0 || \ 92 (CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 || \ 93 CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8)) 94 95#define SINGLE_BIT_CE \ 96 (payloadprop("syndrome-type") == "E" || \ 97 (payloadprop("syndrome-type") == "C" && \ 98 CKSINGLE(payloadprop("syndrome")))) 99 100#define MULTI_BIT_CE \ 101 (payloadprop("syndrome-type") == "C" && \ 102 !CKSINGLE(payloadprop("syndrome"))) 103 104/* 105 * A single bit fault in a memory dimm can cause: 106 * 107 * - mem_ce : reported by nb for an access from a remote cpu 108 * 109 * Single-bit errors are fed into a per-DIMM SERD engine; if a SERD engine 110 * trips we diagnose a fault.memory.page so that the response agent can 111 * retire the page that caused the trip. If the total number of pages 112 * faulted in this way on a single DIMM exceeds a threshold we will 113 * diagnose a fault.memory.dimm_sb against the DIMM. 114 * 115 * Multibit ChipKill-correctable errors produce an immediate page fault. 116 * This is achieved through SERD engines using N=0 so the facility is there 117 * to be a little more tolerant of these errors in future. 118 * 119 * Uncorrectable errors produce an immediate page fault and corresponding 120 * fault.memory.dimm_ue. 121 * 122 * Page faults are essentially internal - action is only required when 123 * they are accompanied by a dimm fault. As such we include message=0 124 * on DIMM faults. 125 */ 126 127event ereport.cpu.amd.nb.mem_ce@cpu; 128 129/* 130 * If the address is not valid then no resource member will be included 131 * in a nb.mem_ce or nb.mem_ue ereport. These cases should be rare. 132 * We will discard such ereports. An alternative may be to SERD them 133 * on a per MC basis and trip if we see too many such events. 134 */ 135 136event upset.memory.discard@cpu; 137 138/* #PAGE# 139 * Page faults of all types diagnose to a single fault class and are 140 * counted with a stat. 141 * 142 * Single-bit errors are diagnosed as upsets and feed into per-DIMM 143 * SERD engines which diagnose fault.memory.page if they trip. 144 */ 145 146#define PAGE_FIT 1 147#define PAGE_SB_COUNT 2 148#define PAGE_SB_TIME 72h 149#define PAGE_CK_COUNT 0 150#define PAGE_CK_TIME 1h 151 152engine stat.page_fault@dimm; 153event fault.memory.page@dimm, FITrate=PAGE_FIT, 154 ASRU=dimm, message=0, count=stat.page_fault@dimm, 155 action=confcall("rewrite-ASRU"); 156event error.memory.page_sb@dimm; 157event error.memory.page_ck@dimm; 158event error.memory.page_ue@dimm; 159 160prop fault.memory.page@dimm (1)-> 161 error.memory.page_sb@dimm, 162 error.memory.page_ck@dimm, 163 error.memory.page_ue@dimm; 164 165event ereport.memory.page_sb_trip@dimm; 166engine serd.memory.page_sb@dimm, N=PAGE_SB_COUNT, T=PAGE_SB_TIME, 167 method=persistent, trip=ereport.memory.page_sb_trip@dimm; 168event upset.memory.page_sb@dimm, engine=serd.memory.page_sb@dimm; 169 170event ereport.memory.page_ck_trip@dimm; 171engine serd.memory.page_ck@dimm, N=PAGE_CK_COUNT, T=PAGE_CK_TIME, 172 method=persistent, trip=ereport.memory.page_ck_trip@dimm; 173event upset.memory.page_ck@dimm, engine=serd.memory.page_ck@dimm; 174 175prop upset.memory.page_sb@dimm (0)-> 176 ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && SINGLE_BIT_CE }; 177 178prop upset.memory.page_ck@dimm (0)-> 179 ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && MULTI_BIT_CE }; 180 181prop error.memory.page_sb@dimm (1)-> 182 ereport.memory.page_sb_trip@dimm; 183 184prop error.memory.page_ck@dimm (1)-> 185 ereport.memory.page_ck_trip@dimm; 186 187prop fault.memory.page@dimm { SET_ADDR && SET_OFFSET } (0)-> 188 ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && GET_ADDR && GET_OFFSET }; 189 190prop upset.memory.discard@cpu (1)-> 191 ereport.cpu.amd.nb.mem_ce@cpu { !RESOURCE_EXISTS }; 192 193/* #DIMM_SB# 194 * Single-bit DIMM faults are diagnosed when the number of page faults 195 * (of all types since they all are counted in a single per-DIMM stat engine) 196 * reaches a threshold. Since our tolerance of ChipKill and UE faults 197 * is much lower than that for single-bit errors the threshold will only be 198 * reached for repeated single-bit page faults. We do not stop diagnosing 199 * further single-bit page faults once we have declared a single-bit DIMM 200 * fault - we continue diagnosing them and response agents can continue to 201 * retire those pages up to the system-imposed retirement limit. 202 * 203 * We maintain a parallel SERD engine to the page_sb engine which trips 204 * in unison, but on trip it generates a distinct ereport which we 205 * diagnose to a dimm_sb fault if the threshold has been reached, or 206 * to a throwaway upset if not. 207 */ 208 209#define DIMM_SB_FIT 2000 210#define DIMM_SB_THRESH 128 211 212event fault.memory.dimm_sb@dimm, FITrate=DIMM_SB_FIT, FRU=dimm, ASRU=dimm, 213 action=confcall("rewrite-ASRU"); 214 215event ereport.memory.dimm_sb_trip@dimm; 216event upset.memory.discard@dimm; 217engine serd.memory.dimm_sb@dimm, N=PAGE_SB_COUNT, T=PAGE_SB_TIME, 218 method=persistent, trip=ereport.memory.dimm_sb_trip@dimm; 219event upset.memory.dimm_sb@dimm, engine=serd.memory.dimm_sb@dimm; 220 221prop upset.memory.dimm_sb@dimm (0)-> 222 ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM }; /* sb and ck */ 223 224prop upset.memory.discard@dimm (1)-> 225 ereport.memory.dimm_sb_trip@dimm; 226 227prop fault.memory.dimm_sb@dimm (0)-> 228 ereport.memory.dimm_sb_trip@dimm { 229 count(stat.page_fault@dimm) >= DIMM_SB_THRESH }; 230 231/* #DIMM_CK# 232 * ChipKill-correctable multi-bit errors produce immediate page faults. 233 * If the fault is indeed isolated to just a few cells then we have contained 234 * the error; if not, say if the SDRAM device is failing, then we will hit a 235 * number of other similar errors in a short space of time. Thus we will 236 * SERD these in diagnosing a fault.memory.dimm_ck and not simply fault 237 * the DIMM at the first instance. 238 */ 239 240#define DIMM_CK_FIT 4000 241#define DIMM_CK_COUNT 2 242#define DIMM_CK_TIME 72h 243 244event fault.memory.dimm_ck@dimm, FITrate=DIMM_CK_FIT, FRU=dimm, ASRU=dimm, 245 action=confcall("rewrite-ASRU"); 246 247event ereport.memory.dimm_ck_trip@dimm; 248engine serd.memory.dimm_ck@dimm, N=DIMM_CK_COUNT, T=DIMM_CK_TIME, 249 method=persistent, trip=ereport.memory.dimm_ck_trip@dimm; 250event upset.memory.dimm_ck@dimm, engine=serd.memory.dimm_ck@dimm; 251 252prop upset.memory.dimm_ck@dimm (0)-> 253 ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && MULTI_BIT_CE }; 254 255prop fault.memory.dimm_ck@dimm (1)-> 256 ereport.memory.dimm_ck_trip@dimm; 257 258prop fault.memory.page@dimm { SET_ADDR && SET_OFFSET } (0)-> 259 ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && MULTI_BIT_CE && 260 GET_ADDR && GET_OFFSET }; 261 262/* #DIMM_UE# 263 * A multi-bit fault in a memory dimm can cause: 264 * 265 * - ue : reported by nb for an access from a remote cpu 266 * 267 * Note we use a SERD engine here simply as a way of ensuring that we get 268 * both dimm and page faults reported 269 */ 270 271#define DIMM_UE_FIT 6000 272 273event ereport.cpu.amd.nb.mem_ue@cpu; 274event ereport.memory.page_ue_trip@dimm; 275event ereport.memory.dimm_ue_trip@dimm; 276event fault.memory.dimm_ue@dimm, FITrate=DIMM_UE_FIT, FRU=dimm, ASRU=dimm, 277 action=confcall("rewrite-ASRU"); 278event upset.memory.page_ue@dimm, engine=serd.memory.page_ue@dimm; 279event upset.memory.dimm_ue@dimm, engine=serd.memory.dimm_ue@dimm; 280 281engine serd.memory.dimm_ue@dimm, N=0, T=1h, 282 method=persistent, trip=ereport.memory.dimm_ue_trip@dimm; 283 284engine serd.memory.page_ue@dimm, N=0, T=1h, 285 method=persistent, trip=ereport.memory.page_ue_trip@dimm; 286 287prop upset.memory.page_ue@dimm (0)-> 288 ereport.cpu.amd.nb.mem_ue@cpu { CONTAINS_DIMM }; 289 290prop upset.memory.dimm_ue@dimm (0)-> 291 ereport.cpu.amd.nb.mem_ue@cpu { CONTAINS_DIMM }; 292 293prop error.memory.page_ue@dimm (1)-> 294 ereport.memory.page_ue_trip@dimm; 295 296prop fault.memory.page@dimm { SET_ADDR && SET_OFFSET } (0)-> 297 ereport.cpu.amd.nb.mem_ue@cpu { CONTAINS_DIMM && GET_ADDR & GET_OFFSET }; 298 299prop fault.memory.dimm_ue@dimm (1)-> 300 ereport.memory.dimm_ue_trip@dimm; 301 302prop upset.memory.discard@cpu (1)-> 303 ereport.cpu.amd.nb.mem_ce@cpu { !RESOURCE_EXISTS }; 304 305/* #L2D# 306 * l2 cache data errors. 307 */ 308 309#define L2CACHEDATA_FIT 1000 310#define L2CACHEDATA_SB_COUNT 3 311#define L2CACHEDATA_SB_TIME 12h 312 313event fault.cpu.amd.l2cachedata@chip/cpu, FITrate=L2CACHEDATA_FIT, 314 FRU=chip, ASRU=chip/cpu; 315event error.cpu.amd.l2cachedata_sb@chip/cpu; 316event error.cpu.amd.l2cachedata_mb@chip/cpu; 317 318prop fault.cpu.amd.l2cachedata@chip/cpu (1)-> 319 error.cpu.amd.l2cachedata_sb@chip/cpu, 320 error.cpu.amd.l2cachedata_mb@chip/cpu; 321 322/* #L2D_SINGLE# 323 * A single bit data array fault in an l2 cache can cause: 324 * 325 * - inf_l2_ecc1 : reported by ic on this cpu 326 * - inf_l2_ecc1 : reported by dc on this cpu 327 * - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu 328 * 329 * Single-bit errors are diagnosed to cache upsets. SERD engines are used 330 * to count upsets resulting from CEs. 331 */ 332 333event ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu{within(5s)}; 334event ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu{within(5s)}; 335event ereport.cpu.amd.bu.l2d_ecc1@chip/cpu{within(5s)}; 336event ereport.cpu.amd.l2d_sb_trip@chip/cpu; 337 338engine serd.cpu.amd.l2d_sb@chip/cpu, 339 N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME, method=persistent, 340 trip=ereport.cpu.amd.l2d_sb_trip@chip/cpu; 341 342event upset.cpu.amd.l2d_sb@chip/cpu, 343 engine=serd.cpu.amd.l2d_sb@chip/cpu; 344 345prop upset.cpu.amd.l2d_sb@chip/cpu (1)-> 346 ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu, 347 ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu, 348 ereport.cpu.amd.bu.l2d_ecc1@chip/cpu; 349 350prop error.cpu.amd.l2cachedata_sb@chip/cpu (1)-> 351 ereport.cpu.amd.l2d_sb_trip@chip/cpu; 352 353prop fault.cpu.amd.l2cachedata@chip/cpu (0)-> 354 ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu, 355 ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu, 356 ereport.cpu.amd.bu.l2d_ecc1@chip/cpu; 357 358/* #L2D_MULTI# 359 * A multi-bit data array fault in an l2 cache can cause: 360 * 361 * - inf_l2_eccm : reported by ic on this cpu 362 * - inf_l2_eccm : reported by dc on this cpu 363 * - l2d_eccm : reported by bu on copyback or on snoop from another cpu 364 */ 365 366event ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu; 367event ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu; 368event ereport.cpu.amd.bu.l2d_eccm@chip/cpu; 369 370prop error.cpu.amd.l2cachedata_mb@chip/cpu (1)-> 371 ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu, 372 ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu, 373 ereport.cpu.amd.bu.l2d_eccm@chip/cpu; 374 375prop fault.cpu.amd.l2cachedata@chip/cpu (0)-> 376 ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu, 377 ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu, 378 ereport.cpu.amd.bu.l2d_eccm@chip/cpu; 379 380/* #L2T# 381 * l2 cache main tag errors 382 */ 383 384#define L2CACHETAG_FIT 1000 385#define L2CACHETAG_SB_COUNT 3 386#define L2CACHETAG_SB_TIME 12h 387 388event fault.cpu.amd.l2cachetag@chip/cpu, FITrate=L2CACHETAG_FIT, 389 FRU=chip, ASRU=chip/cpu; 390event error.cpu.amd.l2cachetag_sb@chip/cpu; 391event error.cpu.amd.l2cachetag_mb@chip/cpu; 392 393prop fault.cpu.amd.l2cachetag@chip/cpu (1)-> 394 error.cpu.amd.l2cachetag_sb@chip/cpu, 395 error.cpu.amd.l2cachetag_mb@chip/cpu; 396 397/* #L2T_SINGLE# 398 * A single bit tag array fault in an l2 cache can cause: 399 * 400 * - l2t_ecc1 : reported by bu on this cpu when detected during snoop 401 * - l2t_par : reported by bu on this cpu when detected other than during snoop 402 * 403 * Note that the bu.l2t_par ereport could be due to a single bit or multi bit 404 * event. If the l2t_sb_trip has already triggered it will be treated as another 405 * ce, otherwise it will be treated as a ue event. 406 */ 407 408event ereport.cpu.amd.bu.l2t_ecc1@chip/cpu{within(5s)}; 409event ereport.cpu.amd.bu.l2t_par@chip/cpu; 410event ereport.cpu.amd.l2t_sb_trip@chip/cpu; 411 412engine serd.cpu.amd.l2t_sb@chip/cpu, 413 N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME, method=persistent, 414 trip=ereport.cpu.amd.l2t_sb_trip@chip/cpu; 415 416event upset.cpu.amd.l2t_sb@chip/cpu, 417 engine=serd.cpu.amd.l2t_sb@chip/cpu; 418 419prop upset.cpu.amd.l2t_sb@chip/cpu (1)-> 420 ereport.cpu.amd.bu.l2t_ecc1@chip/cpu, 421 ereport.cpu.amd.bu.l2t_par@chip/cpu; 422 423prop error.cpu.amd.l2cachetag_sb@chip/cpu (1)-> 424 ereport.cpu.amd.l2t_sb_trip@chip/cpu; 425 426prop fault.cpu.amd.l2cachetag@chip/cpu (0)-> 427 ereport.cpu.amd.bu.l2t_ecc1@chip/cpu, 428 ereport.cpu.amd.bu.l2t_par@chip/cpu; 429 430/* #L2T_MULTI# 431 * A multi-bit tag array fault in an l2 cache can cause: 432 * 433 * - l2t_eccm : reported by bu on this cpu when detected during snoop 434 * - l2t_par : reported by bu on this cpu when detected other than during snoop 435 */ 436 437event ereport.cpu.amd.bu.l2t_eccm@chip/cpu; 438 439prop error.cpu.amd.l2cachetag_mb@chip/cpu (1)-> 440 ereport.cpu.amd.bu.l2t_eccm@chip/cpu, 441 ereport.cpu.amd.bu.l2t_par@chip/cpu; 442 443prop fault.cpu.amd.l2cachetag@chip/cpu (0)-> 444 ereport.cpu.amd.bu.l2t_eccm@chip/cpu, 445 ereport.cpu.amd.bu.l2t_par@chip/cpu; 446 447/* #ICD_PAR# 448 * A data array parity fault in an I cache can cause: 449 * 450 * - data_par : reported by ic on this cpu 451 */ 452 453#define ICACHEDATA_FIT 1000 454#define ICACHEDATA_SB_COUNT 2 455#define ICACHEDATA_SB_TIME 168h 456 457event ereport.cpu.amd.ic.data_par@chip/cpu{within(5s)}; 458event ereport.cpu.amd.ic_dp_trip@chip/cpu; 459 460event fault.cpu.amd.icachedata@chip/cpu, FITrate=ICACHEDATA_FIT, 461 FRU=chip, ASRU=chip/cpu; 462 463engine serd.cpu.amd.icachedata@chip/cpu, 464 N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME, method=persistent, 465 trip=ereport.cpu.amd.ic_dp_trip@chip/cpu; 466 467event upset.cpu.amd.icachedata@chip/cpu, 468 engine=serd.cpu.amd.icachedata@chip/cpu; 469 470prop upset.cpu.amd.icachedata@chip/cpu (1)-> 471 ereport.cpu.amd.ic.data_par@chip/cpu; 472 473prop fault.cpu.amd.icachedata@chip/cpu (1)-> 474 ereport.cpu.amd.ic_dp_trip@chip/cpu; 475 476prop fault.cpu.amd.icachedata@chip/cpu (0)-> 477 ereport.cpu.amd.ic.data_par@chip/cpu; 478 479/* #ICT_PAR# 480 * A tag array parity fault in an I cache can cause: 481 * 482 * - tag_par : reported by ic on this cpu 483 */ 484 485#define ICACHETAG_FIT 1000 486#define ICACHETAG_SB_COUNT 2 487#define ICACHETAG_SB_TIME 168h 488 489event ereport.cpu.amd.ic.tag_par@chip/cpu{within(5s)}; 490event ereport.cpu.amd.ic_tp_trip@chip/cpu; 491 492event fault.cpu.amd.icachetag@chip/cpu, FITrate=ICACHETAG_FIT, 493 FRU=chip, ASRU=chip/cpu; 494 495engine serd.cpu.amd.icachetag@chip/cpu, 496 N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME, method=persistent, 497 trip=ereport.cpu.amd.ic_tp_trip@chip/cpu; 498 499event upset.cpu.amd.icachetag@chip/cpu, 500 engine=serd.cpu.amd.icachetag@chip/cpu; 501 502prop upset.cpu.amd.icachetag@chip/cpu (1)-> 503 ereport.cpu.amd.ic.tag_par@chip/cpu; 504 505prop fault.cpu.amd.icachetag@chip/cpu (1)-> 506 ereport.cpu.amd.ic_tp_trip@chip/cpu; 507 508prop fault.cpu.amd.icachetag@chip/cpu (0)-> 509 ereport.cpu.amd.ic.tag_par@chip/cpu; 510 511/* #ICT_SNOOP# 512 * A snoop tag array parity fault in an I cache can cause: 513 * 514 * - stag_par : reported by ic on this cpu 515 */ 516 517#define ICACHESTAG_FIT 1000 518 519event ereport.cpu.amd.ic.stag_par@chip/cpu{within(5s)}; 520 521event fault.cpu.amd.icachestag@chip/cpu, FITrate=ICACHESTAG_FIT, 522 FRU=chip, ASRU=chip/cpu; 523 524prop fault.cpu.amd.icachestag@chip/cpu (1)-> 525 ereport.cpu.amd.ic.stag_par@chip/cpu; 526 527/* #ICTLB_1# 528 * An l1tlb parity fault in an I cache can cause: 529 * 530 * - l1tlb_par : reported by ic on this cpu 531 */ 532 533#define ICACHEL1TLB_FIT 1000 534#define ICACHEL1TLB_SB_COUNT 2 535#define ICACHEL1TLB_SB_TIME 168h 536 537event ereport.cpu.amd.ic.l1tlb_par@chip/cpu{within(5s)}; 538event ereport.cpu.amd.ic_l1tlb_trip@chip/cpu; 539 540event fault.cpu.amd.l1itlb@chip/cpu, FITrate=ICACHEL1TLB_FIT, 541 FRU=chip, ASRU=chip/cpu; 542 543engine serd.cpu.amd.l1itlb@chip/cpu, 544 N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME, method=persistent, 545 trip=ereport.cpu.amd.ic_l1tlb_trip@chip/cpu; 546 547event upset.cpu.amd.l1itlb@chip/cpu, 548 engine=serd.cpu.amd.l1itlb@chip/cpu; 549 550prop upset.cpu.amd.l1itlb@chip/cpu (1)-> 551 ereport.cpu.amd.ic.l1tlb_par@chip/cpu; 552 553prop fault.cpu.amd.l1itlb@chip/cpu (1)-> 554 ereport.cpu.amd.ic_l1tlb_trip@chip/cpu; 555 556prop fault.cpu.amd.l1itlb@chip/cpu (0)-> 557 ereport.cpu.amd.ic.l1tlb_par@chip/cpu; 558 559/* #ICTLB_2# 560 * An l2tlb parity fault in an I cache can cause: 561 * 562 * - l2tlb_par : reported by ic on this cpu 563 */ 564 565#define ICACHEL2TLB_FIT 1000 566#define ICACHEL2TLB_SB_COUNT 2 567#define ICACHEL2TLB_SB_TIME 168h 568 569event ereport.cpu.amd.ic.l2tlb_par@chip/cpu{within(5s)}; 570event ereport.cpu.amd.ic_l2tlb_trip@chip/cpu; 571 572event fault.cpu.amd.l2itlb@chip/cpu, FITrate=ICACHEL2TLB_FIT, 573 FRU=chip, ASRU=chip/cpu; 574 575engine serd.cpu.amd.l2itlb@chip/cpu, 576 N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME, method=persistent, 577 trip=ereport.cpu.amd.ic_l2tlb_trip@chip/cpu; 578 579event upset.cpu.amd.l2itlb@chip/cpu, 580 engine=serd.cpu.amd.l2itlb@chip/cpu; 581 582prop upset.cpu.amd.l2itlb@chip/cpu (1)-> 583 ereport.cpu.amd.ic.l2tlb_par@chip/cpu; 584 585prop fault.cpu.amd.l2itlb@chip/cpu (1)-> 586 ereport.cpu.amd.ic_l2tlb_trip@chip/cpu; 587 588prop fault.cpu.amd.l2itlb@chip/cpu (0)-> 589 ereport.cpu.amd.ic.l2tlb_par@chip/cpu; 590 591/* #DCD# 592 * dcache data errors 593 */ 594 595#define DCACHEDATA_FIT 1000 596#define DCACHEDATA_SB_COUNT 2 597#define DCACHEDATA_SB_TIME 168h 598 599event fault.cpu.amd.dcachedata@chip/cpu, FITrate=DCACHEDATA_FIT, 600 FRU=chip, ASRU=chip/cpu; 601event error.cpu.amd.dcachedata_sb@chip/cpu; 602event error.cpu.amd.dcachedata_mb@chip/cpu; 603 604prop fault.cpu.amd.dcachedata@chip/cpu (1)-> 605 error.cpu.amd.dcachedata_sb@chip/cpu, 606 error.cpu.amd.dcachedata_mb@chip/cpu; 607 608/* #DCD_SINGLE# 609 * A single bit data array fault in an D cache can cause: 610 * 611 * - data_ecc1 : reported by dc on this cpu by scrubber 612 * - data_ecc1_uc : reported by dc on this cpu other than by scrubber 613 * 614 * Make data_ecc1_uc fault immediately as it may have caused a panic 615 */ 616 617event ereport.cpu.amd.dc.data_ecc1@chip/cpu{within(5s)}; 618event ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu{within(5s)}; 619event ereport.cpu.amd.dc_sb_trip@chip/cpu; 620 621engine serd.cpu.amd.dc_sb@chip/cpu, 622 N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME, method=persistent, 623 trip=ereport.cpu.amd.dc_sb_trip@chip/cpu; 624 625engine serd.cpu.amd.dc_sb_uc@chip/cpu, 626 N=0, T=1hr, method=persistent, 627 trip=ereport.cpu.amd.dc_sb_trip@chip/cpu; 628 629event upset.cpu.amd.dc_sb@chip/cpu, 630 engine=serd.cpu.amd.dc_sb@chip/cpu; 631 632event upset.cpu.amd.dc_sb_uc@chip/cpu, 633 engine=serd.cpu.amd.dc_sb_uc@chip/cpu; 634 635prop upset.cpu.amd.dc_sb@chip/cpu (1)-> 636 ereport.cpu.amd.dc.data_ecc1@chip/cpu; 637 638prop upset.cpu.amd.dc_sb_uc@chip/cpu (1)-> 639 ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu; 640 641prop error.cpu.amd.dcachedata_sb@chip/cpu (1)-> 642 ereport.cpu.amd.dc_sb_trip@chip/cpu; 643 644prop fault.cpu.amd.dcachedata@chip/cpu (0)-> 645 ereport.cpu.amd.dc.data_ecc1@chip/cpu, 646 ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu; 647 648/* #DCD_MULTI# 649 * A multi-bit data array fault in an D cache can cause: 650 * 651 * - data_eccm : reported by dc on this cpu 652 */ 653 654event ereport.cpu.amd.dc.data_eccm@chip/cpu; 655 656prop error.cpu.amd.dcachedata_mb@chip/cpu (1)-> 657 ereport.cpu.amd.dc.data_eccm@chip/cpu; 658 659prop fault.cpu.amd.dcachedata@chip/cpu (0)-> 660 ereport.cpu.amd.dc.data_eccm@chip/cpu; 661 662/* #DCT_PAR# 663 * A tag array parity fault in an D cache can cause: 664 * 665 * - tag_par : reported by dc on this cpu 666 */ 667 668#define DCACHETAG_FIT 1000 669 670event ereport.cpu.amd.dc.tag_par@chip/cpu{within(5s)}; 671 672event fault.cpu.amd.dcachetag@chip/cpu, FITrate=DCACHETAG_FIT, 673 FRU=chip, ASRU=chip/cpu; 674 675prop fault.cpu.amd.dcachetag@chip/cpu (1)-> 676 ereport.cpu.amd.dc.tag_par@chip/cpu; 677 678/* #DCT_SNOOP# 679 * A snoop tag array parity fault in an D cache can cause: 680 * 681 * - stag_par : reported by dc on this cpu 682 */ 683 684#define DCACHESTAG_FIT 1000 685 686event ereport.cpu.amd.dc.stag_par@chip/cpu{within(5s)}; 687 688event fault.cpu.amd.dcachestag@chip/cpu, FITrate=DCACHESTAG_FIT, 689 FRU=chip, ASRU=chip/cpu; 690 691prop fault.cpu.amd.dcachestag@chip/cpu (1)-> 692 ereport.cpu.amd.dc.stag_par@chip/cpu; 693 694/* #DCTLB_1# 695 * An l1tlb parity fault in an D cache can cause: 696 * 697 * - l1tlb_par : reported by dc on this cpu 698 */ 699 700#define L1DTLB_FIT 1000 701 702event ereport.cpu.amd.dc.l1tlb_par@chip/cpu{within(5s)}; 703 704event fault.cpu.amd.l1dtlb@chip/cpu, FITrate=L1DTLB_FIT, 705 FRU=chip, ASRU=chip/cpu; 706 707prop fault.cpu.amd.l1dtlb@chip/cpu (1)-> 708 ereport.cpu.amd.dc.l1tlb_par@chip/cpu; 709 710/* #DCTLB_2# 711 * An l2tlb parity fault in an D cache can cause: 712 * 713 * - l2tlb_par : reported by dc on this cpu 714 */ 715 716#define L2DTLB_FIT 1000 717 718event ereport.cpu.amd.dc.l2tlb_par@chip/cpu{within(5s)}; 719 720event fault.cpu.amd.l2dtlb@chip/cpu, FITrate=L2DTLB_FIT, 721 FRU=chip, ASRU=chip/cpu; 722 723prop fault.cpu.amd.l2dtlb@chip/cpu (1)-> 724 ereport.cpu.amd.dc.l2tlb_par@chip/cpu; 725 726/* #DPATH_SB# 727 * Datapath errors between NB/MC and core. 728 */ 729 730#define CPU_DP_FIT 1000 731 732event fault.cpu.amd.datapath@chip/cpu, FITrate=CPU_DP_FIT, FRU=chip, 733 ASRU=chip/cpu; 734event error.cpu.amd.datapath_sb@chip/cpu; 735event error.cpu.amd.datapath_mb@chip/cpu; 736 737prop fault.cpu.amd.datapath@chip/cpu (1)-> 738 error.cpu.amd.datapath_sb@chip/cpu, 739 error.cpu.amd.datapath_mb@chip/cpu; 740 741/* 742 * A single bit fault in the datapath between the NB and requesting core 743 * can cause: 744 * 745 * - inf_sys_ecc1 : reported by ic on access from a local cpu 746 * - inf_sys_ecc1 : reported by dc on access from a local cpu 747 * - s_ecc1 : reported by bu on access from a local cpu (hw prefetch etc) 748 * 749 * Empirical observations show that in 64/8 ECC mode some memory CEs *can* 750 * travel past the DRAM controller and on to the IC/DC/BU to be reported 751 * via the above errors. This is not the case with ChipKill enabled. 752 * We should not be diagnosing datapath/chip errors for these. While 753 * this behaviour is clarified the serd parameters will be set to infinity 754 * (and the multibit counterpats will not be seen because of sync flood). 755 */ 756 757#define CPU_DP_COUNT 5000 758#define CPU_DP_TIME 1m 759 760event ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu{within(5s)}; 761event ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu{within(5s)}; 762event ereport.cpu.amd.bu.s_ecc1@chip/cpu{within(5s)}; 763event upset.cpu.dp_sb@chip/cpu, engine=serd.cpu.dp_sb@chip/cpu; 764event ereport.cpu.amd.dp_sb_trip@chip/cpu; 765 766engine serd.cpu.dp_sb@chip/cpu, N=CPU_DP_COUNT, T=CPU_DP_TIME, 767 method=persistent, trip=ereport.cpu.amd.dp_sb_trip@chip/cpu; 768 769prop upset.cpu.dp_sb@chip/cpu (1)-> 770 ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu, 771 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu, 772 ereport.cpu.amd.bu.s_ecc1@chip/cpu; 773 774prop error.cpu.amd.datapath_sb@chip/cpu (1)-> 775 ereport.cpu.amd.dp_sb_trip@chip/cpu; 776 777prop fault.cpu.amd.datapath@chip/cpu (0)-> 778 ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu, 779 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu, 780 ereport.cpu.amd.bu.s_ecc1@chip/cpu; 781 782/* #DPATH_MB# 783 * A multi-bit fault in the datapath between the NB and requesting core 784 * can cause: 785 * 786 * - inf_sys_eccm : reported by ic on access from a local cpu 787 * - inf_sys_eccm : reported by dc on access from a local cpu 788 * - s_eccm : reported by bu on access from a local cpu (hw prefetch etc) 789 */ 790 791event ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu; 792event ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu; 793event ereport.cpu.amd.bu.s_eccm@chip/cpu; 794 795prop error.cpu.amd.datapath_mb@chip/cpu (1)-> 796 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu, 797 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu, 798 ereport.cpu.amd.bu.s_eccm@chip/cpu; 799 800prop fault.cpu.amd.datapath@chip/cpu (0)-> 801 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu, 802 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu, 803 ereport.cpu.amd.bu.s_eccm@chip/cpu; 804 805/* 806 * Ereports that should not normally happen and which we will discard 807 * without diagnosis if they do. These fall into a few categories: 808 * 809 * - the corresponding detector is not enabled, typically because 810 * detection/handling of the event is taking place elsewhere 811 * (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk) 812 * - the event is associated with a sync flood so even if the detector is 813 * enabled we will never handle the event and generate an ereport *and* 814 * even if the ereport did arrive we could perform no useful diagnosis 815 * e.g., the NB can be configured for sync flood on nb.mem_eccm 816 * but we don't choose to discard that ereport here since we could have 817 * made a useful diagnosis from it had it been delivered 818 * (nb.ht_sync, nb.ht_crc) 819 * - events that will be accompanied by an immediate panic and 820 * delivery of the ereport during subsequent reboot but from 821 * which no useful diagnosis can be made. (nb.rmw, nb.wdog) 822 * 823 * Ereports for all of these can be generated by error simulation and 824 * injection. We will perform a null diagnosos of all these ereports in order 825 * to avoid "no subscription" complaints during test harness runs. 826 */ 827 828event ereport.cpu.amd.nb.ma@cpu; 829event ereport.cpu.amd.nb.ta@cpu; 830event ereport.cpu.amd.ls.s_rde@cpu; 831event ereport.cpu.amd.ic.rdde@cpu; 832event ereport.cpu.amd.bu.s_rde@cpu; 833event ereport.cpu.amd.nb.gart_walk@cpu; 834event ereport.cpu.amd.nb.ht_sync@cpu; 835event ereport.cpu.amd.nb.ht_crc@cpu; 836event ereport.cpu.amd.nb.rmw@cpu; 837event ereport.cpu.amd.nb.wdog@cpu; 838event ereport.cpu.amd.unknown@cpu; 839 840event upset.null_diag@cpu; 841 842prop upset.null_diag@cpu (1)-> 843 ereport.cpu.amd.nb.ma@cpu, 844 ereport.cpu.amd.nb.ta@cpu, 845 ereport.cpu.amd.ls.s_rde@cpu, 846 ereport.cpu.amd.ic.rdde@cpu, 847 ereport.cpu.amd.bu.s_rde@cpu, 848 ereport.cpu.amd.nb.gart_walk@cpu, 849 ereport.cpu.amd.nb.ht_sync@cpu, 850 ereport.cpu.amd.nb.ht_crc@cpu, 851 ereport.cpu.amd.nb.rmw@cpu, 852 ereport.cpu.amd.nb.wdog@cpu, 853 ereport.cpu.amd.unknown@cpu; 854