1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29#pragma dictionary "AMD" 30 31/* 32 * Eversholt rules for the AMD Opteron CPU/Memory 33 */ 34 35#define MAX(x, y) ((x) >= (y) ? (x) : (y)) 36#define MIN(x, y) ((x) <= (y) ? (x) : (y)) 37 38/* 39 * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that 40 * we diagnose for page faults, to record the physical address of the faulting 41 * page. 42 */ 43#define SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR"))) 44 45#define SET_OFFSET (setpayloadprop("asru-offset", \ 46 payloadprop("resource[0].hc-specific.offset"))) 47 48/* 49 * RESOURCE_EXISTS is true if a member with name "resource" exists in the 50 * payload - regardless of type (e.g., nvlist or nvlist array) or value. 51 */ 52#define RESOURCE_EXISTS (payloadprop_defined("resource")) 53 54/* 55 * CONTAINS_RANK is true if the "resource" nvlist array (as used in memory 56 * ereports) exists and one if its members matches the path for the 57 * rank node. Our memory propogation are of the form 58 * 59 * "prop foo@chip/memory-controller/dimm/rank -> blah@chip/cpu" 60 * 61 * since cpus detect memory errors; in eversholt such a propogation, where 62 * the lhs path and rhs path do not match, expands to the cross-product of 63 * all dimms, ranks and cpus on the same chip (since chip appears in the 64 * path on both sides). We use CONTAINS_RANK to constrain the propogation 65 * such that it only happens if the payload resource matches the rank. 66 */ 67#define CONTAINS_RANK (payloadprop_contains("resource", \ 68 asru(chip/memory-controller/dimm/rank)) \ 69 || payloadprop_contains("resource", \ 70 asru(chip/memory-controller/dimm))) 71 72/* 73 * The following will tell us whether a syndrome that is known to be 74 * correctable (from a mem_ce ereport) is single-bit or multi-bit. For a 75 * correctable ChipKill syndrome the number of bits set in the lowest 76 * nibble indicates how many bits were in error. 77 */ 78 79#define CBITMASK(synd) ((synd) & 0xf) 80 81#define CKSINGLE(synd) \ 82 ((synd) == 0 || \ 83 (CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 || \ 84 CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8)) 85 86#define SINGLE_BIT_CE \ 87 (payloadprop("syndrome-type") == "E" || \ 88 (payloadprop("syndrome-type") == "C" && \ 89 CKSINGLE(payloadprop("syndrome")))) 90 91#define MULTI_BIT_CE \ 92 (payloadprop("syndrome-type") == "C" && \ 93 !CKSINGLE(payloadprop("syndrome"))) 94 95/* #PAGE# 96 * #DIMM_SCU# 97 * A single bit fault in a memory rank can cause: 98 * 99 * - mem_ce : reported by nb 100 * - inf_sys_ecc1: reported by ic or dc; inf_sys_ecc1 errors detected at the 101 * ic do not record a syndrome; these errors will not be triggered in 102 * ChipKill ECC mode (the NB corrects all ECC errors in that mode) 103 * - s_ecc1: reported by bu; this error will not be triggered in ChipKill 104 * ECC mode (the NB corrects all ECC in that mode) 105 * 106 * Single-bit errors are fed into a per-rank SERD engine; if a SERD engine 107 * trips we diagnose a fault.memory.page so that the response agent can 108 * retire the page that caused the trip. If the total number of pages 109 * faulted in this way on a single rank exceeds a threshold we will 110 * diagnose a fault.memory.dimm_sb against the containing dimm. 111 * 112 * Multibit ChipKill-correctable errors are treated identically to 113 * single-bit errors, but via separate serd engines to allow distinct 114 * parameters if desired. 115 * 116 * Uncorrectable errors produce an immediate page fault and corresponding 117 * fault.memory.dimm_ue. 118 * 119 * Page faults are essentially internal - action is only required when 120 * they are accompanied by a dimm fault. As such we include message=0 121 * on page faults. 122 */ 123 124event ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu{within(5s)}; 125event ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu{within(5s)}; 126event ereport.cpu.amd.bu.s_ecc1@chip/cpu{within(5s)}; 127event ereport.cpu.amd.nb.mem_ce@chip/cpu{within(5s)}; 128 129/* 130 * Single-bit correctable errors feed into per-rank 131 * SERD engines which diagnose fault.memory.page_sb if they trip. 132 * 133 * Multi-bit correctable (via ChipKill) errors feed 134 * into additional per-rank SERD engines which diagnose fault.memory.page_ck 135 * if they trip. 136 * 137 * The number of fault.memory.page and fault.memory.page_ck diagnosed is 138 * counted in stat engines for each type. These are used in deciding 139 * whether to declare a dimm faulty after repeated page faults. 140 */ 141 142#define PAGE_SB_COUNT 2 143#define PAGE_SB_TIME 72h 144#define PAGE_CK_COUNT 2 145#define PAGE_CK_TIME 72h 146 147engine stat.sbpgflt@chip/memory-controller/dimm/rank; 148engine stat.ckpgflt@chip/memory-controller/dimm/rank; 149engine serd.memory.page_sb@chip/memory-controller/dimm/rank, 150 N=PAGE_SB_COUNT, T=PAGE_SB_TIME; 151engine serd.memory.page_ck@chip/memory-controller/dimm/rank, 152 N=PAGE_CK_COUNT, T=PAGE_CK_TIME; 153engine serd.memory.dimm_sb@chip/memory-controller/dimm/rank, 154 N=PAGE_SB_COUNT, T=PAGE_SB_TIME; 155engine serd.memory.dimm_ck@chip/memory-controller/dimm/rank, 156 N=PAGE_CK_COUNT, T=PAGE_CK_TIME; 157event fault.memory.page_sb@chip/memory-controller/dimm/rank, message=0, 158 count=stat.sbpgflt@chip/memory-controller/dimm/rank, response=0, 159 engine=serd.memory.page_sb@chip/memory-controller/dimm/rank; 160event fault.memory.page_ck@chip/memory-controller/dimm/rank, message=0, 161 count=stat.ckpgflt@chip/memory-controller/dimm/rank, response=0, 162 engine=serd.memory.page_ck@chip/memory-controller/dimm/rank; 163event fault.memory.dimm_sb@chip/memory-controller/dimm/rank, 164 engine=serd.memory.dimm_sb@chip/memory-controller/dimm/rank; 165event fault.memory.dimm_ck@chip/memory-controller/dimm/rank, 166 engine=serd.memory.dimm_ck@chip/memory-controller/dimm/rank; 167 168/* 169 * The fraction of pages on a single rank that must be diagnosed as faulty 170 * with single correctable unit faults before we will fault the rank. 171 * Once we have faulted the rank we will continue to diagnose any further page 172 * faults on the rank up to some maximum multiple of the threshold at which 173 * we faulted the dimm. This allows us to potentially contain some fairly 174 * far-reaching but still limited-extent fault (such as a partial column 175 * failure) without getting carried away and allowing a single faulty rank to 176 * use up the entire system-imposed page retirenment limit (which, once 177 * reached, causes retirement request to have no effect other than to fill 178 * the fault manager cache and logs). 179 * 180 * This fraction is specified in basis points, where 100 basis points are 181 * equivalent to 1 percent. It is applied on a per-rank basis. 182 * 183 * The system imposes an absolute maximum on the number of pages it will 184 * retire; the current value is 10 basis points, or 0.1% of 'physmem'. Note 185 * that 'physmem' is reduced from installed memory pages by an amount 186 * reflecting permanent kernel memory allocations. This system page retire 187 * limit bounds the maximum real response to page faults across all ranks 188 * that fault manager response agents can effect, but it should not be confused 189 * with any diagnosis threshold (i.e., the number of faulty pages we are 190 * prepared to tolerate from a single rank before faulting the rank is 191 * distinct from the total number of pages we are prepared to retire from use 192 * in response to that and other faults). It is, however, desirable to 193 * arrange that the maximum number of pages we are prepared to fault from 194 * any one rank is less than the system-wide quota. 195 */ 196#define PAGE_RETIRE_LIMIT_BPS 5 /* or 0.05%; ~ 131 pages/GB %/ 197 198/* 199 * A macro to manipulate the above fraction. Given a size in bytes convert 200 * this to pages (4K pagesize) and calculate the number of those pages 201 * indicated by PAGE_RETIRE_LIMIT_BPS basis points. 202 */ 203#define _BPS_PGCNT(totalbytes) \ 204 ((((totalbytes) / 4096 ) * PAGE_RETIRE_LIMIT_BPS) / 10000) 205 206/* 207 * The single-correctable-unit threshold at which number of faulted pages 208 * on a rank we we fault the rank. We insist that this be at least 128 and 209 * never more than 512. 210 */ 211#define RANK_THRESH MIN(512, MAX(128, \ 212 _BPS_PGCNT(confprop(chip/memory-controller/dimm/rank, "size")))) 213 214/* 215 * The maximum number of single-correctable-unit page faults we will diagnose 216 * on a single rank (must be greater than RANK_THRESH). We set 217 * this at twice the rank fault threshold. 218 */ 219#define RANK_PGFLT_MAX (2 * RANK_THRESH) 220 221#define SB_PGFLTS (count(stat.sbpgflt@chip/memory-controller/dimm/rank)) 222#define CK_PGFLTS (count(stat.ckpgflt@chip/memory-controller/dimm/rank)) 223 224/* 225 * "Single-correctable-unit" DIMM faults are diagnosed when the total number of 226 * page faults (diagnosed from repeated single-bit or multibit-chipkills) 227 * from any one rank on that DIMM reaches a threshold. A "correctable unit" 228 * is a single bit in normal 64/8 ECC mode, or a single symbol in ChipKill 229 * 128/16 mode (i.e., nibble-aligned nibble for the code used on Opteron). 230 * 231 * We do not stop diagnosing further single-bit page faults once we have 232 * declared a single-bit DIMM fault - we continue diagnosing them and 233 * response agents can continue to retire those pages up to the system-imposed 234 * retirement limit. 235 * 236 * Two distinct fault types may be diagnosed - fault.memory.dimm_sb and 237 * fault.memory.dimm_ck. Which one is diagnosed depends on whether we 238 * have reached the threshold for a majority of single-bit page faults or 239 * multibit page faults. 240 * 241 * Implementation: we maintain parallel SERD engines to the page_sb and 242 * page_ck engines, which trip in unison. On trip it generates a distinct 243 * ereport which we diagnose to a fault if the threshold has been reached. 244 */ 245prop fault.memory.page_sb@chip/memory-controller/dimm/rank 246 { CONTAINS_RANK && SINGLE_BIT_CE && 247 SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)-> 248 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu, 249 ereport.cpu.amd.bu.s_ecc1@chip/cpu, 250 ereport.cpu.amd.nb.mem_ce@chip/cpu; 251 252prop fault.memory.page_ck@chip/memory-controller/dimm/rank 253 { CONTAINS_RANK && !SINGLE_BIT_CE && 254 SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)-> 255 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu, 256 ereport.cpu.amd.bu.s_ecc1@chip/cpu, 257 ereport.cpu.amd.nb.mem_ce@chip/cpu; 258 259prop fault.memory.dimm_sb@chip/memory-controller/dimm/rank 260 { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH && 261 SB_PGFLTS > RANK_THRESH / 2 } (1)-> 262 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu, 263 ereport.cpu.amd.bu.s_ecc1@chip/cpu, 264 ereport.cpu.amd.nb.mem_ce@chip/cpu; 265 266prop fault.memory.dimm_ck@chip/memory-controller/dimm/rank 267 { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH && 268 CK_PGFLTS > RANK_THRESH / 2 } (1)-> 269 ereport.cpu.amd.nb.mem_ce@chip/cpu; 270 271/* 272 * If the address is not valid then no resource member will be included 273 * in a nb.mem_ce or nb.mem_ue ereport. These cases should be rare. 274 * We will also discard all inf_sys_ecc1 events detected at the ic since they 275 * have no syndrome and therefore no resource information. 276 * We will discard such ereports. An alternative may be to SERD them 277 * on a per MC basis and trip if we see too many such events. 278 */ 279event upset.memory.discard1@chip/cpu; 280prop upset.memory.discard1@chip/cpu 281 { !RESOURCE_EXISTS } (1)-> 282 ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu, 283 ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu, 284 ereport.cpu.amd.bu.s_ecc1@chip/cpu, 285 ereport.cpu.amd.nb.mem_ce@chip/cpu; 286 287/* #DIMM_UE# 288 * #PAGE_UE# 289 * An uncorrectable multi-bit fault in a memory dimm can cause: 290 * 291 * - mem_ue : reported by nb for an access from a remote cpu 292 * - inf_sys_eccm : reported by ic or dc; the ic does not report a syndrome 293 * - s_eccm : reported by bu 294 * 295 * Since on production systems we force HT Sync Flood on uncorrectable 296 * memory errors (if not already set as such by the BIOS, as it should be) 297 * we won't actually receive these ereports since the system will be reset. 298 */ 299 300event ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu{within(5s)}; 301event ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu{within(5s)}; 302event ereport.cpu.amd.bu.s_eccm@chip/cpu{within(5s)}; 303event ereport.cpu.amd.nb.mem_ue@chip/cpu{within(5s)}; 304 305event fault.memory.dimm_ue@chip/memory-controller/dimm/rank; 306event fault.memory.page_ue@chip/memory-controller/dimm/rank, message=0, 307 response=0; 308 309prop fault.memory.dimm_ue@chip/memory-controller/dimm/rank 310 { CONTAINS_RANK } (1)-> 311 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu, 312 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu, 313 ereport.cpu.amd.bu.s_eccm@chip/cpu, 314 ereport.cpu.amd.nb.mem_ue@chip/cpu; 315 316prop fault.memory.page_ue@chip/memory-controller/dimm/rank 317 { CONTAINS_RANK && SET_ADDR && SET_OFFSET } (1)-> 318 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu, 319 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu, 320 ereport.cpu.amd.bu.s_eccm@chip/cpu, 321 ereport.cpu.amd.nb.mem_ue@chip/cpu; 322 323event upset.memory.discard3@chip/cpu; 324prop upset.memory.discard3@chip/cpu 325 { !RESOURCE_EXISTS } (1)-> 326 ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu, 327 ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu, 328 ereport.cpu.amd.bu.s_eccm@chip/cpu, 329 ereport.cpu.amd.nb.mem_ue@chip/cpu; 330 331/* #CSTESTFAIL# 332 * If the BIOS fails a chip-select during POST, or perhaps after a 333 * sync flood from an uncorrectable error, then on revision F and G it 334 * should mark that chip-select as TestFail in the CS Base register. 335 * When the memory-controller driver discovers all the MC configuration 336 * it notes such failed chip-selects and creates topology nodes for the 337 * chip-select and associated dimms and ranks, and produces an ereport for each 338 * failed chip-select with detector set to the memory-controller node 339 * and resource indicating the failed chip-select. 340 */ 341 342event ereport.cpu.amd.mc.cs_testfail@chip/memory-controller{within(5s)}; 343event fault.memory.dimm_testfail@chip/memory-controller/dimm/rank; 344event error.memory.cs_testfail@chip/memory-controller/chip-select; 345 346#define CONTAINS_CS (payloadprop_contains("resource", \ 347 asru(chip/memory-controller/chip-select))) 348 349prop error.memory.cs_testfail@chip/memory-controller/chip-select (1)-> 350 ereport.cpu.amd.mc.cs_testfail@chip/memory-controller 351 { CONTAINS_CS }; 352 353#define CSMATCH(s) \ 354 (confprop_defined(chip/memory-controller/chip-select, s) && \ 355 confprop(chip/memory-controller/chip-select, s) == \ 356 confprop(chip/memory-controller/dimm/rank, "csname")) 357 358prop fault.memory.dimm_testfail@chip/memory-controller/dimm/rank (1)-> 359 error.memory.cs_testfail@chip/memory-controller/chip-select 360 { CSMATCH("dimm1-csname") || CSMATCH("dimm2-csname")}; 361 362/* #ADDRPAR# 363 * DRAM Command/Address Parity Errors. 364 * 365 * - dramaddr_par : reported by the nb; the NB status register includes 366 * a bit indicating which dram controller channel (A or B) experienced 367 * the error. 368 */ 369 370event ereport.cpu.amd.nb.dramaddr_par@chip/cpu{within(5s)}; 371event fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel, response=0; 372 373prop fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel[y] (0)-> 374 ereport.cpu.amd.nb.dramaddr_par@chip/cpu { 375 ((payloadprop("IA32_MCi_STATUS") >> 32 & 0x200) ? 1 : 0) == y }; 376 377/* #L2D_SINGLE# 378 * A single bit data array fault in an l2 cache can cause: 379 * 380 * - inf_l2_ecc1 : reported by ic on this cpu 381 * - inf_l2_ecc1 : reported by dc on this cpu 382 * - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu 383 */ 384 385#define L2CACHEDATA_SB_COUNT 3 386#define L2CACHEDATA_SB_TIME 12h 387 388event ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu{within(5s)}; 389event ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu{within(5s)}; 390event ereport.cpu.amd.bu.l2d_ecc1@chip/cpu{within(5s)}; 391engine serd.cpu.amd.l2d_sb@chip/cpu, 392 N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME; 393event fault.cpu.amd.l2cachedata@chip/cpu, engine=serd.cpu.amd.l2d_sb@chip/cpu; 394 395prop fault.cpu.amd.l2cachedata@chip/cpu (0)-> 396 ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu, 397 ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu, 398 ereport.cpu.amd.bu.l2d_ecc1@chip/cpu; 399 400/* #L2D_MULTI# 401 * A multi-bit data array fault in an l2 cache can cause: 402 * 403 * - inf_l2_eccm : reported by ic on this cpu 404 * - inf_l2_eccm : reported by dc on this cpu 405 * - l2d_eccm : reported by bu on copyback or on snoop from another cpu 406 */ 407 408event ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu{within(5s)}; 409event ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu{within(5s)}; 410event ereport.cpu.amd.bu.l2d_eccm@chip/cpu{within(5s)}; 411 412prop fault.cpu.amd.l2cachedata@chip/cpu 413 { setserdincrement(L2CACHEDATA_SB_COUNT + 1) } (0)-> 414 ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu, 415 ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu, 416 ereport.cpu.amd.bu.l2d_eccm@chip/cpu; 417 418/* #L2T_SINGLE# 419 * A single bit tag array fault in an l2 cache can cause: 420 * 421 * - l2t_ecc1 : reported by bu on this cpu when detected during snoop 422 * - l2t_par : reported by bu on this cpu when detected other than during snoop 423 */ 424 425#define L2CACHETAG_SB_COUNT 3 426#define L2CACHETAG_SB_TIME 12h 427 428event ereport.cpu.amd.bu.l2t_ecc1@chip/cpu{within(5s)}; 429event ereport.cpu.amd.bu.l2t_par@chip/cpu{within(5s)}; 430engine serd.cpu.amd.l2t_sb@chip/cpu, 431 N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME; 432event fault.cpu.amd.l2cachetag@chip/cpu, engine=serd.cpu.amd.l2t_sb@chip/cpu; 433 434prop fault.cpu.amd.l2cachetag@chip/cpu (0)-> 435 ereport.cpu.amd.bu.l2t_ecc1@chip/cpu, 436 ereport.cpu.amd.bu.l2t_par@chip/cpu; 437 438/* #L2T_MULTI# 439 * A multi-bit tag array fault in an l2 cache can cause: 440 * 441 * - l2t_eccm : reported by bu on this cpu when detected during snoop 442 * - l2t_par : reported by bu on this cpu when detected other than during snoop 443 */ 444 445event ereport.cpu.amd.bu.l2t_eccm@chip/cpu{within(5s)}; 446 447prop fault.cpu.amd.l2cachetag@chip/cpu 448 { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)-> 449 ereport.cpu.amd.bu.l2t_eccm@chip/cpu, 450 ereport.cpu.amd.bu.l2t_par@chip/cpu; 451 452/* #ICD_PAR# 453 * A data array parity fault in an I cache can cause: 454 * 455 * - data_par : reported by ic on this cpu 456 */ 457 458#define ICACHEDATA_SB_COUNT 2 459#define ICACHEDATA_SB_TIME 168h 460 461event ereport.cpu.amd.ic.data_par@chip/cpu{within(5s)}; 462engine serd.cpu.amd.icachedata@chip/cpu, 463 N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME; 464event fault.cpu.amd.icachedata@chip/cpu, 465 engine=serd.cpu.amd.icachedata@chip/cpu; 466 467prop fault.cpu.amd.icachedata@chip/cpu (0)-> 468 ereport.cpu.amd.ic.data_par@chip/cpu; 469 470/* #ICT_PAR# 471 * A tag array parity fault in an I cache can cause: 472 * 473 * - tag_par : reported by ic on this cpu 474 */ 475 476#define ICACHETAG_SB_COUNT 2 477#define ICACHETAG_SB_TIME 168h 478 479event ereport.cpu.amd.ic.tag_par@chip/cpu{within(5s)}; 480engine serd.cpu.amd.icachetag@chip/cpu, 481 N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME; 482event fault.cpu.amd.icachetag@chip/cpu, engine=serd.cpu.amd.icachetag@chip/cpu; 483 484prop fault.cpu.amd.icachetag@chip/cpu (0)-> 485 ereport.cpu.amd.ic.tag_par@chip/cpu; 486 487/* #ICT_SNOOP# 488 * A snoop tag array parity fault in an I cache can cause: 489 * 490 * - stag_par : reported by ic on this cpu 491 */ 492 493event ereport.cpu.amd.ic.stag_par@chip/cpu{within(5s)}; 494event fault.cpu.amd.icachestag@chip/cpu; 495 496prop fault.cpu.amd.icachestag@chip/cpu (1)-> 497 ereport.cpu.amd.ic.stag_par@chip/cpu; 498 499/* #ICTLB_1# 500 * An l1tlb parity fault in an I cache can cause: 501 * 502 * - l1tlb_par : reported by ic on this cpu 503 */ 504 505#define ICACHEL1TLB_SB_COUNT 2 506#define ICACHEL1TLB_SB_TIME 168h 507 508event ereport.cpu.amd.ic.l1tlb_par@chip/cpu{within(5s)}; 509engine serd.cpu.amd.l1itlb@chip/cpu, 510 N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME; 511event fault.cpu.amd.l1itlb@chip/cpu, engine=serd.cpu.amd.l1itlb@chip/cpu; 512 513prop fault.cpu.amd.l1itlb@chip/cpu (0)-> 514 ereport.cpu.amd.ic.l1tlb_par@chip/cpu; 515 516/* #ICTLB_2# 517 * An l2tlb parity fault in an I cache can cause: 518 * 519 * - l2tlb_par : reported by ic on this cpu 520 */ 521 522#define ICACHEL2TLB_SB_COUNT 2 523#define ICACHEL2TLB_SB_TIME 168h 524 525event ereport.cpu.amd.ic.l2tlb_par@chip/cpu{within(5s)}; 526engine serd.cpu.amd.l2itlb@chip/cpu, 527 N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME; 528event fault.cpu.amd.l2itlb@chip/cpu, engine=serd.cpu.amd.l2itlb@chip/cpu; 529 530prop fault.cpu.amd.l2itlb@chip/cpu (0)-> 531 ereport.cpu.amd.ic.l2tlb_par@chip/cpu; 532 533/* #DCD_SINGLE# 534 * A single bit data array fault in an D cache can cause: 535 * 536 * - data_ecc1 : reported by dc on this cpu by scrubber 537 * - data_ecc1_uc : reported by dc on this cpu other than by scrubber 538 * 539 * Make data_ecc1_uc fault immediately as it may have caused a panic, so 540 * it is handled by the multi-bit case in the following section. 541 */ 542 543#define DCACHEDATA_SB_COUNT 2 544#define DCACHEDATA_SB_TIME 168h 545 546event ereport.cpu.amd.dc.data_ecc1@chip/cpu{within(5s)}; 547event ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu{within(5s)}; 548engine serd.cpu.amd.dc_sb@chip/cpu, 549 N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME; 550event fault.cpu.amd.dcachedata@chip/cpu, engine=serd.cpu.amd.dc_sb@chip/cpu; 551 552prop fault.cpu.amd.dcachedata@chip/cpu (0)-> 553 ereport.cpu.amd.dc.data_ecc1@chip/cpu; 554 555/* #DCD_MULTI# 556 * A multi-bit data array fault in an D cache can cause: 557 * 558 * - data_eccm : reported by dc on this cpu 559 */ 560 561event ereport.cpu.amd.dc.data_eccm@chip/cpu{within(5s)}; 562 563prop fault.cpu.amd.dcachedata@chip/cpu 564 { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)-> 565 ereport.cpu.amd.dc.data_eccm@chip/cpu, 566 ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu; 567 568/* #DCT_PAR# 569 * A tag array parity fault in an D cache can cause: 570 * 571 * - tag_par : reported by dc on this cpu 572 */ 573 574event ereport.cpu.amd.dc.tag_par@chip/cpu{within(5s)}; 575event fault.cpu.amd.dcachetag@chip/cpu; 576 577prop fault.cpu.amd.dcachetag@chip/cpu (1)-> 578 ereport.cpu.amd.dc.tag_par@chip/cpu; 579 580/* #DCT_SNOOP# 581 * A snoop tag array parity fault in an D cache can cause: 582 * 583 * - stag_par : reported by dc on this cpu 584 */ 585 586event ereport.cpu.amd.dc.stag_par@chip/cpu{within(5s)}; 587event fault.cpu.amd.dcachestag@chip/cpu; 588 589prop fault.cpu.amd.dcachestag@chip/cpu (1)-> 590 ereport.cpu.amd.dc.stag_par@chip/cpu; 591 592/* #DCTLB_1# 593 * An l1tlb parity fault in an D cache can cause: 594 * 595 * - l1tlb_par : reported by dc on this cpu 596 */ 597 598event ereport.cpu.amd.dc.l1tlb_par@chip/cpu{within(5s)}; 599event fault.cpu.amd.l1dtlb@chip/cpu; 600 601prop fault.cpu.amd.l1dtlb@chip/cpu (1)-> 602 ereport.cpu.amd.dc.l1tlb_par@chip/cpu; 603 604/* #DCTLB_2# 605 * An l2tlb parity fault in an D cache can cause: 606 * 607 * - l2tlb_par : reported by dc on this cpu 608 */ 609 610event ereport.cpu.amd.dc.l2tlb_par@chip/cpu{within(5s)}; 611event fault.cpu.amd.l2dtlb@chip/cpu; 612 613prop fault.cpu.amd.l2dtlb@chip/cpu (1)-> 614 ereport.cpu.amd.dc.l2tlb_par@chip/cpu; 615 616/* #MISC# 617 * Ereports that should not normally happen and which we will discard 618 * without diagnosis if they do. These fall into a few categories: 619 * 620 * - the corresponding detector is not enabled, typically because 621 * detection/handling of the event is taking place elsewhere 622 * (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk) 623 * - the event is associated with a sync flood so even if the detector is 624 * enabled we will never handle the event and generate an ereport *and* 625 * even if the ereport did arrive we could perform no useful diagnosis 626 * e.g., the NB can be configured for sync flood on nb.mem_eccm 627 * but we don't choose to discard that ereport here since we could have 628 * made a useful diagnosis from it had it been delivered 629 * (nb.ht_sync, nb.ht_crc) 630 * - events that will be accompanied by an immediate panic and 631 * delivery of the ereport during subsequent reboot but from 632 * which no useful diagnosis can be made. (nb.rmw, nb.wdog) 633 * 634 * Ereports for all of these can be generated by error simulation and 635 * injection. We will perform a null diagnosos of all these ereports in order 636 * to avoid "no subscription" complaints during test harness runs. 637 */ 638 639event ereport.cpu.amd.nb.ma@cpu{within(5s)}; 640event ereport.cpu.amd.nb.ta@cpu{within(5s)}; 641event ereport.cpu.amd.ls.s_rde@cpu{within(5s)}; 642event ereport.cpu.amd.ic.rdde@cpu{within(5s)}; 643event ereport.cpu.amd.bu.s_rde@cpu{within(5s)}; 644event ereport.cpu.amd.nb.gart_walk@cpu{within(5s)}; 645event ereport.cpu.amd.nb.ht_sync@cpu{within(5s)}; 646event ereport.cpu.amd.nb.ht_crc@cpu{within(5s)}; 647event ereport.cpu.amd.nb.rmw@cpu{within(5s)}; 648event ereport.cpu.amd.nb.wdog@cpu{within(5s)}; 649event ereport.cpu.amd.unknown@cpu{within(5s)}; 650 651event upset.null_diag@cpu; 652 653prop upset.null_diag@cpu (1)-> 654 ereport.cpu.amd.nb.ma@cpu, 655 ereport.cpu.amd.nb.ta@cpu, 656 ereport.cpu.amd.ls.s_rde@cpu, 657 ereport.cpu.amd.ic.rdde@cpu, 658 ereport.cpu.amd.bu.s_rde@cpu, 659 ereport.cpu.amd.nb.gart_walk@cpu, 660 ereport.cpu.amd.nb.ht_sync@cpu, 661 ereport.cpu.amd.nb.ht_crc@cpu, 662 ereport.cpu.amd.nb.rmw@cpu, 663 ereport.cpu.amd.nb.wdog@cpu, 664 ereport.cpu.amd.unknown@cpu; 665