1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma dictionary "INTEL" 28 29/* 30 * Eversholt rules for the intel CPU/Memory 31 */ 32 33/* 34 * Ereports for Simple error codes. 35 */ 36 37#define SMPL_EVENT(leafclass, t) \ 38 event ereport.cpu.intel.leafclass@chip/cpu { within(t) }; \ 39 event ereport.cpu.intel.leafclass@chip/core/strand { within(t) } 40 41SMPL_EVENT(unknown, 1s); 42SMPL_EVENT(unclassified, 1s); 43SMPL_EVENT(microcode_rom_parity, 1s); 44SMPL_EVENT(external, 1s); 45SMPL_EVENT(frc, 1s); 46SMPL_EVENT(internal_timer, 1s); 47SMPL_EVENT(internal_parity, 1s); 48SMPL_EVENT(internal_unclassified, 1s); 49 50/* 51 * Propogations for all but "external" and "unknown" simple errors. 52 * If the error is uncorrected we produce a fault immediately, otherwise 53 * we diagnose it to an upset and decalre a fault when the SERD engine 54 * trips. 55 */ 56 57engine serd.cpu.intel.simple@chip/cpu, N=3, T=72h; 58event fault.cpu.intel.internal@chip/cpu, engine=serd.cpu.intel.simple@chip/cpu; 59engine serd.cpu.intel.simple@chip/core/strand, N=3, T=72h; 60event fault.cpu.intel.internal@chip/core/strand, 61 engine=serd.cpu.intel.simple@chip/core/strand; 62 63prop fault.cpu.intel.internal@chip/cpu 64 { payloadprop("error_uncorrected") == 1 ? setserdincrement(4) : 1} (0)-> 65 ereport.cpu.intel.microcode_rom_parity@chip/cpu, 66 ereport.cpu.intel.internal_timer@chip/cpu, 67 ereport.cpu.intel.internal_parity@chip/cpu, 68 ereport.cpu.intel.unclassified@chip/cpu, 69 ereport.cpu.intel.internal_unclassified@chip/cpu, 70 ereport.cpu.intel.frc@chip/cpu; 71prop fault.cpu.intel.internal@chip/core/strand 72 { payloadprop("error_uncorrected") == 1 ? setserdincrement(4) : 1} (0)-> 73 ereport.cpu.intel.microcode_rom_parity@chip/core/strand, 74 ereport.cpu.intel.internal_timer@chip/core/strand, 75 ereport.cpu.intel.internal_parity@chip/core/strand, 76 ereport.cpu.intel.unclassified@chip/core/strand, 77 ereport.cpu.intel.internal_unclassified@chip/core/strand, 78 ereport.cpu.intel.frc@chip/core/strand; 79 80/* 81 * Ereports for Compound error codes. These are in pairs "foo" and "foo_uc" 82 * for the corrected and uncorrected version of each error type. All are 83 * detected at chip/cpu and chip/core/strand. 84 */ 85 86#define CMPND_EVENT(leafclass, t) \ 87 event ereport.cpu.intel.leafclass@chip/cpu { within(t) }; \ 88 event ereport.cpu.intel.leafclass/**/_uc@chip/cpu { within(t) }; \ 89 event ereport.cpu.intel.leafclass@chip/core/strand { within(t) }; \ 90 event ereport.cpu.intel.leafclass/**/_uc@chip/core/strand { within(t) } 91 92/* 93 * Ereports for Compound error codes - intel errors 94 */ 95CMPND_EVENT(l0cache, 1s); 96CMPND_EVENT(l1cache, 1s); 97CMPND_EVENT(l2cache, 1s); 98CMPND_EVENT(cache, 1s); 99 100/* 101 * Ereports for Compound error codes - TLB errors 102 */ 103CMPND_EVENT(l0dtlb, 1s); 104CMPND_EVENT(l1dtlb, 1s); 105CMPND_EVENT(l2dtlb, 1s); 106CMPND_EVENT(dtlb, 1s); 107 108CMPND_EVENT(l0itlb, 1s); 109CMPND_EVENT(l1itlb, 1s); 110CMPND_EVENT(l2itlb, 1s); 111CMPND_EVENT(itlb, 1s); 112 113CMPND_EVENT(l0tlb, 1s); 114CMPND_EVENT(l1tlb, 1s); 115CMPND_EVENT(l2tlb, 1s); 116CMPND_EVENT(tlb, 1s); 117 118/* 119 * Ereports for Compound error codes - memory hierarchy errors 120 */ 121CMPND_EVENT(l0dcache, 1s); 122CMPND_EVENT(l1dcache, 1s); 123CMPND_EVENT(l2dcache, 1s); 124CMPND_EVENT(dcache, 1s); 125 126CMPND_EVENT(l0icache, 1s); 127CMPND_EVENT(l1icache, 1s); 128CMPND_EVENT(l2icache, 1s); 129CMPND_EVENT(icache, 1s); 130 131/* 132 * Ereports for Compound error codes - bus and interconnect errors 133 */ 134CMPND_EVENT(bus_interconnect, 1s); 135CMPND_EVENT(bus_interconnect_memory, 1s); 136CMPND_EVENT(bus_interconnect_io, 1s); 137 138/* 139 * Compound error propogations. 140 * 141 * We resist the temptation propogate, for example, a single dcache fault 142 * to all ereports mentioning dcache (l0dcache, l1dcache, l2dcache, dcache). 143 * Instead we will diagnose a distinct fault for each possible cache level, 144 * whether or not current chips have dcaches at all levels. 145 * 146 * Corrected errors are SERDed and produce a fault when the engine fires; 147 * the same fault is diagnosed immediately for a corresponding uncorrected 148 * error. 149 */ 150 151#define CMPND_FLT_PROP_1(erptleaf, fltleaf, n, t) \ 152 engine serd.cpu.intel.fltleaf@chip/cpu, N=n, T=t; \ 153 event fault.cpu.intel.fltleaf@chip/cpu, \ 154 engine=serd.cpu.intel.fltleaf@chip/cpu; \ 155 engine serd.cpu.intel.fltleaf@chip/core/strand, N=n, T=t; \ 156 event fault.cpu.intel.fltleaf@chip/core/strand, \ 157 engine=serd.cpu.intel.fltleaf@chip/core/strand; \ 158 \ 159 prop fault.cpu.intel.fltleaf@chip/cpu (0)-> \ 160 ereport.cpu.intel.erptleaf@chip/cpu; \ 161 prop fault.cpu.intel.fltleaf@chip/core/strand (0)-> \ 162 ereport.cpu.intel.erptleaf@chip/core/strand; \ 163 \ 164 prop fault.cpu.intel.fltleaf@chip/cpu \ 165 { setserdincrement(n + 1) } (0)-> \ 166 ereport.cpu.intel.erptleaf/**/_uc@chip/cpu; \ 167 prop fault.cpu.intel.fltleaf@chip/core/strand \ 168 { setserdincrement(n + 1) } (0)-> \ 169 ereport.cpu.intel.erptleaf/**/_uc@chip/core/strand 170 171#define CMPND_FLT_PROP_2(erptleaf, fltleaf, n, t) \ 172 engine serd.cpu.intel.fltleaf@chip/cpu, N=n, T=t; \ 173 event fault.cpu.intel.fltleaf@chip/cpu, retire=0, response=0, \ 174 engine=serd.cpu.intel.fltleaf@chip/cpu; \ 175 engine serd.cpu.intel.fltleaf@chip/core/strand, N=n, T=t; \ 176 event fault.cpu.intel.fltleaf@chip/core/strand, retire=0, response=0,\ 177 engine=serd.cpu.intel.fltleaf@chip/core/strand; \ 178 \ 179 prop fault.cpu.intel.fltleaf@chip/cpu (0)-> \ 180 ereport.cpu.intel.erptleaf@chip/cpu; \ 181 prop fault.cpu.intel.fltleaf@chip/core/strand (0)-> \ 182 ereport.cpu.intel.erptleaf@chip/core/strand; \ 183 \ 184 prop fault.cpu.intel.fltleaf@chip/cpu \ 185 { setserdincrement(n + 1) } (0)-> \ 186 ereport.cpu.intel.erptleaf/**/_uc@chip/cpu; \ 187 prop fault.cpu.intel.fltleaf@chip/core/strand \ 188 { setserdincrement(n + 1) } (0)-> \ 189 ereport.cpu.intel.erptleaf/**/_uc@chip/core/strand 190 191CMPND_FLT_PROP_1(l0cache, l0cache, 3, 72h); 192CMPND_FLT_PROP_1(l1cache, l1cache, 3, 72h); 193CMPND_FLT_PROP_1(l2cache, l2cache, 3, 72h); 194CMPND_FLT_PROP_1(cache, cache, 12, 72h); 195 196CMPND_FLT_PROP_1(l0dtlb, l0dtlb, 3, 72h); 197CMPND_FLT_PROP_1(l1dtlb, l1dtlb, 3, 72h); 198CMPND_FLT_PROP_1(l2dtlb, l2dtlb, 3, 72h); 199CMPND_FLT_PROP_1(dtlb, dtlb, 12, 72h); 200 201CMPND_FLT_PROP_1(l0itlb, l0itlb, 3, 72h); 202CMPND_FLT_PROP_1(l1itlb, l1itlb, 3, 72h); 203CMPND_FLT_PROP_1(l2itlb, l2itlb, 3, 72h); 204CMPND_FLT_PROP_1(itlb, itlb, 12, 72h); 205 206CMPND_FLT_PROP_1(l0tlb, litlb, 3, 72h); 207CMPND_FLT_PROP_1(l1tlb, litlb, 3, 72h); 208CMPND_FLT_PROP_1(l2tlb, litlb, 3, 72h); 209CMPND_FLT_PROP_1(tlb, tlb, 12, 72h); 210 211CMPND_FLT_PROP_1(l0dcache, l0dcache, 3, 72h); 212CMPND_FLT_PROP_1(l1dcache, l1dcache, 3, 72h); 213CMPND_FLT_PROP_1(l2dcache, l2dcache, 3, 72h); 214CMPND_FLT_PROP_1(dcache, dcache, 12, 72h); 215 216CMPND_FLT_PROP_1(l0icache, l0icache, 3, 72h); 217CMPND_FLT_PROP_1(l1icache, l1icache, 3, 72h); 218CMPND_FLT_PROP_1(l2icache, l2icache, 3, 72h); 219CMPND_FLT_PROP_1(icache, icache, 12, 72h); 220 221CMPND_FLT_PROP_2(bus_interconnect, bus_interconnect, 10, 72h); 222CMPND_FLT_PROP_2(bus_interconnect_memory, bus_interconnect_memory, 10, 72h); 223CMPND_FLT_PROP_2(bus_interconnect_io, bus_interconnect_io, 10, 72h); 224 225event upset.discard@chip/cpu; 226 227prop upset.discard@chip/cpu (0)-> 228 ereport.cpu.intel.external@chip/cpu, 229 ereport.cpu.intel.unknown@chip/cpu; 230 231event upset.discard@chip/core/strand; 232 233prop upset.discard@chip/core/strand (0)-> 234 ereport.cpu.intel.external@chip/core/strand, 235 ereport.cpu.intel.unknown@chip/core/strand; 236 237/* errors detected in northbridge */ 238 239 240/* 241 * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that 242 * we diagnose for page faults, to record the physical address of the faulting 243 * page. 244 */ 245#define SET_ADDR (!payloadprop_defined("physaddr") || \ 246 setpayloadprop("asru-physaddr", payloadprop("physaddr"))) 247 248#define SET_OFFSET (!payloadprop_defined("offset") || \ 249 setpayloadprop("asru-offset", payloadprop("offset"))) 250 251#define EREPORT_BUS_ERROR \ 252 ereport.cpu.intel.bus_interconnect_memory_uc@chip/cpu, \ 253 ereport.cpu.intel.bus_interconnect_uc@chip/cpu, \ 254 ereport.cpu.intel.bus_interconnect_memory@chip/cpu, \ 255 ereport.cpu.intel.bus_interconnect@chip/cpu, \ 256 ereport.cpu.intel.external@chip/cpu, \ 257 ereport.cpu.intel.bus_interconnect_memory_uc@chip/core/strand, \ 258 ereport.cpu.intel.bus_interconnect_uc@chip/core/strand, \ 259 ereport.cpu.intel.bus_interconnect_memory@chip/core/strand, \ 260 ereport.cpu.intel.bus_interconnect@chip/core/strand, \ 261 ereport.cpu.intel.external@chip/core/strand 262 263engine stat.ce_pgflt@memory-controller/dram-channel/dimm; 264 265event ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller{within(12s)}; 266event ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller{within(12s)}; 267event fault.memory.intel.page_ue@ 268 motherboard/memory-controller/dram-channel/dimm/rank, 269 message=0, response=0; 270event fault.memory.intel.dimm_ue@ 271 motherboard/memory-controller/dram-channel/dimm/rank; 272 273prop fault.memory.intel.page_ue@ 274 motherboard/memory-controller/dram-channel/dimm/rank[rank_num] 275 { payloadprop_defined("rank") && rank_num == payloadprop("rank") && 276 (payloadprop_defined("physaddr") || payloadprop_defined("offset")) && 277 SET_ADDR && SET_OFFSET } (1)-> 278 ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller, 279 ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller; 280 281prop fault.memory.intel.page_ue@ 282 motherboard/memory-controller/dram-channel/dimm/rank (1)-> 283 ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller, 284 ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller; 285 286prop fault.memory.intel.page_ue@ 287 motherboard/memory-controller/dram-channel/dimm/rank (0)-> 288 EREPORT_BUS_ERROR; 289 290prop fault.memory.intel.dimm_ue@ 291 motherboard/memory-controller/dram-channel<channel_num>/dimm/rank[rank_num] 292 { payloadprop_defined("rank") && rank_num == payloadprop("rank") } (1)-> 293 ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller, 294 ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller; 295 296prop fault.memory.intel.dimm_ue@ 297 motherboard/memory-controller/dram-channel/dimm/rank (1)-> 298 ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller, 299 ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller; 300 301prop fault.memory.intel.dimm_ue@ 302 motherboard/memory-controller/dram-channel/dimm/rank (0)-> 303 EREPORT_BUS_ERROR; 304 305event upset.memory.intel.discard@motherboard/memory-controller{within(1s)}; 306 307prop upset.memory.intel.discard@motherboard/memory-controller 308 { !payloadprop_defined("rank") } (1)-> 309 ereport.cpu.intel.nb.mem_ue@motherboard/memory-controller, 310 ereport.cpu.intel.nb.fbd.ma@motherboard/memory-controller; 311 312prop upset.memory.intel.discard@motherboard/memory-controller (0)-> 313 EREPORT_BUS_ERROR; 314 315#define PAGE_CE_COUNT 2 316#define PAGE_CE_TIME 72h 317#define DIMM_CE_COUNT 10 318#define DIMM_CE_TIME 1week 319 320event ereport.cpu.intel.nb.mem_ce@dimm/rank{within(12s)}; 321 322engine serd.memory.intel.page_ce@dimm/rank, N=PAGE_CE_COUNT, T=PAGE_CE_TIME; 323event fault.memory.intel.page_ce@dimm/rank, message=0, response=0, 324 count=stat.ce_pgflt@dimm, engine=serd.memory.intel.page_ce@dimm/rank; 325prop fault.memory.intel.page_ce@dimm/rank 326 { (payloadprop_defined("physaddr") || payloadprop_defined("offset")) && 327 SET_ADDR && SET_OFFSET } (0)-> 328 ereport.cpu.intel.nb.mem_ce@dimm/rank; 329 330engine serd.memory.intel.dimm_ce@dimm/rank, N=DIMM_CE_COUNT, T=DIMM_CE_TIME; 331event fault.memory.intel.dimm_ce@dimm/rank, 332 engine=serd.memory.intel.dimm_ce@dimm/rank; 333event error.memory.intel.dimm_ce@dimm; 334prop fault.memory.intel.dimm_ce@dimm/rank (1)-> 335 ereport.cpu.intel.nb.mem_ce@dimm/rank; 336prop fault.memory.intel.dimm_ce@dimm/rank 337 { !confprop_defined(dimm, "dimm-size") } (1)-> 338 error.memory.intel.dimm_ce@dimm; 339prop error.memory.intel.dimm_ce@dimm 340 { !confprop_defined(dimm, "dimm-size") && 341 count(stat.ce_pgflt@dimm) > 512 } (1)-> 342 ereport.cpu.intel.nb.mem_ce@dimm/rank; 343 344#define DIMM_CE(dimm_size, n, t, fault_rate) \ 345 prop fault.memory.intel.dimm_ce@dimm/rank { \ 346 confprop(dimm, "dimm-size") == dimm_size && \ 347 setserdn(n) & setserdt(t) } (1)-> \ 348 error.memory.intel.dimm_ce@dimm; \ 349 prop error.memory.intel.dimm_ce@dimm { \ 350 confprop(dimm, "dimm-size") == dimm_size && \ 351 count(stat.ce_pgflt@dimm) > fault_rate } (1)-> \ 352 ereport.cpu.intel.nb.mem_ce@dimm/rank; 353 354DIMM_CE("8G", 8, 1week, 2000) 355DIMM_CE("4G", 4, 1week, 1500) 356DIMM_CE("2G", 4, 2week, 1000) 357DIMM_CE("1G", 4, 4week, 500) 358DIMM_CE("512M", 4, 8week, 250) 359DIMM_CE("256M", 4, 16week, 125) 360 361event ereport.cpu.intel.nb.fbd.alert@rank{within(12s)}; 362event fault.memory.intel.fbd.alert@rank, retire=0; 363 364prop fault.memory.intel.fbd.alert@rank (1)-> 365 ereport.cpu.intel.nb.fbd.alert@rank; 366 367prop fault.memory.intel.fbd.alert@rank (0)-> 368 EREPORT_BUS_ERROR; 369 370event ereport.cpu.intel.nb.fbd.crc@rank{within(12s)}; 371event fault.memory.intel.fbd.crc@rank, retire=0; 372 373prop fault.memory.intel.fbd.crc@rank (1)-> 374 ereport.cpu.intel.nb.fbd.crc@rank; 375 376prop fault.memory.intel.fbd.crc@rank (0)-> EREPORT_BUS_ERROR; 377 378event ereport.cpu.intel.nb.fbd.reset_timeout@memory-controller {within(12s)}; 379event fault.memory.intel.fbd.reset_timeout@memory-controller, retire=0; 380 381prop fault.memory.intel.fbd.reset_timeout@memory-controller (1)-> 382 ereport.cpu.intel.nb.fbd.reset_timeout@memory-controller; 383 384prop fault.memory.intel.fbd.reset_timeout@memory-controller (0)-> 385 EREPORT_BUS_ERROR; 386 387event ereport.cpu.intel.nb.fbd.ch@dram-channel {within(12s)}; 388engine serd.cpu.intel.nb.fbd.ch@dram-channel, N=2, T=1month; 389event fault.memory.intel.fbd.ch@dram-channel, retire=0, 390 engine=serd.cpu.intel.nb.fbd.ch@dram-channel; 391 392prop fault.memory.intel.fbd.ch@dram-channel (1)-> 393 ereport.cpu.intel.nb.fbd.ch@dram-channel; 394 395prop fault.memory.intel.fbd.ch@dram-channel (0)-> 396 EREPORT_BUS_ERROR; 397 398event ereport.cpu.intel.nb.fbd.otf@dram-channel {within(12s)}; 399engine serd.cpu.intel.nb.fbd_otf@dram-channel, N=2, T=1week; 400event fault.memory.intel.fbd.otf@dram-channel, retire=0, response=0, 401 engine=serd.cpu.intel.nb.fbd_otf@dram-channel; 402 403prop fault.memory.intel.fbd.otf@dram-channel (1)-> 404 ereport.cpu.intel.nb.fbd.otf@dram-channel; 405 406event ereport.cpu.intel.nb.otf@motherboard {within(12s)}; 407event fault.cpu.intel.nb.otf@motherboard, retire=0, response=0; 408 409prop fault.cpu.intel.nb.otf@motherboard (1)-> 410 ereport.cpu.intel.nb.otf@motherboard; 411 412event ereport.cpu.intel.nb.unknown@memory-controller {within(12s)}; 413event ereport.cpu.intel.nb.unknown@memory-controller/dram-channel {within(12s)}; 414event ereport.cpu.intel.nb.spd@memory-controller/dram-channel {within(12s)}; 415event upset.discard@memory-controller; 416 417prop upset.discard@memory-controller (0)-> 418 ereport.cpu.intel.nb.unknown@memory-controller, 419 ereport.cpu.intel.nb.unknown@memory-controller/dram-channel, 420 ereport.cpu.intel.nb.spd@memory-controller/dram-channel; 421 422event ereport.cpu.intel.nb.mem_ds@memory-controller{within(30s)}; 423event fault.memory.intel.fbd.mem_ds@memory-controller/dram-channel/dimm/rank, 424 retire=0; 425 426prop fault.memory.intel.fbd.mem_ds@ 427 memory-controller/dram-channel/dimm/rank[rank_num] 428 { payloadprop_defined("rank") && rank_num == payloadprop("rank") } (1)-> 429 ereport.cpu.intel.nb.mem_ds@memory-controller; 430 431event ereport.cpu.intel.nb.fsb@chip{within(12s)}; 432event fault.cpu.intel.nb.fsb@chip, retire=0; 433 434prop fault.cpu.intel.nb.fsb@chip (1)-> 435 ereport.cpu.intel.nb.fsb@chip; 436 437prop fault.cpu.intel.nb.fsb@chip (0)-> EREPORT_BUS_ERROR; 438 439event ereport.cpu.intel.nb.ie@motherboard{within(12s)}; 440event fault.cpu.intel.nb.ie@motherboard, retire=0; 441 442prop fault.cpu.intel.nb.ie@motherboard (1)-> 443 ereport.cpu.intel.nb.ie@motherboard; 444 445prop fault.cpu.intel.nb.ie@motherboard (0)-> EREPORT_BUS_ERROR; 446 447event ereport.cpu.intel.nb.dma@motherboard{within(12s)}; 448event fault.cpu.intel.nb.dma@motherboard, retire=0, response=0; 449 450prop fault.cpu.intel.nb.dma@motherboard (1)-> 451 ereport.cpu.intel.nb.dma@motherboard; 452 453prop fault.cpu.intel.nb.dma@motherboard (0)-> EREPORT_BUS_ERROR; 454 455event ereport.cpu.intel.nb.esi@motherboard{within(12s)}; 456event ereport.cpu.intel.nb.pex@hostbridge{within(12s)}; 457event upset.cpu.intel.nb.pex@hostbridge; 458 459prop upset.cpu.intel.nb.pex@hostbridge (1)-> 460 ereport.cpu.intel.nb.esi@motherboard, 461 ereport.cpu.intel.nb.pex@hostbridge; 462 463prop upset.cpu.intel.nb.pex@hostbridge (0)-> EREPORT_BUS_ERROR; 464 465event ereport.cpu.intel.nb.unknown@rank{within(12s)}; 466event upset.discard@rank; 467 468prop upset.discard@rank (1)-> 469 ereport.cpu.intel.nb.unknown@rank; 470 471prop upset.discard@rank (0)-> EREPORT_BUS_ERROR; 472 473/* 474 * CPU integrated memory controller 475 */ 476 477#define CONTAINS_RANK (payloadprop_contains("resource", \ 478 asru(motherboard/chip/memory-controller/dram-channel/dimm/rank)) || \ 479 payloadprop_contains("resource", \ 480 asru(motherboard/chip/memory-controller/dram-channel/dimm))) 481 482#define CPU_MEM_CE_PGFLTS \ 483 (count(stat.ce_pgflt@motherboard/chip/memory-controller/dram-channel/dimm)) 484 485engine stat.ce_pgflt@motherboard/chip/memory-controller/dram-channel/dimm; 486 487event ereport.cpu.intel.quickpath.mem_ue@motherboard/chip/memory-controller 488 {within(12s)}; 489 490event fault.memory.intel.page_ue@ 491 motherboard/chip/memory-controller/dram-channel/dimm/rank, 492 message=0, response=0; /* do not message individual pageflts */ 493 494prop fault.memory.intel.page_ue@ 495 motherboard/chip/memory-controller/dram-channel/dimm/rank 496 { CONTAINS_RANK && 497 (payloadprop_defined("physaddr") || payloadprop_defined("offset")) && 498 SET_ADDR && SET_OFFSET } (1)-> 499 ereport.cpu.intel.quickpath.mem_ue@motherboard/chip/memory-controller; 500 501event fault.memory.intel.dimm_ue@ 502 motherboard/chip/memory-controller/dram-channel/dimm/rank; 503 504prop fault.memory.intel.dimm_ue@ 505 motherboard/chip/memory-controller/dram-channel/dimm/rank 506 { CONTAINS_RANK } (1)-> 507 ereport.cpu.intel.quickpath.mem_ue@motherboard/chip/memory-controller; 508 509prop fault.memory.intel.dimm_ue@ 510 motherboard/chip/memory-controller/dram-channel/dimm/rank (0)-> 511 EREPORT_BUS_ERROR; 512 513event ereport.cpu.intel.quickpath.mem_ce@ 514 motherboard/chip/memory-controller{within(12s)}; 515 516engine serd.memory.intel.page_ce@ 517 motherboard/chip/memory-controller/dram-channel/dimm/rank, 518 N=PAGE_CE_COUNT, T=PAGE_CE_TIME; 519 520event fault.memory.intel.page_ce@ 521 motherboard/chip/memory-controller/dram-channel/dimm/rank, 522 message=0, response=0, 523 count=stat.ce_pgflt@motherboard/chip/memory-controller/dram-channel/dimm, 524 engine=serd.memory.intel.page_ce@ 525 motherboard/chip/memory-controller/dram-channel/dimm/rank; 526 527prop fault.memory.intel.page_ce@ 528 motherboard/chip/memory-controller/dram-channel/dimm/rank 529 { CONTAINS_RANK && 530 (payloadprop_defined("physaddr") || payloadprop_defined("offset")) && 531 SET_ADDR && SET_OFFSET } (1)-> 532 ereport.cpu.intel.quickpath.mem_ce@motherboard/chip/memory-controller; 533 534engine serd.memory.intel.dimm_ce@ 535 motherboard/chip/memory-controller/dram-channel/dimm, 536 N=PAGE_CE_COUNT, T=PAGE_CE_TIME; 537event fault.memory.intel.dimm_ce@ 538 motherboard/chip/memory-controller/dram-channel/dimm, 539 engine=serd.memory.intel.dimm_ce@ 540 motherboard/chip/memory-controller/dram-channel/dimm; 541event error.memory.intel.dimm_ce@ 542 motherboard/chip/memory-controller/dram-channel/dimm; 543prop fault.memory.intel.dimm_ce@ 544 motherboard/chip/memory-controller/dram-channel/dimm 545 { !confprop_defined(dimm, "dimm-size") } (1)-> 546 error.memory.intel.dimm_ce@ 547 motherboard/chip/memory-controller/dram-channel/dimm; 548prop error.memory.intel.dimm_ce@ 549 motherboard/chip/memory-controller/dram-channel/dimm 550 { !confprop_defined(dimm, "dimm-size") && 551 count(stat.ce_pgflt@dimm) > 512 } (1)-> 552 ereport.cpu.intel.quickpath.mem_ce@motherboard/chip/memory-controller; 553 554#define CPU_MEM_DIMM_CE(dimm_size, n, t, fault_rate) \ 555 prop fault.memory.intel.dimm_ce@ \ 556 motherboard/chip/memory-controller/dram-channel/dimm { \ 557 confprop(dimm, "dimm-size") == dimm_size && \ 558 setserdn(n) & setserdt(t) } (1)-> \ 559 error.memory.intel.dimm_ce@ \ 560 motherboard/chip/memory-controller/dram-channel/dimm; \ 561 prop error.memory.intel.dimm_ce@ \ 562 motherboard/chip/memory-controller/dram-channel/dimm { \ 563 confprop(dimm, "dimm-size") == dimm_size && \ 564 count(stat.ce_pgflt@dimm) > fault_rate } (1)-> \ 565 ereport.cpu.intel.quickpath.mem_ce@ \ 566 motherboard/chip/memory-controller; 567 568CPU_MEM_DIMM_CE("16G", 16, 1week, 2000) 569CPU_MEM_DIMM_CE("8G", 8, 1week, 2000) 570CPU_MEM_DIMM_CE("4G", 4, 1week, 1500) 571CPU_MEM_DIMM_CE("2G", 4, 2week, 1000) 572CPU_MEM_DIMM_CE("1G", 4, 4week, 500) 573CPU_MEM_DIMM_CE("512M", 4, 8week, 250) 574 575event ereport.cpu.intel.quickpath.mem_unknown@motherboard/chip/memory-controller {within(12s)}; 576event ereport.cpu.intel.quickpath.mem_unknown@motherboard/chip/memory-controller/dram-channel 577 {within(12s)}; 578event ereport.cpu.intel.quickpath.mem_unknown@ 579 motherboard/chip/memory-controller/dram-channel/dimm/rank{within(12s)}; 580event upset.discard@motherboard/chip/memory-controller; 581event upset.discard@motherboard/chip/memory-controller/dram-channel/dimm/rank; 582 583prop upset.discard@motherboard/chip/memory-controller (0)-> 584 ereport.cpu.intel.quickpath.mem_unknown@motherboard/chip/memory-controller, 585 ereport.cpu.intel.quickpath.mem_unknown@ 586 motherboard/chip/memory-controller/dram-channel; 587 588prop upset.discard@ 589 motherboard/chip/memory-controller/dram-channel/dimm/rank (1)-> 590 ereport.cpu.intel.quickpath.mem_unknown@ 591 motherboard/chip/memory-controller/dram-channel/dimm/rank; 592 593event ereport.cpu.intel.quickpath.mem_parity@motherboard/chip/memory-controller {within(1s)}; 594event fault.cpu.intel.quickpath.mem_parity@motherboard/chip/memory-controller; 595 596prop fault.cpu.intel.quickpath.mem_parity@motherboard/chip/memory-controller (1)-> 597 ereport.cpu.intel.quickpath.mem_parity@motherboard/chip/memory-controller; 598 599event ereport.cpu.intel.quickpath.mem_addr_parity@motherboard/chip/memory-controller {within(1s)}; 600event fault.cpu.intel.quickpath.mem_addr_parity@ 601 motherboard/chip/memory-controller/dram-channel/dimm; 602event fault.cpu.intel.quickpath.mem_addr_parity@ 603 motherboard/chip/memory-controller; 604 605prop fault.cpu.intel.quickpath.mem_addr_parity@ 606 motherboard/chip/memory-controller (1)-> 607 ereport.cpu.intel.quickpath.mem_addr_parity@motherboard/chip/memory-controller; 608 609prop fault.cpu.intel.quickpath.mem_addr_parity@ 610 motherboard/chip/memory-controller/dram-channel/dimm 611 { payloadprop_contains("resource", asru(motherboard/chip/memory-controller/dram-channel/dimm)) } (1)-> 612 ereport.cpu.intel.quickpath.mem_addr_parity@motherboard/chip/memory-controller; 613 614event ereport.cpu.intel.quickpath.mem_bad_addr@motherboard/chip/memory-controller {within(1s)}; 615event fault.cpu.intel.quickpath.mem_bad_addr@motherboard/chip/memory-controller; 616 617prop fault.cpu.intel.quickpath.mem_bad_addr@motherboard/chip/memory-controller (1)-> 618 ereport.cpu.intel.quickpath.mem_bad_addr@motherboard/chip/memory-controller; 619 620event ereport.cpu.intel.quickpath.mem_spare@motherboard/chip/memory-controller {within(1s)}; 621event fault.cpu.intel.quickpath.mem_spare@ 622 motherboard/chip/memory-controller/dram-channel/dimm; 623 624prop fault.cpu.intel.quickpath.mem_spare@ 625 motherboard/chip/memory-controller/dram-channel/dimm (1)-> 626 ereport.cpu.intel.quickpath.mem_spare@motherboard/chip/memory-controller; 627 628event ereport.cpu.intel.quickpath.mem_bad_id@motherboard/chip/memory-controller {within(1s)}; 629event fault.cpu.intel.quickpath.mem_bad_id@motherboard/chip/memory-controller; 630 631prop fault.cpu.intel.quickpath.mem_bad_id@motherboard/chip/memory-controller (1)-> 632 ereport.cpu.intel.quickpath.mem_bad_id@motherboard/chip/memory-controller; 633 634event ereport.cpu.intel.quickpath.mem_redundant@motherboard/chip/memory-controller {within(1s)}; 635engine serd.cpu.intel.quickpath.mem_redundant@motherboard/chip/memory-controller, 636 N=2, T=72h; 637event fault.cpu.intel.quickpath.mem_redundant@ 638 motherboard/chip/memory-controller/dram-channel/dimm, 639 engine=serd.cpu.intel.quickpath.mem_redundant@ 640 motherboard/chip/memory-controller; 641 642prop fault.cpu.intel.quickpath.mem_redundant@ 643 motherboard/chip/memory-controller/dram-channel/dimm (1)-> 644 ereport.cpu.intel.quickpath.mem_redundant@ 645 motherboard/chip/memory-controller; 646 647event ereport.cpu.intel.quickpath.interconnect@motherboard/chip 648 {within(1s)}; 649event upset.cpu.intel.quickpath.interconnect@motherboard/chip; 650/* Diagnose corrected events to upsets */ 651prop upset.cpu.intel.quickpath.interconnect@motherboard/chip 652 { !STATUS_UC } (1)-> 653 ereport.cpu.intel.quickpath.interconnect@motherboard/chip; 654 655 656engine serd.cpu.intel.quickpath.interconnect@motherboard/chip, 657 N=3, T=72h; 658event fault.cpu.intel.quickpath.interconnect@motherboard/chip, 659 engine=serd.cpu.intel.quickpath.interconnect@motherboard/chip; 660 661/* Diagnose uncorrected events to faults */ 662prop fault.cpu.intel.quickpath.interconnect@motherboard/chip 663 { STATUS_UC } (0)-> 664 ereport.cpu.intel.quickpath.interconnect@motherboard/chip; 665