1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29/* 30 * Eversholt rules for generic AMD with on-chip memory-controller(s), as seen 31 * in AMD family 0xf and 0x10. 32 * 33 * In the absence of any model-specific support, any memory errors that 34 * are observed via MCA (typically through an on-chip memory-controller) 35 * will surface as ereport.cpu.generic-x86.bus_interconnect_memory[_uc] 36 * ereports and are diagnosed via generic rules in gcpu.esc. 37 * 38 * If full model-specific support is available, including full NorthBridge 39 * support, then memory ereports will surface in a more-specific subclass 40 * such as ereport.cpu.amd.mem_ce; these are diagnosed in amd64.esc. 41 * 42 * In the case where some "vendor generic" support is present, memory errors 43 * are reported as ereport.cpu.generic-x86.mem_{ce,ue} and include a 44 * syndrome and syndrome-type, and usually also a resource FMRI to identify 45 * the affected resource. In the AMD case a resource FMRI is included for 46 * those chip versions that include an Online Spare Control register; this 47 * register provides counts of ECC errors seen per channel and chip-select 48 * on a NorthBridge node. The resource FMRI has form 49 * hc:///motherboard/chip/memory-controller/dram-channel/chip-select 50 * in these cases. 51 */ 52 53#pragma dictionary "GMCA" 54 55/* 56 * The number of pages that must be faulted on a chip-select for repeated 57 * correctable errors before we will consider one of the component dimms 58 * faulty. 59 */ 60#define CS_DIMMSB_THRESH 64 61 62/* 63 * The maximum number of pages we will diagnose as faulty on any one 64 * chip-select (must be at least CS_PAGEFLT_THRESH). If a chip-select 65 * has a fault that will affect zillions of pages this limit stops us 66 * diagnosing excessive numbers of page faults. 67 */ 68#define CS_PAGEFLT_MAX (2 * CS_DIMMSB_THRESH) 69 70/* 71 * SERD paramters for individual page faults. When more than PAGE_SB_COUNT 72 * correctable ereports are experienced on a single chip-select within 73 * PAGE_SB_TIME the engine will fire and we will fault the most recent 74 * page. 75 */ 76#define PAGE_SB_COUNT 3 77#define PAGE_SB_TIME 24h 78 79fru chip; 80 81#define CSPATH chip/memory-controller/dram-channel/chip-select 82 83asru chip/cpu; 84asru CSPATH; 85 86/* 87 * ADDR_VALID is true if the ereport payload includes IA32_MCi_ADDR. 88 */ 89#define ADDR_VALID (payloadprop_defined("IA32_MCi_ADDR")) 90 91/* 92 * CONTAINS_CS is true if the resource nvlist array exists and one of its 93 * members matches the chip-select path. This is used to constrain 94 * propogations to those for which a resource element matches the 95 * chip-select path of the propogation. This is necessary because the 96 * detector element of memory ereports is a cpu and not the chip-select itself. 97 */ 98#define CONTAINS_CS (payloadprop_contains("resource", asru(CSPATH))) 99 100#define SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR"))) 101/* Generic memory ereports. */ 102event ereport.cpu.generic-x86.mem_ce@chip/cpu { within(1s) }; 103event ereport.cpu.generic-x86.mem_ue@chip/cpu { within(1s) }; 104 105/* 106 * ========= Propogations for correctable page faults ============ 107 * | | 108 * | Discard mem_ce with no resource in the ereport payload. | 109 * | Discard mem_ce with no address info - we can't fault the | 110 * | corresponding page without it. | 111 * | | 112 * | For a mem_ce ereport detected by a given chip/cpu (as per | 113 * | the payload detector info) whose resource payload member | 114 * | includes a chip/memory-controller/dram-channel/chip-select | 115 * | (CSPATH) for the same chip number, diagnose to an upset event | 116 * | associated with a per-CSPATH SERD engine as long as we are | 117 * | below the page fault limit for this CSPATH (defined below); | 118 * | if we are over that limit then discard the event since we | 119 * | will already have faulted a dimm and there is no point in | 120 * | continuing to diagnose endless page faults from a dimm with | 121 * | something like a pin failure. | 122 * | | 123 * | When the per-CSPATH SERD engine fires we fault the page | 124 * | containing the address included in the ereport that caused | 125 * | the trip, and increment a per-CSPATH counter to count page | 126 * | faults on that chip-select from repeated correctable errors. | 127 * | | 128 * | A mem_ue ereport produces an immediate page_ue fault. | 129 * |===============================================================| 130 */ 131 132/* Counter for page faults diagnosed on a chip-select */ 133engine stat.cepgflt@CSPATH; 134 135#define CS_PGFLT_LIMIT_REACHED (count(stat.cepgflt@CSPATH) > CS_PAGEFLT_MAX) 136 137/* Page fault event for repeated correctable errors */ 138event fault.memory.generic-x86.page_ce@CSPATH, 139 FITrate=1000, /* meaningless */ 140 message=0, /* do not message individual pageflts */ 141 ASRU=CSPATH, 142 count=stat.cepgflt@CSPATH, /* increment on pageflt diagnosis */ 143 action=confcall("rewrite-ASRU"); /* identify page in chip-select */ 144 145/* Upset to diagnose correctable ereports to */ 146event upset.memory.generic-x86.page_ce@CSPATH, 147 engine=serd.memory.generic-x86.page_ce@CSPATH; 148 149/* Synthetic ereport generated when page_ce SERD engine trips */ 150event ereport.memory.generic-x86.page_ce_trip@CSPATH { within(1s) }; 151 152/* SERD engine for each chip-select */ 153engine serd.memory.generic-x86.page_ce@CSPATH, 154 N=PAGE_SB_COUNT, T=PAGE_SB_TIME, 155 method=persistent, 156 trip=ereport.memory.generic-x86.page_ce_trip@CSPATH; 157 158/* Upset to discard events to when we're over limit */ 159event upset.memory.generic-x86.overpgfltlimit@CSPATH; 160 161/* 162 * Discard ereports with no resource or no address info 163 */ 164event upset.memory.generic-x86.discard@chip/cpu; 165prop upset.memory.generic-x86.discard@chip/cpu 166 { !payloadprop_defined("resource") || !ADDR_VALID } (1)-> 167 ereport.cpu.generic-x86.mem_ce@chip/cpu; 168 169/* 170 * For as long as we are below the page fault limit diagnose correctable ereport 171 * observations as upsets to feed the SERD engine. 172 */ 173prop upset.memory.generic-x86.page_ce@CSPATH 174 { ADDR_VALID && CONTAINS_CS && !CS_PGFLT_LIMIT_REACHED } (0)-> 175 ereport.cpu.generic-x86.mem_ce@chip/cpu; 176 177/* 178 * Discard ereports if we are above the page fault limit on this chip-select, 179 */ 180prop upset.memory.generic-x86.overpgfltlimit@CSPATH 181 { ADDR_VALID && CONTAINS_CS && CS_PGFLT_LIMIT_REACHED } (1)-> 182 ereport.cpu.generic-x86.mem_ce@chip/cpu; 183 184/* Diagnose a page fault when the pagefault SERD engine trips */ 185prop fault.memory.generic-x86.page_ce@CSPATH (1)-> 186 ereport.memory.generic-x86.page_ce_trip@CSPATH; 187 188/* Include address info in the page fault diagnosed, for rewrite-ASRU */ 189prop fault.memory.generic-x86.page_ce@CSPATH 190 { ADDR_VALID && CONTAINS_CS && SET_ADDR } (0)-> 191 ereport.cpu.generic-x86.mem_ce@chip/cpu; 192 193/* 194 * ========= Propogations for correctable DIMM faults ============ 195 * | | 196 * | A dimm_ce fault is diagnosed when we have faulted an | 197 * | excessive number of page_ce faults on a chip-select - more | 198 * | than CE_DIMMSB_THRESH. | 199 * | | 200 * | A dimm_ue fault is diagnosed on the first uncorrectable | 201 * | ereport from a chip-select. | 202 * |===============================================================| 203 */ 204 205/* DIMM fault event for CE failures */ 206event fault.memory.generic-x86.dimm_ce@CSPATH, 207 ASRU=CSPATH, 208 FITrate=1000, /* meaningless */ 209 action=confcall("rewrite-ASRU"); /* rewrite in "mem" FMRI scheme */ 210 211#define CS_DIMMSB_THRESH_REACHED \ 212 (count(stat.cepgflt@CSPATH) == CS_DIMMSB_THRESH) 213 214/* 215 * This upset is diagnosed in parallel with upset.memory.generic-x86.page_ce 216 * on the CSPATH, and the associated SERD engine has the same parameters 217 * as serd.memory.generic-x86.page_ce@CSPATH so they fire at the same time. 218 * When this one fires we check whether we have reached the diagnosis 219 * threshold for a dimm_ce. 220 */ 221event upset.memory.generic-x86.dimm_ce@CSPATH, 222 engine=serd.memory.generic-x86.dimm_ce_limitchk@CSPATH; 223 224event ereport.memory.generic-x86.dimm_ce_limitchk@CSPATH { within(1s) }; 225 226engine serd.memory.generic-x86.dimm_ce_limitchk@CSPATH, 227 N=PAGE_SB_COUNT, T=PAGE_SB_TIME, 228 method=persistent, 229 trip=ereport.memory.generic-x86.dimm_ce_limitchk@CSPATH; 230 231prop upset.memory.generic-x86.dimm_ce@CSPATH 232 { ADDR_VALID && CONTAINS_CS } (0)-> 233 ereport.cpu.generic-x86.mem_ce@chip/cpu; 234 235prop fault.memory.generic-x86.dimm_ce@CSPATH 236 { CS_DIMMSB_THRESH_REACHED } (0)-> 237 ereport.memory.generic-x86.dimm_ce_limitchk@CSPATH; 238 239event upset.memory.generic-x86.discard2@CSPATH; 240prop upset.memory.generic-x86.discard2@CSPATH 241 { !CS_DIMMSB_THRESH_REACHED } (0)-> 242 ereport.memory.generic-x86.dimm_ce_limitchk@CSPATH; 243 244/* 245 * ========= Propogations for uncorrectable page faults ========== 246 * | | 247 * | A UE produces an immediate page fault. But we also want a | 248 * | corresponding dimm fault and since we do not like multi-entry | 249 * | suspect lists we arrange two distinct fault management | 250 * | exercises by diagnosing a mem_ue to two upset events that | 251 * | feed instant-trip SERD engines. Yuck. | 252 * |===============================================================| 253 */ 254 255/* Page fault event for uncorrectable errors */ 256event fault.memory.generic-x86.page_ue@CSPATH, 257 FITrate=1000, /* meaningless */ 258 message=0, /* do not message individual pageflts */ 259 count=stat.cepgflt@CSPATH, /* increment on pageflt diagnosis */ 260 action=confcall("rewrite-ASRU"); /* identify page in chip-select */ 261 262/* Upset for page fault */ 263event upset.memory.generic-x86.page_ue@CSPATH, 264 engine=serd.memory.generic-x86.page_ue@CSPATH; 265 266/* Synthetic erport generated when the page_ue SERD engine trips */ 267event ereport.memory.generic-x86.page_ue_trip@CSPATH { within(1s) }; 268 269/* Instant-trip engine for page fault */ 270engine serd.memory.generic-x86.page_ue@CSPATH, 271 N=0, T=1h, /* trip on first upset */ 272 method=persistent, 273 trip=ereport.memory.generic-x86.page_ue_trip@CSPATH; 274 275/* Discard events with no address info */ 276event upset.memory.generic-x86.discard3@CSPATH; 277prop upset.memory.generic-x86.discard3@CSPATH 278 { !payloadprop_defined("resource") || !ADDR_VALID } (1)-> 279 ereport.cpu.generic-x86.mem_ue@chip/cpu; 280 281/* Diagnose a page_ue upset on a mem_ue event */ 282prop upset.memory.generic-x86.page_ue@CSPATH 283 { ADDR_VALID && CONTAINS_CS } (0)-> 284 ereport.cpu.generic-x86.mem_ue@chip/cpu; 285 286/* On the immediate SERD trip diagnose a page fault */ 287prop fault.memory.generic-x86.page_ue@CSPATH (1)-> 288 ereport.memory.generic-x86.page_ue_trip@CSPATH; 289 290/* Include address info in the page fault diagnosed, for rewrite-ASRU */ 291prop fault.memory.generic-x86.page_ue@CSPATH 292 { ADDR_VALID && CONTAINS_CS && SET_ADDR } (0)-> 293 ereport.cpu.generic-x86.mem_ue@chip/cpu; 294 295/* 296 * ========= Propogations for uncorrectable dimm faults ========== 297 * | | 298 * | A UE produces an immediate dimm fault. As explained in the | 299 * | page_ue block comment above we split the exercise in two in | 300 * | order to produce independent page_ue and dimm_ue diagnoses. | 301 * |===============================================================| 302 */ 303 304/* Dimm fault for an uncorrectable error */ 305event fault.memory.generic-x86.dimm_ue@CSPATH, 306 ASRU=CSPATH, 307 FITrate=1000, /* meaningless */ 308 action=confcall("rewrite-ASRU"); /* rewrite in "mem" FMRI scheme */ 309 310/* Upset for dimm fault */ 311event upset.memory.generic-x86.dimm_ue@CSPATH, 312 engine=serd.memory.generic-x86.dimm_ue@CSPATH; 313 314/* Sythetic ereport generated when the dimm_ue SERD engine trips */ 315event ereport.memory.generic-x86.dimm_ue_trip@CSPATH { within(1s) }; 316 317/* Instant-trip engine for dimm fault */ 318engine serd.memory.generic-x86.dimm_ue@CSPATH, 319 N=0, T=1h, /* trip on first upset */ 320 method=persistent, 321 trip=ereport.memory.generic-x86.dimm_ue_trip@CSPATH; 322 323/* Diagnose a dimm_ue upset on a mem_ue event (in addition to page_ue upset) */ 324prop upset.memory.generic-x86.dimm_ue@CSPATH 325 { CONTAINS_CS } (0)-> 326 ereport.cpu.generic-x86.mem_ue@chip/cpu; 327 328/* On the immediate SERD trip diagnose a dimm fault */ 329prop fault.memory.generic-x86.dimm_ue@CSPATH (1)-> 330 ereport.memory.generic-x86.dimm_ue_trip@CSPATH; 331 332/* 333 * ========= Propogations for GART Table Walk Errors ============= 334 * | | 335 * | These are usually due to software mis-programming of the GART | 336 * | TLB rather than from hardware errors. It would be incorrect | 337 * | to fault and potentially offline a cpu in response to these | 338 * | so they have their own fault class to facilitate us ignoring | 339 * | them. | 340 * |===============================================================| 341 */ 342 343event ereport.cpu.generic-x86.gart_tbl_walk@chip/cpu { within(1s) }; 344event upset.cpu.generic-x86.gart_tbl_walk@chip/cpu; 345 346prop upset.cpu.generic-x86.gart_tbl_walk@chip/cpu (1)-> 347 ereport.cpu.generic-x86.gart_tbl_walk@chip/cpu; 348