1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29/* 30 * Eversholt rules for generic AMD with on-chip memory-controller(s), as seen 31 * in AMD family 0xf and 0x10. 32 * 33 * In the absence of any model-specific support, any memory errors that 34 * are observed via MCA (typically through an on-chip memory-controller) 35 * will surface as ereport.cpu.generic-x86.bus_interconnect_memory[_uc] 36 * ereports and are diagnosed via generic rules in gcpu.esc. 37 * 38 * If full model-specific support is available, including full NorthBridge 39 * support, then memory ereports will surface in a more-specific subclass 40 * such as ereport.cpu.amd.mem_ce; these are diagnosed in amd64.esc. 41 * 42 * In the case where some "vendor generic" support is present, memory errors 43 * are reported as ereport.cpu.generic-x86.mem_{ce,ue} and include a 44 * syndrome and syndrome-type, and usually also a resource FMRI to identify 45 * the affected resource. In the AMD case a resource FMRI is included for 46 * those chip versions that include an Online Spare Control register; this 47 * register provides counts of ECC errors seen per channel and chip-select 48 * on a NorthBridge node. The resource FMRI has form 49 * hc:///motherboard/chip/memory-controller/dram-channel/chip-select 50 * in these cases. 51 */ 52 53#pragma dictionary "GMCA" 54 55/* 56 * The number of pages that must be faulted on a chip-select for repeated 57 * correctable errors before we will consider one of the component dimms 58 * faulty. 59 */ 60#define CS_DIMMSB_THRESH 64 61 62/* 63 * The maximum number of pages we will diagnose as faulty on any one 64 * chip-select (must be at least CS_PAGEFLT_THRESH). If a chip-select 65 * has a fault that will affect zillions of pages this limit stops us 66 * diagnosing excessive numbers of page faults. 67 */ 68#define CS_PAGEFLT_MAX (2 * CS_DIMMSB_THRESH) 69 70/* 71 * SERD paramters for individual page faults. When more than PAGE_SB_COUNT 72 * correctable ereports are experienced on a single chip-select within 73 * PAGE_SB_TIME the engine will fire and we will fault the most recent 74 * page. 75 */ 76#define PAGE_SB_COUNT 3 77#define PAGE_SB_TIME 24h 78 79#define CSPATH chip/memory-controller/dram-channel/chip-select 80 81/* 82 * ADDR_VALID is true if the ereport payload includes IA32_MCi_ADDR. 83 */ 84#define ADDR_VALID (payloadprop_defined("IA32_MCi_ADDR")) 85 86/* 87 * CONTAINS_CS is true if the resource nvlist array exists and one of its 88 * members matches the chip-select path. This is used to constrain 89 * propogations to those for which a resource element matches the 90 * chip-select path of the propogation. This is necessary because the 91 * detector element of memory ereports is a cpu and not the chip-select itself. 92 */ 93#define CONTAINS_CS (payloadprop_contains("resource", asru(CSPATH))) 94 95#define SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR"))) 96/* Generic memory ereports. */ 97event ereport.cpu.generic-x86.mem_ce@chip/cpu { within(1s) }; 98event ereport.cpu.generic-x86.mem_ue@chip/cpu { within(1s) }; 99 100/* 101 * ========= Propogations for correctable memory faults ========== 102 * | | 103 * | Discard mem_ce with no resource in the ereport payload. | 104 * | Discard mem_ce with no address info - we can't fault the | 105 * | corresponding page without it. | 106 * | | 107 * | For a mem_ce ereport detected by a given chip/cpu (as per | 108 * | the payload detector info) whose resource payload member | 109 * | includes a chip/memory-controller/dram-channel/chip-select | 110 * | (CSPATH) for the same chip number, diagnose to an fault event | 111 * | associated with a per-CSPATH SERD engine as long as we are | 112 * | below the page fault limit for this CSPATH (defined below); | 113 * | if we are over that limit then discard the event since we | 114 * | will already have faulted a dimm and there is no point in | 115 * | continuing to diagnose endless page faults from a dimm with | 116 * | something like a pin failure. | 117 * | | 118 * | When the per-CSPATH SERD engine fires we fault the page | 119 * | containing the address included in the ereport that caused | 120 * | the trip, and increment a per-CSPATH counter to count page | 121 * | faults on that chip-select from repeated correctable errors. | 122 * | | 123 * | A dimm_ce fault is diagnosed when we have faulted an | 124 * | excessive number of page_ce faults on a chip-select - more | 125 * | than CE_DIMMSB_THRESH. | 126 * |===============================================================| 127 */ 128 129#define CS_PGFLT_LIMIT_REACHED (count(stat.cepgflt@CSPATH) > CS_PAGEFLT_MAX) 130#define CS_DIMMSB_THRESH_REACHED \ 131 (count(stat.cepgflt@CSPATH) >= CS_DIMMSB_THRESH) 132 133engine stat.cepgflt@CSPATH; 134engine serd.memory.generic-x86.page_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME; 135event fault.memory.generic-x86.page_ce@CSPATH, 136 message=0, response=0, /* do not message individual pageflts */ 137 count=stat.cepgflt@CSPATH, /* increment on pageflt diagnosis */ 138 engine=serd.memory.generic-x86.page_ce@CSPATH; 139engine serd.memory.generic-x86.dimm_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME; 140event fault.memory.generic-x86.dimm_ce@CSPATH, 141 engine=serd.memory.generic-x86.dimm_ce@CSPATH; 142 143prop fault.memory.generic-x86.page_ce@CSPATH 144 { ADDR_VALID && CONTAINS_CS && !CS_PGFLT_LIMIT_REACHED && SET_ADDR } (1)-> 145 ereport.cpu.generic-x86.mem_ce@chip/cpu; 146 147prop fault.memory.generic-x86.dimm_ce@CSPATH 148 { ADDR_VALID && CONTAINS_CS && CS_DIMMSB_THRESH_REACHED } (1)-> 149 ereport.cpu.generic-x86.mem_ce@chip/cpu; 150 151event upset.memory.generic-x86.discard@chip/cpu; 152prop upset.memory.generic-x86.discard@chip/cpu 153 { !payloadprop_defined("resource") || !ADDR_VALID } (1)-> 154 ereport.cpu.generic-x86.mem_ce@chip/cpu; 155 156/* 157 * ========= Propogations for uncorrectable page faults ========== 158 * | | 159 * | A UE produces an immediate page fault. 160 * |===============================================================| 161 */ 162 163event fault.memory.generic-x86.page_ue@CSPATH, 164 message=0, response=0, /* do not message individual pageflts */ 165 count=stat.cepgflt@CSPATH; /* increment on pageflt diagnosis */ 166event fault.memory.generic-x86.dimm_ue@CSPATH; 167 168prop fault.memory.generic-x86.page_ue@CSPATH 169 { ADDR_VALID && CONTAINS_CS && SET_ADDR } (1)-> 170 ereport.cpu.generic-x86.mem_ue@chip/cpu; 171 172prop fault.memory.generic-x86.dimm_ue@CSPATH 173 { ADDR_VALID && CONTAINS_CS } (1)-> 174 ereport.cpu.generic-x86.mem_ue@chip/cpu; 175 176event upset.memory.generic-x86.discard3@CSPATH; 177prop upset.memory.generic-x86.discard3@CSPATH 178 { !payloadprop_defined("resource") || !ADDR_VALID } (1)-> 179 ereport.cpu.generic-x86.mem_ue@chip/cpu; 180 181/* 182 * ========= Propogations for GART Table Walk Errors ============= 183 * | | 184 * | These are usually due to software mis-programming of the GART | 185 * | TLB rather than from hardware errors. It would be incorrect | 186 * | to fault and potentially offline a cpu in response to these | 187 * | so they have their own fault class to facilitate us ignoring | 188 * | them. | 189 * |===============================================================| 190 */ 191 192event ereport.cpu.generic-x86.gart_tbl_walk@chip/cpu { within(1s) }; 193event upset.cpu.generic-x86.gart_tbl_walk@chip/cpu; 194 195prop upset.cpu.generic-x86.gart_tbl_walk@chip/cpu (1)-> 196 ereport.cpu.generic-x86.gart_tbl_walk@chip/cpu; 197