1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27/* 28 * Eversholt rules for generic AMD with on-chip memory-controller(s), as seen 29 * in AMD family 0xf and 0x10. 30 * 31 * In the absence of any model-specific support, any memory errors that 32 * are observed via MCA (typically through an on-chip memory-controller) 33 * will surface as ereport.cpu.generic-x86.bus_interconnect_memory[_uc] 34 * ereports and are diagnosed via generic rules in gcpu.esc. 35 * 36 * If full model-specific support is available, including full NorthBridge 37 * support, then memory ereports will surface in a more-specific subclass 38 * such as ereport.cpu.amd.mem_ce; these are diagnosed in amd64.esc. 39 * 40 * In the case where some "vendor generic" support is present, memory errors 41 * are reported as ereport.cpu.generic-x86.mem_{ce,ue} and include a 42 * syndrome and syndrome-type, and usually also a resource FMRI to identify 43 * the affected resource. In the AMD case a resource FMRI is included for 44 * those chip versions that include an Online Spare Control register; this 45 * register provides counts of ECC errors seen per channel and chip-select 46 * on a NorthBridge node. The resource FMRI has form 47 * hc:///motherboard/chip/memory-controller/dram-channel/chip-select 48 * in these cases. 49 */ 50 51#pragma dictionary "GMCA" 52 53/* 54 * The number of pages that must be faulted on a chip-select for repeated 55 * correctable errors before we will consider one of the component dimms 56 * faulty. 57 */ 58#define CS_DIMMSB_THRESH 64 59 60/* 61 * The maximum number of pages we will diagnose as faulty on any one 62 * chip-select (must be at least CS_PAGEFLT_THRESH). If a chip-select 63 * has a fault that will affect zillions of pages this limit stops us 64 * diagnosing excessive numbers of page faults. 65 */ 66#define CS_PAGEFLT_MAX (2 * CS_DIMMSB_THRESH) 67 68/* 69 * SERD paramters for individual page faults. When more than PAGE_SB_COUNT 70 * correctable ereports are experienced on a single chip-select within 71 * PAGE_SB_TIME the engine will fire and we will fault the most recent 72 * page. 73 */ 74#define PAGE_SB_COUNT 3 75#define PAGE_SB_TIME 24h 76 77#define CSPATH chip/memory-controller/dram-channel/chip-select 78 79/* 80 * ADDR_VALID is true if the ereport payload includes IA32_MCi_ADDR. 81 */ 82#define ADDR_VALID (payloadprop_defined("IA32_MCi_ADDR")) 83 84/* 85 * CONTAINS_CS is true if the resource nvlist array exists and one of its 86 * members matches the chip-select path. This is used to constrain 87 * propogations to those for which a resource element matches the 88 * chip-select path of the propogation. This is necessary because the 89 * detector element of memory ereports is a cpu and not the chip-select itself. 90 */ 91#define CONTAINS_CS (payloadprop_contains("resource", asru(CSPATH))) 92 93#define SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR"))) 94/* Generic memory ereports. */ 95event ereport.cpu.generic-x86.mem_ce@chip/core/strand { within(1s) }; 96event ereport.cpu.generic-x86.mem_ue@chip/core/strand { within(1s) }; 97 98/* 99 * ========= Propogations for correctable memory faults ========== 100 * | | 101 * | Discard mem_ce with no resource in the ereport payload. | 102 * | Discard mem_ce with no address info - we can't fault the | 103 * | corresponding page without it. | 104 * | | 105 * | For a mem_ce ereport detected by a given chip/cpu (as per | 106 * | the payload detector info) whose resource payload member | 107 * | includes a chip/memory-controller/dram-channel/chip-select | 108 * | (CSPATH) for the same chip number, diagnose to an fault event | 109 * | associated with a per-CSPATH SERD engine as long as we are | 110 * | below the page fault limit for this CSPATH (defined below); | 111 * | if we are over that limit then discard the event since we | 112 * | will already have faulted a dimm and there is no point in | 113 * | continuing to diagnose endless page faults from a dimm with | 114 * | something like a pin failure. | 115 * | | 116 * | When the per-CSPATH SERD engine fires we fault the page | 117 * | containing the address included in the ereport that caused | 118 * | the trip, and increment a per-CSPATH counter to count page | 119 * | faults on that chip-select from repeated correctable errors. | 120 * | | 121 * | A dimm_ce fault is diagnosed when we have faulted an | 122 * | excessive number of page_ce faults on a chip-select - more | 123 * | than CE_DIMMSB_THRESH. | 124 * |===============================================================| 125 */ 126 127#define CS_PGFLT_LIMIT_REACHED (count(stat.cepgflt@CSPATH) > CS_PAGEFLT_MAX) 128#define CS_DIMMSB_THRESH_REACHED \ 129 (count(stat.cepgflt@CSPATH) >= CS_DIMMSB_THRESH) 130 131engine stat.cepgflt@CSPATH; 132engine serd.memory.generic-x86.page_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME; 133event fault.memory.generic-x86.page_ce@CSPATH, 134 message=0, response=0, /* do not message individual pageflts */ 135 count=stat.cepgflt@CSPATH, /* increment on pageflt diagnosis */ 136 engine=serd.memory.generic-x86.page_ce@CSPATH; 137engine serd.memory.generic-x86.dimm_ce@CSPATH, N=PAGE_SB_COUNT, T=PAGE_SB_TIME; 138event fault.memory.generic-x86.dimm_ce@CSPATH, 139 engine=serd.memory.generic-x86.dimm_ce@CSPATH; 140 141prop fault.memory.generic-x86.page_ce@CSPATH 142 { ADDR_VALID && CONTAINS_CS && !CS_PGFLT_LIMIT_REACHED && SET_ADDR } (1)-> 143 ereport.cpu.generic-x86.mem_ce@chip/core<>/strand<>; 144 145prop fault.memory.generic-x86.dimm_ce@CSPATH 146 { ADDR_VALID && CONTAINS_CS && CS_DIMMSB_THRESH_REACHED } (1)-> 147 ereport.cpu.generic-x86.mem_ce@chip/core<>/strand<>; 148 149event upset.memory.generic-x86.discard@chip/core/strand; 150prop upset.memory.generic-x86.discard@chip/core/strand 151 { !payloadprop_defined("resource") || !ADDR_VALID } (1)-> 152 ereport.cpu.generic-x86.mem_ce@chip/core/strand; 153 154/* 155 * ========= Propogations for uncorrectable page faults ========== 156 * | | 157 * | A UE produces an immediate page fault. 158 * |===============================================================| 159 */ 160 161event fault.memory.generic-x86.page_ue@CSPATH, 162 message=0, response=0, /* do not message individual pageflts */ 163 count=stat.cepgflt@CSPATH; /* increment on pageflt diagnosis */ 164event fault.memory.generic-x86.dimm_ue@CSPATH; 165 166prop fault.memory.generic-x86.page_ue@CSPATH 167 { ADDR_VALID && CONTAINS_CS && SET_ADDR } (1)-> 168 ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>; 169 170prop fault.memory.generic-x86.dimm_ue@CSPATH 171 { ADDR_VALID && CONTAINS_CS } (1)-> 172 ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>; 173 174event upset.memory.generic-x86.discard3@CSPATH; 175prop upset.memory.generic-x86.discard3@CSPATH 176 { !payloadprop_defined("resource") || !ADDR_VALID } (1)-> 177 ereport.cpu.generic-x86.mem_ue@chip/core<>/strand<>; 178 179/* 180 * ========= Propogations for GART Table Walk Errors ============= 181 * | | 182 * | These are usually due to software mis-programming of the GART | 183 * | TLB rather than from hardware errors. It would be incorrect | 184 * | to fault and potentially offline a cpu in response to these | 185 * | so they have their own fault class to facilitate us ignoring | 186 * | them. | 187 * |===============================================================| 188 */ 189 190event ereport.cpu.generic-x86.gart_tbl_walk@chip/core/strand { within(1s) }; 191event upset.cpu.generic-x86.gart_tbl_walk@chip/core/strand; 192 193prop upset.cpu.generic-x86.gart_tbl_walk@chip/core/strand (1)-> 194 ereport.cpu.generic-x86.gart_tbl_walk@chip/core/strand; 195