1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma dictionary "DISK" 27 28#define P disk 29 30fru P; 31asru P; 32 33/* 34 * Over all comments for this file: 35 * <disk-as-detector> The disk-as-detector DE provides the mapping between 36 * ereports generated by a kernel disk driver sd(7D) and resulting faults. 37 */ 38 39/* 40 * SERD engine for media error fault propagation: 41 * 42 * This strategy is designed to give a file system, like ZFS, the 43 * ability to attempt data recovery/relocation without faulting a disk. 44 * This implementation depends on a file system retry to the same lba 45 * to trigger a fault when recovery/relocation is not possible. 46 * 47 * We let the engine propagate one error only once every 1 minute and then if we 48 * still get 2 or more * errors within 24 hours for the same LBA, there is a fault. 49 */ 50engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h; 51 52/* 53 * disk-as-detector: fault events. 54 */ 55event fault.io.scsi.cmd.disk.dev.rqs.derr@P; 56event fault.io.scsi.cmd.disk.dev.rqs.merr@P, 57 engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P; 58 59/* 60 * The uderr fault will be defined at some future time. 61 * event fault.io.scsi.cmd.disk.dev.uderr@P; 62 */ 63 64/* 65 * disk-as-detector: upset events. 66 * NOTE: For now we define an upset to implement discard. 67 */ 68event upset.io.scsi.cmd.disk.dev.rqs.derr@P; 69event upset.io.scsi.cmd.disk.dev.rqs.merr@P; 70event upset.io.scsi.cmd.disk.dev.uderr@P; 71event upset.io.scsi.cmd.disk.dev.serr@P; 72event upset.io.scsi.cmd.disk.tran@P; 73event upset.io.scsi.cmd.disk.recovered@P; 74 75/* 76 * disk-as-detector: ereports from the kernel. 77 * 78 * We don't know the topology for all scsi disks, but the kernel will always 79 * generate ereport telemetry assuming that we do. We define these ereports 80 * with 'discard_if_config_unknown=1', which permits ereports against things 81 * with unknown topology to be silently discarded. The ereport data is logged 82 * in either case, and can be viewed via 'fmdump -eV'. 83 */ 84event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1; 85event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1; 86event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1; 87event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1; 88event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1; 89event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1; 90 91/* 92 * For some ereports we let the 'driver-assessment', communicated as part of 93 * the ereport payload, determine fault .vs. upset via propagation constraints. 94 */ 95#define DRIVER_ASSESSMENT_FATAL \ 96 (payloadprop_contains("driver-assessment", "fatal")) 97#define DRIVER_ASSESSMENT_NONFATAL (!DRIVER_ASSESSMENT_FATAL) 98 99/* 100 * disk-as-detector: propagations from faults(based on 101 * DRIVER_ASSESSMENT_FATAL). 102 * We need to set additional fault payloads to indicate fault details. 103 * The payload we may need are listed as following: 104 * fault.io.scsi.cmd.disk.dev.rqs.derr 105 * op_code, key, asc, ascq 106 * fault.io.scsi.cmd.disk.dev.rqs.merr 107 * op_code, key, asc, ascq, lba 108 */ 109prop fault.io.scsi.cmd.disk.dev.rqs.derr@P-> 110 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL && 111 setpayloadprop("key", payloadprop("key")) && 112 setpayloadprop("asc", payloadprop("asc")) && 113 setpayloadprop("ascq", payloadprop("ascq"))}; 114 115/* 116 * Utilize setserdsuffix with specific LBA, 117 * the serd engine would only trigger if the fault recurred on the same LBA 118 */ 119prop fault.io.scsi.cmd.disk.dev.rqs.merr@P-> 120 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL && 121 setserdsuffix(payloadprop("lba")) && 122 setpayloadprop("key", payloadprop("key")) && 123 setpayloadprop("asc", payloadprop("asc")) && 124 setpayloadprop("ascq", payloadprop("ascq")) && 125 setpayloadprop("lba", payloadprop("lba"))}; 126 127/* 128 * NOTE: this propagation uses the "may" propagation of eversholt. 129 * The ereport need never exist. It's just a way of making 130 * the diagnosis wait for the within time on that ereport 131 * to complete. Once it has completed the diagnosis continues 132 * even though the dummy ereport didn't occur. 133 */ 134event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)}; 135prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) -> 136 ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P; 137 138/* 139 * The uderr fault will be propagated at some future time. 140 * prop fault.io.scsi.cmd.disk.dev.uderr@P-> 141 * ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL }; 142 */ 143 144/* 145 * disk-as-detector: propagations from upsets(based on 146 * DRIVER_ASSESSMENT_NONFATAL). 147 */ 148prop upset.io.scsi.cmd.disk.dev.rqs.derr@P-> 149 ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL }; 150 151prop upset.io.scsi.cmd.disk.dev.rqs.merr@P-> 152 ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL }; 153 154/* 155 * disk-as-detector: propagations from upsets(independent of 156 * driver-assessment) 157 */ 158 159prop upset.io.scsi.cmd.disk.dev.serr@P-> 160 ereport.io.scsi.cmd.disk.dev.serr@P; 161 162prop upset.io.scsi.cmd.disk.dev.uderr@P-> 163 ereport.io.scsi.cmd.disk.dev.uderr@P; 164 165prop upset.io.scsi.cmd.disk.recovered@P-> 166 ereport.io.scsi.cmd.disk.recovered@P; 167 168prop upset.io.scsi.cmd.disk.tran@P-> 169 ereport.io.scsi.cmd.disk.tran@P; 170 171/* 172 * -------------------------------------- 173 * The remainder of this file contains rules associated with the operation of 174 * cmd/fm/modules/common/disk-monitor/disk_monitor.c code. 175 * 176 * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events 177 * generated by the disk-transport fmd module, and the resulting faults. 178 */ 179 180/* 181 * Fault events. 182 */ 183event fault.io.disk.over-temperature@P, 184 FITrate=10, FRU=P, ASRU=P; 185event fault.io.disk.predictive-failure@P, FITrate=10, 186 FITrate=10, FRU=P, ASRU=P; 187event fault.io.disk.self-test-failure@P, FITrate=10, 188 FITrate=10, FRU=P, ASRU=P; 189 190/* 191 * ereports. 192 */ 193event ereport.io.scsi.disk.over-temperature@P; 194event ereport.io.scsi.disk.predictive-failure@P; 195event ereport.io.scsi.disk.self-test-failure@P; 196 197/* 198 * Propagations. 199 */ 200prop fault.io.disk.over-temperature@P -> 201 ereport.io.scsi.disk.over-temperature@P; 202 203prop fault.io.disk.self-test-failure@P -> 204 ereport.io.scsi.disk.self-test-failure@P; 205 206prop fault.io.disk.predictive-failure@P -> 207 ereport.io.scsi.disk.predictive-failure@P; 208