xref: /illumos-gate/usr/src/cmd/fm/eversholt/files/common/disk.esc (revision fb2a9bae0030340ad72b9c26ba1ffee2ee3cafec)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma dictionary "DISK"
27
28#define	P			disk
29
30fru P;
31asru P;
32
33/*
34 * Over all comments for this file:
35 * <disk-as-detector> The disk-as-detector DE provides the mapping between
36 * ereports generated by a kernel disk driver sd(7D) and resulting faults.
37 */
38
39/*
40 * SERD engine for media error fault propagation:
41 *
42 * This strategy is designed to give a file system, like ZFS, the
43 * ability to attempt data recovery/relocation without faulting a disk.
44 * This implementation depends on a file system retry to the same lba
45 * to trigger a fault when recovery/relocation is not possible.
46 *
47 * We let the engine propagate one error only once every 1 minute and then if we
48 * still get 2 or more * errors within 24 hours for the same LBA, there is a fault.
49 */
50engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
51
52/*
53 * disk-as-detector: fault events.
54 */
55event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
56event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
57    engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
58
59/*
60 * The uderr fault will be defined at some future time.
61 * event fault.io.scsi.cmd.disk.dev.uderr@P;
62 */
63
64/*
65 * disk-as-detector: upset events.
66 * NOTE: For now we define an upset to implement discard.
67 */
68event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
69event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
70event upset.io.scsi.cmd.disk.dev.uderr@P;
71event upset.io.scsi.cmd.disk.dev.serr@P;
72event upset.io.scsi.cmd.disk.tran@P;
73event upset.io.scsi.cmd.disk.recovered@P;
74
75/*
76 * disk-as-detector: ereports from the kernel.
77 *
78 * We don't know the topology for all scsi disks, but the kernel will always
79 * generate ereport telemetry assuming that we do. We define these ereports
80 * with 'discard_if_config_unknown=1', which permits ereports against things
81 * with unknown topology to be silently discarded.  The ereport data is logged
82 * in either case, and can be viewed via 'fmdump -eV'.
83 */
84event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
85event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
86event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
87event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
88event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
89event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
90
91/*
92 * For some ereports we let the 'driver-assessment', communicated as part of
93 * the ereport payload, determine fault .vs. upset via propagation constraints.
94 */
95#define DRIVER_ASSESSMENT_FATAL		\
96	    (payloadprop_contains("driver-assessment", "fatal"))
97#define DRIVER_ASSESSMENT_NONFATAL	(!DRIVER_ASSESSMENT_FATAL)
98
99/*
100 * disk-as-detector: propagations from faults(based on
101 * DRIVER_ASSESSMENT_FATAL).
102 * We need to set additional fault payloads to indicate fault details.
103 * The payload we may need are listed as following:
104 * fault.io.scsi.cmd.disk.dev.rqs.derr
105 *     op_code, key, asc, ascq
106 * fault.io.scsi.cmd.disk.dev.rqs.merr
107 *     op_code, key, asc, ascq, lba
108 */
109prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
110    ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
111    setpayloadprop("key", payloadprop("key")) &&
112    setpayloadprop("asc", payloadprop("asc")) &&
113    setpayloadprop("ascq", payloadprop("ascq"))};
114
115/*
116 * Utilize setserdsuffix with specific LBA,
117 * the serd engine would only trigger if the fault recurred on the same LBA
118 */
119prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
120    ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
121    setserdsuffix(payloadprop("lba")) &&
122    setpayloadprop("key", payloadprop("key")) &&
123    setpayloadprop("asc", payloadprop("asc")) &&
124    setpayloadprop("ascq", payloadprop("ascq")) &&
125    setpayloadprop("lba", payloadprop("lba"))};
126
127/*
128 * NOTE: this propagation uses the "may" propagation of eversholt.
129 * The ereport need never exist. It's just a way of making
130 * the diagnosis wait for the within time on that ereport
131 * to complete. Once it has completed the diagnosis continues
132 * even though the dummy ereport didn't occur.
133 */
134event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
135prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
136	ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
137
138/*
139 * The uderr fault will be propagated at some future time.
140 * prop fault.io.scsi.cmd.disk.dev.uderr@P->
141 *     ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
142 */
143
144/*
145 * disk-as-detector: propagations from upsets(based on
146 * DRIVER_ASSESSMENT_NONFATAL).
147 */
148prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
149    ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
150
151prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
152    ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
153
154/*
155 * disk-as-detector: propagations from upsets(independent of
156 * driver-assessment)
157 */
158
159prop upset.io.scsi.cmd.disk.dev.serr@P->
160    ereport.io.scsi.cmd.disk.dev.serr@P;
161
162prop upset.io.scsi.cmd.disk.dev.uderr@P->
163    ereport.io.scsi.cmd.disk.dev.uderr@P;
164
165prop upset.io.scsi.cmd.disk.recovered@P->
166    ereport.io.scsi.cmd.disk.recovered@P;
167
168prop upset.io.scsi.cmd.disk.tran@P->
169    ereport.io.scsi.cmd.disk.tran@P;
170
171/*
172 * --------------------------------------
173 * The remainder of this file contains rules associated with the operation of
174 * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
175 *
176 * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
177 * generated by the disk-transport fmd module, and the resulting faults.
178 */
179
180/*
181 * Fault events.
182 */
183event fault.io.disk.over-temperature@P,
184    FITrate=10, FRU=P, ASRU=P;
185event fault.io.disk.predictive-failure@P, FITrate=10,
186    FITrate=10, FRU=P, ASRU=P;
187event fault.io.disk.self-test-failure@P, FITrate=10,
188    FITrate=10, FRU=P, ASRU=P;
189
190/*
191 * ereports.
192 */
193event ereport.io.scsi.disk.over-temperature@P;
194event ereport.io.scsi.disk.predictive-failure@P;
195event ereport.io.scsi.disk.self-test-failure@P;
196
197/*
198 * Propagations.
199 */
200prop fault.io.disk.over-temperature@P ->
201    ereport.io.scsi.disk.over-temperature@P;
202
203prop fault.io.disk.self-test-failure@P ->
204    ereport.io.scsi.disk.self-test-failure@P;
205
206prop fault.io.disk.predictive-failure@P ->
207    ereport.io.scsi.disk.predictive-failure@P;
208