xref: /illumos-gate/usr/src/cmd/fm/eversholt/files/common/disk.esc (revision 96b6509c49b81cb0d89ec222d92d421d946caa0c)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
24 */
25
26#pragma dictionary "DISK"
27
28#define	P			disk
29
30fru P;
31asru P;
32
33/*
34 * Over all comments for this file:
35 * <disk-as-detector> The disk-as-detector DE provides the mapping between
36 * ereports generated by a kernel disk driver sd(7D) and resulting faults.
37 */
38
39/*
40 * SERD engine for media error fault propagation:
41 *
42 * This strategy is designed to give a file system, like ZFS, the
43 * ability to attempt data recovery/relocation without faulting a disk.
44 * This implementation depends on a file system retry to the same lba
45 * to trigger a fault when recovery/relocation is not possible.
46 *
47 * We let the engine propagate one error only once every 1 minute and then if we
48 * still get 2 or more errors within 24 hours for the same LBA,
49 * there is a fault.
50 */
51engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
52
53/*
54 * disk-as-detector: fault events.
55 */
56event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
57event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
58    engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
59
60/*
61 * The uderr fault will be defined at some future time.
62 * event fault.io.scsi.cmd.disk.dev.uderr@P;
63 */
64
65/*
66 * disk-as-detector: upset events.
67 * NOTE: For now we define an upset to implement discard.
68 */
69event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
70event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
71event upset.io.scsi.cmd.disk.dev.uderr@P;
72event upset.io.scsi.cmd.disk.dev.serr@P;
73event upset.io.scsi.cmd.disk.tran@P;
74event upset.io.scsi.cmd.disk.recovered@P;
75
76/*
77 * disk-as-detector: ereports from the kernel.
78 *
79 * We don't know the topology for all scsi disks, but the kernel will always
80 * generate ereport telemetry assuming that we do. We define these ereports
81 * with 'discard_if_config_unknown=1', which permits ereports against things
82 * with unknown topology to be silently discarded.  The ereport data is logged
83 * in either case, and can be viewed via 'fmdump -eV'.
84 */
85event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
86event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
87event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
88event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
89event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
90event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
91
92/*
93 * For some ereports we let the 'driver-assessment', communicated as part of
94 * the ereport payload, determine fault .vs. upset via propagation constraints.
95 */
96#define DRIVER_ASSESSMENT_FATAL		\
97	    (payloadprop_contains("driver-assessment", "fatal"))
98#define DRIVER_ASSESSMENT_NONFATAL	(!DRIVER_ASSESSMENT_FATAL)
99
100/*
101 * disk-as-detector: propagations from faults(based on
102 * DRIVER_ASSESSMENT_FATAL).
103 * We need to set additional fault payloads to indicate fault details.
104 * The payload we may need are listed as following:
105 * fault.io.scsi.cmd.disk.dev.rqs.derr
106 *     op_code, key, asc, ascq
107 * fault.io.scsi.cmd.disk.dev.rqs.merr
108 *     op_code, key, asc, ascq, lba
109 */
110prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
111    ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
112    setpayloadprop("key", payloadprop("key")) &&
113    setpayloadprop("asc", payloadprop("asc")) &&
114    setpayloadprop("ascq", payloadprop("ascq"))};
115
116/*
117 * Utilize setserdsuffix with specific LBA,
118 * the serd engine would only trigger if the fault recurred on the same LBA
119 */
120prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
121    ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
122    setserdsuffix(payloadprop("lba")) &&
123    setpayloadprop("key", payloadprop("key")) &&
124    setpayloadprop("asc", payloadprop("asc")) &&
125    setpayloadprop("ascq", payloadprop("ascq")) &&
126    setpayloadprop("lba", payloadprop("lba"))};
127
128/*
129 * NOTE: this propagation uses the "may" propagation of eversholt.
130 * The ereport need never exist. It's just a way of making
131 * the diagnosis wait for the within time on that ereport
132 * to complete. Once it has completed the diagnosis continues
133 * even though the dummy ereport didn't occur.
134 */
135event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
136prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
137	ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
138
139/*
140 * The uderr fault will be propagated at some future time.
141 * prop fault.io.scsi.cmd.disk.dev.uderr@P->
142 *     ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
143 */
144
145/*
146 * disk-as-detector: propagations from upsets(based on
147 * DRIVER_ASSESSMENT_NONFATAL).
148 */
149prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
150    ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
151
152prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
153    ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
154
155/*
156 * disk-as-detector: propagations from upsets(independent of
157 * driver-assessment)
158 */
159
160prop upset.io.scsi.cmd.disk.dev.serr@P->
161    ereport.io.scsi.cmd.disk.dev.serr@P;
162
163prop upset.io.scsi.cmd.disk.dev.uderr@P->
164    ereport.io.scsi.cmd.disk.dev.uderr@P;
165
166prop upset.io.scsi.cmd.disk.recovered@P->
167    ereport.io.scsi.cmd.disk.recovered@P;
168
169prop upset.io.scsi.cmd.disk.tran@P->
170    ereport.io.scsi.cmd.disk.tran@P;
171
172/*
173 * --------------------------------------
174 * The remainder of this file contains rules associated with the operation of
175 * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
176 *
177 * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
178 * generated by the disk-transport fmd module, and the resulting faults.
179 */
180
181/*
182 * Fault events.
183 */
184event fault.io.disk.over-temperature@P,
185    FITrate=10, FRU=P, ASRU=P;
186event fault.io.disk.predictive-failure@P, FITrate=10,
187    FITrate=10, FRU=P, ASRU=P;
188event fault.io.disk.self-test-failure@P, FITrate=10,
189    FITrate=10, FRU=P, ASRU=P;
190event fault.io.disk.ssm-wearout@P;
191
192/*
193 * ereports.
194 */
195event ereport.io.scsi.disk.over-temperature@P;
196event ereport.io.scsi.disk.predictive-failure@P;
197event ereport.io.scsi.disk.self-test-failure@P;
198event ereport.io.scsi.disk.ssm-wearout@P;
199
200/*
201 * Propagations.
202 */
203prop fault.io.disk.over-temperature@P ->
204    ereport.io.scsi.disk.over-temperature@P;
205
206prop fault.io.disk.self-test-failure@P ->
207    ereport.io.scsi.disk.self-test-failure@P;
208
209prop fault.io.disk.predictive-failure@P ->
210    ereport.io.scsi.disk.predictive-failure@P {
211    setpayloadprop("asc", payloadprop("additional-sense-code")) &&
212    setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) };
213
214prop fault.io.disk.ssm-wearout@P ->
215    ereport.io.scsi.disk.ssm-wearout@P {
216    setpayloadprop("current-wearout-percentage",
217    payloadprop("current-ssm-wearout"))
218    && setpayloadprop("threshold-wearout-percentage",
219    payloadprop("threshold-ssm-wearout")) };
220