xref: /titanic_52/usr/src/cmd/fm/eversholt/files/common/disk.esc (revision 17a2b317610f531d565bf4e940433aab2d9e6985)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25#pragma dictionary "DISK"
26
27#define	P			disk
28
29fru P;
30asru P;
31
32/*
33 * Over all comments for this file:
34 * <disk-as-detector> The disk-as-detector DE provides the mapping between
35 * ereports generated by a kernel disk driver sd(7D) and resulting faults.
36 */
37
38/*
39 * SERD engine for media error fault propagation:
40 *
41 * This strategy is designed to give a file system, like ZFS, the
42 * ability to attempt data recovery/relocation without faulting a disk.
43 * This implementation depends on a file system retry to the same lba
44 * to trigger a fault when recovery/relocation is not possible.
45 *
46 * We let the engine propagate one error only once every 1 minute and then if we
47 * still get 2 or more * errors within 24 hours for the same LBA, there is a fault.
48 */
49engine serd.io.scsi.cmd.disk.dev.rqs.merr@P, N=1, T=24h;
50
51/*
52 * disk-as-detector: fault events.
53 */
54event fault.io.scsi.cmd.disk.dev.rqs.derr@P;
55event fault.io.scsi.cmd.disk.dev.rqs.merr@P,
56    engine=serd.io.scsi.cmd.disk.dev.rqs.merr@P;
57
58/*
59 * The uderr fault will be defined at some future time.
60 * event fault.io.scsi.cmd.disk.dev.uderr@P;
61 */
62
63/*
64 * disk-as-detector: upset events.
65 * NOTE: For now we define an upset to implement discard.
66 */
67event upset.io.scsi.cmd.disk.dev.rqs.derr@P;
68event upset.io.scsi.cmd.disk.dev.rqs.merr@P;
69event upset.io.scsi.cmd.disk.dev.uderr@P;
70event upset.io.scsi.cmd.disk.dev.serr@P;
71event upset.io.scsi.cmd.disk.tran@P;
72event upset.io.scsi.cmd.disk.recovered@P;
73
74/*
75 * disk-as-detector: ereports from the kernel.
76 *
77 * We don't know the topology for all scsi disks, but the kernel will always
78 * generate ereport telemetry assuming that we do. We define these ereports
79 * with 'discard_if_config_unknown=1', which permits ereports against things
80 * with unknown topology to be silently discarded.  The ereport data is logged
81 * in either case, and can be viewed via 'fmdump -eV'.
82 */
83event ereport.io.scsi.cmd.disk.dev.rqs.derr@P, discard_if_config_unknown=1;
84event ereport.io.scsi.cmd.disk.dev.rqs.merr@P, discard_if_config_unknown=1;
85event ereport.io.scsi.cmd.disk.dev.serr@P, discard_if_config_unknown=1;
86event ereport.io.scsi.cmd.disk.dev.uderr@P, discard_if_config_unknown=1;
87event ereport.io.scsi.cmd.disk.recovered@P, discard_if_config_unknown=1;
88event ereport.io.scsi.cmd.disk.tran@P, discard_if_config_unknown=1;
89
90/*
91 * For some ereports we let the 'driver-assessment', communicated as part of
92 * the ereport payload, determine fault .vs. upset via propagation constraints.
93 */
94#define DRIVER_ASSESSMENT_FATAL		\
95	    (payloadprop_contains("driver-assessment", "fatal"))
96#define DRIVER_ASSESSMENT_NONFATAL	(!DRIVER_ASSESSMENT_FATAL)
97
98/*
99 * disk-as-detector: propagations from faults(based on
100 * DRIVER_ASSESSMENT_FATAL).
101 * We need to set additional fault payloads to indicate fault details.
102 * The payload we may need are listed as following:
103 * fault.io.scsi.cmd.disk.dev.rqs.derr
104 *     op_code, key, asc, ascq
105 * fault.io.scsi.cmd.disk.dev.rqs.merr
106 *     op_code, key, asc, ascq, lba
107 */
108prop fault.io.scsi.cmd.disk.dev.rqs.derr@P->
109    ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_FATAL &&
110    setpayloadprop("key", payloadprop("key")) &&
111    setpayloadprop("asc", payloadprop("asc")) &&
112    setpayloadprop("ascq", payloadprop("ascq"))};
113
114/*
115 * Utilize setserdsuffix with specific LBA,
116 * the serd engine would only trigger if the fault recurred on the same LBA
117 */
118prop fault.io.scsi.cmd.disk.dev.rqs.merr@P->
119    ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_FATAL &&
120    setserdsuffix(payloadprop("lba")) &&
121    setpayloadprop("key", payloadprop("key")) &&
122    setpayloadprop("asc", payloadprop("asc")) &&
123    setpayloadprop("ascq", payloadprop("ascq")) &&
124    setpayloadprop("lba", payloadprop("lba"))};
125
126/*
127 * NOTE: this propagation uses the "may" propagation of eversholt.
128 * The ereport need never exist. It's just a way of making
129 * the diagnosis wait for the within time on that ereport
130 * to complete. Once it has completed the diagnosis continues
131 * even though the dummy ereport didn't occur.
132 */
133event ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P {within(60s)};
134prop fault.io.scsi.cmd.disk.dev.rqs.merr@P (0) ->
135	ereport.io.scsi.cmd.disk.dev.rqs.merr.dummy@P;
136
137/*
138 * The uderr fault will be propagated at some future time.
139 * prop fault.io.scsi.cmd.disk.dev.uderr@P->
140 *     ereport.io.scsi.cmd.disk.dev.uderr@P{ DRIVER_ASSESSMENT_FATAL };
141 */
142
143/*
144 * disk-as-detector: propagations from upsets(based on
145 * DRIVER_ASSESSMENT_NONFATAL).
146 */
147prop upset.io.scsi.cmd.disk.dev.rqs.derr@P->
148    ereport.io.scsi.cmd.disk.dev.rqs.derr@P{ DRIVER_ASSESSMENT_NONFATAL };
149
150prop upset.io.scsi.cmd.disk.dev.rqs.merr@P->
151    ereport.io.scsi.cmd.disk.dev.rqs.merr@P{ DRIVER_ASSESSMENT_NONFATAL };
152
153/*
154 * disk-as-detector: propagations from upsets(independent of
155 * driver-assessment)
156 */
157
158prop upset.io.scsi.cmd.disk.dev.serr@P->
159    ereport.io.scsi.cmd.disk.dev.serr@P;
160
161prop upset.io.scsi.cmd.disk.dev.uderr@P->
162    ereport.io.scsi.cmd.disk.dev.uderr@P;
163
164prop upset.io.scsi.cmd.disk.recovered@P->
165    ereport.io.scsi.cmd.disk.recovered@P;
166
167prop upset.io.scsi.cmd.disk.tran@P->
168    ereport.io.scsi.cmd.disk.tran@P;
169
170/*
171 * --------------------------------------
172 * The remainder of this file contains rules associated with the operation of
173 * cmd/fm/modules/common/disk-monitor/disk_monitor.c code.
174 *
175 * The disk DE provides a very simple 1-to-1 mapping between SCSI disk events
176 * generated by the disk-transport fmd module, and the resulting faults.
177 */
178
179/*
180 * Fault events.
181 */
182event fault.io.disk.over-temperature@P,
183    FITrate=10, FRU=P, ASRU=P;
184event fault.io.disk.predictive-failure@P, FITrate=10,
185    FITrate=10, FRU=P, ASRU=P;
186event fault.io.disk.self-test-failure@P, FITrate=10,
187    FITrate=10, FRU=P, ASRU=P;
188
189/*
190 * ereports.
191 */
192event ereport.io.scsi.disk.over-temperature@P;
193event ereport.io.scsi.disk.predictive-failure@P;
194event ereport.io.scsi.disk.self-test-failure@P;
195
196/*
197 * Propagations.
198 */
199prop fault.io.disk.over-temperature@P ->
200    ereport.io.scsi.disk.over-temperature@P;
201
202prop fault.io.disk.self-test-failure@P ->
203    ereport.io.scsi.disk.self-test-failure@P;
204
205prop fault.io.disk.predictive-failure@P ->
206    ereport.io.scsi.disk.predictive-failure@P {
207    setpayloadprop("asc", payloadprop("additional-sense-code")) &&
208    setpayloadprop("ascq", payloadprop("additional-sense-code-qualifier")) };
209