1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 /*
29 * Support routines for managing potential page and bank faults that have
30 * been deferred due to a datapath error. Currently deferment only occurs
31 * if a memory UE occurs while a datapath error is active. When this happens
32 * a page case is created with a special subtype of CMD_PTR_DP_PAGE_DEFER. An
33 * entry (a cmd_dp_defer_t) is added to a list of deferred pages. The entry
34 * links to the cmd_page_t in the cmd_pages list and also keeps track of what
35 * memory controller ids are associated with the first AFAR and any more that
36 * are seen while the page is deferred. This information is used to determine
37 * if the page should be faulted if the fault should be skipped because an
38 * intervening datapath fault has occurred. If a page is faulted when it is
39 * replayed, the corresponding bank is faulted, too, since the original error
40 * was a UE. Note that no action is taken to undo any action taken by the
41 * kernel when the UE was detected. Currently the kernel will attempt to
42 * immediately retire the page where a UE is detected and the retire may or
43 * may not have completed by the time FMA receives an ereport. The possibility
44 * of a datapath fault resulting in memory UEs is very small, so the likelihood
45 * of encountering this scenario is also very small.
46 */
47
48 #include <cmd.h>
49 #include <cmd_dp.h>
50 #include <cmd_dp_page.h>
51 #include <cmd_bank.h>
52 #include <cmd_page.h>
53
54 #include <fm/fmd_api.h>
55 #include <sys/nvpair.h>
56
57 extern void cmd_bank_fault(fmd_hdl_t *, cmd_bank_t *);
58
59 static void
dp_page_defer_data_write(fmd_hdl_t * hdl,cmd_dp_defer_t * dpage)60 dp_page_defer_data_write(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage)
61 {
62 fmd_buf_write(hdl, dpage->dp_defer_page->page_case.cc_cp, "mcids",
63 &dpage->dp_defer_mcids, sizeof (dpage->dp_defer_mcids));
64 }
65
66 static void
dp_page_defer_data_restore(fmd_hdl_t * hdl,cmd_dp_defer_t * dpage)67 dp_page_defer_data_restore(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage)
68 {
69 fmd_buf_read(hdl, dpage->dp_defer_page->page_case.cc_cp, "mcids",
70 &dpage->dp_defer_mcids, sizeof (dpage->dp_defer_mcids));
71 }
72
73 static void
dp_page_defer_add_data(fmd_hdl_t * hdl,cmd_dp_defer_t * dpage,uint64_t afar)74 dp_page_defer_add_data(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage, uint64_t afar)
75 {
76 int mcid;
77 int i;
78
79 if (cmd_dp_get_mcid(afar, &mcid) < 0)
80 fmd_hdl_abort(hdl, "cmd_dp_get_mcid failed");
81
82 for (i = 0; i < DP_MAX_MCS; i++) {
83 if (dpage->dp_defer_mcids[i] == -1) {
84 dpage->dp_defer_mcids[i] = mcid;
85 break;
86 }
87 if (dpage->dp_defer_mcids[i] == mcid)
88 break;
89 }
90
91 if (i == DP_MAX_MCS)
92 fmd_hdl_abort(hdl, "too many mcids for deferred page");
93
94 dp_page_defer_data_write(hdl, dpage);
95 }
96
97 static cmd_dp_defer_t *
dp_page_defer_create(fmd_hdl_t * hdl,cmd_page_t * page,uint64_t afar)98 dp_page_defer_create(fmd_hdl_t *hdl, cmd_page_t *page, uint64_t afar)
99 {
100 cmd_dp_defer_t *dpage;
101 int i;
102
103 dpage = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_defer_t), FMD_SLEEP);
104
105 dpage->dp_defer_page = page;
106
107 for (i = 0; i < DP_MAX_MCS; i++)
108 dpage->dp_defer_mcids[i] = -1;
109
110 dp_page_defer_add_data(hdl, dpage, afar);
111
112 cmd_list_append(&cmd.cmd_deferred_pages, dpage);
113
114 return (dpage);
115 }
116
117 static cmd_dp_defer_t *
dp_page_defer_lookup(cmd_page_t * page)118 dp_page_defer_lookup(cmd_page_t *page)
119 {
120 cmd_dp_defer_t *dpage;
121
122 for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL;
123 dpage = cmd_list_next(dpage)) {
124 if (dpage->dp_defer_page == page)
125 return (dpage);
126 }
127
128 return (NULL);
129 }
130
131 void
cmd_dp_page_defer(fmd_hdl_t * hdl,nvlist_t * modasru,fmd_event_t * ep,uint64_t afar)132 cmd_dp_page_defer(fmd_hdl_t *hdl, nvlist_t *modasru, fmd_event_t *ep,
133 uint64_t afar)
134 {
135 cmd_dp_defer_t *dpage;
136 cmd_page_t *page = cmd_page_lookup(afar);
137 const char *uuid;
138
139 if (page == NULL) {
140 page = cmd_page_create(hdl, modasru, afar);
141 dpage = dp_page_defer_create(hdl, page, afar);
142 page->page_case.cc_cp = cmd_case_create(hdl, &page->page_header,
143 CMD_PTR_DP_PAGE_DEFER, &uuid);
144 fmd_case_setprincipal(hdl, page->page_case.cc_cp, ep);
145 } else {
146 dpage = dp_page_defer_lookup(page);
147 if (dpage == NULL)
148 fmd_hdl_abort(hdl, "deferred page with no defer data");
149 fmd_case_add_ereport(hdl, page->page_case.cc_cp, ep);
150 }
151
152 dp_page_defer_add_data(hdl, dpage, afar);
153 }
154
155 int
cmd_dp_page_check(fmd_hdl_t * hdl,cmd_dp_defer_t * dpage)156 cmd_dp_page_check(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage)
157 {
158 int i;
159
160 for (i = 0; i < DP_MAX_MCS; i++) {
161 if (dpage->dp_defer_mcids[i] == -1)
162 break;
163 /*
164 * If there's no datapath fault corresponding to
165 * an mcid, that means the page incurred an error
166 * not attributable to a datapath fault.
167 */
168 if (cmd_dp_lookup_fault(hdl, dpage->dp_defer_mcids[i]) == 0)
169 return (0);
170 }
171
172 return (1);
173 }
174
175 void
cmd_dp_page_replay(fmd_hdl_t * hdl)176 cmd_dp_page_replay(fmd_hdl_t *hdl)
177 {
178 fmd_event_t *ep;
179 cmd_page_t *page;
180 cmd_bank_t *bank;
181 cmd_dp_defer_t *dpage;
182 nvlist_t *nvl;
183
184 while ((dpage = cmd_list_next(&cmd.cmd_deferred_pages)) != NULL) {
185 fmd_hdl_debug(hdl, "replaying deferred page, "
186 "pa=%llx\n", dpage->dp_defer_page->page_physbase);
187
188 page = dpage->dp_defer_page;
189
190 if (cmd_dp_page_check(hdl, dpage)) {
191 fmd_hdl_debug(hdl, "deferred memory UE overtaken by "
192 "dp fault");
193 CMD_STAT_BUMP(dp_ignored_ue);
194 fmd_case_close(hdl, page->page_case.cc_cp);
195 cmd_list_delete(&cmd.cmd_deferred_pages, dpage);
196 fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t));
197 cmd_page_destroy(hdl, page);
198 continue;
199 }
200
201 nvl = page->page_asru_nvl;
202
203 bank = cmd_bank_lookup(hdl, nvl);
204
205 ep = fmd_case_getprincipal(hdl, page->page_case.cc_cp);
206 fmd_case_add_ereport(hdl, bank->bank_case.cc_cp, ep);
207
208 bank->bank_nretired++;
209 bank->bank_retstat.fmds_value.ui64++;
210 cmd_bank_dirty(hdl, bank);
211
212 fmd_case_reset(hdl, page->page_case.cc_cp);
213 cmd_case_fini(hdl, page->page_case.cc_cp, FMD_B_TRUE);
214
215 page->page_case.cc_cp = NULL;
216 cmd_page_fault(hdl, nvl, nvl, ep, page->page_physbase);
217 cmd_bank_fault(hdl, bank);
218
219 cmd_list_delete(&cmd.cmd_deferred_pages, dpage);
220 fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t));
221 }
222
223 fmd_hdl_debug(hdl, "cmd_page_defer_replay() complete\n");
224 }
225
226 void
cmd_dp_page_restore(fmd_hdl_t * hdl,cmd_page_t * page)227 cmd_dp_page_restore(fmd_hdl_t *hdl, cmd_page_t *page)
228 {
229 cmd_dp_defer_t *dpage;
230
231 dpage = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_defer_t), FMD_SLEEP);
232
233 dpage->dp_defer_page = page;
234
235 dp_page_defer_data_restore(hdl, dpage);
236
237 cmd_list_append(&cmd.cmd_deferred_pages, dpage);
238 }
239
240 void
cmd_dp_page_validate(fmd_hdl_t * hdl)241 cmd_dp_page_validate(fmd_hdl_t *hdl)
242 {
243 cmd_dp_defer_t *dpage, *next;
244 cmd_page_t *page;
245
246 for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL;
247 dpage = next) {
248 next = cmd_list_next(dpage);
249
250 page = dpage->dp_defer_page;
251
252 if (!fmd_nvl_fmri_present(hdl, page->page_asru_nvl)) {
253 cmd_page_destroy(hdl, page);
254 cmd_list_delete(&cmd.cmd_deferred_pages, dpage);
255 fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t));
256 }
257 }
258 }
259
260 /*ARGSUSED*/
261 int
cmd_dp_page_isdeferred(fmd_hdl_t * hdl,cmd_page_t * page)262 cmd_dp_page_isdeferred(fmd_hdl_t *hdl, cmd_page_t *page)
263 {
264 cmd_dp_defer_t *dpage, *next;
265
266 for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL;
267 dpage = next) {
268 next = cmd_list_next(dpage);
269
270 if (dpage->dp_defer_page == page) {
271 return (1);
272 }
273 }
274
275 return (0);
276 }
277