xref: /titanic_41/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_dp_page.c (revision 14ea4bb737263733ad80a36b4f73f681c30a6b45)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Support routines for managing potential page and bank faults that have
30  * been deferred due to a datapath error.  Currently deferment only occurs
31  * if a memory UE occurs while a datapath error is active.  When this happens
32  * a page case is created with a special subtype of CMD_PTR_DP_PAGE_DEFER.  An
33  * entry (a cmd_dp_defer_t) is added to a list of deferred pages.  The entry
34  * links to the cmd_page_t in the cmd_pages list and also keeps track of what
35  * memory controller ids are associated with the first AFAR and any more that
36  * are seen while the page is deferred.  This information is used to determine
37  * if the page should be faulted if the fault should be skipped because an
38  * intervening datapath fault has occurred.  If a page is faulted when it is
39  * replayed, the corresponding bank is faulted, too, since the original error
40  * was a UE.  Note that no action is taken to undo any action taken by the
41  * kernel when the UE was detected.  Currently the kernel will attempt to
42  * immediately retire the page where a UE is detected and the retire may or
43  * may not have completed by the time FMA receives an ereport.  The possibility
44  * of a datapath fault resulting in memory UEs is very small, so the likelihood
45  * of encountering this scenario is also very small.
46  */
47 
48 #include <cmd.h>
49 #include <cmd_dp.h>
50 #include <cmd_dp_page.h>
51 #include <cmd_bank.h>
52 #include <cmd_page.h>
53 
54 #include <fm/fmd_api.h>
55 #include <sys/nvpair.h>
56 
57 extern void cmd_bank_fault(fmd_hdl_t *, cmd_bank_t *);
58 
59 static void
dp_page_defer_data_write(fmd_hdl_t * hdl,cmd_dp_defer_t * dpage)60 dp_page_defer_data_write(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage)
61 {
62 	fmd_buf_write(hdl, dpage->dp_defer_page->page_case.cc_cp, "mcids",
63 	    &dpage->dp_defer_mcids, sizeof (dpage->dp_defer_mcids));
64 }
65 
66 static void
dp_page_defer_data_restore(fmd_hdl_t * hdl,cmd_dp_defer_t * dpage)67 dp_page_defer_data_restore(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage)
68 {
69 	fmd_buf_read(hdl, dpage->dp_defer_page->page_case.cc_cp, "mcids",
70 	    &dpage->dp_defer_mcids, sizeof (dpage->dp_defer_mcids));
71 }
72 
73 static void
dp_page_defer_add_data(fmd_hdl_t * hdl,cmd_dp_defer_t * dpage,uint64_t afar)74 dp_page_defer_add_data(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage, uint64_t afar)
75 {
76 	int mcid;
77 	int i;
78 
79 	if (cmd_dp_get_mcid(afar, &mcid) < 0)
80 		fmd_hdl_abort(hdl, "cmd_dp_get_mcid failed");
81 
82 	for (i = 0; i < DP_MAX_MCS; i++) {
83 		if (dpage->dp_defer_mcids[i] == -1) {
84 			dpage->dp_defer_mcids[i] = mcid;
85 			break;
86 		}
87 		if (dpage->dp_defer_mcids[i] == mcid)
88 			break;
89 	}
90 
91 	if (i == DP_MAX_MCS)
92 		fmd_hdl_abort(hdl, "too many mcids for deferred page");
93 
94 	dp_page_defer_data_write(hdl, dpage);
95 }
96 
97 static cmd_dp_defer_t *
dp_page_defer_create(fmd_hdl_t * hdl,cmd_page_t * page,uint64_t afar)98 dp_page_defer_create(fmd_hdl_t *hdl, cmd_page_t *page, uint64_t afar)
99 {
100 	cmd_dp_defer_t *dpage;
101 	int i;
102 
103 	dpage = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_defer_t), FMD_SLEEP);
104 
105 	dpage->dp_defer_page = page;
106 
107 	for (i = 0; i < DP_MAX_MCS; i++)
108 		dpage->dp_defer_mcids[i] = -1;
109 
110 	dp_page_defer_add_data(hdl, dpage, afar);
111 
112 	cmd_list_append(&cmd.cmd_deferred_pages, dpage);
113 
114 	return (dpage);
115 }
116 
117 static cmd_dp_defer_t *
dp_page_defer_lookup(cmd_page_t * page)118 dp_page_defer_lookup(cmd_page_t *page)
119 {
120 	cmd_dp_defer_t *dpage;
121 
122 	for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL;
123 	    dpage = cmd_list_next(dpage)) {
124 		if (dpage->dp_defer_page == page)
125 			return (dpage);
126 	}
127 
128 	return (NULL);
129 }
130 
131 void
cmd_dp_page_defer(fmd_hdl_t * hdl,nvlist_t * modasru,fmd_event_t * ep,uint64_t afar)132 cmd_dp_page_defer(fmd_hdl_t *hdl, nvlist_t *modasru, fmd_event_t *ep,
133     uint64_t afar)
134 {
135 	cmd_dp_defer_t *dpage;
136 	cmd_page_t *page = cmd_page_lookup(afar);
137 	const char *uuid;
138 
139 	if (page == NULL) {
140 		page = cmd_page_create(hdl, modasru, afar);
141 		dpage = dp_page_defer_create(hdl, page, afar);
142 		page->page_case.cc_cp = cmd_case_create(hdl, &page->page_header,
143 		    CMD_PTR_DP_PAGE_DEFER, &uuid);
144 		fmd_case_setprincipal(hdl, page->page_case.cc_cp, ep);
145 	} else {
146 		dpage = dp_page_defer_lookup(page);
147 		if (dpage == NULL)
148 			fmd_hdl_abort(hdl, "deferred page with no defer data");
149 		fmd_case_add_ereport(hdl, page->page_case.cc_cp, ep);
150 	}
151 
152 	dp_page_defer_add_data(hdl, dpage, afar);
153 }
154 
155 int
cmd_dp_page_check(fmd_hdl_t * hdl,cmd_dp_defer_t * dpage)156 cmd_dp_page_check(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage)
157 {
158 	int i;
159 
160 	for (i = 0; i < DP_MAX_MCS; i++) {
161 		if (dpage->dp_defer_mcids[i] == -1)
162 			break;
163 		/*
164 		 * If there's no datapath fault corresponding to
165 		 * an mcid, that means the page incurred an error
166 		 * not attributable to a datapath fault.
167 		 */
168 		if (cmd_dp_lookup_fault(hdl, dpage->dp_defer_mcids[i]) == 0)
169 			return (0);
170 	}
171 
172 	return (1);
173 }
174 
175 void
cmd_dp_page_replay(fmd_hdl_t * hdl)176 cmd_dp_page_replay(fmd_hdl_t *hdl)
177 {
178 	fmd_event_t *ep;
179 	cmd_page_t *page;
180 	cmd_bank_t *bank;
181 	cmd_dp_defer_t *dpage;
182 	nvlist_t *nvl;
183 
184 	while ((dpage = cmd_list_next(&cmd.cmd_deferred_pages)) != NULL) {
185 		fmd_hdl_debug(hdl, "replaying deferred page, "
186 		    "pa=%llx\n", dpage->dp_defer_page->page_physbase);
187 
188 		page = dpage->dp_defer_page;
189 
190 		if (cmd_dp_page_check(hdl, dpage)) {
191 			fmd_hdl_debug(hdl, "deferred memory UE  overtaken by "
192 			    "dp fault");
193 			CMD_STAT_BUMP(dp_ignored_ue);
194 			fmd_case_close(hdl, page->page_case.cc_cp);
195 			cmd_list_delete(&cmd.cmd_deferred_pages, dpage);
196 			fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t));
197 			cmd_page_destroy(hdl, page);
198 			continue;
199 		}
200 
201 		nvl = page->page_asru_nvl;
202 
203 		bank = cmd_bank_lookup(hdl, nvl);
204 
205 		ep = fmd_case_getprincipal(hdl, page->page_case.cc_cp);
206 		fmd_case_add_ereport(hdl, bank->bank_case.cc_cp, ep);
207 
208 		bank->bank_nretired++;
209 		bank->bank_retstat.fmds_value.ui64++;
210 		cmd_bank_dirty(hdl, bank);
211 
212 		fmd_case_reset(hdl, page->page_case.cc_cp);
213 		cmd_case_fini(hdl, page->page_case.cc_cp, FMD_B_TRUE);
214 
215 		page->page_case.cc_cp = NULL;
216 		cmd_page_fault(hdl, nvl, nvl, ep, page->page_physbase);
217 		cmd_bank_fault(hdl, bank);
218 
219 		cmd_list_delete(&cmd.cmd_deferred_pages, dpage);
220 		fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t));
221 	}
222 
223 	fmd_hdl_debug(hdl, "cmd_page_defer_replay() complete\n");
224 }
225 
226 void
cmd_dp_page_restore(fmd_hdl_t * hdl,cmd_page_t * page)227 cmd_dp_page_restore(fmd_hdl_t *hdl, cmd_page_t *page)
228 {
229 	cmd_dp_defer_t *dpage;
230 
231 	dpage = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_defer_t), FMD_SLEEP);
232 
233 	dpage->dp_defer_page = page;
234 
235 	dp_page_defer_data_restore(hdl, dpage);
236 
237 	cmd_list_append(&cmd.cmd_deferred_pages, dpage);
238 }
239 
240 void
cmd_dp_page_validate(fmd_hdl_t * hdl)241 cmd_dp_page_validate(fmd_hdl_t *hdl)
242 {
243 	cmd_dp_defer_t *dpage, *next;
244 	cmd_page_t *page;
245 
246 	for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL;
247 	    dpage = next) {
248 		next = cmd_list_next(dpage);
249 
250 		page = dpage->dp_defer_page;
251 
252 		if (!fmd_nvl_fmri_present(hdl, page->page_asru_nvl)) {
253 			cmd_page_destroy(hdl, page);
254 			cmd_list_delete(&cmd.cmd_deferred_pages, dpage);
255 			fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t));
256 		}
257 	}
258 }
259 
260 /*ARGSUSED*/
261 int
cmd_dp_page_isdeferred(fmd_hdl_t * hdl,cmd_page_t * page)262 cmd_dp_page_isdeferred(fmd_hdl_t *hdl, cmd_page_t *page)
263 {
264 	cmd_dp_defer_t *dpage, *next;
265 
266 	for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL;
267 	    dpage = next) {
268 		next = cmd_list_next(dpage);
269 
270 		if (dpage->dp_defer_page == page) {
271 			return (1);
272 		}
273 	}
274 
275 	return (0);
276 }
277