xref: /illumos-gate/usr/src/cmd/fm/modules/common/cpumem-retire/cma_page.c (revision a07094369b21309434206d9b3601d162693466fc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 /*
31  * Page retirement can be an extended process due to the fact that a retirement
32  * may not be possible when the original request is made.  The kernel will
33  * repeatedly attempt to retire a given page, but will not let us know when the
34  * page has been retired.  We therefore have to poll to see if the retirement
35  * has been completed.  This poll is implemented with a bounded exponential
36  * backoff to reduce the burden which we impose upon the system.
37  *
38  * To reduce the burden on fmd in the face of retirement storms, we schedule
39  * all retries as a group.  In the simplest case, we attempt to retire a single
40  * page.  When forced to retry, we initially schedule a retry at a configurable
41  * interval t.  If the retry fails, we schedule another at 2 * t, and so on,
42  * until t reaches the maximum interval (also configurable).  Future retries
43  * for that page will occur with t equal to the maximum interval value.  We
44  * will never give up on a retirement.
45  *
46  * With multiple retirements, the situation gets slightly more complicated.  As
47  * indicated above, we schedule retries as a group.  We don't want to deny new
48  * pages their short retry intervals, so we'll (re)set the retry interval to the
49  * value appropriate for the newest page.
50  */
51 
52 #include <cma.h>
53 
54 #include <time.h>
55 #include <fcntl.h>
56 #include <errno.h>
57 #include <unistd.h>
58 #include <strings.h>
59 #include <fm/fmd_api.h>
60 #include <fm/libtopo.h>
61 #include <sys/fm/protocol.h>
62 #include <sys/mem.h>
63 
64 static int
65 cma_page_cmd(fmd_hdl_t *hdl, int cmd, nvlist_t *nvl)
66 {
67 	mem_page_t mpage;
68 	char *fmribuf;
69 	size_t fmrisz;
70 	int fd, rc, err;
71 
72 	if ((fd = open("/dev/mem", O_RDONLY)) < 0)
73 		return (-1); /* errno is set for us */
74 
75 	if ((errno = nvlist_size(nvl, &fmrisz, NV_ENCODE_NATIVE)) != 0 ||
76 	    fmrisz > MEM_FMRI_MAX_BUFSIZE ||
77 	    (fmribuf = fmd_hdl_alloc(hdl, fmrisz, FMD_SLEEP)) == NULL) {
78 		(void) close(fd);
79 		return (-1); /* errno is set for us */
80 	}
81 
82 	if ((errno = nvlist_pack(nvl, &fmribuf, &fmrisz,
83 	    NV_ENCODE_NATIVE, 0)) != 0) {
84 		fmd_hdl_free(hdl, fmribuf, fmrisz);
85 		(void) close(fd);
86 		return (-1); /* errno is set for us */
87 	}
88 
89 	mpage.m_fmri = fmribuf;
90 	mpage.m_fmrisz = fmrisz;
91 
92 	if ((rc = ioctl(fd, cmd, &mpage)) < 0)
93 		err = errno;
94 
95 	fmd_hdl_free(hdl, fmribuf, fmrisz);
96 
97 	(void) close(fd);
98 
99 	if (rc < 0) {
100 		errno = err;
101 		return (-1);
102 	}
103 
104 	return (0);
105 }
106 
107 static void
108 cma_page_free(fmd_hdl_t *hdl, cma_page_t *page)
109 {
110 	if (page->pg_fmri != NULL)
111 		nvlist_free(page->pg_fmri);
112 	fmd_hdl_free(hdl, page, sizeof (cma_page_t));
113 }
114 
115 /*
116  * Retire the specified ASRU, referring to a memory page by PA or by DIMM
117  * offset (i.e. the encoded coordinates internal bank, row, and column).
118  * In the initial FMA implementation, fault.memory.page exported an ASRU
119  * with an explicit physical address, which is valid at the initial time of
120  * diagnosis but may not be later following DR, DIMM removal, or interleave
121  * changes.  On SPARC, this issue was solved by exporting the DIMM offset
122  * and pushing the entire FMRI to the platform memory controller through
123  * /dev/mem so it can derive the current PA from the DIMM and offset.
124  * On x64, we also use DIMM and offset, but the mem:/// unum string is an
125  * encoded hc:/// FMRI that is then used by the x64 memory controller driver.
126  * At some point these three approaches need to be rationalized: all platforms
127  * should use the same scheme, either with decoding in the kernel or decoding
128  * in userland (i.e. with a libtopo method to compute and update the PA).
129  */
130 /*ARGSUSED*/
131 void
132 cma_page_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid)
133 {
134 	cma_page_t *page;
135 	uint64_t pageaddr;
136 	char *unumstr;
137 	nvlist_t *asrucp = NULL;
138 
139 	/* It should already be expanded, but we'll do it again anyway */
140 	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
141 		fmd_hdl_debug(hdl, "failed to expand page asru\n");
142 		cma_stats.bad_flts.fmds_value.ui64++;
143 		return;
144 	}
145 
146 	if (nvlist_lookup_uint64(asru, FM_FMRI_MEM_PHYSADDR, &pageaddr) != 0) {
147 		fmd_hdl_debug(hdl, "mem fault missing '%s'\n",
148 		    FM_FMRI_MEM_PHYSADDR);
149 		cma_stats.bad_flts.fmds_value.ui64++;
150 		return;
151 	}
152 
153 	if (!cma.cma_page_doretire) {
154 		fmd_hdl_debug(hdl, "suppressed retire of page %llx\n",
155 		    (u_longlong_t)pageaddr);
156 		cma_stats.page_supp.fmds_value.ui64++;
157 		return;
158 	}
159 
160 	if (!fmd_nvl_fmri_present(hdl, asru)) {
161 		fmd_hdl_debug(hdl, "page retire overtaken by events\n");
162 		cma_stats.page_nonent.fmds_value.ui64++;
163 		if (uuid != NULL)
164 			fmd_case_uuclose(hdl, uuid);
165 		return;
166 	}
167 
168 	/*
169 	 * If the unum is an hc fmri string expand it to an fmri and include
170 	 * that in a modified asru nvlist.
171 	 */
172 	if (nvlist_lookup_string(asru, FM_FMRI_MEM_UNUM, &unumstr) == 0 &&
173 	    strncmp(unumstr, "hc:/", 4) == 0) {
174 		int err;
175 		nvlist_t *unumfmri;
176 		struct topo_hdl *thp = fmd_hdl_topology(hdl, TOPO_VERSION);
177 
178 		if (topo_fmri_str2nvl(thp, unumstr, &unumfmri, &err) != 0) {
179 			fmd_hdl_debug(hdl, "page retire str2nvl failed: %s\n",
180 			    topo_strerror(err));
181 			return;
182 		}
183 
184 		if (nvlist_dup(asru, &asrucp, 0) != 0) {
185 			fmd_hdl_debug(hdl, "page retire nvlist dup failed\n");
186 			nvlist_free(unumfmri);
187 			return;
188 		}
189 
190 		if (nvlist_add_nvlist(asrucp, FM_FMRI_MEM_UNUM "-fmri",
191 		    unumfmri) != 0) {
192 			fmd_hdl_debug(hdl, "page retire failed to add "
193 			    "unumfmri to modified asru");
194 			nvlist_free(unumfmri);
195 			nvlist_free(asrucp);
196 			return;
197 		}
198 		nvlist_free(unumfmri);
199 	}
200 
201 	if (cma_page_cmd(hdl, MEM_PAGE_FMRI_RETIRE,
202 	    asrucp ? asrucp : asru) == 0) {
203 		fmd_hdl_debug(hdl, "retired page 0x%llx\n",
204 		    (u_longlong_t)pageaddr);
205 		cma_stats.page_flts.fmds_value.ui64++;
206 		if (uuid != NULL)
207 			fmd_case_uuclose(hdl, uuid);
208 		if (asrucp)
209 			nvlist_free(asrucp);
210 		return;
211 	} else if (errno != EAGAIN) {
212 		fmd_hdl_debug(hdl, "retire of page 0x%llx failed, will not "
213 		    "retry: %s\n", (u_longlong_t)pageaddr, strerror(errno));
214 		if (asrucp)
215 			nvlist_free(asrucp);
216 		return;
217 	}
218 
219 	/*
220 	 * The page didn't immediately retire.  We'll need to periodically
221 	 * check to see if it has been retired.
222 	 */
223 	fmd_hdl_debug(hdl, "page didn't retire - sleeping\n");
224 
225 	page = fmd_hdl_zalloc(hdl, sizeof (cma_page_t), FMD_SLEEP);
226 	page->pg_addr = pageaddr;
227 	if (asrucp) {
228 		page->pg_fmri = asrucp;
229 	} else {
230 		(void) nvlist_dup(asru, &page->pg_fmri, 0);
231 	}
232 	if (uuid != NULL)
233 		page->pg_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP);
234 
235 	page->pg_next = cma.cma_pages;
236 	cma.cma_pages = page;
237 
238 	if (cma.cma_page_timerid != 0)
239 		fmd_timer_remove(hdl, cma.cma_page_timerid);
240 
241 	cma.cma_page_curdelay = cma.cma_page_mindelay;
242 
243 	cma.cma_page_timerid =
244 	    fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay);
245 }
246 
247 static int
248 page_retry(fmd_hdl_t *hdl, cma_page_t *page)
249 {
250 	if (page->pg_fmri != NULL && !fmd_nvl_fmri_present(hdl,
251 	    page->pg_fmri)) {
252 		fmd_hdl_debug(hdl, "page retire overtaken by events");
253 		cma_stats.page_nonent.fmds_value.ui64++;
254 
255 		if (page->pg_uuid != NULL)
256 			fmd_case_uuclose(hdl, page->pg_uuid);
257 		return (1); /* no longer a page to retire */
258 	}
259 
260 	if (cma_page_cmd(hdl, MEM_PAGE_FMRI_ISRETIRED, page->pg_fmri) == 0) {
261 		fmd_hdl_debug(hdl, "retired page 0x%llx on retry %u\n",
262 		    page->pg_addr, page->pg_nretries);
263 		cma_stats.page_flts.fmds_value.ui64++;
264 
265 		if (page->pg_uuid != NULL)
266 			fmd_case_uuclose(hdl, page->pg_uuid);
267 		return (1); /* page retired */
268 	}
269 
270 	if (errno == EAGAIN) {
271 		fmd_hdl_debug(hdl, "scheduling another retry for 0x%llx\n",
272 		    page->pg_addr);
273 		return (0); /* schedule another retry */
274 	} else {
275 		if (errno == EIO) {
276 			fmd_hdl_debug(hdl, "failed to retry page 0x%llx "
277 			    "retirement: page isn't scheduled for retirement\n",
278 			    page->pg_addr);
279 		} else {
280 			fmd_hdl_debug(hdl, "failed to retry page 0x%llx "
281 			    "retirement: %s\n", page->pg_addr,
282 			    strerror(errno));
283 		}
284 
285 		cma_stats.page_fails.fmds_value.ui64++;
286 		return (1); /* give up */
287 	}
288 }
289 
290 void
291 cma_page_retry(fmd_hdl_t *hdl)
292 {
293 	cma_page_t **pagep;
294 
295 	cma.cma_page_timerid = 0;
296 
297 	fmd_hdl_debug(hdl, "page_retry: timer fired\n");
298 
299 	pagep = &cma.cma_pages;
300 	while (*pagep != NULL) {
301 		cma_page_t *page = *pagep;
302 
303 		if (page_retry(hdl, page)) {
304 			/*
305 			 * Successful retry or we're giving up - remove from
306 			 * the list
307 			 */
308 			*pagep = page->pg_next;
309 
310 			if (page->pg_uuid != NULL)
311 				fmd_hdl_strfree(hdl, page->pg_uuid);
312 
313 			cma_page_free(hdl, page);
314 		} else if (cma.cma_page_maxretries == 0 ||
315 		    page->pg_nretries < cma.cma_page_maxretries) {
316 			page->pg_nretries++;
317 			pagep = &page->pg_next;
318 		} else {
319 			/*
320 			 * Tunable maxretries was set and we reached
321 			 * the max, so just close the case.
322 			 */
323 			fmd_hdl_debug(hdl,
324 			    "giving up page retire 0x%llx on retry %u\n",
325 			    page->pg_addr, page->pg_nretries);
326 			cma_stats.page_retmax.fmds_value.ui64++;
327 
328 			if (page->pg_uuid != NULL) {
329 				fmd_case_uuclose(hdl, page->pg_uuid);
330 				fmd_hdl_strfree(hdl, page->pg_uuid);
331 			}
332 
333 			*pagep = page->pg_next;
334 
335 			cma_page_free(hdl, page);
336 		}
337 	}
338 
339 	if (cma.cma_pages == NULL)
340 		return; /* no more retirements */
341 
342 	/*
343 	 * We still have retirements that haven't completed.  Back the delay
344 	 * off, and schedule a retry.
345 	 */
346 	cma.cma_page_curdelay = MIN(cma.cma_page_curdelay * 2,
347 	    cma.cma_page_maxdelay);
348 
349 	fmd_hdl_debug(hdl, "scheduled page retirement retry for %llu secs\n",
350 	    (u_longlong_t)(cma.cma_page_curdelay / NANOSEC));
351 
352 	cma.cma_page_timerid =
353 	    fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay);
354 }
355 
356 void
357 cma_page_fini(fmd_hdl_t *hdl)
358 {
359 	cma_page_t *page;
360 
361 	while ((page = cma.cma_pages) != NULL) {
362 		cma.cma_pages = page->pg_next;
363 		cma_page_free(hdl, page);
364 	}
365 }
366