1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Page retirement can be an extended process due to the fact that a retirement 31 * may not be possible when the original request is made. The kernel will 32 * repeatedly attempt to retire a given page, but will not let us know when the 33 * page has been retired. We therefore have to poll to see if the retirement 34 * has been completed. This poll is implemented with a bounded exponential 35 * backoff to reduce the burden which we impose upon the system. 36 * 37 * To reduce the burden on fmd in the face of retirement storms, we schedule 38 * all retries as a group. In the simplest case, we attempt to retire a single 39 * page. When forced to retry, we initially schedule a retry at a configurable 40 * interval t. If the retry fails, we schedule another at 2 * t, and so on, 41 * until t reaches the maximum interval (also configurable). Future retries 42 * for that page will occur with t equal to the maximum interval value. We 43 * will never give up on a retirement. 44 * 45 * With multiple retirements, the situation gets slightly more complicated. As 46 * indicated above, we schedule retries as a group. We don't want to deny new 47 * pages their short retry intervals, so we'll (re)set the retry interval to the 48 * value appropriate for the newest page. 49 */ 50 51 #include <cma.h> 52 53 #include <time.h> 54 #include <fcntl.h> 55 #include <errno.h> 56 #include <unistd.h> 57 #include <strings.h> 58 #include <fm/fmd_api.h> 59 #include <fm/libtopo.h> 60 #include <sys/fm/protocol.h> 61 #include <sys/mem.h> 62 63 static int 64 cma_page_cmd(fmd_hdl_t *hdl, int cmd, nvlist_t *nvl) 65 { 66 mem_page_t mpage; 67 char *fmribuf; 68 size_t fmrisz; 69 int fd, rc, err; 70 71 if ((fd = open("/dev/mem", O_RDONLY)) < 0) 72 return (-1); /* errno is set for us */ 73 74 if ((errno = nvlist_size(nvl, &fmrisz, NV_ENCODE_NATIVE)) != 0 || 75 fmrisz > MEM_FMRI_MAX_BUFSIZE || 76 (fmribuf = fmd_hdl_alloc(hdl, fmrisz, FMD_SLEEP)) == NULL) { 77 (void) close(fd); 78 return (-1); /* errno is set for us */ 79 } 80 81 if ((errno = nvlist_pack(nvl, &fmribuf, &fmrisz, 82 NV_ENCODE_NATIVE, 0)) != 0) { 83 fmd_hdl_free(hdl, fmribuf, fmrisz); 84 (void) close(fd); 85 return (-1); /* errno is set for us */ 86 } 87 88 mpage.m_fmri = fmribuf; 89 mpage.m_fmrisz = fmrisz; 90 91 if ((rc = ioctl(fd, cmd, &mpage)) < 0) 92 err = errno; 93 94 fmd_hdl_free(hdl, fmribuf, fmrisz); 95 96 (void) close(fd); 97 98 if (rc < 0) { 99 errno = err; 100 return (-1); 101 } 102 103 return (0); 104 } 105 106 static void 107 cma_page_free(fmd_hdl_t *hdl, cma_page_t *page) 108 { 109 if (page->pg_fmri != NULL) 110 nvlist_free(page->pg_fmri); 111 fmd_hdl_free(hdl, page, sizeof (cma_page_t)); 112 } 113 114 /* 115 * Retire the specified ASRU, referring to a memory page by PA or by DIMM 116 * offset (i.e. the encoded coordinates internal bank, row, and column). 117 * In the initial FMA implementation, fault.memory.page exported an ASRU 118 * with an explicit physical address, which is valid at the initial time of 119 * diagnosis but may not be later following DR, DIMM removal, or interleave 120 * changes. On SPARC, this issue was solved by exporting the DIMM offset 121 * and pushing the entire FMRI to the platform memory controller through 122 * /dev/mem so it can derive the current PA from the DIMM and offset. 123 * On x64, we also use DIMM and offset, but the mem:/// unum string is an 124 * encoded hc:/// FMRI that is then used by the x64 memory controller driver. 125 * At some point these three approaches need to be rationalized: all platforms 126 * should use the same scheme, either with decoding in the kernel or decoding 127 * in userland (i.e. with a libtopo method to compute and update the PA). 128 */ 129 /*ARGSUSED*/ 130 int 131 cma_page_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid) 132 { 133 cma_page_t *page; 134 uint64_t pageaddr; 135 char *unumstr; 136 nvlist_t *asrucp = NULL; 137 138 /* It should already be expanded, but we'll do it again anyway */ 139 if (fmd_nvl_fmri_expand(hdl, asru) < 0) { 140 fmd_hdl_debug(hdl, "failed to expand page asru\n"); 141 cma_stats.bad_flts.fmds_value.ui64++; 142 return (CMA_RA_FAILURE); 143 } 144 145 if (nvlist_lookup_uint64(asru, FM_FMRI_MEM_PHYSADDR, &pageaddr) != 0) { 146 fmd_hdl_debug(hdl, "mem fault missing '%s'\n", 147 FM_FMRI_MEM_PHYSADDR); 148 cma_stats.bad_flts.fmds_value.ui64++; 149 return (CMA_RA_FAILURE); 150 } 151 152 if (!cma.cma_page_doretire) { 153 fmd_hdl_debug(hdl, "suppressed retire of page %llx\n", 154 (u_longlong_t)pageaddr); 155 cma_stats.page_supp.fmds_value.ui64++; 156 return (CMA_RA_FAILURE); 157 } 158 159 if (!fmd_nvl_fmri_present(hdl, asru)) { 160 fmd_hdl_debug(hdl, "page retire overtaken by events\n"); 161 cma_stats.page_nonent.fmds_value.ui64++; 162 return (CMA_RA_SUCCESS); 163 } 164 165 /* 166 * If the unum is an hc fmri string expand it to an fmri and include 167 * that in a modified asru nvlist. 168 */ 169 if (nvlist_lookup_string(asru, FM_FMRI_MEM_UNUM, &unumstr) == 0 && 170 strncmp(unumstr, "hc:/", 4) == 0) { 171 int err; 172 nvlist_t *unumfmri; 173 struct topo_hdl *thp = fmd_hdl_topo_hold(hdl, TOPO_VERSION); 174 175 if (topo_fmri_str2nvl(thp, unumstr, &unumfmri, &err) != 0) { 176 fmd_hdl_topo_rele(hdl, thp); 177 fmd_hdl_debug(hdl, "page retire str2nvl failed: %s\n", 178 topo_strerror(err)); 179 return (CMA_RA_FAILURE); 180 } 181 fmd_hdl_topo_rele(hdl, thp); 182 183 if (nvlist_dup(asru, &asrucp, 0) != 0) { 184 fmd_hdl_debug(hdl, "page retire nvlist dup failed\n"); 185 nvlist_free(unumfmri); 186 return (CMA_RA_FAILURE); 187 } 188 189 if (nvlist_add_nvlist(asrucp, FM_FMRI_MEM_UNUM "-fmri", 190 unumfmri) != 0) { 191 fmd_hdl_debug(hdl, "page retire failed to add " 192 "unumfmri to modified asru"); 193 nvlist_free(unumfmri); 194 nvlist_free(asrucp); 195 return (CMA_RA_FAILURE); 196 } 197 nvlist_free(unumfmri); 198 } 199 200 if (cma_page_cmd(hdl, MEM_PAGE_FMRI_RETIRE, 201 asrucp ? asrucp : asru) == 0) { 202 fmd_hdl_debug(hdl, "retired page 0x%llx\n", 203 (u_longlong_t)pageaddr); 204 cma_stats.page_flts.fmds_value.ui64++; 205 if (asrucp) 206 nvlist_free(asrucp); 207 return (CMA_RA_SUCCESS); 208 } else if (errno != EAGAIN) { 209 fmd_hdl_debug(hdl, "retire of page 0x%llx failed, will not " 210 "retry: %s\n", (u_longlong_t)pageaddr, strerror(errno)); 211 if (asrucp) 212 nvlist_free(asrucp); 213 if (uuid != NULL && cma.cma_page_maxretries != 0) 214 return (CMA_RA_SUCCESS); 215 return (CMA_RA_FAILURE); 216 } 217 218 /* 219 * The page didn't immediately retire. We'll need to periodically 220 * check to see if it has been retired. 221 */ 222 fmd_hdl_debug(hdl, "page didn't retire - sleeping\n"); 223 224 page = fmd_hdl_zalloc(hdl, sizeof (cma_page_t), FMD_SLEEP); 225 page->pg_addr = pageaddr; 226 if (asrucp) { 227 page->pg_fmri = asrucp; 228 } else { 229 (void) nvlist_dup(asru, &page->pg_fmri, 0); 230 } 231 if (uuid != NULL) 232 page->pg_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP); 233 234 page->pg_next = cma.cma_pages; 235 cma.cma_pages = page; 236 237 if (cma.cma_page_timerid != 0) 238 fmd_timer_remove(hdl, cma.cma_page_timerid); 239 240 cma.cma_page_curdelay = cma.cma_page_mindelay; 241 242 cma.cma_page_timerid = 243 fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay); 244 245 return (CMA_RA_FAILURE); 246 } 247 248 static int 249 page_retry(fmd_hdl_t *hdl, cma_page_t *page) 250 { 251 if (page->pg_fmri != NULL && !fmd_nvl_fmri_present(hdl, 252 page->pg_fmri)) { 253 fmd_hdl_debug(hdl, "page retire overtaken by events"); 254 cma_stats.page_nonent.fmds_value.ui64++; 255 256 if (page->pg_uuid != NULL) 257 fmd_case_uuclose(hdl, page->pg_uuid); 258 return (1); /* no longer a page to retire */ 259 } 260 261 if (cma_page_cmd(hdl, MEM_PAGE_FMRI_ISRETIRED, page->pg_fmri) == 0) { 262 fmd_hdl_debug(hdl, "retired page 0x%llx on retry %u\n", 263 page->pg_addr, page->pg_nretries); 264 cma_stats.page_flts.fmds_value.ui64++; 265 266 if (page->pg_uuid != NULL) 267 fmd_case_uuclose(hdl, page->pg_uuid); 268 return (1); /* page retired */ 269 } 270 271 if (errno == EAGAIN) { 272 fmd_hdl_debug(hdl, "scheduling another retry for 0x%llx\n", 273 page->pg_addr); 274 return (0); /* schedule another retry */ 275 } else { 276 if (errno == EIO) { 277 fmd_hdl_debug(hdl, "failed to retry page 0x%llx " 278 "retirement: page isn't scheduled for retirement" 279 "(request made beyond page_retire limit?)\n", 280 page->pg_addr); 281 } else { 282 fmd_hdl_debug(hdl, "failed to retry page 0x%llx " 283 "retirement: %s\n", page->pg_addr, 284 strerror(errno)); 285 } 286 287 if (page->pg_uuid != NULL && cma.cma_page_maxretries != 0) 288 fmd_case_uuclose(hdl, page->pg_uuid); 289 290 cma_stats.page_fails.fmds_value.ui64++; 291 return (1); /* give up */ 292 } 293 } 294 295 void 296 cma_page_retry(fmd_hdl_t *hdl) 297 { 298 cma_page_t **pagep; 299 300 cma.cma_page_timerid = 0; 301 302 fmd_hdl_debug(hdl, "page_retry: timer fired\n"); 303 304 pagep = &cma.cma_pages; 305 while (*pagep != NULL) { 306 cma_page_t *page = *pagep; 307 308 if (page_retry(hdl, page)) { 309 /* 310 * Successful retry or we're giving up - remove from 311 * the list 312 */ 313 *pagep = page->pg_next; 314 315 if (page->pg_uuid != NULL) 316 fmd_hdl_strfree(hdl, page->pg_uuid); 317 318 cma_page_free(hdl, page); 319 } else if (cma.cma_page_maxretries == 0 || 320 page->pg_nretries < cma.cma_page_maxretries) { 321 page->pg_nretries++; 322 pagep = &page->pg_next; 323 } else { 324 /* 325 * Tunable maxretries was set and we reached 326 * the max, so just close the case. 327 */ 328 fmd_hdl_debug(hdl, 329 "giving up page retire 0x%llx on retry %u\n", 330 page->pg_addr, page->pg_nretries); 331 cma_stats.page_retmax.fmds_value.ui64++; 332 333 if (page->pg_uuid != NULL) { 334 fmd_case_uuclose(hdl, page->pg_uuid); 335 fmd_hdl_strfree(hdl, page->pg_uuid); 336 } 337 338 *pagep = page->pg_next; 339 340 cma_page_free(hdl, page); 341 } 342 } 343 344 if (cma.cma_pages == NULL) 345 return; /* no more retirements */ 346 347 /* 348 * We still have retirements that haven't completed. Back the delay 349 * off, and schedule a retry. 350 */ 351 cma.cma_page_curdelay = MIN(cma.cma_page_curdelay * 2, 352 cma.cma_page_maxdelay); 353 354 fmd_hdl_debug(hdl, "scheduled page retirement retry for %llu secs\n", 355 (u_longlong_t)(cma.cma_page_curdelay / NANOSEC)); 356 357 cma.cma_page_timerid = 358 fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay); 359 } 360 361 void 362 cma_page_fini(fmd_hdl_t *hdl) 363 { 364 cma_page_t *page; 365 366 while ((page = cma.cma_pages) != NULL) { 367 cma.cma_pages = page->pg_next; 368 cma_page_free(hdl, page); 369 } 370 } 371