1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 /* 31 * Page retirement can be an extended process due to the fact that a retirement 32 * may not be possible when the original request is made. The kernel will 33 * repeatedly attempt to retire a given page, but will not let us know when the 34 * page has been retired. We therefore have to poll to see if the retirement 35 * has been completed. This poll is implemented with a bounded exponential 36 * backoff to reduce the burden which we impose upon the system. 37 * 38 * To reduce the burden on fmd in the face of retirement storms, we schedule 39 * all retries as a group. In the simplest case, we attempt to retire a single 40 * page. When forced to retry, we initially schedule a retry at a configurable 41 * interval t. If the retry fails, we schedule another at 2 * t, and so on, 42 * until t reaches the maximum interval (also configurable). Future retries 43 * for that page will occur with t equal to the maximum interval value. We 44 * will never give up on a retirement. 45 * 46 * With multiple retirements, the situation gets slightly more complicated. As 47 * indicated above, we schedule retries as a group. We don't want to deny new 48 * pages their short retry intervals, so we'll (re)set the retry interval to the 49 * value appropriate for the newest page. 50 */ 51 52 #include <cma.h> 53 54 #include <time.h> 55 #include <fcntl.h> 56 #include <errno.h> 57 #include <unistd.h> 58 #include <strings.h> 59 #include <fm/fmd_api.h> 60 #include <fm/libtopo.h> 61 #include <sys/fm/protocol.h> 62 #include <sys/mem.h> 63 64 static int 65 cma_page_cmd(fmd_hdl_t *hdl, int cmd, nvlist_t *nvl) 66 { 67 mem_page_t mpage; 68 char *fmribuf; 69 size_t fmrisz; 70 int fd, rc, err; 71 72 if ((fd = open("/dev/mem", O_RDONLY)) < 0) 73 return (-1); /* errno is set for us */ 74 75 if ((errno = nvlist_size(nvl, &fmrisz, NV_ENCODE_NATIVE)) != 0 || 76 fmrisz > MEM_FMRI_MAX_BUFSIZE || 77 (fmribuf = fmd_hdl_alloc(hdl, fmrisz, FMD_SLEEP)) == NULL) { 78 (void) close(fd); 79 return (-1); /* errno is set for us */ 80 } 81 82 if ((errno = nvlist_pack(nvl, &fmribuf, &fmrisz, 83 NV_ENCODE_NATIVE, 0)) != 0) { 84 fmd_hdl_free(hdl, fmribuf, fmrisz); 85 (void) close(fd); 86 return (-1); /* errno is set for us */ 87 } 88 89 mpage.m_fmri = fmribuf; 90 mpage.m_fmrisz = fmrisz; 91 92 if ((rc = ioctl(fd, cmd, &mpage)) < 0) 93 err = errno; 94 95 fmd_hdl_free(hdl, fmribuf, fmrisz); 96 97 (void) close(fd); 98 99 if (rc < 0) { 100 errno = err; 101 return (-1); 102 } 103 104 return (0); 105 } 106 107 static void 108 cma_page_free(fmd_hdl_t *hdl, cma_page_t *page) 109 { 110 if (page->pg_fmri != NULL) 111 nvlist_free(page->pg_fmri); 112 fmd_hdl_free(hdl, page, sizeof (cma_page_t)); 113 } 114 115 /* 116 * Retire the specified ASRU, referring to a memory page by PA or by DIMM 117 * offset (i.e. the encoded coordinates internal bank, row, and column). 118 * In the initial FMA implementation, fault.memory.page exported an ASRU 119 * with an explicit physical address, which is valid at the initial time of 120 * diagnosis but may not be later following DR, DIMM removal, or interleave 121 * changes. On SPARC, this issue was solved by exporting the DIMM offset 122 * and pushing the entire FMRI to the platform memory controller through 123 * /dev/mem so it can derive the current PA from the DIMM and offset. 124 * On x64, we also use DIMM and offset, but the mem:/// unum string is an 125 * encoded hc:/// FMRI that is then used by the x64 memory controller driver. 126 * At some point these three approaches need to be rationalized: all platforms 127 * should use the same scheme, either with decoding in the kernel or decoding 128 * in userland (i.e. with a libtopo method to compute and update the PA). 129 */ 130 /*ARGSUSED*/ 131 void 132 cma_page_retire(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru, const char *uuid) 133 { 134 cma_page_t *page; 135 uint64_t pageaddr; 136 char *unumstr; 137 nvlist_t *asrucp = NULL; 138 139 /* It should already be expanded, but we'll do it again anyway */ 140 if (fmd_nvl_fmri_expand(hdl, asru) < 0) { 141 fmd_hdl_debug(hdl, "failed to expand page asru\n"); 142 cma_stats.bad_flts.fmds_value.ui64++; 143 return; 144 } 145 146 if (nvlist_lookup_uint64(asru, FM_FMRI_MEM_PHYSADDR, &pageaddr) != 0) { 147 fmd_hdl_debug(hdl, "mem fault missing '%s'\n", 148 FM_FMRI_MEM_PHYSADDR); 149 cma_stats.bad_flts.fmds_value.ui64++; 150 return; 151 } 152 153 if (!cma.cma_page_doretire) { 154 fmd_hdl_debug(hdl, "suppressed retire of page %llx\n", 155 (u_longlong_t)pageaddr); 156 cma_stats.page_supp.fmds_value.ui64++; 157 return; 158 } 159 160 if (!fmd_nvl_fmri_present(hdl, asru)) { 161 fmd_hdl_debug(hdl, "page retire overtaken by events\n"); 162 cma_stats.page_nonent.fmds_value.ui64++; 163 if (uuid != NULL) 164 fmd_case_uuclose(hdl, uuid); 165 return; 166 } 167 168 /* 169 * If the unum is an hc fmri string expand it to an fmri and include 170 * that in a modified asru nvlist. 171 */ 172 if (nvlist_lookup_string(asru, FM_FMRI_MEM_UNUM, &unumstr) == 0 && 173 strncmp(unumstr, "hc:/", 4) == 0) { 174 int err; 175 nvlist_t *unumfmri; 176 struct topo_hdl *thp = fmd_hdl_topology(hdl, TOPO_VERSION); 177 178 if (topo_fmri_str2nvl(thp, unumstr, &unumfmri, &err) != 0) { 179 fmd_hdl_debug(hdl, "page retire str2nvl failed: %s\n", 180 topo_strerror(err)); 181 return; 182 } 183 184 if (nvlist_dup(asru, &asrucp, 0) != 0) { 185 fmd_hdl_debug(hdl, "page retire nvlist dup failed\n"); 186 nvlist_free(unumfmri); 187 return; 188 } 189 190 if (nvlist_add_nvlist(asrucp, FM_FMRI_MEM_UNUM "-fmri", 191 unumfmri) != 0) { 192 fmd_hdl_debug(hdl, "page retire failed to add " 193 "unumfmri to modified asru"); 194 nvlist_free(unumfmri); 195 nvlist_free(asrucp); 196 return; 197 } 198 nvlist_free(unumfmri); 199 } 200 201 if (cma_page_cmd(hdl, MEM_PAGE_FMRI_RETIRE, 202 asrucp ? asrucp : asru) == 0) { 203 fmd_hdl_debug(hdl, "retired page 0x%llx\n", 204 (u_longlong_t)pageaddr); 205 cma_stats.page_flts.fmds_value.ui64++; 206 if (uuid != NULL) 207 fmd_case_uuclose(hdl, uuid); 208 if (asrucp) 209 nvlist_free(asrucp); 210 return; 211 } else if (errno != EAGAIN) { 212 fmd_hdl_debug(hdl, "retire of page 0x%llx failed, will not " 213 "retry: %s\n", (u_longlong_t)pageaddr, strerror(errno)); 214 if (asrucp) 215 nvlist_free(asrucp); 216 return; 217 } 218 219 /* 220 * The page didn't immediately retire. We'll need to periodically 221 * check to see if it has been retired. 222 */ 223 fmd_hdl_debug(hdl, "page didn't retire - sleeping\n"); 224 225 page = fmd_hdl_zalloc(hdl, sizeof (cma_page_t), FMD_SLEEP); 226 page->pg_addr = pageaddr; 227 if (asrucp) { 228 page->pg_fmri = asrucp; 229 } else { 230 (void) nvlist_dup(asru, &page->pg_fmri, 0); 231 } 232 if (uuid != NULL) 233 page->pg_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP); 234 235 page->pg_next = cma.cma_pages; 236 cma.cma_pages = page; 237 238 if (cma.cma_page_timerid != 0) 239 fmd_timer_remove(hdl, cma.cma_page_timerid); 240 241 cma.cma_page_curdelay = cma.cma_page_mindelay; 242 243 cma.cma_page_timerid = 244 fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay); 245 } 246 247 static int 248 page_retry(fmd_hdl_t *hdl, cma_page_t *page) 249 { 250 if (page->pg_fmri != NULL && !fmd_nvl_fmri_present(hdl, 251 page->pg_fmri)) { 252 fmd_hdl_debug(hdl, "page retire overtaken by events"); 253 cma_stats.page_nonent.fmds_value.ui64++; 254 255 if (page->pg_uuid != NULL) 256 fmd_case_uuclose(hdl, page->pg_uuid); 257 return (1); /* no longer a page to retire */ 258 } 259 260 if (cma_page_cmd(hdl, MEM_PAGE_FMRI_ISRETIRED, page->pg_fmri) == 0) { 261 fmd_hdl_debug(hdl, "retired page 0x%llx on retry %u\n", 262 page->pg_addr, page->pg_nretries); 263 cma_stats.page_flts.fmds_value.ui64++; 264 265 if (page->pg_uuid != NULL) 266 fmd_case_uuclose(hdl, page->pg_uuid); 267 return (1); /* page retired */ 268 } 269 270 if (errno == EAGAIN) { 271 fmd_hdl_debug(hdl, "scheduling another retry for 0x%llx\n", 272 page->pg_addr); 273 return (0); /* schedule another retry */ 274 } else { 275 if (errno == EIO) { 276 fmd_hdl_debug(hdl, "failed to retry page 0x%llx " 277 "retirement: page isn't scheduled for retirement\n", 278 page->pg_addr); 279 } else { 280 fmd_hdl_debug(hdl, "failed to retry page 0x%llx " 281 "retirement: %s\n", page->pg_addr, 282 strerror(errno)); 283 } 284 285 cma_stats.page_fails.fmds_value.ui64++; 286 return (1); /* give up */ 287 } 288 } 289 290 void 291 cma_page_retry(fmd_hdl_t *hdl) 292 { 293 cma_page_t **pagep; 294 295 cma.cma_page_timerid = 0; 296 297 fmd_hdl_debug(hdl, "page_retry: timer fired\n"); 298 299 pagep = &cma.cma_pages; 300 while (*pagep != NULL) { 301 cma_page_t *page = *pagep; 302 303 if (page_retry(hdl, page)) { 304 /* 305 * Successful retry or we're giving up - remove from 306 * the list 307 */ 308 *pagep = page->pg_next; 309 310 if (page->pg_uuid != NULL) 311 fmd_hdl_strfree(hdl, page->pg_uuid); 312 313 cma_page_free(hdl, page); 314 } else if (cma.cma_page_maxretries == 0 || 315 page->pg_nretries < cma.cma_page_maxretries) { 316 page->pg_nretries++; 317 pagep = &page->pg_next; 318 } else { 319 /* 320 * Tunable maxretries was set and we reached 321 * the max, so just close the case. 322 */ 323 fmd_hdl_debug(hdl, 324 "giving up page retire 0x%llx on retry %u\n", 325 page->pg_addr, page->pg_nretries); 326 cma_stats.page_retmax.fmds_value.ui64++; 327 328 if (page->pg_uuid != NULL) { 329 fmd_case_uuclose(hdl, page->pg_uuid); 330 fmd_hdl_strfree(hdl, page->pg_uuid); 331 } 332 333 *pagep = page->pg_next; 334 335 cma_page_free(hdl, page); 336 } 337 } 338 339 if (cma.cma_pages == NULL) 340 return; /* no more retirements */ 341 342 /* 343 * We still have retirements that haven't completed. Back the delay 344 * off, and schedule a retry. 345 */ 346 cma.cma_page_curdelay = MIN(cma.cma_page_curdelay * 2, 347 cma.cma_page_maxdelay); 348 349 fmd_hdl_debug(hdl, "scheduled page retirement retry for %llu secs\n", 350 (u_longlong_t)(cma.cma_page_curdelay / NANOSEC)); 351 352 cma.cma_page_timerid = 353 fmd_timer_install(hdl, NULL, NULL, cma.cma_page_curdelay); 354 } 355 356 void 357 cma_page_fini(fmd_hdl_t *hdl) 358 { 359 cma_page_t *page; 360 361 while ((page = cma.cma_pages) != NULL) { 362 cma.cma_pages = page->pg_next; 363 cma_page_free(hdl, page); 364 } 365 } 366