1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Support routines for managing potential page and bank faults that have 30 * been deferred due to a datapath error. Currently deferment only occurs 31 * if a memory UE occurs while a datapath error is active. When this happens 32 * a page case is created with a special subtype of CMD_PTR_DP_PAGE_DEFER. An 33 * entry (a cmd_dp_defer_t) is added to a list of deferred pages. The entry 34 * links to the cmd_page_t in the cmd_pages list and also keeps track of what 35 * memory controller ids are associated with the first AFAR and any more that 36 * are seen while the page is deferred. This information is used to determine 37 * if the page should be faulted if the fault should be skipped because an 38 * intervening datapath fault has occurred. If a page is faulted when it is 39 * replayed, the corresponding bank is faulted, too, since the original error 40 * was a UE. Note that no action is taken to undo any action taken by the 41 * kernel when the UE was detected. Currently the kernel will attempt to 42 * immediately retire the page where a UE is detected and the retire may or 43 * may not have completed by the time FMA receives an ereport. The possibility 44 * of a datapath fault resulting in memory UEs is very small, so the likelihood 45 * of encountering this scenario is also very small. 46 */ 47 48 #include <cmd.h> 49 #include <cmd_dp.h> 50 #include <cmd_dp_page.h> 51 #include <cmd_bank.h> 52 #include <cmd_page.h> 53 54 #include <fm/fmd_api.h> 55 #include <sys/nvpair.h> 56 57 extern void cmd_bank_fault(fmd_hdl_t *, cmd_bank_t *); 58 59 static void 60 dp_page_defer_data_write(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage) 61 { 62 fmd_buf_write(hdl, dpage->dp_defer_page->page_case.cc_cp, "mcids", 63 &dpage->dp_defer_mcids, sizeof (dpage->dp_defer_mcids)); 64 } 65 66 static void 67 dp_page_defer_data_restore(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage) 68 { 69 fmd_buf_read(hdl, dpage->dp_defer_page->page_case.cc_cp, "mcids", 70 &dpage->dp_defer_mcids, sizeof (dpage->dp_defer_mcids)); 71 } 72 73 static void 74 dp_page_defer_add_data(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage, uint64_t afar) 75 { 76 int mcid; 77 int i; 78 79 if (cmd_dp_get_mcid(afar, &mcid) < 0) 80 fmd_hdl_abort(hdl, "cmd_dp_get_mcid failed"); 81 82 for (i = 0; i < DP_MAX_MCS; i++) { 83 if (dpage->dp_defer_mcids[i] == -1) { 84 dpage->dp_defer_mcids[i] = mcid; 85 break; 86 } 87 if (dpage->dp_defer_mcids[i] == mcid) 88 break; 89 } 90 91 if (i == DP_MAX_MCS) 92 fmd_hdl_abort(hdl, "too many mcids for deferred page"); 93 94 dp_page_defer_data_write(hdl, dpage); 95 } 96 97 static cmd_dp_defer_t * 98 dp_page_defer_create(fmd_hdl_t *hdl, cmd_page_t *page, uint64_t afar) 99 { 100 cmd_dp_defer_t *dpage; 101 int i; 102 103 dpage = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_defer_t), FMD_SLEEP); 104 105 dpage->dp_defer_page = page; 106 107 for (i = 0; i < DP_MAX_MCS; i++) 108 dpage->dp_defer_mcids[i] = -1; 109 110 dp_page_defer_add_data(hdl, dpage, afar); 111 112 cmd_list_append(&cmd.cmd_deferred_pages, dpage); 113 114 return (dpage); 115 } 116 117 static cmd_dp_defer_t * 118 dp_page_defer_lookup(cmd_page_t *page) 119 { 120 cmd_dp_defer_t *dpage; 121 122 for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL; 123 dpage = cmd_list_next(dpage)) { 124 if (dpage->dp_defer_page == page) 125 return (dpage); 126 } 127 128 return (NULL); 129 } 130 131 void 132 cmd_dp_page_defer(fmd_hdl_t *hdl, nvlist_t *modasru, fmd_event_t *ep, 133 uint64_t afar) 134 { 135 cmd_dp_defer_t *dpage; 136 cmd_page_t *page = cmd_page_lookup(afar); 137 const char *uuid; 138 139 if (page == NULL) { 140 page = cmd_page_create(hdl, modasru, afar); 141 dpage = dp_page_defer_create(hdl, page, afar); 142 page->page_case.cc_cp = cmd_case_create(hdl, &page->page_header, 143 CMD_PTR_DP_PAGE_DEFER, &uuid); 144 fmd_case_setprincipal(hdl, page->page_case.cc_cp, ep); 145 } else { 146 dpage = dp_page_defer_lookup(page); 147 if (dpage == NULL) 148 fmd_hdl_abort(hdl, "deferred page with no defer data"); 149 fmd_case_add_ereport(hdl, page->page_case.cc_cp, ep); 150 } 151 152 dp_page_defer_add_data(hdl, dpage, afar); 153 } 154 155 int 156 cmd_dp_page_check(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage) 157 { 158 int i; 159 160 for (i = 0; i < DP_MAX_MCS; i++) { 161 if (dpage->dp_defer_mcids[i] == -1) 162 break; 163 /* 164 * If there's no datapath fault corresponding to 165 * an mcid, that means the page incurred an error 166 * not attributable to a datapath fault. 167 */ 168 if (cmd_dp_lookup_fault(hdl, dpage->dp_defer_mcids[i]) == 0) 169 return (0); 170 } 171 172 return (1); 173 } 174 175 void 176 cmd_dp_page_replay(fmd_hdl_t *hdl) 177 { 178 fmd_event_t *ep; 179 cmd_page_t *page; 180 cmd_bank_t *bank; 181 cmd_dp_defer_t *dpage; 182 nvlist_t *nvl; 183 184 while ((dpage = cmd_list_next(&cmd.cmd_deferred_pages)) != NULL) { 185 fmd_hdl_debug(hdl, "replaying deferred page, " 186 "pa=%llx\n", dpage->dp_defer_page->page_physbase); 187 188 page = dpage->dp_defer_page; 189 190 if (cmd_dp_page_check(hdl, dpage)) { 191 fmd_hdl_debug(hdl, "deferred memory UE overtaken by " 192 "dp fault"); 193 CMD_STAT_BUMP(dp_ignored_ue); 194 fmd_case_close(hdl, page->page_case.cc_cp); 195 cmd_list_delete(&cmd.cmd_deferred_pages, dpage); 196 fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t)); 197 cmd_page_destroy(hdl, page); 198 continue; 199 } 200 201 nvl = page->page_asru_nvl; 202 203 bank = cmd_bank_lookup(hdl, nvl); 204 205 ep = fmd_case_getprincipal(hdl, page->page_case.cc_cp); 206 fmd_case_add_ereport(hdl, bank->bank_case.cc_cp, ep); 207 208 bank->bank_nretired++; 209 bank->bank_retstat.fmds_value.ui64++; 210 cmd_bank_dirty(hdl, bank); 211 212 fmd_case_reset(hdl, page->page_case.cc_cp); 213 cmd_case_fini(hdl, page->page_case.cc_cp, FMD_B_TRUE); 214 215 page->page_case.cc_cp = NULL; 216 cmd_page_fault(hdl, nvl, nvl, ep, page->page_physbase); 217 cmd_bank_fault(hdl, bank); 218 219 cmd_list_delete(&cmd.cmd_deferred_pages, dpage); 220 fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t)); 221 } 222 223 fmd_hdl_debug(hdl, "cmd_page_defer_replay() complete\n"); 224 } 225 226 void 227 cmd_dp_page_restore(fmd_hdl_t *hdl, cmd_page_t *page) 228 { 229 cmd_dp_defer_t *dpage; 230 231 dpage = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_defer_t), FMD_SLEEP); 232 233 dpage->dp_defer_page = page; 234 235 dp_page_defer_data_restore(hdl, dpage); 236 237 cmd_list_append(&cmd.cmd_deferred_pages, dpage); 238 } 239 240 void 241 cmd_dp_page_validate(fmd_hdl_t *hdl) 242 { 243 cmd_dp_defer_t *dpage, *next; 244 cmd_page_t *page; 245 246 for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL; 247 dpage = next) { 248 next = cmd_list_next(dpage); 249 250 page = dpage->dp_defer_page; 251 252 if (!fmd_nvl_fmri_present(hdl, page->page_asru_nvl)) { 253 cmd_page_destroy(hdl, page); 254 cmd_list_delete(&cmd.cmd_deferred_pages, dpage); 255 fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t)); 256 } 257 } 258 } 259 260 /*ARGSUSED*/ 261 int 262 cmd_dp_page_isdeferred(fmd_hdl_t *hdl, cmd_page_t *page) 263 { 264 cmd_dp_defer_t *dpage, *next; 265 266 for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL; 267 dpage = next) { 268 next = cmd_list_next(dpage); 269 270 if (dpage->dp_defer_page == page) { 271 return (1); 272 } 273 } 274 275 return (0); 276 } 277