1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * OPL platform specific functions for
29 * CPU/Memory error diagnosis engine.
30 */
31 #include <cmd.h>
32 #include <cmd_dimm.h>
33 #include <cmd_bank.h>
34 #include <cmd_page.h>
35 #include <cmd_opl.h>
36 #include <string.h>
37 #include <errno.h>
38 #include <fcntl.h>
39 #include <unistd.h>
40 #include <dirent.h>
41 #include <sys/stat.h>
42
43 #include <sys/fm/protocol.h>
44 #include <sys/fm/io/opl_mc_fm.h>
45 #include <sys/async.h>
46 #include <sys/opl_olympus_regs.h>
47 #include <sys/fm/cpu/SPARC64-VI.h>
48 #include <sys/int_const.h>
49 #include <sys/mutex.h>
50 #include <sys/dditypes.h>
51 #include <opl/sys/mc-opl.h>
52
53 /*
54 * The following is the common function for handling
55 * memory UE with EID=MEM.
56 * The error could be detected by either CPU/IO.
57 */
58 cmd_evdisp_t
opl_ue_mem(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,int hdlr_type)59 opl_ue_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
60 int hdlr_type)
61 {
62 nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
63 uint64_t ubc_ue_log_reg, pa;
64 cmd_page_t *page;
65
66 if (nvlist_lookup_nvlist(nvl,
67 FM_EREPORT_PAYLOAD_NAME_RESOURCE, &rsrc) != 0)
68 return (CMD_EVD_BAD);
69
70 switch (hdlr_type) {
71 case CMD_OPL_HDLR_CPU:
72
73 if (nvlist_lookup_uint64(nvl,
74 FM_EREPORT_PAYLOAD_NAME_SFAR, &pa) != 0)
75 return (CMD_EVD_BAD);
76
77 fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
78 (u_longlong_t)pa);
79 break;
80
81 case CMD_OPL_HDLR_IO:
82
83 if (nvlist_lookup_uint64(nvl, OBERON_UBC_MUE,
84 &ubc_ue_log_reg) != 0)
85 return (CMD_EVD_BAD);
86
87 pa = (ubc_ue_log_reg & UBC_UE_ADR_MASK);
88
89 fmd_hdl_debug(hdl, "cmd_ue_mem: ue_log_reg=%llx\n",
90 (u_longlong_t)ubc_ue_log_reg);
91 fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
92 (u_longlong_t)pa);
93 break;
94
95 default:
96
97 return (CMD_EVD_BAD);
98 }
99
100 if ((page = cmd_page_lookup(pa)) != NULL &&
101 page->page_case.cc_cp != NULL &&
102 fmd_case_solved(hdl, page->page_case.cc_cp))
103 return (CMD_EVD_REDUND);
104
105 if (nvlist_dup(rsrc, &asru, 0) != 0) {
106 fmd_hdl_debug(hdl, "opl_ue_mem nvlist dup failed\n");
107 return (CMD_EVD_BAD);
108 }
109
110 if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
111 nvlist_free(asru);
112 CMD_STAT_BUMP(bad_mem_asru);
113 return (CMD_EVD_BAD);
114 }
115
116 if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
117 nvlist_free(asru);
118 return (CMD_EVD_BAD);
119 }
120
121 cmd_page_fault(hdl, asru, fru, ep, pa);
122 nvlist_free(asru);
123 nvlist_free(fru);
124 return (CMD_EVD_OK);
125 }
126
127 /*
128 * The following is the main function to handle generating
129 * the sibling cpu suspect list for the CPU detected UE
130 * error cases. This is to handle the
131 * multiple strand/core architecture on the OPL platform.
132 */
133 cmd_evdisp_t
cmd_opl_ue_cpu(fmd_hdl_t * hdl,fmd_event_t * ep,const char * class,const char * fltname,cmd_ptrsubtype_t ptr,cmd_cpu_t * cpu,cmd_case_t * cc,uint8_t cpumask)134 cmd_opl_ue_cpu(fmd_hdl_t *hdl, fmd_event_t *ep,
135 const char *class, const char *fltname,
136 cmd_ptrsubtype_t ptr, cmd_cpu_t *cpu,
137 cmd_case_t *cc, uint8_t cpumask)
138 {
139 const char *uuid;
140 cmd_cpu_t *main_cpu, *sib_cpu;
141 nvlist_t *fmri;
142 cmd_list_t *cpu_list;
143 opl_cpu_t *opl_cpu;
144 uint32_t main_cpuid, nsusp = 1;
145 uint8_t cert;
146
147 fmd_hdl_debug(hdl,
148 "Enter OPL_CPUUE_HANDLER for class %x\n", class);
149
150 main_cpu = cpu;
151 main_cpuid = cpu->cpu_cpuid;
152
153 if (strcmp(fltname, "core") == 0)
154 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
155 IS_CORE);
156 else if (strcmp(fltname, "chip") == 0)
157 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
158 IS_CHIP);
159 else
160 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
161 IS_STRAND);
162
163 for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
164 opl_cpu = cmd_list_next(opl_cpu)) {
165 if (opl_cpu->oc_cpuid == main_cpuid) {
166 sib_cpu = main_cpu;
167 opl_cpu->oc_cmd_cpu = main_cpu;
168 } else {
169 fmri = cmd_cpu_fmri_create(opl_cpu->oc_cpuid, cpumask);
170 if (fmri == NULL) {
171 opl_cpu->oc_cmd_cpu = NULL;
172 fmd_hdl_debug(hdl,
173 "missing asru, cpuid %u excluded\n",
174 opl_cpu->oc_cpuid);
175 continue;
176 }
177
178 sib_cpu = cmd_cpu_lookup(hdl, fmri, class,
179 CMD_CPU_LEVEL_THREAD);
180 if (sib_cpu == NULL || sib_cpu->cpu_faulting) {
181 nvlist_free(fmri);
182 opl_cpu->oc_cmd_cpu = NULL;
183 fmd_hdl_debug(hdl,
184 "cpu not present, cpuid %u excluded\n",
185 opl_cpu->oc_cpuid);
186 continue;
187 }
188 opl_cpu->oc_cmd_cpu = sib_cpu;
189 nvlist_free(fmri);
190 nsusp++;
191 }
192 if (cpu->cpu_cpuid == main_cpuid) {
193 if (cc->cc_cp != NULL &&
194 fmd_case_solved(hdl, cc->cc_cp)) {
195 if (cpu_list != NULL)
196 opl_cpulist_free(hdl, cpu_list);
197 return (CMD_EVD_REDUND);
198 }
199
200 if (cc->cc_cp == NULL)
201 cc->cc_cp = cmd_case_create(hdl,
202 &cpu->cpu_header, ptr, &uuid);
203
204 if (cc->cc_serdnm != NULL) {
205 fmd_hdl_debug(hdl,
206 "destroying existing %s state for class %x\n",
207 cc->cc_serdnm, class);
208 fmd_serd_destroy(hdl, cc->cc_serdnm);
209 fmd_hdl_strfree(hdl, cc->cc_serdnm);
210 cc->cc_serdnm = NULL;
211 fmd_case_reset(hdl, cc->cc_cp);
212 }
213 fmd_case_add_ereport(hdl, cc->cc_cp, ep);
214 }
215 }
216 cert = opl_avg(100, nsusp);
217 for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
218 opl_cpu = cmd_list_next(opl_cpu)) {
219 if (opl_cpu->oc_cmd_cpu != NULL) {
220 nvlist_t *cpu_rsrc;
221
222 cpu_rsrc = opl_cpursrc_create(hdl, opl_cpu->oc_cpuid);
223 if (cpu_rsrc == NULL) {
224 fmd_hdl_debug(hdl,
225 "missing rsrc, cpuid %u excluded\n",
226 opl_cpu->oc_cpuid);
227 continue;
228 }
229 cmd_cpu_create_faultlist(hdl, cc->cc_cp,
230 opl_cpu->oc_cmd_cpu, fltname, cpu_rsrc, cert);
231 nvlist_free(cpu_rsrc);
232 }
233 }
234 fmd_case_solve(hdl, cc->cc_cp);
235 if (cpu_list != NULL)
236 opl_cpulist_free(hdl, cpu_list);
237 return (CMD_EVD_OK);
238 }
239
240 /*
241 * Generates DIMM fault if the number of Permanent CE
242 * threshold is exceeded.
243 */
244 static void
opl_ce_thresh_check(fmd_hdl_t * hdl,cmd_dimm_t * dimm)245 opl_ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
246 {
247 nvlist_t *dflt;
248 fmd_case_t *cp;
249
250 fmd_hdl_debug(hdl,
251 "Permanent CE event threshold checking.\n");
252
253 if (dimm->dimm_flags & CMD_MEM_F_FAULTING) {
254 /* We've already complained about this DIMM */
255 return;
256 }
257
258 if (dimm->dimm_nretired >= fmd_prop_get_int32(hdl,
259 "max_perm_ce_dimm")) {
260 dimm->dimm_flags |= CMD_MEM_F_FAULTING;
261 cp = fmd_case_open(hdl, NULL);
262 dflt = cmd_dimm_create_fault(hdl, dimm, "fault.memory.dimm",
263 CMD_FLTMAXCONF);
264 fmd_case_add_suspect(hdl, cp, dflt);
265 fmd_case_solve(hdl, cp);
266 }
267 }
268
269 /*
270 * Notify fault page information (pa and errlog) to XSCF via mc-opl
271 */
272 #define MC_PHYDEV_DIR "/devices"
273 #define MC_PHYPREFIX "pseudo-mc@"
274 static int
opl_scf_log(fmd_hdl_t * hdl,nvlist_t * nvl)275 opl_scf_log(fmd_hdl_t *hdl, nvlist_t *nvl)
276 {
277 uint32_t *eadd, *elog;
278 uint_t n;
279 uint64_t pa;
280 char path[MAXPATHLEN];
281 char *unum;
282 nvlist_t *rsrc;
283 DIR *mcdir;
284 struct dirent *dp;
285 mc_flt_page_t flt_page;
286 cmd_page_t *page;
287 struct stat statbuf;
288
289 /*
290 * Extract ereport.
291 * Sanity check of pa is already done at cmd_opl_mac_common().
292 * mc-opl sets only one entry for MC_OPL_ERR_ADD, MC_OPL_ERR_LOG,
293 * and MC_OPL_BANK.
294 */
295 if ((nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa) != 0) ||
296 (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_ADD, &eadd, &n) != 0) ||
297 (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_LOG, &elog, &n) != 0)) {
298 fmd_hdl_debug(hdl, "opl_scf_log failed to extract ereport.\n");
299 return (-1);
300 }
301 if (nvlist_lookup_nvlist(nvl, FM_EREPORT_PAYLOAD_NAME_RESOURCE,
302 &rsrc) != 0) {
303 fmd_hdl_debug(hdl, "opl_scf_log failed to get resource.\n");
304 return (-1);
305 }
306 if (nvlist_lookup_string(rsrc, FM_FMRI_MEM_UNUM, &unum) != 0) {
307 fmd_hdl_debug(hdl, "opl_scf_log failed to get unum.\n");
308 return (-1);
309 }
310
311 page = cmd_page_lookup(pa);
312 if (page != NULL && page->page_flags & CMD_MEM_F_FAULTING) {
313 /*
314 * fault.memory.page will not be created.
315 */
316 return (0);
317 }
318
319 flt_page.err_add = eadd[0];
320 flt_page.err_log = elog[0];
321 flt_page.fmri_addr = (uint64_t)(uint32_t)unum;
322 flt_page.fmri_sz = strlen(unum) + 1;
323
324 fmd_hdl_debug(hdl, "opl_scf_log DIMM: %s (%d)\n",
325 unum, strlen(unum) + 1);
326 fmd_hdl_debug(hdl, "opl_scf_log pa:%llx add:%x log:%x\n",
327 pa, eadd[0], elog[0]);
328
329 if ((mcdir = opendir(MC_PHYDEV_DIR)) != NULL) {
330 while ((dp = readdir(mcdir)) != NULL) {
331 int fd;
332
333 if (strncmp(dp->d_name, MC_PHYPREFIX,
334 strlen(MC_PHYPREFIX)) != 0)
335 continue;
336
337 (void) snprintf(path, sizeof (path),
338 "%s/%s", MC_PHYDEV_DIR, dp->d_name);
339
340 if (stat(path, &statbuf) != 0 ||
341 (statbuf.st_mode & S_IFCHR) == 0) {
342 /* skip if not a character device */
343 continue;
344 }
345
346 if ((fd = open(path, O_RDONLY)) < 0)
347 continue;
348
349 if (ioctl(fd, MCIOC_FAULT_PAGE, &flt_page) == 0) {
350 fmd_hdl_debug(hdl, "opl_scf_log ioctl(%s)\n",
351 path);
352 (void) close(fd);
353 (void) closedir(mcdir);
354 return (0);
355 }
356 (void) close(fd);
357 }
358 (void) closedir(mcdir);
359 }
360
361 fmd_hdl_debug(hdl, "opl_scf_log failed ioctl().\n");
362
363 return (-1);
364 }
365
366 /*
367 * This is the common function for processing MAC detected
368 * Intermittent and Permanent CEs.
369 */
370
371 cmd_evdisp_t
cmd_opl_mac_ce(fmd_hdl_t * hdl,fmd_event_t * ep,const char * class,nvlist_t * asru,nvlist_t * fru,uint64_t pa,nvlist_t * nvl)372 cmd_opl_mac_ce(fmd_hdl_t *hdl, fmd_event_t *ep, const char *class,
373 nvlist_t *asru, nvlist_t *fru, uint64_t pa, nvlist_t *nvl)
374 {
375 cmd_dimm_t *dimm;
376 const char *uuid;
377
378 fmd_hdl_debug(hdl,
379 "Processing CE ereport\n");
380
381 if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL &&
382 (dimm = cmd_dimm_create(hdl, asru)) == NULL)
383 return (CMD_EVD_UNUSED);
384
385 if (dimm->dimm_case.cc_cp == NULL) {
386 dimm->dimm_case.cc_cp = cmd_case_create(hdl,
387 &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid);
388 }
389
390 if (strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
391 CMD_STAT_BUMP(ce_interm);
392 fmd_hdl_debug(hdl, "adding FJ-Intermittent event "
393 "to CE serd engine\n");
394
395 if (dimm->dimm_case.cc_serdnm == NULL) {
396 dimm->dimm_case.cc_serdnm =
397 cmd_mem_serdnm_create(hdl,
398 "dimm", dimm->dimm_unum);
399 fmd_serd_create(hdl, dimm->dimm_case.cc_serdnm,
400 fmd_prop_get_int32(hdl, "ce_n"),
401 fmd_prop_get_int64(hdl, "ce_t"));
402 }
403
404 if (fmd_serd_record(hdl, dimm->dimm_case.cc_serdnm, ep) ==
405 FMD_B_FALSE) {
406 return (CMD_EVD_OK); /* engine hasn't fired */
407 }
408 fmd_hdl_debug(hdl, "ce serd fired\n");
409 fmd_case_add_serd(hdl, dimm->dimm_case.cc_cp,
410 dimm->dimm_case.cc_serdnm);
411 fmd_serd_reset(hdl, dimm->dimm_case.cc_serdnm);
412
413 (void) opl_scf_log(hdl, nvl);
414 } else {
415 CMD_STAT_BUMP(ce_sticky);
416 }
417
418 dimm->dimm_nretired++;
419 dimm->dimm_retstat.fmds_value.ui64++;
420 cmd_dimm_dirty(hdl, dimm);
421
422 cmd_page_fault(hdl, asru, fru, ep, pa);
423 opl_ce_thresh_check(hdl, dimm);
424
425 return (CMD_EVD_OK);
426 }
427
428 /*
429 * This is the common entry for processing MAC detected errors.
430 * It is responsible for generating the memory page fault event.
431 * The permanent CE (sticky) in normal mode is handled here also
432 * in the same way as in the UE case.
433 */
434 /*ARGSUSED*/
435 cmd_evdisp_t
cmd_opl_mac_common(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)436 cmd_opl_mac_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
437 const char *class, cmd_errcl_t clcode)
438 {
439 uint64_t pa;
440 nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
441 cmd_page_t *page;
442
443 fmd_hdl_debug(hdl, "cmd_mac_common: clcode=%ll\n", clcode);
444
445 if (nvlist_lookup_nvlist(nvl, MC_OPL_RESOURCE, &rsrc) != 0)
446 return (CMD_EVD_BAD);
447
448 if (nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa)
449 != 0)
450 return (CMD_EVD_BAD);
451
452 /*
453 * Check for invalid pa.
454 * The most sig. bit should not be on.
455 * It would be out of the range of possible pa
456 * in MAC's view.
457 */
458 if (((uint64_t)1 << 63) & pa)
459 return (CMD_EVD_BAD);
460
461 if ((page = cmd_page_lookup(pa)) != NULL &&
462 page->page_case.cc_cp != NULL &&
463 fmd_case_solved(hdl, page->page_case.cc_cp))
464 return (CMD_EVD_REDUND);
465
466 if (nvlist_dup(rsrc, &asru, 0) != 0) {
467 fmd_hdl_debug(hdl, "cmd_opl_mac_common nvlist dup failed\n");
468 return (CMD_EVD_BAD);
469 }
470
471 if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
472 fmd_hdl_debug(hdl, "cmd_opl_mac_common expand failed\n");
473 nvlist_free(asru);
474 CMD_STAT_BUMP(bad_mem_asru);
475 return (CMD_EVD_BAD);
476 }
477
478 if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
479 fmd_hdl_debug(hdl, "cmd_opl_mac_common fru_create failed\n");
480 nvlist_free(asru);
481 return (CMD_EVD_BAD);
482 }
483
484 /*
485 * process PCE and ICE to create DIMM fault
486 */
487 if (strcmp(class, "ereport.asic.mac.mi-ce") == 0 ||
488 strcmp(class, "ereport.asic.mac.ptrl-ce") == 0 ||
489 strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
490 cmd_evdisp_t ret;
491
492 ret = cmd_opl_mac_ce(hdl, ep, class, asru, fru, pa, nvl);
493 nvlist_free(asru);
494 nvlist_free(fru);
495 if (ret != CMD_EVD_OK) {
496 fmd_hdl_debug(hdl,
497 "cmd_opl_mac_common: mac_ce failed\n");
498 return (CMD_EVD_BAD);
499 } else
500 return (CMD_EVD_OK);
501 }
502
503 /* The following code handles page retires for UEs and CMPEs. */
504
505 cmd_page_fault(hdl, asru, fru, ep, pa);
506 nvlist_free(asru);
507 nvlist_free(fru);
508 return (CMD_EVD_OK);
509 }
510
511 /*
512 * Common entry points for handling CPU/IO detected UE with
513 * respect to EID=MEM.
514 */
515 /*ARGSUSED*/
516 cmd_evdisp_t
cmd_opl_cpu_mem(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)517 cmd_opl_cpu_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
518 const char *class, cmd_errcl_t clcode)
519 {
520 return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_CPU));
521 }
522
523 /*ARGSUSED*/
524 cmd_evdisp_t
cmd_opl_io_mem(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)525 cmd_opl_io_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
526 const char *class, cmd_errcl_t clcode)
527 {
528 return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_IO));
529 }
530