1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #pragma ident "%Z%%M% %I% %E% SMI"
28
29 /*
30 * OPL platform specific functions for
31 * CPU/Memory error diagnosis engine.
32 */
33 #include <cmd.h>
34 #include <cmd_dimm.h>
35 #include <cmd_bank.h>
36 #include <cmd_page.h>
37 #include <cmd_opl.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <fcntl.h>
41 #include <unistd.h>
42 #include <dirent.h>
43 #include <sys/stat.h>
44
45 #include <sys/fm/protocol.h>
46 #include <sys/fm/io/opl_mc_fm.h>
47 #include <sys/async.h>
48 #include <sys/opl_olympus_regs.h>
49 #include <sys/fm/cpu/SPARC64-VI.h>
50 #include <sys/int_const.h>
51 #include <sys/mutex.h>
52 #include <sys/dditypes.h>
53 #include <opl/sys/mc-opl.h>
54
55 /*
56 * The following is the common function for handling
57 * memory UE with EID=MEM.
58 * The error could be detected by either CPU/IO.
59 */
60 cmd_evdisp_t
opl_ue_mem(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,int hdlr_type)61 opl_ue_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
62 int hdlr_type)
63 {
64 nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
65 uint64_t ubc_ue_log_reg, pa;
66 cmd_page_t *page;
67
68 if (nvlist_lookup_nvlist(nvl,
69 FM_EREPORT_PAYLOAD_NAME_RESOURCE, &rsrc) != 0)
70 return (CMD_EVD_BAD);
71
72 switch (hdlr_type) {
73 case CMD_OPL_HDLR_CPU:
74
75 if (nvlist_lookup_uint64(nvl,
76 FM_EREPORT_PAYLOAD_NAME_SFAR, &pa) != 0)
77 return (CMD_EVD_BAD);
78
79 fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
80 (u_longlong_t)pa);
81 break;
82
83 case CMD_OPL_HDLR_IO:
84
85 if (nvlist_lookup_uint64(nvl, OBERON_UBC_MUE,
86 &ubc_ue_log_reg) != 0)
87 return (CMD_EVD_BAD);
88
89 pa = (ubc_ue_log_reg & UBC_UE_ADR_MASK);
90
91 fmd_hdl_debug(hdl, "cmd_ue_mem: ue_log_reg=%llx\n",
92 (u_longlong_t)ubc_ue_log_reg);
93 fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
94 (u_longlong_t)pa);
95 break;
96
97 default:
98
99 return (CMD_EVD_BAD);
100 }
101
102 if ((page = cmd_page_lookup(pa)) != NULL &&
103 page->page_case.cc_cp != NULL &&
104 fmd_case_solved(hdl, page->page_case.cc_cp))
105 return (CMD_EVD_REDUND);
106
107 if (nvlist_dup(rsrc, &asru, 0) != 0) {
108 fmd_hdl_debug(hdl, "opl_ue_mem nvlist dup failed\n");
109 return (CMD_EVD_BAD);
110 }
111
112 if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
113 nvlist_free(asru);
114 CMD_STAT_BUMP(bad_mem_asru);
115 return (CMD_EVD_BAD);
116 }
117
118 if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
119 nvlist_free(asru);
120 return (CMD_EVD_BAD);
121 }
122
123 cmd_page_fault(hdl, asru, fru, ep, pa);
124 nvlist_free(asru);
125 nvlist_free(fru);
126 return (CMD_EVD_OK);
127 }
128
129 /*
130 * The following is the main function to handle generating
131 * the sibling cpu suspect list for the CPU detected UE
132 * error cases. This is to handle the
133 * multiple strand/core architecture on the OPL platform.
134 */
135 cmd_evdisp_t
cmd_opl_ue_cpu(fmd_hdl_t * hdl,fmd_event_t * ep,const char * class,const char * fltname,cmd_ptrsubtype_t ptr,cmd_cpu_t * cpu,cmd_case_t * cc,uint8_t cpumask)136 cmd_opl_ue_cpu(fmd_hdl_t *hdl, fmd_event_t *ep,
137 const char *class, const char *fltname,
138 cmd_ptrsubtype_t ptr, cmd_cpu_t *cpu,
139 cmd_case_t *cc, uint8_t cpumask)
140 {
141 const char *uuid;
142 cmd_cpu_t *main_cpu, *sib_cpu;
143 nvlist_t *fmri;
144 cmd_list_t *cpu_list;
145 opl_cpu_t *opl_cpu;
146 uint32_t main_cpuid, nsusp = 1;
147 uint8_t cert;
148
149 fmd_hdl_debug(hdl,
150 "Enter OPL_CPUUE_HANDLER for class %x\n", class);
151
152 main_cpu = cpu;
153 main_cpuid = cpu->cpu_cpuid;
154
155 if (strcmp(fltname, "core") == 0)
156 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
157 IS_CORE);
158 else if (strcmp(fltname, "chip") == 0)
159 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
160 IS_CHIP);
161 else
162 cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
163 IS_STRAND);
164
165 for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
166 opl_cpu = cmd_list_next(opl_cpu)) {
167 if (opl_cpu->oc_cpuid == main_cpuid) {
168 sib_cpu = main_cpu;
169 opl_cpu->oc_cmd_cpu = main_cpu;
170 } else {
171 fmri = cmd_cpu_fmri_create(opl_cpu->oc_cpuid, cpumask);
172 if (fmri == NULL) {
173 opl_cpu->oc_cmd_cpu = NULL;
174 fmd_hdl_debug(hdl,
175 "missing asru, cpuid %u excluded\n",
176 opl_cpu->oc_cpuid);
177 continue;
178 }
179
180 sib_cpu = cmd_cpu_lookup(hdl, fmri, class,
181 CMD_CPU_LEVEL_THREAD);
182 if (sib_cpu == NULL || sib_cpu->cpu_faulting) {
183 if (fmri != NULL)
184 nvlist_free(fmri);
185 opl_cpu->oc_cmd_cpu = NULL;
186 fmd_hdl_debug(hdl,
187 "cpu not present, cpuid %u excluded\n",
188 opl_cpu->oc_cpuid);
189 continue;
190 }
191 opl_cpu->oc_cmd_cpu = sib_cpu;
192 if (fmri != NULL)
193 nvlist_free(fmri);
194 nsusp++;
195 }
196 if (cpu->cpu_cpuid == main_cpuid) {
197 if (cc->cc_cp != NULL &&
198 fmd_case_solved(hdl, cc->cc_cp)) {
199 if (cpu_list != NULL)
200 opl_cpulist_free(hdl, cpu_list);
201 return (CMD_EVD_REDUND);
202 }
203
204 if (cc->cc_cp == NULL)
205 cc->cc_cp = cmd_case_create(hdl,
206 &cpu->cpu_header, ptr, &uuid);
207
208 if (cc->cc_serdnm != NULL) {
209 fmd_hdl_debug(hdl,
210 "destroying existing %s state for class %x\n",
211 cc->cc_serdnm, class);
212 fmd_serd_destroy(hdl, cc->cc_serdnm);
213 fmd_hdl_strfree(hdl, cc->cc_serdnm);
214 cc->cc_serdnm = NULL;
215 fmd_case_reset(hdl, cc->cc_cp);
216 }
217 fmd_case_add_ereport(hdl, cc->cc_cp, ep);
218 }
219 }
220 cert = opl_avg(100, nsusp);
221 for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
222 opl_cpu = cmd_list_next(opl_cpu)) {
223 if (opl_cpu->oc_cmd_cpu != NULL) {
224 nvlist_t *cpu_rsrc;
225
226 cpu_rsrc = opl_cpursrc_create(hdl, opl_cpu->oc_cpuid);
227 if (cpu_rsrc == NULL) {
228 fmd_hdl_debug(hdl,
229 "missing rsrc, cpuid %u excluded\n",
230 opl_cpu->oc_cpuid);
231 continue;
232 }
233 cmd_cpu_create_faultlist(hdl, cc->cc_cp,
234 opl_cpu->oc_cmd_cpu, fltname, cpu_rsrc, cert);
235 nvlist_free(cpu_rsrc);
236 }
237 }
238 fmd_case_solve(hdl, cc->cc_cp);
239 if (cpu_list != NULL)
240 opl_cpulist_free(hdl, cpu_list);
241 return (CMD_EVD_OK);
242 }
243
244 /*
245 * Generates DIMM fault if the number of Permanent CE
246 * threshold is exceeded.
247 */
248 static void
opl_ce_thresh_check(fmd_hdl_t * hdl,cmd_dimm_t * dimm)249 opl_ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
250 {
251 nvlist_t *dflt;
252 fmd_case_t *cp;
253
254 fmd_hdl_debug(hdl,
255 "Permanent CE event threshold checking.\n");
256
257 if (dimm->dimm_flags & CMD_MEM_F_FAULTING) {
258 /* We've already complained about this DIMM */
259 return;
260 }
261
262 if (dimm->dimm_nretired >= fmd_prop_get_int32(hdl,
263 "max_perm_ce_dimm")) {
264 dimm->dimm_flags |= CMD_MEM_F_FAULTING;
265 cp = fmd_case_open(hdl, NULL);
266 dflt = cmd_dimm_create_fault(hdl, dimm, "fault.memory.dimm",
267 CMD_FLTMAXCONF);
268 fmd_case_add_suspect(hdl, cp, dflt);
269 fmd_case_solve(hdl, cp);
270 }
271 }
272
273 /*
274 * Notify fault page information (pa and errlog) to XSCF via mc-opl
275 */
276 #define MC_PHYDEV_DIR "/devices"
277 #define MC_PHYPREFIX "pseudo-mc@"
278 static int
opl_scf_log(fmd_hdl_t * hdl,nvlist_t * nvl)279 opl_scf_log(fmd_hdl_t *hdl, nvlist_t *nvl)
280 {
281 uint32_t *eadd, *elog;
282 uint_t n;
283 uint64_t pa;
284 char path[MAXPATHLEN];
285 char *unum;
286 nvlist_t *rsrc;
287 DIR *mcdir;
288 struct dirent *dp;
289 mc_flt_page_t flt_page;
290 cmd_page_t *page;
291 struct stat statbuf;
292
293 /*
294 * Extract ereport.
295 * Sanity check of pa is already done at cmd_opl_mac_common().
296 * mc-opl sets only one entry for MC_OPL_ERR_ADD, MC_OPL_ERR_LOG,
297 * and MC_OPL_BANK.
298 */
299 if ((nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa) != 0) ||
300 (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_ADD, &eadd, &n) != 0) ||
301 (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_LOG, &elog, &n) != 0)) {
302 fmd_hdl_debug(hdl, "opl_scf_log failed to extract ereport.\n");
303 return (-1);
304 }
305 if (nvlist_lookup_nvlist(nvl, FM_EREPORT_PAYLOAD_NAME_RESOURCE,
306 &rsrc) != 0) {
307 fmd_hdl_debug(hdl, "opl_scf_log failed to get resource.\n");
308 return (-1);
309 }
310 if (nvlist_lookup_string(rsrc, FM_FMRI_MEM_UNUM, &unum) != 0) {
311 fmd_hdl_debug(hdl, "opl_scf_log failed to get unum.\n");
312 return (-1);
313 }
314
315 page = cmd_page_lookup(pa);
316 if (page != NULL && page->page_flags & CMD_MEM_F_FAULTING) {
317 /*
318 * fault.memory.page will not be created.
319 */
320 return (0);
321 }
322
323 flt_page.err_add = eadd[0];
324 flt_page.err_log = elog[0];
325 flt_page.fmri_addr = (uint64_t)(uint32_t)unum;
326 flt_page.fmri_sz = strlen(unum) + 1;
327
328 fmd_hdl_debug(hdl, "opl_scf_log DIMM: %s (%d)\n",
329 unum, strlen(unum) + 1);
330 fmd_hdl_debug(hdl, "opl_scf_log pa:%llx add:%x log:%x\n",
331 pa, eadd[0], elog[0]);
332
333 if ((mcdir = opendir(MC_PHYDEV_DIR)) != NULL) {
334 while ((dp = readdir(mcdir)) != NULL) {
335 int fd;
336
337 if (strncmp(dp->d_name, MC_PHYPREFIX,
338 strlen(MC_PHYPREFIX)) != 0)
339 continue;
340
341 (void) snprintf(path, sizeof (path),
342 "%s/%s", MC_PHYDEV_DIR, dp->d_name);
343
344 if (stat(path, &statbuf) != 0 ||
345 (statbuf.st_mode & S_IFCHR) == 0) {
346 /* skip if not a character device */
347 continue;
348 }
349
350 if ((fd = open(path, O_RDONLY)) < 0)
351 continue;
352
353 if (ioctl(fd, MCIOC_FAULT_PAGE, &flt_page) == 0) {
354 fmd_hdl_debug(hdl, "opl_scf_log ioctl(%s)\n",
355 path);
356 (void) close(fd);
357 (void) closedir(mcdir);
358 return (0);
359 }
360 (void) close(fd);
361 }
362 (void) closedir(mcdir);
363 }
364
365 fmd_hdl_debug(hdl, "opl_scf_log failed ioctl().\n");
366
367 return (-1);
368 }
369
370 /*
371 * This is the common function for processing MAC detected
372 * Intermittent and Permanent CEs.
373 */
374
375 cmd_evdisp_t
cmd_opl_mac_ce(fmd_hdl_t * hdl,fmd_event_t * ep,const char * class,nvlist_t * asru,nvlist_t * fru,uint64_t pa,nvlist_t * nvl)376 cmd_opl_mac_ce(fmd_hdl_t *hdl, fmd_event_t *ep, const char *class,
377 nvlist_t *asru, nvlist_t *fru, uint64_t pa, nvlist_t *nvl)
378 {
379 cmd_dimm_t *dimm;
380 const char *uuid;
381
382 fmd_hdl_debug(hdl,
383 "Processing CE ereport\n");
384
385 if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL &&
386 (dimm = cmd_dimm_create(hdl, asru)) == NULL)
387 return (CMD_EVD_UNUSED);
388
389 if (dimm->dimm_case.cc_cp == NULL) {
390 dimm->dimm_case.cc_cp = cmd_case_create(hdl,
391 &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid);
392 }
393
394 if (strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
395 CMD_STAT_BUMP(ce_interm);
396 fmd_hdl_debug(hdl, "adding FJ-Intermittent event "
397 "to CE serd engine\n");
398
399 if (dimm->dimm_case.cc_serdnm == NULL) {
400 dimm->dimm_case.cc_serdnm =
401 cmd_mem_serdnm_create(hdl,
402 "dimm", dimm->dimm_unum);
403 fmd_serd_create(hdl, dimm->dimm_case.cc_serdnm,
404 fmd_prop_get_int32(hdl, "ce_n"),
405 fmd_prop_get_int64(hdl, "ce_t"));
406 }
407
408 if (fmd_serd_record(hdl, dimm->dimm_case.cc_serdnm, ep) ==
409 FMD_B_FALSE) {
410 return (CMD_EVD_OK); /* engine hasn't fired */
411 }
412 fmd_hdl_debug(hdl, "ce serd fired\n");
413 fmd_case_add_serd(hdl, dimm->dimm_case.cc_cp,
414 dimm->dimm_case.cc_serdnm);
415 fmd_serd_reset(hdl, dimm->dimm_case.cc_serdnm);
416
417 (void) opl_scf_log(hdl, nvl);
418 } else {
419 CMD_STAT_BUMP(ce_sticky);
420 }
421
422 dimm->dimm_nretired++;
423 dimm->dimm_retstat.fmds_value.ui64++;
424 cmd_dimm_dirty(hdl, dimm);
425
426 cmd_page_fault(hdl, asru, fru, ep, pa);
427 opl_ce_thresh_check(hdl, dimm);
428
429 return (CMD_EVD_OK);
430 }
431
432 /*
433 * This is the common entry for processing MAC detected errors.
434 * It is responsible for generating the memory page fault event.
435 * The permanent CE (sticky) in normal mode is handled here also
436 * in the same way as in the UE case.
437 */
438 /*ARGSUSED*/
439 cmd_evdisp_t
cmd_opl_mac_common(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)440 cmd_opl_mac_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
441 const char *class, cmd_errcl_t clcode)
442 {
443 uint64_t pa;
444 nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
445 cmd_page_t *page;
446
447 fmd_hdl_debug(hdl, "cmd_mac_common: clcode=%ll\n", clcode);
448
449 if (nvlist_lookup_nvlist(nvl, MC_OPL_RESOURCE, &rsrc) != 0)
450 return (CMD_EVD_BAD);
451
452 if (nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa)
453 != 0)
454 return (CMD_EVD_BAD);
455
456 /*
457 * Check for invalid pa.
458 * The most sig. bit should not be on.
459 * It would be out of the range of possible pa
460 * in MAC's view.
461 */
462 if (((uint64_t)1 << 63) & pa)
463 return (CMD_EVD_BAD);
464
465 if ((page = cmd_page_lookup(pa)) != NULL &&
466 page->page_case.cc_cp != NULL &&
467 fmd_case_solved(hdl, page->page_case.cc_cp))
468 return (CMD_EVD_REDUND);
469
470 if (nvlist_dup(rsrc, &asru, 0) != 0) {
471 fmd_hdl_debug(hdl, "cmd_opl_mac_common nvlist dup failed\n");
472 return (CMD_EVD_BAD);
473 }
474
475 if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
476 fmd_hdl_debug(hdl, "cmd_opl_mac_common expand failed\n");
477 nvlist_free(asru);
478 CMD_STAT_BUMP(bad_mem_asru);
479 return (CMD_EVD_BAD);
480 }
481
482 if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
483 fmd_hdl_debug(hdl, "cmd_opl_mac_common fru_create failed\n");
484 nvlist_free(asru);
485 return (CMD_EVD_BAD);
486 }
487
488 /*
489 * process PCE and ICE to create DIMM fault
490 */
491 if (strcmp(class, "ereport.asic.mac.mi-ce") == 0 ||
492 strcmp(class, "ereport.asic.mac.ptrl-ce") == 0 ||
493 strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
494 cmd_evdisp_t ret;
495
496 ret = cmd_opl_mac_ce(hdl, ep, class, asru, fru, pa, nvl);
497 nvlist_free(asru);
498 nvlist_free(fru);
499 if (ret != CMD_EVD_OK) {
500 fmd_hdl_debug(hdl,
501 "cmd_opl_mac_common: mac_ce failed\n");
502 return (CMD_EVD_BAD);
503 } else
504 return (CMD_EVD_OK);
505 }
506
507 /* The following code handles page retires for UEs and CMPEs. */
508
509 cmd_page_fault(hdl, asru, fru, ep, pa);
510 nvlist_free(asru);
511 nvlist_free(fru);
512 return (CMD_EVD_OK);
513 }
514
515 /*
516 * Common entry points for handling CPU/IO detected UE with
517 * respect to EID=MEM.
518 */
519 /*ARGSUSED*/
520 cmd_evdisp_t
cmd_opl_cpu_mem(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)521 cmd_opl_cpu_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
522 const char *class, cmd_errcl_t clcode)
523 {
524 return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_CPU));
525 }
526
527 /*ARGSUSED*/
528 cmd_evdisp_t
cmd_opl_io_mem(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)529 cmd_opl_io_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
530 const char *class, cmd_errcl_t clcode)
531 {
532 return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_IO));
533 }
534