xref: /titanic_52/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_oplerr.c (revision aab83bb83be7342f6cfccaed8d5fe0b2f404855d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * OPL platform specific functions for
29  * CPU/Memory error diagnosis engine.
30  */
31 #include <cmd.h>
32 #include <cmd_dimm.h>
33 #include <cmd_bank.h>
34 #include <cmd_page.h>
35 #include <cmd_opl.h>
36 #include <string.h>
37 #include <errno.h>
38 #include <fcntl.h>
39 #include <unistd.h>
40 #include <dirent.h>
41 #include <sys/stat.h>
42 
43 #include <sys/fm/protocol.h>
44 #include <sys/fm/io/opl_mc_fm.h>
45 #include <sys/async.h>
46 #include <sys/opl_olympus_regs.h>
47 #include <sys/fm/cpu/SPARC64-VI.h>
48 #include <sys/int_const.h>
49 #include <sys/mutex.h>
50 #include <sys/dditypes.h>
51 #include <opl/sys/mc-opl.h>
52 
53 /*
54  * The following is the common function for handling
55  * memory UE with EID=MEM.
56  * The error could be detected by either CPU/IO.
57  */
58 cmd_evdisp_t
59 opl_ue_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
60     int hdlr_type)
61 {
62 	nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
63 	uint64_t ubc_ue_log_reg, pa;
64 	cmd_page_t *page;
65 
66 	if (nvlist_lookup_nvlist(nvl,
67 	    FM_EREPORT_PAYLOAD_NAME_RESOURCE, &rsrc) != 0)
68 		return (CMD_EVD_BAD);
69 
70 	switch (hdlr_type) {
71 	case CMD_OPL_HDLR_CPU:
72 
73 		if (nvlist_lookup_uint64(nvl,
74 		    FM_EREPORT_PAYLOAD_NAME_SFAR, &pa) != 0)
75 			return (CMD_EVD_BAD);
76 
77 		fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
78 		    (u_longlong_t)pa);
79 		break;
80 
81 	case CMD_OPL_HDLR_IO:
82 
83 		if (nvlist_lookup_uint64(nvl, OBERON_UBC_MUE,
84 		    &ubc_ue_log_reg) != 0)
85 			return (CMD_EVD_BAD);
86 
87 		pa = (ubc_ue_log_reg & UBC_UE_ADR_MASK);
88 
89 		fmd_hdl_debug(hdl, "cmd_ue_mem: ue_log_reg=%llx\n",
90 		    (u_longlong_t)ubc_ue_log_reg);
91 		fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
92 		    (u_longlong_t)pa);
93 		break;
94 
95 	default:
96 
97 		return (CMD_EVD_BAD);
98 	}
99 
100 	if ((page = cmd_page_lookup(pa)) != NULL &&
101 	    page->page_case.cc_cp != NULL &&
102 	    fmd_case_solved(hdl, page->page_case.cc_cp))
103 		return (CMD_EVD_REDUND);
104 
105 	if (nvlist_dup(rsrc, &asru, 0) != 0) {
106 		fmd_hdl_debug(hdl, "opl_ue_mem nvlist dup failed\n");
107 		return (CMD_EVD_BAD);
108 	}
109 
110 	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
111 		nvlist_free(asru);
112 		CMD_STAT_BUMP(bad_mem_asru);
113 		return (CMD_EVD_BAD);
114 	}
115 
116 	if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
117 		nvlist_free(asru);
118 		return (CMD_EVD_BAD);
119 	}
120 
121 	cmd_page_fault(hdl, asru, fru, ep, pa);
122 	nvlist_free(asru);
123 	nvlist_free(fru);
124 	return (CMD_EVD_OK);
125 }
126 
127 /*
128  * The following is the main function to handle generating
129  * the sibling cpu suspect list for the CPU detected UE
130  * error cases.  This is to handle the
131  * multiple strand/core architecture on the OPL platform.
132  */
133 cmd_evdisp_t
134 cmd_opl_ue_cpu(fmd_hdl_t *hdl, fmd_event_t *ep,
135     const char *class, const char *fltname,
136     cmd_ptrsubtype_t ptr, cmd_cpu_t *cpu,
137     cmd_case_t *cc, uint8_t cpumask)
138 {
139 	const char *uuid;
140 	cmd_cpu_t *main_cpu, *sib_cpu;
141 	nvlist_t *fmri;
142 	cmd_list_t *cpu_list;
143 	opl_cpu_t *opl_cpu;
144 	uint32_t main_cpuid, nsusp = 1;
145 	uint8_t cert;
146 
147 	fmd_hdl_debug(hdl,
148 	    "Enter OPL_CPUUE_HANDLER for class %x\n", class);
149 
150 	main_cpu = cpu;
151 	main_cpuid = cpu->cpu_cpuid;
152 
153 	if (strcmp(fltname, "core") == 0)
154 		cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
155 		    IS_CORE);
156 	else if (strcmp(fltname, "chip") == 0)
157 		cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
158 		    IS_CHIP);
159 	else
160 		cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
161 		    IS_STRAND);
162 
163 	for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
164 	    opl_cpu = cmd_list_next(opl_cpu)) {
165 		if (opl_cpu->oc_cpuid == main_cpuid) {
166 			sib_cpu = main_cpu;
167 			opl_cpu->oc_cmd_cpu = main_cpu;
168 		} else {
169 			fmri = cmd_cpu_fmri_create(opl_cpu->oc_cpuid, cpumask);
170 			if (fmri == NULL) {
171 				opl_cpu->oc_cmd_cpu = NULL;
172 				fmd_hdl_debug(hdl,
173 				    "missing asru, cpuid %u excluded\n",
174 				    opl_cpu->oc_cpuid);
175 				continue;
176 			}
177 
178 			sib_cpu = cmd_cpu_lookup(hdl, fmri, class,
179 			    CMD_CPU_LEVEL_THREAD);
180 			if (sib_cpu == NULL || sib_cpu->cpu_faulting) {
181 				nvlist_free(fmri);
182 				opl_cpu->oc_cmd_cpu = NULL;
183 				fmd_hdl_debug(hdl,
184 				"cpu not present, cpuid %u excluded\n",
185 				    opl_cpu->oc_cpuid);
186 				continue;
187 			}
188 			opl_cpu->oc_cmd_cpu = sib_cpu;
189 			nvlist_free(fmri);
190 			nsusp++;
191 		}
192 		if (cpu->cpu_cpuid == main_cpuid) {
193 			if (cc->cc_cp != NULL &&
194 			    fmd_case_solved(hdl, cc->cc_cp)) {
195 				if (cpu_list != NULL)
196 					opl_cpulist_free(hdl, cpu_list);
197 				return (CMD_EVD_REDUND);
198 			}
199 
200 			if (cc->cc_cp == NULL)
201 				cc->cc_cp = cmd_case_create(hdl,
202 				    &cpu->cpu_header, ptr, &uuid);
203 
204 			if (cc->cc_serdnm != NULL) {
205 				fmd_hdl_debug(hdl,
206 			"destroying existing %s state for class %x\n",
207 				    cc->cc_serdnm, class);
208 				fmd_serd_destroy(hdl, cc->cc_serdnm);
209 				fmd_hdl_strfree(hdl, cc->cc_serdnm);
210 				cc->cc_serdnm = NULL;
211 				fmd_case_reset(hdl, cc->cc_cp);
212 			}
213 			fmd_case_add_ereport(hdl, cc->cc_cp, ep);
214 		}
215 	}
216 	cert = opl_avg(100, nsusp);
217 	for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
218 	    opl_cpu = cmd_list_next(opl_cpu)) {
219 		if (opl_cpu->oc_cmd_cpu != NULL) {
220 			nvlist_t *cpu_rsrc;
221 
222 			cpu_rsrc = opl_cpursrc_create(hdl, opl_cpu->oc_cpuid);
223 			if (cpu_rsrc == NULL) {
224 				fmd_hdl_debug(hdl,
225 				"missing rsrc, cpuid %u excluded\n",
226 				    opl_cpu->oc_cpuid);
227 				continue;
228 			}
229 			cmd_cpu_create_faultlist(hdl, cc->cc_cp,
230 			    opl_cpu->oc_cmd_cpu, fltname, cpu_rsrc, cert);
231 			nvlist_free(cpu_rsrc);
232 		}
233 	}
234 	fmd_case_solve(hdl, cc->cc_cp);
235 	if (cpu_list != NULL)
236 		opl_cpulist_free(hdl, cpu_list);
237 	return (CMD_EVD_OK);
238 }
239 
240 /*
241  * Generates DIMM fault if the number of Permanent CE
242  * threshold is exceeded.
243  */
244 static void
245 opl_ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
246 {
247 	nvlist_t *dflt;
248 	fmd_case_t *cp;
249 
250 	fmd_hdl_debug(hdl,
251 	    "Permanent CE event threshold checking.\n");
252 
253 	if (dimm->dimm_flags & CMD_MEM_F_FAULTING) {
254 		/* We've already complained about this DIMM */
255 		return;
256 	}
257 
258 	if (dimm->dimm_nretired >= fmd_prop_get_int32(hdl,
259 	    "max_perm_ce_dimm")) {
260 		dimm->dimm_flags |= CMD_MEM_F_FAULTING;
261 		cp = fmd_case_open(hdl, NULL);
262 		dflt = cmd_dimm_create_fault(hdl, dimm, "fault.memory.dimm",
263 		    CMD_FLTMAXCONF);
264 		fmd_case_add_suspect(hdl, cp, dflt);
265 		fmd_case_solve(hdl, cp);
266 	}
267 }
268 
269 /*
270  * Notify fault page information (pa and errlog) to XSCF via mc-opl
271  */
272 #define	MC_PHYDEV_DIR	"/devices"
273 #define	MC_PHYPREFIX	"pseudo-mc@"
274 static int
275 opl_scf_log(fmd_hdl_t *hdl, nvlist_t *nvl)
276 {
277 	uint32_t *eadd, *elog;
278 	uint_t n;
279 	uint64_t pa;
280 	char path[MAXPATHLEN];
281 	char *unum;
282 	nvlist_t *rsrc;
283 	DIR *mcdir;
284 	struct dirent *dp;
285 	mc_flt_page_t flt_page;
286 	cmd_page_t *page;
287 	struct stat statbuf;
288 
289 	/*
290 	 * Extract ereport.
291 	 * Sanity check of pa is already done at cmd_opl_mac_common().
292 	 * mc-opl sets only one entry for MC_OPL_ERR_ADD, MC_OPL_ERR_LOG,
293 	 * and MC_OPL_BANK.
294 	 */
295 	if ((nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa) != 0) ||
296 	    (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_ADD, &eadd, &n) != 0) ||
297 	    (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_LOG, &elog, &n) != 0)) {
298 		fmd_hdl_debug(hdl, "opl_scf_log failed to extract ereport.\n");
299 		return (-1);
300 	}
301 	if (nvlist_lookup_nvlist(nvl, FM_EREPORT_PAYLOAD_NAME_RESOURCE,
302 	    &rsrc) != 0) {
303 		fmd_hdl_debug(hdl, "opl_scf_log failed to get resource.\n");
304 		return (-1);
305 	}
306 	if (nvlist_lookup_string(rsrc, FM_FMRI_MEM_UNUM, &unum) != 0) {
307 		fmd_hdl_debug(hdl, "opl_scf_log failed to get unum.\n");
308 		return (-1);
309 	}
310 
311 	page = cmd_page_lookup(pa);
312 	if (page != NULL && page->page_flags & CMD_MEM_F_FAULTING) {
313 		/*
314 		 * fault.memory.page will not be created.
315 		 */
316 		return (0);
317 	}
318 
319 	flt_page.err_add = eadd[0];
320 	flt_page.err_log = elog[0];
321 	flt_page.fmri_addr = (uint64_t)(uint32_t)unum;
322 	flt_page.fmri_sz = strlen(unum) + 1;
323 
324 	fmd_hdl_debug(hdl, "opl_scf_log DIMM: %s (%d)\n",
325 	    unum, strlen(unum) + 1);
326 	fmd_hdl_debug(hdl, "opl_scf_log pa:%llx add:%x log:%x\n",
327 	    pa, eadd[0], elog[0]);
328 
329 	if ((mcdir = opendir(MC_PHYDEV_DIR)) != NULL) {
330 		while ((dp = readdir(mcdir)) != NULL) {
331 			int fd;
332 
333 			if (strncmp(dp->d_name, MC_PHYPREFIX,
334 			    strlen(MC_PHYPREFIX)) != 0)
335 				continue;
336 
337 			(void) snprintf(path, sizeof (path),
338 			    "%s/%s", MC_PHYDEV_DIR, dp->d_name);
339 
340 			if (stat(path, &statbuf) != 0 ||
341 			    (statbuf.st_mode & S_IFCHR) == 0) {
342 				/* skip if not a character device */
343 				continue;
344 			}
345 
346 			if ((fd = open(path, O_RDONLY)) < 0)
347 				continue;
348 
349 			if (ioctl(fd, MCIOC_FAULT_PAGE, &flt_page) == 0) {
350 				fmd_hdl_debug(hdl, "opl_scf_log ioctl(%s)\n",
351 				    path);
352 				(void) close(fd);
353 				(void) closedir(mcdir);
354 				return (0);
355 			}
356 			(void) close(fd);
357 		}
358 		(void) closedir(mcdir);
359 	}
360 
361 	fmd_hdl_debug(hdl, "opl_scf_log failed ioctl().\n");
362 
363 	return (-1);
364 }
365 
366 /*
367  * This is the common function for processing MAC detected
368  * Intermittent and Permanent CEs.
369  */
370 
371 cmd_evdisp_t
372 cmd_opl_mac_ce(fmd_hdl_t *hdl, fmd_event_t *ep, const char *class,
373     nvlist_t *asru, nvlist_t *fru, uint64_t pa, nvlist_t *nvl)
374 {
375 	cmd_dimm_t *dimm;
376 	const char *uuid;
377 
378 	fmd_hdl_debug(hdl,
379 	    "Processing CE ereport\n");
380 
381 	if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL &&
382 	    (dimm = cmd_dimm_create(hdl, asru)) == NULL)
383 		return (CMD_EVD_UNUSED);
384 
385 	if (dimm->dimm_case.cc_cp == NULL) {
386 		dimm->dimm_case.cc_cp = cmd_case_create(hdl,
387 		    &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid);
388 	}
389 
390 	if (strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
391 		CMD_STAT_BUMP(ce_interm);
392 		fmd_hdl_debug(hdl, "adding FJ-Intermittent event "
393 		    "to CE serd engine\n");
394 
395 		if (dimm->dimm_case.cc_serdnm == NULL) {
396 			dimm->dimm_case.cc_serdnm =
397 			    cmd_mem_serdnm_create(hdl,
398 			    "dimm", dimm->dimm_unum);
399 			fmd_serd_create(hdl, dimm->dimm_case.cc_serdnm,
400 			    fmd_prop_get_int32(hdl, "ce_n"),
401 			    fmd_prop_get_int64(hdl, "ce_t"));
402 		}
403 
404 		if (fmd_serd_record(hdl, dimm->dimm_case.cc_serdnm, ep) ==
405 		    FMD_B_FALSE) {
406 			return (CMD_EVD_OK); /* engine hasn't fired */
407 		}
408 		fmd_hdl_debug(hdl, "ce serd fired\n");
409 		fmd_case_add_serd(hdl, dimm->dimm_case.cc_cp,
410 		    dimm->dimm_case.cc_serdnm);
411 		fmd_serd_reset(hdl, dimm->dimm_case.cc_serdnm);
412 
413 		(void) opl_scf_log(hdl, nvl);
414 	} else {
415 		CMD_STAT_BUMP(ce_sticky);
416 	}
417 
418 	dimm->dimm_nretired++;
419 	dimm->dimm_retstat.fmds_value.ui64++;
420 	cmd_dimm_dirty(hdl, dimm);
421 
422 	cmd_page_fault(hdl, asru, fru, ep, pa);
423 	opl_ce_thresh_check(hdl, dimm);
424 
425 	return (CMD_EVD_OK);
426 }
427 
428 /*
429  * This is the common entry for processing MAC detected errors.
430  * It is responsible for generating the memory page fault event.
431  * The permanent CE (sticky) in normal mode is handled here also
432  * in the same way as in the UE case.
433  */
434 /*ARGSUSED*/
435 cmd_evdisp_t
436 cmd_opl_mac_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
437     const char *class, cmd_errcl_t clcode)
438 {
439 	uint64_t pa;
440 	nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
441 	cmd_page_t *page;
442 
443 	fmd_hdl_debug(hdl, "cmd_mac_common: clcode=%ll\n", clcode);
444 
445 	if (nvlist_lookup_nvlist(nvl, MC_OPL_RESOURCE, &rsrc) != 0)
446 		return (CMD_EVD_BAD);
447 
448 	if (nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa)
449 	    != 0)
450 		return (CMD_EVD_BAD);
451 
452 	/*
453 	 * Check for invalid pa.
454 	 * The most sig. bit should not be on.
455 	 * It would be out of the range of possible pa
456 	 * in MAC's view.
457 	 */
458 	if (((uint64_t)1 << 63) & pa)
459 		return (CMD_EVD_BAD);
460 
461 	if ((page = cmd_page_lookup(pa)) != NULL &&
462 	    page->page_case.cc_cp != NULL &&
463 	    fmd_case_solved(hdl, page->page_case.cc_cp))
464 		return (CMD_EVD_REDUND);
465 
466 	if (nvlist_dup(rsrc, &asru, 0) != 0) {
467 		fmd_hdl_debug(hdl, "cmd_opl_mac_common nvlist dup failed\n");
468 		return (CMD_EVD_BAD);
469 	}
470 
471 	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
472 		fmd_hdl_debug(hdl, "cmd_opl_mac_common expand failed\n");
473 		nvlist_free(asru);
474 		CMD_STAT_BUMP(bad_mem_asru);
475 		return (CMD_EVD_BAD);
476 	}
477 
478 	if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
479 		fmd_hdl_debug(hdl, "cmd_opl_mac_common fru_create failed\n");
480 		nvlist_free(asru);
481 		return (CMD_EVD_BAD);
482 	}
483 
484 	/*
485 	 * process PCE and ICE to create DIMM fault
486 	 */
487 	if (strcmp(class, "ereport.asic.mac.mi-ce") == 0 ||
488 	    strcmp(class, "ereport.asic.mac.ptrl-ce") == 0 ||
489 	    strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
490 		cmd_evdisp_t ret;
491 
492 		ret = cmd_opl_mac_ce(hdl, ep, class, asru, fru, pa, nvl);
493 		nvlist_free(asru);
494 		nvlist_free(fru);
495 		if (ret != CMD_EVD_OK) {
496 			fmd_hdl_debug(hdl,
497 			    "cmd_opl_mac_common: mac_ce failed\n");
498 			return (CMD_EVD_BAD);
499 		} else
500 			return (CMD_EVD_OK);
501 	}
502 
503 	/* The following code handles page retires for UEs and CMPEs.  */
504 
505 	cmd_page_fault(hdl, asru, fru, ep, pa);
506 	nvlist_free(asru);
507 	nvlist_free(fru);
508 	return (CMD_EVD_OK);
509 }
510 
511 /*
512  * Common entry points for handling CPU/IO detected UE with
513  * respect to EID=MEM.
514  */
515 /*ARGSUSED*/
516 cmd_evdisp_t
517 cmd_opl_cpu_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
518     const char *class, cmd_errcl_t clcode)
519 {
520 	return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_CPU));
521 }
522 
523 /*ARGSUSED*/
524 cmd_evdisp_t
525 cmd_opl_io_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
526     const char *class, cmd_errcl_t clcode)
527 {
528 	return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_IO));
529 }
530