xref: /titanic_41/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_oplerr.c (revision 0b240fcdeb4772e65fed050aee3e3dc63308ae72)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * OPL platform specific functions for
31  * CPU/Memory error diagnosis engine.
32  */
33 #include <cmd.h>
34 #include <cmd_dimm.h>
35 #include <cmd_bank.h>
36 #include <cmd_page.h>
37 #include <cmd_opl.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <fcntl.h>
41 #include <unistd.h>
42 #include <dirent.h>
43 #include <sys/stat.h>
44 
45 #include <sys/fm/protocol.h>
46 #include <sys/fm/io/opl_mc_fm.h>
47 #include <sys/async.h>
48 #include <sys/opl_olympus_regs.h>
49 #include <sys/fm/cpu/SPARC64-VI.h>
50 #include <sys/int_const.h>
51 #include <sys/mutex.h>
52 #include <sys/dditypes.h>
53 #include <opl/sys/mc-opl.h>
54 
55 /*
56  * The following is the common function for handling
57  * memory UE with EID=MEM.
58  * The error could be detected by either CPU/IO.
59  */
60 cmd_evdisp_t
opl_ue_mem(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,int hdlr_type)61 opl_ue_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
62     int hdlr_type)
63 {
64 	nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
65 	uint64_t ubc_ue_log_reg, pa;
66 	cmd_page_t *page;
67 
68 	if (nvlist_lookup_nvlist(nvl,
69 	    FM_EREPORT_PAYLOAD_NAME_RESOURCE, &rsrc) != 0)
70 		return (CMD_EVD_BAD);
71 
72 	switch (hdlr_type) {
73 	case CMD_OPL_HDLR_CPU:
74 
75 		if (nvlist_lookup_uint64(nvl,
76 		    FM_EREPORT_PAYLOAD_NAME_SFAR, &pa) != 0)
77 			return (CMD_EVD_BAD);
78 
79 		fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
80 		    (u_longlong_t)pa);
81 		break;
82 
83 	case CMD_OPL_HDLR_IO:
84 
85 		if (nvlist_lookup_uint64(nvl, OBERON_UBC_MUE,
86 		    &ubc_ue_log_reg) != 0)
87 			return (CMD_EVD_BAD);
88 
89 		pa = (ubc_ue_log_reg & UBC_UE_ADR_MASK);
90 
91 		fmd_hdl_debug(hdl, "cmd_ue_mem: ue_log_reg=%llx\n",
92 		    (u_longlong_t)ubc_ue_log_reg);
93 		fmd_hdl_debug(hdl, "cmd_ue_mem: pa=%llx\n",
94 		    (u_longlong_t)pa);
95 		break;
96 
97 	default:
98 
99 		return (CMD_EVD_BAD);
100 	}
101 
102 	if ((page = cmd_page_lookup(pa)) != NULL &&
103 	    page->page_case.cc_cp != NULL &&
104 	    fmd_case_solved(hdl, page->page_case.cc_cp))
105 		return (CMD_EVD_REDUND);
106 
107 	if (nvlist_dup(rsrc, &asru, 0) != 0) {
108 		fmd_hdl_debug(hdl, "opl_ue_mem nvlist dup failed\n");
109 		return (CMD_EVD_BAD);
110 	}
111 
112 	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
113 		nvlist_free(asru);
114 		CMD_STAT_BUMP(bad_mem_asru);
115 		return (CMD_EVD_BAD);
116 	}
117 
118 	if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
119 		nvlist_free(asru);
120 		return (CMD_EVD_BAD);
121 	}
122 
123 	cmd_page_fault(hdl, asru, fru, ep, pa);
124 	nvlist_free(asru);
125 	nvlist_free(fru);
126 	return (CMD_EVD_OK);
127 }
128 
129 /*
130  * The following is the main function to handle generating
131  * the sibling cpu suspect list for the CPU detected UE
132  * error cases.  This is to handle the
133  * multiple strand/core architecture on the OPL platform.
134  */
135 cmd_evdisp_t
cmd_opl_ue_cpu(fmd_hdl_t * hdl,fmd_event_t * ep,const char * class,const char * fltname,cmd_ptrsubtype_t ptr,cmd_cpu_t * cpu,cmd_case_t * cc,uint8_t cpumask)136 cmd_opl_ue_cpu(fmd_hdl_t *hdl, fmd_event_t *ep,
137     const char *class, const char *fltname,
138     cmd_ptrsubtype_t ptr, cmd_cpu_t *cpu,
139     cmd_case_t *cc, uint8_t cpumask)
140 {
141 	const char *uuid;
142 	cmd_cpu_t *main_cpu, *sib_cpu;
143 	nvlist_t *fmri;
144 	cmd_list_t *cpu_list;
145 	opl_cpu_t *opl_cpu;
146 	uint32_t main_cpuid, nsusp = 1;
147 	uint8_t cert;
148 
149 	fmd_hdl_debug(hdl,
150 	    "Enter OPL_CPUUE_HANDLER for class %x\n", class);
151 
152 	main_cpu = cpu;
153 	main_cpuid = cpu->cpu_cpuid;
154 
155 	if (strcmp(fltname, "core") == 0)
156 		cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
157 		    IS_CORE);
158 	else if (strcmp(fltname, "chip") == 0)
159 		cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
160 		    IS_CHIP);
161 	else
162 		cpu_list = opl_cpulist_insert(hdl, cpu->cpu_cpuid,
163 		    IS_STRAND);
164 
165 	for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
166 	    opl_cpu = cmd_list_next(opl_cpu)) {
167 		if (opl_cpu->oc_cpuid == main_cpuid) {
168 			sib_cpu = main_cpu;
169 			opl_cpu->oc_cmd_cpu = main_cpu;
170 		} else {
171 			fmri = cmd_cpu_fmri_create(opl_cpu->oc_cpuid, cpumask);
172 			if (fmri == NULL) {
173 				opl_cpu->oc_cmd_cpu = NULL;
174 				fmd_hdl_debug(hdl,
175 				    "missing asru, cpuid %u excluded\n",
176 				    opl_cpu->oc_cpuid);
177 				continue;
178 			}
179 
180 			sib_cpu = cmd_cpu_lookup(hdl, fmri, class,
181 			    CMD_CPU_LEVEL_THREAD);
182 			if (sib_cpu == NULL || sib_cpu->cpu_faulting) {
183 				if (fmri != NULL)
184 					nvlist_free(fmri);
185 				opl_cpu->oc_cmd_cpu = NULL;
186 				fmd_hdl_debug(hdl,
187 				"cpu not present, cpuid %u excluded\n",
188 				    opl_cpu->oc_cpuid);
189 				continue;
190 			}
191 			opl_cpu->oc_cmd_cpu = sib_cpu;
192 			if (fmri != NULL)
193 				nvlist_free(fmri);
194 			nsusp++;
195 		}
196 		if (cpu->cpu_cpuid == main_cpuid) {
197 			if (cc->cc_cp != NULL &&
198 			    fmd_case_solved(hdl, cc->cc_cp)) {
199 				if (cpu_list != NULL)
200 					opl_cpulist_free(hdl, cpu_list);
201 				return (CMD_EVD_REDUND);
202 			}
203 
204 			if (cc->cc_cp == NULL)
205 				cc->cc_cp = cmd_case_create(hdl,
206 				    &cpu->cpu_header, ptr, &uuid);
207 
208 			if (cc->cc_serdnm != NULL) {
209 				fmd_hdl_debug(hdl,
210 			"destroying existing %s state for class %x\n",
211 				    cc->cc_serdnm, class);
212 				fmd_serd_destroy(hdl, cc->cc_serdnm);
213 				fmd_hdl_strfree(hdl, cc->cc_serdnm);
214 				cc->cc_serdnm = NULL;
215 				fmd_case_reset(hdl, cc->cc_cp);
216 			}
217 			fmd_case_add_ereport(hdl, cc->cc_cp, ep);
218 		}
219 	}
220 	cert = opl_avg(100, nsusp);
221 	for (opl_cpu = cmd_list_next(cpu_list); opl_cpu != NULL;
222 	    opl_cpu = cmd_list_next(opl_cpu)) {
223 		if (opl_cpu->oc_cmd_cpu != NULL) {
224 			nvlist_t *cpu_rsrc;
225 
226 			cpu_rsrc = opl_cpursrc_create(hdl, opl_cpu->oc_cpuid);
227 			if (cpu_rsrc == NULL) {
228 				fmd_hdl_debug(hdl,
229 				"missing rsrc, cpuid %u excluded\n",
230 				    opl_cpu->oc_cpuid);
231 				continue;
232 			}
233 			cmd_cpu_create_faultlist(hdl, cc->cc_cp,
234 			    opl_cpu->oc_cmd_cpu, fltname, cpu_rsrc, cert);
235 			nvlist_free(cpu_rsrc);
236 		}
237 	}
238 	fmd_case_solve(hdl, cc->cc_cp);
239 	if (cpu_list != NULL)
240 		opl_cpulist_free(hdl, cpu_list);
241 	return (CMD_EVD_OK);
242 }
243 
244 /*
245  * Generates DIMM fault if the number of Permanent CE
246  * threshold is exceeded.
247  */
248 static void
opl_ce_thresh_check(fmd_hdl_t * hdl,cmd_dimm_t * dimm)249 opl_ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
250 {
251 	nvlist_t *dflt;
252 	fmd_case_t *cp;
253 
254 	fmd_hdl_debug(hdl,
255 	    "Permanent CE event threshold checking.\n");
256 
257 	if (dimm->dimm_flags & CMD_MEM_F_FAULTING) {
258 		/* We've already complained about this DIMM */
259 		return;
260 	}
261 
262 	if (dimm->dimm_nretired >= fmd_prop_get_int32(hdl,
263 	    "max_perm_ce_dimm")) {
264 		dimm->dimm_flags |= CMD_MEM_F_FAULTING;
265 		cp = fmd_case_open(hdl, NULL);
266 		dflt = cmd_dimm_create_fault(hdl, dimm, "fault.memory.dimm",
267 		    CMD_FLTMAXCONF);
268 		fmd_case_add_suspect(hdl, cp, dflt);
269 		fmd_case_solve(hdl, cp);
270 	}
271 }
272 
273 /*
274  * Notify fault page information (pa and errlog) to XSCF via mc-opl
275  */
276 #define	MC_PHYDEV_DIR	"/devices"
277 #define	MC_PHYPREFIX	"pseudo-mc@"
278 static int
opl_scf_log(fmd_hdl_t * hdl,nvlist_t * nvl)279 opl_scf_log(fmd_hdl_t *hdl, nvlist_t *nvl)
280 {
281 	uint32_t *eadd, *elog;
282 	uint_t n;
283 	uint64_t pa;
284 	char path[MAXPATHLEN];
285 	char *unum;
286 	nvlist_t *rsrc;
287 	DIR *mcdir;
288 	struct dirent *dp;
289 	mc_flt_page_t flt_page;
290 	cmd_page_t *page;
291 	struct stat statbuf;
292 
293 	/*
294 	 * Extract ereport.
295 	 * Sanity check of pa is already done at cmd_opl_mac_common().
296 	 * mc-opl sets only one entry for MC_OPL_ERR_ADD, MC_OPL_ERR_LOG,
297 	 * and MC_OPL_BANK.
298 	 */
299 	if ((nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa) != 0) ||
300 	    (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_ADD, &eadd, &n) != 0) ||
301 	    (nvlist_lookup_uint32_array(nvl, MC_OPL_ERR_LOG, &elog, &n) != 0)) {
302 		fmd_hdl_debug(hdl, "opl_scf_log failed to extract ereport.\n");
303 		return (-1);
304 	}
305 	if (nvlist_lookup_nvlist(nvl, FM_EREPORT_PAYLOAD_NAME_RESOURCE,
306 	    &rsrc) != 0) {
307 		fmd_hdl_debug(hdl, "opl_scf_log failed to get resource.\n");
308 		return (-1);
309 	}
310 	if (nvlist_lookup_string(rsrc, FM_FMRI_MEM_UNUM, &unum) != 0) {
311 		fmd_hdl_debug(hdl, "opl_scf_log failed to get unum.\n");
312 		return (-1);
313 	}
314 
315 	page = cmd_page_lookup(pa);
316 	if (page != NULL && page->page_flags & CMD_MEM_F_FAULTING) {
317 		/*
318 		 * fault.memory.page will not be created.
319 		 */
320 		return (0);
321 	}
322 
323 	flt_page.err_add = eadd[0];
324 	flt_page.err_log = elog[0];
325 	flt_page.fmri_addr = (uint64_t)(uint32_t)unum;
326 	flt_page.fmri_sz = strlen(unum) + 1;
327 
328 	fmd_hdl_debug(hdl, "opl_scf_log DIMM: %s (%d)\n",
329 	    unum, strlen(unum) + 1);
330 	fmd_hdl_debug(hdl, "opl_scf_log pa:%llx add:%x log:%x\n",
331 	    pa, eadd[0], elog[0]);
332 
333 	if ((mcdir = opendir(MC_PHYDEV_DIR)) != NULL) {
334 		while ((dp = readdir(mcdir)) != NULL) {
335 			int fd;
336 
337 			if (strncmp(dp->d_name, MC_PHYPREFIX,
338 			    strlen(MC_PHYPREFIX)) != 0)
339 				continue;
340 
341 			(void) snprintf(path, sizeof (path),
342 			    "%s/%s", MC_PHYDEV_DIR, dp->d_name);
343 
344 			if (stat(path, &statbuf) != 0 ||
345 			    (statbuf.st_mode & S_IFCHR) == 0) {
346 				/* skip if not a character device */
347 				continue;
348 			}
349 
350 			if ((fd = open(path, O_RDONLY)) < 0)
351 				continue;
352 
353 			if (ioctl(fd, MCIOC_FAULT_PAGE, &flt_page) == 0) {
354 				fmd_hdl_debug(hdl, "opl_scf_log ioctl(%s)\n",
355 				    path);
356 				(void) close(fd);
357 				(void) closedir(mcdir);
358 				return (0);
359 			}
360 			(void) close(fd);
361 		}
362 		(void) closedir(mcdir);
363 	}
364 
365 	fmd_hdl_debug(hdl, "opl_scf_log failed ioctl().\n");
366 
367 	return (-1);
368 }
369 
370 /*
371  * This is the common function for processing MAC detected
372  * Intermittent and Permanent CEs.
373  */
374 
375 cmd_evdisp_t
cmd_opl_mac_ce(fmd_hdl_t * hdl,fmd_event_t * ep,const char * class,nvlist_t * asru,nvlist_t * fru,uint64_t pa,nvlist_t * nvl)376 cmd_opl_mac_ce(fmd_hdl_t *hdl, fmd_event_t *ep, const char *class,
377     nvlist_t *asru, nvlist_t *fru, uint64_t pa, nvlist_t *nvl)
378 {
379 	cmd_dimm_t *dimm;
380 	const char *uuid;
381 
382 	fmd_hdl_debug(hdl,
383 	    "Processing CE ereport\n");
384 
385 	if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL &&
386 	    (dimm = cmd_dimm_create(hdl, asru)) == NULL)
387 		return (CMD_EVD_UNUSED);
388 
389 	if (dimm->dimm_case.cc_cp == NULL) {
390 		dimm->dimm_case.cc_cp = cmd_case_create(hdl,
391 		    &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid);
392 	}
393 
394 	if (strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
395 		CMD_STAT_BUMP(ce_interm);
396 		fmd_hdl_debug(hdl, "adding FJ-Intermittent event "
397 		    "to CE serd engine\n");
398 
399 		if (dimm->dimm_case.cc_serdnm == NULL) {
400 			dimm->dimm_case.cc_serdnm =
401 			    cmd_mem_serdnm_create(hdl,
402 			    "dimm", dimm->dimm_unum);
403 			fmd_serd_create(hdl, dimm->dimm_case.cc_serdnm,
404 			    fmd_prop_get_int32(hdl, "ce_n"),
405 			    fmd_prop_get_int64(hdl, "ce_t"));
406 		}
407 
408 		if (fmd_serd_record(hdl, dimm->dimm_case.cc_serdnm, ep) ==
409 		    FMD_B_FALSE) {
410 			return (CMD_EVD_OK); /* engine hasn't fired */
411 		}
412 		fmd_hdl_debug(hdl, "ce serd fired\n");
413 		fmd_case_add_serd(hdl, dimm->dimm_case.cc_cp,
414 		    dimm->dimm_case.cc_serdnm);
415 		fmd_serd_reset(hdl, dimm->dimm_case.cc_serdnm);
416 
417 		(void) opl_scf_log(hdl, nvl);
418 	} else {
419 		CMD_STAT_BUMP(ce_sticky);
420 	}
421 
422 	dimm->dimm_nretired++;
423 	dimm->dimm_retstat.fmds_value.ui64++;
424 	cmd_dimm_dirty(hdl, dimm);
425 
426 	cmd_page_fault(hdl, asru, fru, ep, pa);
427 	opl_ce_thresh_check(hdl, dimm);
428 
429 	return (CMD_EVD_OK);
430 }
431 
432 /*
433  * This is the common entry for processing MAC detected errors.
434  * It is responsible for generating the memory page fault event.
435  * The permanent CE (sticky) in normal mode is handled here also
436  * in the same way as in the UE case.
437  */
438 /*ARGSUSED*/
439 cmd_evdisp_t
cmd_opl_mac_common(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)440 cmd_opl_mac_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
441     const char *class, cmd_errcl_t clcode)
442 {
443 	uint64_t pa;
444 	nvlist_t *rsrc = NULL, *asru = NULL, *fru = NULL;
445 	cmd_page_t *page;
446 
447 	fmd_hdl_debug(hdl, "cmd_mac_common: clcode=%ll\n", clcode);
448 
449 	if (nvlist_lookup_nvlist(nvl, MC_OPL_RESOURCE, &rsrc) != 0)
450 		return (CMD_EVD_BAD);
451 
452 	if (nvlist_lookup_uint64(nvl, MC_OPL_PA, &pa)
453 	    != 0)
454 		return (CMD_EVD_BAD);
455 
456 	/*
457 	 * Check for invalid pa.
458 	 * The most sig. bit should not be on.
459 	 * It would be out of the range of possible pa
460 	 * in MAC's view.
461 	 */
462 	if (((uint64_t)1 << 63) & pa)
463 		return (CMD_EVD_BAD);
464 
465 	if ((page = cmd_page_lookup(pa)) != NULL &&
466 	    page->page_case.cc_cp != NULL &&
467 	    fmd_case_solved(hdl, page->page_case.cc_cp))
468 		return (CMD_EVD_REDUND);
469 
470 	if (nvlist_dup(rsrc, &asru, 0) != 0) {
471 		fmd_hdl_debug(hdl, "cmd_opl_mac_common nvlist dup failed\n");
472 		return (CMD_EVD_BAD);
473 	}
474 
475 	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
476 		fmd_hdl_debug(hdl, "cmd_opl_mac_common expand failed\n");
477 		nvlist_free(asru);
478 		CMD_STAT_BUMP(bad_mem_asru);
479 		return (CMD_EVD_BAD);
480 	}
481 
482 	if ((fru = opl_mem_fru_create(hdl, asru)) == NULL) {
483 		fmd_hdl_debug(hdl, "cmd_opl_mac_common fru_create failed\n");
484 		nvlist_free(asru);
485 		return (CMD_EVD_BAD);
486 	}
487 
488 	/*
489 	 * process PCE and ICE to create DIMM fault
490 	 */
491 	if (strcmp(class, "ereport.asic.mac.mi-ce") == 0 ||
492 	    strcmp(class, "ereport.asic.mac.ptrl-ce") == 0 ||
493 	    strcmp(class, "ereport.asic.mac.ptrl-ice") == 0) {
494 		cmd_evdisp_t ret;
495 
496 		ret = cmd_opl_mac_ce(hdl, ep, class, asru, fru, pa, nvl);
497 		nvlist_free(asru);
498 		nvlist_free(fru);
499 		if (ret != CMD_EVD_OK) {
500 			fmd_hdl_debug(hdl,
501 			    "cmd_opl_mac_common: mac_ce failed\n");
502 			return (CMD_EVD_BAD);
503 		} else
504 			return (CMD_EVD_OK);
505 	}
506 
507 	/* The following code handles page retires for UEs and CMPEs.  */
508 
509 	cmd_page_fault(hdl, asru, fru, ep, pa);
510 	nvlist_free(asru);
511 	nvlist_free(fru);
512 	return (CMD_EVD_OK);
513 }
514 
515 /*
516  * Common entry points for handling CPU/IO detected UE with
517  * respect to EID=MEM.
518  */
519 /*ARGSUSED*/
520 cmd_evdisp_t
cmd_opl_cpu_mem(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)521 cmd_opl_cpu_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
522     const char *class, cmd_errcl_t clcode)
523 {
524 	return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_CPU));
525 }
526 
527 /*ARGSUSED*/
528 cmd_evdisp_t
cmd_opl_io_mem(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,cmd_errcl_t clcode)529 cmd_opl_io_mem(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
530     const char *class, cmd_errcl_t clcode)
531 {
532 	return (opl_ue_mem(hdl, ep, nvl, CMD_OPL_HDLR_IO));
533 }
534