xref: /titanic_41/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.c (revision d4ac42a1cd3016618a9ba0330862d410f0058f89)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Support routines for DIMMs.
27  */
28 
29 #include <cmd_mem.h>
30 #include <limits.h>
31 #include <cmd_dimm.h>
32 #include <cmd_bank.h>
33 #include <cmd.h>
34 
35 #include <errno.h>
36 #include <string.h>
37 #include <strings.h>
38 #include <fcntl.h>
39 #include <unistd.h>
40 #include <fm/fmd_api.h>
41 #include <sys/fm/protocol.h>
42 #include <sys/mem.h>
43 #include <sys/nvpair.h>
44 #ifdef sun4v
45 #include <cmd_hc_sun4v.h>
46 #include <cmd_branch.h>
47 #endif /* sun4v */
48 
49 /*
50  * Some errors (RxE/FRx pairs) don't have accurate DIMM (resource) FMRIs,
51  * because sufficient information was unavailable prior to correlation.
52  * When the DE completes the pair, it uses this routine to retrieve the
53  * correct FMRI.
54  */
55 nvlist_t *
cmd_dimm_fmri_derive(fmd_hdl_t * hdl,uint64_t afar,uint16_t synd,uint64_t afsr)56 cmd_dimm_fmri_derive(fmd_hdl_t *hdl, uint64_t afar, uint16_t synd,
57     uint64_t afsr)
58 {
59 	nvlist_t *fmri;
60 
61 	if ((fmri = cmd_mem_fmri_derive(hdl, afar, afsr, synd)) == NULL)
62 		return (NULL);
63 
64 	if (fmd_nvl_fmri_expand(hdl, fmri) < 0) {
65 		nvlist_free(fmri);
66 		return (NULL);
67 	}
68 
69 	return (fmri);
70 }
71 
72 nvlist_t *
cmd_dimm_fru(cmd_dimm_t * dimm)73 cmd_dimm_fru(cmd_dimm_t *dimm)
74 {
75 	return (dimm->dimm_asru_nvl);
76 }
77 
78 nvlist_t *
cmd_dimm_create_fault(fmd_hdl_t * hdl,cmd_dimm_t * dimm,const char * fltnm,uint_t cert)79 cmd_dimm_create_fault(fmd_hdl_t *hdl, cmd_dimm_t *dimm, const char *fltnm,
80     uint_t cert)
81 {
82 #ifdef sun4v
83 	nvlist_t *flt, *nvlfru;
84 	/*
85 	 * Do NOT issue hc scheme FRU FMRIs for ultraSPARC-T1 platforms.
86 	 * The SP will misinterpret the FRU. Instead, reuse the ASRU FMRI
87 	 *
88 	 * Use the BR string as a distinguisher. BR (branch) is only
89 	 * present in ultraSPARC-T2/T2plus DIMM unums
90 	 */
91 	if (strstr(dimm->dimm_unum, "BR") == NULL) {
92 		flt = cmd_nvl_create_fault(hdl, fltnm, cert,
93 		    dimm->dimm_asru_nvl, dimm->dimm_asru_nvl, NULL);
94 	} else {
95 		nvlfru = cmd_mem2hc(hdl, dimm->dimm_asru_nvl);
96 		flt = cmd_nvl_create_fault(hdl, fltnm, cert,
97 		    dimm->dimm_asru_nvl, nvlfru, NULL);
98 		if (nvlfru != NULL)
99 			nvlist_free(nvlfru);
100 	}
101 	return (cmd_fault_add_location(hdl, flt, dimm->dimm_unum));
102 #else
103 	return (cmd_nvl_create_fault(hdl, fltnm, cert, dimm->dimm_asru_nvl,
104 	    dimm->dimm_asru_nvl, NULL));
105 #endif /* sun4v */
106 }
107 
108 static void
cmd_dimm_free(fmd_hdl_t * hdl,cmd_dimm_t * dimm,int destroy)109 cmd_dimm_free(fmd_hdl_t *hdl, cmd_dimm_t *dimm, int destroy)
110 {
111 	cmd_case_t *cc = &dimm->dimm_case;
112 	int i;
113 	cmd_mq_t *q;
114 	tstamp_t  *tsp, *next;
115 
116 #ifdef sun4v
117 	cmd_branch_t *branch;
118 #endif
119 	if (cc->cc_cp != NULL) {
120 		cmd_case_fini(hdl, cc->cc_cp, destroy);
121 		if (cc->cc_serdnm != NULL) {
122 			if (fmd_serd_exists(hdl, cc->cc_serdnm) &&
123 			    destroy)
124 				fmd_serd_destroy(hdl, cc->cc_serdnm);
125 			fmd_hdl_strfree(hdl, cc->cc_serdnm);
126 		}
127 	}
128 
129 	for (i = 0; i < CMD_MAX_CKWDS; i++) {
130 		while ((q = cmd_list_next(&dimm->mq_root[i])) != NULL) {
131 			if (q->mq_serdnm != NULL) {
132 				if (fmd_serd_exists(hdl, q->mq_serdnm)) {
133 					fmd_serd_destroy(hdl, q->mq_serdnm);
134 				}
135 				fmd_hdl_strfree(hdl, q->mq_serdnm);
136 				q->mq_serdnm = NULL;
137 			}
138 
139 			for (tsp = cmd_list_next(&q->mq_dupce_tstamp);
140 			    tsp != NULL; tsp = next) {
141 				next = cmd_list_next(tsp);
142 				cmd_list_delete(&q->mq_dupce_tstamp,
143 				    &tsp->ts_l);
144 				fmd_hdl_free(hdl, tsp, sizeof (tstamp_t));
145 			}
146 
147 			cmd_list_delete(&dimm->mq_root[i], q);
148 			fmd_hdl_free(hdl, q, sizeof (cmd_mq_t));
149 		}
150 	}
151 
152 	if (dimm->dimm_bank != NULL)
153 		cmd_bank_remove_dimm(hdl, dimm->dimm_bank, dimm);
154 
155 #ifdef sun4v
156 	branch = cmd_branch_lookup_by_unum(hdl, dimm->dimm_unum);
157 	if (branch != NULL)
158 		cmd_branch_remove_dimm(hdl, branch, dimm);
159 #endif
160 
161 	cmd_fmri_fini(hdl, &dimm->dimm_asru, destroy);
162 
163 	if (destroy)
164 		fmd_buf_destroy(hdl, NULL, dimm->dimm_bufname);
165 
166 	cmd_list_delete(&cmd.cmd_dimms, dimm);
167 	fmd_hdl_free(hdl, dimm, sizeof (cmd_dimm_t));
168 }
169 
170 void
cmd_dimm_destroy(fmd_hdl_t * hdl,cmd_dimm_t * dimm)171 cmd_dimm_destroy(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
172 {
173 
174 	fmd_stat_destroy(hdl, 1, &(dimm->dimm_retstat));
175 	cmd_dimm_free(hdl, dimm, FMD_B_TRUE);
176 }
177 
178 static cmd_dimm_t *
dimm_lookup_by_unum(const char * unum)179 dimm_lookup_by_unum(const char *unum)
180 {
181 	cmd_dimm_t *dimm;
182 
183 	for (dimm = cmd_list_next(&cmd.cmd_dimms); dimm != NULL;
184 	    dimm = cmd_list_next(dimm)) {
185 		if (strcmp(dimm->dimm_unum, unum) == 0)
186 			return (dimm);
187 	}
188 
189 	return (NULL);
190 }
191 
192 static void
dimm_attach_to_bank(fmd_hdl_t * hdl,cmd_dimm_t * dimm)193 dimm_attach_to_bank(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
194 {
195 	cmd_bank_t *bank;
196 
197 	for (bank = cmd_list_next(&cmd.cmd_banks); bank != NULL;
198 	    bank = cmd_list_next(bank)) {
199 		if (fmd_nvl_fmri_contains(hdl, bank->bank_asru_nvl,
200 		    dimm->dimm_asru_nvl)) {
201 			cmd_bank_add_dimm(hdl, bank, dimm);
202 			return;
203 		}
204 	}
205 }
206 
207 cmd_dimm_t *
cmd_dimm_create(fmd_hdl_t * hdl,nvlist_t * asru)208 cmd_dimm_create(fmd_hdl_t *hdl, nvlist_t *asru)
209 {
210 	cmd_dimm_t *dimm;
211 	const char *unum;
212 	nvlist_t *fmri;
213 	size_t nserids = 0;
214 	char **serids = NULL;
215 
216 	if (!fmd_nvl_fmri_present(hdl, asru)) {
217 		fmd_hdl_debug(hdl, "dimm_lookup: discarding old ereport\n");
218 		return (NULL);
219 	}
220 
221 	if ((unum = cmd_fmri_get_unum(asru)) == NULL) {
222 		CMD_STAT_BUMP(bad_mem_asru);
223 		return (NULL);
224 	}
225 
226 #ifdef sun4v
227 	if (nvlist_lookup_string_array(asru, FM_FMRI_HC_SERIAL_ID, &serids,
228 	    &nserids) != 0) {
229 		fmd_hdl_debug(hdl, "sun4v mem: FMRI does not"
230 		    " have serial_ids\n");
231 		CMD_STAT_BUMP(bad_mem_asru);
232 		return (NULL);
233 	}
234 #endif
235 	fmri = cmd_mem_fmri_create(unum, serids, nserids);
236 	if (fmd_nvl_fmri_expand(hdl, fmri) < 0) {
237 		CMD_STAT_BUMP(bad_mem_asru);
238 		nvlist_free(fmri);
239 		return (NULL);
240 	}
241 
242 	fmd_hdl_debug(hdl, "dimm_create: creating new DIMM %s\n", unum);
243 	CMD_STAT_BUMP(dimm_creat);
244 
245 	dimm = fmd_hdl_zalloc(hdl, sizeof (cmd_dimm_t), FMD_SLEEP);
246 	dimm->dimm_nodetype = CMD_NT_DIMM;
247 	dimm->dimm_version = CMD_DIMM_VERSION;
248 	dimm->dimm_phys_addr_low = ULLONG_MAX;
249 	dimm->dimm_phys_addr_hi = 0;
250 	dimm->dimm_syl_error = USHRT_MAX;
251 
252 	cmd_bufname(dimm->dimm_bufname, sizeof (dimm->dimm_bufname), "dimm_%s",
253 	    unum);
254 	cmd_fmri_init(hdl, &dimm->dimm_asru, fmri, "dimm_asru_%s", unum);
255 
256 	nvlist_free(fmri);
257 
258 	(void) nvlist_lookup_string(dimm->dimm_asru_nvl, FM_FMRI_MEM_UNUM,
259 	    (char **)&dimm->dimm_unum);
260 
261 	dimm_attach_to_bank(hdl, dimm);
262 
263 	cmd_mem_retirestat_create(hdl, &dimm->dimm_retstat, dimm->dimm_unum, 0,
264 	    CMD_DIMM_STAT_PREFIX);
265 
266 	cmd_list_append(&cmd.cmd_dimms, dimm);
267 	cmd_dimm_dirty(hdl, dimm);
268 
269 	return (dimm);
270 }
271 
272 cmd_dimm_t *
cmd_dimm_lookup(fmd_hdl_t * hdl,nvlist_t * asru)273 cmd_dimm_lookup(fmd_hdl_t *hdl, nvlist_t *asru)
274 {
275 	cmd_dimm_t *dimm;
276 	const char *unum;
277 
278 	if ((unum = cmd_fmri_get_unum(asru)) == NULL) {
279 		CMD_STAT_BUMP(bad_mem_asru);
280 		return (NULL);
281 	}
282 
283 	dimm = dimm_lookup_by_unum(unum);
284 
285 	if (dimm != NULL && !fmd_nvl_fmri_present(hdl, dimm->dimm_asru_nvl)) {
286 		/*
287 		 * The DIMM doesn't exist anymore, so we need to delete the
288 		 * state structure, which is now out of date.  The containing
289 		 * bank (if any) is also out of date, so blow it away too.
290 		 */
291 		fmd_hdl_debug(hdl, "dimm_lookup: discarding old dimm\n");
292 
293 		if (dimm->dimm_bank != NULL)
294 			cmd_bank_destroy(hdl, dimm->dimm_bank);
295 		cmd_dimm_destroy(hdl, dimm);
296 
297 		return (NULL);
298 	}
299 
300 	return (dimm);
301 }
302 
303 static cmd_dimm_t *
dimm_v0tov2(fmd_hdl_t * hdl,cmd_dimm_0_t * old,size_t oldsz)304 dimm_v0tov2(fmd_hdl_t *hdl, cmd_dimm_0_t *old, size_t oldsz)
305 {
306 	cmd_dimm_t *new;
307 
308 	if (oldsz != sizeof (cmd_dimm_0_t)) {
309 		fmd_hdl_abort(hdl, "size of state doesn't match size of "
310 		    "version 0 state (%u bytes).\n", sizeof (cmd_dimm_0_t));
311 	}
312 
313 	new = fmd_hdl_zalloc(hdl, sizeof (cmd_dimm_t), FMD_SLEEP);
314 	new->dimm_header = old->dimm0_header;
315 	new->dimm_version = CMD_DIMM_VERSION;
316 	new->dimm_asru = old->dimm0_asru;
317 	new->dimm_nretired = old->dimm0_nretired;
318 	new->dimm_phys_addr_hi = 0;
319 	new->dimm_phys_addr_low = ULLONG_MAX;
320 
321 	fmd_hdl_free(hdl, old, oldsz);
322 	return (new);
323 }
324 
325 static cmd_dimm_t *
dimm_v1tov2(fmd_hdl_t * hdl,cmd_dimm_1_t * old,size_t oldsz)326 dimm_v1tov2(fmd_hdl_t *hdl, cmd_dimm_1_t *old, size_t oldsz)
327 {
328 
329 	cmd_dimm_t *new;
330 
331 	if (oldsz != sizeof (cmd_dimm_1_t)) {
332 		fmd_hdl_abort(hdl, "size of state doesn't match size of "
333 		    "version 1 state (%u bytes).\n", sizeof (cmd_dimm_1_t));
334 	}
335 
336 	new = fmd_hdl_zalloc(hdl, sizeof (cmd_dimm_t), FMD_SLEEP);
337 
338 	new->dimm_header = old->dimm1_header;
339 	new->dimm_version = CMD_DIMM_VERSION;
340 	new->dimm_asru = old->dimm1_asru;
341 	new->dimm_nretired = old->dimm1_nretired;
342 	new->dimm_flags = old->dimm1_flags;
343 	new->dimm_phys_addr_hi = 0;
344 	new->dimm_phys_addr_low = ULLONG_MAX;
345 
346 	fmd_hdl_free(hdl, old, oldsz);
347 	return (new);
348 }
349 
350 static cmd_dimm_t *
dimm_wrapv2(fmd_hdl_t * hdl,cmd_dimm_pers_t * pers,size_t psz)351 dimm_wrapv2(fmd_hdl_t *hdl, cmd_dimm_pers_t *pers, size_t psz)
352 {
353 	cmd_dimm_t *dimm;
354 
355 	if (psz != sizeof (cmd_dimm_pers_t)) {
356 		fmd_hdl_abort(hdl, "size of state doesn't match size of "
357 		    "version 1 state (%u bytes).\n", sizeof (cmd_dimm_pers_t));
358 	}
359 
360 	dimm = fmd_hdl_zalloc(hdl, sizeof (cmd_dimm_t), FMD_SLEEP);
361 	bcopy(pers, dimm, sizeof (cmd_dimm_pers_t));
362 	fmd_hdl_free(hdl, pers, psz);
363 	return (dimm);
364 }
365 
366 void *
cmd_dimm_restore(fmd_hdl_t * hdl,fmd_case_t * cp,cmd_case_ptr_t * ptr)367 cmd_dimm_restore(fmd_hdl_t *hdl, fmd_case_t *cp, cmd_case_ptr_t *ptr)
368 {
369 	cmd_dimm_t *dimm;
370 
371 	for (dimm = cmd_list_next(&cmd.cmd_dimms); dimm != NULL;
372 	    dimm = cmd_list_next(dimm)) {
373 		if (strcmp(dimm->dimm_bufname, ptr->ptr_name) == 0)
374 			break;
375 	}
376 
377 	if (dimm == NULL) {
378 		int migrated = 0;
379 		size_t dimmsz;
380 
381 		fmd_hdl_debug(hdl, "restoring dimm from %s\n", ptr->ptr_name);
382 
383 		if ((dimmsz = fmd_buf_size(hdl, NULL, ptr->ptr_name)) == 0) {
384 			fmd_hdl_abort(hdl, "dimm referenced by case %s does "
385 			    "not exist in saved state\n",
386 			    fmd_case_uuid(hdl, cp));
387 		} else if (dimmsz > CMD_DIMM_MAXSIZE ||
388 		    dimmsz < CMD_DIMM_MINSIZE) {
389 			fmd_hdl_abort(hdl,
390 			    "dimm buffer referenced by case %s "
391 			    "is out of bounds (is %u bytes, max %u, min %u)\n",
392 			    fmd_case_uuid(hdl, cp), dimmsz,
393 			    CMD_DIMM_MAXSIZE, CMD_DIMM_MINSIZE);
394 		}
395 
396 		if ((dimm = cmd_buf_read(hdl, NULL, ptr->ptr_name,
397 		    dimmsz)) == NULL) {
398 			fmd_hdl_abort(hdl, "failed to read dimm buf %s",
399 			    ptr->ptr_name);
400 		}
401 
402 		fmd_hdl_debug(hdl, "found %d in version field\n",
403 		    dimm->dimm_version);
404 
405 		if (CMD_DIMM_VERSIONED(dimm)) {
406 			switch (dimm->dimm_version) {
407 			case CMD_DIMM_VERSION_1:
408 				dimm = dimm_v1tov2(hdl, (cmd_dimm_1_t *)dimm,
409 				    dimmsz);
410 				break;
411 			case CMD_DIMM_VERSION_2:
412 				dimm = dimm_wrapv2(hdl, (cmd_dimm_pers_t *)dimm,
413 				    dimmsz);
414 				break;
415 			default:
416 				fmd_hdl_abort(hdl, "unknown version (found %d) "
417 				    "for dimm state referenced by case %s.\n",
418 				    dimm->dimm_version, fmd_case_uuid(hdl, cp));
419 				break;
420 			}
421 		} else {
422 			dimm = dimm_v0tov2(hdl, (cmd_dimm_0_t *)dimm, dimmsz);
423 			migrated = 1;
424 		}
425 
426 		if (migrated) {
427 			CMD_STAT_BUMP(dimm_migrat);
428 			cmd_dimm_dirty(hdl, dimm);
429 		}
430 
431 		cmd_fmri_restore(hdl, &dimm->dimm_asru);
432 
433 		if ((errno = nvlist_lookup_string(dimm->dimm_asru_nvl,
434 		    FM_FMRI_MEM_UNUM, (char **)&dimm->dimm_unum)) != 0)
435 			fmd_hdl_abort(hdl, "failed to retrieve unum from asru");
436 
437 		dimm_attach_to_bank(hdl, dimm);
438 
439 		cmd_mem_retirestat_create(hdl, &dimm->dimm_retstat,
440 		    dimm->dimm_unum, dimm->dimm_nretired, CMD_DIMM_STAT_PREFIX);
441 
442 		cmd_list_append(&cmd.cmd_dimms, dimm);
443 	}
444 
445 	switch (ptr->ptr_subtype) {
446 	case BUG_PTR_DIMM_CASE:
447 		fmd_hdl_debug(hdl, "recovering from out of order dimm ptr\n");
448 		cmd_case_redirect(hdl, cp, CMD_PTR_DIMM_CASE);
449 		/*FALLTHROUGH*/
450 	case CMD_PTR_DIMM_CASE:
451 		cmd_mem_case_restore(hdl, &dimm->dimm_case, cp, "dimm",
452 		    dimm->dimm_unum);
453 		break;
454 	default:
455 		fmd_hdl_abort(hdl, "invalid %s subtype %d\n",
456 		    ptr->ptr_name, ptr->ptr_subtype);
457 	}
458 
459 	return (dimm);
460 }
461 
462 void
cmd_dimm_validate(fmd_hdl_t * hdl)463 cmd_dimm_validate(fmd_hdl_t *hdl)
464 {
465 	cmd_dimm_t *dimm, *next;
466 
467 	for (dimm = cmd_list_next(&cmd.cmd_dimms); dimm != NULL; dimm = next) {
468 		next = cmd_list_next(dimm);
469 
470 		if (!fmd_nvl_fmri_present(hdl, dimm->dimm_asru_nvl))
471 			cmd_dimm_destroy(hdl, dimm);
472 	}
473 }
474 
475 void
cmd_dimm_dirty(fmd_hdl_t * hdl,cmd_dimm_t * dimm)476 cmd_dimm_dirty(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
477 {
478 	if (fmd_buf_size(hdl, NULL, dimm->dimm_bufname) !=
479 	    sizeof (cmd_dimm_pers_t))
480 		fmd_buf_destroy(hdl, NULL, dimm->dimm_bufname);
481 
482 	/* No need to rewrite the FMRIs in the dimm - they don't change */
483 	fmd_buf_write(hdl, NULL, dimm->dimm_bufname, &dimm->dimm_pers,
484 	    sizeof (cmd_dimm_pers_t));
485 }
486 
487 void
cmd_dimm_gc(fmd_hdl_t * hdl)488 cmd_dimm_gc(fmd_hdl_t *hdl)
489 {
490 	cmd_dimm_validate(hdl);
491 }
492 
493 void
cmd_dimm_fini(fmd_hdl_t * hdl)494 cmd_dimm_fini(fmd_hdl_t *hdl)
495 {
496 	cmd_dimm_t *dimm;
497 
498 	while ((dimm = cmd_list_next(&cmd.cmd_dimms)) != NULL)
499 		cmd_dimm_free(hdl, dimm, FMD_B_FALSE);
500 }
501 
502 
503 void
cmd_dimm_save_symbol_error(cmd_dimm_t * dimm,uint16_t upos)504 cmd_dimm_save_symbol_error(cmd_dimm_t *dimm, uint16_t upos)
505 {
506 	cmd_dimm_t *d = NULL, *next = NULL;
507 
508 	for (d = cmd_list_next(&cmd.cmd_dimms); d != NULL; d = next) {
509 		next = cmd_list_next(d);
510 		if (cmd_same_datapath_dimms(dimm, d))
511 			d->dimm_syl_error = upos;
512 	}
513 }
514 
515 int
cmd_dimm_check_symbol_error(cmd_dimm_t * dimm,uint16_t synd)516 cmd_dimm_check_symbol_error(cmd_dimm_t *dimm, uint16_t synd)
517 {
518 	int upos;
519 	cmd_dimm_t *d, *next;
520 
521 	if ((upos = cmd_synd2upos(synd)) < 0)
522 		return (0);
523 
524 	for (d = cmd_list_next(&cmd.cmd_dimms); d != NULL; d = next) {
525 		next = cmd_list_next(d);
526 		if (cmd_same_datapath_dimms(dimm, d) &&
527 		    (d->dimm_syl_error == upos))
528 			return (1);
529 	}
530 
531 	return (0);
532 }
533