xref: /titanic_50/usr/src/cmd/fm/modules/sun4/cpumem-diagnosis/cmd_dimm.c (revision aab83bb83be7342f6cfccaed8d5fe0b2f404855d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Support routines for DIMMs.
27  */
28 
29 #include <cmd_mem.h>
30 #include <limits.h>
31 #include <cmd_dimm.h>
32 #include <cmd_bank.h>
33 #include <cmd.h>
34 
35 #include <errno.h>
36 #include <string.h>
37 #include <strings.h>
38 #include <fcntl.h>
39 #include <unistd.h>
40 #include <fm/fmd_api.h>
41 #include <sys/fm/protocol.h>
42 #include <sys/mem.h>
43 #include <sys/nvpair.h>
44 #ifdef sun4v
45 #include <cmd_hc_sun4v.h>
46 #include <cmd_branch.h>
47 #endif /* sun4v */
48 
49 /*
50  * Some errors (RxE/FRx pairs) don't have accurate DIMM (resource) FMRIs,
51  * because sufficient information was unavailable prior to correlation.
52  * When the DE completes the pair, it uses this routine to retrieve the
53  * correct FMRI.
54  */
55 nvlist_t *
cmd_dimm_fmri_derive(fmd_hdl_t * hdl,uint64_t afar,uint16_t synd,uint64_t afsr)56 cmd_dimm_fmri_derive(fmd_hdl_t *hdl, uint64_t afar, uint16_t synd,
57     uint64_t afsr)
58 {
59 	nvlist_t *fmri;
60 
61 	if ((fmri = cmd_mem_fmri_derive(hdl, afar, afsr, synd)) == NULL)
62 		return (NULL);
63 
64 	if (fmd_nvl_fmri_expand(hdl, fmri) < 0) {
65 		nvlist_free(fmri);
66 		return (NULL);
67 	}
68 
69 	return (fmri);
70 }
71 
72 nvlist_t *
cmd_dimm_fru(cmd_dimm_t * dimm)73 cmd_dimm_fru(cmd_dimm_t *dimm)
74 {
75 	return (dimm->dimm_asru_nvl);
76 }
77 
78 nvlist_t *
cmd_dimm_create_fault(fmd_hdl_t * hdl,cmd_dimm_t * dimm,const char * fltnm,uint_t cert)79 cmd_dimm_create_fault(fmd_hdl_t *hdl, cmd_dimm_t *dimm, const char *fltnm,
80     uint_t cert)
81 {
82 #ifdef sun4v
83 	nvlist_t *flt, *nvlfru;
84 	/*
85 	 * Do NOT issue hc scheme FRU FMRIs for ultraSPARC-T1 platforms.
86 	 * The SP will misinterpret the FRU. Instead, reuse the ASRU FMRI
87 	 *
88 	 * Use the BR string as a distinguisher. BR (branch) is only
89 	 * present in ultraSPARC-T2/T2plus DIMM unums
90 	 */
91 	if (strstr(dimm->dimm_unum, "BR") == NULL) {
92 		flt = cmd_nvl_create_fault(hdl, fltnm, cert,
93 		    dimm->dimm_asru_nvl, dimm->dimm_asru_nvl, NULL);
94 	} else {
95 		nvlfru = cmd_mem2hc(hdl, dimm->dimm_asru_nvl);
96 		flt = cmd_nvl_create_fault(hdl, fltnm, cert,
97 		    dimm->dimm_asru_nvl, nvlfru, NULL);
98 		nvlist_free(nvlfru);
99 	}
100 	return (cmd_fault_add_location(hdl, flt, dimm->dimm_unum));
101 #else
102 	return (cmd_nvl_create_fault(hdl, fltnm, cert, dimm->dimm_asru_nvl,
103 	    dimm->dimm_asru_nvl, NULL));
104 #endif /* sun4v */
105 }
106 
107 static void
cmd_dimm_free(fmd_hdl_t * hdl,cmd_dimm_t * dimm,int destroy)108 cmd_dimm_free(fmd_hdl_t *hdl, cmd_dimm_t *dimm, int destroy)
109 {
110 	cmd_case_t *cc = &dimm->dimm_case;
111 	int i;
112 	cmd_mq_t *q;
113 	tstamp_t  *tsp, *next;
114 
115 #ifdef sun4v
116 	cmd_branch_t *branch;
117 #endif
118 	if (cc->cc_cp != NULL) {
119 		cmd_case_fini(hdl, cc->cc_cp, destroy);
120 		if (cc->cc_serdnm != NULL) {
121 			if (fmd_serd_exists(hdl, cc->cc_serdnm) &&
122 			    destroy)
123 				fmd_serd_destroy(hdl, cc->cc_serdnm);
124 			fmd_hdl_strfree(hdl, cc->cc_serdnm);
125 		}
126 	}
127 
128 	for (i = 0; i < CMD_MAX_CKWDS; i++) {
129 		while ((q = cmd_list_next(&dimm->mq_root[i])) != NULL) {
130 			if (q->mq_serdnm != NULL) {
131 				if (fmd_serd_exists(hdl, q->mq_serdnm)) {
132 					fmd_serd_destroy(hdl, q->mq_serdnm);
133 				}
134 				fmd_hdl_strfree(hdl, q->mq_serdnm);
135 				q->mq_serdnm = NULL;
136 			}
137 
138 			for (tsp = cmd_list_next(&q->mq_dupce_tstamp);
139 			    tsp != NULL; tsp = next) {
140 				next = cmd_list_next(tsp);
141 				cmd_list_delete(&q->mq_dupce_tstamp,
142 				    &tsp->ts_l);
143 				fmd_hdl_free(hdl, tsp, sizeof (tstamp_t));
144 			}
145 
146 			cmd_list_delete(&dimm->mq_root[i], q);
147 			fmd_hdl_free(hdl, q, sizeof (cmd_mq_t));
148 		}
149 	}
150 
151 	if (dimm->dimm_bank != NULL)
152 		cmd_bank_remove_dimm(hdl, dimm->dimm_bank, dimm);
153 
154 #ifdef sun4v
155 	branch = cmd_branch_lookup_by_unum(hdl, dimm->dimm_unum);
156 	if (branch != NULL)
157 		cmd_branch_remove_dimm(hdl, branch, dimm);
158 #endif
159 
160 	cmd_fmri_fini(hdl, &dimm->dimm_asru, destroy);
161 
162 	if (destroy)
163 		fmd_buf_destroy(hdl, NULL, dimm->dimm_bufname);
164 
165 	cmd_list_delete(&cmd.cmd_dimms, dimm);
166 	fmd_hdl_free(hdl, dimm, sizeof (cmd_dimm_t));
167 }
168 
169 void
cmd_dimm_destroy(fmd_hdl_t * hdl,cmd_dimm_t * dimm)170 cmd_dimm_destroy(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
171 {
172 
173 	fmd_stat_destroy(hdl, 1, &(dimm->dimm_retstat));
174 	cmd_dimm_free(hdl, dimm, FMD_B_TRUE);
175 }
176 
177 static cmd_dimm_t *
dimm_lookup_by_unum(const char * unum)178 dimm_lookup_by_unum(const char *unum)
179 {
180 	cmd_dimm_t *dimm;
181 
182 	for (dimm = cmd_list_next(&cmd.cmd_dimms); dimm != NULL;
183 	    dimm = cmd_list_next(dimm)) {
184 		if (strcmp(dimm->dimm_unum, unum) == 0)
185 			return (dimm);
186 	}
187 
188 	return (NULL);
189 }
190 
191 static void
dimm_attach_to_bank(fmd_hdl_t * hdl,cmd_dimm_t * dimm)192 dimm_attach_to_bank(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
193 {
194 	cmd_bank_t *bank;
195 
196 	for (bank = cmd_list_next(&cmd.cmd_banks); bank != NULL;
197 	    bank = cmd_list_next(bank)) {
198 		if (fmd_nvl_fmri_contains(hdl, bank->bank_asru_nvl,
199 		    dimm->dimm_asru_nvl)) {
200 			cmd_bank_add_dimm(hdl, bank, dimm);
201 			return;
202 		}
203 	}
204 }
205 
206 cmd_dimm_t *
cmd_dimm_create(fmd_hdl_t * hdl,nvlist_t * asru)207 cmd_dimm_create(fmd_hdl_t *hdl, nvlist_t *asru)
208 {
209 	cmd_dimm_t *dimm;
210 	const char *unum;
211 	nvlist_t *fmri;
212 	size_t nserids = 0;
213 	char **serids = NULL;
214 
215 	if (!fmd_nvl_fmri_present(hdl, asru)) {
216 		fmd_hdl_debug(hdl, "dimm_lookup: discarding old ereport\n");
217 		return (NULL);
218 	}
219 
220 	if ((unum = cmd_fmri_get_unum(asru)) == NULL) {
221 		CMD_STAT_BUMP(bad_mem_asru);
222 		return (NULL);
223 	}
224 
225 #ifdef sun4v
226 	if (nvlist_lookup_string_array(asru, FM_FMRI_HC_SERIAL_ID, &serids,
227 	    &nserids) != 0) {
228 		fmd_hdl_debug(hdl, "sun4v mem: FMRI does not"
229 		    " have serial_ids\n");
230 		CMD_STAT_BUMP(bad_mem_asru);
231 		return (NULL);
232 	}
233 #endif
234 	fmri = cmd_mem_fmri_create(unum, serids, nserids);
235 	if (fmd_nvl_fmri_expand(hdl, fmri) < 0) {
236 		CMD_STAT_BUMP(bad_mem_asru);
237 		nvlist_free(fmri);
238 		return (NULL);
239 	}
240 
241 	fmd_hdl_debug(hdl, "dimm_create: creating new DIMM %s\n", unum);
242 	CMD_STAT_BUMP(dimm_creat);
243 
244 	dimm = fmd_hdl_zalloc(hdl, sizeof (cmd_dimm_t), FMD_SLEEP);
245 	dimm->dimm_nodetype = CMD_NT_DIMM;
246 	dimm->dimm_version = CMD_DIMM_VERSION;
247 	dimm->dimm_phys_addr_low = ULLONG_MAX;
248 	dimm->dimm_phys_addr_hi = 0;
249 	dimm->dimm_syl_error = USHRT_MAX;
250 
251 	cmd_bufname(dimm->dimm_bufname, sizeof (dimm->dimm_bufname), "dimm_%s",
252 	    unum);
253 	cmd_fmri_init(hdl, &dimm->dimm_asru, fmri, "dimm_asru_%s", unum);
254 
255 	nvlist_free(fmri);
256 
257 	(void) nvlist_lookup_string(dimm->dimm_asru_nvl, FM_FMRI_MEM_UNUM,
258 	    (char **)&dimm->dimm_unum);
259 
260 	dimm_attach_to_bank(hdl, dimm);
261 
262 	cmd_mem_retirestat_create(hdl, &dimm->dimm_retstat, dimm->dimm_unum, 0,
263 	    CMD_DIMM_STAT_PREFIX);
264 
265 	cmd_list_append(&cmd.cmd_dimms, dimm);
266 	cmd_dimm_dirty(hdl, dimm);
267 
268 	return (dimm);
269 }
270 
271 cmd_dimm_t *
cmd_dimm_lookup(fmd_hdl_t * hdl,nvlist_t * asru)272 cmd_dimm_lookup(fmd_hdl_t *hdl, nvlist_t *asru)
273 {
274 	cmd_dimm_t *dimm;
275 	const char *unum;
276 
277 	if ((unum = cmd_fmri_get_unum(asru)) == NULL) {
278 		CMD_STAT_BUMP(bad_mem_asru);
279 		return (NULL);
280 	}
281 
282 	dimm = dimm_lookup_by_unum(unum);
283 
284 	if (dimm != NULL && !fmd_nvl_fmri_present(hdl, dimm->dimm_asru_nvl)) {
285 		/*
286 		 * The DIMM doesn't exist anymore, so we need to delete the
287 		 * state structure, which is now out of date.  The containing
288 		 * bank (if any) is also out of date, so blow it away too.
289 		 */
290 		fmd_hdl_debug(hdl, "dimm_lookup: discarding old dimm\n");
291 
292 		if (dimm->dimm_bank != NULL)
293 			cmd_bank_destroy(hdl, dimm->dimm_bank);
294 		cmd_dimm_destroy(hdl, dimm);
295 
296 		return (NULL);
297 	}
298 
299 	return (dimm);
300 }
301 
302 static cmd_dimm_t *
dimm_v0tov2(fmd_hdl_t * hdl,cmd_dimm_0_t * old,size_t oldsz)303 dimm_v0tov2(fmd_hdl_t *hdl, cmd_dimm_0_t *old, size_t oldsz)
304 {
305 	cmd_dimm_t *new;
306 
307 	if (oldsz != sizeof (cmd_dimm_0_t)) {
308 		fmd_hdl_abort(hdl, "size of state doesn't match size of "
309 		    "version 0 state (%u bytes).\n", sizeof (cmd_dimm_0_t));
310 	}
311 
312 	new = fmd_hdl_zalloc(hdl, sizeof (cmd_dimm_t), FMD_SLEEP);
313 	new->dimm_header = old->dimm0_header;
314 	new->dimm_version = CMD_DIMM_VERSION;
315 	new->dimm_asru = old->dimm0_asru;
316 	new->dimm_nretired = old->dimm0_nretired;
317 	new->dimm_phys_addr_hi = 0;
318 	new->dimm_phys_addr_low = ULLONG_MAX;
319 
320 	fmd_hdl_free(hdl, old, oldsz);
321 	return (new);
322 }
323 
324 static cmd_dimm_t *
dimm_v1tov2(fmd_hdl_t * hdl,cmd_dimm_1_t * old,size_t oldsz)325 dimm_v1tov2(fmd_hdl_t *hdl, cmd_dimm_1_t *old, size_t oldsz)
326 {
327 
328 	cmd_dimm_t *new;
329 
330 	if (oldsz != sizeof (cmd_dimm_1_t)) {
331 		fmd_hdl_abort(hdl, "size of state doesn't match size of "
332 		    "version 1 state (%u bytes).\n", sizeof (cmd_dimm_1_t));
333 	}
334 
335 	new = fmd_hdl_zalloc(hdl, sizeof (cmd_dimm_t), FMD_SLEEP);
336 
337 	new->dimm_header = old->dimm1_header;
338 	new->dimm_version = CMD_DIMM_VERSION;
339 	new->dimm_asru = old->dimm1_asru;
340 	new->dimm_nretired = old->dimm1_nretired;
341 	new->dimm_flags = old->dimm1_flags;
342 	new->dimm_phys_addr_hi = 0;
343 	new->dimm_phys_addr_low = ULLONG_MAX;
344 
345 	fmd_hdl_free(hdl, old, oldsz);
346 	return (new);
347 }
348 
349 static cmd_dimm_t *
dimm_wrapv2(fmd_hdl_t * hdl,cmd_dimm_pers_t * pers,size_t psz)350 dimm_wrapv2(fmd_hdl_t *hdl, cmd_dimm_pers_t *pers, size_t psz)
351 {
352 	cmd_dimm_t *dimm;
353 
354 	if (psz != sizeof (cmd_dimm_pers_t)) {
355 		fmd_hdl_abort(hdl, "size of state doesn't match size of "
356 		    "version 1 state (%u bytes).\n", sizeof (cmd_dimm_pers_t));
357 	}
358 
359 	dimm = fmd_hdl_zalloc(hdl, sizeof (cmd_dimm_t), FMD_SLEEP);
360 	bcopy(pers, dimm, sizeof (cmd_dimm_pers_t));
361 	fmd_hdl_free(hdl, pers, psz);
362 	return (dimm);
363 }
364 
365 void *
cmd_dimm_restore(fmd_hdl_t * hdl,fmd_case_t * cp,cmd_case_ptr_t * ptr)366 cmd_dimm_restore(fmd_hdl_t *hdl, fmd_case_t *cp, cmd_case_ptr_t *ptr)
367 {
368 	cmd_dimm_t *dimm;
369 
370 	for (dimm = cmd_list_next(&cmd.cmd_dimms); dimm != NULL;
371 	    dimm = cmd_list_next(dimm)) {
372 		if (strcmp(dimm->dimm_bufname, ptr->ptr_name) == 0)
373 			break;
374 	}
375 
376 	if (dimm == NULL) {
377 		int migrated = 0;
378 		size_t dimmsz;
379 
380 		fmd_hdl_debug(hdl, "restoring dimm from %s\n", ptr->ptr_name);
381 
382 		if ((dimmsz = fmd_buf_size(hdl, NULL, ptr->ptr_name)) == 0) {
383 			fmd_hdl_abort(hdl, "dimm referenced by case %s does "
384 			    "not exist in saved state\n",
385 			    fmd_case_uuid(hdl, cp));
386 		} else if (dimmsz > CMD_DIMM_MAXSIZE ||
387 		    dimmsz < CMD_DIMM_MINSIZE) {
388 			fmd_hdl_abort(hdl,
389 			    "dimm buffer referenced by case %s "
390 			    "is out of bounds (is %u bytes, max %u, min %u)\n",
391 			    fmd_case_uuid(hdl, cp), dimmsz,
392 			    CMD_DIMM_MAXSIZE, CMD_DIMM_MINSIZE);
393 		}
394 
395 		if ((dimm = cmd_buf_read(hdl, NULL, ptr->ptr_name,
396 		    dimmsz)) == NULL) {
397 			fmd_hdl_abort(hdl, "failed to read dimm buf %s",
398 			    ptr->ptr_name);
399 		}
400 
401 		fmd_hdl_debug(hdl, "found %d in version field\n",
402 		    dimm->dimm_version);
403 
404 		if (CMD_DIMM_VERSIONED(dimm)) {
405 			switch (dimm->dimm_version) {
406 			case CMD_DIMM_VERSION_1:
407 				dimm = dimm_v1tov2(hdl, (cmd_dimm_1_t *)dimm,
408 				    dimmsz);
409 				break;
410 			case CMD_DIMM_VERSION_2:
411 				dimm = dimm_wrapv2(hdl, (cmd_dimm_pers_t *)dimm,
412 				    dimmsz);
413 				break;
414 			default:
415 				fmd_hdl_abort(hdl, "unknown version (found %d) "
416 				    "for dimm state referenced by case %s.\n",
417 				    dimm->dimm_version, fmd_case_uuid(hdl, cp));
418 				break;
419 			}
420 		} else {
421 			dimm = dimm_v0tov2(hdl, (cmd_dimm_0_t *)dimm, dimmsz);
422 			migrated = 1;
423 		}
424 
425 		if (migrated) {
426 			CMD_STAT_BUMP(dimm_migrat);
427 			cmd_dimm_dirty(hdl, dimm);
428 		}
429 
430 		cmd_fmri_restore(hdl, &dimm->dimm_asru);
431 
432 		if ((errno = nvlist_lookup_string(dimm->dimm_asru_nvl,
433 		    FM_FMRI_MEM_UNUM, (char **)&dimm->dimm_unum)) != 0)
434 			fmd_hdl_abort(hdl, "failed to retrieve unum from asru");
435 
436 		dimm_attach_to_bank(hdl, dimm);
437 
438 		cmd_mem_retirestat_create(hdl, &dimm->dimm_retstat,
439 		    dimm->dimm_unum, dimm->dimm_nretired, CMD_DIMM_STAT_PREFIX);
440 
441 		cmd_list_append(&cmd.cmd_dimms, dimm);
442 	}
443 
444 	switch (ptr->ptr_subtype) {
445 	case BUG_PTR_DIMM_CASE:
446 		fmd_hdl_debug(hdl, "recovering from out of order dimm ptr\n");
447 		cmd_case_redirect(hdl, cp, CMD_PTR_DIMM_CASE);
448 		/*FALLTHROUGH*/
449 	case CMD_PTR_DIMM_CASE:
450 		cmd_mem_case_restore(hdl, &dimm->dimm_case, cp, "dimm",
451 		    dimm->dimm_unum);
452 		break;
453 	default:
454 		fmd_hdl_abort(hdl, "invalid %s subtype %d\n",
455 		    ptr->ptr_name, ptr->ptr_subtype);
456 	}
457 
458 	return (dimm);
459 }
460 
461 void
cmd_dimm_validate(fmd_hdl_t * hdl)462 cmd_dimm_validate(fmd_hdl_t *hdl)
463 {
464 	cmd_dimm_t *dimm, *next;
465 
466 	for (dimm = cmd_list_next(&cmd.cmd_dimms); dimm != NULL; dimm = next) {
467 		next = cmd_list_next(dimm);
468 
469 		if (!fmd_nvl_fmri_present(hdl, dimm->dimm_asru_nvl))
470 			cmd_dimm_destroy(hdl, dimm);
471 	}
472 }
473 
474 void
cmd_dimm_dirty(fmd_hdl_t * hdl,cmd_dimm_t * dimm)475 cmd_dimm_dirty(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
476 {
477 	if (fmd_buf_size(hdl, NULL, dimm->dimm_bufname) !=
478 	    sizeof (cmd_dimm_pers_t))
479 		fmd_buf_destroy(hdl, NULL, dimm->dimm_bufname);
480 
481 	/* No need to rewrite the FMRIs in the dimm - they don't change */
482 	fmd_buf_write(hdl, NULL, dimm->dimm_bufname, &dimm->dimm_pers,
483 	    sizeof (cmd_dimm_pers_t));
484 }
485 
486 void
cmd_dimm_gc(fmd_hdl_t * hdl)487 cmd_dimm_gc(fmd_hdl_t *hdl)
488 {
489 	cmd_dimm_validate(hdl);
490 }
491 
492 void
cmd_dimm_fini(fmd_hdl_t * hdl)493 cmd_dimm_fini(fmd_hdl_t *hdl)
494 {
495 	cmd_dimm_t *dimm;
496 
497 	while ((dimm = cmd_list_next(&cmd.cmd_dimms)) != NULL)
498 		cmd_dimm_free(hdl, dimm, FMD_B_FALSE);
499 }
500 
501 
502 void
cmd_dimm_save_symbol_error(cmd_dimm_t * dimm,uint16_t upos)503 cmd_dimm_save_symbol_error(cmd_dimm_t *dimm, uint16_t upos)
504 {
505 	cmd_dimm_t *d = NULL, *next = NULL;
506 
507 	for (d = cmd_list_next(&cmd.cmd_dimms); d != NULL; d = next) {
508 		next = cmd_list_next(d);
509 		if (cmd_same_datapath_dimms(dimm, d))
510 			d->dimm_syl_error = upos;
511 	}
512 }
513 
514 int
cmd_dimm_check_symbol_error(cmd_dimm_t * dimm,uint16_t synd)515 cmd_dimm_check_symbol_error(cmd_dimm_t *dimm, uint16_t synd)
516 {
517 	int upos;
518 	cmd_dimm_t *d, *next;
519 
520 	if ((upos = cmd_synd2upos(synd)) < 0)
521 		return (0);
522 
523 	for (d = cmd_list_next(&cmd.cmd_dimms); d != NULL; d = next) {
524 		next = cmd_list_next(d);
525 		if (cmd_same_datapath_dimms(dimm, d) &&
526 		    (d->dimm_syl_error == upos))
527 			return (1);
528 	}
529 
530 	return (0);
531 }
532