xref: /titanic_52/usr/src/cmd/fm/modules/sun4u/cpumem-diagnosis/cmd_dp.c (revision 8ca4fa23f8750b90c13a6933cc51ddb7d29abf22)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <strings.h>
29 #include <string.h>
30 #include <errno.h>
31 #include <fm/fmd_api.h>
32 #include <sys/fm/protocol.h>
33 #include <sys/async.h>
34 #include <sys/time.h>
35 #include <cmd.h>
36 #include <cmd_state.h>
37 #include <cmd_mem.h>
38 #include <cmd_dp.h>
39 #include <cmd_dp_page.h>
40 #include <libnvpair.h>
41 #include <fcntl.h>
42 #include <unistd.h>
43 #include <sys/mem.h>
44 #include <sys/plat_datapath.h>
45 
46 /*ARGSUSED*/
47 static nvlist_t *
48 dp_cpu_fmri(fmd_hdl_t *hdl, uint32_t cpuid, uint64_t serial_id)
49 {
50 	nvlist_t	*nvl = NULL;
51 	int		err;
52 	char sbuf[21]; /* sizeof (UINT64_MAX) + '\0' */
53 
54 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
55 		return (NULL);
56 
57 	err = nvlist_add_string(nvl, FM_FMRI_SCHEME, FM_FMRI_SCHEME_CPU);
58 	err |= nvlist_add_uint8(nvl, FM_VERSION, FM_CPU_SCHEME_VERSION);
59 	err |= nvlist_add_uint32(nvl, FM_FMRI_CPU_ID, cpuid);
60 
61 	/*
62 	 * Version 1 calls for a string-based serial number
63 	 */
64 	(void) snprintf(sbuf, sizeof (sbuf), "%llX", (u_longlong_t)serial_id);
65 	err |= nvlist_add_string(nvl, FM_FMRI_CPU_SERIAL_ID, sbuf);
66 	if (err != 0) {
67 		nvlist_free(nvl);
68 		return (NULL);
69 	}
70 	return (nvl);
71 }
72 
73 cmd_dp_t *
74 cmd_dp_lookup_fault(fmd_hdl_t *hdl, uint32_t cpuid)
75 {
76 	cmd_dp_t	*ptr;
77 	int		i, found = 0;
78 
79 	/*
80 	 * Scan the cmd.cmd_datapaths list to see if there is
81 	 * a fault event present that impacts 'cpuid'
82 	 */
83 	for (ptr = cmd_list_next(&cmd.cmd_datapaths); ptr != NULL;
84 	    ptr = cmd_list_next(ptr)) {
85 		if (ptr->dp_erpt_type == DP_FAULT) {
86 			for (i = 0; i < ptr->dp_ncpus; i++) {
87 				if (ptr->dp_cpuid_list[i] == cpuid) {
88 					found = 1;
89 					break;
90 				}
91 			}
92 		}
93 		if (found)
94 			break;
95 	}
96 
97 	/*
98 	 * Check if the FMRI for the found cpuid exists in the domain.
99 	 * If it does not, it implies a DR has been done and this DP_FAULT
100 	 * is no longer needed.
101 	 */
102 	if (ptr != NULL) {
103 		nvlist_t	*nvl;
104 
105 		nvl = dp_cpu_fmri(hdl, ptr->dp_cpuid_list[i],
106 		    ptr->dp_serid_list[i]);
107 
108 		if (nvl != NULL) {
109 			if (!fmd_nvl_fmri_present(hdl, nvl)) {
110 				cmd_dp_destroy(hdl, ptr);
111 				ptr = NULL;
112 			}
113 			nvlist_free(nvl);
114 		}
115 	}
116 	return (ptr);
117 }
118 
119 cmd_dp_t *
120 cmd_dp_lookup_error(cmd_dp_t *dp)
121 {
122 	cmd_dp_t	*ptr;
123 
124 	/*
125 	 * Scan the cmd.cmd_datapaths list to see if there is
126 	 * an existing error that matches 'dp'. A match is if
127 	 * both dp_err and the base cpuid are identical
128 	 */
129 	for (ptr = cmd_list_next(&cmd.cmd_datapaths); ptr != NULL;
130 	    ptr = cmd_list_next(ptr)) {
131 		if (ptr->dp_erpt_type == DP_ERROR) {
132 			if ((ptr->dp_err == dp->dp_err) &&
133 			    (ptr->dp_cpuid_list[0] == dp->dp_cpuid_list[0]))
134 				return (ptr);
135 		}
136 	}
137 	return (NULL);
138 }
139 
140 /*
141  * Allocates an nvlist_t, and sets ASRU information according to
142  * the cmd_dp_t provided.
143  */
144 /*ARGSUSED*/
145 nvlist_t *
146 cmd_dp_setasru(fmd_hdl_t *hdl, cmd_dp_t *dpt)
147 {
148 	nvlist_t	*asru, *hcelem[DP_MAX_ASRUS];
149 	int		i, j, sz, err;
150 	char		buf[DP_MAX_BUF];
151 
152 	sz = dpt->dp_ncpus;
153 
154 	/* put ASRUs in an nvlist */
155 	for (i = 0; i < sz; i++) {
156 		(void) snprintf(buf, DP_MAX_BUF, "%d", dpt->dp_cpuid_list[i]);
157 		if (nvlist_alloc(&hcelem[i], NV_UNIQUE_NAME, 0) != 0)
158 			return (NULL);
159 
160 		err = nvlist_add_string(hcelem[i], FM_FMRI_HC_NAME,
161 		    FM_FMRI_CPU_ID);
162 		err |= nvlist_add_string(hcelem[i], FM_FMRI_HC_ID, buf);
163 		if (err != 0) {
164 			for (j = 0; j < i + 1; j++)
165 				nvlist_free(hcelem[j]);
166 			return (NULL);
167 		}
168 	}
169 
170 	/* put it in an HC scheme */
171 	if (nvlist_alloc(&asru, NV_UNIQUE_NAME, 0) != 0) {
172 		for (j = 0; j < sz; j++)
173 			nvlist_free(hcelem[j]);
174 		return (NULL);
175 	}
176 	err = nvlist_add_uint8(asru, FM_VERSION, FM_HC_SCHEME_VERSION);
177 	err |= nvlist_add_string(asru, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
178 	err |= nvlist_add_string(asru, FM_FMRI_HC_ROOT, "");
179 	err |= nvlist_add_uint32(asru, FM_FMRI_HC_LIST_SZ, sz);
180 	err |= nvlist_add_nvlist_array(asru, FM_FMRI_HC_LIST, &hcelem[0],
181 	    dpt->dp_ncpus);
182 	if (err != 0) {
183 		for (j = 0; j < sz; j++)
184 			nvlist_free(hcelem[j]);
185 		nvlist_free(asru);
186 		return (NULL);
187 	}
188 
189 	/* free up memory */
190 	for (j = 0; j < sz; j++)
191 		nvlist_free(hcelem[j]);
192 
193 	/* return the ASRU */
194 	return (asru);
195 }
196 
197 void
198 dp_buf_write(fmd_hdl_t *hdl, cmd_dp_t *dp)
199 {
200 	size_t sz;
201 
202 	if ((sz = fmd_buf_size(hdl, NULL, dp->dp_bufname)) != 0 &&
203 	    sz != sizeof (cmd_dp_pers_t))
204 		fmd_buf_destroy(hdl, NULL, dp->dp_bufname);
205 
206 	fmd_buf_write(hdl, NULL, dp->dp_bufname, &dp->dp_pers,
207 	    sizeof (cmd_dp_pers_t));
208 }
209 
210 static cmd_dp_t *
211 dp_wrapv0(fmd_hdl_t *hdl, cmd_dp_pers_t *pers, size_t psz)
212 {
213 	cmd_dp_t *dp;
214 
215 	if (psz != sizeof (cmd_dp_pers_t)) {
216 		fmd_hdl_abort(hdl, "size of state doesn't match size of "
217 		    "version 1 state (%u bytes).\n", sizeof (cmd_dp_pers_t));
218 	}
219 
220 	dp = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_t), FMD_SLEEP);
221 	bcopy(pers, dp, sizeof (cmd_dp_pers_t));
222 	fmd_hdl_free(hdl, pers, psz);
223 	return (dp);
224 }
225 
226 void *
227 cmd_dp_restore(fmd_hdl_t *hdl, fmd_case_t *cp, cmd_case_ptr_t *ptr)
228 {
229 	cmd_dp_t *dp;
230 
231 	for (dp = cmd_list_next(&cmd.cmd_datapaths); dp != NULL;
232 	    dp = cmd_list_next(dp)) {
233 		if (dp->dp_case == cp)
234 			break;
235 	}
236 
237 	if (dp == NULL) {
238 		size_t dpsz;
239 
240 		fmd_hdl_debug(hdl, "restoring dp from %s\n", ptr->ptr_name);
241 
242 		if ((dpsz = fmd_buf_size(hdl, NULL, ptr->ptr_name)) == 0) {
243 			if (fmd_case_solved(hdl, cp) ||
244 			    fmd_case_closed(hdl, cp)) {
245 				fmd_hdl_debug(hdl, "dp %s from case %s not "
246 				    "found. Case is already solved or closed\n",
247 				    ptr->ptr_name, fmd_case_uuid(hdl, cp));
248 				return (NULL);
249 			} else {
250 				fmd_hdl_abort(hdl, "dp referenced by case %s "
251 				    "does not exist in saved state\n",
252 				    fmd_case_uuid(hdl, cp));
253 			}
254 		} else if (dpsz > CMD_DP_MAXSIZE ||
255 		    dpsz < CMD_DP_MINSIZE) {
256 			fmd_hdl_abort(hdl, "dp buffer referenced by "
257 			    "case %s is out of bounds (is %u bytes, "
258 			    "max %u, min %u)\n", fmd_case_uuid(hdl, cp),
259 			    dpsz, CMD_DP_MAXSIZE, CMD_DP_MINSIZE);
260 		}
261 
262 		if ((dp = cmd_buf_read(hdl, NULL, ptr->ptr_name, dpsz)) == NULL)
263 			fmd_hdl_abort(hdl, "failed to read dp buf %s",
264 			    ptr->ptr_name);
265 
266 		switch (dp->dp_version) {
267 		case CMD_DP_VERSION_0:
268 			dp = dp_wrapv0(hdl, (cmd_dp_pers_t *)dp, dpsz);
269 			break;
270 		default:
271 			fmd_hdl_abort(hdl, "unknown version (found %d) "
272 			    "for dp state referenced by case %s.\n",
273 			    dp->dp_version, fmd_case_uuid(hdl, cp));
274 			break;
275 		}
276 
277 		dp->dp_case = cp;
278 
279 		if (dp->dp_erpt_type == DP_ERROR) {
280 			fmd_event_t *ep = fmd_case_getprincipal(hdl, cp);
281 
282 			++cmd.cmd_dp_flag;
283 
284 			dp->dp_id = fmd_timer_install(hdl,
285 			    (void *)CMD_TIMERTYPE_DP, ep,
286 			    (hrtime_t)NANOSEC * (dp->dp_t_value + 120));
287 		}
288 
289 		cmd_list_append(&cmd.cmd_datapaths, dp);
290 	}
291 
292 	return (dp);
293 }
294 
295 void
296 cmd_dp_close(fmd_hdl_t *hdl, void *arg)
297 {
298 	cmd_dp_destroy(hdl, arg);
299 }
300 
301 void
302 cmd_dp_timeout(fmd_hdl_t *hdl, id_t id)
303 {
304 	cmd_dp_t		*dp;
305 
306 	/* close case associated with the timer */
307 	for (dp = cmd_list_next(&cmd.cmd_datapaths); dp != NULL;
308 	    dp = cmd_list_next(dp)) {
309 		if (dp->dp_id == id) {
310 			cmd_dp_destroy(hdl, dp);
311 			break;
312 		}
313 	}
314 
315 	fmd_hdl_debug(hdl, "cmd_dp_timeout() complete\n");
316 }
317 
318 /*
319  * Validate by matching each cmd_dp_t cpu and serial id to what is
320  * installed and active on this machine or domain. Delete the cmd_dp_t
321  * if no match is made.
322  */
323 void
324 cmd_dp_validate(fmd_hdl_t *hdl)
325 {
326 	cmd_dp_t *dp, *next;
327 	nvlist_t *nvl;
328 	int i, no_match;
329 
330 	for (dp = cmd_list_next(&cmd.cmd_datapaths); dp != NULL; dp = next) {
331 		next = cmd_list_next(dp);
332 
333 		for (i = 0, no_match = 0; i < dp->dp_ncpus; i++) {
334 			nvl = dp_cpu_fmri(hdl, dp->dp_cpuid_list[i],
335 			    dp->dp_serid_list[i]);
336 
337 			if (nvl == NULL)
338 				fmd_hdl_abort(hdl, "could not make CPU fmri");
339 
340 			if (!fmd_nvl_fmri_present(hdl, nvl))
341 				no_match = 1;
342 
343 			nvlist_free(nvl);
344 
345 			if (no_match) {
346 				cmd_dp_destroy(hdl, dp);
347 				break;
348 			}
349 		}
350 	}
351 }
352 
353 static void
354 cmd_dp_free(fmd_hdl_t *hdl, cmd_dp_t *dp, int destroy)
355 {
356 	if (dp->dp_case != NULL)
357 		cmd_case_fini(hdl, dp->dp_case, destroy);
358 
359 	if (destroy && dp->dp_erpt_type == DP_ERROR) {
360 		--cmd.cmd_dp_flag;
361 		/*
362 		 * If there are no active datapath events, replay any
363 		 * pages that were deferred.
364 		 */
365 		if (cmd.cmd_dp_flag == 0)
366 			cmd_dp_page_replay(hdl);
367 	}
368 
369 	if (destroy)
370 		fmd_buf_destroy(hdl, NULL, dp->dp_bufname);
371 
372 	cmd_list_delete(&cmd.cmd_datapaths, dp);
373 	fmd_hdl_free(hdl, dp, sizeof (cmd_dp_t));
374 }
375 
376 void
377 cmd_dp_destroy(fmd_hdl_t *hdl, cmd_dp_t *dp)
378 {
379 	cmd_dp_free(hdl, dp, FMD_B_TRUE);
380 }
381 
382 /*ARGSUSED*/
383 int
384 cmd_dp_error(fmd_hdl_t *hdl)
385 {
386 	if (cmd.cmd_dp_flag)
387 		return (1);
388 	else
389 		return (0);
390 }
391 
392 int
393 cmd_dp_get_mcid(uint64_t addr, int *mcid)
394 {
395 	int fd, rc;
396 	mem_info_t data;
397 
398 	if ((fd = open("/dev/mem", O_RDONLY)) < 0)
399 		return (-1);
400 
401 	data.m_addr = addr;
402 	data.m_synd = 0;
403 	if ((rc = ioctl(fd, MEM_INFO, &data)) < 0) {
404 		(void) close(fd);
405 		return (rc);
406 	}
407 
408 	(void) close(fd);
409 	*mcid = data.m_mcid;
410 
411 	return (0);
412 }
413 
414 /*ARGSUSED*/
415 int
416 cmd_dp_fault(fmd_hdl_t *hdl, uint64_t addr)
417 {
418 	int mcid;
419 
420 	if (cmd_dp_get_mcid(addr, &mcid) < 0)
421 		fmd_hdl_abort(hdl, "cmd_dp_get_mcid failed");
422 
423 	if (cmd_dp_lookup_fault(hdl, mcid) != NULL)
424 		return (1);
425 	else
426 		return (0);
427 }
428 
429 void
430 cmd_dp_fini(fmd_hdl_t *hdl)
431 {
432 	cmd_dp_t *dp;
433 	cmd_dp_defer_t *dpage;
434 
435 	while ((dp = cmd_list_next(&cmd.cmd_datapaths)) != NULL)
436 		cmd_dp_free(hdl, dp, FMD_B_FALSE);
437 
438 	while ((dpage = cmd_list_next(&cmd.cmd_deferred_pages)) != NULL) {
439 		cmd_list_delete(&cmd.cmd_deferred_pages, dpage);
440 		fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t));
441 	}
442 }
443