xref: /titanic_44/usr/src/cmd/fm/fmd/common/fmd_log.c (revision f808c858fa61e7769218966759510a8b1190dfcf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 /*
31  * FMD Log File Subsystem
32  *
33  * Events are written to one of two log files as they are received or created;
34  * the error log tracks all ereport.* events received on the inbound event
35  * transport, and the fault log tracks all list.* events generated by fmd or
36  * its client modules.  In addition, we use the same log file format to cache
37  * state and events associated with ASRUs that are named in a diagnosis.
38  *
39  * The log files use the exacct format manipulated by libexacct(3LIB) and
40  * originally defined in PSARC 1999/119.  However, the exacct library was
41  * designed primarily for read-only clients and without the synchronous i/o
42  * considerations and seeking required for fmd, so we use libexacct here only
43  * to read and write the file headers and to pack data from memory into a file
44  * bytestream.  All of the i/o and file offset manipulations are performed by
45  * the fmd code below.  Our exacct file management uses the following grammar:
46  *
47  * file := hdr toc event*
48  * hdr := EXD_FMA_LABEL EXD_FMA_VERSION EXD_FMA_OSREL EXD_FMA_OSVER
49  * EXD_FMA_PLAT EXD_FMA_UUID
50  * toc := EXD_FMA_OFFSET
51  * event := EXD_FMA_TODSEC EXD_FMA_TODNSEC EXD_FMA_NVLIST evref* or legacy evref
52  * evref := EXD_FMA_UUID EXD_FMA_OFFSET
53  * legacy evref := EXD_FMA_MAJOR EXD_FMA_MINOR EXD_FMA_INODE EXD_FMA_OFFSET
54  *
55  * Any event can be uniquely identified by the tuple (file, offset) where file
56  * is encoded as (uuid) when we are cross-linking files.  For legacy file
57  * formats we still support encoding the reference as (major, minor, inode).
58  * Note that we break out of the file's dev_t into its two 32-bit components to
59  * permit development of either 32-bit or 64-bit log readers and writers; the
60  * LFS APIs do not yet export a 64-bit dev_t to fstat64(), so there is no way
61  * for a 32-bit application to retrieve and store a 64-bit dev_t.
62  *
63  * In order to replay events in the event of an fmd crash, events are initially
64  * written to the error log using the group catalog tag EXD_GROUP_RFMA by the
65  * fmd_log_append() function.  Later, once an event transitions from the
66  * received state to one of its other states (see fmd_event.c for details),
67  * fmd_log_commit() is used to overwrite the tag with EXD_GROUP_FMA, indicating
68  * that the event is fully processed and no longer needs to be replayed.
69  */
70 
71 #include <sys/types.h>
72 #include <sys/mkdev.h>
73 #include <sys/statvfs.h>
74 #include <sys/fm/protocol.h>
75 #include <sys/exacct_impl.h>
76 #include <uuid/uuid.h>
77 
78 #include <unistd.h>
79 #include <limits.h>
80 #include <fcntl.h>
81 #include <ctype.h>
82 
83 #include <fmd_alloc.h>
84 #include <fmd_error.h>
85 #include <fmd_string.h>
86 #include <fmd_event.h>
87 #include <fmd_conf.h>
88 #include <fmd_subr.h>
89 #include <fmd_case.h>
90 #include <fmd_log.h>
91 
92 #include <fmd.h>
93 
94 #define	CAT_FMA_RGROUP	(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_RFMA)
95 #define	CAT_FMA_GROUP	(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_FMA)
96 
97 #define	CAT_FMA_LABEL	(EXT_STRING | EXC_DEFAULT | EXD_FMA_LABEL)
98 #define	CAT_FMA_VERSION	(EXT_STRING | EXC_DEFAULT | EXD_FMA_VERSION)
99 #define	CAT_FMA_OSREL	(EXT_STRING | EXC_DEFAULT | EXD_FMA_OSREL)
100 #define	CAT_FMA_OSVER	(EXT_STRING | EXC_DEFAULT | EXD_FMA_OSVER)
101 #define	CAT_FMA_PLAT	(EXT_STRING | EXC_DEFAULT | EXD_FMA_PLAT)
102 #define	CAT_FMA_UUID	(EXT_STRING | EXC_DEFAULT | EXD_FMA_UUID)
103 #define	CAT_FMA_TODSEC	(EXT_UINT64 | EXC_DEFAULT | EXD_FMA_TODSEC)
104 #define	CAT_FMA_TODNSEC	(EXT_UINT64 | EXC_DEFAULT | EXD_FMA_TODNSEC)
105 #define	CAT_FMA_NVLIST	(EXT_RAW | EXC_DEFAULT | EXD_FMA_NVLIST)
106 #define	CAT_FMA_MAJOR	(EXT_UINT32 | EXC_DEFAULT | EXD_FMA_MAJOR)
107 #define	CAT_FMA_MINOR	(EXT_UINT32 | EXC_DEFAULT | EXD_FMA_MINOR)
108 #define	CAT_FMA_INODE	(EXT_UINT64 | EXC_DEFAULT | EXD_FMA_INODE)
109 #define	CAT_FMA_OFFSET	(EXT_UINT64 | EXC_DEFAULT | EXD_FMA_OFFSET)
110 
111 static ssize_t
112 fmd_log_write(fmd_log_t *lp, const void *buf, size_t n)
113 {
114 	ssize_t resid = n;
115 	ssize_t len;
116 
117 	ASSERT(MUTEX_HELD(&lp->log_lock));
118 
119 	while (resid != 0) {
120 		if ((len = write(lp->log_fd, buf, resid)) <= 0)
121 			break;
122 
123 		resid -= len;
124 		buf = (char *)buf + len;
125 	}
126 
127 	if (resid == n && n != 0)
128 		return (-1);
129 
130 	return (n - resid);
131 }
132 
133 static int
134 fmd_log_write_hdr(fmd_log_t *lp, const char *tag)
135 {
136 	ea_object_t hdr, toc, i0, i1, i2, i3, i4, i5, i6;
137 	const char *osrel, *osver, *plat;
138 	off64_t off = 0;
139 	int err = 0;
140 	uuid_t uuid;
141 
142 	(void) fmd_conf_getprop(fmd.d_conf, "osrelease", &osrel);
143 	(void) fmd_conf_getprop(fmd.d_conf, "osversion", &osver);
144 	(void) fmd_conf_getprop(fmd.d_conf, "platform", &plat);
145 	(void) fmd_conf_getprop(fmd.d_conf, "uuidlen", &lp->log_uuidlen);
146 
147 	lp->log_uuid = fmd_zalloc(lp->log_uuidlen + 1, FMD_SLEEP);
148 	uuid_generate(uuid);
149 	uuid_unparse(uuid, lp->log_uuid);
150 
151 	err |= ea_set_group(&hdr, CAT_FMA_GROUP);
152 	err |= ea_set_group(&toc, CAT_FMA_GROUP);
153 
154 	err |= ea_set_item(&i0, CAT_FMA_LABEL, tag, 0);
155 	err |= ea_set_item(&i1, CAT_FMA_VERSION, fmd.d_version, 0);
156 	err |= ea_set_item(&i2, CAT_FMA_OSREL, osrel, 0);
157 	err |= ea_set_item(&i3, CAT_FMA_OSVER, osver, 0);
158 	err |= ea_set_item(&i4, CAT_FMA_PLAT, plat, 0);
159 	err |= ea_set_item(&i5, CAT_FMA_UUID, lp->log_uuid, 0);
160 	err |= ea_set_item(&i6, CAT_FMA_OFFSET, &off, 0);
161 
162 	(void) ea_attach_to_group(&hdr, &i0);
163 	(void) ea_attach_to_group(&hdr, &i1);
164 	(void) ea_attach_to_group(&hdr, &i2);
165 	(void) ea_attach_to_group(&hdr, &i3);
166 	(void) ea_attach_to_group(&hdr, &i4);
167 	(void) ea_attach_to_group(&hdr, &i5);
168 	(void) ea_attach_to_group(&toc, &i6);
169 
170 	if (err == 0) {
171 		size_t hdr_size = ea_pack_object(&hdr, NULL, 0);
172 		size_t toc_size = ea_pack_object(&toc, NULL, 0);
173 
174 		size_t size = hdr_size + toc_size;
175 		void *buf = fmd_alloc(size, FMD_SLEEP);
176 
177 		(void) ea_pack_object(&hdr, buf, hdr_size);
178 		(void) ea_pack_object(&toc, (char *)buf + hdr_size, toc_size);
179 
180 		if ((lp->log_off = lseek64(lp->log_fd, 0, SEEK_END)) == -1L)
181 			fmd_panic("failed to seek log %s", lp->log_name);
182 
183 		if (fmd_log_write(lp, buf, size) != size)
184 			err = errno; /* save errno for fmd_set_errno() below */
185 
186 		fmd_free(buf, size);
187 
188 		lp->log_toc = lp->log_off + hdr_size;
189 		lp->log_beg = lp->log_off + hdr_size + toc_size;
190 		lp->log_off = lp->log_off + hdr_size + toc_size;
191 
192 		if (lp->log_off != lseek64(lp->log_fd, 0, SEEK_END))
193 			fmd_panic("eof off != log_off 0x%llx\n", lp->log_off);
194 	} else
195 		err = EFMD_LOG_EXACCT;
196 
197 	(void) ea_free_item(&i0, EUP_ALLOC);
198 	(void) ea_free_item(&i1, EUP_ALLOC);
199 	(void) ea_free_item(&i2, EUP_ALLOC);
200 	(void) ea_free_item(&i3, EUP_ALLOC);
201 	(void) ea_free_item(&i4, EUP_ALLOC);
202 	(void) ea_free_item(&i5, EUP_ALLOC);
203 	(void) ea_free_item(&i6, EUP_ALLOC);
204 
205 	return (err ? fmd_set_errno(err) : 0);
206 }
207 
208 static int
209 fmd_log_check_err(fmd_log_t *lp, int err, const char *msg)
210 {
211 	int eaerr = ea_error();
212 	char buf[BUFSIZ];
213 
214 	(void) snprintf(buf, sizeof (buf), "%s: %s: %s\n",
215 	    lp->log_name, msg, eaerr != EXR_OK ?
216 	    fmd_ea_strerror(eaerr) : "catalog tag mismatch");
217 
218 	fmd_error(err, buf);
219 	return (fmd_set_errno(err));
220 }
221 
222 static int
223 fmd_log_check_hdr(fmd_log_t *lp, const char *tag)
224 {
225 	int got_version = 0, got_label = 0;
226 	ea_object_t *grp, *obj;
227 	off64_t hdr_off, hdr_size;
228 	int dvers, fvers;
229 	const char *p;
230 
231 	ea_clear(&lp->log_ea); /* resync exacct file */
232 
233 	if ((hdr_off = lseek64(lp->log_fd, 0, SEEK_CUR)) == -1L)
234 		fmd_panic("failed to seek log %s", lp->log_name);
235 
236 	/*
237 	 * Read the first group of log meta-data: the write-once read-only
238 	 * file header.  We read all records in this group, ignoring all but
239 	 * the VERSION and LABEL, which are required and must be verified.
240 	 */
241 	if ((grp = ea_get_object_tree(&lp->log_ea, 1)) == NULL ||
242 	    grp->eo_catalog != CAT_FMA_GROUP) {
243 		ea_free_object(grp, EUP_ALLOC);
244 		return (fmd_log_check_err(lp, EFMD_LOG_INVAL,
245 		    "invalid fma hdr record group"));
246 	}
247 
248 	for (obj = grp->eo_group.eg_objs; obj != NULL; obj = obj->eo_next) {
249 		switch (obj->eo_catalog) {
250 		case CAT_FMA_VERSION:
251 			for (dvers = 0, p = fmd.d_version;
252 			    *p != '\0'; p++) {
253 				if (isdigit(*p))
254 					dvers = dvers * 10 + (*p - '0');
255 				else
256 					break;
257 			}
258 
259 			for (fvers = 0, p = obj->eo_item.ei_string;
260 			    *p != '\0'; p++) {
261 				if (isdigit(*p))
262 					fvers = fvers * 10 + (*p - '0');
263 				else
264 					break;
265 			}
266 
267 			if (fvers > dvers) {
268 				fmd_error(EFMD_LOG_INVAL, "%s: log version "
269 				    "%s is not supported by this daemon\n",
270 				    lp->log_name, obj->eo_item.ei_string);
271 				ea_free_object(grp, EUP_ALLOC);
272 				return (fmd_set_errno(EFMD_LOG_VERSION));
273 			}
274 
275 			got_version++;
276 			break;
277 
278 		case CAT_FMA_LABEL:
279 			if (strcmp(obj->eo_item.ei_string, tag) != 0) {
280 				fmd_error(EFMD_LOG_INVAL, "%s: log tag '%s' "
281 				    "does not matched expected tag '%s'\n",
282 				    lp->log_name, obj->eo_item.ei_string, tag);
283 				ea_free_object(grp, EUP_ALLOC);
284 				return (fmd_set_errno(EFMD_LOG_INVAL));
285 			}
286 			got_label++;
287 			break;
288 		case CAT_FMA_UUID:
289 			lp->log_uuid = fmd_strdup(obj->eo_item.ei_string,
290 			    FMD_SLEEP);
291 			lp->log_uuidlen = strlen(lp->log_uuid);
292 			break;
293 		}
294 	}
295 
296 	hdr_size = ea_pack_object(grp, NULL, 0);
297 	ea_free_object(grp, EUP_ALLOC);
298 
299 	if (!got_version || !got_label) {
300 		fmd_error(EFMD_LOG_INVAL, "%s: fmd hdr record group did not "
301 		    "include mandatory version and/or label\n", lp->log_name);
302 		return (fmd_set_errno(EFMD_LOG_INVAL));
303 	}
304 
305 	/*
306 	 * Read the second group of log meta-data: the table of contents.  We
307 	 * expect this group to contain an OFFSET object indicating the current
308 	 * value of log_skip.  We save this in our fmd_log_t and then return.
309 	 */
310 	if ((grp = ea_get_object_tree(&lp->log_ea, 1)) == NULL ||
311 	    grp->eo_catalog != CAT_FMA_GROUP || grp->eo_group.eg_nobjs < 1 ||
312 	    grp->eo_group.eg_objs->eo_catalog != CAT_FMA_OFFSET) {
313 		ea_free_object(grp, EUP_ALLOC);
314 		return (fmd_log_check_err(lp, EFMD_LOG_INVAL,
315 		    "invalid fma toc record group"));
316 	}
317 
318 	lp->log_toc = hdr_off + hdr_size;
319 	lp->log_beg = hdr_off + hdr_size + ea_pack_object(grp, NULL, 0);
320 	lp->log_off = lseek64(lp->log_fd, 0, SEEK_END);
321 	lp->log_skip = grp->eo_group.eg_objs->eo_item.ei_uint64;
322 
323 	if (lp->log_skip > lp->log_off) {
324 		fmd_error(EFMD_LOG_INVAL, "%s: skip %llx exceeds file size; "
325 		    "resetting to zero\n", lp->log_name, lp->log_skip);
326 		lp->log_skip = 0;
327 	}
328 
329 	ea_free_object(grp, EUP_ALLOC);
330 	return (0);
331 }
332 
333 static int
334 fmd_log_open_exacct(fmd_log_t *lp, int aflags, int oflags)
335 {
336 	int fd = dup(lp->log_fd);
337 	const char *creator;
338 
339 	(void) fmd_conf_getprop(fmd.d_conf, "log.creator", &creator);
340 
341 	if (ea_fdopen(&lp->log_ea, fd, creator, aflags, oflags) != 0) {
342 		fmd_error(EFMD_LOG_EXACCT, "%s: failed to open log file: %s\n",
343 		    lp->log_name, fmd_ea_strerror(ea_error()));
344 		(void) close(fd);
345 		return (fmd_set_errno(EFMD_LOG_EXACCT));
346 	}
347 
348 	lp->log_flags |= FMD_LF_EAOPEN;
349 	return (0);
350 }
351 
352 static fmd_log_t *
353 fmd_log_xopen(const char *root, const char *name, const char *tag, int oflags)
354 {
355 	fmd_log_t *lp = fmd_zalloc(sizeof (fmd_log_t), FMD_SLEEP);
356 
357 	char buf[PATH_MAX];
358 	size_t len;
359 	int err;
360 
361 	(void) pthread_mutex_init(&lp->log_lock, NULL);
362 	(void) pthread_cond_init(&lp->log_cv, NULL);
363 	(void) pthread_mutex_lock(&lp->log_lock);
364 
365 	len = strlen(root) + strlen(name) + 2; /* for "/" and "\0" */
366 	lp->log_name = fmd_alloc(len, FMD_SLEEP);
367 	(void) snprintf(lp->log_name, len, "%s/%s", root, name);
368 	lp->log_tag = fmd_strdup(tag, FMD_SLEEP);
369 	(void) fmd_conf_getprop(fmd.d_conf, "log.minfree", &lp->log_minfree);
370 
371 	if (strcmp(lp->log_tag, FMD_LOG_ERROR) == 0)
372 		lp->log_flags |= FMD_LF_REPLAY;
373 
374 	if (strcmp(lp->log_tag, FMD_LOG_XPRT) == 0)
375 		oflags &= ~O_SYNC;
376 
377 top:
378 	if ((lp->log_fd = open64(lp->log_name, oflags, 0644)) == -1 ||
379 	    fstat64(lp->log_fd, &lp->log_stat) == -1) {
380 		fmd_error(EFMD_LOG_OPEN, "failed to open log %s", lp->log_name);
381 		fmd_log_close(lp);
382 		return (NULL);
383 	}
384 
385 	/*
386 	 * If our open() created the log file, use libexacct to write a header
387 	 * and position the file just after the header (EO_TAIL).  If the log
388 	 * file already existed, use libexacct to validate the header and again
389 	 * position the file just after the header (EO_HEAD).  Note that we lie
390 	 * to libexacct about 'oflags' in order to achieve the desired result.
391 	 */
392 	if (lp->log_stat.st_size == 0) {
393 		err = fmd_log_open_exacct(lp, EO_VALID_HDR | EO_TAIL,
394 		    O_CREAT | O_WRONLY) || fmd_log_write_hdr(lp, tag);
395 	} else {
396 		err = fmd_log_open_exacct(lp, EO_VALID_HDR | EO_HEAD,
397 		    O_RDONLY) || fmd_log_check_hdr(lp, tag);
398 	}
399 
400 	/*
401 	 * If ea_fdopen() failed and the log was pre-existing, attempt to move
402 	 * it aside and start a new one.  If we created the log but failed to
403 	 * initialize it, then we have no choice but to give up (e.g. EROFS).
404 	 */
405 	if (err) {
406 		fmd_error(EFMD_LOG_OPEN,
407 		    "failed to initialize log %s", lp->log_name);
408 
409 		if (lp->log_flags & FMD_LF_EAOPEN) {
410 			lp->log_flags &= ~FMD_LF_EAOPEN;
411 			(void) ea_close(&lp->log_ea);
412 		}
413 
414 		(void) close(lp->log_fd);
415 		lp->log_fd = -1;
416 
417 		if (lp->log_stat.st_size != 0 && snprintf(buf,
418 		    sizeof (buf), "%s-", lp->log_name) < PATH_MAX &&
419 		    rename(lp->log_name, buf) == 0) {
420 			TRACE((FMD_DBG_LOG, "mv %s to %s", lp->log_name, buf));
421 			if (oflags & O_CREAT)
422 				goto top;
423 		}
424 
425 		fmd_log_close(lp);
426 		return (NULL);
427 	}
428 
429 	lp->log_refs++;
430 	(void) pthread_mutex_unlock(&lp->log_lock);
431 
432 	return (lp);
433 }
434 
435 fmd_log_t *
436 fmd_log_tryopen(const char *root, const char *name, const char *tag)
437 {
438 	return (fmd_log_xopen(root, name, tag, O_RDWR | O_SYNC));
439 }
440 
441 fmd_log_t *
442 fmd_log_open(const char *root, const char *name, const char *tag)
443 {
444 	return (fmd_log_xopen(root, name, tag, O_RDWR | O_CREAT | O_SYNC));
445 }
446 
447 void
448 fmd_log_close(fmd_log_t *lp)
449 {
450 	ASSERT(MUTEX_HELD(&lp->log_lock));
451 	ASSERT(lp->log_refs == 0);
452 
453 	if ((lp->log_flags & FMD_LF_EAOPEN) && ea_close(&lp->log_ea) != 0) {
454 		fmd_error(EFMD_LOG_CLOSE, "failed to close log %s: %s\n",
455 		    lp->log_name, fmd_ea_strerror(ea_error()));
456 	}
457 
458 	if (lp->log_fd >= 0 && close(lp->log_fd) != 0) {
459 		fmd_error(EFMD_LOG_CLOSE,
460 		    "failed to close log %s", lp->log_name);
461 	}
462 
463 	fmd_strfree(lp->log_name);
464 	fmd_strfree(lp->log_tag);
465 	if (lp->log_uuid != NULL)
466 		fmd_free(lp->log_uuid, lp->log_uuidlen + 1);
467 
468 	fmd_free(lp, sizeof (fmd_log_t));
469 }
470 
471 void
472 fmd_log_hold_pending(fmd_log_t *lp)
473 {
474 	(void) pthread_mutex_lock(&lp->log_lock);
475 
476 	lp->log_refs++;
477 	ASSERT(lp->log_refs != 0);
478 
479 	if (lp->log_flags & FMD_LF_REPLAY) {
480 		lp->log_pending++;
481 		ASSERT(lp->log_pending != 0);
482 	}
483 
484 	(void) pthread_mutex_unlock(&lp->log_lock);
485 }
486 
487 void
488 fmd_log_hold(fmd_log_t *lp)
489 {
490 	(void) pthread_mutex_lock(&lp->log_lock);
491 	lp->log_refs++;
492 	ASSERT(lp->log_refs != 0);
493 	(void) pthread_mutex_unlock(&lp->log_lock);
494 }
495 
496 void
497 fmd_log_rele(fmd_log_t *lp)
498 {
499 	(void) pthread_mutex_lock(&lp->log_lock);
500 	ASSERT(lp->log_refs != 0);
501 
502 	if (--lp->log_refs == 0)
503 		fmd_log_close(lp);
504 	else
505 		(void) pthread_mutex_unlock(&lp->log_lock);
506 }
507 
508 void
509 fmd_log_append(fmd_log_t *lp, fmd_event_t *e, fmd_case_t *cp)
510 {
511 	fmd_event_impl_t *ep = (fmd_event_impl_t *)e;
512 	fmd_case_impl_t *cip = (fmd_case_impl_t *)cp;
513 	int err = 0;
514 
515 	ea_object_t grp0, grp1, i0, i1, i2, *items;
516 	ea_object_t **fe = NULL;
517 	size_t nvsize, easize, itsize, frsize;
518 	char *nvbuf, *eabuf;
519 	statvfs64_t stv;
520 
521 	(void) pthread_mutex_lock(&ep->ev_lock);
522 
523 	ASSERT(ep->ev_flags & FMD_EVF_VOLATILE);
524 	ASSERT(ep->ev_log == NULL);
525 
526 	(void) nvlist_size(ep->ev_nvl, &nvsize, NV_ENCODE_XDR);
527 	nvbuf = fmd_alloc(nvsize, FMD_SLEEP);
528 	(void) nvlist_pack(ep->ev_nvl, &nvbuf, &nvsize, NV_ENCODE_XDR, 0);
529 
530 	if (lp->log_flags & FMD_LF_REPLAY)
531 		err |= ea_set_group(&grp0, CAT_FMA_RGROUP);
532 	else
533 		err |= ea_set_group(&grp0, CAT_FMA_GROUP);
534 
535 	err |= ea_set_item(&i0, CAT_FMA_TODSEC, &ep->ev_time.ftv_sec, 0);
536 	err |= ea_set_item(&i1, CAT_FMA_TODNSEC, &ep->ev_time.ftv_nsec, 0);
537 	err |= ea_set_item(&i2, CAT_FMA_NVLIST, nvbuf, nvsize);
538 
539 	if (err != 0) {
540 		(void) pthread_mutex_unlock(&ep->ev_lock);
541 		err = EFMD_LOG_EXACCT;
542 		goto exerr;
543 	}
544 
545 	(void) ea_attach_to_group(&grp0, &i0);
546 	(void) ea_attach_to_group(&grp0, &i1);
547 	(void) ea_attach_to_group(&grp0, &i2);
548 
549 	/*
550 	 * If this event has a case associated with it (i.e. it is a list),
551 	 * then allocate a block of ea_object_t's and fill in a group for
552 	 * each event saved in the case's item list.  For each such group,
553 	 * we attach it to grp1, which in turn will be attached to grp0.
554 	 */
555 	if (cp != NULL) {
556 		ea_object_t *egrp, *ip, **fp;
557 		fmd_event_impl_t *eip;
558 		fmd_case_item_t *cit;
559 
560 		(void) ea_set_group(&grp1, CAT_FMA_GROUP);
561 		frsize = sizeof (ea_object_t *) * cip->ci_nitems;
562 		itsize = sizeof (ea_object_t) * cip->ci_nitems * 5;
563 		items = ip = fmd_alloc(itsize, FMD_SLEEP);
564 
565 		for (cit = cip->ci_items; cit != NULL; cit = cit->cit_next) {
566 			major_t maj;
567 			minor_t min;
568 
569 			eip = (fmd_event_impl_t *)cit->cit_event;
570 
571 			if (eip->ev_log == NULL)
572 				continue; /* event was never logged */
573 
574 			maj = major(eip->ev_log->log_stat.st_dev);
575 			min = minor(eip->ev_log->log_stat.st_dev);
576 
577 			(void) ea_set_group(ip, CAT_FMA_GROUP);
578 			egrp = ip++; /* first obj is group */
579 
580 			/*
581 			 * If the event log file is in legacy format,
582 			 * then write the xref to the file in the legacy
583 			 * maj/min/inode method else write it using the
584 			 * file uuid.
585 			 */
586 			if (eip->ev_log->log_uuid == NULL) {
587 				(void) ea_set_item(ip, CAT_FMA_MAJOR, &maj, 0);
588 				(void) ea_attach_to_group(egrp, ip++);
589 				(void) ea_set_item(ip, CAT_FMA_MINOR, &min, 0);
590 				(void) ea_attach_to_group(egrp, ip++);
591 				(void) ea_set_item(ip, CAT_FMA_INODE,
592 				    &eip->ev_log->log_stat.st_ino, 0);
593 				(void) ea_attach_to_group(egrp, ip++);
594 			} else {
595 				if (ea_set_item(ip, CAT_FMA_UUID,
596 				    eip->ev_log->log_uuid, 0) == -1) {
597 					err = EFMD_LOG_EXACCT;
598 					goto exerrcp;
599 				}
600 				if (fe == NULL)
601 					fe = fp = fmd_zalloc(frsize, FMD_SLEEP);
602 				*fp++ = ip;
603 				(void) ea_attach_to_group(egrp, ip++);
604 			}
605 			(void) ea_set_item(ip, CAT_FMA_OFFSET, &eip->ev_off, 0);
606 			(void) ea_attach_to_group(egrp, ip++);
607 			(void) ea_attach_to_group(&grp1, egrp);
608 		}
609 		(void) ea_attach_to_group(&grp0, &grp1);
610 	}
611 
612 	easize = ea_pack_object(&grp0, NULL, 0);
613 	eabuf = fmd_alloc(easize, FMD_SLEEP);
614 	(void) ea_pack_object(&grp0, eabuf, easize);
615 
616 	/*
617 	 * Before writing the record, check to see if this would cause the free
618 	 * space in the filesystem to drop below our minfree threshold.  If so,
619 	 * don't bother attempting the write and instead pretend it failed.  As
620 	 * fmd(1M) runs as root, it will be able to access the space "reserved"
621 	 * for root, and therefore can run the system of out of disk space in a
622 	 * heavy error load situation, violating the basic design principle of
623 	 * fmd(1M) that we don't want to make a bad situation even worse.
624 	 */
625 	(void) pthread_mutex_lock(&lp->log_lock);
626 
627 	if (lp->log_minfree != 0 && fstatvfs64(lp->log_fd, &stv) == 0 &&
628 	    stv.f_bavail * stv.f_frsize < lp->log_minfree + easize) {
629 
630 		TRACE((FMD_DBG_LOG, "append %s crosses minfree", lp->log_tag));
631 		err = EFMD_LOG_MINFREE;
632 
633 	} else if (fmd_log_write(lp, eabuf, easize) == easize) {
634 		TRACE((FMD_DBG_LOG, "append %s %p off=0x%llx",
635 		    lp->log_tag, (void *)ep, (u_longlong_t)lp->log_off));
636 
637 		ep->ev_flags &= ~FMD_EVF_VOLATILE;
638 		ep->ev_log = lp;
639 		ep->ev_off = lp->log_off;
640 		ep->ev_len = easize;
641 
642 		if (lp->log_flags & FMD_LF_REPLAY) {
643 			lp->log_pending++;
644 			ASSERT(lp->log_pending != 0);
645 		}
646 
647 		lp->log_refs++;
648 		ASSERT(lp->log_refs != 0);
649 		lp->log_off += easize;
650 	} else {
651 		err = errno; /* save errno for fmd_error() call below */
652 
653 		/*
654 		 * If we can't write append the record, seek the file back to
655 		 * the original location and truncate it there in order to make
656 		 * sure the file is always in a sane state w.r.t. libexacct.
657 		 */
658 		(void) lseek64(lp->log_fd, lp->log_off, SEEK_SET);
659 		(void) ftruncate64(lp->log_fd, lp->log_off);
660 	}
661 
662 	(void) pthread_mutex_unlock(&lp->log_lock);
663 	(void) pthread_mutex_unlock(&ep->ev_lock);
664 
665 	fmd_free(eabuf, easize);
666 
667 exerrcp:
668 	if (cp != NULL) {
669 		if (fe != NULL) {
670 			ea_object_t **fp = fe;
671 			int i = 0;
672 
673 			for (; *fp != NULL && i < cip->ci_nitems; i++)
674 				(void) ea_free_item(*fp++, EUP_ALLOC);
675 			fmd_free(fe, frsize);
676 		}
677 
678 		fmd_free(items, itsize);
679 	}
680 
681 exerr:
682 	fmd_free(nvbuf, nvsize);
683 
684 	(void) ea_free_item(&i0, EUP_ALLOC);
685 	(void) ea_free_item(&i1, EUP_ALLOC);
686 	(void) ea_free_item(&i2, EUP_ALLOC);
687 
688 	/*
689 	 * Keep track of out-of-space errors using global statistics.  As we're
690 	 * out of disk space, it's unlikely the EFMD_LOG_APPEND will be logged.
691 	 */
692 	if (err == ENOSPC || err == EFMD_LOG_MINFREE) {
693 		fmd_stat_t *sp;
694 
695 		if (lp == fmd.d_errlog)
696 			sp = &fmd.d_stats->ds_err_enospc;
697 		else if (lp == fmd.d_fltlog)
698 			sp = &fmd.d_stats->ds_flt_enospc;
699 		else
700 			sp = &fmd.d_stats->ds_oth_enospc;
701 
702 		(void) pthread_mutex_lock(&fmd.d_stats_lock);
703 		sp->fmds_value.ui64++;
704 		(void) pthread_mutex_unlock(&fmd.d_stats_lock);
705 	}
706 
707 	if (err != 0) {
708 		fmd_error(EFMD_LOG_APPEND, "failed to log_append %s %p: %s\n",
709 		    lp->log_tag, (void *)ep, fmd_strerror(err));
710 	}
711 }
712 
713 /*
714  * Commit an event to the log permanently, indicating that it should not be
715  * replayed on restart.  This is done by overwriting the event group's catalog
716  * code with EXD_GROUP_FMA (from EXD_GROUP_RFMA used in fmd_log_append()).  We
717  * use pwrite64() to update the existing word directly, using somewhat guilty
718  * knowledge that exacct stores the 32-bit catalog word first for each object.
719  * Since we are overwriting an existing log location using pwrite64() and hold
720  * the event lock, we do not need to hold the log_lock during the i/o.
721  */
722 void
723 fmd_log_commit(fmd_log_t *lp, fmd_event_t *e)
724 {
725 	fmd_event_impl_t *ep = (fmd_event_impl_t *)e;
726 	ea_catalog_t c;
727 	int err = 0;
728 
729 	if (!(lp->log_flags & FMD_LF_REPLAY))
730 		return; /* log does not require replay tagging */
731 
732 	ASSERT(MUTEX_HELD(&ep->ev_lock));
733 	ASSERT(ep->ev_log == lp && ep->ev_off != 0);
734 
735 	c = CAT_FMA_GROUP;
736 	exacct_order32(&c);
737 
738 	if (pwrite64(lp->log_fd, &c, sizeof (c), ep->ev_off) == sizeof (c)) {
739 		TRACE((FMD_DBG_LOG, "commit %s %p", lp->log_tag, (void *)ep));
740 		ep->ev_flags &= ~FMD_EVF_REPLAY;
741 
742 		/*
743 		 * If we have committed the event, check to see if the TOC skip
744 		 * offset needs to be updated, and decrement the pending count.
745 		 */
746 		(void) pthread_mutex_lock(&lp->log_lock);
747 
748 		if (lp->log_skip == ep->ev_off) {
749 			lp->log_flags |= FMD_LF_DIRTY;
750 			lp->log_skip += ep->ev_len;
751 		}
752 
753 		ASSERT(lp->log_pending != 0);
754 		lp->log_pending--;
755 
756 		(void) pthread_cond_broadcast(&lp->log_cv);
757 		(void) pthread_mutex_unlock(&lp->log_lock);
758 
759 	} else {
760 		fmd_error(EFMD_LOG_COMMIT, "failed to log_commit %s %p: %s\n",
761 		    lp->log_tag, (void *)ep, fmd_strerror(err));
762 	}
763 }
764 
765 /*
766  * If we need to destroy an event and it wasn't able to be committed, we permit
767  * the owner to decommit from ever trying again.  This operation decrements the
768  * pending count on the log and broadcasts to anyone waiting on log_cv.
769  */
770 void
771 fmd_log_decommit(fmd_log_t *lp, fmd_event_t *e)
772 {
773 	fmd_event_impl_t *ep = (fmd_event_impl_t *)e;
774 
775 	if (!(lp->log_flags & FMD_LF_REPLAY))
776 		return; /* log does not require replay tagging */
777 
778 	ASSERT(MUTEX_HELD(&ep->ev_lock));
779 	ASSERT(ep->ev_log == lp);
780 
781 	(void) pthread_mutex_lock(&lp->log_lock);
782 
783 	TRACE((FMD_DBG_LOG, "decommit %s %p", lp->log_tag, (void *)ep));
784 	ep->ev_flags &= ~FMD_EVF_REPLAY;
785 
786 	ASSERT(lp->log_pending != 0);
787 	lp->log_pending--;
788 
789 	(void) pthread_cond_broadcast(&lp->log_cv);
790 	(void) pthread_mutex_unlock(&lp->log_lock);
791 }
792 
793 static fmd_event_t *
794 fmd_log_unpack(fmd_log_t *lp, ea_object_t *grp, off64_t off)
795 {
796 	fmd_timeval_t ftv = { -1ULL, -1ULL };
797 	nvlist_t *nvl = NULL;
798 
799 	ea_object_t *obj;
800 	char *class;
801 	int err;
802 
803 	for (obj = grp->eo_group.eg_objs; obj != NULL; obj = obj->eo_next) {
804 		switch (obj->eo_catalog) {
805 		case CAT_FMA_NVLIST:
806 			if ((err = nvlist_xunpack(obj->eo_item.ei_raw,
807 			    obj->eo_item.ei_size, &nvl, &fmd.d_nva)) != 0) {
808 				fmd_error(EFMD_LOG_UNPACK, "failed to unpack "
809 				    "log nvpair: %s\n", fmd_strerror(err));
810 				return (NULL);
811 			}
812 			break;
813 
814 		case CAT_FMA_TODSEC:
815 			ftv.ftv_sec = obj->eo_item.ei_uint64;
816 			break;
817 
818 		case CAT_FMA_TODNSEC:
819 			ftv.ftv_nsec = obj->eo_item.ei_uint64;
820 			break;
821 		}
822 	}
823 
824 	if (nvl == NULL || ftv.ftv_sec == -1ULL || ftv.ftv_nsec == -1ULL) {
825 		fmd_error(EFMD_LOG_UNPACK, "failed to unpack log event: "
826 		    "required object(s) missing from record group\n");
827 		nvlist_free(nvl);
828 		return (NULL);
829 	}
830 
831 	if (nvlist_lookup_string(nvl, FM_CLASS, &class) != 0) {
832 		fmd_error(EFMD_LOG_UNPACK, "failed to unpack log event: "
833 		    "record is missing required '%s' nvpair\n", FM_CLASS);
834 		nvlist_free(nvl);
835 		return (NULL);
836 	}
837 
838 	return (fmd_event_recreate(FMD_EVT_PROTOCOL,
839 	    &ftv, nvl, class, lp, off, ea_pack_object(grp, NULL, 0)));
840 }
841 
842 /*
843  * Replay event(s) from the specified log by invoking the specified callback
844  * function 'func' for each event.  If the log has the FMD_LF_REPLAY flag set,
845  * we replay all events after log_skip that have the FMA_RGROUP group tag.
846  * This mode is used for the error telemetry log.  If the log does not have
847  * this flag set (used for ASRU logs), only the most recent event is replayed.
848  */
849 void
850 fmd_log_replay(fmd_log_t *lp, fmd_log_f *func, void *data)
851 {
852 	ea_object_t obj, *grp;
853 	ea_object_type_t type;
854 	ea_catalog_t c;
855 	fmd_event_t *ep;
856 	off64_t off, skp;
857 	uint_t n = 0;
858 
859 	(void) pthread_mutex_lock(&lp->log_lock);
860 
861 	if (lp->log_stat.st_size == 0 && (lp->log_flags & FMD_LF_REPLAY)) {
862 		(void) pthread_mutex_unlock(&lp->log_lock);
863 		return; /* we just created this log: never replay events */
864 	}
865 
866 	while (lp->log_flags & FMD_LF_BUSY)
867 		(void) pthread_cond_wait(&lp->log_cv, &lp->log_lock);
868 
869 	if (lp->log_off == lp->log_beg) {
870 		(void) pthread_mutex_unlock(&lp->log_lock);
871 		return; /* no records appended yet */
872 	}
873 
874 	lp->log_flags |= FMD_LF_BUSY;
875 	skp = lp->log_skip;
876 	ea_clear(&lp->log_ea); /* resync exacct file */
877 
878 	/*
879 	 * If FMD_LF_REPLAY is set, begin our replay at either log_skip (if it
880 	 * is non-zero) or at log_beg.  Otherwise replay from the end (log_off)
881 	 */
882 	if (lp->log_flags & FMD_LF_REPLAY) {
883 		off = MAX(lp->log_beg, lp->log_skip);
884 		c = CAT_FMA_RGROUP;
885 	} else {
886 		off = lp->log_off;
887 		c = CAT_FMA_GROUP;
888 	}
889 
890 	if (lseek64(lp->log_fd, off, SEEK_SET) != off) {
891 		fmd_panic("failed to seek %s to 0x%llx\n",
892 		    lp->log_name, (u_longlong_t)off);
893 	}
894 
895 	/*
896 	 * If FMD_LF_REPLAY is not set, back up to the start of the previous
897 	 * object and make sure this object is an EO_GROUP; otherwise return.
898 	 */
899 	if (!(lp->log_flags & FMD_LF_REPLAY) &&
900 	    (type = ea_previous_object(&lp->log_ea, &obj)) != EO_GROUP) {
901 		fmd_error(EFMD_LOG_REPLAY, "last log object is of unexpected "
902 		    "type %d (log may be truncated or corrupt)\n", type);
903 		goto out;
904 	}
905 
906 	while ((grp = ea_get_object_tree(&lp->log_ea, 1)) != NULL) {
907 		if (!(lp->log_flags & FMD_LF_REPLAY))
908 			off -= ea_pack_object(grp, NULL, 0);
909 		else if (n == 0 && grp->eo_catalog == CAT_FMA_GROUP)
910 			skp = off; /* update skip */
911 
912 		/*
913 		 * We temporarily drop log_lock around the call to unpack the
914 		 * event, hold it, and perform the callback, because these
915 		 * operations may try to acquire log_lock to bump log_refs.
916 		 * We cannot lose control because the FMD_LF_BUSY flag is set.
917 		 */
918 		(void) pthread_mutex_unlock(&lp->log_lock);
919 
920 		if (grp->eo_catalog == c &&
921 		    (ep = fmd_log_unpack(lp, grp, off)) != NULL) {
922 
923 			TRACE((FMD_DBG_LOG, "replay %s %p off %llx",
924 			    lp->log_tag, (void *)ep, (u_longlong_t)off));
925 
926 			fmd_event_hold(ep);
927 			func(lp, ep, data);
928 			fmd_event_rele(ep);
929 			n++;
930 		}
931 
932 		(void) pthread_mutex_lock(&lp->log_lock);
933 		off += ea_pack_object(grp, NULL, 0);
934 		ea_free_object(grp, EUP_ALLOC);
935 	}
936 
937 	if (ea_error() != EXR_EOF) {
938 		fmd_error(EFMD_LOG_REPLAY, "failed to replay %s event at "
939 		    "offset 0x%llx: %s\n", lp->log_name, (u_longlong_t)off,
940 		    fmd_ea_strerror(ea_error()));
941 	}
942 
943 	if (n == 0)
944 		skp = off; /* if no replays, move skip to where we ended up */
945 
946 out:
947 	if (lseek64(lp->log_fd, lp->log_off, SEEK_SET) != lp->log_off) {
948 		fmd_panic("failed to seek %s to 0x%llx\n",
949 		    lp->log_name, (u_longlong_t)lp->log_off);
950 	}
951 
952 	if (skp != lp->log_skip) {
953 		lp->log_flags |= FMD_LF_DIRTY;
954 		lp->log_skip = skp;
955 	}
956 
957 	lp->log_flags &= ~FMD_LF_BUSY;
958 	(void) pthread_cond_broadcast(&lp->log_cv);
959 	(void) pthread_mutex_unlock(&lp->log_lock);
960 }
961 
962 void
963 fmd_log_update(fmd_log_t *lp)
964 {
965 	ea_object_t toc, item;
966 	off64_t skip = 0;
967 	size_t size;
968 	void *buf;
969 
970 	(void) pthread_mutex_lock(&lp->log_lock);
971 
972 	if (lp->log_flags & FMD_LF_DIRTY) {
973 		lp->log_flags &= ~FMD_LF_DIRTY;
974 		skip = lp->log_skip;
975 	}
976 
977 	(void) pthread_mutex_unlock(&lp->log_lock);
978 
979 	/*
980 	 * If the skip needs to be updated, construct a TOC record group
981 	 * containing the skip offset and overwrite the TOC in-place.
982 	 */
983 	if (skip != 0 && ea_set_group(&toc, CAT_FMA_GROUP) == 0 &&
984 	    ea_set_item(&item, CAT_FMA_OFFSET, &skip, 0) == 0) {
985 
986 		(void) ea_attach_to_group(&toc, &item);
987 		size = ea_pack_object(&toc, NULL, 0);
988 		buf = fmd_alloc(size, FMD_SLEEP);
989 
990 		(void) ea_pack_object(&toc, buf, size);
991 		ASSERT(lp->log_toc + size == lp->log_beg);
992 
993 		if (pwrite64(lp->log_fd, buf, size, lp->log_toc) == size) {
994 			TRACE((FMD_DBG_LOG, "updated skip to %llx", skip));
995 		} else {
996 			fmd_error(EFMD_LOG_UPDATE,
997 			    "failed to log_update %s", lp->log_tag);
998 		}
999 
1000 		fmd_free(buf, size);
1001 		(void) ea_free_item(&item, EUP_ALLOC);
1002 	}
1003 }
1004 
1005 /*
1006  * Rotate the specified log by renaming its underlying file to a staging file
1007  * that can be handed off to logadm(1M) or an administrator script.  If the
1008  * rename succeeds, open a new log file using the old path and return it.
1009  * Note that we are relying our caller to use some higher-level mechanism to
1010  * ensure that fmd_log_rotate() cannot be called while other threads are
1011  * attempting fmd_log_append() using the same log (fmd's d_log_lock is used
1012  * for the global errlog and fltlog).
1013  */
1014 fmd_log_t *
1015 fmd_log_rotate(fmd_log_t *lp)
1016 {
1017 	char npath[PATH_MAX];
1018 	fmd_log_t *nlp;
1019 
1020 	(void) snprintf(npath, sizeof (npath), "%s.0-", lp->log_name);
1021 	(void) pthread_mutex_lock(&lp->log_lock);
1022 
1023 	/*
1024 	 * Check for any pending commits to drain before proceeding.  We can't
1025 	 * rotate the log out if commits are pending because if we die after
1026 	 * the log is moved aside, we won't be able to replay them on restart.
1027 	 */
1028 	if (lp->log_pending != 0) {
1029 		(void) pthread_mutex_unlock(&lp->log_lock);
1030 		(void) fmd_set_errno(EFMD_LOG_ROTBUSY);
1031 		return (NULL);
1032 	}
1033 
1034 	if (rename(lp->log_name, npath) != 0) {
1035 		(void) pthread_mutex_unlock(&lp->log_lock);
1036 		fmd_error(EFMD_LOG_ROTATE, "failed to rename %s", lp->log_name);
1037 		(void) fmd_set_errno(EFMD_LOG_ROTATE);
1038 		return (NULL);
1039 	}
1040 
1041 	if ((nlp = fmd_log_open("", lp->log_name, lp->log_tag)) == NULL) {
1042 		(void) rename(npath, lp->log_name);
1043 		(void) pthread_mutex_unlock(&lp->log_lock);
1044 		fmd_error(EFMD_LOG_ROTATE, "failed to reopen %s", lp->log_name);
1045 		(void) fmd_set_errno(EFMD_LOG_ROTATE);
1046 		return (NULL);
1047 	}
1048 
1049 	/*
1050 	 * If we've rotated the log, no pending events exist so we don't have
1051 	 * any more commits coming, and our caller should have arranged for
1052 	 * no more calls to append.  As such, we can close log_fd for good.
1053 	 */
1054 	if (lp->log_flags & FMD_LF_EAOPEN) {
1055 		(void) ea_close(&lp->log_ea);
1056 		lp->log_flags &= ~FMD_LF_EAOPEN;
1057 	}
1058 
1059 	(void) close(lp->log_fd);
1060 	lp->log_fd = -1;
1061 
1062 	(void) pthread_mutex_unlock(&lp->log_lock);
1063 	return (nlp);
1064 }
1065