xref: /titanic_52/usr/src/uts/common/os/exacct.c (revision 03831d35f7499c87d51205817c93e9a8d42c4bae)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/exacct.h>
30 #include <sys/exacct_catalog.h>
31 #include <sys/disp.h>
32 #include <sys/task.h>
33 #include <sys/proc.h>
34 #include <sys/cmn_err.h>
35 #include <sys/kmem.h>
36 #include <sys/project.h>
37 #include <sys/systm.h>
38 #include <sys/vnode.h>
39 #include <sys/file.h>
40 #include <sys/acctctl.h>
41 #include <sys/time.h>
42 #include <sys/utsname.h>
43 #include <sys/session.h>
44 #include <sys/sysmacros.h>
45 #include <sys/bitmap.h>
46 #include <sys/msacct.h>
47 
48 /*
49  * exacct usage and recording routines
50  *
51  * wracct(2), getacct(2), and the records written at process or task
52  * termination are constructed using the exacct_assemble_[task,proc]_usage()
53  * functions, which take a callback that takes the appropriate action on
54  * the packed exacct record for the task or process.  For the process-related
55  * actions, we partition the routines such that the data collecting component
56  * can be performed while holding p_lock, and all sleeping or blocking
57  * operations can be performed without acquiring p_lock.
58  *
59  * putacct(2), which allows an application to construct a customized record
60  * associated with an existing process or task, has its own entry points:
61  * exacct_tag_task() and exacct_tag_proc().
62  */
63 
64 taskq_t *exacct_queue;
65 kmem_cache_t *exacct_object_cache;
66 
67 zone_key_t exacct_zone_key = ZONE_KEY_UNINITIALIZED;
68 
69 static const uint32_t exacct_version = EXACCT_VERSION;
70 static const char exacct_header[] = "exacct";
71 static const char exacct_creator[] = "SunOS";
72 
73 ea_object_t *
74 ea_alloc_item(ea_catalog_t catalog, void *buf, size_t bufsz)
75 {
76 	ea_object_t *item;
77 
78 	item = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
79 	bzero(item, sizeof (ea_object_t));
80 	(void) ea_set_item(item, catalog, buf, bufsz);
81 	return (item);
82 }
83 
84 ea_object_t *
85 ea_alloc_group(ea_catalog_t catalog)
86 {
87 	ea_object_t *group;
88 
89 	group = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
90 	bzero(group, sizeof (ea_object_t));
91 	(void) ea_set_group(group, catalog);
92 	return (group);
93 }
94 
95 ea_object_t *
96 ea_attach_item(ea_object_t *grp, void *buf, size_t bufsz, ea_catalog_t catalog)
97 {
98 	ea_object_t *item;
99 
100 	item = ea_alloc_item(catalog, buf, bufsz);
101 	(void) ea_attach_to_group(grp, item);
102 	return (item);
103 }
104 
105 /*
106  * exacct_vn_write() is a vn_rdwr wrapper that protects us from corrupting the
107  * accounting file in case of an I/O or filesystem error.  acctctl() prevents
108  * the two accounting vnodes from being equal, and the appropriate ac_lock is
109  * held across the call, so we're single threaded through this code for each
110  * file.
111  */
112 static int
113 exacct_vn_write(ac_info_t *info, void *buf, ssize_t bufsize)
114 {
115 	int error = 0;
116 	ssize_t resid;
117 	struct vattr va;
118 
119 	if (info == NULL)
120 		return (0);
121 
122 	mutex_enter(&info->ac_lock);
123 
124 	/*
125 	 * Don't do anything unless accounting file is set.
126 	 */
127 	if (info->ac_vnode == NULL) {
128 		mutex_exit(&info->ac_lock);
129 		return (0);
130 	}
131 
132 	/*
133 	 * Save the size. If vn_rdwr fails, reset the size to avoid corrupting
134 	 * the present accounting file.
135 	 */
136 	va.va_mask = AT_SIZE;
137 	error = VOP_GETATTR(info->ac_vnode, &va, 0, kcred);
138 	if (error == 0) {
139 		error = vn_rdwr(UIO_WRITE, info->ac_vnode, (caddr_t)buf,
140 		    bufsize, 0LL, UIO_SYSSPACE, FAPPEND, (rlim64_t)MAXOFFSET_T,
141 		    kcred, &resid);
142 		if (error) {
143 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
144 		} else if (resid != 0) {
145 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
146 			error = ENOSPC;
147 		}
148 	}
149 	mutex_exit(&info->ac_lock);
150 
151 	return (error);
152 }
153 
154 /*
155  * void *exacct_create_header(size_t *)
156  *
157  * Overview
158  *   exacct_create_header() constructs an exacct file header identifying the
159  *   accounting file as the output of the kernel.  exacct_create_header() and
160  *   the static write_header() and verify_header() routines in libexacct must
161  *   remain synchronized.
162  *
163  * Return values
164  *   A pointer to a packed exacct buffer containing the appropriate header is
165  *   returned; the size of the buffer is placed in the location indicated by
166  *   sizep.
167  *
168  * Caller's context
169  *   Suitable for KM_SLEEP allocations.
170  */
171 void *
172 exacct_create_header(size_t *sizep)
173 {
174 	ea_object_t *hdr_grp;
175 	uint32_t bskip;
176 	void *buf;
177 	size_t bufsize;
178 
179 	hdr_grp = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_HEADER);
180 	(void) ea_attach_item(hdr_grp, (void *)&exacct_version, 0,
181 	    EXT_UINT32 | EXC_DEFAULT | EXD_VERSION);
182 	(void) ea_attach_item(hdr_grp, (void *)exacct_header, 0,
183 	    EXT_STRING | EXC_DEFAULT | EXD_FILETYPE);
184 	(void) ea_attach_item(hdr_grp, (void *)exacct_creator, 0,
185 	    EXT_STRING | EXC_DEFAULT | EXD_CREATOR);
186 	(void) ea_attach_item(hdr_grp, uts_nodename(), 0,
187 	    EXT_STRING | EXC_DEFAULT | EXD_HOSTNAME);
188 
189 	bufsize = ea_pack_object(hdr_grp, NULL, 0);
190 	buf = kmem_alloc(bufsize, KM_SLEEP);
191 	(void) ea_pack_object(hdr_grp, buf, bufsize);
192 	ea_free_object(hdr_grp, EUP_ALLOC);
193 
194 	/*
195 	 * To prevent reading the header when reading the file backwards,
196 	 * set the large backskip of the header group to 0 (last 4 bytes).
197 	 */
198 	bskip = 0;
199 	exacct_order32(&bskip);
200 	bcopy(&bskip, (char *)buf + bufsize - sizeof (bskip),
201 	    sizeof (bskip));
202 
203 	*sizep = bufsize;
204 	return (buf);
205 }
206 
207 /*
208  * int exacct_write_header(ac_info_t *, void *, size_t)
209  *
210  * Overview
211  *   exacct_write_header() writes the given header buffer to the indicated
212  *   vnode, and frees the buffer.
213  *
214  * Return values
215  *   The result of the write operation is returned.
216  *
217  * Caller's context
218  *   Caller must not hold the ac_lock of the appropriate accounting file
219  *   information block (ac_info_t).
220  */
221 int
222 exacct_write_header(ac_info_t *info, void *hdr, size_t hdrsize)
223 {
224 	int error;
225 
226 	error = exacct_vn_write(info, hdr, hdrsize);
227 	kmem_free(hdr, hdrsize);
228 	return (error);
229 }
230 
231 static void
232 exacct_get_interval_task_usage(task_t *tk, task_usage_t *tu,
233     task_usage_t **tu_buf)
234 {
235 	task_usage_t *oldtu, *newtu;
236 	task_usage_t **prevusage;
237 
238 	ASSERT(MUTEX_HELD(&tk->tk_usage_lock));
239 	if (getzoneid() != GLOBAL_ZONEID) {
240 		prevusage = &tk->tk_zoneusage;
241 	} else {
242 		prevusage = &tk->tk_prevusage;
243 	}
244 	if ((oldtu = *prevusage) != NULL) {
245 		/*
246 		 * In case we have any accounting information
247 		 * saved from the previous interval record.
248 		 */
249 		newtu = *tu_buf;
250 		bcopy(tu, newtu, sizeof (task_usage_t));
251 		tu->tu_minflt	-= oldtu->tu_minflt;
252 		tu->tu_majflt	-= oldtu->tu_majflt;
253 		tu->tu_sndmsg	-= oldtu->tu_sndmsg;
254 		tu->tu_rcvmsg	-= oldtu->tu_rcvmsg;
255 		tu->tu_ioch	-= oldtu->tu_ioch;
256 		tu->tu_iblk	-= oldtu->tu_iblk;
257 		tu->tu_oblk	-= oldtu->tu_oblk;
258 		tu->tu_vcsw	-= oldtu->tu_vcsw;
259 		tu->tu_icsw	-= oldtu->tu_icsw;
260 		tu->tu_nsig	-= oldtu->tu_nsig;
261 		tu->tu_nswp	-= oldtu->tu_nswp;
262 		tu->tu_nscl	-= oldtu->tu_nscl;
263 		tu->tu_utime	-= oldtu->tu_utime;
264 		tu->tu_stime	-= oldtu->tu_stime;
265 
266 		tu->tu_startsec = oldtu->tu_finishsec;
267 		tu->tu_startnsec = oldtu->tu_finishnsec;
268 		/*
269 		 * Copy the data from our temporary storage to the task's
270 		 * previous interval usage structure for future reference.
271 		 */
272 		bcopy(newtu, oldtu, sizeof (task_usage_t));
273 	} else {
274 		/*
275 		 * Store current statistics in the task's previous interval
276 		 * usage structure for future references.
277 		 */
278 		*prevusage = *tu_buf;
279 		bcopy(tu, *prevusage, sizeof (task_usage_t));
280 		*tu_buf = NULL;
281 	}
282 }
283 
284 static void
285 exacct_snapshot_task_usage(task_t *tk, task_usage_t *tu)
286 {
287 	timestruc_t ts;
288 	proc_t *p;
289 
290 	ASSERT(MUTEX_HELD(&pidlock));
291 
292 	if ((p = tk->tk_memb_list) == NULL)
293 		return;
294 
295 	/*
296 	 * exacct_snapshot_task_usage() provides an approximate snapshot of the
297 	 * usage of the potentially many members of the task.  Since we don't
298 	 * guarantee exactness, we don't acquire the p_lock of any of the member
299 	 * processes.
300 	 */
301 	do {
302 		mutex_enter(&p->p_lock);
303 		tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
304 		tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
305 		mutex_exit(&p->p_lock);
306 		tu->tu_minflt	+= p->p_ru.minflt;
307 		tu->tu_majflt	+= p->p_ru.majflt;
308 		tu->tu_sndmsg	+= p->p_ru.msgsnd;
309 		tu->tu_rcvmsg	+= p->p_ru.msgrcv;
310 		tu->tu_ioch	+= p->p_ru.ioch;
311 		tu->tu_iblk	+= p->p_ru.inblock;
312 		tu->tu_oblk	+= p->p_ru.oublock;
313 		tu->tu_vcsw	+= p->p_ru.nvcsw;
314 		tu->tu_icsw	+= p->p_ru.nivcsw;
315 		tu->tu_nsig	+= p->p_ru.nsignals;
316 		tu->tu_nswp	+= p->p_ru.nswap;
317 		tu->tu_nscl	+= p->p_ru.sysc;
318 	} while ((p = p->p_tasknext) != tk->tk_memb_list);
319 
320 	gethrestime(&ts);
321 	tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
322 	tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
323 }
324 
325 /*
326  * exacct_update_task_mstate() updates the task's microstate accounting
327  * statistics with accumulated counters for the exiting process.
328  */
329 static void
330 exacct_update_task_mstate(proc_t *p)
331 {
332 	task_usage_t *tu;
333 
334 	mutex_enter(&p->p_task->tk_usage_lock);
335 	tu = p->p_task->tk_usage;
336 	mutex_enter(&p->p_lock);
337 	tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
338 	tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
339 	mutex_exit(&p->p_lock);
340 	tu->tu_minflt	+= p->p_ru.minflt;
341 	tu->tu_majflt	+= p->p_ru.majflt;
342 	tu->tu_sndmsg	+= p->p_ru.msgsnd;
343 	tu->tu_rcvmsg	+= p->p_ru.msgrcv;
344 	tu->tu_ioch	+= p->p_ru.ioch;
345 	tu->tu_iblk	+= p->p_ru.inblock;
346 	tu->tu_oblk	+= p->p_ru.oublock;
347 	tu->tu_vcsw	+= p->p_ru.nvcsw;
348 	tu->tu_icsw	+= p->p_ru.nivcsw;
349 	tu->tu_nsig	+= p->p_ru.nsignals;
350 	tu->tu_nswp	+= p->p_ru.nswap;
351 	tu->tu_nscl	+= p->p_ru.sysc;
352 	mutex_exit(&p->p_task->tk_usage_lock);
353 }
354 
355 static void
356 exacct_calculate_task_usage(task_t *tk, task_usage_t *tu, int flag)
357 {
358 	timestruc_t ts;
359 	task_usage_t *tu_buf;
360 
361 	switch (flag) {
362 	case EW_PARTIAL:
363 		/*
364 		 * For partial records we must report the sum of current
365 		 * accounting statistics with previously accumulated
366 		 * statistics.
367 		 */
368 		mutex_enter(&pidlock);
369 		mutex_enter(&tk->tk_usage_lock);
370 
371 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
372 		exacct_snapshot_task_usage(tk, tu);
373 
374 		mutex_exit(&tk->tk_usage_lock);
375 		mutex_exit(&pidlock);
376 		break;
377 	case EW_INTERVAL:
378 		/*
379 		 * We need to allocate spare task_usage_t buffer before
380 		 * grabbing pidlock because we might need it later in
381 		 * exacct_get_interval_task_usage().
382 		 */
383 		tu_buf = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
384 		mutex_enter(&pidlock);
385 		mutex_enter(&tk->tk_usage_lock);
386 
387 		/*
388 		 * For interval records, we deduct the previous microstate
389 		 * accounting data and cpu usage times from previously saved
390 		 * results and update the previous task usage structure.
391 		 */
392 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
393 		exacct_snapshot_task_usage(tk, tu);
394 		exacct_get_interval_task_usage(tk, tu, &tu_buf);
395 
396 		mutex_exit(&tk->tk_usage_lock);
397 		mutex_exit(&pidlock);
398 
399 		if (tu_buf != NULL)
400 			kmem_free(tu_buf, sizeof (task_usage_t));
401 		break;
402 	case EW_FINAL:
403 		/*
404 		 * For final records, we only have to record task's finish
405 		 * time because all other stuff has been calculated already.
406 		 */
407 		mutex_enter(&tk->tk_usage_lock);
408 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
409 		mutex_exit(&tk->tk_usage_lock);
410 
411 		gethrestime(&ts);
412 		tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
413 		tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
414 
415 		break;
416 	}
417 }
418 
419 static int
420 exacct_attach_task_item(task_t *tk, task_usage_t *tu, ea_object_t *record,
421     int res)
422 {
423 	int attached = 1;
424 
425 	switch (res) {
426 	case AC_TASK_TASKID:
427 		(void) ea_attach_item(record, &tk->tk_tkid,
428 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_TASKID);
429 		break;
430 	case AC_TASK_PROJID:
431 		(void) ea_attach_item(record, &tk->tk_proj->kpj_id,
432 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_PROJID);
433 		break;
434 	case AC_TASK_CPU: {
435 			timestruc_t ts;
436 			uint64_t ui;
437 
438 			hrt2ts(tu->tu_stime, &ts);
439 			ui = ts.tv_sec;
440 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
441 			    EXT_UINT64 | EXD_TASK_CPU_SYS_SEC);
442 			ui = ts.tv_nsec;
443 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
444 			    EXT_UINT64 | EXD_TASK_CPU_SYS_NSEC);
445 
446 			hrt2ts(tu->tu_utime, &ts);
447 			ui = ts.tv_sec;
448 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
449 			    EXT_UINT64 | EXD_TASK_CPU_USER_SEC);
450 			ui = ts.tv_nsec;
451 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
452 			    EXT_UINT64 | EXD_TASK_CPU_USER_NSEC);
453 		}
454 		break;
455 	case AC_TASK_TIME:
456 		(void) ea_attach_item(record, &tu->tu_startsec,
457 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_SEC);
458 		(void) ea_attach_item(record, &tu->tu_startnsec,
459 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_NSEC);
460 		(void) ea_attach_item(record, &tu->tu_finishsec,
461 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_SEC);
462 		(void) ea_attach_item(record, &tu->tu_finishnsec,
463 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_NSEC);
464 		break;
465 	case AC_TASK_HOSTNAME:
466 		(void) ea_attach_item(record, tk->tk_zone->zone_nodename,
467 		    strlen(tk->tk_zone->zone_nodename) + 1,
468 		    EXT_STRING | EXD_TASK_HOSTNAME);
469 			break;
470 	case AC_TASK_MICROSTATE:
471 		(void) ea_attach_item(record, &tu->tu_majflt,
472 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MAJOR);
473 		(void) ea_attach_item(record, &tu->tu_minflt,
474 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MINOR);
475 		(void) ea_attach_item(record, &tu->tu_sndmsg,
476 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_SND);
477 		(void) ea_attach_item(record, &tu->tu_rcvmsg,
478 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_RCV);
479 		(void) ea_attach_item(record, &tu->tu_iblk,
480 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_IN);
481 		(void) ea_attach_item(record, &tu->tu_oblk,
482 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_OUT);
483 		(void) ea_attach_item(record, &tu->tu_ioch,
484 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CHARS_RDWR);
485 		(void) ea_attach_item(record, &tu->tu_vcsw,
486 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_VOL);
487 		(void) ea_attach_item(record, &tu->tu_icsw,
488 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_INV);
489 		(void) ea_attach_item(record, &tu->tu_nsig,
490 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SIGNALS);
491 		(void) ea_attach_item(record, &tu->tu_nswp,
492 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SWAPS);
493 		(void) ea_attach_item(record, &tu->tu_nscl,
494 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SYSCALLS);
495 		break;
496 	case AC_TASK_ANCTASKID:
497 		(void) ea_attach_item(record, &tu->tu_anctaskid,
498 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_ANCTASKID);
499 		break;
500 	case AC_TASK_ZONENAME:
501 		(void) ea_attach_item(record, tk->tk_zone->zone_name,
502 		    strlen(tk->tk_zone->zone_name) + 1,
503 		    EXT_STRING | EXD_TASK_ZONENAME);
504 		break;
505 	default:
506 		attached = 0;
507 	}
508 	return (attached);
509 }
510 
511 static ea_object_t *
512 exacct_assemble_task_record(task_t *tk, task_usage_t *tu, ulong_t *mask,
513     ea_catalog_t record_type)
514 {
515 	int res, count;
516 	ea_object_t *record;
517 
518 	/*
519 	 * Assemble usage values into group.
520 	 */
521 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
522 	for (res = 1, count = 0; res <= AC_TASK_MAX_RES; res++)
523 		if (BT_TEST(mask, res))
524 			count += exacct_attach_task_item(tk, tu, record, res);
525 	if (count == 0) {
526 		ea_free_object(record, EUP_ALLOC);
527 		record = NULL;
528 	}
529 	return (record);
530 }
531 
532 /*
533  * int exacct_assemble_task_usage(task_t *, int (*)(void *, size_t, void *,
534  *	size_t, size_t *), void *, size_t, size_t *, int)
535  *
536  * Overview
537  *   exacct_assemble_task_usage() builds the packed exacct buffer for the
538  *   indicated task, executes the given callback function, and free the packed
539  *   buffer.
540  *
541  * Return values
542  *   Returns 0 on success; otherwise the appropriate error code is returned.
543  *
544  * Caller's context
545  *   Suitable for KM_SLEEP allocations.
546  */
547 int
548 exacct_assemble_task_usage(ac_info_t *ac_task, task_t *tk,
549     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
550     void *ubuf, size_t ubufsize, size_t *actual, int flag)
551 {
552 	ulong_t mask[AC_MASK_SZ];
553 	ea_object_t *task_record;
554 	ea_catalog_t record_type;
555 	task_usage_t *tu;
556 	void *buf;
557 	size_t bufsize;
558 	int ret;
559 
560 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL || flag == EW_INTERVAL);
561 
562 	mutex_enter(&ac_task->ac_lock);
563 	if (ac_task->ac_state == AC_OFF) {
564 		mutex_exit(&ac_task->ac_lock);
565 		return (ENOTACTIVE);
566 	}
567 	bt_copy(ac_task->ac_mask, mask, AC_MASK_SZ);
568 	mutex_exit(&ac_task->ac_lock);
569 
570 	switch (flag) {
571 	case EW_FINAL:
572 		record_type = EXD_GROUP_TASK;
573 		break;
574 	case EW_PARTIAL:
575 		record_type = EXD_GROUP_TASK_PARTIAL;
576 		break;
577 	case EW_INTERVAL:
578 		record_type = EXD_GROUP_TASK_INTERVAL;
579 		break;
580 	}
581 
582 	/*
583 	 * Calculate task usage and assemble it into the task record.
584 	 */
585 	tu = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
586 	exacct_calculate_task_usage(tk, tu, flag);
587 	task_record = exacct_assemble_task_record(tk, tu, mask, record_type);
588 	if (task_record == NULL) {
589 		/*
590 		 * The current configuration of the accounting system has
591 		 * resulted in records with no data; accordingly, we don't write
592 		 * these, but we return success.
593 		 */
594 		kmem_free(tu, sizeof (task_usage_t));
595 		return (0);
596 	}
597 
598 	/*
599 	 * Pack object into buffer and run callback on it.
600 	 */
601 	bufsize = ea_pack_object(task_record, NULL, 0);
602 	buf = kmem_alloc(bufsize, KM_SLEEP);
603 	(void) ea_pack_object(task_record, buf, bufsize);
604 	ret = callback(ac_task, ubuf, ubufsize, buf, bufsize, actual);
605 
606 	/*
607 	 * Free all previously allocated structures.
608 	 */
609 	kmem_free(buf, bufsize);
610 	ea_free_object(task_record, EUP_ALLOC);
611 	kmem_free(tu, sizeof (task_usage_t));
612 	return (ret);
613 }
614 
615 /*
616  * void exacct_commit_task(void *)
617  *
618  * Overview
619  *   exacct_commit_task() calculates the final usage for a task, updating the
620  *   task usage if task accounting is active, and writing a task record if task
621  *   accounting is active.  exacct_commit_task() is intended for being called
622  *   from a task queue (taskq_t).
623  *
624  * Return values
625  *   None.
626  *
627  * Caller's context
628  *   Suitable for KM_SLEEP allocations.
629  */
630 
631 void
632 exacct_commit_task(void *arg)
633 {
634 	task_t *tk = (task_t *)arg;
635 	size_t size;
636 	zone_t *zone = tk->tk_zone;
637 	struct exacct_globals *acg;
638 
639 	ASSERT(tk != task0p);
640 	ASSERT(tk->tk_memb_list == NULL);
641 
642 	/*
643 	 * Don't do any extra work if the acctctl module isn't loaded.
644 	 */
645 	if (exacct_zone_key != ZONE_KEY_UNINITIALIZED) {
646 		acg = zone_getspecific(exacct_zone_key, zone);
647 		(void) exacct_assemble_task_usage(&acg->ac_task, tk,
648 		    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
649 		if (tk->tk_zone != global_zone) {
650 			acg = zone_getspecific(exacct_zone_key, global_zone);
651 			(void) exacct_assemble_task_usage(&acg->ac_task, tk,
652 			    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
653 		}
654 	}
655 	/*
656 	 * Release associated project and finalize task.
657 	 */
658 	task_end(tk);
659 }
660 
661 static int
662 exacct_attach_proc_item(proc_usage_t *pu, ea_object_t *record, int res)
663 {
664 	int attached = 1;
665 
666 	switch (res) {
667 	case AC_PROC_PID:
668 		(void) ea_attach_item(record, &pu->pu_pid,
669 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PID);
670 		break;
671 	case AC_PROC_UID:
672 		(void) ea_attach_item(record, &pu->pu_ruid,
673 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_UID);
674 		break;
675 	case AC_PROC_FLAG:
676 		(void) ea_attach_item(record, &pu->pu_acflag,
677 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ACCT_FLAGS);
678 		break;
679 	case AC_PROC_GID:
680 		(void) ea_attach_item(record, &pu->pu_rgid,
681 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_GID);
682 		break;
683 	case AC_PROC_PROJID:
684 		(void) ea_attach_item(record, &pu->pu_projid,
685 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PROJID);
686 		break;
687 	case AC_PROC_TASKID:
688 		(void) ea_attach_item(record, &pu->pu_taskid,
689 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TASKID);
690 		break;
691 	case AC_PROC_CPU:
692 		(void) ea_attach_item(record, &pu->pu_utimesec,
693 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_SEC);
694 		(void) ea_attach_item(record, &pu->pu_utimensec,
695 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_NSEC);
696 		(void) ea_attach_item(record, &pu->pu_stimesec,
697 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_SEC);
698 		(void) ea_attach_item(record, &pu->pu_stimensec,
699 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_NSEC);
700 		break;
701 	case AC_PROC_TIME:
702 		(void) ea_attach_item(record, &pu->pu_startsec,
703 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_SEC);
704 		(void) ea_attach_item(record, &pu->pu_startnsec,
705 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_NSEC);
706 		(void) ea_attach_item(record, &pu->pu_finishsec,
707 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_SEC);
708 		(void) ea_attach_item(record, &pu->pu_finishnsec,
709 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_NSEC);
710 		break;
711 	case AC_PROC_COMMAND:
712 		(void) ea_attach_item(record, pu->pu_command,
713 		    strlen(pu->pu_command) + 1, EXT_STRING | EXD_PROC_COMMAND);
714 		break;
715 	case AC_PROC_HOSTNAME:
716 		(void) ea_attach_item(record, pu->pu_nodename,
717 		    strlen(pu->pu_nodename) + 1,
718 		    EXT_STRING | EXD_PROC_HOSTNAME);
719 		break;
720 	case AC_PROC_TTY:
721 		(void) ea_attach_item(record, &pu->pu_major,
722 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MAJOR);
723 		(void) ea_attach_item(record, &pu->pu_minor,
724 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MINOR);
725 		break;
726 	case AC_PROC_MICROSTATE:
727 		(void) ea_attach_item(record, &pu->pu_majflt,
728 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MAJOR);
729 		(void) ea_attach_item(record, &pu->pu_minflt,
730 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MINOR);
731 		(void) ea_attach_item(record, &pu->pu_sndmsg,
732 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_SND);
733 		(void) ea_attach_item(record, &pu->pu_rcvmsg,
734 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_RCV);
735 		(void) ea_attach_item(record, &pu->pu_iblk,
736 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_IN);
737 		(void) ea_attach_item(record, &pu->pu_oblk,
738 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_OUT);
739 		(void) ea_attach_item(record, &pu->pu_ioch,
740 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CHARS_RDWR);
741 		(void) ea_attach_item(record, &pu->pu_vcsw,
742 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_VOL);
743 		(void) ea_attach_item(record, &pu->pu_icsw,
744 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_INV);
745 		(void) ea_attach_item(record, &pu->pu_nsig,
746 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SIGNALS);
747 		(void) ea_attach_item(record, &pu->pu_nswp,
748 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SWAPS);
749 		(void) ea_attach_item(record, &pu->pu_nscl,
750 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SYSCALLS);
751 		break;
752 	case AC_PROC_ANCPID:
753 		(void) ea_attach_item(record, &pu->pu_ancpid,
754 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ANCPID);
755 		break;
756 	case AC_PROC_WAIT_STATUS:
757 		(void) ea_attach_item(record, &pu->pu_wstat,
758 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_WAIT_STATUS);
759 		break;
760 	case AC_PROC_ZONENAME:
761 		(void) ea_attach_item(record, pu->pu_zonename,
762 		    strlen(pu->pu_zonename) + 1,
763 		    EXT_STRING | EXD_PROC_ZONENAME);
764 		break;
765 	case AC_PROC_MEM:
766 		(void) ea_attach_item(record, &pu->pu_mem_rss_avg,
767 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_AVG_K);
768 		(void) ea_attach_item(record, &pu->pu_mem_rss_max,
769 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_MAX_K);
770 		break;
771 	default:
772 		attached = 0;
773 	}
774 	return (attached);
775 }
776 
777 static ea_object_t *
778 exacct_assemble_proc_record(proc_usage_t *pu, ulong_t *mask,
779     ea_catalog_t record_type)
780 {
781 	int res, count;
782 	ea_object_t *record;
783 
784 	/*
785 	 * Assemble usage values into group.
786 	 */
787 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
788 	for (res = 1, count = 0; res <= AC_PROC_MAX_RES; res++)
789 		if (BT_TEST(mask, res))
790 		    count += exacct_attach_proc_item(pu, record, res);
791 	if (count == 0) {
792 		ea_free_object(record, EUP_ALLOC);
793 		record = NULL;
794 	}
795 	return (record);
796 }
797 
798 /*
799  * The following two routines assume that process's p_lock is held or
800  * exacct_commit_proc has been called from exit() when all lwps are stopped.
801  */
802 static void
803 exacct_calculate_proc_mstate(proc_t *p, proc_usage_t *pu)
804 {
805 	kthread_t *t;
806 
807 	ASSERT(MUTEX_HELD(&p->p_lock));
808 	if ((t = p->p_tlist) == NULL)
809 		return;
810 
811 	do {
812 		pu->pu_minflt	+= t->t_lwp->lwp_ru.minflt;
813 		pu->pu_majflt	+= t->t_lwp->lwp_ru.majflt;
814 		pu->pu_sndmsg	+= t->t_lwp->lwp_ru.msgsnd;
815 		pu->pu_rcvmsg	+= t->t_lwp->lwp_ru.msgrcv;
816 		pu->pu_ioch	+= t->t_lwp->lwp_ru.ioch;
817 		pu->pu_iblk	+= t->t_lwp->lwp_ru.inblock;
818 		pu->pu_oblk	+= t->t_lwp->lwp_ru.oublock;
819 		pu->pu_vcsw	+= t->t_lwp->lwp_ru.nvcsw;
820 		pu->pu_icsw	+= t->t_lwp->lwp_ru.nivcsw;
821 		pu->pu_nsig	+= t->t_lwp->lwp_ru.nsignals;
822 		pu->pu_nswp	+= t->t_lwp->lwp_ru.nswap;
823 		pu->pu_nscl	+= t->t_lwp->lwp_ru.sysc;
824 	} while ((t = t->t_forw) != p->p_tlist);
825 }
826 
827 static void
828 exacct_copy_proc_mstate(proc_t *p, proc_usage_t *pu)
829 {
830 	pu->pu_minflt	= p->p_ru.minflt;
831 	pu->pu_majflt	= p->p_ru.majflt;
832 	pu->pu_sndmsg	= p->p_ru.msgsnd;
833 	pu->pu_rcvmsg	= p->p_ru.msgrcv;
834 	pu->pu_ioch	= p->p_ru.ioch;
835 	pu->pu_iblk	= p->p_ru.inblock;
836 	pu->pu_oblk	= p->p_ru.oublock;
837 	pu->pu_vcsw	= p->p_ru.nvcsw;
838 	pu->pu_icsw	= p->p_ru.nivcsw;
839 	pu->pu_nsig	= p->p_ru.nsignals;
840 	pu->pu_nswp	= p->p_ru.nswap;
841 	pu->pu_nscl	= p->p_ru.sysc;
842 }
843 
844 void
845 exacct_calculate_proc_usage(proc_t *p, proc_usage_t *pu, ulong_t *mask,
846     int flag, int wstat)
847 {
848 	timestruc_t ts, ts_run;
849 
850 	ASSERT(MUTEX_HELD(&p->p_lock));
851 
852 	/*
853 	 * Convert CPU and execution times to sec/nsec format.
854 	 */
855 	if (BT_TEST(mask, AC_PROC_CPU)) {
856 		hrt2ts(mstate_aggr_state(p, LMS_USER), &ts);
857 		pu->pu_utimesec = (uint64_t)(ulong_t)ts.tv_sec;
858 		pu->pu_utimensec = (uint64_t)(ulong_t)ts.tv_nsec;
859 		hrt2ts(mstate_aggr_state(p, LMS_SYSTEM), &ts);
860 		pu->pu_stimesec = (uint64_t)(ulong_t)ts.tv_sec;
861 		pu->pu_stimensec = (uint64_t)(ulong_t)ts.tv_nsec;
862 	}
863 	if (BT_TEST(mask, AC_PROC_TIME)) {
864 		gethrestime(&ts);
865 		pu->pu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
866 		pu->pu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
867 		hrt2ts(gethrtime() - p->p_mstart, &ts_run);
868 		ts.tv_sec -= ts_run.tv_sec;
869 		ts.tv_nsec -= ts_run.tv_nsec;
870 		if (ts.tv_nsec < 0) {
871 			ts.tv_sec--;
872 			if ((ts.tv_nsec = ts.tv_nsec + NANOSEC) >= NANOSEC) {
873 				ts.tv_sec++;
874 				ts.tv_nsec -= NANOSEC;
875 			}
876 		}
877 		pu->pu_startsec = (uint64_t)(ulong_t)ts.tv_sec;
878 		pu->pu_startnsec = (uint64_t)(ulong_t)ts.tv_nsec;
879 	}
880 
881 	pu->pu_pid = p->p_pidp->pid_id;
882 	pu->pu_acflag = p->p_user.u_acflag;
883 	pu->pu_projid = p->p_task->tk_proj->kpj_id;
884 	pu->pu_taskid = p->p_task->tk_tkid;
885 	pu->pu_major = getmajor(p->p_sessp->s_dev);
886 	pu->pu_minor = getminor(p->p_sessp->s_dev);
887 	pu->pu_ancpid = p->p_ancpid;
888 	pu->pu_wstat = wstat;
889 	/*
890 	 * Compute average RSS in K.  The denominator is the number of
891 	 * samples:  the number of clock ticks plus the initial value.
892 	 */
893 	pu->pu_mem_rss_avg = (PTOU(p)->u_mem / (p->p_stime + p->p_utime + 1)) *
894 	    (PAGESIZE / 1024);
895 	pu->pu_mem_rss_max = PTOU(p)->u_mem_max * (PAGESIZE / 1024);
896 
897 	mutex_enter(&p->p_crlock);
898 	pu->pu_ruid = crgetruid(p->p_cred);
899 	pu->pu_rgid = crgetrgid(p->p_cred);
900 	mutex_exit(&p->p_crlock);
901 
902 	bcopy(p->p_user.u_comm, pu->pu_command, strlen(p->p_user.u_comm) + 1);
903 	bcopy(p->p_zone->zone_name, pu->pu_zonename,
904 	    strlen(p->p_zone->zone_name) + 1);
905 	bcopy(p->p_zone->zone_nodename, pu->pu_nodename,
906 	    strlen(p->p_zone->zone_nodename) + 1);
907 
908 	/*
909 	 * Calculate microstate accounting data for a process that is still
910 	 * running.  Presently, we explicitly collect all of the LWP usage into
911 	 * the proc usage structure here.
912 	 */
913 	if (flag & EW_PARTIAL)
914 		exacct_calculate_proc_mstate(p, pu);
915 	if (flag & EW_FINAL)
916 		exacct_copy_proc_mstate(p, pu);
917 }
918 
919 /*
920  * int exacct_assemble_proc_usage(proc_usage_t *, int (*)(void *, size_t, void
921  *	*, size_t, size_t *), void *, size_t, size_t *)
922  *
923  * Overview
924  *   Assemble record with miscellaneous accounting information about the process
925  *   and execute the callback on it. It is the callback's job to set "actual" to
926  *   the size of record.
927  *
928  * Return values
929  *   The result of the callback function, unless the extended process accounting
930  *   feature is not active, in which case ENOTACTIVE is returned.
931  *
932  * Caller's context
933  *   Suitable for KM_SLEEP allocations.
934  */
935 int
936 exacct_assemble_proc_usage(ac_info_t *ac_proc, proc_usage_t *pu,
937     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
938     void *ubuf, size_t ubufsize, size_t *actual, int flag)
939 {
940 	ulong_t mask[AC_MASK_SZ];
941 	ea_object_t *proc_record;
942 	ea_catalog_t record_type;
943 	void *buf;
944 	size_t bufsize;
945 	int ret;
946 
947 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL);
948 
949 	mutex_enter(&ac_proc->ac_lock);
950 	if (ac_proc->ac_state == AC_OFF) {
951 		mutex_exit(&ac_proc->ac_lock);
952 		return (ENOTACTIVE);
953 	}
954 	bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
955 	mutex_exit(&ac_proc->ac_lock);
956 
957 	switch (flag) {
958 	case EW_FINAL:
959 		record_type = EXD_GROUP_PROC;
960 		break;
961 	case EW_PARTIAL:
962 		record_type = EXD_GROUP_PROC_PARTIAL;
963 		break;
964 	}
965 
966 	proc_record = exacct_assemble_proc_record(pu, mask, record_type);
967 	if (proc_record == NULL)
968 		return (0);
969 
970 	/*
971 	 * Pack object into buffer and pass to callback.
972 	 */
973 	bufsize = ea_pack_object(proc_record, NULL, 0);
974 	buf = kmem_alloc(bufsize, KM_SLEEP);
975 	(void) ea_pack_object(proc_record, buf, bufsize);
976 
977 	ret = callback(ac_proc, ubuf, ubufsize, buf, bufsize, actual);
978 
979 	/*
980 	 * Free all previously allocations.
981 	 */
982 	kmem_free(buf, bufsize);
983 	ea_free_object(proc_record, EUP_ALLOC);
984 	return (ret);
985 }
986 
987 /*
988  * int exacct_commit_callback(ac_info_t *, void *, size_t, void *, size_t,
989  * 	size_t *)
990  *
991  * Overview
992  *   exacct_commit_callback() writes the indicated buffer to the indicated
993  *   extended accounting file.
994  *
995  * Return values
996  *   The result of the write operation is returned.  "actual" is updated to
997  *   contain the number of bytes actually written.
998  *
999  * Caller's context
1000  *   Suitable for a vn_rdwr() operation.
1001  */
1002 /*ARGSUSED*/
1003 int
1004 exacct_commit_callback(ac_info_t *info, void *ubuf, size_t ubufsize,
1005     void *buf, size_t bufsize, size_t *actual)
1006 {
1007 	int error = 0;
1008 
1009 	*actual = 0;
1010 	if ((error = exacct_vn_write(info, buf, bufsize)) == 0)
1011 		*actual = bufsize;
1012 	return (error);
1013 }
1014 
1015 static void
1016 exacct_do_commit_proc(ac_info_t *ac_proc, proc_t *p, int wstat)
1017 {
1018 	size_t size;
1019 	proc_usage_t *pu;
1020 	ulong_t mask[AC_MASK_SZ];
1021 
1022 	mutex_enter(&ac_proc->ac_lock);
1023 	if (ac_proc->ac_state == AC_ON) {
1024 		bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
1025 		mutex_exit(&ac_proc->ac_lock);
1026 	} else {
1027 		mutex_exit(&ac_proc->ac_lock);
1028 		return;
1029 	}
1030 
1031 	mutex_enter(&p->p_lock);
1032 	size = strlen(p->p_user.u_comm) + 1;
1033 	mutex_exit(&p->p_lock);
1034 
1035 	pu = kmem_alloc(sizeof (proc_usage_t), KM_SLEEP);
1036 	pu->pu_command = kmem_alloc(size, KM_SLEEP);
1037 	mutex_enter(&p->p_lock);
1038 	exacct_calculate_proc_usage(p, pu, mask, EW_FINAL, wstat);
1039 	mutex_exit(&p->p_lock);
1040 
1041 	(void) exacct_assemble_proc_usage(ac_proc, pu,
1042 	    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
1043 
1044 	kmem_free(pu->pu_command, strlen(pu->pu_command) + 1);
1045 	kmem_free(pu, sizeof (proc_usage_t));
1046 }
1047 /*
1048  * void exacct_commit_proc(proc_t *, int)
1049  *
1050  * Overview
1051  *   exacct_commit_proc() calculates the final usage for a process, updating the
1052  *   task usage if task accounting is active, and writing a process record if
1053  *   process accounting is active.  exacct_commit_proc() is intended for being
1054  *   called from proc_exit().
1055  *
1056  * Return values
1057  *   None.
1058  *
1059  * Caller's context
1060  *   Suitable for KM_SLEEP allocations.  p_lock must not be held at entry.
1061  */
1062 void
1063 exacct_commit_proc(proc_t *p, int wstat)
1064 {
1065 	zone_t *zone = p->p_zone;
1066 	struct exacct_globals *acg, *gacg = NULL;
1067 
1068 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1069 		/*
1070 		 * acctctl module not loaded.  Nothing to do.
1071 		 */
1072 		return;
1073 	}
1074 	acg = zone_getspecific(exacct_zone_key, zone);
1075 	if (zone != global_zone)
1076 		gacg = zone_getspecific(exacct_zone_key, global_zone);
1077 	if (acg->ac_task.ac_state == AC_ON ||
1078 	    (gacg != NULL && gacg->ac_task.ac_state == AC_ON)) {
1079 		exacct_update_task_mstate(p);
1080 	}
1081 
1082 	exacct_do_commit_proc(&acg->ac_proc, p, wstat);
1083 	if (p->p_zone != global_zone)
1084 		exacct_do_commit_proc(&gacg->ac_proc, p, wstat);
1085 }
1086 
1087 static int
1088 exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res)
1089 {
1090 	int attached = 1;
1091 
1092 	switch (res) {
1093 	case AC_FLOW_SADDR:
1094 		if (fu->fu_isv4) {
1095 			(void) ea_attach_item(record, &fu->fu_saddr[3],
1096 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4SADDR);
1097 		} else {
1098 			(void) ea_attach_item(record, &fu->fu_saddr,
1099 			    sizeof (fu->fu_saddr), EXT_RAW |
1100 			    EXD_FLOW_V6SADDR);
1101 		}
1102 		break;
1103 	case AC_FLOW_DADDR:
1104 		if (fu->fu_isv4) {
1105 			(void) ea_attach_item(record, &fu->fu_daddr[3],
1106 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4DADDR);
1107 		} else {
1108 			(void) ea_attach_item(record, &fu->fu_daddr,
1109 			    sizeof (fu->fu_daddr), EXT_RAW |
1110 			    EXD_FLOW_V6DADDR);
1111 		}
1112 		break;
1113 	case AC_FLOW_SPORT:
1114 		(void) ea_attach_item(record, &fu->fu_sport,
1115 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_SPORT);
1116 		break;
1117 	case AC_FLOW_DPORT:
1118 		(void) ea_attach_item(record, &fu->fu_dport,
1119 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_DPORT);
1120 		break;
1121 	case AC_FLOW_PROTOCOL:
1122 		(void) ea_attach_item(record, &fu->fu_protocol,
1123 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_PROTOCOL);
1124 		break;
1125 	case AC_FLOW_DSFIELD:
1126 		(void) ea_attach_item(record, &fu->fu_dsfield,
1127 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_DSFIELD);
1128 		break;
1129 	case AC_FLOW_CTIME:
1130 		(void) ea_attach_item(record, &fu->fu_ctime,
1131 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_CTIME);
1132 		break;
1133 	case AC_FLOW_LSEEN:
1134 		(void) ea_attach_item(record, &fu->fu_lseen,
1135 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_LSEEN);
1136 		break;
1137 	case AC_FLOW_NBYTES:
1138 		(void) ea_attach_item(record, &fu->fu_nbytes,
1139 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NBYTES);
1140 		break;
1141 	case AC_FLOW_NPKTS:
1142 		(void) ea_attach_item(record, &fu->fu_npackets,
1143 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NPKTS);
1144 		break;
1145 	case AC_FLOW_PROJID:
1146 		if (fu->fu_projid >= 0) {
1147 			(void) ea_attach_item(record, &fu->fu_projid,
1148 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_PROJID);
1149 		}
1150 		break;
1151 	case AC_FLOW_UID:
1152 		if (fu->fu_userid >= 0) {
1153 			(void) ea_attach_item(record, &fu->fu_userid,
1154 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID);
1155 		}
1156 		break;
1157 	case AC_FLOW_ANAME:
1158 		(void) ea_attach_item(record, fu->fu_aname,
1159 		    strlen(fu->fu_aname) + 1, EXT_STRING | EXD_FLOW_ANAME);
1160 		break;
1161 	default:
1162 		attached = 0;
1163 	}
1164 	return (attached);
1165 }
1166 
1167 static ea_object_t *
1168 exacct_assemble_flow_record(flow_usage_t *fu, ulong_t *mask,
1169     ea_catalog_t record_type)
1170 {
1171 	int res, count;
1172 	ea_object_t *record;
1173 
1174 	/*
1175 	 * Assemble usage values into group.
1176 	 */
1177 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
1178 	for (res = 1, count = 0; res <= AC_FLOW_MAX_RES; res++)
1179 		if (BT_TEST(mask, res))
1180 			count += exacct_attach_flow_item(fu, record, res);
1181 	if (count == 0) {
1182 		ea_free_object(record, EUP_ALLOC);
1183 		record = NULL;
1184 	}
1185 	return (record);
1186 }
1187 
1188 int
1189 exacct_assemble_flow_usage(ac_info_t *ac_flow, flow_usage_t *fu,
1190     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1191     void *ubuf, size_t ubufsize, size_t *actual)
1192 {
1193 	ulong_t mask[AC_MASK_SZ];
1194 	ea_object_t *flow_usage;
1195 	ea_catalog_t record_type;
1196 	void *buf;
1197 	size_t bufsize;
1198 	int ret;
1199 
1200 	mutex_enter(&ac_flow->ac_lock);
1201 	if (ac_flow->ac_state == AC_OFF) {
1202 		mutex_exit(&ac_flow->ac_lock);
1203 		return (ENOTACTIVE);
1204 	}
1205 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1206 	mutex_exit(&ac_flow->ac_lock);
1207 
1208 	record_type = EXD_GROUP_FLOW;
1209 
1210 	flow_usage = exacct_assemble_flow_record(fu, mask, record_type);
1211 	if (flow_usage == NULL) {
1212 		return (0);
1213 	}
1214 
1215 	/*
1216 	 * Pack object into buffer and pass to callback.
1217 	 */
1218 	bufsize = ea_pack_object(flow_usage, NULL, 0);
1219 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
1220 	if (buf == NULL) {
1221 		return (ENOMEM);
1222 	}
1223 
1224 	(void) ea_pack_object(flow_usage, buf, bufsize);
1225 
1226 	ret = callback(ac_flow, ubuf, ubufsize, buf, bufsize, actual);
1227 
1228 	/*
1229 	 * Free all previously allocations.
1230 	 */
1231 	kmem_free(buf, bufsize);
1232 	ea_free_object(flow_usage, EUP_ALLOC);
1233 	return (ret);
1234 }
1235 
1236 void
1237 exacct_commit_flow(void *arg)
1238 {
1239 	flow_usage_t *f = (flow_usage_t *)arg;
1240 	size_t size;
1241 	ulong_t mask[AC_MASK_SZ];
1242 	struct exacct_globals *acg;
1243 	ac_info_t *ac_flow;
1244 
1245 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1246 		/*
1247 		 * acctctl module not loaded. Nothing to do.
1248 		 */
1249 		return;
1250 	}
1251 
1252 	/*
1253 	 * Even though each zone nominally has its own flow accounting settings
1254 	 * (ac_flow), these are only maintained by and for the global zone.
1255 	 *
1256 	 * If this were to change in the future, this function should grow a
1257 	 * second zoneid (or zone) argument, and use the corresponding zone's
1258 	 * settings rather than always using those of the global zone.
1259 	 */
1260 	acg = zone_getspecific(exacct_zone_key, global_zone);
1261 	ac_flow = &acg->ac_flow;
1262 
1263 	mutex_enter(&ac_flow->ac_lock);
1264 	if (ac_flow->ac_state == AC_OFF) {
1265 		mutex_exit(&ac_flow->ac_lock);
1266 		return;
1267 	}
1268 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1269 	mutex_exit(&ac_flow->ac_lock);
1270 
1271 	(void) exacct_assemble_flow_usage(ac_flow, f, exacct_commit_callback,
1272 	    NULL, 0, &size);
1273 }
1274 
1275 /*
1276  * int exacct_tag_task(task_t *, void *, size_t, int)
1277  *
1278  * Overview
1279  *   exacct_tag_task() provides the exacct record construction and writing
1280  *   support required by putacct(2) for task entities.
1281  *
1282  * Return values
1283  *   The result of the write operation is returned, unless the extended
1284  *   accounting facility is not active, in which case ENOTACTIVE is returned.
1285  *
1286  * Caller's context
1287  *   Suitable for KM_SLEEP allocations.
1288  */
1289 int
1290 exacct_tag_task(ac_info_t *ac_task, task_t *tk, void *ubuf, size_t ubufsz,
1291     int flags)
1292 {
1293 	int error = 0;
1294 	void *buf;
1295 	size_t bufsize;
1296 	ea_catalog_t cat;
1297 	ea_object_t *tag;
1298 
1299 	mutex_enter(&ac_task->ac_lock);
1300 	if (ac_task->ac_state == AC_OFF || ac_task->ac_vnode == NULL) {
1301 		mutex_exit(&ac_task->ac_lock);
1302 		return (ENOTACTIVE);
1303 	}
1304 	mutex_exit(&ac_task->ac_lock);
1305 
1306 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_TASK_TAG);
1307 	(void) ea_attach_item(tag, &tk->tk_tkid, 0,
1308 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1309 	(void) ea_attach_item(tag, tk->tk_zone->zone_nodename, 0,
1310 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1311 	if (flags == EP_RAW)
1312 		cat = EXT_RAW | EXC_DEFAULT | EXD_TASK_TAG;
1313 	else
1314 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_TASK_TAG;
1315 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1316 
1317 	bufsize = ea_pack_object(tag, NULL, 0);
1318 	buf = kmem_alloc(bufsize, KM_SLEEP);
1319 	(void) ea_pack_object(tag, buf, bufsize);
1320 	error = exacct_vn_write(ac_task, buf, bufsize);
1321 	kmem_free(buf, bufsize);
1322 	ea_free_object(tag, EUP_ALLOC);
1323 	return (error);
1324 }
1325 
1326 /*
1327  * exacct_tag_proc(pid_t, taskid_t, void *, size_t, int, char *)
1328  *
1329  * Overview
1330  *   exacct_tag_proc() provides the exacct record construction and writing
1331  *   support required by putacct(2) for processes.
1332  *
1333  * Return values
1334  *   The result of the write operation is returned, unless the extended
1335  *   accounting facility is not active, in which case ENOTACTIVE is returned.
1336  *
1337  * Caller's context
1338  *   Suitable for KM_SLEEP allocations.
1339  */
1340 int
1341 exacct_tag_proc(ac_info_t *ac_proc, pid_t pid, taskid_t tkid, void *ubuf,
1342     size_t ubufsz, int flags, const char *hostname)
1343 {
1344 	int error = 0;
1345 	void *buf;
1346 	size_t bufsize;
1347 	ea_catalog_t cat;
1348 	ea_object_t *tag;
1349 
1350 	mutex_enter(&ac_proc->ac_lock);
1351 	if (ac_proc->ac_state == AC_OFF || ac_proc->ac_vnode == NULL) {
1352 		mutex_exit(&ac_proc->ac_lock);
1353 		return (ENOTACTIVE);
1354 	}
1355 	mutex_exit(&ac_proc->ac_lock);
1356 
1357 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_PROC_TAG);
1358 	(void) ea_attach_item(tag, &pid, sizeof (uint32_t),
1359 	    EXT_UINT32 | EXC_DEFAULT | EXD_PROC_PID);
1360 	(void) ea_attach_item(tag, &tkid, 0,
1361 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1362 	(void) ea_attach_item(tag, (void *)hostname, 0,
1363 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1364 	if (flags == EP_RAW)
1365 		cat = EXT_RAW | EXC_DEFAULT | EXD_PROC_TAG;
1366 	else
1367 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_PROC_TAG;
1368 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1369 
1370 	bufsize = ea_pack_object(tag, NULL, 0);
1371 	buf = kmem_alloc(bufsize, KM_SLEEP);
1372 	(void) ea_pack_object(tag, buf, bufsize);
1373 	error = exacct_vn_write(ac_proc, buf, bufsize);
1374 	kmem_free(buf, bufsize);
1375 	ea_free_object(tag, EUP_ALLOC);
1376 	return (error);
1377 }
1378 
1379 /*
1380  * void exacct_init(void)
1381  *
1382  * Overview
1383  *   Initialized the extended accounting subsystem.
1384  *
1385  * Return values
1386  *   None.
1387  *
1388  * Caller's context
1389  *   Suitable for KM_SLEEP allocations.
1390  */
1391 void
1392 exacct_init()
1393 {
1394 	exacct_queue = system_taskq;
1395 	exacct_object_cache = kmem_cache_create("exacct_object_cache",
1396 	    sizeof (ea_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1397 }
1398