xref: /titanic_51/usr/src/uts/common/os/exacct.c (revision 980a6e61aeb2038ab2b640d7ac80b36cf5c7d84b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/exacct.h>
29 #include <sys/exacct_catalog.h>
30 #include <sys/disp.h>
31 #include <sys/task.h>
32 #include <sys/proc.h>
33 #include <sys/cmn_err.h>
34 #include <sys/kmem.h>
35 #include <sys/project.h>
36 #include <sys/systm.h>
37 #include <sys/vnode.h>
38 #include <sys/file.h>
39 #include <sys/acctctl.h>
40 #include <sys/time.h>
41 #include <sys/utsname.h>
42 #include <sys/session.h>
43 #include <sys/sysmacros.h>
44 #include <sys/bitmap.h>
45 #include <sys/msacct.h>
46 
47 /*
48  * exacct usage and recording routines
49  *
50  * wracct(2), getacct(2), and the records written at process or task
51  * termination are constructed using the exacct_assemble_[task,proc]_usage()
52  * functions, which take a callback that takes the appropriate action on
53  * the packed exacct record for the task or process.  For the process-related
54  * actions, we partition the routines such that the data collecting component
55  * can be performed while holding p_lock, and all sleeping or blocking
56  * operations can be performed without acquiring p_lock.
57  *
58  * putacct(2), which allows an application to construct a customized record
59  * associated with an existing process or task, has its own entry points:
60  * exacct_tag_task() and exacct_tag_proc().
61  */
62 
63 taskq_t *exacct_queue;
64 kmem_cache_t *exacct_object_cache;
65 
66 zone_key_t exacct_zone_key = ZONE_KEY_UNINITIALIZED;
67 
68 static const uint32_t exacct_version = EXACCT_VERSION;
69 static const char exacct_header[] = "exacct";
70 static const char exacct_creator[] = "SunOS";
71 
72 ea_object_t *
73 ea_alloc_item(ea_catalog_t catalog, void *buf, size_t bufsz)
74 {
75 	ea_object_t *item;
76 
77 	item = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
78 	bzero(item, sizeof (ea_object_t));
79 	(void) ea_set_item(item, catalog, buf, bufsz);
80 	return (item);
81 }
82 
83 ea_object_t *
84 ea_alloc_group(ea_catalog_t catalog)
85 {
86 	ea_object_t *group;
87 
88 	group = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
89 	bzero(group, sizeof (ea_object_t));
90 	(void) ea_set_group(group, catalog);
91 	return (group);
92 }
93 
94 ea_object_t *
95 ea_attach_item(ea_object_t *grp, void *buf, size_t bufsz, ea_catalog_t catalog)
96 {
97 	ea_object_t *item;
98 
99 	item = ea_alloc_item(catalog, buf, bufsz);
100 	(void) ea_attach_to_group(grp, item);
101 	return (item);
102 }
103 
104 /*
105  * exacct_add_task_mstate() and exacct_sub_task_mstate() add and subtract
106  * microstate accounting data and resource usage counters from one task_usage_t
107  * from those supplied in another. These functions do not operate on *all*
108  * members of a task_usage_t: for some (e.g. tu_anctaskid) it would not make
109  * sense.
110  */
111 static void
112 exacct_add_task_mstate(task_usage_t *tu, task_usage_t *delta)
113 {
114 	tu->tu_utime  += delta->tu_utime;
115 	tu->tu_stime  += delta->tu_stime;
116 	tu->tu_minflt += delta->tu_minflt;
117 	tu->tu_majflt += delta->tu_majflt;
118 	tu->tu_sndmsg += delta->tu_sndmsg;
119 	tu->tu_rcvmsg += delta->tu_rcvmsg;
120 	tu->tu_ioch   += delta->tu_ioch;
121 	tu->tu_iblk   += delta->tu_iblk;
122 	tu->tu_oblk   += delta->tu_oblk;
123 	tu->tu_vcsw   += delta->tu_vcsw;
124 	tu->tu_icsw   += delta->tu_icsw;
125 	tu->tu_nsig   += delta->tu_nsig;
126 	tu->tu_nswp   += delta->tu_nswp;
127 	tu->tu_nscl   += delta->tu_nscl;
128 }
129 
130 /*
131  * See the comments for exacct_add_task_mstate(), above.
132  */
133 static void
134 exacct_sub_task_mstate(task_usage_t *tu, task_usage_t *delta)
135 {
136 	tu->tu_utime  -= delta->tu_utime;
137 	tu->tu_stime  -= delta->tu_stime;
138 	tu->tu_minflt -= delta->tu_minflt;
139 	tu->tu_majflt -= delta->tu_majflt;
140 	tu->tu_sndmsg -= delta->tu_sndmsg;
141 	tu->tu_rcvmsg -= delta->tu_rcvmsg;
142 	tu->tu_ioch   -= delta->tu_ioch;
143 	tu->tu_iblk   -= delta->tu_iblk;
144 	tu->tu_oblk   -= delta->tu_oblk;
145 	tu->tu_vcsw   -= delta->tu_vcsw;
146 	tu->tu_icsw   -= delta->tu_icsw;
147 	tu->tu_nsig   -= delta->tu_nsig;
148 	tu->tu_nswp   -= delta->tu_nswp;
149 	tu->tu_nscl   -= delta->tu_nscl;
150 }
151 
152 /*
153  * Wrapper for vn_rdwr() used by exacct_vn_write() and exacct_write_header()
154  * to write to the accounting file without corrupting it in case of an I/O or
155  * filesystem error.
156  */
157 static int
158 exacct_vn_write_impl(ac_info_t *info, void *buf, ssize_t bufsize)
159 {
160 	int error;
161 	ssize_t resid;
162 	struct vattr va;
163 
164 	ASSERT(info != NULL);
165 	ASSERT(info->ac_vnode != NULL);
166 	ASSERT(MUTEX_HELD(&info->ac_lock));
167 
168 	/*
169 	 * Save the size. If vn_rdwr fails, reset the size to avoid corrupting
170 	 * the present accounting file.
171 	 */
172 	va.va_mask = AT_SIZE;
173 	error = VOP_GETATTR(info->ac_vnode, &va, 0, kcred, NULL);
174 	if (error == 0) {
175 		error = vn_rdwr(UIO_WRITE, info->ac_vnode, (caddr_t)buf,
176 		    bufsize, 0LL, UIO_SYSSPACE, FAPPEND, (rlim64_t)MAXOFFSET_T,
177 		    kcred, &resid);
178 		if (error) {
179 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
180 		} else if (resid != 0) {
181 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
182 			error = ENOSPC;
183 		}
184 	}
185 	return (error);
186 }
187 
188 /*
189  * exacct_vn_write() safely writes to an accounting file.  acctctl() prevents
190  * the two accounting vnodes from being equal, and the appropriate ac_lock is
191  * held across the call, so we're single threaded through this code for each
192  * file.
193  */
194 static int
195 exacct_vn_write(ac_info_t *info, void *buf, ssize_t bufsize)
196 {
197 	int error;
198 
199 	if (info == NULL)
200 		return (0);
201 
202 	mutex_enter(&info->ac_lock);
203 
204 	/*
205 	 * Don't do anything unless accounting file is set.
206 	 */
207 	if (info->ac_vnode == NULL) {
208 		mutex_exit(&info->ac_lock);
209 		return (0);
210 	}
211 	error = exacct_vn_write_impl(info, buf, bufsize);
212 	mutex_exit(&info->ac_lock);
213 
214 	return (error);
215 }
216 
217 /*
218  * void *exacct_create_header(size_t *)
219  *
220  * Overview
221  *   exacct_create_header() constructs an exacct file header identifying the
222  *   accounting file as the output of the kernel.  exacct_create_header() and
223  *   the static write_header() and verify_header() routines in libexacct must
224  *   remain synchronized.
225  *
226  * Return values
227  *   A pointer to a packed exacct buffer containing the appropriate header is
228  *   returned; the size of the buffer is placed in the location indicated by
229  *   sizep.
230  *
231  * Caller's context
232  *   Suitable for KM_SLEEP allocations.
233  */
234 void *
235 exacct_create_header(size_t *sizep)
236 {
237 	ea_object_t *hdr_grp;
238 	uint32_t bskip;
239 	void *buf;
240 	size_t bufsize;
241 
242 	hdr_grp = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_HEADER);
243 	(void) ea_attach_item(hdr_grp, (void *)&exacct_version, 0,
244 	    EXT_UINT32 | EXC_DEFAULT | EXD_VERSION);
245 	(void) ea_attach_item(hdr_grp, (void *)exacct_header, 0,
246 	    EXT_STRING | EXC_DEFAULT | EXD_FILETYPE);
247 	(void) ea_attach_item(hdr_grp, (void *)exacct_creator, 0,
248 	    EXT_STRING | EXC_DEFAULT | EXD_CREATOR);
249 	(void) ea_attach_item(hdr_grp, uts_nodename(), 0,
250 	    EXT_STRING | EXC_DEFAULT | EXD_HOSTNAME);
251 
252 	bufsize = ea_pack_object(hdr_grp, NULL, 0);
253 	buf = kmem_alloc(bufsize, KM_SLEEP);
254 	(void) ea_pack_object(hdr_grp, buf, bufsize);
255 	ea_free_object(hdr_grp, EUP_ALLOC);
256 
257 	/*
258 	 * To prevent reading the header when reading the file backwards,
259 	 * set the large backskip of the header group to 0 (last 4 bytes).
260 	 */
261 	bskip = 0;
262 	exacct_order32(&bskip);
263 	bcopy(&bskip, (char *)buf + bufsize - sizeof (bskip),
264 	    sizeof (bskip));
265 
266 	*sizep = bufsize;
267 	return (buf);
268 }
269 
270 /*
271  * int exacct_write_header(ac_info_t *, void *, size_t)
272  *
273  * Overview
274  *   exacct_write_header() writes the given header buffer to the indicated
275  *   vnode.
276  *
277  * Return values
278  *   The result of the write operation is returned.
279  *
280  * Caller's context
281  *   Caller must hold the ac_lock of the appropriate accounting file
282  *   information block (ac_info_t).
283  */
284 int
285 exacct_write_header(ac_info_t *info, void *hdr, size_t hdrsize)
286 {
287 	if (info != NULL && info->ac_vnode != NULL)
288 		return (exacct_vn_write_impl(info, hdr, hdrsize));
289 
290 	return (0);
291 }
292 
293 static void
294 exacct_get_interval_task_usage(task_t *tk, task_usage_t *tu,
295     task_usage_t **tu_buf)
296 {
297 	task_usage_t *oldtu, *newtu;
298 	task_usage_t **prevusage;
299 
300 	ASSERT(MUTEX_HELD(&tk->tk_usage_lock));
301 	if (getzoneid() != GLOBAL_ZONEID) {
302 		prevusage = &tk->tk_zoneusage;
303 	} else {
304 		prevusage = &tk->tk_prevusage;
305 	}
306 	if ((oldtu = *prevusage) != NULL) {
307 		/*
308 		 * In case we have any accounting information
309 		 * saved from the previous interval record.
310 		 */
311 		newtu = *tu_buf;
312 		bcopy(tu, newtu, sizeof (task_usage_t));
313 		tu->tu_minflt	-= oldtu->tu_minflt;
314 		tu->tu_majflt	-= oldtu->tu_majflt;
315 		tu->tu_sndmsg	-= oldtu->tu_sndmsg;
316 		tu->tu_rcvmsg	-= oldtu->tu_rcvmsg;
317 		tu->tu_ioch	-= oldtu->tu_ioch;
318 		tu->tu_iblk	-= oldtu->tu_iblk;
319 		tu->tu_oblk	-= oldtu->tu_oblk;
320 		tu->tu_vcsw	-= oldtu->tu_vcsw;
321 		tu->tu_icsw	-= oldtu->tu_icsw;
322 		tu->tu_nsig	-= oldtu->tu_nsig;
323 		tu->tu_nswp	-= oldtu->tu_nswp;
324 		tu->tu_nscl	-= oldtu->tu_nscl;
325 		tu->tu_utime	-= oldtu->tu_utime;
326 		tu->tu_stime	-= oldtu->tu_stime;
327 
328 		tu->tu_startsec = oldtu->tu_finishsec;
329 		tu->tu_startnsec = oldtu->tu_finishnsec;
330 		/*
331 		 * Copy the data from our temporary storage to the task's
332 		 * previous interval usage structure for future reference.
333 		 */
334 		bcopy(newtu, oldtu, sizeof (task_usage_t));
335 	} else {
336 		/*
337 		 * Store current statistics in the task's previous interval
338 		 * usage structure for future references.
339 		 */
340 		*prevusage = *tu_buf;
341 		bcopy(tu, *prevusage, sizeof (task_usage_t));
342 		*tu_buf = NULL;
343 	}
344 }
345 
346 static void
347 exacct_snapshot_task_usage(task_t *tk, task_usage_t *tu)
348 {
349 	timestruc_t ts;
350 	proc_t *p;
351 
352 	ASSERT(MUTEX_HELD(&pidlock));
353 
354 	if ((p = tk->tk_memb_list) == NULL)
355 		return;
356 
357 	/*
358 	 * exacct_snapshot_task_usage() provides an approximate snapshot of the
359 	 * usage of the potentially many members of the task.  Since we don't
360 	 * guarantee exactness, we don't acquire the p_lock of any of the member
361 	 * processes.
362 	 */
363 	do {
364 		mutex_enter(&p->p_lock);
365 		tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
366 		tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
367 		mutex_exit(&p->p_lock);
368 		tu->tu_minflt	+= p->p_ru.minflt;
369 		tu->tu_majflt	+= p->p_ru.majflt;
370 		tu->tu_sndmsg	+= p->p_ru.msgsnd;
371 		tu->tu_rcvmsg	+= p->p_ru.msgrcv;
372 		tu->tu_ioch	+= p->p_ru.ioch;
373 		tu->tu_iblk	+= p->p_ru.inblock;
374 		tu->tu_oblk	+= p->p_ru.oublock;
375 		tu->tu_vcsw	+= p->p_ru.nvcsw;
376 		tu->tu_icsw	+= p->p_ru.nivcsw;
377 		tu->tu_nsig	+= p->p_ru.nsignals;
378 		tu->tu_nswp	+= p->p_ru.nswap;
379 		tu->tu_nscl	+= p->p_ru.sysc;
380 	} while ((p = p->p_tasknext) != tk->tk_memb_list);
381 
382 	/*
383 	 * The resource usage accounted for so far will include that
384 	 * contributed by the task's first process. If this process
385 	 * came from another task, then its accumulated resource usage
386 	 * will include a contribution from work performed there.
387 	 * We must therefore subtract any resource usage that was
388 	 * inherited with the first process.
389 	 */
390 	exacct_sub_task_mstate(tu, tk->tk_inherited);
391 
392 	gethrestime(&ts);
393 	tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
394 	tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
395 }
396 
397 /*
398  * void exacct_update_task_mstate(proc_t *)
399  *
400  * Overview
401  *   exacct_update_task_mstate() updates the task usage; it is intended
402  *   to be called from proc_exit().
403  *
404  * Return values
405  *   None.
406  *
407  * Caller's context
408  *   p_lock must be held at entry.
409  */
410 void
411 exacct_update_task_mstate(proc_t *p)
412 {
413 	task_usage_t *tu;
414 
415 	mutex_enter(&p->p_task->tk_usage_lock);
416 	tu = p->p_task->tk_usage;
417 	tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
418 	tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
419 	tu->tu_minflt	+= p->p_ru.minflt;
420 	tu->tu_majflt	+= p->p_ru.majflt;
421 	tu->tu_sndmsg	+= p->p_ru.msgsnd;
422 	tu->tu_rcvmsg	+= p->p_ru.msgrcv;
423 	tu->tu_ioch	+= p->p_ru.ioch;
424 	tu->tu_iblk	+= p->p_ru.inblock;
425 	tu->tu_oblk	+= p->p_ru.oublock;
426 	tu->tu_vcsw	+= p->p_ru.nvcsw;
427 	tu->tu_icsw	+= p->p_ru.nivcsw;
428 	tu->tu_nsig	+= p->p_ru.nsignals;
429 	tu->tu_nswp	+= p->p_ru.nswap;
430 	tu->tu_nscl	+= p->p_ru.sysc;
431 	mutex_exit(&p->p_task->tk_usage_lock);
432 }
433 
434 static void
435 exacct_calculate_task_usage(task_t *tk, task_usage_t *tu, int flag)
436 {
437 	timestruc_t ts;
438 	task_usage_t *tu_buf;
439 
440 	switch (flag) {
441 	case EW_PARTIAL:
442 		/*
443 		 * For partial records we must report the sum of current
444 		 * accounting statistics with previously accumulated
445 		 * statistics.
446 		 */
447 		mutex_enter(&pidlock);
448 		mutex_enter(&tk->tk_usage_lock);
449 
450 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
451 		exacct_snapshot_task_usage(tk, tu);
452 
453 		mutex_exit(&tk->tk_usage_lock);
454 		mutex_exit(&pidlock);
455 		break;
456 	case EW_INTERVAL:
457 		/*
458 		 * We need to allocate spare task_usage_t buffer before
459 		 * grabbing pidlock because we might need it later in
460 		 * exacct_get_interval_task_usage().
461 		 */
462 		tu_buf = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
463 		mutex_enter(&pidlock);
464 		mutex_enter(&tk->tk_usage_lock);
465 
466 		/*
467 		 * For interval records, we deduct the previous microstate
468 		 * accounting data and cpu usage times from previously saved
469 		 * results and update the previous task usage structure.
470 		 */
471 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
472 		exacct_snapshot_task_usage(tk, tu);
473 		exacct_get_interval_task_usage(tk, tu, &tu_buf);
474 
475 		mutex_exit(&tk->tk_usage_lock);
476 		mutex_exit(&pidlock);
477 
478 		if (tu_buf != NULL)
479 			kmem_free(tu_buf, sizeof (task_usage_t));
480 		break;
481 	case EW_FINAL:
482 		/*
483 		 * For final records, we deduct, from the task's current
484 		 * usage, any usage that was inherited with the arrival
485 		 * of a process from a previous task. We then record
486 		 * the task's finish time.
487 		 */
488 		mutex_enter(&tk->tk_usage_lock);
489 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
490 		exacct_sub_task_mstate(tu, tk->tk_inherited);
491 		mutex_exit(&tk->tk_usage_lock);
492 
493 		gethrestime(&ts);
494 		tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
495 		tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
496 
497 		break;
498 	}
499 }
500 
501 static int
502 exacct_attach_task_item(task_t *tk, task_usage_t *tu, ea_object_t *record,
503     int res)
504 {
505 	int attached = 1;
506 
507 	switch (res) {
508 	case AC_TASK_TASKID:
509 		(void) ea_attach_item(record, &tk->tk_tkid,
510 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_TASKID);
511 		break;
512 	case AC_TASK_PROJID:
513 		(void) ea_attach_item(record, &tk->tk_proj->kpj_id,
514 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_PROJID);
515 		break;
516 	case AC_TASK_CPU: {
517 			timestruc_t ts;
518 			uint64_t ui;
519 
520 			hrt2ts(tu->tu_stime, &ts);
521 			ui = ts.tv_sec;
522 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
523 			    EXT_UINT64 | EXD_TASK_CPU_SYS_SEC);
524 			ui = ts.tv_nsec;
525 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
526 			    EXT_UINT64 | EXD_TASK_CPU_SYS_NSEC);
527 
528 			hrt2ts(tu->tu_utime, &ts);
529 			ui = ts.tv_sec;
530 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
531 			    EXT_UINT64 | EXD_TASK_CPU_USER_SEC);
532 			ui = ts.tv_nsec;
533 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
534 			    EXT_UINT64 | EXD_TASK_CPU_USER_NSEC);
535 		}
536 		break;
537 	case AC_TASK_TIME:
538 		(void) ea_attach_item(record, &tu->tu_startsec,
539 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_SEC);
540 		(void) ea_attach_item(record, &tu->tu_startnsec,
541 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_NSEC);
542 		(void) ea_attach_item(record, &tu->tu_finishsec,
543 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_SEC);
544 		(void) ea_attach_item(record, &tu->tu_finishnsec,
545 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_NSEC);
546 		break;
547 	case AC_TASK_HOSTNAME:
548 		(void) ea_attach_item(record, tk->tk_zone->zone_nodename,
549 		    strlen(tk->tk_zone->zone_nodename) + 1,
550 		    EXT_STRING | EXD_TASK_HOSTNAME);
551 			break;
552 	case AC_TASK_MICROSTATE:
553 		(void) ea_attach_item(record, &tu->tu_majflt,
554 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MAJOR);
555 		(void) ea_attach_item(record, &tu->tu_minflt,
556 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MINOR);
557 		(void) ea_attach_item(record, &tu->tu_sndmsg,
558 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_SND);
559 		(void) ea_attach_item(record, &tu->tu_rcvmsg,
560 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_RCV);
561 		(void) ea_attach_item(record, &tu->tu_iblk,
562 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_IN);
563 		(void) ea_attach_item(record, &tu->tu_oblk,
564 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_OUT);
565 		(void) ea_attach_item(record, &tu->tu_ioch,
566 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CHARS_RDWR);
567 		(void) ea_attach_item(record, &tu->tu_vcsw,
568 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_VOL);
569 		(void) ea_attach_item(record, &tu->tu_icsw,
570 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_INV);
571 		(void) ea_attach_item(record, &tu->tu_nsig,
572 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SIGNALS);
573 		(void) ea_attach_item(record, &tu->tu_nswp,
574 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SWAPS);
575 		(void) ea_attach_item(record, &tu->tu_nscl,
576 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SYSCALLS);
577 		break;
578 	case AC_TASK_ANCTASKID:
579 		(void) ea_attach_item(record, &tu->tu_anctaskid,
580 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_ANCTASKID);
581 		break;
582 	case AC_TASK_ZONENAME:
583 		(void) ea_attach_item(record, tk->tk_zone->zone_name,
584 		    strlen(tk->tk_zone->zone_name) + 1,
585 		    EXT_STRING | EXD_TASK_ZONENAME);
586 		break;
587 	default:
588 		attached = 0;
589 	}
590 	return (attached);
591 }
592 
593 static ea_object_t *
594 exacct_assemble_task_record(task_t *tk, task_usage_t *tu, ulong_t *mask,
595     ea_catalog_t record_type)
596 {
597 	int res, count;
598 	ea_object_t *record;
599 
600 	/*
601 	 * Assemble usage values into group.
602 	 */
603 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
604 	for (res = 1, count = 0; res <= AC_TASK_MAX_RES; res++)
605 		if (BT_TEST(mask, res))
606 			count += exacct_attach_task_item(tk, tu, record, res);
607 	if (count == 0) {
608 		ea_free_object(record, EUP_ALLOC);
609 		record = NULL;
610 	}
611 	return (record);
612 }
613 
614 /*
615  * int exacct_assemble_task_usage(task_t *, int (*)(void *, size_t, void *,
616  *	size_t, size_t *), void *, size_t, size_t *, int)
617  *
618  * Overview
619  *   exacct_assemble_task_usage() builds the packed exacct buffer for the
620  *   indicated task, executes the given callback function, and free the packed
621  *   buffer.
622  *
623  * Return values
624  *   Returns 0 on success; otherwise the appropriate error code is returned.
625  *
626  * Caller's context
627  *   Suitable for KM_SLEEP allocations.
628  */
629 int
630 exacct_assemble_task_usage(ac_info_t *ac_task, task_t *tk,
631     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
632     void *ubuf, size_t ubufsize, size_t *actual, int flag)
633 {
634 	ulong_t mask[AC_MASK_SZ];
635 	ea_object_t *task_record;
636 	ea_catalog_t record_type;
637 	task_usage_t *tu;
638 	void *buf;
639 	size_t bufsize;
640 	int ret;
641 
642 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL || flag == EW_INTERVAL);
643 
644 	mutex_enter(&ac_task->ac_lock);
645 	if (ac_task->ac_state == AC_OFF) {
646 		mutex_exit(&ac_task->ac_lock);
647 		return (ENOTACTIVE);
648 	}
649 	bt_copy(ac_task->ac_mask, mask, AC_MASK_SZ);
650 	mutex_exit(&ac_task->ac_lock);
651 
652 	switch (flag) {
653 	case EW_FINAL:
654 		record_type = EXD_GROUP_TASK;
655 		break;
656 	case EW_PARTIAL:
657 		record_type = EXD_GROUP_TASK_PARTIAL;
658 		break;
659 	case EW_INTERVAL:
660 		record_type = EXD_GROUP_TASK_INTERVAL;
661 		break;
662 	}
663 
664 	/*
665 	 * Calculate task usage and assemble it into the task record.
666 	 */
667 	tu = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
668 	exacct_calculate_task_usage(tk, tu, flag);
669 	task_record = exacct_assemble_task_record(tk, tu, mask, record_type);
670 	if (task_record == NULL) {
671 		/*
672 		 * The current configuration of the accounting system has
673 		 * resulted in records with no data; accordingly, we don't write
674 		 * these, but we return success.
675 		 */
676 		kmem_free(tu, sizeof (task_usage_t));
677 		return (0);
678 	}
679 
680 	/*
681 	 * Pack object into buffer and run callback on it.
682 	 */
683 	bufsize = ea_pack_object(task_record, NULL, 0);
684 	buf = kmem_alloc(bufsize, KM_SLEEP);
685 	(void) ea_pack_object(task_record, buf, bufsize);
686 	ret = callback(ac_task, ubuf, ubufsize, buf, bufsize, actual);
687 
688 	/*
689 	 * Free all previously allocated structures.
690 	 */
691 	kmem_free(buf, bufsize);
692 	ea_free_object(task_record, EUP_ALLOC);
693 	kmem_free(tu, sizeof (task_usage_t));
694 	return (ret);
695 }
696 
697 /*
698  * void exacct_commit_task(void *)
699  *
700  * Overview
701  *   exacct_commit_task() calculates the final usage for a task, updating the
702  *   task usage if task accounting is active, and writing a task record if task
703  *   accounting is active.  exacct_commit_task() is intended for being called
704  *   from a task queue (taskq_t).
705  *
706  * Return values
707  *   None.
708  *
709  * Caller's context
710  *   Suitable for KM_SLEEP allocations.
711  */
712 
713 void
714 exacct_commit_task(void *arg)
715 {
716 	task_t *tk = (task_t *)arg;
717 	size_t size;
718 	zone_t *zone = tk->tk_zone;
719 	struct exacct_globals *acg;
720 
721 	ASSERT(tk != task0p);
722 	ASSERT(tk->tk_memb_list == NULL);
723 
724 	/*
725 	 * Don't do any extra work if the acctctl module isn't loaded.
726 	 */
727 	if (exacct_zone_key != ZONE_KEY_UNINITIALIZED) {
728 		acg = zone_getspecific(exacct_zone_key, zone);
729 		(void) exacct_assemble_task_usage(&acg->ac_task, tk,
730 		    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
731 		if (tk->tk_zone != global_zone) {
732 			acg = zone_getspecific(exacct_zone_key, global_zone);
733 			(void) exacct_assemble_task_usage(&acg->ac_task, tk,
734 			    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
735 		}
736 	}
737 	/*
738 	 * Release associated project and finalize task.
739 	 */
740 	task_end(tk);
741 }
742 
743 static int
744 exacct_attach_proc_item(proc_usage_t *pu, ea_object_t *record, int res)
745 {
746 	int attached = 1;
747 
748 	switch (res) {
749 	case AC_PROC_PID:
750 		(void) ea_attach_item(record, &pu->pu_pid,
751 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PID);
752 		break;
753 	case AC_PROC_UID:
754 		(void) ea_attach_item(record, &pu->pu_ruid,
755 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_UID);
756 		break;
757 	case AC_PROC_FLAG:
758 		(void) ea_attach_item(record, &pu->pu_acflag,
759 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ACCT_FLAGS);
760 		break;
761 	case AC_PROC_GID:
762 		(void) ea_attach_item(record, &pu->pu_rgid,
763 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_GID);
764 		break;
765 	case AC_PROC_PROJID:
766 		(void) ea_attach_item(record, &pu->pu_projid,
767 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PROJID);
768 		break;
769 	case AC_PROC_TASKID:
770 		(void) ea_attach_item(record, &pu->pu_taskid,
771 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TASKID);
772 		break;
773 	case AC_PROC_CPU:
774 		(void) ea_attach_item(record, &pu->pu_utimesec,
775 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_SEC);
776 		(void) ea_attach_item(record, &pu->pu_utimensec,
777 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_NSEC);
778 		(void) ea_attach_item(record, &pu->pu_stimesec,
779 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_SEC);
780 		(void) ea_attach_item(record, &pu->pu_stimensec,
781 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_NSEC);
782 		break;
783 	case AC_PROC_TIME:
784 		(void) ea_attach_item(record, &pu->pu_startsec,
785 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_SEC);
786 		(void) ea_attach_item(record, &pu->pu_startnsec,
787 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_NSEC);
788 		(void) ea_attach_item(record, &pu->pu_finishsec,
789 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_SEC);
790 		(void) ea_attach_item(record, &pu->pu_finishnsec,
791 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_NSEC);
792 		break;
793 	case AC_PROC_COMMAND:
794 		(void) ea_attach_item(record, pu->pu_command,
795 		    strlen(pu->pu_command) + 1, EXT_STRING | EXD_PROC_COMMAND);
796 		break;
797 	case AC_PROC_HOSTNAME:
798 		(void) ea_attach_item(record, pu->pu_nodename,
799 		    strlen(pu->pu_nodename) + 1,
800 		    EXT_STRING | EXD_PROC_HOSTNAME);
801 		break;
802 	case AC_PROC_TTY:
803 		(void) ea_attach_item(record, &pu->pu_major,
804 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MAJOR);
805 		(void) ea_attach_item(record, &pu->pu_minor,
806 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MINOR);
807 		break;
808 	case AC_PROC_MICROSTATE:
809 		(void) ea_attach_item(record, &pu->pu_majflt,
810 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MAJOR);
811 		(void) ea_attach_item(record, &pu->pu_minflt,
812 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MINOR);
813 		(void) ea_attach_item(record, &pu->pu_sndmsg,
814 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_SND);
815 		(void) ea_attach_item(record, &pu->pu_rcvmsg,
816 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_RCV);
817 		(void) ea_attach_item(record, &pu->pu_iblk,
818 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_IN);
819 		(void) ea_attach_item(record, &pu->pu_oblk,
820 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_OUT);
821 		(void) ea_attach_item(record, &pu->pu_ioch,
822 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CHARS_RDWR);
823 		(void) ea_attach_item(record, &pu->pu_vcsw,
824 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_VOL);
825 		(void) ea_attach_item(record, &pu->pu_icsw,
826 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_INV);
827 		(void) ea_attach_item(record, &pu->pu_nsig,
828 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SIGNALS);
829 		(void) ea_attach_item(record, &pu->pu_nswp,
830 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SWAPS);
831 		(void) ea_attach_item(record, &pu->pu_nscl,
832 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SYSCALLS);
833 		break;
834 	case AC_PROC_ANCPID:
835 		(void) ea_attach_item(record, &pu->pu_ancpid,
836 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ANCPID);
837 		break;
838 	case AC_PROC_WAIT_STATUS:
839 		(void) ea_attach_item(record, &pu->pu_wstat,
840 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_WAIT_STATUS);
841 		break;
842 	case AC_PROC_ZONENAME:
843 		(void) ea_attach_item(record, pu->pu_zonename,
844 		    strlen(pu->pu_zonename) + 1,
845 		    EXT_STRING | EXD_PROC_ZONENAME);
846 		break;
847 	case AC_PROC_MEM:
848 		(void) ea_attach_item(record, &pu->pu_mem_rss_avg,
849 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_AVG_K);
850 		(void) ea_attach_item(record, &pu->pu_mem_rss_max,
851 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_MAX_K);
852 		break;
853 	default:
854 		attached = 0;
855 	}
856 	return (attached);
857 }
858 
859 static ea_object_t *
860 exacct_assemble_proc_record(proc_usage_t *pu, ulong_t *mask,
861     ea_catalog_t record_type)
862 {
863 	int res, count;
864 	ea_object_t *record;
865 
866 	/*
867 	 * Assemble usage values into group.
868 	 */
869 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
870 	for (res = 1, count = 0; res <= AC_PROC_MAX_RES; res++)
871 		if (BT_TEST(mask, res))
872 			count += exacct_attach_proc_item(pu, record, res);
873 	if (count == 0) {
874 		ea_free_object(record, EUP_ALLOC);
875 		record = NULL;
876 	}
877 	return (record);
878 }
879 
880 /*
881  * The following two routines assume that process's p_lock is held or
882  * exacct_commit_proc has been called from exit() when all lwps are stopped.
883  */
884 static void
885 exacct_calculate_proc_mstate(proc_t *p, proc_usage_t *pu)
886 {
887 	kthread_t *t;
888 
889 	ASSERT(MUTEX_HELD(&p->p_lock));
890 	if ((t = p->p_tlist) == NULL)
891 		return;
892 
893 	do {
894 		pu->pu_minflt	+= t->t_lwp->lwp_ru.minflt;
895 		pu->pu_majflt	+= t->t_lwp->lwp_ru.majflt;
896 		pu->pu_sndmsg	+= t->t_lwp->lwp_ru.msgsnd;
897 		pu->pu_rcvmsg	+= t->t_lwp->lwp_ru.msgrcv;
898 		pu->pu_ioch	+= t->t_lwp->lwp_ru.ioch;
899 		pu->pu_iblk	+= t->t_lwp->lwp_ru.inblock;
900 		pu->pu_oblk	+= t->t_lwp->lwp_ru.oublock;
901 		pu->pu_vcsw	+= t->t_lwp->lwp_ru.nvcsw;
902 		pu->pu_icsw	+= t->t_lwp->lwp_ru.nivcsw;
903 		pu->pu_nsig	+= t->t_lwp->lwp_ru.nsignals;
904 		pu->pu_nswp	+= t->t_lwp->lwp_ru.nswap;
905 		pu->pu_nscl	+= t->t_lwp->lwp_ru.sysc;
906 	} while ((t = t->t_forw) != p->p_tlist);
907 }
908 
909 static void
910 exacct_copy_proc_mstate(proc_t *p, proc_usage_t *pu)
911 {
912 	pu->pu_minflt	= p->p_ru.minflt;
913 	pu->pu_majflt	= p->p_ru.majflt;
914 	pu->pu_sndmsg	= p->p_ru.msgsnd;
915 	pu->pu_rcvmsg	= p->p_ru.msgrcv;
916 	pu->pu_ioch	= p->p_ru.ioch;
917 	pu->pu_iblk	= p->p_ru.inblock;
918 	pu->pu_oblk	= p->p_ru.oublock;
919 	pu->pu_vcsw	= p->p_ru.nvcsw;
920 	pu->pu_icsw	= p->p_ru.nivcsw;
921 	pu->pu_nsig	= p->p_ru.nsignals;
922 	pu->pu_nswp	= p->p_ru.nswap;
923 	pu->pu_nscl	= p->p_ru.sysc;
924 }
925 
926 void
927 exacct_calculate_proc_usage(proc_t *p, proc_usage_t *pu, ulong_t *mask,
928     int flag, int wstat)
929 {
930 	timestruc_t ts, ts_run;
931 
932 	ASSERT(MUTEX_HELD(&p->p_lock));
933 
934 	/*
935 	 * Convert CPU and execution times to sec/nsec format.
936 	 */
937 	if (BT_TEST(mask, AC_PROC_CPU)) {
938 		hrt2ts(mstate_aggr_state(p, LMS_USER), &ts);
939 		pu->pu_utimesec = (uint64_t)(ulong_t)ts.tv_sec;
940 		pu->pu_utimensec = (uint64_t)(ulong_t)ts.tv_nsec;
941 		hrt2ts(mstate_aggr_state(p, LMS_SYSTEM), &ts);
942 		pu->pu_stimesec = (uint64_t)(ulong_t)ts.tv_sec;
943 		pu->pu_stimensec = (uint64_t)(ulong_t)ts.tv_nsec;
944 	}
945 	if (BT_TEST(mask, AC_PROC_TIME)) {
946 		gethrestime(&ts);
947 		pu->pu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
948 		pu->pu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
949 		hrt2ts(gethrtime() - p->p_mstart, &ts_run);
950 		ts.tv_sec -= ts_run.tv_sec;
951 		ts.tv_nsec -= ts_run.tv_nsec;
952 		if (ts.tv_nsec < 0) {
953 			ts.tv_sec--;
954 			if ((ts.tv_nsec = ts.tv_nsec + NANOSEC) >= NANOSEC) {
955 				ts.tv_sec++;
956 				ts.tv_nsec -= NANOSEC;
957 			}
958 		}
959 		pu->pu_startsec = (uint64_t)(ulong_t)ts.tv_sec;
960 		pu->pu_startnsec = (uint64_t)(ulong_t)ts.tv_nsec;
961 	}
962 
963 	pu->pu_pid = p->p_pidp->pid_id;
964 	pu->pu_acflag = p->p_user.u_acflag;
965 	pu->pu_projid = p->p_task->tk_proj->kpj_id;
966 	pu->pu_taskid = p->p_task->tk_tkid;
967 	pu->pu_major = getmajor(p->p_sessp->s_dev);
968 	pu->pu_minor = getminor(p->p_sessp->s_dev);
969 	pu->pu_ancpid = p->p_ancpid;
970 	pu->pu_wstat = wstat;
971 	/*
972 	 * Compute average RSS in K.  The denominator is the number of
973 	 * samples:  the number of clock ticks plus the initial value.
974 	 */
975 	pu->pu_mem_rss_avg = (PTOU(p)->u_mem / (p->p_stime + p->p_utime + 1)) *
976 	    (PAGESIZE / 1024);
977 	pu->pu_mem_rss_max = PTOU(p)->u_mem_max * (PAGESIZE / 1024);
978 
979 	mutex_enter(&p->p_crlock);
980 	pu->pu_ruid = crgetruid(p->p_cred);
981 	pu->pu_rgid = crgetrgid(p->p_cred);
982 	mutex_exit(&p->p_crlock);
983 
984 	bcopy(p->p_user.u_comm, pu->pu_command, strlen(p->p_user.u_comm) + 1);
985 	bcopy(p->p_zone->zone_name, pu->pu_zonename,
986 	    strlen(p->p_zone->zone_name) + 1);
987 	bcopy(p->p_zone->zone_nodename, pu->pu_nodename,
988 	    strlen(p->p_zone->zone_nodename) + 1);
989 
990 	/*
991 	 * Calculate microstate accounting data for a process that is still
992 	 * running.  Presently, we explicitly collect all of the LWP usage into
993 	 * the proc usage structure here.
994 	 */
995 	if (flag & EW_PARTIAL)
996 		exacct_calculate_proc_mstate(p, pu);
997 	if (flag & EW_FINAL)
998 		exacct_copy_proc_mstate(p, pu);
999 }
1000 
1001 /*
1002  * int exacct_assemble_proc_usage(proc_usage_t *, int (*)(void *, size_t, void
1003  *	*, size_t, size_t *), void *, size_t, size_t *)
1004  *
1005  * Overview
1006  *   Assemble record with miscellaneous accounting information about the process
1007  *   and execute the callback on it. It is the callback's job to set "actual" to
1008  *   the size of record.
1009  *
1010  * Return values
1011  *   The result of the callback function, unless the extended process accounting
1012  *   feature is not active, in which case ENOTACTIVE is returned.
1013  *
1014  * Caller's context
1015  *   Suitable for KM_SLEEP allocations.
1016  */
1017 int
1018 exacct_assemble_proc_usage(ac_info_t *ac_proc, proc_usage_t *pu,
1019     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1020     void *ubuf, size_t ubufsize, size_t *actual, int flag)
1021 {
1022 	ulong_t mask[AC_MASK_SZ];
1023 	ea_object_t *proc_record;
1024 	ea_catalog_t record_type;
1025 	void *buf;
1026 	size_t bufsize;
1027 	int ret;
1028 
1029 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL);
1030 
1031 	mutex_enter(&ac_proc->ac_lock);
1032 	if (ac_proc->ac_state == AC_OFF) {
1033 		mutex_exit(&ac_proc->ac_lock);
1034 		return (ENOTACTIVE);
1035 	}
1036 	bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
1037 	mutex_exit(&ac_proc->ac_lock);
1038 
1039 	switch (flag) {
1040 	case EW_FINAL:
1041 		record_type = EXD_GROUP_PROC;
1042 		break;
1043 	case EW_PARTIAL:
1044 		record_type = EXD_GROUP_PROC_PARTIAL;
1045 		break;
1046 	}
1047 
1048 	proc_record = exacct_assemble_proc_record(pu, mask, record_type);
1049 	if (proc_record == NULL)
1050 		return (0);
1051 
1052 	/*
1053 	 * Pack object into buffer and pass to callback.
1054 	 */
1055 	bufsize = ea_pack_object(proc_record, NULL, 0);
1056 	buf = kmem_alloc(bufsize, KM_SLEEP);
1057 	(void) ea_pack_object(proc_record, buf, bufsize);
1058 
1059 	ret = callback(ac_proc, ubuf, ubufsize, buf, bufsize, actual);
1060 
1061 	/*
1062 	 * Free all previously allocations.
1063 	 */
1064 	kmem_free(buf, bufsize);
1065 	ea_free_object(proc_record, EUP_ALLOC);
1066 	return (ret);
1067 }
1068 
1069 /*
1070  * int exacct_commit_callback(ac_info_t *, void *, size_t, void *, size_t,
1071  * 	size_t *)
1072  *
1073  * Overview
1074  *   exacct_commit_callback() writes the indicated buffer to the indicated
1075  *   extended accounting file.
1076  *
1077  * Return values
1078  *   The result of the write operation is returned.  "actual" is updated to
1079  *   contain the number of bytes actually written.
1080  *
1081  * Caller's context
1082  *   Suitable for a vn_rdwr() operation.
1083  */
1084 /*ARGSUSED*/
1085 int
1086 exacct_commit_callback(ac_info_t *info, void *ubuf, size_t ubufsize,
1087     void *buf, size_t bufsize, size_t *actual)
1088 {
1089 	int error = 0;
1090 
1091 	*actual = 0;
1092 	if ((error = exacct_vn_write(info, buf, bufsize)) == 0)
1093 		*actual = bufsize;
1094 	return (error);
1095 }
1096 
1097 static void
1098 exacct_do_commit_proc(ac_info_t *ac_proc, proc_t *p, int wstat)
1099 {
1100 	size_t size;
1101 	proc_usage_t *pu;
1102 	ulong_t mask[AC_MASK_SZ];
1103 
1104 	mutex_enter(&ac_proc->ac_lock);
1105 	if (ac_proc->ac_state == AC_ON) {
1106 		bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
1107 		mutex_exit(&ac_proc->ac_lock);
1108 	} else {
1109 		mutex_exit(&ac_proc->ac_lock);
1110 		return;
1111 	}
1112 
1113 	mutex_enter(&p->p_lock);
1114 	size = strlen(p->p_user.u_comm) + 1;
1115 	mutex_exit(&p->p_lock);
1116 
1117 	pu = kmem_alloc(sizeof (proc_usage_t), KM_SLEEP);
1118 	pu->pu_command = kmem_alloc(size, KM_SLEEP);
1119 	mutex_enter(&p->p_lock);
1120 	exacct_calculate_proc_usage(p, pu, mask, EW_FINAL, wstat);
1121 	mutex_exit(&p->p_lock);
1122 
1123 	(void) exacct_assemble_proc_usage(ac_proc, pu,
1124 	    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
1125 
1126 	kmem_free(pu->pu_command, strlen(pu->pu_command) + 1);
1127 	kmem_free(pu, sizeof (proc_usage_t));
1128 }
1129 
1130 /*
1131  * void exacct_commit_proc(proc_t *, int)
1132  *
1133  * Overview
1134  *   exacct_commit_proc() calculates the final usage for a process, updating the
1135  *   task usage if task accounting is active, and writing a process record if
1136  *   process accounting is active.  exacct_commit_proc() is intended for being
1137  *   called from proc_exit().
1138  *
1139  * Return values
1140  *   None.
1141  *
1142  * Caller's context
1143  *   Suitable for KM_SLEEP allocations.  p_lock must not be held at entry.
1144  */
1145 void
1146 exacct_commit_proc(proc_t *p, int wstat)
1147 {
1148 	zone_t *zone = p->p_zone;
1149 	struct exacct_globals *acg, *gacg = NULL;
1150 
1151 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1152 		/*
1153 		 * acctctl module not loaded.  Nothing to do.
1154 		 */
1155 		return;
1156 	}
1157 	acg = zone_getspecific(exacct_zone_key, zone);
1158 	exacct_do_commit_proc(&acg->ac_proc, p, wstat);
1159 	if (zone != global_zone) {
1160 		gacg = zone_getspecific(exacct_zone_key, global_zone);
1161 		exacct_do_commit_proc(&gacg->ac_proc, p, wstat);
1162 	}
1163 }
1164 
1165 static int
1166 exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res)
1167 {
1168 	int attached = 1;
1169 
1170 	switch (res) {
1171 	case AC_FLOW_SADDR:
1172 		if (fu->fu_isv4) {
1173 			(void) ea_attach_item(record, &fu->fu_saddr[3],
1174 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4SADDR);
1175 		} else {
1176 			(void) ea_attach_item(record, &fu->fu_saddr,
1177 			    sizeof (fu->fu_saddr), EXT_RAW |
1178 			    EXD_FLOW_V6SADDR);
1179 		}
1180 		break;
1181 	case AC_FLOW_DADDR:
1182 		if (fu->fu_isv4) {
1183 			(void) ea_attach_item(record, &fu->fu_daddr[3],
1184 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4DADDR);
1185 		} else {
1186 			(void) ea_attach_item(record, &fu->fu_daddr,
1187 			    sizeof (fu->fu_daddr), EXT_RAW |
1188 			    EXD_FLOW_V6DADDR);
1189 		}
1190 		break;
1191 	case AC_FLOW_SPORT:
1192 		(void) ea_attach_item(record, &fu->fu_sport,
1193 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_SPORT);
1194 		break;
1195 	case AC_FLOW_DPORT:
1196 		(void) ea_attach_item(record, &fu->fu_dport,
1197 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_DPORT);
1198 		break;
1199 	case AC_FLOW_PROTOCOL:
1200 		(void) ea_attach_item(record, &fu->fu_protocol,
1201 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_PROTOCOL);
1202 		break;
1203 	case AC_FLOW_DSFIELD:
1204 		(void) ea_attach_item(record, &fu->fu_dsfield,
1205 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_DSFIELD);
1206 		break;
1207 	case AC_FLOW_CTIME:
1208 		(void) ea_attach_item(record, &fu->fu_ctime,
1209 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_CTIME);
1210 		break;
1211 	case AC_FLOW_LSEEN:
1212 		(void) ea_attach_item(record, &fu->fu_lseen,
1213 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_LSEEN);
1214 		break;
1215 	case AC_FLOW_NBYTES:
1216 		(void) ea_attach_item(record, &fu->fu_nbytes,
1217 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NBYTES);
1218 		break;
1219 	case AC_FLOW_NPKTS:
1220 		(void) ea_attach_item(record, &fu->fu_npackets,
1221 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NPKTS);
1222 		break;
1223 	case AC_FLOW_PROJID:
1224 		if (fu->fu_projid >= 0) {
1225 			(void) ea_attach_item(record, &fu->fu_projid,
1226 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_PROJID);
1227 		}
1228 		break;
1229 	case AC_FLOW_UID:
1230 		if (fu->fu_userid >= 0) {
1231 			(void) ea_attach_item(record, &fu->fu_userid,
1232 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID);
1233 		}
1234 		break;
1235 	case AC_FLOW_ANAME:
1236 		(void) ea_attach_item(record, fu->fu_aname,
1237 		    strlen(fu->fu_aname) + 1, EXT_STRING | EXD_FLOW_ANAME);
1238 		break;
1239 	default:
1240 		attached = 0;
1241 	}
1242 	return (attached);
1243 }
1244 
1245 static ea_object_t *
1246 exacct_assemble_flow_record(flow_usage_t *fu, ulong_t *mask,
1247     ea_catalog_t record_type)
1248 {
1249 	int res, count;
1250 	ea_object_t *record;
1251 
1252 	/*
1253 	 * Assemble usage values into group.
1254 	 */
1255 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
1256 	for (res = 1, count = 0; res <= AC_FLOW_MAX_RES; res++)
1257 		if (BT_TEST(mask, res))
1258 			count += exacct_attach_flow_item(fu, record, res);
1259 	if (count == 0) {
1260 		ea_free_object(record, EUP_ALLOC);
1261 		record = NULL;
1262 	}
1263 	return (record);
1264 }
1265 
1266 int
1267 exacct_assemble_flow_usage(ac_info_t *ac_flow, flow_usage_t *fu,
1268     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1269     void *ubuf, size_t ubufsize, size_t *actual)
1270 {
1271 	ulong_t mask[AC_MASK_SZ];
1272 	ea_object_t *flow_usage;
1273 	ea_catalog_t record_type;
1274 	void *buf;
1275 	size_t bufsize;
1276 	int ret;
1277 
1278 	mutex_enter(&ac_flow->ac_lock);
1279 	if (ac_flow->ac_state == AC_OFF) {
1280 		mutex_exit(&ac_flow->ac_lock);
1281 		return (ENOTACTIVE);
1282 	}
1283 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1284 	mutex_exit(&ac_flow->ac_lock);
1285 
1286 	record_type = EXD_GROUP_FLOW;
1287 
1288 	flow_usage = exacct_assemble_flow_record(fu, mask, record_type);
1289 	if (flow_usage == NULL) {
1290 		return (0);
1291 	}
1292 
1293 	/*
1294 	 * Pack object into buffer and pass to callback.
1295 	 */
1296 	bufsize = ea_pack_object(flow_usage, NULL, 0);
1297 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
1298 	if (buf == NULL) {
1299 		return (ENOMEM);
1300 	}
1301 
1302 	(void) ea_pack_object(flow_usage, buf, bufsize);
1303 
1304 	ret = callback(ac_flow, ubuf, ubufsize, buf, bufsize, actual);
1305 
1306 	/*
1307 	 * Free all previously allocations.
1308 	 */
1309 	kmem_free(buf, bufsize);
1310 	ea_free_object(flow_usage, EUP_ALLOC);
1311 	return (ret);
1312 }
1313 
1314 void
1315 exacct_commit_flow(void *arg)
1316 {
1317 	flow_usage_t *f = (flow_usage_t *)arg;
1318 	size_t size;
1319 	ulong_t mask[AC_MASK_SZ];
1320 	struct exacct_globals *acg;
1321 	ac_info_t *ac_flow;
1322 
1323 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1324 		/*
1325 		 * acctctl module not loaded. Nothing to do.
1326 		 */
1327 		return;
1328 	}
1329 
1330 	/*
1331 	 * Even though each zone nominally has its own flow accounting settings
1332 	 * (ac_flow), these are only maintained by and for the global zone.
1333 	 *
1334 	 * If this were to change in the future, this function should grow a
1335 	 * second zoneid (or zone) argument, and use the corresponding zone's
1336 	 * settings rather than always using those of the global zone.
1337 	 */
1338 	acg = zone_getspecific(exacct_zone_key, global_zone);
1339 	ac_flow = &acg->ac_flow;
1340 
1341 	mutex_enter(&ac_flow->ac_lock);
1342 	if (ac_flow->ac_state == AC_OFF) {
1343 		mutex_exit(&ac_flow->ac_lock);
1344 		return;
1345 	}
1346 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1347 	mutex_exit(&ac_flow->ac_lock);
1348 
1349 	(void) exacct_assemble_flow_usage(ac_flow, f, exacct_commit_callback,
1350 	    NULL, 0, &size);
1351 }
1352 
1353 /*
1354  * int exacct_tag_task(task_t *, void *, size_t, int)
1355  *
1356  * Overview
1357  *   exacct_tag_task() provides the exacct record construction and writing
1358  *   support required by putacct(2) for task entities.
1359  *
1360  * Return values
1361  *   The result of the write operation is returned, unless the extended
1362  *   accounting facility is not active, in which case ENOTACTIVE is returned.
1363  *
1364  * Caller's context
1365  *   Suitable for KM_SLEEP allocations.
1366  */
1367 int
1368 exacct_tag_task(ac_info_t *ac_task, task_t *tk, void *ubuf, size_t ubufsz,
1369     int flags)
1370 {
1371 	int error = 0;
1372 	void *buf;
1373 	size_t bufsize;
1374 	ea_catalog_t cat;
1375 	ea_object_t *tag;
1376 
1377 	mutex_enter(&ac_task->ac_lock);
1378 	if (ac_task->ac_state == AC_OFF || ac_task->ac_vnode == NULL) {
1379 		mutex_exit(&ac_task->ac_lock);
1380 		return (ENOTACTIVE);
1381 	}
1382 	mutex_exit(&ac_task->ac_lock);
1383 
1384 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_TASK_TAG);
1385 	(void) ea_attach_item(tag, &tk->tk_tkid, 0,
1386 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1387 	(void) ea_attach_item(tag, tk->tk_zone->zone_nodename, 0,
1388 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1389 	if (flags == EP_RAW)
1390 		cat = EXT_RAW | EXC_DEFAULT | EXD_TASK_TAG;
1391 	else
1392 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_TASK_TAG;
1393 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1394 
1395 	bufsize = ea_pack_object(tag, NULL, 0);
1396 	buf = kmem_alloc(bufsize, KM_SLEEP);
1397 	(void) ea_pack_object(tag, buf, bufsize);
1398 	error = exacct_vn_write(ac_task, buf, bufsize);
1399 	kmem_free(buf, bufsize);
1400 	ea_free_object(tag, EUP_ALLOC);
1401 	return (error);
1402 }
1403 
1404 /*
1405  * exacct_tag_proc(pid_t, taskid_t, void *, size_t, int, char *)
1406  *
1407  * Overview
1408  *   exacct_tag_proc() provides the exacct record construction and writing
1409  *   support required by putacct(2) for processes.
1410  *
1411  * Return values
1412  *   The result of the write operation is returned, unless the extended
1413  *   accounting facility is not active, in which case ENOTACTIVE is returned.
1414  *
1415  * Caller's context
1416  *   Suitable for KM_SLEEP allocations.
1417  */
1418 int
1419 exacct_tag_proc(ac_info_t *ac_proc, pid_t pid, taskid_t tkid, void *ubuf,
1420     size_t ubufsz, int flags, const char *hostname)
1421 {
1422 	int error = 0;
1423 	void *buf;
1424 	size_t bufsize;
1425 	ea_catalog_t cat;
1426 	ea_object_t *tag;
1427 
1428 	mutex_enter(&ac_proc->ac_lock);
1429 	if (ac_proc->ac_state == AC_OFF || ac_proc->ac_vnode == NULL) {
1430 		mutex_exit(&ac_proc->ac_lock);
1431 		return (ENOTACTIVE);
1432 	}
1433 	mutex_exit(&ac_proc->ac_lock);
1434 
1435 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_PROC_TAG);
1436 	(void) ea_attach_item(tag, &pid, sizeof (uint32_t),
1437 	    EXT_UINT32 | EXC_DEFAULT | EXD_PROC_PID);
1438 	(void) ea_attach_item(tag, &tkid, 0,
1439 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1440 	(void) ea_attach_item(tag, (void *)hostname, 0,
1441 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1442 	if (flags == EP_RAW)
1443 		cat = EXT_RAW | EXC_DEFAULT | EXD_PROC_TAG;
1444 	else
1445 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_PROC_TAG;
1446 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1447 
1448 	bufsize = ea_pack_object(tag, NULL, 0);
1449 	buf = kmem_alloc(bufsize, KM_SLEEP);
1450 	(void) ea_pack_object(tag, buf, bufsize);
1451 	error = exacct_vn_write(ac_proc, buf, bufsize);
1452 	kmem_free(buf, bufsize);
1453 	ea_free_object(tag, EUP_ALLOC);
1454 	return (error);
1455 }
1456 
1457 /*
1458  * void exacct_init(void)
1459  *
1460  * Overview
1461  *   Initialized the extended accounting subsystem.
1462  *
1463  * Return values
1464  *   None.
1465  *
1466  * Caller's context
1467  *   Suitable for KM_SLEEP allocations.
1468  */
1469 void
1470 exacct_init()
1471 {
1472 	exacct_queue = system_taskq;
1473 	exacct_object_cache = kmem_cache_create("exacct_object_cache",
1474 	    sizeof (ea_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1475 }
1476 
1477 /*
1478  * exacct_snapshot_proc_mstate() copies a process's microstate accounting data
1479  * and resource usage counters into a given task_usage_t. It differs from
1480  * exacct_copy_proc_mstate() in that here a) we are copying to a task_usage_t,
1481  * b) p_lock will have been acquired earlier in the call path and c) we
1482  * are here including the process's user and system times.
1483  */
1484 static void
1485 exacct_snapshot_proc_mstate(proc_t *p, task_usage_t *tu)
1486 {
1487 	tu->tu_utime  = mstate_aggr_state(p, LMS_USER);
1488 	tu->tu_stime  = mstate_aggr_state(p, LMS_SYSTEM);
1489 	tu->tu_minflt = p->p_ru.minflt;
1490 	tu->tu_majflt = p->p_ru.majflt;
1491 	tu->tu_sndmsg = p->p_ru.msgsnd;
1492 	tu->tu_rcvmsg = p->p_ru.msgrcv;
1493 	tu->tu_ioch   = p->p_ru.ioch;
1494 	tu->tu_iblk   = p->p_ru.inblock;
1495 	tu->tu_oblk   = p->p_ru.oublock;
1496 	tu->tu_vcsw   = p->p_ru.nvcsw;
1497 	tu->tu_icsw   = p->p_ru.nivcsw;
1498 	tu->tu_nsig   = p->p_ru.nsignals;
1499 	tu->tu_nswp   = p->p_ru.nswap;
1500 	tu->tu_nscl   = p->p_ru.sysc;
1501 }
1502 
1503 /*
1504  * void exacct_move_mstate(proc_t *, task_t *, task_t *)
1505  *
1506  * Overview
1507  *   exacct_move_mstate() is called by task_change() and accounts for
1508  *   a process's resource usage when it is moved from one task to another.
1509  *
1510  *   The process's usage at this point is recorded in the new task so
1511  *   that it can be excluded from the calculation of resources consumed
1512  *   by that task.
1513  *
1514  *   The resource usage inherited by the new task is also added to the
1515  *   aggregate maintained by the old task for processes that have exited.
1516  *
1517  * Return values
1518  *   None.
1519  *
1520  * Caller's context
1521  *   pidlock and p_lock held across exacct_move_mstate().
1522  */
1523 void
1524 exacct_move_mstate(proc_t *p, task_t *oldtk, task_t *newtk)
1525 {
1526 	task_usage_t tu;
1527 
1528 	/* Take a snapshot of this process's mstate and RU counters */
1529 	exacct_snapshot_proc_mstate(p, &tu);
1530 
1531 	/*
1532 	 * Use the snapshot to increment the aggregate usage of the old
1533 	 * task, and the inherited usage of the new one.
1534 	 */
1535 	mutex_enter(&oldtk->tk_usage_lock);
1536 	exacct_add_task_mstate(oldtk->tk_usage, &tu);
1537 	mutex_exit(&oldtk->tk_usage_lock);
1538 	mutex_enter(&newtk->tk_usage_lock);
1539 	exacct_add_task_mstate(newtk->tk_inherited, &tu);
1540 	mutex_exit(&newtk->tk_usage_lock);
1541 }
1542