xref: /titanic_44/usr/src/uts/common/os/exacct.c (revision ee5416c9d7e449233197d5d20bc6b81e4ff091b2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/exacct.h>
29 #include <sys/exacct_catalog.h>
30 #include <sys/disp.h>
31 #include <sys/task.h>
32 #include <sys/proc.h>
33 #include <sys/cmn_err.h>
34 #include <sys/kmem.h>
35 #include <sys/project.h>
36 #include <sys/systm.h>
37 #include <sys/vnode.h>
38 #include <sys/file.h>
39 #include <sys/acctctl.h>
40 #include <sys/time.h>
41 #include <sys/utsname.h>
42 #include <sys/session.h>
43 #include <sys/sysmacros.h>
44 #include <sys/bitmap.h>
45 #include <sys/msacct.h>
46 
47 /*
48  * exacct usage and recording routines
49  *
50  * wracct(2), getacct(2), and the records written at process or task
51  * termination are constructed using the exacct_assemble_[task,proc]_usage()
52  * functions, which take a callback that takes the appropriate action on
53  * the packed exacct record for the task or process.  For the process-related
54  * actions, we partition the routines such that the data collecting component
55  * can be performed while holding p_lock, and all sleeping or blocking
56  * operations can be performed without acquiring p_lock.
57  *
58  * putacct(2), which allows an application to construct a customized record
59  * associated with an existing process or task, has its own entry points:
60  * exacct_tag_task() and exacct_tag_proc().
61  */
62 
63 taskq_t *exacct_queue;
64 kmem_cache_t *exacct_object_cache;
65 
66 zone_key_t exacct_zone_key = ZONE_KEY_UNINITIALIZED;
67 
68 static const uint32_t exacct_version = EXACCT_VERSION;
69 static const char exacct_header[] = "exacct";
70 static const char exacct_creator[] = "SunOS";
71 
72 ea_object_t *
73 ea_alloc_item(ea_catalog_t catalog, void *buf, size_t bufsz)
74 {
75 	ea_object_t *item;
76 
77 	item = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
78 	bzero(item, sizeof (ea_object_t));
79 	(void) ea_set_item(item, catalog, buf, bufsz);
80 	return (item);
81 }
82 
83 ea_object_t *
84 ea_alloc_group(ea_catalog_t catalog)
85 {
86 	ea_object_t *group;
87 
88 	group = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
89 	bzero(group, sizeof (ea_object_t));
90 	(void) ea_set_group(group, catalog);
91 	return (group);
92 }
93 
94 ea_object_t *
95 ea_attach_item(ea_object_t *grp, void *buf, size_t bufsz, ea_catalog_t catalog)
96 {
97 	ea_object_t *item;
98 
99 	item = ea_alloc_item(catalog, buf, bufsz);
100 	(void) ea_attach_to_group(grp, item);
101 	return (item);
102 }
103 
104 /*
105  * exacct_add_task_mstate() and exacct_sub_task_mstate() add and subtract
106  * microstate accounting data and resource usage counters from one task_usage_t
107  * from those supplied in another. These functions do not operate on *all*
108  * members of a task_usage_t: for some (e.g. tu_anctaskid) it would not make
109  * sense.
110  */
111 static void
112 exacct_add_task_mstate(task_usage_t *tu, task_usage_t *delta)
113 {
114 	tu->tu_utime  += delta->tu_utime;
115 	tu->tu_stime  += delta->tu_stime;
116 	tu->tu_minflt += delta->tu_minflt;
117 	tu->tu_majflt += delta->tu_majflt;
118 	tu->tu_sndmsg += delta->tu_sndmsg;
119 	tu->tu_rcvmsg += delta->tu_rcvmsg;
120 	tu->tu_ioch   += delta->tu_ioch;
121 	tu->tu_iblk   += delta->tu_iblk;
122 	tu->tu_oblk   += delta->tu_oblk;
123 	tu->tu_vcsw   += delta->tu_vcsw;
124 	tu->tu_icsw   += delta->tu_icsw;
125 	tu->tu_nsig   += delta->tu_nsig;
126 	tu->tu_nswp   += delta->tu_nswp;
127 	tu->tu_nscl   += delta->tu_nscl;
128 }
129 
130 /*
131  * See the comments for exacct_add_task_mstate(), above.
132  */
133 static void
134 exacct_sub_task_mstate(task_usage_t *tu, task_usage_t *delta)
135 {
136 	tu->tu_utime  -= delta->tu_utime;
137 	tu->tu_stime  -= delta->tu_stime;
138 	tu->tu_minflt -= delta->tu_minflt;
139 	tu->tu_majflt -= delta->tu_majflt;
140 	tu->tu_sndmsg -= delta->tu_sndmsg;
141 	tu->tu_rcvmsg -= delta->tu_rcvmsg;
142 	tu->tu_ioch   -= delta->tu_ioch;
143 	tu->tu_iblk   -= delta->tu_iblk;
144 	tu->tu_oblk   -= delta->tu_oblk;
145 	tu->tu_vcsw   -= delta->tu_vcsw;
146 	tu->tu_icsw   -= delta->tu_icsw;
147 	tu->tu_nsig   -= delta->tu_nsig;
148 	tu->tu_nswp   -= delta->tu_nswp;
149 	tu->tu_nscl   -= delta->tu_nscl;
150 }
151 
152 /*
153  * exacct_vn_write() is a vn_rdwr wrapper that protects us from corrupting the
154  * accounting file in case of an I/O or filesystem error.  acctctl() prevents
155  * the two accounting vnodes from being equal, and the appropriate ac_lock is
156  * held across the call, so we're single threaded through this code for each
157  * file.
158  */
159 static int
160 exacct_vn_write(ac_info_t *info, void *buf, ssize_t bufsize)
161 {
162 	int error = 0;
163 	ssize_t resid;
164 	struct vattr va;
165 
166 	if (info == NULL)
167 		return (0);
168 
169 	mutex_enter(&info->ac_lock);
170 
171 	/*
172 	 * Don't do anything unless accounting file is set.
173 	 */
174 	if (info->ac_vnode == NULL) {
175 		mutex_exit(&info->ac_lock);
176 		return (0);
177 	}
178 
179 	/*
180 	 * Save the size. If vn_rdwr fails, reset the size to avoid corrupting
181 	 * the present accounting file.
182 	 */
183 	va.va_mask = AT_SIZE;
184 	error = VOP_GETATTR(info->ac_vnode, &va, 0, kcred);
185 	if (error == 0) {
186 		error = vn_rdwr(UIO_WRITE, info->ac_vnode, (caddr_t)buf,
187 		    bufsize, 0LL, UIO_SYSSPACE, FAPPEND, (rlim64_t)MAXOFFSET_T,
188 		    kcred, &resid);
189 		if (error) {
190 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
191 		} else if (resid != 0) {
192 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
193 			error = ENOSPC;
194 		}
195 	}
196 	mutex_exit(&info->ac_lock);
197 
198 	return (error);
199 }
200 
201 /*
202  * void *exacct_create_header(size_t *)
203  *
204  * Overview
205  *   exacct_create_header() constructs an exacct file header identifying the
206  *   accounting file as the output of the kernel.  exacct_create_header() and
207  *   the static write_header() and verify_header() routines in libexacct must
208  *   remain synchronized.
209  *
210  * Return values
211  *   A pointer to a packed exacct buffer containing the appropriate header is
212  *   returned; the size of the buffer is placed in the location indicated by
213  *   sizep.
214  *
215  * Caller's context
216  *   Suitable for KM_SLEEP allocations.
217  */
218 void *
219 exacct_create_header(size_t *sizep)
220 {
221 	ea_object_t *hdr_grp;
222 	uint32_t bskip;
223 	void *buf;
224 	size_t bufsize;
225 
226 	hdr_grp = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_HEADER);
227 	(void) ea_attach_item(hdr_grp, (void *)&exacct_version, 0,
228 	    EXT_UINT32 | EXC_DEFAULT | EXD_VERSION);
229 	(void) ea_attach_item(hdr_grp, (void *)exacct_header, 0,
230 	    EXT_STRING | EXC_DEFAULT | EXD_FILETYPE);
231 	(void) ea_attach_item(hdr_grp, (void *)exacct_creator, 0,
232 	    EXT_STRING | EXC_DEFAULT | EXD_CREATOR);
233 	(void) ea_attach_item(hdr_grp, uts_nodename(), 0,
234 	    EXT_STRING | EXC_DEFAULT | EXD_HOSTNAME);
235 
236 	bufsize = ea_pack_object(hdr_grp, NULL, 0);
237 	buf = kmem_alloc(bufsize, KM_SLEEP);
238 	(void) ea_pack_object(hdr_grp, buf, bufsize);
239 	ea_free_object(hdr_grp, EUP_ALLOC);
240 
241 	/*
242 	 * To prevent reading the header when reading the file backwards,
243 	 * set the large backskip of the header group to 0 (last 4 bytes).
244 	 */
245 	bskip = 0;
246 	exacct_order32(&bskip);
247 	bcopy(&bskip, (char *)buf + bufsize - sizeof (bskip),
248 	    sizeof (bskip));
249 
250 	*sizep = bufsize;
251 	return (buf);
252 }
253 
254 /*
255  * int exacct_write_header(ac_info_t *, void *, size_t)
256  *
257  * Overview
258  *   exacct_write_header() writes the given header buffer to the indicated
259  *   vnode, and frees the buffer.
260  *
261  * Return values
262  *   The result of the write operation is returned.
263  *
264  * Caller's context
265  *   Caller must not hold the ac_lock of the appropriate accounting file
266  *   information block (ac_info_t).
267  */
268 int
269 exacct_write_header(ac_info_t *info, void *hdr, size_t hdrsize)
270 {
271 	int error;
272 
273 	error = exacct_vn_write(info, hdr, hdrsize);
274 	kmem_free(hdr, hdrsize);
275 	return (error);
276 }
277 
278 static void
279 exacct_get_interval_task_usage(task_t *tk, task_usage_t *tu,
280     task_usage_t **tu_buf)
281 {
282 	task_usage_t *oldtu, *newtu;
283 	task_usage_t **prevusage;
284 
285 	ASSERT(MUTEX_HELD(&tk->tk_usage_lock));
286 	if (getzoneid() != GLOBAL_ZONEID) {
287 		prevusage = &tk->tk_zoneusage;
288 	} else {
289 		prevusage = &tk->tk_prevusage;
290 	}
291 	if ((oldtu = *prevusage) != NULL) {
292 		/*
293 		 * In case we have any accounting information
294 		 * saved from the previous interval record.
295 		 */
296 		newtu = *tu_buf;
297 		bcopy(tu, newtu, sizeof (task_usage_t));
298 		tu->tu_minflt	-= oldtu->tu_minflt;
299 		tu->tu_majflt	-= oldtu->tu_majflt;
300 		tu->tu_sndmsg	-= oldtu->tu_sndmsg;
301 		tu->tu_rcvmsg	-= oldtu->tu_rcvmsg;
302 		tu->tu_ioch	-= oldtu->tu_ioch;
303 		tu->tu_iblk	-= oldtu->tu_iblk;
304 		tu->tu_oblk	-= oldtu->tu_oblk;
305 		tu->tu_vcsw	-= oldtu->tu_vcsw;
306 		tu->tu_icsw	-= oldtu->tu_icsw;
307 		tu->tu_nsig	-= oldtu->tu_nsig;
308 		tu->tu_nswp	-= oldtu->tu_nswp;
309 		tu->tu_nscl	-= oldtu->tu_nscl;
310 		tu->tu_utime	-= oldtu->tu_utime;
311 		tu->tu_stime	-= oldtu->tu_stime;
312 
313 		tu->tu_startsec = oldtu->tu_finishsec;
314 		tu->tu_startnsec = oldtu->tu_finishnsec;
315 		/*
316 		 * Copy the data from our temporary storage to the task's
317 		 * previous interval usage structure for future reference.
318 		 */
319 		bcopy(newtu, oldtu, sizeof (task_usage_t));
320 	} else {
321 		/*
322 		 * Store current statistics in the task's previous interval
323 		 * usage structure for future references.
324 		 */
325 		*prevusage = *tu_buf;
326 		bcopy(tu, *prevusage, sizeof (task_usage_t));
327 		*tu_buf = NULL;
328 	}
329 }
330 
331 static void
332 exacct_snapshot_task_usage(task_t *tk, task_usage_t *tu)
333 {
334 	timestruc_t ts;
335 	proc_t *p;
336 
337 	ASSERT(MUTEX_HELD(&pidlock));
338 
339 	if ((p = tk->tk_memb_list) == NULL)
340 		return;
341 
342 	/*
343 	 * exacct_snapshot_task_usage() provides an approximate snapshot of the
344 	 * usage of the potentially many members of the task.  Since we don't
345 	 * guarantee exactness, we don't acquire the p_lock of any of the member
346 	 * processes.
347 	 */
348 	do {
349 		mutex_enter(&p->p_lock);
350 		tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
351 		tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
352 		mutex_exit(&p->p_lock);
353 		tu->tu_minflt	+= p->p_ru.minflt;
354 		tu->tu_majflt	+= p->p_ru.majflt;
355 		tu->tu_sndmsg	+= p->p_ru.msgsnd;
356 		tu->tu_rcvmsg	+= p->p_ru.msgrcv;
357 		tu->tu_ioch	+= p->p_ru.ioch;
358 		tu->tu_iblk	+= p->p_ru.inblock;
359 		tu->tu_oblk	+= p->p_ru.oublock;
360 		tu->tu_vcsw	+= p->p_ru.nvcsw;
361 		tu->tu_icsw	+= p->p_ru.nivcsw;
362 		tu->tu_nsig	+= p->p_ru.nsignals;
363 		tu->tu_nswp	+= p->p_ru.nswap;
364 		tu->tu_nscl	+= p->p_ru.sysc;
365 	} while ((p = p->p_tasknext) != tk->tk_memb_list);
366 
367 	/*
368 	 * The resource usage accounted for so far will include that
369 	 * contributed by the task's first process. If this process
370 	 * came from another task, then its accumulated resource usage
371 	 * will include a contribution from work performed there.
372 	 * We must therefore subtract any resource usage that was
373 	 * inherited with the first process.
374 	 */
375 	exacct_sub_task_mstate(tu, tk->tk_inherited);
376 
377 	gethrestime(&ts);
378 	tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
379 	tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
380 }
381 
382 /*
383  * void exacct_update_task_mstate(proc_t *)
384  *
385  * Overview
386  *   exacct_update_task_mstate() updates the task usage; it is intended
387  *   to be called from proc_exit().
388  *
389  * Return values
390  *   None.
391  *
392  * Caller's context
393  *   p_lock must be held at entry.
394  */
395 void
396 exacct_update_task_mstate(proc_t *p)
397 {
398 	task_usage_t *tu;
399 
400 	mutex_enter(&p->p_task->tk_usage_lock);
401 	tu = p->p_task->tk_usage;
402 	tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
403 	tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
404 	tu->tu_minflt	+= p->p_ru.minflt;
405 	tu->tu_majflt	+= p->p_ru.majflt;
406 	tu->tu_sndmsg	+= p->p_ru.msgsnd;
407 	tu->tu_rcvmsg	+= p->p_ru.msgrcv;
408 	tu->tu_ioch	+= p->p_ru.ioch;
409 	tu->tu_iblk	+= p->p_ru.inblock;
410 	tu->tu_oblk	+= p->p_ru.oublock;
411 	tu->tu_vcsw	+= p->p_ru.nvcsw;
412 	tu->tu_icsw	+= p->p_ru.nivcsw;
413 	tu->tu_nsig	+= p->p_ru.nsignals;
414 	tu->tu_nswp	+= p->p_ru.nswap;
415 	tu->tu_nscl	+= p->p_ru.sysc;
416 	mutex_exit(&p->p_task->tk_usage_lock);
417 }
418 
419 static void
420 exacct_calculate_task_usage(task_t *tk, task_usage_t *tu, int flag)
421 {
422 	timestruc_t ts;
423 	task_usage_t *tu_buf;
424 
425 	switch (flag) {
426 	case EW_PARTIAL:
427 		/*
428 		 * For partial records we must report the sum of current
429 		 * accounting statistics with previously accumulated
430 		 * statistics.
431 		 */
432 		mutex_enter(&pidlock);
433 		mutex_enter(&tk->tk_usage_lock);
434 
435 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
436 		exacct_snapshot_task_usage(tk, tu);
437 
438 		mutex_exit(&tk->tk_usage_lock);
439 		mutex_exit(&pidlock);
440 		break;
441 	case EW_INTERVAL:
442 		/*
443 		 * We need to allocate spare task_usage_t buffer before
444 		 * grabbing pidlock because we might need it later in
445 		 * exacct_get_interval_task_usage().
446 		 */
447 		tu_buf = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
448 		mutex_enter(&pidlock);
449 		mutex_enter(&tk->tk_usage_lock);
450 
451 		/*
452 		 * For interval records, we deduct the previous microstate
453 		 * accounting data and cpu usage times from previously saved
454 		 * results and update the previous task usage structure.
455 		 */
456 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
457 		exacct_snapshot_task_usage(tk, tu);
458 		exacct_get_interval_task_usage(tk, tu, &tu_buf);
459 
460 		mutex_exit(&tk->tk_usage_lock);
461 		mutex_exit(&pidlock);
462 
463 		if (tu_buf != NULL)
464 			kmem_free(tu_buf, sizeof (task_usage_t));
465 		break;
466 	case EW_FINAL:
467 		/*
468 		 * For final records, we deduct, from the task's current
469 		 * usage, any usage that was inherited with the arrival
470 		 * of a process from a previous task. We then record
471 		 * the task's finish time.
472 		 */
473 		mutex_enter(&tk->tk_usage_lock);
474 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
475 		exacct_sub_task_mstate(tu, tk->tk_inherited);
476 		mutex_exit(&tk->tk_usage_lock);
477 
478 		gethrestime(&ts);
479 		tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
480 		tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
481 
482 		break;
483 	}
484 }
485 
486 static int
487 exacct_attach_task_item(task_t *tk, task_usage_t *tu, ea_object_t *record,
488     int res)
489 {
490 	int attached = 1;
491 
492 	switch (res) {
493 	case AC_TASK_TASKID:
494 		(void) ea_attach_item(record, &tk->tk_tkid,
495 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_TASKID);
496 		break;
497 	case AC_TASK_PROJID:
498 		(void) ea_attach_item(record, &tk->tk_proj->kpj_id,
499 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_PROJID);
500 		break;
501 	case AC_TASK_CPU: {
502 			timestruc_t ts;
503 			uint64_t ui;
504 
505 			hrt2ts(tu->tu_stime, &ts);
506 			ui = ts.tv_sec;
507 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
508 			    EXT_UINT64 | EXD_TASK_CPU_SYS_SEC);
509 			ui = ts.tv_nsec;
510 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
511 			    EXT_UINT64 | EXD_TASK_CPU_SYS_NSEC);
512 
513 			hrt2ts(tu->tu_utime, &ts);
514 			ui = ts.tv_sec;
515 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
516 			    EXT_UINT64 | EXD_TASK_CPU_USER_SEC);
517 			ui = ts.tv_nsec;
518 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
519 			    EXT_UINT64 | EXD_TASK_CPU_USER_NSEC);
520 		}
521 		break;
522 	case AC_TASK_TIME:
523 		(void) ea_attach_item(record, &tu->tu_startsec,
524 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_SEC);
525 		(void) ea_attach_item(record, &tu->tu_startnsec,
526 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_NSEC);
527 		(void) ea_attach_item(record, &tu->tu_finishsec,
528 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_SEC);
529 		(void) ea_attach_item(record, &tu->tu_finishnsec,
530 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_NSEC);
531 		break;
532 	case AC_TASK_HOSTNAME:
533 		(void) ea_attach_item(record, tk->tk_zone->zone_nodename,
534 		    strlen(tk->tk_zone->zone_nodename) + 1,
535 		    EXT_STRING | EXD_TASK_HOSTNAME);
536 			break;
537 	case AC_TASK_MICROSTATE:
538 		(void) ea_attach_item(record, &tu->tu_majflt,
539 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MAJOR);
540 		(void) ea_attach_item(record, &tu->tu_minflt,
541 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MINOR);
542 		(void) ea_attach_item(record, &tu->tu_sndmsg,
543 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_SND);
544 		(void) ea_attach_item(record, &tu->tu_rcvmsg,
545 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_RCV);
546 		(void) ea_attach_item(record, &tu->tu_iblk,
547 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_IN);
548 		(void) ea_attach_item(record, &tu->tu_oblk,
549 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_OUT);
550 		(void) ea_attach_item(record, &tu->tu_ioch,
551 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CHARS_RDWR);
552 		(void) ea_attach_item(record, &tu->tu_vcsw,
553 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_VOL);
554 		(void) ea_attach_item(record, &tu->tu_icsw,
555 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_INV);
556 		(void) ea_attach_item(record, &tu->tu_nsig,
557 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SIGNALS);
558 		(void) ea_attach_item(record, &tu->tu_nswp,
559 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SWAPS);
560 		(void) ea_attach_item(record, &tu->tu_nscl,
561 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SYSCALLS);
562 		break;
563 	case AC_TASK_ANCTASKID:
564 		(void) ea_attach_item(record, &tu->tu_anctaskid,
565 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_ANCTASKID);
566 		break;
567 	case AC_TASK_ZONENAME:
568 		(void) ea_attach_item(record, tk->tk_zone->zone_name,
569 		    strlen(tk->tk_zone->zone_name) + 1,
570 		    EXT_STRING | EXD_TASK_ZONENAME);
571 		break;
572 	default:
573 		attached = 0;
574 	}
575 	return (attached);
576 }
577 
578 static ea_object_t *
579 exacct_assemble_task_record(task_t *tk, task_usage_t *tu, ulong_t *mask,
580     ea_catalog_t record_type)
581 {
582 	int res, count;
583 	ea_object_t *record;
584 
585 	/*
586 	 * Assemble usage values into group.
587 	 */
588 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
589 	for (res = 1, count = 0; res <= AC_TASK_MAX_RES; res++)
590 		if (BT_TEST(mask, res))
591 			count += exacct_attach_task_item(tk, tu, record, res);
592 	if (count == 0) {
593 		ea_free_object(record, EUP_ALLOC);
594 		record = NULL;
595 	}
596 	return (record);
597 }
598 
599 /*
600  * int exacct_assemble_task_usage(task_t *, int (*)(void *, size_t, void *,
601  *	size_t, size_t *), void *, size_t, size_t *, int)
602  *
603  * Overview
604  *   exacct_assemble_task_usage() builds the packed exacct buffer for the
605  *   indicated task, executes the given callback function, and free the packed
606  *   buffer.
607  *
608  * Return values
609  *   Returns 0 on success; otherwise the appropriate error code is returned.
610  *
611  * Caller's context
612  *   Suitable for KM_SLEEP allocations.
613  */
614 int
615 exacct_assemble_task_usage(ac_info_t *ac_task, task_t *tk,
616     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
617     void *ubuf, size_t ubufsize, size_t *actual, int flag)
618 {
619 	ulong_t mask[AC_MASK_SZ];
620 	ea_object_t *task_record;
621 	ea_catalog_t record_type;
622 	task_usage_t *tu;
623 	void *buf;
624 	size_t bufsize;
625 	int ret;
626 
627 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL || flag == EW_INTERVAL);
628 
629 	mutex_enter(&ac_task->ac_lock);
630 	if (ac_task->ac_state == AC_OFF) {
631 		mutex_exit(&ac_task->ac_lock);
632 		return (ENOTACTIVE);
633 	}
634 	bt_copy(ac_task->ac_mask, mask, AC_MASK_SZ);
635 	mutex_exit(&ac_task->ac_lock);
636 
637 	switch (flag) {
638 	case EW_FINAL:
639 		record_type = EXD_GROUP_TASK;
640 		break;
641 	case EW_PARTIAL:
642 		record_type = EXD_GROUP_TASK_PARTIAL;
643 		break;
644 	case EW_INTERVAL:
645 		record_type = EXD_GROUP_TASK_INTERVAL;
646 		break;
647 	}
648 
649 	/*
650 	 * Calculate task usage and assemble it into the task record.
651 	 */
652 	tu = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
653 	exacct_calculate_task_usage(tk, tu, flag);
654 	task_record = exacct_assemble_task_record(tk, tu, mask, record_type);
655 	if (task_record == NULL) {
656 		/*
657 		 * The current configuration of the accounting system has
658 		 * resulted in records with no data; accordingly, we don't write
659 		 * these, but we return success.
660 		 */
661 		kmem_free(tu, sizeof (task_usage_t));
662 		return (0);
663 	}
664 
665 	/*
666 	 * Pack object into buffer and run callback on it.
667 	 */
668 	bufsize = ea_pack_object(task_record, NULL, 0);
669 	buf = kmem_alloc(bufsize, KM_SLEEP);
670 	(void) ea_pack_object(task_record, buf, bufsize);
671 	ret = callback(ac_task, ubuf, ubufsize, buf, bufsize, actual);
672 
673 	/*
674 	 * Free all previously allocated structures.
675 	 */
676 	kmem_free(buf, bufsize);
677 	ea_free_object(task_record, EUP_ALLOC);
678 	kmem_free(tu, sizeof (task_usage_t));
679 	return (ret);
680 }
681 
682 /*
683  * void exacct_commit_task(void *)
684  *
685  * Overview
686  *   exacct_commit_task() calculates the final usage for a task, updating the
687  *   task usage if task accounting is active, and writing a task record if task
688  *   accounting is active.  exacct_commit_task() is intended for being called
689  *   from a task queue (taskq_t).
690  *
691  * Return values
692  *   None.
693  *
694  * Caller's context
695  *   Suitable for KM_SLEEP allocations.
696  */
697 
698 void
699 exacct_commit_task(void *arg)
700 {
701 	task_t *tk = (task_t *)arg;
702 	size_t size;
703 	zone_t *zone = tk->tk_zone;
704 	struct exacct_globals *acg;
705 
706 	ASSERT(tk != task0p);
707 	ASSERT(tk->tk_memb_list == NULL);
708 
709 	/*
710 	 * Don't do any extra work if the acctctl module isn't loaded.
711 	 */
712 	if (exacct_zone_key != ZONE_KEY_UNINITIALIZED) {
713 		acg = zone_getspecific(exacct_zone_key, zone);
714 		(void) exacct_assemble_task_usage(&acg->ac_task, tk,
715 		    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
716 		if (tk->tk_zone != global_zone) {
717 			acg = zone_getspecific(exacct_zone_key, global_zone);
718 			(void) exacct_assemble_task_usage(&acg->ac_task, tk,
719 			    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
720 		}
721 	}
722 	/*
723 	 * Release associated project and finalize task.
724 	 */
725 	task_end(tk);
726 }
727 
728 static int
729 exacct_attach_proc_item(proc_usage_t *pu, ea_object_t *record, int res)
730 {
731 	int attached = 1;
732 
733 	switch (res) {
734 	case AC_PROC_PID:
735 		(void) ea_attach_item(record, &pu->pu_pid,
736 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PID);
737 		break;
738 	case AC_PROC_UID:
739 		(void) ea_attach_item(record, &pu->pu_ruid,
740 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_UID);
741 		break;
742 	case AC_PROC_FLAG:
743 		(void) ea_attach_item(record, &pu->pu_acflag,
744 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ACCT_FLAGS);
745 		break;
746 	case AC_PROC_GID:
747 		(void) ea_attach_item(record, &pu->pu_rgid,
748 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_GID);
749 		break;
750 	case AC_PROC_PROJID:
751 		(void) ea_attach_item(record, &pu->pu_projid,
752 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PROJID);
753 		break;
754 	case AC_PROC_TASKID:
755 		(void) ea_attach_item(record, &pu->pu_taskid,
756 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TASKID);
757 		break;
758 	case AC_PROC_CPU:
759 		(void) ea_attach_item(record, &pu->pu_utimesec,
760 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_SEC);
761 		(void) ea_attach_item(record, &pu->pu_utimensec,
762 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_NSEC);
763 		(void) ea_attach_item(record, &pu->pu_stimesec,
764 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_SEC);
765 		(void) ea_attach_item(record, &pu->pu_stimensec,
766 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_NSEC);
767 		break;
768 	case AC_PROC_TIME:
769 		(void) ea_attach_item(record, &pu->pu_startsec,
770 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_SEC);
771 		(void) ea_attach_item(record, &pu->pu_startnsec,
772 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_NSEC);
773 		(void) ea_attach_item(record, &pu->pu_finishsec,
774 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_SEC);
775 		(void) ea_attach_item(record, &pu->pu_finishnsec,
776 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_NSEC);
777 		break;
778 	case AC_PROC_COMMAND:
779 		(void) ea_attach_item(record, pu->pu_command,
780 		    strlen(pu->pu_command) + 1, EXT_STRING | EXD_PROC_COMMAND);
781 		break;
782 	case AC_PROC_HOSTNAME:
783 		(void) ea_attach_item(record, pu->pu_nodename,
784 		    strlen(pu->pu_nodename) + 1,
785 		    EXT_STRING | EXD_PROC_HOSTNAME);
786 		break;
787 	case AC_PROC_TTY:
788 		(void) ea_attach_item(record, &pu->pu_major,
789 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MAJOR);
790 		(void) ea_attach_item(record, &pu->pu_minor,
791 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MINOR);
792 		break;
793 	case AC_PROC_MICROSTATE:
794 		(void) ea_attach_item(record, &pu->pu_majflt,
795 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MAJOR);
796 		(void) ea_attach_item(record, &pu->pu_minflt,
797 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MINOR);
798 		(void) ea_attach_item(record, &pu->pu_sndmsg,
799 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_SND);
800 		(void) ea_attach_item(record, &pu->pu_rcvmsg,
801 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_RCV);
802 		(void) ea_attach_item(record, &pu->pu_iblk,
803 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_IN);
804 		(void) ea_attach_item(record, &pu->pu_oblk,
805 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_OUT);
806 		(void) ea_attach_item(record, &pu->pu_ioch,
807 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CHARS_RDWR);
808 		(void) ea_attach_item(record, &pu->pu_vcsw,
809 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_VOL);
810 		(void) ea_attach_item(record, &pu->pu_icsw,
811 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_INV);
812 		(void) ea_attach_item(record, &pu->pu_nsig,
813 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SIGNALS);
814 		(void) ea_attach_item(record, &pu->pu_nswp,
815 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SWAPS);
816 		(void) ea_attach_item(record, &pu->pu_nscl,
817 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SYSCALLS);
818 		break;
819 	case AC_PROC_ANCPID:
820 		(void) ea_attach_item(record, &pu->pu_ancpid,
821 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ANCPID);
822 		break;
823 	case AC_PROC_WAIT_STATUS:
824 		(void) ea_attach_item(record, &pu->pu_wstat,
825 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_WAIT_STATUS);
826 		break;
827 	case AC_PROC_ZONENAME:
828 		(void) ea_attach_item(record, pu->pu_zonename,
829 		    strlen(pu->pu_zonename) + 1,
830 		    EXT_STRING | EXD_PROC_ZONENAME);
831 		break;
832 	case AC_PROC_MEM:
833 		(void) ea_attach_item(record, &pu->pu_mem_rss_avg,
834 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_AVG_K);
835 		(void) ea_attach_item(record, &pu->pu_mem_rss_max,
836 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_MAX_K);
837 		break;
838 	default:
839 		attached = 0;
840 	}
841 	return (attached);
842 }
843 
844 static ea_object_t *
845 exacct_assemble_proc_record(proc_usage_t *pu, ulong_t *mask,
846     ea_catalog_t record_type)
847 {
848 	int res, count;
849 	ea_object_t *record;
850 
851 	/*
852 	 * Assemble usage values into group.
853 	 */
854 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
855 	for (res = 1, count = 0; res <= AC_PROC_MAX_RES; res++)
856 		if (BT_TEST(mask, res))
857 			count += exacct_attach_proc_item(pu, record, res);
858 	if (count == 0) {
859 		ea_free_object(record, EUP_ALLOC);
860 		record = NULL;
861 	}
862 	return (record);
863 }
864 
865 /*
866  * The following two routines assume that process's p_lock is held or
867  * exacct_commit_proc has been called from exit() when all lwps are stopped.
868  */
869 static void
870 exacct_calculate_proc_mstate(proc_t *p, proc_usage_t *pu)
871 {
872 	kthread_t *t;
873 
874 	ASSERT(MUTEX_HELD(&p->p_lock));
875 	if ((t = p->p_tlist) == NULL)
876 		return;
877 
878 	do {
879 		pu->pu_minflt	+= t->t_lwp->lwp_ru.minflt;
880 		pu->pu_majflt	+= t->t_lwp->lwp_ru.majflt;
881 		pu->pu_sndmsg	+= t->t_lwp->lwp_ru.msgsnd;
882 		pu->pu_rcvmsg	+= t->t_lwp->lwp_ru.msgrcv;
883 		pu->pu_ioch	+= t->t_lwp->lwp_ru.ioch;
884 		pu->pu_iblk	+= t->t_lwp->lwp_ru.inblock;
885 		pu->pu_oblk	+= t->t_lwp->lwp_ru.oublock;
886 		pu->pu_vcsw	+= t->t_lwp->lwp_ru.nvcsw;
887 		pu->pu_icsw	+= t->t_lwp->lwp_ru.nivcsw;
888 		pu->pu_nsig	+= t->t_lwp->lwp_ru.nsignals;
889 		pu->pu_nswp	+= t->t_lwp->lwp_ru.nswap;
890 		pu->pu_nscl	+= t->t_lwp->lwp_ru.sysc;
891 	} while ((t = t->t_forw) != p->p_tlist);
892 }
893 
894 static void
895 exacct_copy_proc_mstate(proc_t *p, proc_usage_t *pu)
896 {
897 	pu->pu_minflt	= p->p_ru.minflt;
898 	pu->pu_majflt	= p->p_ru.majflt;
899 	pu->pu_sndmsg	= p->p_ru.msgsnd;
900 	pu->pu_rcvmsg	= p->p_ru.msgrcv;
901 	pu->pu_ioch	= p->p_ru.ioch;
902 	pu->pu_iblk	= p->p_ru.inblock;
903 	pu->pu_oblk	= p->p_ru.oublock;
904 	pu->pu_vcsw	= p->p_ru.nvcsw;
905 	pu->pu_icsw	= p->p_ru.nivcsw;
906 	pu->pu_nsig	= p->p_ru.nsignals;
907 	pu->pu_nswp	= p->p_ru.nswap;
908 	pu->pu_nscl	= p->p_ru.sysc;
909 }
910 
911 void
912 exacct_calculate_proc_usage(proc_t *p, proc_usage_t *pu, ulong_t *mask,
913     int flag, int wstat)
914 {
915 	timestruc_t ts, ts_run;
916 
917 	ASSERT(MUTEX_HELD(&p->p_lock));
918 
919 	/*
920 	 * Convert CPU and execution times to sec/nsec format.
921 	 */
922 	if (BT_TEST(mask, AC_PROC_CPU)) {
923 		hrt2ts(mstate_aggr_state(p, LMS_USER), &ts);
924 		pu->pu_utimesec = (uint64_t)(ulong_t)ts.tv_sec;
925 		pu->pu_utimensec = (uint64_t)(ulong_t)ts.tv_nsec;
926 		hrt2ts(mstate_aggr_state(p, LMS_SYSTEM), &ts);
927 		pu->pu_stimesec = (uint64_t)(ulong_t)ts.tv_sec;
928 		pu->pu_stimensec = (uint64_t)(ulong_t)ts.tv_nsec;
929 	}
930 	if (BT_TEST(mask, AC_PROC_TIME)) {
931 		gethrestime(&ts);
932 		pu->pu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
933 		pu->pu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
934 		hrt2ts(gethrtime() - p->p_mstart, &ts_run);
935 		ts.tv_sec -= ts_run.tv_sec;
936 		ts.tv_nsec -= ts_run.tv_nsec;
937 		if (ts.tv_nsec < 0) {
938 			ts.tv_sec--;
939 			if ((ts.tv_nsec = ts.tv_nsec + NANOSEC) >= NANOSEC) {
940 				ts.tv_sec++;
941 				ts.tv_nsec -= NANOSEC;
942 			}
943 		}
944 		pu->pu_startsec = (uint64_t)(ulong_t)ts.tv_sec;
945 		pu->pu_startnsec = (uint64_t)(ulong_t)ts.tv_nsec;
946 	}
947 
948 	pu->pu_pid = p->p_pidp->pid_id;
949 	pu->pu_acflag = p->p_user.u_acflag;
950 	pu->pu_projid = p->p_task->tk_proj->kpj_id;
951 	pu->pu_taskid = p->p_task->tk_tkid;
952 	pu->pu_major = getmajor(p->p_sessp->s_dev);
953 	pu->pu_minor = getminor(p->p_sessp->s_dev);
954 	pu->pu_ancpid = p->p_ancpid;
955 	pu->pu_wstat = wstat;
956 	/*
957 	 * Compute average RSS in K.  The denominator is the number of
958 	 * samples:  the number of clock ticks plus the initial value.
959 	 */
960 	pu->pu_mem_rss_avg = (PTOU(p)->u_mem / (p->p_stime + p->p_utime + 1)) *
961 	    (PAGESIZE / 1024);
962 	pu->pu_mem_rss_max = PTOU(p)->u_mem_max * (PAGESIZE / 1024);
963 
964 	mutex_enter(&p->p_crlock);
965 	pu->pu_ruid = crgetruid(p->p_cred);
966 	pu->pu_rgid = crgetrgid(p->p_cred);
967 	mutex_exit(&p->p_crlock);
968 
969 	bcopy(p->p_user.u_comm, pu->pu_command, strlen(p->p_user.u_comm) + 1);
970 	bcopy(p->p_zone->zone_name, pu->pu_zonename,
971 	    strlen(p->p_zone->zone_name) + 1);
972 	bcopy(p->p_zone->zone_nodename, pu->pu_nodename,
973 	    strlen(p->p_zone->zone_nodename) + 1);
974 
975 	/*
976 	 * Calculate microstate accounting data for a process that is still
977 	 * running.  Presently, we explicitly collect all of the LWP usage into
978 	 * the proc usage structure here.
979 	 */
980 	if (flag & EW_PARTIAL)
981 		exacct_calculate_proc_mstate(p, pu);
982 	if (flag & EW_FINAL)
983 		exacct_copy_proc_mstate(p, pu);
984 }
985 
986 /*
987  * int exacct_assemble_proc_usage(proc_usage_t *, int (*)(void *, size_t, void
988  *	*, size_t, size_t *), void *, size_t, size_t *)
989  *
990  * Overview
991  *   Assemble record with miscellaneous accounting information about the process
992  *   and execute the callback on it. It is the callback's job to set "actual" to
993  *   the size of record.
994  *
995  * Return values
996  *   The result of the callback function, unless the extended process accounting
997  *   feature is not active, in which case ENOTACTIVE is returned.
998  *
999  * Caller's context
1000  *   Suitable for KM_SLEEP allocations.
1001  */
1002 int
1003 exacct_assemble_proc_usage(ac_info_t *ac_proc, proc_usage_t *pu,
1004     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1005     void *ubuf, size_t ubufsize, size_t *actual, int flag)
1006 {
1007 	ulong_t mask[AC_MASK_SZ];
1008 	ea_object_t *proc_record;
1009 	ea_catalog_t record_type;
1010 	void *buf;
1011 	size_t bufsize;
1012 	int ret;
1013 
1014 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL);
1015 
1016 	mutex_enter(&ac_proc->ac_lock);
1017 	if (ac_proc->ac_state == AC_OFF) {
1018 		mutex_exit(&ac_proc->ac_lock);
1019 		return (ENOTACTIVE);
1020 	}
1021 	bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
1022 	mutex_exit(&ac_proc->ac_lock);
1023 
1024 	switch (flag) {
1025 	case EW_FINAL:
1026 		record_type = EXD_GROUP_PROC;
1027 		break;
1028 	case EW_PARTIAL:
1029 		record_type = EXD_GROUP_PROC_PARTIAL;
1030 		break;
1031 	}
1032 
1033 	proc_record = exacct_assemble_proc_record(pu, mask, record_type);
1034 	if (proc_record == NULL)
1035 		return (0);
1036 
1037 	/*
1038 	 * Pack object into buffer and pass to callback.
1039 	 */
1040 	bufsize = ea_pack_object(proc_record, NULL, 0);
1041 	buf = kmem_alloc(bufsize, KM_SLEEP);
1042 	(void) ea_pack_object(proc_record, buf, bufsize);
1043 
1044 	ret = callback(ac_proc, ubuf, ubufsize, buf, bufsize, actual);
1045 
1046 	/*
1047 	 * Free all previously allocations.
1048 	 */
1049 	kmem_free(buf, bufsize);
1050 	ea_free_object(proc_record, EUP_ALLOC);
1051 	return (ret);
1052 }
1053 
1054 /*
1055  * int exacct_commit_callback(ac_info_t *, void *, size_t, void *, size_t,
1056  * 	size_t *)
1057  *
1058  * Overview
1059  *   exacct_commit_callback() writes the indicated buffer to the indicated
1060  *   extended accounting file.
1061  *
1062  * Return values
1063  *   The result of the write operation is returned.  "actual" is updated to
1064  *   contain the number of bytes actually written.
1065  *
1066  * Caller's context
1067  *   Suitable for a vn_rdwr() operation.
1068  */
1069 /*ARGSUSED*/
1070 int
1071 exacct_commit_callback(ac_info_t *info, void *ubuf, size_t ubufsize,
1072     void *buf, size_t bufsize, size_t *actual)
1073 {
1074 	int error = 0;
1075 
1076 	*actual = 0;
1077 	if ((error = exacct_vn_write(info, buf, bufsize)) == 0)
1078 		*actual = bufsize;
1079 	return (error);
1080 }
1081 
1082 static void
1083 exacct_do_commit_proc(ac_info_t *ac_proc, proc_t *p, int wstat)
1084 {
1085 	size_t size;
1086 	proc_usage_t *pu;
1087 	ulong_t mask[AC_MASK_SZ];
1088 
1089 	mutex_enter(&ac_proc->ac_lock);
1090 	if (ac_proc->ac_state == AC_ON) {
1091 		bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
1092 		mutex_exit(&ac_proc->ac_lock);
1093 	} else {
1094 		mutex_exit(&ac_proc->ac_lock);
1095 		return;
1096 	}
1097 
1098 	mutex_enter(&p->p_lock);
1099 	size = strlen(p->p_user.u_comm) + 1;
1100 	mutex_exit(&p->p_lock);
1101 
1102 	pu = kmem_alloc(sizeof (proc_usage_t), KM_SLEEP);
1103 	pu->pu_command = kmem_alloc(size, KM_SLEEP);
1104 	mutex_enter(&p->p_lock);
1105 	exacct_calculate_proc_usage(p, pu, mask, EW_FINAL, wstat);
1106 	mutex_exit(&p->p_lock);
1107 
1108 	(void) exacct_assemble_proc_usage(ac_proc, pu,
1109 	    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
1110 
1111 	kmem_free(pu->pu_command, strlen(pu->pu_command) + 1);
1112 	kmem_free(pu, sizeof (proc_usage_t));
1113 }
1114 
1115 /*
1116  * void exacct_commit_proc(proc_t *, int)
1117  *
1118  * Overview
1119  *   exacct_commit_proc() calculates the final usage for a process, updating the
1120  *   task usage if task accounting is active, and writing a process record if
1121  *   process accounting is active.  exacct_commit_proc() is intended for being
1122  *   called from proc_exit().
1123  *
1124  * Return values
1125  *   None.
1126  *
1127  * Caller's context
1128  *   Suitable for KM_SLEEP allocations.  p_lock must not be held at entry.
1129  */
1130 void
1131 exacct_commit_proc(proc_t *p, int wstat)
1132 {
1133 	zone_t *zone = p->p_zone;
1134 	struct exacct_globals *acg, *gacg = NULL;
1135 
1136 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1137 		/*
1138 		 * acctctl module not loaded.  Nothing to do.
1139 		 */
1140 		return;
1141 	}
1142 	acg = zone_getspecific(exacct_zone_key, zone);
1143 	exacct_do_commit_proc(&acg->ac_proc, p, wstat);
1144 	if (zone != global_zone) {
1145 		gacg = zone_getspecific(exacct_zone_key, global_zone);
1146 		exacct_do_commit_proc(&gacg->ac_proc, p, wstat);
1147 	}
1148 }
1149 
1150 static int
1151 exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res)
1152 {
1153 	int attached = 1;
1154 
1155 	switch (res) {
1156 	case AC_FLOW_SADDR:
1157 		if (fu->fu_isv4) {
1158 			(void) ea_attach_item(record, &fu->fu_saddr[3],
1159 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4SADDR);
1160 		} else {
1161 			(void) ea_attach_item(record, &fu->fu_saddr,
1162 			    sizeof (fu->fu_saddr), EXT_RAW |
1163 			    EXD_FLOW_V6SADDR);
1164 		}
1165 		break;
1166 	case AC_FLOW_DADDR:
1167 		if (fu->fu_isv4) {
1168 			(void) ea_attach_item(record, &fu->fu_daddr[3],
1169 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4DADDR);
1170 		} else {
1171 			(void) ea_attach_item(record, &fu->fu_daddr,
1172 			    sizeof (fu->fu_daddr), EXT_RAW |
1173 			    EXD_FLOW_V6DADDR);
1174 		}
1175 		break;
1176 	case AC_FLOW_SPORT:
1177 		(void) ea_attach_item(record, &fu->fu_sport,
1178 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_SPORT);
1179 		break;
1180 	case AC_FLOW_DPORT:
1181 		(void) ea_attach_item(record, &fu->fu_dport,
1182 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_DPORT);
1183 		break;
1184 	case AC_FLOW_PROTOCOL:
1185 		(void) ea_attach_item(record, &fu->fu_protocol,
1186 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_PROTOCOL);
1187 		break;
1188 	case AC_FLOW_DSFIELD:
1189 		(void) ea_attach_item(record, &fu->fu_dsfield,
1190 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_DSFIELD);
1191 		break;
1192 	case AC_FLOW_CTIME:
1193 		(void) ea_attach_item(record, &fu->fu_ctime,
1194 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_CTIME);
1195 		break;
1196 	case AC_FLOW_LSEEN:
1197 		(void) ea_attach_item(record, &fu->fu_lseen,
1198 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_LSEEN);
1199 		break;
1200 	case AC_FLOW_NBYTES:
1201 		(void) ea_attach_item(record, &fu->fu_nbytes,
1202 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NBYTES);
1203 		break;
1204 	case AC_FLOW_NPKTS:
1205 		(void) ea_attach_item(record, &fu->fu_npackets,
1206 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NPKTS);
1207 		break;
1208 	case AC_FLOW_PROJID:
1209 		if (fu->fu_projid >= 0) {
1210 			(void) ea_attach_item(record, &fu->fu_projid,
1211 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_PROJID);
1212 		}
1213 		break;
1214 	case AC_FLOW_UID:
1215 		if (fu->fu_userid >= 0) {
1216 			(void) ea_attach_item(record, &fu->fu_userid,
1217 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID);
1218 		}
1219 		break;
1220 	case AC_FLOW_ANAME:
1221 		(void) ea_attach_item(record, fu->fu_aname,
1222 		    strlen(fu->fu_aname) + 1, EXT_STRING | EXD_FLOW_ANAME);
1223 		break;
1224 	default:
1225 		attached = 0;
1226 	}
1227 	return (attached);
1228 }
1229 
1230 static ea_object_t *
1231 exacct_assemble_flow_record(flow_usage_t *fu, ulong_t *mask,
1232     ea_catalog_t record_type)
1233 {
1234 	int res, count;
1235 	ea_object_t *record;
1236 
1237 	/*
1238 	 * Assemble usage values into group.
1239 	 */
1240 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
1241 	for (res = 1, count = 0; res <= AC_FLOW_MAX_RES; res++)
1242 		if (BT_TEST(mask, res))
1243 			count += exacct_attach_flow_item(fu, record, res);
1244 	if (count == 0) {
1245 		ea_free_object(record, EUP_ALLOC);
1246 		record = NULL;
1247 	}
1248 	return (record);
1249 }
1250 
1251 int
1252 exacct_assemble_flow_usage(ac_info_t *ac_flow, flow_usage_t *fu,
1253     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1254     void *ubuf, size_t ubufsize, size_t *actual)
1255 {
1256 	ulong_t mask[AC_MASK_SZ];
1257 	ea_object_t *flow_usage;
1258 	ea_catalog_t record_type;
1259 	void *buf;
1260 	size_t bufsize;
1261 	int ret;
1262 
1263 	mutex_enter(&ac_flow->ac_lock);
1264 	if (ac_flow->ac_state == AC_OFF) {
1265 		mutex_exit(&ac_flow->ac_lock);
1266 		return (ENOTACTIVE);
1267 	}
1268 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1269 	mutex_exit(&ac_flow->ac_lock);
1270 
1271 	record_type = EXD_GROUP_FLOW;
1272 
1273 	flow_usage = exacct_assemble_flow_record(fu, mask, record_type);
1274 	if (flow_usage == NULL) {
1275 		return (0);
1276 	}
1277 
1278 	/*
1279 	 * Pack object into buffer and pass to callback.
1280 	 */
1281 	bufsize = ea_pack_object(flow_usage, NULL, 0);
1282 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
1283 	if (buf == NULL) {
1284 		return (ENOMEM);
1285 	}
1286 
1287 	(void) ea_pack_object(flow_usage, buf, bufsize);
1288 
1289 	ret = callback(ac_flow, ubuf, ubufsize, buf, bufsize, actual);
1290 
1291 	/*
1292 	 * Free all previously allocations.
1293 	 */
1294 	kmem_free(buf, bufsize);
1295 	ea_free_object(flow_usage, EUP_ALLOC);
1296 	return (ret);
1297 }
1298 
1299 void
1300 exacct_commit_flow(void *arg)
1301 {
1302 	flow_usage_t *f = (flow_usage_t *)arg;
1303 	size_t size;
1304 	ulong_t mask[AC_MASK_SZ];
1305 	struct exacct_globals *acg;
1306 	ac_info_t *ac_flow;
1307 
1308 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1309 		/*
1310 		 * acctctl module not loaded. Nothing to do.
1311 		 */
1312 		return;
1313 	}
1314 
1315 	/*
1316 	 * Even though each zone nominally has its own flow accounting settings
1317 	 * (ac_flow), these are only maintained by and for the global zone.
1318 	 *
1319 	 * If this were to change in the future, this function should grow a
1320 	 * second zoneid (or zone) argument, and use the corresponding zone's
1321 	 * settings rather than always using those of the global zone.
1322 	 */
1323 	acg = zone_getspecific(exacct_zone_key, global_zone);
1324 	ac_flow = &acg->ac_flow;
1325 
1326 	mutex_enter(&ac_flow->ac_lock);
1327 	if (ac_flow->ac_state == AC_OFF) {
1328 		mutex_exit(&ac_flow->ac_lock);
1329 		return;
1330 	}
1331 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1332 	mutex_exit(&ac_flow->ac_lock);
1333 
1334 	(void) exacct_assemble_flow_usage(ac_flow, f, exacct_commit_callback,
1335 	    NULL, 0, &size);
1336 }
1337 
1338 /*
1339  * int exacct_tag_task(task_t *, void *, size_t, int)
1340  *
1341  * Overview
1342  *   exacct_tag_task() provides the exacct record construction and writing
1343  *   support required by putacct(2) for task entities.
1344  *
1345  * Return values
1346  *   The result of the write operation is returned, unless the extended
1347  *   accounting facility is not active, in which case ENOTACTIVE is returned.
1348  *
1349  * Caller's context
1350  *   Suitable for KM_SLEEP allocations.
1351  */
1352 int
1353 exacct_tag_task(ac_info_t *ac_task, task_t *tk, void *ubuf, size_t ubufsz,
1354     int flags)
1355 {
1356 	int error = 0;
1357 	void *buf;
1358 	size_t bufsize;
1359 	ea_catalog_t cat;
1360 	ea_object_t *tag;
1361 
1362 	mutex_enter(&ac_task->ac_lock);
1363 	if (ac_task->ac_state == AC_OFF || ac_task->ac_vnode == NULL) {
1364 		mutex_exit(&ac_task->ac_lock);
1365 		return (ENOTACTIVE);
1366 	}
1367 	mutex_exit(&ac_task->ac_lock);
1368 
1369 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_TASK_TAG);
1370 	(void) ea_attach_item(tag, &tk->tk_tkid, 0,
1371 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1372 	(void) ea_attach_item(tag, tk->tk_zone->zone_nodename, 0,
1373 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1374 	if (flags == EP_RAW)
1375 		cat = EXT_RAW | EXC_DEFAULT | EXD_TASK_TAG;
1376 	else
1377 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_TASK_TAG;
1378 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1379 
1380 	bufsize = ea_pack_object(tag, NULL, 0);
1381 	buf = kmem_alloc(bufsize, KM_SLEEP);
1382 	(void) ea_pack_object(tag, buf, bufsize);
1383 	error = exacct_vn_write(ac_task, buf, bufsize);
1384 	kmem_free(buf, bufsize);
1385 	ea_free_object(tag, EUP_ALLOC);
1386 	return (error);
1387 }
1388 
1389 /*
1390  * exacct_tag_proc(pid_t, taskid_t, void *, size_t, int, char *)
1391  *
1392  * Overview
1393  *   exacct_tag_proc() provides the exacct record construction and writing
1394  *   support required by putacct(2) for processes.
1395  *
1396  * Return values
1397  *   The result of the write operation is returned, unless the extended
1398  *   accounting facility is not active, in which case ENOTACTIVE is returned.
1399  *
1400  * Caller's context
1401  *   Suitable for KM_SLEEP allocations.
1402  */
1403 int
1404 exacct_tag_proc(ac_info_t *ac_proc, pid_t pid, taskid_t tkid, void *ubuf,
1405     size_t ubufsz, int flags, const char *hostname)
1406 {
1407 	int error = 0;
1408 	void *buf;
1409 	size_t bufsize;
1410 	ea_catalog_t cat;
1411 	ea_object_t *tag;
1412 
1413 	mutex_enter(&ac_proc->ac_lock);
1414 	if (ac_proc->ac_state == AC_OFF || ac_proc->ac_vnode == NULL) {
1415 		mutex_exit(&ac_proc->ac_lock);
1416 		return (ENOTACTIVE);
1417 	}
1418 	mutex_exit(&ac_proc->ac_lock);
1419 
1420 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_PROC_TAG);
1421 	(void) ea_attach_item(tag, &pid, sizeof (uint32_t),
1422 	    EXT_UINT32 | EXC_DEFAULT | EXD_PROC_PID);
1423 	(void) ea_attach_item(tag, &tkid, 0,
1424 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1425 	(void) ea_attach_item(tag, (void *)hostname, 0,
1426 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1427 	if (flags == EP_RAW)
1428 		cat = EXT_RAW | EXC_DEFAULT | EXD_PROC_TAG;
1429 	else
1430 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_PROC_TAG;
1431 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1432 
1433 	bufsize = ea_pack_object(tag, NULL, 0);
1434 	buf = kmem_alloc(bufsize, KM_SLEEP);
1435 	(void) ea_pack_object(tag, buf, bufsize);
1436 	error = exacct_vn_write(ac_proc, buf, bufsize);
1437 	kmem_free(buf, bufsize);
1438 	ea_free_object(tag, EUP_ALLOC);
1439 	return (error);
1440 }
1441 
1442 /*
1443  * void exacct_init(void)
1444  *
1445  * Overview
1446  *   Initialized the extended accounting subsystem.
1447  *
1448  * Return values
1449  *   None.
1450  *
1451  * Caller's context
1452  *   Suitable for KM_SLEEP allocations.
1453  */
1454 void
1455 exacct_init()
1456 {
1457 	exacct_queue = system_taskq;
1458 	exacct_object_cache = kmem_cache_create("exacct_object_cache",
1459 	    sizeof (ea_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1460 }
1461 
1462 /*
1463  * exacct_snapshot_proc_mstate() copies a process's microstate accounting data
1464  * and resource usage counters into a given task_usage_t. It differs from
1465  * exacct_copy_proc_mstate() in that here a) we are copying to a task_usage_t,
1466  * b) p_lock will have been acquired earlier in the call path and c) we
1467  * are here including the process's user and system times.
1468  */
1469 static void
1470 exacct_snapshot_proc_mstate(proc_t *p, task_usage_t *tu)
1471 {
1472 	tu->tu_utime  = mstate_aggr_state(p, LMS_USER);
1473 	tu->tu_stime  = mstate_aggr_state(p, LMS_SYSTEM);
1474 	tu->tu_minflt = p->p_ru.minflt;
1475 	tu->tu_majflt = p->p_ru.majflt;
1476 	tu->tu_sndmsg = p->p_ru.msgsnd;
1477 	tu->tu_rcvmsg = p->p_ru.msgrcv;
1478 	tu->tu_ioch   = p->p_ru.ioch;
1479 	tu->tu_iblk   = p->p_ru.inblock;
1480 	tu->tu_oblk   = p->p_ru.oublock;
1481 	tu->tu_vcsw   = p->p_ru.nvcsw;
1482 	tu->tu_icsw   = p->p_ru.nivcsw;
1483 	tu->tu_nsig   = p->p_ru.nsignals;
1484 	tu->tu_nswp   = p->p_ru.nswap;
1485 	tu->tu_nscl   = p->p_ru.sysc;
1486 }
1487 
1488 /*
1489  * void exacct_move_mstate(proc_t *, task_t *, task_t *)
1490  *
1491  * Overview
1492  *   exacct_move_mstate() is called by task_change() and accounts for
1493  *   a process's resource usage when it is moved from one task to another.
1494  *
1495  *   The process's usage at this point is recorded in the new task so
1496  *   that it can be excluded from the calculation of resources consumed
1497  *   by that task.
1498  *
1499  *   The resource usage inherited by the new task is also added to the
1500  *   aggregate maintained by the old task for processes that have exited.
1501  *
1502  * Return values
1503  *   None.
1504  *
1505  * Caller's context
1506  *   pidlock and p_lock held across exacct_move_mstate().
1507  */
1508 void
1509 exacct_move_mstate(proc_t *p, task_t *oldtk, task_t *newtk)
1510 {
1511 	task_usage_t tu;
1512 
1513 	/* Take a snapshot of this process's mstate and RU counters */
1514 	exacct_snapshot_proc_mstate(p, &tu);
1515 
1516 	/*
1517 	 * Use the snapshot to increment the aggregate usage of the old
1518 	 * task, and the inherited usage of the new one.
1519 	 */
1520 	mutex_enter(&oldtk->tk_usage_lock);
1521 	exacct_add_task_mstate(oldtk->tk_usage, &tu);
1522 	mutex_exit(&oldtk->tk_usage_lock);
1523 	mutex_enter(&newtk->tk_usage_lock);
1524 	exacct_add_task_mstate(newtk->tk_inherited, &tu);
1525 	mutex_exit(&newtk->tk_usage_lock);
1526 }
1527