xref: /illumos-gate/usr/src/uts/common/os/exacct.c (revision e86372a01d2d16a5dd4a64e144ed978ba17fe7dd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/exacct.h>
26 #include <sys/exacct_catalog.h>
27 #include <sys/disp.h>
28 #include <sys/task.h>
29 #include <sys/proc.h>
30 #include <sys/cmn_err.h>
31 #include <sys/kmem.h>
32 #include <sys/project.h>
33 #include <sys/systm.h>
34 #include <sys/vnode.h>
35 #include <sys/file.h>
36 #include <sys/acctctl.h>
37 #include <sys/time.h>
38 #include <sys/utsname.h>
39 #include <sys/session.h>
40 #include <sys/sysmacros.h>
41 #include <sys/bitmap.h>
42 #include <sys/msacct.h>
43 
44 /*
45  * exacct usage and recording routines
46  *
47  * wracct(2), getacct(2), and the records written at process or task
48  * termination are constructed using the exacct_assemble_[task,proc]_usage()
49  * functions, which take a callback that takes the appropriate action on
50  * the packed exacct record for the task or process.  For the process-related
51  * actions, we partition the routines such that the data collecting component
52  * can be performed while holding p_lock, and all sleeping or blocking
53  * operations can be performed without acquiring p_lock.
54  *
55  * putacct(2), which allows an application to construct a customized record
56  * associated with an existing process or task, has its own entry points:
57  * exacct_tag_task() and exacct_tag_proc().
58  */
59 
60 taskq_t *exacct_queue;
61 kmem_cache_t *exacct_object_cache;
62 
63 zone_key_t exacct_zone_key = ZONE_KEY_UNINITIALIZED;
64 
65 static const uint32_t exacct_version = EXACCT_VERSION;
66 static const char exacct_header[] = "exacct";
67 static const char exacct_creator[] = "SunOS";
68 
69 ea_object_t *
70 ea_alloc_item(ea_catalog_t catalog, void *buf, size_t bufsz)
71 {
72 	ea_object_t *item;
73 
74 	item = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
75 	bzero(item, sizeof (ea_object_t));
76 	(void) ea_set_item(item, catalog, buf, bufsz);
77 	return (item);
78 }
79 
80 ea_object_t *
81 ea_alloc_group(ea_catalog_t catalog)
82 {
83 	ea_object_t *group;
84 
85 	group = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
86 	bzero(group, sizeof (ea_object_t));
87 	(void) ea_set_group(group, catalog);
88 	return (group);
89 }
90 
91 ea_object_t *
92 ea_attach_item(ea_object_t *grp, void *buf, size_t bufsz, ea_catalog_t catalog)
93 {
94 	ea_object_t *item;
95 
96 	item = ea_alloc_item(catalog, buf, bufsz);
97 	(void) ea_attach_to_group(grp, item);
98 	return (item);
99 }
100 
101 /*
102  * exacct_add_task_mstate() and exacct_sub_task_mstate() add and subtract
103  * microstate accounting data and resource usage counters from one task_usage_t
104  * from those supplied in another. These functions do not operate on *all*
105  * members of a task_usage_t: for some (e.g. tu_anctaskid) it would not make
106  * sense.
107  */
108 static void
109 exacct_add_task_mstate(task_usage_t *tu, task_usage_t *delta)
110 {
111 	tu->tu_utime  += delta->tu_utime;
112 	tu->tu_stime  += delta->tu_stime;
113 	tu->tu_minflt += delta->tu_minflt;
114 	tu->tu_majflt += delta->tu_majflt;
115 	tu->tu_sndmsg += delta->tu_sndmsg;
116 	tu->tu_rcvmsg += delta->tu_rcvmsg;
117 	tu->tu_ioch   += delta->tu_ioch;
118 	tu->tu_iblk   += delta->tu_iblk;
119 	tu->tu_oblk   += delta->tu_oblk;
120 	tu->tu_vcsw   += delta->tu_vcsw;
121 	tu->tu_icsw   += delta->tu_icsw;
122 	tu->tu_nsig   += delta->tu_nsig;
123 	tu->tu_nswp   += delta->tu_nswp;
124 	tu->tu_nscl   += delta->tu_nscl;
125 }
126 
127 /*
128  * See the comments for exacct_add_task_mstate(), above.
129  */
130 static void
131 exacct_sub_task_mstate(task_usage_t *tu, task_usage_t *delta)
132 {
133 	tu->tu_utime  -= delta->tu_utime;
134 	tu->tu_stime  -= delta->tu_stime;
135 	tu->tu_minflt -= delta->tu_minflt;
136 	tu->tu_majflt -= delta->tu_majflt;
137 	tu->tu_sndmsg -= delta->tu_sndmsg;
138 	tu->tu_rcvmsg -= delta->tu_rcvmsg;
139 	tu->tu_ioch   -= delta->tu_ioch;
140 	tu->tu_iblk   -= delta->tu_iblk;
141 	tu->tu_oblk   -= delta->tu_oblk;
142 	tu->tu_vcsw   -= delta->tu_vcsw;
143 	tu->tu_icsw   -= delta->tu_icsw;
144 	tu->tu_nsig   -= delta->tu_nsig;
145 	tu->tu_nswp   -= delta->tu_nswp;
146 	tu->tu_nscl   -= delta->tu_nscl;
147 }
148 
149 /*
150  * Wrapper for vn_rdwr() used by exacct_vn_write() and exacct_write_header()
151  * to write to the accounting file without corrupting it in case of an I/O or
152  * filesystem error.
153  */
154 static int
155 exacct_vn_write_impl(ac_info_t *info, void *buf, ssize_t bufsize)
156 {
157 	int error;
158 	ssize_t resid;
159 	struct vattr va;
160 
161 	ASSERT(info != NULL);
162 	ASSERT(info->ac_vnode != NULL);
163 	ASSERT(MUTEX_HELD(&info->ac_lock));
164 
165 	/*
166 	 * Save the size. If vn_rdwr fails, reset the size to avoid corrupting
167 	 * the present accounting file.
168 	 */
169 	va.va_mask = AT_SIZE;
170 	error = VOP_GETATTR(info->ac_vnode, &va, 0, kcred, NULL);
171 	if (error == 0) {
172 		error = vn_rdwr(UIO_WRITE, info->ac_vnode, (caddr_t)buf,
173 		    bufsize, 0LL, UIO_SYSSPACE, FAPPEND, (rlim64_t)MAXOFFSET_T,
174 		    kcred, &resid);
175 		if (error) {
176 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
177 		} else if (resid != 0) {
178 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
179 			error = ENOSPC;
180 		}
181 	}
182 	return (error);
183 }
184 
185 /*
186  * exacct_vn_write() safely writes to an accounting file.  acctctl() prevents
187  * the two accounting vnodes from being equal, and the appropriate ac_lock is
188  * held across the call, so we're single threaded through this code for each
189  * file.
190  */
191 static int
192 exacct_vn_write(ac_info_t *info, void *buf, ssize_t bufsize)
193 {
194 	int error;
195 
196 	if (info == NULL)
197 		return (0);
198 
199 	mutex_enter(&info->ac_lock);
200 
201 	/*
202 	 * Don't do anything unless accounting file is set.
203 	 */
204 	if (info->ac_vnode == NULL) {
205 		mutex_exit(&info->ac_lock);
206 		return (0);
207 	}
208 	error = exacct_vn_write_impl(info, buf, bufsize);
209 	mutex_exit(&info->ac_lock);
210 
211 	return (error);
212 }
213 
214 /*
215  * void *exacct_create_header(size_t *)
216  *
217  * Overview
218  *   exacct_create_header() constructs an exacct file header identifying the
219  *   accounting file as the output of the kernel.  exacct_create_header() and
220  *   the static write_header() and verify_header() routines in libexacct must
221  *   remain synchronized.
222  *
223  * Return values
224  *   A pointer to a packed exacct buffer containing the appropriate header is
225  *   returned; the size of the buffer is placed in the location indicated by
226  *   sizep.
227  *
228  * Caller's context
229  *   Suitable for KM_SLEEP allocations.
230  */
231 void *
232 exacct_create_header(size_t *sizep)
233 {
234 	ea_object_t *hdr_grp;
235 	uint32_t bskip;
236 	void *buf;
237 	size_t bufsize;
238 
239 	hdr_grp = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_HEADER);
240 	(void) ea_attach_item(hdr_grp, (void *)&exacct_version, 0,
241 	    EXT_UINT32 | EXC_DEFAULT | EXD_VERSION);
242 	(void) ea_attach_item(hdr_grp, (void *)exacct_header, 0,
243 	    EXT_STRING | EXC_DEFAULT | EXD_FILETYPE);
244 	(void) ea_attach_item(hdr_grp, (void *)exacct_creator, 0,
245 	    EXT_STRING | EXC_DEFAULT | EXD_CREATOR);
246 	(void) ea_attach_item(hdr_grp, uts_nodename(), 0,
247 	    EXT_STRING | EXC_DEFAULT | EXD_HOSTNAME);
248 
249 	bufsize = ea_pack_object(hdr_grp, NULL, 0);
250 	buf = kmem_alloc(bufsize, KM_SLEEP);
251 	(void) ea_pack_object(hdr_grp, buf, bufsize);
252 	ea_free_object(hdr_grp, EUP_ALLOC);
253 
254 	/*
255 	 * To prevent reading the header when reading the file backwards,
256 	 * set the large backskip of the header group to 0 (last 4 bytes).
257 	 */
258 	bskip = 0;
259 	exacct_order32(&bskip);
260 	bcopy(&bskip, (char *)buf + bufsize - sizeof (bskip),
261 	    sizeof (bskip));
262 
263 	*sizep = bufsize;
264 	return (buf);
265 }
266 
267 /*
268  * int exacct_write_header(ac_info_t *, void *, size_t)
269  *
270  * Overview
271  *   exacct_write_header() writes the given header buffer to the indicated
272  *   vnode.
273  *
274  * Return values
275  *   The result of the write operation is returned.
276  *
277  * Caller's context
278  *   Caller must hold the ac_lock of the appropriate accounting file
279  *   information block (ac_info_t).
280  */
281 int
282 exacct_write_header(ac_info_t *info, void *hdr, size_t hdrsize)
283 {
284 	if (info != NULL && info->ac_vnode != NULL)
285 		return (exacct_vn_write_impl(info, hdr, hdrsize));
286 
287 	return (0);
288 }
289 
290 static void
291 exacct_get_interval_task_usage(task_t *tk, task_usage_t *tu,
292     task_usage_t **tu_buf)
293 {
294 	task_usage_t *oldtu, *newtu;
295 	task_usage_t **prevusage;
296 
297 	ASSERT(MUTEX_HELD(&tk->tk_usage_lock));
298 	if (getzoneid() != GLOBAL_ZONEID) {
299 		prevusage = &tk->tk_zoneusage;
300 	} else {
301 		prevusage = &tk->tk_prevusage;
302 	}
303 	if ((oldtu = *prevusage) != NULL) {
304 		/*
305 		 * In case we have any accounting information
306 		 * saved from the previous interval record.
307 		 */
308 		newtu = *tu_buf;
309 		bcopy(tu, newtu, sizeof (task_usage_t));
310 		tu->tu_minflt	-= oldtu->tu_minflt;
311 		tu->tu_majflt	-= oldtu->tu_majflt;
312 		tu->tu_sndmsg	-= oldtu->tu_sndmsg;
313 		tu->tu_rcvmsg	-= oldtu->tu_rcvmsg;
314 		tu->tu_ioch	-= oldtu->tu_ioch;
315 		tu->tu_iblk	-= oldtu->tu_iblk;
316 		tu->tu_oblk	-= oldtu->tu_oblk;
317 		tu->tu_vcsw	-= oldtu->tu_vcsw;
318 		tu->tu_icsw	-= oldtu->tu_icsw;
319 		tu->tu_nsig	-= oldtu->tu_nsig;
320 		tu->tu_nswp	-= oldtu->tu_nswp;
321 		tu->tu_nscl	-= oldtu->tu_nscl;
322 		tu->tu_utime	-= oldtu->tu_utime;
323 		tu->tu_stime	-= oldtu->tu_stime;
324 
325 		tu->tu_startsec = oldtu->tu_finishsec;
326 		tu->tu_startnsec = oldtu->tu_finishnsec;
327 		/*
328 		 * Copy the data from our temporary storage to the task's
329 		 * previous interval usage structure for future reference.
330 		 */
331 		bcopy(newtu, oldtu, sizeof (task_usage_t));
332 	} else {
333 		/*
334 		 * Store current statistics in the task's previous interval
335 		 * usage structure for future references.
336 		 */
337 		*prevusage = *tu_buf;
338 		bcopy(tu, *prevusage, sizeof (task_usage_t));
339 		*tu_buf = NULL;
340 	}
341 }
342 
343 static void
344 exacct_snapshot_task_usage(task_t *tk, task_usage_t *tu)
345 {
346 	timestruc_t ts;
347 	proc_t *p;
348 
349 	ASSERT(MUTEX_HELD(&pidlock));
350 
351 	if ((p = tk->tk_memb_list) == NULL)
352 		return;
353 
354 	/*
355 	 * exacct_snapshot_task_usage() provides an approximate snapshot of the
356 	 * usage of the potentially many members of the task.  Since we don't
357 	 * guarantee exactness, we don't acquire the p_lock of any of the member
358 	 * processes.
359 	 */
360 	do {
361 		mutex_enter(&p->p_lock);
362 		tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
363 		tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
364 		mutex_exit(&p->p_lock);
365 		tu->tu_minflt	+= p->p_ru.minflt;
366 		tu->tu_majflt	+= p->p_ru.majflt;
367 		tu->tu_sndmsg	+= p->p_ru.msgsnd;
368 		tu->tu_rcvmsg	+= p->p_ru.msgrcv;
369 		tu->tu_ioch	+= p->p_ru.ioch;
370 		tu->tu_iblk	+= p->p_ru.inblock;
371 		tu->tu_oblk	+= p->p_ru.oublock;
372 		tu->tu_vcsw	+= p->p_ru.nvcsw;
373 		tu->tu_icsw	+= p->p_ru.nivcsw;
374 		tu->tu_nsig	+= p->p_ru.nsignals;
375 		tu->tu_nswp	+= p->p_ru.nswap;
376 		tu->tu_nscl	+= p->p_ru.sysc;
377 	} while ((p = p->p_tasknext) != tk->tk_memb_list);
378 
379 	/*
380 	 * The resource usage accounted for so far will include that
381 	 * contributed by the task's first process. If this process
382 	 * came from another task, then its accumulated resource usage
383 	 * will include a contribution from work performed there.
384 	 * We must therefore subtract any resource usage that was
385 	 * inherited with the first process.
386 	 */
387 	exacct_sub_task_mstate(tu, tk->tk_inherited);
388 
389 	gethrestime(&ts);
390 	tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
391 	tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
392 }
393 
394 /*
395  * void exacct_update_task_mstate(proc_t *)
396  *
397  * Overview
398  *   exacct_update_task_mstate() updates the task usage; it is intended
399  *   to be called from proc_exit().
400  *
401  * Return values
402  *   None.
403  *
404  * Caller's context
405  *   p_lock must be held at entry.
406  */
407 void
408 exacct_update_task_mstate(proc_t *p)
409 {
410 	task_usage_t *tu;
411 
412 	mutex_enter(&p->p_task->tk_usage_lock);
413 	tu = p->p_task->tk_usage;
414 	tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
415 	tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
416 	tu->tu_minflt	+= p->p_ru.minflt;
417 	tu->tu_majflt	+= p->p_ru.majflt;
418 	tu->tu_sndmsg	+= p->p_ru.msgsnd;
419 	tu->tu_rcvmsg	+= p->p_ru.msgrcv;
420 	tu->tu_ioch	+= p->p_ru.ioch;
421 	tu->tu_iblk	+= p->p_ru.inblock;
422 	tu->tu_oblk	+= p->p_ru.oublock;
423 	tu->tu_vcsw	+= p->p_ru.nvcsw;
424 	tu->tu_icsw	+= p->p_ru.nivcsw;
425 	tu->tu_nsig	+= p->p_ru.nsignals;
426 	tu->tu_nswp	+= p->p_ru.nswap;
427 	tu->tu_nscl	+= p->p_ru.sysc;
428 	mutex_exit(&p->p_task->tk_usage_lock);
429 }
430 
431 static void
432 exacct_calculate_task_usage(task_t *tk, task_usage_t *tu, int flag)
433 {
434 	timestruc_t ts;
435 	task_usage_t *tu_buf;
436 
437 	switch (flag) {
438 	case EW_PARTIAL:
439 		/*
440 		 * For partial records we must report the sum of current
441 		 * accounting statistics with previously accumulated
442 		 * statistics.
443 		 */
444 		mutex_enter(&pidlock);
445 		mutex_enter(&tk->tk_usage_lock);
446 
447 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
448 		exacct_snapshot_task_usage(tk, tu);
449 
450 		mutex_exit(&tk->tk_usage_lock);
451 		mutex_exit(&pidlock);
452 		break;
453 	case EW_INTERVAL:
454 		/*
455 		 * We need to allocate spare task_usage_t buffer before
456 		 * grabbing pidlock because we might need it later in
457 		 * exacct_get_interval_task_usage().
458 		 */
459 		tu_buf = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
460 		mutex_enter(&pidlock);
461 		mutex_enter(&tk->tk_usage_lock);
462 
463 		/*
464 		 * For interval records, we deduct the previous microstate
465 		 * accounting data and cpu usage times from previously saved
466 		 * results and update the previous task usage structure.
467 		 */
468 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
469 		exacct_snapshot_task_usage(tk, tu);
470 		exacct_get_interval_task_usage(tk, tu, &tu_buf);
471 
472 		mutex_exit(&tk->tk_usage_lock);
473 		mutex_exit(&pidlock);
474 
475 		if (tu_buf != NULL)
476 			kmem_free(tu_buf, sizeof (task_usage_t));
477 		break;
478 	case EW_FINAL:
479 		/*
480 		 * For final records, we deduct, from the task's current
481 		 * usage, any usage that was inherited with the arrival
482 		 * of a process from a previous task. We then record
483 		 * the task's finish time.
484 		 */
485 		mutex_enter(&tk->tk_usage_lock);
486 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
487 		exacct_sub_task_mstate(tu, tk->tk_inherited);
488 		mutex_exit(&tk->tk_usage_lock);
489 
490 		gethrestime(&ts);
491 		tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
492 		tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
493 
494 		break;
495 	}
496 }
497 
498 static int
499 exacct_attach_task_item(task_t *tk, task_usage_t *tu, ea_object_t *record,
500     int res)
501 {
502 	int attached = 1;
503 
504 	switch (res) {
505 	case AC_TASK_TASKID:
506 		(void) ea_attach_item(record, &tk->tk_tkid,
507 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_TASKID);
508 		break;
509 	case AC_TASK_PROJID:
510 		(void) ea_attach_item(record, &tk->tk_proj->kpj_id,
511 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_PROJID);
512 		break;
513 	case AC_TASK_CPU: {
514 			timestruc_t ts;
515 			uint64_t ui;
516 
517 			hrt2ts(tu->tu_stime, &ts);
518 			ui = ts.tv_sec;
519 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
520 			    EXT_UINT64 | EXD_TASK_CPU_SYS_SEC);
521 			ui = ts.tv_nsec;
522 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
523 			    EXT_UINT64 | EXD_TASK_CPU_SYS_NSEC);
524 
525 			hrt2ts(tu->tu_utime, &ts);
526 			ui = ts.tv_sec;
527 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
528 			    EXT_UINT64 | EXD_TASK_CPU_USER_SEC);
529 			ui = ts.tv_nsec;
530 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
531 			    EXT_UINT64 | EXD_TASK_CPU_USER_NSEC);
532 		}
533 		break;
534 	case AC_TASK_TIME:
535 		(void) ea_attach_item(record, &tu->tu_startsec,
536 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_SEC);
537 		(void) ea_attach_item(record, &tu->tu_startnsec,
538 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_NSEC);
539 		(void) ea_attach_item(record, &tu->tu_finishsec,
540 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_SEC);
541 		(void) ea_attach_item(record, &tu->tu_finishnsec,
542 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_NSEC);
543 		break;
544 	case AC_TASK_HOSTNAME:
545 		(void) ea_attach_item(record, tk->tk_zone->zone_nodename,
546 		    strlen(tk->tk_zone->zone_nodename) + 1,
547 		    EXT_STRING | EXD_TASK_HOSTNAME);
548 			break;
549 	case AC_TASK_MICROSTATE:
550 		(void) ea_attach_item(record, &tu->tu_majflt,
551 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MAJOR);
552 		(void) ea_attach_item(record, &tu->tu_minflt,
553 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MINOR);
554 		(void) ea_attach_item(record, &tu->tu_sndmsg,
555 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_SND);
556 		(void) ea_attach_item(record, &tu->tu_rcvmsg,
557 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_RCV);
558 		(void) ea_attach_item(record, &tu->tu_iblk,
559 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_IN);
560 		(void) ea_attach_item(record, &tu->tu_oblk,
561 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_OUT);
562 		(void) ea_attach_item(record, &tu->tu_ioch,
563 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CHARS_RDWR);
564 		(void) ea_attach_item(record, &tu->tu_vcsw,
565 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_VOL);
566 		(void) ea_attach_item(record, &tu->tu_icsw,
567 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_INV);
568 		(void) ea_attach_item(record, &tu->tu_nsig,
569 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SIGNALS);
570 		(void) ea_attach_item(record, &tu->tu_nswp,
571 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SWAPS);
572 		(void) ea_attach_item(record, &tu->tu_nscl,
573 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SYSCALLS);
574 		break;
575 	case AC_TASK_ANCTASKID:
576 		(void) ea_attach_item(record, &tu->tu_anctaskid,
577 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_ANCTASKID);
578 		break;
579 	case AC_TASK_ZONENAME:
580 		(void) ea_attach_item(record, tk->tk_zone->zone_name,
581 		    strlen(tk->tk_zone->zone_name) + 1,
582 		    EXT_STRING | EXD_TASK_ZONENAME);
583 		break;
584 	default:
585 		attached = 0;
586 	}
587 	return (attached);
588 }
589 
590 static ea_object_t *
591 exacct_assemble_task_record(task_t *tk, task_usage_t *tu, ulong_t *mask,
592     ea_catalog_t record_type)
593 {
594 	int res, count;
595 	ea_object_t *record;
596 
597 	/*
598 	 * Assemble usage values into group.
599 	 */
600 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
601 	for (res = 1, count = 0; res <= AC_TASK_MAX_RES; res++)
602 		if (BT_TEST(mask, res))
603 			count += exacct_attach_task_item(tk, tu, record, res);
604 	if (count == 0) {
605 		ea_free_object(record, EUP_ALLOC);
606 		record = NULL;
607 	}
608 	return (record);
609 }
610 
611 /*
612  * int exacct_assemble_task_usage(task_t *, int (*)(void *, size_t, void *,
613  *	size_t, size_t *), void *, size_t, size_t *, int)
614  *
615  * Overview
616  *   exacct_assemble_task_usage() builds the packed exacct buffer for the
617  *   indicated task, executes the given callback function, and free the packed
618  *   buffer.
619  *
620  * Return values
621  *   Returns 0 on success; otherwise the appropriate error code is returned.
622  *
623  * Caller's context
624  *   Suitable for KM_SLEEP allocations.
625  */
626 int
627 exacct_assemble_task_usage(ac_info_t *ac_task, task_t *tk,
628     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
629     void *ubuf, size_t ubufsize, size_t *actual, int flag)
630 {
631 	ulong_t mask[AC_MASK_SZ];
632 	ea_object_t *task_record;
633 	ea_catalog_t record_type;
634 	task_usage_t *tu;
635 	void *buf;
636 	size_t bufsize;
637 	int ret;
638 
639 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL || flag == EW_INTERVAL);
640 
641 	mutex_enter(&ac_task->ac_lock);
642 	if (ac_task->ac_state == AC_OFF) {
643 		mutex_exit(&ac_task->ac_lock);
644 		return (ENOTACTIVE);
645 	}
646 	bt_copy(ac_task->ac_mask, mask, AC_MASK_SZ);
647 	mutex_exit(&ac_task->ac_lock);
648 
649 	switch (flag) {
650 	case EW_FINAL:
651 		record_type = EXD_GROUP_TASK;
652 		break;
653 	case EW_PARTIAL:
654 		record_type = EXD_GROUP_TASK_PARTIAL;
655 		break;
656 	case EW_INTERVAL:
657 		record_type = EXD_GROUP_TASK_INTERVAL;
658 		break;
659 	default:
660 		return (0);
661 	}
662 
663 	/*
664 	 * Calculate task usage and assemble it into the task record.
665 	 */
666 	tu = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
667 	exacct_calculate_task_usage(tk, tu, flag);
668 	task_record = exacct_assemble_task_record(tk, tu, mask, record_type);
669 	if (task_record == NULL) {
670 		/*
671 		 * The current configuration of the accounting system has
672 		 * resulted in records with no data; accordingly, we don't write
673 		 * these, but we return success.
674 		 */
675 		kmem_free(tu, sizeof (task_usage_t));
676 		return (0);
677 	}
678 
679 	/*
680 	 * Pack object into buffer and run callback on it.
681 	 */
682 	bufsize = ea_pack_object(task_record, NULL, 0);
683 	buf = kmem_alloc(bufsize, KM_SLEEP);
684 	(void) ea_pack_object(task_record, buf, bufsize);
685 	ret = callback(ac_task, ubuf, ubufsize, buf, bufsize, actual);
686 
687 	/*
688 	 * Free all previously allocated structures.
689 	 */
690 	kmem_free(buf, bufsize);
691 	ea_free_object(task_record, EUP_ALLOC);
692 	kmem_free(tu, sizeof (task_usage_t));
693 	return (ret);
694 }
695 
696 /*
697  * void exacct_commit_task(void *)
698  *
699  * Overview
700  *   exacct_commit_task() calculates the final usage for a task, updating the
701  *   task usage if task accounting is active, and writing a task record if task
702  *   accounting is active.  exacct_commit_task() is intended for being called
703  *   from a task queue (taskq_t).
704  *
705  * Return values
706  *   None.
707  *
708  * Caller's context
709  *   Suitable for KM_SLEEP allocations.
710  */
711 
712 void
713 exacct_commit_task(void *arg)
714 {
715 	task_t *tk = (task_t *)arg;
716 	size_t size;
717 	zone_t *zone = tk->tk_zone;
718 	struct exacct_globals *acg;
719 
720 	ASSERT(tk != task0p);
721 	ASSERT(tk->tk_memb_list == NULL);
722 
723 	/*
724 	 * Don't do any extra work if the acctctl module isn't loaded.
725 	 * If acctctl module is loaded when zone is in down state then
726 	 * zone_getspecific can return NULL for that zone.
727 	 */
728 	if (exacct_zone_key != ZONE_KEY_UNINITIALIZED) {
729 		acg = zone_getspecific(exacct_zone_key, zone);
730 		if (acg == NULL)
731 			goto err;
732 		(void) exacct_assemble_task_usage(&acg->ac_task, tk,
733 		    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
734 		if (tk->tk_zone != global_zone) {
735 			acg = zone_getspecific(exacct_zone_key, global_zone);
736 			(void) exacct_assemble_task_usage(&acg->ac_task, tk,
737 			    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
738 		}
739 	}
740 	/*
741 	 * Release associated project and finalize task.
742 	 */
743 err:
744 	task_end(tk);
745 }
746 
747 static int
748 exacct_attach_proc_item(proc_usage_t *pu, ea_object_t *record, int res)
749 {
750 	int attached = 1;
751 
752 	switch (res) {
753 	case AC_PROC_PID:
754 		(void) ea_attach_item(record, &pu->pu_pid,
755 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PID);
756 		break;
757 	case AC_PROC_UID:
758 		(void) ea_attach_item(record, &pu->pu_ruid,
759 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_UID);
760 		break;
761 	case AC_PROC_FLAG:
762 		(void) ea_attach_item(record, &pu->pu_acflag,
763 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ACCT_FLAGS);
764 		break;
765 	case AC_PROC_GID:
766 		(void) ea_attach_item(record, &pu->pu_rgid,
767 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_GID);
768 		break;
769 	case AC_PROC_PROJID:
770 		(void) ea_attach_item(record, &pu->pu_projid,
771 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PROJID);
772 		break;
773 	case AC_PROC_TASKID:
774 		(void) ea_attach_item(record, &pu->pu_taskid,
775 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TASKID);
776 		break;
777 	case AC_PROC_CPU:
778 		(void) ea_attach_item(record, &pu->pu_utimesec,
779 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_SEC);
780 		(void) ea_attach_item(record, &pu->pu_utimensec,
781 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_NSEC);
782 		(void) ea_attach_item(record, &pu->pu_stimesec,
783 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_SEC);
784 		(void) ea_attach_item(record, &pu->pu_stimensec,
785 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_NSEC);
786 		break;
787 	case AC_PROC_TIME:
788 		(void) ea_attach_item(record, &pu->pu_startsec,
789 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_SEC);
790 		(void) ea_attach_item(record, &pu->pu_startnsec,
791 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_NSEC);
792 		(void) ea_attach_item(record, &pu->pu_finishsec,
793 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_SEC);
794 		(void) ea_attach_item(record, &pu->pu_finishnsec,
795 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_NSEC);
796 		break;
797 	case AC_PROC_COMMAND:
798 		(void) ea_attach_item(record, pu->pu_command,
799 		    strlen(pu->pu_command) + 1, EXT_STRING | EXD_PROC_COMMAND);
800 		break;
801 	case AC_PROC_HOSTNAME:
802 		(void) ea_attach_item(record, pu->pu_nodename,
803 		    strlen(pu->pu_nodename) + 1,
804 		    EXT_STRING | EXD_PROC_HOSTNAME);
805 		break;
806 	case AC_PROC_TTY:
807 		(void) ea_attach_item(record, &pu->pu_major,
808 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MAJOR);
809 		(void) ea_attach_item(record, &pu->pu_minor,
810 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MINOR);
811 		break;
812 	case AC_PROC_MICROSTATE:
813 		(void) ea_attach_item(record, &pu->pu_majflt,
814 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MAJOR);
815 		(void) ea_attach_item(record, &pu->pu_minflt,
816 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MINOR);
817 		(void) ea_attach_item(record, &pu->pu_sndmsg,
818 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_SND);
819 		(void) ea_attach_item(record, &pu->pu_rcvmsg,
820 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_RCV);
821 		(void) ea_attach_item(record, &pu->pu_iblk,
822 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_IN);
823 		(void) ea_attach_item(record, &pu->pu_oblk,
824 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_OUT);
825 		(void) ea_attach_item(record, &pu->pu_ioch,
826 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CHARS_RDWR);
827 		(void) ea_attach_item(record, &pu->pu_vcsw,
828 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_VOL);
829 		(void) ea_attach_item(record, &pu->pu_icsw,
830 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_INV);
831 		(void) ea_attach_item(record, &pu->pu_nsig,
832 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SIGNALS);
833 		(void) ea_attach_item(record, &pu->pu_nswp,
834 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SWAPS);
835 		(void) ea_attach_item(record, &pu->pu_nscl,
836 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SYSCALLS);
837 		break;
838 	case AC_PROC_ANCPID:
839 		(void) ea_attach_item(record, &pu->pu_ancpid,
840 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ANCPID);
841 		break;
842 	case AC_PROC_WAIT_STATUS:
843 		(void) ea_attach_item(record, &pu->pu_wstat,
844 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_WAIT_STATUS);
845 		break;
846 	case AC_PROC_ZONENAME:
847 		(void) ea_attach_item(record, pu->pu_zonename,
848 		    strlen(pu->pu_zonename) + 1,
849 		    EXT_STRING | EXD_PROC_ZONENAME);
850 		break;
851 	case AC_PROC_MEM:
852 		(void) ea_attach_item(record, &pu->pu_mem_rss_avg,
853 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_AVG_K);
854 		(void) ea_attach_item(record, &pu->pu_mem_rss_max,
855 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_MAX_K);
856 		break;
857 	default:
858 		attached = 0;
859 	}
860 	return (attached);
861 }
862 
863 static ea_object_t *
864 exacct_assemble_proc_record(proc_usage_t *pu, ulong_t *mask,
865     ea_catalog_t record_type)
866 {
867 	int res, count;
868 	ea_object_t *record;
869 
870 	/*
871 	 * Assemble usage values into group.
872 	 */
873 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
874 	for (res = 1, count = 0; res <= AC_PROC_MAX_RES; res++)
875 		if (BT_TEST(mask, res))
876 			count += exacct_attach_proc_item(pu, record, res);
877 	if (count == 0) {
878 		ea_free_object(record, EUP_ALLOC);
879 		record = NULL;
880 	}
881 	return (record);
882 }
883 
884 /*
885  * The following two routines assume that process's p_lock is held or
886  * exacct_commit_proc has been called from exit() when all lwps are stopped.
887  */
888 static void
889 exacct_calculate_proc_mstate(proc_t *p, proc_usage_t *pu)
890 {
891 	kthread_t *t;
892 
893 	ASSERT(MUTEX_HELD(&p->p_lock));
894 	if ((t = p->p_tlist) == NULL)
895 		return;
896 
897 	do {
898 		pu->pu_minflt	+= t->t_lwp->lwp_ru.minflt;
899 		pu->pu_majflt	+= t->t_lwp->lwp_ru.majflt;
900 		pu->pu_sndmsg	+= t->t_lwp->lwp_ru.msgsnd;
901 		pu->pu_rcvmsg	+= t->t_lwp->lwp_ru.msgrcv;
902 		pu->pu_ioch	+= t->t_lwp->lwp_ru.ioch;
903 		pu->pu_iblk	+= t->t_lwp->lwp_ru.inblock;
904 		pu->pu_oblk	+= t->t_lwp->lwp_ru.oublock;
905 		pu->pu_vcsw	+= t->t_lwp->lwp_ru.nvcsw;
906 		pu->pu_icsw	+= t->t_lwp->lwp_ru.nivcsw;
907 		pu->pu_nsig	+= t->t_lwp->lwp_ru.nsignals;
908 		pu->pu_nswp	+= t->t_lwp->lwp_ru.nswap;
909 		pu->pu_nscl	+= t->t_lwp->lwp_ru.sysc;
910 	} while ((t = t->t_forw) != p->p_tlist);
911 }
912 
913 static void
914 exacct_copy_proc_mstate(proc_t *p, proc_usage_t *pu)
915 {
916 	pu->pu_minflt	= p->p_ru.minflt;
917 	pu->pu_majflt	= p->p_ru.majflt;
918 	pu->pu_sndmsg	= p->p_ru.msgsnd;
919 	pu->pu_rcvmsg	= p->p_ru.msgrcv;
920 	pu->pu_ioch	= p->p_ru.ioch;
921 	pu->pu_iblk	= p->p_ru.inblock;
922 	pu->pu_oblk	= p->p_ru.oublock;
923 	pu->pu_vcsw	= p->p_ru.nvcsw;
924 	pu->pu_icsw	= p->p_ru.nivcsw;
925 	pu->pu_nsig	= p->p_ru.nsignals;
926 	pu->pu_nswp	= p->p_ru.nswap;
927 	pu->pu_nscl	= p->p_ru.sysc;
928 }
929 
930 void
931 exacct_calculate_proc_usage(proc_t *p, proc_usage_t *pu, ulong_t *mask,
932     int flag, int wstat)
933 {
934 	timestruc_t ts, ts_run;
935 
936 	ASSERT(MUTEX_HELD(&p->p_lock));
937 
938 	/*
939 	 * Convert CPU and execution times to sec/nsec format.
940 	 */
941 	if (BT_TEST(mask, AC_PROC_CPU)) {
942 		hrt2ts(mstate_aggr_state(p, LMS_USER), &ts);
943 		pu->pu_utimesec = (uint64_t)(ulong_t)ts.tv_sec;
944 		pu->pu_utimensec = (uint64_t)(ulong_t)ts.tv_nsec;
945 		hrt2ts(mstate_aggr_state(p, LMS_SYSTEM), &ts);
946 		pu->pu_stimesec = (uint64_t)(ulong_t)ts.tv_sec;
947 		pu->pu_stimensec = (uint64_t)(ulong_t)ts.tv_nsec;
948 	}
949 	if (BT_TEST(mask, AC_PROC_TIME)) {
950 		gethrestime(&ts);
951 		pu->pu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
952 		pu->pu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
953 		hrt2ts(gethrtime() - p->p_mstart, &ts_run);
954 		ts.tv_sec -= ts_run.tv_sec;
955 		ts.tv_nsec -= ts_run.tv_nsec;
956 		if (ts.tv_nsec < 0) {
957 			ts.tv_sec--;
958 			if ((ts.tv_nsec = ts.tv_nsec + NANOSEC) >= NANOSEC) {
959 				ts.tv_sec++;
960 				ts.tv_nsec -= NANOSEC;
961 			}
962 		}
963 		pu->pu_startsec = (uint64_t)(ulong_t)ts.tv_sec;
964 		pu->pu_startnsec = (uint64_t)(ulong_t)ts.tv_nsec;
965 	}
966 
967 	pu->pu_pid = p->p_pidp->pid_id;
968 	pu->pu_acflag = p->p_user.u_acflag;
969 	pu->pu_projid = p->p_task->tk_proj->kpj_id;
970 	pu->pu_taskid = p->p_task->tk_tkid;
971 	pu->pu_major = getmajor(p->p_sessp->s_dev);
972 	pu->pu_minor = getminor(p->p_sessp->s_dev);
973 	pu->pu_ancpid = p->p_ancpid;
974 	pu->pu_wstat = wstat;
975 	/*
976 	 * Compute average RSS in K.  The denominator is the number of
977 	 * samples:  the number of clock ticks plus the initial value.
978 	 */
979 	pu->pu_mem_rss_avg = (PTOU(p)->u_mem / (p->p_stime + p->p_utime + 1)) *
980 	    (PAGESIZE / 1024);
981 	pu->pu_mem_rss_max = PTOU(p)->u_mem_max * (PAGESIZE / 1024);
982 
983 	mutex_enter(&p->p_crlock);
984 	pu->pu_ruid = crgetruid(p->p_cred);
985 	pu->pu_rgid = crgetrgid(p->p_cred);
986 	mutex_exit(&p->p_crlock);
987 
988 	bcopy(p->p_user.u_comm, pu->pu_command, strlen(p->p_user.u_comm) + 1);
989 	bcopy(p->p_zone->zone_name, pu->pu_zonename,
990 	    strlen(p->p_zone->zone_name) + 1);
991 	bcopy(p->p_zone->zone_nodename, pu->pu_nodename,
992 	    strlen(p->p_zone->zone_nodename) + 1);
993 
994 	/*
995 	 * Calculate microstate accounting data for a process that is still
996 	 * running.  Presently, we explicitly collect all of the LWP usage into
997 	 * the proc usage structure here.
998 	 */
999 	if (flag & EW_PARTIAL)
1000 		exacct_calculate_proc_mstate(p, pu);
1001 	if (flag & EW_FINAL)
1002 		exacct_copy_proc_mstate(p, pu);
1003 }
1004 
1005 /*
1006  * int exacct_assemble_proc_usage(proc_usage_t *, int (*)(void *, size_t, void
1007  *	*, size_t, size_t *), void *, size_t, size_t *)
1008  *
1009  * Overview
1010  *   Assemble record with miscellaneous accounting information about the process
1011  *   and execute the callback on it. It is the callback's job to set "actual" to
1012  *   the size of record.
1013  *
1014  * Return values
1015  *   The result of the callback function, unless the extended process accounting
1016  *   feature is not active, in which case ENOTACTIVE is returned.
1017  *
1018  * Caller's context
1019  *   Suitable for KM_SLEEP allocations.
1020  */
1021 int
1022 exacct_assemble_proc_usage(ac_info_t *ac_proc, proc_usage_t *pu,
1023     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1024     void *ubuf, size_t ubufsize, size_t *actual, int flag)
1025 {
1026 	ulong_t mask[AC_MASK_SZ];
1027 	ea_object_t *proc_record;
1028 	ea_catalog_t record_type;
1029 	void *buf;
1030 	size_t bufsize;
1031 	int ret;
1032 
1033 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL);
1034 
1035 	mutex_enter(&ac_proc->ac_lock);
1036 	if (ac_proc->ac_state == AC_OFF) {
1037 		mutex_exit(&ac_proc->ac_lock);
1038 		return (ENOTACTIVE);
1039 	}
1040 	bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
1041 	mutex_exit(&ac_proc->ac_lock);
1042 
1043 	switch (flag) {
1044 	case EW_FINAL:
1045 		record_type = EXD_GROUP_PROC;
1046 		break;
1047 	case EW_PARTIAL:
1048 		record_type = EXD_GROUP_PROC_PARTIAL;
1049 		break;
1050 	default:
1051 		record_type = EXD_NONE;
1052 		break;
1053 	}
1054 
1055 	proc_record = exacct_assemble_proc_record(pu, mask, record_type);
1056 	if (proc_record == NULL)
1057 		return (0);
1058 
1059 	/*
1060 	 * Pack object into buffer and pass to callback.
1061 	 */
1062 	bufsize = ea_pack_object(proc_record, NULL, 0);
1063 	buf = kmem_alloc(bufsize, KM_SLEEP);
1064 	(void) ea_pack_object(proc_record, buf, bufsize);
1065 
1066 	ret = callback(ac_proc, ubuf, ubufsize, buf, bufsize, actual);
1067 
1068 	/*
1069 	 * Free all previously allocations.
1070 	 */
1071 	kmem_free(buf, bufsize);
1072 	ea_free_object(proc_record, EUP_ALLOC);
1073 	return (ret);
1074 }
1075 
1076 /*
1077  * int exacct_commit_callback(ac_info_t *, void *, size_t, void *, size_t,
1078  * 	size_t *)
1079  *
1080  * Overview
1081  *   exacct_commit_callback() writes the indicated buffer to the indicated
1082  *   extended accounting file.
1083  *
1084  * Return values
1085  *   The result of the write operation is returned.  "actual" is updated to
1086  *   contain the number of bytes actually written.
1087  *
1088  * Caller's context
1089  *   Suitable for a vn_rdwr() operation.
1090  */
1091 /*ARGSUSED*/
1092 int
1093 exacct_commit_callback(ac_info_t *info, void *ubuf, size_t ubufsize,
1094     void *buf, size_t bufsize, size_t *actual)
1095 {
1096 	int error = 0;
1097 
1098 	*actual = 0;
1099 	if ((error = exacct_vn_write(info, buf, bufsize)) == 0)
1100 		*actual = bufsize;
1101 	return (error);
1102 }
1103 
1104 static void
1105 exacct_do_commit_proc(ac_info_t *ac_proc, proc_t *p, int wstat)
1106 {
1107 	size_t size;
1108 	proc_usage_t *pu;
1109 	ulong_t mask[AC_MASK_SZ];
1110 
1111 	mutex_enter(&ac_proc->ac_lock);
1112 	if (ac_proc->ac_state == AC_ON) {
1113 		bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
1114 		mutex_exit(&ac_proc->ac_lock);
1115 	} else {
1116 		mutex_exit(&ac_proc->ac_lock);
1117 		return;
1118 	}
1119 
1120 	mutex_enter(&p->p_lock);
1121 	size = strlen(p->p_user.u_comm) + 1;
1122 	mutex_exit(&p->p_lock);
1123 
1124 	pu = kmem_alloc(sizeof (proc_usage_t), KM_SLEEP);
1125 	pu->pu_command = kmem_alloc(size, KM_SLEEP);
1126 	mutex_enter(&p->p_lock);
1127 	exacct_calculate_proc_usage(p, pu, mask, EW_FINAL, wstat);
1128 	mutex_exit(&p->p_lock);
1129 
1130 	(void) exacct_assemble_proc_usage(ac_proc, pu,
1131 	    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
1132 
1133 	kmem_free(pu->pu_command, strlen(pu->pu_command) + 1);
1134 	kmem_free(pu, sizeof (proc_usage_t));
1135 }
1136 
1137 /*
1138  * void exacct_commit_proc(proc_t *, int)
1139  *
1140  * Overview
1141  *   exacct_commit_proc() calculates the final usage for a process, updating the
1142  *   task usage if task accounting is active, and writing a process record if
1143  *   process accounting is active.  exacct_commit_proc() is intended for being
1144  *   called from proc_exit().
1145  *
1146  * Return values
1147  *   None.
1148  *
1149  * Caller's context
1150  *   Suitable for KM_SLEEP allocations.  p_lock must not be held at entry.
1151  */
1152 void
1153 exacct_commit_proc(proc_t *p, int wstat)
1154 {
1155 	zone_t *zone = p->p_zone;
1156 	struct exacct_globals *acg, *gacg = NULL;
1157 
1158 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1159 		/*
1160 		 * acctctl module not loaded.  Nothing to do.
1161 		 */
1162 		return;
1163 	}
1164 
1165 	/*
1166 	 * If acctctl module is loaded when zone is in down state then
1167 	 * zone_getspecific can return NULL for that zone.
1168 	 */
1169 	acg = zone_getspecific(exacct_zone_key, zone);
1170 	if (acg == NULL)
1171 		return;
1172 	exacct_do_commit_proc(&acg->ac_proc, p, wstat);
1173 	if (zone != global_zone) {
1174 		gacg = zone_getspecific(exacct_zone_key, global_zone);
1175 		exacct_do_commit_proc(&gacg->ac_proc, p, wstat);
1176 	}
1177 }
1178 
1179 static int
1180 exacct_attach_netstat_item(net_stat_t *ns, ea_object_t *record, int res)
1181 {
1182 	int		attached = 1;
1183 
1184 	switch (res) {
1185 	case AC_NET_NAME:
1186 		(void) ea_attach_item(record, ns->ns_name,
1187 		    strlen(ns->ns_name) + 1, EXT_STRING | EXD_NET_STATS_NAME);
1188 		break;
1189 	case AC_NET_CURTIME:
1190 		{
1191 			uint64_t	now;
1192 			timestruc_t	ts;
1193 
1194 			gethrestime(&ts);
1195 			now = (uint64_t)(ulong_t)ts.tv_sec;
1196 			(void) ea_attach_item(record,  &now, sizeof (uint64_t),
1197 			    EXT_UINT64 | EXD_NET_STATS_CURTIME);
1198 		}
1199 		break;
1200 	case AC_NET_IBYTES:
1201 		(void) ea_attach_item(record, &ns->ns_ibytes,
1202 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IBYTES);
1203 		break;
1204 	case AC_NET_OBYTES:
1205 		(void) ea_attach_item(record, &ns->ns_obytes,
1206 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OBYTES);
1207 		break;
1208 	case AC_NET_IPKTS:
1209 		(void) ea_attach_item(record, &ns->ns_ipackets,
1210 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IPKTS);
1211 		break;
1212 	case AC_NET_OPKTS:
1213 		(void) ea_attach_item(record, &ns->ns_opackets,
1214 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OPKTS);
1215 		break;
1216 	case AC_NET_IERRPKTS:
1217 		(void) ea_attach_item(record, &ns->ns_ierrors,
1218 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IERRPKTS);
1219 		break;
1220 	case AC_NET_OERRPKTS:
1221 		(void) ea_attach_item(record, &ns->ns_oerrors,
1222 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OERRPKTS);
1223 		break;
1224 	default:
1225 		attached = 0;
1226 	}
1227 	return (attached);
1228 }
1229 
1230 static int
1231 exacct_attach_netdesc_item(net_desc_t *nd, ea_object_t *record, int res)
1232 {
1233 	int attached = 1;
1234 
1235 	switch (res) {
1236 	case AC_NET_NAME:
1237 		(void) ea_attach_item(record, nd->nd_name,
1238 		    strlen(nd->nd_name) + 1, EXT_STRING | EXD_NET_DESC_NAME);
1239 		break;
1240 	case AC_NET_DEVNAME:
1241 		(void) ea_attach_item(record, nd->nd_devname,
1242 		    strlen(nd->nd_devname) + 1, EXT_STRING |
1243 		    EXD_NET_DESC_DEVNAME);
1244 		break;
1245 	case AC_NET_EHOST:
1246 		(void) ea_attach_item(record, &nd->nd_ehost,
1247 		    sizeof (nd->nd_ehost), EXT_RAW | EXD_NET_DESC_EHOST);
1248 		break;
1249 	case AC_NET_EDEST:
1250 		(void) ea_attach_item(record, &nd->nd_edest,
1251 		    sizeof (nd->nd_edest), EXT_RAW | EXD_NET_DESC_EDEST);
1252 		break;
1253 	case AC_NET_VLAN_TPID:
1254 		(void) ea_attach_item(record, &nd->nd_vlan_tpid,
1255 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TPID);
1256 		break;
1257 	case AC_NET_VLAN_TCI:
1258 		(void) ea_attach_item(record, &nd->nd_vlan_tci,
1259 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TCI);
1260 		break;
1261 	case AC_NET_SAP:
1262 		(void) ea_attach_item(record, &nd->nd_sap,
1263 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_SAP);
1264 		break;
1265 	case AC_NET_PRIORITY:
1266 		(void) ea_attach_item(record, &nd->nd_priority,
1267 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_PRIORITY);
1268 		break;
1269 	case AC_NET_BWLIMIT:
1270 		(void) ea_attach_item(record, &nd->nd_bw_limit,
1271 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_DESC_BWLIMIT);
1272 		break;
1273 	case AC_NET_SADDR:
1274 		if (nd->nd_isv4) {
1275 			(void) ea_attach_item(record, &nd->nd_saddr[3],
1276 			    sizeof (uint32_t), EXT_UINT32 |
1277 			    EXD_NET_DESC_V4SADDR);
1278 		} else {
1279 			(void) ea_attach_item(record, &nd->nd_saddr,
1280 			    sizeof (nd->nd_saddr), EXT_RAW |
1281 			    EXD_NET_DESC_V6SADDR);
1282 		}
1283 		break;
1284 	case AC_NET_DADDR:
1285 		if (nd->nd_isv4) {
1286 			(void) ea_attach_item(record, &nd->nd_daddr[3],
1287 			    sizeof (uint32_t), EXT_UINT32 |
1288 			    EXD_NET_DESC_V4DADDR);
1289 		} else {
1290 			(void) ea_attach_item(record, &nd->nd_daddr,
1291 			    sizeof (nd->nd_daddr), EXT_RAW |
1292 			    EXD_NET_DESC_V6DADDR);
1293 		}
1294 		break;
1295 	case AC_NET_SPORT:
1296 		(void) ea_attach_item(record, &nd->nd_sport,
1297 		    sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_SPORT);
1298 		break;
1299 	case AC_NET_DPORT:
1300 		(void) ea_attach_item(record, &nd->nd_dport,
1301 		    sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_DPORT);
1302 		break;
1303 	case AC_NET_PROTOCOL:
1304 		(void) ea_attach_item(record, &nd->nd_protocol,
1305 		    sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_PROTOCOL);
1306 		break;
1307 	case AC_NET_DSFIELD:
1308 		(void) ea_attach_item(record, &nd->nd_dsfield,
1309 		    sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_DSFIELD);
1310 		break;
1311 	default:
1312 		attached = 0;
1313 	}
1314 	return (attached);
1315 }
1316 
1317 static ea_object_t *
1318 exacct_assemble_net_record(void *ninfo, ulong_t *mask, ea_catalog_t record_type,
1319     int what)
1320 {
1321 	int		res;
1322 	int		count;
1323 	ea_object_t	*record;
1324 
1325 	/*
1326 	 * Assemble usage values into group.
1327 	 */
1328 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
1329 	for (res = 1, count = 0; res <= AC_NET_MAX_RES; res++)
1330 		if (BT_TEST(mask, res)) {
1331 			if (what == EX_NET_LNDESC_REC ||
1332 			    what == EX_NET_FLDESC_REC) {
1333 				count += exacct_attach_netdesc_item(
1334 				    (net_desc_t *)ninfo, record, res);
1335 			} else {
1336 				count += exacct_attach_netstat_item(
1337 				    (net_stat_t *)ninfo, record, res);
1338 			}
1339 		}
1340 	if (count == 0) {
1341 		ea_free_object(record, EUP_ALLOC);
1342 		record = NULL;
1343 	}
1344 	return (record);
1345 }
1346 
1347 int
1348 exacct_assemble_net_usage(ac_info_t *ac_net, void *ninfo,
1349     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1350     void *ubuf, size_t ubufsize, size_t *actual, int what)
1351 {
1352 	ulong_t		mask[AC_MASK_SZ];
1353 	ea_object_t	*net_desc;
1354 	ea_catalog_t	record_type;
1355 	void		*buf;
1356 	size_t		bufsize;
1357 	int		ret;
1358 
1359 	mutex_enter(&ac_net->ac_lock);
1360 	if (ac_net->ac_state == AC_OFF) {
1361 		mutex_exit(&ac_net->ac_lock);
1362 		return (ENOTACTIVE);
1363 	}
1364 	bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
1365 	mutex_exit(&ac_net->ac_lock);
1366 
1367 	switch (what) {
1368 	case EX_NET_LNDESC_REC:
1369 		record_type = EXD_GROUP_NET_LINK_DESC;
1370 		break;
1371 	case EX_NET_LNSTAT_REC:
1372 		record_type = EXD_GROUP_NET_LINK_STATS;
1373 		break;
1374 	case EX_NET_FLDESC_REC:
1375 		record_type = EXD_GROUP_NET_FLOW_DESC;
1376 		break;
1377 	case EX_NET_FLSTAT_REC:
1378 		record_type = EXD_GROUP_NET_FLOW_STATS;
1379 		break;
1380 	default:
1381 		return (0);
1382 	}
1383 
1384 	net_desc = exacct_assemble_net_record(ninfo, mask, record_type, what);
1385 	if (net_desc == NULL)
1386 		return (0);
1387 
1388 	/*
1389 	 * Pack object into buffer and pass to callback.
1390 	 */
1391 	bufsize = ea_pack_object(net_desc, NULL, 0);
1392 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
1393 	if (buf == NULL)
1394 		return (ENOMEM);
1395 
1396 	(void) ea_pack_object(net_desc, buf, bufsize);
1397 
1398 	ret = callback(ac_net, ubuf, ubufsize, buf, bufsize, actual);
1399 
1400 	/*
1401 	 * Free all previously allocations.
1402 	 */
1403 	kmem_free(buf, bufsize);
1404 	ea_free_object(net_desc, EUP_ALLOC);
1405 	return (ret);
1406 }
1407 
1408 int
1409 exacct_commit_netinfo(void *arg, int what)
1410 {
1411 	size_t			size;
1412 	ulong_t			mask[AC_MASK_SZ];
1413 	struct exacct_globals	*acg;
1414 	ac_info_t		*ac_net;
1415 
1416 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1417 		/*
1418 		 * acctctl module not loaded. Nothing to do.
1419 		 */
1420 		return (ENOTACTIVE);
1421 	}
1422 
1423 	/*
1424 	 * Even though each zone nominally has its own flow accounting settings
1425 	 * (ac_flow), these are only maintained by and for the global zone.
1426 	 *
1427 	 * If this were to change in the future, this function should grow a
1428 	 * second zoneid (or zone) argument, and use the corresponding zone's
1429 	 * settings rather than always using those of the global zone.
1430 	 */
1431 	acg = zone_getspecific(exacct_zone_key, global_zone);
1432 	ac_net = &acg->ac_net;
1433 
1434 	mutex_enter(&ac_net->ac_lock);
1435 	if (ac_net->ac_state == AC_OFF) {
1436 		mutex_exit(&ac_net->ac_lock);
1437 		return (ENOTACTIVE);
1438 	}
1439 	bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
1440 	mutex_exit(&ac_net->ac_lock);
1441 
1442 	return (exacct_assemble_net_usage(ac_net, arg, exacct_commit_callback,
1443 	    NULL, 0, &size, what));
1444 }
1445 
1446 static int
1447 exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res)
1448 {
1449 	int attached = 1;
1450 
1451 	switch (res) {
1452 	case AC_FLOW_SADDR:
1453 		if (fu->fu_isv4) {
1454 			(void) ea_attach_item(record, &fu->fu_saddr[3],
1455 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4SADDR);
1456 		} else {
1457 			(void) ea_attach_item(record, &fu->fu_saddr,
1458 			    sizeof (fu->fu_saddr), EXT_RAW |
1459 			    EXD_FLOW_V6SADDR);
1460 		}
1461 		break;
1462 	case AC_FLOW_DADDR:
1463 		if (fu->fu_isv4) {
1464 			(void) ea_attach_item(record, &fu->fu_daddr[3],
1465 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4DADDR);
1466 		} else {
1467 			(void) ea_attach_item(record, &fu->fu_daddr,
1468 			    sizeof (fu->fu_daddr), EXT_RAW |
1469 			    EXD_FLOW_V6DADDR);
1470 		}
1471 		break;
1472 	case AC_FLOW_SPORT:
1473 		(void) ea_attach_item(record, &fu->fu_sport,
1474 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_SPORT);
1475 		break;
1476 	case AC_FLOW_DPORT:
1477 		(void) ea_attach_item(record, &fu->fu_dport,
1478 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_DPORT);
1479 		break;
1480 	case AC_FLOW_PROTOCOL:
1481 		(void) ea_attach_item(record, &fu->fu_protocol,
1482 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_PROTOCOL);
1483 		break;
1484 	case AC_FLOW_DSFIELD:
1485 		(void) ea_attach_item(record, &fu->fu_dsfield,
1486 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_DSFIELD);
1487 		break;
1488 	case AC_FLOW_CTIME:
1489 		(void) ea_attach_item(record, &fu->fu_ctime,
1490 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_CTIME);
1491 		break;
1492 	case AC_FLOW_LSEEN:
1493 		(void) ea_attach_item(record, &fu->fu_lseen,
1494 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_LSEEN);
1495 		break;
1496 	case AC_FLOW_NBYTES:
1497 		(void) ea_attach_item(record, &fu->fu_nbytes,
1498 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NBYTES);
1499 		break;
1500 	case AC_FLOW_NPKTS:
1501 		(void) ea_attach_item(record, &fu->fu_npackets,
1502 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NPKTS);
1503 		break;
1504 	case AC_FLOW_PROJID:
1505 		if (fu->fu_projid >= 0) {
1506 			(void) ea_attach_item(record, &fu->fu_projid,
1507 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_PROJID);
1508 		}
1509 		break;
1510 	case AC_FLOW_UID:
1511 		if (fu->fu_userid >= 0) {
1512 			(void) ea_attach_item(record, &fu->fu_userid,
1513 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID);
1514 		}
1515 		break;
1516 	case AC_FLOW_ANAME:
1517 		(void) ea_attach_item(record, fu->fu_aname,
1518 		    strlen(fu->fu_aname) + 1, EXT_STRING | EXD_FLOW_ANAME);
1519 		break;
1520 	default:
1521 		attached = 0;
1522 	}
1523 	return (attached);
1524 }
1525 
1526 static ea_object_t *
1527 exacct_assemble_flow_record(flow_usage_t *fu, ulong_t *mask,
1528     ea_catalog_t record_type)
1529 {
1530 	int res, count;
1531 	ea_object_t *record;
1532 
1533 	/*
1534 	 * Assemble usage values into group.
1535 	 */
1536 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
1537 	for (res = 1, count = 0; res <= AC_FLOW_MAX_RES; res++)
1538 		if (BT_TEST(mask, res))
1539 			count += exacct_attach_flow_item(fu, record, res);
1540 	if (count == 0) {
1541 		ea_free_object(record, EUP_ALLOC);
1542 		record = NULL;
1543 	}
1544 	return (record);
1545 }
1546 
1547 int
1548 exacct_assemble_flow_usage(ac_info_t *ac_flow, flow_usage_t *fu,
1549     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1550     void *ubuf, size_t ubufsize, size_t *actual)
1551 {
1552 	ulong_t mask[AC_MASK_SZ];
1553 	ea_object_t *flow_usage;
1554 	ea_catalog_t record_type;
1555 	void *buf;
1556 	size_t bufsize;
1557 	int ret;
1558 
1559 	mutex_enter(&ac_flow->ac_lock);
1560 	if (ac_flow->ac_state == AC_OFF) {
1561 		mutex_exit(&ac_flow->ac_lock);
1562 		return (ENOTACTIVE);
1563 	}
1564 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1565 	mutex_exit(&ac_flow->ac_lock);
1566 
1567 	record_type = EXD_GROUP_FLOW;
1568 
1569 	flow_usage = exacct_assemble_flow_record(fu, mask, record_type);
1570 	if (flow_usage == NULL) {
1571 		return (0);
1572 	}
1573 
1574 	/*
1575 	 * Pack object into buffer and pass to callback.
1576 	 */
1577 	bufsize = ea_pack_object(flow_usage, NULL, 0);
1578 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
1579 	if (buf == NULL) {
1580 		return (ENOMEM);
1581 	}
1582 
1583 	(void) ea_pack_object(flow_usage, buf, bufsize);
1584 
1585 	ret = callback(ac_flow, ubuf, ubufsize, buf, bufsize, actual);
1586 
1587 	/*
1588 	 * Free all previously allocations.
1589 	 */
1590 	kmem_free(buf, bufsize);
1591 	ea_free_object(flow_usage, EUP_ALLOC);
1592 	return (ret);
1593 }
1594 
1595 void
1596 exacct_commit_flow(void *arg)
1597 {
1598 	flow_usage_t *f = (flow_usage_t *)arg;
1599 	size_t size;
1600 	ulong_t mask[AC_MASK_SZ];
1601 	struct exacct_globals *acg;
1602 	ac_info_t *ac_flow;
1603 
1604 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1605 		/*
1606 		 * acctctl module not loaded. Nothing to do.
1607 		 */
1608 		return;
1609 	}
1610 
1611 	/*
1612 	 * Even though each zone nominally has its own flow accounting settings
1613 	 * (ac_flow), these are only maintained by and for the global zone.
1614 	 *
1615 	 * If this were to change in the future, this function should grow a
1616 	 * second zoneid (or zone) argument, and use the corresponding zone's
1617 	 * settings rather than always using those of the global zone.
1618 	 */
1619 	acg = zone_getspecific(exacct_zone_key, global_zone);
1620 	ac_flow = &acg->ac_flow;
1621 
1622 	mutex_enter(&ac_flow->ac_lock);
1623 	if (ac_flow->ac_state == AC_OFF) {
1624 		mutex_exit(&ac_flow->ac_lock);
1625 		return;
1626 	}
1627 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1628 	mutex_exit(&ac_flow->ac_lock);
1629 
1630 	(void) exacct_assemble_flow_usage(ac_flow, f, exacct_commit_callback,
1631 	    NULL, 0, &size);
1632 }
1633 
1634 /*
1635  * int exacct_tag_task(task_t *, void *, size_t, int)
1636  *
1637  * Overview
1638  *   exacct_tag_task() provides the exacct record construction and writing
1639  *   support required by putacct(2) for task entities.
1640  *
1641  * Return values
1642  *   The result of the write operation is returned, unless the extended
1643  *   accounting facility is not active, in which case ENOTACTIVE is returned.
1644  *
1645  * Caller's context
1646  *   Suitable for KM_SLEEP allocations.
1647  */
1648 int
1649 exacct_tag_task(ac_info_t *ac_task, task_t *tk, void *ubuf, size_t ubufsz,
1650     int flags)
1651 {
1652 	int error = 0;
1653 	void *buf;
1654 	size_t bufsize;
1655 	ea_catalog_t cat;
1656 	ea_object_t *tag;
1657 
1658 	mutex_enter(&ac_task->ac_lock);
1659 	if (ac_task->ac_state == AC_OFF || ac_task->ac_vnode == NULL) {
1660 		mutex_exit(&ac_task->ac_lock);
1661 		return (ENOTACTIVE);
1662 	}
1663 	mutex_exit(&ac_task->ac_lock);
1664 
1665 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_TASK_TAG);
1666 	(void) ea_attach_item(tag, &tk->tk_tkid, 0,
1667 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1668 	(void) ea_attach_item(tag, tk->tk_zone->zone_nodename, 0,
1669 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1670 	if (flags == EP_RAW)
1671 		cat = EXT_RAW | EXC_DEFAULT | EXD_TASK_TAG;
1672 	else
1673 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_TASK_TAG;
1674 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1675 
1676 	bufsize = ea_pack_object(tag, NULL, 0);
1677 	buf = kmem_alloc(bufsize, KM_SLEEP);
1678 	(void) ea_pack_object(tag, buf, bufsize);
1679 	error = exacct_vn_write(ac_task, buf, bufsize);
1680 	kmem_free(buf, bufsize);
1681 	ea_free_object(tag, EUP_ALLOC);
1682 	return (error);
1683 }
1684 
1685 /*
1686  * exacct_tag_proc(pid_t, taskid_t, void *, size_t, int, char *)
1687  *
1688  * Overview
1689  *   exacct_tag_proc() provides the exacct record construction and writing
1690  *   support required by putacct(2) for processes.
1691  *
1692  * Return values
1693  *   The result of the write operation is returned, unless the extended
1694  *   accounting facility is not active, in which case ENOTACTIVE is returned.
1695  *
1696  * Caller's context
1697  *   Suitable for KM_SLEEP allocations.
1698  */
1699 int
1700 exacct_tag_proc(ac_info_t *ac_proc, pid_t pid, taskid_t tkid, void *ubuf,
1701     size_t ubufsz, int flags, const char *hostname)
1702 {
1703 	int error = 0;
1704 	void *buf;
1705 	size_t bufsize;
1706 	ea_catalog_t cat;
1707 	ea_object_t *tag;
1708 
1709 	mutex_enter(&ac_proc->ac_lock);
1710 	if (ac_proc->ac_state == AC_OFF || ac_proc->ac_vnode == NULL) {
1711 		mutex_exit(&ac_proc->ac_lock);
1712 		return (ENOTACTIVE);
1713 	}
1714 	mutex_exit(&ac_proc->ac_lock);
1715 
1716 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_PROC_TAG);
1717 	(void) ea_attach_item(tag, &pid, sizeof (uint32_t),
1718 	    EXT_UINT32 | EXC_DEFAULT | EXD_PROC_PID);
1719 	(void) ea_attach_item(tag, &tkid, 0,
1720 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1721 	(void) ea_attach_item(tag, (void *)hostname, 0,
1722 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1723 	if (flags == EP_RAW)
1724 		cat = EXT_RAW | EXC_DEFAULT | EXD_PROC_TAG;
1725 	else
1726 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_PROC_TAG;
1727 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1728 
1729 	bufsize = ea_pack_object(tag, NULL, 0);
1730 	buf = kmem_alloc(bufsize, KM_SLEEP);
1731 	(void) ea_pack_object(tag, buf, bufsize);
1732 	error = exacct_vn_write(ac_proc, buf, bufsize);
1733 	kmem_free(buf, bufsize);
1734 	ea_free_object(tag, EUP_ALLOC);
1735 	return (error);
1736 }
1737 
1738 /*
1739  * void exacct_init(void)
1740  *
1741  * Overview
1742  *   Initialized the extended accounting subsystem.
1743  *
1744  * Return values
1745  *   None.
1746  *
1747  * Caller's context
1748  *   Suitable for KM_SLEEP allocations.
1749  */
1750 void
1751 exacct_init()
1752 {
1753 	exacct_queue = system_taskq;
1754 	exacct_object_cache = kmem_cache_create("exacct_object_cache",
1755 	    sizeof (ea_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1756 	task_commit_thread_init();
1757 }
1758 
1759 /*
1760  * exacct_snapshot_proc_mstate() copies a process's microstate accounting data
1761  * and resource usage counters into a given task_usage_t. It differs from
1762  * exacct_copy_proc_mstate() in that here a) we are copying to a task_usage_t,
1763  * b) p_lock will have been acquired earlier in the call path and c) we
1764  * are here including the process's user and system times.
1765  */
1766 static void
1767 exacct_snapshot_proc_mstate(proc_t *p, task_usage_t *tu)
1768 {
1769 	tu->tu_utime  = mstate_aggr_state(p, LMS_USER);
1770 	tu->tu_stime  = mstate_aggr_state(p, LMS_SYSTEM);
1771 	tu->tu_minflt = p->p_ru.minflt;
1772 	tu->tu_majflt = p->p_ru.majflt;
1773 	tu->tu_sndmsg = p->p_ru.msgsnd;
1774 	tu->tu_rcvmsg = p->p_ru.msgrcv;
1775 	tu->tu_ioch   = p->p_ru.ioch;
1776 	tu->tu_iblk   = p->p_ru.inblock;
1777 	tu->tu_oblk   = p->p_ru.oublock;
1778 	tu->tu_vcsw   = p->p_ru.nvcsw;
1779 	tu->tu_icsw   = p->p_ru.nivcsw;
1780 	tu->tu_nsig   = p->p_ru.nsignals;
1781 	tu->tu_nswp   = p->p_ru.nswap;
1782 	tu->tu_nscl   = p->p_ru.sysc;
1783 }
1784 
1785 /*
1786  * void exacct_move_mstate(proc_t *, task_t *, task_t *)
1787  *
1788  * Overview
1789  *   exacct_move_mstate() is called by task_change() and accounts for
1790  *   a process's resource usage when it is moved from one task to another.
1791  *
1792  *   The process's usage at this point is recorded in the new task so
1793  *   that it can be excluded from the calculation of resources consumed
1794  *   by that task.
1795  *
1796  *   The resource usage inherited by the new task is also added to the
1797  *   aggregate maintained by the old task for processes that have exited.
1798  *
1799  * Return values
1800  *   None.
1801  *
1802  * Caller's context
1803  *   pidlock and p_lock held across exacct_move_mstate().
1804  */
1805 void
1806 exacct_move_mstate(proc_t *p, task_t *oldtk, task_t *newtk)
1807 {
1808 	task_usage_t tu;
1809 
1810 	/* Take a snapshot of this process's mstate and RU counters */
1811 	exacct_snapshot_proc_mstate(p, &tu);
1812 
1813 	/*
1814 	 * Use the snapshot to increment the aggregate usage of the old
1815 	 * task, and the inherited usage of the new one.
1816 	 */
1817 	mutex_enter(&oldtk->tk_usage_lock);
1818 	exacct_add_task_mstate(oldtk->tk_usage, &tu);
1819 	mutex_exit(&oldtk->tk_usage_lock);
1820 	mutex_enter(&newtk->tk_usage_lock);
1821 	exacct_add_task_mstate(newtk->tk_inherited, &tu);
1822 	mutex_exit(&newtk->tk_usage_lock);
1823 }
1824