xref: /titanic_51/usr/src/uts/common/os/exacct.c (revision 67dbe2be0c0f1e2eb428b89088bb5667e8f0b9f6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/exacct.h>
27 #include <sys/exacct_catalog.h>
28 #include <sys/disp.h>
29 #include <sys/task.h>
30 #include <sys/proc.h>
31 #include <sys/cmn_err.h>
32 #include <sys/kmem.h>
33 #include <sys/project.h>
34 #include <sys/systm.h>
35 #include <sys/vnode.h>
36 #include <sys/file.h>
37 #include <sys/acctctl.h>
38 #include <sys/time.h>
39 #include <sys/utsname.h>
40 #include <sys/session.h>
41 #include <sys/sysmacros.h>
42 #include <sys/bitmap.h>
43 #include <sys/msacct.h>
44 
45 /*
46  * exacct usage and recording routines
47  *
48  * wracct(2), getacct(2), and the records written at process or task
49  * termination are constructed using the exacct_assemble_[task,proc]_usage()
50  * functions, which take a callback that takes the appropriate action on
51  * the packed exacct record for the task or process.  For the process-related
52  * actions, we partition the routines such that the data collecting component
53  * can be performed while holding p_lock, and all sleeping or blocking
54  * operations can be performed without acquiring p_lock.
55  *
56  * putacct(2), which allows an application to construct a customized record
57  * associated with an existing process or task, has its own entry points:
58  * exacct_tag_task() and exacct_tag_proc().
59  */
60 
61 taskq_t *exacct_queue;
62 kmem_cache_t *exacct_object_cache;
63 
64 zone_key_t exacct_zone_key = ZONE_KEY_UNINITIALIZED;
65 
66 static const uint32_t exacct_version = EXACCT_VERSION;
67 static const char exacct_header[] = "exacct";
68 static const char exacct_creator[] = "SunOS";
69 
70 ea_object_t *
71 ea_alloc_item(ea_catalog_t catalog, void *buf, size_t bufsz)
72 {
73 	ea_object_t *item;
74 
75 	item = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
76 	bzero(item, sizeof (ea_object_t));
77 	(void) ea_set_item(item, catalog, buf, bufsz);
78 	return (item);
79 }
80 
81 ea_object_t *
82 ea_alloc_group(ea_catalog_t catalog)
83 {
84 	ea_object_t *group;
85 
86 	group = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
87 	bzero(group, sizeof (ea_object_t));
88 	(void) ea_set_group(group, catalog);
89 	return (group);
90 }
91 
92 ea_object_t *
93 ea_attach_item(ea_object_t *grp, void *buf, size_t bufsz, ea_catalog_t catalog)
94 {
95 	ea_object_t *item;
96 
97 	item = ea_alloc_item(catalog, buf, bufsz);
98 	(void) ea_attach_to_group(grp, item);
99 	return (item);
100 }
101 
102 /*
103  * exacct_add_task_mstate() and exacct_sub_task_mstate() add and subtract
104  * microstate accounting data and resource usage counters from one task_usage_t
105  * from those supplied in another. These functions do not operate on *all*
106  * members of a task_usage_t: for some (e.g. tu_anctaskid) it would not make
107  * sense.
108  */
109 static void
110 exacct_add_task_mstate(task_usage_t *tu, task_usage_t *delta)
111 {
112 	tu->tu_utime  += delta->tu_utime;
113 	tu->tu_stime  += delta->tu_stime;
114 	tu->tu_minflt += delta->tu_minflt;
115 	tu->tu_majflt += delta->tu_majflt;
116 	tu->tu_sndmsg += delta->tu_sndmsg;
117 	tu->tu_rcvmsg += delta->tu_rcvmsg;
118 	tu->tu_ioch   += delta->tu_ioch;
119 	tu->tu_iblk   += delta->tu_iblk;
120 	tu->tu_oblk   += delta->tu_oblk;
121 	tu->tu_vcsw   += delta->tu_vcsw;
122 	tu->tu_icsw   += delta->tu_icsw;
123 	tu->tu_nsig   += delta->tu_nsig;
124 	tu->tu_nswp   += delta->tu_nswp;
125 	tu->tu_nscl   += delta->tu_nscl;
126 }
127 
128 /*
129  * See the comments for exacct_add_task_mstate(), above.
130  */
131 static void
132 exacct_sub_task_mstate(task_usage_t *tu, task_usage_t *delta)
133 {
134 	tu->tu_utime  -= delta->tu_utime;
135 	tu->tu_stime  -= delta->tu_stime;
136 	tu->tu_minflt -= delta->tu_minflt;
137 	tu->tu_majflt -= delta->tu_majflt;
138 	tu->tu_sndmsg -= delta->tu_sndmsg;
139 	tu->tu_rcvmsg -= delta->tu_rcvmsg;
140 	tu->tu_ioch   -= delta->tu_ioch;
141 	tu->tu_iblk   -= delta->tu_iblk;
142 	tu->tu_oblk   -= delta->tu_oblk;
143 	tu->tu_vcsw   -= delta->tu_vcsw;
144 	tu->tu_icsw   -= delta->tu_icsw;
145 	tu->tu_nsig   -= delta->tu_nsig;
146 	tu->tu_nswp   -= delta->tu_nswp;
147 	tu->tu_nscl   -= delta->tu_nscl;
148 }
149 
150 /*
151  * Wrapper for vn_rdwr() used by exacct_vn_write() and exacct_write_header()
152  * to write to the accounting file without corrupting it in case of an I/O or
153  * filesystem error.
154  */
155 static int
156 exacct_vn_write_impl(ac_info_t *info, void *buf, ssize_t bufsize)
157 {
158 	int error;
159 	ssize_t resid;
160 	struct vattr va;
161 
162 	ASSERT(info != NULL);
163 	ASSERT(info->ac_vnode != NULL);
164 	ASSERT(MUTEX_HELD(&info->ac_lock));
165 
166 	/*
167 	 * Save the size. If vn_rdwr fails, reset the size to avoid corrupting
168 	 * the present accounting file.
169 	 */
170 	va.va_mask = AT_SIZE;
171 	error = VOP_GETATTR(info->ac_vnode, &va, 0, kcred, NULL);
172 	if (error == 0) {
173 		error = vn_rdwr(UIO_WRITE, info->ac_vnode, (caddr_t)buf,
174 		    bufsize, 0LL, UIO_SYSSPACE, FAPPEND, (rlim64_t)MAXOFFSET_T,
175 		    kcred, &resid);
176 		if (error) {
177 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
178 		} else if (resid != 0) {
179 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
180 			error = ENOSPC;
181 		}
182 	}
183 	return (error);
184 }
185 
186 /*
187  * exacct_vn_write() safely writes to an accounting file.  acctctl() prevents
188  * the two accounting vnodes from being equal, and the appropriate ac_lock is
189  * held across the call, so we're single threaded through this code for each
190  * file.
191  */
192 static int
193 exacct_vn_write(ac_info_t *info, void *buf, ssize_t bufsize)
194 {
195 	int error;
196 
197 	if (info == NULL)
198 		return (0);
199 
200 	mutex_enter(&info->ac_lock);
201 
202 	/*
203 	 * Don't do anything unless accounting file is set.
204 	 */
205 	if (info->ac_vnode == NULL) {
206 		mutex_exit(&info->ac_lock);
207 		return (0);
208 	}
209 	error = exacct_vn_write_impl(info, buf, bufsize);
210 	mutex_exit(&info->ac_lock);
211 
212 	return (error);
213 }
214 
215 /*
216  * void *exacct_create_header(size_t *)
217  *
218  * Overview
219  *   exacct_create_header() constructs an exacct file header identifying the
220  *   accounting file as the output of the kernel.  exacct_create_header() and
221  *   the static write_header() and verify_header() routines in libexacct must
222  *   remain synchronized.
223  *
224  * Return values
225  *   A pointer to a packed exacct buffer containing the appropriate header is
226  *   returned; the size of the buffer is placed in the location indicated by
227  *   sizep.
228  *
229  * Caller's context
230  *   Suitable for KM_SLEEP allocations.
231  */
232 void *
233 exacct_create_header(size_t *sizep)
234 {
235 	ea_object_t *hdr_grp;
236 	uint32_t bskip;
237 	void *buf;
238 	size_t bufsize;
239 
240 	hdr_grp = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_HEADER);
241 	(void) ea_attach_item(hdr_grp, (void *)&exacct_version, 0,
242 	    EXT_UINT32 | EXC_DEFAULT | EXD_VERSION);
243 	(void) ea_attach_item(hdr_grp, (void *)exacct_header, 0,
244 	    EXT_STRING | EXC_DEFAULT | EXD_FILETYPE);
245 	(void) ea_attach_item(hdr_grp, (void *)exacct_creator, 0,
246 	    EXT_STRING | EXC_DEFAULT | EXD_CREATOR);
247 	(void) ea_attach_item(hdr_grp, uts_nodename(), 0,
248 	    EXT_STRING | EXC_DEFAULT | EXD_HOSTNAME);
249 
250 	bufsize = ea_pack_object(hdr_grp, NULL, 0);
251 	buf = kmem_alloc(bufsize, KM_SLEEP);
252 	(void) ea_pack_object(hdr_grp, buf, bufsize);
253 	ea_free_object(hdr_grp, EUP_ALLOC);
254 
255 	/*
256 	 * To prevent reading the header when reading the file backwards,
257 	 * set the large backskip of the header group to 0 (last 4 bytes).
258 	 */
259 	bskip = 0;
260 	exacct_order32(&bskip);
261 	bcopy(&bskip, (char *)buf + bufsize - sizeof (bskip),
262 	    sizeof (bskip));
263 
264 	*sizep = bufsize;
265 	return (buf);
266 }
267 
268 /*
269  * int exacct_write_header(ac_info_t *, void *, size_t)
270  *
271  * Overview
272  *   exacct_write_header() writes the given header buffer to the indicated
273  *   vnode.
274  *
275  * Return values
276  *   The result of the write operation is returned.
277  *
278  * Caller's context
279  *   Caller must hold the ac_lock of the appropriate accounting file
280  *   information block (ac_info_t).
281  */
282 int
283 exacct_write_header(ac_info_t *info, void *hdr, size_t hdrsize)
284 {
285 	if (info != NULL && info->ac_vnode != NULL)
286 		return (exacct_vn_write_impl(info, hdr, hdrsize));
287 
288 	return (0);
289 }
290 
291 static void
292 exacct_get_interval_task_usage(task_t *tk, task_usage_t *tu,
293     task_usage_t **tu_buf)
294 {
295 	task_usage_t *oldtu, *newtu;
296 	task_usage_t **prevusage;
297 
298 	ASSERT(MUTEX_HELD(&tk->tk_usage_lock));
299 	if (getzoneid() != GLOBAL_ZONEID) {
300 		prevusage = &tk->tk_zoneusage;
301 	} else {
302 		prevusage = &tk->tk_prevusage;
303 	}
304 	if ((oldtu = *prevusage) != NULL) {
305 		/*
306 		 * In case we have any accounting information
307 		 * saved from the previous interval record.
308 		 */
309 		newtu = *tu_buf;
310 		bcopy(tu, newtu, sizeof (task_usage_t));
311 		tu->tu_minflt	-= oldtu->tu_minflt;
312 		tu->tu_majflt	-= oldtu->tu_majflt;
313 		tu->tu_sndmsg	-= oldtu->tu_sndmsg;
314 		tu->tu_rcvmsg	-= oldtu->tu_rcvmsg;
315 		tu->tu_ioch	-= oldtu->tu_ioch;
316 		tu->tu_iblk	-= oldtu->tu_iblk;
317 		tu->tu_oblk	-= oldtu->tu_oblk;
318 		tu->tu_vcsw	-= oldtu->tu_vcsw;
319 		tu->tu_icsw	-= oldtu->tu_icsw;
320 		tu->tu_nsig	-= oldtu->tu_nsig;
321 		tu->tu_nswp	-= oldtu->tu_nswp;
322 		tu->tu_nscl	-= oldtu->tu_nscl;
323 		tu->tu_utime	-= oldtu->tu_utime;
324 		tu->tu_stime	-= oldtu->tu_stime;
325 
326 		tu->tu_startsec = oldtu->tu_finishsec;
327 		tu->tu_startnsec = oldtu->tu_finishnsec;
328 		/*
329 		 * Copy the data from our temporary storage to the task's
330 		 * previous interval usage structure for future reference.
331 		 */
332 		bcopy(newtu, oldtu, sizeof (task_usage_t));
333 	} else {
334 		/*
335 		 * Store current statistics in the task's previous interval
336 		 * usage structure for future references.
337 		 */
338 		*prevusage = *tu_buf;
339 		bcopy(tu, *prevusage, sizeof (task_usage_t));
340 		*tu_buf = NULL;
341 	}
342 }
343 
344 static void
345 exacct_snapshot_task_usage(task_t *tk, task_usage_t *tu)
346 {
347 	timestruc_t ts;
348 	proc_t *p;
349 
350 	ASSERT(MUTEX_HELD(&pidlock));
351 
352 	if ((p = tk->tk_memb_list) == NULL)
353 		return;
354 
355 	/*
356 	 * exacct_snapshot_task_usage() provides an approximate snapshot of the
357 	 * usage of the potentially many members of the task.  Since we don't
358 	 * guarantee exactness, we don't acquire the p_lock of any of the member
359 	 * processes.
360 	 */
361 	do {
362 		mutex_enter(&p->p_lock);
363 		tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
364 		tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
365 		mutex_exit(&p->p_lock);
366 		tu->tu_minflt	+= p->p_ru.minflt;
367 		tu->tu_majflt	+= p->p_ru.majflt;
368 		tu->tu_sndmsg	+= p->p_ru.msgsnd;
369 		tu->tu_rcvmsg	+= p->p_ru.msgrcv;
370 		tu->tu_ioch	+= p->p_ru.ioch;
371 		tu->tu_iblk	+= p->p_ru.inblock;
372 		tu->tu_oblk	+= p->p_ru.oublock;
373 		tu->tu_vcsw	+= p->p_ru.nvcsw;
374 		tu->tu_icsw	+= p->p_ru.nivcsw;
375 		tu->tu_nsig	+= p->p_ru.nsignals;
376 		tu->tu_nswp	+= p->p_ru.nswap;
377 		tu->tu_nscl	+= p->p_ru.sysc;
378 	} while ((p = p->p_tasknext) != tk->tk_memb_list);
379 
380 	/*
381 	 * The resource usage accounted for so far will include that
382 	 * contributed by the task's first process. If this process
383 	 * came from another task, then its accumulated resource usage
384 	 * will include a contribution from work performed there.
385 	 * We must therefore subtract any resource usage that was
386 	 * inherited with the first process.
387 	 */
388 	exacct_sub_task_mstate(tu, tk->tk_inherited);
389 
390 	gethrestime(&ts);
391 	tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
392 	tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
393 }
394 
395 /*
396  * void exacct_update_task_mstate(proc_t *)
397  *
398  * Overview
399  *   exacct_update_task_mstate() updates the task usage; it is intended
400  *   to be called from proc_exit().
401  *
402  * Return values
403  *   None.
404  *
405  * Caller's context
406  *   p_lock must be held at entry.
407  */
408 void
409 exacct_update_task_mstate(proc_t *p)
410 {
411 	task_usage_t *tu;
412 
413 	mutex_enter(&p->p_task->tk_usage_lock);
414 	tu = p->p_task->tk_usage;
415 	tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
416 	tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
417 	tu->tu_minflt	+= p->p_ru.minflt;
418 	tu->tu_majflt	+= p->p_ru.majflt;
419 	tu->tu_sndmsg	+= p->p_ru.msgsnd;
420 	tu->tu_rcvmsg	+= p->p_ru.msgrcv;
421 	tu->tu_ioch	+= p->p_ru.ioch;
422 	tu->tu_iblk	+= p->p_ru.inblock;
423 	tu->tu_oblk	+= p->p_ru.oublock;
424 	tu->tu_vcsw	+= p->p_ru.nvcsw;
425 	tu->tu_icsw	+= p->p_ru.nivcsw;
426 	tu->tu_nsig	+= p->p_ru.nsignals;
427 	tu->tu_nswp	+= p->p_ru.nswap;
428 	tu->tu_nscl	+= p->p_ru.sysc;
429 	mutex_exit(&p->p_task->tk_usage_lock);
430 }
431 
432 static void
433 exacct_calculate_task_usage(task_t *tk, task_usage_t *tu, int flag)
434 {
435 	timestruc_t ts;
436 	task_usage_t *tu_buf;
437 
438 	switch (flag) {
439 	case EW_PARTIAL:
440 		/*
441 		 * For partial records we must report the sum of current
442 		 * accounting statistics with previously accumulated
443 		 * statistics.
444 		 */
445 		mutex_enter(&pidlock);
446 		mutex_enter(&tk->tk_usage_lock);
447 
448 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
449 		exacct_snapshot_task_usage(tk, tu);
450 
451 		mutex_exit(&tk->tk_usage_lock);
452 		mutex_exit(&pidlock);
453 		break;
454 	case EW_INTERVAL:
455 		/*
456 		 * We need to allocate spare task_usage_t buffer before
457 		 * grabbing pidlock because we might need it later in
458 		 * exacct_get_interval_task_usage().
459 		 */
460 		tu_buf = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
461 		mutex_enter(&pidlock);
462 		mutex_enter(&tk->tk_usage_lock);
463 
464 		/*
465 		 * For interval records, we deduct the previous microstate
466 		 * accounting data and cpu usage times from previously saved
467 		 * results and update the previous task usage structure.
468 		 */
469 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
470 		exacct_snapshot_task_usage(tk, tu);
471 		exacct_get_interval_task_usage(tk, tu, &tu_buf);
472 
473 		mutex_exit(&tk->tk_usage_lock);
474 		mutex_exit(&pidlock);
475 
476 		if (tu_buf != NULL)
477 			kmem_free(tu_buf, sizeof (task_usage_t));
478 		break;
479 	case EW_FINAL:
480 		/*
481 		 * For final records, we deduct, from the task's current
482 		 * usage, any usage that was inherited with the arrival
483 		 * of a process from a previous task. We then record
484 		 * the task's finish time.
485 		 */
486 		mutex_enter(&tk->tk_usage_lock);
487 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
488 		exacct_sub_task_mstate(tu, tk->tk_inherited);
489 		mutex_exit(&tk->tk_usage_lock);
490 
491 		gethrestime(&ts);
492 		tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
493 		tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
494 
495 		break;
496 	}
497 }
498 
499 static int
500 exacct_attach_task_item(task_t *tk, task_usage_t *tu, ea_object_t *record,
501     int res)
502 {
503 	int attached = 1;
504 
505 	switch (res) {
506 	case AC_TASK_TASKID:
507 		(void) ea_attach_item(record, &tk->tk_tkid,
508 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_TASKID);
509 		break;
510 	case AC_TASK_PROJID:
511 		(void) ea_attach_item(record, &tk->tk_proj->kpj_id,
512 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_PROJID);
513 		break;
514 	case AC_TASK_CPU: {
515 			timestruc_t ts;
516 			uint64_t ui;
517 
518 			hrt2ts(tu->tu_stime, &ts);
519 			ui = ts.tv_sec;
520 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
521 			    EXT_UINT64 | EXD_TASK_CPU_SYS_SEC);
522 			ui = ts.tv_nsec;
523 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
524 			    EXT_UINT64 | EXD_TASK_CPU_SYS_NSEC);
525 
526 			hrt2ts(tu->tu_utime, &ts);
527 			ui = ts.tv_sec;
528 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
529 			    EXT_UINT64 | EXD_TASK_CPU_USER_SEC);
530 			ui = ts.tv_nsec;
531 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
532 			    EXT_UINT64 | EXD_TASK_CPU_USER_NSEC);
533 		}
534 		break;
535 	case AC_TASK_TIME:
536 		(void) ea_attach_item(record, &tu->tu_startsec,
537 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_SEC);
538 		(void) ea_attach_item(record, &tu->tu_startnsec,
539 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_NSEC);
540 		(void) ea_attach_item(record, &tu->tu_finishsec,
541 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_SEC);
542 		(void) ea_attach_item(record, &tu->tu_finishnsec,
543 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_NSEC);
544 		break;
545 	case AC_TASK_HOSTNAME:
546 		(void) ea_attach_item(record, tk->tk_zone->zone_nodename,
547 		    strlen(tk->tk_zone->zone_nodename) + 1,
548 		    EXT_STRING | EXD_TASK_HOSTNAME);
549 			break;
550 	case AC_TASK_MICROSTATE:
551 		(void) ea_attach_item(record, &tu->tu_majflt,
552 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MAJOR);
553 		(void) ea_attach_item(record, &tu->tu_minflt,
554 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MINOR);
555 		(void) ea_attach_item(record, &tu->tu_sndmsg,
556 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_SND);
557 		(void) ea_attach_item(record, &tu->tu_rcvmsg,
558 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_RCV);
559 		(void) ea_attach_item(record, &tu->tu_iblk,
560 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_IN);
561 		(void) ea_attach_item(record, &tu->tu_oblk,
562 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_OUT);
563 		(void) ea_attach_item(record, &tu->tu_ioch,
564 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CHARS_RDWR);
565 		(void) ea_attach_item(record, &tu->tu_vcsw,
566 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_VOL);
567 		(void) ea_attach_item(record, &tu->tu_icsw,
568 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_INV);
569 		(void) ea_attach_item(record, &tu->tu_nsig,
570 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SIGNALS);
571 		(void) ea_attach_item(record, &tu->tu_nswp,
572 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SWAPS);
573 		(void) ea_attach_item(record, &tu->tu_nscl,
574 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SYSCALLS);
575 		break;
576 	case AC_TASK_ANCTASKID:
577 		(void) ea_attach_item(record, &tu->tu_anctaskid,
578 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_ANCTASKID);
579 		break;
580 	case AC_TASK_ZONENAME:
581 		(void) ea_attach_item(record, tk->tk_zone->zone_name,
582 		    strlen(tk->tk_zone->zone_name) + 1,
583 		    EXT_STRING | EXD_TASK_ZONENAME);
584 		break;
585 	default:
586 		attached = 0;
587 	}
588 	return (attached);
589 }
590 
591 static ea_object_t *
592 exacct_assemble_task_record(task_t *tk, task_usage_t *tu, ulong_t *mask,
593     ea_catalog_t record_type)
594 {
595 	int res, count;
596 	ea_object_t *record;
597 
598 	/*
599 	 * Assemble usage values into group.
600 	 */
601 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
602 	for (res = 1, count = 0; res <= AC_TASK_MAX_RES; res++)
603 		if (BT_TEST(mask, res))
604 			count += exacct_attach_task_item(tk, tu, record, res);
605 	if (count == 0) {
606 		ea_free_object(record, EUP_ALLOC);
607 		record = NULL;
608 	}
609 	return (record);
610 }
611 
612 /*
613  * int exacct_assemble_task_usage(task_t *, int (*)(void *, size_t, void *,
614  *	size_t, size_t *), void *, size_t, size_t *, int)
615  *
616  * Overview
617  *   exacct_assemble_task_usage() builds the packed exacct buffer for the
618  *   indicated task, executes the given callback function, and free the packed
619  *   buffer.
620  *
621  * Return values
622  *   Returns 0 on success; otherwise the appropriate error code is returned.
623  *
624  * Caller's context
625  *   Suitable for KM_SLEEP allocations.
626  */
627 int
628 exacct_assemble_task_usage(ac_info_t *ac_task, task_t *tk,
629     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
630     void *ubuf, size_t ubufsize, size_t *actual, int flag)
631 {
632 	ulong_t mask[AC_MASK_SZ];
633 	ea_object_t *task_record;
634 	ea_catalog_t record_type;
635 	task_usage_t *tu;
636 	void *buf;
637 	size_t bufsize;
638 	int ret;
639 
640 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL || flag == EW_INTERVAL);
641 
642 	mutex_enter(&ac_task->ac_lock);
643 	if (ac_task->ac_state == AC_OFF) {
644 		mutex_exit(&ac_task->ac_lock);
645 		return (ENOTACTIVE);
646 	}
647 	bt_copy(ac_task->ac_mask, mask, AC_MASK_SZ);
648 	mutex_exit(&ac_task->ac_lock);
649 
650 	switch (flag) {
651 	case EW_FINAL:
652 		record_type = EXD_GROUP_TASK;
653 		break;
654 	case EW_PARTIAL:
655 		record_type = EXD_GROUP_TASK_PARTIAL;
656 		break;
657 	case EW_INTERVAL:
658 		record_type = EXD_GROUP_TASK_INTERVAL;
659 		break;
660 	}
661 
662 	/*
663 	 * Calculate task usage and assemble it into the task record.
664 	 */
665 	tu = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
666 	exacct_calculate_task_usage(tk, tu, flag);
667 	task_record = exacct_assemble_task_record(tk, tu, mask, record_type);
668 	if (task_record == NULL) {
669 		/*
670 		 * The current configuration of the accounting system has
671 		 * resulted in records with no data; accordingly, we don't write
672 		 * these, but we return success.
673 		 */
674 		kmem_free(tu, sizeof (task_usage_t));
675 		return (0);
676 	}
677 
678 	/*
679 	 * Pack object into buffer and run callback on it.
680 	 */
681 	bufsize = ea_pack_object(task_record, NULL, 0);
682 	buf = kmem_alloc(bufsize, KM_SLEEP);
683 	(void) ea_pack_object(task_record, buf, bufsize);
684 	ret = callback(ac_task, ubuf, ubufsize, buf, bufsize, actual);
685 
686 	/*
687 	 * Free all previously allocated structures.
688 	 */
689 	kmem_free(buf, bufsize);
690 	ea_free_object(task_record, EUP_ALLOC);
691 	kmem_free(tu, sizeof (task_usage_t));
692 	return (ret);
693 }
694 
695 /*
696  * void exacct_commit_task(void *)
697  *
698  * Overview
699  *   exacct_commit_task() calculates the final usage for a task, updating the
700  *   task usage if task accounting is active, and writing a task record if task
701  *   accounting is active.  exacct_commit_task() is intended for being called
702  *   from a task queue (taskq_t).
703  *
704  * Return values
705  *   None.
706  *
707  * Caller's context
708  *   Suitable for KM_SLEEP allocations.
709  */
710 
711 void
712 exacct_commit_task(void *arg)
713 {
714 	task_t *tk = (task_t *)arg;
715 	size_t size;
716 	zone_t *zone = tk->tk_zone;
717 	struct exacct_globals *acg;
718 
719 	ASSERT(tk != task0p);
720 	ASSERT(tk->tk_memb_list == NULL);
721 
722 	/*
723 	 * Don't do any extra work if the acctctl module isn't loaded.
724 	 */
725 	if (exacct_zone_key != ZONE_KEY_UNINITIALIZED) {
726 		acg = zone_getspecific(exacct_zone_key, zone);
727 		(void) exacct_assemble_task_usage(&acg->ac_task, tk,
728 		    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
729 		if (tk->tk_zone != global_zone) {
730 			acg = zone_getspecific(exacct_zone_key, global_zone);
731 			(void) exacct_assemble_task_usage(&acg->ac_task, tk,
732 			    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
733 		}
734 	}
735 	/*
736 	 * Release associated project and finalize task.
737 	 */
738 	task_end(tk);
739 }
740 
741 static int
742 exacct_attach_proc_item(proc_usage_t *pu, ea_object_t *record, int res)
743 {
744 	int attached = 1;
745 
746 	switch (res) {
747 	case AC_PROC_PID:
748 		(void) ea_attach_item(record, &pu->pu_pid,
749 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PID);
750 		break;
751 	case AC_PROC_UID:
752 		(void) ea_attach_item(record, &pu->pu_ruid,
753 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_UID);
754 		break;
755 	case AC_PROC_FLAG:
756 		(void) ea_attach_item(record, &pu->pu_acflag,
757 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ACCT_FLAGS);
758 		break;
759 	case AC_PROC_GID:
760 		(void) ea_attach_item(record, &pu->pu_rgid,
761 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_GID);
762 		break;
763 	case AC_PROC_PROJID:
764 		(void) ea_attach_item(record, &pu->pu_projid,
765 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PROJID);
766 		break;
767 	case AC_PROC_TASKID:
768 		(void) ea_attach_item(record, &pu->pu_taskid,
769 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TASKID);
770 		break;
771 	case AC_PROC_CPU:
772 		(void) ea_attach_item(record, &pu->pu_utimesec,
773 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_SEC);
774 		(void) ea_attach_item(record, &pu->pu_utimensec,
775 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_NSEC);
776 		(void) ea_attach_item(record, &pu->pu_stimesec,
777 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_SEC);
778 		(void) ea_attach_item(record, &pu->pu_stimensec,
779 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_NSEC);
780 		break;
781 	case AC_PROC_TIME:
782 		(void) ea_attach_item(record, &pu->pu_startsec,
783 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_SEC);
784 		(void) ea_attach_item(record, &pu->pu_startnsec,
785 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_NSEC);
786 		(void) ea_attach_item(record, &pu->pu_finishsec,
787 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_SEC);
788 		(void) ea_attach_item(record, &pu->pu_finishnsec,
789 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_NSEC);
790 		break;
791 	case AC_PROC_COMMAND:
792 		(void) ea_attach_item(record, pu->pu_command,
793 		    strlen(pu->pu_command) + 1, EXT_STRING | EXD_PROC_COMMAND);
794 		break;
795 	case AC_PROC_HOSTNAME:
796 		(void) ea_attach_item(record, pu->pu_nodename,
797 		    strlen(pu->pu_nodename) + 1,
798 		    EXT_STRING | EXD_PROC_HOSTNAME);
799 		break;
800 	case AC_PROC_TTY:
801 		(void) ea_attach_item(record, &pu->pu_major,
802 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MAJOR);
803 		(void) ea_attach_item(record, &pu->pu_minor,
804 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MINOR);
805 		break;
806 	case AC_PROC_MICROSTATE:
807 		(void) ea_attach_item(record, &pu->pu_majflt,
808 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MAJOR);
809 		(void) ea_attach_item(record, &pu->pu_minflt,
810 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MINOR);
811 		(void) ea_attach_item(record, &pu->pu_sndmsg,
812 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_SND);
813 		(void) ea_attach_item(record, &pu->pu_rcvmsg,
814 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_RCV);
815 		(void) ea_attach_item(record, &pu->pu_iblk,
816 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_IN);
817 		(void) ea_attach_item(record, &pu->pu_oblk,
818 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_OUT);
819 		(void) ea_attach_item(record, &pu->pu_ioch,
820 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CHARS_RDWR);
821 		(void) ea_attach_item(record, &pu->pu_vcsw,
822 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_VOL);
823 		(void) ea_attach_item(record, &pu->pu_icsw,
824 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_INV);
825 		(void) ea_attach_item(record, &pu->pu_nsig,
826 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SIGNALS);
827 		(void) ea_attach_item(record, &pu->pu_nswp,
828 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SWAPS);
829 		(void) ea_attach_item(record, &pu->pu_nscl,
830 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SYSCALLS);
831 		break;
832 	case AC_PROC_ANCPID:
833 		(void) ea_attach_item(record, &pu->pu_ancpid,
834 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ANCPID);
835 		break;
836 	case AC_PROC_WAIT_STATUS:
837 		(void) ea_attach_item(record, &pu->pu_wstat,
838 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_WAIT_STATUS);
839 		break;
840 	case AC_PROC_ZONENAME:
841 		(void) ea_attach_item(record, pu->pu_zonename,
842 		    strlen(pu->pu_zonename) + 1,
843 		    EXT_STRING | EXD_PROC_ZONENAME);
844 		break;
845 	case AC_PROC_MEM:
846 		(void) ea_attach_item(record, &pu->pu_mem_rss_avg,
847 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_AVG_K);
848 		(void) ea_attach_item(record, &pu->pu_mem_rss_max,
849 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_MAX_K);
850 		break;
851 	default:
852 		attached = 0;
853 	}
854 	return (attached);
855 }
856 
857 static ea_object_t *
858 exacct_assemble_proc_record(proc_usage_t *pu, ulong_t *mask,
859     ea_catalog_t record_type)
860 {
861 	int res, count;
862 	ea_object_t *record;
863 
864 	/*
865 	 * Assemble usage values into group.
866 	 */
867 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
868 	for (res = 1, count = 0; res <= AC_PROC_MAX_RES; res++)
869 		if (BT_TEST(mask, res))
870 			count += exacct_attach_proc_item(pu, record, res);
871 	if (count == 0) {
872 		ea_free_object(record, EUP_ALLOC);
873 		record = NULL;
874 	}
875 	return (record);
876 }
877 
878 /*
879  * The following two routines assume that process's p_lock is held or
880  * exacct_commit_proc has been called from exit() when all lwps are stopped.
881  */
882 static void
883 exacct_calculate_proc_mstate(proc_t *p, proc_usage_t *pu)
884 {
885 	kthread_t *t;
886 
887 	ASSERT(MUTEX_HELD(&p->p_lock));
888 	if ((t = p->p_tlist) == NULL)
889 		return;
890 
891 	do {
892 		pu->pu_minflt	+= t->t_lwp->lwp_ru.minflt;
893 		pu->pu_majflt	+= t->t_lwp->lwp_ru.majflt;
894 		pu->pu_sndmsg	+= t->t_lwp->lwp_ru.msgsnd;
895 		pu->pu_rcvmsg	+= t->t_lwp->lwp_ru.msgrcv;
896 		pu->pu_ioch	+= t->t_lwp->lwp_ru.ioch;
897 		pu->pu_iblk	+= t->t_lwp->lwp_ru.inblock;
898 		pu->pu_oblk	+= t->t_lwp->lwp_ru.oublock;
899 		pu->pu_vcsw	+= t->t_lwp->lwp_ru.nvcsw;
900 		pu->pu_icsw	+= t->t_lwp->lwp_ru.nivcsw;
901 		pu->pu_nsig	+= t->t_lwp->lwp_ru.nsignals;
902 		pu->pu_nswp	+= t->t_lwp->lwp_ru.nswap;
903 		pu->pu_nscl	+= t->t_lwp->lwp_ru.sysc;
904 	} while ((t = t->t_forw) != p->p_tlist);
905 }
906 
907 static void
908 exacct_copy_proc_mstate(proc_t *p, proc_usage_t *pu)
909 {
910 	pu->pu_minflt	= p->p_ru.minflt;
911 	pu->pu_majflt	= p->p_ru.majflt;
912 	pu->pu_sndmsg	= p->p_ru.msgsnd;
913 	pu->pu_rcvmsg	= p->p_ru.msgrcv;
914 	pu->pu_ioch	= p->p_ru.ioch;
915 	pu->pu_iblk	= p->p_ru.inblock;
916 	pu->pu_oblk	= p->p_ru.oublock;
917 	pu->pu_vcsw	= p->p_ru.nvcsw;
918 	pu->pu_icsw	= p->p_ru.nivcsw;
919 	pu->pu_nsig	= p->p_ru.nsignals;
920 	pu->pu_nswp	= p->p_ru.nswap;
921 	pu->pu_nscl	= p->p_ru.sysc;
922 }
923 
924 void
925 exacct_calculate_proc_usage(proc_t *p, proc_usage_t *pu, ulong_t *mask,
926     int flag, int wstat)
927 {
928 	timestruc_t ts, ts_run;
929 
930 	ASSERT(MUTEX_HELD(&p->p_lock));
931 
932 	/*
933 	 * Convert CPU and execution times to sec/nsec format.
934 	 */
935 	if (BT_TEST(mask, AC_PROC_CPU)) {
936 		hrt2ts(mstate_aggr_state(p, LMS_USER), &ts);
937 		pu->pu_utimesec = (uint64_t)(ulong_t)ts.tv_sec;
938 		pu->pu_utimensec = (uint64_t)(ulong_t)ts.tv_nsec;
939 		hrt2ts(mstate_aggr_state(p, LMS_SYSTEM), &ts);
940 		pu->pu_stimesec = (uint64_t)(ulong_t)ts.tv_sec;
941 		pu->pu_stimensec = (uint64_t)(ulong_t)ts.tv_nsec;
942 	}
943 	if (BT_TEST(mask, AC_PROC_TIME)) {
944 		gethrestime(&ts);
945 		pu->pu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
946 		pu->pu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
947 		hrt2ts(gethrtime() - p->p_mstart, &ts_run);
948 		ts.tv_sec -= ts_run.tv_sec;
949 		ts.tv_nsec -= ts_run.tv_nsec;
950 		if (ts.tv_nsec < 0) {
951 			ts.tv_sec--;
952 			if ((ts.tv_nsec = ts.tv_nsec + NANOSEC) >= NANOSEC) {
953 				ts.tv_sec++;
954 				ts.tv_nsec -= NANOSEC;
955 			}
956 		}
957 		pu->pu_startsec = (uint64_t)(ulong_t)ts.tv_sec;
958 		pu->pu_startnsec = (uint64_t)(ulong_t)ts.tv_nsec;
959 	}
960 
961 	pu->pu_pid = p->p_pidp->pid_id;
962 	pu->pu_acflag = p->p_user.u_acflag;
963 	pu->pu_projid = p->p_task->tk_proj->kpj_id;
964 	pu->pu_taskid = p->p_task->tk_tkid;
965 	pu->pu_major = getmajor(p->p_sessp->s_dev);
966 	pu->pu_minor = getminor(p->p_sessp->s_dev);
967 	pu->pu_ancpid = p->p_ancpid;
968 	pu->pu_wstat = wstat;
969 	/*
970 	 * Compute average RSS in K.  The denominator is the number of
971 	 * samples:  the number of clock ticks plus the initial value.
972 	 */
973 	pu->pu_mem_rss_avg = (PTOU(p)->u_mem / (p->p_stime + p->p_utime + 1)) *
974 	    (PAGESIZE / 1024);
975 	pu->pu_mem_rss_max = PTOU(p)->u_mem_max * (PAGESIZE / 1024);
976 
977 	mutex_enter(&p->p_crlock);
978 	pu->pu_ruid = crgetruid(p->p_cred);
979 	pu->pu_rgid = crgetrgid(p->p_cred);
980 	mutex_exit(&p->p_crlock);
981 
982 	bcopy(p->p_user.u_comm, pu->pu_command, strlen(p->p_user.u_comm) + 1);
983 	bcopy(p->p_zone->zone_name, pu->pu_zonename,
984 	    strlen(p->p_zone->zone_name) + 1);
985 	bcopy(p->p_zone->zone_nodename, pu->pu_nodename,
986 	    strlen(p->p_zone->zone_nodename) + 1);
987 
988 	/*
989 	 * Calculate microstate accounting data for a process that is still
990 	 * running.  Presently, we explicitly collect all of the LWP usage into
991 	 * the proc usage structure here.
992 	 */
993 	if (flag & EW_PARTIAL)
994 		exacct_calculate_proc_mstate(p, pu);
995 	if (flag & EW_FINAL)
996 		exacct_copy_proc_mstate(p, pu);
997 }
998 
999 /*
1000  * int exacct_assemble_proc_usage(proc_usage_t *, int (*)(void *, size_t, void
1001  *	*, size_t, size_t *), void *, size_t, size_t *)
1002  *
1003  * Overview
1004  *   Assemble record with miscellaneous accounting information about the process
1005  *   and execute the callback on it. It is the callback's job to set "actual" to
1006  *   the size of record.
1007  *
1008  * Return values
1009  *   The result of the callback function, unless the extended process accounting
1010  *   feature is not active, in which case ENOTACTIVE is returned.
1011  *
1012  * Caller's context
1013  *   Suitable for KM_SLEEP allocations.
1014  */
1015 int
1016 exacct_assemble_proc_usage(ac_info_t *ac_proc, proc_usage_t *pu,
1017     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1018     void *ubuf, size_t ubufsize, size_t *actual, int flag)
1019 {
1020 	ulong_t mask[AC_MASK_SZ];
1021 	ea_object_t *proc_record;
1022 	ea_catalog_t record_type;
1023 	void *buf;
1024 	size_t bufsize;
1025 	int ret;
1026 
1027 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL);
1028 
1029 	mutex_enter(&ac_proc->ac_lock);
1030 	if (ac_proc->ac_state == AC_OFF) {
1031 		mutex_exit(&ac_proc->ac_lock);
1032 		return (ENOTACTIVE);
1033 	}
1034 	bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
1035 	mutex_exit(&ac_proc->ac_lock);
1036 
1037 	switch (flag) {
1038 	case EW_FINAL:
1039 		record_type = EXD_GROUP_PROC;
1040 		break;
1041 	case EW_PARTIAL:
1042 		record_type = EXD_GROUP_PROC_PARTIAL;
1043 		break;
1044 	}
1045 
1046 	proc_record = exacct_assemble_proc_record(pu, mask, record_type);
1047 	if (proc_record == NULL)
1048 		return (0);
1049 
1050 	/*
1051 	 * Pack object into buffer and pass to callback.
1052 	 */
1053 	bufsize = ea_pack_object(proc_record, NULL, 0);
1054 	buf = kmem_alloc(bufsize, KM_SLEEP);
1055 	(void) ea_pack_object(proc_record, buf, bufsize);
1056 
1057 	ret = callback(ac_proc, ubuf, ubufsize, buf, bufsize, actual);
1058 
1059 	/*
1060 	 * Free all previously allocations.
1061 	 */
1062 	kmem_free(buf, bufsize);
1063 	ea_free_object(proc_record, EUP_ALLOC);
1064 	return (ret);
1065 }
1066 
1067 /*
1068  * int exacct_commit_callback(ac_info_t *, void *, size_t, void *, size_t,
1069  * 	size_t *)
1070  *
1071  * Overview
1072  *   exacct_commit_callback() writes the indicated buffer to the indicated
1073  *   extended accounting file.
1074  *
1075  * Return values
1076  *   The result of the write operation is returned.  "actual" is updated to
1077  *   contain the number of bytes actually written.
1078  *
1079  * Caller's context
1080  *   Suitable for a vn_rdwr() operation.
1081  */
1082 /*ARGSUSED*/
1083 int
1084 exacct_commit_callback(ac_info_t *info, void *ubuf, size_t ubufsize,
1085     void *buf, size_t bufsize, size_t *actual)
1086 {
1087 	int error = 0;
1088 
1089 	*actual = 0;
1090 	if ((error = exacct_vn_write(info, buf, bufsize)) == 0)
1091 		*actual = bufsize;
1092 	return (error);
1093 }
1094 
1095 static void
1096 exacct_do_commit_proc(ac_info_t *ac_proc, proc_t *p, int wstat)
1097 {
1098 	size_t size;
1099 	proc_usage_t *pu;
1100 	ulong_t mask[AC_MASK_SZ];
1101 
1102 	mutex_enter(&ac_proc->ac_lock);
1103 	if (ac_proc->ac_state == AC_ON) {
1104 		bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
1105 		mutex_exit(&ac_proc->ac_lock);
1106 	} else {
1107 		mutex_exit(&ac_proc->ac_lock);
1108 		return;
1109 	}
1110 
1111 	mutex_enter(&p->p_lock);
1112 	size = strlen(p->p_user.u_comm) + 1;
1113 	mutex_exit(&p->p_lock);
1114 
1115 	pu = kmem_alloc(sizeof (proc_usage_t), KM_SLEEP);
1116 	pu->pu_command = kmem_alloc(size, KM_SLEEP);
1117 	mutex_enter(&p->p_lock);
1118 	exacct_calculate_proc_usage(p, pu, mask, EW_FINAL, wstat);
1119 	mutex_exit(&p->p_lock);
1120 
1121 	(void) exacct_assemble_proc_usage(ac_proc, pu,
1122 	    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
1123 
1124 	kmem_free(pu->pu_command, strlen(pu->pu_command) + 1);
1125 	kmem_free(pu, sizeof (proc_usage_t));
1126 }
1127 
1128 /*
1129  * void exacct_commit_proc(proc_t *, int)
1130  *
1131  * Overview
1132  *   exacct_commit_proc() calculates the final usage for a process, updating the
1133  *   task usage if task accounting is active, and writing a process record if
1134  *   process accounting is active.  exacct_commit_proc() is intended for being
1135  *   called from proc_exit().
1136  *
1137  * Return values
1138  *   None.
1139  *
1140  * Caller's context
1141  *   Suitable for KM_SLEEP allocations.  p_lock must not be held at entry.
1142  */
1143 void
1144 exacct_commit_proc(proc_t *p, int wstat)
1145 {
1146 	zone_t *zone = p->p_zone;
1147 	struct exacct_globals *acg, *gacg = NULL;
1148 
1149 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1150 		/*
1151 		 * acctctl module not loaded.  Nothing to do.
1152 		 */
1153 		return;
1154 	}
1155 	acg = zone_getspecific(exacct_zone_key, zone);
1156 	exacct_do_commit_proc(&acg->ac_proc, p, wstat);
1157 	if (zone != global_zone) {
1158 		gacg = zone_getspecific(exacct_zone_key, global_zone);
1159 		exacct_do_commit_proc(&gacg->ac_proc, p, wstat);
1160 	}
1161 }
1162 
1163 static int
1164 exacct_attach_netstat_item(net_stat_t *ns, ea_object_t *record, int res)
1165 {
1166 	int		attached = 1;
1167 
1168 	switch (res) {
1169 	case AC_NET_NAME:
1170 		(void) ea_attach_item(record, ns->ns_name,
1171 		    strlen(ns->ns_name) + 1, EXT_STRING | EXD_NET_STATS_NAME);
1172 		break;
1173 	case AC_NET_CURTIME:
1174 		{
1175 			uint64_t	now;
1176 			timestruc_t	ts;
1177 
1178 			gethrestime(&ts);
1179 			now = (uint64_t)(ulong_t)ts.tv_sec;
1180 			(void) ea_attach_item(record,  &now, sizeof (uint64_t),
1181 			    EXT_UINT64 | EXD_NET_STATS_CURTIME);
1182 		}
1183 		break;
1184 	case AC_NET_IBYTES:
1185 		(void) ea_attach_item(record, &ns->ns_ibytes,
1186 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IBYTES);
1187 		break;
1188 	case AC_NET_OBYTES:
1189 		(void) ea_attach_item(record, &ns->ns_obytes,
1190 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OBYTES);
1191 		break;
1192 	case AC_NET_IPKTS:
1193 		(void) ea_attach_item(record, &ns->ns_ipackets,
1194 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IPKTS);
1195 		break;
1196 	case AC_NET_OPKTS:
1197 		(void) ea_attach_item(record, &ns->ns_opackets,
1198 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OPKTS);
1199 		break;
1200 	case AC_NET_IERRPKTS:
1201 		(void) ea_attach_item(record, &ns->ns_ierrors,
1202 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IERRPKTS);
1203 		break;
1204 	case AC_NET_OERRPKTS:
1205 		(void) ea_attach_item(record, &ns->ns_oerrors,
1206 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OERRPKTS);
1207 		break;
1208 	default:
1209 		attached = 0;
1210 	}
1211 	return (attached);
1212 }
1213 
1214 static int
1215 exacct_attach_netdesc_item(net_desc_t *nd, ea_object_t *record, int res)
1216 {
1217 	int attached = 1;
1218 
1219 	switch (res) {
1220 	case AC_NET_NAME:
1221 		(void) ea_attach_item(record, nd->nd_name,
1222 		    strlen(nd->nd_name) + 1, EXT_STRING | EXD_NET_DESC_NAME);
1223 		break;
1224 	case AC_NET_DEVNAME:
1225 		(void) ea_attach_item(record, nd->nd_devname,
1226 		    strlen(nd->nd_devname) + 1, EXT_STRING |
1227 		    EXD_NET_DESC_DEVNAME);
1228 		break;
1229 	case AC_NET_EHOST:
1230 		(void) ea_attach_item(record, &nd->nd_ehost,
1231 		    sizeof (nd->nd_ehost), EXT_RAW | EXD_NET_DESC_EHOST);
1232 		break;
1233 	case AC_NET_EDEST:
1234 		(void) ea_attach_item(record, &nd->nd_edest,
1235 		    sizeof (nd->nd_edest), EXT_RAW | EXD_NET_DESC_EDEST);
1236 		break;
1237 	case AC_NET_VLAN_TPID:
1238 		(void) ea_attach_item(record, &nd->nd_vlan_tpid,
1239 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TPID);
1240 		break;
1241 	case AC_NET_VLAN_TCI:
1242 		(void) ea_attach_item(record, &nd->nd_vlan_tci,
1243 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TCI);
1244 		break;
1245 	case AC_NET_SAP:
1246 		(void) ea_attach_item(record, &nd->nd_sap,
1247 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_SAP);
1248 		break;
1249 	case AC_NET_PRIORITY:
1250 		(void) ea_attach_item(record, &nd->nd_priority,
1251 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_PRIORITY);
1252 		break;
1253 	case AC_NET_BWLIMIT:
1254 		(void) ea_attach_item(record, &nd->nd_bw_limit,
1255 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_DESC_BWLIMIT);
1256 		break;
1257 	case AC_NET_SADDR:
1258 		if (nd->nd_isv4) {
1259 			(void) ea_attach_item(record, &nd->nd_saddr[3],
1260 			    sizeof (uint32_t), EXT_UINT32 |
1261 			    EXD_NET_DESC_V4SADDR);
1262 		} else {
1263 			(void) ea_attach_item(record, &nd->nd_saddr,
1264 			    sizeof (nd->nd_saddr), EXT_RAW |
1265 			    EXD_NET_DESC_V6SADDR);
1266 		}
1267 		break;
1268 	case AC_NET_DADDR:
1269 		if (nd->nd_isv4) {
1270 			(void) ea_attach_item(record, &nd->nd_daddr[3],
1271 			    sizeof (uint32_t), EXT_UINT32 |
1272 			    EXD_NET_DESC_V4DADDR);
1273 		} else {
1274 			(void) ea_attach_item(record, &nd->nd_daddr,
1275 			    sizeof (nd->nd_daddr), EXT_RAW |
1276 			    EXD_NET_DESC_V6DADDR);
1277 		}
1278 		break;
1279 	case AC_NET_SPORT:
1280 		(void) ea_attach_item(record, &nd->nd_sport,
1281 		    sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_SPORT);
1282 		break;
1283 	case AC_NET_DPORT:
1284 		(void) ea_attach_item(record, &nd->nd_dport,
1285 		    sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_DPORT);
1286 		break;
1287 	case AC_NET_PROTOCOL:
1288 		(void) ea_attach_item(record, &nd->nd_protocol,
1289 		    sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_PROTOCOL);
1290 		break;
1291 	case AC_NET_DSFIELD:
1292 		(void) ea_attach_item(record, &nd->nd_dsfield,
1293 		    sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_DSFIELD);
1294 		break;
1295 	default:
1296 		attached = 0;
1297 	}
1298 	return (attached);
1299 }
1300 
1301 static ea_object_t *
1302 exacct_assemble_net_record(void *ninfo, ulong_t *mask, ea_catalog_t record_type,
1303     int what)
1304 {
1305 	int		res;
1306 	int		count;
1307 	ea_object_t	*record;
1308 
1309 	/*
1310 	 * Assemble usage values into group.
1311 	 */
1312 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
1313 	for (res = 1, count = 0; res <= AC_NET_MAX_RES; res++)
1314 		if (BT_TEST(mask, res)) {
1315 			if (what == EX_NET_LNDESC_REC ||
1316 			    what == EX_NET_FLDESC_REC) {
1317 				count += exacct_attach_netdesc_item(
1318 				    (net_desc_t *)ninfo, record, res);
1319 			} else {
1320 				count += exacct_attach_netstat_item(
1321 				    (net_stat_t *)ninfo, record, res);
1322 			}
1323 		}
1324 	if (count == 0) {
1325 		ea_free_object(record, EUP_ALLOC);
1326 		record = NULL;
1327 	}
1328 	return (record);
1329 }
1330 
1331 int
1332 exacct_assemble_net_usage(ac_info_t *ac_net, void *ninfo,
1333     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1334     void *ubuf, size_t ubufsize, size_t *actual, int what)
1335 {
1336 	ulong_t		mask[AC_MASK_SZ];
1337 	ea_object_t	*net_desc;
1338 	ea_catalog_t	record_type;
1339 	void		*buf;
1340 	size_t		bufsize;
1341 	int		ret;
1342 
1343 	mutex_enter(&ac_net->ac_lock);
1344 	if (ac_net->ac_state == AC_OFF) {
1345 		mutex_exit(&ac_net->ac_lock);
1346 		return (ENOTACTIVE);
1347 	}
1348 	bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
1349 	mutex_exit(&ac_net->ac_lock);
1350 
1351 	switch (what) {
1352 	case EX_NET_LNDESC_REC:
1353 		record_type = EXD_GROUP_NET_LINK_DESC;
1354 		break;
1355 	case EX_NET_LNSTAT_REC:
1356 		record_type = EXD_GROUP_NET_LINK_STATS;
1357 		break;
1358 	case EX_NET_FLDESC_REC:
1359 		record_type = EXD_GROUP_NET_FLOW_DESC;
1360 		break;
1361 	case EX_NET_FLSTAT_REC:
1362 		record_type = EXD_GROUP_NET_FLOW_STATS;
1363 		break;
1364 	}
1365 
1366 	net_desc = exacct_assemble_net_record(ninfo, mask, record_type, what);
1367 	if (net_desc == NULL)
1368 		return (0);
1369 
1370 	/*
1371 	 * Pack object into buffer and pass to callback.
1372 	 */
1373 	bufsize = ea_pack_object(net_desc, NULL, 0);
1374 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
1375 	if (buf == NULL)
1376 		return (ENOMEM);
1377 
1378 	(void) ea_pack_object(net_desc, buf, bufsize);
1379 
1380 	ret = callback(ac_net, ubuf, ubufsize, buf, bufsize, actual);
1381 
1382 	/*
1383 	 * Free all previously allocations.
1384 	 */
1385 	kmem_free(buf, bufsize);
1386 	ea_free_object(net_desc, EUP_ALLOC);
1387 	return (ret);
1388 }
1389 
1390 int
1391 exacct_commit_netinfo(void *arg, int what)
1392 {
1393 	size_t			size;
1394 	ulong_t			mask[AC_MASK_SZ];
1395 	struct exacct_globals	*acg;
1396 	ac_info_t		*ac_net;
1397 
1398 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1399 		/*
1400 		 * acctctl module not loaded. Nothing to do.
1401 		 */
1402 		return (ENOTACTIVE);
1403 	}
1404 
1405 	/*
1406 	 * Even though each zone nominally has its own flow accounting settings
1407 	 * (ac_flow), these are only maintained by and for the global zone.
1408 	 *
1409 	 * If this were to change in the future, this function should grow a
1410 	 * second zoneid (or zone) argument, and use the corresponding zone's
1411 	 * settings rather than always using those of the global zone.
1412 	 */
1413 	acg = zone_getspecific(exacct_zone_key, global_zone);
1414 	ac_net = &acg->ac_net;
1415 
1416 	mutex_enter(&ac_net->ac_lock);
1417 	if (ac_net->ac_state == AC_OFF) {
1418 		mutex_exit(&ac_net->ac_lock);
1419 		return (ENOTACTIVE);
1420 	}
1421 	bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
1422 	mutex_exit(&ac_net->ac_lock);
1423 
1424 	return (exacct_assemble_net_usage(ac_net, arg, exacct_commit_callback,
1425 	    NULL, 0, &size, what));
1426 }
1427 
1428 static int
1429 exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res)
1430 {
1431 	int attached = 1;
1432 
1433 	switch (res) {
1434 	case AC_FLOW_SADDR:
1435 		if (fu->fu_isv4) {
1436 			(void) ea_attach_item(record, &fu->fu_saddr[3],
1437 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4SADDR);
1438 		} else {
1439 			(void) ea_attach_item(record, &fu->fu_saddr,
1440 			    sizeof (fu->fu_saddr), EXT_RAW |
1441 			    EXD_FLOW_V6SADDR);
1442 		}
1443 		break;
1444 	case AC_FLOW_DADDR:
1445 		if (fu->fu_isv4) {
1446 			(void) ea_attach_item(record, &fu->fu_daddr[3],
1447 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4DADDR);
1448 		} else {
1449 			(void) ea_attach_item(record, &fu->fu_daddr,
1450 			    sizeof (fu->fu_daddr), EXT_RAW |
1451 			    EXD_FLOW_V6DADDR);
1452 		}
1453 		break;
1454 	case AC_FLOW_SPORT:
1455 		(void) ea_attach_item(record, &fu->fu_sport,
1456 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_SPORT);
1457 		break;
1458 	case AC_FLOW_DPORT:
1459 		(void) ea_attach_item(record, &fu->fu_dport,
1460 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_DPORT);
1461 		break;
1462 	case AC_FLOW_PROTOCOL:
1463 		(void) ea_attach_item(record, &fu->fu_protocol,
1464 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_PROTOCOL);
1465 		break;
1466 	case AC_FLOW_DSFIELD:
1467 		(void) ea_attach_item(record, &fu->fu_dsfield,
1468 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_DSFIELD);
1469 		break;
1470 	case AC_FLOW_CTIME:
1471 		(void) ea_attach_item(record, &fu->fu_ctime,
1472 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_CTIME);
1473 		break;
1474 	case AC_FLOW_LSEEN:
1475 		(void) ea_attach_item(record, &fu->fu_lseen,
1476 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_LSEEN);
1477 		break;
1478 	case AC_FLOW_NBYTES:
1479 		(void) ea_attach_item(record, &fu->fu_nbytes,
1480 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NBYTES);
1481 		break;
1482 	case AC_FLOW_NPKTS:
1483 		(void) ea_attach_item(record, &fu->fu_npackets,
1484 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NPKTS);
1485 		break;
1486 	case AC_FLOW_PROJID:
1487 		if (fu->fu_projid >= 0) {
1488 			(void) ea_attach_item(record, &fu->fu_projid,
1489 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_PROJID);
1490 		}
1491 		break;
1492 	case AC_FLOW_UID:
1493 		if (fu->fu_userid >= 0) {
1494 			(void) ea_attach_item(record, &fu->fu_userid,
1495 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID);
1496 		}
1497 		break;
1498 	case AC_FLOW_ANAME:
1499 		(void) ea_attach_item(record, fu->fu_aname,
1500 		    strlen(fu->fu_aname) + 1, EXT_STRING | EXD_FLOW_ANAME);
1501 		break;
1502 	default:
1503 		attached = 0;
1504 	}
1505 	return (attached);
1506 }
1507 
1508 static ea_object_t *
1509 exacct_assemble_flow_record(flow_usage_t *fu, ulong_t *mask,
1510     ea_catalog_t record_type)
1511 {
1512 	int res, count;
1513 	ea_object_t *record;
1514 
1515 	/*
1516 	 * Assemble usage values into group.
1517 	 */
1518 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
1519 	for (res = 1, count = 0; res <= AC_FLOW_MAX_RES; res++)
1520 		if (BT_TEST(mask, res))
1521 			count += exacct_attach_flow_item(fu, record, res);
1522 	if (count == 0) {
1523 		ea_free_object(record, EUP_ALLOC);
1524 		record = NULL;
1525 	}
1526 	return (record);
1527 }
1528 
1529 int
1530 exacct_assemble_flow_usage(ac_info_t *ac_flow, flow_usage_t *fu,
1531     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1532     void *ubuf, size_t ubufsize, size_t *actual)
1533 {
1534 	ulong_t mask[AC_MASK_SZ];
1535 	ea_object_t *flow_usage;
1536 	ea_catalog_t record_type;
1537 	void *buf;
1538 	size_t bufsize;
1539 	int ret;
1540 
1541 	mutex_enter(&ac_flow->ac_lock);
1542 	if (ac_flow->ac_state == AC_OFF) {
1543 		mutex_exit(&ac_flow->ac_lock);
1544 		return (ENOTACTIVE);
1545 	}
1546 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1547 	mutex_exit(&ac_flow->ac_lock);
1548 
1549 	record_type = EXD_GROUP_FLOW;
1550 
1551 	flow_usage = exacct_assemble_flow_record(fu, mask, record_type);
1552 	if (flow_usage == NULL) {
1553 		return (0);
1554 	}
1555 
1556 	/*
1557 	 * Pack object into buffer and pass to callback.
1558 	 */
1559 	bufsize = ea_pack_object(flow_usage, NULL, 0);
1560 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
1561 	if (buf == NULL) {
1562 		return (ENOMEM);
1563 	}
1564 
1565 	(void) ea_pack_object(flow_usage, buf, bufsize);
1566 
1567 	ret = callback(ac_flow, ubuf, ubufsize, buf, bufsize, actual);
1568 
1569 	/*
1570 	 * Free all previously allocations.
1571 	 */
1572 	kmem_free(buf, bufsize);
1573 	ea_free_object(flow_usage, EUP_ALLOC);
1574 	return (ret);
1575 }
1576 
1577 void
1578 exacct_commit_flow(void *arg)
1579 {
1580 	flow_usage_t *f = (flow_usage_t *)arg;
1581 	size_t size;
1582 	ulong_t mask[AC_MASK_SZ];
1583 	struct exacct_globals *acg;
1584 	ac_info_t *ac_flow;
1585 
1586 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1587 		/*
1588 		 * acctctl module not loaded. Nothing to do.
1589 		 */
1590 		return;
1591 	}
1592 
1593 	/*
1594 	 * Even though each zone nominally has its own flow accounting settings
1595 	 * (ac_flow), these are only maintained by and for the global zone.
1596 	 *
1597 	 * If this were to change in the future, this function should grow a
1598 	 * second zoneid (or zone) argument, and use the corresponding zone's
1599 	 * settings rather than always using those of the global zone.
1600 	 */
1601 	acg = zone_getspecific(exacct_zone_key, global_zone);
1602 	ac_flow = &acg->ac_flow;
1603 
1604 	mutex_enter(&ac_flow->ac_lock);
1605 	if (ac_flow->ac_state == AC_OFF) {
1606 		mutex_exit(&ac_flow->ac_lock);
1607 		return;
1608 	}
1609 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1610 	mutex_exit(&ac_flow->ac_lock);
1611 
1612 	(void) exacct_assemble_flow_usage(ac_flow, f, exacct_commit_callback,
1613 	    NULL, 0, &size);
1614 }
1615 
1616 /*
1617  * int exacct_tag_task(task_t *, void *, size_t, int)
1618  *
1619  * Overview
1620  *   exacct_tag_task() provides the exacct record construction and writing
1621  *   support required by putacct(2) for task entities.
1622  *
1623  * Return values
1624  *   The result of the write operation is returned, unless the extended
1625  *   accounting facility is not active, in which case ENOTACTIVE is returned.
1626  *
1627  * Caller's context
1628  *   Suitable for KM_SLEEP allocations.
1629  */
1630 int
1631 exacct_tag_task(ac_info_t *ac_task, task_t *tk, void *ubuf, size_t ubufsz,
1632     int flags)
1633 {
1634 	int error = 0;
1635 	void *buf;
1636 	size_t bufsize;
1637 	ea_catalog_t cat;
1638 	ea_object_t *tag;
1639 
1640 	mutex_enter(&ac_task->ac_lock);
1641 	if (ac_task->ac_state == AC_OFF || ac_task->ac_vnode == NULL) {
1642 		mutex_exit(&ac_task->ac_lock);
1643 		return (ENOTACTIVE);
1644 	}
1645 	mutex_exit(&ac_task->ac_lock);
1646 
1647 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_TASK_TAG);
1648 	(void) ea_attach_item(tag, &tk->tk_tkid, 0,
1649 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1650 	(void) ea_attach_item(tag, tk->tk_zone->zone_nodename, 0,
1651 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1652 	if (flags == EP_RAW)
1653 		cat = EXT_RAW | EXC_DEFAULT | EXD_TASK_TAG;
1654 	else
1655 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_TASK_TAG;
1656 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1657 
1658 	bufsize = ea_pack_object(tag, NULL, 0);
1659 	buf = kmem_alloc(bufsize, KM_SLEEP);
1660 	(void) ea_pack_object(tag, buf, bufsize);
1661 	error = exacct_vn_write(ac_task, buf, bufsize);
1662 	kmem_free(buf, bufsize);
1663 	ea_free_object(tag, EUP_ALLOC);
1664 	return (error);
1665 }
1666 
1667 /*
1668  * exacct_tag_proc(pid_t, taskid_t, void *, size_t, int, char *)
1669  *
1670  * Overview
1671  *   exacct_tag_proc() provides the exacct record construction and writing
1672  *   support required by putacct(2) for processes.
1673  *
1674  * Return values
1675  *   The result of the write operation is returned, unless the extended
1676  *   accounting facility is not active, in which case ENOTACTIVE is returned.
1677  *
1678  * Caller's context
1679  *   Suitable for KM_SLEEP allocations.
1680  */
1681 int
1682 exacct_tag_proc(ac_info_t *ac_proc, pid_t pid, taskid_t tkid, void *ubuf,
1683     size_t ubufsz, int flags, const char *hostname)
1684 {
1685 	int error = 0;
1686 	void *buf;
1687 	size_t bufsize;
1688 	ea_catalog_t cat;
1689 	ea_object_t *tag;
1690 
1691 	mutex_enter(&ac_proc->ac_lock);
1692 	if (ac_proc->ac_state == AC_OFF || ac_proc->ac_vnode == NULL) {
1693 		mutex_exit(&ac_proc->ac_lock);
1694 		return (ENOTACTIVE);
1695 	}
1696 	mutex_exit(&ac_proc->ac_lock);
1697 
1698 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_PROC_TAG);
1699 	(void) ea_attach_item(tag, &pid, sizeof (uint32_t),
1700 	    EXT_UINT32 | EXC_DEFAULT | EXD_PROC_PID);
1701 	(void) ea_attach_item(tag, &tkid, 0,
1702 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1703 	(void) ea_attach_item(tag, (void *)hostname, 0,
1704 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1705 	if (flags == EP_RAW)
1706 		cat = EXT_RAW | EXC_DEFAULT | EXD_PROC_TAG;
1707 	else
1708 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_PROC_TAG;
1709 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1710 
1711 	bufsize = ea_pack_object(tag, NULL, 0);
1712 	buf = kmem_alloc(bufsize, KM_SLEEP);
1713 	(void) ea_pack_object(tag, buf, bufsize);
1714 	error = exacct_vn_write(ac_proc, buf, bufsize);
1715 	kmem_free(buf, bufsize);
1716 	ea_free_object(tag, EUP_ALLOC);
1717 	return (error);
1718 }
1719 
1720 /*
1721  * void exacct_init(void)
1722  *
1723  * Overview
1724  *   Initialized the extended accounting subsystem.
1725  *
1726  * Return values
1727  *   None.
1728  *
1729  * Caller's context
1730  *   Suitable for KM_SLEEP allocations.
1731  */
1732 void
1733 exacct_init()
1734 {
1735 	exacct_queue = system_taskq;
1736 	exacct_object_cache = kmem_cache_create("exacct_object_cache",
1737 	    sizeof (ea_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1738 }
1739 
1740 /*
1741  * exacct_snapshot_proc_mstate() copies a process's microstate accounting data
1742  * and resource usage counters into a given task_usage_t. It differs from
1743  * exacct_copy_proc_mstate() in that here a) we are copying to a task_usage_t,
1744  * b) p_lock will have been acquired earlier in the call path and c) we
1745  * are here including the process's user and system times.
1746  */
1747 static void
1748 exacct_snapshot_proc_mstate(proc_t *p, task_usage_t *tu)
1749 {
1750 	tu->tu_utime  = mstate_aggr_state(p, LMS_USER);
1751 	tu->tu_stime  = mstate_aggr_state(p, LMS_SYSTEM);
1752 	tu->tu_minflt = p->p_ru.minflt;
1753 	tu->tu_majflt = p->p_ru.majflt;
1754 	tu->tu_sndmsg = p->p_ru.msgsnd;
1755 	tu->tu_rcvmsg = p->p_ru.msgrcv;
1756 	tu->tu_ioch   = p->p_ru.ioch;
1757 	tu->tu_iblk   = p->p_ru.inblock;
1758 	tu->tu_oblk   = p->p_ru.oublock;
1759 	tu->tu_vcsw   = p->p_ru.nvcsw;
1760 	tu->tu_icsw   = p->p_ru.nivcsw;
1761 	tu->tu_nsig   = p->p_ru.nsignals;
1762 	tu->tu_nswp   = p->p_ru.nswap;
1763 	tu->tu_nscl   = p->p_ru.sysc;
1764 }
1765 
1766 /*
1767  * void exacct_move_mstate(proc_t *, task_t *, task_t *)
1768  *
1769  * Overview
1770  *   exacct_move_mstate() is called by task_change() and accounts for
1771  *   a process's resource usage when it is moved from one task to another.
1772  *
1773  *   The process's usage at this point is recorded in the new task so
1774  *   that it can be excluded from the calculation of resources consumed
1775  *   by that task.
1776  *
1777  *   The resource usage inherited by the new task is also added to the
1778  *   aggregate maintained by the old task for processes that have exited.
1779  *
1780  * Return values
1781  *   None.
1782  *
1783  * Caller's context
1784  *   pidlock and p_lock held across exacct_move_mstate().
1785  */
1786 void
1787 exacct_move_mstate(proc_t *p, task_t *oldtk, task_t *newtk)
1788 {
1789 	task_usage_t tu;
1790 
1791 	/* Take a snapshot of this process's mstate and RU counters */
1792 	exacct_snapshot_proc_mstate(p, &tu);
1793 
1794 	/*
1795 	 * Use the snapshot to increment the aggregate usage of the old
1796 	 * task, and the inherited usage of the new one.
1797 	 */
1798 	mutex_enter(&oldtk->tk_usage_lock);
1799 	exacct_add_task_mstate(oldtk->tk_usage, &tu);
1800 	mutex_exit(&oldtk->tk_usage_lock);
1801 	mutex_enter(&newtk->tk_usage_lock);
1802 	exacct_add_task_mstate(newtk->tk_inherited, &tu);
1803 	mutex_exit(&newtk->tk_usage_lock);
1804 }
1805