1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/exacct.h> 29 #include <sys/exacct_catalog.h> 30 #include <sys/disp.h> 31 #include <sys/task.h> 32 #include <sys/proc.h> 33 #include <sys/cmn_err.h> 34 #include <sys/kmem.h> 35 #include <sys/project.h> 36 #include <sys/systm.h> 37 #include <sys/vnode.h> 38 #include <sys/file.h> 39 #include <sys/acctctl.h> 40 #include <sys/time.h> 41 #include <sys/utsname.h> 42 #include <sys/session.h> 43 #include <sys/sysmacros.h> 44 #include <sys/bitmap.h> 45 #include <sys/msacct.h> 46 47 /* 48 * exacct usage and recording routines 49 * 50 * wracct(2), getacct(2), and the records written at process or task 51 * termination are constructed using the exacct_assemble_[task,proc]_usage() 52 * functions, which take a callback that takes the appropriate action on 53 * the packed exacct record for the task or process. For the process-related 54 * actions, we partition the routines such that the data collecting component 55 * can be performed while holding p_lock, and all sleeping or blocking 56 * operations can be performed without acquiring p_lock. 57 * 58 * putacct(2), which allows an application to construct a customized record 59 * associated with an existing process or task, has its own entry points: 60 * exacct_tag_task() and exacct_tag_proc(). 61 */ 62 63 taskq_t *exacct_queue; 64 kmem_cache_t *exacct_object_cache; 65 66 zone_key_t exacct_zone_key = ZONE_KEY_UNINITIALIZED; 67 68 static const uint32_t exacct_version = EXACCT_VERSION; 69 static const char exacct_header[] = "exacct"; 70 static const char exacct_creator[] = "SunOS"; 71 72 ea_object_t * 73 ea_alloc_item(ea_catalog_t catalog, void *buf, size_t bufsz) 74 { 75 ea_object_t *item; 76 77 item = kmem_cache_alloc(exacct_object_cache, KM_SLEEP); 78 bzero(item, sizeof (ea_object_t)); 79 (void) ea_set_item(item, catalog, buf, bufsz); 80 return (item); 81 } 82 83 ea_object_t * 84 ea_alloc_group(ea_catalog_t catalog) 85 { 86 ea_object_t *group; 87 88 group = kmem_cache_alloc(exacct_object_cache, KM_SLEEP); 89 bzero(group, sizeof (ea_object_t)); 90 (void) ea_set_group(group, catalog); 91 return (group); 92 } 93 94 ea_object_t * 95 ea_attach_item(ea_object_t *grp, void *buf, size_t bufsz, ea_catalog_t catalog) 96 { 97 ea_object_t *item; 98 99 item = ea_alloc_item(catalog, buf, bufsz); 100 (void) ea_attach_to_group(grp, item); 101 return (item); 102 } 103 104 /* 105 * exacct_add_task_mstate() and exacct_sub_task_mstate() add and subtract 106 * microstate accounting data and resource usage counters from one task_usage_t 107 * from those supplied in another. These functions do not operate on *all* 108 * members of a task_usage_t: for some (e.g. tu_anctaskid) it would not make 109 * sense. 110 */ 111 static void 112 exacct_add_task_mstate(task_usage_t *tu, task_usage_t *delta) 113 { 114 tu->tu_utime += delta->tu_utime; 115 tu->tu_stime += delta->tu_stime; 116 tu->tu_minflt += delta->tu_minflt; 117 tu->tu_majflt += delta->tu_majflt; 118 tu->tu_sndmsg += delta->tu_sndmsg; 119 tu->tu_rcvmsg += delta->tu_rcvmsg; 120 tu->tu_ioch += delta->tu_ioch; 121 tu->tu_iblk += delta->tu_iblk; 122 tu->tu_oblk += delta->tu_oblk; 123 tu->tu_vcsw += delta->tu_vcsw; 124 tu->tu_icsw += delta->tu_icsw; 125 tu->tu_nsig += delta->tu_nsig; 126 tu->tu_nswp += delta->tu_nswp; 127 tu->tu_nscl += delta->tu_nscl; 128 } 129 130 /* 131 * See the comments for exacct_add_task_mstate(), above. 132 */ 133 static void 134 exacct_sub_task_mstate(task_usage_t *tu, task_usage_t *delta) 135 { 136 tu->tu_utime -= delta->tu_utime; 137 tu->tu_stime -= delta->tu_stime; 138 tu->tu_minflt -= delta->tu_minflt; 139 tu->tu_majflt -= delta->tu_majflt; 140 tu->tu_sndmsg -= delta->tu_sndmsg; 141 tu->tu_rcvmsg -= delta->tu_rcvmsg; 142 tu->tu_ioch -= delta->tu_ioch; 143 tu->tu_iblk -= delta->tu_iblk; 144 tu->tu_oblk -= delta->tu_oblk; 145 tu->tu_vcsw -= delta->tu_vcsw; 146 tu->tu_icsw -= delta->tu_icsw; 147 tu->tu_nsig -= delta->tu_nsig; 148 tu->tu_nswp -= delta->tu_nswp; 149 tu->tu_nscl -= delta->tu_nscl; 150 } 151 152 /* 153 * Wrapper for vn_rdwr() used by exacct_vn_write() and exacct_write_header() 154 * to write to the accounting file without corrupting it in case of an I/O or 155 * filesystem error. 156 */ 157 static int 158 exacct_vn_write_impl(ac_info_t *info, void *buf, ssize_t bufsize) 159 { 160 int error; 161 ssize_t resid; 162 struct vattr va; 163 164 ASSERT(info != NULL); 165 ASSERT(info->ac_vnode != NULL); 166 ASSERT(MUTEX_HELD(&info->ac_lock)); 167 168 /* 169 * Save the size. If vn_rdwr fails, reset the size to avoid corrupting 170 * the present accounting file. 171 */ 172 va.va_mask = AT_SIZE; 173 error = VOP_GETATTR(info->ac_vnode, &va, 0, kcred, NULL); 174 if (error == 0) { 175 error = vn_rdwr(UIO_WRITE, info->ac_vnode, (caddr_t)buf, 176 bufsize, 0LL, UIO_SYSSPACE, FAPPEND, (rlim64_t)MAXOFFSET_T, 177 kcred, &resid); 178 if (error) { 179 (void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL); 180 } else if (resid != 0) { 181 (void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL); 182 error = ENOSPC; 183 } 184 } 185 return (error); 186 } 187 188 /* 189 * exacct_vn_write() safely writes to an accounting file. acctctl() prevents 190 * the two accounting vnodes from being equal, and the appropriate ac_lock is 191 * held across the call, so we're single threaded through this code for each 192 * file. 193 */ 194 static int 195 exacct_vn_write(ac_info_t *info, void *buf, ssize_t bufsize) 196 { 197 int error; 198 199 if (info == NULL) 200 return (0); 201 202 mutex_enter(&info->ac_lock); 203 204 /* 205 * Don't do anything unless accounting file is set. 206 */ 207 if (info->ac_vnode == NULL) { 208 mutex_exit(&info->ac_lock); 209 return (0); 210 } 211 error = exacct_vn_write_impl(info, buf, bufsize); 212 mutex_exit(&info->ac_lock); 213 214 return (error); 215 } 216 217 /* 218 * void *exacct_create_header(size_t *) 219 * 220 * Overview 221 * exacct_create_header() constructs an exacct file header identifying the 222 * accounting file as the output of the kernel. exacct_create_header() and 223 * the static write_header() and verify_header() routines in libexacct must 224 * remain synchronized. 225 * 226 * Return values 227 * A pointer to a packed exacct buffer containing the appropriate header is 228 * returned; the size of the buffer is placed in the location indicated by 229 * sizep. 230 * 231 * Caller's context 232 * Suitable for KM_SLEEP allocations. 233 */ 234 void * 235 exacct_create_header(size_t *sizep) 236 { 237 ea_object_t *hdr_grp; 238 uint32_t bskip; 239 void *buf; 240 size_t bufsize; 241 242 hdr_grp = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_HEADER); 243 (void) ea_attach_item(hdr_grp, (void *)&exacct_version, 0, 244 EXT_UINT32 | EXC_DEFAULT | EXD_VERSION); 245 (void) ea_attach_item(hdr_grp, (void *)exacct_header, 0, 246 EXT_STRING | EXC_DEFAULT | EXD_FILETYPE); 247 (void) ea_attach_item(hdr_grp, (void *)exacct_creator, 0, 248 EXT_STRING | EXC_DEFAULT | EXD_CREATOR); 249 (void) ea_attach_item(hdr_grp, uts_nodename(), 0, 250 EXT_STRING | EXC_DEFAULT | EXD_HOSTNAME); 251 252 bufsize = ea_pack_object(hdr_grp, NULL, 0); 253 buf = kmem_alloc(bufsize, KM_SLEEP); 254 (void) ea_pack_object(hdr_grp, buf, bufsize); 255 ea_free_object(hdr_grp, EUP_ALLOC); 256 257 /* 258 * To prevent reading the header when reading the file backwards, 259 * set the large backskip of the header group to 0 (last 4 bytes). 260 */ 261 bskip = 0; 262 exacct_order32(&bskip); 263 bcopy(&bskip, (char *)buf + bufsize - sizeof (bskip), 264 sizeof (bskip)); 265 266 *sizep = bufsize; 267 return (buf); 268 } 269 270 /* 271 * int exacct_write_header(ac_info_t *, void *, size_t) 272 * 273 * Overview 274 * exacct_write_header() writes the given header buffer to the indicated 275 * vnode. 276 * 277 * Return values 278 * The result of the write operation is returned. 279 * 280 * Caller's context 281 * Caller must hold the ac_lock of the appropriate accounting file 282 * information block (ac_info_t). 283 */ 284 int 285 exacct_write_header(ac_info_t *info, void *hdr, size_t hdrsize) 286 { 287 if (info != NULL && info->ac_vnode != NULL) 288 return (exacct_vn_write_impl(info, hdr, hdrsize)); 289 290 return (0); 291 } 292 293 static void 294 exacct_get_interval_task_usage(task_t *tk, task_usage_t *tu, 295 task_usage_t **tu_buf) 296 { 297 task_usage_t *oldtu, *newtu; 298 task_usage_t **prevusage; 299 300 ASSERT(MUTEX_HELD(&tk->tk_usage_lock)); 301 if (getzoneid() != GLOBAL_ZONEID) { 302 prevusage = &tk->tk_zoneusage; 303 } else { 304 prevusage = &tk->tk_prevusage; 305 } 306 if ((oldtu = *prevusage) != NULL) { 307 /* 308 * In case we have any accounting information 309 * saved from the previous interval record. 310 */ 311 newtu = *tu_buf; 312 bcopy(tu, newtu, sizeof (task_usage_t)); 313 tu->tu_minflt -= oldtu->tu_minflt; 314 tu->tu_majflt -= oldtu->tu_majflt; 315 tu->tu_sndmsg -= oldtu->tu_sndmsg; 316 tu->tu_rcvmsg -= oldtu->tu_rcvmsg; 317 tu->tu_ioch -= oldtu->tu_ioch; 318 tu->tu_iblk -= oldtu->tu_iblk; 319 tu->tu_oblk -= oldtu->tu_oblk; 320 tu->tu_vcsw -= oldtu->tu_vcsw; 321 tu->tu_icsw -= oldtu->tu_icsw; 322 tu->tu_nsig -= oldtu->tu_nsig; 323 tu->tu_nswp -= oldtu->tu_nswp; 324 tu->tu_nscl -= oldtu->tu_nscl; 325 tu->tu_utime -= oldtu->tu_utime; 326 tu->tu_stime -= oldtu->tu_stime; 327 328 tu->tu_startsec = oldtu->tu_finishsec; 329 tu->tu_startnsec = oldtu->tu_finishnsec; 330 /* 331 * Copy the data from our temporary storage to the task's 332 * previous interval usage structure for future reference. 333 */ 334 bcopy(newtu, oldtu, sizeof (task_usage_t)); 335 } else { 336 /* 337 * Store current statistics in the task's previous interval 338 * usage structure for future references. 339 */ 340 *prevusage = *tu_buf; 341 bcopy(tu, *prevusage, sizeof (task_usage_t)); 342 *tu_buf = NULL; 343 } 344 } 345 346 static void 347 exacct_snapshot_task_usage(task_t *tk, task_usage_t *tu) 348 { 349 timestruc_t ts; 350 proc_t *p; 351 352 ASSERT(MUTEX_HELD(&pidlock)); 353 354 if ((p = tk->tk_memb_list) == NULL) 355 return; 356 357 /* 358 * exacct_snapshot_task_usage() provides an approximate snapshot of the 359 * usage of the potentially many members of the task. Since we don't 360 * guarantee exactness, we don't acquire the p_lock of any of the member 361 * processes. 362 */ 363 do { 364 mutex_enter(&p->p_lock); 365 tu->tu_utime += mstate_aggr_state(p, LMS_USER); 366 tu->tu_stime += mstate_aggr_state(p, LMS_SYSTEM); 367 mutex_exit(&p->p_lock); 368 tu->tu_minflt += p->p_ru.minflt; 369 tu->tu_majflt += p->p_ru.majflt; 370 tu->tu_sndmsg += p->p_ru.msgsnd; 371 tu->tu_rcvmsg += p->p_ru.msgrcv; 372 tu->tu_ioch += p->p_ru.ioch; 373 tu->tu_iblk += p->p_ru.inblock; 374 tu->tu_oblk += p->p_ru.oublock; 375 tu->tu_vcsw += p->p_ru.nvcsw; 376 tu->tu_icsw += p->p_ru.nivcsw; 377 tu->tu_nsig += p->p_ru.nsignals; 378 tu->tu_nswp += p->p_ru.nswap; 379 tu->tu_nscl += p->p_ru.sysc; 380 } while ((p = p->p_tasknext) != tk->tk_memb_list); 381 382 /* 383 * The resource usage accounted for so far will include that 384 * contributed by the task's first process. If this process 385 * came from another task, then its accumulated resource usage 386 * will include a contribution from work performed there. 387 * We must therefore subtract any resource usage that was 388 * inherited with the first process. 389 */ 390 exacct_sub_task_mstate(tu, tk->tk_inherited); 391 392 gethrestime(&ts); 393 tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec; 394 tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec; 395 } 396 397 /* 398 * void exacct_update_task_mstate(proc_t *) 399 * 400 * Overview 401 * exacct_update_task_mstate() updates the task usage; it is intended 402 * to be called from proc_exit(). 403 * 404 * Return values 405 * None. 406 * 407 * Caller's context 408 * p_lock must be held at entry. 409 */ 410 void 411 exacct_update_task_mstate(proc_t *p) 412 { 413 task_usage_t *tu; 414 415 mutex_enter(&p->p_task->tk_usage_lock); 416 tu = p->p_task->tk_usage; 417 tu->tu_utime += mstate_aggr_state(p, LMS_USER); 418 tu->tu_stime += mstate_aggr_state(p, LMS_SYSTEM); 419 tu->tu_minflt += p->p_ru.minflt; 420 tu->tu_majflt += p->p_ru.majflt; 421 tu->tu_sndmsg += p->p_ru.msgsnd; 422 tu->tu_rcvmsg += p->p_ru.msgrcv; 423 tu->tu_ioch += p->p_ru.ioch; 424 tu->tu_iblk += p->p_ru.inblock; 425 tu->tu_oblk += p->p_ru.oublock; 426 tu->tu_vcsw += p->p_ru.nvcsw; 427 tu->tu_icsw += p->p_ru.nivcsw; 428 tu->tu_nsig += p->p_ru.nsignals; 429 tu->tu_nswp += p->p_ru.nswap; 430 tu->tu_nscl += p->p_ru.sysc; 431 mutex_exit(&p->p_task->tk_usage_lock); 432 } 433 434 static void 435 exacct_calculate_task_usage(task_t *tk, task_usage_t *tu, int flag) 436 { 437 timestruc_t ts; 438 task_usage_t *tu_buf; 439 440 switch (flag) { 441 case EW_PARTIAL: 442 /* 443 * For partial records we must report the sum of current 444 * accounting statistics with previously accumulated 445 * statistics. 446 */ 447 mutex_enter(&pidlock); 448 mutex_enter(&tk->tk_usage_lock); 449 450 (void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t)); 451 exacct_snapshot_task_usage(tk, tu); 452 453 mutex_exit(&tk->tk_usage_lock); 454 mutex_exit(&pidlock); 455 break; 456 case EW_INTERVAL: 457 /* 458 * We need to allocate spare task_usage_t buffer before 459 * grabbing pidlock because we might need it later in 460 * exacct_get_interval_task_usage(). 461 */ 462 tu_buf = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP); 463 mutex_enter(&pidlock); 464 mutex_enter(&tk->tk_usage_lock); 465 466 /* 467 * For interval records, we deduct the previous microstate 468 * accounting data and cpu usage times from previously saved 469 * results and update the previous task usage structure. 470 */ 471 (void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t)); 472 exacct_snapshot_task_usage(tk, tu); 473 exacct_get_interval_task_usage(tk, tu, &tu_buf); 474 475 mutex_exit(&tk->tk_usage_lock); 476 mutex_exit(&pidlock); 477 478 if (tu_buf != NULL) 479 kmem_free(tu_buf, sizeof (task_usage_t)); 480 break; 481 case EW_FINAL: 482 /* 483 * For final records, we deduct, from the task's current 484 * usage, any usage that was inherited with the arrival 485 * of a process from a previous task. We then record 486 * the task's finish time. 487 */ 488 mutex_enter(&tk->tk_usage_lock); 489 (void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t)); 490 exacct_sub_task_mstate(tu, tk->tk_inherited); 491 mutex_exit(&tk->tk_usage_lock); 492 493 gethrestime(&ts); 494 tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec; 495 tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec; 496 497 break; 498 } 499 } 500 501 static int 502 exacct_attach_task_item(task_t *tk, task_usage_t *tu, ea_object_t *record, 503 int res) 504 { 505 int attached = 1; 506 507 switch (res) { 508 case AC_TASK_TASKID: 509 (void) ea_attach_item(record, &tk->tk_tkid, 510 sizeof (uint32_t), EXT_UINT32 | EXD_TASK_TASKID); 511 break; 512 case AC_TASK_PROJID: 513 (void) ea_attach_item(record, &tk->tk_proj->kpj_id, 514 sizeof (uint32_t), EXT_UINT32 | EXD_TASK_PROJID); 515 break; 516 case AC_TASK_CPU: { 517 timestruc_t ts; 518 uint64_t ui; 519 520 hrt2ts(tu->tu_stime, &ts); 521 ui = ts.tv_sec; 522 (void) ea_attach_item(record, &ui, sizeof (uint64_t), 523 EXT_UINT64 | EXD_TASK_CPU_SYS_SEC); 524 ui = ts.tv_nsec; 525 (void) ea_attach_item(record, &ui, sizeof (uint64_t), 526 EXT_UINT64 | EXD_TASK_CPU_SYS_NSEC); 527 528 hrt2ts(tu->tu_utime, &ts); 529 ui = ts.tv_sec; 530 (void) ea_attach_item(record, &ui, sizeof (uint64_t), 531 EXT_UINT64 | EXD_TASK_CPU_USER_SEC); 532 ui = ts.tv_nsec; 533 (void) ea_attach_item(record, &ui, sizeof (uint64_t), 534 EXT_UINT64 | EXD_TASK_CPU_USER_NSEC); 535 } 536 break; 537 case AC_TASK_TIME: 538 (void) ea_attach_item(record, &tu->tu_startsec, 539 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_SEC); 540 (void) ea_attach_item(record, &tu->tu_startnsec, 541 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_NSEC); 542 (void) ea_attach_item(record, &tu->tu_finishsec, 543 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_SEC); 544 (void) ea_attach_item(record, &tu->tu_finishnsec, 545 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_NSEC); 546 break; 547 case AC_TASK_HOSTNAME: 548 (void) ea_attach_item(record, tk->tk_zone->zone_nodename, 549 strlen(tk->tk_zone->zone_nodename) + 1, 550 EXT_STRING | EXD_TASK_HOSTNAME); 551 break; 552 case AC_TASK_MICROSTATE: 553 (void) ea_attach_item(record, &tu->tu_majflt, 554 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MAJOR); 555 (void) ea_attach_item(record, &tu->tu_minflt, 556 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MINOR); 557 (void) ea_attach_item(record, &tu->tu_sndmsg, 558 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_SND); 559 (void) ea_attach_item(record, &tu->tu_rcvmsg, 560 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_RCV); 561 (void) ea_attach_item(record, &tu->tu_iblk, 562 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_IN); 563 (void) ea_attach_item(record, &tu->tu_oblk, 564 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_OUT); 565 (void) ea_attach_item(record, &tu->tu_ioch, 566 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CHARS_RDWR); 567 (void) ea_attach_item(record, &tu->tu_vcsw, 568 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_VOL); 569 (void) ea_attach_item(record, &tu->tu_icsw, 570 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_INV); 571 (void) ea_attach_item(record, &tu->tu_nsig, 572 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SIGNALS); 573 (void) ea_attach_item(record, &tu->tu_nswp, 574 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SWAPS); 575 (void) ea_attach_item(record, &tu->tu_nscl, 576 sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SYSCALLS); 577 break; 578 case AC_TASK_ANCTASKID: 579 (void) ea_attach_item(record, &tu->tu_anctaskid, 580 sizeof (uint32_t), EXT_UINT32 | EXD_TASK_ANCTASKID); 581 break; 582 case AC_TASK_ZONENAME: 583 (void) ea_attach_item(record, tk->tk_zone->zone_name, 584 strlen(tk->tk_zone->zone_name) + 1, 585 EXT_STRING | EXD_TASK_ZONENAME); 586 break; 587 default: 588 attached = 0; 589 } 590 return (attached); 591 } 592 593 static ea_object_t * 594 exacct_assemble_task_record(task_t *tk, task_usage_t *tu, ulong_t *mask, 595 ea_catalog_t record_type) 596 { 597 int res, count; 598 ea_object_t *record; 599 600 /* 601 * Assemble usage values into group. 602 */ 603 record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type); 604 for (res = 1, count = 0; res <= AC_TASK_MAX_RES; res++) 605 if (BT_TEST(mask, res)) 606 count += exacct_attach_task_item(tk, tu, record, res); 607 if (count == 0) { 608 ea_free_object(record, EUP_ALLOC); 609 record = NULL; 610 } 611 return (record); 612 } 613 614 /* 615 * int exacct_assemble_task_usage(task_t *, int (*)(void *, size_t, void *, 616 * size_t, size_t *), void *, size_t, size_t *, int) 617 * 618 * Overview 619 * exacct_assemble_task_usage() builds the packed exacct buffer for the 620 * indicated task, executes the given callback function, and free the packed 621 * buffer. 622 * 623 * Return values 624 * Returns 0 on success; otherwise the appropriate error code is returned. 625 * 626 * Caller's context 627 * Suitable for KM_SLEEP allocations. 628 */ 629 int 630 exacct_assemble_task_usage(ac_info_t *ac_task, task_t *tk, 631 int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *), 632 void *ubuf, size_t ubufsize, size_t *actual, int flag) 633 { 634 ulong_t mask[AC_MASK_SZ]; 635 ea_object_t *task_record; 636 ea_catalog_t record_type; 637 task_usage_t *tu; 638 void *buf; 639 size_t bufsize; 640 int ret; 641 642 ASSERT(flag == EW_FINAL || flag == EW_PARTIAL || flag == EW_INTERVAL); 643 644 mutex_enter(&ac_task->ac_lock); 645 if (ac_task->ac_state == AC_OFF) { 646 mutex_exit(&ac_task->ac_lock); 647 return (ENOTACTIVE); 648 } 649 bt_copy(ac_task->ac_mask, mask, AC_MASK_SZ); 650 mutex_exit(&ac_task->ac_lock); 651 652 switch (flag) { 653 case EW_FINAL: 654 record_type = EXD_GROUP_TASK; 655 break; 656 case EW_PARTIAL: 657 record_type = EXD_GROUP_TASK_PARTIAL; 658 break; 659 case EW_INTERVAL: 660 record_type = EXD_GROUP_TASK_INTERVAL; 661 break; 662 } 663 664 /* 665 * Calculate task usage and assemble it into the task record. 666 */ 667 tu = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP); 668 exacct_calculate_task_usage(tk, tu, flag); 669 task_record = exacct_assemble_task_record(tk, tu, mask, record_type); 670 if (task_record == NULL) { 671 /* 672 * The current configuration of the accounting system has 673 * resulted in records with no data; accordingly, we don't write 674 * these, but we return success. 675 */ 676 kmem_free(tu, sizeof (task_usage_t)); 677 return (0); 678 } 679 680 /* 681 * Pack object into buffer and run callback on it. 682 */ 683 bufsize = ea_pack_object(task_record, NULL, 0); 684 buf = kmem_alloc(bufsize, KM_SLEEP); 685 (void) ea_pack_object(task_record, buf, bufsize); 686 ret = callback(ac_task, ubuf, ubufsize, buf, bufsize, actual); 687 688 /* 689 * Free all previously allocated structures. 690 */ 691 kmem_free(buf, bufsize); 692 ea_free_object(task_record, EUP_ALLOC); 693 kmem_free(tu, sizeof (task_usage_t)); 694 return (ret); 695 } 696 697 /* 698 * void exacct_commit_task(void *) 699 * 700 * Overview 701 * exacct_commit_task() calculates the final usage for a task, updating the 702 * task usage if task accounting is active, and writing a task record if task 703 * accounting is active. exacct_commit_task() is intended for being called 704 * from a task queue (taskq_t). 705 * 706 * Return values 707 * None. 708 * 709 * Caller's context 710 * Suitable for KM_SLEEP allocations. 711 */ 712 713 void 714 exacct_commit_task(void *arg) 715 { 716 task_t *tk = (task_t *)arg; 717 size_t size; 718 zone_t *zone = tk->tk_zone; 719 struct exacct_globals *acg; 720 721 ASSERT(tk != task0p); 722 ASSERT(tk->tk_memb_list == NULL); 723 724 /* 725 * Don't do any extra work if the acctctl module isn't loaded. 726 */ 727 if (exacct_zone_key != ZONE_KEY_UNINITIALIZED) { 728 acg = zone_getspecific(exacct_zone_key, zone); 729 (void) exacct_assemble_task_usage(&acg->ac_task, tk, 730 exacct_commit_callback, NULL, 0, &size, EW_FINAL); 731 if (tk->tk_zone != global_zone) { 732 acg = zone_getspecific(exacct_zone_key, global_zone); 733 (void) exacct_assemble_task_usage(&acg->ac_task, tk, 734 exacct_commit_callback, NULL, 0, &size, EW_FINAL); 735 } 736 } 737 /* 738 * Release associated project and finalize task. 739 */ 740 task_end(tk); 741 } 742 743 static int 744 exacct_attach_proc_item(proc_usage_t *pu, ea_object_t *record, int res) 745 { 746 int attached = 1; 747 748 switch (res) { 749 case AC_PROC_PID: 750 (void) ea_attach_item(record, &pu->pu_pid, 751 sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PID); 752 break; 753 case AC_PROC_UID: 754 (void) ea_attach_item(record, &pu->pu_ruid, 755 sizeof (uint32_t), EXT_UINT32 | EXD_PROC_UID); 756 break; 757 case AC_PROC_FLAG: 758 (void) ea_attach_item(record, &pu->pu_acflag, 759 sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ACCT_FLAGS); 760 break; 761 case AC_PROC_GID: 762 (void) ea_attach_item(record, &pu->pu_rgid, 763 sizeof (uint32_t), EXT_UINT32 | EXD_PROC_GID); 764 break; 765 case AC_PROC_PROJID: 766 (void) ea_attach_item(record, &pu->pu_projid, 767 sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PROJID); 768 break; 769 case AC_PROC_TASKID: 770 (void) ea_attach_item(record, &pu->pu_taskid, 771 sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TASKID); 772 break; 773 case AC_PROC_CPU: 774 (void) ea_attach_item(record, &pu->pu_utimesec, 775 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_SEC); 776 (void) ea_attach_item(record, &pu->pu_utimensec, 777 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_NSEC); 778 (void) ea_attach_item(record, &pu->pu_stimesec, 779 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_SEC); 780 (void) ea_attach_item(record, &pu->pu_stimensec, 781 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_NSEC); 782 break; 783 case AC_PROC_TIME: 784 (void) ea_attach_item(record, &pu->pu_startsec, 785 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_SEC); 786 (void) ea_attach_item(record, &pu->pu_startnsec, 787 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_NSEC); 788 (void) ea_attach_item(record, &pu->pu_finishsec, 789 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_SEC); 790 (void) ea_attach_item(record, &pu->pu_finishnsec, 791 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_NSEC); 792 break; 793 case AC_PROC_COMMAND: 794 (void) ea_attach_item(record, pu->pu_command, 795 strlen(pu->pu_command) + 1, EXT_STRING | EXD_PROC_COMMAND); 796 break; 797 case AC_PROC_HOSTNAME: 798 (void) ea_attach_item(record, pu->pu_nodename, 799 strlen(pu->pu_nodename) + 1, 800 EXT_STRING | EXD_PROC_HOSTNAME); 801 break; 802 case AC_PROC_TTY: 803 (void) ea_attach_item(record, &pu->pu_major, 804 sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MAJOR); 805 (void) ea_attach_item(record, &pu->pu_minor, 806 sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MINOR); 807 break; 808 case AC_PROC_MICROSTATE: 809 (void) ea_attach_item(record, &pu->pu_majflt, 810 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MAJOR); 811 (void) ea_attach_item(record, &pu->pu_minflt, 812 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MINOR); 813 (void) ea_attach_item(record, &pu->pu_sndmsg, 814 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_SND); 815 (void) ea_attach_item(record, &pu->pu_rcvmsg, 816 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_RCV); 817 (void) ea_attach_item(record, &pu->pu_iblk, 818 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_IN); 819 (void) ea_attach_item(record, &pu->pu_oblk, 820 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_OUT); 821 (void) ea_attach_item(record, &pu->pu_ioch, 822 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CHARS_RDWR); 823 (void) ea_attach_item(record, &pu->pu_vcsw, 824 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_VOL); 825 (void) ea_attach_item(record, &pu->pu_icsw, 826 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_INV); 827 (void) ea_attach_item(record, &pu->pu_nsig, 828 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SIGNALS); 829 (void) ea_attach_item(record, &pu->pu_nswp, 830 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SWAPS); 831 (void) ea_attach_item(record, &pu->pu_nscl, 832 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SYSCALLS); 833 break; 834 case AC_PROC_ANCPID: 835 (void) ea_attach_item(record, &pu->pu_ancpid, 836 sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ANCPID); 837 break; 838 case AC_PROC_WAIT_STATUS: 839 (void) ea_attach_item(record, &pu->pu_wstat, 840 sizeof (uint32_t), EXT_UINT32 | EXD_PROC_WAIT_STATUS); 841 break; 842 case AC_PROC_ZONENAME: 843 (void) ea_attach_item(record, pu->pu_zonename, 844 strlen(pu->pu_zonename) + 1, 845 EXT_STRING | EXD_PROC_ZONENAME); 846 break; 847 case AC_PROC_MEM: 848 (void) ea_attach_item(record, &pu->pu_mem_rss_avg, 849 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_AVG_K); 850 (void) ea_attach_item(record, &pu->pu_mem_rss_max, 851 sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_MAX_K); 852 break; 853 default: 854 attached = 0; 855 } 856 return (attached); 857 } 858 859 static ea_object_t * 860 exacct_assemble_proc_record(proc_usage_t *pu, ulong_t *mask, 861 ea_catalog_t record_type) 862 { 863 int res, count; 864 ea_object_t *record; 865 866 /* 867 * Assemble usage values into group. 868 */ 869 record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type); 870 for (res = 1, count = 0; res <= AC_PROC_MAX_RES; res++) 871 if (BT_TEST(mask, res)) 872 count += exacct_attach_proc_item(pu, record, res); 873 if (count == 0) { 874 ea_free_object(record, EUP_ALLOC); 875 record = NULL; 876 } 877 return (record); 878 } 879 880 /* 881 * The following two routines assume that process's p_lock is held or 882 * exacct_commit_proc has been called from exit() when all lwps are stopped. 883 */ 884 static void 885 exacct_calculate_proc_mstate(proc_t *p, proc_usage_t *pu) 886 { 887 kthread_t *t; 888 889 ASSERT(MUTEX_HELD(&p->p_lock)); 890 if ((t = p->p_tlist) == NULL) 891 return; 892 893 do { 894 pu->pu_minflt += t->t_lwp->lwp_ru.minflt; 895 pu->pu_majflt += t->t_lwp->lwp_ru.majflt; 896 pu->pu_sndmsg += t->t_lwp->lwp_ru.msgsnd; 897 pu->pu_rcvmsg += t->t_lwp->lwp_ru.msgrcv; 898 pu->pu_ioch += t->t_lwp->lwp_ru.ioch; 899 pu->pu_iblk += t->t_lwp->lwp_ru.inblock; 900 pu->pu_oblk += t->t_lwp->lwp_ru.oublock; 901 pu->pu_vcsw += t->t_lwp->lwp_ru.nvcsw; 902 pu->pu_icsw += t->t_lwp->lwp_ru.nivcsw; 903 pu->pu_nsig += t->t_lwp->lwp_ru.nsignals; 904 pu->pu_nswp += t->t_lwp->lwp_ru.nswap; 905 pu->pu_nscl += t->t_lwp->lwp_ru.sysc; 906 } while ((t = t->t_forw) != p->p_tlist); 907 } 908 909 static void 910 exacct_copy_proc_mstate(proc_t *p, proc_usage_t *pu) 911 { 912 pu->pu_minflt = p->p_ru.minflt; 913 pu->pu_majflt = p->p_ru.majflt; 914 pu->pu_sndmsg = p->p_ru.msgsnd; 915 pu->pu_rcvmsg = p->p_ru.msgrcv; 916 pu->pu_ioch = p->p_ru.ioch; 917 pu->pu_iblk = p->p_ru.inblock; 918 pu->pu_oblk = p->p_ru.oublock; 919 pu->pu_vcsw = p->p_ru.nvcsw; 920 pu->pu_icsw = p->p_ru.nivcsw; 921 pu->pu_nsig = p->p_ru.nsignals; 922 pu->pu_nswp = p->p_ru.nswap; 923 pu->pu_nscl = p->p_ru.sysc; 924 } 925 926 void 927 exacct_calculate_proc_usage(proc_t *p, proc_usage_t *pu, ulong_t *mask, 928 int flag, int wstat) 929 { 930 timestruc_t ts, ts_run; 931 932 ASSERT(MUTEX_HELD(&p->p_lock)); 933 934 /* 935 * Convert CPU and execution times to sec/nsec format. 936 */ 937 if (BT_TEST(mask, AC_PROC_CPU)) { 938 hrt2ts(mstate_aggr_state(p, LMS_USER), &ts); 939 pu->pu_utimesec = (uint64_t)(ulong_t)ts.tv_sec; 940 pu->pu_utimensec = (uint64_t)(ulong_t)ts.tv_nsec; 941 hrt2ts(mstate_aggr_state(p, LMS_SYSTEM), &ts); 942 pu->pu_stimesec = (uint64_t)(ulong_t)ts.tv_sec; 943 pu->pu_stimensec = (uint64_t)(ulong_t)ts.tv_nsec; 944 } 945 if (BT_TEST(mask, AC_PROC_TIME)) { 946 gethrestime(&ts); 947 pu->pu_finishsec = (uint64_t)(ulong_t)ts.tv_sec; 948 pu->pu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec; 949 hrt2ts(gethrtime() - p->p_mstart, &ts_run); 950 ts.tv_sec -= ts_run.tv_sec; 951 ts.tv_nsec -= ts_run.tv_nsec; 952 if (ts.tv_nsec < 0) { 953 ts.tv_sec--; 954 if ((ts.tv_nsec = ts.tv_nsec + NANOSEC) >= NANOSEC) { 955 ts.tv_sec++; 956 ts.tv_nsec -= NANOSEC; 957 } 958 } 959 pu->pu_startsec = (uint64_t)(ulong_t)ts.tv_sec; 960 pu->pu_startnsec = (uint64_t)(ulong_t)ts.tv_nsec; 961 } 962 963 pu->pu_pid = p->p_pidp->pid_id; 964 pu->pu_acflag = p->p_user.u_acflag; 965 pu->pu_projid = p->p_task->tk_proj->kpj_id; 966 pu->pu_taskid = p->p_task->tk_tkid; 967 pu->pu_major = getmajor(p->p_sessp->s_dev); 968 pu->pu_minor = getminor(p->p_sessp->s_dev); 969 pu->pu_ancpid = p->p_ancpid; 970 pu->pu_wstat = wstat; 971 /* 972 * Compute average RSS in K. The denominator is the number of 973 * samples: the number of clock ticks plus the initial value. 974 */ 975 pu->pu_mem_rss_avg = (PTOU(p)->u_mem / (p->p_stime + p->p_utime + 1)) * 976 (PAGESIZE / 1024); 977 pu->pu_mem_rss_max = PTOU(p)->u_mem_max * (PAGESIZE / 1024); 978 979 mutex_enter(&p->p_crlock); 980 pu->pu_ruid = crgetruid(p->p_cred); 981 pu->pu_rgid = crgetrgid(p->p_cred); 982 mutex_exit(&p->p_crlock); 983 984 bcopy(p->p_user.u_comm, pu->pu_command, strlen(p->p_user.u_comm) + 1); 985 bcopy(p->p_zone->zone_name, pu->pu_zonename, 986 strlen(p->p_zone->zone_name) + 1); 987 bcopy(p->p_zone->zone_nodename, pu->pu_nodename, 988 strlen(p->p_zone->zone_nodename) + 1); 989 990 /* 991 * Calculate microstate accounting data for a process that is still 992 * running. Presently, we explicitly collect all of the LWP usage into 993 * the proc usage structure here. 994 */ 995 if (flag & EW_PARTIAL) 996 exacct_calculate_proc_mstate(p, pu); 997 if (flag & EW_FINAL) 998 exacct_copy_proc_mstate(p, pu); 999 } 1000 1001 /* 1002 * int exacct_assemble_proc_usage(proc_usage_t *, int (*)(void *, size_t, void 1003 * *, size_t, size_t *), void *, size_t, size_t *) 1004 * 1005 * Overview 1006 * Assemble record with miscellaneous accounting information about the process 1007 * and execute the callback on it. It is the callback's job to set "actual" to 1008 * the size of record. 1009 * 1010 * Return values 1011 * The result of the callback function, unless the extended process accounting 1012 * feature is not active, in which case ENOTACTIVE is returned. 1013 * 1014 * Caller's context 1015 * Suitable for KM_SLEEP allocations. 1016 */ 1017 int 1018 exacct_assemble_proc_usage(ac_info_t *ac_proc, proc_usage_t *pu, 1019 int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *), 1020 void *ubuf, size_t ubufsize, size_t *actual, int flag) 1021 { 1022 ulong_t mask[AC_MASK_SZ]; 1023 ea_object_t *proc_record; 1024 ea_catalog_t record_type; 1025 void *buf; 1026 size_t bufsize; 1027 int ret; 1028 1029 ASSERT(flag == EW_FINAL || flag == EW_PARTIAL); 1030 1031 mutex_enter(&ac_proc->ac_lock); 1032 if (ac_proc->ac_state == AC_OFF) { 1033 mutex_exit(&ac_proc->ac_lock); 1034 return (ENOTACTIVE); 1035 } 1036 bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ); 1037 mutex_exit(&ac_proc->ac_lock); 1038 1039 switch (flag) { 1040 case EW_FINAL: 1041 record_type = EXD_GROUP_PROC; 1042 break; 1043 case EW_PARTIAL: 1044 record_type = EXD_GROUP_PROC_PARTIAL; 1045 break; 1046 } 1047 1048 proc_record = exacct_assemble_proc_record(pu, mask, record_type); 1049 if (proc_record == NULL) 1050 return (0); 1051 1052 /* 1053 * Pack object into buffer and pass to callback. 1054 */ 1055 bufsize = ea_pack_object(proc_record, NULL, 0); 1056 buf = kmem_alloc(bufsize, KM_SLEEP); 1057 (void) ea_pack_object(proc_record, buf, bufsize); 1058 1059 ret = callback(ac_proc, ubuf, ubufsize, buf, bufsize, actual); 1060 1061 /* 1062 * Free all previously allocations. 1063 */ 1064 kmem_free(buf, bufsize); 1065 ea_free_object(proc_record, EUP_ALLOC); 1066 return (ret); 1067 } 1068 1069 /* 1070 * int exacct_commit_callback(ac_info_t *, void *, size_t, void *, size_t, 1071 * size_t *) 1072 * 1073 * Overview 1074 * exacct_commit_callback() writes the indicated buffer to the indicated 1075 * extended accounting file. 1076 * 1077 * Return values 1078 * The result of the write operation is returned. "actual" is updated to 1079 * contain the number of bytes actually written. 1080 * 1081 * Caller's context 1082 * Suitable for a vn_rdwr() operation. 1083 */ 1084 /*ARGSUSED*/ 1085 int 1086 exacct_commit_callback(ac_info_t *info, void *ubuf, size_t ubufsize, 1087 void *buf, size_t bufsize, size_t *actual) 1088 { 1089 int error = 0; 1090 1091 *actual = 0; 1092 if ((error = exacct_vn_write(info, buf, bufsize)) == 0) 1093 *actual = bufsize; 1094 return (error); 1095 } 1096 1097 static void 1098 exacct_do_commit_proc(ac_info_t *ac_proc, proc_t *p, int wstat) 1099 { 1100 size_t size; 1101 proc_usage_t *pu; 1102 ulong_t mask[AC_MASK_SZ]; 1103 1104 mutex_enter(&ac_proc->ac_lock); 1105 if (ac_proc->ac_state == AC_ON) { 1106 bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ); 1107 mutex_exit(&ac_proc->ac_lock); 1108 } else { 1109 mutex_exit(&ac_proc->ac_lock); 1110 return; 1111 } 1112 1113 mutex_enter(&p->p_lock); 1114 size = strlen(p->p_user.u_comm) + 1; 1115 mutex_exit(&p->p_lock); 1116 1117 pu = kmem_alloc(sizeof (proc_usage_t), KM_SLEEP); 1118 pu->pu_command = kmem_alloc(size, KM_SLEEP); 1119 mutex_enter(&p->p_lock); 1120 exacct_calculate_proc_usage(p, pu, mask, EW_FINAL, wstat); 1121 mutex_exit(&p->p_lock); 1122 1123 (void) exacct_assemble_proc_usage(ac_proc, pu, 1124 exacct_commit_callback, NULL, 0, &size, EW_FINAL); 1125 1126 kmem_free(pu->pu_command, strlen(pu->pu_command) + 1); 1127 kmem_free(pu, sizeof (proc_usage_t)); 1128 } 1129 1130 /* 1131 * void exacct_commit_proc(proc_t *, int) 1132 * 1133 * Overview 1134 * exacct_commit_proc() calculates the final usage for a process, updating the 1135 * task usage if task accounting is active, and writing a process record if 1136 * process accounting is active. exacct_commit_proc() is intended for being 1137 * called from proc_exit(). 1138 * 1139 * Return values 1140 * None. 1141 * 1142 * Caller's context 1143 * Suitable for KM_SLEEP allocations. p_lock must not be held at entry. 1144 */ 1145 void 1146 exacct_commit_proc(proc_t *p, int wstat) 1147 { 1148 zone_t *zone = p->p_zone; 1149 struct exacct_globals *acg, *gacg = NULL; 1150 1151 if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) { 1152 /* 1153 * acctctl module not loaded. Nothing to do. 1154 */ 1155 return; 1156 } 1157 acg = zone_getspecific(exacct_zone_key, zone); 1158 exacct_do_commit_proc(&acg->ac_proc, p, wstat); 1159 if (zone != global_zone) { 1160 gacg = zone_getspecific(exacct_zone_key, global_zone); 1161 exacct_do_commit_proc(&gacg->ac_proc, p, wstat); 1162 } 1163 } 1164 1165 static int 1166 exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res) 1167 { 1168 int attached = 1; 1169 1170 switch (res) { 1171 case AC_FLOW_SADDR: 1172 if (fu->fu_isv4) { 1173 (void) ea_attach_item(record, &fu->fu_saddr[3], 1174 sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4SADDR); 1175 } else { 1176 (void) ea_attach_item(record, &fu->fu_saddr, 1177 sizeof (fu->fu_saddr), EXT_RAW | 1178 EXD_FLOW_V6SADDR); 1179 } 1180 break; 1181 case AC_FLOW_DADDR: 1182 if (fu->fu_isv4) { 1183 (void) ea_attach_item(record, &fu->fu_daddr[3], 1184 sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4DADDR); 1185 } else { 1186 (void) ea_attach_item(record, &fu->fu_daddr, 1187 sizeof (fu->fu_daddr), EXT_RAW | 1188 EXD_FLOW_V6DADDR); 1189 } 1190 break; 1191 case AC_FLOW_SPORT: 1192 (void) ea_attach_item(record, &fu->fu_sport, 1193 sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_SPORT); 1194 break; 1195 case AC_FLOW_DPORT: 1196 (void) ea_attach_item(record, &fu->fu_dport, 1197 sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_DPORT); 1198 break; 1199 case AC_FLOW_PROTOCOL: 1200 (void) ea_attach_item(record, &fu->fu_protocol, 1201 sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_PROTOCOL); 1202 break; 1203 case AC_FLOW_DSFIELD: 1204 (void) ea_attach_item(record, &fu->fu_dsfield, 1205 sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_DSFIELD); 1206 break; 1207 case AC_FLOW_CTIME: 1208 (void) ea_attach_item(record, &fu->fu_ctime, 1209 sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_CTIME); 1210 break; 1211 case AC_FLOW_LSEEN: 1212 (void) ea_attach_item(record, &fu->fu_lseen, 1213 sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_LSEEN); 1214 break; 1215 case AC_FLOW_NBYTES: 1216 (void) ea_attach_item(record, &fu->fu_nbytes, 1217 sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NBYTES); 1218 break; 1219 case AC_FLOW_NPKTS: 1220 (void) ea_attach_item(record, &fu->fu_npackets, 1221 sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NPKTS); 1222 break; 1223 case AC_FLOW_PROJID: 1224 if (fu->fu_projid >= 0) { 1225 (void) ea_attach_item(record, &fu->fu_projid, 1226 sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_PROJID); 1227 } 1228 break; 1229 case AC_FLOW_UID: 1230 if (fu->fu_userid >= 0) { 1231 (void) ea_attach_item(record, &fu->fu_userid, 1232 sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID); 1233 } 1234 break; 1235 case AC_FLOW_ANAME: 1236 (void) ea_attach_item(record, fu->fu_aname, 1237 strlen(fu->fu_aname) + 1, EXT_STRING | EXD_FLOW_ANAME); 1238 break; 1239 default: 1240 attached = 0; 1241 } 1242 return (attached); 1243 } 1244 1245 static ea_object_t * 1246 exacct_assemble_flow_record(flow_usage_t *fu, ulong_t *mask, 1247 ea_catalog_t record_type) 1248 { 1249 int res, count; 1250 ea_object_t *record; 1251 1252 /* 1253 * Assemble usage values into group. 1254 */ 1255 record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type); 1256 for (res = 1, count = 0; res <= AC_FLOW_MAX_RES; res++) 1257 if (BT_TEST(mask, res)) 1258 count += exacct_attach_flow_item(fu, record, res); 1259 if (count == 0) { 1260 ea_free_object(record, EUP_ALLOC); 1261 record = NULL; 1262 } 1263 return (record); 1264 } 1265 1266 int 1267 exacct_assemble_flow_usage(ac_info_t *ac_flow, flow_usage_t *fu, 1268 int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *), 1269 void *ubuf, size_t ubufsize, size_t *actual) 1270 { 1271 ulong_t mask[AC_MASK_SZ]; 1272 ea_object_t *flow_usage; 1273 ea_catalog_t record_type; 1274 void *buf; 1275 size_t bufsize; 1276 int ret; 1277 1278 mutex_enter(&ac_flow->ac_lock); 1279 if (ac_flow->ac_state == AC_OFF) { 1280 mutex_exit(&ac_flow->ac_lock); 1281 return (ENOTACTIVE); 1282 } 1283 bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ); 1284 mutex_exit(&ac_flow->ac_lock); 1285 1286 record_type = EXD_GROUP_FLOW; 1287 1288 flow_usage = exacct_assemble_flow_record(fu, mask, record_type); 1289 if (flow_usage == NULL) { 1290 return (0); 1291 } 1292 1293 /* 1294 * Pack object into buffer and pass to callback. 1295 */ 1296 bufsize = ea_pack_object(flow_usage, NULL, 0); 1297 buf = kmem_alloc(bufsize, KM_NOSLEEP); 1298 if (buf == NULL) { 1299 return (ENOMEM); 1300 } 1301 1302 (void) ea_pack_object(flow_usage, buf, bufsize); 1303 1304 ret = callback(ac_flow, ubuf, ubufsize, buf, bufsize, actual); 1305 1306 /* 1307 * Free all previously allocations. 1308 */ 1309 kmem_free(buf, bufsize); 1310 ea_free_object(flow_usage, EUP_ALLOC); 1311 return (ret); 1312 } 1313 1314 void 1315 exacct_commit_flow(void *arg) 1316 { 1317 flow_usage_t *f = (flow_usage_t *)arg; 1318 size_t size; 1319 ulong_t mask[AC_MASK_SZ]; 1320 struct exacct_globals *acg; 1321 ac_info_t *ac_flow; 1322 1323 if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) { 1324 /* 1325 * acctctl module not loaded. Nothing to do. 1326 */ 1327 return; 1328 } 1329 1330 /* 1331 * Even though each zone nominally has its own flow accounting settings 1332 * (ac_flow), these are only maintained by and for the global zone. 1333 * 1334 * If this were to change in the future, this function should grow a 1335 * second zoneid (or zone) argument, and use the corresponding zone's 1336 * settings rather than always using those of the global zone. 1337 */ 1338 acg = zone_getspecific(exacct_zone_key, global_zone); 1339 ac_flow = &acg->ac_flow; 1340 1341 mutex_enter(&ac_flow->ac_lock); 1342 if (ac_flow->ac_state == AC_OFF) { 1343 mutex_exit(&ac_flow->ac_lock); 1344 return; 1345 } 1346 bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ); 1347 mutex_exit(&ac_flow->ac_lock); 1348 1349 (void) exacct_assemble_flow_usage(ac_flow, f, exacct_commit_callback, 1350 NULL, 0, &size); 1351 } 1352 1353 /* 1354 * int exacct_tag_task(task_t *, void *, size_t, int) 1355 * 1356 * Overview 1357 * exacct_tag_task() provides the exacct record construction and writing 1358 * support required by putacct(2) for task entities. 1359 * 1360 * Return values 1361 * The result of the write operation is returned, unless the extended 1362 * accounting facility is not active, in which case ENOTACTIVE is returned. 1363 * 1364 * Caller's context 1365 * Suitable for KM_SLEEP allocations. 1366 */ 1367 int 1368 exacct_tag_task(ac_info_t *ac_task, task_t *tk, void *ubuf, size_t ubufsz, 1369 int flags) 1370 { 1371 int error = 0; 1372 void *buf; 1373 size_t bufsize; 1374 ea_catalog_t cat; 1375 ea_object_t *tag; 1376 1377 mutex_enter(&ac_task->ac_lock); 1378 if (ac_task->ac_state == AC_OFF || ac_task->ac_vnode == NULL) { 1379 mutex_exit(&ac_task->ac_lock); 1380 return (ENOTACTIVE); 1381 } 1382 mutex_exit(&ac_task->ac_lock); 1383 1384 tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_TASK_TAG); 1385 (void) ea_attach_item(tag, &tk->tk_tkid, 0, 1386 EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID); 1387 (void) ea_attach_item(tag, tk->tk_zone->zone_nodename, 0, 1388 EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME); 1389 if (flags == EP_RAW) 1390 cat = EXT_RAW | EXC_DEFAULT | EXD_TASK_TAG; 1391 else 1392 cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_TASK_TAG; 1393 (void) ea_attach_item(tag, ubuf, ubufsz, cat); 1394 1395 bufsize = ea_pack_object(tag, NULL, 0); 1396 buf = kmem_alloc(bufsize, KM_SLEEP); 1397 (void) ea_pack_object(tag, buf, bufsize); 1398 error = exacct_vn_write(ac_task, buf, bufsize); 1399 kmem_free(buf, bufsize); 1400 ea_free_object(tag, EUP_ALLOC); 1401 return (error); 1402 } 1403 1404 /* 1405 * exacct_tag_proc(pid_t, taskid_t, void *, size_t, int, char *) 1406 * 1407 * Overview 1408 * exacct_tag_proc() provides the exacct record construction and writing 1409 * support required by putacct(2) for processes. 1410 * 1411 * Return values 1412 * The result of the write operation is returned, unless the extended 1413 * accounting facility is not active, in which case ENOTACTIVE is returned. 1414 * 1415 * Caller's context 1416 * Suitable for KM_SLEEP allocations. 1417 */ 1418 int 1419 exacct_tag_proc(ac_info_t *ac_proc, pid_t pid, taskid_t tkid, void *ubuf, 1420 size_t ubufsz, int flags, const char *hostname) 1421 { 1422 int error = 0; 1423 void *buf; 1424 size_t bufsize; 1425 ea_catalog_t cat; 1426 ea_object_t *tag; 1427 1428 mutex_enter(&ac_proc->ac_lock); 1429 if (ac_proc->ac_state == AC_OFF || ac_proc->ac_vnode == NULL) { 1430 mutex_exit(&ac_proc->ac_lock); 1431 return (ENOTACTIVE); 1432 } 1433 mutex_exit(&ac_proc->ac_lock); 1434 1435 tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_PROC_TAG); 1436 (void) ea_attach_item(tag, &pid, sizeof (uint32_t), 1437 EXT_UINT32 | EXC_DEFAULT | EXD_PROC_PID); 1438 (void) ea_attach_item(tag, &tkid, 0, 1439 EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID); 1440 (void) ea_attach_item(tag, (void *)hostname, 0, 1441 EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME); 1442 if (flags == EP_RAW) 1443 cat = EXT_RAW | EXC_DEFAULT | EXD_PROC_TAG; 1444 else 1445 cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_PROC_TAG; 1446 (void) ea_attach_item(tag, ubuf, ubufsz, cat); 1447 1448 bufsize = ea_pack_object(tag, NULL, 0); 1449 buf = kmem_alloc(bufsize, KM_SLEEP); 1450 (void) ea_pack_object(tag, buf, bufsize); 1451 error = exacct_vn_write(ac_proc, buf, bufsize); 1452 kmem_free(buf, bufsize); 1453 ea_free_object(tag, EUP_ALLOC); 1454 return (error); 1455 } 1456 1457 /* 1458 * void exacct_init(void) 1459 * 1460 * Overview 1461 * Initialized the extended accounting subsystem. 1462 * 1463 * Return values 1464 * None. 1465 * 1466 * Caller's context 1467 * Suitable for KM_SLEEP allocations. 1468 */ 1469 void 1470 exacct_init() 1471 { 1472 exacct_queue = system_taskq; 1473 exacct_object_cache = kmem_cache_create("exacct_object_cache", 1474 sizeof (ea_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 1475 } 1476 1477 /* 1478 * exacct_snapshot_proc_mstate() copies a process's microstate accounting data 1479 * and resource usage counters into a given task_usage_t. It differs from 1480 * exacct_copy_proc_mstate() in that here a) we are copying to a task_usage_t, 1481 * b) p_lock will have been acquired earlier in the call path and c) we 1482 * are here including the process's user and system times. 1483 */ 1484 static void 1485 exacct_snapshot_proc_mstate(proc_t *p, task_usage_t *tu) 1486 { 1487 tu->tu_utime = mstate_aggr_state(p, LMS_USER); 1488 tu->tu_stime = mstate_aggr_state(p, LMS_SYSTEM); 1489 tu->tu_minflt = p->p_ru.minflt; 1490 tu->tu_majflt = p->p_ru.majflt; 1491 tu->tu_sndmsg = p->p_ru.msgsnd; 1492 tu->tu_rcvmsg = p->p_ru.msgrcv; 1493 tu->tu_ioch = p->p_ru.ioch; 1494 tu->tu_iblk = p->p_ru.inblock; 1495 tu->tu_oblk = p->p_ru.oublock; 1496 tu->tu_vcsw = p->p_ru.nvcsw; 1497 tu->tu_icsw = p->p_ru.nivcsw; 1498 tu->tu_nsig = p->p_ru.nsignals; 1499 tu->tu_nswp = p->p_ru.nswap; 1500 tu->tu_nscl = p->p_ru.sysc; 1501 } 1502 1503 /* 1504 * void exacct_move_mstate(proc_t *, task_t *, task_t *) 1505 * 1506 * Overview 1507 * exacct_move_mstate() is called by task_change() and accounts for 1508 * a process's resource usage when it is moved from one task to another. 1509 * 1510 * The process's usage at this point is recorded in the new task so 1511 * that it can be excluded from the calculation of resources consumed 1512 * by that task. 1513 * 1514 * The resource usage inherited by the new task is also added to the 1515 * aggregate maintained by the old task for processes that have exited. 1516 * 1517 * Return values 1518 * None. 1519 * 1520 * Caller's context 1521 * pidlock and p_lock held across exacct_move_mstate(). 1522 */ 1523 void 1524 exacct_move_mstate(proc_t *p, task_t *oldtk, task_t *newtk) 1525 { 1526 task_usage_t tu; 1527 1528 /* Take a snapshot of this process's mstate and RU counters */ 1529 exacct_snapshot_proc_mstate(p, &tu); 1530 1531 /* 1532 * Use the snapshot to increment the aggregate usage of the old 1533 * task, and the inherited usage of the new one. 1534 */ 1535 mutex_enter(&oldtk->tk_usage_lock); 1536 exacct_add_task_mstate(oldtk->tk_usage, &tu); 1537 mutex_exit(&oldtk->tk_usage_lock); 1538 mutex_enter(&newtk->tk_usage_lock); 1539 exacct_add_task_mstate(newtk->tk_inherited, &tu); 1540 mutex_exit(&newtk->tk_usage_lock); 1541 } 1542