1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * vm_usage 31 * 32 * This file implements the getvmusage() private system call. 33 * getvmusage() counts the amount of resident memory pages and swap 34 * reserved by the specified process collective. A "process collective" is 35 * the set of processes owned by a particular, zone, project, task, or user. 36 * 37 * rss and swap are counted so that for a given process collective, a page is 38 * only counted once. For example, this means that if multiple processes in 39 * the same project map the same page, then the project will only be charged 40 * once for that page. On the other hand, if two processes in different 41 * projects map the same page, then both projects will be charged 42 * for the page. 43 * 44 * The vm_getusage() calculation is implemented so that the first thread 45 * performs the rss/swap counting. Other callers will wait for that thread to 46 * finish, copying the results. This enables multiple rcapds and prstats to 47 * consume data from the same calculation. The results are also cached so that 48 * a caller interested in recent results can just copy them instead of starting 49 * a new calculation. The caller passes the maximium age (in seconds) of the 50 * data. If the cached data is young enough, the cache is copied, otherwise, 51 * a new calculation is executed and the cache is replaced with the new 52 * data. 53 * 54 * The rss calculation for each process collective is as follows: 55 * 56 * - Inspect flags, determine if counting rss for zones, projects, tasks, 57 * and/or users. 58 * - For each proc: 59 * - Figure out proc's collectives (zone, project, task, and/or user). 60 * - For each seg in proc's address space: 61 * - If seg is private: 62 * - Lookup anons in the amp. 63 * - For incore pages not previously visited each of the 64 * proc's collectives, add incore pagesize to each. 65 * collective. 66 * Anon's with a refcnt of 1 can be assummed to be not 67 * previously visited. 68 * - For address ranges without anons in the amp: 69 * - Lookup pages in underlying vnode. 70 * - For incore pages not previously visiting for 71 * each of the proc's collectives, add incore 72 * pagesize to each collective. 73 * - If seg is shared: 74 * - Lookup pages in the shared amp or vnode. 75 * - For incore pages not previously visited for each of 76 * the proc's collectives, add incore pagesize to each 77 * collective. 78 * 79 * Swap is reserved by private segments, and shared anonymous segments. 80 * The only shared anon segments which do not reserve swap are ISM segments 81 * and schedctl segments, both of which can be identified by having 82 * amp->swresv == 0. 83 * 84 * The swap calculation for each collective is as follows: 85 * 86 * - Inspect flags, determine if counting rss for zones, projects, tasks, 87 * and/or users. 88 * - For each proc: 89 * - Figure out proc's collectives (zone, project, task, and/or user). 90 * - For each seg in proc's address space: 91 * - If seg is private: 92 * - Add svd->swresv pages to swap count for each of the 93 * proc's collectives. 94 * - If seg is anon, shared, and amp->swresv != 0 95 * - For address ranges in amp not previously visited for 96 * each of the proc's collectives, add size of address 97 * range to the swap count for each collective. 98 * 99 * These two calculations are done simultaneously, with most of the work 100 * being done in vmu_calculate_seg(). The results of the calculation are 101 * copied into "vmu_data.vmu_cache_results". 102 * 103 * To perform the calculation, various things are tracked and cached: 104 * 105 * - incore/not-incore page ranges for all vnodes. 106 * (vmu_data.vmu_all_vnodes_hash) 107 * This eliminates looking up the same page more than once. 108 * 109 * - incore/not-incore page ranges for all shared amps. 110 * (vmu_data.vmu_all_amps_hash) 111 * This eliminates looking up the same page more than once. 112 * 113 * - visited page ranges for each collective. 114 * - per vnode (entity->vme_vnode_hash) 115 * - per shared amp (entity->vme_amp_hash) 116 * For accurate counting of map-shared and cow-shared pages. 117 * 118 * - visited private anons (refcnt > 1) for each collective. 119 * (entity->vme_anon_hash) 120 * For accurate counting of cow-shared pages. 121 * 122 * The common accounting structure is the vmu_entity_t, which represents 123 * collectives: 124 * 125 * - A zone. 126 * - A project, task, or user within a zone. 127 * - The entire system (vmu_data.vmu_system). 128 * - Each collapsed (col) project and user. This means a given projid or 129 * uid, regardless of which zone the process is in. For instance, 130 * project 0 in the global zone and project 0 in a non global zone are 131 * the same collapsed project. 132 * 133 * Each entity structure tracks which pages have been already visited for 134 * that entity (via previously inspected processes) so that these pages are 135 * not double counted. 136 */ 137 138 #include <sys/errno.h> 139 #include <sys/types.h> 140 #include <sys/zone.h> 141 #include <sys/proc.h> 142 #include <sys/project.h> 143 #include <sys/task.h> 144 #include <sys/thread.h> 145 #include <sys/time.h> 146 #include <sys/mman.h> 147 #include <sys/modhash.h> 148 #include <sys/modhash_impl.h> 149 #include <sys/shm.h> 150 #include <sys/swap.h> 151 #include <sys/synch.h> 152 #include <sys/systm.h> 153 #include <sys/var.h> 154 #include <sys/vm_usage.h> 155 #include <sys/zone.h> 156 #include <vm/anon.h> 157 #include <vm/as.h> 158 #include <vm/seg_vn.h> 159 #include <vm/seg_spt.h> 160 161 #define VMUSAGE_HASH_SIZE 512 162 163 #define VMUSAGE_TYPE_VNODE 1 164 #define VMUSAGE_TYPE_AMP 2 165 #define VMUSAGE_TYPE_ANON 3 166 167 #define VMUSAGE_BOUND_UNKNOWN 0 168 #define VMUSAGE_BOUND_INCORE 1 169 #define VMUSAGE_BOUND_NOT_INCORE 2 170 171 /* 172 * bounds for vnodes and shared amps 173 * Each bound is either entirely incore, entirely not in core, or 174 * entirely unknown. bounds are stored in order by offset. 175 */ 176 typedef struct vmu_bound { 177 struct vmu_bound *vmb_next; 178 pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */ 179 pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */ 180 char vmb_type; /* One of VMUSAGE_BOUND_* */ 181 } vmu_bound_t; 182 183 /* 184 * hash of visited objects (vnodes or shared amps) 185 * key is address of vnode or amp. Bounds lists known incore/non-incore 186 * bounds for vnode/amp. 187 */ 188 typedef struct vmu_object { 189 struct vmu_object *vmo_next; /* free list */ 190 caddr_t vmo_key; 191 short vmo_type; 192 vmu_bound_t *vmo_bounds; 193 } vmu_object_t; 194 195 /* 196 * Entity by which to count results. 197 * 198 * The entity structure keeps the current rss/swap counts for each entity 199 * (zone, project, etc), and hashes of vm structures that have already 200 * been visited for the entity. 201 * 202 * vme_next: links the list of all entities currently being counted by 203 * vmu_calculate(). 204 * 205 * vme_next_calc: links the list of entities related to the current process 206 * being counted by vmu_calculate_proc(). 207 * 208 * vmu_calculate_proc() walks all processes. For each process, it makes a 209 * list of the entities related to that process using vme_next_calc. This 210 * list changes each time vmu_calculate_proc() is called. 211 * 212 */ 213 typedef struct vmu_entity { 214 struct vmu_entity *vme_next; 215 struct vmu_entity *vme_next_calc; 216 mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ 217 mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ 218 mod_hash_t *vme_anon_hash; /* cow anons visited for entity */ 219 vmusage_t vme_result; /* identifies entity and results */ 220 } vmu_entity_t; 221 222 /* 223 * Hash of entities visited within a zone, and an entity for the zone 224 * itself. 225 */ 226 typedef struct vmu_zone { 227 struct vmu_zone *vmz_next; /* free list */ 228 id_t vmz_id; 229 vmu_entity_t *vmz_zone; 230 mod_hash_t *vmz_projects_hash; 231 mod_hash_t *vmz_tasks_hash; 232 mod_hash_t *vmz_rusers_hash; 233 mod_hash_t *vmz_eusers_hash; 234 } vmu_zone_t; 235 236 /* 237 * Cache of results from last calculation 238 */ 239 typedef struct vmu_cache { 240 vmusage_t *vmc_results; /* Results from last call to */ 241 /* vm_getusage(). */ 242 uint64_t vmc_nresults; /* Count of cached results */ 243 uint64_t vmc_refcnt; /* refcnt for free */ 244 uint_t vmc_flags; /* Flags for vm_getusage() */ 245 hrtime_t vmc_timestamp; /* when cache was created */ 246 } vmu_cache_t; 247 248 /* 249 * top level rss info for the system 250 */ 251 typedef struct vmu_data { 252 kmutex_t vmu_lock; /* Protects vmu_data */ 253 kcondvar_t vmu_cv; /* Used to signal threads */ 254 /* Waiting for */ 255 /* Rss_calc_thread to finish */ 256 vmu_entity_t *vmu_system; /* Entity for tracking */ 257 /* rss/swap for all processes */ 258 /* in all zones */ 259 mod_hash_t *vmu_zones_hash; /* Zones visited */ 260 mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */ 261 mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */ 262 mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */ 263 /* to implement VMUSAGE_COL_* */ 264 /* flags, which aggregate by */ 265 /* project or user regardless */ 266 /* of zoneid. */ 267 mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */ 268 /* to track incore/not-incore */ 269 mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */ 270 /* amps to track incore/not- */ 271 /* incore */ 272 vmu_entity_t *vmu_entities; /* Linked list of entities */ 273 size_t vmu_nentities; /* Count of entities in list */ 274 vmu_cache_t *vmu_cache; /* Cached results */ 275 kthread_t *vmu_calc_thread; /* NULL, or thread running */ 276 /* vmu_calculate() */ 277 uint_t vmu_calc_flags; /* Flags being using by */ 278 /* currently running calc */ 279 /* thread */ 280 uint_t vmu_pending_flags; /* Flags of vm_getusage() */ 281 /* threads waiting for */ 282 /* calc thread to finish */ 283 uint_t vmu_pending_waiters; /* Number of threads waiting */ 284 /* for calc thread */ 285 vmu_bound_t *vmu_free_bounds; 286 vmu_object_t *vmu_free_objects; 287 vmu_entity_t *vmu_free_entities; 288 vmu_zone_t *vmu_free_zones; 289 } vmu_data_t; 290 291 extern struct as kas; 292 extern proc_t *practive; 293 extern zone_t *global_zone; 294 extern struct seg_ops segvn_ops; 295 extern struct seg_ops segspt_shmops; 296 297 static vmu_data_t vmu_data; 298 static kmem_cache_t *vmu_bound_cache; 299 static kmem_cache_t *vmu_object_cache; 300 301 /* 302 * Save a bound on the free list 303 */ 304 static void 305 vmu_free_bound(vmu_bound_t *bound) 306 { 307 bound->vmb_next = vmu_data.vmu_free_bounds; 308 vmu_data.vmu_free_bounds = bound; 309 } 310 311 /* 312 * Free an object, and all visited bound info. 313 */ 314 static void 315 vmu_free_object(mod_hash_val_t val) 316 { 317 vmu_object_t *obj = (vmu_object_t *)val; 318 vmu_bound_t *bound = obj->vmo_bounds; 319 vmu_bound_t *tmp; 320 321 while (bound != NULL) { 322 tmp = bound; 323 bound = bound->vmb_next; 324 vmu_free_bound(tmp); 325 } 326 obj->vmo_next = vmu_data.vmu_free_objects; 327 vmu_data.vmu_free_objects = obj; 328 } 329 330 /* 331 * Free an entity, and hashes of visited objects for that entity. 332 */ 333 static void 334 vmu_free_entity(mod_hash_val_t val) 335 { 336 vmu_entity_t *entity = (vmu_entity_t *)val; 337 338 if (entity->vme_vnode_hash != NULL) 339 i_mod_hash_clear_nosync(entity->vme_vnode_hash); 340 if (entity->vme_amp_hash != NULL) 341 i_mod_hash_clear_nosync(entity->vme_amp_hash); 342 if (entity->vme_anon_hash != NULL) 343 i_mod_hash_clear_nosync(entity->vme_anon_hash); 344 345 entity->vme_next = vmu_data.vmu_free_entities; 346 vmu_data.vmu_free_entities = entity; 347 } 348 349 /* 350 * Free zone entity, and all hashes of entities inside that zone, 351 * which are projects, tasks, and users. 352 */ 353 static void 354 vmu_free_zone(mod_hash_val_t val) 355 { 356 vmu_zone_t *zone = (vmu_zone_t *)val; 357 358 if (zone->vmz_zone != NULL) { 359 vmu_free_entity((mod_hash_val_t)zone->vmz_zone); 360 zone->vmz_zone = NULL; 361 } 362 if (zone->vmz_projects_hash != NULL) 363 i_mod_hash_clear_nosync(zone->vmz_projects_hash); 364 if (zone->vmz_tasks_hash != NULL) 365 i_mod_hash_clear_nosync(zone->vmz_tasks_hash); 366 if (zone->vmz_rusers_hash != NULL) 367 i_mod_hash_clear_nosync(zone->vmz_rusers_hash); 368 if (zone->vmz_eusers_hash != NULL) 369 i_mod_hash_clear_nosync(zone->vmz_eusers_hash); 370 zone->vmz_next = vmu_data.vmu_free_zones; 371 vmu_data.vmu_free_zones = zone; 372 } 373 374 /* 375 * Initialize synchronization primitives and hashes for system-wide tracking 376 * of visited vnodes and shared amps. Initialize results cache. 377 */ 378 void 379 vm_usage_init() 380 { 381 mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL); 382 cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL); 383 384 vmu_data.vmu_system = NULL; 385 vmu_data.vmu_zones_hash = NULL; 386 vmu_data.vmu_projects_col_hash = NULL; 387 vmu_data.vmu_rusers_col_hash = NULL; 388 vmu_data.vmu_eusers_col_hash = NULL; 389 390 vmu_data.vmu_free_bounds = NULL; 391 vmu_data.vmu_free_objects = NULL; 392 vmu_data.vmu_free_entities = NULL; 393 vmu_data.vmu_free_zones = NULL; 394 395 vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash( 396 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, 397 sizeof (vnode_t)); 398 vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash( 399 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, 400 sizeof (struct anon_map)); 401 vmu_data.vmu_projects_col_hash = mod_hash_create_idhash( 402 "vmusage collapsed project hash", VMUSAGE_HASH_SIZE, 403 vmu_free_entity); 404 vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash( 405 "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE, 406 vmu_free_entity); 407 vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash( 408 "vmusage collpased euser hash", VMUSAGE_HASH_SIZE, 409 vmu_free_entity); 410 vmu_data.vmu_zones_hash = mod_hash_create_idhash( 411 "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone); 412 413 vmu_bound_cache = kmem_cache_create("vmu_bound_cache", 414 sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 415 vmu_object_cache = kmem_cache_create("vmu_object_cache", 416 sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 417 418 vmu_data.vmu_entities = NULL; 419 vmu_data.vmu_nentities = 0; 420 421 vmu_data.vmu_cache = NULL; 422 vmu_data.vmu_calc_thread = NULL; 423 vmu_data.vmu_calc_flags = 0; 424 vmu_data.vmu_pending_flags = 0; 425 vmu_data.vmu_pending_waiters = 0; 426 } 427 428 /* 429 * Allocate hashes for tracking vm objects visited for an entity. 430 * Update list of entities. 431 */ 432 static vmu_entity_t * 433 vmu_alloc_entity(id_t id, int type, id_t zoneid) 434 { 435 vmu_entity_t *entity; 436 437 if (vmu_data.vmu_free_entities != NULL) { 438 entity = vmu_data.vmu_free_entities; 439 vmu_data.vmu_free_entities = 440 vmu_data.vmu_free_entities->vme_next; 441 bzero(&entity->vme_result, sizeof (vmusage_t)); 442 } else { 443 entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP); 444 } 445 entity->vme_result.vmu_id = id; 446 entity->vme_result.vmu_zoneid = zoneid; 447 entity->vme_result.vmu_type = type; 448 449 if (entity->vme_vnode_hash == NULL) 450 entity->vme_vnode_hash = mod_hash_create_ptrhash( 451 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, 452 sizeof (vnode_t)); 453 454 if (entity->vme_amp_hash == NULL) 455 entity->vme_amp_hash = mod_hash_create_ptrhash( 456 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, 457 sizeof (struct anon_map)); 458 459 if (entity->vme_anon_hash == NULL) 460 entity->vme_anon_hash = mod_hash_create_ptrhash( 461 "vmusage anon hash", VMUSAGE_HASH_SIZE, 462 mod_hash_null_valdtor, sizeof (struct anon)); 463 464 entity->vme_next = vmu_data.vmu_entities; 465 vmu_data.vmu_entities = entity; 466 vmu_data.vmu_nentities++; 467 468 return (entity); 469 } 470 471 /* 472 * Allocate a zone entity, and hashes for tracking visited vm objects 473 * for projects, tasks, and users within that zone. 474 */ 475 static vmu_zone_t * 476 vmu_alloc_zone(id_t id) 477 { 478 vmu_zone_t *zone; 479 480 if (vmu_data.vmu_free_zones != NULL) { 481 zone = vmu_data.vmu_free_zones; 482 vmu_data.vmu_free_zones = 483 vmu_data.vmu_free_zones->vmz_next; 484 zone->vmz_next = NULL; 485 zone->vmz_zone = NULL; 486 } else { 487 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP); 488 } 489 490 zone->vmz_id = id; 491 492 if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) 493 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); 494 495 if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | 496 VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL) 497 zone->vmz_projects_hash = mod_hash_create_idhash( 498 "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity); 499 500 if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) 501 != 0 && zone->vmz_tasks_hash == NULL) 502 zone->vmz_tasks_hash = mod_hash_create_idhash( 503 "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity); 504 505 if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) 506 != 0 && zone->vmz_rusers_hash == NULL) 507 zone->vmz_rusers_hash = mod_hash_create_idhash( 508 "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); 509 510 if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) 511 != 0 && zone->vmz_eusers_hash == NULL) 512 zone->vmz_eusers_hash = mod_hash_create_idhash( 513 "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); 514 515 return (zone); 516 } 517 518 /* 519 * Allocate a structure for tracking visited bounds for a vm object. 520 */ 521 static vmu_object_t * 522 vmu_alloc_object(caddr_t key, int type) 523 { 524 vmu_object_t *object; 525 526 if (vmu_data.vmu_free_objects != NULL) { 527 object = vmu_data.vmu_free_objects; 528 vmu_data.vmu_free_objects = 529 vmu_data.vmu_free_objects->vmo_next; 530 } else { 531 object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP); 532 } 533 534 object->vmo_key = key; 535 object->vmo_type = type; 536 object->vmo_bounds = NULL; 537 538 return (object); 539 } 540 541 /* 542 * Allocate and return a bound structure. 543 */ 544 static vmu_bound_t * 545 vmu_alloc_bound() 546 { 547 vmu_bound_t *bound; 548 549 if (vmu_data.vmu_free_bounds != NULL) { 550 bound = vmu_data.vmu_free_bounds; 551 vmu_data.vmu_free_bounds = 552 vmu_data.vmu_free_bounds->vmb_next; 553 bzero(bound, sizeof (vmu_bound_t)); 554 } else { 555 bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP); 556 bzero(bound, sizeof (vmu_bound_t)); 557 } 558 return (bound); 559 } 560 561 /* 562 * vmu_find_insert_* functions implement hash lookup or allocate and 563 * insert operations. 564 */ 565 static vmu_object_t * 566 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) 567 { 568 int ret; 569 vmu_object_t *object; 570 571 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, 572 (mod_hash_val_t *)&object); 573 if (ret != 0) { 574 object = vmu_alloc_object(key, type); 575 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, 576 (mod_hash_val_t)object, (mod_hash_hndl_t)0); 577 ASSERT(ret == 0); 578 } 579 return (object); 580 } 581 582 static int 583 vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) 584 { 585 int ret; 586 caddr_t val; 587 588 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, 589 (mod_hash_val_t *)&val); 590 591 if (ret == 0) 592 return (0); 593 594 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, 595 (mod_hash_val_t)key, (mod_hash_hndl_t)0); 596 597 ASSERT(ret == 0); 598 599 return (1); 600 } 601 602 static vmu_entity_t * 603 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid) 604 { 605 int ret; 606 vmu_entity_t *entity; 607 608 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id, 609 (mod_hash_val_t *)&entity); 610 if (ret != 0) { 611 entity = vmu_alloc_entity(id, type, zoneid); 612 ret = i_mod_hash_insert_nosync(hash, 613 (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity, 614 (mod_hash_hndl_t)0); 615 ASSERT(ret == 0); 616 } 617 return (entity); 618 } 619 620 621 622 623 /* 624 * Returns list of object bounds between start and end. New bounds inserted 625 * by this call are given type. 626 * 627 * Returns the number of pages covered if new bounds are created. Returns 0 628 * if region between start/end consists of all existing bounds. 629 */ 630 static pgcnt_t 631 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t 632 end, char type, vmu_bound_t **first, vmu_bound_t **last) 633 { 634 vmu_bound_t *next; 635 vmu_bound_t *prev = NULL; 636 vmu_bound_t *tmp = NULL; 637 pgcnt_t ret = 0; 638 639 *first = *last = NULL; 640 641 for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) { 642 /* 643 * Find bounds overlapping or overlapped by range [start,end]. 644 */ 645 if (start > next->vmb_end) { 646 /* bound is before new bound */ 647 prev = next; 648 continue; 649 } 650 if (next->vmb_start > end) { 651 /* bound is after new bound */ 652 break; 653 } 654 if (*first == NULL) 655 *first = next; 656 *last = next; 657 } 658 659 if (*first == NULL) { 660 ASSERT(*last == NULL); 661 /* 662 * No bounds overlapping range [start,end], so create new 663 * bound 664 */ 665 tmp = vmu_alloc_bound(); 666 tmp->vmb_start = start; 667 tmp->vmb_end = end; 668 tmp->vmb_type = type; 669 if (prev == NULL) { 670 tmp->vmb_next = ro->vmo_bounds; 671 ro->vmo_bounds = tmp; 672 } else { 673 tmp->vmb_next = prev->vmb_next; 674 prev->vmb_next = tmp; 675 } 676 *first = tmp; 677 *last = tmp; 678 ASSERT(tmp->vmb_end >= tmp->vmb_start); 679 ret = tmp->vmb_end - tmp->vmb_start + 1; 680 return (ret); 681 } 682 683 /* Check to see if start is before first known bound */ 684 ASSERT(first != NULL && last != NULL); 685 next = (*first); 686 if (start < (*first)->vmb_start) { 687 /* Create new bound before first bound */ 688 tmp = vmu_alloc_bound(); 689 tmp->vmb_start = start; 690 tmp->vmb_end = (*first)->vmb_start - 1; 691 tmp->vmb_type = type; 692 tmp->vmb_next = *first; 693 if (*first == ro->vmo_bounds) 694 ro->vmo_bounds = tmp; 695 if (prev != NULL) 696 prev->vmb_next = tmp; 697 ASSERT(tmp->vmb_end >= tmp->vmb_start); 698 ret += tmp->vmb_end - tmp->vmb_start + 1; 699 *first = tmp; 700 } 701 /* 702 * Between start and end, search for gaps between and after existing 703 * bounds. Create new bounds to fill gaps if they exist. 704 */ 705 while (end > next->vmb_end) { 706 /* 707 * Check for gap between bound and next bound. if no gap, 708 * continue. 709 */ 710 if ((next != *last) && 711 ((next->vmb_end + 1) == next->vmb_next->vmb_start)) { 712 next = next->vmb_next; 713 continue; 714 } 715 /* 716 * Insert new bound in gap after bound, and before next 717 * bound if next bound exists. 718 */ 719 tmp = vmu_alloc_bound(); 720 tmp->vmb_type = type; 721 tmp->vmb_next = next->vmb_next; 722 tmp->vmb_start = next->vmb_end + 1; 723 724 if (next != *last) { 725 tmp->vmb_end = next->vmb_next->vmb_start - 1; 726 ASSERT(tmp->vmb_end >= tmp->vmb_start); 727 ret += tmp->vmb_end - tmp->vmb_start + 1; 728 next->vmb_next = tmp; 729 next = tmp->vmb_next; 730 } else { 731 tmp->vmb_end = end; 732 ASSERT(tmp->vmb_end >= tmp->vmb_start); 733 ret += tmp->vmb_end - tmp->vmb_start + 1; 734 next->vmb_next = tmp; 735 *last = tmp; 736 break; 737 } 738 } 739 return (ret); 740 } 741 742 /* 743 * vmu_update_bounds() 744 * 745 * first, last: list of continuous bounds, of which zero or more are of 746 * type VMUSAGE_BOUND_UNKNOWN. 747 * 748 * new_first, new_last: list of continuous bounds, of which none are of 749 * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to 750 * update the types of bounds in (first,last) with 751 * type VMUSAGE_BOUND_UNKNOWN. 752 * 753 * For the list of bounds (first,last), this function updates any bounds 754 * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in 755 * the list (new_first, new_last). 756 * 757 * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list 758 * (new_first, new_last), it will be split into multiple bounds. 759 * 760 * Return value: 761 * The number of pages in the list of bounds (first,last) that were of 762 * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type 763 * VMUSAGE_BOUND_INCORE. 764 * 765 */ 766 static pgcnt_t 767 vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last, 768 vmu_bound_t *new_first, vmu_bound_t *new_last) 769 { 770 vmu_bound_t *next, *new_next, *tmp; 771 pgcnt_t rss = 0; 772 773 next = *first; 774 new_next = new_first; 775 776 /* 777 * Verify first and last bound are covered by new bounds if they 778 * have unknown type. 779 */ 780 ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN || 781 (*first)->vmb_start >= new_next->vmb_start); 782 ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN || 783 (*last)->vmb_end <= new_last->vmb_end); 784 for (;;) { 785 /* If bound already has type, proceed to next bound */ 786 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { 787 if (next == *last) 788 break; 789 next = next->vmb_next; 790 continue; 791 } 792 while (new_next->vmb_end < next->vmb_start) 793 new_next = new_next->vmb_next; 794 ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN); 795 next->vmb_type = new_next->vmb_type; 796 if (new_next->vmb_end < next->vmb_end) { 797 /* need to split bound */ 798 tmp = vmu_alloc_bound(); 799 tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN; 800 tmp->vmb_start = new_next->vmb_end + 1; 801 tmp->vmb_end = next->vmb_end; 802 tmp->vmb_next = next->vmb_next; 803 next->vmb_end = new_next->vmb_end; 804 next->vmb_next = tmp; 805 if (*last == next) 806 *last = tmp; 807 if (next->vmb_type == VMUSAGE_BOUND_INCORE) 808 rss += next->vmb_end - next->vmb_start + 1; 809 next = tmp; 810 } else { 811 if (next->vmb_type == VMUSAGE_BOUND_INCORE) 812 rss += next->vmb_end - next->vmb_start + 1; 813 if (next == *last) 814 break; 815 next = next->vmb_next; 816 } 817 } 818 return (rss); 819 } 820 821 /* 822 * merges adjacent bounds with same type between first and last bound. 823 * After merge, last pointer is no longer valid, as last bound may be 824 * merged away. 825 */ 826 static void 827 vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last) 828 { 829 vmu_bound_t *next; 830 vmu_bound_t *tmp; 831 832 ASSERT(*first != NULL); 833 ASSERT(*last != NULL); 834 835 next = *first; 836 while (next != *last) { 837 838 /* If bounds are adjacent and have same type, merge them */ 839 if (((next->vmb_end + 1) == next->vmb_next->vmb_start) && 840 (next->vmb_type == next->vmb_next->vmb_type)) { 841 tmp = next->vmb_next; 842 next->vmb_end = tmp->vmb_end; 843 next->vmb_next = tmp->vmb_next; 844 vmu_free_bound(tmp); 845 if (tmp == *last) 846 *last = next; 847 } else { 848 next = next->vmb_next; 849 } 850 } 851 } 852 853 /* 854 * Given an amp and a list of bounds, updates each bound's type with 855 * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE. 856 * 857 * If a bound is partially incore, it will be split into two bounds. 858 * first and last may be modified, as bounds may be split into multiple 859 * bounds if the are partially incore/not-incore. 860 * 861 * Set incore to non-zero if bounds are already known to be incore 862 * 863 */ 864 static void 865 vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first, 866 vmu_bound_t **last, boolean_t incore) 867 { 868 vmu_bound_t *next; 869 vmu_bound_t *tmp; 870 pgcnt_t index; 871 short bound_type; 872 short page_type; 873 vnode_t *vn; 874 anoff_t off; 875 struct anon *ap; 876 877 next = *first; 878 /* Shared anon slots don't change once set */ 879 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 880 for (;;) { 881 if (incore == B_TRUE) 882 next->vmb_type = VMUSAGE_BOUND_INCORE; 883 884 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { 885 if (next == *last) 886 break; 887 next = next->vmb_next; 888 continue; 889 } 890 bound_type = next->vmb_type; 891 index = next->vmb_start; 892 while (index <= next->vmb_end) { 893 894 /* 895 * These are used to determine how much to increment 896 * index when a large page is found. 897 */ 898 page_t *page; 899 pgcnt_t pgcnt = 1; 900 uint_t pgshft; 901 pgcnt_t pgmsk; 902 903 ap = anon_get_ptr(amp->ahp, index); 904 if (ap != NULL) 905 swap_xlate(ap, &vn, &off); 906 907 if (ap != NULL && vn != NULL && vn->v_pages != NULL && 908 (page = page_exists(vn, off)) != NULL) { 909 page_type = VMUSAGE_BOUND_INCORE; 910 if (page->p_szc > 0) { 911 pgcnt = page_get_pagecnt(page->p_szc); 912 pgshft = page_get_shift(page->p_szc); 913 pgmsk = (0x1 << (pgshft - PAGESHIFT)) 914 - 1; 915 } 916 } else { 917 page_type = VMUSAGE_BOUND_NOT_INCORE; 918 } 919 if (bound_type == VMUSAGE_BOUND_UNKNOWN) { 920 next->vmb_type = page_type; 921 } else if (next->vmb_type != page_type) { 922 /* 923 * if current bound type does not match page 924 * type, need to split off new bound. 925 */ 926 tmp = vmu_alloc_bound(); 927 tmp->vmb_type = page_type; 928 tmp->vmb_start = index; 929 tmp->vmb_end = next->vmb_end; 930 tmp->vmb_next = next->vmb_next; 931 next->vmb_end = index - 1; 932 next->vmb_next = tmp; 933 if (*last == next) 934 *last = tmp; 935 next = tmp; 936 } 937 if (pgcnt > 1) { 938 /* 939 * If inside large page, jump to next large 940 * page 941 */ 942 index = (index & ~pgmsk) + pgcnt; 943 } else { 944 index++; 945 } 946 } 947 if (next == *last) { 948 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); 949 break; 950 } else 951 next = next->vmb_next; 952 } 953 ANON_LOCK_EXIT(&->a_rwlock); 954 } 955 956 /* 957 * Same as vmu_amp_update_incore_bounds(), except for tracking 958 * incore-/not-incore for vnodes. 959 */ 960 static void 961 vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first, 962 vmu_bound_t **last) 963 { 964 vmu_bound_t *next; 965 vmu_bound_t *tmp; 966 pgcnt_t index; 967 short bound_type; 968 short page_type; 969 970 next = *first; 971 for (;;) { 972 if (vnode->v_pages == NULL) 973 next->vmb_type = VMUSAGE_BOUND_NOT_INCORE; 974 975 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { 976 if (next == *last) 977 break; 978 next = next->vmb_next; 979 continue; 980 } 981 982 bound_type = next->vmb_type; 983 index = next->vmb_start; 984 while (index <= next->vmb_end) { 985 986 /* 987 * These are used to determine how much to increment 988 * index when a large page is found. 989 */ 990 page_t *page; 991 pgcnt_t pgcnt = 1; 992 uint_t pgshft; 993 pgcnt_t pgmsk; 994 995 if (vnode->v_pages != NULL && 996 (page = page_exists(vnode, ptob(index))) != NULL) { 997 page_type = VMUSAGE_BOUND_INCORE; 998 if (page->p_szc > 0) { 999 pgcnt = page_get_pagecnt(page->p_szc); 1000 pgshft = page_get_shift(page->p_szc); 1001 pgmsk = (0x1 << (pgshft - PAGESHIFT)) 1002 - 1; 1003 } 1004 } else { 1005 page_type = VMUSAGE_BOUND_NOT_INCORE; 1006 } 1007 if (bound_type == VMUSAGE_BOUND_UNKNOWN) { 1008 next->vmb_type = page_type; 1009 } else if (next->vmb_type != page_type) { 1010 /* 1011 * if current bound type does not match page 1012 * type, need to split off new bound. 1013 */ 1014 tmp = vmu_alloc_bound(); 1015 tmp->vmb_type = page_type; 1016 tmp->vmb_start = index; 1017 tmp->vmb_end = next->vmb_end; 1018 tmp->vmb_next = next->vmb_next; 1019 next->vmb_end = index - 1; 1020 next->vmb_next = tmp; 1021 if (*last == next) 1022 *last = tmp; 1023 next = tmp; 1024 } 1025 if (pgcnt > 1) { 1026 /* 1027 * If inside large page, jump to next large 1028 * page 1029 */ 1030 index = (index & ~pgmsk) + pgcnt; 1031 } else { 1032 index++; 1033 } 1034 } 1035 if (next == *last) { 1036 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); 1037 break; 1038 } else 1039 next = next->vmb_next; 1040 } 1041 } 1042 1043 /* 1044 * Calculate the rss and swap consumed by a segment. vmu_entities is the 1045 * list of entities to visit. For shared segments, the vnode or amp 1046 * is looked up in each entity to see if has been already counted. Private 1047 * anon pages are checked per entity to ensure that cow pages are not 1048 * double counted. 1049 * 1050 * For private mapped files, first the amp is checked for private pages. 1051 * Bounds not backed by the amp are looked up in the vnode for each entity 1052 * to avoid double counting of private COW vnode pages. 1053 */ 1054 static void 1055 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) 1056 { 1057 struct segvn_data *svd; 1058 struct shm_data *shmd; 1059 struct spt_data *sptd; 1060 vmu_object_t *shared_object = NULL; 1061 vmu_object_t *entity_object = NULL; 1062 vmu_entity_t *entity; 1063 vmusage_t *result; 1064 vmu_bound_t *first = NULL; 1065 vmu_bound_t *last = NULL; 1066 vmu_bound_t *cur = NULL; 1067 vmu_bound_t *e_first = NULL; 1068 vmu_bound_t *e_last = NULL; 1069 vmu_bound_t *tmp; 1070 pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt; 1071 struct anon_map *private_amp = NULL; 1072 boolean_t incore = B_FALSE; 1073 boolean_t shared = B_FALSE; 1074 int file = 0; 1075 pgcnt_t swresv = 0; 1076 pgcnt_t panon = 0; 1077 1078 /* Can zero-length segments exist? Not sure, so parenoia */ 1079 if (seg->s_size <= 0) 1080 return; 1081 1082 /* 1083 * Figure out if there is a shared object (such as a named vnode or 1084 * a shared amp, then figure out if there is a private amp, which 1085 * identifies private pages. 1086 */ 1087 if (seg->s_ops == &segvn_ops) { 1088 svd = (struct segvn_data *)seg->s_data; 1089 if (svd->type == MAP_SHARED) 1090 shared = B_TRUE; 1091 else 1092 swresv = svd->swresv; 1093 1094 if (svd->vp != NULL) { 1095 file = 1; 1096 shared_object = vmu_find_insert_object( 1097 vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp, 1098 VMUSAGE_TYPE_VNODE); 1099 s_start = btop(svd->offset); 1100 s_end = btop(svd->offset + seg->s_size) - 1; 1101 } 1102 if (svd->amp != NULL && svd->type == MAP_SHARED) { 1103 ASSERT(shared_object == NULL); 1104 shared_object = vmu_find_insert_object( 1105 vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp, 1106 VMUSAGE_TYPE_AMP); 1107 s_start = svd->anon_index; 1108 s_end = svd->anon_index + btop(seg->s_size) - 1; 1109 /* schedctl mappings are always in core */ 1110 if (svd->amp->swresv == 0) 1111 incore = B_TRUE; 1112 } 1113 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 1114 /* 1115 * Text replication anon maps can be shared across all zones. 1116 * Space used for text replication is typically capped as 1117 * small % of memory. To keep it simple for now we don't 1118 * account for swap and memory space used for text replication. 1119 */ 1120 if (svd->tr_state == SEGVN_TR_OFF && svd->amp != NULL && 1121 svd->type == MAP_PRIVATE) { 1122 private_amp = svd->amp; 1123 p_start = svd->anon_index; 1124 p_end = svd->anon_index + btop(seg->s_size) - 1; 1125 } 1126 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 1127 } else if (seg->s_ops == &segspt_shmops) { 1128 shared = B_TRUE; 1129 shmd = (struct shm_data *)seg->s_data; 1130 shared_object = vmu_find_insert_object( 1131 vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp, 1132 VMUSAGE_TYPE_AMP); 1133 s_start = 0; 1134 s_end = btop(seg->s_size) - 1; 1135 sptd = shmd->shm_sptseg->s_data; 1136 1137 /* ism segments are always incore and do not reserve swap */ 1138 if (sptd->spt_flags & SHM_SHARE_MMU) 1139 incore = B_TRUE; 1140 1141 } else { 1142 return; 1143 } 1144 1145 /* 1146 * If there is a private amp, count anon pages that exist. If an 1147 * anon has a refcnt > 1 (cow sharing), then save the anon in a 1148 * hash so that it is not double counted. 1149 * 1150 * If there is also a shared object, they figure out the bounds 1151 * which are not mapped by the private amp. 1152 */ 1153 if (private_amp != NULL) { 1154 1155 /* Enter as writer to prevent cow anons from being freed */ 1156 ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER); 1157 1158 p_index = p_start; 1159 s_index = s_start; 1160 1161 while (p_index <= p_end) { 1162 1163 pgcnt_t p_index_next; 1164 pgcnt_t p_bound_size; 1165 int cnt; 1166 anoff_t off; 1167 struct vnode *vn; 1168 struct anon *ap; 1169 page_t *page; /* For handling of large */ 1170 pgcnt_t pgcnt = 1; /* pages */ 1171 pgcnt_t pgstart; 1172 pgcnt_t pgend; 1173 uint_t pgshft; 1174 pgcnt_t pgmsk; 1175 1176 p_index_next = p_index; 1177 ap = anon_get_next_ptr(private_amp->ahp, 1178 &p_index_next); 1179 1180 /* 1181 * If next anon is past end of mapping, simulate 1182 * end of anon so loop terminates. 1183 */ 1184 if (p_index_next > p_end) { 1185 p_index_next = p_end + 1; 1186 ap = NULL; 1187 } 1188 /* 1189 * For cow segments, keep track of bounds not 1190 * backed by private amp so they can be looked 1191 * up in the backing vnode 1192 */ 1193 if (p_index_next != p_index) { 1194 1195 /* 1196 * Compute index difference between anon and 1197 * previous anon. 1198 */ 1199 p_bound_size = p_index_next - p_index - 1; 1200 1201 if (shared_object != NULL) { 1202 cur = vmu_alloc_bound(); 1203 cur->vmb_next = NULL; 1204 cur->vmb_start = s_index; 1205 cur->vmb_end = s_index + p_bound_size; 1206 cur->vmb_type = VMUSAGE_BOUND_UNKNOWN; 1207 if (first == NULL) { 1208 first = cur; 1209 last = cur; 1210 } else { 1211 last->vmb_next = cur; 1212 last = cur; 1213 } 1214 } 1215 p_index = p_index + p_bound_size + 1; 1216 s_index = s_index + p_bound_size + 1; 1217 } 1218 1219 /* Detect end of anons in amp */ 1220 if (ap == NULL) 1221 break; 1222 1223 cnt = ap->an_refcnt; 1224 swap_xlate(ap, &vn, &off); 1225 1226 if (vn == NULL || vn->v_pages == NULL || 1227 (page = page_exists(vn, off)) == NULL) { 1228 p_index++; 1229 s_index++; 1230 continue; 1231 } 1232 1233 /* 1234 * If large page is found, compute portion of large 1235 * page in mapping, and increment indicies to the next 1236 * large page. 1237 */ 1238 if (page->p_szc > 0) { 1239 1240 pgcnt = page_get_pagecnt(page->p_szc); 1241 pgshft = page_get_shift(page->p_szc); 1242 pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1; 1243 1244 /* First page in large page */ 1245 pgstart = p_index & ~pgmsk; 1246 /* Last page in large page */ 1247 pgend = pgstart + pgcnt - 1; 1248 /* 1249 * Artifically end page if page extends past 1250 * end of mapping. 1251 */ 1252 if (pgend > p_end) 1253 pgend = p_end; 1254 1255 /* 1256 * Compute number of pages from large page 1257 * which are mapped. 1258 */ 1259 pgcnt = pgend - p_index + 1; 1260 1261 /* 1262 * Point indicies at page after large page, 1263 * or at page after end of mapping. 1264 */ 1265 p_index += pgcnt; 1266 s_index += pgcnt; 1267 } else { 1268 p_index++; 1269 s_index++; 1270 } 1271 1272 /* 1273 * Assume anon structs with a refcnt 1274 * of 1 are not cow shared, so there 1275 * is no reason to track them per entity. 1276 */ 1277 if (cnt == 1) { 1278 panon += pgcnt; 1279 continue; 1280 } 1281 for (entity = vmu_entities; entity != NULL; 1282 entity = entity->vme_next_calc) { 1283 1284 result = &entity->vme_result; 1285 /* 1286 * Track cow anons per entity so 1287 * they are not double counted. 1288 */ 1289 if (vmu_find_insert_anon(entity->vme_anon_hash, 1290 (caddr_t)ap) == 0) 1291 continue; 1292 1293 result->vmu_rss_all += (pgcnt << PAGESHIFT); 1294 result->vmu_rss_private += 1295 (pgcnt << PAGESHIFT); 1296 } 1297 } 1298 ANON_LOCK_EXIT(&private_amp->a_rwlock); 1299 } 1300 1301 /* Add up resident anon and swap reserved for private mappings */ 1302 if (swresv > 0 || panon > 0) { 1303 for (entity = vmu_entities; entity != NULL; 1304 entity = entity->vme_next_calc) { 1305 result = &entity->vme_result; 1306 result->vmu_swap_all += swresv; 1307 result->vmu_swap_private += swresv; 1308 result->vmu_rss_all += (panon << PAGESHIFT); 1309 result->vmu_rss_private += (panon << PAGESHIFT); 1310 } 1311 } 1312 1313 /* Compute resident pages backing shared amp or named vnode */ 1314 if (shared_object != NULL) { 1315 if (first == NULL) { 1316 /* 1317 * No private amp, or private amp has no anon 1318 * structs. This means entire segment is backed by 1319 * the shared object. 1320 */ 1321 first = vmu_alloc_bound(); 1322 first->vmb_next = NULL; 1323 first->vmb_start = s_start; 1324 first->vmb_end = s_end; 1325 first->vmb_type = VMUSAGE_BOUND_UNKNOWN; 1326 } 1327 /* 1328 * Iterate bounds not backed by private amp, and compute 1329 * resident pages. 1330 */ 1331 cur = first; 1332 while (cur != NULL) { 1333 1334 if (vmu_insert_lookup_object_bounds(shared_object, 1335 cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN, 1336 &first, &last) > 0) { 1337 /* new bounds, find incore/not-incore */ 1338 if (shared_object->vmo_type == 1339 VMUSAGE_TYPE_VNODE) 1340 vmu_vnode_update_incore_bounds( 1341 (vnode_t *) 1342 shared_object->vmo_key, &first, 1343 &last); 1344 else 1345 vmu_amp_update_incore_bounds( 1346 (struct anon_map *) 1347 shared_object->vmo_key, &first, 1348 &last, incore); 1349 vmu_merge_bounds(&first, &last); 1350 } 1351 for (entity = vmu_entities; entity != NULL; 1352 entity = entity->vme_next_calc) { 1353 1354 result = &entity->vme_result; 1355 1356 entity_object = vmu_find_insert_object( 1357 shared_object->vmo_type == 1358 VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash: 1359 entity->vme_amp_hash, 1360 shared_object->vmo_key, 1361 shared_object->vmo_type); 1362 1363 virt = vmu_insert_lookup_object_bounds( 1364 entity_object, cur->vmb_start, cur->vmb_end, 1365 VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last); 1366 1367 if (virt == 0) 1368 continue; 1369 /* 1370 * Range visited for this entity 1371 */ 1372 rss = vmu_update_bounds(&e_first, 1373 &e_last, first, last); 1374 result->vmu_rss_all += (rss << PAGESHIFT); 1375 if (shared == B_TRUE && file == B_FALSE) { 1376 /* shared anon mapping */ 1377 result->vmu_swap_all += 1378 (virt << PAGESHIFT); 1379 result->vmu_swap_shared += 1380 (virt << PAGESHIFT); 1381 result->vmu_rss_shared += 1382 (rss << PAGESHIFT); 1383 } else if (shared == B_TRUE && file == B_TRUE) { 1384 /* shared file mapping */ 1385 result->vmu_rss_shared += 1386 (rss << PAGESHIFT); 1387 } else if (shared == B_FALSE && 1388 file == B_TRUE) { 1389 /* private file mapping */ 1390 result->vmu_rss_private += 1391 (rss << PAGESHIFT); 1392 } 1393 vmu_merge_bounds(&e_first, &e_last); 1394 } 1395 tmp = cur; 1396 cur = cur->vmb_next; 1397 vmu_free_bound(tmp); 1398 } 1399 } 1400 } 1401 1402 /* 1403 * Based on the current calculation flags, find the relevant entities 1404 * which are relative to the process. Then calculate each segment 1405 * in the process'es address space for each relevant entity. 1406 */ 1407 static void 1408 vmu_calculate_proc(proc_t *p) 1409 { 1410 vmu_entity_t *entities = NULL; 1411 vmu_zone_t *zone; 1412 vmu_entity_t *tmp; 1413 struct as *as; 1414 struct seg *seg; 1415 int ret; 1416 1417 /* Figure out which entities are being computed */ 1418 if ((vmu_data.vmu_system) != NULL) { 1419 tmp = vmu_data.vmu_system; 1420 tmp->vme_next_calc = entities; 1421 entities = tmp; 1422 } 1423 if (vmu_data.vmu_calc_flags & 1424 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | 1425 VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | 1426 VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | 1427 VMUSAGE_ALL_EUSERS)) { 1428 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, 1429 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, 1430 (mod_hash_val_t *)&zone); 1431 if (ret != 0) { 1432 zone = vmu_alloc_zone(p->p_zone->zone_id); 1433 ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash, 1434 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, 1435 (mod_hash_val_t)zone, (mod_hash_hndl_t)0); 1436 ASSERT(ret == 0); 1437 } 1438 if (zone->vmz_zone != NULL) { 1439 tmp = zone->vmz_zone; 1440 tmp->vme_next_calc = entities; 1441 entities = tmp; 1442 } 1443 if (vmu_data.vmu_calc_flags & 1444 (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) { 1445 tmp = vmu_find_insert_entity(zone->vmz_projects_hash, 1446 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, 1447 zone->vmz_id); 1448 tmp->vme_next_calc = entities; 1449 entities = tmp; 1450 } 1451 if (vmu_data.vmu_calc_flags & 1452 (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) { 1453 tmp = vmu_find_insert_entity(zone->vmz_tasks_hash, 1454 p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id); 1455 tmp->vme_next_calc = entities; 1456 entities = tmp; 1457 } 1458 if (vmu_data.vmu_calc_flags & 1459 (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) { 1460 tmp = vmu_find_insert_entity(zone->vmz_rusers_hash, 1461 crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id); 1462 tmp->vme_next_calc = entities; 1463 entities = tmp; 1464 } 1465 if (vmu_data.vmu_calc_flags & 1466 (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { 1467 tmp = vmu_find_insert_entity(zone->vmz_eusers_hash, 1468 crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id); 1469 tmp->vme_next_calc = entities; 1470 entities = tmp; 1471 } 1472 } 1473 /* Entities which collapse projects and users for all zones */ 1474 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) { 1475 tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash, 1476 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES); 1477 tmp->vme_next_calc = entities; 1478 entities = tmp; 1479 } 1480 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) { 1481 tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash, 1482 crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES); 1483 tmp->vme_next_calc = entities; 1484 entities = tmp; 1485 } 1486 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) { 1487 tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash, 1488 crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES); 1489 tmp->vme_next_calc = entities; 1490 entities = tmp; 1491 } 1492 1493 ASSERT(entities != NULL); 1494 /* process all segs in process's address space */ 1495 as = p->p_as; 1496 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1497 for (seg = AS_SEGFIRST(as); seg != NULL; 1498 seg = AS_SEGNEXT(as, seg)) { 1499 vmu_calculate_seg(entities, seg); 1500 } 1501 AS_LOCK_EXIT(as, &as->a_lock); 1502 } 1503 1504 /* 1505 * Free data created by previous call to vmu_calculate(). 1506 */ 1507 static void 1508 vmu_clear_calc() 1509 { 1510 if (vmu_data.vmu_system != NULL) 1511 vmu_free_entity(vmu_data.vmu_system); 1512 vmu_data.vmu_system = NULL; 1513 if (vmu_data.vmu_zones_hash != NULL) 1514 i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash); 1515 if (vmu_data.vmu_projects_col_hash != NULL) 1516 i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash); 1517 if (vmu_data.vmu_rusers_col_hash != NULL) 1518 i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash); 1519 if (vmu_data.vmu_eusers_col_hash != NULL) 1520 i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash); 1521 1522 i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash); 1523 i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash); 1524 } 1525 1526 /* 1527 * Free unused data structures. These can result if the system workload 1528 * decreases between calculations. 1529 */ 1530 static void 1531 vmu_free_extra() 1532 { 1533 vmu_bound_t *tb; 1534 vmu_object_t *to; 1535 vmu_entity_t *te; 1536 vmu_zone_t *tz; 1537 1538 while (vmu_data.vmu_free_bounds != NULL) { 1539 tb = vmu_data.vmu_free_bounds; 1540 vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next; 1541 kmem_cache_free(vmu_bound_cache, tb); 1542 } 1543 while (vmu_data.vmu_free_objects != NULL) { 1544 to = vmu_data.vmu_free_objects; 1545 vmu_data.vmu_free_objects = 1546 vmu_data.vmu_free_objects->vmo_next; 1547 kmem_cache_free(vmu_object_cache, to); 1548 } 1549 while (vmu_data.vmu_free_entities != NULL) { 1550 te = vmu_data.vmu_free_entities; 1551 vmu_data.vmu_free_entities = 1552 vmu_data.vmu_free_entities->vme_next; 1553 if (te->vme_vnode_hash != NULL) 1554 mod_hash_destroy_hash(te->vme_vnode_hash); 1555 if (te->vme_amp_hash != NULL) 1556 mod_hash_destroy_hash(te->vme_amp_hash); 1557 if (te->vme_anon_hash != NULL) 1558 mod_hash_destroy_hash(te->vme_anon_hash); 1559 kmem_free(te, sizeof (vmu_entity_t)); 1560 } 1561 while (vmu_data.vmu_free_zones != NULL) { 1562 tz = vmu_data.vmu_free_zones; 1563 vmu_data.vmu_free_zones = 1564 vmu_data.vmu_free_zones->vmz_next; 1565 if (tz->vmz_projects_hash != NULL) 1566 mod_hash_destroy_hash(tz->vmz_projects_hash); 1567 if (tz->vmz_tasks_hash != NULL) 1568 mod_hash_destroy_hash(tz->vmz_tasks_hash); 1569 if (tz->vmz_rusers_hash != NULL) 1570 mod_hash_destroy_hash(tz->vmz_rusers_hash); 1571 if (tz->vmz_eusers_hash != NULL) 1572 mod_hash_destroy_hash(tz->vmz_eusers_hash); 1573 kmem_free(tz, sizeof (vmu_zone_t)); 1574 } 1575 } 1576 1577 extern kcondvar_t *pr_pid_cv; 1578 1579 /* 1580 * Determine which entity types are relevant and allocate the hashes to 1581 * track them. Then walk the process table and count rss and swap 1582 * for each process'es address space. Address space object such as 1583 * vnodes, amps and anons are tracked per entity, so that they are 1584 * not double counted in the results. 1585 * 1586 */ 1587 static void 1588 vmu_calculate() 1589 { 1590 int i = 0; 1591 int ret; 1592 proc_t *p; 1593 1594 vmu_clear_calc(); 1595 1596 if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM) 1597 vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM, 1598 ALL_ZONES); 1599 1600 /* 1601 * Walk process table and calculate rss of each proc. 1602 * 1603 * Pidlock and p_lock cannot be held while doing the rss calculation. 1604 * This is because: 1605 * 1. The calculation allocates using KM_SLEEP. 1606 * 2. The calculation grabs a_lock, which cannot be grabbed 1607 * after p_lock. 1608 * 1609 * Since pidlock must be dropped, we cannot simply just walk the 1610 * practive list. Instead, we walk the process table, and sprlock 1611 * each process to ensure that it does not exit during the 1612 * calculation. 1613 */ 1614 1615 mutex_enter(&pidlock); 1616 for (i = 0; i < v.v_proc; i++) { 1617 again: 1618 p = pid_entry(i); 1619 if (p == NULL) 1620 continue; 1621 1622 mutex_enter(&p->p_lock); 1623 mutex_exit(&pidlock); 1624 1625 if (panicstr) { 1626 mutex_exit(&p->p_lock); 1627 return; 1628 } 1629 1630 /* Try to set P_PR_LOCK */ 1631 ret = sprtrylock_proc(p); 1632 if (ret == -1) { 1633 /* Process in invalid state */ 1634 mutex_exit(&p->p_lock); 1635 mutex_enter(&pidlock); 1636 continue; 1637 } else if (ret == 1) { 1638 /* 1639 * P_PR_LOCK is already set. Wait and try again. 1640 * This also drops p_lock. 1641 */ 1642 sprwaitlock_proc(p); 1643 mutex_enter(&pidlock); 1644 goto again; 1645 } 1646 mutex_exit(&p->p_lock); 1647 1648 vmu_calculate_proc(p); 1649 1650 mutex_enter(&p->p_lock); 1651 sprunlock(p); 1652 mutex_enter(&pidlock); 1653 } 1654 mutex_exit(&pidlock); 1655 1656 vmu_free_extra(); 1657 } 1658 1659 /* 1660 * allocate a new cache for N results satisfying flags 1661 */ 1662 vmu_cache_t * 1663 vmu_cache_alloc(size_t nres, uint_t flags) 1664 { 1665 vmu_cache_t *cache; 1666 1667 cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP); 1668 cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP); 1669 cache->vmc_nresults = nres; 1670 cache->vmc_flags = flags; 1671 cache->vmc_refcnt = 1; 1672 return (cache); 1673 } 1674 1675 /* 1676 * Make sure cached results are not freed 1677 */ 1678 static void 1679 vmu_cache_hold(vmu_cache_t *cache) 1680 { 1681 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); 1682 cache->vmc_refcnt++; 1683 } 1684 1685 /* 1686 * free cache data 1687 */ 1688 static void 1689 vmu_cache_rele(vmu_cache_t *cache) 1690 { 1691 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); 1692 ASSERT(cache->vmc_refcnt > 0); 1693 cache->vmc_refcnt--; 1694 if (cache->vmc_refcnt == 0) { 1695 kmem_free(cache->vmc_results, sizeof (vmusage_t) * 1696 cache->vmc_nresults); 1697 kmem_free(cache, sizeof (vmu_cache_t)); 1698 } 1699 } 1700 1701 /* 1702 * Copy out the cached results to a caller. Inspect the callers flags 1703 * and zone to determine which cached results should be copied. 1704 */ 1705 static int 1706 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, 1707 uint_t flags) 1708 { 1709 vmusage_t *result, *out_result; 1710 vmusage_t dummy; 1711 size_t i, count = 0; 1712 size_t bufsize; 1713 int ret = 0; 1714 uint_t types = 0; 1715 1716 if (nres != NULL) { 1717 if (copyin((caddr_t)nres, &bufsize, sizeof (size_t))) 1718 return (set_errno(EFAULT)); 1719 } else { 1720 bufsize = 0; 1721 } 1722 1723 /* figure out what results the caller is interested in. */ 1724 if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) 1725 types |= VMUSAGE_SYSTEM; 1726 if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) 1727 types |= VMUSAGE_ZONE; 1728 if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | 1729 VMUSAGE_COL_PROJECTS)) 1730 types |= VMUSAGE_PROJECTS; 1731 if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) 1732 types |= VMUSAGE_TASKS; 1733 if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) 1734 types |= VMUSAGE_RUSERS; 1735 if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) 1736 types |= VMUSAGE_EUSERS; 1737 1738 /* count results for current zone */ 1739 out_result = buf; 1740 for (result = cache->vmc_results, i = 0; 1741 i < cache->vmc_nresults; result++, i++) { 1742 1743 /* Do not return "other-zone" results to non-global zones */ 1744 if (curproc->p_zone != global_zone && 1745 curproc->p_zone->zone_id != result->vmu_zoneid) 1746 continue; 1747 1748 /* 1749 * If non-global zone requests VMUSAGE_SYSTEM, fake 1750 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result. 1751 */ 1752 if (curproc->p_zone != global_zone && 1753 (flags & VMUSAGE_SYSTEM) != 0 && 1754 result->vmu_type == VMUSAGE_ZONE) { 1755 count++; 1756 if (out_result != NULL) { 1757 if (bufsize < count) { 1758 ret = set_errno(EOVERFLOW); 1759 } else { 1760 dummy = *result; 1761 dummy.vmu_zoneid = ALL_ZONES; 1762 dummy.vmu_id = 0; 1763 dummy.vmu_type = VMUSAGE_SYSTEM; 1764 if (copyout(&dummy, out_result, 1765 sizeof (vmusage_t))) 1766 return (set_errno( 1767 EFAULT)); 1768 out_result++; 1769 } 1770 } 1771 } 1772 1773 /* Skip results that do not match requested type */ 1774 if ((result->vmu_type & types) == 0) 1775 continue; 1776 1777 /* Skip collated results if not requested */ 1778 if (result->vmu_zoneid == ALL_ZONES) { 1779 if (result->vmu_type == VMUSAGE_PROJECTS && 1780 (flags & VMUSAGE_COL_PROJECTS) == 0) 1781 continue; 1782 if (result->vmu_type == VMUSAGE_EUSERS && 1783 (flags & VMUSAGE_COL_EUSERS) == 0) 1784 continue; 1785 if (result->vmu_type == VMUSAGE_RUSERS && 1786 (flags & VMUSAGE_COL_RUSERS) == 0) 1787 continue; 1788 } 1789 1790 /* Skip "other zone" results if not requested */ 1791 if (result->vmu_zoneid != curproc->p_zone->zone_id) { 1792 if (result->vmu_type == VMUSAGE_ZONE && 1793 (flags & VMUSAGE_ALL_ZONES) == 0) 1794 continue; 1795 if (result->vmu_type == VMUSAGE_PROJECTS && 1796 (flags & (VMUSAGE_ALL_PROJECTS | 1797 VMUSAGE_COL_PROJECTS)) == 0) 1798 continue; 1799 if (result->vmu_type == VMUSAGE_TASKS && 1800 (flags & VMUSAGE_ALL_TASKS) == 0) 1801 continue; 1802 if (result->vmu_type == VMUSAGE_RUSERS && 1803 (flags & (VMUSAGE_ALL_RUSERS | 1804 VMUSAGE_COL_RUSERS)) == 0) 1805 continue; 1806 if (result->vmu_type == VMUSAGE_EUSERS && 1807 (flags & (VMUSAGE_ALL_EUSERS | 1808 VMUSAGE_COL_EUSERS)) == 0) 1809 continue; 1810 } 1811 count++; 1812 if (out_result != NULL) { 1813 if (bufsize < count) { 1814 ret = set_errno(EOVERFLOW); 1815 } else { 1816 if (copyout(result, out_result, 1817 sizeof (vmusage_t))) 1818 return (set_errno(EFAULT)); 1819 out_result++; 1820 } 1821 } 1822 } 1823 if (nres != NULL) 1824 if (copyout(&count, (void *)nres, sizeof (size_t))) 1825 return (set_errno(EFAULT)); 1826 1827 return (ret); 1828 } 1829 1830 /* 1831 * vm_getusage() 1832 * 1833 * Counts rss and swap by zone, project, task, and/or user. The flags argument 1834 * determines the type of results structures returned. Flags requesting 1835 * results from more than one zone are "flattened" to the local zone if the 1836 * caller is not the global zone. 1837 * 1838 * args: 1839 * flags: bitmap consisting of one or more of VMUSAGE_*. 1840 * age: maximum allowable age (time since counting was done) in 1841 * seconds of the results. Results from previous callers are 1842 * cached in kernel. 1843 * buf: pointer to buffer array of vmusage_t. If NULL, then only nres 1844 * set on success. 1845 * nres: Set to number of vmusage_t structures pointed to by buf 1846 * before calling vm_getusage(). 1847 * On return 0 (success) or ENOSPC, is set to the number of result 1848 * structures returned or attempted to return. 1849 * 1850 * returns 0 on success, -1 on failure: 1851 * EINTR (interrupted) 1852 * ENOSPC (nres to small for results, nres set to needed value for success) 1853 * EINVAL (flags invalid) 1854 * EFAULT (bad address for buf or nres) 1855 */ 1856 int 1857 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres) 1858 { 1859 vmu_entity_t *entity; 1860 vmusage_t *result; 1861 int ret = 0; 1862 int cacherecent = 0; 1863 hrtime_t now; 1864 uint_t flags_orig; 1865 1866 /* 1867 * Non-global zones cannot request system wide and/or collated 1868 * results, or the system result, so munge the flags accordingly. 1869 */ 1870 flags_orig = flags; 1871 if (curproc->p_zone != global_zone) { 1872 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) { 1873 flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS); 1874 flags |= VMUSAGE_PROJECTS; 1875 } 1876 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) { 1877 flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS); 1878 flags |= VMUSAGE_RUSERS; 1879 } 1880 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) { 1881 flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS); 1882 flags |= VMUSAGE_EUSERS; 1883 } 1884 if (flags & VMUSAGE_SYSTEM) { 1885 flags &= ~VMUSAGE_SYSTEM; 1886 flags |= VMUSAGE_ZONE; 1887 } 1888 } 1889 1890 /* Check for unknown flags */ 1891 if ((flags & (~VMUSAGE_MASK)) != 0) 1892 return (set_errno(EINVAL)); 1893 1894 /* Check for no flags */ 1895 if ((flags & VMUSAGE_MASK) == 0) 1896 return (set_errno(EINVAL)); 1897 1898 mutex_enter(&vmu_data.vmu_lock); 1899 now = gethrtime(); 1900 1901 start: 1902 if (vmu_data.vmu_cache != NULL) { 1903 1904 vmu_cache_t *cache; 1905 1906 if ((vmu_data.vmu_cache->vmc_timestamp + 1907 ((hrtime_t)age * NANOSEC)) > now) 1908 cacherecent = 1; 1909 1910 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags && 1911 cacherecent == 1) { 1912 cache = vmu_data.vmu_cache; 1913 vmu_cache_hold(cache); 1914 mutex_exit(&vmu_data.vmu_lock); 1915 1916 ret = vmu_copyout_results(cache, buf, nres, flags_orig); 1917 mutex_enter(&vmu_data.vmu_lock); 1918 vmu_cache_rele(cache); 1919 if (vmu_data.vmu_pending_waiters > 0) 1920 cv_broadcast(&vmu_data.vmu_cv); 1921 mutex_exit(&vmu_data.vmu_lock); 1922 return (ret); 1923 } 1924 /* 1925 * If the cache is recent, it is likely that there are other 1926 * consumers of vm_getusage running, so add their flags to the 1927 * desired flags for the calculation. 1928 */ 1929 if (cacherecent == 1) 1930 flags = vmu_data.vmu_cache->vmc_flags | flags; 1931 } 1932 if (vmu_data.vmu_calc_thread == NULL) { 1933 1934 vmu_cache_t *cache; 1935 1936 vmu_data.vmu_calc_thread = curthread; 1937 vmu_data.vmu_calc_flags = flags; 1938 vmu_data.vmu_entities = NULL; 1939 vmu_data.vmu_nentities = 0; 1940 if (vmu_data.vmu_pending_waiters > 0) 1941 vmu_data.vmu_calc_flags |= 1942 vmu_data.vmu_pending_flags; 1943 1944 vmu_data.vmu_pending_flags = 0; 1945 mutex_exit(&vmu_data.vmu_lock); 1946 vmu_calculate(); 1947 mutex_enter(&vmu_data.vmu_lock); 1948 /* copy results to cache */ 1949 if (vmu_data.vmu_cache != NULL) 1950 vmu_cache_rele(vmu_data.vmu_cache); 1951 cache = vmu_data.vmu_cache = 1952 vmu_cache_alloc(vmu_data.vmu_nentities, 1953 vmu_data.vmu_calc_flags); 1954 1955 result = cache->vmc_results; 1956 for (entity = vmu_data.vmu_entities; entity != NULL; 1957 entity = entity->vme_next) { 1958 *result = entity->vme_result; 1959 result++; 1960 } 1961 cache->vmc_timestamp = gethrtime(); 1962 vmu_cache_hold(cache); 1963 1964 vmu_data.vmu_calc_flags = 0; 1965 vmu_data.vmu_calc_thread = NULL; 1966 1967 if (vmu_data.vmu_pending_waiters > 0) 1968 cv_broadcast(&vmu_data.vmu_cv); 1969 1970 mutex_exit(&vmu_data.vmu_lock); 1971 1972 /* copy cache */ 1973 ret = vmu_copyout_results(cache, buf, nres, flags_orig); 1974 mutex_enter(&vmu_data.vmu_lock); 1975 vmu_cache_rele(cache); 1976 mutex_exit(&vmu_data.vmu_lock); 1977 1978 return (ret); 1979 } 1980 vmu_data.vmu_pending_flags |= flags; 1981 vmu_data.vmu_pending_waiters++; 1982 while (vmu_data.vmu_calc_thread != NULL) { 1983 if (cv_wait_sig(&vmu_data.vmu_cv, 1984 &vmu_data.vmu_lock) == 0) { 1985 vmu_data.vmu_pending_waiters--; 1986 mutex_exit(&vmu_data.vmu_lock); 1987 return (set_errno(EINTR)); 1988 } 1989 } 1990 vmu_data.vmu_pending_waiters--; 1991 goto start; 1992 } 1993