1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * vm_usage 29 * 30 * This file implements the getvmusage() private system call. 31 * getvmusage() counts the amount of resident memory pages and swap 32 * reserved by the specified process collective. A "process collective" is 33 * the set of processes owned by a particular, zone, project, task, or user. 34 * 35 * rss and swap are counted so that for a given process collective, a page is 36 * only counted once. For example, this means that if multiple processes in 37 * the same project map the same page, then the project will only be charged 38 * once for that page. On the other hand, if two processes in different 39 * projects map the same page, then both projects will be charged 40 * for the page. 41 * 42 * The vm_getusage() calculation is implemented so that the first thread 43 * performs the rss/swap counting. Other callers will wait for that thread to 44 * finish, copying the results. This enables multiple rcapds and prstats to 45 * consume data from the same calculation. The results are also cached so that 46 * a caller interested in recent results can just copy them instead of starting 47 * a new calculation. The caller passes the maximium age (in seconds) of the 48 * data. If the cached data is young enough, the cache is copied, otherwise, 49 * a new calculation is executed and the cache is replaced with the new 50 * data. 51 * 52 * The rss calculation for each process collective is as follows: 53 * 54 * - Inspect flags, determine if counting rss for zones, projects, tasks, 55 * and/or users. 56 * - For each proc: 57 * - Figure out proc's collectives (zone, project, task, and/or user). 58 * - For each seg in proc's address space: 59 * - If seg is private: 60 * - Lookup anons in the amp. 61 * - For incore pages not previously visited each of the 62 * proc's collectives, add incore pagesize to each. 63 * collective. 64 * Anon's with a refcnt of 1 can be assummed to be not 65 * previously visited. 66 * - For address ranges without anons in the amp: 67 * - Lookup pages in underlying vnode. 68 * - For incore pages not previously visiting for 69 * each of the proc's collectives, add incore 70 * pagesize to each collective. 71 * - If seg is shared: 72 * - Lookup pages in the shared amp or vnode. 73 * - For incore pages not previously visited for each of 74 * the proc's collectives, add incore pagesize to each 75 * collective. 76 * 77 * Swap is reserved by private segments, and shared anonymous segments. 78 * The only shared anon segments which do not reserve swap are ISM segments 79 * and schedctl segments, both of which can be identified by having 80 * amp->swresv == 0. 81 * 82 * The swap calculation for each collective is as follows: 83 * 84 * - Inspect flags, determine if counting rss for zones, projects, tasks, 85 * and/or users. 86 * - For each proc: 87 * - Figure out proc's collectives (zone, project, task, and/or user). 88 * - For each seg in proc's address space: 89 * - If seg is private: 90 * - Add svd->swresv pages to swap count for each of the 91 * proc's collectives. 92 * - If seg is anon, shared, and amp->swresv != 0 93 * - For address ranges in amp not previously visited for 94 * each of the proc's collectives, add size of address 95 * range to the swap count for each collective. 96 * 97 * These two calculations are done simultaneously, with most of the work 98 * being done in vmu_calculate_seg(). The results of the calculation are 99 * copied into "vmu_data.vmu_cache_results". 100 * 101 * To perform the calculation, various things are tracked and cached: 102 * 103 * - incore/not-incore page ranges for all vnodes. 104 * (vmu_data.vmu_all_vnodes_hash) 105 * This eliminates looking up the same page more than once. 106 * 107 * - incore/not-incore page ranges for all shared amps. 108 * (vmu_data.vmu_all_amps_hash) 109 * This eliminates looking up the same page more than once. 110 * 111 * - visited page ranges for each collective. 112 * - per vnode (entity->vme_vnode_hash) 113 * - per shared amp (entity->vme_amp_hash) 114 * For accurate counting of map-shared and cow-shared pages. 115 * 116 * - visited private anons (refcnt > 1) for each collective. 117 * (entity->vme_anon_hash) 118 * For accurate counting of cow-shared pages. 119 * 120 * The common accounting structure is the vmu_entity_t, which represents 121 * collectives: 122 * 123 * - A zone. 124 * - A project, task, or user within a zone. 125 * - The entire system (vmu_data.vmu_system). 126 * - Each collapsed (col) project and user. This means a given projid or 127 * uid, regardless of which zone the process is in. For instance, 128 * project 0 in the global zone and project 0 in a non global zone are 129 * the same collapsed project. 130 * 131 * Each entity structure tracks which pages have been already visited for 132 * that entity (via previously inspected processes) so that these pages are 133 * not double counted. 134 */ 135 136 #include <sys/errno.h> 137 #include <sys/types.h> 138 #include <sys/zone.h> 139 #include <sys/proc.h> 140 #include <sys/project.h> 141 #include <sys/task.h> 142 #include <sys/thread.h> 143 #include <sys/time.h> 144 #include <sys/mman.h> 145 #include <sys/modhash.h> 146 #include <sys/modhash_impl.h> 147 #include <sys/shm.h> 148 #include <sys/swap.h> 149 #include <sys/synch.h> 150 #include <sys/systm.h> 151 #include <sys/var.h> 152 #include <sys/vm_usage.h> 153 #include <sys/zone.h> 154 #include <sys/sunddi.h> 155 #include <vm/anon.h> 156 #include <vm/as.h> 157 #include <vm/seg_vn.h> 158 #include <vm/seg_spt.h> 159 160 #define VMUSAGE_HASH_SIZE 512 161 162 #define VMUSAGE_TYPE_VNODE 1 163 #define VMUSAGE_TYPE_AMP 2 164 #define VMUSAGE_TYPE_ANON 3 165 166 #define VMUSAGE_BOUND_UNKNOWN 0 167 #define VMUSAGE_BOUND_INCORE 1 168 #define VMUSAGE_BOUND_NOT_INCORE 2 169 170 /* 171 * bounds for vnodes and shared amps 172 * Each bound is either entirely incore, entirely not in core, or 173 * entirely unknown. bounds are stored in order by offset. 174 */ 175 typedef struct vmu_bound { 176 struct vmu_bound *vmb_next; 177 pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */ 178 pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */ 179 char vmb_type; /* One of VMUSAGE_BOUND_* */ 180 } vmu_bound_t; 181 182 /* 183 * hash of visited objects (vnodes or shared amps) 184 * key is address of vnode or amp. Bounds lists known incore/non-incore 185 * bounds for vnode/amp. 186 */ 187 typedef struct vmu_object { 188 struct vmu_object *vmo_next; /* free list */ 189 caddr_t vmo_key; 190 short vmo_type; 191 vmu_bound_t *vmo_bounds; 192 } vmu_object_t; 193 194 /* 195 * Entity by which to count results. 196 * 197 * The entity structure keeps the current rss/swap counts for each entity 198 * (zone, project, etc), and hashes of vm structures that have already 199 * been visited for the entity. 200 * 201 * vme_next: links the list of all entities currently being counted by 202 * vmu_calculate(). 203 * 204 * vme_next_calc: links the list of entities related to the current process 205 * being counted by vmu_calculate_proc(). 206 * 207 * vmu_calculate_proc() walks all processes. For each process, it makes a 208 * list of the entities related to that process using vme_next_calc. This 209 * list changes each time vmu_calculate_proc() is called. 210 * 211 */ 212 typedef struct vmu_entity { 213 struct vmu_entity *vme_next; 214 struct vmu_entity *vme_next_calc; 215 mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ 216 mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ 217 mod_hash_t *vme_anon_hash; /* cow anons visited for entity */ 218 vmusage_t vme_result; /* identifies entity and results */ 219 } vmu_entity_t; 220 221 /* 222 * Hash of entities visited within a zone, and an entity for the zone 223 * itself. 224 */ 225 typedef struct vmu_zone { 226 struct vmu_zone *vmz_next; /* free list */ 227 id_t vmz_id; 228 vmu_entity_t *vmz_zone; 229 mod_hash_t *vmz_projects_hash; 230 mod_hash_t *vmz_tasks_hash; 231 mod_hash_t *vmz_rusers_hash; 232 mod_hash_t *vmz_eusers_hash; 233 } vmu_zone_t; 234 235 /* 236 * Cache of results from last calculation 237 */ 238 typedef struct vmu_cache { 239 vmusage_t *vmc_results; /* Results from last call to */ 240 /* vm_getusage(). */ 241 uint64_t vmc_nresults; /* Count of cached results */ 242 uint64_t vmc_refcnt; /* refcnt for free */ 243 uint_t vmc_flags; /* Flags for vm_getusage() */ 244 hrtime_t vmc_timestamp; /* when cache was created */ 245 } vmu_cache_t; 246 247 /* 248 * top level rss info for the system 249 */ 250 typedef struct vmu_data { 251 kmutex_t vmu_lock; /* Protects vmu_data */ 252 kcondvar_t vmu_cv; /* Used to signal threads */ 253 /* Waiting for */ 254 /* Rss_calc_thread to finish */ 255 vmu_entity_t *vmu_system; /* Entity for tracking */ 256 /* rss/swap for all processes */ 257 /* in all zones */ 258 mod_hash_t *vmu_zones_hash; /* Zones visited */ 259 mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */ 260 mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */ 261 mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */ 262 /* to implement VMUSAGE_COL_* */ 263 /* flags, which aggregate by */ 264 /* project or user regardless */ 265 /* of zoneid. */ 266 mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */ 267 /* to track incore/not-incore */ 268 mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */ 269 /* amps to track incore/not- */ 270 /* incore */ 271 vmu_entity_t *vmu_entities; /* Linked list of entities */ 272 size_t vmu_nentities; /* Count of entities in list */ 273 vmu_cache_t *vmu_cache; /* Cached results */ 274 kthread_t *vmu_calc_thread; /* NULL, or thread running */ 275 /* vmu_calculate() */ 276 uint_t vmu_calc_flags; /* Flags being using by */ 277 /* currently running calc */ 278 /* thread */ 279 uint_t vmu_pending_flags; /* Flags of vm_getusage() */ 280 /* threads waiting for */ 281 /* calc thread to finish */ 282 uint_t vmu_pending_waiters; /* Number of threads waiting */ 283 /* for calc thread */ 284 vmu_bound_t *vmu_free_bounds; 285 vmu_object_t *vmu_free_objects; 286 vmu_entity_t *vmu_free_entities; 287 vmu_zone_t *vmu_free_zones; 288 } vmu_data_t; 289 290 extern struct as kas; 291 extern proc_t *practive; 292 extern zone_t *global_zone; 293 extern struct seg_ops segvn_ops; 294 extern struct seg_ops segspt_shmops; 295 296 static vmu_data_t vmu_data; 297 static kmem_cache_t *vmu_bound_cache; 298 static kmem_cache_t *vmu_object_cache; 299 300 /* 301 * Save a bound on the free list 302 */ 303 static void 304 vmu_free_bound(vmu_bound_t *bound) 305 { 306 bound->vmb_next = vmu_data.vmu_free_bounds; 307 vmu_data.vmu_free_bounds = bound; 308 } 309 310 /* 311 * Free an object, and all visited bound info. 312 */ 313 static void 314 vmu_free_object(mod_hash_val_t val) 315 { 316 vmu_object_t *obj = (vmu_object_t *)val; 317 vmu_bound_t *bound = obj->vmo_bounds; 318 vmu_bound_t *tmp; 319 320 while (bound != NULL) { 321 tmp = bound; 322 bound = bound->vmb_next; 323 vmu_free_bound(tmp); 324 } 325 obj->vmo_next = vmu_data.vmu_free_objects; 326 vmu_data.vmu_free_objects = obj; 327 } 328 329 /* 330 * Free an entity, and hashes of visited objects for that entity. 331 */ 332 static void 333 vmu_free_entity(mod_hash_val_t val) 334 { 335 vmu_entity_t *entity = (vmu_entity_t *)val; 336 337 if (entity->vme_vnode_hash != NULL) 338 i_mod_hash_clear_nosync(entity->vme_vnode_hash); 339 if (entity->vme_amp_hash != NULL) 340 i_mod_hash_clear_nosync(entity->vme_amp_hash); 341 if (entity->vme_anon_hash != NULL) 342 i_mod_hash_clear_nosync(entity->vme_anon_hash); 343 344 entity->vme_next = vmu_data.vmu_free_entities; 345 vmu_data.vmu_free_entities = entity; 346 } 347 348 /* 349 * Free zone entity, and all hashes of entities inside that zone, 350 * which are projects, tasks, and users. 351 */ 352 static void 353 vmu_free_zone(mod_hash_val_t val) 354 { 355 vmu_zone_t *zone = (vmu_zone_t *)val; 356 357 if (zone->vmz_zone != NULL) { 358 vmu_free_entity((mod_hash_val_t)zone->vmz_zone); 359 zone->vmz_zone = NULL; 360 } 361 if (zone->vmz_projects_hash != NULL) 362 i_mod_hash_clear_nosync(zone->vmz_projects_hash); 363 if (zone->vmz_tasks_hash != NULL) 364 i_mod_hash_clear_nosync(zone->vmz_tasks_hash); 365 if (zone->vmz_rusers_hash != NULL) 366 i_mod_hash_clear_nosync(zone->vmz_rusers_hash); 367 if (zone->vmz_eusers_hash != NULL) 368 i_mod_hash_clear_nosync(zone->vmz_eusers_hash); 369 zone->vmz_next = vmu_data.vmu_free_zones; 370 vmu_data.vmu_free_zones = zone; 371 } 372 373 /* 374 * Initialize synchronization primitives and hashes for system-wide tracking 375 * of visited vnodes and shared amps. Initialize results cache. 376 */ 377 void 378 vm_usage_init() 379 { 380 mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL); 381 cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL); 382 383 vmu_data.vmu_system = NULL; 384 vmu_data.vmu_zones_hash = NULL; 385 vmu_data.vmu_projects_col_hash = NULL; 386 vmu_data.vmu_rusers_col_hash = NULL; 387 vmu_data.vmu_eusers_col_hash = NULL; 388 389 vmu_data.vmu_free_bounds = NULL; 390 vmu_data.vmu_free_objects = NULL; 391 vmu_data.vmu_free_entities = NULL; 392 vmu_data.vmu_free_zones = NULL; 393 394 vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash( 395 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, 396 sizeof (vnode_t)); 397 vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash( 398 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, 399 sizeof (struct anon_map)); 400 vmu_data.vmu_projects_col_hash = mod_hash_create_idhash( 401 "vmusage collapsed project hash", VMUSAGE_HASH_SIZE, 402 vmu_free_entity); 403 vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash( 404 "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE, 405 vmu_free_entity); 406 vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash( 407 "vmusage collpased euser hash", VMUSAGE_HASH_SIZE, 408 vmu_free_entity); 409 vmu_data.vmu_zones_hash = mod_hash_create_idhash( 410 "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone); 411 412 vmu_bound_cache = kmem_cache_create("vmu_bound_cache", 413 sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 414 vmu_object_cache = kmem_cache_create("vmu_object_cache", 415 sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 416 417 vmu_data.vmu_entities = NULL; 418 vmu_data.vmu_nentities = 0; 419 420 vmu_data.vmu_cache = NULL; 421 vmu_data.vmu_calc_thread = NULL; 422 vmu_data.vmu_calc_flags = 0; 423 vmu_data.vmu_pending_flags = 0; 424 vmu_data.vmu_pending_waiters = 0; 425 } 426 427 /* 428 * Allocate hashes for tracking vm objects visited for an entity. 429 * Update list of entities. 430 */ 431 static vmu_entity_t * 432 vmu_alloc_entity(id_t id, int type, id_t zoneid) 433 { 434 vmu_entity_t *entity; 435 436 if (vmu_data.vmu_free_entities != NULL) { 437 entity = vmu_data.vmu_free_entities; 438 vmu_data.vmu_free_entities = 439 vmu_data.vmu_free_entities->vme_next; 440 bzero(&entity->vme_result, sizeof (vmusage_t)); 441 } else { 442 entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP); 443 } 444 entity->vme_result.vmu_id = id; 445 entity->vme_result.vmu_zoneid = zoneid; 446 entity->vme_result.vmu_type = type; 447 448 if (entity->vme_vnode_hash == NULL) 449 entity->vme_vnode_hash = mod_hash_create_ptrhash( 450 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, 451 sizeof (vnode_t)); 452 453 if (entity->vme_amp_hash == NULL) 454 entity->vme_amp_hash = mod_hash_create_ptrhash( 455 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, 456 sizeof (struct anon_map)); 457 458 if (entity->vme_anon_hash == NULL) 459 entity->vme_anon_hash = mod_hash_create_ptrhash( 460 "vmusage anon hash", VMUSAGE_HASH_SIZE, 461 mod_hash_null_valdtor, sizeof (struct anon)); 462 463 entity->vme_next = vmu_data.vmu_entities; 464 vmu_data.vmu_entities = entity; 465 vmu_data.vmu_nentities++; 466 467 return (entity); 468 } 469 470 /* 471 * Allocate a zone entity, and hashes for tracking visited vm objects 472 * for projects, tasks, and users within that zone. 473 */ 474 static vmu_zone_t * 475 vmu_alloc_zone(id_t id) 476 { 477 vmu_zone_t *zone; 478 479 if (vmu_data.vmu_free_zones != NULL) { 480 zone = vmu_data.vmu_free_zones; 481 vmu_data.vmu_free_zones = 482 vmu_data.vmu_free_zones->vmz_next; 483 zone->vmz_next = NULL; 484 zone->vmz_zone = NULL; 485 } else { 486 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP); 487 } 488 489 zone->vmz_id = id; 490 491 if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) 492 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); 493 494 if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | 495 VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL) 496 zone->vmz_projects_hash = mod_hash_create_idhash( 497 "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity); 498 499 if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) 500 != 0 && zone->vmz_tasks_hash == NULL) 501 zone->vmz_tasks_hash = mod_hash_create_idhash( 502 "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity); 503 504 if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) 505 != 0 && zone->vmz_rusers_hash == NULL) 506 zone->vmz_rusers_hash = mod_hash_create_idhash( 507 "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); 508 509 if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) 510 != 0 && zone->vmz_eusers_hash == NULL) 511 zone->vmz_eusers_hash = mod_hash_create_idhash( 512 "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); 513 514 return (zone); 515 } 516 517 /* 518 * Allocate a structure for tracking visited bounds for a vm object. 519 */ 520 static vmu_object_t * 521 vmu_alloc_object(caddr_t key, int type) 522 { 523 vmu_object_t *object; 524 525 if (vmu_data.vmu_free_objects != NULL) { 526 object = vmu_data.vmu_free_objects; 527 vmu_data.vmu_free_objects = 528 vmu_data.vmu_free_objects->vmo_next; 529 } else { 530 object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP); 531 } 532 533 object->vmo_key = key; 534 object->vmo_type = type; 535 object->vmo_bounds = NULL; 536 537 return (object); 538 } 539 540 /* 541 * Allocate and return a bound structure. 542 */ 543 static vmu_bound_t * 544 vmu_alloc_bound() 545 { 546 vmu_bound_t *bound; 547 548 if (vmu_data.vmu_free_bounds != NULL) { 549 bound = vmu_data.vmu_free_bounds; 550 vmu_data.vmu_free_bounds = 551 vmu_data.vmu_free_bounds->vmb_next; 552 bzero(bound, sizeof (vmu_bound_t)); 553 } else { 554 bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP); 555 bzero(bound, sizeof (vmu_bound_t)); 556 } 557 return (bound); 558 } 559 560 /* 561 * vmu_find_insert_* functions implement hash lookup or allocate and 562 * insert operations. 563 */ 564 static vmu_object_t * 565 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) 566 { 567 int ret; 568 vmu_object_t *object; 569 570 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, 571 (mod_hash_val_t *)&object); 572 if (ret != 0) { 573 object = vmu_alloc_object(key, type); 574 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, 575 (mod_hash_val_t)object, (mod_hash_hndl_t)0); 576 ASSERT(ret == 0); 577 } 578 return (object); 579 } 580 581 static int 582 vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) 583 { 584 int ret; 585 caddr_t val; 586 587 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, 588 (mod_hash_val_t *)&val); 589 590 if (ret == 0) 591 return (0); 592 593 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, 594 (mod_hash_val_t)key, (mod_hash_hndl_t)0); 595 596 ASSERT(ret == 0); 597 598 return (1); 599 } 600 601 static vmu_entity_t * 602 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid) 603 { 604 int ret; 605 vmu_entity_t *entity; 606 607 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id, 608 (mod_hash_val_t *)&entity); 609 if (ret != 0) { 610 entity = vmu_alloc_entity(id, type, zoneid); 611 ret = i_mod_hash_insert_nosync(hash, 612 (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity, 613 (mod_hash_hndl_t)0); 614 ASSERT(ret == 0); 615 } 616 return (entity); 617 } 618 619 620 621 622 /* 623 * Returns list of object bounds between start and end. New bounds inserted 624 * by this call are given type. 625 * 626 * Returns the number of pages covered if new bounds are created. Returns 0 627 * if region between start/end consists of all existing bounds. 628 */ 629 static pgcnt_t 630 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t 631 end, char type, vmu_bound_t **first, vmu_bound_t **last) 632 { 633 vmu_bound_t *next; 634 vmu_bound_t *prev = NULL; 635 vmu_bound_t *tmp = NULL; 636 pgcnt_t ret = 0; 637 638 *first = *last = NULL; 639 640 for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) { 641 /* 642 * Find bounds overlapping or overlapped by range [start,end]. 643 */ 644 if (start > next->vmb_end) { 645 /* bound is before new bound */ 646 prev = next; 647 continue; 648 } 649 if (next->vmb_start > end) { 650 /* bound is after new bound */ 651 break; 652 } 653 if (*first == NULL) 654 *first = next; 655 *last = next; 656 } 657 658 if (*first == NULL) { 659 ASSERT(*last == NULL); 660 /* 661 * No bounds overlapping range [start,end], so create new 662 * bound 663 */ 664 tmp = vmu_alloc_bound(); 665 tmp->vmb_start = start; 666 tmp->vmb_end = end; 667 tmp->vmb_type = type; 668 if (prev == NULL) { 669 tmp->vmb_next = ro->vmo_bounds; 670 ro->vmo_bounds = tmp; 671 } else { 672 tmp->vmb_next = prev->vmb_next; 673 prev->vmb_next = tmp; 674 } 675 *first = tmp; 676 *last = tmp; 677 ASSERT(tmp->vmb_end >= tmp->vmb_start); 678 ret = tmp->vmb_end - tmp->vmb_start + 1; 679 return (ret); 680 } 681 682 /* Check to see if start is before first known bound */ 683 ASSERT(first != NULL && last != NULL); 684 next = (*first); 685 if (start < (*first)->vmb_start) { 686 /* Create new bound before first bound */ 687 tmp = vmu_alloc_bound(); 688 tmp->vmb_start = start; 689 tmp->vmb_end = (*first)->vmb_start - 1; 690 tmp->vmb_type = type; 691 tmp->vmb_next = *first; 692 if (*first == ro->vmo_bounds) 693 ro->vmo_bounds = tmp; 694 if (prev != NULL) 695 prev->vmb_next = tmp; 696 ASSERT(tmp->vmb_end >= tmp->vmb_start); 697 ret += tmp->vmb_end - tmp->vmb_start + 1; 698 *first = tmp; 699 } 700 /* 701 * Between start and end, search for gaps between and after existing 702 * bounds. Create new bounds to fill gaps if they exist. 703 */ 704 while (end > next->vmb_end) { 705 /* 706 * Check for gap between bound and next bound. if no gap, 707 * continue. 708 */ 709 if ((next != *last) && 710 ((next->vmb_end + 1) == next->vmb_next->vmb_start)) { 711 next = next->vmb_next; 712 continue; 713 } 714 /* 715 * Insert new bound in gap after bound, and before next 716 * bound if next bound exists. 717 */ 718 tmp = vmu_alloc_bound(); 719 tmp->vmb_type = type; 720 tmp->vmb_next = next->vmb_next; 721 tmp->vmb_start = next->vmb_end + 1; 722 723 if (next != *last) { 724 tmp->vmb_end = next->vmb_next->vmb_start - 1; 725 ASSERT(tmp->vmb_end >= tmp->vmb_start); 726 ret += tmp->vmb_end - tmp->vmb_start + 1; 727 next->vmb_next = tmp; 728 next = tmp->vmb_next; 729 } else { 730 tmp->vmb_end = end; 731 ASSERT(tmp->vmb_end >= tmp->vmb_start); 732 ret += tmp->vmb_end - tmp->vmb_start + 1; 733 next->vmb_next = tmp; 734 *last = tmp; 735 break; 736 } 737 } 738 return (ret); 739 } 740 741 /* 742 * vmu_update_bounds() 743 * 744 * first, last: list of continuous bounds, of which zero or more are of 745 * type VMUSAGE_BOUND_UNKNOWN. 746 * 747 * new_first, new_last: list of continuous bounds, of which none are of 748 * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to 749 * update the types of bounds in (first,last) with 750 * type VMUSAGE_BOUND_UNKNOWN. 751 * 752 * For the list of bounds (first,last), this function updates any bounds 753 * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in 754 * the list (new_first, new_last). 755 * 756 * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list 757 * (new_first, new_last), it will be split into multiple bounds. 758 * 759 * Return value: 760 * The number of pages in the list of bounds (first,last) that were of 761 * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type 762 * VMUSAGE_BOUND_INCORE. 763 * 764 */ 765 static pgcnt_t 766 vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last, 767 vmu_bound_t *new_first, vmu_bound_t *new_last) 768 { 769 vmu_bound_t *next, *new_next, *tmp; 770 pgcnt_t rss = 0; 771 772 next = *first; 773 new_next = new_first; 774 775 /* 776 * Verify first and last bound are covered by new bounds if they 777 * have unknown type. 778 */ 779 ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN || 780 (*first)->vmb_start >= new_next->vmb_start); 781 ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN || 782 (*last)->vmb_end <= new_last->vmb_end); 783 for (;;) { 784 /* If bound already has type, proceed to next bound */ 785 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { 786 if (next == *last) 787 break; 788 next = next->vmb_next; 789 continue; 790 } 791 while (new_next->vmb_end < next->vmb_start) 792 new_next = new_next->vmb_next; 793 ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN); 794 next->vmb_type = new_next->vmb_type; 795 if (new_next->vmb_end < next->vmb_end) { 796 /* need to split bound */ 797 tmp = vmu_alloc_bound(); 798 tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN; 799 tmp->vmb_start = new_next->vmb_end + 1; 800 tmp->vmb_end = next->vmb_end; 801 tmp->vmb_next = next->vmb_next; 802 next->vmb_end = new_next->vmb_end; 803 next->vmb_next = tmp; 804 if (*last == next) 805 *last = tmp; 806 if (next->vmb_type == VMUSAGE_BOUND_INCORE) 807 rss += next->vmb_end - next->vmb_start + 1; 808 next = tmp; 809 } else { 810 if (next->vmb_type == VMUSAGE_BOUND_INCORE) 811 rss += next->vmb_end - next->vmb_start + 1; 812 if (next == *last) 813 break; 814 next = next->vmb_next; 815 } 816 } 817 return (rss); 818 } 819 820 /* 821 * merges adjacent bounds with same type between first and last bound. 822 * After merge, last pointer is no longer valid, as last bound may be 823 * merged away. 824 */ 825 static void 826 vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last) 827 { 828 vmu_bound_t *next; 829 vmu_bound_t *tmp; 830 831 ASSERT(*first != NULL); 832 ASSERT(*last != NULL); 833 834 next = *first; 835 while (next != *last) { 836 837 /* If bounds are adjacent and have same type, merge them */ 838 if (((next->vmb_end + 1) == next->vmb_next->vmb_start) && 839 (next->vmb_type == next->vmb_next->vmb_type)) { 840 tmp = next->vmb_next; 841 next->vmb_end = tmp->vmb_end; 842 next->vmb_next = tmp->vmb_next; 843 vmu_free_bound(tmp); 844 if (tmp == *last) 845 *last = next; 846 } else { 847 next = next->vmb_next; 848 } 849 } 850 } 851 852 /* 853 * Given an amp and a list of bounds, updates each bound's type with 854 * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE. 855 * 856 * If a bound is partially incore, it will be split into two bounds. 857 * first and last may be modified, as bounds may be split into multiple 858 * bounds if the are partially incore/not-incore. 859 * 860 * Set incore to non-zero if bounds are already known to be incore 861 * 862 */ 863 static void 864 vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first, 865 vmu_bound_t **last, boolean_t incore) 866 { 867 vmu_bound_t *next; 868 vmu_bound_t *tmp; 869 pgcnt_t index; 870 short bound_type; 871 short page_type; 872 vnode_t *vn; 873 anoff_t off; 874 struct anon *ap; 875 876 next = *first; 877 /* Shared anon slots don't change once set */ 878 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 879 for (;;) { 880 if (incore == B_TRUE) 881 next->vmb_type = VMUSAGE_BOUND_INCORE; 882 883 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { 884 if (next == *last) 885 break; 886 next = next->vmb_next; 887 continue; 888 } 889 bound_type = next->vmb_type; 890 index = next->vmb_start; 891 while (index <= next->vmb_end) { 892 893 /* 894 * These are used to determine how much to increment 895 * index when a large page is found. 896 */ 897 page_t *page; 898 pgcnt_t pgcnt = 1; 899 uint_t pgshft; 900 pgcnt_t pgmsk; 901 902 ap = anon_get_ptr(amp->ahp, index); 903 if (ap != NULL) 904 swap_xlate(ap, &vn, &off); 905 906 if (ap != NULL && vn != NULL && vn->v_pages != NULL && 907 (page = page_exists(vn, off)) != NULL) { 908 page_type = VMUSAGE_BOUND_INCORE; 909 if (page->p_szc > 0) { 910 pgcnt = page_get_pagecnt(page->p_szc); 911 pgshft = page_get_shift(page->p_szc); 912 pgmsk = (0x1 << (pgshft - PAGESHIFT)) 913 - 1; 914 } 915 } else { 916 page_type = VMUSAGE_BOUND_NOT_INCORE; 917 } 918 if (bound_type == VMUSAGE_BOUND_UNKNOWN) { 919 next->vmb_type = page_type; 920 } else if (next->vmb_type != page_type) { 921 /* 922 * if current bound type does not match page 923 * type, need to split off new bound. 924 */ 925 tmp = vmu_alloc_bound(); 926 tmp->vmb_type = page_type; 927 tmp->vmb_start = index; 928 tmp->vmb_end = next->vmb_end; 929 tmp->vmb_next = next->vmb_next; 930 next->vmb_end = index - 1; 931 next->vmb_next = tmp; 932 if (*last == next) 933 *last = tmp; 934 next = tmp; 935 } 936 if (pgcnt > 1) { 937 /* 938 * If inside large page, jump to next large 939 * page 940 */ 941 index = (index & ~pgmsk) + pgcnt; 942 } else { 943 index++; 944 } 945 } 946 if (next == *last) { 947 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); 948 break; 949 } else 950 next = next->vmb_next; 951 } 952 ANON_LOCK_EXIT(&->a_rwlock); 953 } 954 955 /* 956 * Same as vmu_amp_update_incore_bounds(), except for tracking 957 * incore-/not-incore for vnodes. 958 */ 959 static void 960 vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first, 961 vmu_bound_t **last) 962 { 963 vmu_bound_t *next; 964 vmu_bound_t *tmp; 965 pgcnt_t index; 966 short bound_type; 967 short page_type; 968 969 next = *first; 970 for (;;) { 971 if (vnode->v_pages == NULL) 972 next->vmb_type = VMUSAGE_BOUND_NOT_INCORE; 973 974 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { 975 if (next == *last) 976 break; 977 next = next->vmb_next; 978 continue; 979 } 980 981 bound_type = next->vmb_type; 982 index = next->vmb_start; 983 while (index <= next->vmb_end) { 984 985 /* 986 * These are used to determine how much to increment 987 * index when a large page is found. 988 */ 989 page_t *page; 990 pgcnt_t pgcnt = 1; 991 uint_t pgshft; 992 pgcnt_t pgmsk; 993 994 if (vnode->v_pages != NULL && 995 (page = page_exists(vnode, ptob(index))) != NULL) { 996 page_type = VMUSAGE_BOUND_INCORE; 997 if (page->p_szc > 0) { 998 pgcnt = page_get_pagecnt(page->p_szc); 999 pgshft = page_get_shift(page->p_szc); 1000 pgmsk = (0x1 << (pgshft - PAGESHIFT)) 1001 - 1; 1002 } 1003 } else { 1004 page_type = VMUSAGE_BOUND_NOT_INCORE; 1005 } 1006 if (bound_type == VMUSAGE_BOUND_UNKNOWN) { 1007 next->vmb_type = page_type; 1008 } else if (next->vmb_type != page_type) { 1009 /* 1010 * if current bound type does not match page 1011 * type, need to split off new bound. 1012 */ 1013 tmp = vmu_alloc_bound(); 1014 tmp->vmb_type = page_type; 1015 tmp->vmb_start = index; 1016 tmp->vmb_end = next->vmb_end; 1017 tmp->vmb_next = next->vmb_next; 1018 next->vmb_end = index - 1; 1019 next->vmb_next = tmp; 1020 if (*last == next) 1021 *last = tmp; 1022 next = tmp; 1023 } 1024 if (pgcnt > 1) { 1025 /* 1026 * If inside large page, jump to next large 1027 * page 1028 */ 1029 index = (index & ~pgmsk) + pgcnt; 1030 } else { 1031 index++; 1032 } 1033 } 1034 if (next == *last) { 1035 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); 1036 break; 1037 } else 1038 next = next->vmb_next; 1039 } 1040 } 1041 1042 /* 1043 * Calculate the rss and swap consumed by a segment. vmu_entities is the 1044 * list of entities to visit. For shared segments, the vnode or amp 1045 * is looked up in each entity to see if has been already counted. Private 1046 * anon pages are checked per entity to ensure that cow pages are not 1047 * double counted. 1048 * 1049 * For private mapped files, first the amp is checked for private pages. 1050 * Bounds not backed by the amp are looked up in the vnode for each entity 1051 * to avoid double counting of private COW vnode pages. 1052 */ 1053 static void 1054 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) 1055 { 1056 struct segvn_data *svd; 1057 struct shm_data *shmd; 1058 struct spt_data *sptd; 1059 vmu_object_t *shared_object = NULL; 1060 vmu_object_t *entity_object = NULL; 1061 vmu_entity_t *entity; 1062 vmusage_t *result; 1063 vmu_bound_t *first = NULL; 1064 vmu_bound_t *last = NULL; 1065 vmu_bound_t *cur = NULL; 1066 vmu_bound_t *e_first = NULL; 1067 vmu_bound_t *e_last = NULL; 1068 vmu_bound_t *tmp; 1069 pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt; 1070 struct anon_map *private_amp = NULL; 1071 boolean_t incore = B_FALSE; 1072 boolean_t shared = B_FALSE; 1073 int file = 0; 1074 pgcnt_t swresv = 0; 1075 pgcnt_t panon = 0; 1076 1077 /* Can zero-length segments exist? Not sure, so parenoia */ 1078 if (seg->s_size <= 0) 1079 return; 1080 1081 /* 1082 * Figure out if there is a shared object (such as a named vnode or 1083 * a shared amp, then figure out if there is a private amp, which 1084 * identifies private pages. 1085 */ 1086 if (seg->s_ops == &segvn_ops) { 1087 svd = (struct segvn_data *)seg->s_data; 1088 if (svd->type == MAP_SHARED) 1089 shared = B_TRUE; 1090 else 1091 swresv = svd->swresv; 1092 1093 if (svd->vp != NULL) { 1094 file = 1; 1095 shared_object = vmu_find_insert_object( 1096 vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp, 1097 VMUSAGE_TYPE_VNODE); 1098 s_start = btop(svd->offset); 1099 s_end = btop(svd->offset + seg->s_size) - 1; 1100 } 1101 if (svd->amp != NULL && svd->type == MAP_SHARED) { 1102 ASSERT(shared_object == NULL); 1103 shared_object = vmu_find_insert_object( 1104 vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp, 1105 VMUSAGE_TYPE_AMP); 1106 s_start = svd->anon_index; 1107 s_end = svd->anon_index + btop(seg->s_size) - 1; 1108 /* schedctl mappings are always in core */ 1109 if (svd->amp->swresv == 0) 1110 incore = B_TRUE; 1111 } 1112 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 1113 /* 1114 * Text replication anon maps can be shared across all zones. 1115 * Space used for text replication is typically capped as 1116 * small % of memory. To keep it simple for now we don't 1117 * account for swap and memory space used for text replication. 1118 */ 1119 if (svd->tr_state == SEGVN_TR_OFF && svd->amp != NULL && 1120 svd->type == MAP_PRIVATE) { 1121 private_amp = svd->amp; 1122 p_start = svd->anon_index; 1123 p_end = svd->anon_index + btop(seg->s_size) - 1; 1124 } 1125 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 1126 } else if (seg->s_ops == &segspt_shmops) { 1127 shared = B_TRUE; 1128 shmd = (struct shm_data *)seg->s_data; 1129 shared_object = vmu_find_insert_object( 1130 vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp, 1131 VMUSAGE_TYPE_AMP); 1132 s_start = 0; 1133 s_end = btop(seg->s_size) - 1; 1134 sptd = shmd->shm_sptseg->s_data; 1135 1136 /* ism segments are always incore and do not reserve swap */ 1137 if (sptd->spt_flags & SHM_SHARE_MMU) 1138 incore = B_TRUE; 1139 1140 } else { 1141 return; 1142 } 1143 1144 /* 1145 * If there is a private amp, count anon pages that exist. If an 1146 * anon has a refcnt > 1 (cow sharing), then save the anon in a 1147 * hash so that it is not double counted. 1148 * 1149 * If there is also a shared object, they figure out the bounds 1150 * which are not mapped by the private amp. 1151 */ 1152 if (private_amp != NULL) { 1153 1154 /* Enter as writer to prevent cow anons from being freed */ 1155 ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER); 1156 1157 p_index = p_start; 1158 s_index = s_start; 1159 1160 while (p_index <= p_end) { 1161 1162 pgcnt_t p_index_next; 1163 pgcnt_t p_bound_size; 1164 int cnt; 1165 anoff_t off; 1166 struct vnode *vn; 1167 struct anon *ap; 1168 page_t *page; /* For handling of large */ 1169 pgcnt_t pgcnt = 1; /* pages */ 1170 pgcnt_t pgstart; 1171 pgcnt_t pgend; 1172 uint_t pgshft; 1173 pgcnt_t pgmsk; 1174 1175 p_index_next = p_index; 1176 ap = anon_get_next_ptr(private_amp->ahp, 1177 &p_index_next); 1178 1179 /* 1180 * If next anon is past end of mapping, simulate 1181 * end of anon so loop terminates. 1182 */ 1183 if (p_index_next > p_end) { 1184 p_index_next = p_end + 1; 1185 ap = NULL; 1186 } 1187 /* 1188 * For cow segments, keep track of bounds not 1189 * backed by private amp so they can be looked 1190 * up in the backing vnode 1191 */ 1192 if (p_index_next != p_index) { 1193 1194 /* 1195 * Compute index difference between anon and 1196 * previous anon. 1197 */ 1198 p_bound_size = p_index_next - p_index - 1; 1199 1200 if (shared_object != NULL) { 1201 cur = vmu_alloc_bound(); 1202 cur->vmb_next = NULL; 1203 cur->vmb_start = s_index; 1204 cur->vmb_end = s_index + p_bound_size; 1205 cur->vmb_type = VMUSAGE_BOUND_UNKNOWN; 1206 if (first == NULL) { 1207 first = cur; 1208 last = cur; 1209 } else { 1210 last->vmb_next = cur; 1211 last = cur; 1212 } 1213 } 1214 p_index = p_index + p_bound_size + 1; 1215 s_index = s_index + p_bound_size + 1; 1216 } 1217 1218 /* Detect end of anons in amp */ 1219 if (ap == NULL) 1220 break; 1221 1222 cnt = ap->an_refcnt; 1223 swap_xlate(ap, &vn, &off); 1224 1225 if (vn == NULL || vn->v_pages == NULL || 1226 (page = page_exists(vn, off)) == NULL) { 1227 p_index++; 1228 s_index++; 1229 continue; 1230 } 1231 1232 /* 1233 * If large page is found, compute portion of large 1234 * page in mapping, and increment indicies to the next 1235 * large page. 1236 */ 1237 if (page->p_szc > 0) { 1238 1239 pgcnt = page_get_pagecnt(page->p_szc); 1240 pgshft = page_get_shift(page->p_szc); 1241 pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1; 1242 1243 /* First page in large page */ 1244 pgstart = p_index & ~pgmsk; 1245 /* Last page in large page */ 1246 pgend = pgstart + pgcnt - 1; 1247 /* 1248 * Artifically end page if page extends past 1249 * end of mapping. 1250 */ 1251 if (pgend > p_end) 1252 pgend = p_end; 1253 1254 /* 1255 * Compute number of pages from large page 1256 * which are mapped. 1257 */ 1258 pgcnt = pgend - p_index + 1; 1259 1260 /* 1261 * Point indicies at page after large page, 1262 * or at page after end of mapping. 1263 */ 1264 p_index += pgcnt; 1265 s_index += pgcnt; 1266 } else { 1267 p_index++; 1268 s_index++; 1269 } 1270 1271 /* 1272 * Assume anon structs with a refcnt 1273 * of 1 are not cow shared, so there 1274 * is no reason to track them per entity. 1275 */ 1276 if (cnt == 1) { 1277 panon += pgcnt; 1278 continue; 1279 } 1280 for (entity = vmu_entities; entity != NULL; 1281 entity = entity->vme_next_calc) { 1282 1283 result = &entity->vme_result; 1284 /* 1285 * Track cow anons per entity so 1286 * they are not double counted. 1287 */ 1288 if (vmu_find_insert_anon(entity->vme_anon_hash, 1289 (caddr_t)ap) == 0) 1290 continue; 1291 1292 result->vmu_rss_all += (pgcnt << PAGESHIFT); 1293 result->vmu_rss_private += 1294 (pgcnt << PAGESHIFT); 1295 } 1296 } 1297 ANON_LOCK_EXIT(&private_amp->a_rwlock); 1298 } 1299 1300 /* Add up resident anon and swap reserved for private mappings */ 1301 if (swresv > 0 || panon > 0) { 1302 for (entity = vmu_entities; entity != NULL; 1303 entity = entity->vme_next_calc) { 1304 result = &entity->vme_result; 1305 result->vmu_swap_all += swresv; 1306 result->vmu_swap_private += swresv; 1307 result->vmu_rss_all += (panon << PAGESHIFT); 1308 result->vmu_rss_private += (panon << PAGESHIFT); 1309 } 1310 } 1311 1312 /* Compute resident pages backing shared amp or named vnode */ 1313 if (shared_object != NULL) { 1314 if (first == NULL) { 1315 /* 1316 * No private amp, or private amp has no anon 1317 * structs. This means entire segment is backed by 1318 * the shared object. 1319 */ 1320 first = vmu_alloc_bound(); 1321 first->vmb_next = NULL; 1322 first->vmb_start = s_start; 1323 first->vmb_end = s_end; 1324 first->vmb_type = VMUSAGE_BOUND_UNKNOWN; 1325 } 1326 /* 1327 * Iterate bounds not backed by private amp, and compute 1328 * resident pages. 1329 */ 1330 cur = first; 1331 while (cur != NULL) { 1332 1333 if (vmu_insert_lookup_object_bounds(shared_object, 1334 cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN, 1335 &first, &last) > 0) { 1336 /* new bounds, find incore/not-incore */ 1337 if (shared_object->vmo_type == 1338 VMUSAGE_TYPE_VNODE) 1339 vmu_vnode_update_incore_bounds( 1340 (vnode_t *) 1341 shared_object->vmo_key, &first, 1342 &last); 1343 else 1344 vmu_amp_update_incore_bounds( 1345 (struct anon_map *) 1346 shared_object->vmo_key, &first, 1347 &last, incore); 1348 vmu_merge_bounds(&first, &last); 1349 } 1350 for (entity = vmu_entities; entity != NULL; 1351 entity = entity->vme_next_calc) { 1352 1353 result = &entity->vme_result; 1354 1355 entity_object = vmu_find_insert_object( 1356 shared_object->vmo_type == 1357 VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash: 1358 entity->vme_amp_hash, 1359 shared_object->vmo_key, 1360 shared_object->vmo_type); 1361 1362 virt = vmu_insert_lookup_object_bounds( 1363 entity_object, cur->vmb_start, cur->vmb_end, 1364 VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last); 1365 1366 if (virt == 0) 1367 continue; 1368 /* 1369 * Range visited for this entity 1370 */ 1371 rss = vmu_update_bounds(&e_first, 1372 &e_last, first, last); 1373 result->vmu_rss_all += (rss << PAGESHIFT); 1374 if (shared == B_TRUE && file == B_FALSE) { 1375 /* shared anon mapping */ 1376 result->vmu_swap_all += 1377 (virt << PAGESHIFT); 1378 result->vmu_swap_shared += 1379 (virt << PAGESHIFT); 1380 result->vmu_rss_shared += 1381 (rss << PAGESHIFT); 1382 } else if (shared == B_TRUE && file == B_TRUE) { 1383 /* shared file mapping */ 1384 result->vmu_rss_shared += 1385 (rss << PAGESHIFT); 1386 } else if (shared == B_FALSE && 1387 file == B_TRUE) { 1388 /* private file mapping */ 1389 result->vmu_rss_private += 1390 (rss << PAGESHIFT); 1391 } 1392 vmu_merge_bounds(&e_first, &e_last); 1393 } 1394 tmp = cur; 1395 cur = cur->vmb_next; 1396 vmu_free_bound(tmp); 1397 } 1398 } 1399 } 1400 1401 /* 1402 * Based on the current calculation flags, find the relevant entities 1403 * which are relative to the process. Then calculate each segment 1404 * in the process'es address space for each relevant entity. 1405 */ 1406 static void 1407 vmu_calculate_proc(proc_t *p) 1408 { 1409 vmu_entity_t *entities = NULL; 1410 vmu_zone_t *zone; 1411 vmu_entity_t *tmp; 1412 struct as *as; 1413 struct seg *seg; 1414 int ret; 1415 1416 /* Figure out which entities are being computed */ 1417 if ((vmu_data.vmu_system) != NULL) { 1418 tmp = vmu_data.vmu_system; 1419 tmp->vme_next_calc = entities; 1420 entities = tmp; 1421 } 1422 if (vmu_data.vmu_calc_flags & 1423 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | 1424 VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | 1425 VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | 1426 VMUSAGE_ALL_EUSERS)) { 1427 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, 1428 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, 1429 (mod_hash_val_t *)&zone); 1430 if (ret != 0) { 1431 zone = vmu_alloc_zone(p->p_zone->zone_id); 1432 ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash, 1433 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, 1434 (mod_hash_val_t)zone, (mod_hash_hndl_t)0); 1435 ASSERT(ret == 0); 1436 } 1437 if (zone->vmz_zone != NULL) { 1438 tmp = zone->vmz_zone; 1439 tmp->vme_next_calc = entities; 1440 entities = tmp; 1441 } 1442 if (vmu_data.vmu_calc_flags & 1443 (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) { 1444 tmp = vmu_find_insert_entity(zone->vmz_projects_hash, 1445 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, 1446 zone->vmz_id); 1447 tmp->vme_next_calc = entities; 1448 entities = tmp; 1449 } 1450 if (vmu_data.vmu_calc_flags & 1451 (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) { 1452 tmp = vmu_find_insert_entity(zone->vmz_tasks_hash, 1453 p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id); 1454 tmp->vme_next_calc = entities; 1455 entities = tmp; 1456 } 1457 if (vmu_data.vmu_calc_flags & 1458 (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) { 1459 tmp = vmu_find_insert_entity(zone->vmz_rusers_hash, 1460 crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id); 1461 tmp->vme_next_calc = entities; 1462 entities = tmp; 1463 } 1464 if (vmu_data.vmu_calc_flags & 1465 (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { 1466 tmp = vmu_find_insert_entity(zone->vmz_eusers_hash, 1467 crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id); 1468 tmp->vme_next_calc = entities; 1469 entities = tmp; 1470 } 1471 } 1472 /* Entities which collapse projects and users for all zones */ 1473 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) { 1474 tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash, 1475 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES); 1476 tmp->vme_next_calc = entities; 1477 entities = tmp; 1478 } 1479 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) { 1480 tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash, 1481 crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES); 1482 tmp->vme_next_calc = entities; 1483 entities = tmp; 1484 } 1485 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) { 1486 tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash, 1487 crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES); 1488 tmp->vme_next_calc = entities; 1489 entities = tmp; 1490 } 1491 1492 ASSERT(entities != NULL); 1493 /* process all segs in process's address space */ 1494 as = p->p_as; 1495 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1496 for (seg = AS_SEGFIRST(as); seg != NULL; 1497 seg = AS_SEGNEXT(as, seg)) { 1498 vmu_calculate_seg(entities, seg); 1499 } 1500 AS_LOCK_EXIT(as, &as->a_lock); 1501 } 1502 1503 /* 1504 * Free data created by previous call to vmu_calculate(). 1505 */ 1506 static void 1507 vmu_clear_calc() 1508 { 1509 if (vmu_data.vmu_system != NULL) 1510 vmu_free_entity(vmu_data.vmu_system); 1511 vmu_data.vmu_system = NULL; 1512 if (vmu_data.vmu_zones_hash != NULL) 1513 i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash); 1514 if (vmu_data.vmu_projects_col_hash != NULL) 1515 i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash); 1516 if (vmu_data.vmu_rusers_col_hash != NULL) 1517 i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash); 1518 if (vmu_data.vmu_eusers_col_hash != NULL) 1519 i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash); 1520 1521 i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash); 1522 i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash); 1523 } 1524 1525 /* 1526 * Free unused data structures. These can result if the system workload 1527 * decreases between calculations. 1528 */ 1529 static void 1530 vmu_free_extra() 1531 { 1532 vmu_bound_t *tb; 1533 vmu_object_t *to; 1534 vmu_entity_t *te; 1535 vmu_zone_t *tz; 1536 1537 while (vmu_data.vmu_free_bounds != NULL) { 1538 tb = vmu_data.vmu_free_bounds; 1539 vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next; 1540 kmem_cache_free(vmu_bound_cache, tb); 1541 } 1542 while (vmu_data.vmu_free_objects != NULL) { 1543 to = vmu_data.vmu_free_objects; 1544 vmu_data.vmu_free_objects = 1545 vmu_data.vmu_free_objects->vmo_next; 1546 kmem_cache_free(vmu_object_cache, to); 1547 } 1548 while (vmu_data.vmu_free_entities != NULL) { 1549 te = vmu_data.vmu_free_entities; 1550 vmu_data.vmu_free_entities = 1551 vmu_data.vmu_free_entities->vme_next; 1552 if (te->vme_vnode_hash != NULL) 1553 mod_hash_destroy_hash(te->vme_vnode_hash); 1554 if (te->vme_amp_hash != NULL) 1555 mod_hash_destroy_hash(te->vme_amp_hash); 1556 if (te->vme_anon_hash != NULL) 1557 mod_hash_destroy_hash(te->vme_anon_hash); 1558 kmem_free(te, sizeof (vmu_entity_t)); 1559 } 1560 while (vmu_data.vmu_free_zones != NULL) { 1561 tz = vmu_data.vmu_free_zones; 1562 vmu_data.vmu_free_zones = 1563 vmu_data.vmu_free_zones->vmz_next; 1564 if (tz->vmz_projects_hash != NULL) 1565 mod_hash_destroy_hash(tz->vmz_projects_hash); 1566 if (tz->vmz_tasks_hash != NULL) 1567 mod_hash_destroy_hash(tz->vmz_tasks_hash); 1568 if (tz->vmz_rusers_hash != NULL) 1569 mod_hash_destroy_hash(tz->vmz_rusers_hash); 1570 if (tz->vmz_eusers_hash != NULL) 1571 mod_hash_destroy_hash(tz->vmz_eusers_hash); 1572 kmem_free(tz, sizeof (vmu_zone_t)); 1573 } 1574 } 1575 1576 extern kcondvar_t *pr_pid_cv; 1577 1578 /* 1579 * Determine which entity types are relevant and allocate the hashes to 1580 * track them. Then walk the process table and count rss and swap 1581 * for each process'es address space. Address space object such as 1582 * vnodes, amps and anons are tracked per entity, so that they are 1583 * not double counted in the results. 1584 * 1585 */ 1586 static void 1587 vmu_calculate() 1588 { 1589 int i = 0; 1590 int ret; 1591 proc_t *p; 1592 1593 vmu_clear_calc(); 1594 1595 if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM) 1596 vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM, 1597 ALL_ZONES); 1598 1599 /* 1600 * Walk process table and calculate rss of each proc. 1601 * 1602 * Pidlock and p_lock cannot be held while doing the rss calculation. 1603 * This is because: 1604 * 1. The calculation allocates using KM_SLEEP. 1605 * 2. The calculation grabs a_lock, which cannot be grabbed 1606 * after p_lock. 1607 * 1608 * Since pidlock must be dropped, we cannot simply just walk the 1609 * practive list. Instead, we walk the process table, and sprlock 1610 * each process to ensure that it does not exit during the 1611 * calculation. 1612 */ 1613 1614 mutex_enter(&pidlock); 1615 for (i = 0; i < v.v_proc; i++) { 1616 again: 1617 p = pid_entry(i); 1618 if (p == NULL) 1619 continue; 1620 1621 mutex_enter(&p->p_lock); 1622 mutex_exit(&pidlock); 1623 1624 if (panicstr) { 1625 mutex_exit(&p->p_lock); 1626 return; 1627 } 1628 1629 /* Try to set P_PR_LOCK */ 1630 ret = sprtrylock_proc(p); 1631 if (ret == -1) { 1632 /* Process in invalid state */ 1633 mutex_exit(&p->p_lock); 1634 mutex_enter(&pidlock); 1635 continue; 1636 } else if (ret == 1) { 1637 /* 1638 * P_PR_LOCK is already set. Wait and try again. 1639 * This also drops p_lock. 1640 */ 1641 sprwaitlock_proc(p); 1642 mutex_enter(&pidlock); 1643 goto again; 1644 } 1645 mutex_exit(&p->p_lock); 1646 1647 vmu_calculate_proc(p); 1648 1649 mutex_enter(&p->p_lock); 1650 sprunlock(p); 1651 mutex_enter(&pidlock); 1652 } 1653 mutex_exit(&pidlock); 1654 1655 vmu_free_extra(); 1656 } 1657 1658 /* 1659 * allocate a new cache for N results satisfying flags 1660 */ 1661 vmu_cache_t * 1662 vmu_cache_alloc(size_t nres, uint_t flags) 1663 { 1664 vmu_cache_t *cache; 1665 1666 cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP); 1667 cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP); 1668 cache->vmc_nresults = nres; 1669 cache->vmc_flags = flags; 1670 cache->vmc_refcnt = 1; 1671 return (cache); 1672 } 1673 1674 /* 1675 * Make sure cached results are not freed 1676 */ 1677 static void 1678 vmu_cache_hold(vmu_cache_t *cache) 1679 { 1680 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); 1681 cache->vmc_refcnt++; 1682 } 1683 1684 /* 1685 * free cache data 1686 */ 1687 static void 1688 vmu_cache_rele(vmu_cache_t *cache) 1689 { 1690 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); 1691 ASSERT(cache->vmc_refcnt > 0); 1692 cache->vmc_refcnt--; 1693 if (cache->vmc_refcnt == 0) { 1694 kmem_free(cache->vmc_results, sizeof (vmusage_t) * 1695 cache->vmc_nresults); 1696 kmem_free(cache, sizeof (vmu_cache_t)); 1697 } 1698 } 1699 1700 /* 1701 * Copy out the cached results to a caller. Inspect the callers flags 1702 * and zone to determine which cached results should be copied. 1703 */ 1704 static int 1705 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, 1706 uint_t flags, int cpflg) 1707 { 1708 vmusage_t *result, *out_result; 1709 vmusage_t dummy; 1710 size_t i, count = 0; 1711 size_t bufsize; 1712 int ret = 0; 1713 uint_t types = 0; 1714 1715 if (nres != NULL) { 1716 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg)) 1717 return (set_errno(EFAULT)); 1718 } else { 1719 bufsize = 0; 1720 } 1721 1722 /* figure out what results the caller is interested in. */ 1723 if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) 1724 types |= VMUSAGE_SYSTEM; 1725 if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) 1726 types |= VMUSAGE_ZONE; 1727 if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | 1728 VMUSAGE_COL_PROJECTS)) 1729 types |= VMUSAGE_PROJECTS; 1730 if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) 1731 types |= VMUSAGE_TASKS; 1732 if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) 1733 types |= VMUSAGE_RUSERS; 1734 if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) 1735 types |= VMUSAGE_EUSERS; 1736 1737 /* count results for current zone */ 1738 out_result = buf; 1739 for (result = cache->vmc_results, i = 0; 1740 i < cache->vmc_nresults; result++, i++) { 1741 1742 /* Do not return "other-zone" results to non-global zones */ 1743 if (curproc->p_zone != global_zone && 1744 curproc->p_zone->zone_id != result->vmu_zoneid) 1745 continue; 1746 1747 /* 1748 * If non-global zone requests VMUSAGE_SYSTEM, fake 1749 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result. 1750 */ 1751 if (curproc->p_zone != global_zone && 1752 (flags & VMUSAGE_SYSTEM) != 0 && 1753 result->vmu_type == VMUSAGE_ZONE) { 1754 count++; 1755 if (out_result != NULL) { 1756 if (bufsize < count) { 1757 ret = set_errno(EOVERFLOW); 1758 } else { 1759 dummy = *result; 1760 dummy.vmu_zoneid = ALL_ZONES; 1761 dummy.vmu_id = 0; 1762 dummy.vmu_type = VMUSAGE_SYSTEM; 1763 if (ddi_copyout(&dummy, out_result, 1764 sizeof (vmusage_t), cpflg)) 1765 return (set_errno(EFAULT)); 1766 out_result++; 1767 } 1768 } 1769 } 1770 1771 /* Skip results that do not match requested type */ 1772 if ((result->vmu_type & types) == 0) 1773 continue; 1774 1775 /* Skip collated results if not requested */ 1776 if (result->vmu_zoneid == ALL_ZONES) { 1777 if (result->vmu_type == VMUSAGE_PROJECTS && 1778 (flags & VMUSAGE_COL_PROJECTS) == 0) 1779 continue; 1780 if (result->vmu_type == VMUSAGE_EUSERS && 1781 (flags & VMUSAGE_COL_EUSERS) == 0) 1782 continue; 1783 if (result->vmu_type == VMUSAGE_RUSERS && 1784 (flags & VMUSAGE_COL_RUSERS) == 0) 1785 continue; 1786 } 1787 1788 /* Skip "other zone" results if not requested */ 1789 if (result->vmu_zoneid != curproc->p_zone->zone_id) { 1790 if (result->vmu_type == VMUSAGE_ZONE && 1791 (flags & VMUSAGE_ALL_ZONES) == 0) 1792 continue; 1793 if (result->vmu_type == VMUSAGE_PROJECTS && 1794 (flags & (VMUSAGE_ALL_PROJECTS | 1795 VMUSAGE_COL_PROJECTS)) == 0) 1796 continue; 1797 if (result->vmu_type == VMUSAGE_TASKS && 1798 (flags & VMUSAGE_ALL_TASKS) == 0) 1799 continue; 1800 if (result->vmu_type == VMUSAGE_RUSERS && 1801 (flags & (VMUSAGE_ALL_RUSERS | 1802 VMUSAGE_COL_RUSERS)) == 0) 1803 continue; 1804 if (result->vmu_type == VMUSAGE_EUSERS && 1805 (flags & (VMUSAGE_ALL_EUSERS | 1806 VMUSAGE_COL_EUSERS)) == 0) 1807 continue; 1808 } 1809 count++; 1810 if (out_result != NULL) { 1811 if (bufsize < count) { 1812 ret = set_errno(EOVERFLOW); 1813 } else { 1814 if (ddi_copyout(result, out_result, 1815 sizeof (vmusage_t), cpflg)) 1816 return (set_errno(EFAULT)); 1817 out_result++; 1818 } 1819 } 1820 } 1821 if (nres != NULL) 1822 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg)) 1823 return (set_errno(EFAULT)); 1824 1825 return (ret); 1826 } 1827 1828 /* 1829 * vm_getusage() 1830 * 1831 * Counts rss and swap by zone, project, task, and/or user. The flags argument 1832 * determines the type of results structures returned. Flags requesting 1833 * results from more than one zone are "flattened" to the local zone if the 1834 * caller is not the global zone. 1835 * 1836 * args: 1837 * flags: bitmap consisting of one or more of VMUSAGE_*. 1838 * age: maximum allowable age (time since counting was done) in 1839 * seconds of the results. Results from previous callers are 1840 * cached in kernel. 1841 * buf: pointer to buffer array of vmusage_t. If NULL, then only nres 1842 * set on success. 1843 * nres: Set to number of vmusage_t structures pointed to by buf 1844 * before calling vm_getusage(). 1845 * On return 0 (success) or ENOSPC, is set to the number of result 1846 * structures returned or attempted to return. 1847 * 1848 * returns 0 on success, -1 on failure: 1849 * EINTR (interrupted) 1850 * ENOSPC (nres to small for results, nres set to needed value for success) 1851 * EINVAL (flags invalid) 1852 * EFAULT (bad address for buf or nres) 1853 */ 1854 int 1855 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) 1856 { 1857 vmu_entity_t *entity; 1858 vmusage_t *result; 1859 int ret = 0; 1860 int cacherecent = 0; 1861 hrtime_t now; 1862 uint_t flags_orig; 1863 1864 /* 1865 * Non-global zones cannot request system wide and/or collated 1866 * results, or the system result, so munge the flags accordingly. 1867 */ 1868 flags_orig = flags; 1869 if (curproc->p_zone != global_zone) { 1870 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) { 1871 flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS); 1872 flags |= VMUSAGE_PROJECTS; 1873 } 1874 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) { 1875 flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS); 1876 flags |= VMUSAGE_RUSERS; 1877 } 1878 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) { 1879 flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS); 1880 flags |= VMUSAGE_EUSERS; 1881 } 1882 if (flags & VMUSAGE_SYSTEM) { 1883 flags &= ~VMUSAGE_SYSTEM; 1884 flags |= VMUSAGE_ZONE; 1885 } 1886 } 1887 1888 /* Check for unknown flags */ 1889 if ((flags & (~VMUSAGE_MASK)) != 0) 1890 return (set_errno(EINVAL)); 1891 1892 /* Check for no flags */ 1893 if ((flags & VMUSAGE_MASK) == 0) 1894 return (set_errno(EINVAL)); 1895 1896 mutex_enter(&vmu_data.vmu_lock); 1897 now = gethrtime(); 1898 1899 start: 1900 if (vmu_data.vmu_cache != NULL) { 1901 1902 vmu_cache_t *cache; 1903 1904 if ((vmu_data.vmu_cache->vmc_timestamp + 1905 ((hrtime_t)age * NANOSEC)) > now) 1906 cacherecent = 1; 1907 1908 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags && 1909 cacherecent == 1) { 1910 cache = vmu_data.vmu_cache; 1911 vmu_cache_hold(cache); 1912 mutex_exit(&vmu_data.vmu_lock); 1913 1914 ret = vmu_copyout_results(cache, buf, nres, flags_orig, 1915 cpflg); 1916 mutex_enter(&vmu_data.vmu_lock); 1917 vmu_cache_rele(cache); 1918 if (vmu_data.vmu_pending_waiters > 0) 1919 cv_broadcast(&vmu_data.vmu_cv); 1920 mutex_exit(&vmu_data.vmu_lock); 1921 return (ret); 1922 } 1923 /* 1924 * If the cache is recent, it is likely that there are other 1925 * consumers of vm_getusage running, so add their flags to the 1926 * desired flags for the calculation. 1927 */ 1928 if (cacherecent == 1) 1929 flags = vmu_data.vmu_cache->vmc_flags | flags; 1930 } 1931 if (vmu_data.vmu_calc_thread == NULL) { 1932 1933 vmu_cache_t *cache; 1934 1935 vmu_data.vmu_calc_thread = curthread; 1936 vmu_data.vmu_calc_flags = flags; 1937 vmu_data.vmu_entities = NULL; 1938 vmu_data.vmu_nentities = 0; 1939 if (vmu_data.vmu_pending_waiters > 0) 1940 vmu_data.vmu_calc_flags |= 1941 vmu_data.vmu_pending_flags; 1942 1943 vmu_data.vmu_pending_flags = 0; 1944 mutex_exit(&vmu_data.vmu_lock); 1945 vmu_calculate(); 1946 mutex_enter(&vmu_data.vmu_lock); 1947 /* copy results to cache */ 1948 if (vmu_data.vmu_cache != NULL) 1949 vmu_cache_rele(vmu_data.vmu_cache); 1950 cache = vmu_data.vmu_cache = 1951 vmu_cache_alloc(vmu_data.vmu_nentities, 1952 vmu_data.vmu_calc_flags); 1953 1954 result = cache->vmc_results; 1955 for (entity = vmu_data.vmu_entities; entity != NULL; 1956 entity = entity->vme_next) { 1957 *result = entity->vme_result; 1958 result++; 1959 } 1960 cache->vmc_timestamp = gethrtime(); 1961 vmu_cache_hold(cache); 1962 1963 vmu_data.vmu_calc_flags = 0; 1964 vmu_data.vmu_calc_thread = NULL; 1965 1966 if (vmu_data.vmu_pending_waiters > 0) 1967 cv_broadcast(&vmu_data.vmu_cv); 1968 1969 mutex_exit(&vmu_data.vmu_lock); 1970 1971 /* copy cache */ 1972 ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg); 1973 mutex_enter(&vmu_data.vmu_lock); 1974 vmu_cache_rele(cache); 1975 mutex_exit(&vmu_data.vmu_lock); 1976 1977 return (ret); 1978 } 1979 vmu_data.vmu_pending_flags |= flags; 1980 vmu_data.vmu_pending_waiters++; 1981 while (vmu_data.vmu_calc_thread != NULL) { 1982 if (cv_wait_sig(&vmu_data.vmu_cv, 1983 &vmu_data.vmu_lock) == 0) { 1984 vmu_data.vmu_pending_waiters--; 1985 mutex_exit(&vmu_data.vmu_lock); 1986 return (set_errno(EINTR)); 1987 } 1988 } 1989 vmu_data.vmu_pending_waiters--; 1990 goto start; 1991 } 1992