/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * vm_usage * * This file implements the getvmusage() private system call. * getvmusage() counts the amount of resident memory pages and swap * reserved by the specified process collective. A "process collective" is * the set of processes owned by a particular, zone, project, task, or user. * * rss and swap are counted so that for a given process collective, a page is * only counted once. For example, this means that if multiple processes in * the same project map the same page, then the project will only be charged * once for that page. On the other hand, if two processes in different * projects map the same page, then both projects will be charged * for the page. * * The vm_getusage() calculation is implemented so that the first thread * performs the rss/swap counting. Other callers will wait for that thread to * finish, copying the results. This enables multiple rcapds and prstats to * consume data from the same calculation. The results are also cached so that * a caller interested in recent results can just copy them instead of starting * a new calculation. The caller passes the maximium age (in seconds) of the * data. If the cached data is young enough, the cache is copied, otherwise, * a new calculation is executed and the cache is replaced with the new * data. * * The rss calculation for each process collective is as follows: * * - Inspect flags, determine if counting rss for zones, projects, tasks, * and/or users. * - For each proc: * - Figure out proc's collectives (zone, project, task, and/or user). * - For each seg in proc's address space: * - If seg is private: * - Lookup anons in the amp. * - For incore pages not previously visited each of the * proc's collectives, add incore pagesize to each. * collective. * Anon's with a refcnt of 1 can be assummed to be not * previously visited. * - For address ranges without anons in the amp: * - Lookup pages in underlying vnode. * - For incore pages not previously visiting for * each of the proc's collectives, add incore * pagesize to each collective. * - If seg is shared: * - Lookup pages in the shared amp or vnode. * - For incore pages not previously visited for each of * the proc's collectives, add incore pagesize to each * collective. * * Swap is reserved by private segments, and shared anonymous segments. * The only shared anon segments which do not reserve swap are ISM segments * and schedctl segments, both of which can be identified by having * amp->swresv == 0. * * The swap calculation for each collective is as follows: * * - Inspect flags, determine if counting rss for zones, projects, tasks, * and/or users. * - For each proc: * - Figure out proc's collectives (zone, project, task, and/or user). * - For each seg in proc's address space: * - If seg is private: * - Add svd->swresv pages to swap count for each of the * proc's collectives. * - If seg is anon, shared, and amp->swresv != 0 * - For address ranges in amp not previously visited for * each of the proc's collectives, add size of address * range to the swap count for each collective. * * These two calculations are done simultaneously, with most of the work * being done in vmu_calculate_seg(). The results of the calculation are * copied into "vmu_data.vmu_cache_results". * * To perform the calculation, various things are tracked and cached: * * - incore/not-incore page ranges for all vnodes. * (vmu_data.vmu_all_vnodes_hash) * This eliminates looking up the same page more than once. * * - incore/not-incore page ranges for all shared amps. * (vmu_data.vmu_all_amps_hash) * This eliminates looking up the same page more than once. * * - visited page ranges for each collective. * - per vnode (entity->vme_vnode_hash) * - per shared amp (entity->vme_amp_hash) * For accurate counting of map-shared and COW-shared pages. * * - visited private anons (refcnt > 1) for each collective. * (entity->vme_anon_hash) * For accurate counting of COW-shared pages. * * The common accounting structure is the vmu_entity_t, which represents * collectives: * * - A zone. * - A project, task, or user within a zone. * - The entire system (vmu_data.vmu_system). * - Each collapsed (col) project and user. This means a given projid or * uid, regardless of which zone the process is in. For instance, * project 0 in the global zone and project 0 in a non global zone are * the same collapsed project. * * Each entity structure tracks which pages have been already visited for * that entity (via previously inspected processes) so that these pages are * not double counted. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VMUSAGE_HASH_SIZE 512 #define VMUSAGE_TYPE_VNODE 1 #define VMUSAGE_TYPE_AMP 2 #define VMUSAGE_TYPE_ANON 3 #define VMUSAGE_BOUND_UNKNOWN 0 #define VMUSAGE_BOUND_INCORE 1 #define VMUSAGE_BOUND_NOT_INCORE 2 #define ISWITHIN(node, addr) ((node)->vmb_start <= addr && \ (node)->vmb_end >= addr ? 1 : 0) /* * bounds for vnodes and shared amps * Each bound is either entirely incore, entirely not in core, or * entirely unknown. bounds are stored in an avl tree sorted by start member * when in use, otherwise (free or temporary lists) they're strung * together off of vmb_next. */ typedef struct vmu_bound { avl_node_t vmb_node; struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */ pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */ pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */ char vmb_type; /* One of VMUSAGE_BOUND_* */ } vmu_bound_t; /* * hash of visited objects (vnodes or shared amps) * key is address of vnode or amp. Bounds lists known incore/non-incore * bounds for vnode/amp. */ typedef struct vmu_object { struct vmu_object *vmo_next; /* free list */ caddr_t vmo_key; short vmo_type; avl_tree_t vmo_bounds; } vmu_object_t; /* * Entity by which to count results. * * The entity structure keeps the current rss/swap counts for each entity * (zone, project, etc), and hashes of vm structures that have already * been visited for the entity. * * vme_next: links the list of all entities currently being counted by * vmu_calculate(). * * vme_next_calc: links the list of entities related to the current process * being counted by vmu_calculate_proc(). * * vmu_calculate_proc() walks all processes. For each process, it makes a * list of the entities related to that process using vme_next_calc. This * list changes each time vmu_calculate_proc() is called. * */ typedef struct vmu_entity { struct vmu_entity *vme_next; struct vmu_entity *vme_next_calc; mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ mod_hash_t *vme_anon_hash; /* COW anons visited for entity */ vmusage_t vme_result; /* identifies entity and results */ } vmu_entity_t; /* * Hash of entities visited within a zone, and an entity for the zone * itself. */ typedef struct vmu_zone { struct vmu_zone *vmz_next; /* free list */ id_t vmz_id; vmu_entity_t *vmz_zone; mod_hash_t *vmz_projects_hash; mod_hash_t *vmz_tasks_hash; mod_hash_t *vmz_rusers_hash; mod_hash_t *vmz_eusers_hash; } vmu_zone_t; /* * Cache of results from last calculation */ typedef struct vmu_cache { vmusage_t *vmc_results; /* Results from last call to */ /* vm_getusage(). */ uint64_t vmc_nresults; /* Count of cached results */ uint64_t vmc_refcnt; /* refcnt for free */ uint_t vmc_flags; /* Flags for vm_getusage() */ hrtime_t vmc_timestamp; /* when cache was created */ } vmu_cache_t; /* * top level rss info for the system */ typedef struct vmu_data { kmutex_t vmu_lock; /* Protects vmu_data */ kcondvar_t vmu_cv; /* Used to signal threads */ /* Waiting for */ /* Rss_calc_thread to finish */ vmu_entity_t *vmu_system; /* Entity for tracking */ /* rss/swap for all processes */ /* in all zones */ mod_hash_t *vmu_zones_hash; /* Zones visited */ mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */ mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */ mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */ /* to implement VMUSAGE_COL_* */ /* flags, which aggregate by */ /* project or user regardless */ /* of zoneid. */ mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */ /* to track incore/not-incore */ mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */ /* amps to track incore/not- */ /* incore */ vmu_entity_t *vmu_entities; /* Linked list of entities */ size_t vmu_nentities; /* Count of entities in list */ vmu_cache_t *vmu_cache; /* Cached results */ kthread_t *vmu_calc_thread; /* NULL, or thread running */ /* vmu_calculate() */ uint_t vmu_calc_flags; /* Flags being using by */ /* currently running calc */ /* thread */ uint_t vmu_pending_flags; /* Flags of vm_getusage() */ /* threads waiting for */ /* calc thread to finish */ uint_t vmu_pending_waiters; /* Number of threads waiting */ /* for calc thread */ vmu_bound_t *vmu_free_bounds; vmu_object_t *vmu_free_objects; vmu_entity_t *vmu_free_entities; vmu_zone_t *vmu_free_zones; } vmu_data_t; extern struct as kas; extern proc_t *practive; extern zone_t *global_zone; extern struct seg_ops segvn_ops; extern struct seg_ops segspt_shmops; static vmu_data_t vmu_data; static kmem_cache_t *vmu_bound_cache; static kmem_cache_t *vmu_object_cache; /* * Comparison routine for AVL tree. We base our comparison on vmb_start. */ static int bounds_cmp(const void *bnd1, const void *bnd2) { const vmu_bound_t *bound1 = bnd1; const vmu_bound_t *bound2 = bnd2; if (bound1->vmb_start == bound2->vmb_start) { return (0); } if (bound1->vmb_start < bound2->vmb_start) { return (-1); } return (1); } /* * Save a bound on the free list. */ static void vmu_free_bound(vmu_bound_t *bound) { bound->vmb_next = vmu_data.vmu_free_bounds; bound->vmb_start = 0; bound->vmb_end = 0; bound->vmb_type = 0; vmu_data.vmu_free_bounds = bound; } /* * Free an object, and all visited bound info. */ static void vmu_free_object(mod_hash_val_t val) { vmu_object_t *obj = (vmu_object_t *)val; avl_tree_t *tree = &(obj->vmo_bounds); vmu_bound_t *bound; void *cookie = NULL; while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL) vmu_free_bound(bound); avl_destroy(tree); obj->vmo_type = 0; obj->vmo_next = vmu_data.vmu_free_objects; vmu_data.vmu_free_objects = obj; } /* * Free an entity, and hashes of visited objects for that entity. */ static void vmu_free_entity(mod_hash_val_t val) { vmu_entity_t *entity = (vmu_entity_t *)val; if (entity->vme_vnode_hash != NULL) i_mod_hash_clear_nosync(entity->vme_vnode_hash); if (entity->vme_amp_hash != NULL) i_mod_hash_clear_nosync(entity->vme_amp_hash); if (entity->vme_anon_hash != NULL) i_mod_hash_clear_nosync(entity->vme_anon_hash); entity->vme_next = vmu_data.vmu_free_entities; vmu_data.vmu_free_entities = entity; } /* * Free zone entity, and all hashes of entities inside that zone, * which are projects, tasks, and users. */ static void vmu_free_zone(mod_hash_val_t val) { vmu_zone_t *zone = (vmu_zone_t *)val; if (zone->vmz_zone != NULL) { vmu_free_entity((mod_hash_val_t)zone->vmz_zone); zone->vmz_zone = NULL; } if (zone->vmz_projects_hash != NULL) i_mod_hash_clear_nosync(zone->vmz_projects_hash); if (zone->vmz_tasks_hash != NULL) i_mod_hash_clear_nosync(zone->vmz_tasks_hash); if (zone->vmz_rusers_hash != NULL) i_mod_hash_clear_nosync(zone->vmz_rusers_hash); if (zone->vmz_eusers_hash != NULL) i_mod_hash_clear_nosync(zone->vmz_eusers_hash); zone->vmz_next = vmu_data.vmu_free_zones; vmu_data.vmu_free_zones = zone; } /* * Initialize synchronization primitives and hashes for system-wide tracking * of visited vnodes and shared amps. Initialize results cache. */ void vm_usage_init() { mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL); vmu_data.vmu_system = NULL; vmu_data.vmu_zones_hash = NULL; vmu_data.vmu_projects_col_hash = NULL; vmu_data.vmu_rusers_col_hash = NULL; vmu_data.vmu_eusers_col_hash = NULL; vmu_data.vmu_free_bounds = NULL; vmu_data.vmu_free_objects = NULL; vmu_data.vmu_free_entities = NULL; vmu_data.vmu_free_zones = NULL; vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash( "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, sizeof (vnode_t)); vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash( "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, sizeof (struct anon_map)); vmu_data.vmu_projects_col_hash = mod_hash_create_idhash( "vmusage collapsed project hash", VMUSAGE_HASH_SIZE, vmu_free_entity); vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash( "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash( "vmusage collpased euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); vmu_data.vmu_zones_hash = mod_hash_create_idhash( "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone); vmu_bound_cache = kmem_cache_create("vmu_bound_cache", sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0); vmu_object_cache = kmem_cache_create("vmu_object_cache", sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0); vmu_data.vmu_entities = NULL; vmu_data.vmu_nentities = 0; vmu_data.vmu_cache = NULL; vmu_data.vmu_calc_thread = NULL; vmu_data.vmu_calc_flags = 0; vmu_data.vmu_pending_flags = 0; vmu_data.vmu_pending_waiters = 0; } /* * Allocate hashes for tracking vm objects visited for an entity. * Update list of entities. */ static vmu_entity_t * vmu_alloc_entity(id_t id, int type, id_t zoneid) { vmu_entity_t *entity; if (vmu_data.vmu_free_entities != NULL) { entity = vmu_data.vmu_free_entities; vmu_data.vmu_free_entities = vmu_data.vmu_free_entities->vme_next; bzero(&entity->vme_result, sizeof (vmusage_t)); } else { entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP); } entity->vme_result.vmu_id = id; entity->vme_result.vmu_zoneid = zoneid; entity->vme_result.vmu_type = type; if (entity->vme_vnode_hash == NULL) entity->vme_vnode_hash = mod_hash_create_ptrhash( "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, sizeof (vnode_t)); if (entity->vme_amp_hash == NULL) entity->vme_amp_hash = mod_hash_create_ptrhash( "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, sizeof (struct anon_map)); if (entity->vme_anon_hash == NULL) entity->vme_anon_hash = mod_hash_create_ptrhash( "vmusage anon hash", VMUSAGE_HASH_SIZE, mod_hash_null_valdtor, sizeof (struct anon)); entity->vme_next = vmu_data.vmu_entities; vmu_data.vmu_entities = entity; vmu_data.vmu_nentities++; return (entity); } /* * Allocate a zone entity, and hashes for tracking visited vm objects * for projects, tasks, and users within that zone. */ static vmu_zone_t * vmu_alloc_zone(id_t id) { vmu_zone_t *zone; if (vmu_data.vmu_free_zones != NULL) { zone = vmu_data.vmu_free_zones; vmu_data.vmu_free_zones = vmu_data.vmu_free_zones->vmz_next; zone->vmz_next = NULL; zone->vmz_zone = NULL; } else { zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP); } zone->vmz_id = id; if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL) zone->vmz_projects_hash = mod_hash_create_idhash( "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity); if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) != 0 && zone->vmz_tasks_hash == NULL) zone->vmz_tasks_hash = mod_hash_create_idhash( "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity); if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) != 0 && zone->vmz_rusers_hash == NULL) zone->vmz_rusers_hash = mod_hash_create_idhash( "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) != 0 && zone->vmz_eusers_hash == NULL) zone->vmz_eusers_hash = mod_hash_create_idhash( "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); return (zone); } /* * Allocate a structure for tracking visited bounds for a vm object. */ static vmu_object_t * vmu_alloc_object(caddr_t key, int type) { vmu_object_t *object; if (vmu_data.vmu_free_objects != NULL) { object = vmu_data.vmu_free_objects; vmu_data.vmu_free_objects = vmu_data.vmu_free_objects->vmo_next; } else { object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP); } object->vmo_next = NULL; object->vmo_key = key; object->vmo_type = type; avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0); return (object); } /* * Allocate and return a bound structure. */ static vmu_bound_t * vmu_alloc_bound() { vmu_bound_t *bound; if (vmu_data.vmu_free_bounds != NULL) { bound = vmu_data.vmu_free_bounds; vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next; } else { bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP); } bound->vmb_next = NULL; bound->vmb_start = 0; bound->vmb_end = 0; bound->vmb_type = 0; return (bound); } /* * vmu_find_insert_* functions implement hash lookup or allocate and * insert operations. */ static vmu_object_t * vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) { int ret; vmu_object_t *object; ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, (mod_hash_val_t *)&object); if (ret != 0) { object = vmu_alloc_object(key, type); ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, (mod_hash_val_t)object, (mod_hash_hndl_t)0); ASSERT(ret == 0); } return (object); } static int vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) { int ret; caddr_t val; ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, (mod_hash_val_t *)&val); if (ret == 0) return (0); ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, (mod_hash_val_t)key, (mod_hash_hndl_t)0); ASSERT(ret == 0); return (1); } static vmu_entity_t * vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid) { int ret; vmu_entity_t *entity; ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t *)&entity); if (ret != 0) { entity = vmu_alloc_entity(id, type, zoneid); ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity, (mod_hash_hndl_t)0); ASSERT(ret == 0); } return (entity); } /* * Returns list of object bounds between start and end. New bounds inserted * by this call are given type. * * Returns the number of pages covered if new bounds are created. Returns 0 * if region between start/end consists of all existing bounds. */ static pgcnt_t vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t end, char type, vmu_bound_t **first, vmu_bound_t **last) { avl_tree_t *tree = &(ro->vmo_bounds); avl_index_t where; vmu_bound_t *walker, *tmp; pgcnt_t ret = 0; ASSERT(start <= end); *first = *last = NULL; tmp = vmu_alloc_bound(); tmp->vmb_start = start; tmp->vmb_type = type; /* Hopelessly optimistic case. */ if (walker = avl_find(tree, tmp, &where)) { /* We got lucky. */ vmu_free_bound(tmp); *first = walker; } if (walker == NULL) { /* Is start in the previous node? */ walker = avl_nearest(tree, where, AVL_BEFORE); if (walker != NULL) { if (ISWITHIN(walker, start)) { /* We found start. */ vmu_free_bound(tmp); *first = walker; } } } /* * At this point, if *first is still NULL, then we * didn't get a direct hit and start isn't covered * by the previous node. We know that the next node * must have a greater start value than we require * because avl_find tells us where the AVL routines would * insert our new node. We have some gap between the * start we want and the next node. */ if (*first == NULL) { walker = avl_nearest(tree, where, AVL_AFTER); if (walker != NULL && walker->vmb_start <= end) { /* Fill the gap. */ tmp->vmb_end = walker->vmb_start - 1; *first = tmp; } else { /* We have a gap over [start, end]. */ tmp->vmb_end = end; *first = *last = tmp; } ret += tmp->vmb_end - tmp->vmb_start + 1; avl_insert(tree, tmp, where); } ASSERT(*first != NULL); if (*last != NULL) { /* We're done. */ return (ret); } /* * If we are here we still need to set *last and * that may involve filling in some gaps. */ *last = *first; for (;;) { if (ISWITHIN(*last, end)) { /* We're done. */ break; } walker = AVL_NEXT(tree, *last); if (walker == NULL || walker->vmb_start > end) { /* Bottom or mid tree with gap. */ tmp = vmu_alloc_bound(); tmp->vmb_start = (*last)->vmb_end + 1; tmp->vmb_end = end; tmp->vmb_type = type; ret += tmp->vmb_end - tmp->vmb_start + 1; avl_insert_here(tree, tmp, *last, AVL_AFTER); *last = tmp; break; } else { if ((*last)->vmb_end + 1 != walker->vmb_start) { /* Non-contiguous. */ tmp = vmu_alloc_bound(); tmp->vmb_start = (*last)->vmb_end + 1; tmp->vmb_end = walker->vmb_start - 1; tmp->vmb_type = type; ret += tmp->vmb_end - tmp->vmb_start + 1; avl_insert_here(tree, tmp, *last, AVL_AFTER); *last = tmp; } else { *last = walker; } } } return (ret); } /* * vmu_update_bounds() * * tree: avl_tree in which first and last hang. * * first, last: list of continuous bounds, of which zero or more are of * type VMUSAGE_BOUND_UNKNOWN. * * new_tree: avl_tree in which new_first and new_last hang. * * new_first, new_last: list of continuous bounds, of which none are of * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to * update the types of bounds in (first,last) with * type VMUSAGE_BOUND_UNKNOWN. * * For the list of bounds (first,last), this function updates any bounds * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in * the list (new_first, new_last). * * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list * (new_first, new_last), it will be split into multiple bounds. * * Return value: * The number of pages in the list of bounds (first,last) that were of * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type * VMUSAGE_BOUND_INCORE. * */ static pgcnt_t vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last, avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last) { vmu_bound_t *next, *new_next, *tmp; pgcnt_t rss = 0; next = *first; new_next = new_first; /* * Verify first and last bound are covered by new bounds if they * have unknown type. */ ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN || (*first)->vmb_start >= new_first->vmb_start); ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN || (*last)->vmb_end <= new_last->vmb_end); for (;;) { /* If bound already has type, proceed to next bound. */ if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { if (next == *last) break; next = AVL_NEXT(tree, next); continue; } while (new_next->vmb_end < next->vmb_start) new_next = AVL_NEXT(new_tree, new_next); ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN); next->vmb_type = new_next->vmb_type; if (new_next->vmb_end < next->vmb_end) { /* need to split bound */ tmp = vmu_alloc_bound(); tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN; tmp->vmb_start = new_next->vmb_end + 1; tmp->vmb_end = next->vmb_end; avl_insert_here(tree, tmp, next, AVL_AFTER); next->vmb_end = new_next->vmb_end; if (*last == next) *last = tmp; if (next->vmb_type == VMUSAGE_BOUND_INCORE) rss += next->vmb_end - next->vmb_start + 1; next = tmp; } else { if (next->vmb_type == VMUSAGE_BOUND_INCORE) rss += next->vmb_end - next->vmb_start + 1; if (next == *last) break; next = AVL_NEXT(tree, next); } } return (rss); } /* * Merges adjacent bounds with same type between first and last bound. * After merge, last pointer may point to a different bound, as (incoming) * last bound may have been merged away. */ static void vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last) { vmu_bound_t *current; vmu_bound_t *next; ASSERT(tree != NULL); ASSERT(*first != NULL); ASSERT(*last != NULL); current = *first; while (current != *last) { next = AVL_NEXT(tree, current); if ((current->vmb_end + 1) == next->vmb_start && current->vmb_type == next->vmb_type) { current->vmb_end = next->vmb_end; avl_remove(tree, next); vmu_free_bound(next); if (next == *last) { *last = current; } } else { current = AVL_NEXT(tree, current); } } } /* * Given an amp and a list of bounds, updates each bound's type with * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE. * * If a bound is partially incore, it will be split into two bounds. * first and last may be modified, as bounds may be split into multiple * bounds if they are partially incore/not-incore. * * Set incore to non-zero if bounds are already known to be incore. * */ static void vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, vmu_bound_t **first, vmu_bound_t **last, boolean_t incore) { vmu_bound_t *next; vmu_bound_t *tmp; pgcnt_t index; short bound_type; short page_type; vnode_t *vn; anoff_t off; struct anon *ap; next = *first; /* Shared anon slots don't change once set. */ ANON_LOCK_ENTER(&->a_rwlock, RW_READER); for (;;) { if (incore == B_TRUE) next->vmb_type = VMUSAGE_BOUND_INCORE; if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { if (next == *last) break; next = AVL_NEXT(tree, next); continue; } bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { /* * These are used to determine how much to increment * index when a large page is found. */ page_t *page; pgcnt_t pgcnt = 1; uint_t pgshft; pgcnt_t pgmsk; ap = anon_get_ptr(amp->ahp, index); if (ap != NULL) swap_xlate(ap, &vn, &off); if (ap != NULL && vn != NULL && vn->v_pages != NULL && (page = page_exists(vn, off)) != NULL) { page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1; } } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page * type, need to split off new bound. */ tmp = vmu_alloc_bound(); tmp->vmb_type = page_type; tmp->vmb_start = index; tmp->vmb_end = next->vmb_end; avl_insert_here(tree, tmp, next, AVL_AFTER); next->vmb_end = index - 1; if (*last == next) *last = tmp; next = tmp; } if (pgcnt > 1) { /* * If inside large page, jump to next large * page */ index = (index & ~pgmsk) + pgcnt; } else { index++; } } if (next == *last) { ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); break; } else next = AVL_NEXT(tree, next); } ANON_LOCK_EXIT(&->a_rwlock); } /* * Same as vmu_amp_update_incore_bounds(), except for tracking * incore-/not-incore for vnodes. */ static void vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, vmu_bound_t **first, vmu_bound_t **last) { vmu_bound_t *next; vmu_bound_t *tmp; pgcnt_t index; short bound_type; short page_type; next = *first; for (;;) { if (vnode->v_pages == NULL) next->vmb_type = VMUSAGE_BOUND_NOT_INCORE; if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { if (next == *last) break; next = AVL_NEXT(tree, next); continue; } bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { /* * These are used to determine how much to increment * index when a large page is found. */ page_t *page; pgcnt_t pgcnt = 1; uint_t pgshft; pgcnt_t pgmsk; if (vnode->v_pages != NULL && (page = page_exists(vnode, ptob(index))) != NULL) { page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1; } } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page * type, need to split off new bound. */ tmp = vmu_alloc_bound(); tmp->vmb_type = page_type; tmp->vmb_start = index; tmp->vmb_end = next->vmb_end; avl_insert_here(tree, tmp, next, AVL_AFTER); next->vmb_end = index - 1; if (*last == next) *last = tmp; next = tmp; } if (pgcnt > 1) { /* * If inside large page, jump to next large * page */ index = (index & ~pgmsk) + pgcnt; } else { index++; } } if (next == *last) { ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); break; } else next = AVL_NEXT(tree, next); } } /* * Calculate the rss and swap consumed by a segment. vmu_entities is the * list of entities to visit. For shared segments, the vnode or amp * is looked up in each entity to see if it has been already counted. Private * anon pages are checked per entity to ensure that COW pages are not * double counted. * * For private mapped files, first the amp is checked for private pages. * Bounds not backed by the amp are looked up in the vnode for each entity * to avoid double counting of private COW vnode pages. */ static void vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) { struct segvn_data *svd; struct shm_data *shmd; struct spt_data *sptd; vmu_object_t *shared_object = NULL; vmu_object_t *entity_object = NULL; vmu_entity_t *entity; vmusage_t *result; vmu_bound_t *first = NULL; vmu_bound_t *last = NULL; vmu_bound_t *cur = NULL; vmu_bound_t *e_first = NULL; vmu_bound_t *e_last = NULL; vmu_bound_t *tmp; pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt; struct anon_map *private_amp = NULL; boolean_t incore = B_FALSE; boolean_t shared = B_FALSE; int file = 0; pgcnt_t swresv = 0; pgcnt_t panon = 0; /* Can zero-length segments exist? Not sure, so paranoia. */ if (seg->s_size <= 0) return; /* * Figure out if there is a shared object (such as a named vnode or * a shared amp, then figure out if there is a private amp, which * identifies private pages. */ if (seg->s_ops == &segvn_ops) { svd = (struct segvn_data *)seg->s_data; if (svd->type == MAP_SHARED) { shared = B_TRUE; } else { swresv = svd->swresv; if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock, RW_READER) != 0) { /* * Text replication anon maps can be shared * across all zones. Space used for text * replication is typically capped as a small % * of memory. To keep it simple for now we * don't account for swap and memory space used * for text replication. */ if (svd->tr_state == SEGVN_TR_OFF && svd->amp != NULL) { private_amp = svd->amp; p_start = svd->anon_index; p_end = svd->anon_index + btop(seg->s_size) - 1; } SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); } } if (svd->vp != NULL) { file = 1; shared_object = vmu_find_insert_object( vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp, VMUSAGE_TYPE_VNODE); s_start = btop(svd->offset); s_end = btop(svd->offset + seg->s_size) - 1; } if (svd->amp != NULL && svd->type == MAP_SHARED) { ASSERT(shared_object == NULL); shared_object = vmu_find_insert_object( vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp, VMUSAGE_TYPE_AMP); s_start = svd->anon_index; s_end = svd->anon_index + btop(seg->s_size) - 1; /* schedctl mappings are always in core */ if (svd->amp->swresv == 0) incore = B_TRUE; } } else if (seg->s_ops == &segspt_shmops) { shared = B_TRUE; shmd = (struct shm_data *)seg->s_data; shared_object = vmu_find_insert_object( vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp, VMUSAGE_TYPE_AMP); s_start = 0; s_end = btop(seg->s_size) - 1; sptd = shmd->shm_sptseg->s_data; /* ism segments are always incore and do not reserve swap */ if (sptd->spt_flags & SHM_SHARE_MMU) incore = B_TRUE; } else { return; } /* * If there is a private amp, count anon pages that exist. If an * anon has a refcnt > 1 (COW sharing), then save the anon in a * hash so that it is not double counted. * * If there is also a shared object, then figure out the bounds * which are not mapped by the private amp. */ if (private_amp != NULL) { /* Enter as writer to prevent COW anons from being freed */ ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER); p_index = p_start; s_index = s_start; while (p_index <= p_end) { pgcnt_t p_index_next; pgcnt_t p_bound_size; int cnt; anoff_t off; struct vnode *vn; struct anon *ap; page_t *page; /* For handling of large */ pgcnt_t pgcnt = 1; /* pages */ pgcnt_t pgstart; pgcnt_t pgend; uint_t pgshft; pgcnt_t pgmsk; p_index_next = p_index; ap = anon_get_next_ptr(private_amp->ahp, &p_index_next); /* * If next anon is past end of mapping, simulate * end of anon so loop terminates. */ if (p_index_next > p_end) { p_index_next = p_end + 1; ap = NULL; } /* * For COW segments, keep track of bounds not * backed by private amp so they can be looked * up in the backing vnode */ if (p_index_next != p_index) { /* * Compute index difference between anon and * previous anon. */ p_bound_size = p_index_next - p_index - 1; if (shared_object != NULL) { cur = vmu_alloc_bound(); cur->vmb_start = s_index; cur->vmb_end = s_index + p_bound_size; cur->vmb_type = VMUSAGE_BOUND_UNKNOWN; if (first == NULL) { first = cur; last = cur; } else { last->vmb_next = cur; last = cur; } } p_index = p_index + p_bound_size + 1; s_index = s_index + p_bound_size + 1; } /* Detect end of anons in amp */ if (ap == NULL) break; cnt = ap->an_refcnt; swap_xlate(ap, &vn, &off); if (vn == NULL || vn->v_pages == NULL || (page = page_exists(vn, off)) == NULL) { p_index++; s_index++; continue; } /* * If large page is found, compute portion of large * page in mapping, and increment indicies to the next * large page. */ if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1; /* First page in large page */ pgstart = p_index & ~pgmsk; /* Last page in large page */ pgend = pgstart + pgcnt - 1; /* * Artifically end page if page extends past * end of mapping. */ if (pgend > p_end) pgend = p_end; /* * Compute number of pages from large page * which are mapped. */ pgcnt = pgend - p_index + 1; /* * Point indicies at page after large page, * or at page after end of mapping. */ p_index += pgcnt; s_index += pgcnt; } else { p_index++; s_index++; } /* * Assume anon structs with a refcnt * of 1 are not COW shared, so there * is no reason to track them per entity. */ if (cnt == 1) { panon += pgcnt; continue; } for (entity = vmu_entities; entity != NULL; entity = entity->vme_next_calc) { result = &entity->vme_result; /* * Track COW anons per entity so * they are not double counted. */ if (vmu_find_insert_anon(entity->vme_anon_hash, (caddr_t)ap) == 0) continue; result->vmu_rss_all += (pgcnt << PAGESHIFT); result->vmu_rss_private += (pgcnt << PAGESHIFT); } } ANON_LOCK_EXIT(&private_amp->a_rwlock); } /* Add up resident anon and swap reserved for private mappings */ if (swresv > 0 || panon > 0) { for (entity = vmu_entities; entity != NULL; entity = entity->vme_next_calc) { result = &entity->vme_result; result->vmu_swap_all += swresv; result->vmu_swap_private += swresv; result->vmu_rss_all += (panon << PAGESHIFT); result->vmu_rss_private += (panon << PAGESHIFT); } } /* Compute resident pages backing shared amp or named vnode */ if (shared_object != NULL) { avl_tree_t *tree = &(shared_object->vmo_bounds); if (first == NULL) { /* * No private amp, or private amp has no anon * structs. This means entire segment is backed by * the shared object. */ first = vmu_alloc_bound(); first->vmb_start = s_start; first->vmb_end = s_end; first->vmb_type = VMUSAGE_BOUND_UNKNOWN; } /* * Iterate bounds not backed by private amp, and compute * resident pages. */ cur = first; while (cur != NULL) { if (vmu_insert_lookup_object_bounds(shared_object, cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN, &first, &last) > 0) { /* new bounds, find incore/not-incore */ if (shared_object->vmo_type == VMUSAGE_TYPE_VNODE) { vmu_vnode_update_incore_bounds( tree, (vnode_t *) shared_object->vmo_key, &first, &last); } else { vmu_amp_update_incore_bounds( tree, (struct anon_map *) shared_object->vmo_key, &first, &last, incore); } vmu_merge_bounds(tree, &first, &last); } for (entity = vmu_entities; entity != NULL; entity = entity->vme_next_calc) { avl_tree_t *e_tree; result = &entity->vme_result; entity_object = vmu_find_insert_object( shared_object->vmo_type == VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash: entity->vme_amp_hash, shared_object->vmo_key, shared_object->vmo_type); virt = vmu_insert_lookup_object_bounds( entity_object, cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last); if (virt == 0) continue; /* * Range visited for this entity */ e_tree = &(entity_object->vmo_bounds); rss = vmu_update_bounds(e_tree, &e_first, &e_last, tree, first, last); result->vmu_rss_all += (rss << PAGESHIFT); if (shared == B_TRUE && file == B_FALSE) { /* shared anon mapping */ result->vmu_swap_all += (virt << PAGESHIFT); result->vmu_swap_shared += (virt << PAGESHIFT); result->vmu_rss_shared += (rss << PAGESHIFT); } else if (shared == B_TRUE && file == B_TRUE) { /* shared file mapping */ result->vmu_rss_shared += (rss << PAGESHIFT); } else if (shared == B_FALSE && file == B_TRUE) { /* private file mapping */ result->vmu_rss_private += (rss << PAGESHIFT); } vmu_merge_bounds(e_tree, &e_first, &e_last); } tmp = cur; cur = cur->vmb_next; vmu_free_bound(tmp); } } } /* * Based on the current calculation flags, find the relevant entities * which are relative to the process. Then calculate each segment * in the process'es address space for each relevant entity. */ static void vmu_calculate_proc(proc_t *p) { vmu_entity_t *entities = NULL; vmu_zone_t *zone; vmu_entity_t *tmp; struct as *as; struct seg *seg; int ret; /* Figure out which entities are being computed */ if ((vmu_data.vmu_system) != NULL) { tmp = vmu_data.vmu_system; tmp->vme_next_calc = entities; entities = tmp; } if (vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, (mod_hash_val_t *)&zone); if (ret != 0) { zone = vmu_alloc_zone(p->p_zone->zone_id); ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash, (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, (mod_hash_val_t)zone, (mod_hash_hndl_t)0); ASSERT(ret == 0); } if (zone->vmz_zone != NULL) { tmp = zone->vmz_zone; tmp->vme_next_calc = entities; entities = tmp; } if (vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) { tmp = vmu_find_insert_entity(zone->vmz_projects_hash, p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, zone->vmz_id); tmp->vme_next_calc = entities; entities = tmp; } if (vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) { tmp = vmu_find_insert_entity(zone->vmz_tasks_hash, p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id); tmp->vme_next_calc = entities; entities = tmp; } if (vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) { tmp = vmu_find_insert_entity(zone->vmz_rusers_hash, crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id); tmp->vme_next_calc = entities; entities = tmp; } if (vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { tmp = vmu_find_insert_entity(zone->vmz_eusers_hash, crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id); tmp->vme_next_calc = entities; entities = tmp; } } /* Entities which collapse projects and users for all zones */ if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) { tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash, p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES); tmp->vme_next_calc = entities; entities = tmp; } if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) { tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash, crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES); tmp->vme_next_calc = entities; entities = tmp; } if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) { tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash, crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES); tmp->vme_next_calc = entities; entities = tmp; } ASSERT(entities != NULL); /* process all segs in process's address space */ as = p->p_as; AS_LOCK_ENTER(as, RW_READER); for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { vmu_calculate_seg(entities, seg); } AS_LOCK_EXIT(as); } /* * Free data created by previous call to vmu_calculate(). */ static void vmu_clear_calc() { if (vmu_data.vmu_system != NULL) { vmu_free_entity(vmu_data.vmu_system); vmu_data.vmu_system = NULL; } if (vmu_data.vmu_zones_hash != NULL) i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash); if (vmu_data.vmu_projects_col_hash != NULL) i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash); if (vmu_data.vmu_rusers_col_hash != NULL) i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash); if (vmu_data.vmu_eusers_col_hash != NULL) i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash); i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash); i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash); } /* * Free unused data structures. These can result if the system workload * decreases between calculations. */ static void vmu_free_extra() { vmu_bound_t *tb; vmu_object_t *to; vmu_entity_t *te; vmu_zone_t *tz; while (vmu_data.vmu_free_bounds != NULL) { tb = vmu_data.vmu_free_bounds; vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next; kmem_cache_free(vmu_bound_cache, tb); } while (vmu_data.vmu_free_objects != NULL) { to = vmu_data.vmu_free_objects; vmu_data.vmu_free_objects = vmu_data.vmu_free_objects->vmo_next; kmem_cache_free(vmu_object_cache, to); } while (vmu_data.vmu_free_entities != NULL) { te = vmu_data.vmu_free_entities; vmu_data.vmu_free_entities = vmu_data.vmu_free_entities->vme_next; if (te->vme_vnode_hash != NULL) mod_hash_destroy_hash(te->vme_vnode_hash); if (te->vme_amp_hash != NULL) mod_hash_destroy_hash(te->vme_amp_hash); if (te->vme_anon_hash != NULL) mod_hash_destroy_hash(te->vme_anon_hash); kmem_free(te, sizeof (vmu_entity_t)); } while (vmu_data.vmu_free_zones != NULL) { tz = vmu_data.vmu_free_zones; vmu_data.vmu_free_zones = vmu_data.vmu_free_zones->vmz_next; if (tz->vmz_projects_hash != NULL) mod_hash_destroy_hash(tz->vmz_projects_hash); if (tz->vmz_tasks_hash != NULL) mod_hash_destroy_hash(tz->vmz_tasks_hash); if (tz->vmz_rusers_hash != NULL) mod_hash_destroy_hash(tz->vmz_rusers_hash); if (tz->vmz_eusers_hash != NULL) mod_hash_destroy_hash(tz->vmz_eusers_hash); kmem_free(tz, sizeof (vmu_zone_t)); } } extern kcondvar_t *pr_pid_cv; /* * Determine which entity types are relevant and allocate the hashes to * track them. Then walk the process table and count rss and swap * for each process'es address space. Address space object such as * vnodes, amps and anons are tracked per entity, so that they are * not double counted in the results. * */ static void vmu_calculate() { int i = 0; int ret; proc_t *p; vmu_clear_calc(); if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM) vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM, ALL_ZONES); /* * Walk process table and calculate rss of each proc. * * Pidlock and p_lock cannot be held while doing the rss calculation. * This is because: * 1. The calculation allocates using KM_SLEEP. * 2. The calculation grabs a_lock, which cannot be grabbed * after p_lock. * * Since pidlock must be dropped, we cannot simply just walk the * practive list. Instead, we walk the process table, and sprlock * each process to ensure that it does not exit during the * calculation. */ mutex_enter(&pidlock); for (i = 0; i < v.v_proc; i++) { again: p = pid_entry(i); if (p == NULL) continue; mutex_enter(&p->p_lock); mutex_exit(&pidlock); if (panicstr) { mutex_exit(&p->p_lock); return; } /* Try to set P_PR_LOCK */ ret = sprtrylock_proc(p); if (ret == -1) { /* Process in invalid state */ mutex_exit(&p->p_lock); mutex_enter(&pidlock); continue; } else if (ret == 1) { /* * P_PR_LOCK is already set. Wait and try again. * This also drops p_lock. */ sprwaitlock_proc(p); mutex_enter(&pidlock); goto again; } mutex_exit(&p->p_lock); vmu_calculate_proc(p); mutex_enter(&p->p_lock); sprunlock(p); mutex_enter(&pidlock); } mutex_exit(&pidlock); vmu_free_extra(); } /* * allocate a new cache for N results satisfying flags */ vmu_cache_t * vmu_cache_alloc(size_t nres, uint_t flags) { vmu_cache_t *cache; cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP); cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP); cache->vmc_nresults = nres; cache->vmc_flags = flags; cache->vmc_refcnt = 1; return (cache); } /* * Make sure cached results are not freed */ static void vmu_cache_hold(vmu_cache_t *cache) { ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); cache->vmc_refcnt++; } /* * free cache data */ static void vmu_cache_rele(vmu_cache_t *cache) { ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); ASSERT(cache->vmc_refcnt > 0); cache->vmc_refcnt--; if (cache->vmc_refcnt == 0) { kmem_free(cache->vmc_results, sizeof (vmusage_t) * cache->vmc_nresults); kmem_free(cache, sizeof (vmu_cache_t)); } } /* * Copy out the cached results to a caller. Inspect the callers flags * and zone to determine which cached results should be copied. */ static int vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, uint_t flags, int cpflg) { vmusage_t *result, *out_result; vmusage_t dummy; size_t i, count = 0; size_t bufsize; int ret = 0; uint_t types = 0; if (nres != NULL) { if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg)) return (set_errno(EFAULT)); } else { bufsize = 0; } /* figure out what results the caller is interested in. */ if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) types |= VMUSAGE_SYSTEM; if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) types |= VMUSAGE_ZONE; if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) types |= VMUSAGE_PROJECTS; if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) types |= VMUSAGE_TASKS; if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) types |= VMUSAGE_RUSERS; if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) types |= VMUSAGE_EUSERS; /* count results for current zone */ out_result = buf; for (result = cache->vmc_results, i = 0; i < cache->vmc_nresults; result++, i++) { /* Do not return "other-zone" results to non-global zones */ if (curproc->p_zone != global_zone && curproc->p_zone->zone_id != result->vmu_zoneid) continue; /* * If non-global zone requests VMUSAGE_SYSTEM, fake * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result. */ if (curproc->p_zone != global_zone && (flags & VMUSAGE_SYSTEM) != 0 && result->vmu_type == VMUSAGE_ZONE) { count++; if (out_result != NULL) { if (bufsize < count) { ret = set_errno(EOVERFLOW); } else { dummy = *result; dummy.vmu_zoneid = ALL_ZONES; dummy.vmu_id = 0; dummy.vmu_type = VMUSAGE_SYSTEM; if (ddi_copyout(&dummy, out_result, sizeof (vmusage_t), cpflg)) return (set_errno(EFAULT)); out_result++; } } } /* Skip results that do not match requested type */ if ((result->vmu_type & types) == 0) continue; /* Skip collated results if not requested */ if (result->vmu_zoneid == ALL_ZONES) { if (result->vmu_type == VMUSAGE_PROJECTS && (flags & VMUSAGE_COL_PROJECTS) == 0) continue; if (result->vmu_type == VMUSAGE_EUSERS && (flags & VMUSAGE_COL_EUSERS) == 0) continue; if (result->vmu_type == VMUSAGE_RUSERS && (flags & VMUSAGE_COL_RUSERS) == 0) continue; } /* Skip "other zone" results if not requested */ if (result->vmu_zoneid != curproc->p_zone->zone_id) { if (result->vmu_type == VMUSAGE_ZONE && (flags & VMUSAGE_ALL_ZONES) == 0) continue; if (result->vmu_type == VMUSAGE_PROJECTS && (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) == 0) continue; if (result->vmu_type == VMUSAGE_TASKS && (flags & VMUSAGE_ALL_TASKS) == 0) continue; if (result->vmu_type == VMUSAGE_RUSERS && (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) == 0) continue; if (result->vmu_type == VMUSAGE_EUSERS && (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) == 0) continue; } count++; if (out_result != NULL) { if (bufsize < count) { ret = set_errno(EOVERFLOW); } else { if (ddi_copyout(result, out_result, sizeof (vmusage_t), cpflg)) return (set_errno(EFAULT)); out_result++; } } } if (nres != NULL) if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg)) return (set_errno(EFAULT)); return (ret); } /* * vm_getusage() * * Counts rss and swap by zone, project, task, and/or user. The flags argument * determines the type of results structures returned. Flags requesting * results from more than one zone are "flattened" to the local zone if the * caller is not the global zone. * * args: * flags: bitmap consisting of one or more of VMUSAGE_*. * age: maximum allowable age (time since counting was done) in * seconds of the results. Results from previous callers are * cached in kernel. * buf: pointer to buffer array of vmusage_t. If NULL, then only nres * set on success. * nres: Set to number of vmusage_t structures pointed to by buf * before calling vm_getusage(). * On return 0 (success) or ENOSPC, is set to the number of result * structures returned or attempted to return. * * returns 0 on success, -1 on failure: * EINTR (interrupted) * ENOSPC (nres to small for results, nres set to needed value for success) * EINVAL (flags invalid) * EFAULT (bad address for buf or nres) */ int vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) { vmu_entity_t *entity; vmusage_t *result; int ret = 0; int cacherecent = 0; hrtime_t now; uint_t flags_orig; /* * Non-global zones cannot request system wide and/or collated * results, or the system result, so munge the flags accordingly. */ flags_orig = flags; if (curproc->p_zone != global_zone) { if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) { flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS); flags |= VMUSAGE_PROJECTS; } if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) { flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS); flags |= VMUSAGE_RUSERS; } if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) { flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS); flags |= VMUSAGE_EUSERS; } if (flags & VMUSAGE_SYSTEM) { flags &= ~VMUSAGE_SYSTEM; flags |= VMUSAGE_ZONE; } } /* Check for unknown flags */ if ((flags & (~VMUSAGE_MASK)) != 0) return (set_errno(EINVAL)); /* Check for no flags */ if ((flags & VMUSAGE_MASK) == 0) return (set_errno(EINVAL)); mutex_enter(&vmu_data.vmu_lock); now = gethrtime(); start: if (vmu_data.vmu_cache != NULL) { vmu_cache_t *cache; if ((vmu_data.vmu_cache->vmc_timestamp + ((hrtime_t)age * NANOSEC)) > now) cacherecent = 1; if ((vmu_data.vmu_cache->vmc_flags & flags) == flags && cacherecent == 1) { cache = vmu_data.vmu_cache; vmu_cache_hold(cache); mutex_exit(&vmu_data.vmu_lock); ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); if (vmu_data.vmu_pending_waiters > 0) cv_broadcast(&vmu_data.vmu_cv); mutex_exit(&vmu_data.vmu_lock); return (ret); } /* * If the cache is recent, it is likely that there are other * consumers of vm_getusage running, so add their flags to the * desired flags for the calculation. */ if (cacherecent == 1) flags = vmu_data.vmu_cache->vmc_flags | flags; } if (vmu_data.vmu_calc_thread == NULL) { vmu_cache_t *cache; vmu_data.vmu_calc_thread = curthread; vmu_data.vmu_calc_flags = flags; vmu_data.vmu_entities = NULL; vmu_data.vmu_nentities = 0; if (vmu_data.vmu_pending_waiters > 0) vmu_data.vmu_calc_flags |= vmu_data.vmu_pending_flags; vmu_data.vmu_pending_flags = 0; mutex_exit(&vmu_data.vmu_lock); vmu_calculate(); mutex_enter(&vmu_data.vmu_lock); /* copy results to cache */ if (vmu_data.vmu_cache != NULL) vmu_cache_rele(vmu_data.vmu_cache); cache = vmu_data.vmu_cache = vmu_cache_alloc(vmu_data.vmu_nentities, vmu_data.vmu_calc_flags); result = cache->vmc_results; for (entity = vmu_data.vmu_entities; entity != NULL; entity = entity->vme_next) { *result = entity->vme_result; result++; } cache->vmc_timestamp = gethrtime(); vmu_cache_hold(cache); vmu_data.vmu_calc_flags = 0; vmu_data.vmu_calc_thread = NULL; if (vmu_data.vmu_pending_waiters > 0) cv_broadcast(&vmu_data.vmu_cv); mutex_exit(&vmu_data.vmu_lock); /* copy cache */ ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); mutex_exit(&vmu_data.vmu_lock); return (ret); } vmu_data.vmu_pending_flags |= flags; vmu_data.vmu_pending_waiters++; while (vmu_data.vmu_calc_thread != NULL) { if (cv_wait_sig(&vmu_data.vmu_cv, &vmu_data.vmu_lock) == 0) { vmu_data.vmu_pending_waiters--; mutex_exit(&vmu_data.vmu_lock); return (set_errno(EINTR)); } } vmu_data.vmu_pending_waiters--; goto start; }