common/vm/vm_usage.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * vm_usage
 *
 * This file implements the getvmusage() private system call.
 * getvmusage() counts the amount of resident memory pages and swap
 * reserved by the specified process collective. A "process collective" is
 * the set of processes owned by a particular, zone, project, task, or user.
 *
 * rss and swap are counted so that for a given process collective, a page is
 * only counted once.  For example, this means that if multiple processes in
 * the same project map the same page, then the project will only be charged
 * once for that page.  On the other hand, if two processes in different
 * projects map the same page, then both projects will be charged
 * for the page.
 *
 * The vm_getusage() calculation is implemented so that the first thread
 * performs the rss/swap counting. Other callers will wait for that thread to
 * finish, copying the results.  This enables multiple rcapds and prstats to
 * consume data from the same calculation.  The results are also cached so that
 * a caller interested in recent results can just copy them instead of starting
 * a new calculation. The caller passes the maximium age (in seconds) of the
 * data.  If the cached data is young enough, the cache is copied, otherwise,
 * a new calculation is executed and the cache is replaced with the new
 * data.
 *
 * The rss calculation for each process collective is as follows:
 *
 *   - Inspect flags, determine if counting rss for zones, projects, tasks,
 *     and/or users.
 *   - For each proc:
 *	- Figure out proc's collectives (zone, project, task, and/or user).
 *	- For each seg in proc's address space:
 *		- If seg is private:
 *			- Lookup anons in the amp.
 *			- For incore pages not previously visited each of the
 *			  proc's collectives, add incore pagesize to each.
 *			  collective.
 *			  Anon's with a refcnt of 1 can be assummed to be not
 *			  previously visited.
 *			- For address ranges without anons in the amp:
 *				- Lookup pages in underlying vnode.
 *				- For incore pages not previously visiting for
 *				  each of the proc's collectives, add incore
 *				  pagesize to each collective.
 *		- If seg is shared:
 *			- Lookup pages in the shared amp or vnode.
 *			- For incore pages not previously visited for each of
 *			  the proc's collectives, add incore pagesize to each
 *			  collective.
 *
 * Swap is reserved by private segments, and shared anonymous segments.
 * The only shared anon segments which do not reserve swap are ISM segments
 * and schedctl segments, both of which can be identified by having
 * amp->swresv == 0.
 *
 * The swap calculation for each collective is as follows:
 *
 *   - Inspect flags, determine if counting rss for zones, projects, tasks,
 *     and/or users.
 *   - For each proc:
 *	- Figure out proc's collectives (zone, project, task, and/or user).
 *	- For each seg in proc's address space:
 *		- If seg is private:
 *			- Add svd->swresv pages to swap count for each of the
 *			  proc's collectives.
 *		- If seg is anon, shared, and amp->swresv != 0
 *			- For address ranges in amp not previously visited for
 *			  each of the proc's collectives, add size of address
 *			  range to the swap count for each collective.
 *
 * These two calculations are done simultaneously, with most of the work
 * being done in vmu_calculate_seg().  The results of the calculation are
 * copied into "vmu_data.vmu_cache_results".
 *
 * To perform the calculation, various things are tracked and cached:
 *
 *    - incore/not-incore page ranges for all vnodes.
 *	(vmu_data.vmu_all_vnodes_hash)
 *	This eliminates looking up the same page more than once.
 *
 *    - incore/not-incore page ranges for all shared amps.
 *	(vmu_data.vmu_all_amps_hash)
 *	This eliminates looking up the same page more than once.
 *
 *    - visited page ranges for each collective.
 *	   - per vnode (entity->vme_vnode_hash)
 *	   - per shared amp (entity->vme_amp_hash)
 *	For accurate counting of map-shared and COW-shared pages.
 *
 *    - visited private anons (refcnt > 1) for each collective.
 *	(entity->vme_anon_hash)
 *	For accurate counting of COW-shared pages.
 *
 * The common accounting structure is the vmu_entity_t, which represents
 * collectives:
 *
 *    - A zone.
 *    - A project, task, or user within a zone.
 *    - The entire system (vmu_data.vmu_system).
 *    - Each collapsed (col) project and user.  This means a given projid or
 *	uid, regardless of which zone the process is in.  For instance,
 *      project 0 in the global zone and project 0 in a non global zone are
 *	the same collapsed project.
 *
 *  Each entity structure tracks which pages have been already visited for
 *  that entity (via previously inspected processes) so that these pages are
 *  not double counted.
 */

#include <sys/errno.h>
#include <sys/types.h>
#include <sys/zone.h>
#include <sys/proc.h>
#include <sys/project.h>
#include <sys/task.h>
#include <sys/thread.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/modhash.h>
#include <sys/modhash_impl.h>
#include <sys/shm.h>
#include <sys/swap.h>
#include <sys/synch.h>
#include <sys/systm.h>
#include <sys/var.h>
#include <sys/vm_usage.h>
#include <sys/zone.h>
#include <sys/sunddi.h>
#include <sys/avl.h>
#include <vm/anon.h>
#include <vm/as.h>
#include <vm/seg_vn.h>
#include <vm/seg_spt.h>

#define	VMUSAGE_HASH_SIZE		512

#define	VMUSAGE_TYPE_VNODE		1
#define	VMUSAGE_TYPE_AMP		2
#define	VMUSAGE_TYPE_ANON		3

#define	VMUSAGE_BOUND_UNKNOWN		0
#define	VMUSAGE_BOUND_INCORE		1
#define	VMUSAGE_BOUND_NOT_INCORE	2

#define	ISWITHIN(node, addr)	((node)->vmb_start <= addr && \
				    (node)->vmb_end >= addr ? 1 : 0)

/*
 * bounds for vnodes and shared amps
 * Each bound is either entirely incore, entirely not in core, or
 * entirely unknown.  bounds are stored in an avl tree sorted by start member
 * when in use, otherwise (free or temporary lists) they're strung
 * together off of vmb_next.
 */
typedef struct vmu_bound {
	avl_node_t vmb_node;
	struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */
	pgcnt_t vmb_start;  /* page offset in vnode/amp on which bound starts */
	pgcnt_t	vmb_end;    /* page offset in vnode/amp on which bound ends */
	char	vmb_type;   /* One of VMUSAGE_BOUND_* */
} vmu_bound_t;

/*
 * hash of visited objects (vnodes or shared amps)
 * key is address of vnode or amp.  Bounds lists known incore/non-incore
 * bounds for vnode/amp.
 */
typedef struct vmu_object {
	struct vmu_object	*vmo_next;	/* free list */
	caddr_t		vmo_key;
	short		vmo_type;
	avl_tree_t	vmo_bounds;
} vmu_object_t;

/*
 * Entity by which to count results.
 *
 * The entity structure keeps the current rss/swap counts for each entity
 * (zone, project, etc), and hashes of vm structures that have already
 * been visited for the entity.
 *
 * vme_next:	links the list of all entities currently being counted by
 *		vmu_calculate().
 *
 * vme_next_calc: links the list of entities related to the current process
 *		 being counted by vmu_calculate_proc().
 *
 * vmu_calculate_proc() walks all processes.  For each process, it makes a
 * list of the entities related to that process using vme_next_calc.  This
 * list changes each time vmu_calculate_proc() is called.
 *
 */
typedef struct vmu_entity {
	struct vmu_entity *vme_next;
	struct vmu_entity *vme_next_calc;
	mod_hash_t	*vme_vnode_hash; /* vnodes visited for entity */
	mod_hash_t	*vme_amp_hash;	 /* shared amps visited for entity */
	mod_hash_t	*vme_anon_hash;	 /* COW anons visited for entity */
	vmusage_t	vme_result;	 /* identifies entity and results */
} vmu_entity_t;

/*
 * Hash of entities visited within a zone, and an entity for the zone
 * itself.
 */
typedef struct vmu_zone {
	struct vmu_zone	*vmz_next;	/* free list */
	id_t		vmz_id;
	vmu_entity_t	*vmz_zone;
	mod_hash_t	*vmz_projects_hash;
	mod_hash_t	*vmz_tasks_hash;
	mod_hash_t	*vmz_rusers_hash;
	mod_hash_t	*vmz_eusers_hash;
} vmu_zone_t;

/*
 * Cache of results from last calculation
 */
typedef struct vmu_cache {
	vmusage_t	*vmc_results;	/* Results from last call to */
					/* vm_getusage(). */
	uint64_t	vmc_nresults;	/* Count of cached results */
	uint64_t	vmc_refcnt;	/* refcnt for free */
	uint_t		vmc_flags;	/* Flags for vm_getusage() */
	hrtime_t	vmc_timestamp;	/* when cache was created */
} vmu_cache_t;

/*
 * top level rss info for the system
 */
typedef struct vmu_data {
	kmutex_t	vmu_lock;		/* Protects vmu_data */
	kcondvar_t	vmu_cv;			/* Used to signal threads */
						/* Waiting for */
						/* Rss_calc_thread to finish */
	vmu_entity_t	*vmu_system;		/* Entity for tracking */
						/* rss/swap for all processes */
						/* in all zones */
	mod_hash_t	*vmu_zones_hash;	/* Zones visited */
	mod_hash_t	*vmu_projects_col_hash; /* These *_col_hash hashes */
	mod_hash_t	*vmu_rusers_col_hash;	/* keep track of entities, */
	mod_hash_t	*vmu_eusers_col_hash;	/* ignoring zoneid, in order */
						/* to implement VMUSAGE_COL_* */
						/* flags, which aggregate by */
						/* project or user regardless */
						/* of zoneid. */
	mod_hash_t	*vmu_all_vnodes_hash;	/* System wide visited vnodes */
						/* to track incore/not-incore */
	mod_hash_t	*vmu_all_amps_hash;	/* System wide visited shared */
						/* amps to track incore/not- */
						/* incore */
	vmu_entity_t	*vmu_entities;		/* Linked list of entities */
	size_t		vmu_nentities;		/* Count of entities in list */
	vmu_cache_t	*vmu_cache;		/* Cached results */
	kthread_t	*vmu_calc_thread;	/* NULL, or thread running */
						/* vmu_calculate() */
	uint_t		vmu_calc_flags;		/* Flags being using by */
						/* currently running calc */
						/* thread */
	uint_t		vmu_pending_flags;	/* Flags of vm_getusage() */
						/* threads waiting for */
						/* calc thread to finish */
	uint_t		vmu_pending_waiters;	/* Number of threads waiting */
						/* for calc thread */
	vmu_bound_t	*vmu_free_bounds;
	vmu_object_t	*vmu_free_objects;
	vmu_entity_t	*vmu_free_entities;
	vmu_zone_t	*vmu_free_zones;
} vmu_data_t;

extern struct as kas;
extern proc_t *practive;
extern zone_t *global_zone;
extern struct seg_ops segvn_ops;
extern struct seg_ops segspt_shmops;

static vmu_data_t vmu_data;
static kmem_cache_t *vmu_bound_cache;
static kmem_cache_t *vmu_object_cache;

/*
 * Comparison routine for AVL tree. We base our comparison on vmb_start.
 */
static int
bounds_cmp(const void *bnd1, const void *bnd2)
{
	const vmu_bound_t *bound1 = bnd1;
	const vmu_bound_t *bound2 = bnd2;

	if (bound1->vmb_start == bound2->vmb_start) {
		return (0);
	}
	if (bound1->vmb_start < bound2->vmb_start) {
		return (-1);
	}

	return (1);
}

/*
 * Save a bound on the free list.
 */
static void
vmu_free_bound(vmu_bound_t *bound)
{
	bound->vmb_next = vmu_data.vmu_free_bounds;
	bound->vmb_start = 0;
	bound->vmb_end = 0;
	bound->vmb_type = 0;
	vmu_data.vmu_free_bounds = bound;
}

/*
 * Free an object, and all visited bound info.
 */
static void
vmu_free_object(mod_hash_val_t val)
{
	vmu_object_t *obj = (vmu_object_t *)val;
	avl_tree_t *tree = &(obj->vmo_bounds);
	vmu_bound_t *bound;
	void *cookie = NULL;

	while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL)
		vmu_free_bound(bound);
	avl_destroy(tree);

	obj->vmo_type = 0;
	obj->vmo_next = vmu_data.vmu_free_objects;
	vmu_data.vmu_free_objects = obj;
}

/*
 * Free an entity, and hashes of visited objects for that entity.
 */
static void
vmu_free_entity(mod_hash_val_t val)
{
	vmu_entity_t *entity = (vmu_entity_t *)val;

	if (entity->vme_vnode_hash != NULL)
		i_mod_hash_clear_nosync(entity->vme_vnode_hash);
	if (entity->vme_amp_hash != NULL)
		i_mod_hash_clear_nosync(entity->vme_amp_hash);
	if (entity->vme_anon_hash != NULL)
		i_mod_hash_clear_nosync(entity->vme_anon_hash);

	entity->vme_next = vmu_data.vmu_free_entities;
	vmu_data.vmu_free_entities = entity;
}

/*
 * Free zone entity, and all hashes of entities inside that zone,
 * which are projects, tasks, and users.
 */
static void
vmu_free_zone(mod_hash_val_t val)
{
	vmu_zone_t *zone = (vmu_zone_t *)val;

	if (zone->vmz_zone != NULL) {
		vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
		zone->vmz_zone = NULL;
	}
	if (zone->vmz_projects_hash != NULL)
		i_mod_hash_clear_nosync(zone->vmz_projects_hash);
	if (zone->vmz_tasks_hash != NULL)
		i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
	if (zone->vmz_rusers_hash != NULL)
		i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
	if (zone->vmz_eusers_hash != NULL)
		i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
	zone->vmz_next = vmu_data.vmu_free_zones;
	vmu_data.vmu_free_zones = zone;
}

/*
 * Initialize synchronization primitives and hashes for system-wide tracking
 * of visited vnodes and shared amps.  Initialize results cache.
 */
void
vm_usage_init()
{
	mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);

	vmu_data.vmu_system = NULL;
	vmu_data.vmu_zones_hash = NULL;
	vmu_data.vmu_projects_col_hash = NULL;
	vmu_data.vmu_rusers_col_hash = NULL;
	vmu_data.vmu_eusers_col_hash = NULL;

	vmu_data.vmu_free_bounds = NULL;
	vmu_data.vmu_free_objects = NULL;
	vmu_data.vmu_free_entities = NULL;
	vmu_data.vmu_free_zones = NULL;

	vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
	    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
	    sizeof (vnode_t));
	vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
	    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
	    sizeof (struct anon_map));
	vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
	    "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
	    vmu_free_entity);
	vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
	    "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
	    vmu_free_entity);
	vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
	    "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
	    vmu_free_entity);
	vmu_data.vmu_zones_hash = mod_hash_create_idhash(
	    "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);

	vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
	    sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	vmu_object_cache = kmem_cache_create("vmu_object_cache",
	    sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);

	vmu_data.vmu_entities = NULL;
	vmu_data.vmu_nentities = 0;

	vmu_data.vmu_cache = NULL;
	vmu_data.vmu_calc_thread = NULL;
	vmu_data.vmu_calc_flags = 0;
	vmu_data.vmu_pending_flags = 0;
	vmu_data.vmu_pending_waiters = 0;
}

/*
 * Allocate hashes for tracking vm objects visited for an entity.
 * Update list of entities.
 */
static vmu_entity_t *
vmu_alloc_entity(id_t id, int type, id_t zoneid)
{
	vmu_entity_t *entity;

	if (vmu_data.vmu_free_entities != NULL) {
		entity = vmu_data.vmu_free_entities;
		vmu_data.vmu_free_entities =
		    vmu_data.vmu_free_entities->vme_next;
		bzero(&entity->vme_result, sizeof (vmusage_t));
	} else {
		entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
	}
	entity->vme_result.vmu_id = id;
	entity->vme_result.vmu_zoneid = zoneid;
	entity->vme_result.vmu_type = type;

	if (entity->vme_vnode_hash == NULL)
		entity->vme_vnode_hash = mod_hash_create_ptrhash(
		    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
		    sizeof (vnode_t));

	if (entity->vme_amp_hash == NULL)
		entity->vme_amp_hash = mod_hash_create_ptrhash(
		    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
		    sizeof (struct anon_map));

	if (entity->vme_anon_hash == NULL)
		entity->vme_anon_hash = mod_hash_create_ptrhash(
		    "vmusage anon hash", VMUSAGE_HASH_SIZE,
		    mod_hash_null_valdtor, sizeof (struct anon));

	entity->vme_next = vmu_data.vmu_entities;
	vmu_data.vmu_entities = entity;
	vmu_data.vmu_nentities++;

	return (entity);
}

/*
 * Allocate a zone entity, and hashes for tracking visited vm objects
 * for projects, tasks, and users within that zone.
 */
static vmu_zone_t *
vmu_alloc_zone(id_t id)
{
	vmu_zone_t *zone;

	if (vmu_data.vmu_free_zones != NULL) {
		zone = vmu_data.vmu_free_zones;
		vmu_data.vmu_free_zones =
		    vmu_data.vmu_free_zones->vmz_next;
		zone->vmz_next = NULL;
		zone->vmz_zone = NULL;
	} else {
		zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
	}

	zone->vmz_id = id;

	if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
		zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);

	if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
	    VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
		zone->vmz_projects_hash = mod_hash_create_idhash(
		    "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);

	if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
	    != 0 && zone->vmz_tasks_hash == NULL)
		zone->vmz_tasks_hash = mod_hash_create_idhash(
		    "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);

	if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
	    != 0 && zone->vmz_rusers_hash == NULL)
		zone->vmz_rusers_hash = mod_hash_create_idhash(
		    "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);

	if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
	    != 0 && zone->vmz_eusers_hash == NULL)
		zone->vmz_eusers_hash = mod_hash_create_idhash(
		    "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);

	return (zone);
}

/*
 * Allocate a structure for tracking visited bounds for a vm object.
 */
static vmu_object_t *
vmu_alloc_object(caddr_t key, int type)
{
	vmu_object_t *object;

	if (vmu_data.vmu_free_objects != NULL) {
		object = vmu_data.vmu_free_objects;
		vmu_data.vmu_free_objects =
		    vmu_data.vmu_free_objects->vmo_next;
	} else {
		object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
	}

	object->vmo_next = NULL;
	object->vmo_key = key;
	object->vmo_type = type;
	avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0);

	return (object);
}

/*
 * Allocate and return a bound structure.
 */
static vmu_bound_t *
vmu_alloc_bound()
{
	vmu_bound_t *bound;

	if (vmu_data.vmu_free_bounds != NULL) {
		bound = vmu_data.vmu_free_bounds;
		vmu_data.vmu_free_bounds =
		    vmu_data.vmu_free_bounds->vmb_next;
	} else {
		bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
	}

	bound->vmb_next = NULL;
	bound->vmb_start = 0;
	bound->vmb_end = 0;
	bound->vmb_type = 0;
	return (bound);
}

/*
 * vmu_find_insert_* functions implement hash lookup or allocate and
 * insert operations.
 */
static vmu_object_t *
vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
{
	int ret;
	vmu_object_t *object;

	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
	    (mod_hash_val_t *)&object);
	if (ret != 0) {
		object = vmu_alloc_object(key, type);
		ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
		    (mod_hash_val_t)object, (mod_hash_hndl_t)0);
		ASSERT(ret == 0);
	}
	return (object);
}

static int
vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
{
	int ret;
	caddr_t val;

	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
	    (mod_hash_val_t *)&val);

	if (ret == 0)
		return (0);

	ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
	    (mod_hash_val_t)key, (mod_hash_hndl_t)0);

	ASSERT(ret == 0);

	return (1);
}

static vmu_entity_t *
vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
{
	int ret;
	vmu_entity_t *entity;

	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
	    (mod_hash_val_t *)&entity);
	if (ret != 0) {
		entity = vmu_alloc_entity(id, type, zoneid);
		ret = i_mod_hash_insert_nosync(hash,
		    (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
		    (mod_hash_hndl_t)0);
		ASSERT(ret == 0);
	}
	return (entity);
}


/*
 * Returns list of object bounds between start and end.  New bounds inserted
 * by this call are given type.
 *
 * Returns the number of pages covered if new bounds are created.  Returns 0
 * if region between start/end consists of all existing bounds.
 */
static pgcnt_t
vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
    end, char type, vmu_bound_t **first, vmu_bound_t **last)
{
	avl_tree_t	*tree = &(ro->vmo_bounds);
	avl_index_t	where;
	vmu_bound_t	*walker, *tmp;
	pgcnt_t		ret = 0;

	ASSERT(start <= end);

	*first = *last = NULL;

	tmp = vmu_alloc_bound();
	tmp->vmb_start = start;
	tmp->vmb_type = type;

	/* Hopelessly optimistic case. */
	if (walker = avl_find(tree, tmp, &where)) {
		/* We got lucky. */
		vmu_free_bound(tmp);
		*first = walker;
	}

	if (walker == NULL) {
		/* Is start in the previous node? */
		walker = avl_nearest(tree, where, AVL_BEFORE);
		if (walker != NULL) {
			if (ISWITHIN(walker, start)) {
				/* We found start. */
				vmu_free_bound(tmp);
				*first = walker;
			}
		}
	}

	/*
	 * At this point, if *first is still NULL, then we
	 * didn't get a direct hit and start isn't covered
	 * by the previous node. We know that the next node
	 * must have a greater start value than we require
	 * because avl_find tells us where the AVL routines would
	 * insert our new node. We have some gap between the
	 * start we want and the next node.
	 */
	if (*first == NULL) {
		walker = avl_nearest(tree, where, AVL_AFTER);
		if (walker != NULL && walker->vmb_start <= end) {
			/* Fill the gap. */
			tmp->vmb_end = walker->vmb_start - 1;
			*first = tmp;
		} else {
			/* We have a gap over [start, end]. */
			tmp->vmb_end = end;
			*first = *last = tmp;
		}
		ret += tmp->vmb_end - tmp->vmb_start + 1;
		avl_insert(tree, tmp, where);
	}

	ASSERT(*first != NULL);

	if (*last != NULL) {
		/* We're done. */
		return (ret);
	}

	/*
	 * If we are here we still need to set *last and
	 * that may involve filling in some gaps.
	 */
	*last = *first;
	for (;;) {
		if (ISWITHIN(*last, end)) {
			/* We're done. */
			break;
		}
		walker = AVL_NEXT(tree, *last);
		if (walker == NULL || walker->vmb_start > end) {
			/* Bottom or mid tree with gap. */
			tmp = vmu_alloc_bound();
			tmp->vmb_start = (*last)->vmb_end + 1;
			tmp->vmb_end = end;
			tmp->vmb_type = type;
			ret += tmp->vmb_end - tmp->vmb_start + 1;
			avl_insert_here(tree, tmp, *last, AVL_AFTER);
			*last = tmp;
			break;
		} else {
			if ((*last)->vmb_end + 1 != walker->vmb_start) {
				/* Non-contiguous. */
				tmp = vmu_alloc_bound();
				tmp->vmb_start = (*last)->vmb_end + 1;
				tmp->vmb_end = walker->vmb_start - 1;
				tmp->vmb_type = type;
				ret += tmp->vmb_end - tmp->vmb_start + 1;
				avl_insert_here(tree, tmp, *last, AVL_AFTER);
				*last = tmp;
			} else {
				*last = walker;
			}
		}
	}

	return (ret);
}

/*
 * vmu_update_bounds()
 *
 * tree: avl_tree in which first and last hang.
 *
 * first, last:	list of continuous bounds, of which zero or more are of
 * 		type VMUSAGE_BOUND_UNKNOWN.
 *
 * new_tree: avl_tree in which new_first and new_last hang.
 *
 * new_first, new_last:	list of continuous bounds, of which none are of
 *			type VMUSAGE_BOUND_UNKNOWN.  These bounds are used to
 *			update the types of bounds in (first,last) with
 *			type VMUSAGE_BOUND_UNKNOWN.
 *
 * For the list of bounds (first,last), this function updates any bounds
 * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
 * the list (new_first, new_last).
 *
 * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
 * (new_first, new_last), it will be split into multiple bounds.
 *
 * Return value:
 * 	The number of pages in the list of bounds (first,last) that were of
 *	type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
 *	VMUSAGE_BOUND_INCORE.
 *
 */
static pgcnt_t
vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last,
    avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last)
{
	vmu_bound_t *next, *new_next, *tmp;
	pgcnt_t rss = 0;

	next = *first;
	new_next = new_first;

	/*
	 * Verify first and last bound are covered by new bounds if they
	 * have unknown type.
	 */
	ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
	    (*first)->vmb_start >= new_first->vmb_start);
	ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
	    (*last)->vmb_end <= new_last->vmb_end);
	for (;;) {
		/* If bound already has type, proceed to next bound. */
		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
			if (next == *last)
				break;
			next = AVL_NEXT(tree, next);
			continue;
		}
		while (new_next->vmb_end < next->vmb_start)
			new_next = AVL_NEXT(new_tree, new_next);
		ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
		next->vmb_type = new_next->vmb_type;
		if (new_next->vmb_end < next->vmb_end) {
			/* need to split bound */
			tmp = vmu_alloc_bound();
			tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
			tmp->vmb_start = new_next->vmb_end + 1;
			tmp->vmb_end = next->vmb_end;
			avl_insert_here(tree, tmp, next, AVL_AFTER);
			next->vmb_end = new_next->vmb_end;
			if (*last == next)
				*last = tmp;
			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
				rss += next->vmb_end - next->vmb_start + 1;
			next = tmp;
		} else {
			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
				rss += next->vmb_end - next->vmb_start + 1;
			if (next == *last)
				break;
			next = AVL_NEXT(tree, next);
		}
	}
	return (rss);
}

/*
 * Merges adjacent bounds with same type between first and last bound.
 * After merge, last pointer may point to a different bound, as (incoming)
 * last bound may have been merged away.
 */
static void
vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last)
{
	vmu_bound_t *current;
	vmu_bound_t *next;

	ASSERT(tree != NULL);
	ASSERT(*first != NULL);
	ASSERT(*last != NULL);

	current = *first;
	while (current != *last) {
		next = AVL_NEXT(tree, current);
		if ((current->vmb_end + 1) == next->vmb_start &&
		    current->vmb_type == next->vmb_type) {
			current->vmb_end = next->vmb_end;
			avl_remove(tree, next);
			vmu_free_bound(next);
			if (next == *last) {
				*last = current;
			}
		} else {
			current = AVL_NEXT(tree, current);
		}
	}
}

/*
 * Given an amp and a list of bounds, updates each bound's type with
 * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
 *
 * If a bound is partially incore, it will be split into two bounds.
 * first and last may be modified, as bounds may be split into multiple
 * bounds if they are partially incore/not-incore.
 *
 * Set incore to non-zero if bounds are already known to be incore.
 *
 */
static void
vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
    vmu_bound_t **first, vmu_bound_t **last, boolean_t incore)
{
	vmu_bound_t *next;
	vmu_bound_t *tmp;
	pgcnt_t index;
	short bound_type;
	short page_type;
	vnode_t *vn;
	anoff_t off;
	struct anon *ap;

	next = *first;
	/* Shared anon slots don't change once set. */
	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
	for (;;) {
		if (incore == B_TRUE)
			next->vmb_type = VMUSAGE_BOUND_INCORE;

		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
			if (next == *last)
				break;
			next = AVL_NEXT(tree, next);
			continue;
		}
		bound_type = next->vmb_type;
		index = next->vmb_start;
		while (index <= next->vmb_end) {

			/*
			 * These are used to determine how much to increment
			 * index when a large page is found.
			 */
			page_t *page;
			pgcnt_t pgcnt = 1;
			uint_t pgshft;
			pgcnt_t pgmsk;

			ap = anon_get_ptr(amp->ahp, index);
			if (ap != NULL)
				swap_xlate(ap, &vn, &off);

			if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
			    (page = page_exists(vn, off)) != NULL) {
				page_type = VMUSAGE_BOUND_INCORE;
				if (page->p_szc > 0) {
					pgcnt = page_get_pagecnt(page->p_szc);
					pgshft = page_get_shift(page->p_szc);
					pgmsk = (0x1 << (pgshft - PAGESHIFT))
					    - 1;
				}
			} else {
				page_type = VMUSAGE_BOUND_NOT_INCORE;
			}
			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
				next->vmb_type = page_type;
			} else if (next->vmb_type != page_type) {
				/*
				 * If current bound type does not match page
				 * type, need to split off new bound.
				 */
				tmp = vmu_alloc_bound();
				tmp->vmb_type = page_type;
				tmp->vmb_start = index;
				tmp->vmb_end = next->vmb_end;
				avl_insert_here(tree, tmp, next, AVL_AFTER);
				next->vmb_end = index - 1;
				if (*last == next)
					*last = tmp;
				next = tmp;
			}
			if (pgcnt > 1) {
				/*
				 * If inside large page, jump to next large
				 * page
				 */
				index = (index & ~pgmsk) + pgcnt;
			} else {
				index++;
			}
		}
		if (next == *last) {
			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
			break;
		} else
			next = AVL_NEXT(tree, next);
	}
	ANON_LOCK_EXIT(&amp->a_rwlock);
}

/*
 * Same as vmu_amp_update_incore_bounds(), except for tracking
 * incore-/not-incore for vnodes.
 */
static void
vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
    vmu_bound_t **first, vmu_bound_t **last)
{
	vmu_bound_t *next;
	vmu_bound_t *tmp;
	pgcnt_t index;
	short bound_type;
	short page_type;

	next = *first;
	for (;;) {
		if (vnode->v_pages == NULL)
			next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;

		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
			if (next == *last)
				break;
			next = AVL_NEXT(tree, next);
			continue;
		}

		bound_type = next->vmb_type;
		index = next->vmb_start;
		while (index <= next->vmb_end) {

			/*
			 * These are used to determine how much to increment
			 * index when a large page is found.
			 */
			page_t *page;
			pgcnt_t pgcnt = 1;
			uint_t pgshft;
			pgcnt_t pgmsk;

			if (vnode->v_pages != NULL &&
			    (page = page_exists(vnode, ptob(index))) != NULL) {
				page_type = VMUSAGE_BOUND_INCORE;
				if (page->p_szc > 0) {
					pgcnt = page_get_pagecnt(page->p_szc);
					pgshft = page_get_shift(page->p_szc);
					pgmsk = (0x1 << (pgshft - PAGESHIFT))
					    - 1;
				}
			} else {
				page_type = VMUSAGE_BOUND_NOT_INCORE;
			}
			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
				next->vmb_type = page_type;
			} else if (next->vmb_type != page_type) {
				/*
				 * If current bound type does not match page
				 * type, need to split off new bound.
				 */
				tmp = vmu_alloc_bound();
				tmp->vmb_type = page_type;
				tmp->vmb_start = index;
				tmp->vmb_end = next->vmb_end;
				avl_insert_here(tree, tmp, next, AVL_AFTER);
				next->vmb_end = index - 1;
				if (*last == next)
					*last = tmp;
				next = tmp;
			}
			if (pgcnt > 1) {
				/*
				 * If inside large page, jump to next large
				 * page
				 */
				index = (index & ~pgmsk) + pgcnt;
			} else {
				index++;
			}
		}
		if (next == *last) {
			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
			break;
		} else
			next = AVL_NEXT(tree, next);
	}
}

/*
 * Calculate the rss and swap consumed by a segment.  vmu_entities is the
 * list of entities to visit.  For shared segments, the vnode or amp
 * is looked up in each entity to see if it has been already counted.  Private
 * anon pages are checked per entity to ensure that COW pages are not
 * double counted.
 *
 * For private mapped files, first the amp is checked for private pages.
 * Bounds not backed by the amp are looked up in the vnode for each entity
 * to avoid double counting of private COW vnode pages.
 */
static void
vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
{
	struct segvn_data *svd;
	struct shm_data *shmd;
	struct spt_data *sptd;
	vmu_object_t *shared_object = NULL;
	vmu_object_t *entity_object = NULL;
	vmu_entity_t *entity;
	vmusage_t *result;
	vmu_bound_t *first = NULL;
	vmu_bound_t *last = NULL;
	vmu_bound_t *cur = NULL;
	vmu_bound_t *e_first = NULL;
	vmu_bound_t *e_last = NULL;
	vmu_bound_t *tmp;
	pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
	struct anon_map *private_amp = NULL;
	boolean_t incore = B_FALSE;
	boolean_t shared = B_FALSE;
	int file = 0;
	pgcnt_t swresv = 0;
	pgcnt_t panon = 0;

	/* Can zero-length segments exist?  Not sure, so paranoia. */
	if (seg->s_size <= 0)
		return;

	/*
	 * Figure out if there is a shared object (such as a named vnode or
	 * a shared amp, then figure out if there is a private amp, which
	 * identifies private pages.
	 */
	if (seg->s_ops == &segvn_ops) {
		svd = (struct segvn_data *)seg->s_data;
		if (svd->type == MAP_SHARED) {
			shared = B_TRUE;
		} else {
			swresv = svd->swresv;

			if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock,
			    RW_READER) != 0) {
				/*
				 * Text replication anon maps can be shared
				 * across all zones. Space used for text
				 * replication is typically capped as a small %
				 * of memory.  To keep it simple for now we
				 * don't account for swap and memory space used
				 * for text replication.
				 */
				if (svd->tr_state == SEGVN_TR_OFF &&
				    svd->amp != NULL) {
					private_amp = svd->amp;
					p_start = svd->anon_index;
					p_end = svd->anon_index +
					    btop(seg->s_size) - 1;
				}
				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
			}
		}
		if (svd->vp != NULL) {
			file = 1;
			shared_object = vmu_find_insert_object(
			    vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
			    VMUSAGE_TYPE_VNODE);
			s_start = btop(svd->offset);
			s_end = btop(svd->offset + seg->s_size) - 1;
		}
		if (svd->amp != NULL && svd->type == MAP_SHARED) {
			ASSERT(shared_object == NULL);
			shared_object = vmu_find_insert_object(
			    vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
			    VMUSAGE_TYPE_AMP);
			s_start = svd->anon_index;
			s_end = svd->anon_index + btop(seg->s_size) - 1;
			/* schedctl mappings are always in core */
			if (svd->amp->swresv == 0)
				incore = B_TRUE;
		}
	} else if (seg->s_ops == &segspt_shmops) {
		shared = B_TRUE;
		shmd = (struct shm_data *)seg->s_data;
		shared_object = vmu_find_insert_object(
		    vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
		    VMUSAGE_TYPE_AMP);
		s_start = 0;
		s_end = btop(seg->s_size) - 1;
		sptd = shmd->shm_sptseg->s_data;

		/* ism segments are always incore and do not reserve swap */
		if (sptd->spt_flags & SHM_SHARE_MMU)
			incore = B_TRUE;

	} else {
		return;
	}

	/*
	 * If there is a private amp, count anon pages that exist.  If an
	 * anon has a refcnt > 1 (COW sharing), then save the anon in a
	 * hash so that it is not double counted.
	 *
	 * If there is also a shared object, then figure out the bounds
	 * which are not mapped by the private amp.
	 */
	if (private_amp != NULL) {

		/* Enter as writer to prevent COW anons from being freed */
		ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);

		p_index = p_start;
		s_index = s_start;

		while (p_index <= p_end) {

			pgcnt_t p_index_next;
			pgcnt_t p_bound_size;
			int cnt;
			anoff_t off;
			struct vnode *vn;
			struct anon *ap;
			page_t *page;		/* For handling of large */
			pgcnt_t pgcnt = 1;	/* pages */
			pgcnt_t pgstart;
			pgcnt_t pgend;
			uint_t pgshft;
			pgcnt_t pgmsk;

			p_index_next = p_index;
			ap = anon_get_next_ptr(private_amp->ahp,
			    &p_index_next);

			/*
			 * If next anon is past end of mapping, simulate
			 * end of anon so loop terminates.
			 */
			if (p_index_next > p_end) {
				p_index_next = p_end + 1;
				ap = NULL;
			}
			/*
			 * For COW segments, keep track of bounds not
			 * backed by private amp so they can be looked
			 * up in the backing vnode
			 */
			if (p_index_next != p_index) {

				/*
				 * Compute index difference between anon and
				 * previous anon.
				 */
				p_bound_size = p_index_next - p_index - 1;

				if (shared_object != NULL) {
					cur = vmu_alloc_bound();
					cur->vmb_start = s_index;
					cur->vmb_end = s_index + p_bound_size;
					cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
					if (first == NULL) {
						first = cur;
						last = cur;
					} else {
						last->vmb_next = cur;
						last = cur;
					}
				}
				p_index = p_index + p_bound_size + 1;
				s_index = s_index + p_bound_size + 1;
			}

			/* Detect end of anons in amp */
			if (ap == NULL)
				break;

			cnt = ap->an_refcnt;
			swap_xlate(ap, &vn, &off);

			if (vn == NULL || vn->v_pages == NULL ||
			    (page = page_exists(vn, off)) == NULL) {
				p_index++;
				s_index++;
				continue;
			}

			/*
			 * If large page is found, compute portion of large
			 * page in mapping, and increment indicies to the next
			 * large page.
			 */
			if (page->p_szc > 0) {

				pgcnt = page_get_pagecnt(page->p_szc);
				pgshft = page_get_shift(page->p_szc);
				pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;

				/* First page in large page */
				pgstart = p_index & ~pgmsk;
				/* Last page in large page */
				pgend = pgstart + pgcnt - 1;
				/*
				 * Artifically end page if page extends past
				 * end of mapping.
				 */
				if (pgend > p_end)
					pgend = p_end;

				/*
				 * Compute number of pages from large page
				 * which are mapped.
				 */
				pgcnt = pgend - p_index + 1;

				/*
				 * Point indicies at page after large page,
				 * or at page after end of mapping.
				 */
				p_index += pgcnt;
				s_index += pgcnt;
			} else {
				p_index++;
				s_index++;
			}

			/*
			 * Assume anon structs with a refcnt
			 * of 1 are not COW shared, so there
			 * is no reason to track them per entity.
			 */
			if (cnt == 1) {
				panon += pgcnt;
				continue;
			}
			for (entity = vmu_entities; entity != NULL;
			    entity = entity->vme_next_calc) {

				result = &entity->vme_result;
				/*
				 * Track COW anons per entity so
				 * they are not double counted.
				 */
				if (vmu_find_insert_anon(entity->vme_anon_hash,
				    (caddr_t)ap) == 0)
					continue;

				result->vmu_rss_all += (pgcnt << PAGESHIFT);
				result->vmu_rss_private +=
				    (pgcnt << PAGESHIFT);
			}
		}
		ANON_LOCK_EXIT(&private_amp->a_rwlock);
	}

	/* Add up resident anon and swap reserved for private mappings */
	if (swresv > 0 || panon > 0) {
		for (entity = vmu_entities; entity != NULL;
		    entity = entity->vme_next_calc) {
			result = &entity->vme_result;
			result->vmu_swap_all += swresv;
			result->vmu_swap_private += swresv;
			result->vmu_rss_all += (panon << PAGESHIFT);
			result->vmu_rss_private += (panon << PAGESHIFT);
		}
	}

	/* Compute resident pages backing shared amp or named vnode */
	if (shared_object != NULL) {
		avl_tree_t *tree = &(shared_object->vmo_bounds);

		if (first == NULL) {
			/*
			 * No private amp, or private amp has no anon
			 * structs.  This means entire segment is backed by
			 * the shared object.
			 */
			first = vmu_alloc_bound();
			first->vmb_start = s_start;
			first->vmb_end = s_end;
			first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
		}
		/*
		 * Iterate bounds not backed by private amp, and compute
		 * resident pages.
		 */
		cur = first;
		while (cur != NULL) {

			if (vmu_insert_lookup_object_bounds(shared_object,
			    cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
			    &first, &last) > 0) {
				/* new bounds, find incore/not-incore */
				if (shared_object->vmo_type ==
				    VMUSAGE_TYPE_VNODE) {
					vmu_vnode_update_incore_bounds(
					    tree,
					    (vnode_t *)
					    shared_object->vmo_key, &first,
					    &last);
				} else {
					vmu_amp_update_incore_bounds(
					    tree,
					    (struct anon_map *)
					    shared_object->vmo_key, &first,
					    &last, incore);
				}
				vmu_merge_bounds(tree, &first, &last);
			}
			for (entity = vmu_entities; entity != NULL;
			    entity = entity->vme_next_calc) {
				avl_tree_t *e_tree;

				result = &entity->vme_result;

				entity_object = vmu_find_insert_object(
				    shared_object->vmo_type ==
				    VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
				    entity->vme_amp_hash,
				    shared_object->vmo_key,
				    shared_object->vmo_type);

				virt = vmu_insert_lookup_object_bounds(
				    entity_object, cur->vmb_start, cur->vmb_end,
				    VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);

				if (virt == 0)
					continue;
				/*
				 * Range visited for this entity
				 */
				e_tree = &(entity_object->vmo_bounds);
				rss = vmu_update_bounds(e_tree, &e_first,
				    &e_last, tree, first, last);
				result->vmu_rss_all += (rss << PAGESHIFT);
				if (shared == B_TRUE && file == B_FALSE) {
					/* shared anon mapping */
					result->vmu_swap_all +=
					    (virt << PAGESHIFT);
					result->vmu_swap_shared +=
					    (virt << PAGESHIFT);
					result->vmu_rss_shared +=
					    (rss << PAGESHIFT);
				} else if (shared == B_TRUE && file == B_TRUE) {
					/* shared file mapping */
					result->vmu_rss_shared +=
					    (rss << PAGESHIFT);
				} else if (shared == B_FALSE &&
				    file == B_TRUE) {
					/* private file mapping */
					result->vmu_rss_private +=
					    (rss << PAGESHIFT);
				}
				vmu_merge_bounds(e_tree, &e_first, &e_last);
			}
			tmp = cur;
			cur = cur->vmb_next;
			vmu_free_bound(tmp);
		}
	}
}

/*
 * Based on the current calculation flags, find the relevant entities
 * which are relative to the process.  Then calculate each segment
 * in the process'es address space for each relevant entity.
 */
static void
vmu_calculate_proc(proc_t *p)
{
	vmu_entity_t *entities = NULL;
	vmu_zone_t *zone;
	vmu_entity_t *tmp;
	struct as *as;
	struct seg *seg;
	int ret;

	/* Figure out which entities are being computed */
	if ((vmu_data.vmu_system) != NULL) {
		tmp = vmu_data.vmu_system;
		tmp->vme_next_calc = entities;
		entities = tmp;
	}
	if (vmu_data.vmu_calc_flags &
	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
	    VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
	    VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
	    VMUSAGE_ALL_EUSERS)) {
		ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
		    (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
		    (mod_hash_val_t *)&zone);
		if (ret != 0) {
			zone = vmu_alloc_zone(p->p_zone->zone_id);
			ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
			    (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
			    (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
			ASSERT(ret == 0);
		}
		if (zone->vmz_zone != NULL) {
			tmp = zone->vmz_zone;
			tmp->vme_next_calc = entities;
			entities = tmp;
		}
		if (vmu_data.vmu_calc_flags &
		    (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
			tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
			    p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
			    zone->vmz_id);
			tmp->vme_next_calc = entities;
			entities = tmp;
		}
		if (vmu_data.vmu_calc_flags &
		    (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
			tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
			    p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
			tmp->vme_next_calc = entities;
			entities = tmp;
		}
		if (vmu_data.vmu_calc_flags &
		    (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
			tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
			    crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
			tmp->vme_next_calc = entities;
			entities = tmp;
		}
		if (vmu_data.vmu_calc_flags &
		    (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
			tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
			    crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
			tmp->vme_next_calc = entities;
			entities = tmp;
		}
	}
	/* Entities which collapse projects and users for all zones */
	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
		tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
		    p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
		tmp->vme_next_calc = entities;
		entities = tmp;
	}
	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
		tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
		    crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
		tmp->vme_next_calc = entities;
		entities = tmp;
	}
	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
		tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
		    crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
		tmp->vme_next_calc = entities;
		entities = tmp;
	}

	ASSERT(entities != NULL);
	/* process all segs in process's address space */
	as = p->p_as;
	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
	for (seg = AS_SEGFIRST(as); seg != NULL;
	    seg = AS_SEGNEXT(as, seg)) {
		vmu_calculate_seg(entities, seg);
	}
	AS_LOCK_EXIT(as, &as->a_lock);
}

/*
 * Free data created by previous call to vmu_calculate().
 */
static void
vmu_clear_calc()
{
	if (vmu_data.vmu_system != NULL)
		vmu_free_entity(vmu_data.vmu_system);
		vmu_data.vmu_system = NULL;
	if (vmu_data.vmu_zones_hash != NULL)
		i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
	if (vmu_data.vmu_projects_col_hash != NULL)
		i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
	if (vmu_data.vmu_rusers_col_hash != NULL)
		i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
	if (vmu_data.vmu_eusers_col_hash != NULL)
		i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);

	i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
	i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
}

/*
 * Free unused data structures.  These can result if the system workload
 * decreases between calculations.
 */
static void
vmu_free_extra()
{
	vmu_bound_t *tb;
	vmu_object_t *to;
	vmu_entity_t *te;
	vmu_zone_t *tz;

	while (vmu_data.vmu_free_bounds != NULL) {
		tb = vmu_data.vmu_free_bounds;
		vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
		kmem_cache_free(vmu_bound_cache, tb);
	}
	while (vmu_data.vmu_free_objects != NULL) {
		to = vmu_data.vmu_free_objects;
		vmu_data.vmu_free_objects =
		    vmu_data.vmu_free_objects->vmo_next;
		kmem_cache_free(vmu_object_cache, to);
	}
	while (vmu_data.vmu_free_entities != NULL) {
		te = vmu_data.vmu_free_entities;
		vmu_data.vmu_free_entities =
		    vmu_data.vmu_free_entities->vme_next;
		if (te->vme_vnode_hash != NULL)
			mod_hash_destroy_hash(te->vme_vnode_hash);
		if (te->vme_amp_hash != NULL)
			mod_hash_destroy_hash(te->vme_amp_hash);
		if (te->vme_anon_hash != NULL)
			mod_hash_destroy_hash(te->vme_anon_hash);
		kmem_free(te, sizeof (vmu_entity_t));
	}
	while (vmu_data.vmu_free_zones != NULL) {
		tz = vmu_data.vmu_free_zones;
		vmu_data.vmu_free_zones =
		    vmu_data.vmu_free_zones->vmz_next;
		if (tz->vmz_projects_hash != NULL)
			mod_hash_destroy_hash(tz->vmz_projects_hash);
		if (tz->vmz_tasks_hash != NULL)
			mod_hash_destroy_hash(tz->vmz_tasks_hash);
		if (tz->vmz_rusers_hash != NULL)
			mod_hash_destroy_hash(tz->vmz_rusers_hash);
		if (tz->vmz_eusers_hash != NULL)
			mod_hash_destroy_hash(tz->vmz_eusers_hash);
		kmem_free(tz, sizeof (vmu_zone_t));
	}
}

extern kcondvar_t *pr_pid_cv;

/*
 * Determine which entity types are relevant and allocate the hashes to
 * track them.  Then walk the process table and count rss and swap
 * for each process'es address space.  Address space object such as
 * vnodes, amps and anons are tracked per entity, so that they are
 * not double counted in the results.
 *
 */
static void
vmu_calculate()
{
	int i = 0;
	int ret;
	proc_t *p;

	vmu_clear_calc();

	if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
		vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
		    ALL_ZONES);

	/*
	 * Walk process table and calculate rss of each proc.
	 *
	 * Pidlock and p_lock cannot be held while doing the rss calculation.
	 * This is because:
	 *	1.  The calculation allocates using KM_SLEEP.
	 *	2.  The calculation grabs a_lock, which cannot be grabbed
	 *	    after p_lock.
	 *
	 * Since pidlock must be dropped, we cannot simply just walk the
	 * practive list.  Instead, we walk the process table, and sprlock
	 * each process to ensure that it does not exit during the
	 * calculation.
	 */

	mutex_enter(&pidlock);
	for (i = 0; i < v.v_proc; i++) {
again:
		p = pid_entry(i);
		if (p == NULL)
			continue;

		mutex_enter(&p->p_lock);
		mutex_exit(&pidlock);

		if (panicstr) {
			mutex_exit(&p->p_lock);
			return;
		}

		/* Try to set P_PR_LOCK */
		ret = sprtrylock_proc(p);
		if (ret == -1) {
			/* Process in invalid state */
			mutex_exit(&p->p_lock);
			mutex_enter(&pidlock);
			continue;
		} else if (ret == 1) {
			/*
			 * P_PR_LOCK is already set.  Wait and try again.
			 * This also drops p_lock.
			 */
			sprwaitlock_proc(p);
			mutex_enter(&pidlock);
			goto again;
		}
		mutex_exit(&p->p_lock);

		vmu_calculate_proc(p);

		mutex_enter(&p->p_lock);
		sprunlock(p);
		mutex_enter(&pidlock);
	}
	mutex_exit(&pidlock);

	vmu_free_extra();
}

/*
 * allocate a new cache for N results satisfying flags
 */
vmu_cache_t *
vmu_cache_alloc(size_t nres, uint_t flags)
{
	vmu_cache_t *cache;

	cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
	cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
	cache->vmc_nresults = nres;
	cache->vmc_flags = flags;
	cache->vmc_refcnt = 1;
	return (cache);
}

/*
 * Make sure cached results are not freed
 */
static void
vmu_cache_hold(vmu_cache_t *cache)
{
	ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
	cache->vmc_refcnt++;
}

/*
 * free cache data
 */
static void
vmu_cache_rele(vmu_cache_t *cache)
{
	ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
	ASSERT(cache->vmc_refcnt > 0);
	cache->vmc_refcnt--;
	if (cache->vmc_refcnt == 0) {
		kmem_free(cache->vmc_results, sizeof (vmusage_t) *
		    cache->vmc_nresults);
		kmem_free(cache, sizeof (vmu_cache_t));
	}
}

/*
 * Copy out the cached results to a caller.  Inspect the callers flags
 * and zone to determine which cached results should be copied.
 */
static int
vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
    uint_t flags, int cpflg)
{
	vmusage_t *result, *out_result;
	vmusage_t dummy;
	size_t i, count = 0;
	size_t bufsize;
	int ret = 0;
	uint_t types = 0;

	if (nres != NULL) {
		if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
			return (set_errno(EFAULT));
	} else {
		bufsize = 0;
	}

	/* figure out what results the caller is interested in. */
	if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
		types |= VMUSAGE_SYSTEM;
	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
		types |= VMUSAGE_ZONE;
	if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
	    VMUSAGE_COL_PROJECTS))
		types |= VMUSAGE_PROJECTS;
	if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
		types |= VMUSAGE_TASKS;
	if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
		types |= VMUSAGE_RUSERS;
	if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
		types |= VMUSAGE_EUSERS;

	/* count results for current zone */
	out_result = buf;
	for (result = cache->vmc_results, i = 0;
	    i < cache->vmc_nresults; result++, i++) {

		/* Do not return "other-zone" results to non-global zones */
		if (curproc->p_zone != global_zone &&
		    curproc->p_zone->zone_id != result->vmu_zoneid)
			continue;

		/*
		 * If non-global zone requests VMUSAGE_SYSTEM, fake
		 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
		 */
		if (curproc->p_zone != global_zone &&
		    (flags & VMUSAGE_SYSTEM) != 0 &&
		    result->vmu_type == VMUSAGE_ZONE) {
			count++;
			if (out_result != NULL) {
				if (bufsize < count) {
					ret = set_errno(EOVERFLOW);
				} else {
					dummy = *result;
					dummy.vmu_zoneid = ALL_ZONES;
					dummy.vmu_id = 0;
					dummy.vmu_type = VMUSAGE_SYSTEM;
					if (ddi_copyout(&dummy, out_result,
					    sizeof (vmusage_t), cpflg))
						return (set_errno(EFAULT));
					out_result++;
				}
			}
		}

		/* Skip results that do not match requested type */
		if ((result->vmu_type & types) == 0)
			continue;

		/* Skip collated results if not requested */
		if (result->vmu_zoneid == ALL_ZONES) {
			if (result->vmu_type == VMUSAGE_PROJECTS &&
			    (flags & VMUSAGE_COL_PROJECTS) == 0)
				continue;
			if (result->vmu_type == VMUSAGE_EUSERS &&
			    (flags & VMUSAGE_COL_EUSERS) == 0)
				continue;
			if (result->vmu_type == VMUSAGE_RUSERS &&
			    (flags & VMUSAGE_COL_RUSERS) == 0)
				continue;
		}

		/* Skip "other zone" results if not requested */
		if (result->vmu_zoneid != curproc->p_zone->zone_id) {
			if (result->vmu_type == VMUSAGE_ZONE &&
			    (flags & VMUSAGE_ALL_ZONES) == 0)
				continue;
			if (result->vmu_type == VMUSAGE_PROJECTS &&
			    (flags & (VMUSAGE_ALL_PROJECTS |
			    VMUSAGE_COL_PROJECTS)) == 0)
				continue;
			if (result->vmu_type == VMUSAGE_TASKS &&
			    (flags & VMUSAGE_ALL_TASKS) == 0)
				continue;
			if (result->vmu_type == VMUSAGE_RUSERS &&
			    (flags & (VMUSAGE_ALL_RUSERS |
			    VMUSAGE_COL_RUSERS)) == 0)
				continue;
			if (result->vmu_type == VMUSAGE_EUSERS &&
			    (flags & (VMUSAGE_ALL_EUSERS |
			    VMUSAGE_COL_EUSERS)) == 0)
				continue;
		}
		count++;
		if (out_result != NULL) {
			if (bufsize < count) {
				ret = set_errno(EOVERFLOW);
			} else {
				if (ddi_copyout(result, out_result,
				    sizeof (vmusage_t), cpflg))
					return (set_errno(EFAULT));
				out_result++;
			}
		}
	}
	if (nres != NULL)
		if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
			return (set_errno(EFAULT));

	return (ret);
}

/*
 * vm_getusage()
 *
 * Counts rss and swap by zone, project, task, and/or user.  The flags argument
 * determines the type of results structures returned.  Flags requesting
 * results from more than one zone are "flattened" to the local zone if the
 * caller is not the global zone.
 *
 * args:
 *	flags:	bitmap consisting of one or more of VMUSAGE_*.
 *	age:	maximum allowable age (time since counting was done) in
 *		seconds of the results.  Results from previous callers are
 *		cached in kernel.
 *	buf:	pointer to buffer array of vmusage_t.  If NULL, then only nres
 *		set on success.
 *	nres:	Set to number of vmusage_t structures pointed to by buf
 *		before calling vm_getusage().
 *		On return 0 (success) or ENOSPC, is set to the number of result
 *		structures returned or attempted to return.
 *
 * returns 0 on success, -1 on failure:
 *	EINTR (interrupted)
 *	ENOSPC (nres to small for results, nres set to needed value for success)
 *	EINVAL (flags invalid)
 *	EFAULT (bad address for buf or nres)
 */
int
vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
{
	vmu_entity_t *entity;
	vmusage_t *result;
	int ret = 0;
	int cacherecent = 0;
	hrtime_t now;
	uint_t flags_orig;

	/*
	 * Non-global zones cannot request system wide and/or collated
	 * results, or the system result, so munge the flags accordingly.
	 */
	flags_orig = flags;
	if (curproc->p_zone != global_zone) {
		if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
			flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
			flags |= VMUSAGE_PROJECTS;
		}
		if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
			flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
			flags |= VMUSAGE_RUSERS;
		}
		if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
			flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
			flags |= VMUSAGE_EUSERS;
		}
		if (flags & VMUSAGE_SYSTEM) {
			flags &= ~VMUSAGE_SYSTEM;
			flags |= VMUSAGE_ZONE;
		}
	}

	/* Check for unknown flags */
	if ((flags & (~VMUSAGE_MASK)) != 0)
		return (set_errno(EINVAL));

	/* Check for no flags */
	if ((flags & VMUSAGE_MASK) == 0)
		return (set_errno(EINVAL));

	mutex_enter(&vmu_data.vmu_lock);
	now = gethrtime();

start:
	if (vmu_data.vmu_cache != NULL) {

		vmu_cache_t *cache;

		if ((vmu_data.vmu_cache->vmc_timestamp +
		    ((hrtime_t)age * NANOSEC)) > now)
			cacherecent = 1;

		if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
		    cacherecent == 1) {
			cache = vmu_data.vmu_cache;
			vmu_cache_hold(cache);
			mutex_exit(&vmu_data.vmu_lock);

			ret = vmu_copyout_results(cache, buf, nres, flags_orig,
			    cpflg);
			mutex_enter(&vmu_data.vmu_lock);
			vmu_cache_rele(cache);
			if (vmu_data.vmu_pending_waiters > 0)
				cv_broadcast(&vmu_data.vmu_cv);
			mutex_exit(&vmu_data.vmu_lock);
			return (ret);
		}
		/*
		 * If the cache is recent, it is likely that there are other
		 * consumers of vm_getusage running, so add their flags to the
		 * desired flags for the calculation.
		 */
		if (cacherecent == 1)
			flags = vmu_data.vmu_cache->vmc_flags | flags;
	}
	if (vmu_data.vmu_calc_thread == NULL) {

		vmu_cache_t *cache;

		vmu_data.vmu_calc_thread = curthread;
		vmu_data.vmu_calc_flags = flags;
		vmu_data.vmu_entities = NULL;
		vmu_data.vmu_nentities = 0;
		if (vmu_data.vmu_pending_waiters > 0)
			vmu_data.vmu_calc_flags |=
			    vmu_data.vmu_pending_flags;

		vmu_data.vmu_pending_flags = 0;
		mutex_exit(&vmu_data.vmu_lock);
		vmu_calculate();
		mutex_enter(&vmu_data.vmu_lock);
		/* copy results to cache */
		if (vmu_data.vmu_cache != NULL)
			vmu_cache_rele(vmu_data.vmu_cache);
		cache = vmu_data.vmu_cache =
		    vmu_cache_alloc(vmu_data.vmu_nentities,
		    vmu_data.vmu_calc_flags);

		result = cache->vmc_results;
		for (entity = vmu_data.vmu_entities; entity != NULL;
		    entity = entity->vme_next) {
			*result = entity->vme_result;
			result++;
		}
		cache->vmc_timestamp = gethrtime();
		vmu_cache_hold(cache);

		vmu_data.vmu_calc_flags = 0;
		vmu_data.vmu_calc_thread = NULL;

		if (vmu_data.vmu_pending_waiters > 0)
			cv_broadcast(&vmu_data.vmu_cv);

		mutex_exit(&vmu_data.vmu_lock);

		/* copy cache */
		ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
		mutex_enter(&vmu_data.vmu_lock);
		vmu_cache_rele(cache);
		mutex_exit(&vmu_data.vmu_lock);

		return (ret);
	}
	vmu_data.vmu_pending_flags |= flags;
	vmu_data.vmu_pending_waiters++;
	while (vmu_data.vmu_calc_thread != NULL) {
		if (cv_wait_sig(&vmu_data.vmu_cv,
		    &vmu_data.vmu_lock) == 0) {
			vmu_data.vmu_pending_waiters--;
			mutex_exit(&vmu_data.vmu_lock);
			return (set_errno(EINTR));
		}
	}
	vmu_data.vmu_pending_waiters--;
	goto start;
}