xref: /titanic_50/usr/src/uts/common/vm/vm_usage.c (revision 1747673f150798edc5519c1e6d968616fff11b69)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * vm_usage
29  *
30  * This file implements the getvmusage() private system call.
31  * getvmusage() counts the amount of resident memory pages and swap
32  * reserved by the specified process collective. A "process collective" is
33  * the set of processes owned by a particular, zone, project, task, or user.
34  *
35  * rss and swap are counted so that for a given process collective, a page is
36  * only counted once.  For example, this means that if multiple processes in
37  * the same project map the same page, then the project will only be charged
38  * once for that page.  On the other hand, if two processes in different
39  * projects map the same page, then both projects will be charged
40  * for the page.
41  *
42  * The vm_getusage() calculation is implemented so that the first thread
43  * performs the rss/swap counting. Other callers will wait for that thread to
44  * finish, copying the results.  This enables multiple rcapds and prstats to
45  * consume data from the same calculation.  The results are also cached so that
46  * a caller interested in recent results can just copy them instead of starting
47  * a new calculation. The caller passes the maximium age (in seconds) of the
48  * data.  If the cached data is young enough, the cache is copied, otherwise,
49  * a new calculation is executed and the cache is replaced with the new
50  * data.
51  *
52  * The rss calculation for each process collective is as follows:
53  *
54  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
55  *     and/or users.
56  *   - For each proc:
57  *	- Figure out proc's collectives (zone, project, task, and/or user).
58  *	- For each seg in proc's address space:
59  *		- If seg is private:
60  *			- Lookup anons in the amp.
61  *			- For incore pages not previously visited each of the
62  *			  proc's collectives, add incore pagesize to each.
63  *			  collective.
64  *			  Anon's with a refcnt of 1 can be assummed to be not
65  *			  previously visited.
66  *			- For address ranges without anons in the amp:
67  *				- Lookup pages in underlying vnode.
68  *				- For incore pages not previously visiting for
69  *				  each of the proc's collectives, add incore
70  *				  pagesize to each collective.
71  *		- If seg is shared:
72  *			- Lookup pages in the shared amp or vnode.
73  *			- For incore pages not previously visited for each of
74  *			  the proc's collectives, add incore pagesize to each
75  *			  collective.
76  *
77  * Swap is reserved by private segments, and shared anonymous segments.
78  * The only shared anon segments which do not reserve swap are ISM segments
79  * and schedctl segments, both of which can be identified by having
80  * amp->swresv == 0.
81  *
82  * The swap calculation for each collective is as follows:
83  *
84  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
85  *     and/or users.
86  *   - For each proc:
87  *	- Figure out proc's collectives (zone, project, task, and/or user).
88  *	- For each seg in proc's address space:
89  *		- If seg is private:
90  *			- Add svd->swresv pages to swap count for each of the
91  *			  proc's collectives.
92  *		- If seg is anon, shared, and amp->swresv != 0
93  *			- For address ranges in amp not previously visited for
94  *			  each of the proc's collectives, add size of address
95  *			  range to the swap count for each collective.
96  *
97  * These two calculations are done simultaneously, with most of the work
98  * being done in vmu_calculate_seg().  The results of the calculation are
99  * copied into "vmu_data.vmu_cache_results".
100  *
101  * To perform the calculation, various things are tracked and cached:
102  *
103  *    - incore/not-incore page ranges for all vnodes.
104  *	(vmu_data.vmu_all_vnodes_hash)
105  *	This eliminates looking up the same page more than once.
106  *
107  *    - incore/not-incore page ranges for all shared amps.
108  *	(vmu_data.vmu_all_amps_hash)
109  *	This eliminates looking up the same page more than once.
110  *
111  *    - visited page ranges for each collective.
112  *	   - per vnode (entity->vme_vnode_hash)
113  *	   - per shared amp (entity->vme_amp_hash)
114  *	For accurate counting of map-shared and cow-shared pages.
115  *
116  *    - visited private anons (refcnt > 1) for each collective.
117  *	(entity->vme_anon_hash)
118  *	For accurate counting of cow-shared pages.
119  *
120  * The common accounting structure is the vmu_entity_t, which represents
121  * collectives:
122  *
123  *    - A zone.
124  *    - A project, task, or user within a zone.
125  *    - The entire system (vmu_data.vmu_system).
126  *    - Each collapsed (col) project and user.  This means a given projid or
127  *	uid, regardless of which zone the process is in.  For instance,
128  *      project 0 in the global zone and project 0 in a non global zone are
129  *	the same collapsed project.
130  *
131  *  Each entity structure tracks which pages have been already visited for
132  *  that entity (via previously inspected processes) so that these pages are
133  *  not double counted.
134  */
135 
136 #include <sys/errno.h>
137 #include <sys/types.h>
138 #include <sys/zone.h>
139 #include <sys/proc.h>
140 #include <sys/project.h>
141 #include <sys/task.h>
142 #include <sys/thread.h>
143 #include <sys/time.h>
144 #include <sys/mman.h>
145 #include <sys/modhash.h>
146 #include <sys/modhash_impl.h>
147 #include <sys/shm.h>
148 #include <sys/swap.h>
149 #include <sys/synch.h>
150 #include <sys/systm.h>
151 #include <sys/var.h>
152 #include <sys/vm_usage.h>
153 #include <sys/zone.h>
154 #include <sys/sunddi.h>
155 #include <vm/anon.h>
156 #include <vm/as.h>
157 #include <vm/seg_vn.h>
158 #include <vm/seg_spt.h>
159 
160 #define	VMUSAGE_HASH_SIZE		512
161 
162 #define	VMUSAGE_TYPE_VNODE		1
163 #define	VMUSAGE_TYPE_AMP		2
164 #define	VMUSAGE_TYPE_ANON		3
165 
166 #define	VMUSAGE_BOUND_UNKNOWN		0
167 #define	VMUSAGE_BOUND_INCORE		1
168 #define	VMUSAGE_BOUND_NOT_INCORE	2
169 
170 /*
171  * bounds for vnodes and shared amps
172  * Each bound is either entirely incore, entirely not in core, or
173  * entirely unknown.  bounds are stored in order by offset.
174  */
175 typedef struct vmu_bound {
176 	struct  vmu_bound *vmb_next;
177 	pgcnt_t vmb_start;  /* page offset in vnode/amp on which bound starts */
178 	pgcnt_t	vmb_end;    /* page offset in vnode/amp on which bound ends */
179 	char	vmb_type;   /* One of VMUSAGE_BOUND_* */
180 } vmu_bound_t;
181 
182 /*
183  * hash of visited objects (vnodes or shared amps)
184  * key is address of vnode or amp.  Bounds lists known incore/non-incore
185  * bounds for vnode/amp.
186  */
187 typedef struct vmu_object {
188 	struct vmu_object	*vmo_next;	/* free list */
189 	caddr_t		vmo_key;
190 	short		vmo_type;
191 	vmu_bound_t	*vmo_bounds;
192 } vmu_object_t;
193 
194 /*
195  * Entity by which to count results.
196  *
197  * The entity structure keeps the current rss/swap counts for each entity
198  * (zone, project, etc), and hashes of vm structures that have already
199  * been visited for the entity.
200  *
201  * vme_next:	links the list of all entities currently being counted by
202  *		vmu_calculate().
203  *
204  * vme_next_calc: links the list of entities related to the current process
205  *		 being counted by vmu_calculate_proc().
206  *
207  * vmu_calculate_proc() walks all processes.  For each process, it makes a
208  * list of the entities related to that process using vme_next_calc.  This
209  * list changes each time vmu_calculate_proc() is called.
210  *
211  */
212 typedef struct vmu_entity {
213 	struct vmu_entity *vme_next;
214 	struct vmu_entity *vme_next_calc;
215 	mod_hash_t	*vme_vnode_hash; /* vnodes visited for entity */
216 	mod_hash_t	*vme_amp_hash;	 /* shared amps visited for entity */
217 	mod_hash_t	*vme_anon_hash;	 /* cow anons visited for entity */
218 	vmusage_t	vme_result;	 /* identifies entity and results */
219 } vmu_entity_t;
220 
221 /*
222  * Hash of entities visited within a zone, and an entity for the zone
223  * itself.
224  */
225 typedef struct vmu_zone {
226 	struct vmu_zone	*vmz_next;	/* free list */
227 	id_t		vmz_id;
228 	vmu_entity_t	*vmz_zone;
229 	mod_hash_t	*vmz_projects_hash;
230 	mod_hash_t	*vmz_tasks_hash;
231 	mod_hash_t	*vmz_rusers_hash;
232 	mod_hash_t	*vmz_eusers_hash;
233 } vmu_zone_t;
234 
235 /*
236  * Cache of results from last calculation
237  */
238 typedef struct vmu_cache {
239 	vmusage_t	*vmc_results;	/* Results from last call to */
240 					/* vm_getusage(). */
241 	uint64_t	vmc_nresults;	/* Count of cached results */
242 	uint64_t	vmc_refcnt;	/* refcnt for free */
243 	uint_t		vmc_flags;	/* Flags for vm_getusage() */
244 	hrtime_t	vmc_timestamp;	/* when cache was created */
245 } vmu_cache_t;
246 
247 /*
248  * top level rss info for the system
249  */
250 typedef struct vmu_data {
251 	kmutex_t	vmu_lock;		/* Protects vmu_data */
252 	kcondvar_t	vmu_cv;			/* Used to signal threads */
253 						/* Waiting for */
254 						/* Rss_calc_thread to finish */
255 	vmu_entity_t	*vmu_system;		/* Entity for tracking */
256 						/* rss/swap for all processes */
257 						/* in all zones */
258 	mod_hash_t	*vmu_zones_hash;	/* Zones visited */
259 	mod_hash_t	*vmu_projects_col_hash; /* These *_col_hash hashes */
260 	mod_hash_t	*vmu_rusers_col_hash;	/* keep track of entities, */
261 	mod_hash_t	*vmu_eusers_col_hash;	/* ignoring zoneid, in order */
262 						/* to implement VMUSAGE_COL_* */
263 						/* flags, which aggregate by */
264 						/* project or user regardless */
265 						/* of zoneid. */
266 	mod_hash_t	*vmu_all_vnodes_hash;	/* System wide visited vnodes */
267 						/* to track incore/not-incore */
268 	mod_hash_t	*vmu_all_amps_hash;	/* System wide visited shared */
269 						/* amps to track incore/not- */
270 						/* incore */
271 	vmu_entity_t	*vmu_entities;		/* Linked list of entities */
272 	size_t		vmu_nentities;		/* Count of entities in list */
273 	vmu_cache_t	*vmu_cache;		/* Cached results */
274 	kthread_t	*vmu_calc_thread;	/* NULL, or thread running */
275 						/* vmu_calculate() */
276 	uint_t		vmu_calc_flags;		/* Flags being using by */
277 						/* currently running calc */
278 						/* thread */
279 	uint_t		vmu_pending_flags;	/* Flags of vm_getusage() */
280 						/* threads waiting for */
281 						/* calc thread to finish */
282 	uint_t		vmu_pending_waiters;	/* Number of threads waiting */
283 						/* for calc thread */
284 	vmu_bound_t	*vmu_free_bounds;
285 	vmu_object_t	*vmu_free_objects;
286 	vmu_entity_t	*vmu_free_entities;
287 	vmu_zone_t	*vmu_free_zones;
288 } vmu_data_t;
289 
290 extern struct as kas;
291 extern proc_t *practive;
292 extern zone_t *global_zone;
293 extern struct seg_ops segvn_ops;
294 extern struct seg_ops segspt_shmops;
295 
296 static vmu_data_t vmu_data;
297 static kmem_cache_t *vmu_bound_cache;
298 static kmem_cache_t *vmu_object_cache;
299 
300 /*
301  * Save a bound on the free list
302  */
303 static void
304 vmu_free_bound(vmu_bound_t *bound)
305 {
306 	bound->vmb_next = vmu_data.vmu_free_bounds;
307 	vmu_data.vmu_free_bounds = bound;
308 }
309 
310 /*
311  * Free an object, and all visited bound info.
312  */
313 static void
314 vmu_free_object(mod_hash_val_t val)
315 {
316 	vmu_object_t *obj = (vmu_object_t *)val;
317 	vmu_bound_t *bound = obj->vmo_bounds;
318 	vmu_bound_t *tmp;
319 
320 	while (bound != NULL) {
321 		tmp = bound;
322 		bound = bound->vmb_next;
323 		vmu_free_bound(tmp);
324 	}
325 	obj->vmo_next = vmu_data.vmu_free_objects;
326 	vmu_data.vmu_free_objects = obj;
327 }
328 
329 /*
330  * Free an entity, and hashes of visited objects for that entity.
331  */
332 static void
333 vmu_free_entity(mod_hash_val_t val)
334 {
335 	vmu_entity_t *entity = (vmu_entity_t *)val;
336 
337 	if (entity->vme_vnode_hash != NULL)
338 		i_mod_hash_clear_nosync(entity->vme_vnode_hash);
339 	if (entity->vme_amp_hash != NULL)
340 		i_mod_hash_clear_nosync(entity->vme_amp_hash);
341 	if (entity->vme_anon_hash != NULL)
342 		i_mod_hash_clear_nosync(entity->vme_anon_hash);
343 
344 	entity->vme_next = vmu_data.vmu_free_entities;
345 	vmu_data.vmu_free_entities = entity;
346 }
347 
348 /*
349  * Free zone entity, and all hashes of entities inside that zone,
350  * which are projects, tasks, and users.
351  */
352 static void
353 vmu_free_zone(mod_hash_val_t val)
354 {
355 	vmu_zone_t *zone = (vmu_zone_t *)val;
356 
357 	if (zone->vmz_zone != NULL) {
358 		vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
359 		zone->vmz_zone = NULL;
360 	}
361 	if (zone->vmz_projects_hash != NULL)
362 		i_mod_hash_clear_nosync(zone->vmz_projects_hash);
363 	if (zone->vmz_tasks_hash != NULL)
364 		i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
365 	if (zone->vmz_rusers_hash != NULL)
366 		i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
367 	if (zone->vmz_eusers_hash != NULL)
368 		i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
369 	zone->vmz_next = vmu_data.vmu_free_zones;
370 	vmu_data.vmu_free_zones = zone;
371 }
372 
373 /*
374  * Initialize synchronization primitives and hashes for system-wide tracking
375  * of visited vnodes and shared amps.  Initialize results cache.
376  */
377 void
378 vm_usage_init()
379 {
380 	mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
381 	cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
382 
383 	vmu_data.vmu_system = NULL;
384 	vmu_data.vmu_zones_hash = NULL;
385 	vmu_data.vmu_projects_col_hash = NULL;
386 	vmu_data.vmu_rusers_col_hash = NULL;
387 	vmu_data.vmu_eusers_col_hash = NULL;
388 
389 	vmu_data.vmu_free_bounds = NULL;
390 	vmu_data.vmu_free_objects = NULL;
391 	vmu_data.vmu_free_entities = NULL;
392 	vmu_data.vmu_free_zones = NULL;
393 
394 	vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
395 	    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
396 	    sizeof (vnode_t));
397 	vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
398 	    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
399 	    sizeof (struct anon_map));
400 	vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
401 	    "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
402 	    vmu_free_entity);
403 	vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
404 	    "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
405 	    vmu_free_entity);
406 	vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
407 	    "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
408 	    vmu_free_entity);
409 	vmu_data.vmu_zones_hash = mod_hash_create_idhash(
410 	    "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
411 
412 	vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
413 	    sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
414 	vmu_object_cache = kmem_cache_create("vmu_object_cache",
415 	    sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
416 
417 	vmu_data.vmu_entities = NULL;
418 	vmu_data.vmu_nentities = 0;
419 
420 	vmu_data.vmu_cache = NULL;
421 	vmu_data.vmu_calc_thread = NULL;
422 	vmu_data.vmu_calc_flags = 0;
423 	vmu_data.vmu_pending_flags = 0;
424 	vmu_data.vmu_pending_waiters = 0;
425 }
426 
427 /*
428  * Allocate hashes for tracking vm objects visited for an entity.
429  * Update list of entities.
430  */
431 static vmu_entity_t *
432 vmu_alloc_entity(id_t id, int type, id_t zoneid)
433 {
434 	vmu_entity_t *entity;
435 
436 	if (vmu_data.vmu_free_entities != NULL) {
437 		entity = vmu_data.vmu_free_entities;
438 		vmu_data.vmu_free_entities =
439 		    vmu_data.vmu_free_entities->vme_next;
440 		bzero(&entity->vme_result, sizeof (vmusage_t));
441 	} else {
442 		entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
443 	}
444 	entity->vme_result.vmu_id = id;
445 	entity->vme_result.vmu_zoneid = zoneid;
446 	entity->vme_result.vmu_type = type;
447 
448 	if (entity->vme_vnode_hash == NULL)
449 		entity->vme_vnode_hash = mod_hash_create_ptrhash(
450 		    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
451 		    sizeof (vnode_t));
452 
453 	if (entity->vme_amp_hash == NULL)
454 		entity->vme_amp_hash = mod_hash_create_ptrhash(
455 		    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
456 		    sizeof (struct anon_map));
457 
458 	if (entity->vme_anon_hash == NULL)
459 		entity->vme_anon_hash = mod_hash_create_ptrhash(
460 		    "vmusage anon hash", VMUSAGE_HASH_SIZE,
461 		    mod_hash_null_valdtor, sizeof (struct anon));
462 
463 	entity->vme_next = vmu_data.vmu_entities;
464 	vmu_data.vmu_entities = entity;
465 	vmu_data.vmu_nentities++;
466 
467 	return (entity);
468 }
469 
470 /*
471  * Allocate a zone entity, and hashes for tracking visited vm objects
472  * for projects, tasks, and users within that zone.
473  */
474 static vmu_zone_t *
475 vmu_alloc_zone(id_t id)
476 {
477 	vmu_zone_t *zone;
478 
479 	if (vmu_data.vmu_free_zones != NULL) {
480 		zone = vmu_data.vmu_free_zones;
481 		vmu_data.vmu_free_zones =
482 		    vmu_data.vmu_free_zones->vmz_next;
483 		zone->vmz_next = NULL;
484 		zone->vmz_zone = NULL;
485 	} else {
486 		zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
487 	}
488 
489 	zone->vmz_id = id;
490 
491 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
492 		zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
493 
494 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
495 	    VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
496 		zone->vmz_projects_hash = mod_hash_create_idhash(
497 		    "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
498 
499 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
500 	    != 0 && zone->vmz_tasks_hash == NULL)
501 		zone->vmz_tasks_hash = mod_hash_create_idhash(
502 		    "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
503 
504 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
505 	    != 0 && zone->vmz_rusers_hash == NULL)
506 		zone->vmz_rusers_hash = mod_hash_create_idhash(
507 		    "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
508 
509 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
510 	    != 0 && zone->vmz_eusers_hash == NULL)
511 		zone->vmz_eusers_hash = mod_hash_create_idhash(
512 		    "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
513 
514 	return (zone);
515 }
516 
517 /*
518  * Allocate a structure for tracking visited bounds for a vm object.
519  */
520 static vmu_object_t *
521 vmu_alloc_object(caddr_t key, int type)
522 {
523 	vmu_object_t *object;
524 
525 	if (vmu_data.vmu_free_objects != NULL) {
526 		object = vmu_data.vmu_free_objects;
527 		vmu_data.vmu_free_objects =
528 		    vmu_data.vmu_free_objects->vmo_next;
529 	} else {
530 		object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
531 	}
532 
533 	object->vmo_key = key;
534 	object->vmo_type = type;
535 	object->vmo_bounds = NULL;
536 
537 	return (object);
538 }
539 
540 /*
541  * Allocate and return a bound structure.
542  */
543 static vmu_bound_t *
544 vmu_alloc_bound()
545 {
546 	vmu_bound_t *bound;
547 
548 	if (vmu_data.vmu_free_bounds != NULL) {
549 		bound = vmu_data.vmu_free_bounds;
550 		vmu_data.vmu_free_bounds =
551 		    vmu_data.vmu_free_bounds->vmb_next;
552 		bzero(bound, sizeof (vmu_bound_t));
553 	} else {
554 		bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
555 		bzero(bound, sizeof (vmu_bound_t));
556 	}
557 	return (bound);
558 }
559 
560 /*
561  * vmu_find_insert_* functions implement hash lookup or allocate and
562  * insert operations.
563  */
564 static vmu_object_t *
565 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
566 {
567 	int ret;
568 	vmu_object_t *object;
569 
570 	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
571 	    (mod_hash_val_t *)&object);
572 	if (ret != 0) {
573 		object = vmu_alloc_object(key, type);
574 		ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
575 		    (mod_hash_val_t)object, (mod_hash_hndl_t)0);
576 		ASSERT(ret == 0);
577 	}
578 	return (object);
579 }
580 
581 static int
582 vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
583 {
584 	int ret;
585 	caddr_t val;
586 
587 	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
588 	    (mod_hash_val_t *)&val);
589 
590 	if (ret == 0)
591 		return (0);
592 
593 	ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
594 	    (mod_hash_val_t)key, (mod_hash_hndl_t)0);
595 
596 	ASSERT(ret == 0);
597 
598 	return (1);
599 }
600 
601 static vmu_entity_t *
602 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
603 {
604 	int ret;
605 	vmu_entity_t *entity;
606 
607 	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
608 	    (mod_hash_val_t *)&entity);
609 	if (ret != 0) {
610 		entity = vmu_alloc_entity(id, type, zoneid);
611 		ret = i_mod_hash_insert_nosync(hash,
612 		    (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
613 		    (mod_hash_hndl_t)0);
614 		ASSERT(ret == 0);
615 	}
616 	return (entity);
617 }
618 
619 
620 
621 
622 /*
623  * Returns list of object bounds between start and end.  New bounds inserted
624  * by this call are given type.
625  *
626  * Returns the number of pages covered if new bounds are created.  Returns 0
627  * if region between start/end consists of all existing bounds.
628  */
629 static pgcnt_t
630 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
631     end, char type, vmu_bound_t **first, vmu_bound_t **last)
632 {
633 	vmu_bound_t *next;
634 	vmu_bound_t *prev = NULL;
635 	vmu_bound_t *tmp = NULL;
636 	pgcnt_t ret = 0;
637 
638 	*first = *last = NULL;
639 
640 	for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) {
641 		/*
642 		 * Find bounds overlapping or overlapped by range [start,end].
643 		 */
644 		if (start > next->vmb_end) {
645 			/* bound is before new bound */
646 			prev = next;
647 			continue;
648 		}
649 		if (next->vmb_start > end) {
650 			/* bound is after new bound */
651 			break;
652 		}
653 		if (*first == NULL)
654 			*first = next;
655 		*last = next;
656 	}
657 
658 	if (*first == NULL) {
659 		ASSERT(*last == NULL);
660 		/*
661 		 * No bounds overlapping range [start,end], so create new
662 		 * bound
663 		 */
664 		tmp = vmu_alloc_bound();
665 		tmp->vmb_start = start;
666 		tmp->vmb_end = end;
667 		tmp->vmb_type = type;
668 		if (prev == NULL) {
669 			tmp->vmb_next = ro->vmo_bounds;
670 			ro->vmo_bounds = tmp;
671 		} else {
672 			tmp->vmb_next = prev->vmb_next;
673 			prev->vmb_next = tmp;
674 		}
675 		*first = tmp;
676 		*last = tmp;
677 		ASSERT(tmp->vmb_end >= tmp->vmb_start);
678 		ret = tmp->vmb_end - tmp->vmb_start + 1;
679 		return (ret);
680 	}
681 
682 	/* Check to see if start is before first known bound */
683 	ASSERT(first != NULL && last != NULL);
684 	next = (*first);
685 	if (start < (*first)->vmb_start) {
686 		/* Create new bound before first bound */
687 		tmp = vmu_alloc_bound();
688 		tmp->vmb_start = start;
689 		tmp->vmb_end = (*first)->vmb_start - 1;
690 		tmp->vmb_type = type;
691 		tmp->vmb_next = *first;
692 		if (*first == ro->vmo_bounds)
693 			ro->vmo_bounds = tmp;
694 		if (prev != NULL)
695 			prev->vmb_next = tmp;
696 		ASSERT(tmp->vmb_end >= tmp->vmb_start);
697 		ret += tmp->vmb_end - tmp->vmb_start + 1;
698 		*first = tmp;
699 	}
700 	/*
701 	 * Between start and end, search for gaps between and after existing
702 	 * bounds.  Create new bounds to fill gaps if they exist.
703 	 */
704 	while (end > next->vmb_end) {
705 		/*
706 		 * Check for gap between bound and next bound. if no gap,
707 		 * continue.
708 		 */
709 		if ((next != *last) &&
710 		    ((next->vmb_end + 1) == next->vmb_next->vmb_start)) {
711 			next = next->vmb_next;
712 			continue;
713 		}
714 		/*
715 		 * Insert new bound in gap after bound, and before next
716 		 * bound if next bound exists.
717 		 */
718 		tmp = vmu_alloc_bound();
719 		tmp->vmb_type = type;
720 		tmp->vmb_next = next->vmb_next;
721 		tmp->vmb_start = next->vmb_end + 1;
722 
723 		if (next != *last) {
724 			tmp->vmb_end = next->vmb_next->vmb_start - 1;
725 			ASSERT(tmp->vmb_end >= tmp->vmb_start);
726 			ret += tmp->vmb_end - tmp->vmb_start + 1;
727 			next->vmb_next = tmp;
728 			next = tmp->vmb_next;
729 		} else {
730 			tmp->vmb_end = end;
731 			ASSERT(tmp->vmb_end >= tmp->vmb_start);
732 			ret += tmp->vmb_end - tmp->vmb_start + 1;
733 			next->vmb_next = tmp;
734 			*last = tmp;
735 			break;
736 		}
737 	}
738 	return (ret);
739 }
740 
741 /*
742  * vmu_update_bounds()
743  *
744  * first, last:	list of continuous bounds, of which zero or more are of
745  * 		type VMUSAGE_BOUND_UNKNOWN.
746  *
747  * new_first, new_last:	list of continuous bounds, of which none are of
748  *			type VMUSAGE_BOUND_UNKNOWN.  These bounds are used to
749  *			update the types of bounds in (first,last) with
750  *			type VMUSAGE_BOUND_UNKNOWN.
751  *
752  * For the list of bounds (first,last), this function updates any bounds
753  * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
754  * the list (new_first, new_last).
755  *
756  * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
757  * (new_first, new_last), it will be split into multiple bounds.
758  *
759  * Return value:
760  * 	The number of pages in the list of bounds (first,last) that were of
761  *	type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
762  *	VMUSAGE_BOUND_INCORE.
763  *
764  */
765 static pgcnt_t
766 vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last,
767     vmu_bound_t *new_first, vmu_bound_t *new_last)
768 {
769 	vmu_bound_t *next, *new_next, *tmp;
770 	pgcnt_t rss = 0;
771 
772 	next = *first;
773 	new_next = new_first;
774 
775 	/*
776 	 * Verify first and last bound are covered by new bounds if they
777 	 * have unknown type.
778 	 */
779 	ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
780 	    (*first)->vmb_start >= new_next->vmb_start);
781 	ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
782 	    (*last)->vmb_end <= new_last->vmb_end);
783 	for (;;) {
784 		/* If bound already has type, proceed to next bound */
785 		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
786 			if (next == *last)
787 				break;
788 			next = next->vmb_next;
789 			continue;
790 		}
791 		while (new_next->vmb_end < next->vmb_start)
792 			new_next = new_next->vmb_next;
793 		ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
794 		next->vmb_type = new_next->vmb_type;
795 		if (new_next->vmb_end < next->vmb_end) {
796 			/* need to split bound */
797 			tmp = vmu_alloc_bound();
798 			tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
799 			tmp->vmb_start = new_next->vmb_end + 1;
800 			tmp->vmb_end = next->vmb_end;
801 			tmp->vmb_next = next->vmb_next;
802 			next->vmb_end = new_next->vmb_end;
803 			next->vmb_next = tmp;
804 			if (*last == next)
805 				*last = tmp;
806 			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
807 				rss += next->vmb_end - next->vmb_start + 1;
808 			next = tmp;
809 		} else {
810 			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
811 				rss += next->vmb_end - next->vmb_start + 1;
812 			if (next == *last)
813 				break;
814 			next = next->vmb_next;
815 		}
816 	}
817 	return (rss);
818 }
819 
820 /*
821  * merges adjacent bounds with same type between first and last bound.
822  * After merge, last pointer is no longer valid, as last bound may be
823  * merged away.
824  */
825 static void
826 vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last)
827 {
828 	vmu_bound_t *next;
829 	vmu_bound_t *tmp;
830 
831 	ASSERT(*first != NULL);
832 	ASSERT(*last != NULL);
833 
834 	next = *first;
835 	while (next != *last) {
836 
837 		/* If bounds are adjacent and have same type, merge them */
838 		if (((next->vmb_end + 1) == next->vmb_next->vmb_start) &&
839 		    (next->vmb_type == next->vmb_next->vmb_type)) {
840 			tmp = next->vmb_next;
841 			next->vmb_end = tmp->vmb_end;
842 			next->vmb_next = tmp->vmb_next;
843 			vmu_free_bound(tmp);
844 			if (tmp == *last)
845 				*last = next;
846 		} else {
847 			next = next->vmb_next;
848 		}
849 	}
850 }
851 
852 /*
853  * Given an amp and a list of bounds, updates each bound's type with
854  * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
855  *
856  * If a bound is partially incore, it will be split into two bounds.
857  * first and last may be modified, as bounds may be split into multiple
858  * bounds if the are partially incore/not-incore.
859  *
860  * Set incore to non-zero if bounds are already known to be incore
861  *
862  */
863 static void
864 vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first,
865     vmu_bound_t **last, boolean_t incore)
866 {
867 	vmu_bound_t *next;
868 	vmu_bound_t *tmp;
869 	pgcnt_t index;
870 	short bound_type;
871 	short page_type;
872 	vnode_t *vn;
873 	anoff_t off;
874 	struct anon *ap;
875 
876 	next = *first;
877 	/* Shared anon slots don't change once set */
878 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
879 	for (;;) {
880 		if (incore == B_TRUE)
881 			next->vmb_type = VMUSAGE_BOUND_INCORE;
882 
883 		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
884 			if (next == *last)
885 				break;
886 			next = next->vmb_next;
887 			continue;
888 		}
889 		bound_type = next->vmb_type;
890 		index = next->vmb_start;
891 		while (index <= next->vmb_end) {
892 
893 			/*
894 			 * These are used to determine how much to increment
895 			 * index when a large page is found.
896 			 */
897 			page_t *page;
898 			pgcnt_t pgcnt = 1;
899 			uint_t pgshft;
900 			pgcnt_t pgmsk;
901 
902 			ap = anon_get_ptr(amp->ahp, index);
903 			if (ap != NULL)
904 				swap_xlate(ap, &vn, &off);
905 
906 			if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
907 			    (page = page_exists(vn, off)) != NULL) {
908 				page_type = VMUSAGE_BOUND_INCORE;
909 				if (page->p_szc > 0) {
910 					pgcnt = page_get_pagecnt(page->p_szc);
911 					pgshft = page_get_shift(page->p_szc);
912 					pgmsk = (0x1 << (pgshft - PAGESHIFT))
913 					    - 1;
914 				}
915 			} else {
916 				page_type = VMUSAGE_BOUND_NOT_INCORE;
917 			}
918 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
919 				next->vmb_type = page_type;
920 			} else if (next->vmb_type != page_type) {
921 				/*
922 				 * if current bound type does not match page
923 				 * type, need to split off new bound.
924 				 */
925 				tmp = vmu_alloc_bound();
926 				tmp->vmb_type = page_type;
927 				tmp->vmb_start = index;
928 				tmp->vmb_end = next->vmb_end;
929 				tmp->vmb_next = next->vmb_next;
930 				next->vmb_end = index - 1;
931 				next->vmb_next = tmp;
932 				if (*last == next)
933 					*last = tmp;
934 				next = tmp;
935 			}
936 			if (pgcnt > 1) {
937 				/*
938 				 * If inside large page, jump to next large
939 				 * page
940 				 */
941 				index = (index & ~pgmsk) + pgcnt;
942 			} else {
943 				index++;
944 			}
945 		}
946 		if (next == *last) {
947 			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
948 			break;
949 		} else
950 			next = next->vmb_next;
951 	}
952 	ANON_LOCK_EXIT(&amp->a_rwlock);
953 }
954 
955 /*
956  * Same as vmu_amp_update_incore_bounds(), except for tracking
957  * incore-/not-incore for vnodes.
958  */
959 static void
960 vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first,
961     vmu_bound_t **last)
962 {
963 	vmu_bound_t *next;
964 	vmu_bound_t *tmp;
965 	pgcnt_t index;
966 	short bound_type;
967 	short page_type;
968 
969 	next = *first;
970 	for (;;) {
971 		if (vnode->v_pages == NULL)
972 			next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
973 
974 		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
975 			if (next == *last)
976 				break;
977 			next = next->vmb_next;
978 			continue;
979 		}
980 
981 		bound_type = next->vmb_type;
982 		index = next->vmb_start;
983 		while (index <= next->vmb_end) {
984 
985 			/*
986 			 * These are used to determine how much to increment
987 			 * index when a large page is found.
988 			 */
989 			page_t *page;
990 			pgcnt_t pgcnt = 1;
991 			uint_t pgshft;
992 			pgcnt_t pgmsk;
993 
994 			if (vnode->v_pages != NULL &&
995 			    (page = page_exists(vnode, ptob(index))) != NULL) {
996 				page_type = VMUSAGE_BOUND_INCORE;
997 				if (page->p_szc > 0) {
998 					pgcnt = page_get_pagecnt(page->p_szc);
999 					pgshft = page_get_shift(page->p_szc);
1000 					pgmsk = (0x1 << (pgshft - PAGESHIFT))
1001 					    - 1;
1002 				}
1003 			} else {
1004 				page_type = VMUSAGE_BOUND_NOT_INCORE;
1005 			}
1006 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1007 				next->vmb_type = page_type;
1008 			} else if (next->vmb_type != page_type) {
1009 				/*
1010 				 * if current bound type does not match page
1011 				 * type, need to split off new bound.
1012 				 */
1013 				tmp = vmu_alloc_bound();
1014 				tmp->vmb_type = page_type;
1015 				tmp->vmb_start = index;
1016 				tmp->vmb_end = next->vmb_end;
1017 				tmp->vmb_next = next->vmb_next;
1018 				next->vmb_end = index - 1;
1019 				next->vmb_next = tmp;
1020 				if (*last == next)
1021 					*last = tmp;
1022 				next = tmp;
1023 			}
1024 			if (pgcnt > 1) {
1025 				/*
1026 				 * If inside large page, jump to next large
1027 				 * page
1028 				 */
1029 				index = (index & ~pgmsk) + pgcnt;
1030 			} else {
1031 				index++;
1032 			}
1033 		}
1034 		if (next == *last) {
1035 			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1036 			break;
1037 		} else
1038 			next = next->vmb_next;
1039 	}
1040 }
1041 
1042 /*
1043  * Calculate the rss and swap consumed by a segment.  vmu_entities is the
1044  * list of entities to visit.  For shared segments, the vnode or amp
1045  * is looked up in each entity to see if has been already counted.  Private
1046  * anon pages are checked per entity to ensure that cow pages are not
1047  * double counted.
1048  *
1049  * For private mapped files, first the amp is checked for private pages.
1050  * Bounds not backed by the amp are looked up in the vnode for each entity
1051  * to avoid double counting of private COW vnode pages.
1052  */
1053 static void
1054 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
1055 {
1056 	struct segvn_data *svd;
1057 	struct shm_data *shmd;
1058 	struct spt_data *sptd;
1059 	vmu_object_t *shared_object = NULL;
1060 	vmu_object_t *entity_object = NULL;
1061 	vmu_entity_t *entity;
1062 	vmusage_t *result;
1063 	vmu_bound_t *first = NULL;
1064 	vmu_bound_t *last = NULL;
1065 	vmu_bound_t *cur = NULL;
1066 	vmu_bound_t *e_first = NULL;
1067 	vmu_bound_t *e_last = NULL;
1068 	vmu_bound_t *tmp;
1069 	pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
1070 	struct anon_map *private_amp = NULL;
1071 	boolean_t incore = B_FALSE;
1072 	boolean_t shared = B_FALSE;
1073 	int file = 0;
1074 	pgcnt_t swresv = 0;
1075 	pgcnt_t panon = 0;
1076 
1077 	/* Can zero-length segments exist?  Not sure, so parenoia */
1078 	if (seg->s_size <= 0)
1079 		return;
1080 
1081 	/*
1082 	 * Figure out if there is a shared object (such as a named vnode or
1083 	 * a shared amp, then figure out if there is a private amp, which
1084 	 * identifies private pages.
1085 	 */
1086 	if (seg->s_ops == &segvn_ops) {
1087 		svd = (struct segvn_data *)seg->s_data;
1088 		if (svd->type == MAP_SHARED)
1089 			shared = B_TRUE;
1090 		else
1091 			swresv = svd->swresv;
1092 
1093 		if (svd->vp != NULL) {
1094 			file = 1;
1095 			shared_object = vmu_find_insert_object(
1096 			    vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
1097 			    VMUSAGE_TYPE_VNODE);
1098 			s_start = btop(svd->offset);
1099 			s_end = btop(svd->offset + seg->s_size) - 1;
1100 		}
1101 		if (svd->amp != NULL && svd->type == MAP_SHARED) {
1102 			ASSERT(shared_object == NULL);
1103 			shared_object = vmu_find_insert_object(
1104 			    vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
1105 			    VMUSAGE_TYPE_AMP);
1106 			s_start = svd->anon_index;
1107 			s_end = svd->anon_index + btop(seg->s_size) - 1;
1108 			/* schedctl mappings are always in core */
1109 			if (svd->amp->swresv == 0)
1110 				incore = B_TRUE;
1111 		}
1112 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
1113 		/*
1114 		 * Text replication anon maps can be shared across all zones.
1115 		 * Space used for text replication is typically capped as
1116 		 * small % of memory.  To keep it simple for now we don't
1117 		 * account for swap and memory space used for text replication.
1118 		 */
1119 		if (svd->tr_state == SEGVN_TR_OFF && svd->amp != NULL &&
1120 		    svd->type == MAP_PRIVATE) {
1121 			private_amp = svd->amp;
1122 			p_start = svd->anon_index;
1123 			p_end = svd->anon_index + btop(seg->s_size) - 1;
1124 		}
1125 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
1126 	} else if (seg->s_ops == &segspt_shmops) {
1127 		shared = B_TRUE;
1128 		shmd = (struct shm_data *)seg->s_data;
1129 		shared_object = vmu_find_insert_object(
1130 		    vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
1131 		    VMUSAGE_TYPE_AMP);
1132 		s_start = 0;
1133 		s_end = btop(seg->s_size) - 1;
1134 		sptd = shmd->shm_sptseg->s_data;
1135 
1136 		/* ism segments are always incore and do not reserve swap */
1137 		if (sptd->spt_flags & SHM_SHARE_MMU)
1138 			incore = B_TRUE;
1139 
1140 	} else {
1141 		return;
1142 	}
1143 
1144 	/*
1145 	 * If there is a private amp, count anon pages that exist.  If an
1146 	 * anon has a refcnt > 1 (cow sharing), then save the anon in a
1147 	 * hash so that it is not double counted.
1148 	 *
1149 	 * If there is also a shared object, they figure out the bounds
1150 	 * which are not mapped by the private amp.
1151 	 */
1152 	if (private_amp != NULL) {
1153 
1154 		/* Enter as writer to prevent cow anons from being freed */
1155 		ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
1156 
1157 		p_index = p_start;
1158 		s_index = s_start;
1159 
1160 		while (p_index <= p_end) {
1161 
1162 			pgcnt_t p_index_next;
1163 			pgcnt_t p_bound_size;
1164 			int cnt;
1165 			anoff_t off;
1166 			struct vnode *vn;
1167 			struct anon *ap;
1168 			page_t *page;		/* For handling of large */
1169 			pgcnt_t pgcnt = 1;	/* pages */
1170 			pgcnt_t pgstart;
1171 			pgcnt_t pgend;
1172 			uint_t pgshft;
1173 			pgcnt_t pgmsk;
1174 
1175 			p_index_next = p_index;
1176 			ap = anon_get_next_ptr(private_amp->ahp,
1177 			    &p_index_next);
1178 
1179 			/*
1180 			 * If next anon is past end of mapping, simulate
1181 			 * end of anon so loop terminates.
1182 			 */
1183 			if (p_index_next > p_end) {
1184 				p_index_next = p_end + 1;
1185 				ap = NULL;
1186 			}
1187 			/*
1188 			 * For cow segments, keep track of bounds not
1189 			 * backed by private amp so they can be looked
1190 			 * up in the backing vnode
1191 			 */
1192 			if (p_index_next != p_index) {
1193 
1194 				/*
1195 				 * Compute index difference between anon and
1196 				 * previous anon.
1197 				 */
1198 				p_bound_size = p_index_next - p_index - 1;
1199 
1200 				if (shared_object != NULL) {
1201 					cur = vmu_alloc_bound();
1202 					cur->vmb_next = NULL;
1203 					cur->vmb_start = s_index;
1204 					cur->vmb_end = s_index + p_bound_size;
1205 					cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1206 					if (first == NULL) {
1207 						first = cur;
1208 						last = cur;
1209 					} else {
1210 						last->vmb_next = cur;
1211 						last = cur;
1212 					}
1213 				}
1214 				p_index = p_index + p_bound_size + 1;
1215 				s_index = s_index + p_bound_size + 1;
1216 			}
1217 
1218 			/* Detect end of anons in amp */
1219 			if (ap == NULL)
1220 				break;
1221 
1222 			cnt = ap->an_refcnt;
1223 			swap_xlate(ap, &vn, &off);
1224 
1225 			if (vn == NULL || vn->v_pages == NULL ||
1226 			    (page = page_exists(vn, off)) == NULL) {
1227 				p_index++;
1228 				s_index++;
1229 				continue;
1230 			}
1231 
1232 			/*
1233 			 * If large page is found, compute portion of large
1234 			 * page in mapping, and increment indicies to the next
1235 			 * large page.
1236 			 */
1237 			if (page->p_szc > 0) {
1238 
1239 				pgcnt = page_get_pagecnt(page->p_szc);
1240 				pgshft = page_get_shift(page->p_szc);
1241 				pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
1242 
1243 				/* First page in large page */
1244 				pgstart = p_index & ~pgmsk;
1245 				/* Last page in large page */
1246 				pgend = pgstart + pgcnt - 1;
1247 				/*
1248 				 * Artifically end page if page extends past
1249 				 * end of mapping.
1250 				 */
1251 				if (pgend > p_end)
1252 					pgend = p_end;
1253 
1254 				/*
1255 				 * Compute number of pages from large page
1256 				 * which are mapped.
1257 				 */
1258 				pgcnt = pgend - p_index + 1;
1259 
1260 				/*
1261 				 * Point indicies at page after large page,
1262 				 * or at page after end of mapping.
1263 				 */
1264 				p_index += pgcnt;
1265 				s_index += pgcnt;
1266 			} else {
1267 				p_index++;
1268 				s_index++;
1269 			}
1270 
1271 			/*
1272 			 * Assume anon structs with a refcnt
1273 			 * of 1 are not cow shared, so there
1274 			 * is no reason to track them per entity.
1275 			 */
1276 			if (cnt == 1) {
1277 				panon += pgcnt;
1278 				continue;
1279 			}
1280 			for (entity = vmu_entities; entity != NULL;
1281 			    entity = entity->vme_next_calc) {
1282 
1283 				result = &entity->vme_result;
1284 				/*
1285 				 * Track cow anons per entity so
1286 				 * they are not double counted.
1287 				 */
1288 				if (vmu_find_insert_anon(entity->vme_anon_hash,
1289 				    (caddr_t)ap) == 0)
1290 					continue;
1291 
1292 				result->vmu_rss_all += (pgcnt << PAGESHIFT);
1293 				result->vmu_rss_private +=
1294 				    (pgcnt << PAGESHIFT);
1295 			}
1296 		}
1297 		ANON_LOCK_EXIT(&private_amp->a_rwlock);
1298 	}
1299 
1300 	/* Add up resident anon and swap reserved for private mappings */
1301 	if (swresv > 0 || panon > 0) {
1302 		for (entity = vmu_entities; entity != NULL;
1303 		    entity = entity->vme_next_calc) {
1304 			result = &entity->vme_result;
1305 			result->vmu_swap_all += swresv;
1306 			result->vmu_swap_private += swresv;
1307 			result->vmu_rss_all += (panon << PAGESHIFT);
1308 			result->vmu_rss_private += (panon << PAGESHIFT);
1309 		}
1310 	}
1311 
1312 	/* Compute resident pages backing shared amp or named vnode */
1313 	if (shared_object != NULL) {
1314 		if (first == NULL) {
1315 			/*
1316 			 * No private amp, or private amp has no anon
1317 			 * structs.  This means entire segment is backed by
1318 			 * the shared object.
1319 			 */
1320 			first = vmu_alloc_bound();
1321 			first->vmb_next = NULL;
1322 			first->vmb_start = s_start;
1323 			first->vmb_end = s_end;
1324 			first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1325 		}
1326 		/*
1327 		 * Iterate bounds not backed by private amp, and compute
1328 		 * resident pages.
1329 		 */
1330 		cur = first;
1331 		while (cur != NULL) {
1332 
1333 			if (vmu_insert_lookup_object_bounds(shared_object,
1334 			    cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
1335 			    &first, &last) > 0) {
1336 				/* new bounds, find incore/not-incore */
1337 				if (shared_object->vmo_type ==
1338 				    VMUSAGE_TYPE_VNODE)
1339 					vmu_vnode_update_incore_bounds(
1340 					    (vnode_t *)
1341 					    shared_object->vmo_key, &first,
1342 					    &last);
1343 				else
1344 					vmu_amp_update_incore_bounds(
1345 					    (struct anon_map *)
1346 					    shared_object->vmo_key, &first,
1347 					    &last, incore);
1348 				vmu_merge_bounds(&first, &last);
1349 			}
1350 			for (entity = vmu_entities; entity != NULL;
1351 			    entity = entity->vme_next_calc) {
1352 
1353 				result = &entity->vme_result;
1354 
1355 				entity_object = vmu_find_insert_object(
1356 				    shared_object->vmo_type ==
1357 				    VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
1358 				    entity->vme_amp_hash,
1359 				    shared_object->vmo_key,
1360 				    shared_object->vmo_type);
1361 
1362 				virt = vmu_insert_lookup_object_bounds(
1363 				    entity_object, cur->vmb_start, cur->vmb_end,
1364 				    VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
1365 
1366 				if (virt == 0)
1367 					continue;
1368 				/*
1369 				 * Range visited for this entity
1370 				 */
1371 				rss = vmu_update_bounds(&e_first,
1372 				    &e_last, first, last);
1373 				result->vmu_rss_all += (rss << PAGESHIFT);
1374 				if (shared == B_TRUE && file == B_FALSE) {
1375 					/* shared anon mapping */
1376 					result->vmu_swap_all +=
1377 					    (virt << PAGESHIFT);
1378 					result->vmu_swap_shared +=
1379 					    (virt << PAGESHIFT);
1380 					result->vmu_rss_shared +=
1381 					    (rss << PAGESHIFT);
1382 				} else if (shared == B_TRUE && file == B_TRUE) {
1383 					/* shared file mapping */
1384 					result->vmu_rss_shared +=
1385 					    (rss << PAGESHIFT);
1386 				} else if (shared == B_FALSE &&
1387 				    file == B_TRUE) {
1388 					/* private file mapping */
1389 					result->vmu_rss_private +=
1390 					    (rss << PAGESHIFT);
1391 				}
1392 				vmu_merge_bounds(&e_first, &e_last);
1393 			}
1394 			tmp = cur;
1395 			cur = cur->vmb_next;
1396 			vmu_free_bound(tmp);
1397 		}
1398 	}
1399 }
1400 
1401 /*
1402  * Based on the current calculation flags, find the relevant entities
1403  * which are relative to the process.  Then calculate each segment
1404  * in the process'es address space for each relevant entity.
1405  */
1406 static void
1407 vmu_calculate_proc(proc_t *p)
1408 {
1409 	vmu_entity_t *entities = NULL;
1410 	vmu_zone_t *zone;
1411 	vmu_entity_t *tmp;
1412 	struct as *as;
1413 	struct seg *seg;
1414 	int ret;
1415 
1416 	/* Figure out which entities are being computed */
1417 	if ((vmu_data.vmu_system) != NULL) {
1418 		tmp = vmu_data.vmu_system;
1419 		tmp->vme_next_calc = entities;
1420 		entities = tmp;
1421 	}
1422 	if (vmu_data.vmu_calc_flags &
1423 	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
1424 	    VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1425 	    VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1426 	    VMUSAGE_ALL_EUSERS)) {
1427 		ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1428 		    (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1429 		    (mod_hash_val_t *)&zone);
1430 		if (ret != 0) {
1431 			zone = vmu_alloc_zone(p->p_zone->zone_id);
1432 			ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1433 			    (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1434 			    (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1435 			ASSERT(ret == 0);
1436 		}
1437 		if (zone->vmz_zone != NULL) {
1438 			tmp = zone->vmz_zone;
1439 			tmp->vme_next_calc = entities;
1440 			entities = tmp;
1441 		}
1442 		if (vmu_data.vmu_calc_flags &
1443 		    (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1444 			tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1445 			    p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
1446 			    zone->vmz_id);
1447 			tmp->vme_next_calc = entities;
1448 			entities = tmp;
1449 		}
1450 		if (vmu_data.vmu_calc_flags &
1451 		    (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
1452 			tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
1453 			    p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
1454 			tmp->vme_next_calc = entities;
1455 			entities = tmp;
1456 		}
1457 		if (vmu_data.vmu_calc_flags &
1458 		    (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
1459 			tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
1460 			    crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
1461 			tmp->vme_next_calc = entities;
1462 			entities = tmp;
1463 		}
1464 		if (vmu_data.vmu_calc_flags &
1465 		    (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
1466 			tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
1467 			    crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
1468 			tmp->vme_next_calc = entities;
1469 			entities = tmp;
1470 		}
1471 	}
1472 	/* Entities which collapse projects and users for all zones */
1473 	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
1474 		tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
1475 		    p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
1476 		tmp->vme_next_calc = entities;
1477 		entities = tmp;
1478 	}
1479 	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
1480 		tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
1481 		    crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
1482 		tmp->vme_next_calc = entities;
1483 		entities = tmp;
1484 	}
1485 	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
1486 		tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
1487 		    crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
1488 		tmp->vme_next_calc = entities;
1489 		entities = tmp;
1490 	}
1491 
1492 	ASSERT(entities != NULL);
1493 	/* process all segs in process's address space */
1494 	as = p->p_as;
1495 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1496 	for (seg = AS_SEGFIRST(as); seg != NULL;
1497 	    seg = AS_SEGNEXT(as, seg)) {
1498 		vmu_calculate_seg(entities, seg);
1499 	}
1500 	AS_LOCK_EXIT(as, &as->a_lock);
1501 }
1502 
1503 /*
1504  * Free data created by previous call to vmu_calculate().
1505  */
1506 static void
1507 vmu_clear_calc()
1508 {
1509 	if (vmu_data.vmu_system != NULL)
1510 		vmu_free_entity(vmu_data.vmu_system);
1511 		vmu_data.vmu_system = NULL;
1512 	if (vmu_data.vmu_zones_hash != NULL)
1513 		i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
1514 	if (vmu_data.vmu_projects_col_hash != NULL)
1515 		i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
1516 	if (vmu_data.vmu_rusers_col_hash != NULL)
1517 		i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
1518 	if (vmu_data.vmu_eusers_col_hash != NULL)
1519 		i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
1520 
1521 	i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
1522 	i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
1523 }
1524 
1525 /*
1526  * Free unused data structures.  These can result if the system workload
1527  * decreases between calculations.
1528  */
1529 static void
1530 vmu_free_extra()
1531 {
1532 	vmu_bound_t *tb;
1533 	vmu_object_t *to;
1534 	vmu_entity_t *te;
1535 	vmu_zone_t *tz;
1536 
1537 	while (vmu_data.vmu_free_bounds != NULL) {
1538 		tb = vmu_data.vmu_free_bounds;
1539 		vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
1540 		kmem_cache_free(vmu_bound_cache, tb);
1541 	}
1542 	while (vmu_data.vmu_free_objects != NULL) {
1543 		to = vmu_data.vmu_free_objects;
1544 		vmu_data.vmu_free_objects =
1545 		    vmu_data.vmu_free_objects->vmo_next;
1546 		kmem_cache_free(vmu_object_cache, to);
1547 	}
1548 	while (vmu_data.vmu_free_entities != NULL) {
1549 		te = vmu_data.vmu_free_entities;
1550 		vmu_data.vmu_free_entities =
1551 		    vmu_data.vmu_free_entities->vme_next;
1552 		if (te->vme_vnode_hash != NULL)
1553 			mod_hash_destroy_hash(te->vme_vnode_hash);
1554 		if (te->vme_amp_hash != NULL)
1555 			mod_hash_destroy_hash(te->vme_amp_hash);
1556 		if (te->vme_anon_hash != NULL)
1557 			mod_hash_destroy_hash(te->vme_anon_hash);
1558 		kmem_free(te, sizeof (vmu_entity_t));
1559 	}
1560 	while (vmu_data.vmu_free_zones != NULL) {
1561 		tz = vmu_data.vmu_free_zones;
1562 		vmu_data.vmu_free_zones =
1563 		    vmu_data.vmu_free_zones->vmz_next;
1564 		if (tz->vmz_projects_hash != NULL)
1565 			mod_hash_destroy_hash(tz->vmz_projects_hash);
1566 		if (tz->vmz_tasks_hash != NULL)
1567 			mod_hash_destroy_hash(tz->vmz_tasks_hash);
1568 		if (tz->vmz_rusers_hash != NULL)
1569 			mod_hash_destroy_hash(tz->vmz_rusers_hash);
1570 		if (tz->vmz_eusers_hash != NULL)
1571 			mod_hash_destroy_hash(tz->vmz_eusers_hash);
1572 		kmem_free(tz, sizeof (vmu_zone_t));
1573 	}
1574 }
1575 
1576 extern kcondvar_t *pr_pid_cv;
1577 
1578 /*
1579  * Determine which entity types are relevant and allocate the hashes to
1580  * track them.  Then walk the process table and count rss and swap
1581  * for each process'es address space.  Address space object such as
1582  * vnodes, amps and anons are tracked per entity, so that they are
1583  * not double counted in the results.
1584  *
1585  */
1586 static void
1587 vmu_calculate()
1588 {
1589 	int i = 0;
1590 	int ret;
1591 	proc_t *p;
1592 
1593 	vmu_clear_calc();
1594 
1595 	if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
1596 		vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
1597 		    ALL_ZONES);
1598 
1599 	/*
1600 	 * Walk process table and calculate rss of each proc.
1601 	 *
1602 	 * Pidlock and p_lock cannot be held while doing the rss calculation.
1603 	 * This is because:
1604 	 *	1.  The calculation allocates using KM_SLEEP.
1605 	 *	2.  The calculation grabs a_lock, which cannot be grabbed
1606 	 *	    after p_lock.
1607 	 *
1608 	 * Since pidlock must be dropped, we cannot simply just walk the
1609 	 * practive list.  Instead, we walk the process table, and sprlock
1610 	 * each process to ensure that it does not exit during the
1611 	 * calculation.
1612 	 */
1613 
1614 	mutex_enter(&pidlock);
1615 	for (i = 0; i < v.v_proc; i++) {
1616 again:
1617 		p = pid_entry(i);
1618 		if (p == NULL)
1619 			continue;
1620 
1621 		mutex_enter(&p->p_lock);
1622 		mutex_exit(&pidlock);
1623 
1624 		if (panicstr) {
1625 			mutex_exit(&p->p_lock);
1626 			return;
1627 		}
1628 
1629 		/* Try to set P_PR_LOCK */
1630 		ret = sprtrylock_proc(p);
1631 		if (ret == -1) {
1632 			/* Process in invalid state */
1633 			mutex_exit(&p->p_lock);
1634 			mutex_enter(&pidlock);
1635 			continue;
1636 		} else if (ret == 1) {
1637 			/*
1638 			 * P_PR_LOCK is already set.  Wait and try again.
1639 			 * This also drops p_lock.
1640 			 */
1641 			sprwaitlock_proc(p);
1642 			mutex_enter(&pidlock);
1643 			goto again;
1644 		}
1645 		mutex_exit(&p->p_lock);
1646 
1647 		vmu_calculate_proc(p);
1648 
1649 		mutex_enter(&p->p_lock);
1650 		sprunlock(p);
1651 		mutex_enter(&pidlock);
1652 	}
1653 	mutex_exit(&pidlock);
1654 
1655 	vmu_free_extra();
1656 }
1657 
1658 /*
1659  * allocate a new cache for N results satisfying flags
1660  */
1661 vmu_cache_t *
1662 vmu_cache_alloc(size_t nres, uint_t flags)
1663 {
1664 	vmu_cache_t *cache;
1665 
1666 	cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
1667 	cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
1668 	cache->vmc_nresults = nres;
1669 	cache->vmc_flags = flags;
1670 	cache->vmc_refcnt = 1;
1671 	return (cache);
1672 }
1673 
1674 /*
1675  * Make sure cached results are not freed
1676  */
1677 static void
1678 vmu_cache_hold(vmu_cache_t *cache)
1679 {
1680 	ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1681 	cache->vmc_refcnt++;
1682 }
1683 
1684 /*
1685  * free cache data
1686  */
1687 static void
1688 vmu_cache_rele(vmu_cache_t *cache)
1689 {
1690 	ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1691 	ASSERT(cache->vmc_refcnt > 0);
1692 	cache->vmc_refcnt--;
1693 	if (cache->vmc_refcnt == 0) {
1694 		kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1695 		    cache->vmc_nresults);
1696 		kmem_free(cache, sizeof (vmu_cache_t));
1697 	}
1698 }
1699 
1700 /*
1701  * Copy out the cached results to a caller.  Inspect the callers flags
1702  * and zone to determine which cached results should be copied.
1703  */
1704 static int
1705 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1706     uint_t flags, int cpflg)
1707 {
1708 	vmusage_t *result, *out_result;
1709 	vmusage_t dummy;
1710 	size_t i, count = 0;
1711 	size_t bufsize;
1712 	int ret = 0;
1713 	uint_t types = 0;
1714 
1715 	if (nres != NULL) {
1716 		if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1717 			return (set_errno(EFAULT));
1718 	} else {
1719 		bufsize = 0;
1720 	}
1721 
1722 	/* figure out what results the caller is interested in. */
1723 	if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1724 		types |= VMUSAGE_SYSTEM;
1725 	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
1726 		types |= VMUSAGE_ZONE;
1727 	if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1728 	    VMUSAGE_COL_PROJECTS))
1729 		types |= VMUSAGE_PROJECTS;
1730 	if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1731 		types |= VMUSAGE_TASKS;
1732 	if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1733 		types |= VMUSAGE_RUSERS;
1734 	if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1735 		types |= VMUSAGE_EUSERS;
1736 
1737 	/* count results for current zone */
1738 	out_result = buf;
1739 	for (result = cache->vmc_results, i = 0;
1740 	    i < cache->vmc_nresults; result++, i++) {
1741 
1742 		/* Do not return "other-zone" results to non-global zones */
1743 		if (curproc->p_zone != global_zone &&
1744 		    curproc->p_zone->zone_id != result->vmu_zoneid)
1745 			continue;
1746 
1747 		/*
1748 		 * If non-global zone requests VMUSAGE_SYSTEM, fake
1749 		 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1750 		 */
1751 		if (curproc->p_zone != global_zone &&
1752 		    (flags & VMUSAGE_SYSTEM) != 0 &&
1753 		    result->vmu_type == VMUSAGE_ZONE) {
1754 			count++;
1755 			if (out_result != NULL) {
1756 				if (bufsize < count) {
1757 					ret = set_errno(EOVERFLOW);
1758 				} else {
1759 					dummy = *result;
1760 					dummy.vmu_zoneid = ALL_ZONES;
1761 					dummy.vmu_id = 0;
1762 					dummy.vmu_type = VMUSAGE_SYSTEM;
1763 					if (ddi_copyout(&dummy, out_result,
1764 					    sizeof (vmusage_t), cpflg))
1765 						return (set_errno(EFAULT));
1766 					out_result++;
1767 				}
1768 			}
1769 		}
1770 
1771 		/* Skip results that do not match requested type */
1772 		if ((result->vmu_type & types) == 0)
1773 			continue;
1774 
1775 		/* Skip collated results if not requested */
1776 		if (result->vmu_zoneid == ALL_ZONES) {
1777 			if (result->vmu_type == VMUSAGE_PROJECTS &&
1778 			    (flags & VMUSAGE_COL_PROJECTS) == 0)
1779 				continue;
1780 			if (result->vmu_type == VMUSAGE_EUSERS &&
1781 			    (flags & VMUSAGE_COL_EUSERS) == 0)
1782 				continue;
1783 			if (result->vmu_type == VMUSAGE_RUSERS &&
1784 			    (flags & VMUSAGE_COL_RUSERS) == 0)
1785 				continue;
1786 		}
1787 
1788 		/* Skip "other zone" results if not requested */
1789 		if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1790 			if (result->vmu_type == VMUSAGE_ZONE &&
1791 			    (flags & VMUSAGE_ALL_ZONES) == 0)
1792 				continue;
1793 			if (result->vmu_type == VMUSAGE_PROJECTS &&
1794 			    (flags & (VMUSAGE_ALL_PROJECTS |
1795 			    VMUSAGE_COL_PROJECTS)) == 0)
1796 				continue;
1797 			if (result->vmu_type == VMUSAGE_TASKS &&
1798 			    (flags & VMUSAGE_ALL_TASKS) == 0)
1799 				continue;
1800 			if (result->vmu_type == VMUSAGE_RUSERS &&
1801 			    (flags & (VMUSAGE_ALL_RUSERS |
1802 			    VMUSAGE_COL_RUSERS)) == 0)
1803 				continue;
1804 			if (result->vmu_type == VMUSAGE_EUSERS &&
1805 			    (flags & (VMUSAGE_ALL_EUSERS |
1806 			    VMUSAGE_COL_EUSERS)) == 0)
1807 				continue;
1808 		}
1809 		count++;
1810 		if (out_result != NULL) {
1811 			if (bufsize < count) {
1812 				ret = set_errno(EOVERFLOW);
1813 			} else {
1814 				if (ddi_copyout(result, out_result,
1815 				    sizeof (vmusage_t), cpflg))
1816 					return (set_errno(EFAULT));
1817 				out_result++;
1818 			}
1819 		}
1820 	}
1821 	if (nres != NULL)
1822 		if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1823 			return (set_errno(EFAULT));
1824 
1825 	return (ret);
1826 }
1827 
1828 /*
1829  * vm_getusage()
1830  *
1831  * Counts rss and swap by zone, project, task, and/or user.  The flags argument
1832  * determines the type of results structures returned.  Flags requesting
1833  * results from more than one zone are "flattened" to the local zone if the
1834  * caller is not the global zone.
1835  *
1836  * args:
1837  *	flags:	bitmap consisting of one or more of VMUSAGE_*.
1838  *	age:	maximum allowable age (time since counting was done) in
1839  *		seconds of the results.  Results from previous callers are
1840  *		cached in kernel.
1841  *	buf:	pointer to buffer array of vmusage_t.  If NULL, then only nres
1842  *		set on success.
1843  *	nres:	Set to number of vmusage_t structures pointed to by buf
1844  *		before calling vm_getusage().
1845  *		On return 0 (success) or ENOSPC, is set to the number of result
1846  *		structures returned or attempted to return.
1847  *
1848  * returns 0 on success, -1 on failure:
1849  *	EINTR (interrupted)
1850  *	ENOSPC (nres to small for results, nres set to needed value for success)
1851  *	EINVAL (flags invalid)
1852  *	EFAULT (bad address for buf or nres)
1853  */
1854 int
1855 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1856 {
1857 	vmu_entity_t *entity;
1858 	vmusage_t *result;
1859 	int ret = 0;
1860 	int cacherecent = 0;
1861 	hrtime_t now;
1862 	uint_t flags_orig;
1863 
1864 	/*
1865 	 * Non-global zones cannot request system wide and/or collated
1866 	 * results, or the system result, so munge the flags accordingly.
1867 	 */
1868 	flags_orig = flags;
1869 	if (curproc->p_zone != global_zone) {
1870 		if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1871 			flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1872 			flags |= VMUSAGE_PROJECTS;
1873 		}
1874 		if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1875 			flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1876 			flags |= VMUSAGE_RUSERS;
1877 		}
1878 		if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1879 			flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1880 			flags |= VMUSAGE_EUSERS;
1881 		}
1882 		if (flags & VMUSAGE_SYSTEM) {
1883 			flags &= ~VMUSAGE_SYSTEM;
1884 			flags |= VMUSAGE_ZONE;
1885 		}
1886 	}
1887 
1888 	/* Check for unknown flags */
1889 	if ((flags & (~VMUSAGE_MASK)) != 0)
1890 		return (set_errno(EINVAL));
1891 
1892 	/* Check for no flags */
1893 	if ((flags & VMUSAGE_MASK) == 0)
1894 		return (set_errno(EINVAL));
1895 
1896 	mutex_enter(&vmu_data.vmu_lock);
1897 	now = gethrtime();
1898 
1899 start:
1900 	if (vmu_data.vmu_cache != NULL) {
1901 
1902 		vmu_cache_t *cache;
1903 
1904 		if ((vmu_data.vmu_cache->vmc_timestamp +
1905 		    ((hrtime_t)age * NANOSEC)) > now)
1906 			cacherecent = 1;
1907 
1908 		if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
1909 		    cacherecent == 1) {
1910 			cache = vmu_data.vmu_cache;
1911 			vmu_cache_hold(cache);
1912 			mutex_exit(&vmu_data.vmu_lock);
1913 
1914 			ret = vmu_copyout_results(cache, buf, nres, flags_orig,
1915 			    cpflg);
1916 			mutex_enter(&vmu_data.vmu_lock);
1917 			vmu_cache_rele(cache);
1918 			if (vmu_data.vmu_pending_waiters > 0)
1919 				cv_broadcast(&vmu_data.vmu_cv);
1920 			mutex_exit(&vmu_data.vmu_lock);
1921 			return (ret);
1922 		}
1923 		/*
1924 		 * If the cache is recent, it is likely that there are other
1925 		 * consumers of vm_getusage running, so add their flags to the
1926 		 * desired flags for the calculation.
1927 		 */
1928 		if (cacherecent == 1)
1929 			flags = vmu_data.vmu_cache->vmc_flags | flags;
1930 	}
1931 	if (vmu_data.vmu_calc_thread == NULL) {
1932 
1933 		vmu_cache_t *cache;
1934 
1935 		vmu_data.vmu_calc_thread = curthread;
1936 		vmu_data.vmu_calc_flags = flags;
1937 		vmu_data.vmu_entities = NULL;
1938 		vmu_data.vmu_nentities = 0;
1939 		if (vmu_data.vmu_pending_waiters > 0)
1940 			vmu_data.vmu_calc_flags |=
1941 			    vmu_data.vmu_pending_flags;
1942 
1943 		vmu_data.vmu_pending_flags = 0;
1944 		mutex_exit(&vmu_data.vmu_lock);
1945 		vmu_calculate();
1946 		mutex_enter(&vmu_data.vmu_lock);
1947 		/* copy results to cache */
1948 		if (vmu_data.vmu_cache != NULL)
1949 			vmu_cache_rele(vmu_data.vmu_cache);
1950 		cache = vmu_data.vmu_cache =
1951 		    vmu_cache_alloc(vmu_data.vmu_nentities,
1952 		    vmu_data.vmu_calc_flags);
1953 
1954 		result = cache->vmc_results;
1955 		for (entity = vmu_data.vmu_entities; entity != NULL;
1956 		    entity = entity->vme_next) {
1957 			*result = entity->vme_result;
1958 			result++;
1959 		}
1960 		cache->vmc_timestamp = gethrtime();
1961 		vmu_cache_hold(cache);
1962 
1963 		vmu_data.vmu_calc_flags = 0;
1964 		vmu_data.vmu_calc_thread = NULL;
1965 
1966 		if (vmu_data.vmu_pending_waiters > 0)
1967 			cv_broadcast(&vmu_data.vmu_cv);
1968 
1969 		mutex_exit(&vmu_data.vmu_lock);
1970 
1971 		/* copy cache */
1972 		ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
1973 		mutex_enter(&vmu_data.vmu_lock);
1974 		vmu_cache_rele(cache);
1975 		mutex_exit(&vmu_data.vmu_lock);
1976 
1977 		return (ret);
1978 	}
1979 	vmu_data.vmu_pending_flags |= flags;
1980 	vmu_data.vmu_pending_waiters++;
1981 	while (vmu_data.vmu_calc_thread != NULL) {
1982 		if (cv_wait_sig(&vmu_data.vmu_cv,
1983 		    &vmu_data.vmu_lock) == 0) {
1984 			vmu_data.vmu_pending_waiters--;
1985 			mutex_exit(&vmu_data.vmu_lock);
1986 			return (set_errno(EINTR));
1987 		}
1988 	}
1989 	vmu_data.vmu_pending_waiters--;
1990 	goto start;
1991 }
1992