xref: /titanic_50/usr/src/uts/common/vm/vm_usage.c (revision 1e1ddd6cc98ab5af8293f7ebd132be62900730fd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * vm_usage
31  *
32  * This file implements the getvmusage() private system call.
33  * getvmusage() counts the amount of resident memory pages and swap
34  * reserved by the specified process collective. A "process collective" is
35  * the set of processes owned by a particular, zone, project, task, or user.
36  *
37  * rss and swap are counted so that for a given process collective, a page is
38  * only counted once.  For example, this means that if multiple processes in
39  * the same project map the same page, then the project will only be charged
40  * once for that page.  On the other hand, if two processes in different
41  * projects map the same page, then both projects will be charged
42  * for the page.
43  *
44  * The vm_getusage() calculation is implemented so that the first thread
45  * performs the rss/swap counting. Other callers will wait for that thread to
46  * finish, copying the results.  This enables multiple rcapds and prstats to
47  * consume data from the same calculation.  The results are also cached so that
48  * a caller interested in recent results can just copy them instead of starting
49  * a new calculation. The caller passes the maximium age (in seconds) of the
50  * data.  If the cached data is young enough, the cache is copied, otherwise,
51  * a new calculation is executed and the cache is replaced with the new
52  * data.
53  *
54  * The rss calculation for each process collective is as follows:
55  *
56  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
57  *     and/or users.
58  *   - For each proc:
59  *	- Figure out proc's collectives (zone, project, task, and/or user).
60  *	- For each seg in proc's address space:
61  *		- If seg is private:
62  *			- Lookup anons in the amp.
63  *			- For incore pages not previously visited each of the
64  *			  proc's collectives, add incore pagesize to each.
65  *			  collective.
66  *			  Anon's with a refcnt of 1 can be assummed to be not
67  *			  previously visited.
68  *			- For address ranges without anons in the amp:
69  *				- Lookup pages in underlying vnode.
70  *				- For incore pages not previously visiting for
71  *				  each of the proc's collectives, add incore
72  *				  pagesize to each collective.
73  *		- If seg is shared:
74  *			- Lookup pages in the shared amp or vnode.
75  *			- For incore pages not previously visited for each of
76  *			  the proc's collectives, add incore pagesize to each
77  *			  collective.
78  *
79  * Swap is reserved by private segments, and shared anonymous segments.
80  * The only shared anon segments which do not reserve swap are ISM segments
81  * and schedctl segments, both of which can be identified by having
82  * amp->swresv == 0.
83  *
84  * The swap calculation for each collective is as follows:
85  *
86  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
87  *     and/or users.
88  *   - For each proc:
89  *	- Figure out proc's collectives (zone, project, task, and/or user).
90  *	- For each seg in proc's address space:
91  *		- If seg is private:
92  *			- Add svd->swresv pages to swap count for each of the
93  *			  proc's collectives.
94  *		- If seg is anon, shared, and amp->swresv != 0
95  *			- For address ranges in amp not previously visited for
96  *			  each of the proc's collectives, add size of address
97  *			  range to the swap count for each collective.
98  *
99  * These two calculations are done simultaneously, with most of the work
100  * being done in vmu_calculate_seg().  The results of the calculation are
101  * copied into "vmu_data.vmu_cache_results".
102  *
103  * To perform the calculation, various things are tracked and cached:
104  *
105  *    - incore/not-incore page ranges for all vnodes.
106  *	(vmu_data.vmu_all_vnodes_hash)
107  *	This eliminates looking up the same page more than once.
108  *
109  *    - incore/not-incore page ranges for all shared amps.
110  *	(vmu_data.vmu_all_amps_hash)
111  *	This eliminates looking up the same page more than once.
112  *
113  *    - visited page ranges for each collective.
114  *	   - per vnode (entity->vme_vnode_hash)
115  *	   - per shared amp (entity->vme_amp_hash)
116  *	For accurate counting of map-shared and cow-shared pages.
117  *
118  *    - visited private anons (refcnt > 1) for each collective.
119  *	(entity->vme_anon_hash)
120  *	For accurate counting of cow-shared pages.
121  *
122  * The common accounting structure is the vmu_entity_t, which represents
123  * collectives:
124  *
125  *    - A zone.
126  *    - A project, task, or user within a zone.
127  *    - The entire system (vmu_data.vmu_system).
128  *    - Each collapsed (col) project and user.  This means a given projid or
129  *	uid, regardless of which zone the process is in.  For instance,
130  *      project 0 in the global zone and project 0 in a non global zone are
131  *	the same collapsed project.
132  *
133  *  Each entity structure tracks which pages have been already visited for
134  *  that entity (via previously inspected processes) so that these pages are
135  *  not double counted.
136  */
137 
138 #include <sys/errno.h>
139 #include <sys/types.h>
140 #include <sys/zone.h>
141 #include <sys/proc.h>
142 #include <sys/project.h>
143 #include <sys/task.h>
144 #include <sys/thread.h>
145 #include <sys/time.h>
146 #include <sys/mman.h>
147 #include <sys/modhash.h>
148 #include <sys/modhash_impl.h>
149 #include <sys/shm.h>
150 #include <sys/swap.h>
151 #include <sys/synch.h>
152 #include <sys/systm.h>
153 #include <sys/var.h>
154 #include <sys/vm_usage.h>
155 #include <sys/zone.h>
156 #include <vm/anon.h>
157 #include <vm/as.h>
158 #include <vm/seg_vn.h>
159 #include <vm/seg_spt.h>
160 
161 #define	VMUSAGE_HASH_SIZE		512
162 
163 #define	VMUSAGE_TYPE_VNODE		1
164 #define	VMUSAGE_TYPE_AMP		2
165 #define	VMUSAGE_TYPE_ANON		3
166 
167 #define	VMUSAGE_BOUND_UNKNOWN		0
168 #define	VMUSAGE_BOUND_INCORE		1
169 #define	VMUSAGE_BOUND_NOT_INCORE	2
170 
171 /*
172  * bounds for vnodes and shared amps
173  * Each bound is either entirely incore, entirely not in core, or
174  * entirely unknown.  bounds are stored in order by offset.
175  */
176 typedef struct vmu_bound {
177 	struct  vmu_bound *vmb_next;
178 	pgcnt_t vmb_start;  /* page offset in vnode/amp on which bound starts */
179 	pgcnt_t	vmb_end;    /* page offset in vnode/amp on which bound ends */
180 	char	vmb_type;   /* One of VMUSAGE_BOUND_* */
181 } vmu_bound_t;
182 
183 /*
184  * hash of visited objects (vnodes or shared amps)
185  * key is address of vnode or amp.  Bounds lists known incore/non-incore
186  * bounds for vnode/amp.
187  */
188 typedef struct vmu_object {
189 	struct vmu_object	*vmo_next;	/* free list */
190 	caddr_t		vmo_key;
191 	short		vmo_type;
192 	vmu_bound_t	*vmo_bounds;
193 } vmu_object_t;
194 
195 /*
196  * Entity by which to count results.
197  *
198  * The entity structure keeps the current rss/swap counts for each entity
199  * (zone, project, etc), and hashes of vm structures that have already
200  * been visited for the entity.
201  *
202  * vme_next:	links the list of all entities currently being counted by
203  *		vmu_calculate().
204  *
205  * vme_next_calc: links the list of entities related to the current process
206  *		 being counted by vmu_calculate_proc().
207  *
208  * vmu_calculate_proc() walks all processes.  For each process, it makes a
209  * list of the entities related to that process using vme_next_calc.  This
210  * list changes each time vmu_calculate_proc() is called.
211  *
212  */
213 typedef struct vmu_entity {
214 	struct vmu_entity *vme_next;
215 	struct vmu_entity *vme_next_calc;
216 	mod_hash_t	*vme_vnode_hash; /* vnodes visited for entity */
217 	mod_hash_t	*vme_amp_hash;	 /* shared amps visited for entity */
218 	mod_hash_t	*vme_anon_hash;	 /* cow anons visited for entity */
219 	vmusage_t	vme_result;	 /* identifies entity and results */
220 } vmu_entity_t;
221 
222 /*
223  * Hash of entities visited within a zone, and an entity for the zone
224  * itself.
225  */
226 typedef struct vmu_zone {
227 	struct vmu_zone	*vmz_next;	/* free list */
228 	id_t		vmz_id;
229 	vmu_entity_t	*vmz_zone;
230 	mod_hash_t	*vmz_projects_hash;
231 	mod_hash_t	*vmz_tasks_hash;
232 	mod_hash_t	*vmz_rusers_hash;
233 	mod_hash_t	*vmz_eusers_hash;
234 } vmu_zone_t;
235 
236 /*
237  * Cache of results from last calculation
238  */
239 typedef struct vmu_cache {
240 	vmusage_t	*vmc_results;	/* Results from last call to */
241 					/* vm_getusage(). */
242 	uint64_t	vmc_nresults;	/* Count of cached results */
243 	uint64_t	vmc_refcnt;	/* refcnt for free */
244 	uint_t		vmc_flags;	/* Flags for vm_getusage() */
245 	hrtime_t	vmc_timestamp;	/* when cache was created */
246 } vmu_cache_t;
247 
248 /*
249  * top level rss info for the system
250  */
251 typedef struct vmu_data {
252 	kmutex_t	vmu_lock;		/* Protects vmu_data */
253 	kcondvar_t	vmu_cv;			/* Used to signal threads */
254 						/* Waiting for */
255 						/* Rss_calc_thread to finish */
256 	vmu_entity_t	*vmu_system;		/* Entity for tracking */
257 						/* rss/swap for all processes */
258 						/* in all zones */
259 	mod_hash_t	*vmu_zones_hash;	/* Zones visited */
260 	mod_hash_t	*vmu_projects_col_hash; /* These *_col_hash hashes */
261 	mod_hash_t	*vmu_rusers_col_hash;	/* keep track of entities, */
262 	mod_hash_t	*vmu_eusers_col_hash;	/* ignoring zoneid, in order */
263 						/* to implement VMUSAGE_COL_* */
264 						/* flags, which aggregate by */
265 						/* project or user regardless */
266 						/* of zoneid. */
267 	mod_hash_t	*vmu_all_vnodes_hash;	/* System wide visited vnodes */
268 						/* to track incore/not-incore */
269 	mod_hash_t	*vmu_all_amps_hash;	/* System wide visited shared */
270 						/* amps to track incore/not- */
271 						/* incore */
272 	vmu_entity_t	*vmu_entities;		/* Linked list of entities */
273 	size_t		vmu_nentities;		/* Count of entities in list */
274 	vmu_cache_t	*vmu_cache;		/* Cached results */
275 	kthread_t	*vmu_calc_thread;	/* NULL, or thread running */
276 						/* vmu_calculate() */
277 	uint_t		vmu_calc_flags;		/* Flags being using by */
278 						/* currently running calc */
279 						/* thread */
280 	uint_t		vmu_pending_flags;	/* Flags of vm_getusage() */
281 						/* threads waiting for */
282 						/* calc thread to finish */
283 	uint_t		vmu_pending_waiters;	/* Number of threads waiting */
284 						/* for calc thread */
285 	vmu_bound_t	*vmu_free_bounds;
286 	vmu_object_t	*vmu_free_objects;
287 	vmu_entity_t	*vmu_free_entities;
288 	vmu_zone_t	*vmu_free_zones;
289 } vmu_data_t;
290 
291 extern struct as kas;
292 extern proc_t *practive;
293 extern zone_t *global_zone;
294 extern struct seg_ops segvn_ops;
295 extern struct seg_ops segspt_shmops;
296 
297 static vmu_data_t vmu_data;
298 static kmem_cache_t *vmu_bound_cache;
299 static kmem_cache_t *vmu_object_cache;
300 
301 /*
302  * Save a bound on the free list
303  */
304 static void
305 vmu_free_bound(vmu_bound_t *bound)
306 {
307 	bound->vmb_next = vmu_data.vmu_free_bounds;
308 	vmu_data.vmu_free_bounds = bound;
309 }
310 
311 /*
312  * Free an object, and all visited bound info.
313  */
314 static void
315 vmu_free_object(mod_hash_val_t val)
316 {
317 	vmu_object_t *obj = (vmu_object_t *)val;
318 	vmu_bound_t *bound = obj->vmo_bounds;
319 	vmu_bound_t *tmp;
320 
321 	while (bound != NULL) {
322 		tmp = bound;
323 		bound = bound->vmb_next;
324 		vmu_free_bound(tmp);
325 	}
326 	obj->vmo_next = vmu_data.vmu_free_objects;
327 	vmu_data.vmu_free_objects = obj;
328 }
329 
330 /*
331  * Free an entity, and hashes of visited objects for that entity.
332  */
333 static void
334 vmu_free_entity(mod_hash_val_t val)
335 {
336 	vmu_entity_t *entity = (vmu_entity_t *)val;
337 
338 	if (entity->vme_vnode_hash != NULL)
339 		i_mod_hash_clear_nosync(entity->vme_vnode_hash);
340 	if (entity->vme_amp_hash != NULL)
341 		i_mod_hash_clear_nosync(entity->vme_amp_hash);
342 	if (entity->vme_anon_hash != NULL)
343 		i_mod_hash_clear_nosync(entity->vme_anon_hash);
344 
345 	entity->vme_next = vmu_data.vmu_free_entities;
346 	vmu_data.vmu_free_entities = entity;
347 }
348 
349 /*
350  * Free zone entity, and all hashes of entities inside that zone,
351  * which are projects, tasks, and users.
352  */
353 static void
354 vmu_free_zone(mod_hash_val_t val)
355 {
356 	vmu_zone_t *zone = (vmu_zone_t *)val;
357 
358 	if (zone->vmz_zone != NULL) {
359 		vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
360 		zone->vmz_zone = NULL;
361 	}
362 	if (zone->vmz_projects_hash != NULL)
363 		i_mod_hash_clear_nosync(zone->vmz_projects_hash);
364 	if (zone->vmz_tasks_hash != NULL)
365 		i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
366 	if (zone->vmz_rusers_hash != NULL)
367 		i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
368 	if (zone->vmz_eusers_hash != NULL)
369 		i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
370 	zone->vmz_next = vmu_data.vmu_free_zones;
371 	vmu_data.vmu_free_zones = zone;
372 }
373 
374 /*
375  * Initialize synchronization primitives and hashes for system-wide tracking
376  * of visited vnodes and shared amps.  Initialize results cache.
377  */
378 void
379 vm_usage_init()
380 {
381 	mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
382 	cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
383 
384 	vmu_data.vmu_system = NULL;
385 	vmu_data.vmu_zones_hash = NULL;
386 	vmu_data.vmu_projects_col_hash = NULL;
387 	vmu_data.vmu_rusers_col_hash = NULL;
388 	vmu_data.vmu_eusers_col_hash = NULL;
389 
390 	vmu_data.vmu_free_bounds = NULL;
391 	vmu_data.vmu_free_objects = NULL;
392 	vmu_data.vmu_free_entities = NULL;
393 	vmu_data.vmu_free_zones = NULL;
394 
395 	vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
396 	    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
397 	    sizeof (vnode_t));
398 	vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
399 	    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
400 	    sizeof (struct anon_map));
401 	vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
402 	    "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
403 	    vmu_free_entity);
404 	vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
405 	    "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
406 	    vmu_free_entity);
407 	vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
408 	    "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
409 	    vmu_free_entity);
410 	vmu_data.vmu_zones_hash = mod_hash_create_idhash(
411 	    "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
412 
413 	vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
414 	    sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
415 	vmu_object_cache = kmem_cache_create("vmu_object_cache",
416 	    sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
417 
418 	vmu_data.vmu_entities = NULL;
419 	vmu_data.vmu_nentities = 0;
420 
421 	vmu_data.vmu_cache = NULL;
422 	vmu_data.vmu_calc_thread = NULL;
423 	vmu_data.vmu_calc_flags = 0;
424 	vmu_data.vmu_pending_flags = 0;
425 	vmu_data.vmu_pending_waiters = 0;
426 }
427 
428 /*
429  * Allocate hashes for tracking vm objects visited for an entity.
430  * Update list of entities.
431  */
432 static vmu_entity_t *
433 vmu_alloc_entity(id_t id, int type, id_t zoneid)
434 {
435 	vmu_entity_t *entity;
436 
437 	if (vmu_data.vmu_free_entities != NULL) {
438 		entity = vmu_data.vmu_free_entities;
439 		vmu_data.vmu_free_entities =
440 		    vmu_data.vmu_free_entities->vme_next;
441 		bzero(&entity->vme_result, sizeof (vmusage_t));
442 	} else {
443 		entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
444 	}
445 	entity->vme_result.vmu_id = id;
446 	entity->vme_result.vmu_zoneid = zoneid;
447 	entity->vme_result.vmu_type = type;
448 
449 	if (entity->vme_vnode_hash == NULL)
450 		entity->vme_vnode_hash = mod_hash_create_ptrhash(
451 		    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
452 		    sizeof (vnode_t));
453 
454 	if (entity->vme_amp_hash == NULL)
455 		entity->vme_amp_hash = mod_hash_create_ptrhash(
456 		    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
457 		    sizeof (struct anon_map));
458 
459 	if (entity->vme_anon_hash == NULL)
460 		entity->vme_anon_hash = mod_hash_create_ptrhash(
461 		    "vmusage anon hash", VMUSAGE_HASH_SIZE,
462 		    mod_hash_null_valdtor, sizeof (struct anon));
463 
464 	entity->vme_next = vmu_data.vmu_entities;
465 	vmu_data.vmu_entities = entity;
466 	vmu_data.vmu_nentities++;
467 
468 	return (entity);
469 }
470 
471 /*
472  * Allocate a zone entity, and hashes for tracking visited vm objects
473  * for projects, tasks, and users within that zone.
474  */
475 static vmu_zone_t *
476 vmu_alloc_zone(id_t id)
477 {
478 	vmu_zone_t *zone;
479 
480 	if (vmu_data.vmu_free_zones != NULL) {
481 		zone = vmu_data.vmu_free_zones;
482 		vmu_data.vmu_free_zones =
483 		    vmu_data.vmu_free_zones->vmz_next;
484 		zone->vmz_next = NULL;
485 		zone->vmz_zone = NULL;
486 	} else {
487 		zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
488 	}
489 
490 	zone->vmz_id = id;
491 
492 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
493 		zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
494 
495 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
496 	    VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
497 		zone->vmz_projects_hash = mod_hash_create_idhash(
498 		    "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
499 
500 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
501 	    != 0 && zone->vmz_tasks_hash == NULL)
502 		zone->vmz_tasks_hash = mod_hash_create_idhash(
503 		    "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
504 
505 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
506 	    != 0 && zone->vmz_rusers_hash == NULL)
507 		zone->vmz_rusers_hash = mod_hash_create_idhash(
508 		    "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
509 
510 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
511 	    != 0 && zone->vmz_eusers_hash == NULL)
512 		zone->vmz_eusers_hash = mod_hash_create_idhash(
513 		    "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
514 
515 	return (zone);
516 }
517 
518 /*
519  * Allocate a structure for tracking visited bounds for a vm object.
520  */
521 static vmu_object_t *
522 vmu_alloc_object(caddr_t key, int type)
523 {
524 	vmu_object_t *object;
525 
526 	if (vmu_data.vmu_free_objects != NULL) {
527 		object = vmu_data.vmu_free_objects;
528 		vmu_data.vmu_free_objects =
529 		    vmu_data.vmu_free_objects->vmo_next;
530 	} else {
531 		object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
532 	}
533 
534 	object->vmo_key = key;
535 	object->vmo_type = type;
536 	object->vmo_bounds = NULL;
537 
538 	return (object);
539 }
540 
541 /*
542  * Allocate and return a bound structure.
543  */
544 static vmu_bound_t *
545 vmu_alloc_bound()
546 {
547 	vmu_bound_t *bound;
548 
549 	if (vmu_data.vmu_free_bounds != NULL) {
550 		bound = vmu_data.vmu_free_bounds;
551 		vmu_data.vmu_free_bounds =
552 		    vmu_data.vmu_free_bounds->vmb_next;
553 		bzero(bound, sizeof (vmu_bound_t));
554 	} else {
555 		bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
556 		bzero(bound, sizeof (vmu_bound_t));
557 	}
558 	return (bound);
559 }
560 
561 /*
562  * vmu_find_insert_* functions implement hash lookup or allocate and
563  * insert operations.
564  */
565 static vmu_object_t *
566 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
567 {
568 	int ret;
569 	vmu_object_t *object;
570 
571 	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
572 	    (mod_hash_val_t *)&object);
573 	if (ret != 0) {
574 		object = vmu_alloc_object(key, type);
575 		ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
576 		    (mod_hash_val_t)object, (mod_hash_hndl_t)0);
577 		ASSERT(ret == 0);
578 	}
579 	return (object);
580 }
581 
582 static int
583 vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
584 {
585 	int ret;
586 	caddr_t val;
587 
588 	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
589 	    (mod_hash_val_t *)&val);
590 
591 	if (ret == 0)
592 		return (0);
593 
594 	ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
595 	    (mod_hash_val_t)key, (mod_hash_hndl_t)0);
596 
597 	ASSERT(ret == 0);
598 
599 	return (1);
600 }
601 
602 static vmu_entity_t *
603 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
604 {
605 	int ret;
606 	vmu_entity_t *entity;
607 
608 	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
609 	    (mod_hash_val_t *)&entity);
610 	if (ret != 0) {
611 		entity = vmu_alloc_entity(id, type, zoneid);
612 		ret = i_mod_hash_insert_nosync(hash,
613 		    (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
614 		    (mod_hash_hndl_t)0);
615 		ASSERT(ret == 0);
616 	}
617 	return (entity);
618 }
619 
620 
621 
622 
623 /*
624  * Returns list of object bounds between start and end.  New bounds inserted
625  * by this call are given type.
626  *
627  * Returns the number of pages covered if new bounds are created.  Returns 0
628  * if region between start/end consists of all existing bounds.
629  */
630 static pgcnt_t
631 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
632     end, char type, vmu_bound_t **first, vmu_bound_t **last)
633 {
634 	vmu_bound_t *next;
635 	vmu_bound_t *prev = NULL;
636 	vmu_bound_t *tmp = NULL;
637 	pgcnt_t ret = 0;
638 
639 	*first = *last = NULL;
640 
641 	for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) {
642 		/*
643 		 * Find bounds overlapping or overlapped by range [start,end].
644 		 */
645 		if (start > next->vmb_end) {
646 			/* bound is before new bound */
647 			prev = next;
648 			continue;
649 		}
650 		if (next->vmb_start > end) {
651 			/* bound is after new bound */
652 			break;
653 		}
654 		if (*first == NULL)
655 			*first = next;
656 		*last = next;
657 	}
658 
659 	if (*first == NULL) {
660 		ASSERT(*last == NULL);
661 		/*
662 		 * No bounds overlapping range [start,end], so create new
663 		 * bound
664 		 */
665 		tmp = vmu_alloc_bound();
666 		tmp->vmb_start = start;
667 		tmp->vmb_end = end;
668 		tmp->vmb_type = type;
669 		if (prev == NULL) {
670 			tmp->vmb_next = ro->vmo_bounds;
671 			ro->vmo_bounds = tmp;
672 		} else {
673 			tmp->vmb_next = prev->vmb_next;
674 			prev->vmb_next = tmp;
675 		}
676 		*first = tmp;
677 		*last = tmp;
678 		ASSERT(tmp->vmb_end >= tmp->vmb_start);
679 		ret = tmp->vmb_end - tmp->vmb_start + 1;
680 		return (ret);
681 	}
682 
683 	/* Check to see if start is before first known bound */
684 	ASSERT(first != NULL && last != NULL);
685 	next = (*first);
686 	if (start < (*first)->vmb_start) {
687 		/* Create new bound before first bound */
688 		tmp = vmu_alloc_bound();
689 		tmp->vmb_start = start;
690 		tmp->vmb_end = (*first)->vmb_start - 1;
691 		tmp->vmb_type = type;
692 		tmp->vmb_next = *first;
693 		if (*first == ro->vmo_bounds)
694 			ro->vmo_bounds = tmp;
695 		if (prev != NULL)
696 			prev->vmb_next = tmp;
697 		ASSERT(tmp->vmb_end >= tmp->vmb_start);
698 		ret += tmp->vmb_end - tmp->vmb_start + 1;
699 		*first = tmp;
700 	}
701 	/*
702 	 * Between start and end, search for gaps between and after existing
703 	 * bounds.  Create new bounds to fill gaps if they exist.
704 	 */
705 	while (end > next->vmb_end) {
706 		/*
707 		 * Check for gap between bound and next bound. if no gap,
708 		 * continue.
709 		 */
710 		if ((next != *last) &&
711 		    ((next->vmb_end + 1) == next->vmb_next->vmb_start)) {
712 			next = next->vmb_next;
713 			continue;
714 		}
715 		/*
716 		 * Insert new bound in gap after bound, and before next
717 		 * bound if next bound exists.
718 		 */
719 		tmp = vmu_alloc_bound();
720 		tmp->vmb_type = type;
721 		tmp->vmb_next = next->vmb_next;
722 		tmp->vmb_start = next->vmb_end + 1;
723 
724 		if (next != *last) {
725 			tmp->vmb_end = next->vmb_next->vmb_start - 1;
726 			ASSERT(tmp->vmb_end >= tmp->vmb_start);
727 			ret += tmp->vmb_end - tmp->vmb_start + 1;
728 			next->vmb_next = tmp;
729 			next = tmp->vmb_next;
730 		} else {
731 			tmp->vmb_end = end;
732 			ASSERT(tmp->vmb_end >= tmp->vmb_start);
733 			ret += tmp->vmb_end - tmp->vmb_start + 1;
734 			next->vmb_next = tmp;
735 			*last = tmp;
736 			break;
737 		}
738 	}
739 	return (ret);
740 }
741 
742 /*
743  * vmu_update_bounds()
744  *
745  * first, last:	list of continuous bounds, of which zero or more are of
746  * 		type VMUSAGE_BOUND_UNKNOWN.
747  *
748  * new_first, new_last:	list of continuous bounds, of which none are of
749  *			type VMUSAGE_BOUND_UNKNOWN.  These bounds are used to
750  *			update the types of bounds in (first,last) with
751  *			type VMUSAGE_BOUND_UNKNOWN.
752  *
753  * For the list of bounds (first,last), this function updates any bounds
754  * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
755  * the list (new_first, new_last).
756  *
757  * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
758  * (new_first, new_last), it will be split into multiple bounds.
759  *
760  * Return value:
761  * 	The number of pages in the list of bounds (first,last) that were of
762  *	type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
763  *	VMUSAGE_BOUND_INCORE.
764  *
765  */
766 static pgcnt_t
767 vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last,
768     vmu_bound_t *new_first, vmu_bound_t *new_last)
769 {
770 	vmu_bound_t *next, *new_next, *tmp;
771 	pgcnt_t rss = 0;
772 
773 	next = *first;
774 	new_next = new_first;
775 
776 	/*
777 	 * Verify first and last bound are covered by new bounds if they
778 	 * have unknown type.
779 	 */
780 	ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
781 	    (*first)->vmb_start >= new_next->vmb_start);
782 	ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
783 	    (*last)->vmb_end <= new_last->vmb_end);
784 	for (;;) {
785 		/* If bound already has type, proceed to next bound */
786 		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
787 			if (next == *last)
788 				break;
789 			next = next->vmb_next;
790 			continue;
791 		}
792 		while (new_next->vmb_end < next->vmb_start)
793 			new_next = new_next->vmb_next;
794 		ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
795 		next->vmb_type = new_next->vmb_type;
796 		if (new_next->vmb_end < next->vmb_end) {
797 			/* need to split bound */
798 			tmp = vmu_alloc_bound();
799 			tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
800 			tmp->vmb_start = new_next->vmb_end + 1;
801 			tmp->vmb_end = next->vmb_end;
802 			tmp->vmb_next = next->vmb_next;
803 			next->vmb_end = new_next->vmb_end;
804 			next->vmb_next = tmp;
805 			if (*last == next)
806 				*last = tmp;
807 			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
808 				rss += next->vmb_end - next->vmb_start + 1;
809 			next = tmp;
810 		} else {
811 			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
812 				rss += next->vmb_end - next->vmb_start + 1;
813 			if (next == *last)
814 				break;
815 			next = next->vmb_next;
816 		}
817 	}
818 	return (rss);
819 }
820 
821 /*
822  * merges adjacent bounds with same type between first and last bound.
823  * After merge, last pointer is no longer valid, as last bound may be
824  * merged away.
825  */
826 static void
827 vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last)
828 {
829 	vmu_bound_t *next;
830 	vmu_bound_t *tmp;
831 
832 	ASSERT(*first != NULL);
833 	ASSERT(*last != NULL);
834 
835 	next = *first;
836 	while (next != *last) {
837 
838 		/* If bounds are adjacent and have same type, merge them */
839 		if (((next->vmb_end + 1) == next->vmb_next->vmb_start) &&
840 		    (next->vmb_type == next->vmb_next->vmb_type)) {
841 			tmp = next->vmb_next;
842 			next->vmb_end = tmp->vmb_end;
843 			next->vmb_next = tmp->vmb_next;
844 			vmu_free_bound(tmp);
845 			if (tmp == *last)
846 				*last = next;
847 		} else {
848 			next = next->vmb_next;
849 		}
850 	}
851 }
852 
853 /*
854  * Given an amp and a list of bounds, updates each bound's type with
855  * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
856  *
857  * If a bound is partially incore, it will be split into two bounds.
858  * first and last may be modified, as bounds may be split into multiple
859  * bounds if the are partially incore/not-incore.
860  *
861  * Set incore to non-zero if bounds are already known to be incore
862  *
863  */
864 static void
865 vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first,
866     vmu_bound_t **last, boolean_t incore)
867 {
868 	vmu_bound_t *next;
869 	vmu_bound_t *tmp;
870 	pgcnt_t index;
871 	short bound_type;
872 	short page_type;
873 	vnode_t *vn;
874 	anoff_t off;
875 	struct anon *ap;
876 
877 	next = *first;
878 	/* Shared anon slots don't change once set */
879 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
880 	for (;;) {
881 		if (incore == B_TRUE)
882 			next->vmb_type = VMUSAGE_BOUND_INCORE;
883 
884 		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
885 			if (next == *last)
886 				break;
887 			next = next->vmb_next;
888 			continue;
889 		}
890 		bound_type = next->vmb_type;
891 		index = next->vmb_start;
892 		while (index <= next->vmb_end) {
893 
894 			/*
895 			 * These are used to determine how much to increment
896 			 * index when a large page is found.
897 			 */
898 			page_t *page;
899 			pgcnt_t pgcnt = 1;
900 			uint_t pgshft;
901 			pgcnt_t pgmsk;
902 
903 			ap = anon_get_ptr(amp->ahp, index);
904 			if (ap != NULL)
905 				swap_xlate(ap, &vn, &off);
906 
907 			if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
908 			    (page = page_exists(vn, off)) != NULL) {
909 				page_type = VMUSAGE_BOUND_INCORE;
910 				if (page->p_szc > 0) {
911 					pgcnt = page_get_pagecnt(page->p_szc);
912 					pgshft = page_get_shift(page->p_szc);
913 					pgmsk = (0x1 << (pgshft - PAGESHIFT))
914 					    - 1;
915 				}
916 			} else {
917 				page_type = VMUSAGE_BOUND_NOT_INCORE;
918 			}
919 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
920 				next->vmb_type = page_type;
921 			} else if (next->vmb_type != page_type) {
922 				/*
923 				 * if current bound type does not match page
924 				 * type, need to split off new bound.
925 				 */
926 				tmp = vmu_alloc_bound();
927 				tmp->vmb_type = page_type;
928 				tmp->vmb_start = index;
929 				tmp->vmb_end = next->vmb_end;
930 				tmp->vmb_next = next->vmb_next;
931 				next->vmb_end = index - 1;
932 				next->vmb_next = tmp;
933 				if (*last == next)
934 					*last = tmp;
935 				next = tmp;
936 			}
937 			if (pgcnt > 1) {
938 				/*
939 				 * If inside large page, jump to next large
940 				 * page
941 				 */
942 				index = (index & ~pgmsk) + pgcnt;
943 			} else {
944 				index++;
945 			}
946 		}
947 		if (next == *last) {
948 			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
949 			break;
950 		} else
951 			next = next->vmb_next;
952 	}
953 	ANON_LOCK_EXIT(&amp->a_rwlock);
954 }
955 
956 /*
957  * Same as vmu_amp_update_incore_bounds(), except for tracking
958  * incore-/not-incore for vnodes.
959  */
960 static void
961 vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first,
962     vmu_bound_t **last)
963 {
964 	vmu_bound_t *next;
965 	vmu_bound_t *tmp;
966 	pgcnt_t index;
967 	short bound_type;
968 	short page_type;
969 
970 	next = *first;
971 	for (;;) {
972 		if (vnode->v_pages == NULL)
973 			next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
974 
975 		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
976 			if (next == *last)
977 				break;
978 			next = next->vmb_next;
979 			continue;
980 		}
981 
982 		bound_type = next->vmb_type;
983 		index = next->vmb_start;
984 		while (index <= next->vmb_end) {
985 
986 			/*
987 			 * These are used to determine how much to increment
988 			 * index when a large page is found.
989 			 */
990 			page_t *page;
991 			pgcnt_t pgcnt = 1;
992 			uint_t pgshft;
993 			pgcnt_t pgmsk;
994 
995 			if (vnode->v_pages != NULL &&
996 			    (page = page_exists(vnode, ptob(index))) != NULL) {
997 				page_type = VMUSAGE_BOUND_INCORE;
998 				if (page->p_szc > 0) {
999 					pgcnt = page_get_pagecnt(page->p_szc);
1000 					pgshft = page_get_shift(page->p_szc);
1001 					pgmsk = (0x1 << (pgshft - PAGESHIFT))
1002 					    - 1;
1003 				}
1004 			} else {
1005 				page_type = VMUSAGE_BOUND_NOT_INCORE;
1006 			}
1007 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1008 				next->vmb_type = page_type;
1009 			} else if (next->vmb_type != page_type) {
1010 				/*
1011 				 * if current bound type does not match page
1012 				 * type, need to split off new bound.
1013 				 */
1014 				tmp = vmu_alloc_bound();
1015 				tmp->vmb_type = page_type;
1016 				tmp->vmb_start = index;
1017 				tmp->vmb_end = next->vmb_end;
1018 				tmp->vmb_next = next->vmb_next;
1019 				next->vmb_end = index - 1;
1020 				next->vmb_next = tmp;
1021 				if (*last == next)
1022 					*last = tmp;
1023 				next = tmp;
1024 			}
1025 			if (pgcnt > 1) {
1026 				/*
1027 				 * If inside large page, jump to next large
1028 				 * page
1029 				 */
1030 				index = (index & ~pgmsk) + pgcnt;
1031 			} else {
1032 				index++;
1033 			}
1034 		}
1035 		if (next == *last) {
1036 			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1037 			break;
1038 		} else
1039 			next = next->vmb_next;
1040 	}
1041 }
1042 
1043 /*
1044  * Calculate the rss and swap consumed by a segment.  vmu_entities is the
1045  * list of entities to visit.  For shared segments, the vnode or amp
1046  * is looked up in each entity to see if has been already counted.  Private
1047  * anon pages are checked per entity to ensure that cow pages are not
1048  * double counted.
1049  *
1050  * For private mapped files, first the amp is checked for private pages.
1051  * Bounds not backed by the amp are looked up in the vnode for each entity
1052  * to avoid double counting of private COW vnode pages.
1053  */
1054 static void
1055 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
1056 {
1057 	struct segvn_data *svd;
1058 	struct shm_data *shmd;
1059 	struct spt_data *sptd;
1060 	vmu_object_t *shared_object = NULL;
1061 	vmu_object_t *entity_object = NULL;
1062 	vmu_entity_t *entity;
1063 	vmusage_t *result;
1064 	vmu_bound_t *first = NULL;
1065 	vmu_bound_t *last = NULL;
1066 	vmu_bound_t *cur = NULL;
1067 	vmu_bound_t *e_first = NULL;
1068 	vmu_bound_t *e_last = NULL;
1069 	vmu_bound_t *tmp;
1070 	pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
1071 	struct anon_map *private_amp = NULL;
1072 	boolean_t incore = B_FALSE;
1073 	boolean_t shared = B_FALSE;
1074 	int file = 0;
1075 	pgcnt_t swresv = 0;
1076 	pgcnt_t panon = 0;
1077 
1078 	/* Can zero-length segments exist?  Not sure, so parenoia */
1079 	if (seg->s_size <= 0)
1080 		return;
1081 
1082 	/*
1083 	 * Figure out if there is a shared object (such as a named vnode or
1084 	 * a shared amp, then figure out if there is a private amp, which
1085 	 * identifies private pages.
1086 	 */
1087 	if (seg->s_ops == &segvn_ops) {
1088 		svd = (struct segvn_data *)seg->s_data;
1089 		if (svd->type == MAP_SHARED)
1090 			shared = B_TRUE;
1091 		else
1092 			swresv = svd->swresv;
1093 
1094 		if (svd->vp != NULL) {
1095 			file = 1;
1096 			shared_object = vmu_find_insert_object(
1097 			    vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
1098 			    VMUSAGE_TYPE_VNODE);
1099 			s_start = btop(svd->offset);
1100 			s_end = btop(svd->offset + seg->s_size) - 1;
1101 		}
1102 		if (svd->amp != NULL && svd->type == MAP_SHARED) {
1103 			ASSERT(shared_object == NULL);
1104 			shared_object = vmu_find_insert_object(
1105 			    vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
1106 			    VMUSAGE_TYPE_AMP);
1107 			s_start = svd->anon_index;
1108 			s_end = svd->anon_index + btop(seg->s_size) - 1;
1109 			/* schedctl mappings are always in core */
1110 			if (svd->amp->swresv == 0)
1111 				incore = B_TRUE;
1112 		}
1113 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
1114 		/*
1115 		 * Text replication anon maps can be shared across all zones.
1116 		 * Space used for text replication is typically capped as
1117 		 * small % of memory.  To keep it simple for now we don't
1118 		 * account for swap and memory space used for text replication.
1119 		 */
1120 		if (svd->tr_state == SEGVN_TR_OFF && svd->amp != NULL &&
1121 		    svd->type == MAP_PRIVATE) {
1122 			private_amp = svd->amp;
1123 			p_start = svd->anon_index;
1124 			p_end = svd->anon_index + btop(seg->s_size) - 1;
1125 		}
1126 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
1127 	} else if (seg->s_ops == &segspt_shmops) {
1128 		shared = B_TRUE;
1129 		shmd = (struct shm_data *)seg->s_data;
1130 		shared_object = vmu_find_insert_object(
1131 		    vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
1132 		    VMUSAGE_TYPE_AMP);
1133 		s_start = 0;
1134 		s_end = btop(seg->s_size) - 1;
1135 		sptd = shmd->shm_sptseg->s_data;
1136 
1137 		/* ism segments are always incore and do not reserve swap */
1138 		if (sptd->spt_flags & SHM_SHARE_MMU)
1139 			incore = B_TRUE;
1140 
1141 	} else {
1142 		return;
1143 	}
1144 
1145 	/*
1146 	 * If there is a private amp, count anon pages that exist.  If an
1147 	 * anon has a refcnt > 1 (cow sharing), then save the anon in a
1148 	 * hash so that it is not double counted.
1149 	 *
1150 	 * If there is also a shared object, they figure out the bounds
1151 	 * which are not mapped by the private amp.
1152 	 */
1153 	if (private_amp != NULL) {
1154 
1155 		/* Enter as writer to prevent cow anons from being freed */
1156 		ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
1157 
1158 		p_index = p_start;
1159 		s_index = s_start;
1160 
1161 		while (p_index <= p_end) {
1162 
1163 			pgcnt_t p_index_next;
1164 			pgcnt_t p_bound_size;
1165 			int cnt;
1166 			anoff_t off;
1167 			struct vnode *vn;
1168 			struct anon *ap;
1169 			page_t *page;		/* For handling of large */
1170 			pgcnt_t pgcnt = 1;	/* pages */
1171 			pgcnt_t pgstart;
1172 			pgcnt_t pgend;
1173 			uint_t pgshft;
1174 			pgcnt_t pgmsk;
1175 
1176 			p_index_next = p_index;
1177 			ap = anon_get_next_ptr(private_amp->ahp,
1178 			    &p_index_next);
1179 
1180 			/*
1181 			 * If next anon is past end of mapping, simulate
1182 			 * end of anon so loop terminates.
1183 			 */
1184 			if (p_index_next > p_end) {
1185 				p_index_next = p_end + 1;
1186 				ap = NULL;
1187 			}
1188 			/*
1189 			 * For cow segments, keep track of bounds not
1190 			 * backed by private amp so they can be looked
1191 			 * up in the backing vnode
1192 			 */
1193 			if (p_index_next != p_index) {
1194 
1195 				/*
1196 				 * Compute index difference between anon and
1197 				 * previous anon.
1198 				 */
1199 				p_bound_size = p_index_next - p_index - 1;
1200 
1201 				if (shared_object != NULL) {
1202 					cur = vmu_alloc_bound();
1203 					cur->vmb_next = NULL;
1204 					cur->vmb_start = s_index;
1205 					cur->vmb_end = s_index + p_bound_size;
1206 					cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1207 					if (first == NULL) {
1208 						first = cur;
1209 						last = cur;
1210 					} else {
1211 						last->vmb_next = cur;
1212 						last = cur;
1213 					}
1214 				}
1215 				p_index = p_index + p_bound_size + 1;
1216 				s_index = s_index + p_bound_size + 1;
1217 			}
1218 
1219 			/* Detect end of anons in amp */
1220 			if (ap == NULL)
1221 				break;
1222 
1223 			cnt = ap->an_refcnt;
1224 			swap_xlate(ap, &vn, &off);
1225 
1226 			if (vn == NULL || vn->v_pages == NULL ||
1227 			    (page = page_exists(vn, off)) == NULL) {
1228 				p_index++;
1229 				s_index++;
1230 				continue;
1231 			}
1232 
1233 			/*
1234 			 * If large page is found, compute portion of large
1235 			 * page in mapping, and increment indicies to the next
1236 			 * large page.
1237 			 */
1238 			if (page->p_szc > 0) {
1239 
1240 				pgcnt = page_get_pagecnt(page->p_szc);
1241 				pgshft = page_get_shift(page->p_szc);
1242 				pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
1243 
1244 				/* First page in large page */
1245 				pgstart = p_index & ~pgmsk;
1246 				/* Last page in large page */
1247 				pgend = pgstart + pgcnt - 1;
1248 				/*
1249 				 * Artifically end page if page extends past
1250 				 * end of mapping.
1251 				 */
1252 				if (pgend > p_end)
1253 					pgend = p_end;
1254 
1255 				/*
1256 				 * Compute number of pages from large page
1257 				 * which are mapped.
1258 				 */
1259 				pgcnt = pgend - p_index + 1;
1260 
1261 				/*
1262 				 * Point indicies at page after large page,
1263 				 * or at page after end of mapping.
1264 				 */
1265 				p_index += pgcnt;
1266 				s_index += pgcnt;
1267 			} else {
1268 				p_index++;
1269 				s_index++;
1270 			}
1271 
1272 			/*
1273 			 * Assume anon structs with a refcnt
1274 			 * of 1 are not cow shared, so there
1275 			 * is no reason to track them per entity.
1276 			 */
1277 			if (cnt == 1) {
1278 				panon += pgcnt;
1279 				continue;
1280 			}
1281 			for (entity = vmu_entities; entity != NULL;
1282 			    entity = entity->vme_next_calc) {
1283 
1284 				result = &entity->vme_result;
1285 				/*
1286 				 * Track cow anons per entity so
1287 				 * they are not double counted.
1288 				 */
1289 				if (vmu_find_insert_anon(entity->vme_anon_hash,
1290 				    (caddr_t)ap) == 0)
1291 					continue;
1292 
1293 				result->vmu_rss_all += (pgcnt << PAGESHIFT);
1294 				result->vmu_rss_private +=
1295 				    (pgcnt << PAGESHIFT);
1296 			}
1297 		}
1298 		ANON_LOCK_EXIT(&private_amp->a_rwlock);
1299 	}
1300 
1301 	/* Add up resident anon and swap reserved for private mappings */
1302 	if (swresv > 0 || panon > 0) {
1303 		for (entity = vmu_entities; entity != NULL;
1304 		    entity = entity->vme_next_calc) {
1305 			result = &entity->vme_result;
1306 			result->vmu_swap_all += swresv;
1307 			result->vmu_swap_private += swresv;
1308 			result->vmu_rss_all += (panon << PAGESHIFT);
1309 			result->vmu_rss_private += (panon << PAGESHIFT);
1310 		}
1311 	}
1312 
1313 	/* Compute resident pages backing shared amp or named vnode */
1314 	if (shared_object != NULL) {
1315 		if (first == NULL) {
1316 			/*
1317 			 * No private amp, or private amp has no anon
1318 			 * structs.  This means entire segment is backed by
1319 			 * the shared object.
1320 			 */
1321 			first = vmu_alloc_bound();
1322 			first->vmb_next = NULL;
1323 			first->vmb_start = s_start;
1324 			first->vmb_end = s_end;
1325 			first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1326 		}
1327 		/*
1328 		 * Iterate bounds not backed by private amp, and compute
1329 		 * resident pages.
1330 		 */
1331 		cur = first;
1332 		while (cur != NULL) {
1333 
1334 			if (vmu_insert_lookup_object_bounds(shared_object,
1335 			    cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
1336 			    &first, &last) > 0) {
1337 				/* new bounds, find incore/not-incore */
1338 				if (shared_object->vmo_type ==
1339 				    VMUSAGE_TYPE_VNODE)
1340 					vmu_vnode_update_incore_bounds(
1341 					    (vnode_t *)
1342 					    shared_object->vmo_key, &first,
1343 					    &last);
1344 				else
1345 					vmu_amp_update_incore_bounds(
1346 					    (struct anon_map *)
1347 					    shared_object->vmo_key, &first,
1348 					    &last, incore);
1349 				vmu_merge_bounds(&first, &last);
1350 			}
1351 			for (entity = vmu_entities; entity != NULL;
1352 			    entity = entity->vme_next_calc) {
1353 
1354 				result = &entity->vme_result;
1355 
1356 				entity_object = vmu_find_insert_object(
1357 				    shared_object->vmo_type ==
1358 				    VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
1359 					entity->vme_amp_hash,
1360 					shared_object->vmo_key,
1361 					shared_object->vmo_type);
1362 
1363 				virt = vmu_insert_lookup_object_bounds(
1364 				    entity_object, cur->vmb_start, cur->vmb_end,
1365 				    VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
1366 
1367 				if (virt == 0)
1368 					continue;
1369 				/*
1370 				 * Range visited for this entity
1371 				 */
1372 				rss = vmu_update_bounds(&e_first,
1373 				    &e_last, first, last);
1374 				result->vmu_rss_all += (rss << PAGESHIFT);
1375 				if (shared == B_TRUE && file == B_FALSE) {
1376 					/* shared anon mapping */
1377 					result->vmu_swap_all +=
1378 					    (virt << PAGESHIFT);
1379 					result->vmu_swap_shared +=
1380 					    (virt << PAGESHIFT);
1381 					result->vmu_rss_shared +=
1382 					    (rss << PAGESHIFT);
1383 				} else if (shared == B_TRUE && file == B_TRUE) {
1384 					/* shared file mapping */
1385 					result->vmu_rss_shared +=
1386 					    (rss << PAGESHIFT);
1387 				} else if (shared == B_FALSE &&
1388 				    file == B_TRUE) {
1389 					/* private file mapping */
1390 					result->vmu_rss_private +=
1391 					    (rss << PAGESHIFT);
1392 				}
1393 				vmu_merge_bounds(&e_first, &e_last);
1394 			}
1395 			tmp = cur;
1396 			cur = cur->vmb_next;
1397 			vmu_free_bound(tmp);
1398 		}
1399 	}
1400 }
1401 
1402 /*
1403  * Based on the current calculation flags, find the relevant entities
1404  * which are relative to the process.  Then calculate each segment
1405  * in the process'es address space for each relevant entity.
1406  */
1407 static void
1408 vmu_calculate_proc(proc_t *p)
1409 {
1410 	vmu_entity_t *entities = NULL;
1411 	vmu_zone_t *zone;
1412 	vmu_entity_t *tmp;
1413 	struct as *as;
1414 	struct seg *seg;
1415 	int ret;
1416 
1417 	/* Figure out which entities are being computed */
1418 	if ((vmu_data.vmu_system) != NULL) {
1419 		tmp = vmu_data.vmu_system;
1420 		tmp->vme_next_calc = entities;
1421 		entities = tmp;
1422 	}
1423 	if (vmu_data.vmu_calc_flags &
1424 	    (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
1425 	    VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1426 	    VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1427 	    VMUSAGE_ALL_EUSERS)) {
1428 		ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1429 		    (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1430 		    (mod_hash_val_t *)&zone);
1431 		if (ret != 0) {
1432 			zone = vmu_alloc_zone(p->p_zone->zone_id);
1433 			ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1434 			    (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1435 			    (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1436 			ASSERT(ret == 0);
1437 		}
1438 		if (zone->vmz_zone != NULL) {
1439 			tmp = zone->vmz_zone;
1440 			tmp->vme_next_calc = entities;
1441 			entities = tmp;
1442 		}
1443 		if (vmu_data.vmu_calc_flags &
1444 		    (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1445 			tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1446 			    p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
1447 			    zone->vmz_id);
1448 			tmp->vme_next_calc = entities;
1449 			entities = tmp;
1450 		}
1451 		if (vmu_data.vmu_calc_flags &
1452 		    (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
1453 			tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
1454 			    p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
1455 			tmp->vme_next_calc = entities;
1456 			entities = tmp;
1457 		}
1458 		if (vmu_data.vmu_calc_flags &
1459 		    (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
1460 			tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
1461 			    crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
1462 			tmp->vme_next_calc = entities;
1463 			entities = tmp;
1464 		}
1465 		if (vmu_data.vmu_calc_flags &
1466 		    (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
1467 			tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
1468 			    crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
1469 			tmp->vme_next_calc = entities;
1470 			entities = tmp;
1471 		}
1472 	}
1473 	/* Entities which collapse projects and users for all zones */
1474 	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
1475 		tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
1476 		    p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
1477 		tmp->vme_next_calc = entities;
1478 		entities = tmp;
1479 	}
1480 	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
1481 		tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
1482 		    crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
1483 		tmp->vme_next_calc = entities;
1484 		entities = tmp;
1485 	}
1486 	if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
1487 		tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
1488 		    crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
1489 		tmp->vme_next_calc = entities;
1490 		entities = tmp;
1491 	}
1492 
1493 	ASSERT(entities != NULL);
1494 	/* process all segs in process's address space */
1495 	as = p->p_as;
1496 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1497 	for (seg = AS_SEGFIRST(as); seg != NULL;
1498 	    seg = AS_SEGNEXT(as, seg)) {
1499 		vmu_calculate_seg(entities, seg);
1500 	}
1501 	AS_LOCK_EXIT(as, &as->a_lock);
1502 }
1503 
1504 /*
1505  * Free data created by previous call to vmu_calculate().
1506  */
1507 static void
1508 vmu_clear_calc()
1509 {
1510 	if (vmu_data.vmu_system != NULL)
1511 		vmu_free_entity(vmu_data.vmu_system);
1512 		vmu_data.vmu_system = NULL;
1513 	if (vmu_data.vmu_zones_hash != NULL)
1514 		i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
1515 	if (vmu_data.vmu_projects_col_hash != NULL)
1516 		i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
1517 	if (vmu_data.vmu_rusers_col_hash != NULL)
1518 		i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
1519 	if (vmu_data.vmu_eusers_col_hash != NULL)
1520 		i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
1521 
1522 	i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
1523 	i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
1524 }
1525 
1526 /*
1527  * Free unused data structures.  These can result if the system workload
1528  * decreases between calculations.
1529  */
1530 static void
1531 vmu_free_extra()
1532 {
1533 	vmu_bound_t *tb;
1534 	vmu_object_t *to;
1535 	vmu_entity_t *te;
1536 	vmu_zone_t *tz;
1537 
1538 	while (vmu_data.vmu_free_bounds != NULL) {
1539 		tb = vmu_data.vmu_free_bounds;
1540 		vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
1541 		kmem_cache_free(vmu_bound_cache, tb);
1542 	}
1543 	while (vmu_data.vmu_free_objects != NULL) {
1544 		to = vmu_data.vmu_free_objects;
1545 		vmu_data.vmu_free_objects =
1546 		    vmu_data.vmu_free_objects->vmo_next;
1547 		kmem_cache_free(vmu_object_cache, to);
1548 	}
1549 	while (vmu_data.vmu_free_entities != NULL) {
1550 		te = vmu_data.vmu_free_entities;
1551 		vmu_data.vmu_free_entities =
1552 		    vmu_data.vmu_free_entities->vme_next;
1553 		if (te->vme_vnode_hash != NULL)
1554 			mod_hash_destroy_hash(te->vme_vnode_hash);
1555 		if (te->vme_amp_hash != NULL)
1556 			mod_hash_destroy_hash(te->vme_amp_hash);
1557 		if (te->vme_anon_hash != NULL)
1558 			mod_hash_destroy_hash(te->vme_anon_hash);
1559 		kmem_free(te, sizeof (vmu_entity_t));
1560 	}
1561 	while (vmu_data.vmu_free_zones != NULL) {
1562 		tz = vmu_data.vmu_free_zones;
1563 		vmu_data.vmu_free_zones =
1564 		    vmu_data.vmu_free_zones->vmz_next;
1565 		if (tz->vmz_projects_hash != NULL)
1566 			mod_hash_destroy_hash(tz->vmz_projects_hash);
1567 		if (tz->vmz_tasks_hash != NULL)
1568 			mod_hash_destroy_hash(tz->vmz_tasks_hash);
1569 		if (tz->vmz_rusers_hash != NULL)
1570 			mod_hash_destroy_hash(tz->vmz_rusers_hash);
1571 		if (tz->vmz_eusers_hash != NULL)
1572 			mod_hash_destroy_hash(tz->vmz_eusers_hash);
1573 		kmem_free(tz, sizeof (vmu_zone_t));
1574 	}
1575 }
1576 
1577 extern kcondvar_t *pr_pid_cv;
1578 
1579 /*
1580  * Determine which entity types are relevant and allocate the hashes to
1581  * track them.  Then walk the process table and count rss and swap
1582  * for each process'es address space.  Address space object such as
1583  * vnodes, amps and anons are tracked per entity, so that they are
1584  * not double counted in the results.
1585  *
1586  */
1587 static void
1588 vmu_calculate()
1589 {
1590 	int i = 0;
1591 	int ret;
1592 	proc_t *p;
1593 
1594 	vmu_clear_calc();
1595 
1596 	if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
1597 		vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
1598 		    ALL_ZONES);
1599 
1600 	/*
1601 	 * Walk process table and calculate rss of each proc.
1602 	 *
1603 	 * Pidlock and p_lock cannot be held while doing the rss calculation.
1604 	 * This is because:
1605 	 *	1.  The calculation allocates using KM_SLEEP.
1606 	 *	2.  The calculation grabs a_lock, which cannot be grabbed
1607 	 *	    after p_lock.
1608 	 *
1609 	 * Since pidlock must be dropped, we cannot simply just walk the
1610 	 * practive list.  Instead, we walk the process table, and sprlock
1611 	 * each process to ensure that it does not exit during the
1612 	 * calculation.
1613 	 */
1614 
1615 	mutex_enter(&pidlock);
1616 	for (i = 0; i < v.v_proc; i++) {
1617 again:
1618 		p = pid_entry(i);
1619 		if (p == NULL)
1620 			continue;
1621 
1622 		mutex_enter(&p->p_lock);
1623 		mutex_exit(&pidlock);
1624 
1625 		if (panicstr) {
1626 			mutex_exit(&p->p_lock);
1627 			return;
1628 		}
1629 
1630 		/* Try to set P_PR_LOCK */
1631 		ret = sprtrylock_proc(p);
1632 		if (ret == -1) {
1633 			/* Process in invalid state */
1634 			mutex_exit(&p->p_lock);
1635 			mutex_enter(&pidlock);
1636 			continue;
1637 		} else if (ret == 1) {
1638 			/*
1639 			 * P_PR_LOCK is already set.  Wait and try again.
1640 			 * This also drops p_lock.
1641 			 */
1642 			sprwaitlock_proc(p);
1643 			mutex_enter(&pidlock);
1644 			goto again;
1645 		}
1646 		mutex_exit(&p->p_lock);
1647 
1648 		vmu_calculate_proc(p);
1649 
1650 		mutex_enter(&p->p_lock);
1651 		sprunlock(p);
1652 		mutex_enter(&pidlock);
1653 	}
1654 	mutex_exit(&pidlock);
1655 
1656 	vmu_free_extra();
1657 }
1658 
1659 /*
1660  * allocate a new cache for N results satisfying flags
1661  */
1662 vmu_cache_t *
1663 vmu_cache_alloc(size_t nres, uint_t flags)
1664 {
1665 	vmu_cache_t *cache;
1666 
1667 	cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
1668 	cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
1669 	cache->vmc_nresults = nres;
1670 	cache->vmc_flags = flags;
1671 	cache->vmc_refcnt = 1;
1672 	return (cache);
1673 }
1674 
1675 /*
1676  * Make sure cached results are not freed
1677  */
1678 static void
1679 vmu_cache_hold(vmu_cache_t *cache)
1680 {
1681 	ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1682 	cache->vmc_refcnt++;
1683 }
1684 
1685 /*
1686  * free cache data
1687  */
1688 static void
1689 vmu_cache_rele(vmu_cache_t *cache)
1690 {
1691 	ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1692 	ASSERT(cache->vmc_refcnt > 0);
1693 	cache->vmc_refcnt--;
1694 	if (cache->vmc_refcnt == 0) {
1695 		kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1696 			cache->vmc_nresults);
1697 		kmem_free(cache, sizeof (vmu_cache_t));
1698 	}
1699 }
1700 
1701 /*
1702  * Copy out the cached results to a caller.  Inspect the callers flags
1703  * and zone to determine which cached results should be copied.
1704  */
1705 static int
1706 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1707     uint_t flags)
1708 {
1709 	vmusage_t *result, *out_result;
1710 	vmusage_t dummy;
1711 	size_t i, count = 0;
1712 	size_t bufsize;
1713 	int ret = 0;
1714 	uint_t types = 0;
1715 
1716 	if (nres != NULL) {
1717 		if (copyin((caddr_t)nres, &bufsize, sizeof (size_t)))
1718 			return (set_errno(EFAULT));
1719 	} else {
1720 		bufsize = 0;
1721 	}
1722 
1723 	/* figure out what results the caller is interested in. */
1724 	if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1725 		types |= VMUSAGE_SYSTEM;
1726 	if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
1727 		types |= VMUSAGE_ZONE;
1728 	if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1729 	    VMUSAGE_COL_PROJECTS))
1730 		types |= VMUSAGE_PROJECTS;
1731 	if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1732 		types |= VMUSAGE_TASKS;
1733 	if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1734 		types |= VMUSAGE_RUSERS;
1735 	if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1736 		types |= VMUSAGE_EUSERS;
1737 
1738 	/* count results for current zone */
1739 	out_result = buf;
1740 	for (result = cache->vmc_results, i = 0;
1741 	    i < cache->vmc_nresults; result++, i++) {
1742 
1743 		/* Do not return "other-zone" results to non-global zones */
1744 		if (curproc->p_zone != global_zone &&
1745 		    curproc->p_zone->zone_id != result->vmu_zoneid)
1746 			continue;
1747 
1748 		/*
1749 		 * If non-global zone requests VMUSAGE_SYSTEM, fake
1750 		 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1751 		 */
1752 		if (curproc->p_zone != global_zone &&
1753 		    (flags & VMUSAGE_SYSTEM) != 0 &&
1754 		    result->vmu_type == VMUSAGE_ZONE) {
1755 			count++;
1756 			if (out_result != NULL) {
1757 				if (bufsize < count) {
1758 					ret = set_errno(EOVERFLOW);
1759 				} else {
1760 					dummy = *result;
1761 					dummy.vmu_zoneid = ALL_ZONES;
1762 					dummy.vmu_id = 0;
1763 					dummy.vmu_type = VMUSAGE_SYSTEM;
1764 					if (copyout(&dummy, out_result,
1765 					    sizeof (vmusage_t)))
1766 						return (set_errno(
1767 						    EFAULT));
1768 					out_result++;
1769 				}
1770 			}
1771 		}
1772 
1773 		/* Skip results that do not match requested type */
1774 		if ((result->vmu_type & types) == 0)
1775 			continue;
1776 
1777 		/* Skip collated results if not requested */
1778 		if (result->vmu_zoneid == ALL_ZONES) {
1779 			if (result->vmu_type == VMUSAGE_PROJECTS &&
1780 			    (flags & VMUSAGE_COL_PROJECTS) == 0)
1781 				continue;
1782 			if (result->vmu_type == VMUSAGE_EUSERS &&
1783 			    (flags & VMUSAGE_COL_EUSERS) == 0)
1784 				continue;
1785 			if (result->vmu_type == VMUSAGE_RUSERS &&
1786 			    (flags & VMUSAGE_COL_RUSERS) == 0)
1787 				continue;
1788 		}
1789 
1790 		/* Skip "other zone" results if not requested */
1791 		if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1792 			if (result->vmu_type == VMUSAGE_ZONE &&
1793 			    (flags & VMUSAGE_ALL_ZONES) == 0)
1794 				continue;
1795 			if (result->vmu_type == VMUSAGE_PROJECTS &&
1796 			    (flags & (VMUSAGE_ALL_PROJECTS |
1797 			    VMUSAGE_COL_PROJECTS)) == 0)
1798 				continue;
1799 			if (result->vmu_type == VMUSAGE_TASKS &&
1800 			    (flags & VMUSAGE_ALL_TASKS) == 0)
1801 				continue;
1802 			if (result->vmu_type == VMUSAGE_RUSERS &&
1803 			    (flags & (VMUSAGE_ALL_RUSERS |
1804 			    VMUSAGE_COL_RUSERS)) == 0)
1805 				continue;
1806 			if (result->vmu_type == VMUSAGE_EUSERS &&
1807 			    (flags & (VMUSAGE_ALL_EUSERS |
1808 			    VMUSAGE_COL_EUSERS)) == 0)
1809 				continue;
1810 		}
1811 		count++;
1812 		if (out_result != NULL) {
1813 			if (bufsize < count) {
1814 				ret = set_errno(EOVERFLOW);
1815 			} else {
1816 				if (copyout(result, out_result,
1817 				    sizeof (vmusage_t)))
1818 					return (set_errno(EFAULT));
1819 				out_result++;
1820 			}
1821 		}
1822 	}
1823 	if (nres != NULL)
1824 		if (copyout(&count, (void *)nres, sizeof (size_t)))
1825 			return (set_errno(EFAULT));
1826 
1827 	return (ret);
1828 }
1829 
1830 /*
1831  * vm_getusage()
1832  *
1833  * Counts rss and swap by zone, project, task, and/or user.  The flags argument
1834  * determines the type of results structures returned.  Flags requesting
1835  * results from more than one zone are "flattened" to the local zone if the
1836  * caller is not the global zone.
1837  *
1838  * args:
1839  *	flags:	bitmap consisting of one or more of VMUSAGE_*.
1840  *	age:	maximum allowable age (time since counting was done) in
1841  *		seconds of the results.  Results from previous callers are
1842  *		cached in kernel.
1843  *	buf:	pointer to buffer array of vmusage_t.  If NULL, then only nres
1844  *		set on success.
1845  *	nres:	Set to number of vmusage_t structures pointed to by buf
1846  *		before calling vm_getusage().
1847  *		On return 0 (success) or ENOSPC, is set to the number of result
1848  *		structures returned or attempted to return.
1849  *
1850  * returns 0 on success, -1 on failure:
1851  *	EINTR (interrupted)
1852  *	ENOSPC (nres to small for results, nres set to needed value for success)
1853  *	EINVAL (flags invalid)
1854  *	EFAULT (bad address for buf or nres)
1855  */
1856 int
1857 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres)
1858 {
1859 	vmu_entity_t *entity;
1860 	vmusage_t *result;
1861 	int ret = 0;
1862 	int cacherecent = 0;
1863 	hrtime_t now;
1864 	uint_t flags_orig;
1865 
1866 	/*
1867 	 * Non-global zones cannot request system wide and/or collated
1868 	 * results, or the system result, so munge the flags accordingly.
1869 	 */
1870 	flags_orig = flags;
1871 	if (curproc->p_zone != global_zone) {
1872 		if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1873 			flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1874 			flags |= VMUSAGE_PROJECTS;
1875 		}
1876 		if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1877 			flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1878 			flags |= VMUSAGE_RUSERS;
1879 		}
1880 		if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1881 			flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1882 			flags |= VMUSAGE_EUSERS;
1883 		}
1884 		if (flags & VMUSAGE_SYSTEM) {
1885 			flags &= ~VMUSAGE_SYSTEM;
1886 			flags |= VMUSAGE_ZONE;
1887 		}
1888 	}
1889 
1890 	/* Check for unknown flags */
1891 	if ((flags & (~VMUSAGE_MASK)) != 0)
1892 		return (set_errno(EINVAL));
1893 
1894 	/* Check for no flags */
1895 	if ((flags & VMUSAGE_MASK) == 0)
1896 		return (set_errno(EINVAL));
1897 
1898 	mutex_enter(&vmu_data.vmu_lock);
1899 	now = gethrtime();
1900 
1901 start:
1902 	if (vmu_data.vmu_cache != NULL) {
1903 
1904 		vmu_cache_t *cache;
1905 
1906 		if ((vmu_data.vmu_cache->vmc_timestamp +
1907 		    ((hrtime_t)age * NANOSEC)) > now)
1908 			cacherecent = 1;
1909 
1910 		if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
1911 		    cacherecent == 1) {
1912 			cache = vmu_data.vmu_cache;
1913 			vmu_cache_hold(cache);
1914 			mutex_exit(&vmu_data.vmu_lock);
1915 
1916 			ret = vmu_copyout_results(cache, buf, nres, flags_orig);
1917 			mutex_enter(&vmu_data.vmu_lock);
1918 			vmu_cache_rele(cache);
1919 			if (vmu_data.vmu_pending_waiters > 0)
1920 				cv_broadcast(&vmu_data.vmu_cv);
1921 			mutex_exit(&vmu_data.vmu_lock);
1922 			return (ret);
1923 		}
1924 		/*
1925 		 * If the cache is recent, it is likely that there are other
1926 		 * consumers of vm_getusage running, so add their flags to the
1927 		 * desired flags for the calculation.
1928 		 */
1929 		if (cacherecent == 1)
1930 			flags = vmu_data.vmu_cache->vmc_flags | flags;
1931 	}
1932 	if (vmu_data.vmu_calc_thread == NULL) {
1933 
1934 		vmu_cache_t *cache;
1935 
1936 		vmu_data.vmu_calc_thread = curthread;
1937 		vmu_data.vmu_calc_flags = flags;
1938 		vmu_data.vmu_entities = NULL;
1939 		vmu_data.vmu_nentities = 0;
1940 		if (vmu_data.vmu_pending_waiters > 0)
1941 			vmu_data.vmu_calc_flags |=
1942 			    vmu_data.vmu_pending_flags;
1943 
1944 		vmu_data.vmu_pending_flags = 0;
1945 		mutex_exit(&vmu_data.vmu_lock);
1946 		vmu_calculate();
1947 		mutex_enter(&vmu_data.vmu_lock);
1948 		/* copy results to cache */
1949 		if (vmu_data.vmu_cache != NULL)
1950 			vmu_cache_rele(vmu_data.vmu_cache);
1951 		cache = vmu_data.vmu_cache =
1952 		    vmu_cache_alloc(vmu_data.vmu_nentities,
1953 			vmu_data.vmu_calc_flags);
1954 
1955 		result = cache->vmc_results;
1956 		for (entity = vmu_data.vmu_entities; entity != NULL;
1957 		    entity = entity->vme_next) {
1958 			*result = entity->vme_result;
1959 			result++;
1960 		}
1961 		cache->vmc_timestamp = gethrtime();
1962 		vmu_cache_hold(cache);
1963 
1964 		vmu_data.vmu_calc_flags = 0;
1965 		vmu_data.vmu_calc_thread = NULL;
1966 
1967 		if (vmu_data.vmu_pending_waiters > 0)
1968 			cv_broadcast(&vmu_data.vmu_cv);
1969 
1970 		mutex_exit(&vmu_data.vmu_lock);
1971 
1972 		/* copy cache */
1973 		ret = vmu_copyout_results(cache, buf, nres, flags_orig);
1974 		mutex_enter(&vmu_data.vmu_lock);
1975 		vmu_cache_rele(cache);
1976 		mutex_exit(&vmu_data.vmu_lock);
1977 
1978 		return (ret);
1979 	}
1980 	vmu_data.vmu_pending_flags |= flags;
1981 	vmu_data.vmu_pending_waiters++;
1982 	while (vmu_data.vmu_calc_thread != NULL) {
1983 		if (cv_wait_sig(&vmu_data.vmu_cv,
1984 		    &vmu_data.vmu_lock) == 0) {
1985 			vmu_data.vmu_pending_waiters--;
1986 			mutex_exit(&vmu_data.vmu_lock);
1987 			return (set_errno(EINTR));
1988 		}
1989 	}
1990 	vmu_data.vmu_pending_waiters--;
1991 	goto start;
1992 }
1993