xref: /titanic_51/usr/src/uts/i86xpv/os/balloon.c (revision 81f63062a60a29358c252e0d10807f8a8547fbb5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/balloon_impl.h>
30 #include <sys/hypervisor.h>
31 #include <xen/sys/xenbus_impl.h>
32 #include <sys/atomic.h>
33 #include <sys/cmn_err.h>
34 #include <sys/disp.h>
35 #include <sys/callb.h>
36 #include <xen/public/memory.h>
37 #include <vm/hat.h>
38 #include <sys/promif.h>
39 #include <vm/seg_kmem.h>
40 #include <sys/memnode.h>
41 #include <sys/param.h>
42 #include <vm/vm_dep.h>
43 #include <sys/mman.h>
44 #include <sys/memlist.h>
45 #include <sys/sysmacros.h>
46 #include <sys/machsystm.h>
47 #include <sys/sdt.h>
48 
49 /*
50  * This file implements a balloon thread, which controls a domain's memory
51  * reservation, or the amount of memory a domain is currently allocated.
52  * The hypervisor provides the current memory reservation through xenbus,
53  * so we register a watch on this.  We will then be signalled when the
54  * reservation changes.  If it goes up, we map the new mfn's to our pfn's
55  * (allocating page_t's if necessary), and release them into the system.
56  * If the reservation goes down, we grab pages and release them back to
57  * the hypervisor, saving the page_t's for later use.
58  */
59 
60 /*
61  * Various structures needed by the balloon thread
62  */
63 static bln_stats_t bln_stats;
64 static kthread_t *bln_thread;
65 static kmutex_t bln_mutex;
66 static kcondvar_t bln_cv;
67 static struct xenbus_watch bln_watch;
68 static mfn_t new_high_mfn;
69 
70 /*
71  * For holding spare page_t structures - keep a singly-linked list.
72  * The list may hold both valid (pagenum < mfn_count) and invalid
73  * (pagenum >= mfn_count) page_t's.  Valid page_t's should be inserted
74  * at the front, and invalid page_t's at the back.  Removal should
75  * always be from the front.  This is a singly-linked list using
76  * p_next, so p_prev is always NULL.
77  */
78 static page_t *bln_spare_list_front, *bln_spare_list_back;
79 
80 int balloon_zero_memory = 1;
81 size_t balloon_minkmem = (8 * 1024 * 1024);
82 static caddr_t balloon_kva;
83 static kmutex_t balloon_kva_mutex;
84 static void balloon_zero_page(pfn_t pfn);
85 
86 /*
87  * reassign_pfn() calls update_contig_pfnlist(), which can cause a large
88  * slowdown when calling multiple times.  If we're reassigning less than the
89  * quota defined here, we just accept the slowdown.  If the count is greater
90  * than the quota, we tell the contig alloc code to stop its accounting until
91  * we're done.  Setting the quota to less than 2 is not supported.
92  *
93  * Note that we define our own wrapper around the external
94  * clear_and_lock_contig_pfnlist(), but we just use the version of
95  * unlock_contig_pfnlist() in vm_machdep.c.
96  */
97 uint_t bln_contig_list_quota = 50;
98 
99 extern void clear_and_lock_contig_pfnlist(void);
100 extern void unlock_contig_pfnlist(void);
101 
102 /*
103  * Lock the pfnlist if necessary (see above), and return whether we locked it.
104  */
105 static int
106 balloon_lock_contig_pfnlist(int count) {
107 	if (count > bln_contig_list_quota) {
108 		clear_and_lock_contig_pfnlist();
109 		return (1);
110 	} else {
111 		return (0);
112 	}
113 }
114 
115 /*
116  * The page represented by pp is being given back to the hypervisor.
117  * Add the page_t structure to our spare list.
118  */
119 static void
120 balloon_page_add(page_t *pp)
121 {
122 	/*
123 	 * We need to keep the page exclusively locked
124 	 * to prevent swrand from grabbing it.
125 	 */
126 	ASSERT(PAGE_EXCL(pp));
127 	ASSERT(MUTEX_HELD(&bln_mutex));
128 
129 	pp->p_prev = NULL;
130 	if (bln_spare_list_front == NULL) {
131 		bln_spare_list_front = bln_spare_list_back = pp;
132 		pp->p_next = NULL;
133 	} else if (pp->p_pagenum >= mfn_count) {
134 		/*
135 		 * The pfn is invalid, so add at the end of list.  Since these
136 		 * adds should *only* be done by balloon_init_new_pages(), and
137 		 * that does adds in order, the following ASSERT should
138 		 * never trigger.
139 		 */
140 		ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum);
141 		bln_spare_list_back->p_next = pp;
142 		pp->p_next = NULL;
143 		bln_spare_list_back = pp;
144 	} else {
145 		/* Add at beginning of list */
146 		pp->p_next = bln_spare_list_front;
147 		bln_spare_list_front = pp;
148 	}
149 }
150 
151 /*
152  * Return a page_t structure from our spare list, or NULL if none are available.
153  */
154 static page_t *
155 balloon_page_sub(void)
156 {
157 	page_t *pp;
158 
159 	ASSERT(MUTEX_HELD(&bln_mutex));
160 	if (bln_spare_list_front == NULL) {
161 		return (NULL);
162 	}
163 
164 	pp = bln_spare_list_front;
165 	ASSERT(PAGE_EXCL(pp));
166 	ASSERT(pp->p_pagenum <= mfn_count);
167 	if (pp->p_pagenum == mfn_count) {
168 		return (NULL);
169 	}
170 
171 	bln_spare_list_front = pp->p_next;
172 	if (bln_spare_list_front == NULL)
173 		bln_spare_list_back = NULL;
174 	pp->p_next = NULL;
175 	return (pp);
176 }
177 
178 /*
179  * NOTE: We currently do not support growing beyond the boot memory size,
180  * so the following function will not be called.  It is left in here with
181  * the hope that someday this restriction can be lifted, and this code can
182  * be used.
183  */
184 
185 /*
186  * This structure is placed at the start of every block of new pages
187  */
188 typedef struct {
189 	struct memseg	memseg;
190 	struct memlist	memlist;
191 	page_t		pages[1];
192 } mem_structs_t;
193 
194 /*
195  * To make the math below slightly less confusing, we calculate the first
196  * two parts here.  page_t's are handled separately, so they are not included.
197  */
198 #define	MEM_STRUCT_SIZE	(sizeof (struct memseg) + sizeof (struct memlist))
199 
200 /*
201  * We want to add memory, but have no spare page_t structures.  Use some of
202  * our new memory for the page_t structures.
203  *
204  * Somewhat similar to kphysm_add_memory_dynamic(), but simpler.
205  */
206 static int
207 balloon_init_new_pages(mfn_t framelist[], pgcnt_t count)
208 {
209 	pgcnt_t	metapgs, totalpgs, num_pages;
210 	paddr_t	metasz;
211 	pfn_t	meta_start;
212 	page_t	*page_array;
213 	caddr_t	va;
214 	int	i, rv, locked;
215 	mem_structs_t *mem;
216 	struct memseg *segp;
217 
218 	/* Calculate the number of pages we're going to add */
219 	totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages;
220 
221 	/*
222 	 * The following calculates the number of "meta" pages -- the pages
223 	 * that will be required to hold page_t structures for all new pages.
224 	 * Proof of this calculation is left up to the reader.
225 	 */
226 	metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) /
227 	    (PAGESIZE + sizeof (page_t)));
228 
229 	/*
230 	 * Given the number of page_t structures we need, is there also
231 	 * room in our meta pages for a memseg and memlist struct?
232 	 * If not, we'll need one more meta page.
233 	 */
234 	if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) +
235 	    MEM_STRUCT_SIZE))
236 		metapgs++;
237 
238 	/*
239 	 * metapgs is calculated from totalpgs, which may be much larger than
240 	 * count.  If we don't have enough pages, all of the pages in this
241 	 * batch will be made meta pages, and a future trip through
242 	 * balloon_inc_reservation() will add the rest of the meta pages.
243 	 */
244 	if (metapgs > count)
245 		metapgs = count;
246 
247 	/*
248 	 * Figure out the number of page_t structures that can fit in metapgs
249 	 *
250 	 * This will cause us to initialize more page_t structures than we
251 	 * need - these may be used in future memory increases.
252 	 */
253 	metasz = pfn_to_pa(metapgs);
254 	num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t);
255 
256 	DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t,
257 	    num_pages, pgcnt_t, metapgs);
258 
259 	/*
260 	 * We only increment mfn_count by count, not num_pages, to keep the
261 	 * space of all valid pfns contiguous.  This means we create page_t
262 	 * structures with invalid pagenums -- we deal with this situation
263 	 * in balloon_page_sub.
264 	 */
265 	mfn_count += count;
266 
267 	/*
268 	 * Get a VA for the pages that will hold page_t and other structures.
269 	 * The memseg and memlist structures will go at the beginning, with
270 	 * the page_t structures following.
271 	 */
272 	va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP);
273 	/* LINTED: improper alignment */
274 	mem = (mem_structs_t *)va;
275 	page_array = mem->pages;
276 
277 	meta_start = bln_stats.bln_max_pages;
278 
279 	/*
280 	 * Set the mfn to pfn mapping for the meta pages.
281 	 */
282 	locked = balloon_lock_contig_pfnlist(metapgs);
283 	for (i = 0; i < metapgs; i++) {
284 		reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]);
285 	}
286 	if (locked)
287 		unlock_contig_pfnlist();
288 
289 	/*
290 	 * For our meta pages, map them in and zero the page.
291 	 * This will be the first time touching the new pages.
292 	 */
293 	hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages,
294 	    PROT_READ | PROT_WRITE,
295 	    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
296 	bzero(va, metasz);
297 
298 	/*
299 	 * Initialize the page array for the new pages.
300 	 */
301 	for (i = 0; i < metapgs; i++) {
302 		page_array[i].p_pagenum = bln_stats.bln_max_pages++;
303 		page_array[i].p_offset = (u_offset_t)-1;
304 		page_iolock_init(&page_array[i]);
305 		rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
306 		ASSERT(rv == 1);
307 	}
308 
309 	/*
310 	 * For the rest of the pages, initialize the page_t struct and
311 	 * add them to the free list
312 	 */
313 	for (i = metapgs; i < num_pages; i++) {
314 		page_array[i].p_pagenum = bln_stats.bln_max_pages++;
315 		page_array[i].p_offset = (u_offset_t)-1;
316 		page_iolock_init(&page_array[i]);
317 		rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
318 		ASSERT(rv == 1);
319 		balloon_page_add(&page_array[i]);
320 	}
321 
322 	/*
323 	 * Remember where I said that we don't call this function?  The missing
324 	 * code right here is why.  We need to set up kpm mappings for any new
325 	 * pages coming in.  However, if someone starts up a domain with small
326 	 * memory, then greatly increases it, we could get in some horrible
327 	 * deadlock situations as we steal page tables for kpm use, and
328 	 * userland applications take them right back before we can use them
329 	 * to set up our new memory.  Once a way around that is found, and a
330 	 * few other changes are made, we'll be able to enable this code.
331 	 */
332 
333 	/*
334 	 * Update kernel structures, part 1: memsegs list
335 	 */
336 	mem->memseg.pages_base = meta_start;
337 	mem->memseg.pages_end = bln_stats.bln_max_pages - 1;
338 	mem->memseg.pages = &page_array[0];
339 	mem->memseg.epages = &page_array[num_pages - 1];
340 	mem->memseg.next = NULL;
341 	memsegs_lock(1);
342 	for (segp = memsegs; segp->next != NULL; segp = segp->next)
343 		;
344 	segp->next = &mem->memseg;
345 	memsegs_unlock(1);
346 
347 	/*
348 	 * Update kernel structures, part 2: mem_node array
349 	 */
350 	mem_node_add_slice(meta_start, bln_stats.bln_max_pages);
351 
352 	/*
353 	 * Update kernel structures, part 3: phys_install array
354 	 * (*sigh* how many of these things do we need?)
355 	 */
356 	memlist_write_lock();
357 	memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist,
358 	    &phys_install);
359 	memlist_write_unlock();
360 
361 	build_pfn_hash();
362 
363 	return (metapgs);
364 }
365 
366 /* How many ulong_t's can we fit on a page? */
367 #define	FRAME_ARRAY_SIZE	(PAGESIZE / sizeof (ulong_t))
368 
369 /*
370  * These are too large to declare on the stack, so we make them static instead
371  */
372 static ulong_t	mfn_frames[FRAME_ARRAY_SIZE];
373 static pfn_t	pfn_frames[FRAME_ARRAY_SIZE];
374 
375 /*
376  * This function is called when our reservation is increasing.  Make a
377  * hypervisor call to get our new pages, then integrate them into the system.
378  */
379 static spgcnt_t
380 balloon_inc_reservation(ulong_t credit)
381 {
382 	int	i, cnt, locked;
383 	int	meta_pg_start, meta_pg_end;
384 	long	rv;
385 	page_t	*pp;
386 	page_t	*new_list_front, *new_list_back;
387 
388 	rv = 0;
389 	new_list_front = new_list_back = NULL;
390 	meta_pg_start = meta_pg_end = 0;
391 	bzero(mfn_frames, PAGESIZE);
392 
393 	if (credit > FRAME_ARRAY_SIZE)
394 		credit = FRAME_ARRAY_SIZE;
395 
396 	xen_block_migrate();
397 	rv = balloon_alloc_pages(credit, mfn_frames);
398 
399 	if (rv < 0) {
400 		xen_allow_migrate();
401 		return (0);
402 	}
403 	for (i = 0; i < rv; i++) {
404 		if (mfn_frames[i] > new_high_mfn)
405 			new_high_mfn = mfn_frames[i];
406 
407 		pp = balloon_page_sub();
408 		if (pp == NULL) {
409 			/*
410 			 * We pass the index into the current mfn array,
411 			 * then move the counter past the mfns we used
412 			 */
413 			meta_pg_start = i;
414 			cnt = balloon_init_new_pages(&mfn_frames[i], rv - i);
415 			i += cnt;
416 			meta_pg_end = i;
417 			if (i < rv) {
418 				pp = balloon_page_sub();
419 			} else {
420 				ASSERT(i == rv);
421 			}
422 		}
423 		if (pp == NULL) {
424 			break;
425 		}
426 
427 		if (new_list_back == NULL) {
428 			new_list_front = new_list_back = pp;
429 		} else {
430 			new_list_back->p_next = pp;
431 			new_list_back = pp;
432 		}
433 		pp->p_next = NULL;
434 	}
435 	cnt = i;
436 	locked = balloon_lock_contig_pfnlist(cnt);
437 	for (i = 0, pp = new_list_front; (i < meta_pg_start) && (pp != NULL);
438 	    i++, pp = pp->p_next) {
439 		reassign_pfn(pp->p_pagenum, mfn_frames[i]);
440 	}
441 	for (i = meta_pg_end; (i < cnt) && (pp != NULL); i++, pp = pp->p_next) {
442 		reassign_pfn(pp->p_pagenum, mfn_frames[i]);
443 	}
444 	if (locked)
445 		unlock_contig_pfnlist();
446 	while (new_list_front != NULL) {
447 		pp = new_list_front;
448 		new_list_front = pp->p_next;
449 		page_free(pp, 1);
450 	}
451 	page_unresv(cnt - (meta_pg_end - meta_pg_start));
452 
453 	if (cnt < rv) {
454 		/*
455 		 * We couldn't get page structures.
456 		 *
457 		 * This shouldn't happen, but causes no real harm if it does.
458 		 * On debug kernels, we'll flag it.  On all kernels, we'll
459 		 * give back the pages we couldn't assign.
460 		 */
461 #ifdef DEBUG
462 		cmn_err(CE_WARN, "Could only assign %d of %ld pages", i, rv);
463 #endif	/* DEBUG */
464 
465 		(void) balloon_free_pages(rv - i, &mfn_frames[i], NULL, NULL);
466 
467 		rv = i;
468 	}
469 
470 	xen_allow_migrate();
471 	return (rv);
472 }
473 
474 /*
475  * This function is called when we want to decrease the memory reservation
476  * of our domain.  Allocate the memory and make a hypervisor call to give
477  * it back.
478  */
479 static spgcnt_t
480 balloon_dec_reservation(ulong_t debit)
481 {
482 	int	i, locked;
483 	long	rv;
484 	page_t	*pp;
485 
486 	bzero(mfn_frames, sizeof (mfn_frames));
487 	bzero(pfn_frames, sizeof (pfn_frames));
488 
489 	if (debit > FRAME_ARRAY_SIZE) {
490 		debit = FRAME_ARRAY_SIZE;
491 	}
492 
493 	/*
494 	 * Don't bother if there isn't a safe amount of kmem left.
495 	 */
496 	if (kmem_avail() < balloon_minkmem) {
497 		kmem_reap();
498 		if (kmem_avail() < balloon_minkmem)
499 			return (0);
500 	}
501 
502 	if (page_resv(debit, KM_NOSLEEP) == 0) {
503 		return (0);
504 	}
505 	xen_block_migrate();
506 	for (i = 0; i < debit; i++) {
507 		pp = page_get_high_mfn(new_high_mfn);
508 		new_high_mfn = 0;
509 		if (pp == NULL) {
510 			/*
511 			 * Call kmem_reap(), then try once more,
512 			 * but only if there is a safe amount of
513 			 * kmem left.
514 			 */
515 			kmem_reap();
516 			if (kmem_avail() < balloon_minkmem ||
517 			    (pp = page_get_high_mfn(0)) == NULL) {
518 				debit = i;
519 				break;
520 			}
521 		}
522 		ASSERT(PAGE_EXCL(pp));
523 		ASSERT(!hat_page_is_mapped(pp));
524 
525 		balloon_page_add(pp);
526 		pfn_frames[i] = pp->p_pagenum;
527 		mfn_frames[i] = pfn_to_mfn(pp->p_pagenum);
528 	}
529 	if (debit == 0) {
530 		xen_allow_migrate();
531 		return (0);
532 	}
533 
534 	/*
535 	 * Remove all mappings for the pfns from the system
536 	 */
537 	locked = balloon_lock_contig_pfnlist(debit);
538 	for (i = 0; i < debit; i++) {
539 		reassign_pfn(pfn_frames[i], MFN_INVALID);
540 	}
541 	if (locked)
542 		unlock_contig_pfnlist();
543 
544 	rv = balloon_free_pages(debit, mfn_frames, NULL, NULL);
545 
546 	if (rv < 0) {
547 		cmn_err(CE_WARN, "Attempt to return pages to the hypervisor "
548 		    "failed - up to %lu pages lost (error = %ld)", debit, rv);
549 		rv = 0;
550 	} else if (rv != debit) {
551 		panic("Unexpected return value (%ld) from decrease reservation "
552 		    "hypervisor call", rv);
553 	}
554 
555 	xen_allow_migrate();
556 	return (rv);
557 }
558 
559 /*
560  * This function is the callback which is called when the memory/target
561  * node is changed.  When it is fired, we will read a new reservation
562  * target for our domain and signal the worker thread to make the change.
563  *
564  * If the reservation is larger than we can handle, we issue a warning.  dom0
565  * does this automatically every boot, so we skip the first warning on dom0.
566  */
567 /*ARGSUSED*/
568 static void
569 balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len)
570 {
571 	ulong_t new_target_kb;
572 	pgcnt_t	new_target_pages;
573 	int rv;
574 	static uchar_t warning_cnt = 0;
575 
576 	rv = xenbus_scanf(NULL, "memory", "target", "%lu", &new_target_kb);
577 	if (rv != 0) {
578 		return;
579 	}
580 
581 	/* new_target is in kB - change this to pages */
582 	new_target_pages = kbtop(new_target_kb);
583 
584 	DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages);
585 
586 	/*
587 	 * Unfortunately, dom0 may give us a target that is larger than
588 	 * our max limit.  Re-check the limit, and, if the new target is
589 	 * too large, adjust it downwards.
590 	 */
591 	mutex_enter(&bln_mutex);
592 	if (new_target_pages > bln_stats.bln_max_pages) {
593 		DTRACE_PROBE2(balloon__target__too__large, pgcnt_t,
594 		    new_target_pages, pgcnt_t, bln_stats.bln_max_pages);
595 		if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) {
596 			cmn_err(CE_WARN, "New balloon target (0x%lx pages) is "
597 			    "larger than original memory size (0x%lx pages). "
598 			    "Ballooning beyond original memory size is not "
599 			    "allowed.",
600 			    new_target_pages, bln_stats.bln_max_pages);
601 		}
602 		warning_cnt = 1;
603 		bln_stats.bln_new_target = bln_stats.bln_max_pages;
604 	} else {
605 		bln_stats.bln_new_target = new_target_pages;
606 	}
607 
608 	mutex_exit(&bln_mutex);
609 	cv_signal(&bln_cv);
610 }
611 
612 /*
613  * bln_wait_sec can be used to throttle the hv calls, but by default it's
614  * turned off.  If a balloon attempt fails, the wait time is forced on, and
615  * then is exponentially increased as further attempts fail.
616  */
617 uint_t bln_wait_sec = 0;
618 uint_t bln_wait_shift = 1;
619 
620 /*
621  * This is the main balloon thread.  Wait on the cv.  When woken, if our
622  * reservation has changed, call the appropriate function to adjust the
623  * reservation.
624  */
625 static void
626 balloon_worker_thread(void)
627 {
628 	uint_t		bln_wait;
629 	callb_cpr_t	cprinfo;
630 	spgcnt_t	rv;
631 
632 	bln_wait = bln_wait_sec;
633 
634 	CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon");
635 	for (;;) {
636 		rv = 0;
637 
638 		mutex_enter(&bln_mutex);
639 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
640 		if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
641 			/*
642 			 * We weren't able to fully complete the request
643 			 * last time through, so try again.
644 			 */
645 			(void) cv_timedwait(&bln_cv, &bln_mutex,
646 			    lbolt + (bln_wait * hz));
647 		} else {
648 			cv_wait(&bln_cv, &bln_mutex);
649 		}
650 		CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex);
651 
652 		if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
653 			if (bln_stats.bln_new_target <
654 			    bln_stats.bln_current_pages) {
655 				/* reservation shrunk */
656 				rv = -balloon_dec_reservation(
657 				    bln_stats.bln_current_pages -
658 				    bln_stats.bln_new_target);
659 			} else if (bln_stats.bln_new_target >
660 			    bln_stats.bln_current_pages) {
661 				/* reservation grew */
662 				rv = balloon_inc_reservation(
663 				    bln_stats.bln_new_target -
664 				    bln_stats.bln_current_pages);
665 			}
666 		}
667 		if (rv == 0) {
668 			if (bln_wait == 0) {
669 				bln_wait = 1;
670 			} else {
671 				bln_wait <<= bln_wait_shift;
672 			}
673 		} else {
674 			bln_stats.bln_current_pages += rv;
675 			bln_wait = bln_wait_sec;
676 		}
677 		if (bln_stats.bln_current_pages < bln_stats.bln_low)
678 			bln_stats.bln_low = bln_stats.bln_current_pages;
679 		else if (bln_stats.bln_current_pages > bln_stats.bln_high)
680 			bln_stats.bln_high = bln_stats.bln_current_pages;
681 		mutex_exit(&bln_mutex);
682 	}
683 }
684 
685 /*
686  * Called after balloon_init(), which is below.  The xenbus thread is up
687  * and running, so we can register our watch and create the balloon thread.
688  */
689 static void
690 balloon_config_watch(int state)
691 {
692 	if (state != XENSTORE_UP)
693 		return;
694 
695 	bln_watch.node = "memory/target";
696 	bln_watch.callback = balloon_handler;
697 	if (register_xenbus_watch(&bln_watch)) {
698 		cmn_err(CE_WARN, "Failed to register balloon watcher; balloon "
699 		    "thread will be disabled");
700 		return;
701 	}
702 
703 	if (bln_thread == NULL)
704 		bln_thread = thread_create(NULL, 0, balloon_worker_thread,
705 		    NULL, 0, &p0, TS_RUN, minclsyspri);
706 }
707 
708 /*
709  * Basic initialization of the balloon thread.  Set all of our variables,
710  * and register a callback for later when we can register a xenbus watch.
711  */
712 void
713 balloon_init(pgcnt_t nr_pages)
714 {
715 	domid_t domid = DOMID_SELF;
716 
717 	bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages;
718 	bln_stats.bln_new_target = bln_stats.bln_high = nr_pages;
719 	bln_stats.bln_max_pages = nr_pages;
720 	cv_init(&bln_cv, NULL, CV_DEFAULT, NULL);
721 
722 	/* init balloon zero logic */
723 	balloon_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
724 	mutex_init(&balloon_kva_mutex, NULL, MUTEX_DRIVER, NULL);
725 
726 	bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op(
727 	    XENMEM_maximum_reservation, &domid);
728 
729 	(void) xs_register_xenbus_callback(balloon_config_watch);
730 }
731 
732 /*
733  * These functions are called from the network drivers when they gain a page
734  * or give one away.  We simply update our count.  Note that the counter
735  * tracks the number of pages we give away, so we need to subtract any
736  * amount passed to balloon_drv_added.
737  */
738 void
739 balloon_drv_added(int64_t delta)
740 {
741 	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta);
742 }
743 
744 void
745 balloon_drv_subtracted(int64_t delta)
746 {
747 	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta);
748 }
749 
750 /*
751  * balloon_alloc_pages()
752  *	Allocate page_cnt mfns.  mfns storage provided by the caller.  Returns
753  *	the number of pages allocated, which could be less than page_cnt, or
754  *	a negative number if an error occurred.
755  */
756 long
757 balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns)
758 {
759 	xen_memory_reservation_t memres;
760 	long rv;
761 
762 	bzero(&memres, sizeof (memres));
763 	/*LINTED: constant in conditional context*/
764 	set_xen_guest_handle(memres.extent_start, mfns);
765 	memres.domid = DOMID_SELF;
766 	memres.nr_extents = page_cnt;
767 
768 	rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
769 	if (rv > 0)
770 		atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv);
771 	return (rv);
772 }
773 
774 /*
775  * balloon_free_pages()
776  *    free page_cnt pages, using any combination of mfns, pfns, and kva as long
777  *    as they refer to the same mapping. We need to zero the pages before
778  *    giving them back to the hypervisor. kva space is not free'd up in case
779  *    the caller wants to re-use it.
780  */
781 long
782 balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns)
783 {
784 	xen_memory_reservation_t memdec;
785 	mfn_t mfn;
786 	pfn_t pfn;
787 	uint_t i;
788 	long e;
789 
790 
791 #if DEBUG
792 	/* make sure kva is page aligned and maps to first pfn */
793 	if (kva != NULL) {
794 		ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0);
795 		if (pfns != NULL) {
796 			ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]);
797 		}
798 	}
799 #endif
800 
801 	/* if we have a kva, we can clean all pages with just one bzero */
802 	if ((kva != NULL) && balloon_zero_memory) {
803 		bzero(kva, (page_cnt * PAGESIZE));
804 	}
805 
806 	/* if we were given a kva and/or a pfn */
807 	if ((kva != NULL) || (pfns != NULL)) {
808 
809 		/*
810 		 * All the current callers only pass 1 page when using kva or
811 		 * pfns, and use mfns when passing multiple pages.  If that
812 		 * assumption is changed, the following code will need some
813 		 * work.  The following ASSERT() guarantees we're respecting
814 		 * the io locking quota.
815 		 */
816 		ASSERT(page_cnt < bln_contig_list_quota);
817 
818 		/* go through all the pages */
819 		for (i = 0; i < page_cnt; i++) {
820 
821 			/* get the next pfn */
822 			if (pfns == NULL) {
823 				pfn = hat_getpfnum(kas.a_hat,
824 				    (kva + (PAGESIZE * i)));
825 			} else {
826 				pfn = pfns[i];
827 			}
828 
829 			/*
830 			 * if we didn't already zero this page, do it now. we
831 			 * need to do this *before* we give back the MFN
832 			 */
833 			if ((kva == NULL) && (balloon_zero_memory)) {
834 				balloon_zero_page(pfn);
835 			}
836 
837 			/*
838 			 * unmap the pfn. We don't free up the kva vmem space
839 			 * so the caller can re-use it. The page must be
840 			 * unmapped before it is given back to the hypervisor.
841 			 */
842 			if (kva != NULL) {
843 				hat_unload(kas.a_hat, (kva + (PAGESIZE * i)),
844 				    PAGESIZE, HAT_UNLOAD_UNMAP);
845 			}
846 
847 			/* grab the mfn before the pfn is marked as invalid */
848 			mfn = pfn_to_mfn(pfn);
849 
850 			/* mark the pfn as invalid */
851 			reassign_pfn(pfn, MFN_INVALID);
852 
853 			/*
854 			 * if we weren't given an array of MFNs, we need to
855 			 * free them up one at a time. Otherwise, we'll wait
856 			 * until later and do it in one hypercall
857 			 */
858 			if (mfns == NULL) {
859 				bzero(&memdec, sizeof (memdec));
860 				/*LINTED: constant in conditional context*/
861 				set_xen_guest_handle(memdec.extent_start, &mfn);
862 				memdec.domid = DOMID_SELF;
863 				memdec.nr_extents = 1;
864 				e = HYPERVISOR_memory_op(
865 				    XENMEM_decrease_reservation, &memdec);
866 				if (e != 1) {
867 					cmn_err(CE_PANIC, "balloon: unable to "
868 					    "give a page back to the "
869 					    "hypervisor.\n");
870 				}
871 			}
872 		}
873 
874 	/*
875 	 * if all we were given was an array of MFN's, we only need to zero out
876 	 * each page. The MFNs will be free'd up below.
877 	 */
878 	} else if (balloon_zero_memory) {
879 		ASSERT(mfns != NULL);
880 		for (i = 0; i < page_cnt; i++) {
881 			pfn = xen_assign_pfn(mfns[i]);
882 			balloon_zero_page(pfn);
883 			xen_release_pfn(pfn);
884 		}
885 	}
886 
887 	/*
888 	 * if we were passed in MFNs, we haven't free'd them up yet. We can
889 	 * do it with one call.
890 	 */
891 	if (mfns != NULL) {
892 		bzero(&memdec, sizeof (memdec));
893 		/*LINTED: constant in conditional context*/
894 		set_xen_guest_handle(memdec.extent_start, mfns);
895 		memdec.domid = DOMID_SELF;
896 		memdec.nr_extents = page_cnt;
897 		e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec);
898 		if (e != page_cnt) {
899 			cmn_err(CE_PANIC, "balloon: unable to give pages back "
900 			    "to the hypervisor.\n");
901 		}
902 	}
903 
904 	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt);
905 	return (page_cnt);
906 }
907 
908 
909 /*
910  * balloon_replace_pages()
911  *	Try to replace nextexts blocks of 2^order pages.  addr_bits specifies
912  *	how many bits of address the pages must be within (i.e. 16 would mean
913  *	that the pages cannot have an address > 64k).  The constrints are on
914  *	what the hypervisor gives us -- we are free to give any pages in
915  *	exchange.  The array pp is the pages we are giving away.  The caller
916  *	provides storage space for mfns, which hold the new physical pages.
917  */
918 long
919 balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits,
920     uint_t order, mfn_t *mfns)
921 {
922 	xen_memory_reservation_t memres;
923 	long fallback_cnt;
924 	long cnt;
925 	uint_t i, j, page_cnt, extlen;
926 	long e;
927 	int locked;
928 
929 
930 	/*
931 	 * we shouldn't be allocating constrained pages on a guest. It doesn't
932 	 * make any sense. They won't be constrained after a migration.
933 	 */
934 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
935 
936 	extlen = 1 << order;
937 	page_cnt = nextents * extlen;
938 	/* Give back the current pages to the hypervisor */
939 	for (i = 0; i < page_cnt; i++) {
940 		cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum);
941 		if (cnt != 1) {
942 			cmn_err(CE_PANIC, "balloon: unable to give a page back "
943 			    "to the hypervisor.\n");
944 		}
945 	}
946 
947 	/*
948 	 * try to allocate the new pages using addr_bits and order. If we can't
949 	 * get all of the pages, try to get the remaining pages with no
950 	 * constraints and, if that was successful, return the number of
951 	 * constrained pages we did allocate.
952 	 */
953 	bzero(&memres, sizeof (memres));
954 	/*LINTED: constant in conditional context*/
955 	set_xen_guest_handle(memres.extent_start, mfns);
956 	memres.domid = DOMID_SELF;
957 	memres.nr_extents = nextents;
958 	memres.address_bits = addr_bits;
959 	memres.extent_order = order;
960 	cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
961 	/* assign the new MFNs to the current PFNs */
962 	locked = balloon_lock_contig_pfnlist(cnt * extlen);
963 	for (i = 0; i < cnt; i++) {
964 		for (j = 0; j < extlen; j++) {
965 			reassign_pfn(pp[i * extlen + j]->p_pagenum,
966 			    mfns[i] + j);
967 		}
968 	}
969 	if (locked)
970 		unlock_contig_pfnlist();
971 	if (cnt != nextents) {
972 		if (cnt < 0) {
973 			cnt = 0;
974 		}
975 
976 		/*
977 		 * We couldn't get enough memory to satisfy our requirements.
978 		 * The above loop will assign the parts of the request that
979 		 * were successful (this part may be 0).  We need to fill
980 		 * in the rest.  The bzero below clears out extent_order and
981 		 * address_bits, so we'll take anything from the hypervisor
982 		 * to replace the pages we gave away.
983 		 */
984 		fallback_cnt = page_cnt - cnt * extlen;
985 		bzero(&memres, sizeof (memres));
986 		/*LINTED: constant in conditional context*/
987 		set_xen_guest_handle(memres.extent_start, mfns);
988 		memres.domid = DOMID_SELF;
989 		memres.nr_extents = fallback_cnt;
990 		e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
991 		if (e != fallback_cnt) {
992 			cmn_err(CE_PANIC, "balloon: unable to recover from "
993 			    "failed increase_reservation.\n");
994 		}
995 		locked = balloon_lock_contig_pfnlist(fallback_cnt);
996 		for (i = 0; i < fallback_cnt; i++) {
997 			uint_t offset = page_cnt - fallback_cnt;
998 
999 			/*
1000 			 * We already used pp[0...(cnt * extlen)] before,
1001 			 * so start at the next entry in the pp array.
1002 			 */
1003 			reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]);
1004 		}
1005 		if (locked)
1006 			unlock_contig_pfnlist();
1007 	}
1008 
1009 	/*
1010 	 * balloon_free_pages increments our counter.  Decrement it here.
1011 	 */
1012 	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt);
1013 
1014 	/*
1015 	 * return the number of extents we were able to replace. If we got
1016 	 * this far, we know all the pp's are valid.
1017 	 */
1018 	return (cnt);
1019 }
1020 
1021 
1022 /*
1023  * balloon_zero_page()
1024  *    zero out the page.
1025  */
1026 static void
1027 balloon_zero_page(pfn_t pfn)
1028 {
1029 	/* balloon_init() should have been called first */
1030 	ASSERT(balloon_kva != NULL);
1031 
1032 	mutex_enter(&balloon_kva_mutex);
1033 
1034 	/* map the pfn into kva, zero the page, then unmap the pfn */
1035 	hat_devload(kas.a_hat, balloon_kva, PAGESIZE, pfn,
1036 	    HAT_STORECACHING_OK | PROT_READ | PROT_WRITE | HAT_NOSYNC,
1037 	    HAT_LOAD_LOCK);
1038 	bzero(balloon_kva, PAGESIZE);
1039 	hat_unload(kas.a_hat, balloon_kva, PAGESIZE, HAT_UNLOAD);
1040 
1041 	mutex_exit(&balloon_kva_mutex);
1042 }
1043 
1044 /*
1045  * Called from the driver - return the requested stat.
1046  */
1047 size_t
1048 balloon_values(int cmd)
1049 {
1050 	switch (cmd) {
1051 	case BLN_IOCTL_CURRENT:
1052 		return (ptokb(bln_stats.bln_current_pages));
1053 	case BLN_IOCTL_TARGET:
1054 		return (ptokb(bln_stats.bln_new_target));
1055 	case BLN_IOCTL_LOW:
1056 		return (ptokb(bln_stats.bln_low));
1057 	case BLN_IOCTL_HIGH:
1058 		return (ptokb(bln_stats.bln_high));
1059 	case BLN_IOCTL_LIMIT:
1060 		return (ptokb(bln_stats.bln_hard_limit));
1061 	default:
1062 		panic("Unexpected cmd %d in balloon_values()\n", cmd);
1063 	}
1064 	/*NOTREACHED*/
1065 }
1066