xref: /titanic_51/usr/src/uts/i86xpv/os/balloon.c (revision 61dc244f52b4962ea505abec7e6cbade980d3c53)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/balloon_impl.h>
28 #include <sys/hypervisor.h>
29 #include <xen/sys/xenbus_impl.h>
30 #include <sys/atomic.h>
31 #include <sys/cmn_err.h>
32 #include <sys/disp.h>
33 #include <sys/callb.h>
34 #include <xen/public/memory.h>
35 #include <vm/hat.h>
36 #include <sys/promif.h>
37 #include <vm/seg_kmem.h>
38 #include <sys/memnode.h>
39 #include <sys/param.h>
40 #include <vm/vm_dep.h>
41 #include <sys/mman.h>
42 #include <sys/memlist.h>
43 #include <sys/sysmacros.h>
44 #include <sys/machsystm.h>
45 #include <sys/sdt.h>
46 
47 /*
48  * This file implements a balloon thread, which controls a domain's memory
49  * reservation, or the amount of memory a domain is currently allocated.
50  * The hypervisor provides the current memory reservation through xenbus,
51  * so we register a watch on this.  We will then be signalled when the
52  * reservation changes.  If it goes up, we map the new mfn's to our pfn's
53  * (allocating page_t's if necessary), and release them into the system.
54  * If the reservation goes down, we grab pages and release them back to
55  * the hypervisor, saving the page_t's for later use.
56  */
57 
58 /*
59  * Various structures needed by the balloon thread
60  */
61 static bln_stats_t bln_stats;
62 static kthread_t *bln_thread;
63 static kmutex_t bln_mutex;
64 static kcondvar_t bln_cv;
65 static struct xenbus_watch bln_watch;
66 static mfn_t new_high_mfn;
67 
68 /*
69  * For holding spare page_t structures - keep a singly-linked list.
70  * The list may hold both valid (pagenum < mfn_count) and invalid
71  * (pagenum >= mfn_count) page_t's.  Valid page_t's should be inserted
72  * at the front, and invalid page_t's at the back.  Removal should
73  * always be from the front.  This is a singly-linked list using
74  * p_next, so p_prev is always NULL.
75  */
76 static page_t *bln_spare_list_front, *bln_spare_list_back;
77 
78 int balloon_zero_memory = 1;
79 size_t balloon_minkmem = (8 * 1024 * 1024);
80 
81 /*
82  * reassign_pfn() calls update_contig_pfnlist(), which can cause a large
83  * slowdown when calling multiple times.  If we're reassigning less than the
84  * quota defined here, we just accept the slowdown.  If the count is greater
85  * than the quota, we tell the contig alloc code to stop its accounting until
86  * we're done.  Setting the quota to less than 2 is not supported.
87  *
88  * Note that we define our own wrapper around the external
89  * clear_and_lock_contig_pfnlist(), but we just use the version of
90  * unlock_contig_pfnlist() in vm_machdep.c.
91  */
92 uint_t bln_contig_list_quota = 50;
93 
94 extern void clear_and_lock_contig_pfnlist(void);
95 extern void unlock_contig_pfnlist(void);
96 
97 /*
98  * Lock the pfnlist if necessary (see above), and return whether we locked it.
99  */
100 static int
101 balloon_lock_contig_pfnlist(int count) {
102 	if (count > bln_contig_list_quota) {
103 		clear_and_lock_contig_pfnlist();
104 		return (1);
105 	} else {
106 		return (0);
107 	}
108 }
109 
110 /*
111  * The page represented by pp is being given back to the hypervisor.
112  * Add the page_t structure to our spare list.
113  */
114 static void
115 balloon_page_add(page_t *pp)
116 {
117 	/*
118 	 * We need to keep the page exclusively locked
119 	 * to prevent swrand from grabbing it.
120 	 */
121 	ASSERT(PAGE_EXCL(pp));
122 	ASSERT(MUTEX_HELD(&bln_mutex));
123 
124 	pp->p_prev = NULL;
125 	if (bln_spare_list_front == NULL) {
126 		bln_spare_list_front = bln_spare_list_back = pp;
127 		pp->p_next = NULL;
128 	} else if (pp->p_pagenum >= mfn_count) {
129 		/*
130 		 * The pfn is invalid, so add at the end of list.  Since these
131 		 * adds should *only* be done by balloon_init_new_pages(), and
132 		 * that does adds in order, the following ASSERT should
133 		 * never trigger.
134 		 */
135 		ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum);
136 		bln_spare_list_back->p_next = pp;
137 		pp->p_next = NULL;
138 		bln_spare_list_back = pp;
139 	} else {
140 		/* Add at beginning of list */
141 		pp->p_next = bln_spare_list_front;
142 		bln_spare_list_front = pp;
143 	}
144 }
145 
146 /*
147  * Return a page_t structure from our spare list, or NULL if none are available.
148  */
149 static page_t *
150 balloon_page_sub(void)
151 {
152 	page_t *pp;
153 
154 	ASSERT(MUTEX_HELD(&bln_mutex));
155 	if (bln_spare_list_front == NULL) {
156 		return (NULL);
157 	}
158 
159 	pp = bln_spare_list_front;
160 	ASSERT(PAGE_EXCL(pp));
161 	ASSERT(pp->p_pagenum <= mfn_count);
162 	if (pp->p_pagenum == mfn_count) {
163 		return (NULL);
164 	}
165 
166 	bln_spare_list_front = pp->p_next;
167 	if (bln_spare_list_front == NULL)
168 		bln_spare_list_back = NULL;
169 	pp->p_next = NULL;
170 	return (pp);
171 }
172 
173 /*
174  * NOTE: We currently do not support growing beyond the boot memory size,
175  * so the following function will not be called.  It is left in here with
176  * the hope that someday this restriction can be lifted, and this code can
177  * be used.
178  */
179 
180 /*
181  * This structure is placed at the start of every block of new pages
182  */
183 typedef struct {
184 	struct memseg	memseg;
185 	struct memlist	memlist;
186 	page_t		pages[1];
187 } mem_structs_t;
188 
189 /*
190  * To make the math below slightly less confusing, we calculate the first
191  * two parts here.  page_t's are handled separately, so they are not included.
192  */
193 #define	MEM_STRUCT_SIZE	(sizeof (struct memseg) + sizeof (struct memlist))
194 
195 /*
196  * We want to add memory, but have no spare page_t structures.  Use some of
197  * our new memory for the page_t structures.
198  *
199  * Somewhat similar to kphysm_add_memory_dynamic(), but simpler.
200  */
201 static int
202 balloon_init_new_pages(mfn_t framelist[], pgcnt_t count)
203 {
204 	pgcnt_t	metapgs, totalpgs, num_pages;
205 	paddr_t	metasz;
206 	pfn_t	meta_start;
207 	page_t	*page_array;
208 	caddr_t	va;
209 	int	i, rv, locked;
210 	mem_structs_t *mem;
211 	struct memseg *segp;
212 
213 	/* Calculate the number of pages we're going to add */
214 	totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages;
215 
216 	/*
217 	 * The following calculates the number of "meta" pages -- the pages
218 	 * that will be required to hold page_t structures for all new pages.
219 	 * Proof of this calculation is left up to the reader.
220 	 */
221 	metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) /
222 	    (PAGESIZE + sizeof (page_t)));
223 
224 	/*
225 	 * Given the number of page_t structures we need, is there also
226 	 * room in our meta pages for a memseg and memlist struct?
227 	 * If not, we'll need one more meta page.
228 	 */
229 	if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) +
230 	    MEM_STRUCT_SIZE))
231 		metapgs++;
232 
233 	/*
234 	 * metapgs is calculated from totalpgs, which may be much larger than
235 	 * count.  If we don't have enough pages, all of the pages in this
236 	 * batch will be made meta pages, and a future trip through
237 	 * balloon_inc_reservation() will add the rest of the meta pages.
238 	 */
239 	if (metapgs > count)
240 		metapgs = count;
241 
242 	/*
243 	 * Figure out the number of page_t structures that can fit in metapgs
244 	 *
245 	 * This will cause us to initialize more page_t structures than we
246 	 * need - these may be used in future memory increases.
247 	 */
248 	metasz = pfn_to_pa(metapgs);
249 	num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t);
250 
251 	DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t,
252 	    num_pages, pgcnt_t, metapgs);
253 
254 	/*
255 	 * We only increment mfn_count by count, not num_pages, to keep the
256 	 * space of all valid pfns contiguous.  This means we create page_t
257 	 * structures with invalid pagenums -- we deal with this situation
258 	 * in balloon_page_sub.
259 	 */
260 	mfn_count += count;
261 
262 	/*
263 	 * Get a VA for the pages that will hold page_t and other structures.
264 	 * The memseg and memlist structures will go at the beginning, with
265 	 * the page_t structures following.
266 	 */
267 	va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP);
268 	/* LINTED: improper alignment */
269 	mem = (mem_structs_t *)va;
270 	page_array = mem->pages;
271 
272 	meta_start = bln_stats.bln_max_pages;
273 
274 	/*
275 	 * Set the mfn to pfn mapping for the meta pages.
276 	 */
277 	locked = balloon_lock_contig_pfnlist(metapgs);
278 	for (i = 0; i < metapgs; i++) {
279 		reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]);
280 	}
281 	if (locked)
282 		unlock_contig_pfnlist();
283 
284 	/*
285 	 * For our meta pages, map them in and zero the page.
286 	 * This will be the first time touching the new pages.
287 	 */
288 	hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages,
289 	    PROT_READ | PROT_WRITE,
290 	    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
291 	bzero(va, metasz);
292 
293 	/*
294 	 * Initialize the page array for the new pages.
295 	 */
296 	for (i = 0; i < metapgs; i++) {
297 		page_array[i].p_pagenum = bln_stats.bln_max_pages++;
298 		page_array[i].p_offset = (u_offset_t)-1;
299 		page_iolock_init(&page_array[i]);
300 		rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
301 		ASSERT(rv == 1);
302 	}
303 
304 	/*
305 	 * For the rest of the pages, initialize the page_t struct and
306 	 * add them to the free list
307 	 */
308 	for (i = metapgs; i < num_pages; i++) {
309 		page_array[i].p_pagenum = bln_stats.bln_max_pages++;
310 		page_array[i].p_offset = (u_offset_t)-1;
311 		page_iolock_init(&page_array[i]);
312 		rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
313 		ASSERT(rv == 1);
314 		balloon_page_add(&page_array[i]);
315 	}
316 
317 	/*
318 	 * Remember where I said that we don't call this function?  The missing
319 	 * code right here is why.  We need to set up kpm mappings for any new
320 	 * pages coming in.  However, if someone starts up a domain with small
321 	 * memory, then greatly increases it, we could get in some horrible
322 	 * deadlock situations as we steal page tables for kpm use, and
323 	 * userland applications take them right back before we can use them
324 	 * to set up our new memory.  Once a way around that is found, and a
325 	 * few other changes are made, we'll be able to enable this code.
326 	 */
327 
328 	/*
329 	 * Update kernel structures, part 1: memsegs list
330 	 */
331 	mem->memseg.pages_base = meta_start;
332 	mem->memseg.pages_end = bln_stats.bln_max_pages - 1;
333 	mem->memseg.pages = &page_array[0];
334 	mem->memseg.epages = &page_array[num_pages - 1];
335 	mem->memseg.next = NULL;
336 	memsegs_lock(1);
337 	for (segp = memsegs; segp->next != NULL; segp = segp->next)
338 		;
339 	segp->next = &mem->memseg;
340 	memsegs_unlock(1);
341 
342 	/*
343 	 * Update kernel structures, part 2: mem_node array
344 	 */
345 	mem_node_add_slice(meta_start, bln_stats.bln_max_pages);
346 
347 	/*
348 	 * Update kernel structures, part 3: phys_install array
349 	 * (*sigh* how many of these things do we need?)
350 	 */
351 	memlist_write_lock();
352 	memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist,
353 	    &phys_install);
354 	memlist_write_unlock();
355 
356 	build_pfn_hash();
357 
358 	return (metapgs);
359 }
360 
361 /* How many ulong_t's can we fit on a page? */
362 #define	FRAME_ARRAY_SIZE	(PAGESIZE / sizeof (ulong_t))
363 
364 /*
365  * These are too large to declare on the stack, so we make them static instead
366  */
367 static ulong_t	mfn_frames[FRAME_ARRAY_SIZE];
368 static pfn_t	pfn_frames[FRAME_ARRAY_SIZE];
369 
370 /*
371  * This function is called when our reservation is increasing.  Make a
372  * hypervisor call to get our new pages, then integrate them into the system.
373  */
374 static spgcnt_t
375 balloon_inc_reservation(ulong_t credit)
376 {
377 	int	i, cnt, locked;
378 	int	meta_pg_start, meta_pg_end;
379 	long	rv;
380 	page_t	*pp;
381 	page_t	*new_list_front, *new_list_back;
382 
383 	/* Make sure we're single-threaded. */
384 	ASSERT(MUTEX_HELD(&bln_mutex));
385 
386 	rv = 0;
387 	new_list_front = new_list_back = NULL;
388 	meta_pg_start = meta_pg_end = 0;
389 	bzero(mfn_frames, PAGESIZE);
390 
391 	if (credit > FRAME_ARRAY_SIZE)
392 		credit = FRAME_ARRAY_SIZE;
393 
394 	xen_block_migrate();
395 	rv = balloon_alloc_pages(credit, mfn_frames);
396 
397 	if (rv < 0) {
398 		xen_allow_migrate();
399 		return (0);
400 	}
401 	for (i = 0; i < rv; i++) {
402 		if (mfn_frames[i] > new_high_mfn)
403 			new_high_mfn = mfn_frames[i];
404 
405 		pp = balloon_page_sub();
406 		if (pp == NULL) {
407 			/*
408 			 * We pass the index into the current mfn array,
409 			 * then move the counter past the mfns we used
410 			 */
411 			meta_pg_start = i;
412 			cnt = balloon_init_new_pages(&mfn_frames[i], rv - i);
413 			i += cnt;
414 			meta_pg_end = i;
415 			if (i < rv) {
416 				pp = balloon_page_sub();
417 			} else {
418 				ASSERT(i == rv);
419 			}
420 		}
421 		if (pp == NULL) {
422 			break;
423 		}
424 
425 		if (new_list_back == NULL) {
426 			new_list_front = new_list_back = pp;
427 		} else {
428 			new_list_back->p_next = pp;
429 			new_list_back = pp;
430 		}
431 		pp->p_next = NULL;
432 	}
433 	cnt = i;
434 	locked = balloon_lock_contig_pfnlist(cnt);
435 	for (i = 0, pp = new_list_front; i < meta_pg_start;
436 	    i++, pp = pp->p_next) {
437 		reassign_pfn(pp->p_pagenum, mfn_frames[i]);
438 	}
439 	for (i = meta_pg_end; i < cnt; i++, pp = pp->p_next) {
440 		reassign_pfn(pp->p_pagenum, mfn_frames[i]);
441 	}
442 	if (locked)
443 		unlock_contig_pfnlist();
444 
445 	/*
446 	 * Make sure we don't allow pages without pfn->mfn mappings
447 	 * into the system.
448 	 */
449 	ASSERT(pp == NULL);
450 
451 	while (new_list_front != NULL) {
452 		pp = new_list_front;
453 		new_list_front = pp->p_next;
454 		page_free(pp, 1);
455 	}
456 
457 	/*
458 	 * Variable review: at this point, rv contains the number of pages
459 	 * the hypervisor gave us.  cnt contains the number of pages for which
460 	 * we had page_t structures.  i contains the number of pages
461 	 * where we set up pfn <-> mfn mappings.  If this ASSERT trips, that
462 	 * means we somehow lost page_t's from our local list.
463 	 */
464 	ASSERT(cnt == i);
465 	if (cnt < rv) {
466 		/*
467 		 * We couldn't get page structures.
468 		 *
469 		 * This shouldn't happen, but causes no real harm if it does.
470 		 * On debug kernels, we'll flag it.  On all kernels, we'll
471 		 * give back the pages we couldn't assign.
472 		 *
473 		 * Since these pages are new to the system and haven't been
474 		 * used, we don't bother zeroing them.
475 		 */
476 #ifdef DEBUG
477 		cmn_err(CE_WARN, "Could only assign %d of %ld pages", cnt, rv);
478 #endif	/* DEBUG */
479 
480 		(void) balloon_free_pages(rv - cnt, &mfn_frames[i], NULL, NULL);
481 
482 		rv = cnt;
483 	}
484 
485 	xen_allow_migrate();
486 	page_unresv(rv - (meta_pg_end - meta_pg_start));
487 	return (rv);
488 }
489 
490 /*
491  * This function is called when we want to decrease the memory reservation
492  * of our domain.  Allocate the memory and make a hypervisor call to give
493  * it back.
494  */
495 static spgcnt_t
496 balloon_dec_reservation(ulong_t debit)
497 {
498 	int	i, locked;
499 	long	rv;
500 	ulong_t	request;
501 	page_t	*pp;
502 
503 	bzero(mfn_frames, sizeof (mfn_frames));
504 	bzero(pfn_frames, sizeof (pfn_frames));
505 
506 	if (debit > FRAME_ARRAY_SIZE) {
507 		debit = FRAME_ARRAY_SIZE;
508 	}
509 	request = debit;
510 
511 	/*
512 	 * Don't bother if there isn't a safe amount of kmem left.
513 	 */
514 	if (kmem_avail() < balloon_minkmem) {
515 		kmem_reap();
516 		if (kmem_avail() < balloon_minkmem)
517 			return (0);
518 	}
519 
520 	if (page_resv(request, KM_NOSLEEP) == 0) {
521 		return (0);
522 	}
523 	xen_block_migrate();
524 	for (i = 0; i < debit; i++) {
525 		pp = page_get_high_mfn(new_high_mfn);
526 		new_high_mfn = 0;
527 		if (pp == NULL) {
528 			/*
529 			 * Call kmem_reap(), then try once more,
530 			 * but only if there is a safe amount of
531 			 * kmem left.
532 			 */
533 			kmem_reap();
534 			if (kmem_avail() < balloon_minkmem ||
535 			    (pp = page_get_high_mfn(0)) == NULL) {
536 				debit = i;
537 				break;
538 			}
539 		}
540 		ASSERT(PAGE_EXCL(pp));
541 		ASSERT(!hat_page_is_mapped(pp));
542 
543 		balloon_page_add(pp);
544 		pfn_frames[i] = pp->p_pagenum;
545 		mfn_frames[i] = pfn_to_mfn(pp->p_pagenum);
546 	}
547 	if (debit == 0) {
548 		xen_allow_migrate();
549 		page_unresv(request);
550 		return (0);
551 	}
552 
553 	/*
554 	 * We zero all the pages before we start reassigning them in order to
555 	 * minimize the time spent holding the lock on the contig pfn list.
556 	 */
557 	if (balloon_zero_memory) {
558 		for (i = 0; i < debit; i++) {
559 			pfnzero(pfn_frames[i], 0, PAGESIZE);
560 		}
561 	}
562 
563 	/*
564 	 * Remove all mappings for the pfns from the system
565 	 */
566 	locked = balloon_lock_contig_pfnlist(debit);
567 	for (i = 0; i < debit; i++) {
568 		reassign_pfn(pfn_frames[i], MFN_INVALID);
569 	}
570 	if (locked)
571 		unlock_contig_pfnlist();
572 
573 	rv = balloon_free_pages(debit, mfn_frames, NULL, NULL);
574 
575 	if (rv < 0) {
576 		cmn_err(CE_WARN, "Attempt to return pages to the hypervisor "
577 		    "failed - up to %lu pages lost (error = %ld)", debit, rv);
578 		rv = 0;
579 	} else if (rv != debit) {
580 		panic("Unexpected return value (%ld) from decrease reservation "
581 		    "hypervisor call", rv);
582 	}
583 
584 	xen_allow_migrate();
585 	if (debit != request)
586 		page_unresv(request - debit);
587 	return (rv);
588 }
589 
590 /*
591  * This function is the callback which is called when the memory/target
592  * node is changed.  When it is fired, we will read a new reservation
593  * target for our domain and signal the worker thread to make the change.
594  *
595  * If the reservation is larger than we can handle, we issue a warning.  dom0
596  * does this automatically every boot, so we skip the first warning on dom0.
597  */
598 /*ARGSUSED*/
599 static void
600 balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len)
601 {
602 	ulong_t new_target_kb;
603 	pgcnt_t	new_target_pages;
604 	int rv;
605 	static uchar_t warning_cnt = 0;
606 
607 	rv = xenbus_scanf(NULL, "memory", "target", "%lu", &new_target_kb);
608 	if (rv != 0) {
609 		return;
610 	}
611 
612 	/* new_target is in kB - change this to pages */
613 	new_target_pages = kbtop(new_target_kb);
614 
615 	DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages);
616 
617 	/*
618 	 * Unfortunately, dom0 may give us a target that is larger than
619 	 * our max limit.  Re-check the limit, and, if the new target is
620 	 * too large, adjust it downwards.
621 	 */
622 	mutex_enter(&bln_mutex);
623 	if (new_target_pages > bln_stats.bln_max_pages) {
624 		DTRACE_PROBE2(balloon__target__too__large, pgcnt_t,
625 		    new_target_pages, pgcnt_t, bln_stats.bln_max_pages);
626 		if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) {
627 			cmn_err(CE_WARN, "New balloon target (0x%lx pages) is "
628 			    "larger than original memory size (0x%lx pages). "
629 			    "Ballooning beyond original memory size is not "
630 			    "allowed.",
631 			    new_target_pages, bln_stats.bln_max_pages);
632 		}
633 		warning_cnt = 1;
634 		bln_stats.bln_new_target = bln_stats.bln_max_pages;
635 	} else {
636 		bln_stats.bln_new_target = new_target_pages;
637 	}
638 
639 	mutex_exit(&bln_mutex);
640 	cv_signal(&bln_cv);
641 }
642 
643 /*
644  * bln_wait_sec can be used to throttle the hv calls, but by default it's
645  * turned off.  If a balloon attempt fails, the wait time is forced on, and
646  * then is exponentially increased as further attempts fail.
647  */
648 uint_t bln_wait_sec = 0;
649 uint_t bln_wait_shift = 1;
650 
651 /*
652  * This is the main balloon thread.  Wait on the cv.  When woken, if our
653  * reservation has changed, call the appropriate function to adjust the
654  * reservation.
655  */
656 static void
657 balloon_worker_thread(void)
658 {
659 	uint_t		bln_wait;
660 	callb_cpr_t	cprinfo;
661 	spgcnt_t	rv;
662 
663 	bln_wait = bln_wait_sec;
664 
665 	CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon");
666 	for (;;) {
667 		rv = 0;
668 
669 		mutex_enter(&bln_mutex);
670 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
671 		if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
672 			/*
673 			 * We weren't able to fully complete the request
674 			 * last time through, so try again.
675 			 */
676 			(void) cv_reltimedwait(&bln_cv, &bln_mutex,
677 			    (bln_wait * hz), TR_CLOCK_TICK);
678 		} else {
679 			cv_wait(&bln_cv, &bln_mutex);
680 		}
681 		CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex);
682 
683 		if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
684 			if (bln_stats.bln_new_target <
685 			    bln_stats.bln_current_pages) {
686 				/* reservation shrunk */
687 				rv = -balloon_dec_reservation(
688 				    bln_stats.bln_current_pages -
689 				    bln_stats.bln_new_target);
690 			} else if (bln_stats.bln_new_target >
691 			    bln_stats.bln_current_pages) {
692 				/* reservation grew */
693 				rv = balloon_inc_reservation(
694 				    bln_stats.bln_new_target -
695 				    bln_stats.bln_current_pages);
696 			}
697 		}
698 		if (rv == 0) {
699 			if (bln_wait == 0) {
700 				bln_wait = 1;
701 			} else {
702 				bln_wait <<= bln_wait_shift;
703 			}
704 		} else {
705 			bln_stats.bln_current_pages += rv;
706 			bln_wait = bln_wait_sec;
707 		}
708 		if (bln_stats.bln_current_pages < bln_stats.bln_low)
709 			bln_stats.bln_low = bln_stats.bln_current_pages;
710 		else if (bln_stats.bln_current_pages > bln_stats.bln_high)
711 			bln_stats.bln_high = bln_stats.bln_current_pages;
712 		mutex_exit(&bln_mutex);
713 	}
714 }
715 
716 /*
717  * Called after balloon_init(), which is below.  The xenbus thread is up
718  * and running, so we can register our watch and create the balloon thread.
719  */
720 static void
721 balloon_config_watch(int state)
722 {
723 	if (state != XENSTORE_UP)
724 		return;
725 
726 	bln_watch.node = "memory/target";
727 	bln_watch.callback = balloon_handler;
728 	if (register_xenbus_watch(&bln_watch)) {
729 		cmn_err(CE_WARN, "Failed to register balloon watcher; balloon "
730 		    "thread will be disabled");
731 		return;
732 	}
733 
734 	if (bln_thread == NULL)
735 		bln_thread = thread_create(NULL, 0, balloon_worker_thread,
736 		    NULL, 0, &p0, TS_RUN, minclsyspri);
737 }
738 
739 /*
740  * Basic initialization of the balloon thread.  Set all of our variables,
741  * and register a callback for later when we can register a xenbus watch.
742  */
743 void
744 balloon_init(pgcnt_t nr_pages)
745 {
746 	domid_t domid = DOMID_SELF;
747 
748 	bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages;
749 	bln_stats.bln_new_target = bln_stats.bln_high = nr_pages;
750 	bln_stats.bln_max_pages = nr_pages;
751 	cv_init(&bln_cv, NULL, CV_DEFAULT, NULL);
752 
753 	bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op(
754 	    XENMEM_maximum_reservation, &domid);
755 
756 	(void) xs_register_xenbus_callback(balloon_config_watch);
757 }
758 
759 /*
760  * These functions are called from the network drivers when they gain a page
761  * or give one away.  We simply update our count.  Note that the counter
762  * tracks the number of pages we give away, so we need to subtract any
763  * amount passed to balloon_drv_added.
764  */
765 void
766 balloon_drv_added(int64_t delta)
767 {
768 	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta);
769 }
770 
771 void
772 balloon_drv_subtracted(int64_t delta)
773 {
774 	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta);
775 }
776 
777 /*
778  * balloon_alloc_pages()
779  *	Allocate page_cnt mfns.  mfns storage provided by the caller.  Returns
780  *	the number of pages allocated, which could be less than page_cnt, or
781  *	a negative number if an error occurred.
782  */
783 long
784 balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns)
785 {
786 	xen_memory_reservation_t memres;
787 	long rv;
788 
789 	bzero(&memres, sizeof (memres));
790 	/*LINTED: constant in conditional context*/
791 	set_xen_guest_handle(memres.extent_start, mfns);
792 	memres.domid = DOMID_SELF;
793 	memres.nr_extents = page_cnt;
794 
795 	rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
796 	if (rv > 0)
797 		atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv);
798 	return (rv);
799 }
800 
801 /*
802  * balloon_free_pages()
803  *    free page_cnt pages, using any combination of mfns, pfns, and kva as long
804  *    as they refer to the same mapping.  If an array of mfns is passed in, we
805  *    assume they were already cleared.  Otherwise, we need to zero the pages
806  *    before giving them back to the hypervisor. kva space is not free'd up in
807  *    case the caller wants to re-use it.
808  */
809 long
810 balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns)
811 {
812 	xen_memory_reservation_t memdec;
813 	mfn_t mfn;
814 	pfn_t pfn;
815 	uint_t i;
816 	long e;
817 
818 
819 #if DEBUG
820 	/* make sure kva is page aligned and maps to first pfn */
821 	if (kva != NULL) {
822 		ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0);
823 		if (pfns != NULL) {
824 			ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]);
825 		}
826 	}
827 #endif
828 
829 	/* if we have a kva, we can clean all pages with just one bzero */
830 	if ((kva != NULL) && balloon_zero_memory) {
831 		bzero(kva, (page_cnt * PAGESIZE));
832 	}
833 
834 	/* if we were given a kva and/or a pfn */
835 	if ((kva != NULL) || (pfns != NULL)) {
836 
837 		/*
838 		 * All the current callers only pass 1 page when using kva or
839 		 * pfns, and use mfns when passing multiple pages.  If that
840 		 * assumption is changed, the following code will need some
841 		 * work.  The following ASSERT() guarantees we're respecting
842 		 * the io locking quota.
843 		 */
844 		ASSERT(page_cnt < bln_contig_list_quota);
845 
846 		/* go through all the pages */
847 		for (i = 0; i < page_cnt; i++) {
848 
849 			/* get the next pfn */
850 			if (pfns == NULL) {
851 				pfn = hat_getpfnum(kas.a_hat,
852 				    (kva + (PAGESIZE * i)));
853 			} else {
854 				pfn = pfns[i];
855 			}
856 
857 			/*
858 			 * if we didn't already zero this page, do it now. we
859 			 * need to do this *before* we give back the MFN
860 			 */
861 			if ((kva == NULL) && (balloon_zero_memory)) {
862 				pfnzero(pfn, 0, PAGESIZE);
863 			}
864 
865 			/*
866 			 * unmap the pfn. We don't free up the kva vmem space
867 			 * so the caller can re-use it. The page must be
868 			 * unmapped before it is given back to the hypervisor.
869 			 */
870 			if (kva != NULL) {
871 				hat_unload(kas.a_hat, (kva + (PAGESIZE * i)),
872 				    PAGESIZE, HAT_UNLOAD_UNMAP);
873 			}
874 
875 			/* grab the mfn before the pfn is marked as invalid */
876 			mfn = pfn_to_mfn(pfn);
877 
878 			/* mark the pfn as invalid */
879 			reassign_pfn(pfn, MFN_INVALID);
880 
881 			/*
882 			 * if we weren't given an array of MFNs, we need to
883 			 * free them up one at a time. Otherwise, we'll wait
884 			 * until later and do it in one hypercall
885 			 */
886 			if (mfns == NULL) {
887 				bzero(&memdec, sizeof (memdec));
888 				/*LINTED: constant in conditional context*/
889 				set_xen_guest_handle(memdec.extent_start, &mfn);
890 				memdec.domid = DOMID_SELF;
891 				memdec.nr_extents = 1;
892 				e = HYPERVISOR_memory_op(
893 				    XENMEM_decrease_reservation, &memdec);
894 				if (e != 1) {
895 					cmn_err(CE_PANIC, "balloon: unable to "
896 					    "give a page back to the "
897 					    "hypervisor.\n");
898 				}
899 			}
900 		}
901 	}
902 
903 	/*
904 	 * if we were passed in MFNs, we haven't free'd them up yet. We can
905 	 * do it with one call.
906 	 */
907 	if (mfns != NULL) {
908 		bzero(&memdec, sizeof (memdec));
909 		/*LINTED: constant in conditional context*/
910 		set_xen_guest_handle(memdec.extent_start, mfns);
911 		memdec.domid = DOMID_SELF;
912 		memdec.nr_extents = page_cnt;
913 		e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec);
914 		if (e != page_cnt) {
915 			cmn_err(CE_PANIC, "balloon: unable to give pages back "
916 			    "to the hypervisor.\n");
917 		}
918 	}
919 
920 	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt);
921 	return (page_cnt);
922 }
923 
924 
925 /*
926  * balloon_replace_pages()
927  *	Try to replace nextexts blocks of 2^order pages.  addr_bits specifies
928  *	how many bits of address the pages must be within (i.e. 16 would mean
929  *	that the pages cannot have an address > 64k).  The constrints are on
930  *	what the hypervisor gives us -- we are free to give any pages in
931  *	exchange.  The array pp is the pages we are giving away.  The caller
932  *	provides storage space for mfns, which hold the new physical pages.
933  */
934 long
935 balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits,
936     uint_t order, mfn_t *mfns)
937 {
938 	xen_memory_reservation_t memres;
939 	long fallback_cnt;
940 	long cnt;
941 	uint_t i, j, page_cnt, extlen;
942 	long e;
943 	int locked;
944 
945 
946 	/*
947 	 * we shouldn't be allocating constrained pages on a guest. It doesn't
948 	 * make any sense. They won't be constrained after a migration.
949 	 */
950 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
951 
952 	extlen = 1 << order;
953 	page_cnt = nextents * extlen;
954 	/* Give back the current pages to the hypervisor */
955 	for (i = 0; i < page_cnt; i++) {
956 		cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum);
957 		if (cnt != 1) {
958 			cmn_err(CE_PANIC, "balloon: unable to give a page back "
959 			    "to the hypervisor.\n");
960 		}
961 	}
962 
963 	/*
964 	 * try to allocate the new pages using addr_bits and order. If we can't
965 	 * get all of the pages, try to get the remaining pages with no
966 	 * constraints and, if that was successful, return the number of
967 	 * constrained pages we did allocate.
968 	 */
969 	bzero(&memres, sizeof (memres));
970 	/*LINTED: constant in conditional context*/
971 	set_xen_guest_handle(memres.extent_start, mfns);
972 	memres.domid = DOMID_SELF;
973 	memres.nr_extents = nextents;
974 	memres.mem_flags = XENMEMF_address_bits(addr_bits);
975 	memres.extent_order = order;
976 	cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
977 	/* assign the new MFNs to the current PFNs */
978 	locked = balloon_lock_contig_pfnlist(cnt * extlen);
979 	for (i = 0; i < cnt; i++) {
980 		for (j = 0; j < extlen; j++) {
981 			reassign_pfn(pp[i * extlen + j]->p_pagenum,
982 			    mfns[i] + j);
983 		}
984 	}
985 	if (locked)
986 		unlock_contig_pfnlist();
987 	if (cnt != nextents) {
988 		if (cnt < 0) {
989 			cnt = 0;
990 		}
991 
992 		/*
993 		 * We couldn't get enough memory to satisfy our requirements.
994 		 * The above loop will assign the parts of the request that
995 		 * were successful (this part may be 0).  We need to fill
996 		 * in the rest.  The bzero below clears out extent_order and
997 		 * address_bits, so we'll take anything from the hypervisor
998 		 * to replace the pages we gave away.
999 		 */
1000 		fallback_cnt = page_cnt - cnt * extlen;
1001 		bzero(&memres, sizeof (memres));
1002 		/*LINTED: constant in conditional context*/
1003 		set_xen_guest_handle(memres.extent_start, mfns);
1004 		memres.domid = DOMID_SELF;
1005 		memres.nr_extents = fallback_cnt;
1006 		e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
1007 		if (e != fallback_cnt) {
1008 			cmn_err(CE_PANIC, "balloon: unable to recover from "
1009 			    "failed increase_reservation.\n");
1010 		}
1011 		locked = balloon_lock_contig_pfnlist(fallback_cnt);
1012 		for (i = 0; i < fallback_cnt; i++) {
1013 			uint_t offset = page_cnt - fallback_cnt;
1014 
1015 			/*
1016 			 * We already used pp[0...(cnt * extlen)] before,
1017 			 * so start at the next entry in the pp array.
1018 			 */
1019 			reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]);
1020 		}
1021 		if (locked)
1022 			unlock_contig_pfnlist();
1023 	}
1024 
1025 	/*
1026 	 * balloon_free_pages increments our counter.  Decrement it here.
1027 	 */
1028 	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt);
1029 
1030 	/*
1031 	 * return the number of extents we were able to replace. If we got
1032 	 * this far, we know all the pp's are valid.
1033 	 */
1034 	return (cnt);
1035 }
1036 
1037 
1038 /*
1039  * Called from the driver - return the requested stat.
1040  */
1041 size_t
1042 balloon_values(int cmd)
1043 {
1044 	switch (cmd) {
1045 	case BLN_IOCTL_CURRENT:
1046 		return (ptokb(bln_stats.bln_current_pages));
1047 	case BLN_IOCTL_TARGET:
1048 		return (ptokb(bln_stats.bln_new_target));
1049 	case BLN_IOCTL_LOW:
1050 		return (ptokb(bln_stats.bln_low));
1051 	case BLN_IOCTL_HIGH:
1052 		return (ptokb(bln_stats.bln_high));
1053 	case BLN_IOCTL_LIMIT:
1054 		return (ptokb(bln_stats.bln_hard_limit));
1055 	default:
1056 		panic("Unexpected cmd %d in balloon_values()\n", cmd);
1057 	}
1058 	/*NOTREACHED*/
1059 }
1060