xref: /titanic_51/usr/src/uts/i86xpv/os/balloon.c (revision d3d50737e566cade9a08d73d2af95105ac7cd960)
1843e1988Sjohnlev /*
2843e1988Sjohnlev  * CDDL HEADER START
3843e1988Sjohnlev  *
4843e1988Sjohnlev  * The contents of this file are subject to the terms of the
5843e1988Sjohnlev  * Common Development and Distribution License (the "License").
6843e1988Sjohnlev  * You may not use this file except in compliance with the License.
7843e1988Sjohnlev  *
8843e1988Sjohnlev  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9843e1988Sjohnlev  * or http://www.opensolaris.org/os/licensing.
10843e1988Sjohnlev  * See the License for the specific language governing permissions
11843e1988Sjohnlev  * and limitations under the License.
12843e1988Sjohnlev  *
13843e1988Sjohnlev  * When distributing Covered Code, include this CDDL HEADER in each
14843e1988Sjohnlev  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15843e1988Sjohnlev  * If applicable, add the following below this CDDL HEADER, with the
16843e1988Sjohnlev  * fields enclosed by brackets "[]" replaced with your own identifying
17843e1988Sjohnlev  * information: Portions Copyright [yyyy] [name of copyright owner]
18843e1988Sjohnlev  *
19843e1988Sjohnlev  * CDDL HEADER END
20843e1988Sjohnlev  */
21843e1988Sjohnlev 
22843e1988Sjohnlev /*
23349b53ddSStuart Maybee  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24843e1988Sjohnlev  * Use is subject to license terms.
25843e1988Sjohnlev  */
26843e1988Sjohnlev 
27843e1988Sjohnlev #include <sys/balloon_impl.h>
28843e1988Sjohnlev #include <sys/hypervisor.h>
29843e1988Sjohnlev #include <xen/sys/xenbus_impl.h>
30843e1988Sjohnlev #include <sys/atomic.h>
31843e1988Sjohnlev #include <sys/cmn_err.h>
32843e1988Sjohnlev #include <sys/disp.h>
33843e1988Sjohnlev #include <sys/callb.h>
34843e1988Sjohnlev #include <xen/public/memory.h>
35843e1988Sjohnlev #include <vm/hat.h>
36843e1988Sjohnlev #include <sys/promif.h>
37843e1988Sjohnlev #include <vm/seg_kmem.h>
38843e1988Sjohnlev #include <sys/memnode.h>
39843e1988Sjohnlev #include <sys/param.h>
40843e1988Sjohnlev #include <vm/vm_dep.h>
41843e1988Sjohnlev #include <sys/mman.h>
42843e1988Sjohnlev #include <sys/memlist.h>
43843e1988Sjohnlev #include <sys/sysmacros.h>
44843e1988Sjohnlev #include <sys/machsystm.h>
45843e1988Sjohnlev #include <sys/sdt.h>
46843e1988Sjohnlev 
47843e1988Sjohnlev /*
48843e1988Sjohnlev  * This file implements a balloon thread, which controls a domain's memory
49843e1988Sjohnlev  * reservation, or the amount of memory a domain is currently allocated.
50843e1988Sjohnlev  * The hypervisor provides the current memory reservation through xenbus,
51843e1988Sjohnlev  * so we register a watch on this.  We will then be signalled when the
52843e1988Sjohnlev  * reservation changes.  If it goes up, we map the new mfn's to our pfn's
53843e1988Sjohnlev  * (allocating page_t's if necessary), and release them into the system.
54843e1988Sjohnlev  * If the reservation goes down, we grab pages and release them back to
55843e1988Sjohnlev  * the hypervisor, saving the page_t's for later use.
56843e1988Sjohnlev  */
57843e1988Sjohnlev 
58843e1988Sjohnlev /*
59843e1988Sjohnlev  * Various structures needed by the balloon thread
60843e1988Sjohnlev  */
61843e1988Sjohnlev static bln_stats_t bln_stats;
62843e1988Sjohnlev static kthread_t *bln_thread;
63843e1988Sjohnlev static kmutex_t bln_mutex;
64843e1988Sjohnlev static kcondvar_t bln_cv;
65843e1988Sjohnlev static struct xenbus_watch bln_watch;
66843e1988Sjohnlev static mfn_t new_high_mfn;
67843e1988Sjohnlev 
68843e1988Sjohnlev /*
69843e1988Sjohnlev  * For holding spare page_t structures - keep a singly-linked list.
70843e1988Sjohnlev  * The list may hold both valid (pagenum < mfn_count) and invalid
71843e1988Sjohnlev  * (pagenum >= mfn_count) page_t's.  Valid page_t's should be inserted
72843e1988Sjohnlev  * at the front, and invalid page_t's at the back.  Removal should
73843e1988Sjohnlev  * always be from the front.  This is a singly-linked list using
74843e1988Sjohnlev  * p_next, so p_prev is always NULL.
75843e1988Sjohnlev  */
76843e1988Sjohnlev static page_t *bln_spare_list_front, *bln_spare_list_back;
77843e1988Sjohnlev 
78843e1988Sjohnlev int balloon_zero_memory = 1;
79843e1988Sjohnlev size_t balloon_minkmem = (8 * 1024 * 1024);
80843e1988Sjohnlev 
81843e1988Sjohnlev /*
82843e1988Sjohnlev  * reassign_pfn() calls update_contig_pfnlist(), which can cause a large
83843e1988Sjohnlev  * slowdown when calling multiple times.  If we're reassigning less than the
84843e1988Sjohnlev  * quota defined here, we just accept the slowdown.  If the count is greater
85843e1988Sjohnlev  * than the quota, we tell the contig alloc code to stop its accounting until
86843e1988Sjohnlev  * we're done.  Setting the quota to less than 2 is not supported.
87843e1988Sjohnlev  *
88843e1988Sjohnlev  * Note that we define our own wrapper around the external
89843e1988Sjohnlev  * clear_and_lock_contig_pfnlist(), but we just use the version of
90843e1988Sjohnlev  * unlock_contig_pfnlist() in vm_machdep.c.
91843e1988Sjohnlev  */
92843e1988Sjohnlev uint_t bln_contig_list_quota = 50;
93843e1988Sjohnlev 
94843e1988Sjohnlev extern void clear_and_lock_contig_pfnlist(void);
95843e1988Sjohnlev extern void unlock_contig_pfnlist(void);
96843e1988Sjohnlev 
97843e1988Sjohnlev /*
98843e1988Sjohnlev  * Lock the pfnlist if necessary (see above), and return whether we locked it.
99843e1988Sjohnlev  */
100843e1988Sjohnlev static int
101843e1988Sjohnlev balloon_lock_contig_pfnlist(int count) {
102843e1988Sjohnlev 	if (count > bln_contig_list_quota) {
103843e1988Sjohnlev 		clear_and_lock_contig_pfnlist();
104843e1988Sjohnlev 		return (1);
105843e1988Sjohnlev 	} else {
106843e1988Sjohnlev 		return (0);
107843e1988Sjohnlev 	}
108843e1988Sjohnlev }
109843e1988Sjohnlev 
110843e1988Sjohnlev /*
111843e1988Sjohnlev  * The page represented by pp is being given back to the hypervisor.
112843e1988Sjohnlev  * Add the page_t structure to our spare list.
113843e1988Sjohnlev  */
114843e1988Sjohnlev static void
115843e1988Sjohnlev balloon_page_add(page_t *pp)
116843e1988Sjohnlev {
117843e1988Sjohnlev 	/*
118843e1988Sjohnlev 	 * We need to keep the page exclusively locked
119843e1988Sjohnlev 	 * to prevent swrand from grabbing it.
120843e1988Sjohnlev 	 */
121843e1988Sjohnlev 	ASSERT(PAGE_EXCL(pp));
122843e1988Sjohnlev 	ASSERT(MUTEX_HELD(&bln_mutex));
123843e1988Sjohnlev 
124843e1988Sjohnlev 	pp->p_prev = NULL;
125843e1988Sjohnlev 	if (bln_spare_list_front == NULL) {
126843e1988Sjohnlev 		bln_spare_list_front = bln_spare_list_back = pp;
127843e1988Sjohnlev 		pp->p_next = NULL;
128843e1988Sjohnlev 	} else if (pp->p_pagenum >= mfn_count) {
129843e1988Sjohnlev 		/*
130843e1988Sjohnlev 		 * The pfn is invalid, so add at the end of list.  Since these
131843e1988Sjohnlev 		 * adds should *only* be done by balloon_init_new_pages(), and
132843e1988Sjohnlev 		 * that does adds in order, the following ASSERT should
133843e1988Sjohnlev 		 * never trigger.
134843e1988Sjohnlev 		 */
135843e1988Sjohnlev 		ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum);
136843e1988Sjohnlev 		bln_spare_list_back->p_next = pp;
137843e1988Sjohnlev 		pp->p_next = NULL;
138843e1988Sjohnlev 		bln_spare_list_back = pp;
139843e1988Sjohnlev 	} else {
140843e1988Sjohnlev 		/* Add at beginning of list */
141843e1988Sjohnlev 		pp->p_next = bln_spare_list_front;
142843e1988Sjohnlev 		bln_spare_list_front = pp;
143843e1988Sjohnlev 	}
144843e1988Sjohnlev }
145843e1988Sjohnlev 
146843e1988Sjohnlev /*
147843e1988Sjohnlev  * Return a page_t structure from our spare list, or NULL if none are available.
148843e1988Sjohnlev  */
149843e1988Sjohnlev static page_t *
150843e1988Sjohnlev balloon_page_sub(void)
151843e1988Sjohnlev {
152843e1988Sjohnlev 	page_t *pp;
153843e1988Sjohnlev 
154843e1988Sjohnlev 	ASSERT(MUTEX_HELD(&bln_mutex));
155843e1988Sjohnlev 	if (bln_spare_list_front == NULL) {
156843e1988Sjohnlev 		return (NULL);
157843e1988Sjohnlev 	}
158843e1988Sjohnlev 
159843e1988Sjohnlev 	pp = bln_spare_list_front;
160843e1988Sjohnlev 	ASSERT(PAGE_EXCL(pp));
161843e1988Sjohnlev 	ASSERT(pp->p_pagenum <= mfn_count);
162843e1988Sjohnlev 	if (pp->p_pagenum == mfn_count) {
163843e1988Sjohnlev 		return (NULL);
164843e1988Sjohnlev 	}
165843e1988Sjohnlev 
166843e1988Sjohnlev 	bln_spare_list_front = pp->p_next;
167843e1988Sjohnlev 	if (bln_spare_list_front == NULL)
168843e1988Sjohnlev 		bln_spare_list_back = NULL;
169843e1988Sjohnlev 	pp->p_next = NULL;
170843e1988Sjohnlev 	return (pp);
171843e1988Sjohnlev }
172843e1988Sjohnlev 
173843e1988Sjohnlev /*
174843e1988Sjohnlev  * NOTE: We currently do not support growing beyond the boot memory size,
175843e1988Sjohnlev  * so the following function will not be called.  It is left in here with
176843e1988Sjohnlev  * the hope that someday this restriction can be lifted, and this code can
177843e1988Sjohnlev  * be used.
178843e1988Sjohnlev  */
179843e1988Sjohnlev 
180843e1988Sjohnlev /*
181843e1988Sjohnlev  * This structure is placed at the start of every block of new pages
182843e1988Sjohnlev  */
183843e1988Sjohnlev typedef struct {
184843e1988Sjohnlev 	struct memseg	memseg;
185843e1988Sjohnlev 	struct memlist	memlist;
186843e1988Sjohnlev 	page_t		pages[1];
187843e1988Sjohnlev } mem_structs_t;
188843e1988Sjohnlev 
189843e1988Sjohnlev /*
190843e1988Sjohnlev  * To make the math below slightly less confusing, we calculate the first
191843e1988Sjohnlev  * two parts here.  page_t's are handled separately, so they are not included.
192843e1988Sjohnlev  */
193843e1988Sjohnlev #define	MEM_STRUCT_SIZE	(sizeof (struct memseg) + sizeof (struct memlist))
194843e1988Sjohnlev 
195843e1988Sjohnlev /*
196843e1988Sjohnlev  * We want to add memory, but have no spare page_t structures.  Use some of
197843e1988Sjohnlev  * our new memory for the page_t structures.
198843e1988Sjohnlev  *
199843e1988Sjohnlev  * Somewhat similar to kphysm_add_memory_dynamic(), but simpler.
200843e1988Sjohnlev  */
201843e1988Sjohnlev static int
202843e1988Sjohnlev balloon_init_new_pages(mfn_t framelist[], pgcnt_t count)
203843e1988Sjohnlev {
204843e1988Sjohnlev 	pgcnt_t	metapgs, totalpgs, num_pages;
205843e1988Sjohnlev 	paddr_t	metasz;
206843e1988Sjohnlev 	pfn_t	meta_start;
207843e1988Sjohnlev 	page_t	*page_array;
208843e1988Sjohnlev 	caddr_t	va;
209843e1988Sjohnlev 	int	i, rv, locked;
210843e1988Sjohnlev 	mem_structs_t *mem;
211843e1988Sjohnlev 	struct memseg *segp;
212843e1988Sjohnlev 
213843e1988Sjohnlev 	/* Calculate the number of pages we're going to add */
214843e1988Sjohnlev 	totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages;
215843e1988Sjohnlev 
216843e1988Sjohnlev 	/*
217843e1988Sjohnlev 	 * The following calculates the number of "meta" pages -- the pages
218843e1988Sjohnlev 	 * that will be required to hold page_t structures for all new pages.
219843e1988Sjohnlev 	 * Proof of this calculation is left up to the reader.
220843e1988Sjohnlev 	 */
221843e1988Sjohnlev 	metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) /
222843e1988Sjohnlev 	    (PAGESIZE + sizeof (page_t)));
223843e1988Sjohnlev 
224843e1988Sjohnlev 	/*
225843e1988Sjohnlev 	 * Given the number of page_t structures we need, is there also
226843e1988Sjohnlev 	 * room in our meta pages for a memseg and memlist struct?
227843e1988Sjohnlev 	 * If not, we'll need one more meta page.
228843e1988Sjohnlev 	 */
229843e1988Sjohnlev 	if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) +
230843e1988Sjohnlev 	    MEM_STRUCT_SIZE))
231843e1988Sjohnlev 		metapgs++;
232843e1988Sjohnlev 
233843e1988Sjohnlev 	/*
234843e1988Sjohnlev 	 * metapgs is calculated from totalpgs, which may be much larger than
235843e1988Sjohnlev 	 * count.  If we don't have enough pages, all of the pages in this
236843e1988Sjohnlev 	 * batch will be made meta pages, and a future trip through
237843e1988Sjohnlev 	 * balloon_inc_reservation() will add the rest of the meta pages.
238843e1988Sjohnlev 	 */
239843e1988Sjohnlev 	if (metapgs > count)
240843e1988Sjohnlev 		metapgs = count;
241843e1988Sjohnlev 
242843e1988Sjohnlev 	/*
243843e1988Sjohnlev 	 * Figure out the number of page_t structures that can fit in metapgs
244843e1988Sjohnlev 	 *
245843e1988Sjohnlev 	 * This will cause us to initialize more page_t structures than we
246843e1988Sjohnlev 	 * need - these may be used in future memory increases.
247843e1988Sjohnlev 	 */
248843e1988Sjohnlev 	metasz = pfn_to_pa(metapgs);
249843e1988Sjohnlev 	num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t);
250843e1988Sjohnlev 
251843e1988Sjohnlev 	DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t,
252843e1988Sjohnlev 	    num_pages, pgcnt_t, metapgs);
253843e1988Sjohnlev 
254843e1988Sjohnlev 	/*
255843e1988Sjohnlev 	 * We only increment mfn_count by count, not num_pages, to keep the
256843e1988Sjohnlev 	 * space of all valid pfns contiguous.  This means we create page_t
257843e1988Sjohnlev 	 * structures with invalid pagenums -- we deal with this situation
258843e1988Sjohnlev 	 * in balloon_page_sub.
259843e1988Sjohnlev 	 */
260843e1988Sjohnlev 	mfn_count += count;
261843e1988Sjohnlev 
262843e1988Sjohnlev 	/*
263843e1988Sjohnlev 	 * Get a VA for the pages that will hold page_t and other structures.
264843e1988Sjohnlev 	 * The memseg and memlist structures will go at the beginning, with
265843e1988Sjohnlev 	 * the page_t structures following.
266843e1988Sjohnlev 	 */
267843e1988Sjohnlev 	va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP);
268843e1988Sjohnlev 	/* LINTED: improper alignment */
269843e1988Sjohnlev 	mem = (mem_structs_t *)va;
270843e1988Sjohnlev 	page_array = mem->pages;
271843e1988Sjohnlev 
272843e1988Sjohnlev 	meta_start = bln_stats.bln_max_pages;
273843e1988Sjohnlev 
274843e1988Sjohnlev 	/*
275843e1988Sjohnlev 	 * Set the mfn to pfn mapping for the meta pages.
276843e1988Sjohnlev 	 */
277843e1988Sjohnlev 	locked = balloon_lock_contig_pfnlist(metapgs);
278843e1988Sjohnlev 	for (i = 0; i < metapgs; i++) {
279843e1988Sjohnlev 		reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]);
280843e1988Sjohnlev 	}
281843e1988Sjohnlev 	if (locked)
282843e1988Sjohnlev 		unlock_contig_pfnlist();
283843e1988Sjohnlev 
284843e1988Sjohnlev 	/*
285843e1988Sjohnlev 	 * For our meta pages, map them in and zero the page.
286843e1988Sjohnlev 	 * This will be the first time touching the new pages.
287843e1988Sjohnlev 	 */
288843e1988Sjohnlev 	hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages,
289843e1988Sjohnlev 	    PROT_READ | PROT_WRITE,
290843e1988Sjohnlev 	    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
291843e1988Sjohnlev 	bzero(va, metasz);
292843e1988Sjohnlev 
293843e1988Sjohnlev 	/*
294843e1988Sjohnlev 	 * Initialize the page array for the new pages.
295843e1988Sjohnlev 	 */
296843e1988Sjohnlev 	for (i = 0; i < metapgs; i++) {
297843e1988Sjohnlev 		page_array[i].p_pagenum = bln_stats.bln_max_pages++;
298843e1988Sjohnlev 		page_array[i].p_offset = (u_offset_t)-1;
299843e1988Sjohnlev 		page_iolock_init(&page_array[i]);
300843e1988Sjohnlev 		rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
301843e1988Sjohnlev 		ASSERT(rv == 1);
302843e1988Sjohnlev 	}
303843e1988Sjohnlev 
304843e1988Sjohnlev 	/*
305843e1988Sjohnlev 	 * For the rest of the pages, initialize the page_t struct and
306843e1988Sjohnlev 	 * add them to the free list
307843e1988Sjohnlev 	 */
308843e1988Sjohnlev 	for (i = metapgs; i < num_pages; i++) {
309843e1988Sjohnlev 		page_array[i].p_pagenum = bln_stats.bln_max_pages++;
310843e1988Sjohnlev 		page_array[i].p_offset = (u_offset_t)-1;
311843e1988Sjohnlev 		page_iolock_init(&page_array[i]);
312843e1988Sjohnlev 		rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM);
313843e1988Sjohnlev 		ASSERT(rv == 1);
314843e1988Sjohnlev 		balloon_page_add(&page_array[i]);
315843e1988Sjohnlev 	}
316843e1988Sjohnlev 
317843e1988Sjohnlev 	/*
318843e1988Sjohnlev 	 * Remember where I said that we don't call this function?  The missing
319843e1988Sjohnlev 	 * code right here is why.  We need to set up kpm mappings for any new
320843e1988Sjohnlev 	 * pages coming in.  However, if someone starts up a domain with small
321843e1988Sjohnlev 	 * memory, then greatly increases it, we could get in some horrible
322843e1988Sjohnlev 	 * deadlock situations as we steal page tables for kpm use, and
323843e1988Sjohnlev 	 * userland applications take them right back before we can use them
324843e1988Sjohnlev 	 * to set up our new memory.  Once a way around that is found, and a
325843e1988Sjohnlev 	 * few other changes are made, we'll be able to enable this code.
326843e1988Sjohnlev 	 */
327843e1988Sjohnlev 
328843e1988Sjohnlev 	/*
329843e1988Sjohnlev 	 * Update kernel structures, part 1: memsegs list
330843e1988Sjohnlev 	 */
331843e1988Sjohnlev 	mem->memseg.pages_base = meta_start;
332843e1988Sjohnlev 	mem->memseg.pages_end = bln_stats.bln_max_pages - 1;
333843e1988Sjohnlev 	mem->memseg.pages = &page_array[0];
334843e1988Sjohnlev 	mem->memseg.epages = &page_array[num_pages - 1];
335843e1988Sjohnlev 	mem->memseg.next = NULL;
336843e1988Sjohnlev 	memsegs_lock(1);
337843e1988Sjohnlev 	for (segp = memsegs; segp->next != NULL; segp = segp->next)
338843e1988Sjohnlev 		;
339843e1988Sjohnlev 	segp->next = &mem->memseg;
340843e1988Sjohnlev 	memsegs_unlock(1);
341843e1988Sjohnlev 
342843e1988Sjohnlev 	/*
343843e1988Sjohnlev 	 * Update kernel structures, part 2: mem_node array
344843e1988Sjohnlev 	 */
345843e1988Sjohnlev 	mem_node_add_slice(meta_start, bln_stats.bln_max_pages);
346843e1988Sjohnlev 
347843e1988Sjohnlev 	/*
348843e1988Sjohnlev 	 * Update kernel structures, part 3: phys_install array
349843e1988Sjohnlev 	 * (*sigh* how many of these things do we need?)
350843e1988Sjohnlev 	 */
351843e1988Sjohnlev 	memlist_write_lock();
352843e1988Sjohnlev 	memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist,
353843e1988Sjohnlev 	    &phys_install);
354843e1988Sjohnlev 	memlist_write_unlock();
355843e1988Sjohnlev 
356843e1988Sjohnlev 	build_pfn_hash();
357843e1988Sjohnlev 
358843e1988Sjohnlev 	return (metapgs);
359843e1988Sjohnlev }
360843e1988Sjohnlev 
361843e1988Sjohnlev /* How many ulong_t's can we fit on a page? */
362843e1988Sjohnlev #define	FRAME_ARRAY_SIZE	(PAGESIZE / sizeof (ulong_t))
363843e1988Sjohnlev 
364843e1988Sjohnlev /*
365843e1988Sjohnlev  * These are too large to declare on the stack, so we make them static instead
366843e1988Sjohnlev  */
367843e1988Sjohnlev static ulong_t	mfn_frames[FRAME_ARRAY_SIZE];
368843e1988Sjohnlev static pfn_t	pfn_frames[FRAME_ARRAY_SIZE];
369843e1988Sjohnlev 
370843e1988Sjohnlev /*
371843e1988Sjohnlev  * This function is called when our reservation is increasing.  Make a
372843e1988Sjohnlev  * hypervisor call to get our new pages, then integrate them into the system.
373843e1988Sjohnlev  */
374843e1988Sjohnlev static spgcnt_t
375843e1988Sjohnlev balloon_inc_reservation(ulong_t credit)
376843e1988Sjohnlev {
377843e1988Sjohnlev 	int	i, cnt, locked;
378843e1988Sjohnlev 	int	meta_pg_start, meta_pg_end;
379843e1988Sjohnlev 	long	rv;
380843e1988Sjohnlev 	page_t	*pp;
381843e1988Sjohnlev 	page_t	*new_list_front, *new_list_back;
382843e1988Sjohnlev 
383d2b85481Srscott 	/* Make sure we're single-threaded. */
384d2b85481Srscott 	ASSERT(MUTEX_HELD(&bln_mutex));
385d2b85481Srscott 
386843e1988Sjohnlev 	rv = 0;
387843e1988Sjohnlev 	new_list_front = new_list_back = NULL;
388843e1988Sjohnlev 	meta_pg_start = meta_pg_end = 0;
389843e1988Sjohnlev 	bzero(mfn_frames, PAGESIZE);
390843e1988Sjohnlev 
391843e1988Sjohnlev 	if (credit > FRAME_ARRAY_SIZE)
392843e1988Sjohnlev 		credit = FRAME_ARRAY_SIZE;
393843e1988Sjohnlev 
394843e1988Sjohnlev 	xen_block_migrate();
395843e1988Sjohnlev 	rv = balloon_alloc_pages(credit, mfn_frames);
396843e1988Sjohnlev 
397843e1988Sjohnlev 	if (rv < 0) {
398843e1988Sjohnlev 		xen_allow_migrate();
399843e1988Sjohnlev 		return (0);
400843e1988Sjohnlev 	}
401843e1988Sjohnlev 	for (i = 0; i < rv; i++) {
402843e1988Sjohnlev 		if (mfn_frames[i] > new_high_mfn)
403843e1988Sjohnlev 			new_high_mfn = mfn_frames[i];
404843e1988Sjohnlev 
405843e1988Sjohnlev 		pp = balloon_page_sub();
406843e1988Sjohnlev 		if (pp == NULL) {
407843e1988Sjohnlev 			/*
408843e1988Sjohnlev 			 * We pass the index into the current mfn array,
409843e1988Sjohnlev 			 * then move the counter past the mfns we used
410843e1988Sjohnlev 			 */
411843e1988Sjohnlev 			meta_pg_start = i;
412843e1988Sjohnlev 			cnt = balloon_init_new_pages(&mfn_frames[i], rv - i);
413843e1988Sjohnlev 			i += cnt;
414843e1988Sjohnlev 			meta_pg_end = i;
415843e1988Sjohnlev 			if (i < rv) {
416843e1988Sjohnlev 				pp = balloon_page_sub();
417843e1988Sjohnlev 			} else {
418843e1988Sjohnlev 				ASSERT(i == rv);
419843e1988Sjohnlev 			}
420843e1988Sjohnlev 		}
421843e1988Sjohnlev 		if (pp == NULL) {
422843e1988Sjohnlev 			break;
423843e1988Sjohnlev 		}
424843e1988Sjohnlev 
425843e1988Sjohnlev 		if (new_list_back == NULL) {
426843e1988Sjohnlev 			new_list_front = new_list_back = pp;
427843e1988Sjohnlev 		} else {
428843e1988Sjohnlev 			new_list_back->p_next = pp;
429843e1988Sjohnlev 			new_list_back = pp;
430843e1988Sjohnlev 		}
431843e1988Sjohnlev 		pp->p_next = NULL;
432843e1988Sjohnlev 	}
433843e1988Sjohnlev 	cnt = i;
434843e1988Sjohnlev 	locked = balloon_lock_contig_pfnlist(cnt);
435d2b85481Srscott 	for (i = 0, pp = new_list_front; i < meta_pg_start;
436843e1988Sjohnlev 	    i++, pp = pp->p_next) {
437843e1988Sjohnlev 		reassign_pfn(pp->p_pagenum, mfn_frames[i]);
438843e1988Sjohnlev 	}
439d2b85481Srscott 	for (i = meta_pg_end; i < cnt; i++, pp = pp->p_next) {
440843e1988Sjohnlev 		reassign_pfn(pp->p_pagenum, mfn_frames[i]);
441843e1988Sjohnlev 	}
442843e1988Sjohnlev 	if (locked)
443843e1988Sjohnlev 		unlock_contig_pfnlist();
444d2b85481Srscott 
445d2b85481Srscott 	/*
446d2b85481Srscott 	 * Make sure we don't allow pages without pfn->mfn mappings
447d2b85481Srscott 	 * into the system.
448d2b85481Srscott 	 */
449d2b85481Srscott 	ASSERT(pp == NULL);
450d2b85481Srscott 
451843e1988Sjohnlev 	while (new_list_front != NULL) {
452843e1988Sjohnlev 		pp = new_list_front;
453843e1988Sjohnlev 		new_list_front = pp->p_next;
454843e1988Sjohnlev 		page_free(pp, 1);
455843e1988Sjohnlev 	}
456843e1988Sjohnlev 
457d2b85481Srscott 	/*
458d2b85481Srscott 	 * Variable review: at this point, rv contains the number of pages
459d2b85481Srscott 	 * the hypervisor gave us.  cnt contains the number of pages for which
460d2b85481Srscott 	 * we had page_t structures.  i contains the number of pages
461d2b85481Srscott 	 * where we set up pfn <-> mfn mappings.  If this ASSERT trips, that
462d2b85481Srscott 	 * means we somehow lost page_t's from our local list.
463d2b85481Srscott 	 */
464d2b85481Srscott 	ASSERT(cnt == i);
465843e1988Sjohnlev 	if (cnt < rv) {
466843e1988Sjohnlev 		/*
467843e1988Sjohnlev 		 * We couldn't get page structures.
468843e1988Sjohnlev 		 *
469843e1988Sjohnlev 		 * This shouldn't happen, but causes no real harm if it does.
470843e1988Sjohnlev 		 * On debug kernels, we'll flag it.  On all kernels, we'll
471843e1988Sjohnlev 		 * give back the pages we couldn't assign.
472d2b85481Srscott 		 *
473d2b85481Srscott 		 * Since these pages are new to the system and haven't been
474d2b85481Srscott 		 * used, we don't bother zeroing them.
475843e1988Sjohnlev 		 */
476843e1988Sjohnlev #ifdef DEBUG
477d2b85481Srscott 		cmn_err(CE_WARN, "Could only assign %d of %ld pages", cnt, rv);
478843e1988Sjohnlev #endif	/* DEBUG */
479843e1988Sjohnlev 
480d2b85481Srscott 		(void) balloon_free_pages(rv - cnt, &mfn_frames[i], NULL, NULL);
481843e1988Sjohnlev 
482d2b85481Srscott 		rv = cnt;
483843e1988Sjohnlev 	}
484843e1988Sjohnlev 
485843e1988Sjohnlev 	xen_allow_migrate();
486d2b85481Srscott 	page_unresv(rv - (meta_pg_end - meta_pg_start));
487843e1988Sjohnlev 	return (rv);
488843e1988Sjohnlev }
489843e1988Sjohnlev 
490843e1988Sjohnlev /*
491843e1988Sjohnlev  * This function is called when we want to decrease the memory reservation
492843e1988Sjohnlev  * of our domain.  Allocate the memory and make a hypervisor call to give
493843e1988Sjohnlev  * it back.
494843e1988Sjohnlev  */
495843e1988Sjohnlev static spgcnt_t
496843e1988Sjohnlev balloon_dec_reservation(ulong_t debit)
497843e1988Sjohnlev {
498843e1988Sjohnlev 	int	i, locked;
499843e1988Sjohnlev 	long	rv;
500d2b85481Srscott 	ulong_t	request;
501843e1988Sjohnlev 	page_t	*pp;
502843e1988Sjohnlev 
503843e1988Sjohnlev 	bzero(mfn_frames, sizeof (mfn_frames));
504843e1988Sjohnlev 	bzero(pfn_frames, sizeof (pfn_frames));
505843e1988Sjohnlev 
506843e1988Sjohnlev 	if (debit > FRAME_ARRAY_SIZE) {
507843e1988Sjohnlev 		debit = FRAME_ARRAY_SIZE;
508843e1988Sjohnlev 	}
509d2b85481Srscott 	request = debit;
510843e1988Sjohnlev 
511843e1988Sjohnlev 	/*
512843e1988Sjohnlev 	 * Don't bother if there isn't a safe amount of kmem left.
513843e1988Sjohnlev 	 */
514843e1988Sjohnlev 	if (kmem_avail() < balloon_minkmem) {
515843e1988Sjohnlev 		kmem_reap();
516843e1988Sjohnlev 		if (kmem_avail() < balloon_minkmem)
517843e1988Sjohnlev 			return (0);
518843e1988Sjohnlev 	}
519843e1988Sjohnlev 
520d2b85481Srscott 	if (page_resv(request, KM_NOSLEEP) == 0) {
521843e1988Sjohnlev 		return (0);
522843e1988Sjohnlev 	}
523843e1988Sjohnlev 	xen_block_migrate();
524843e1988Sjohnlev 	for (i = 0; i < debit; i++) {
525843e1988Sjohnlev 		pp = page_get_high_mfn(new_high_mfn);
526843e1988Sjohnlev 		new_high_mfn = 0;
527843e1988Sjohnlev 		if (pp == NULL) {
528843e1988Sjohnlev 			/*
529843e1988Sjohnlev 			 * Call kmem_reap(), then try once more,
530843e1988Sjohnlev 			 * but only if there is a safe amount of
531843e1988Sjohnlev 			 * kmem left.
532843e1988Sjohnlev 			 */
533843e1988Sjohnlev 			kmem_reap();
534843e1988Sjohnlev 			if (kmem_avail() < balloon_minkmem ||
535843e1988Sjohnlev 			    (pp = page_get_high_mfn(0)) == NULL) {
536843e1988Sjohnlev 				debit = i;
537843e1988Sjohnlev 				break;
538843e1988Sjohnlev 			}
539843e1988Sjohnlev 		}
540843e1988Sjohnlev 		ASSERT(PAGE_EXCL(pp));
541843e1988Sjohnlev 		ASSERT(!hat_page_is_mapped(pp));
542843e1988Sjohnlev 
543843e1988Sjohnlev 		balloon_page_add(pp);
544843e1988Sjohnlev 		pfn_frames[i] = pp->p_pagenum;
545843e1988Sjohnlev 		mfn_frames[i] = pfn_to_mfn(pp->p_pagenum);
546843e1988Sjohnlev 	}
547843e1988Sjohnlev 	if (debit == 0) {
548843e1988Sjohnlev 		xen_allow_migrate();
549d2b85481Srscott 		page_unresv(request);
550843e1988Sjohnlev 		return (0);
551843e1988Sjohnlev 	}
552843e1988Sjohnlev 
553843e1988Sjohnlev 	/*
554d2b85481Srscott 	 * We zero all the pages before we start reassigning them in order to
555d2b85481Srscott 	 * minimize the time spent holding the lock on the contig pfn list.
556d2b85481Srscott 	 */
557d2b85481Srscott 	if (balloon_zero_memory) {
558d2b85481Srscott 		for (i = 0; i < debit; i++) {
559d2b85481Srscott 			pfnzero(pfn_frames[i], 0, PAGESIZE);
560d2b85481Srscott 		}
561d2b85481Srscott 	}
562d2b85481Srscott 
563d2b85481Srscott 	/*
564843e1988Sjohnlev 	 * Remove all mappings for the pfns from the system
565843e1988Sjohnlev 	 */
566843e1988Sjohnlev 	locked = balloon_lock_contig_pfnlist(debit);
567843e1988Sjohnlev 	for (i = 0; i < debit; i++) {
568843e1988Sjohnlev 		reassign_pfn(pfn_frames[i], MFN_INVALID);
569843e1988Sjohnlev 	}
570843e1988Sjohnlev 	if (locked)
571843e1988Sjohnlev 		unlock_contig_pfnlist();
572843e1988Sjohnlev 
573843e1988Sjohnlev 	rv = balloon_free_pages(debit, mfn_frames, NULL, NULL);
574843e1988Sjohnlev 
575843e1988Sjohnlev 	if (rv < 0) {
576843e1988Sjohnlev 		cmn_err(CE_WARN, "Attempt to return pages to the hypervisor "
577843e1988Sjohnlev 		    "failed - up to %lu pages lost (error = %ld)", debit, rv);
578843e1988Sjohnlev 		rv = 0;
579843e1988Sjohnlev 	} else if (rv != debit) {
580843e1988Sjohnlev 		panic("Unexpected return value (%ld) from decrease reservation "
581843e1988Sjohnlev 		    "hypervisor call", rv);
582843e1988Sjohnlev 	}
583843e1988Sjohnlev 
584843e1988Sjohnlev 	xen_allow_migrate();
585d2b85481Srscott 	if (debit != request)
586d2b85481Srscott 		page_unresv(request - debit);
587843e1988Sjohnlev 	return (rv);
588843e1988Sjohnlev }
589843e1988Sjohnlev 
590843e1988Sjohnlev /*
591843e1988Sjohnlev  * This function is the callback which is called when the memory/target
592843e1988Sjohnlev  * node is changed.  When it is fired, we will read a new reservation
593843e1988Sjohnlev  * target for our domain and signal the worker thread to make the change.
594843e1988Sjohnlev  *
595843e1988Sjohnlev  * If the reservation is larger than we can handle, we issue a warning.  dom0
596843e1988Sjohnlev  * does this automatically every boot, so we skip the first warning on dom0.
597843e1988Sjohnlev  */
598843e1988Sjohnlev /*ARGSUSED*/
599843e1988Sjohnlev static void
600843e1988Sjohnlev balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len)
601843e1988Sjohnlev {
602843e1988Sjohnlev 	ulong_t new_target_kb;
603843e1988Sjohnlev 	pgcnt_t	new_target_pages;
604843e1988Sjohnlev 	int rv;
605843e1988Sjohnlev 	static uchar_t warning_cnt = 0;
606843e1988Sjohnlev 
607843e1988Sjohnlev 	rv = xenbus_scanf(NULL, "memory", "target", "%lu", &new_target_kb);
608843e1988Sjohnlev 	if (rv != 0) {
609843e1988Sjohnlev 		return;
610843e1988Sjohnlev 	}
611843e1988Sjohnlev 
612843e1988Sjohnlev 	/* new_target is in kB - change this to pages */
613843e1988Sjohnlev 	new_target_pages = kbtop(new_target_kb);
614843e1988Sjohnlev 
615843e1988Sjohnlev 	DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages);
616843e1988Sjohnlev 
617843e1988Sjohnlev 	/*
618843e1988Sjohnlev 	 * Unfortunately, dom0 may give us a target that is larger than
619843e1988Sjohnlev 	 * our max limit.  Re-check the limit, and, if the new target is
620843e1988Sjohnlev 	 * too large, adjust it downwards.
621843e1988Sjohnlev 	 */
622843e1988Sjohnlev 	mutex_enter(&bln_mutex);
623843e1988Sjohnlev 	if (new_target_pages > bln_stats.bln_max_pages) {
624843e1988Sjohnlev 		DTRACE_PROBE2(balloon__target__too__large, pgcnt_t,
625843e1988Sjohnlev 		    new_target_pages, pgcnt_t, bln_stats.bln_max_pages);
626843e1988Sjohnlev 		if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) {
627843e1988Sjohnlev 			cmn_err(CE_WARN, "New balloon target (0x%lx pages) is "
628843e1988Sjohnlev 			    "larger than original memory size (0x%lx pages). "
629843e1988Sjohnlev 			    "Ballooning beyond original memory size is not "
630843e1988Sjohnlev 			    "allowed.",
631843e1988Sjohnlev 			    new_target_pages, bln_stats.bln_max_pages);
632843e1988Sjohnlev 		}
633843e1988Sjohnlev 		warning_cnt = 1;
634843e1988Sjohnlev 		bln_stats.bln_new_target = bln_stats.bln_max_pages;
635843e1988Sjohnlev 	} else {
636843e1988Sjohnlev 		bln_stats.bln_new_target = new_target_pages;
637843e1988Sjohnlev 	}
638843e1988Sjohnlev 
639843e1988Sjohnlev 	mutex_exit(&bln_mutex);
640843e1988Sjohnlev 	cv_signal(&bln_cv);
641843e1988Sjohnlev }
642843e1988Sjohnlev 
643843e1988Sjohnlev /*
644843e1988Sjohnlev  * bln_wait_sec can be used to throttle the hv calls, but by default it's
645843e1988Sjohnlev  * turned off.  If a balloon attempt fails, the wait time is forced on, and
646843e1988Sjohnlev  * then is exponentially increased as further attempts fail.
647843e1988Sjohnlev  */
648843e1988Sjohnlev uint_t bln_wait_sec = 0;
649843e1988Sjohnlev uint_t bln_wait_shift = 1;
650843e1988Sjohnlev 
651843e1988Sjohnlev /*
652843e1988Sjohnlev  * This is the main balloon thread.  Wait on the cv.  When woken, if our
653843e1988Sjohnlev  * reservation has changed, call the appropriate function to adjust the
654843e1988Sjohnlev  * reservation.
655843e1988Sjohnlev  */
656843e1988Sjohnlev static void
657843e1988Sjohnlev balloon_worker_thread(void)
658843e1988Sjohnlev {
659843e1988Sjohnlev 	uint_t		bln_wait;
660843e1988Sjohnlev 	callb_cpr_t	cprinfo;
661843e1988Sjohnlev 	spgcnt_t	rv;
662843e1988Sjohnlev 
663843e1988Sjohnlev 	bln_wait = bln_wait_sec;
664843e1988Sjohnlev 
665843e1988Sjohnlev 	CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon");
666843e1988Sjohnlev 	for (;;) {
667843e1988Sjohnlev 		rv = 0;
668843e1988Sjohnlev 
669843e1988Sjohnlev 		mutex_enter(&bln_mutex);
670843e1988Sjohnlev 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
671843e1988Sjohnlev 		if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
672843e1988Sjohnlev 			/*
673843e1988Sjohnlev 			 * We weren't able to fully complete the request
674843e1988Sjohnlev 			 * last time through, so try again.
675843e1988Sjohnlev 			 */
676*d3d50737SRafael Vanoni 			(void) cv_reltimedwait(&bln_cv, &bln_mutex,
677*d3d50737SRafael Vanoni 			    (bln_wait * hz), TR_CLOCK_TICK);
678843e1988Sjohnlev 		} else {
679843e1988Sjohnlev 			cv_wait(&bln_cv, &bln_mutex);
680843e1988Sjohnlev 		}
681843e1988Sjohnlev 		CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex);
682843e1988Sjohnlev 
683843e1988Sjohnlev 		if (bln_stats.bln_new_target != bln_stats.bln_current_pages) {
684843e1988Sjohnlev 			if (bln_stats.bln_new_target <
685843e1988Sjohnlev 			    bln_stats.bln_current_pages) {
686843e1988Sjohnlev 				/* reservation shrunk */
687843e1988Sjohnlev 				rv = -balloon_dec_reservation(
688843e1988Sjohnlev 				    bln_stats.bln_current_pages -
689843e1988Sjohnlev 				    bln_stats.bln_new_target);
690843e1988Sjohnlev 			} else if (bln_stats.bln_new_target >
691843e1988Sjohnlev 			    bln_stats.bln_current_pages) {
692843e1988Sjohnlev 				/* reservation grew */
693843e1988Sjohnlev 				rv = balloon_inc_reservation(
694843e1988Sjohnlev 				    bln_stats.bln_new_target -
695843e1988Sjohnlev 				    bln_stats.bln_current_pages);
696843e1988Sjohnlev 			}
697843e1988Sjohnlev 		}
698843e1988Sjohnlev 		if (rv == 0) {
699843e1988Sjohnlev 			if (bln_wait == 0) {
700843e1988Sjohnlev 				bln_wait = 1;
701843e1988Sjohnlev 			} else {
702843e1988Sjohnlev 				bln_wait <<= bln_wait_shift;
703843e1988Sjohnlev 			}
704843e1988Sjohnlev 		} else {
705843e1988Sjohnlev 			bln_stats.bln_current_pages += rv;
706843e1988Sjohnlev 			bln_wait = bln_wait_sec;
707843e1988Sjohnlev 		}
708843e1988Sjohnlev 		if (bln_stats.bln_current_pages < bln_stats.bln_low)
709843e1988Sjohnlev 			bln_stats.bln_low = bln_stats.bln_current_pages;
710843e1988Sjohnlev 		else if (bln_stats.bln_current_pages > bln_stats.bln_high)
711843e1988Sjohnlev 			bln_stats.bln_high = bln_stats.bln_current_pages;
712843e1988Sjohnlev 		mutex_exit(&bln_mutex);
713843e1988Sjohnlev 	}
714843e1988Sjohnlev }
715843e1988Sjohnlev 
716843e1988Sjohnlev /*
717843e1988Sjohnlev  * Called after balloon_init(), which is below.  The xenbus thread is up
718843e1988Sjohnlev  * and running, so we can register our watch and create the balloon thread.
719843e1988Sjohnlev  */
720843e1988Sjohnlev static void
721843e1988Sjohnlev balloon_config_watch(int state)
722843e1988Sjohnlev {
723843e1988Sjohnlev 	if (state != XENSTORE_UP)
724843e1988Sjohnlev 		return;
725843e1988Sjohnlev 
726843e1988Sjohnlev 	bln_watch.node = "memory/target";
727843e1988Sjohnlev 	bln_watch.callback = balloon_handler;
728843e1988Sjohnlev 	if (register_xenbus_watch(&bln_watch)) {
729843e1988Sjohnlev 		cmn_err(CE_WARN, "Failed to register balloon watcher; balloon "
730843e1988Sjohnlev 		    "thread will be disabled");
731843e1988Sjohnlev 		return;
732843e1988Sjohnlev 	}
733843e1988Sjohnlev 
734843e1988Sjohnlev 	if (bln_thread == NULL)
735843e1988Sjohnlev 		bln_thread = thread_create(NULL, 0, balloon_worker_thread,
736843e1988Sjohnlev 		    NULL, 0, &p0, TS_RUN, minclsyspri);
737843e1988Sjohnlev }
738843e1988Sjohnlev 
739843e1988Sjohnlev /*
740843e1988Sjohnlev  * Basic initialization of the balloon thread.  Set all of our variables,
741843e1988Sjohnlev  * and register a callback for later when we can register a xenbus watch.
742843e1988Sjohnlev  */
743843e1988Sjohnlev void
744843e1988Sjohnlev balloon_init(pgcnt_t nr_pages)
745843e1988Sjohnlev {
746843e1988Sjohnlev 	domid_t domid = DOMID_SELF;
747843e1988Sjohnlev 
748843e1988Sjohnlev 	bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages;
749843e1988Sjohnlev 	bln_stats.bln_new_target = bln_stats.bln_high = nr_pages;
750843e1988Sjohnlev 	bln_stats.bln_max_pages = nr_pages;
751843e1988Sjohnlev 	cv_init(&bln_cv, NULL, CV_DEFAULT, NULL);
752843e1988Sjohnlev 
753843e1988Sjohnlev 	bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op(
754843e1988Sjohnlev 	    XENMEM_maximum_reservation, &domid);
755843e1988Sjohnlev 
756843e1988Sjohnlev 	(void) xs_register_xenbus_callback(balloon_config_watch);
757843e1988Sjohnlev }
758843e1988Sjohnlev 
759843e1988Sjohnlev /*
760843e1988Sjohnlev  * These functions are called from the network drivers when they gain a page
761843e1988Sjohnlev  * or give one away.  We simply update our count.  Note that the counter
762843e1988Sjohnlev  * tracks the number of pages we give away, so we need to subtract any
763843e1988Sjohnlev  * amount passed to balloon_drv_added.
764843e1988Sjohnlev  */
765843e1988Sjohnlev void
766843e1988Sjohnlev balloon_drv_added(int64_t delta)
767843e1988Sjohnlev {
768843e1988Sjohnlev 	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta);
769843e1988Sjohnlev }
770843e1988Sjohnlev 
771843e1988Sjohnlev void
772843e1988Sjohnlev balloon_drv_subtracted(int64_t delta)
773843e1988Sjohnlev {
774843e1988Sjohnlev 	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta);
775843e1988Sjohnlev }
776843e1988Sjohnlev 
777843e1988Sjohnlev /*
778843e1988Sjohnlev  * balloon_alloc_pages()
779843e1988Sjohnlev  *	Allocate page_cnt mfns.  mfns storage provided by the caller.  Returns
780843e1988Sjohnlev  *	the number of pages allocated, which could be less than page_cnt, or
781843e1988Sjohnlev  *	a negative number if an error occurred.
782843e1988Sjohnlev  */
783843e1988Sjohnlev long
784843e1988Sjohnlev balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns)
785843e1988Sjohnlev {
786843e1988Sjohnlev 	xen_memory_reservation_t memres;
787843e1988Sjohnlev 	long rv;
788843e1988Sjohnlev 
789843e1988Sjohnlev 	bzero(&memres, sizeof (memres));
790843e1988Sjohnlev 	/*LINTED: constant in conditional context*/
791843e1988Sjohnlev 	set_xen_guest_handle(memres.extent_start, mfns);
792843e1988Sjohnlev 	memres.domid = DOMID_SELF;
793843e1988Sjohnlev 	memres.nr_extents = page_cnt;
794843e1988Sjohnlev 
795843e1988Sjohnlev 	rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
796843e1988Sjohnlev 	if (rv > 0)
797843e1988Sjohnlev 		atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv);
798843e1988Sjohnlev 	return (rv);
799843e1988Sjohnlev }
800843e1988Sjohnlev 
801843e1988Sjohnlev /*
802843e1988Sjohnlev  * balloon_free_pages()
803843e1988Sjohnlev  *    free page_cnt pages, using any combination of mfns, pfns, and kva as long
804d2b85481Srscott  *    as they refer to the same mapping.  If an array of mfns is passed in, we
805d2b85481Srscott  *    assume they were already cleared.  Otherwise, we need to zero the pages
806d2b85481Srscott  *    before giving them back to the hypervisor. kva space is not free'd up in
807d2b85481Srscott  *    case the caller wants to re-use it.
808843e1988Sjohnlev  */
809843e1988Sjohnlev long
810843e1988Sjohnlev balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns)
811843e1988Sjohnlev {
812843e1988Sjohnlev 	xen_memory_reservation_t memdec;
813843e1988Sjohnlev 	mfn_t mfn;
814843e1988Sjohnlev 	pfn_t pfn;
815843e1988Sjohnlev 	uint_t i;
816843e1988Sjohnlev 	long e;
817843e1988Sjohnlev 
818843e1988Sjohnlev 
819843e1988Sjohnlev #if DEBUG
820843e1988Sjohnlev 	/* make sure kva is page aligned and maps to first pfn */
821843e1988Sjohnlev 	if (kva != NULL) {
822843e1988Sjohnlev 		ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0);
823843e1988Sjohnlev 		if (pfns != NULL) {
824843e1988Sjohnlev 			ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]);
825843e1988Sjohnlev 		}
826843e1988Sjohnlev 	}
827843e1988Sjohnlev #endif
828843e1988Sjohnlev 
829843e1988Sjohnlev 	/* if we have a kva, we can clean all pages with just one bzero */
830843e1988Sjohnlev 	if ((kva != NULL) && balloon_zero_memory) {
831843e1988Sjohnlev 		bzero(kva, (page_cnt * PAGESIZE));
832843e1988Sjohnlev 	}
833843e1988Sjohnlev 
834843e1988Sjohnlev 	/* if we were given a kva and/or a pfn */
835843e1988Sjohnlev 	if ((kva != NULL) || (pfns != NULL)) {
836843e1988Sjohnlev 
837843e1988Sjohnlev 		/*
838843e1988Sjohnlev 		 * All the current callers only pass 1 page when using kva or
839843e1988Sjohnlev 		 * pfns, and use mfns when passing multiple pages.  If that
840843e1988Sjohnlev 		 * assumption is changed, the following code will need some
841843e1988Sjohnlev 		 * work.  The following ASSERT() guarantees we're respecting
842843e1988Sjohnlev 		 * the io locking quota.
843843e1988Sjohnlev 		 */
844843e1988Sjohnlev 		ASSERT(page_cnt < bln_contig_list_quota);
845843e1988Sjohnlev 
846843e1988Sjohnlev 		/* go through all the pages */
847843e1988Sjohnlev 		for (i = 0; i < page_cnt; i++) {
848843e1988Sjohnlev 
849843e1988Sjohnlev 			/* get the next pfn */
850843e1988Sjohnlev 			if (pfns == NULL) {
851843e1988Sjohnlev 				pfn = hat_getpfnum(kas.a_hat,
852843e1988Sjohnlev 				    (kva + (PAGESIZE * i)));
853843e1988Sjohnlev 			} else {
854843e1988Sjohnlev 				pfn = pfns[i];
855843e1988Sjohnlev 			}
856843e1988Sjohnlev 
857843e1988Sjohnlev 			/*
858843e1988Sjohnlev 			 * if we didn't already zero this page, do it now. we
859843e1988Sjohnlev 			 * need to do this *before* we give back the MFN
860843e1988Sjohnlev 			 */
861843e1988Sjohnlev 			if ((kva == NULL) && (balloon_zero_memory)) {
862d2b85481Srscott 				pfnzero(pfn, 0, PAGESIZE);
863843e1988Sjohnlev 			}
864843e1988Sjohnlev 
865843e1988Sjohnlev 			/*
866843e1988Sjohnlev 			 * unmap the pfn. We don't free up the kva vmem space
867843e1988Sjohnlev 			 * so the caller can re-use it. The page must be
868843e1988Sjohnlev 			 * unmapped before it is given back to the hypervisor.
869843e1988Sjohnlev 			 */
870843e1988Sjohnlev 			if (kva != NULL) {
871843e1988Sjohnlev 				hat_unload(kas.a_hat, (kva + (PAGESIZE * i)),
872843e1988Sjohnlev 				    PAGESIZE, HAT_UNLOAD_UNMAP);
873843e1988Sjohnlev 			}
874843e1988Sjohnlev 
875843e1988Sjohnlev 			/* grab the mfn before the pfn is marked as invalid */
876843e1988Sjohnlev 			mfn = pfn_to_mfn(pfn);
877843e1988Sjohnlev 
878843e1988Sjohnlev 			/* mark the pfn as invalid */
879843e1988Sjohnlev 			reassign_pfn(pfn, MFN_INVALID);
880843e1988Sjohnlev 
881843e1988Sjohnlev 			/*
882843e1988Sjohnlev 			 * if we weren't given an array of MFNs, we need to
883843e1988Sjohnlev 			 * free them up one at a time. Otherwise, we'll wait
884843e1988Sjohnlev 			 * until later and do it in one hypercall
885843e1988Sjohnlev 			 */
886843e1988Sjohnlev 			if (mfns == NULL) {
887843e1988Sjohnlev 				bzero(&memdec, sizeof (memdec));
888843e1988Sjohnlev 				/*LINTED: constant in conditional context*/
889843e1988Sjohnlev 				set_xen_guest_handle(memdec.extent_start, &mfn);
890843e1988Sjohnlev 				memdec.domid = DOMID_SELF;
891843e1988Sjohnlev 				memdec.nr_extents = 1;
892843e1988Sjohnlev 				e = HYPERVISOR_memory_op(
893843e1988Sjohnlev 				    XENMEM_decrease_reservation, &memdec);
894843e1988Sjohnlev 				if (e != 1) {
895843e1988Sjohnlev 					cmn_err(CE_PANIC, "balloon: unable to "
896843e1988Sjohnlev 					    "give a page back to the "
897843e1988Sjohnlev 					    "hypervisor.\n");
898843e1988Sjohnlev 				}
899843e1988Sjohnlev 			}
900843e1988Sjohnlev 		}
901843e1988Sjohnlev 	}
902843e1988Sjohnlev 
903843e1988Sjohnlev 	/*
904843e1988Sjohnlev 	 * if we were passed in MFNs, we haven't free'd them up yet. We can
905843e1988Sjohnlev 	 * do it with one call.
906843e1988Sjohnlev 	 */
907843e1988Sjohnlev 	if (mfns != NULL) {
908843e1988Sjohnlev 		bzero(&memdec, sizeof (memdec));
909843e1988Sjohnlev 		/*LINTED: constant in conditional context*/
910843e1988Sjohnlev 		set_xen_guest_handle(memdec.extent_start, mfns);
911843e1988Sjohnlev 		memdec.domid = DOMID_SELF;
912843e1988Sjohnlev 		memdec.nr_extents = page_cnt;
913843e1988Sjohnlev 		e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec);
914843e1988Sjohnlev 		if (e != page_cnt) {
915843e1988Sjohnlev 			cmn_err(CE_PANIC, "balloon: unable to give pages back "
916843e1988Sjohnlev 			    "to the hypervisor.\n");
917843e1988Sjohnlev 		}
918843e1988Sjohnlev 	}
919843e1988Sjohnlev 
920843e1988Sjohnlev 	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt);
921843e1988Sjohnlev 	return (page_cnt);
922843e1988Sjohnlev }
923843e1988Sjohnlev 
924843e1988Sjohnlev 
925843e1988Sjohnlev /*
926843e1988Sjohnlev  * balloon_replace_pages()
927843e1988Sjohnlev  *	Try to replace nextexts blocks of 2^order pages.  addr_bits specifies
928843e1988Sjohnlev  *	how many bits of address the pages must be within (i.e. 16 would mean
929843e1988Sjohnlev  *	that the pages cannot have an address > 64k).  The constrints are on
930843e1988Sjohnlev  *	what the hypervisor gives us -- we are free to give any pages in
931843e1988Sjohnlev  *	exchange.  The array pp is the pages we are giving away.  The caller
932843e1988Sjohnlev  *	provides storage space for mfns, which hold the new physical pages.
933843e1988Sjohnlev  */
934843e1988Sjohnlev long
935843e1988Sjohnlev balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits,
936843e1988Sjohnlev     uint_t order, mfn_t *mfns)
937843e1988Sjohnlev {
938843e1988Sjohnlev 	xen_memory_reservation_t memres;
939843e1988Sjohnlev 	long fallback_cnt;
940843e1988Sjohnlev 	long cnt;
941843e1988Sjohnlev 	uint_t i, j, page_cnt, extlen;
942843e1988Sjohnlev 	long e;
943843e1988Sjohnlev 	int locked;
944843e1988Sjohnlev 
945843e1988Sjohnlev 
946843e1988Sjohnlev 	/*
947843e1988Sjohnlev 	 * we shouldn't be allocating constrained pages on a guest. It doesn't
948843e1988Sjohnlev 	 * make any sense. They won't be constrained after a migration.
949843e1988Sjohnlev 	 */
950843e1988Sjohnlev 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
951843e1988Sjohnlev 
952843e1988Sjohnlev 	extlen = 1 << order;
953843e1988Sjohnlev 	page_cnt = nextents * extlen;
954843e1988Sjohnlev 	/* Give back the current pages to the hypervisor */
955843e1988Sjohnlev 	for (i = 0; i < page_cnt; i++) {
956843e1988Sjohnlev 		cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum);
957843e1988Sjohnlev 		if (cnt != 1) {
958843e1988Sjohnlev 			cmn_err(CE_PANIC, "balloon: unable to give a page back "
959843e1988Sjohnlev 			    "to the hypervisor.\n");
960843e1988Sjohnlev 		}
961843e1988Sjohnlev 	}
962843e1988Sjohnlev 
963843e1988Sjohnlev 	/*
964843e1988Sjohnlev 	 * try to allocate the new pages using addr_bits and order. If we can't
965843e1988Sjohnlev 	 * get all of the pages, try to get the remaining pages with no
966843e1988Sjohnlev 	 * constraints and, if that was successful, return the number of
967843e1988Sjohnlev 	 * constrained pages we did allocate.
968843e1988Sjohnlev 	 */
969843e1988Sjohnlev 	bzero(&memres, sizeof (memres));
970843e1988Sjohnlev 	/*LINTED: constant in conditional context*/
971843e1988Sjohnlev 	set_xen_guest_handle(memres.extent_start, mfns);
972843e1988Sjohnlev 	memres.domid = DOMID_SELF;
973843e1988Sjohnlev 	memres.nr_extents = nextents;
974349b53ddSStuart Maybee 	memres.mem_flags = XENMEMF_address_bits(addr_bits);
975843e1988Sjohnlev 	memres.extent_order = order;
976843e1988Sjohnlev 	cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
977843e1988Sjohnlev 	/* assign the new MFNs to the current PFNs */
978843e1988Sjohnlev 	locked = balloon_lock_contig_pfnlist(cnt * extlen);
979843e1988Sjohnlev 	for (i = 0; i < cnt; i++) {
980843e1988Sjohnlev 		for (j = 0; j < extlen; j++) {
981843e1988Sjohnlev 			reassign_pfn(pp[i * extlen + j]->p_pagenum,
982843e1988Sjohnlev 			    mfns[i] + j);
983843e1988Sjohnlev 		}
984843e1988Sjohnlev 	}
985843e1988Sjohnlev 	if (locked)
986843e1988Sjohnlev 		unlock_contig_pfnlist();
987843e1988Sjohnlev 	if (cnt != nextents) {
988843e1988Sjohnlev 		if (cnt < 0) {
989843e1988Sjohnlev 			cnt = 0;
990843e1988Sjohnlev 		}
991843e1988Sjohnlev 
992843e1988Sjohnlev 		/*
993843e1988Sjohnlev 		 * We couldn't get enough memory to satisfy our requirements.
994843e1988Sjohnlev 		 * The above loop will assign the parts of the request that
995843e1988Sjohnlev 		 * were successful (this part may be 0).  We need to fill
996843e1988Sjohnlev 		 * in the rest.  The bzero below clears out extent_order and
997843e1988Sjohnlev 		 * address_bits, so we'll take anything from the hypervisor
998843e1988Sjohnlev 		 * to replace the pages we gave away.
999843e1988Sjohnlev 		 */
1000843e1988Sjohnlev 		fallback_cnt = page_cnt - cnt * extlen;
1001843e1988Sjohnlev 		bzero(&memres, sizeof (memres));
1002843e1988Sjohnlev 		/*LINTED: constant in conditional context*/
1003843e1988Sjohnlev 		set_xen_guest_handle(memres.extent_start, mfns);
1004843e1988Sjohnlev 		memres.domid = DOMID_SELF;
1005843e1988Sjohnlev 		memres.nr_extents = fallback_cnt;
1006843e1988Sjohnlev 		e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres);
1007843e1988Sjohnlev 		if (e != fallback_cnt) {
1008843e1988Sjohnlev 			cmn_err(CE_PANIC, "balloon: unable to recover from "
1009843e1988Sjohnlev 			    "failed increase_reservation.\n");
1010843e1988Sjohnlev 		}
1011843e1988Sjohnlev 		locked = balloon_lock_contig_pfnlist(fallback_cnt);
1012843e1988Sjohnlev 		for (i = 0; i < fallback_cnt; i++) {
1013843e1988Sjohnlev 			uint_t offset = page_cnt - fallback_cnt;
1014843e1988Sjohnlev 
1015843e1988Sjohnlev 			/*
1016843e1988Sjohnlev 			 * We already used pp[0...(cnt * extlen)] before,
1017843e1988Sjohnlev 			 * so start at the next entry in the pp array.
1018843e1988Sjohnlev 			 */
1019843e1988Sjohnlev 			reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]);
1020843e1988Sjohnlev 		}
1021843e1988Sjohnlev 		if (locked)
1022843e1988Sjohnlev 			unlock_contig_pfnlist();
1023843e1988Sjohnlev 	}
1024843e1988Sjohnlev 
1025843e1988Sjohnlev 	/*
1026843e1988Sjohnlev 	 * balloon_free_pages increments our counter.  Decrement it here.
1027843e1988Sjohnlev 	 */
1028843e1988Sjohnlev 	atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt);
1029843e1988Sjohnlev 
1030843e1988Sjohnlev 	/*
1031843e1988Sjohnlev 	 * return the number of extents we were able to replace. If we got
1032843e1988Sjohnlev 	 * this far, we know all the pp's are valid.
1033843e1988Sjohnlev 	 */
1034843e1988Sjohnlev 	return (cnt);
1035843e1988Sjohnlev }
1036843e1988Sjohnlev 
1037843e1988Sjohnlev 
1038843e1988Sjohnlev /*
1039843e1988Sjohnlev  * Called from the driver - return the requested stat.
1040843e1988Sjohnlev  */
1041843e1988Sjohnlev size_t
1042843e1988Sjohnlev balloon_values(int cmd)
1043843e1988Sjohnlev {
1044843e1988Sjohnlev 	switch (cmd) {
1045843e1988Sjohnlev 	case BLN_IOCTL_CURRENT:
1046843e1988Sjohnlev 		return (ptokb(bln_stats.bln_current_pages));
1047843e1988Sjohnlev 	case BLN_IOCTL_TARGET:
1048843e1988Sjohnlev 		return (ptokb(bln_stats.bln_new_target));
1049843e1988Sjohnlev 	case BLN_IOCTL_LOW:
1050843e1988Sjohnlev 		return (ptokb(bln_stats.bln_low));
1051843e1988Sjohnlev 	case BLN_IOCTL_HIGH:
1052843e1988Sjohnlev 		return (ptokb(bln_stats.bln_high));
1053843e1988Sjohnlev 	case BLN_IOCTL_LIMIT:
1054843e1988Sjohnlev 		return (ptokb(bln_stats.bln_hard_limit));
1055843e1988Sjohnlev 	default:
1056843e1988Sjohnlev 		panic("Unexpected cmd %d in balloon_values()\n", cmd);
1057843e1988Sjohnlev 	}
1058843e1988Sjohnlev 	/*NOTREACHED*/
1059843e1988Sjohnlev }
1060