/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include <sys/balloon_impl.h> #include <sys/hypervisor.h> #include <xen/sys/xenbus_impl.h> #include <sys/atomic.h> #include <sys/cmn_err.h> #include <sys/disp.h> #include <sys/callb.h> #include <xen/public/memory.h> #include <vm/hat.h> #include <sys/promif.h> #include <vm/seg_kmem.h> #include <sys/memnode.h> #include <sys/param.h> #include <vm/vm_dep.h> #include <sys/mman.h> #include <sys/memlist.h> #include <sys/sysmacros.h> #include <sys/machsystm.h> #include <sys/sdt.h> /* * This file implements a balloon thread, which controls a domain's memory * reservation, or the amount of memory a domain is currently allocated. * The hypervisor provides the current memory reservation through xenbus, * so we register a watch on this. We will then be signalled when the * reservation changes. If it goes up, we map the new mfn's to our pfn's * (allocating page_t's if necessary), and release them into the system. * If the reservation goes down, we grab pages and release them back to * the hypervisor, saving the page_t's for later use. */ /* * Various structures needed by the balloon thread */ static bln_stats_t bln_stats; static kthread_t *bln_thread; static kmutex_t bln_mutex; static kcondvar_t bln_cv; static struct xenbus_watch bln_watch; static mfn_t new_high_mfn; /* * For holding spare page_t structures - keep a singly-linked list. * The list may hold both valid (pagenum < mfn_count) and invalid * (pagenum >= mfn_count) page_t's. Valid page_t's should be inserted * at the front, and invalid page_t's at the back. Removal should * always be from the front. This is a singly-linked list using * p_next, so p_prev is always NULL. */ static page_t *bln_spare_list_front, *bln_spare_list_back; int balloon_zero_memory = 1; size_t balloon_minkmem = (8 * 1024 * 1024); /* * reassign_pfn() calls update_contig_pfnlist(), which can cause a large * slowdown when calling multiple times. If we're reassigning less than the * quota defined here, we just accept the slowdown. If the count is greater * than the quota, we tell the contig alloc code to stop its accounting until * we're done. Setting the quota to less than 2 is not supported. * * Note that we define our own wrapper around the external * clear_and_lock_contig_pfnlist(), but we just use the version of * unlock_contig_pfnlist() in vm_machdep.c. */ uint_t bln_contig_list_quota = 50; extern void clear_and_lock_contig_pfnlist(void); extern void unlock_contig_pfnlist(void); /* * Lock the pfnlist if necessary (see above), and return whether we locked it. */ static int balloon_lock_contig_pfnlist(int count) { if (count > bln_contig_list_quota) { clear_and_lock_contig_pfnlist(); return (1); } else { return (0); } } /* * The page represented by pp is being given back to the hypervisor. * Add the page_t structure to our spare list. */ static void balloon_page_add(page_t *pp) { /* * We need to keep the page exclusively locked * to prevent swrand from grabbing it. */ ASSERT(PAGE_EXCL(pp)); ASSERT(MUTEX_HELD(&bln_mutex)); pp->p_prev = NULL; if (bln_spare_list_front == NULL) { bln_spare_list_front = bln_spare_list_back = pp; pp->p_next = NULL; } else if (pp->p_pagenum >= mfn_count) { /* * The pfn is invalid, so add at the end of list. Since these * adds should *only* be done by balloon_init_new_pages(), and * that does adds in order, the following ASSERT should * never trigger. */ ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum); bln_spare_list_back->p_next = pp; pp->p_next = NULL; bln_spare_list_back = pp; } else { /* Add at beginning of list */ pp->p_next = bln_spare_list_front; bln_spare_list_front = pp; } } /* * Return a page_t structure from our spare list, or NULL if none are available. */ static page_t * balloon_page_sub(void) { page_t *pp; ASSERT(MUTEX_HELD(&bln_mutex)); if (bln_spare_list_front == NULL) { return (NULL); } pp = bln_spare_list_front; ASSERT(PAGE_EXCL(pp)); ASSERT(pp->p_pagenum <= mfn_count); if (pp->p_pagenum == mfn_count) { return (NULL); } bln_spare_list_front = pp->p_next; if (bln_spare_list_front == NULL) bln_spare_list_back = NULL; pp->p_next = NULL; return (pp); } /* * NOTE: We currently do not support growing beyond the boot memory size, * so the following function will not be called. It is left in here with * the hope that someday this restriction can be lifted, and this code can * be used. */ /* * This structure is placed at the start of every block of new pages */ typedef struct { struct memseg memseg; struct memlist memlist; page_t pages[1]; } mem_structs_t; /* * To make the math below slightly less confusing, we calculate the first * two parts here. page_t's are handled separately, so they are not included. */ #define MEM_STRUCT_SIZE (sizeof (struct memseg) + sizeof (struct memlist)) /* * We want to add memory, but have no spare page_t structures. Use some of * our new memory for the page_t structures. * * Somewhat similar to kphysm_add_memory_dynamic(), but simpler. */ static int balloon_init_new_pages(mfn_t framelist[], pgcnt_t count) { pgcnt_t metapgs, totalpgs, num_pages; paddr_t metasz; pfn_t meta_start; page_t *page_array; caddr_t va; int i, rv, locked; mem_structs_t *mem; struct memseg *segp; /* Calculate the number of pages we're going to add */ totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages; /* * The following calculates the number of "meta" pages -- the pages * that will be required to hold page_t structures for all new pages. * Proof of this calculation is left up to the reader. */ metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) / (PAGESIZE + sizeof (page_t))); /* * Given the number of page_t structures we need, is there also * room in our meta pages for a memseg and memlist struct? * If not, we'll need one more meta page. */ if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) + MEM_STRUCT_SIZE)) metapgs++; /* * metapgs is calculated from totalpgs, which may be much larger than * count. If we don't have enough pages, all of the pages in this * batch will be made meta pages, and a future trip through * balloon_inc_reservation() will add the rest of the meta pages. */ if (metapgs > count) metapgs = count; /* * Figure out the number of page_t structures that can fit in metapgs * * This will cause us to initialize more page_t structures than we * need - these may be used in future memory increases. */ metasz = pfn_to_pa(metapgs); num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t); DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t, num_pages, pgcnt_t, metapgs); /* * We only increment mfn_count by count, not num_pages, to keep the * space of all valid pfns contiguous. This means we create page_t * structures with invalid pagenums -- we deal with this situation * in balloon_page_sub. */ mfn_count += count; /* * Get a VA for the pages that will hold page_t and other structures. * The memseg and memlist structures will go at the beginning, with * the page_t structures following. */ va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP); /* LINTED: improper alignment */ mem = (mem_structs_t *)va; page_array = mem->pages; meta_start = bln_stats.bln_max_pages; /* * Set the mfn to pfn mapping for the meta pages. */ locked = balloon_lock_contig_pfnlist(metapgs); for (i = 0; i < metapgs; i++) { reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]); } if (locked) unlock_contig_pfnlist(); /* * For our meta pages, map them in and zero the page. * This will be the first time touching the new pages. */ hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages, PROT_READ | PROT_WRITE, HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); bzero(va, metasz); /* * Initialize the page array for the new pages. */ for (i = 0; i < metapgs; i++) { page_array[i].p_pagenum = bln_stats.bln_max_pages++; page_array[i].p_offset = (u_offset_t)-1; page_iolock_init(&page_array[i]); rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); ASSERT(rv == 1); } /* * For the rest of the pages, initialize the page_t struct and * add them to the free list */ for (i = metapgs; i < num_pages; i++) { page_array[i].p_pagenum = bln_stats.bln_max_pages++; page_array[i].p_offset = (u_offset_t)-1; page_iolock_init(&page_array[i]); rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); ASSERT(rv == 1); balloon_page_add(&page_array[i]); } /* * Remember where I said that we don't call this function? The missing * code right here is why. We need to set up kpm mappings for any new * pages coming in. However, if someone starts up a domain with small * memory, then greatly increases it, we could get in some horrible * deadlock situations as we steal page tables for kpm use, and * userland applications take them right back before we can use them * to set up our new memory. Once a way around that is found, and a * few other changes are made, we'll be able to enable this code. */ /* * Update kernel structures, part 1: memsegs list */ mem->memseg.pages_base = meta_start; mem->memseg.pages_end = bln_stats.bln_max_pages - 1; mem->memseg.pages = &page_array[0]; mem->memseg.epages = &page_array[num_pages - 1]; mem->memseg.next = NULL; memsegs_lock(1); for (segp = memsegs; segp->next != NULL; segp = segp->next) ; segp->next = &mem->memseg; memsegs_unlock(1); /* * Update kernel structures, part 2: mem_node array */ mem_node_add_slice(meta_start, bln_stats.bln_max_pages); /* * Update kernel structures, part 3: phys_install array * (*sigh* how many of these things do we need?) */ memlist_write_lock(); memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist, &phys_install); memlist_write_unlock(); build_pfn_hash(); return (metapgs); } /* How many ulong_t's can we fit on a page? */ #define FRAME_ARRAY_SIZE (PAGESIZE / sizeof (ulong_t)) /* * These are too large to declare on the stack, so we make them static instead */ static ulong_t mfn_frames[FRAME_ARRAY_SIZE]; static pfn_t pfn_frames[FRAME_ARRAY_SIZE]; /* * This function is called when our reservation is increasing. Make a * hypervisor call to get our new pages, then integrate them into the system. */ static spgcnt_t balloon_inc_reservation(ulong_t credit) { int i, cnt, locked; int meta_pg_start, meta_pg_end; long rv; page_t *pp; page_t *new_list_front, *new_list_back; /* Make sure we're single-threaded. */ ASSERT(MUTEX_HELD(&bln_mutex)); rv = 0; new_list_front = new_list_back = NULL; meta_pg_start = meta_pg_end = 0; bzero(mfn_frames, PAGESIZE); if (credit > FRAME_ARRAY_SIZE) credit = FRAME_ARRAY_SIZE; xen_block_migrate(); rv = balloon_alloc_pages(credit, mfn_frames); if (rv < 0) { xen_allow_migrate(); return (0); } for (i = 0; i < rv; i++) { if (mfn_frames[i] > new_high_mfn) new_high_mfn = mfn_frames[i]; pp = balloon_page_sub(); if (pp == NULL) { /* * We pass the index into the current mfn array, * then move the counter past the mfns we used */ meta_pg_start = i; cnt = balloon_init_new_pages(&mfn_frames[i], rv - i); i += cnt; meta_pg_end = i; if (i < rv) { pp = balloon_page_sub(); } else { ASSERT(i == rv); } } if (pp == NULL) { break; } if (new_list_back == NULL) { new_list_front = new_list_back = pp; } else { new_list_back->p_next = pp; new_list_back = pp; } pp->p_next = NULL; } cnt = i; locked = balloon_lock_contig_pfnlist(cnt); for (i = 0, pp = new_list_front; i < meta_pg_start; i++, pp = pp->p_next) { reassign_pfn(pp->p_pagenum, mfn_frames[i]); } for (i = meta_pg_end; i < cnt; i++, pp = pp->p_next) { reassign_pfn(pp->p_pagenum, mfn_frames[i]); } if (locked) unlock_contig_pfnlist(); /* * Make sure we don't allow pages without pfn->mfn mappings * into the system. */ ASSERT(pp == NULL); while (new_list_front != NULL) { pp = new_list_front; new_list_front = pp->p_next; page_free(pp, 1); } /* * Variable review: at this point, rv contains the number of pages * the hypervisor gave us. cnt contains the number of pages for which * we had page_t structures. i contains the number of pages * where we set up pfn <-> mfn mappings. If this ASSERT trips, that * means we somehow lost page_t's from our local list. */ ASSERT(cnt == i); if (cnt < rv) { /* * We couldn't get page structures. * * This shouldn't happen, but causes no real harm if it does. * On debug kernels, we'll flag it. On all kernels, we'll * give back the pages we couldn't assign. * * Since these pages are new to the system and haven't been * used, we don't bother zeroing them. */ #ifdef DEBUG cmn_err(CE_WARN, "Could only assign %d of %ld pages", cnt, rv); #endif /* DEBUG */ (void) balloon_free_pages(rv - cnt, &mfn_frames[i], NULL, NULL); rv = cnt; } xen_allow_migrate(); page_unresv(rv - (meta_pg_end - meta_pg_start)); return (rv); } /* * This function is called when we want to decrease the memory reservation * of our domain. Allocate the memory and make a hypervisor call to give * it back. */ static spgcnt_t balloon_dec_reservation(ulong_t debit) { int i, locked; long rv; ulong_t request; page_t *pp; bzero(mfn_frames, sizeof (mfn_frames)); bzero(pfn_frames, sizeof (pfn_frames)); if (debit > FRAME_ARRAY_SIZE) { debit = FRAME_ARRAY_SIZE; } request = debit; /* * Don't bother if there isn't a safe amount of kmem left. */ if (kmem_avail() < balloon_minkmem) { kmem_reap(); if (kmem_avail() < balloon_minkmem) return (0); } if (page_resv(request, KM_NOSLEEP) == 0) { return (0); } xen_block_migrate(); for (i = 0; i < debit; i++) { pp = page_get_high_mfn(new_high_mfn); new_high_mfn = 0; if (pp == NULL) { /* * Call kmem_reap(), then try once more, * but only if there is a safe amount of * kmem left. */ kmem_reap(); if (kmem_avail() < balloon_minkmem || (pp = page_get_high_mfn(0)) == NULL) { debit = i; break; } } ASSERT(PAGE_EXCL(pp)); ASSERT(!hat_page_is_mapped(pp)); balloon_page_add(pp); pfn_frames[i] = pp->p_pagenum; mfn_frames[i] = pfn_to_mfn(pp->p_pagenum); } if (debit == 0) { xen_allow_migrate(); page_unresv(request); return (0); } /* * We zero all the pages before we start reassigning them in order to * minimize the time spent holding the lock on the contig pfn list. */ if (balloon_zero_memory) { for (i = 0; i < debit; i++) { pfnzero(pfn_frames[i], 0, PAGESIZE); } } /* * Remove all mappings for the pfns from the system */ locked = balloon_lock_contig_pfnlist(debit); for (i = 0; i < debit; i++) { reassign_pfn(pfn_frames[i], MFN_INVALID); } if (locked) unlock_contig_pfnlist(); rv = balloon_free_pages(debit, mfn_frames, NULL, NULL); if (rv < 0) { cmn_err(CE_WARN, "Attempt to return pages to the hypervisor " "failed - up to %lu pages lost (error = %ld)", debit, rv); rv = 0; } else if (rv != debit) { panic("Unexpected return value (%ld) from decrease reservation " "hypervisor call", rv); } xen_allow_migrate(); if (debit != request) page_unresv(request - debit); return (rv); } /* * This function is the callback which is called when the memory/target * node is changed. When it is fired, we will read a new reservation * target for our domain and signal the worker thread to make the change. * * If the reservation is larger than we can handle, we issue a warning. dom0 * does this automatically every boot, so we skip the first warning on dom0. */ /*ARGSUSED*/ static void balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len) { ulong_t new_target_kb; pgcnt_t new_target_pages; int rv; static uchar_t warning_cnt = 0; rv = xenbus_scanf(NULL, "memory", "target", "%lu", &new_target_kb); if (rv != 0) { return; } /* new_target is in kB - change this to pages */ new_target_pages = kbtop(new_target_kb); DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages); /* * Unfortunately, dom0 may give us a target that is larger than * our max limit. Re-check the limit, and, if the new target is * too large, adjust it downwards. */ mutex_enter(&bln_mutex); if (new_target_pages > bln_stats.bln_max_pages) { DTRACE_PROBE2(balloon__target__too__large, pgcnt_t, new_target_pages, pgcnt_t, bln_stats.bln_max_pages); if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) { cmn_err(CE_WARN, "New balloon target (0x%lx pages) is " "larger than original memory size (0x%lx pages). " "Ballooning beyond original memory size is not " "allowed.", new_target_pages, bln_stats.bln_max_pages); } warning_cnt = 1; bln_stats.bln_new_target = bln_stats.bln_max_pages; } else { bln_stats.bln_new_target = new_target_pages; } mutex_exit(&bln_mutex); cv_signal(&bln_cv); } /* * bln_wait_sec can be used to throttle the hv calls, but by default it's * turned off. If a balloon attempt fails, the wait time is forced on, and * then is exponentially increased as further attempts fail. */ uint_t bln_wait_sec = 0; uint_t bln_wait_shift = 1; /* * This is the main balloon thread. Wait on the cv. When woken, if our * reservation has changed, call the appropriate function to adjust the * reservation. */ static void balloon_worker_thread(void) { uint_t bln_wait; callb_cpr_t cprinfo; spgcnt_t rv; bln_wait = bln_wait_sec; CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon"); for (;;) { rv = 0; mutex_enter(&bln_mutex); CALLB_CPR_SAFE_BEGIN(&cprinfo); if (bln_stats.bln_new_target != bln_stats.bln_current_pages) { /* * We weren't able to fully complete the request * last time through, so try again. */ (void) cv_reltimedwait(&bln_cv, &bln_mutex, (bln_wait * hz), TR_CLOCK_TICK); } else { cv_wait(&bln_cv, &bln_mutex); } CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex); if (bln_stats.bln_new_target != bln_stats.bln_current_pages) { if (bln_stats.bln_new_target < bln_stats.bln_current_pages) { /* reservation shrunk */ rv = -balloon_dec_reservation( bln_stats.bln_current_pages - bln_stats.bln_new_target); } else if (bln_stats.bln_new_target > bln_stats.bln_current_pages) { /* reservation grew */ rv = balloon_inc_reservation( bln_stats.bln_new_target - bln_stats.bln_current_pages); } } if (rv == 0) { if (bln_wait == 0) { bln_wait = 1; } else { bln_wait <<= bln_wait_shift; } } else { bln_stats.bln_current_pages += rv; bln_wait = bln_wait_sec; } if (bln_stats.bln_current_pages < bln_stats.bln_low) bln_stats.bln_low = bln_stats.bln_current_pages; else if (bln_stats.bln_current_pages > bln_stats.bln_high) bln_stats.bln_high = bln_stats.bln_current_pages; mutex_exit(&bln_mutex); } } /* * Called after balloon_init(), which is below. The xenbus thread is up * and running, so we can register our watch and create the balloon thread. */ static void balloon_config_watch(int state) { if (state != XENSTORE_UP) return; bln_watch.node = "memory/target"; bln_watch.callback = balloon_handler; if (register_xenbus_watch(&bln_watch)) { cmn_err(CE_WARN, "Failed to register balloon watcher; balloon " "thread will be disabled"); return; } if (bln_thread == NULL) bln_thread = thread_create(NULL, 0, balloon_worker_thread, NULL, 0, &p0, TS_RUN, minclsyspri); } /* * Basic initialization of the balloon thread. Set all of our variables, * and register a callback for later when we can register a xenbus watch. */ void balloon_init(pgcnt_t nr_pages) { domid_t domid = DOMID_SELF; bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages; bln_stats.bln_new_target = bln_stats.bln_high = nr_pages; bln_stats.bln_max_pages = nr_pages; cv_init(&bln_cv, NULL, CV_DEFAULT, NULL); bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op( XENMEM_maximum_reservation, &domid); (void) xs_register_xenbus_callback(balloon_config_watch); } /* * These functions are called from the network drivers when they gain a page * or give one away. We simply update our count. Note that the counter * tracks the number of pages we give away, so we need to subtract any * amount passed to balloon_drv_added. */ void balloon_drv_added(int64_t delta) { atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta); } void balloon_drv_subtracted(int64_t delta) { atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta); } /* * balloon_alloc_pages() * Allocate page_cnt mfns. mfns storage provided by the caller. Returns * the number of pages allocated, which could be less than page_cnt, or * a negative number if an error occurred. */ long balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns) { xen_memory_reservation_t memres; long rv; bzero(&memres, sizeof (memres)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memres.extent_start, mfns); memres.domid = DOMID_SELF; memres.nr_extents = page_cnt; rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); if (rv > 0) atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv); return (rv); } /* * balloon_free_pages() * free page_cnt pages, using any combination of mfns, pfns, and kva as long * as they refer to the same mapping. If an array of mfns is passed in, we * assume they were already cleared. Otherwise, we need to zero the pages * before giving them back to the hypervisor. kva space is not free'd up in * case the caller wants to re-use it. */ long balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns) { xen_memory_reservation_t memdec; mfn_t mfn; pfn_t pfn; uint_t i; long e; #if DEBUG /* make sure kva is page aligned and maps to first pfn */ if (kva != NULL) { ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0); if (pfns != NULL) { ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]); } } #endif /* if we have a kva, we can clean all pages with just one bzero */ if ((kva != NULL) && balloon_zero_memory) { bzero(kva, (page_cnt * PAGESIZE)); } /* if we were given a kva and/or a pfn */ if ((kva != NULL) || (pfns != NULL)) { /* * All the current callers only pass 1 page when using kva or * pfns, and use mfns when passing multiple pages. If that * assumption is changed, the following code will need some * work. The following ASSERT() guarantees we're respecting * the io locking quota. */ ASSERT(page_cnt < bln_contig_list_quota); /* go through all the pages */ for (i = 0; i < page_cnt; i++) { /* get the next pfn */ if (pfns == NULL) { pfn = hat_getpfnum(kas.a_hat, (kva + (PAGESIZE * i))); } else { pfn = pfns[i]; } /* * if we didn't already zero this page, do it now. we * need to do this *before* we give back the MFN */ if ((kva == NULL) && (balloon_zero_memory)) { pfnzero(pfn, 0, PAGESIZE); } /* * unmap the pfn. We don't free up the kva vmem space * so the caller can re-use it. The page must be * unmapped before it is given back to the hypervisor. */ if (kva != NULL) { hat_unload(kas.a_hat, (kva + (PAGESIZE * i)), PAGESIZE, HAT_UNLOAD_UNMAP); } /* grab the mfn before the pfn is marked as invalid */ mfn = pfn_to_mfn(pfn); /* mark the pfn as invalid */ reassign_pfn(pfn, MFN_INVALID); /* * if we weren't given an array of MFNs, we need to * free them up one at a time. Otherwise, we'll wait * until later and do it in one hypercall */ if (mfns == NULL) { bzero(&memdec, sizeof (memdec)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memdec.extent_start, &mfn); memdec.domid = DOMID_SELF; memdec.nr_extents = 1; e = HYPERVISOR_memory_op( XENMEM_decrease_reservation, &memdec); if (e != 1) { cmn_err(CE_PANIC, "balloon: unable to " "give a page back to the " "hypervisor.\n"); } } } } /* * if we were passed in MFNs, we haven't free'd them up yet. We can * do it with one call. */ if (mfns != NULL) { bzero(&memdec, sizeof (memdec)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memdec.extent_start, mfns); memdec.domid = DOMID_SELF; memdec.nr_extents = page_cnt; e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec); if (e != page_cnt) { cmn_err(CE_PANIC, "balloon: unable to give pages back " "to the hypervisor.\n"); } } atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt); return (page_cnt); } /* * balloon_replace_pages() * Try to replace nextexts blocks of 2^order pages. addr_bits specifies * how many bits of address the pages must be within (i.e. 16 would mean * that the pages cannot have an address > 64k). The constrints are on * what the hypervisor gives us -- we are free to give any pages in * exchange. The array pp is the pages we are giving away. The caller * provides storage space for mfns, which hold the new physical pages. */ long balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits, uint_t order, mfn_t *mfns) { xen_memory_reservation_t memres; long fallback_cnt; long cnt; uint_t i, j, page_cnt, extlen; long e; int locked; /* * we shouldn't be allocating constrained pages on a guest. It doesn't * make any sense. They won't be constrained after a migration. */ ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); extlen = 1 << order; page_cnt = nextents * extlen; /* Give back the current pages to the hypervisor */ for (i = 0; i < page_cnt; i++) { cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum); if (cnt != 1) { cmn_err(CE_PANIC, "balloon: unable to give a page back " "to the hypervisor.\n"); } } /* * try to allocate the new pages using addr_bits and order. If we can't * get all of the pages, try to get the remaining pages with no * constraints and, if that was successful, return the number of * constrained pages we did allocate. */ bzero(&memres, sizeof (memres)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memres.extent_start, mfns); memres.domid = DOMID_SELF; memres.nr_extents = nextents; memres.mem_flags = XENMEMF_address_bits(addr_bits); memres.extent_order = order; cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); /* assign the new MFNs to the current PFNs */ locked = balloon_lock_contig_pfnlist(cnt * extlen); for (i = 0; i < cnt; i++) { for (j = 0; j < extlen; j++) { reassign_pfn(pp[i * extlen + j]->p_pagenum, mfns[i] + j); } } if (locked) unlock_contig_pfnlist(); if (cnt != nextents) { if (cnt < 0) { cnt = 0; } /* * We couldn't get enough memory to satisfy our requirements. * The above loop will assign the parts of the request that * were successful (this part may be 0). We need to fill * in the rest. The bzero below clears out extent_order and * address_bits, so we'll take anything from the hypervisor * to replace the pages we gave away. */ fallback_cnt = page_cnt - cnt * extlen; bzero(&memres, sizeof (memres)); /*LINTED: constant in conditional context*/ set_xen_guest_handle(memres.extent_start, mfns); memres.domid = DOMID_SELF; memres.nr_extents = fallback_cnt; e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); if (e != fallback_cnt) { cmn_err(CE_PANIC, "balloon: unable to recover from " "failed increase_reservation.\n"); } locked = balloon_lock_contig_pfnlist(fallback_cnt); for (i = 0; i < fallback_cnt; i++) { uint_t offset = page_cnt - fallback_cnt; /* * We already used pp[0...(cnt * extlen)] before, * so start at the next entry in the pp array. */ reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]); } if (locked) unlock_contig_pfnlist(); } /* * balloon_free_pages increments our counter. Decrement it here. */ atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt); /* * return the number of extents we were able to replace. If we got * this far, we know all the pp's are valid. */ return (cnt); } /* * Called from the driver - return the requested stat. */ size_t balloon_values(int cmd) { switch (cmd) { case BLN_IOCTL_CURRENT: return (ptokb(bln_stats.bln_current_pages)); case BLN_IOCTL_TARGET: return (ptokb(bln_stats.bln_new_target)); case BLN_IOCTL_LOW: return (ptokb(bln_stats.bln_low)); case BLN_IOCTL_HIGH: return (ptokb(bln_stats.bln_high)); case BLN_IOCTL_LIMIT: return (ptokb(bln_stats.bln_hard_limit)); default: panic("Unexpected cmd %d in balloon_values()\n", cmd); } /*NOTREACHED*/ }