1843e1988Sjohnlev /* 2843e1988Sjohnlev * CDDL HEADER START 3843e1988Sjohnlev * 4843e1988Sjohnlev * The contents of this file are subject to the terms of the 5843e1988Sjohnlev * Common Development and Distribution License (the "License"). 6843e1988Sjohnlev * You may not use this file except in compliance with the License. 7843e1988Sjohnlev * 8843e1988Sjohnlev * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9843e1988Sjohnlev * or http://www.opensolaris.org/os/licensing. 10843e1988Sjohnlev * See the License for the specific language governing permissions 11843e1988Sjohnlev * and limitations under the License. 12843e1988Sjohnlev * 13843e1988Sjohnlev * When distributing Covered Code, include this CDDL HEADER in each 14843e1988Sjohnlev * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15843e1988Sjohnlev * If applicable, add the following below this CDDL HEADER, with the 16843e1988Sjohnlev * fields enclosed by brackets "[]" replaced with your own identifying 17843e1988Sjohnlev * information: Portions Copyright [yyyy] [name of copyright owner] 18843e1988Sjohnlev * 19843e1988Sjohnlev * CDDL HEADER END 20843e1988Sjohnlev */ 21843e1988Sjohnlev 22843e1988Sjohnlev /* 23843e1988Sjohnlev * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24843e1988Sjohnlev * Use is subject to license terms. 25843e1988Sjohnlev */ 26843e1988Sjohnlev 27843e1988Sjohnlev #pragma ident "%Z%%M% %I% %E% SMI" 28843e1988Sjohnlev 29843e1988Sjohnlev #include <sys/balloon_impl.h> 30843e1988Sjohnlev #include <sys/hypervisor.h> 31843e1988Sjohnlev #include <xen/sys/xenbus_impl.h> 32843e1988Sjohnlev #include <sys/atomic.h> 33843e1988Sjohnlev #include <sys/cmn_err.h> 34843e1988Sjohnlev #include <sys/disp.h> 35843e1988Sjohnlev #include <sys/callb.h> 36843e1988Sjohnlev #include <xen/public/memory.h> 37843e1988Sjohnlev #include <vm/hat.h> 38843e1988Sjohnlev #include <sys/promif.h> 39843e1988Sjohnlev #include <vm/seg_kmem.h> 40843e1988Sjohnlev #include <sys/memnode.h> 41843e1988Sjohnlev #include <sys/param.h> 42843e1988Sjohnlev #include <vm/vm_dep.h> 43843e1988Sjohnlev #include <sys/mman.h> 44843e1988Sjohnlev #include <sys/memlist.h> 45843e1988Sjohnlev #include <sys/sysmacros.h> 46843e1988Sjohnlev #include <sys/machsystm.h> 47843e1988Sjohnlev #include <sys/sdt.h> 48843e1988Sjohnlev 49843e1988Sjohnlev /* 50843e1988Sjohnlev * This file implements a balloon thread, which controls a domain's memory 51843e1988Sjohnlev * reservation, or the amount of memory a domain is currently allocated. 52843e1988Sjohnlev * The hypervisor provides the current memory reservation through xenbus, 53843e1988Sjohnlev * so we register a watch on this. We will then be signalled when the 54843e1988Sjohnlev * reservation changes. If it goes up, we map the new mfn's to our pfn's 55843e1988Sjohnlev * (allocating page_t's if necessary), and release them into the system. 56843e1988Sjohnlev * If the reservation goes down, we grab pages and release them back to 57843e1988Sjohnlev * the hypervisor, saving the page_t's for later use. 58843e1988Sjohnlev */ 59843e1988Sjohnlev 60843e1988Sjohnlev /* 61843e1988Sjohnlev * Various structures needed by the balloon thread 62843e1988Sjohnlev */ 63843e1988Sjohnlev static bln_stats_t bln_stats; 64843e1988Sjohnlev static kthread_t *bln_thread; 65843e1988Sjohnlev static kmutex_t bln_mutex; 66843e1988Sjohnlev static kcondvar_t bln_cv; 67843e1988Sjohnlev static struct xenbus_watch bln_watch; 68843e1988Sjohnlev static mfn_t new_high_mfn; 69843e1988Sjohnlev 70843e1988Sjohnlev /* 71843e1988Sjohnlev * For holding spare page_t structures - keep a singly-linked list. 72843e1988Sjohnlev * The list may hold both valid (pagenum < mfn_count) and invalid 73843e1988Sjohnlev * (pagenum >= mfn_count) page_t's. Valid page_t's should be inserted 74843e1988Sjohnlev * at the front, and invalid page_t's at the back. Removal should 75843e1988Sjohnlev * always be from the front. This is a singly-linked list using 76843e1988Sjohnlev * p_next, so p_prev is always NULL. 77843e1988Sjohnlev */ 78843e1988Sjohnlev static page_t *bln_spare_list_front, *bln_spare_list_back; 79843e1988Sjohnlev 80843e1988Sjohnlev int balloon_zero_memory = 1; 81843e1988Sjohnlev size_t balloon_minkmem = (8 * 1024 * 1024); 82843e1988Sjohnlev 83843e1988Sjohnlev /* 84843e1988Sjohnlev * reassign_pfn() calls update_contig_pfnlist(), which can cause a large 85843e1988Sjohnlev * slowdown when calling multiple times. If we're reassigning less than the 86843e1988Sjohnlev * quota defined here, we just accept the slowdown. If the count is greater 87843e1988Sjohnlev * than the quota, we tell the contig alloc code to stop its accounting until 88843e1988Sjohnlev * we're done. Setting the quota to less than 2 is not supported. 89843e1988Sjohnlev * 90843e1988Sjohnlev * Note that we define our own wrapper around the external 91843e1988Sjohnlev * clear_and_lock_contig_pfnlist(), but we just use the version of 92843e1988Sjohnlev * unlock_contig_pfnlist() in vm_machdep.c. 93843e1988Sjohnlev */ 94843e1988Sjohnlev uint_t bln_contig_list_quota = 50; 95843e1988Sjohnlev 96843e1988Sjohnlev extern void clear_and_lock_contig_pfnlist(void); 97843e1988Sjohnlev extern void unlock_contig_pfnlist(void); 98843e1988Sjohnlev 99843e1988Sjohnlev /* 100843e1988Sjohnlev * Lock the pfnlist if necessary (see above), and return whether we locked it. 101843e1988Sjohnlev */ 102843e1988Sjohnlev static int 103843e1988Sjohnlev balloon_lock_contig_pfnlist(int count) { 104843e1988Sjohnlev if (count > bln_contig_list_quota) { 105843e1988Sjohnlev clear_and_lock_contig_pfnlist(); 106843e1988Sjohnlev return (1); 107843e1988Sjohnlev } else { 108843e1988Sjohnlev return (0); 109843e1988Sjohnlev } 110843e1988Sjohnlev } 111843e1988Sjohnlev 112843e1988Sjohnlev /* 113843e1988Sjohnlev * The page represented by pp is being given back to the hypervisor. 114843e1988Sjohnlev * Add the page_t structure to our spare list. 115843e1988Sjohnlev */ 116843e1988Sjohnlev static void 117843e1988Sjohnlev balloon_page_add(page_t *pp) 118843e1988Sjohnlev { 119843e1988Sjohnlev /* 120843e1988Sjohnlev * We need to keep the page exclusively locked 121843e1988Sjohnlev * to prevent swrand from grabbing it. 122843e1988Sjohnlev */ 123843e1988Sjohnlev ASSERT(PAGE_EXCL(pp)); 124843e1988Sjohnlev ASSERT(MUTEX_HELD(&bln_mutex)); 125843e1988Sjohnlev 126843e1988Sjohnlev pp->p_prev = NULL; 127843e1988Sjohnlev if (bln_spare_list_front == NULL) { 128843e1988Sjohnlev bln_spare_list_front = bln_spare_list_back = pp; 129843e1988Sjohnlev pp->p_next = NULL; 130843e1988Sjohnlev } else if (pp->p_pagenum >= mfn_count) { 131843e1988Sjohnlev /* 132843e1988Sjohnlev * The pfn is invalid, so add at the end of list. Since these 133843e1988Sjohnlev * adds should *only* be done by balloon_init_new_pages(), and 134843e1988Sjohnlev * that does adds in order, the following ASSERT should 135843e1988Sjohnlev * never trigger. 136843e1988Sjohnlev */ 137843e1988Sjohnlev ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum); 138843e1988Sjohnlev bln_spare_list_back->p_next = pp; 139843e1988Sjohnlev pp->p_next = NULL; 140843e1988Sjohnlev bln_spare_list_back = pp; 141843e1988Sjohnlev } else { 142843e1988Sjohnlev /* Add at beginning of list */ 143843e1988Sjohnlev pp->p_next = bln_spare_list_front; 144843e1988Sjohnlev bln_spare_list_front = pp; 145843e1988Sjohnlev } 146843e1988Sjohnlev } 147843e1988Sjohnlev 148843e1988Sjohnlev /* 149843e1988Sjohnlev * Return a page_t structure from our spare list, or NULL if none are available. 150843e1988Sjohnlev */ 151843e1988Sjohnlev static page_t * 152843e1988Sjohnlev balloon_page_sub(void) 153843e1988Sjohnlev { 154843e1988Sjohnlev page_t *pp; 155843e1988Sjohnlev 156843e1988Sjohnlev ASSERT(MUTEX_HELD(&bln_mutex)); 157843e1988Sjohnlev if (bln_spare_list_front == NULL) { 158843e1988Sjohnlev return (NULL); 159843e1988Sjohnlev } 160843e1988Sjohnlev 161843e1988Sjohnlev pp = bln_spare_list_front; 162843e1988Sjohnlev ASSERT(PAGE_EXCL(pp)); 163843e1988Sjohnlev ASSERT(pp->p_pagenum <= mfn_count); 164843e1988Sjohnlev if (pp->p_pagenum == mfn_count) { 165843e1988Sjohnlev return (NULL); 166843e1988Sjohnlev } 167843e1988Sjohnlev 168843e1988Sjohnlev bln_spare_list_front = pp->p_next; 169843e1988Sjohnlev if (bln_spare_list_front == NULL) 170843e1988Sjohnlev bln_spare_list_back = NULL; 171843e1988Sjohnlev pp->p_next = NULL; 172843e1988Sjohnlev return (pp); 173843e1988Sjohnlev } 174843e1988Sjohnlev 175843e1988Sjohnlev /* 176843e1988Sjohnlev * NOTE: We currently do not support growing beyond the boot memory size, 177843e1988Sjohnlev * so the following function will not be called. It is left in here with 178843e1988Sjohnlev * the hope that someday this restriction can be lifted, and this code can 179843e1988Sjohnlev * be used. 180843e1988Sjohnlev */ 181843e1988Sjohnlev 182843e1988Sjohnlev /* 183843e1988Sjohnlev * This structure is placed at the start of every block of new pages 184843e1988Sjohnlev */ 185843e1988Sjohnlev typedef struct { 186843e1988Sjohnlev struct memseg memseg; 187843e1988Sjohnlev struct memlist memlist; 188843e1988Sjohnlev page_t pages[1]; 189843e1988Sjohnlev } mem_structs_t; 190843e1988Sjohnlev 191843e1988Sjohnlev /* 192843e1988Sjohnlev * To make the math below slightly less confusing, we calculate the first 193843e1988Sjohnlev * two parts here. page_t's are handled separately, so they are not included. 194843e1988Sjohnlev */ 195843e1988Sjohnlev #define MEM_STRUCT_SIZE (sizeof (struct memseg) + sizeof (struct memlist)) 196843e1988Sjohnlev 197843e1988Sjohnlev /* 198843e1988Sjohnlev * We want to add memory, but have no spare page_t structures. Use some of 199843e1988Sjohnlev * our new memory for the page_t structures. 200843e1988Sjohnlev * 201843e1988Sjohnlev * Somewhat similar to kphysm_add_memory_dynamic(), but simpler. 202843e1988Sjohnlev */ 203843e1988Sjohnlev static int 204843e1988Sjohnlev balloon_init_new_pages(mfn_t framelist[], pgcnt_t count) 205843e1988Sjohnlev { 206843e1988Sjohnlev pgcnt_t metapgs, totalpgs, num_pages; 207843e1988Sjohnlev paddr_t metasz; 208843e1988Sjohnlev pfn_t meta_start; 209843e1988Sjohnlev page_t *page_array; 210843e1988Sjohnlev caddr_t va; 211843e1988Sjohnlev int i, rv, locked; 212843e1988Sjohnlev mem_structs_t *mem; 213843e1988Sjohnlev struct memseg *segp; 214843e1988Sjohnlev 215843e1988Sjohnlev /* Calculate the number of pages we're going to add */ 216843e1988Sjohnlev totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages; 217843e1988Sjohnlev 218843e1988Sjohnlev /* 219843e1988Sjohnlev * The following calculates the number of "meta" pages -- the pages 220843e1988Sjohnlev * that will be required to hold page_t structures for all new pages. 221843e1988Sjohnlev * Proof of this calculation is left up to the reader. 222843e1988Sjohnlev */ 223843e1988Sjohnlev metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) / 224843e1988Sjohnlev (PAGESIZE + sizeof (page_t))); 225843e1988Sjohnlev 226843e1988Sjohnlev /* 227843e1988Sjohnlev * Given the number of page_t structures we need, is there also 228843e1988Sjohnlev * room in our meta pages for a memseg and memlist struct? 229843e1988Sjohnlev * If not, we'll need one more meta page. 230843e1988Sjohnlev */ 231843e1988Sjohnlev if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) + 232843e1988Sjohnlev MEM_STRUCT_SIZE)) 233843e1988Sjohnlev metapgs++; 234843e1988Sjohnlev 235843e1988Sjohnlev /* 236843e1988Sjohnlev * metapgs is calculated from totalpgs, which may be much larger than 237843e1988Sjohnlev * count. If we don't have enough pages, all of the pages in this 238843e1988Sjohnlev * batch will be made meta pages, and a future trip through 239843e1988Sjohnlev * balloon_inc_reservation() will add the rest of the meta pages. 240843e1988Sjohnlev */ 241843e1988Sjohnlev if (metapgs > count) 242843e1988Sjohnlev metapgs = count; 243843e1988Sjohnlev 244843e1988Sjohnlev /* 245843e1988Sjohnlev * Figure out the number of page_t structures that can fit in metapgs 246843e1988Sjohnlev * 247843e1988Sjohnlev * This will cause us to initialize more page_t structures than we 248843e1988Sjohnlev * need - these may be used in future memory increases. 249843e1988Sjohnlev */ 250843e1988Sjohnlev metasz = pfn_to_pa(metapgs); 251843e1988Sjohnlev num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t); 252843e1988Sjohnlev 253843e1988Sjohnlev DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t, 254843e1988Sjohnlev num_pages, pgcnt_t, metapgs); 255843e1988Sjohnlev 256843e1988Sjohnlev /* 257843e1988Sjohnlev * We only increment mfn_count by count, not num_pages, to keep the 258843e1988Sjohnlev * space of all valid pfns contiguous. This means we create page_t 259843e1988Sjohnlev * structures with invalid pagenums -- we deal with this situation 260843e1988Sjohnlev * in balloon_page_sub. 261843e1988Sjohnlev */ 262843e1988Sjohnlev mfn_count += count; 263843e1988Sjohnlev 264843e1988Sjohnlev /* 265843e1988Sjohnlev * Get a VA for the pages that will hold page_t and other structures. 266843e1988Sjohnlev * The memseg and memlist structures will go at the beginning, with 267843e1988Sjohnlev * the page_t structures following. 268843e1988Sjohnlev */ 269843e1988Sjohnlev va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP); 270843e1988Sjohnlev /* LINTED: improper alignment */ 271843e1988Sjohnlev mem = (mem_structs_t *)va; 272843e1988Sjohnlev page_array = mem->pages; 273843e1988Sjohnlev 274843e1988Sjohnlev meta_start = bln_stats.bln_max_pages; 275843e1988Sjohnlev 276843e1988Sjohnlev /* 277843e1988Sjohnlev * Set the mfn to pfn mapping for the meta pages. 278843e1988Sjohnlev */ 279843e1988Sjohnlev locked = balloon_lock_contig_pfnlist(metapgs); 280843e1988Sjohnlev for (i = 0; i < metapgs; i++) { 281843e1988Sjohnlev reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]); 282843e1988Sjohnlev } 283843e1988Sjohnlev if (locked) 284843e1988Sjohnlev unlock_contig_pfnlist(); 285843e1988Sjohnlev 286843e1988Sjohnlev /* 287843e1988Sjohnlev * For our meta pages, map them in and zero the page. 288843e1988Sjohnlev * This will be the first time touching the new pages. 289843e1988Sjohnlev */ 290843e1988Sjohnlev hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages, 291843e1988Sjohnlev PROT_READ | PROT_WRITE, 292843e1988Sjohnlev HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 293843e1988Sjohnlev bzero(va, metasz); 294843e1988Sjohnlev 295843e1988Sjohnlev /* 296843e1988Sjohnlev * Initialize the page array for the new pages. 297843e1988Sjohnlev */ 298843e1988Sjohnlev for (i = 0; i < metapgs; i++) { 299843e1988Sjohnlev page_array[i].p_pagenum = bln_stats.bln_max_pages++; 300843e1988Sjohnlev page_array[i].p_offset = (u_offset_t)-1; 301843e1988Sjohnlev page_iolock_init(&page_array[i]); 302843e1988Sjohnlev rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); 303843e1988Sjohnlev ASSERT(rv == 1); 304843e1988Sjohnlev } 305843e1988Sjohnlev 306843e1988Sjohnlev /* 307843e1988Sjohnlev * For the rest of the pages, initialize the page_t struct and 308843e1988Sjohnlev * add them to the free list 309843e1988Sjohnlev */ 310843e1988Sjohnlev for (i = metapgs; i < num_pages; i++) { 311843e1988Sjohnlev page_array[i].p_pagenum = bln_stats.bln_max_pages++; 312843e1988Sjohnlev page_array[i].p_offset = (u_offset_t)-1; 313843e1988Sjohnlev page_iolock_init(&page_array[i]); 314843e1988Sjohnlev rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); 315843e1988Sjohnlev ASSERT(rv == 1); 316843e1988Sjohnlev balloon_page_add(&page_array[i]); 317843e1988Sjohnlev } 318843e1988Sjohnlev 319843e1988Sjohnlev /* 320843e1988Sjohnlev * Remember where I said that we don't call this function? The missing 321843e1988Sjohnlev * code right here is why. We need to set up kpm mappings for any new 322843e1988Sjohnlev * pages coming in. However, if someone starts up a domain with small 323843e1988Sjohnlev * memory, then greatly increases it, we could get in some horrible 324843e1988Sjohnlev * deadlock situations as we steal page tables for kpm use, and 325843e1988Sjohnlev * userland applications take them right back before we can use them 326843e1988Sjohnlev * to set up our new memory. Once a way around that is found, and a 327843e1988Sjohnlev * few other changes are made, we'll be able to enable this code. 328843e1988Sjohnlev */ 329843e1988Sjohnlev 330843e1988Sjohnlev /* 331843e1988Sjohnlev * Update kernel structures, part 1: memsegs list 332843e1988Sjohnlev */ 333843e1988Sjohnlev mem->memseg.pages_base = meta_start; 334843e1988Sjohnlev mem->memseg.pages_end = bln_stats.bln_max_pages - 1; 335843e1988Sjohnlev mem->memseg.pages = &page_array[0]; 336843e1988Sjohnlev mem->memseg.epages = &page_array[num_pages - 1]; 337843e1988Sjohnlev mem->memseg.next = NULL; 338843e1988Sjohnlev memsegs_lock(1); 339843e1988Sjohnlev for (segp = memsegs; segp->next != NULL; segp = segp->next) 340843e1988Sjohnlev ; 341843e1988Sjohnlev segp->next = &mem->memseg; 342843e1988Sjohnlev memsegs_unlock(1); 343843e1988Sjohnlev 344843e1988Sjohnlev /* 345843e1988Sjohnlev * Update kernel structures, part 2: mem_node array 346843e1988Sjohnlev */ 347843e1988Sjohnlev mem_node_add_slice(meta_start, bln_stats.bln_max_pages); 348843e1988Sjohnlev 349843e1988Sjohnlev /* 350843e1988Sjohnlev * Update kernel structures, part 3: phys_install array 351843e1988Sjohnlev * (*sigh* how many of these things do we need?) 352843e1988Sjohnlev */ 353843e1988Sjohnlev memlist_write_lock(); 354843e1988Sjohnlev memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist, 355843e1988Sjohnlev &phys_install); 356843e1988Sjohnlev memlist_write_unlock(); 357843e1988Sjohnlev 358843e1988Sjohnlev build_pfn_hash(); 359843e1988Sjohnlev 360843e1988Sjohnlev return (metapgs); 361843e1988Sjohnlev } 362843e1988Sjohnlev 363843e1988Sjohnlev /* How many ulong_t's can we fit on a page? */ 364843e1988Sjohnlev #define FRAME_ARRAY_SIZE (PAGESIZE / sizeof (ulong_t)) 365843e1988Sjohnlev 366843e1988Sjohnlev /* 367843e1988Sjohnlev * These are too large to declare on the stack, so we make them static instead 368843e1988Sjohnlev */ 369843e1988Sjohnlev static ulong_t mfn_frames[FRAME_ARRAY_SIZE]; 370843e1988Sjohnlev static pfn_t pfn_frames[FRAME_ARRAY_SIZE]; 371843e1988Sjohnlev 372843e1988Sjohnlev /* 373843e1988Sjohnlev * This function is called when our reservation is increasing. Make a 374843e1988Sjohnlev * hypervisor call to get our new pages, then integrate them into the system. 375843e1988Sjohnlev */ 376843e1988Sjohnlev static spgcnt_t 377843e1988Sjohnlev balloon_inc_reservation(ulong_t credit) 378843e1988Sjohnlev { 379843e1988Sjohnlev int i, cnt, locked; 380843e1988Sjohnlev int meta_pg_start, meta_pg_end; 381843e1988Sjohnlev long rv; 382843e1988Sjohnlev page_t *pp; 383843e1988Sjohnlev page_t *new_list_front, *new_list_back; 384843e1988Sjohnlev 385*d2b85481Srscott /* Make sure we're single-threaded. */ 386*d2b85481Srscott ASSERT(MUTEX_HELD(&bln_mutex)); 387*d2b85481Srscott 388843e1988Sjohnlev rv = 0; 389843e1988Sjohnlev new_list_front = new_list_back = NULL; 390843e1988Sjohnlev meta_pg_start = meta_pg_end = 0; 391843e1988Sjohnlev bzero(mfn_frames, PAGESIZE); 392843e1988Sjohnlev 393843e1988Sjohnlev if (credit > FRAME_ARRAY_SIZE) 394843e1988Sjohnlev credit = FRAME_ARRAY_SIZE; 395843e1988Sjohnlev 396843e1988Sjohnlev xen_block_migrate(); 397843e1988Sjohnlev rv = balloon_alloc_pages(credit, mfn_frames); 398843e1988Sjohnlev 399843e1988Sjohnlev if (rv < 0) { 400843e1988Sjohnlev xen_allow_migrate(); 401843e1988Sjohnlev return (0); 402843e1988Sjohnlev } 403843e1988Sjohnlev for (i = 0; i < rv; i++) { 404843e1988Sjohnlev if (mfn_frames[i] > new_high_mfn) 405843e1988Sjohnlev new_high_mfn = mfn_frames[i]; 406843e1988Sjohnlev 407843e1988Sjohnlev pp = balloon_page_sub(); 408843e1988Sjohnlev if (pp == NULL) { 409843e1988Sjohnlev /* 410843e1988Sjohnlev * We pass the index into the current mfn array, 411843e1988Sjohnlev * then move the counter past the mfns we used 412843e1988Sjohnlev */ 413843e1988Sjohnlev meta_pg_start = i; 414843e1988Sjohnlev cnt = balloon_init_new_pages(&mfn_frames[i], rv - i); 415843e1988Sjohnlev i += cnt; 416843e1988Sjohnlev meta_pg_end = i; 417843e1988Sjohnlev if (i < rv) { 418843e1988Sjohnlev pp = balloon_page_sub(); 419843e1988Sjohnlev } else { 420843e1988Sjohnlev ASSERT(i == rv); 421843e1988Sjohnlev } 422843e1988Sjohnlev } 423843e1988Sjohnlev if (pp == NULL) { 424843e1988Sjohnlev break; 425843e1988Sjohnlev } 426843e1988Sjohnlev 427843e1988Sjohnlev if (new_list_back == NULL) { 428843e1988Sjohnlev new_list_front = new_list_back = pp; 429843e1988Sjohnlev } else { 430843e1988Sjohnlev new_list_back->p_next = pp; 431843e1988Sjohnlev new_list_back = pp; 432843e1988Sjohnlev } 433843e1988Sjohnlev pp->p_next = NULL; 434843e1988Sjohnlev } 435843e1988Sjohnlev cnt = i; 436843e1988Sjohnlev locked = balloon_lock_contig_pfnlist(cnt); 437*d2b85481Srscott for (i = 0, pp = new_list_front; i < meta_pg_start; 438843e1988Sjohnlev i++, pp = pp->p_next) { 439843e1988Sjohnlev reassign_pfn(pp->p_pagenum, mfn_frames[i]); 440843e1988Sjohnlev } 441*d2b85481Srscott for (i = meta_pg_end; i < cnt; i++, pp = pp->p_next) { 442843e1988Sjohnlev reassign_pfn(pp->p_pagenum, mfn_frames[i]); 443843e1988Sjohnlev } 444843e1988Sjohnlev if (locked) 445843e1988Sjohnlev unlock_contig_pfnlist(); 446*d2b85481Srscott 447*d2b85481Srscott /* 448*d2b85481Srscott * Make sure we don't allow pages without pfn->mfn mappings 449*d2b85481Srscott * into the system. 450*d2b85481Srscott */ 451*d2b85481Srscott ASSERT(pp == NULL); 452*d2b85481Srscott 453843e1988Sjohnlev while (new_list_front != NULL) { 454843e1988Sjohnlev pp = new_list_front; 455843e1988Sjohnlev new_list_front = pp->p_next; 456843e1988Sjohnlev page_free(pp, 1); 457843e1988Sjohnlev } 458843e1988Sjohnlev 459*d2b85481Srscott /* 460*d2b85481Srscott * Variable review: at this point, rv contains the number of pages 461*d2b85481Srscott * the hypervisor gave us. cnt contains the number of pages for which 462*d2b85481Srscott * we had page_t structures. i contains the number of pages 463*d2b85481Srscott * where we set up pfn <-> mfn mappings. If this ASSERT trips, that 464*d2b85481Srscott * means we somehow lost page_t's from our local list. 465*d2b85481Srscott */ 466*d2b85481Srscott ASSERT(cnt == i); 467843e1988Sjohnlev if (cnt < rv) { 468843e1988Sjohnlev /* 469843e1988Sjohnlev * We couldn't get page structures. 470843e1988Sjohnlev * 471843e1988Sjohnlev * This shouldn't happen, but causes no real harm if it does. 472843e1988Sjohnlev * On debug kernels, we'll flag it. On all kernels, we'll 473843e1988Sjohnlev * give back the pages we couldn't assign. 474*d2b85481Srscott * 475*d2b85481Srscott * Since these pages are new to the system and haven't been 476*d2b85481Srscott * used, we don't bother zeroing them. 477843e1988Sjohnlev */ 478843e1988Sjohnlev #ifdef DEBUG 479*d2b85481Srscott cmn_err(CE_WARN, "Could only assign %d of %ld pages", cnt, rv); 480843e1988Sjohnlev #endif /* DEBUG */ 481843e1988Sjohnlev 482*d2b85481Srscott (void) balloon_free_pages(rv - cnt, &mfn_frames[i], NULL, NULL); 483843e1988Sjohnlev 484*d2b85481Srscott rv = cnt; 485843e1988Sjohnlev } 486843e1988Sjohnlev 487843e1988Sjohnlev xen_allow_migrate(); 488*d2b85481Srscott page_unresv(rv - (meta_pg_end - meta_pg_start)); 489843e1988Sjohnlev return (rv); 490843e1988Sjohnlev } 491843e1988Sjohnlev 492843e1988Sjohnlev /* 493843e1988Sjohnlev * This function is called when we want to decrease the memory reservation 494843e1988Sjohnlev * of our domain. Allocate the memory and make a hypervisor call to give 495843e1988Sjohnlev * it back. 496843e1988Sjohnlev */ 497843e1988Sjohnlev static spgcnt_t 498843e1988Sjohnlev balloon_dec_reservation(ulong_t debit) 499843e1988Sjohnlev { 500843e1988Sjohnlev int i, locked; 501843e1988Sjohnlev long rv; 502*d2b85481Srscott ulong_t request; 503843e1988Sjohnlev page_t *pp; 504843e1988Sjohnlev 505843e1988Sjohnlev bzero(mfn_frames, sizeof (mfn_frames)); 506843e1988Sjohnlev bzero(pfn_frames, sizeof (pfn_frames)); 507843e1988Sjohnlev 508843e1988Sjohnlev if (debit > FRAME_ARRAY_SIZE) { 509843e1988Sjohnlev debit = FRAME_ARRAY_SIZE; 510843e1988Sjohnlev } 511*d2b85481Srscott request = debit; 512843e1988Sjohnlev 513843e1988Sjohnlev /* 514843e1988Sjohnlev * Don't bother if there isn't a safe amount of kmem left. 515843e1988Sjohnlev */ 516843e1988Sjohnlev if (kmem_avail() < balloon_minkmem) { 517843e1988Sjohnlev kmem_reap(); 518843e1988Sjohnlev if (kmem_avail() < balloon_minkmem) 519843e1988Sjohnlev return (0); 520843e1988Sjohnlev } 521843e1988Sjohnlev 522*d2b85481Srscott if (page_resv(request, KM_NOSLEEP) == 0) { 523843e1988Sjohnlev return (0); 524843e1988Sjohnlev } 525843e1988Sjohnlev xen_block_migrate(); 526843e1988Sjohnlev for (i = 0; i < debit; i++) { 527843e1988Sjohnlev pp = page_get_high_mfn(new_high_mfn); 528843e1988Sjohnlev new_high_mfn = 0; 529843e1988Sjohnlev if (pp == NULL) { 530843e1988Sjohnlev /* 531843e1988Sjohnlev * Call kmem_reap(), then try once more, 532843e1988Sjohnlev * but only if there is a safe amount of 533843e1988Sjohnlev * kmem left. 534843e1988Sjohnlev */ 535843e1988Sjohnlev kmem_reap(); 536843e1988Sjohnlev if (kmem_avail() < balloon_minkmem || 537843e1988Sjohnlev (pp = page_get_high_mfn(0)) == NULL) { 538843e1988Sjohnlev debit = i; 539843e1988Sjohnlev break; 540843e1988Sjohnlev } 541843e1988Sjohnlev } 542843e1988Sjohnlev ASSERT(PAGE_EXCL(pp)); 543843e1988Sjohnlev ASSERT(!hat_page_is_mapped(pp)); 544843e1988Sjohnlev 545843e1988Sjohnlev balloon_page_add(pp); 546843e1988Sjohnlev pfn_frames[i] = pp->p_pagenum; 547843e1988Sjohnlev mfn_frames[i] = pfn_to_mfn(pp->p_pagenum); 548843e1988Sjohnlev } 549843e1988Sjohnlev if (debit == 0) { 550843e1988Sjohnlev xen_allow_migrate(); 551*d2b85481Srscott page_unresv(request); 552843e1988Sjohnlev return (0); 553843e1988Sjohnlev } 554843e1988Sjohnlev 555843e1988Sjohnlev /* 556*d2b85481Srscott * We zero all the pages before we start reassigning them in order to 557*d2b85481Srscott * minimize the time spent holding the lock on the contig pfn list. 558*d2b85481Srscott */ 559*d2b85481Srscott if (balloon_zero_memory) { 560*d2b85481Srscott for (i = 0; i < debit; i++) { 561*d2b85481Srscott pfnzero(pfn_frames[i], 0, PAGESIZE); 562*d2b85481Srscott } 563*d2b85481Srscott } 564*d2b85481Srscott 565*d2b85481Srscott /* 566843e1988Sjohnlev * Remove all mappings for the pfns from the system 567843e1988Sjohnlev */ 568843e1988Sjohnlev locked = balloon_lock_contig_pfnlist(debit); 569843e1988Sjohnlev for (i = 0; i < debit; i++) { 570843e1988Sjohnlev reassign_pfn(pfn_frames[i], MFN_INVALID); 571843e1988Sjohnlev } 572843e1988Sjohnlev if (locked) 573843e1988Sjohnlev unlock_contig_pfnlist(); 574843e1988Sjohnlev 575843e1988Sjohnlev rv = balloon_free_pages(debit, mfn_frames, NULL, NULL); 576843e1988Sjohnlev 577843e1988Sjohnlev if (rv < 0) { 578843e1988Sjohnlev cmn_err(CE_WARN, "Attempt to return pages to the hypervisor " 579843e1988Sjohnlev "failed - up to %lu pages lost (error = %ld)", debit, rv); 580843e1988Sjohnlev rv = 0; 581843e1988Sjohnlev } else if (rv != debit) { 582843e1988Sjohnlev panic("Unexpected return value (%ld) from decrease reservation " 583843e1988Sjohnlev "hypervisor call", rv); 584843e1988Sjohnlev } 585843e1988Sjohnlev 586843e1988Sjohnlev xen_allow_migrate(); 587*d2b85481Srscott if (debit != request) 588*d2b85481Srscott page_unresv(request - debit); 589843e1988Sjohnlev return (rv); 590843e1988Sjohnlev } 591843e1988Sjohnlev 592843e1988Sjohnlev /* 593843e1988Sjohnlev * This function is the callback which is called when the memory/target 594843e1988Sjohnlev * node is changed. When it is fired, we will read a new reservation 595843e1988Sjohnlev * target for our domain and signal the worker thread to make the change. 596843e1988Sjohnlev * 597843e1988Sjohnlev * If the reservation is larger than we can handle, we issue a warning. dom0 598843e1988Sjohnlev * does this automatically every boot, so we skip the first warning on dom0. 599843e1988Sjohnlev */ 600843e1988Sjohnlev /*ARGSUSED*/ 601843e1988Sjohnlev static void 602843e1988Sjohnlev balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len) 603843e1988Sjohnlev { 604843e1988Sjohnlev ulong_t new_target_kb; 605843e1988Sjohnlev pgcnt_t new_target_pages; 606843e1988Sjohnlev int rv; 607843e1988Sjohnlev static uchar_t warning_cnt = 0; 608843e1988Sjohnlev 609843e1988Sjohnlev rv = xenbus_scanf(NULL, "memory", "target", "%lu", &new_target_kb); 610843e1988Sjohnlev if (rv != 0) { 611843e1988Sjohnlev return; 612843e1988Sjohnlev } 613843e1988Sjohnlev 614843e1988Sjohnlev /* new_target is in kB - change this to pages */ 615843e1988Sjohnlev new_target_pages = kbtop(new_target_kb); 616843e1988Sjohnlev 617843e1988Sjohnlev DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages); 618843e1988Sjohnlev 619843e1988Sjohnlev /* 620843e1988Sjohnlev * Unfortunately, dom0 may give us a target that is larger than 621843e1988Sjohnlev * our max limit. Re-check the limit, and, if the new target is 622843e1988Sjohnlev * too large, adjust it downwards. 623843e1988Sjohnlev */ 624843e1988Sjohnlev mutex_enter(&bln_mutex); 625843e1988Sjohnlev if (new_target_pages > bln_stats.bln_max_pages) { 626843e1988Sjohnlev DTRACE_PROBE2(balloon__target__too__large, pgcnt_t, 627843e1988Sjohnlev new_target_pages, pgcnt_t, bln_stats.bln_max_pages); 628843e1988Sjohnlev if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) { 629843e1988Sjohnlev cmn_err(CE_WARN, "New balloon target (0x%lx pages) is " 630843e1988Sjohnlev "larger than original memory size (0x%lx pages). " 631843e1988Sjohnlev "Ballooning beyond original memory size is not " 632843e1988Sjohnlev "allowed.", 633843e1988Sjohnlev new_target_pages, bln_stats.bln_max_pages); 634843e1988Sjohnlev } 635843e1988Sjohnlev warning_cnt = 1; 636843e1988Sjohnlev bln_stats.bln_new_target = bln_stats.bln_max_pages; 637843e1988Sjohnlev } else { 638843e1988Sjohnlev bln_stats.bln_new_target = new_target_pages; 639843e1988Sjohnlev } 640843e1988Sjohnlev 641843e1988Sjohnlev mutex_exit(&bln_mutex); 642843e1988Sjohnlev cv_signal(&bln_cv); 643843e1988Sjohnlev } 644843e1988Sjohnlev 645843e1988Sjohnlev /* 646843e1988Sjohnlev * bln_wait_sec can be used to throttle the hv calls, but by default it's 647843e1988Sjohnlev * turned off. If a balloon attempt fails, the wait time is forced on, and 648843e1988Sjohnlev * then is exponentially increased as further attempts fail. 649843e1988Sjohnlev */ 650843e1988Sjohnlev uint_t bln_wait_sec = 0; 651843e1988Sjohnlev uint_t bln_wait_shift = 1; 652843e1988Sjohnlev 653843e1988Sjohnlev /* 654843e1988Sjohnlev * This is the main balloon thread. Wait on the cv. When woken, if our 655843e1988Sjohnlev * reservation has changed, call the appropriate function to adjust the 656843e1988Sjohnlev * reservation. 657843e1988Sjohnlev */ 658843e1988Sjohnlev static void 659843e1988Sjohnlev balloon_worker_thread(void) 660843e1988Sjohnlev { 661843e1988Sjohnlev uint_t bln_wait; 662843e1988Sjohnlev callb_cpr_t cprinfo; 663843e1988Sjohnlev spgcnt_t rv; 664843e1988Sjohnlev 665843e1988Sjohnlev bln_wait = bln_wait_sec; 666843e1988Sjohnlev 667843e1988Sjohnlev CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon"); 668843e1988Sjohnlev for (;;) { 669843e1988Sjohnlev rv = 0; 670843e1988Sjohnlev 671843e1988Sjohnlev mutex_enter(&bln_mutex); 672843e1988Sjohnlev CALLB_CPR_SAFE_BEGIN(&cprinfo); 673843e1988Sjohnlev if (bln_stats.bln_new_target != bln_stats.bln_current_pages) { 674843e1988Sjohnlev /* 675843e1988Sjohnlev * We weren't able to fully complete the request 676843e1988Sjohnlev * last time through, so try again. 677843e1988Sjohnlev */ 678843e1988Sjohnlev (void) cv_timedwait(&bln_cv, &bln_mutex, 679843e1988Sjohnlev lbolt + (bln_wait * hz)); 680843e1988Sjohnlev } else { 681843e1988Sjohnlev cv_wait(&bln_cv, &bln_mutex); 682843e1988Sjohnlev } 683843e1988Sjohnlev CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex); 684843e1988Sjohnlev 685843e1988Sjohnlev if (bln_stats.bln_new_target != bln_stats.bln_current_pages) { 686843e1988Sjohnlev if (bln_stats.bln_new_target < 687843e1988Sjohnlev bln_stats.bln_current_pages) { 688843e1988Sjohnlev /* reservation shrunk */ 689843e1988Sjohnlev rv = -balloon_dec_reservation( 690843e1988Sjohnlev bln_stats.bln_current_pages - 691843e1988Sjohnlev bln_stats.bln_new_target); 692843e1988Sjohnlev } else if (bln_stats.bln_new_target > 693843e1988Sjohnlev bln_stats.bln_current_pages) { 694843e1988Sjohnlev /* reservation grew */ 695843e1988Sjohnlev rv = balloon_inc_reservation( 696843e1988Sjohnlev bln_stats.bln_new_target - 697843e1988Sjohnlev bln_stats.bln_current_pages); 698843e1988Sjohnlev } 699843e1988Sjohnlev } 700843e1988Sjohnlev if (rv == 0) { 701843e1988Sjohnlev if (bln_wait == 0) { 702843e1988Sjohnlev bln_wait = 1; 703843e1988Sjohnlev } else { 704843e1988Sjohnlev bln_wait <<= bln_wait_shift; 705843e1988Sjohnlev } 706843e1988Sjohnlev } else { 707843e1988Sjohnlev bln_stats.bln_current_pages += rv; 708843e1988Sjohnlev bln_wait = bln_wait_sec; 709843e1988Sjohnlev } 710843e1988Sjohnlev if (bln_stats.bln_current_pages < bln_stats.bln_low) 711843e1988Sjohnlev bln_stats.bln_low = bln_stats.bln_current_pages; 712843e1988Sjohnlev else if (bln_stats.bln_current_pages > bln_stats.bln_high) 713843e1988Sjohnlev bln_stats.bln_high = bln_stats.bln_current_pages; 714843e1988Sjohnlev mutex_exit(&bln_mutex); 715843e1988Sjohnlev } 716843e1988Sjohnlev } 717843e1988Sjohnlev 718843e1988Sjohnlev /* 719843e1988Sjohnlev * Called after balloon_init(), which is below. The xenbus thread is up 720843e1988Sjohnlev * and running, so we can register our watch and create the balloon thread. 721843e1988Sjohnlev */ 722843e1988Sjohnlev static void 723843e1988Sjohnlev balloon_config_watch(int state) 724843e1988Sjohnlev { 725843e1988Sjohnlev if (state != XENSTORE_UP) 726843e1988Sjohnlev return; 727843e1988Sjohnlev 728843e1988Sjohnlev bln_watch.node = "memory/target"; 729843e1988Sjohnlev bln_watch.callback = balloon_handler; 730843e1988Sjohnlev if (register_xenbus_watch(&bln_watch)) { 731843e1988Sjohnlev cmn_err(CE_WARN, "Failed to register balloon watcher; balloon " 732843e1988Sjohnlev "thread will be disabled"); 733843e1988Sjohnlev return; 734843e1988Sjohnlev } 735843e1988Sjohnlev 736843e1988Sjohnlev if (bln_thread == NULL) 737843e1988Sjohnlev bln_thread = thread_create(NULL, 0, balloon_worker_thread, 738843e1988Sjohnlev NULL, 0, &p0, TS_RUN, minclsyspri); 739843e1988Sjohnlev } 740843e1988Sjohnlev 741843e1988Sjohnlev /* 742843e1988Sjohnlev * Basic initialization of the balloon thread. Set all of our variables, 743843e1988Sjohnlev * and register a callback for later when we can register a xenbus watch. 744843e1988Sjohnlev */ 745843e1988Sjohnlev void 746843e1988Sjohnlev balloon_init(pgcnt_t nr_pages) 747843e1988Sjohnlev { 748843e1988Sjohnlev domid_t domid = DOMID_SELF; 749843e1988Sjohnlev 750843e1988Sjohnlev bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages; 751843e1988Sjohnlev bln_stats.bln_new_target = bln_stats.bln_high = nr_pages; 752843e1988Sjohnlev bln_stats.bln_max_pages = nr_pages; 753843e1988Sjohnlev cv_init(&bln_cv, NULL, CV_DEFAULT, NULL); 754843e1988Sjohnlev 755843e1988Sjohnlev bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op( 756843e1988Sjohnlev XENMEM_maximum_reservation, &domid); 757843e1988Sjohnlev 758843e1988Sjohnlev (void) xs_register_xenbus_callback(balloon_config_watch); 759843e1988Sjohnlev } 760843e1988Sjohnlev 761843e1988Sjohnlev /* 762843e1988Sjohnlev * These functions are called from the network drivers when they gain a page 763843e1988Sjohnlev * or give one away. We simply update our count. Note that the counter 764843e1988Sjohnlev * tracks the number of pages we give away, so we need to subtract any 765843e1988Sjohnlev * amount passed to balloon_drv_added. 766843e1988Sjohnlev */ 767843e1988Sjohnlev void 768843e1988Sjohnlev balloon_drv_added(int64_t delta) 769843e1988Sjohnlev { 770843e1988Sjohnlev atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta); 771843e1988Sjohnlev } 772843e1988Sjohnlev 773843e1988Sjohnlev void 774843e1988Sjohnlev balloon_drv_subtracted(int64_t delta) 775843e1988Sjohnlev { 776843e1988Sjohnlev atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta); 777843e1988Sjohnlev } 778843e1988Sjohnlev 779843e1988Sjohnlev /* 780843e1988Sjohnlev * balloon_alloc_pages() 781843e1988Sjohnlev * Allocate page_cnt mfns. mfns storage provided by the caller. Returns 782843e1988Sjohnlev * the number of pages allocated, which could be less than page_cnt, or 783843e1988Sjohnlev * a negative number if an error occurred. 784843e1988Sjohnlev */ 785843e1988Sjohnlev long 786843e1988Sjohnlev balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns) 787843e1988Sjohnlev { 788843e1988Sjohnlev xen_memory_reservation_t memres; 789843e1988Sjohnlev long rv; 790843e1988Sjohnlev 791843e1988Sjohnlev bzero(&memres, sizeof (memres)); 792843e1988Sjohnlev /*LINTED: constant in conditional context*/ 793843e1988Sjohnlev set_xen_guest_handle(memres.extent_start, mfns); 794843e1988Sjohnlev memres.domid = DOMID_SELF; 795843e1988Sjohnlev memres.nr_extents = page_cnt; 796843e1988Sjohnlev 797843e1988Sjohnlev rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); 798843e1988Sjohnlev if (rv > 0) 799843e1988Sjohnlev atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv); 800843e1988Sjohnlev return (rv); 801843e1988Sjohnlev } 802843e1988Sjohnlev 803843e1988Sjohnlev /* 804843e1988Sjohnlev * balloon_free_pages() 805843e1988Sjohnlev * free page_cnt pages, using any combination of mfns, pfns, and kva as long 806*d2b85481Srscott * as they refer to the same mapping. If an array of mfns is passed in, we 807*d2b85481Srscott * assume they were already cleared. Otherwise, we need to zero the pages 808*d2b85481Srscott * before giving them back to the hypervisor. kva space is not free'd up in 809*d2b85481Srscott * case the caller wants to re-use it. 810843e1988Sjohnlev */ 811843e1988Sjohnlev long 812843e1988Sjohnlev balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns) 813843e1988Sjohnlev { 814843e1988Sjohnlev xen_memory_reservation_t memdec; 815843e1988Sjohnlev mfn_t mfn; 816843e1988Sjohnlev pfn_t pfn; 817843e1988Sjohnlev uint_t i; 818843e1988Sjohnlev long e; 819843e1988Sjohnlev 820843e1988Sjohnlev 821843e1988Sjohnlev #if DEBUG 822843e1988Sjohnlev /* make sure kva is page aligned and maps to first pfn */ 823843e1988Sjohnlev if (kva != NULL) { 824843e1988Sjohnlev ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0); 825843e1988Sjohnlev if (pfns != NULL) { 826843e1988Sjohnlev ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]); 827843e1988Sjohnlev } 828843e1988Sjohnlev } 829843e1988Sjohnlev #endif 830843e1988Sjohnlev 831843e1988Sjohnlev /* if we have a kva, we can clean all pages with just one bzero */ 832843e1988Sjohnlev if ((kva != NULL) && balloon_zero_memory) { 833843e1988Sjohnlev bzero(kva, (page_cnt * PAGESIZE)); 834843e1988Sjohnlev } 835843e1988Sjohnlev 836843e1988Sjohnlev /* if we were given a kva and/or a pfn */ 837843e1988Sjohnlev if ((kva != NULL) || (pfns != NULL)) { 838843e1988Sjohnlev 839843e1988Sjohnlev /* 840843e1988Sjohnlev * All the current callers only pass 1 page when using kva or 841843e1988Sjohnlev * pfns, and use mfns when passing multiple pages. If that 842843e1988Sjohnlev * assumption is changed, the following code will need some 843843e1988Sjohnlev * work. The following ASSERT() guarantees we're respecting 844843e1988Sjohnlev * the io locking quota. 845843e1988Sjohnlev */ 846843e1988Sjohnlev ASSERT(page_cnt < bln_contig_list_quota); 847843e1988Sjohnlev 848843e1988Sjohnlev /* go through all the pages */ 849843e1988Sjohnlev for (i = 0; i < page_cnt; i++) { 850843e1988Sjohnlev 851843e1988Sjohnlev /* get the next pfn */ 852843e1988Sjohnlev if (pfns == NULL) { 853843e1988Sjohnlev pfn = hat_getpfnum(kas.a_hat, 854843e1988Sjohnlev (kva + (PAGESIZE * i))); 855843e1988Sjohnlev } else { 856843e1988Sjohnlev pfn = pfns[i]; 857843e1988Sjohnlev } 858843e1988Sjohnlev 859843e1988Sjohnlev /* 860843e1988Sjohnlev * if we didn't already zero this page, do it now. we 861843e1988Sjohnlev * need to do this *before* we give back the MFN 862843e1988Sjohnlev */ 863843e1988Sjohnlev if ((kva == NULL) && (balloon_zero_memory)) { 864*d2b85481Srscott pfnzero(pfn, 0, PAGESIZE); 865843e1988Sjohnlev } 866843e1988Sjohnlev 867843e1988Sjohnlev /* 868843e1988Sjohnlev * unmap the pfn. We don't free up the kva vmem space 869843e1988Sjohnlev * so the caller can re-use it. The page must be 870843e1988Sjohnlev * unmapped before it is given back to the hypervisor. 871843e1988Sjohnlev */ 872843e1988Sjohnlev if (kva != NULL) { 873843e1988Sjohnlev hat_unload(kas.a_hat, (kva + (PAGESIZE * i)), 874843e1988Sjohnlev PAGESIZE, HAT_UNLOAD_UNMAP); 875843e1988Sjohnlev } 876843e1988Sjohnlev 877843e1988Sjohnlev /* grab the mfn before the pfn is marked as invalid */ 878843e1988Sjohnlev mfn = pfn_to_mfn(pfn); 879843e1988Sjohnlev 880843e1988Sjohnlev /* mark the pfn as invalid */ 881843e1988Sjohnlev reassign_pfn(pfn, MFN_INVALID); 882843e1988Sjohnlev 883843e1988Sjohnlev /* 884843e1988Sjohnlev * if we weren't given an array of MFNs, we need to 885843e1988Sjohnlev * free them up one at a time. Otherwise, we'll wait 886843e1988Sjohnlev * until later and do it in one hypercall 887843e1988Sjohnlev */ 888843e1988Sjohnlev if (mfns == NULL) { 889843e1988Sjohnlev bzero(&memdec, sizeof (memdec)); 890843e1988Sjohnlev /*LINTED: constant in conditional context*/ 891843e1988Sjohnlev set_xen_guest_handle(memdec.extent_start, &mfn); 892843e1988Sjohnlev memdec.domid = DOMID_SELF; 893843e1988Sjohnlev memdec.nr_extents = 1; 894843e1988Sjohnlev e = HYPERVISOR_memory_op( 895843e1988Sjohnlev XENMEM_decrease_reservation, &memdec); 896843e1988Sjohnlev if (e != 1) { 897843e1988Sjohnlev cmn_err(CE_PANIC, "balloon: unable to " 898843e1988Sjohnlev "give a page back to the " 899843e1988Sjohnlev "hypervisor.\n"); 900843e1988Sjohnlev } 901843e1988Sjohnlev } 902843e1988Sjohnlev } 903843e1988Sjohnlev } 904843e1988Sjohnlev 905843e1988Sjohnlev /* 906843e1988Sjohnlev * if we were passed in MFNs, we haven't free'd them up yet. We can 907843e1988Sjohnlev * do it with one call. 908843e1988Sjohnlev */ 909843e1988Sjohnlev if (mfns != NULL) { 910843e1988Sjohnlev bzero(&memdec, sizeof (memdec)); 911843e1988Sjohnlev /*LINTED: constant in conditional context*/ 912843e1988Sjohnlev set_xen_guest_handle(memdec.extent_start, mfns); 913843e1988Sjohnlev memdec.domid = DOMID_SELF; 914843e1988Sjohnlev memdec.nr_extents = page_cnt; 915843e1988Sjohnlev e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec); 916843e1988Sjohnlev if (e != page_cnt) { 917843e1988Sjohnlev cmn_err(CE_PANIC, "balloon: unable to give pages back " 918843e1988Sjohnlev "to the hypervisor.\n"); 919843e1988Sjohnlev } 920843e1988Sjohnlev } 921843e1988Sjohnlev 922843e1988Sjohnlev atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt); 923843e1988Sjohnlev return (page_cnt); 924843e1988Sjohnlev } 925843e1988Sjohnlev 926843e1988Sjohnlev 927843e1988Sjohnlev /* 928843e1988Sjohnlev * balloon_replace_pages() 929843e1988Sjohnlev * Try to replace nextexts blocks of 2^order pages. addr_bits specifies 930843e1988Sjohnlev * how many bits of address the pages must be within (i.e. 16 would mean 931843e1988Sjohnlev * that the pages cannot have an address > 64k). The constrints are on 932843e1988Sjohnlev * what the hypervisor gives us -- we are free to give any pages in 933843e1988Sjohnlev * exchange. The array pp is the pages we are giving away. The caller 934843e1988Sjohnlev * provides storage space for mfns, which hold the new physical pages. 935843e1988Sjohnlev */ 936843e1988Sjohnlev long 937843e1988Sjohnlev balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits, 938843e1988Sjohnlev uint_t order, mfn_t *mfns) 939843e1988Sjohnlev { 940843e1988Sjohnlev xen_memory_reservation_t memres; 941843e1988Sjohnlev long fallback_cnt; 942843e1988Sjohnlev long cnt; 943843e1988Sjohnlev uint_t i, j, page_cnt, extlen; 944843e1988Sjohnlev long e; 945843e1988Sjohnlev int locked; 946843e1988Sjohnlev 947843e1988Sjohnlev 948843e1988Sjohnlev /* 949843e1988Sjohnlev * we shouldn't be allocating constrained pages on a guest. It doesn't 950843e1988Sjohnlev * make any sense. They won't be constrained after a migration. 951843e1988Sjohnlev */ 952843e1988Sjohnlev ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); 953843e1988Sjohnlev 954843e1988Sjohnlev extlen = 1 << order; 955843e1988Sjohnlev page_cnt = nextents * extlen; 956843e1988Sjohnlev /* Give back the current pages to the hypervisor */ 957843e1988Sjohnlev for (i = 0; i < page_cnt; i++) { 958843e1988Sjohnlev cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum); 959843e1988Sjohnlev if (cnt != 1) { 960843e1988Sjohnlev cmn_err(CE_PANIC, "balloon: unable to give a page back " 961843e1988Sjohnlev "to the hypervisor.\n"); 962843e1988Sjohnlev } 963843e1988Sjohnlev } 964843e1988Sjohnlev 965843e1988Sjohnlev /* 966843e1988Sjohnlev * try to allocate the new pages using addr_bits and order. If we can't 967843e1988Sjohnlev * get all of the pages, try to get the remaining pages with no 968843e1988Sjohnlev * constraints and, if that was successful, return the number of 969843e1988Sjohnlev * constrained pages we did allocate. 970843e1988Sjohnlev */ 971843e1988Sjohnlev bzero(&memres, sizeof (memres)); 972843e1988Sjohnlev /*LINTED: constant in conditional context*/ 973843e1988Sjohnlev set_xen_guest_handle(memres.extent_start, mfns); 974843e1988Sjohnlev memres.domid = DOMID_SELF; 975843e1988Sjohnlev memres.nr_extents = nextents; 976843e1988Sjohnlev memres.address_bits = addr_bits; 977843e1988Sjohnlev memres.extent_order = order; 978843e1988Sjohnlev cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); 979843e1988Sjohnlev /* assign the new MFNs to the current PFNs */ 980843e1988Sjohnlev locked = balloon_lock_contig_pfnlist(cnt * extlen); 981843e1988Sjohnlev for (i = 0; i < cnt; i++) { 982843e1988Sjohnlev for (j = 0; j < extlen; j++) { 983843e1988Sjohnlev reassign_pfn(pp[i * extlen + j]->p_pagenum, 984843e1988Sjohnlev mfns[i] + j); 985843e1988Sjohnlev } 986843e1988Sjohnlev } 987843e1988Sjohnlev if (locked) 988843e1988Sjohnlev unlock_contig_pfnlist(); 989843e1988Sjohnlev if (cnt != nextents) { 990843e1988Sjohnlev if (cnt < 0) { 991843e1988Sjohnlev cnt = 0; 992843e1988Sjohnlev } 993843e1988Sjohnlev 994843e1988Sjohnlev /* 995843e1988Sjohnlev * We couldn't get enough memory to satisfy our requirements. 996843e1988Sjohnlev * The above loop will assign the parts of the request that 997843e1988Sjohnlev * were successful (this part may be 0). We need to fill 998843e1988Sjohnlev * in the rest. The bzero below clears out extent_order and 999843e1988Sjohnlev * address_bits, so we'll take anything from the hypervisor 1000843e1988Sjohnlev * to replace the pages we gave away. 1001843e1988Sjohnlev */ 1002843e1988Sjohnlev fallback_cnt = page_cnt - cnt * extlen; 1003843e1988Sjohnlev bzero(&memres, sizeof (memres)); 1004843e1988Sjohnlev /*LINTED: constant in conditional context*/ 1005843e1988Sjohnlev set_xen_guest_handle(memres.extent_start, mfns); 1006843e1988Sjohnlev memres.domid = DOMID_SELF; 1007843e1988Sjohnlev memres.nr_extents = fallback_cnt; 1008843e1988Sjohnlev e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); 1009843e1988Sjohnlev if (e != fallback_cnt) { 1010843e1988Sjohnlev cmn_err(CE_PANIC, "balloon: unable to recover from " 1011843e1988Sjohnlev "failed increase_reservation.\n"); 1012843e1988Sjohnlev } 1013843e1988Sjohnlev locked = balloon_lock_contig_pfnlist(fallback_cnt); 1014843e1988Sjohnlev for (i = 0; i < fallback_cnt; i++) { 1015843e1988Sjohnlev uint_t offset = page_cnt - fallback_cnt; 1016843e1988Sjohnlev 1017843e1988Sjohnlev /* 1018843e1988Sjohnlev * We already used pp[0...(cnt * extlen)] before, 1019843e1988Sjohnlev * so start at the next entry in the pp array. 1020843e1988Sjohnlev */ 1021843e1988Sjohnlev reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]); 1022843e1988Sjohnlev } 1023843e1988Sjohnlev if (locked) 1024843e1988Sjohnlev unlock_contig_pfnlist(); 1025843e1988Sjohnlev } 1026843e1988Sjohnlev 1027843e1988Sjohnlev /* 1028843e1988Sjohnlev * balloon_free_pages increments our counter. Decrement it here. 1029843e1988Sjohnlev */ 1030843e1988Sjohnlev atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt); 1031843e1988Sjohnlev 1032843e1988Sjohnlev /* 1033843e1988Sjohnlev * return the number of extents we were able to replace. If we got 1034843e1988Sjohnlev * this far, we know all the pp's are valid. 1035843e1988Sjohnlev */ 1036843e1988Sjohnlev return (cnt); 1037843e1988Sjohnlev } 1038843e1988Sjohnlev 1039843e1988Sjohnlev 1040843e1988Sjohnlev /* 1041843e1988Sjohnlev * Called from the driver - return the requested stat. 1042843e1988Sjohnlev */ 1043843e1988Sjohnlev size_t 1044843e1988Sjohnlev balloon_values(int cmd) 1045843e1988Sjohnlev { 1046843e1988Sjohnlev switch (cmd) { 1047843e1988Sjohnlev case BLN_IOCTL_CURRENT: 1048843e1988Sjohnlev return (ptokb(bln_stats.bln_current_pages)); 1049843e1988Sjohnlev case BLN_IOCTL_TARGET: 1050843e1988Sjohnlev return (ptokb(bln_stats.bln_new_target)); 1051843e1988Sjohnlev case BLN_IOCTL_LOW: 1052843e1988Sjohnlev return (ptokb(bln_stats.bln_low)); 1053843e1988Sjohnlev case BLN_IOCTL_HIGH: 1054843e1988Sjohnlev return (ptokb(bln_stats.bln_high)); 1055843e1988Sjohnlev case BLN_IOCTL_LIMIT: 1056843e1988Sjohnlev return (ptokb(bln_stats.bln_hard_limit)); 1057843e1988Sjohnlev default: 1058843e1988Sjohnlev panic("Unexpected cmd %d in balloon_values()\n", cmd); 1059843e1988Sjohnlev } 1060843e1988Sjohnlev /*NOTREACHED*/ 1061843e1988Sjohnlev } 1062