1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/balloon_impl.h> 30 #include <sys/hypervisor.h> 31 #include <xen/sys/xenbus_impl.h> 32 #include <sys/atomic.h> 33 #include <sys/cmn_err.h> 34 #include <sys/disp.h> 35 #include <sys/callb.h> 36 #include <xen/public/memory.h> 37 #include <vm/hat.h> 38 #include <sys/promif.h> 39 #include <vm/seg_kmem.h> 40 #include <sys/memnode.h> 41 #include <sys/param.h> 42 #include <vm/vm_dep.h> 43 #include <sys/mman.h> 44 #include <sys/memlist.h> 45 #include <sys/sysmacros.h> 46 #include <sys/machsystm.h> 47 #include <sys/sdt.h> 48 49 /* 50 * This file implements a balloon thread, which controls a domain's memory 51 * reservation, or the amount of memory a domain is currently allocated. 52 * The hypervisor provides the current memory reservation through xenbus, 53 * so we register a watch on this. We will then be signalled when the 54 * reservation changes. If it goes up, we map the new mfn's to our pfn's 55 * (allocating page_t's if necessary), and release them into the system. 56 * If the reservation goes down, we grab pages and release them back to 57 * the hypervisor, saving the page_t's for later use. 58 */ 59 60 /* 61 * Various structures needed by the balloon thread 62 */ 63 static bln_stats_t bln_stats; 64 static kthread_t *bln_thread; 65 static kmutex_t bln_mutex; 66 static kcondvar_t bln_cv; 67 static struct xenbus_watch bln_watch; 68 static mfn_t new_high_mfn; 69 70 /* 71 * For holding spare page_t structures - keep a singly-linked list. 72 * The list may hold both valid (pagenum < mfn_count) and invalid 73 * (pagenum >= mfn_count) page_t's. Valid page_t's should be inserted 74 * at the front, and invalid page_t's at the back. Removal should 75 * always be from the front. This is a singly-linked list using 76 * p_next, so p_prev is always NULL. 77 */ 78 static page_t *bln_spare_list_front, *bln_spare_list_back; 79 80 int balloon_zero_memory = 1; 81 size_t balloon_minkmem = (8 * 1024 * 1024); 82 static caddr_t balloon_kva; 83 static kmutex_t balloon_kva_mutex; 84 static void balloon_zero_page(pfn_t pfn); 85 86 /* 87 * reassign_pfn() calls update_contig_pfnlist(), which can cause a large 88 * slowdown when calling multiple times. If we're reassigning less than the 89 * quota defined here, we just accept the slowdown. If the count is greater 90 * than the quota, we tell the contig alloc code to stop its accounting until 91 * we're done. Setting the quota to less than 2 is not supported. 92 * 93 * Note that we define our own wrapper around the external 94 * clear_and_lock_contig_pfnlist(), but we just use the version of 95 * unlock_contig_pfnlist() in vm_machdep.c. 96 */ 97 uint_t bln_contig_list_quota = 50; 98 99 extern void clear_and_lock_contig_pfnlist(void); 100 extern void unlock_contig_pfnlist(void); 101 102 /* 103 * Lock the pfnlist if necessary (see above), and return whether we locked it. 104 */ 105 static int 106 balloon_lock_contig_pfnlist(int count) { 107 if (count > bln_contig_list_quota) { 108 clear_and_lock_contig_pfnlist(); 109 return (1); 110 } else { 111 return (0); 112 } 113 } 114 115 /* 116 * The page represented by pp is being given back to the hypervisor. 117 * Add the page_t structure to our spare list. 118 */ 119 static void 120 balloon_page_add(page_t *pp) 121 { 122 /* 123 * We need to keep the page exclusively locked 124 * to prevent swrand from grabbing it. 125 */ 126 ASSERT(PAGE_EXCL(pp)); 127 ASSERT(MUTEX_HELD(&bln_mutex)); 128 129 pp->p_prev = NULL; 130 if (bln_spare_list_front == NULL) { 131 bln_spare_list_front = bln_spare_list_back = pp; 132 pp->p_next = NULL; 133 } else if (pp->p_pagenum >= mfn_count) { 134 /* 135 * The pfn is invalid, so add at the end of list. Since these 136 * adds should *only* be done by balloon_init_new_pages(), and 137 * that does adds in order, the following ASSERT should 138 * never trigger. 139 */ 140 ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum); 141 bln_spare_list_back->p_next = pp; 142 pp->p_next = NULL; 143 bln_spare_list_back = pp; 144 } else { 145 /* Add at beginning of list */ 146 pp->p_next = bln_spare_list_front; 147 bln_spare_list_front = pp; 148 } 149 } 150 151 /* 152 * Return a page_t structure from our spare list, or NULL if none are available. 153 */ 154 static page_t * 155 balloon_page_sub(void) 156 { 157 page_t *pp; 158 159 ASSERT(MUTEX_HELD(&bln_mutex)); 160 if (bln_spare_list_front == NULL) { 161 return (NULL); 162 } 163 164 pp = bln_spare_list_front; 165 ASSERT(PAGE_EXCL(pp)); 166 ASSERT(pp->p_pagenum <= mfn_count); 167 if (pp->p_pagenum == mfn_count) { 168 return (NULL); 169 } 170 171 bln_spare_list_front = pp->p_next; 172 if (bln_spare_list_front == NULL) 173 bln_spare_list_back = NULL; 174 pp->p_next = NULL; 175 return (pp); 176 } 177 178 /* 179 * NOTE: We currently do not support growing beyond the boot memory size, 180 * so the following function will not be called. It is left in here with 181 * the hope that someday this restriction can be lifted, and this code can 182 * be used. 183 */ 184 185 /* 186 * This structure is placed at the start of every block of new pages 187 */ 188 typedef struct { 189 struct memseg memseg; 190 struct memlist memlist; 191 page_t pages[1]; 192 } mem_structs_t; 193 194 /* 195 * To make the math below slightly less confusing, we calculate the first 196 * two parts here. page_t's are handled separately, so they are not included. 197 */ 198 #define MEM_STRUCT_SIZE (sizeof (struct memseg) + sizeof (struct memlist)) 199 200 /* 201 * We want to add memory, but have no spare page_t structures. Use some of 202 * our new memory for the page_t structures. 203 * 204 * Somewhat similar to kphysm_add_memory_dynamic(), but simpler. 205 */ 206 static int 207 balloon_init_new_pages(mfn_t framelist[], pgcnt_t count) 208 { 209 pgcnt_t metapgs, totalpgs, num_pages; 210 paddr_t metasz; 211 pfn_t meta_start; 212 page_t *page_array; 213 caddr_t va; 214 int i, rv, locked; 215 mem_structs_t *mem; 216 struct memseg *segp; 217 218 /* Calculate the number of pages we're going to add */ 219 totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages; 220 221 /* 222 * The following calculates the number of "meta" pages -- the pages 223 * that will be required to hold page_t structures for all new pages. 224 * Proof of this calculation is left up to the reader. 225 */ 226 metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) / 227 (PAGESIZE + sizeof (page_t))); 228 229 /* 230 * Given the number of page_t structures we need, is there also 231 * room in our meta pages for a memseg and memlist struct? 232 * If not, we'll need one more meta page. 233 */ 234 if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) + 235 MEM_STRUCT_SIZE)) 236 metapgs++; 237 238 /* 239 * metapgs is calculated from totalpgs, which may be much larger than 240 * count. If we don't have enough pages, all of the pages in this 241 * batch will be made meta pages, and a future trip through 242 * balloon_inc_reservation() will add the rest of the meta pages. 243 */ 244 if (metapgs > count) 245 metapgs = count; 246 247 /* 248 * Figure out the number of page_t structures that can fit in metapgs 249 * 250 * This will cause us to initialize more page_t structures than we 251 * need - these may be used in future memory increases. 252 */ 253 metasz = pfn_to_pa(metapgs); 254 num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t); 255 256 DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t, 257 num_pages, pgcnt_t, metapgs); 258 259 /* 260 * We only increment mfn_count by count, not num_pages, to keep the 261 * space of all valid pfns contiguous. This means we create page_t 262 * structures with invalid pagenums -- we deal with this situation 263 * in balloon_page_sub. 264 */ 265 mfn_count += count; 266 267 /* 268 * Get a VA for the pages that will hold page_t and other structures. 269 * The memseg and memlist structures will go at the beginning, with 270 * the page_t structures following. 271 */ 272 va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP); 273 /* LINTED: improper alignment */ 274 mem = (mem_structs_t *)va; 275 page_array = mem->pages; 276 277 meta_start = bln_stats.bln_max_pages; 278 279 /* 280 * Set the mfn to pfn mapping for the meta pages. 281 */ 282 locked = balloon_lock_contig_pfnlist(metapgs); 283 for (i = 0; i < metapgs; i++) { 284 reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]); 285 } 286 if (locked) 287 unlock_contig_pfnlist(); 288 289 /* 290 * For our meta pages, map them in and zero the page. 291 * This will be the first time touching the new pages. 292 */ 293 hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages, 294 PROT_READ | PROT_WRITE, 295 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 296 bzero(va, metasz); 297 298 /* 299 * Initialize the page array for the new pages. 300 */ 301 for (i = 0; i < metapgs; i++) { 302 page_array[i].p_pagenum = bln_stats.bln_max_pages++; 303 page_array[i].p_offset = (u_offset_t)-1; 304 page_iolock_init(&page_array[i]); 305 rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); 306 ASSERT(rv == 1); 307 } 308 309 /* 310 * For the rest of the pages, initialize the page_t struct and 311 * add them to the free list 312 */ 313 for (i = metapgs; i < num_pages; i++) { 314 page_array[i].p_pagenum = bln_stats.bln_max_pages++; 315 page_array[i].p_offset = (u_offset_t)-1; 316 page_iolock_init(&page_array[i]); 317 rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); 318 ASSERT(rv == 1); 319 balloon_page_add(&page_array[i]); 320 } 321 322 /* 323 * Remember where I said that we don't call this function? The missing 324 * code right here is why. We need to set up kpm mappings for any new 325 * pages coming in. However, if someone starts up a domain with small 326 * memory, then greatly increases it, we could get in some horrible 327 * deadlock situations as we steal page tables for kpm use, and 328 * userland applications take them right back before we can use them 329 * to set up our new memory. Once a way around that is found, and a 330 * few other changes are made, we'll be able to enable this code. 331 */ 332 333 /* 334 * Update kernel structures, part 1: memsegs list 335 */ 336 mem->memseg.pages_base = meta_start; 337 mem->memseg.pages_end = bln_stats.bln_max_pages - 1; 338 mem->memseg.pages = &page_array[0]; 339 mem->memseg.epages = &page_array[num_pages - 1]; 340 mem->memseg.next = NULL; 341 memsegs_lock(1); 342 for (segp = memsegs; segp->next != NULL; segp = segp->next) 343 ; 344 segp->next = &mem->memseg; 345 memsegs_unlock(1); 346 347 /* 348 * Update kernel structures, part 2: mem_node array 349 */ 350 mem_node_add_slice(meta_start, bln_stats.bln_max_pages); 351 352 /* 353 * Update kernel structures, part 3: phys_install array 354 * (*sigh* how many of these things do we need?) 355 */ 356 memlist_write_lock(); 357 memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist, 358 &phys_install); 359 memlist_write_unlock(); 360 361 build_pfn_hash(); 362 363 return (metapgs); 364 } 365 366 /* How many ulong_t's can we fit on a page? */ 367 #define FRAME_ARRAY_SIZE (PAGESIZE / sizeof (ulong_t)) 368 369 /* 370 * These are too large to declare on the stack, so we make them static instead 371 */ 372 static ulong_t mfn_frames[FRAME_ARRAY_SIZE]; 373 static pfn_t pfn_frames[FRAME_ARRAY_SIZE]; 374 375 /* 376 * This function is called when our reservation is increasing. Make a 377 * hypervisor call to get our new pages, then integrate them into the system. 378 */ 379 static spgcnt_t 380 balloon_inc_reservation(ulong_t credit) 381 { 382 int i, cnt, locked; 383 int meta_pg_start, meta_pg_end; 384 long rv; 385 page_t *pp; 386 page_t *new_list_front, *new_list_back; 387 388 rv = 0; 389 new_list_front = new_list_back = NULL; 390 meta_pg_start = meta_pg_end = 0; 391 bzero(mfn_frames, PAGESIZE); 392 393 if (credit > FRAME_ARRAY_SIZE) 394 credit = FRAME_ARRAY_SIZE; 395 396 xen_block_migrate(); 397 rv = balloon_alloc_pages(credit, mfn_frames); 398 399 if (rv < 0) { 400 xen_allow_migrate(); 401 return (0); 402 } 403 for (i = 0; i < rv; i++) { 404 if (mfn_frames[i] > new_high_mfn) 405 new_high_mfn = mfn_frames[i]; 406 407 pp = balloon_page_sub(); 408 if (pp == NULL) { 409 /* 410 * We pass the index into the current mfn array, 411 * then move the counter past the mfns we used 412 */ 413 meta_pg_start = i; 414 cnt = balloon_init_new_pages(&mfn_frames[i], rv - i); 415 i += cnt; 416 meta_pg_end = i; 417 if (i < rv) { 418 pp = balloon_page_sub(); 419 } else { 420 ASSERT(i == rv); 421 } 422 } 423 if (pp == NULL) { 424 break; 425 } 426 427 if (new_list_back == NULL) { 428 new_list_front = new_list_back = pp; 429 } else { 430 new_list_back->p_next = pp; 431 new_list_back = pp; 432 } 433 pp->p_next = NULL; 434 } 435 cnt = i; 436 locked = balloon_lock_contig_pfnlist(cnt); 437 for (i = 0, pp = new_list_front; (i < meta_pg_start) && (pp != NULL); 438 i++, pp = pp->p_next) { 439 reassign_pfn(pp->p_pagenum, mfn_frames[i]); 440 } 441 for (i = meta_pg_end; (i < cnt) && (pp != NULL); i++, pp = pp->p_next) { 442 reassign_pfn(pp->p_pagenum, mfn_frames[i]); 443 } 444 if (locked) 445 unlock_contig_pfnlist(); 446 while (new_list_front != NULL) { 447 pp = new_list_front; 448 new_list_front = pp->p_next; 449 page_free(pp, 1); 450 } 451 page_unresv(cnt - (meta_pg_end - meta_pg_start)); 452 453 if (cnt < rv) { 454 /* 455 * We couldn't get page structures. 456 * 457 * This shouldn't happen, but causes no real harm if it does. 458 * On debug kernels, we'll flag it. On all kernels, we'll 459 * give back the pages we couldn't assign. 460 */ 461 #ifdef DEBUG 462 cmn_err(CE_WARN, "Could only assign %d of %ld pages", i, rv); 463 #endif /* DEBUG */ 464 465 (void) balloon_free_pages(rv - i, &mfn_frames[i], NULL, NULL); 466 467 rv = i; 468 } 469 470 xen_allow_migrate(); 471 return (rv); 472 } 473 474 /* 475 * This function is called when we want to decrease the memory reservation 476 * of our domain. Allocate the memory and make a hypervisor call to give 477 * it back. 478 */ 479 static spgcnt_t 480 balloon_dec_reservation(ulong_t debit) 481 { 482 int i, locked; 483 long rv; 484 page_t *pp; 485 486 bzero(mfn_frames, sizeof (mfn_frames)); 487 bzero(pfn_frames, sizeof (pfn_frames)); 488 489 if (debit > FRAME_ARRAY_SIZE) { 490 debit = FRAME_ARRAY_SIZE; 491 } 492 493 /* 494 * Don't bother if there isn't a safe amount of kmem left. 495 */ 496 if (kmem_avail() < balloon_minkmem) { 497 kmem_reap(); 498 if (kmem_avail() < balloon_minkmem) 499 return (0); 500 } 501 502 if (page_resv(debit, KM_NOSLEEP) == 0) { 503 return (0); 504 } 505 xen_block_migrate(); 506 for (i = 0; i < debit; i++) { 507 pp = page_get_high_mfn(new_high_mfn); 508 new_high_mfn = 0; 509 if (pp == NULL) { 510 /* 511 * Call kmem_reap(), then try once more, 512 * but only if there is a safe amount of 513 * kmem left. 514 */ 515 kmem_reap(); 516 if (kmem_avail() < balloon_minkmem || 517 (pp = page_get_high_mfn(0)) == NULL) { 518 debit = i; 519 break; 520 } 521 } 522 ASSERT(PAGE_EXCL(pp)); 523 ASSERT(!hat_page_is_mapped(pp)); 524 525 balloon_page_add(pp); 526 pfn_frames[i] = pp->p_pagenum; 527 mfn_frames[i] = pfn_to_mfn(pp->p_pagenum); 528 } 529 if (debit == 0) { 530 xen_allow_migrate(); 531 return (0); 532 } 533 534 /* 535 * Remove all mappings for the pfns from the system 536 */ 537 locked = balloon_lock_contig_pfnlist(debit); 538 for (i = 0; i < debit; i++) { 539 reassign_pfn(pfn_frames[i], MFN_INVALID); 540 } 541 if (locked) 542 unlock_contig_pfnlist(); 543 544 rv = balloon_free_pages(debit, mfn_frames, NULL, NULL); 545 546 if (rv < 0) { 547 cmn_err(CE_WARN, "Attempt to return pages to the hypervisor " 548 "failed - up to %lu pages lost (error = %ld)", debit, rv); 549 rv = 0; 550 } else if (rv != debit) { 551 panic("Unexpected return value (%ld) from decrease reservation " 552 "hypervisor call", rv); 553 } 554 555 xen_allow_migrate(); 556 return (rv); 557 } 558 559 /* 560 * This function is the callback which is called when the memory/target 561 * node is changed. When it is fired, we will read a new reservation 562 * target for our domain and signal the worker thread to make the change. 563 * 564 * If the reservation is larger than we can handle, we issue a warning. dom0 565 * does this automatically every boot, so we skip the first warning on dom0. 566 */ 567 /*ARGSUSED*/ 568 static void 569 balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len) 570 { 571 ulong_t new_target_kb; 572 pgcnt_t new_target_pages; 573 int rv; 574 static uchar_t warning_cnt = 0; 575 576 rv = xenbus_scanf(NULL, "memory", "target", "%lu", &new_target_kb); 577 if (rv != 0) { 578 return; 579 } 580 581 /* new_target is in kB - change this to pages */ 582 new_target_pages = kbtop(new_target_kb); 583 584 DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages); 585 586 /* 587 * Unfortunately, dom0 may give us a target that is larger than 588 * our max limit. Re-check the limit, and, if the new target is 589 * too large, adjust it downwards. 590 */ 591 mutex_enter(&bln_mutex); 592 if (new_target_pages > bln_stats.bln_max_pages) { 593 DTRACE_PROBE2(balloon__target__too__large, pgcnt_t, 594 new_target_pages, pgcnt_t, bln_stats.bln_max_pages); 595 if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) { 596 cmn_err(CE_WARN, "New balloon target (0x%lx pages) is " 597 "larger than original memory size (0x%lx pages). " 598 "Ballooning beyond original memory size is not " 599 "allowed.", 600 new_target_pages, bln_stats.bln_max_pages); 601 } 602 warning_cnt = 1; 603 bln_stats.bln_new_target = bln_stats.bln_max_pages; 604 } else { 605 bln_stats.bln_new_target = new_target_pages; 606 } 607 608 mutex_exit(&bln_mutex); 609 cv_signal(&bln_cv); 610 } 611 612 /* 613 * bln_wait_sec can be used to throttle the hv calls, but by default it's 614 * turned off. If a balloon attempt fails, the wait time is forced on, and 615 * then is exponentially increased as further attempts fail. 616 */ 617 uint_t bln_wait_sec = 0; 618 uint_t bln_wait_shift = 1; 619 620 /* 621 * This is the main balloon thread. Wait on the cv. When woken, if our 622 * reservation has changed, call the appropriate function to adjust the 623 * reservation. 624 */ 625 static void 626 balloon_worker_thread(void) 627 { 628 uint_t bln_wait; 629 callb_cpr_t cprinfo; 630 spgcnt_t rv; 631 632 bln_wait = bln_wait_sec; 633 634 CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon"); 635 for (;;) { 636 rv = 0; 637 638 mutex_enter(&bln_mutex); 639 CALLB_CPR_SAFE_BEGIN(&cprinfo); 640 if (bln_stats.bln_new_target != bln_stats.bln_current_pages) { 641 /* 642 * We weren't able to fully complete the request 643 * last time through, so try again. 644 */ 645 (void) cv_timedwait(&bln_cv, &bln_mutex, 646 lbolt + (bln_wait * hz)); 647 } else { 648 cv_wait(&bln_cv, &bln_mutex); 649 } 650 CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex); 651 652 if (bln_stats.bln_new_target != bln_stats.bln_current_pages) { 653 if (bln_stats.bln_new_target < 654 bln_stats.bln_current_pages) { 655 /* reservation shrunk */ 656 rv = -balloon_dec_reservation( 657 bln_stats.bln_current_pages - 658 bln_stats.bln_new_target); 659 } else if (bln_stats.bln_new_target > 660 bln_stats.bln_current_pages) { 661 /* reservation grew */ 662 rv = balloon_inc_reservation( 663 bln_stats.bln_new_target - 664 bln_stats.bln_current_pages); 665 } 666 } 667 if (rv == 0) { 668 if (bln_wait == 0) { 669 bln_wait = 1; 670 } else { 671 bln_wait <<= bln_wait_shift; 672 } 673 } else { 674 bln_stats.bln_current_pages += rv; 675 bln_wait = bln_wait_sec; 676 } 677 if (bln_stats.bln_current_pages < bln_stats.bln_low) 678 bln_stats.bln_low = bln_stats.bln_current_pages; 679 else if (bln_stats.bln_current_pages > bln_stats.bln_high) 680 bln_stats.bln_high = bln_stats.bln_current_pages; 681 mutex_exit(&bln_mutex); 682 } 683 } 684 685 /* 686 * Called after balloon_init(), which is below. The xenbus thread is up 687 * and running, so we can register our watch and create the balloon thread. 688 */ 689 static void 690 balloon_config_watch(int state) 691 { 692 if (state != XENSTORE_UP) 693 return; 694 695 bln_watch.node = "memory/target"; 696 bln_watch.callback = balloon_handler; 697 if (register_xenbus_watch(&bln_watch)) { 698 cmn_err(CE_WARN, "Failed to register balloon watcher; balloon " 699 "thread will be disabled"); 700 return; 701 } 702 703 if (bln_thread == NULL) 704 bln_thread = thread_create(NULL, 0, balloon_worker_thread, 705 NULL, 0, &p0, TS_RUN, minclsyspri); 706 } 707 708 /* 709 * Basic initialization of the balloon thread. Set all of our variables, 710 * and register a callback for later when we can register a xenbus watch. 711 */ 712 void 713 balloon_init(pgcnt_t nr_pages) 714 { 715 domid_t domid = DOMID_SELF; 716 717 bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages; 718 bln_stats.bln_new_target = bln_stats.bln_high = nr_pages; 719 bln_stats.bln_max_pages = nr_pages; 720 cv_init(&bln_cv, NULL, CV_DEFAULT, NULL); 721 722 /* init balloon zero logic */ 723 balloon_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 724 mutex_init(&balloon_kva_mutex, NULL, MUTEX_DRIVER, NULL); 725 726 bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op( 727 XENMEM_maximum_reservation, &domid); 728 729 (void) xs_register_xenbus_callback(balloon_config_watch); 730 } 731 732 /* 733 * These functions are called from the network drivers when they gain a page 734 * or give one away. We simply update our count. Note that the counter 735 * tracks the number of pages we give away, so we need to subtract any 736 * amount passed to balloon_drv_added. 737 */ 738 void 739 balloon_drv_added(int64_t delta) 740 { 741 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta); 742 } 743 744 void 745 balloon_drv_subtracted(int64_t delta) 746 { 747 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta); 748 } 749 750 /* 751 * balloon_alloc_pages() 752 * Allocate page_cnt mfns. mfns storage provided by the caller. Returns 753 * the number of pages allocated, which could be less than page_cnt, or 754 * a negative number if an error occurred. 755 */ 756 long 757 balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns) 758 { 759 xen_memory_reservation_t memres; 760 long rv; 761 762 bzero(&memres, sizeof (memres)); 763 /*LINTED: constant in conditional context*/ 764 set_xen_guest_handle(memres.extent_start, mfns); 765 memres.domid = DOMID_SELF; 766 memres.nr_extents = page_cnt; 767 768 rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); 769 if (rv > 0) 770 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv); 771 return (rv); 772 } 773 774 /* 775 * balloon_free_pages() 776 * free page_cnt pages, using any combination of mfns, pfns, and kva as long 777 * as they refer to the same mapping. We need to zero the pages before 778 * giving them back to the hypervisor. kva space is not free'd up in case 779 * the caller wants to re-use it. 780 */ 781 long 782 balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns) 783 { 784 xen_memory_reservation_t memdec; 785 mfn_t mfn; 786 pfn_t pfn; 787 uint_t i; 788 long e; 789 790 791 #if DEBUG 792 /* make sure kva is page aligned and maps to first pfn */ 793 if (kva != NULL) { 794 ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0); 795 if (pfns != NULL) { 796 ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]); 797 } 798 } 799 #endif 800 801 /* if we have a kva, we can clean all pages with just one bzero */ 802 if ((kva != NULL) && balloon_zero_memory) { 803 bzero(kva, (page_cnt * PAGESIZE)); 804 } 805 806 /* if we were given a kva and/or a pfn */ 807 if ((kva != NULL) || (pfns != NULL)) { 808 809 /* 810 * All the current callers only pass 1 page when using kva or 811 * pfns, and use mfns when passing multiple pages. If that 812 * assumption is changed, the following code will need some 813 * work. The following ASSERT() guarantees we're respecting 814 * the io locking quota. 815 */ 816 ASSERT(page_cnt < bln_contig_list_quota); 817 818 /* go through all the pages */ 819 for (i = 0; i < page_cnt; i++) { 820 821 /* get the next pfn */ 822 if (pfns == NULL) { 823 pfn = hat_getpfnum(kas.a_hat, 824 (kva + (PAGESIZE * i))); 825 } else { 826 pfn = pfns[i]; 827 } 828 829 /* 830 * if we didn't already zero this page, do it now. we 831 * need to do this *before* we give back the MFN 832 */ 833 if ((kva == NULL) && (balloon_zero_memory)) { 834 balloon_zero_page(pfn); 835 } 836 837 /* 838 * unmap the pfn. We don't free up the kva vmem space 839 * so the caller can re-use it. The page must be 840 * unmapped before it is given back to the hypervisor. 841 */ 842 if (kva != NULL) { 843 hat_unload(kas.a_hat, (kva + (PAGESIZE * i)), 844 PAGESIZE, HAT_UNLOAD_UNMAP); 845 } 846 847 /* grab the mfn before the pfn is marked as invalid */ 848 mfn = pfn_to_mfn(pfn); 849 850 /* mark the pfn as invalid */ 851 reassign_pfn(pfn, MFN_INVALID); 852 853 /* 854 * if we weren't given an array of MFNs, we need to 855 * free them up one at a time. Otherwise, we'll wait 856 * until later and do it in one hypercall 857 */ 858 if (mfns == NULL) { 859 bzero(&memdec, sizeof (memdec)); 860 /*LINTED: constant in conditional context*/ 861 set_xen_guest_handle(memdec.extent_start, &mfn); 862 memdec.domid = DOMID_SELF; 863 memdec.nr_extents = 1; 864 e = HYPERVISOR_memory_op( 865 XENMEM_decrease_reservation, &memdec); 866 if (e != 1) { 867 cmn_err(CE_PANIC, "balloon: unable to " 868 "give a page back to the " 869 "hypervisor.\n"); 870 } 871 } 872 } 873 874 /* 875 * if all we were given was an array of MFN's, we only need to zero out 876 * each page. The MFNs will be free'd up below. 877 */ 878 } else if (balloon_zero_memory) { 879 ASSERT(mfns != NULL); 880 for (i = 0; i < page_cnt; i++) { 881 pfn = xen_assign_pfn(mfns[i]); 882 balloon_zero_page(pfn); 883 xen_release_pfn(pfn); 884 } 885 } 886 887 /* 888 * if we were passed in MFNs, we haven't free'd them up yet. We can 889 * do it with one call. 890 */ 891 if (mfns != NULL) { 892 bzero(&memdec, sizeof (memdec)); 893 /*LINTED: constant in conditional context*/ 894 set_xen_guest_handle(memdec.extent_start, mfns); 895 memdec.domid = DOMID_SELF; 896 memdec.nr_extents = page_cnt; 897 e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec); 898 if (e != page_cnt) { 899 cmn_err(CE_PANIC, "balloon: unable to give pages back " 900 "to the hypervisor.\n"); 901 } 902 } 903 904 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt); 905 return (page_cnt); 906 } 907 908 909 /* 910 * balloon_replace_pages() 911 * Try to replace nextexts blocks of 2^order pages. addr_bits specifies 912 * how many bits of address the pages must be within (i.e. 16 would mean 913 * that the pages cannot have an address > 64k). The constrints are on 914 * what the hypervisor gives us -- we are free to give any pages in 915 * exchange. The array pp is the pages we are giving away. The caller 916 * provides storage space for mfns, which hold the new physical pages. 917 */ 918 long 919 balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits, 920 uint_t order, mfn_t *mfns) 921 { 922 xen_memory_reservation_t memres; 923 long fallback_cnt; 924 long cnt; 925 uint_t i, j, page_cnt, extlen; 926 long e; 927 int locked; 928 929 930 /* 931 * we shouldn't be allocating constrained pages on a guest. It doesn't 932 * make any sense. They won't be constrained after a migration. 933 */ 934 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); 935 936 extlen = 1 << order; 937 page_cnt = nextents * extlen; 938 /* Give back the current pages to the hypervisor */ 939 for (i = 0; i < page_cnt; i++) { 940 cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum); 941 if (cnt != 1) { 942 cmn_err(CE_PANIC, "balloon: unable to give a page back " 943 "to the hypervisor.\n"); 944 } 945 } 946 947 /* 948 * try to allocate the new pages using addr_bits and order. If we can't 949 * get all of the pages, try to get the remaining pages with no 950 * constraints and, if that was successful, return the number of 951 * constrained pages we did allocate. 952 */ 953 bzero(&memres, sizeof (memres)); 954 /*LINTED: constant in conditional context*/ 955 set_xen_guest_handle(memres.extent_start, mfns); 956 memres.domid = DOMID_SELF; 957 memres.nr_extents = nextents; 958 memres.address_bits = addr_bits; 959 memres.extent_order = order; 960 cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); 961 /* assign the new MFNs to the current PFNs */ 962 locked = balloon_lock_contig_pfnlist(cnt * extlen); 963 for (i = 0; i < cnt; i++) { 964 for (j = 0; j < extlen; j++) { 965 reassign_pfn(pp[i * extlen + j]->p_pagenum, 966 mfns[i] + j); 967 } 968 } 969 if (locked) 970 unlock_contig_pfnlist(); 971 if (cnt != nextents) { 972 if (cnt < 0) { 973 cnt = 0; 974 } 975 976 /* 977 * We couldn't get enough memory to satisfy our requirements. 978 * The above loop will assign the parts of the request that 979 * were successful (this part may be 0). We need to fill 980 * in the rest. The bzero below clears out extent_order and 981 * address_bits, so we'll take anything from the hypervisor 982 * to replace the pages we gave away. 983 */ 984 fallback_cnt = page_cnt - cnt * extlen; 985 bzero(&memres, sizeof (memres)); 986 /*LINTED: constant in conditional context*/ 987 set_xen_guest_handle(memres.extent_start, mfns); 988 memres.domid = DOMID_SELF; 989 memres.nr_extents = fallback_cnt; 990 e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); 991 if (e != fallback_cnt) { 992 cmn_err(CE_PANIC, "balloon: unable to recover from " 993 "failed increase_reservation.\n"); 994 } 995 locked = balloon_lock_contig_pfnlist(fallback_cnt); 996 for (i = 0; i < fallback_cnt; i++) { 997 uint_t offset = page_cnt - fallback_cnt; 998 999 /* 1000 * We already used pp[0...(cnt * extlen)] before, 1001 * so start at the next entry in the pp array. 1002 */ 1003 reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]); 1004 } 1005 if (locked) 1006 unlock_contig_pfnlist(); 1007 } 1008 1009 /* 1010 * balloon_free_pages increments our counter. Decrement it here. 1011 */ 1012 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt); 1013 1014 /* 1015 * return the number of extents we were able to replace. If we got 1016 * this far, we know all the pp's are valid. 1017 */ 1018 return (cnt); 1019 } 1020 1021 1022 /* 1023 * balloon_zero_page() 1024 * zero out the page. 1025 */ 1026 static void 1027 balloon_zero_page(pfn_t pfn) 1028 { 1029 /* balloon_init() should have been called first */ 1030 ASSERT(balloon_kva != NULL); 1031 1032 mutex_enter(&balloon_kva_mutex); 1033 1034 /* map the pfn into kva, zero the page, then unmap the pfn */ 1035 hat_devload(kas.a_hat, balloon_kva, PAGESIZE, pfn, 1036 HAT_STORECACHING_OK | PROT_READ | PROT_WRITE | HAT_NOSYNC, 1037 HAT_LOAD_LOCK); 1038 bzero(balloon_kva, PAGESIZE); 1039 hat_unload(kas.a_hat, balloon_kva, PAGESIZE, HAT_UNLOAD); 1040 1041 mutex_exit(&balloon_kva_mutex); 1042 } 1043 1044 /* 1045 * Called from the driver - return the requested stat. 1046 */ 1047 size_t 1048 balloon_values(int cmd) 1049 { 1050 switch (cmd) { 1051 case BLN_IOCTL_CURRENT: 1052 return (ptokb(bln_stats.bln_current_pages)); 1053 case BLN_IOCTL_TARGET: 1054 return (ptokb(bln_stats.bln_new_target)); 1055 case BLN_IOCTL_LOW: 1056 return (ptokb(bln_stats.bln_low)); 1057 case BLN_IOCTL_HIGH: 1058 return (ptokb(bln_stats.bln_high)); 1059 case BLN_IOCTL_LIMIT: 1060 return (ptokb(bln_stats.bln_hard_limit)); 1061 default: 1062 panic("Unexpected cmd %d in balloon_values()\n", cmd); 1063 } 1064 /*NOTREACHED*/ 1065 } 1066