1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/balloon_impl.h> 28 #include <sys/hypervisor.h> 29 #include <xen/sys/xenbus_impl.h> 30 #include <sys/atomic.h> 31 #include <sys/cmn_err.h> 32 #include <sys/disp.h> 33 #include <sys/callb.h> 34 #include <xen/public/memory.h> 35 #include <vm/hat.h> 36 #include <sys/promif.h> 37 #include <vm/seg_kmem.h> 38 #include <sys/memnode.h> 39 #include <sys/param.h> 40 #include <vm/vm_dep.h> 41 #include <sys/mman.h> 42 #include <sys/memlist.h> 43 #include <sys/sysmacros.h> 44 #include <sys/machsystm.h> 45 #include <sys/sdt.h> 46 47 /* 48 * This file implements a balloon thread, which controls a domain's memory 49 * reservation, or the amount of memory a domain is currently allocated. 50 * The hypervisor provides the current memory reservation through xenbus, 51 * so we register a watch on this. We will then be signalled when the 52 * reservation changes. If it goes up, we map the new mfn's to our pfn's 53 * (allocating page_t's if necessary), and release them into the system. 54 * If the reservation goes down, we grab pages and release them back to 55 * the hypervisor, saving the page_t's for later use. 56 */ 57 58 /* 59 * Various structures needed by the balloon thread 60 */ 61 static bln_stats_t bln_stats; 62 static kthread_t *bln_thread; 63 static kmutex_t bln_mutex; 64 static kcondvar_t bln_cv; 65 static struct xenbus_watch bln_watch; 66 static mfn_t new_high_mfn; 67 68 /* 69 * For holding spare page_t structures - keep a singly-linked list. 70 * The list may hold both valid (pagenum < mfn_count) and invalid 71 * (pagenum >= mfn_count) page_t's. Valid page_t's should be inserted 72 * at the front, and invalid page_t's at the back. Removal should 73 * always be from the front. This is a singly-linked list using 74 * p_next, so p_prev is always NULL. 75 */ 76 static page_t *bln_spare_list_front, *bln_spare_list_back; 77 78 int balloon_zero_memory = 1; 79 size_t balloon_minkmem = (8 * 1024 * 1024); 80 81 /* 82 * reassign_pfn() calls update_contig_pfnlist(), which can cause a large 83 * slowdown when calling multiple times. If we're reassigning less than the 84 * quota defined here, we just accept the slowdown. If the count is greater 85 * than the quota, we tell the contig alloc code to stop its accounting until 86 * we're done. Setting the quota to less than 2 is not supported. 87 * 88 * Note that we define our own wrapper around the external 89 * clear_and_lock_contig_pfnlist(), but we just use the version of 90 * unlock_contig_pfnlist() in vm_machdep.c. 91 */ 92 uint_t bln_contig_list_quota = 50; 93 94 extern void clear_and_lock_contig_pfnlist(void); 95 extern void unlock_contig_pfnlist(void); 96 97 /* 98 * Lock the pfnlist if necessary (see above), and return whether we locked it. 99 */ 100 static int 101 balloon_lock_contig_pfnlist(int count) { 102 if (count > bln_contig_list_quota) { 103 clear_and_lock_contig_pfnlist(); 104 return (1); 105 } else { 106 return (0); 107 } 108 } 109 110 /* 111 * The page represented by pp is being given back to the hypervisor. 112 * Add the page_t structure to our spare list. 113 */ 114 static void 115 balloon_page_add(page_t *pp) 116 { 117 /* 118 * We need to keep the page exclusively locked 119 * to prevent swrand from grabbing it. 120 */ 121 ASSERT(PAGE_EXCL(pp)); 122 ASSERT(MUTEX_HELD(&bln_mutex)); 123 124 pp->p_prev = NULL; 125 if (bln_spare_list_front == NULL) { 126 bln_spare_list_front = bln_spare_list_back = pp; 127 pp->p_next = NULL; 128 } else if (pp->p_pagenum >= mfn_count) { 129 /* 130 * The pfn is invalid, so add at the end of list. Since these 131 * adds should *only* be done by balloon_init_new_pages(), and 132 * that does adds in order, the following ASSERT should 133 * never trigger. 134 */ 135 ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum); 136 bln_spare_list_back->p_next = pp; 137 pp->p_next = NULL; 138 bln_spare_list_back = pp; 139 } else { 140 /* Add at beginning of list */ 141 pp->p_next = bln_spare_list_front; 142 bln_spare_list_front = pp; 143 } 144 } 145 146 /* 147 * Return a page_t structure from our spare list, or NULL if none are available. 148 */ 149 static page_t * 150 balloon_page_sub(void) 151 { 152 page_t *pp; 153 154 ASSERT(MUTEX_HELD(&bln_mutex)); 155 if (bln_spare_list_front == NULL) { 156 return (NULL); 157 } 158 159 pp = bln_spare_list_front; 160 ASSERT(PAGE_EXCL(pp)); 161 ASSERT(pp->p_pagenum <= mfn_count); 162 if (pp->p_pagenum == mfn_count) { 163 return (NULL); 164 } 165 166 bln_spare_list_front = pp->p_next; 167 if (bln_spare_list_front == NULL) 168 bln_spare_list_back = NULL; 169 pp->p_next = NULL; 170 return (pp); 171 } 172 173 /* 174 * NOTE: We currently do not support growing beyond the boot memory size, 175 * so the following function will not be called. It is left in here with 176 * the hope that someday this restriction can be lifted, and this code can 177 * be used. 178 */ 179 180 /* 181 * This structure is placed at the start of every block of new pages 182 */ 183 typedef struct { 184 struct memseg memseg; 185 struct memlist memlist; 186 page_t pages[1]; 187 } mem_structs_t; 188 189 /* 190 * To make the math below slightly less confusing, we calculate the first 191 * two parts here. page_t's are handled separately, so they are not included. 192 */ 193 #define MEM_STRUCT_SIZE (sizeof (struct memseg) + sizeof (struct memlist)) 194 195 /* 196 * We want to add memory, but have no spare page_t structures. Use some of 197 * our new memory for the page_t structures. 198 * 199 * Somewhat similar to kphysm_add_memory_dynamic(), but simpler. 200 */ 201 static int 202 balloon_init_new_pages(mfn_t framelist[], pgcnt_t count) 203 { 204 pgcnt_t metapgs, totalpgs, num_pages; 205 paddr_t metasz; 206 pfn_t meta_start; 207 page_t *page_array; 208 caddr_t va; 209 int i, rv, locked; 210 mem_structs_t *mem; 211 struct memseg *segp; 212 213 /* Calculate the number of pages we're going to add */ 214 totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages; 215 216 /* 217 * The following calculates the number of "meta" pages -- the pages 218 * that will be required to hold page_t structures for all new pages. 219 * Proof of this calculation is left up to the reader. 220 */ 221 metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) / 222 (PAGESIZE + sizeof (page_t))); 223 224 /* 225 * Given the number of page_t structures we need, is there also 226 * room in our meta pages for a memseg and memlist struct? 227 * If not, we'll need one more meta page. 228 */ 229 if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) + 230 MEM_STRUCT_SIZE)) 231 metapgs++; 232 233 /* 234 * metapgs is calculated from totalpgs, which may be much larger than 235 * count. If we don't have enough pages, all of the pages in this 236 * batch will be made meta pages, and a future trip through 237 * balloon_inc_reservation() will add the rest of the meta pages. 238 */ 239 if (metapgs > count) 240 metapgs = count; 241 242 /* 243 * Figure out the number of page_t structures that can fit in metapgs 244 * 245 * This will cause us to initialize more page_t structures than we 246 * need - these may be used in future memory increases. 247 */ 248 metasz = pfn_to_pa(metapgs); 249 num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t); 250 251 DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t, 252 num_pages, pgcnt_t, metapgs); 253 254 /* 255 * We only increment mfn_count by count, not num_pages, to keep the 256 * space of all valid pfns contiguous. This means we create page_t 257 * structures with invalid pagenums -- we deal with this situation 258 * in balloon_page_sub. 259 */ 260 mfn_count += count; 261 262 /* 263 * Get a VA for the pages that will hold page_t and other structures. 264 * The memseg and memlist structures will go at the beginning, with 265 * the page_t structures following. 266 */ 267 va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP); 268 /* LINTED: improper alignment */ 269 mem = (mem_structs_t *)va; 270 page_array = mem->pages; 271 272 meta_start = bln_stats.bln_max_pages; 273 274 /* 275 * Set the mfn to pfn mapping for the meta pages. 276 */ 277 locked = balloon_lock_contig_pfnlist(metapgs); 278 for (i = 0; i < metapgs; i++) { 279 reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]); 280 } 281 if (locked) 282 unlock_contig_pfnlist(); 283 284 /* 285 * For our meta pages, map them in and zero the page. 286 * This will be the first time touching the new pages. 287 */ 288 hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages, 289 PROT_READ | PROT_WRITE, 290 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 291 bzero(va, metasz); 292 293 /* 294 * Initialize the page array for the new pages. 295 */ 296 for (i = 0; i < metapgs; i++) { 297 page_array[i].p_pagenum = bln_stats.bln_max_pages++; 298 page_array[i].p_offset = (u_offset_t)-1; 299 page_iolock_init(&page_array[i]); 300 rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); 301 ASSERT(rv == 1); 302 } 303 304 /* 305 * For the rest of the pages, initialize the page_t struct and 306 * add them to the free list 307 */ 308 for (i = metapgs; i < num_pages; i++) { 309 page_array[i].p_pagenum = bln_stats.bln_max_pages++; 310 page_array[i].p_offset = (u_offset_t)-1; 311 page_iolock_init(&page_array[i]); 312 rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); 313 ASSERT(rv == 1); 314 balloon_page_add(&page_array[i]); 315 } 316 317 /* 318 * Remember where I said that we don't call this function? The missing 319 * code right here is why. We need to set up kpm mappings for any new 320 * pages coming in. However, if someone starts up a domain with small 321 * memory, then greatly increases it, we could get in some horrible 322 * deadlock situations as we steal page tables for kpm use, and 323 * userland applications take them right back before we can use them 324 * to set up our new memory. Once a way around that is found, and a 325 * few other changes are made, we'll be able to enable this code. 326 */ 327 328 /* 329 * Update kernel structures, part 1: memsegs list 330 */ 331 mem->memseg.pages_base = meta_start; 332 mem->memseg.pages_end = bln_stats.bln_max_pages - 1; 333 mem->memseg.pages = &page_array[0]; 334 mem->memseg.epages = &page_array[num_pages - 1]; 335 mem->memseg.next = NULL; 336 memsegs_lock(1); 337 for (segp = memsegs; segp->next != NULL; segp = segp->next) 338 ; 339 segp->next = &mem->memseg; 340 memsegs_unlock(1); 341 342 /* 343 * Update kernel structures, part 2: mem_node array 344 */ 345 mem_node_add_slice(meta_start, bln_stats.bln_max_pages); 346 347 /* 348 * Update kernel structures, part 3: phys_install array 349 * (*sigh* how many of these things do we need?) 350 */ 351 memlist_write_lock(); 352 memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist, 353 &phys_install); 354 memlist_write_unlock(); 355 356 build_pfn_hash(); 357 358 return (metapgs); 359 } 360 361 /* How many ulong_t's can we fit on a page? */ 362 #define FRAME_ARRAY_SIZE (PAGESIZE / sizeof (ulong_t)) 363 364 /* 365 * These are too large to declare on the stack, so we make them static instead 366 */ 367 static ulong_t mfn_frames[FRAME_ARRAY_SIZE]; 368 static pfn_t pfn_frames[FRAME_ARRAY_SIZE]; 369 370 /* 371 * This function is called when our reservation is increasing. Make a 372 * hypervisor call to get our new pages, then integrate them into the system. 373 */ 374 static spgcnt_t 375 balloon_inc_reservation(ulong_t credit) 376 { 377 int i, cnt, locked; 378 int meta_pg_start, meta_pg_end; 379 long rv; 380 page_t *pp; 381 page_t *new_list_front, *new_list_back; 382 383 /* Make sure we're single-threaded. */ 384 ASSERT(MUTEX_HELD(&bln_mutex)); 385 386 rv = 0; 387 new_list_front = new_list_back = NULL; 388 meta_pg_start = meta_pg_end = 0; 389 bzero(mfn_frames, PAGESIZE); 390 391 if (credit > FRAME_ARRAY_SIZE) 392 credit = FRAME_ARRAY_SIZE; 393 394 xen_block_migrate(); 395 rv = balloon_alloc_pages(credit, mfn_frames); 396 397 if (rv < 0) { 398 xen_allow_migrate(); 399 return (0); 400 } 401 for (i = 0; i < rv; i++) { 402 if (mfn_frames[i] > new_high_mfn) 403 new_high_mfn = mfn_frames[i]; 404 405 pp = balloon_page_sub(); 406 if (pp == NULL) { 407 /* 408 * We pass the index into the current mfn array, 409 * then move the counter past the mfns we used 410 */ 411 meta_pg_start = i; 412 cnt = balloon_init_new_pages(&mfn_frames[i], rv - i); 413 i += cnt; 414 meta_pg_end = i; 415 if (i < rv) { 416 pp = balloon_page_sub(); 417 } else { 418 ASSERT(i == rv); 419 } 420 } 421 if (pp == NULL) { 422 break; 423 } 424 425 if (new_list_back == NULL) { 426 new_list_front = new_list_back = pp; 427 } else { 428 new_list_back->p_next = pp; 429 new_list_back = pp; 430 } 431 pp->p_next = NULL; 432 } 433 cnt = i; 434 locked = balloon_lock_contig_pfnlist(cnt); 435 for (i = 0, pp = new_list_front; i < meta_pg_start; 436 i++, pp = pp->p_next) { 437 reassign_pfn(pp->p_pagenum, mfn_frames[i]); 438 } 439 for (i = meta_pg_end; i < cnt; i++, pp = pp->p_next) { 440 reassign_pfn(pp->p_pagenum, mfn_frames[i]); 441 } 442 if (locked) 443 unlock_contig_pfnlist(); 444 445 /* 446 * Make sure we don't allow pages without pfn->mfn mappings 447 * into the system. 448 */ 449 ASSERT(pp == NULL); 450 451 while (new_list_front != NULL) { 452 pp = new_list_front; 453 new_list_front = pp->p_next; 454 page_free(pp, 1); 455 } 456 457 /* 458 * Variable review: at this point, rv contains the number of pages 459 * the hypervisor gave us. cnt contains the number of pages for which 460 * we had page_t structures. i contains the number of pages 461 * where we set up pfn <-> mfn mappings. If this ASSERT trips, that 462 * means we somehow lost page_t's from our local list. 463 */ 464 ASSERT(cnt == i); 465 if (cnt < rv) { 466 /* 467 * We couldn't get page structures. 468 * 469 * This shouldn't happen, but causes no real harm if it does. 470 * On debug kernels, we'll flag it. On all kernels, we'll 471 * give back the pages we couldn't assign. 472 * 473 * Since these pages are new to the system and haven't been 474 * used, we don't bother zeroing them. 475 */ 476 #ifdef DEBUG 477 cmn_err(CE_WARN, "Could only assign %d of %ld pages", cnt, rv); 478 #endif /* DEBUG */ 479 480 (void) balloon_free_pages(rv - cnt, &mfn_frames[i], NULL, NULL); 481 482 rv = cnt; 483 } 484 485 xen_allow_migrate(); 486 page_unresv(rv - (meta_pg_end - meta_pg_start)); 487 return (rv); 488 } 489 490 /* 491 * This function is called when we want to decrease the memory reservation 492 * of our domain. Allocate the memory and make a hypervisor call to give 493 * it back. 494 */ 495 static spgcnt_t 496 balloon_dec_reservation(ulong_t debit) 497 { 498 int i, locked; 499 long rv; 500 ulong_t request; 501 page_t *pp; 502 503 bzero(mfn_frames, sizeof (mfn_frames)); 504 bzero(pfn_frames, sizeof (pfn_frames)); 505 506 if (debit > FRAME_ARRAY_SIZE) { 507 debit = FRAME_ARRAY_SIZE; 508 } 509 request = debit; 510 511 /* 512 * Don't bother if there isn't a safe amount of kmem left. 513 */ 514 if (kmem_avail() < balloon_minkmem) { 515 kmem_reap(); 516 if (kmem_avail() < balloon_minkmem) 517 return (0); 518 } 519 520 if (page_resv(request, KM_NOSLEEP) == 0) { 521 return (0); 522 } 523 xen_block_migrate(); 524 for (i = 0; i < debit; i++) { 525 pp = page_get_high_mfn(new_high_mfn); 526 new_high_mfn = 0; 527 if (pp == NULL) { 528 /* 529 * Call kmem_reap(), then try once more, 530 * but only if there is a safe amount of 531 * kmem left. 532 */ 533 kmem_reap(); 534 if (kmem_avail() < balloon_minkmem || 535 (pp = page_get_high_mfn(0)) == NULL) { 536 debit = i; 537 break; 538 } 539 } 540 ASSERT(PAGE_EXCL(pp)); 541 ASSERT(!hat_page_is_mapped(pp)); 542 543 balloon_page_add(pp); 544 pfn_frames[i] = pp->p_pagenum; 545 mfn_frames[i] = pfn_to_mfn(pp->p_pagenum); 546 } 547 if (debit == 0) { 548 xen_allow_migrate(); 549 page_unresv(request); 550 return (0); 551 } 552 553 /* 554 * We zero all the pages before we start reassigning them in order to 555 * minimize the time spent holding the lock on the contig pfn list. 556 */ 557 if (balloon_zero_memory) { 558 for (i = 0; i < debit; i++) { 559 pfnzero(pfn_frames[i], 0, PAGESIZE); 560 } 561 } 562 563 /* 564 * Remove all mappings for the pfns from the system 565 */ 566 locked = balloon_lock_contig_pfnlist(debit); 567 for (i = 0; i < debit; i++) { 568 reassign_pfn(pfn_frames[i], MFN_INVALID); 569 } 570 if (locked) 571 unlock_contig_pfnlist(); 572 573 rv = balloon_free_pages(debit, mfn_frames, NULL, NULL); 574 575 if (rv < 0) { 576 cmn_err(CE_WARN, "Attempt to return pages to the hypervisor " 577 "failed - up to %lu pages lost (error = %ld)", debit, rv); 578 rv = 0; 579 } else if (rv != debit) { 580 panic("Unexpected return value (%ld) from decrease reservation " 581 "hypervisor call", rv); 582 } 583 584 xen_allow_migrate(); 585 if (debit != request) 586 page_unresv(request - debit); 587 return (rv); 588 } 589 590 /* 591 * This function is the callback which is called when the memory/target 592 * node is changed. When it is fired, we will read a new reservation 593 * target for our domain and signal the worker thread to make the change. 594 * 595 * If the reservation is larger than we can handle, we issue a warning. dom0 596 * does this automatically every boot, so we skip the first warning on dom0. 597 */ 598 /*ARGSUSED*/ 599 static void 600 balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len) 601 { 602 ulong_t new_target_kb; 603 pgcnt_t new_target_pages; 604 int rv; 605 static uchar_t warning_cnt = 0; 606 607 rv = xenbus_scanf(NULL, "memory", "target", "%lu", &new_target_kb); 608 if (rv != 0) { 609 return; 610 } 611 612 /* new_target is in kB - change this to pages */ 613 new_target_pages = kbtop(new_target_kb); 614 615 DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages); 616 617 /* 618 * Unfortunately, dom0 may give us a target that is larger than 619 * our max limit. Re-check the limit, and, if the new target is 620 * too large, adjust it downwards. 621 */ 622 mutex_enter(&bln_mutex); 623 if (new_target_pages > bln_stats.bln_max_pages) { 624 DTRACE_PROBE2(balloon__target__too__large, pgcnt_t, 625 new_target_pages, pgcnt_t, bln_stats.bln_max_pages); 626 if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) { 627 cmn_err(CE_WARN, "New balloon target (0x%lx pages) is " 628 "larger than original memory size (0x%lx pages). " 629 "Ballooning beyond original memory size is not " 630 "allowed.", 631 new_target_pages, bln_stats.bln_max_pages); 632 } 633 warning_cnt = 1; 634 bln_stats.bln_new_target = bln_stats.bln_max_pages; 635 } else { 636 bln_stats.bln_new_target = new_target_pages; 637 } 638 639 mutex_exit(&bln_mutex); 640 cv_signal(&bln_cv); 641 } 642 643 /* 644 * bln_wait_sec can be used to throttle the hv calls, but by default it's 645 * turned off. If a balloon attempt fails, the wait time is forced on, and 646 * then is exponentially increased as further attempts fail. 647 */ 648 uint_t bln_wait_sec = 0; 649 uint_t bln_wait_shift = 1; 650 651 /* 652 * This is the main balloon thread. Wait on the cv. When woken, if our 653 * reservation has changed, call the appropriate function to adjust the 654 * reservation. 655 */ 656 static void 657 balloon_worker_thread(void) 658 { 659 uint_t bln_wait; 660 callb_cpr_t cprinfo; 661 spgcnt_t rv; 662 663 bln_wait = bln_wait_sec; 664 665 CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon"); 666 for (;;) { 667 rv = 0; 668 669 mutex_enter(&bln_mutex); 670 CALLB_CPR_SAFE_BEGIN(&cprinfo); 671 if (bln_stats.bln_new_target != bln_stats.bln_current_pages) { 672 /* 673 * We weren't able to fully complete the request 674 * last time through, so try again. 675 */ 676 (void) cv_reltimedwait(&bln_cv, &bln_mutex, 677 (bln_wait * hz), TR_CLOCK_TICK); 678 } else { 679 cv_wait(&bln_cv, &bln_mutex); 680 } 681 CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex); 682 683 if (bln_stats.bln_new_target != bln_stats.bln_current_pages) { 684 if (bln_stats.bln_new_target < 685 bln_stats.bln_current_pages) { 686 /* reservation shrunk */ 687 rv = -balloon_dec_reservation( 688 bln_stats.bln_current_pages - 689 bln_stats.bln_new_target); 690 } else if (bln_stats.bln_new_target > 691 bln_stats.bln_current_pages) { 692 /* reservation grew */ 693 rv = balloon_inc_reservation( 694 bln_stats.bln_new_target - 695 bln_stats.bln_current_pages); 696 } 697 } 698 if (rv == 0) { 699 if (bln_wait == 0) { 700 bln_wait = 1; 701 } else { 702 bln_wait <<= bln_wait_shift; 703 } 704 } else { 705 bln_stats.bln_current_pages += rv; 706 bln_wait = bln_wait_sec; 707 } 708 if (bln_stats.bln_current_pages < bln_stats.bln_low) 709 bln_stats.bln_low = bln_stats.bln_current_pages; 710 else if (bln_stats.bln_current_pages > bln_stats.bln_high) 711 bln_stats.bln_high = bln_stats.bln_current_pages; 712 mutex_exit(&bln_mutex); 713 } 714 } 715 716 /* 717 * Called after balloon_init(), which is below. The xenbus thread is up 718 * and running, so we can register our watch and create the balloon thread. 719 */ 720 static void 721 balloon_config_watch(int state) 722 { 723 if (state != XENSTORE_UP) 724 return; 725 726 bln_watch.node = "memory/target"; 727 bln_watch.callback = balloon_handler; 728 if (register_xenbus_watch(&bln_watch)) { 729 cmn_err(CE_WARN, "Failed to register balloon watcher; balloon " 730 "thread will be disabled"); 731 return; 732 } 733 734 if (bln_thread == NULL) 735 bln_thread = thread_create(NULL, 0, balloon_worker_thread, 736 NULL, 0, &p0, TS_RUN, minclsyspri); 737 } 738 739 /* 740 * Basic initialization of the balloon thread. Set all of our variables, 741 * and register a callback for later when we can register a xenbus watch. 742 */ 743 void 744 balloon_init(pgcnt_t nr_pages) 745 { 746 domid_t domid = DOMID_SELF; 747 748 bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages; 749 bln_stats.bln_new_target = bln_stats.bln_high = nr_pages; 750 bln_stats.bln_max_pages = nr_pages; 751 cv_init(&bln_cv, NULL, CV_DEFAULT, NULL); 752 753 bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op( 754 XENMEM_maximum_reservation, &domid); 755 756 (void) xs_register_xenbus_callback(balloon_config_watch); 757 } 758 759 /* 760 * These functions are called from the network drivers when they gain a page 761 * or give one away. We simply update our count. Note that the counter 762 * tracks the number of pages we give away, so we need to subtract any 763 * amount passed to balloon_drv_added. 764 */ 765 void 766 balloon_drv_added(int64_t delta) 767 { 768 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta); 769 } 770 771 void 772 balloon_drv_subtracted(int64_t delta) 773 { 774 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta); 775 } 776 777 /* 778 * balloon_alloc_pages() 779 * Allocate page_cnt mfns. mfns storage provided by the caller. Returns 780 * the number of pages allocated, which could be less than page_cnt, or 781 * a negative number if an error occurred. 782 */ 783 long 784 balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns) 785 { 786 xen_memory_reservation_t memres; 787 long rv; 788 789 bzero(&memres, sizeof (memres)); 790 /*LINTED: constant in conditional context*/ 791 set_xen_guest_handle(memres.extent_start, mfns); 792 memres.domid = DOMID_SELF; 793 memres.nr_extents = page_cnt; 794 795 rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); 796 if (rv > 0) 797 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv); 798 return (rv); 799 } 800 801 /* 802 * balloon_free_pages() 803 * free page_cnt pages, using any combination of mfns, pfns, and kva as long 804 * as they refer to the same mapping. If an array of mfns is passed in, we 805 * assume they were already cleared. Otherwise, we need to zero the pages 806 * before giving them back to the hypervisor. kva space is not free'd up in 807 * case the caller wants to re-use it. 808 */ 809 long 810 balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns) 811 { 812 xen_memory_reservation_t memdec; 813 mfn_t mfn; 814 pfn_t pfn; 815 uint_t i; 816 long e; 817 818 819 #if DEBUG 820 /* make sure kva is page aligned and maps to first pfn */ 821 if (kva != NULL) { 822 ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0); 823 if (pfns != NULL) { 824 ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]); 825 } 826 } 827 #endif 828 829 /* if we have a kva, we can clean all pages with just one bzero */ 830 if ((kva != NULL) && balloon_zero_memory) { 831 bzero(kva, (page_cnt * PAGESIZE)); 832 } 833 834 /* if we were given a kva and/or a pfn */ 835 if ((kva != NULL) || (pfns != NULL)) { 836 837 /* 838 * All the current callers only pass 1 page when using kva or 839 * pfns, and use mfns when passing multiple pages. If that 840 * assumption is changed, the following code will need some 841 * work. The following ASSERT() guarantees we're respecting 842 * the io locking quota. 843 */ 844 ASSERT(page_cnt < bln_contig_list_quota); 845 846 /* go through all the pages */ 847 for (i = 0; i < page_cnt; i++) { 848 849 /* get the next pfn */ 850 if (pfns == NULL) { 851 pfn = hat_getpfnum(kas.a_hat, 852 (kva + (PAGESIZE * i))); 853 } else { 854 pfn = pfns[i]; 855 } 856 857 /* 858 * if we didn't already zero this page, do it now. we 859 * need to do this *before* we give back the MFN 860 */ 861 if ((kva == NULL) && (balloon_zero_memory)) { 862 pfnzero(pfn, 0, PAGESIZE); 863 } 864 865 /* 866 * unmap the pfn. We don't free up the kva vmem space 867 * so the caller can re-use it. The page must be 868 * unmapped before it is given back to the hypervisor. 869 */ 870 if (kva != NULL) { 871 hat_unload(kas.a_hat, (kva + (PAGESIZE * i)), 872 PAGESIZE, HAT_UNLOAD_UNMAP); 873 } 874 875 /* grab the mfn before the pfn is marked as invalid */ 876 mfn = pfn_to_mfn(pfn); 877 878 /* mark the pfn as invalid */ 879 reassign_pfn(pfn, MFN_INVALID); 880 881 /* 882 * if we weren't given an array of MFNs, we need to 883 * free them up one at a time. Otherwise, we'll wait 884 * until later and do it in one hypercall 885 */ 886 if (mfns == NULL) { 887 bzero(&memdec, sizeof (memdec)); 888 /*LINTED: constant in conditional context*/ 889 set_xen_guest_handle(memdec.extent_start, &mfn); 890 memdec.domid = DOMID_SELF; 891 memdec.nr_extents = 1; 892 e = HYPERVISOR_memory_op( 893 XENMEM_decrease_reservation, &memdec); 894 if (e != 1) { 895 cmn_err(CE_PANIC, "balloon: unable to " 896 "give a page back to the " 897 "hypervisor.\n"); 898 } 899 } 900 } 901 } 902 903 /* 904 * if we were passed in MFNs, we haven't free'd them up yet. We can 905 * do it with one call. 906 */ 907 if (mfns != NULL) { 908 bzero(&memdec, sizeof (memdec)); 909 /*LINTED: constant in conditional context*/ 910 set_xen_guest_handle(memdec.extent_start, mfns); 911 memdec.domid = DOMID_SELF; 912 memdec.nr_extents = page_cnt; 913 e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec); 914 if (e != page_cnt) { 915 cmn_err(CE_PANIC, "balloon: unable to give pages back " 916 "to the hypervisor.\n"); 917 } 918 } 919 920 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt); 921 return (page_cnt); 922 } 923 924 925 /* 926 * balloon_replace_pages() 927 * Try to replace nextexts blocks of 2^order pages. addr_bits specifies 928 * how many bits of address the pages must be within (i.e. 16 would mean 929 * that the pages cannot have an address > 64k). The constrints are on 930 * what the hypervisor gives us -- we are free to give any pages in 931 * exchange. The array pp is the pages we are giving away. The caller 932 * provides storage space for mfns, which hold the new physical pages. 933 */ 934 long 935 balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits, 936 uint_t order, mfn_t *mfns) 937 { 938 xen_memory_reservation_t memres; 939 long fallback_cnt; 940 long cnt; 941 uint_t i, j, page_cnt, extlen; 942 long e; 943 int locked; 944 945 946 /* 947 * we shouldn't be allocating constrained pages on a guest. It doesn't 948 * make any sense. They won't be constrained after a migration. 949 */ 950 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); 951 952 extlen = 1 << order; 953 page_cnt = nextents * extlen; 954 /* Give back the current pages to the hypervisor */ 955 for (i = 0; i < page_cnt; i++) { 956 cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum); 957 if (cnt != 1) { 958 cmn_err(CE_PANIC, "balloon: unable to give a page back " 959 "to the hypervisor.\n"); 960 } 961 } 962 963 /* 964 * try to allocate the new pages using addr_bits and order. If we can't 965 * get all of the pages, try to get the remaining pages with no 966 * constraints and, if that was successful, return the number of 967 * constrained pages we did allocate. 968 */ 969 bzero(&memres, sizeof (memres)); 970 /*LINTED: constant in conditional context*/ 971 set_xen_guest_handle(memres.extent_start, mfns); 972 memres.domid = DOMID_SELF; 973 memres.nr_extents = nextents; 974 memres.mem_flags = XENMEMF_address_bits(addr_bits); 975 memres.extent_order = order; 976 cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); 977 /* assign the new MFNs to the current PFNs */ 978 locked = balloon_lock_contig_pfnlist(cnt * extlen); 979 for (i = 0; i < cnt; i++) { 980 for (j = 0; j < extlen; j++) { 981 reassign_pfn(pp[i * extlen + j]->p_pagenum, 982 mfns[i] + j); 983 } 984 } 985 if (locked) 986 unlock_contig_pfnlist(); 987 if (cnt != nextents) { 988 if (cnt < 0) { 989 cnt = 0; 990 } 991 992 /* 993 * We couldn't get enough memory to satisfy our requirements. 994 * The above loop will assign the parts of the request that 995 * were successful (this part may be 0). We need to fill 996 * in the rest. The bzero below clears out extent_order and 997 * address_bits, so we'll take anything from the hypervisor 998 * to replace the pages we gave away. 999 */ 1000 fallback_cnt = page_cnt - cnt * extlen; 1001 bzero(&memres, sizeof (memres)); 1002 /*LINTED: constant in conditional context*/ 1003 set_xen_guest_handle(memres.extent_start, mfns); 1004 memres.domid = DOMID_SELF; 1005 memres.nr_extents = fallback_cnt; 1006 e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); 1007 if (e != fallback_cnt) { 1008 cmn_err(CE_PANIC, "balloon: unable to recover from " 1009 "failed increase_reservation.\n"); 1010 } 1011 locked = balloon_lock_contig_pfnlist(fallback_cnt); 1012 for (i = 0; i < fallback_cnt; i++) { 1013 uint_t offset = page_cnt - fallback_cnt; 1014 1015 /* 1016 * We already used pp[0...(cnt * extlen)] before, 1017 * so start at the next entry in the pp array. 1018 */ 1019 reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]); 1020 } 1021 if (locked) 1022 unlock_contig_pfnlist(); 1023 } 1024 1025 /* 1026 * balloon_free_pages increments our counter. Decrement it here. 1027 */ 1028 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt); 1029 1030 /* 1031 * return the number of extents we were able to replace. If we got 1032 * this far, we know all the pp's are valid. 1033 */ 1034 return (cnt); 1035 } 1036 1037 1038 /* 1039 * Called from the driver - return the requested stat. 1040 */ 1041 size_t 1042 balloon_values(int cmd) 1043 { 1044 switch (cmd) { 1045 case BLN_IOCTL_CURRENT: 1046 return (ptokb(bln_stats.bln_current_pages)); 1047 case BLN_IOCTL_TARGET: 1048 return (ptokb(bln_stats.bln_new_target)); 1049 case BLN_IOCTL_LOW: 1050 return (ptokb(bln_stats.bln_low)); 1051 case BLN_IOCTL_HIGH: 1052 return (ptokb(bln_stats.bln_high)); 1053 case BLN_IOCTL_LIMIT: 1054 return (ptokb(bln_stats.bln_hard_limit)); 1055 default: 1056 panic("Unexpected cmd %d in balloon_values()\n", cmd); 1057 } 1058 /*NOTREACHED*/ 1059 } 1060