1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/balloon_impl.h> 30 #include <sys/hypervisor.h> 31 #include <xen/sys/xenbus_impl.h> 32 #include <sys/atomic.h> 33 #include <sys/cmn_err.h> 34 #include <sys/disp.h> 35 #include <sys/callb.h> 36 #include <xen/public/memory.h> 37 #include <vm/hat.h> 38 #include <sys/promif.h> 39 #include <vm/seg_kmem.h> 40 #include <sys/memnode.h> 41 #include <sys/param.h> 42 #include <vm/vm_dep.h> 43 #include <sys/mman.h> 44 #include <sys/memlist.h> 45 #include <sys/sysmacros.h> 46 #include <sys/machsystm.h> 47 #include <sys/sdt.h> 48 49 /* 50 * This file implements a balloon thread, which controls a domain's memory 51 * reservation, or the amount of memory a domain is currently allocated. 52 * The hypervisor provides the current memory reservation through xenbus, 53 * so we register a watch on this. We will then be signalled when the 54 * reservation changes. If it goes up, we map the new mfn's to our pfn's 55 * (allocating page_t's if necessary), and release them into the system. 56 * If the reservation goes down, we grab pages and release them back to 57 * the hypervisor, saving the page_t's for later use. 58 */ 59 60 /* 61 * Various structures needed by the balloon thread 62 */ 63 static bln_stats_t bln_stats; 64 static kthread_t *bln_thread; 65 static kmutex_t bln_mutex; 66 static kcondvar_t bln_cv; 67 static struct xenbus_watch bln_watch; 68 static mfn_t new_high_mfn; 69 70 /* 71 * For holding spare page_t structures - keep a singly-linked list. 72 * The list may hold both valid (pagenum < mfn_count) and invalid 73 * (pagenum >= mfn_count) page_t's. Valid page_t's should be inserted 74 * at the front, and invalid page_t's at the back. Removal should 75 * always be from the front. This is a singly-linked list using 76 * p_next, so p_prev is always NULL. 77 */ 78 static page_t *bln_spare_list_front, *bln_spare_list_back; 79 80 int balloon_zero_memory = 1; 81 size_t balloon_minkmem = (8 * 1024 * 1024); 82 83 /* 84 * reassign_pfn() calls update_contig_pfnlist(), which can cause a large 85 * slowdown when calling multiple times. If we're reassigning less than the 86 * quota defined here, we just accept the slowdown. If the count is greater 87 * than the quota, we tell the contig alloc code to stop its accounting until 88 * we're done. Setting the quota to less than 2 is not supported. 89 * 90 * Note that we define our own wrapper around the external 91 * clear_and_lock_contig_pfnlist(), but we just use the version of 92 * unlock_contig_pfnlist() in vm_machdep.c. 93 */ 94 uint_t bln_contig_list_quota = 50; 95 96 extern void clear_and_lock_contig_pfnlist(void); 97 extern void unlock_contig_pfnlist(void); 98 99 /* 100 * Lock the pfnlist if necessary (see above), and return whether we locked it. 101 */ 102 static int 103 balloon_lock_contig_pfnlist(int count) { 104 if (count > bln_contig_list_quota) { 105 clear_and_lock_contig_pfnlist(); 106 return (1); 107 } else { 108 return (0); 109 } 110 } 111 112 /* 113 * The page represented by pp is being given back to the hypervisor. 114 * Add the page_t structure to our spare list. 115 */ 116 static void 117 balloon_page_add(page_t *pp) 118 { 119 /* 120 * We need to keep the page exclusively locked 121 * to prevent swrand from grabbing it. 122 */ 123 ASSERT(PAGE_EXCL(pp)); 124 ASSERT(MUTEX_HELD(&bln_mutex)); 125 126 pp->p_prev = NULL; 127 if (bln_spare_list_front == NULL) { 128 bln_spare_list_front = bln_spare_list_back = pp; 129 pp->p_next = NULL; 130 } else if (pp->p_pagenum >= mfn_count) { 131 /* 132 * The pfn is invalid, so add at the end of list. Since these 133 * adds should *only* be done by balloon_init_new_pages(), and 134 * that does adds in order, the following ASSERT should 135 * never trigger. 136 */ 137 ASSERT(pp->p_pagenum > bln_spare_list_back->p_pagenum); 138 bln_spare_list_back->p_next = pp; 139 pp->p_next = NULL; 140 bln_spare_list_back = pp; 141 } else { 142 /* Add at beginning of list */ 143 pp->p_next = bln_spare_list_front; 144 bln_spare_list_front = pp; 145 } 146 } 147 148 /* 149 * Return a page_t structure from our spare list, or NULL if none are available. 150 */ 151 static page_t * 152 balloon_page_sub(void) 153 { 154 page_t *pp; 155 156 ASSERT(MUTEX_HELD(&bln_mutex)); 157 if (bln_spare_list_front == NULL) { 158 return (NULL); 159 } 160 161 pp = bln_spare_list_front; 162 ASSERT(PAGE_EXCL(pp)); 163 ASSERT(pp->p_pagenum <= mfn_count); 164 if (pp->p_pagenum == mfn_count) { 165 return (NULL); 166 } 167 168 bln_spare_list_front = pp->p_next; 169 if (bln_spare_list_front == NULL) 170 bln_spare_list_back = NULL; 171 pp->p_next = NULL; 172 return (pp); 173 } 174 175 /* 176 * NOTE: We currently do not support growing beyond the boot memory size, 177 * so the following function will not be called. It is left in here with 178 * the hope that someday this restriction can be lifted, and this code can 179 * be used. 180 */ 181 182 /* 183 * This structure is placed at the start of every block of new pages 184 */ 185 typedef struct { 186 struct memseg memseg; 187 struct memlist memlist; 188 page_t pages[1]; 189 } mem_structs_t; 190 191 /* 192 * To make the math below slightly less confusing, we calculate the first 193 * two parts here. page_t's are handled separately, so they are not included. 194 */ 195 #define MEM_STRUCT_SIZE (sizeof (struct memseg) + sizeof (struct memlist)) 196 197 /* 198 * We want to add memory, but have no spare page_t structures. Use some of 199 * our new memory for the page_t structures. 200 * 201 * Somewhat similar to kphysm_add_memory_dynamic(), but simpler. 202 */ 203 static int 204 balloon_init_new_pages(mfn_t framelist[], pgcnt_t count) 205 { 206 pgcnt_t metapgs, totalpgs, num_pages; 207 paddr_t metasz; 208 pfn_t meta_start; 209 page_t *page_array; 210 caddr_t va; 211 int i, rv, locked; 212 mem_structs_t *mem; 213 struct memseg *segp; 214 215 /* Calculate the number of pages we're going to add */ 216 totalpgs = bln_stats.bln_new_target - bln_stats.bln_current_pages; 217 218 /* 219 * The following calculates the number of "meta" pages -- the pages 220 * that will be required to hold page_t structures for all new pages. 221 * Proof of this calculation is left up to the reader. 222 */ 223 metapgs = totalpgs - (((uint64_t)(totalpgs) << PAGESHIFT) / 224 (PAGESIZE + sizeof (page_t))); 225 226 /* 227 * Given the number of page_t structures we need, is there also 228 * room in our meta pages for a memseg and memlist struct? 229 * If not, we'll need one more meta page. 230 */ 231 if ((metapgs << PAGESHIFT) < (totalpgs * sizeof (page_t) + 232 MEM_STRUCT_SIZE)) 233 metapgs++; 234 235 /* 236 * metapgs is calculated from totalpgs, which may be much larger than 237 * count. If we don't have enough pages, all of the pages in this 238 * batch will be made meta pages, and a future trip through 239 * balloon_inc_reservation() will add the rest of the meta pages. 240 */ 241 if (metapgs > count) 242 metapgs = count; 243 244 /* 245 * Figure out the number of page_t structures that can fit in metapgs 246 * 247 * This will cause us to initialize more page_t structures than we 248 * need - these may be used in future memory increases. 249 */ 250 metasz = pfn_to_pa(metapgs); 251 num_pages = (metasz - MEM_STRUCT_SIZE) / sizeof (page_t); 252 253 DTRACE_PROBE3(balloon__alloc__stats, pgcnt_t, totalpgs, pgcnt_t, 254 num_pages, pgcnt_t, metapgs); 255 256 /* 257 * We only increment mfn_count by count, not num_pages, to keep the 258 * space of all valid pfns contiguous. This means we create page_t 259 * structures with invalid pagenums -- we deal with this situation 260 * in balloon_page_sub. 261 */ 262 mfn_count += count; 263 264 /* 265 * Get a VA for the pages that will hold page_t and other structures. 266 * The memseg and memlist structures will go at the beginning, with 267 * the page_t structures following. 268 */ 269 va = (caddr_t)vmem_alloc(heap_arena, metasz, VM_SLEEP); 270 /* LINTED: improper alignment */ 271 mem = (mem_structs_t *)va; 272 page_array = mem->pages; 273 274 meta_start = bln_stats.bln_max_pages; 275 276 /* 277 * Set the mfn to pfn mapping for the meta pages. 278 */ 279 locked = balloon_lock_contig_pfnlist(metapgs); 280 for (i = 0; i < metapgs; i++) { 281 reassign_pfn(bln_stats.bln_max_pages + i, framelist[i]); 282 } 283 if (locked) 284 unlock_contig_pfnlist(); 285 286 /* 287 * For our meta pages, map them in and zero the page. 288 * This will be the first time touching the new pages. 289 */ 290 hat_devload(kas.a_hat, va, metasz, bln_stats.bln_max_pages, 291 PROT_READ | PROT_WRITE, 292 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 293 bzero(va, metasz); 294 295 /* 296 * Initialize the page array for the new pages. 297 */ 298 for (i = 0; i < metapgs; i++) { 299 page_array[i].p_pagenum = bln_stats.bln_max_pages++; 300 page_array[i].p_offset = (u_offset_t)-1; 301 page_iolock_init(&page_array[i]); 302 rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); 303 ASSERT(rv == 1); 304 } 305 306 /* 307 * For the rest of the pages, initialize the page_t struct and 308 * add them to the free list 309 */ 310 for (i = metapgs; i < num_pages; i++) { 311 page_array[i].p_pagenum = bln_stats.bln_max_pages++; 312 page_array[i].p_offset = (u_offset_t)-1; 313 page_iolock_init(&page_array[i]); 314 rv = page_lock(&page_array[i], SE_EXCL, NULL, P_NO_RECLAIM); 315 ASSERT(rv == 1); 316 balloon_page_add(&page_array[i]); 317 } 318 319 /* 320 * Remember where I said that we don't call this function? The missing 321 * code right here is why. We need to set up kpm mappings for any new 322 * pages coming in. However, if someone starts up a domain with small 323 * memory, then greatly increases it, we could get in some horrible 324 * deadlock situations as we steal page tables for kpm use, and 325 * userland applications take them right back before we can use them 326 * to set up our new memory. Once a way around that is found, and a 327 * few other changes are made, we'll be able to enable this code. 328 */ 329 330 /* 331 * Update kernel structures, part 1: memsegs list 332 */ 333 mem->memseg.pages_base = meta_start; 334 mem->memseg.pages_end = bln_stats.bln_max_pages - 1; 335 mem->memseg.pages = &page_array[0]; 336 mem->memseg.epages = &page_array[num_pages - 1]; 337 mem->memseg.next = NULL; 338 memsegs_lock(1); 339 for (segp = memsegs; segp->next != NULL; segp = segp->next) 340 ; 341 segp->next = &mem->memseg; 342 memsegs_unlock(1); 343 344 /* 345 * Update kernel structures, part 2: mem_node array 346 */ 347 mem_node_add_slice(meta_start, bln_stats.bln_max_pages); 348 349 /* 350 * Update kernel structures, part 3: phys_install array 351 * (*sigh* how many of these things do we need?) 352 */ 353 memlist_write_lock(); 354 memlist_add(pfn_to_pa(meta_start), num_pages, &mem->memlist, 355 &phys_install); 356 memlist_write_unlock(); 357 358 build_pfn_hash(); 359 360 return (metapgs); 361 } 362 363 /* How many ulong_t's can we fit on a page? */ 364 #define FRAME_ARRAY_SIZE (PAGESIZE / sizeof (ulong_t)) 365 366 /* 367 * These are too large to declare on the stack, so we make them static instead 368 */ 369 static ulong_t mfn_frames[FRAME_ARRAY_SIZE]; 370 static pfn_t pfn_frames[FRAME_ARRAY_SIZE]; 371 372 /* 373 * This function is called when our reservation is increasing. Make a 374 * hypervisor call to get our new pages, then integrate them into the system. 375 */ 376 static spgcnt_t 377 balloon_inc_reservation(ulong_t credit) 378 { 379 int i, cnt, locked; 380 int meta_pg_start, meta_pg_end; 381 long rv; 382 page_t *pp; 383 page_t *new_list_front, *new_list_back; 384 385 /* Make sure we're single-threaded. */ 386 ASSERT(MUTEX_HELD(&bln_mutex)); 387 388 rv = 0; 389 new_list_front = new_list_back = NULL; 390 meta_pg_start = meta_pg_end = 0; 391 bzero(mfn_frames, PAGESIZE); 392 393 if (credit > FRAME_ARRAY_SIZE) 394 credit = FRAME_ARRAY_SIZE; 395 396 xen_block_migrate(); 397 rv = balloon_alloc_pages(credit, mfn_frames); 398 399 if (rv < 0) { 400 xen_allow_migrate(); 401 return (0); 402 } 403 for (i = 0; i < rv; i++) { 404 if (mfn_frames[i] > new_high_mfn) 405 new_high_mfn = mfn_frames[i]; 406 407 pp = balloon_page_sub(); 408 if (pp == NULL) { 409 /* 410 * We pass the index into the current mfn array, 411 * then move the counter past the mfns we used 412 */ 413 meta_pg_start = i; 414 cnt = balloon_init_new_pages(&mfn_frames[i], rv - i); 415 i += cnt; 416 meta_pg_end = i; 417 if (i < rv) { 418 pp = balloon_page_sub(); 419 } else { 420 ASSERT(i == rv); 421 } 422 } 423 if (pp == NULL) { 424 break; 425 } 426 427 if (new_list_back == NULL) { 428 new_list_front = new_list_back = pp; 429 } else { 430 new_list_back->p_next = pp; 431 new_list_back = pp; 432 } 433 pp->p_next = NULL; 434 } 435 cnt = i; 436 locked = balloon_lock_contig_pfnlist(cnt); 437 for (i = 0, pp = new_list_front; i < meta_pg_start; 438 i++, pp = pp->p_next) { 439 reassign_pfn(pp->p_pagenum, mfn_frames[i]); 440 } 441 for (i = meta_pg_end; i < cnt; i++, pp = pp->p_next) { 442 reassign_pfn(pp->p_pagenum, mfn_frames[i]); 443 } 444 if (locked) 445 unlock_contig_pfnlist(); 446 447 /* 448 * Make sure we don't allow pages without pfn->mfn mappings 449 * into the system. 450 */ 451 ASSERT(pp == NULL); 452 453 while (new_list_front != NULL) { 454 pp = new_list_front; 455 new_list_front = pp->p_next; 456 page_free(pp, 1); 457 } 458 459 /* 460 * Variable review: at this point, rv contains the number of pages 461 * the hypervisor gave us. cnt contains the number of pages for which 462 * we had page_t structures. i contains the number of pages 463 * where we set up pfn <-> mfn mappings. If this ASSERT trips, that 464 * means we somehow lost page_t's from our local list. 465 */ 466 ASSERT(cnt == i); 467 if (cnt < rv) { 468 /* 469 * We couldn't get page structures. 470 * 471 * This shouldn't happen, but causes no real harm if it does. 472 * On debug kernels, we'll flag it. On all kernels, we'll 473 * give back the pages we couldn't assign. 474 * 475 * Since these pages are new to the system and haven't been 476 * used, we don't bother zeroing them. 477 */ 478 #ifdef DEBUG 479 cmn_err(CE_WARN, "Could only assign %d of %ld pages", cnt, rv); 480 #endif /* DEBUG */ 481 482 (void) balloon_free_pages(rv - cnt, &mfn_frames[i], NULL, NULL); 483 484 rv = cnt; 485 } 486 487 xen_allow_migrate(); 488 page_unresv(rv - (meta_pg_end - meta_pg_start)); 489 return (rv); 490 } 491 492 /* 493 * This function is called when we want to decrease the memory reservation 494 * of our domain. Allocate the memory and make a hypervisor call to give 495 * it back. 496 */ 497 static spgcnt_t 498 balloon_dec_reservation(ulong_t debit) 499 { 500 int i, locked; 501 long rv; 502 ulong_t request; 503 page_t *pp; 504 505 bzero(mfn_frames, sizeof (mfn_frames)); 506 bzero(pfn_frames, sizeof (pfn_frames)); 507 508 if (debit > FRAME_ARRAY_SIZE) { 509 debit = FRAME_ARRAY_SIZE; 510 } 511 request = debit; 512 513 /* 514 * Don't bother if there isn't a safe amount of kmem left. 515 */ 516 if (kmem_avail() < balloon_minkmem) { 517 kmem_reap(); 518 if (kmem_avail() < balloon_minkmem) 519 return (0); 520 } 521 522 if (page_resv(request, KM_NOSLEEP) == 0) { 523 return (0); 524 } 525 xen_block_migrate(); 526 for (i = 0; i < debit; i++) { 527 pp = page_get_high_mfn(new_high_mfn); 528 new_high_mfn = 0; 529 if (pp == NULL) { 530 /* 531 * Call kmem_reap(), then try once more, 532 * but only if there is a safe amount of 533 * kmem left. 534 */ 535 kmem_reap(); 536 if (kmem_avail() < balloon_minkmem || 537 (pp = page_get_high_mfn(0)) == NULL) { 538 debit = i; 539 break; 540 } 541 } 542 ASSERT(PAGE_EXCL(pp)); 543 ASSERT(!hat_page_is_mapped(pp)); 544 545 balloon_page_add(pp); 546 pfn_frames[i] = pp->p_pagenum; 547 mfn_frames[i] = pfn_to_mfn(pp->p_pagenum); 548 } 549 if (debit == 0) { 550 xen_allow_migrate(); 551 page_unresv(request); 552 return (0); 553 } 554 555 /* 556 * We zero all the pages before we start reassigning them in order to 557 * minimize the time spent holding the lock on the contig pfn list. 558 */ 559 if (balloon_zero_memory) { 560 for (i = 0; i < debit; i++) { 561 pfnzero(pfn_frames[i], 0, PAGESIZE); 562 } 563 } 564 565 /* 566 * Remove all mappings for the pfns from the system 567 */ 568 locked = balloon_lock_contig_pfnlist(debit); 569 for (i = 0; i < debit; i++) { 570 reassign_pfn(pfn_frames[i], MFN_INVALID); 571 } 572 if (locked) 573 unlock_contig_pfnlist(); 574 575 rv = balloon_free_pages(debit, mfn_frames, NULL, NULL); 576 577 if (rv < 0) { 578 cmn_err(CE_WARN, "Attempt to return pages to the hypervisor " 579 "failed - up to %lu pages lost (error = %ld)", debit, rv); 580 rv = 0; 581 } else if (rv != debit) { 582 panic("Unexpected return value (%ld) from decrease reservation " 583 "hypervisor call", rv); 584 } 585 586 xen_allow_migrate(); 587 if (debit != request) 588 page_unresv(request - debit); 589 return (rv); 590 } 591 592 /* 593 * This function is the callback which is called when the memory/target 594 * node is changed. When it is fired, we will read a new reservation 595 * target for our domain and signal the worker thread to make the change. 596 * 597 * If the reservation is larger than we can handle, we issue a warning. dom0 598 * does this automatically every boot, so we skip the first warning on dom0. 599 */ 600 /*ARGSUSED*/ 601 static void 602 balloon_handler(struct xenbus_watch *watch, const char **vec, uint_t len) 603 { 604 ulong_t new_target_kb; 605 pgcnt_t new_target_pages; 606 int rv; 607 static uchar_t warning_cnt = 0; 608 609 rv = xenbus_scanf(NULL, "memory", "target", "%lu", &new_target_kb); 610 if (rv != 0) { 611 return; 612 } 613 614 /* new_target is in kB - change this to pages */ 615 new_target_pages = kbtop(new_target_kb); 616 617 DTRACE_PROBE1(balloon__new__target, pgcnt_t, new_target_pages); 618 619 /* 620 * Unfortunately, dom0 may give us a target that is larger than 621 * our max limit. Re-check the limit, and, if the new target is 622 * too large, adjust it downwards. 623 */ 624 mutex_enter(&bln_mutex); 625 if (new_target_pages > bln_stats.bln_max_pages) { 626 DTRACE_PROBE2(balloon__target__too__large, pgcnt_t, 627 new_target_pages, pgcnt_t, bln_stats.bln_max_pages); 628 if (!DOMAIN_IS_INITDOMAIN(xen_info) || warning_cnt != 0) { 629 cmn_err(CE_WARN, "New balloon target (0x%lx pages) is " 630 "larger than original memory size (0x%lx pages). " 631 "Ballooning beyond original memory size is not " 632 "allowed.", 633 new_target_pages, bln_stats.bln_max_pages); 634 } 635 warning_cnt = 1; 636 bln_stats.bln_new_target = bln_stats.bln_max_pages; 637 } else { 638 bln_stats.bln_new_target = new_target_pages; 639 } 640 641 mutex_exit(&bln_mutex); 642 cv_signal(&bln_cv); 643 } 644 645 /* 646 * bln_wait_sec can be used to throttle the hv calls, but by default it's 647 * turned off. If a balloon attempt fails, the wait time is forced on, and 648 * then is exponentially increased as further attempts fail. 649 */ 650 uint_t bln_wait_sec = 0; 651 uint_t bln_wait_shift = 1; 652 653 /* 654 * This is the main balloon thread. Wait on the cv. When woken, if our 655 * reservation has changed, call the appropriate function to adjust the 656 * reservation. 657 */ 658 static void 659 balloon_worker_thread(void) 660 { 661 uint_t bln_wait; 662 callb_cpr_t cprinfo; 663 spgcnt_t rv; 664 665 bln_wait = bln_wait_sec; 666 667 CALLB_CPR_INIT(&cprinfo, &bln_mutex, callb_generic_cpr, "balloon"); 668 for (;;) { 669 rv = 0; 670 671 mutex_enter(&bln_mutex); 672 CALLB_CPR_SAFE_BEGIN(&cprinfo); 673 if (bln_stats.bln_new_target != bln_stats.bln_current_pages) { 674 /* 675 * We weren't able to fully complete the request 676 * last time through, so try again. 677 */ 678 (void) cv_timedwait(&bln_cv, &bln_mutex, 679 lbolt + (bln_wait * hz)); 680 } else { 681 cv_wait(&bln_cv, &bln_mutex); 682 } 683 CALLB_CPR_SAFE_END(&cprinfo, &bln_mutex); 684 685 if (bln_stats.bln_new_target != bln_stats.bln_current_pages) { 686 if (bln_stats.bln_new_target < 687 bln_stats.bln_current_pages) { 688 /* reservation shrunk */ 689 rv = -balloon_dec_reservation( 690 bln_stats.bln_current_pages - 691 bln_stats.bln_new_target); 692 } else if (bln_stats.bln_new_target > 693 bln_stats.bln_current_pages) { 694 /* reservation grew */ 695 rv = balloon_inc_reservation( 696 bln_stats.bln_new_target - 697 bln_stats.bln_current_pages); 698 } 699 } 700 if (rv == 0) { 701 if (bln_wait == 0) { 702 bln_wait = 1; 703 } else { 704 bln_wait <<= bln_wait_shift; 705 } 706 } else { 707 bln_stats.bln_current_pages += rv; 708 bln_wait = bln_wait_sec; 709 } 710 if (bln_stats.bln_current_pages < bln_stats.bln_low) 711 bln_stats.bln_low = bln_stats.bln_current_pages; 712 else if (bln_stats.bln_current_pages > bln_stats.bln_high) 713 bln_stats.bln_high = bln_stats.bln_current_pages; 714 mutex_exit(&bln_mutex); 715 } 716 } 717 718 /* 719 * Called after balloon_init(), which is below. The xenbus thread is up 720 * and running, so we can register our watch and create the balloon thread. 721 */ 722 static void 723 balloon_config_watch(int state) 724 { 725 if (state != XENSTORE_UP) 726 return; 727 728 bln_watch.node = "memory/target"; 729 bln_watch.callback = balloon_handler; 730 if (register_xenbus_watch(&bln_watch)) { 731 cmn_err(CE_WARN, "Failed to register balloon watcher; balloon " 732 "thread will be disabled"); 733 return; 734 } 735 736 if (bln_thread == NULL) 737 bln_thread = thread_create(NULL, 0, balloon_worker_thread, 738 NULL, 0, &p0, TS_RUN, minclsyspri); 739 } 740 741 /* 742 * Basic initialization of the balloon thread. Set all of our variables, 743 * and register a callback for later when we can register a xenbus watch. 744 */ 745 void 746 balloon_init(pgcnt_t nr_pages) 747 { 748 domid_t domid = DOMID_SELF; 749 750 bln_stats.bln_current_pages = bln_stats.bln_low = nr_pages; 751 bln_stats.bln_new_target = bln_stats.bln_high = nr_pages; 752 bln_stats.bln_max_pages = nr_pages; 753 cv_init(&bln_cv, NULL, CV_DEFAULT, NULL); 754 755 bln_stats.bln_hard_limit = (spgcnt_t)HYPERVISOR_memory_op( 756 XENMEM_maximum_reservation, &domid); 757 758 (void) xs_register_xenbus_callback(balloon_config_watch); 759 } 760 761 /* 762 * These functions are called from the network drivers when they gain a page 763 * or give one away. We simply update our count. Note that the counter 764 * tracks the number of pages we give away, so we need to subtract any 765 * amount passed to balloon_drv_added. 766 */ 767 void 768 balloon_drv_added(int64_t delta) 769 { 770 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -delta); 771 } 772 773 void 774 balloon_drv_subtracted(int64_t delta) 775 { 776 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, delta); 777 } 778 779 /* 780 * balloon_alloc_pages() 781 * Allocate page_cnt mfns. mfns storage provided by the caller. Returns 782 * the number of pages allocated, which could be less than page_cnt, or 783 * a negative number if an error occurred. 784 */ 785 long 786 balloon_alloc_pages(uint_t page_cnt, mfn_t *mfns) 787 { 788 xen_memory_reservation_t memres; 789 long rv; 790 791 bzero(&memres, sizeof (memres)); 792 /*LINTED: constant in conditional context*/ 793 set_xen_guest_handle(memres.extent_start, mfns); 794 memres.domid = DOMID_SELF; 795 memres.nr_extents = page_cnt; 796 797 rv = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); 798 if (rv > 0) 799 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -rv); 800 return (rv); 801 } 802 803 /* 804 * balloon_free_pages() 805 * free page_cnt pages, using any combination of mfns, pfns, and kva as long 806 * as they refer to the same mapping. If an array of mfns is passed in, we 807 * assume they were already cleared. Otherwise, we need to zero the pages 808 * before giving them back to the hypervisor. kva space is not free'd up in 809 * case the caller wants to re-use it. 810 */ 811 long 812 balloon_free_pages(uint_t page_cnt, mfn_t *mfns, caddr_t kva, pfn_t *pfns) 813 { 814 xen_memory_reservation_t memdec; 815 mfn_t mfn; 816 pfn_t pfn; 817 uint_t i; 818 long e; 819 820 821 #if DEBUG 822 /* make sure kva is page aligned and maps to first pfn */ 823 if (kva != NULL) { 824 ASSERT(((uintptr_t)kva & PAGEOFFSET) == 0); 825 if (pfns != NULL) { 826 ASSERT(hat_getpfnum(kas.a_hat, kva) == pfns[0]); 827 } 828 } 829 #endif 830 831 /* if we have a kva, we can clean all pages with just one bzero */ 832 if ((kva != NULL) && balloon_zero_memory) { 833 bzero(kva, (page_cnt * PAGESIZE)); 834 } 835 836 /* if we were given a kva and/or a pfn */ 837 if ((kva != NULL) || (pfns != NULL)) { 838 839 /* 840 * All the current callers only pass 1 page when using kva or 841 * pfns, and use mfns when passing multiple pages. If that 842 * assumption is changed, the following code will need some 843 * work. The following ASSERT() guarantees we're respecting 844 * the io locking quota. 845 */ 846 ASSERT(page_cnt < bln_contig_list_quota); 847 848 /* go through all the pages */ 849 for (i = 0; i < page_cnt; i++) { 850 851 /* get the next pfn */ 852 if (pfns == NULL) { 853 pfn = hat_getpfnum(kas.a_hat, 854 (kva + (PAGESIZE * i))); 855 } else { 856 pfn = pfns[i]; 857 } 858 859 /* 860 * if we didn't already zero this page, do it now. we 861 * need to do this *before* we give back the MFN 862 */ 863 if ((kva == NULL) && (balloon_zero_memory)) { 864 pfnzero(pfn, 0, PAGESIZE); 865 } 866 867 /* 868 * unmap the pfn. We don't free up the kva vmem space 869 * so the caller can re-use it. The page must be 870 * unmapped before it is given back to the hypervisor. 871 */ 872 if (kva != NULL) { 873 hat_unload(kas.a_hat, (kva + (PAGESIZE * i)), 874 PAGESIZE, HAT_UNLOAD_UNMAP); 875 } 876 877 /* grab the mfn before the pfn is marked as invalid */ 878 mfn = pfn_to_mfn(pfn); 879 880 /* mark the pfn as invalid */ 881 reassign_pfn(pfn, MFN_INVALID); 882 883 /* 884 * if we weren't given an array of MFNs, we need to 885 * free them up one at a time. Otherwise, we'll wait 886 * until later and do it in one hypercall 887 */ 888 if (mfns == NULL) { 889 bzero(&memdec, sizeof (memdec)); 890 /*LINTED: constant in conditional context*/ 891 set_xen_guest_handle(memdec.extent_start, &mfn); 892 memdec.domid = DOMID_SELF; 893 memdec.nr_extents = 1; 894 e = HYPERVISOR_memory_op( 895 XENMEM_decrease_reservation, &memdec); 896 if (e != 1) { 897 cmn_err(CE_PANIC, "balloon: unable to " 898 "give a page back to the " 899 "hypervisor.\n"); 900 } 901 } 902 } 903 } 904 905 /* 906 * if we were passed in MFNs, we haven't free'd them up yet. We can 907 * do it with one call. 908 */ 909 if (mfns != NULL) { 910 bzero(&memdec, sizeof (memdec)); 911 /*LINTED: constant in conditional context*/ 912 set_xen_guest_handle(memdec.extent_start, mfns); 913 memdec.domid = DOMID_SELF; 914 memdec.nr_extents = page_cnt; 915 e = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &memdec); 916 if (e != page_cnt) { 917 cmn_err(CE_PANIC, "balloon: unable to give pages back " 918 "to the hypervisor.\n"); 919 } 920 } 921 922 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, page_cnt); 923 return (page_cnt); 924 } 925 926 927 /* 928 * balloon_replace_pages() 929 * Try to replace nextexts blocks of 2^order pages. addr_bits specifies 930 * how many bits of address the pages must be within (i.e. 16 would mean 931 * that the pages cannot have an address > 64k). The constrints are on 932 * what the hypervisor gives us -- we are free to give any pages in 933 * exchange. The array pp is the pages we are giving away. The caller 934 * provides storage space for mfns, which hold the new physical pages. 935 */ 936 long 937 balloon_replace_pages(uint_t nextents, page_t **pp, uint_t addr_bits, 938 uint_t order, mfn_t *mfns) 939 { 940 xen_memory_reservation_t memres; 941 long fallback_cnt; 942 long cnt; 943 uint_t i, j, page_cnt, extlen; 944 long e; 945 int locked; 946 947 948 /* 949 * we shouldn't be allocating constrained pages on a guest. It doesn't 950 * make any sense. They won't be constrained after a migration. 951 */ 952 ASSERT(DOMAIN_IS_INITDOMAIN(xen_info)); 953 954 extlen = 1 << order; 955 page_cnt = nextents * extlen; 956 /* Give back the current pages to the hypervisor */ 957 for (i = 0; i < page_cnt; i++) { 958 cnt = balloon_free_pages(1, NULL, NULL, &pp[i]->p_pagenum); 959 if (cnt != 1) { 960 cmn_err(CE_PANIC, "balloon: unable to give a page back " 961 "to the hypervisor.\n"); 962 } 963 } 964 965 /* 966 * try to allocate the new pages using addr_bits and order. If we can't 967 * get all of the pages, try to get the remaining pages with no 968 * constraints and, if that was successful, return the number of 969 * constrained pages we did allocate. 970 */ 971 bzero(&memres, sizeof (memres)); 972 /*LINTED: constant in conditional context*/ 973 set_xen_guest_handle(memres.extent_start, mfns); 974 memres.domid = DOMID_SELF; 975 memres.nr_extents = nextents; 976 memres.address_bits = addr_bits; 977 memres.extent_order = order; 978 cnt = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); 979 /* assign the new MFNs to the current PFNs */ 980 locked = balloon_lock_contig_pfnlist(cnt * extlen); 981 for (i = 0; i < cnt; i++) { 982 for (j = 0; j < extlen; j++) { 983 reassign_pfn(pp[i * extlen + j]->p_pagenum, 984 mfns[i] + j); 985 } 986 } 987 if (locked) 988 unlock_contig_pfnlist(); 989 if (cnt != nextents) { 990 if (cnt < 0) { 991 cnt = 0; 992 } 993 994 /* 995 * We couldn't get enough memory to satisfy our requirements. 996 * The above loop will assign the parts of the request that 997 * were successful (this part may be 0). We need to fill 998 * in the rest. The bzero below clears out extent_order and 999 * address_bits, so we'll take anything from the hypervisor 1000 * to replace the pages we gave away. 1001 */ 1002 fallback_cnt = page_cnt - cnt * extlen; 1003 bzero(&memres, sizeof (memres)); 1004 /*LINTED: constant in conditional context*/ 1005 set_xen_guest_handle(memres.extent_start, mfns); 1006 memres.domid = DOMID_SELF; 1007 memres.nr_extents = fallback_cnt; 1008 e = HYPERVISOR_memory_op(XENMEM_increase_reservation, &memres); 1009 if (e != fallback_cnt) { 1010 cmn_err(CE_PANIC, "balloon: unable to recover from " 1011 "failed increase_reservation.\n"); 1012 } 1013 locked = balloon_lock_contig_pfnlist(fallback_cnt); 1014 for (i = 0; i < fallback_cnt; i++) { 1015 uint_t offset = page_cnt - fallback_cnt; 1016 1017 /* 1018 * We already used pp[0...(cnt * extlen)] before, 1019 * so start at the next entry in the pp array. 1020 */ 1021 reassign_pfn(pp[i + offset]->p_pagenum, mfns[i]); 1022 } 1023 if (locked) 1024 unlock_contig_pfnlist(); 1025 } 1026 1027 /* 1028 * balloon_free_pages increments our counter. Decrement it here. 1029 */ 1030 atomic_add_long((ulong_t *)&bln_stats.bln_hv_pages, -(long)page_cnt); 1031 1032 /* 1033 * return the number of extents we were able to replace. If we got 1034 * this far, we know all the pp's are valid. 1035 */ 1036 return (cnt); 1037 } 1038 1039 1040 /* 1041 * Called from the driver - return the requested stat. 1042 */ 1043 size_t 1044 balloon_values(int cmd) 1045 { 1046 switch (cmd) { 1047 case BLN_IOCTL_CURRENT: 1048 return (ptokb(bln_stats.bln_current_pages)); 1049 case BLN_IOCTL_TARGET: 1050 return (ptokb(bln_stats.bln_new_target)); 1051 case BLN_IOCTL_LOW: 1052 return (ptokb(bln_stats.bln_low)); 1053 case BLN_IOCTL_HIGH: 1054 return (ptokb(bln_stats.bln_high)); 1055 case BLN_IOCTL_LIMIT: 1056 return (ptokb(bln_stats.bln_hard_limit)); 1057 default: 1058 panic("Unexpected cmd %d in balloon_values()\n", cmd); 1059 } 1060 /*NOTREACHED*/ 1061 } 1062