1 /****************************************************************************** 2 * Xen balloon driver - enables returning/claiming memory to/from Xen. 3 * 4 * Copyright (c) 2003, B Dragovic 5 * Copyright (c) 2003-2004, M Williamson, K Fraser 6 * Copyright (c) 2005 Dan M. Smith, IBM Corporation 7 * Copyright (c) 2010 Daniel Kiper 8 * 9 * Memory hotplug support was written by Daniel Kiper. Work on 10 * it was sponsored by Google under Google Summer of Code 2010 11 * program. Jeremy Fitzhardinge from Citrix was the mentor for 12 * this project. 13 * 14 * This program is free software; you can redistribute it and/or 15 * modify it under the terms of the GNU General Public License version 2 16 * as published by the Free Software Foundation; or, when distributed 17 * separately from the Linux kernel or incorporated into other 18 * software packages, subject to the following license: 19 * 20 * Permission is hereby granted, free of charge, to any person obtaining a copy 21 * of this source file (the "Software"), to deal in the Software without 22 * restriction, including without limitation the rights to use, copy, modify, 23 * merge, publish, distribute, sublicense, and/or sell copies of the Software, 24 * and to permit persons to whom the Software is furnished to do so, subject to 25 * the following conditions: 26 * 27 * The above copyright notice and this permission notice shall be included in 28 * all copies or substantial portions of the Software. 29 * 30 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 31 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 32 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 33 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 34 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 35 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 36 * IN THE SOFTWARE. 37 */ 38 39 #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt 40 41 #include <linux/cpu.h> 42 #include <linux/kernel.h> 43 #include <linux/sched.h> 44 #include <linux/cred.h> 45 #include <linux/errno.h> 46 #include <linux/mm.h> 47 #include <linux/memblock.h> 48 #include <linux/pagemap.h> 49 #include <linux/highmem.h> 50 #include <linux/mutex.h> 51 #include <linux/list.h> 52 #include <linux/gfp.h> 53 #include <linux/notifier.h> 54 #include <linux/memory.h> 55 #include <linux/memory_hotplug.h> 56 #include <linux/percpu-defs.h> 57 #include <linux/slab.h> 58 #include <linux/sysctl.h> 59 60 #include <asm/page.h> 61 #include <asm/tlb.h> 62 63 #include <asm/xen/hypervisor.h> 64 #include <asm/xen/hypercall.h> 65 66 #include <xen/xen.h> 67 #include <xen/interface/xen.h> 68 #include <xen/interface/memory.h> 69 #include <xen/balloon.h> 70 #include <xen/features.h> 71 #include <xen/page.h> 72 #include <xen/mem-reservation.h> 73 74 static int xen_hotplug_unpopulated; 75 76 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG 77 78 static struct ctl_table balloon_table[] = { 79 { 80 .procname = "hotplug_unpopulated", 81 .data = &xen_hotplug_unpopulated, 82 .maxlen = sizeof(int), 83 .mode = 0644, 84 .proc_handler = proc_dointvec_minmax, 85 .extra1 = SYSCTL_ZERO, 86 .extra2 = SYSCTL_ONE, 87 }, 88 { } 89 }; 90 91 static struct ctl_table balloon_root[] = { 92 { 93 .procname = "balloon", 94 .mode = 0555, 95 .child = balloon_table, 96 }, 97 { } 98 }; 99 100 static struct ctl_table xen_root[] = { 101 { 102 .procname = "xen", 103 .mode = 0555, 104 .child = balloon_root, 105 }, 106 { } 107 }; 108 109 #endif 110 111 /* 112 * Use one extent per PAGE_SIZE to avoid to break down the page into 113 * multiple frame. 114 */ 115 #define EXTENT_ORDER (fls(XEN_PFN_PER_PAGE) - 1) 116 117 /* 118 * balloon_process() state: 119 * 120 * BP_DONE: done or nothing to do, 121 * BP_WAIT: wait to be rescheduled, 122 * BP_EAGAIN: error, go to sleep, 123 * BP_ECANCELED: error, balloon operation canceled. 124 */ 125 126 enum bp_state { 127 BP_DONE, 128 BP_WAIT, 129 BP_EAGAIN, 130 BP_ECANCELED 131 }; 132 133 134 static DEFINE_MUTEX(balloon_mutex); 135 136 struct balloon_stats balloon_stats; 137 EXPORT_SYMBOL_GPL(balloon_stats); 138 139 /* We increase/decrease in batches which fit in a page */ 140 static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)]; 141 142 143 /* List of ballooned pages, threaded through the mem_map array. */ 144 static LIST_HEAD(ballooned_pages); 145 static DECLARE_WAIT_QUEUE_HEAD(balloon_wq); 146 147 /* Main work function, always executed in process context. */ 148 static void balloon_process(struct work_struct *work); 149 static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); 150 151 /* When ballooning out (allocating memory to return to Xen) we don't really 152 want the kernel to try too hard since that can trigger the oom killer. */ 153 #define GFP_BALLOON \ 154 (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC) 155 156 /* balloon_append: add the given page to the balloon. */ 157 static void balloon_append(struct page *page) 158 { 159 __SetPageOffline(page); 160 161 /* Lowmem is re-populated first, so highmem pages go at list tail. */ 162 if (PageHighMem(page)) { 163 list_add_tail(&page->lru, &ballooned_pages); 164 balloon_stats.balloon_high++; 165 } else { 166 list_add(&page->lru, &ballooned_pages); 167 balloon_stats.balloon_low++; 168 } 169 wake_up(&balloon_wq); 170 } 171 172 /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ 173 static struct page *balloon_retrieve(bool require_lowmem) 174 { 175 struct page *page; 176 177 if (list_empty(&ballooned_pages)) 178 return NULL; 179 180 page = list_entry(ballooned_pages.next, struct page, lru); 181 if (require_lowmem && PageHighMem(page)) 182 return NULL; 183 list_del(&page->lru); 184 185 if (PageHighMem(page)) 186 balloon_stats.balloon_high--; 187 else 188 balloon_stats.balloon_low--; 189 190 __ClearPageOffline(page); 191 return page; 192 } 193 194 static struct page *balloon_next_page(struct page *page) 195 { 196 struct list_head *next = page->lru.next; 197 if (next == &ballooned_pages) 198 return NULL; 199 return list_entry(next, struct page, lru); 200 } 201 202 static enum bp_state update_schedule(enum bp_state state) 203 { 204 if (state == BP_WAIT) 205 return BP_WAIT; 206 207 if (state == BP_ECANCELED) 208 return BP_ECANCELED; 209 210 if (state == BP_DONE) { 211 balloon_stats.schedule_delay = 1; 212 balloon_stats.retry_count = 1; 213 return BP_DONE; 214 } 215 216 ++balloon_stats.retry_count; 217 218 if (balloon_stats.max_retry_count != RETRY_UNLIMITED && 219 balloon_stats.retry_count > balloon_stats.max_retry_count) { 220 balloon_stats.schedule_delay = 1; 221 balloon_stats.retry_count = 1; 222 return BP_ECANCELED; 223 } 224 225 balloon_stats.schedule_delay <<= 1; 226 227 if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay) 228 balloon_stats.schedule_delay = balloon_stats.max_schedule_delay; 229 230 return BP_EAGAIN; 231 } 232 233 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG 234 static void release_memory_resource(struct resource *resource) 235 { 236 if (!resource) 237 return; 238 239 /* 240 * No need to reset region to identity mapped since we now 241 * know that no I/O can be in this region 242 */ 243 release_resource(resource); 244 kfree(resource); 245 } 246 247 static struct resource *additional_memory_resource(phys_addr_t size) 248 { 249 struct resource *res; 250 int ret; 251 252 res = kzalloc(sizeof(*res), GFP_KERNEL); 253 if (!res) 254 return NULL; 255 256 res->name = "System RAM"; 257 res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 258 259 ret = allocate_resource(&iomem_resource, res, 260 size, 0, -1, 261 PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL); 262 if (ret < 0) { 263 pr_err("Cannot allocate new System RAM resource\n"); 264 kfree(res); 265 return NULL; 266 } 267 268 return res; 269 } 270 271 static enum bp_state reserve_additional_memory(void) 272 { 273 long credit; 274 struct resource *resource; 275 int nid, rc; 276 unsigned long balloon_hotplug; 277 278 credit = balloon_stats.target_pages + balloon_stats.target_unpopulated 279 - balloon_stats.total_pages; 280 281 /* 282 * Already hotplugged enough pages? Wait for them to be 283 * onlined. 284 */ 285 if (credit <= 0) 286 return BP_WAIT; 287 288 balloon_hotplug = round_up(credit, PAGES_PER_SECTION); 289 290 resource = additional_memory_resource(balloon_hotplug * PAGE_SIZE); 291 if (!resource) 292 goto err; 293 294 nid = memory_add_physaddr_to_nid(resource->start); 295 296 #ifdef CONFIG_XEN_HAVE_PVMMU 297 /* 298 * We don't support PV MMU when Linux and Xen is using 299 * different page granularity. 300 */ 301 BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); 302 303 /* 304 * add_memory() will build page tables for the new memory so 305 * the p2m must contain invalid entries so the correct 306 * non-present PTEs will be written. 307 * 308 * If a failure occurs, the original (identity) p2m entries 309 * are not restored since this region is now known not to 310 * conflict with any devices. 311 */ 312 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 313 unsigned long pfn, i; 314 315 pfn = PFN_DOWN(resource->start); 316 for (i = 0; i < balloon_hotplug; i++) { 317 if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) { 318 pr_warn("set_phys_to_machine() failed, no memory added\n"); 319 goto err; 320 } 321 } 322 } 323 #endif 324 325 /* 326 * add_memory_resource() will call online_pages() which in its turn 327 * will call xen_online_page() callback causing deadlock if we don't 328 * release balloon_mutex here. Unlocking here is safe because the 329 * callers drop the mutex before trying again. 330 */ 331 mutex_unlock(&balloon_mutex); 332 /* add_memory_resource() requires the device_hotplug lock */ 333 lock_device_hotplug(); 334 rc = add_memory_resource(nid, resource); 335 unlock_device_hotplug(); 336 mutex_lock(&balloon_mutex); 337 338 if (rc) { 339 pr_warn("Cannot add additional memory (%i)\n", rc); 340 goto err; 341 } 342 343 balloon_stats.total_pages += balloon_hotplug; 344 345 return BP_WAIT; 346 err: 347 release_memory_resource(resource); 348 return BP_ECANCELED; 349 } 350 351 static void xen_online_page(struct page *page, unsigned int order) 352 { 353 unsigned long i, size = (1 << order); 354 unsigned long start_pfn = page_to_pfn(page); 355 struct page *p; 356 357 pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); 358 mutex_lock(&balloon_mutex); 359 for (i = 0; i < size; i++) { 360 p = pfn_to_page(start_pfn + i); 361 balloon_append(p); 362 } 363 mutex_unlock(&balloon_mutex); 364 } 365 366 static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) 367 { 368 if (val == MEM_ONLINE) 369 schedule_delayed_work(&balloon_worker, 0); 370 371 return NOTIFY_OK; 372 } 373 374 static struct notifier_block xen_memory_nb = { 375 .notifier_call = xen_memory_notifier, 376 .priority = 0 377 }; 378 #else 379 static enum bp_state reserve_additional_memory(void) 380 { 381 balloon_stats.target_pages = balloon_stats.current_pages + 382 balloon_stats.target_unpopulated; 383 return BP_ECANCELED; 384 } 385 #endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ 386 387 static long current_credit(void) 388 { 389 return balloon_stats.target_pages - balloon_stats.current_pages; 390 } 391 392 static bool balloon_is_inflated(void) 393 { 394 return balloon_stats.balloon_low || balloon_stats.balloon_high; 395 } 396 397 static enum bp_state increase_reservation(unsigned long nr_pages) 398 { 399 int rc; 400 unsigned long i; 401 struct page *page; 402 403 if (nr_pages > ARRAY_SIZE(frame_list)) 404 nr_pages = ARRAY_SIZE(frame_list); 405 406 page = list_first_entry_or_null(&ballooned_pages, struct page, lru); 407 for (i = 0; i < nr_pages; i++) { 408 if (!page) { 409 nr_pages = i; 410 break; 411 } 412 413 frame_list[i] = page_to_xen_pfn(page); 414 page = balloon_next_page(page); 415 } 416 417 rc = xenmem_reservation_increase(nr_pages, frame_list); 418 if (rc <= 0) 419 return BP_EAGAIN; 420 421 for (i = 0; i < rc; i++) { 422 page = balloon_retrieve(false); 423 BUG_ON(page == NULL); 424 425 xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]); 426 427 /* Relinquish the page back to the allocator. */ 428 free_reserved_page(page); 429 } 430 431 balloon_stats.current_pages += rc; 432 433 return BP_DONE; 434 } 435 436 static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) 437 { 438 enum bp_state state = BP_DONE; 439 unsigned long i; 440 struct page *page, *tmp; 441 int ret; 442 LIST_HEAD(pages); 443 444 if (nr_pages > ARRAY_SIZE(frame_list)) 445 nr_pages = ARRAY_SIZE(frame_list); 446 447 for (i = 0; i < nr_pages; i++) { 448 page = alloc_page(gfp); 449 if (page == NULL) { 450 nr_pages = i; 451 state = BP_EAGAIN; 452 break; 453 } 454 adjust_managed_page_count(page, -1); 455 xenmem_reservation_scrub_page(page); 456 list_add(&page->lru, &pages); 457 } 458 459 /* 460 * Ensure that ballooned highmem pages don't have kmaps. 461 * 462 * Do this before changing the p2m as kmap_flush_unused() 463 * reads PTEs to obtain pages (and hence needs the original 464 * p2m entry). 465 */ 466 kmap_flush_unused(); 467 468 /* 469 * Setup the frame, update direct mapping, invalidate P2M, 470 * and add to balloon. 471 */ 472 i = 0; 473 list_for_each_entry_safe(page, tmp, &pages, lru) { 474 frame_list[i++] = xen_page_to_gfn(page); 475 476 xenmem_reservation_va_mapping_reset(1, &page); 477 478 list_del(&page->lru); 479 480 balloon_append(page); 481 } 482 483 flush_tlb_all(); 484 485 ret = xenmem_reservation_decrease(nr_pages, frame_list); 486 BUG_ON(ret != nr_pages); 487 488 balloon_stats.current_pages -= nr_pages; 489 490 return state; 491 } 492 493 /* 494 * As this is a work item it is guaranteed to run as a single instance only. 495 * We may of course race updates of the target counts (which are protected 496 * by the balloon lock), or with changes to the Xen hard limit, but we will 497 * recover from these in time. 498 */ 499 static void balloon_process(struct work_struct *work) 500 { 501 enum bp_state state = BP_DONE; 502 long credit; 503 504 505 do { 506 mutex_lock(&balloon_mutex); 507 508 credit = current_credit(); 509 510 if (credit > 0) { 511 if (balloon_is_inflated()) 512 state = increase_reservation(credit); 513 else 514 state = reserve_additional_memory(); 515 } 516 517 if (credit < 0) { 518 long n_pages; 519 520 n_pages = min(-credit, si_mem_available()); 521 state = decrease_reservation(n_pages, GFP_BALLOON); 522 if (state == BP_DONE && n_pages != -credit && 523 n_pages < totalreserve_pages) 524 state = BP_EAGAIN; 525 } 526 527 state = update_schedule(state); 528 529 mutex_unlock(&balloon_mutex); 530 531 cond_resched(); 532 533 } while (credit && state == BP_DONE); 534 535 /* Schedule more work if there is some still to be done. */ 536 if (state == BP_EAGAIN) 537 schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ); 538 } 539 540 /* Resets the Xen limit, sets new target, and kicks off processing. */ 541 void balloon_set_new_target(unsigned long target) 542 { 543 /* No need for lock. Not read-modify-write updates. */ 544 balloon_stats.target_pages = target; 545 schedule_delayed_work(&balloon_worker, 0); 546 } 547 EXPORT_SYMBOL_GPL(balloon_set_new_target); 548 549 static int add_ballooned_pages(int nr_pages) 550 { 551 enum bp_state st; 552 553 if (xen_hotplug_unpopulated) { 554 st = reserve_additional_memory(); 555 if (st != BP_ECANCELED) { 556 int rc; 557 558 mutex_unlock(&balloon_mutex); 559 rc = wait_event_interruptible(balloon_wq, 560 !list_empty(&ballooned_pages)); 561 mutex_lock(&balloon_mutex); 562 return rc ? -ENOMEM : 0; 563 } 564 } 565 566 if (si_mem_available() < nr_pages) 567 return -ENOMEM; 568 569 st = decrease_reservation(nr_pages, GFP_USER); 570 if (st != BP_DONE) 571 return -ENOMEM; 572 573 return 0; 574 } 575 576 /** 577 * alloc_xenballooned_pages - get pages that have been ballooned out 578 * @nr_pages: Number of pages to get 579 * @pages: pages returned 580 * @return 0 on success, error otherwise 581 */ 582 int alloc_xenballooned_pages(int nr_pages, struct page **pages) 583 { 584 int pgno = 0; 585 struct page *page; 586 int ret; 587 588 mutex_lock(&balloon_mutex); 589 590 balloon_stats.target_unpopulated += nr_pages; 591 592 while (pgno < nr_pages) { 593 page = balloon_retrieve(true); 594 if (page) { 595 pages[pgno++] = page; 596 #ifdef CONFIG_XEN_HAVE_PVMMU 597 /* 598 * We don't support PV MMU when Linux and Xen is using 599 * different page granularity. 600 */ 601 BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE); 602 603 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 604 ret = xen_alloc_p2m_entry(page_to_pfn(page)); 605 if (ret < 0) 606 goto out_undo; 607 } 608 #endif 609 } else { 610 ret = add_ballooned_pages(nr_pages - pgno); 611 if (ret < 0) 612 goto out_undo; 613 } 614 } 615 mutex_unlock(&balloon_mutex); 616 return 0; 617 out_undo: 618 mutex_unlock(&balloon_mutex); 619 free_xenballooned_pages(pgno, pages); 620 /* 621 * NB: free_xenballooned_pages will only subtract pgno pages, but since 622 * target_unpopulated is incremented with nr_pages at the start we need 623 * to remove the remaining ones also, or accounting will be screwed. 624 */ 625 balloon_stats.target_unpopulated -= nr_pages - pgno; 626 return ret; 627 } 628 EXPORT_SYMBOL(alloc_xenballooned_pages); 629 630 /** 631 * free_xenballooned_pages - return pages retrieved with get_ballooned_pages 632 * @nr_pages: Number of pages 633 * @pages: pages to return 634 */ 635 void free_xenballooned_pages(int nr_pages, struct page **pages) 636 { 637 int i; 638 639 mutex_lock(&balloon_mutex); 640 641 for (i = 0; i < nr_pages; i++) { 642 if (pages[i]) 643 balloon_append(pages[i]); 644 } 645 646 balloon_stats.target_unpopulated -= nr_pages; 647 648 /* The balloon may be too large now. Shrink it if needed. */ 649 if (current_credit()) 650 schedule_delayed_work(&balloon_worker, 0); 651 652 mutex_unlock(&balloon_mutex); 653 } 654 EXPORT_SYMBOL(free_xenballooned_pages); 655 656 #if defined(CONFIG_XEN_PV) && !defined(CONFIG_XEN_UNPOPULATED_ALLOC) 657 static void __init balloon_add_region(unsigned long start_pfn, 658 unsigned long pages) 659 { 660 unsigned long pfn, extra_pfn_end; 661 662 /* 663 * If the amount of usable memory has been limited (e.g., with 664 * the 'mem' command line parameter), don't add pages beyond 665 * this limit. 666 */ 667 extra_pfn_end = min(max_pfn, start_pfn + pages); 668 669 for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) { 670 /* totalram_pages and totalhigh_pages do not 671 include the boot-time balloon extension, so 672 don't subtract from it. */ 673 balloon_append(pfn_to_page(pfn)); 674 } 675 676 balloon_stats.total_pages += extra_pfn_end - start_pfn; 677 } 678 #endif 679 680 static int __init balloon_init(void) 681 { 682 if (!xen_domain()) 683 return -ENODEV; 684 685 pr_info("Initialising balloon driver\n"); 686 687 #ifdef CONFIG_XEN_PV 688 balloon_stats.current_pages = xen_pv_domain() 689 ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) 690 : get_num_physpages(); 691 #else 692 balloon_stats.current_pages = get_num_physpages(); 693 #endif 694 balloon_stats.target_pages = balloon_stats.current_pages; 695 balloon_stats.balloon_low = 0; 696 balloon_stats.balloon_high = 0; 697 balloon_stats.total_pages = balloon_stats.current_pages; 698 699 balloon_stats.schedule_delay = 1; 700 balloon_stats.max_schedule_delay = 32; 701 balloon_stats.retry_count = 1; 702 balloon_stats.max_retry_count = 4; 703 704 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG 705 set_online_page_callback(&xen_online_page); 706 register_memory_notifier(&xen_memory_nb); 707 register_sysctl_table(xen_root); 708 #endif 709 710 #if defined(CONFIG_XEN_PV) && !defined(CONFIG_XEN_UNPOPULATED_ALLOC) 711 { 712 int i; 713 714 /* 715 * Initialize the balloon with pages from the extra memory 716 * regions (see arch/x86/xen/setup.c). 717 */ 718 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) 719 if (xen_extra_mem[i].n_pfns) 720 balloon_add_region(xen_extra_mem[i].start_pfn, 721 xen_extra_mem[i].n_pfns); 722 } 723 #endif 724 725 /* Init the xen-balloon driver. */ 726 xen_balloon_init(); 727 728 return 0; 729 } 730 subsys_initcall(balloon_init); 731