1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Virtio-mem device driver. 4 * 5 * Copyright Red Hat, Inc. 2020 6 * 7 * Author(s): David Hildenbrand <david@redhat.com> 8 */ 9 10 #include <linux/virtio.h> 11 #include <linux/virtio_mem.h> 12 #include <linux/workqueue.h> 13 #include <linux/slab.h> 14 #include <linux/module.h> 15 #include <linux/mm.h> 16 #include <linux/memory_hotplug.h> 17 #include <linux/memory.h> 18 #include <linux/hrtimer.h> 19 #include <linux/crash_dump.h> 20 #include <linux/mutex.h> 21 #include <linux/bitmap.h> 22 #include <linux/lockdep.h> 23 #include <linux/log2.h> 24 #include <linux/vmalloc.h> 25 #include <linux/suspend.h> 26 27 #include <acpi/acpi_numa.h> 28 29 static bool unplug_online = true; 30 module_param(unplug_online, bool, 0644); 31 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); 32 33 static bool force_bbm; 34 module_param(force_bbm, bool, 0444); 35 MODULE_PARM_DESC(force_bbm, 36 "Force Big Block Mode. Default is 0 (auto-selection)"); 37 38 static unsigned long bbm_block_size; 39 module_param(bbm_block_size, ulong, 0444); 40 MODULE_PARM_DESC(bbm_block_size, 41 "Big Block size in bytes. Default is 0 (auto-detection)."); 42 43 /* 44 * virtio-mem currently supports the following modes of operation: 45 * 46 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The 47 * size of a Sub Block (SB) is determined based on the device block size, the 48 * pageblock size, and the maximum allocation granularity of the buddy. 49 * Subblocks within a Linux memory block might either be plugged or unplugged. 50 * Memory is added/removed to Linux MM in Linux memory block granularity. 51 * 52 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. 53 * Memory is added/removed to Linux MM in Big Block granularity. 54 * 55 * The mode is determined automatically based on the Linux memory block size 56 * and the device block size. 57 * 58 * User space / core MM (auto onlining) is responsible for onlining added 59 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are 60 * always onlined separately, and all memory within a Linux memory block is 61 * onlined to the same zone - virtio-mem relies on this behavior. 62 */ 63 64 /* 65 * State of a Linux memory block in SBM. 66 */ 67 enum virtio_mem_sbm_mb_state { 68 /* Unplugged, not added to Linux. Can be reused later. */ 69 VIRTIO_MEM_SBM_MB_UNUSED = 0, 70 /* (Partially) plugged, not added to Linux. Error on add_memory(). */ 71 VIRTIO_MEM_SBM_MB_PLUGGED, 72 /* Fully plugged, fully added to Linux, offline. */ 73 VIRTIO_MEM_SBM_MB_OFFLINE, 74 /* Partially plugged, fully added to Linux, offline. */ 75 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 76 /* Fully plugged, fully added to Linux, onlined to a kernel zone. */ 77 VIRTIO_MEM_SBM_MB_KERNEL, 78 /* Partially plugged, fully added to Linux, online to a kernel zone */ 79 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 80 /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 81 VIRTIO_MEM_SBM_MB_MOVABLE, 82 /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 83 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 84 VIRTIO_MEM_SBM_MB_COUNT 85 }; 86 87 /* 88 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. 89 */ 90 enum virtio_mem_bbm_bb_state { 91 /* Unplugged, not added to Linux. Can be reused later. */ 92 VIRTIO_MEM_BBM_BB_UNUSED = 0, 93 /* Plugged, not added to Linux. Error on add_memory(). */ 94 VIRTIO_MEM_BBM_BB_PLUGGED, 95 /* Plugged and added to Linux. */ 96 VIRTIO_MEM_BBM_BB_ADDED, 97 /* All online parts are fake-offline, ready to remove. */ 98 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, 99 VIRTIO_MEM_BBM_BB_COUNT 100 }; 101 102 struct virtio_mem { 103 struct virtio_device *vdev; 104 105 /* We might first have to unplug all memory when starting up. */ 106 bool unplug_all_required; 107 108 /* Workqueue that processes the plug/unplug requests. */ 109 struct work_struct wq; 110 atomic_t wq_active; 111 atomic_t config_changed; 112 113 /* Virtqueue for guest->host requests. */ 114 struct virtqueue *vq; 115 116 /* Wait for a host response to a guest request. */ 117 wait_queue_head_t host_resp; 118 119 /* Space for one guest request and the host response. */ 120 struct virtio_mem_req req; 121 struct virtio_mem_resp resp; 122 123 /* The current size of the device. */ 124 uint64_t plugged_size; 125 /* The requested size of the device. */ 126 uint64_t requested_size; 127 128 /* The device block size (for communicating with the device). */ 129 uint64_t device_block_size; 130 /* The determined node id for all memory of the device. */ 131 int nid; 132 /* Physical start address of the memory region. */ 133 uint64_t addr; 134 /* Maximum region size in bytes. */ 135 uint64_t region_size; 136 137 /* The parent resource for all memory added via this device. */ 138 struct resource *parent_resource; 139 /* 140 * Copy of "System RAM (virtio_mem)" to be used for 141 * add_memory_driver_managed(). 142 */ 143 const char *resource_name; 144 /* Memory group identification. */ 145 int mgid; 146 147 /* 148 * We don't want to add too much memory if it's not getting onlined, 149 * to avoid running OOM. Besides this threshold, we allow to have at 150 * least two offline blocks at a time (whatever is bigger). 151 */ 152 #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) 153 atomic64_t offline_size; 154 uint64_t offline_threshold; 155 156 /* If set, the driver is in SBM, otherwise in BBM. */ 157 bool in_sbm; 158 159 union { 160 struct { 161 /* Id of the first memory block of this device. */ 162 unsigned long first_mb_id; 163 /* Id of the last usable memory block of this device. */ 164 unsigned long last_usable_mb_id; 165 /* Id of the next memory bock to prepare when needed. */ 166 unsigned long next_mb_id; 167 168 /* The subblock size. */ 169 uint64_t sb_size; 170 /* The number of subblocks per Linux memory block. */ 171 uint32_t sbs_per_mb; 172 173 /* 174 * Some of the Linux memory blocks tracked as "partially 175 * plugged" are completely unplugged and can be offlined 176 * and removed -- which previously failed. 177 */ 178 bool have_unplugged_mb; 179 180 /* Summary of all memory block states. */ 181 unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; 182 183 /* 184 * One byte state per memory block. Allocated via 185 * vmalloc(). Resized (alloc+copy+free) on demand. 186 * 187 * With 128 MiB memory blocks, we have states for 512 188 * GiB of memory in one 4 KiB page. 189 */ 190 uint8_t *mb_states; 191 192 /* 193 * Bitmap: one bit per subblock. Allocated similar to 194 * sbm.mb_states. 195 * 196 * A set bit means the corresponding subblock is 197 * plugged, otherwise it's unblocked. 198 * 199 * With 4 MiB subblocks, we manage 128 GiB of memory 200 * in one 4 KiB page. 201 */ 202 unsigned long *sb_states; 203 } sbm; 204 205 struct { 206 /* Id of the first big block of this device. */ 207 unsigned long first_bb_id; 208 /* Id of the last usable big block of this device. */ 209 unsigned long last_usable_bb_id; 210 /* Id of the next device bock to prepare when needed. */ 211 unsigned long next_bb_id; 212 213 /* Summary of all big block states. */ 214 unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; 215 216 /* One byte state per big block. See sbm.mb_states. */ 217 uint8_t *bb_states; 218 219 /* The block size used for plugging/adding/removing. */ 220 uint64_t bb_size; 221 } bbm; 222 }; 223 224 /* 225 * Mutex that protects the sbm.mb_count, sbm.mb_states, 226 * sbm.sb_states, bbm.bb_count, and bbm.bb_states 227 * 228 * When this lock is held the pointers can't change, ONLINE and 229 * OFFLINE blocks can't change the state and no subblocks will get 230 * plugged/unplugged. 231 * 232 * In kdump mode, used to serialize requests, last_block_addr and 233 * last_block_plugged. 234 */ 235 struct mutex hotplug_mutex; 236 bool hotplug_active; 237 238 /* An error occurred we cannot handle - stop processing requests. */ 239 bool broken; 240 241 /* Cached valued of is_kdump_kernel() when the device was probed. */ 242 bool in_kdump; 243 244 /* The driver is being removed. */ 245 spinlock_t removal_lock; 246 bool removing; 247 248 /* Timer for retrying to plug/unplug memory. */ 249 struct hrtimer retry_timer; 250 unsigned int retry_timer_ms; 251 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000 252 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000 253 254 /* Memory notifier (online/offline events). */ 255 struct notifier_block memory_notifier; 256 257 /* Notifier to block hibernation image storing/reloading. */ 258 struct notifier_block pm_notifier; 259 260 #ifdef CONFIG_PROC_VMCORE 261 /* vmcore callback for /proc/vmcore handling in kdump mode */ 262 struct vmcore_cb vmcore_cb; 263 uint64_t last_block_addr; 264 bool last_block_plugged; 265 #endif /* CONFIG_PROC_VMCORE */ 266 267 /* Next device in the list of virtio-mem devices. */ 268 struct list_head next; 269 }; 270 271 /* 272 * We have to share a single online_page callback among all virtio-mem 273 * devices. We use RCU to iterate the list in the callback. 274 */ 275 static DEFINE_MUTEX(virtio_mem_mutex); 276 static LIST_HEAD(virtio_mem_devices); 277 278 static void virtio_mem_online_page_cb(struct page *page, unsigned int order); 279 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 280 unsigned long nr_pages); 281 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 282 unsigned long nr_pages); 283 static void virtio_mem_retry(struct virtio_mem *vm); 284 static int virtio_mem_create_resource(struct virtio_mem *vm); 285 static void virtio_mem_delete_resource(struct virtio_mem *vm); 286 287 /* 288 * Register a virtio-mem device so it will be considered for the online_page 289 * callback. 290 */ 291 static int register_virtio_mem_device(struct virtio_mem *vm) 292 { 293 int rc = 0; 294 295 /* First device registers the callback. */ 296 mutex_lock(&virtio_mem_mutex); 297 if (list_empty(&virtio_mem_devices)) 298 rc = set_online_page_callback(&virtio_mem_online_page_cb); 299 if (!rc) 300 list_add_rcu(&vm->next, &virtio_mem_devices); 301 mutex_unlock(&virtio_mem_mutex); 302 303 return rc; 304 } 305 306 /* 307 * Unregister a virtio-mem device so it will no longer be considered for the 308 * online_page callback. 309 */ 310 static void unregister_virtio_mem_device(struct virtio_mem *vm) 311 { 312 /* Last device unregisters the callback. */ 313 mutex_lock(&virtio_mem_mutex); 314 list_del_rcu(&vm->next); 315 if (list_empty(&virtio_mem_devices)) 316 restore_online_page_callback(&virtio_mem_online_page_cb); 317 mutex_unlock(&virtio_mem_mutex); 318 319 synchronize_rcu(); 320 } 321 322 /* 323 * Calculate the memory block id of a given address. 324 */ 325 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr) 326 { 327 return addr / memory_block_size_bytes(); 328 } 329 330 /* 331 * Calculate the physical start address of a given memory block id. 332 */ 333 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) 334 { 335 return mb_id * memory_block_size_bytes(); 336 } 337 338 /* 339 * Calculate the big block id of a given address. 340 */ 341 static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, 342 uint64_t addr) 343 { 344 return addr / vm->bbm.bb_size; 345 } 346 347 /* 348 * Calculate the physical start address of a given big block id. 349 */ 350 static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, 351 unsigned long bb_id) 352 { 353 return bb_id * vm->bbm.bb_size; 354 } 355 356 /* 357 * Calculate the subblock id of a given address. 358 */ 359 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, 360 unsigned long addr) 361 { 362 const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); 363 const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); 364 365 return (addr - mb_addr) / vm->sbm.sb_size; 366 } 367 368 /* 369 * Set the state of a big block, taking care of the state counter. 370 */ 371 static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, 372 unsigned long bb_id, 373 enum virtio_mem_bbm_bb_state state) 374 { 375 const unsigned long idx = bb_id - vm->bbm.first_bb_id; 376 enum virtio_mem_bbm_bb_state old_state; 377 378 old_state = vm->bbm.bb_states[idx]; 379 vm->bbm.bb_states[idx] = state; 380 381 BUG_ON(vm->bbm.bb_count[old_state] == 0); 382 vm->bbm.bb_count[old_state]--; 383 vm->bbm.bb_count[state]++; 384 } 385 386 /* 387 * Get the state of a big block. 388 */ 389 static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, 390 unsigned long bb_id) 391 { 392 return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; 393 } 394 395 /* 396 * Prepare the big block state array for the next big block. 397 */ 398 static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) 399 { 400 unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; 401 unsigned long new_bytes = old_bytes + 1; 402 int old_pages = PFN_UP(old_bytes); 403 int new_pages = PFN_UP(new_bytes); 404 uint8_t *new_array; 405 406 if (vm->bbm.bb_states && old_pages == new_pages) 407 return 0; 408 409 new_array = vzalloc(new_pages * PAGE_SIZE); 410 if (!new_array) 411 return -ENOMEM; 412 413 mutex_lock(&vm->hotplug_mutex); 414 if (vm->bbm.bb_states) 415 memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); 416 vfree(vm->bbm.bb_states); 417 vm->bbm.bb_states = new_array; 418 mutex_unlock(&vm->hotplug_mutex); 419 420 return 0; 421 } 422 423 #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ 424 for (_bb_id = vm->bbm.first_bb_id; \ 425 _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ 426 _bb_id++) \ 427 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 428 429 #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ 430 for (_bb_id = vm->bbm.next_bb_id - 1; \ 431 _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ 432 _bb_id--) \ 433 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 434 435 /* 436 * Set the state of a memory block, taking care of the state counter. 437 */ 438 static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, 439 unsigned long mb_id, uint8_t state) 440 { 441 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 442 uint8_t old_state; 443 444 old_state = vm->sbm.mb_states[idx]; 445 vm->sbm.mb_states[idx] = state; 446 447 BUG_ON(vm->sbm.mb_count[old_state] == 0); 448 vm->sbm.mb_count[old_state]--; 449 vm->sbm.mb_count[state]++; 450 } 451 452 /* 453 * Get the state of a memory block. 454 */ 455 static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, 456 unsigned long mb_id) 457 { 458 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 459 460 return vm->sbm.mb_states[idx]; 461 } 462 463 /* 464 * Prepare the state array for the next memory block. 465 */ 466 static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) 467 { 468 int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); 469 int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); 470 uint8_t *new_array; 471 472 if (vm->sbm.mb_states && old_pages == new_pages) 473 return 0; 474 475 new_array = vzalloc(new_pages * PAGE_SIZE); 476 if (!new_array) 477 return -ENOMEM; 478 479 mutex_lock(&vm->hotplug_mutex); 480 if (vm->sbm.mb_states) 481 memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); 482 vfree(vm->sbm.mb_states); 483 vm->sbm.mb_states = new_array; 484 mutex_unlock(&vm->hotplug_mutex); 485 486 return 0; 487 } 488 489 #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ 490 for (_mb_id = _vm->sbm.first_mb_id; \ 491 _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ 492 _mb_id++) \ 493 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 494 495 #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ 496 for (_mb_id = _vm->sbm.next_mb_id - 1; \ 497 _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ 498 _mb_id--) \ 499 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 500 501 /* 502 * Calculate the bit number in the subblock bitmap for the given subblock 503 * inside the given memory block. 504 */ 505 static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, 506 unsigned long mb_id, int sb_id) 507 { 508 return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; 509 } 510 511 /* 512 * Mark all selected subblocks plugged. 513 * 514 * Will not modify the state of the memory block. 515 */ 516 static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, 517 unsigned long mb_id, int sb_id, 518 int count) 519 { 520 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 521 522 __bitmap_set(vm->sbm.sb_states, bit, count); 523 } 524 525 /* 526 * Mark all selected subblocks unplugged. 527 * 528 * Will not modify the state of the memory block. 529 */ 530 static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, 531 unsigned long mb_id, int sb_id, 532 int count) 533 { 534 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 535 536 __bitmap_clear(vm->sbm.sb_states, bit, count); 537 } 538 539 /* 540 * Test if all selected subblocks are plugged. 541 */ 542 static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, 543 unsigned long mb_id, int sb_id, 544 int count) 545 { 546 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 547 548 if (count == 1) 549 return test_bit(bit, vm->sbm.sb_states); 550 551 /* TODO: Helper similar to bitmap_set() */ 552 return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= 553 bit + count; 554 } 555 556 /* 557 * Test if all selected subblocks are unplugged. 558 */ 559 static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, 560 unsigned long mb_id, int sb_id, 561 int count) 562 { 563 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 564 565 /* TODO: Helper similar to bitmap_set() */ 566 return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= 567 bit + count; 568 } 569 570 /* 571 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is 572 * none. 573 */ 574 static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, 575 unsigned long mb_id) 576 { 577 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); 578 579 return find_next_zero_bit(vm->sbm.sb_states, 580 bit + vm->sbm.sbs_per_mb, bit) - bit; 581 } 582 583 /* 584 * Prepare the subblock bitmap for the next memory block. 585 */ 586 static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) 587 { 588 const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; 589 const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; 590 const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; 591 int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); 592 int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); 593 unsigned long *new_bitmap, *old_bitmap; 594 595 if (vm->sbm.sb_states && old_pages == new_pages) 596 return 0; 597 598 new_bitmap = vzalloc(new_pages * PAGE_SIZE); 599 if (!new_bitmap) 600 return -ENOMEM; 601 602 mutex_lock(&vm->hotplug_mutex); 603 if (vm->sbm.sb_states) 604 memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); 605 606 old_bitmap = vm->sbm.sb_states; 607 vm->sbm.sb_states = new_bitmap; 608 mutex_unlock(&vm->hotplug_mutex); 609 610 vfree(old_bitmap); 611 return 0; 612 } 613 614 /* 615 * Test if we could add memory without creating too much offline memory - 616 * to avoid running OOM if memory is getting onlined deferred. 617 */ 618 static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) 619 { 620 if (WARN_ON_ONCE(size > vm->offline_threshold)) 621 return false; 622 623 return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; 624 } 625 626 /* 627 * Try adding memory to Linux. Will usually only fail if out of memory. 628 * 629 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 630 * onlining code). 631 * 632 * Will not modify the state of memory blocks in virtio-mem. 633 */ 634 static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, 635 uint64_t size) 636 { 637 int rc; 638 639 /* 640 * When force-unloading the driver and we still have memory added to 641 * Linux, the resource name has to stay. 642 */ 643 if (!vm->resource_name) { 644 vm->resource_name = kstrdup_const("System RAM (virtio_mem)", 645 GFP_KERNEL); 646 if (!vm->resource_name) 647 return -ENOMEM; 648 } 649 650 dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, 651 addr + size - 1); 652 /* Memory might get onlined immediately. */ 653 atomic64_add(size, &vm->offline_size); 654 rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name, 655 MHP_MERGE_RESOURCE | MHP_NID_IS_MGID); 656 if (rc) { 657 atomic64_sub(size, &vm->offline_size); 658 dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); 659 /* 660 * TODO: Linux MM does not properly clean up yet in all cases 661 * where adding of memory failed - especially on -ENOMEM. 662 */ 663 } 664 return rc; 665 } 666 667 /* 668 * See virtio_mem_add_memory(): Try adding a single Linux memory block. 669 */ 670 static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) 671 { 672 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 673 const uint64_t size = memory_block_size_bytes(); 674 675 return virtio_mem_add_memory(vm, addr, size); 676 } 677 678 /* 679 * See virtio_mem_add_memory(): Try adding a big block. 680 */ 681 static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) 682 { 683 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 684 const uint64_t size = vm->bbm.bb_size; 685 686 return virtio_mem_add_memory(vm, addr, size); 687 } 688 689 /* 690 * Try removing memory from Linux. Will only fail if memory blocks aren't 691 * offline. 692 * 693 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 694 * onlining code). 695 * 696 * Will not modify the state of memory blocks in virtio-mem. 697 */ 698 static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, 699 uint64_t size) 700 { 701 int rc; 702 703 dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, 704 addr + size - 1); 705 rc = remove_memory(addr, size); 706 if (!rc) { 707 atomic64_sub(size, &vm->offline_size); 708 /* 709 * We might have freed up memory we can now unplug, retry 710 * immediately instead of waiting. 711 */ 712 virtio_mem_retry(vm); 713 } else { 714 dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); 715 } 716 return rc; 717 } 718 719 /* 720 * See virtio_mem_remove_memory(): Try removing a single Linux memory block. 721 */ 722 static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) 723 { 724 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 725 const uint64_t size = memory_block_size_bytes(); 726 727 return virtio_mem_remove_memory(vm, addr, size); 728 } 729 730 /* 731 * Try offlining and removing memory from Linux. 732 * 733 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 734 * onlining code). 735 * 736 * Will not modify the state of memory blocks in virtio-mem. 737 */ 738 static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, 739 uint64_t addr, 740 uint64_t size) 741 { 742 int rc; 743 744 dev_dbg(&vm->vdev->dev, 745 "offlining and removing memory: 0x%llx - 0x%llx\n", addr, 746 addr + size - 1); 747 748 rc = offline_and_remove_memory(addr, size); 749 if (!rc) { 750 atomic64_sub(size, &vm->offline_size); 751 /* 752 * We might have freed up memory we can now unplug, retry 753 * immediately instead of waiting. 754 */ 755 virtio_mem_retry(vm); 756 return 0; 757 } 758 dev_dbg(&vm->vdev->dev, "offlining and removing memory failed: %d\n", rc); 759 /* 760 * We don't really expect this to fail, because we fake-offlined all 761 * memory already. But it could fail in corner cases. 762 */ 763 WARN_ON_ONCE(rc != -ENOMEM && rc != -EBUSY); 764 return rc == -ENOMEM ? -ENOMEM : -EBUSY; 765 } 766 767 /* 768 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing 769 * a single Linux memory block. 770 */ 771 static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, 772 unsigned long mb_id) 773 { 774 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 775 const uint64_t size = memory_block_size_bytes(); 776 777 return virtio_mem_offline_and_remove_memory(vm, addr, size); 778 } 779 780 /* 781 * Try (offlining and) removing memory from Linux in case all subblocks are 782 * unplugged. Can be called on online and offline memory blocks. 783 * 784 * May modify the state of memory blocks in virtio-mem. 785 */ 786 static int virtio_mem_sbm_try_remove_unplugged_mb(struct virtio_mem *vm, 787 unsigned long mb_id) 788 { 789 int rc; 790 791 /* 792 * Once all subblocks of a memory block were unplugged, offline and 793 * remove it. 794 */ 795 if (!virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 796 return 0; 797 798 /* offline_and_remove_memory() works for online and offline memory. */ 799 mutex_unlock(&vm->hotplug_mutex); 800 rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); 801 mutex_lock(&vm->hotplug_mutex); 802 if (!rc) 803 virtio_mem_sbm_set_mb_state(vm, mb_id, 804 VIRTIO_MEM_SBM_MB_UNUSED); 805 return rc; 806 } 807 808 /* 809 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a 810 * all Linux memory blocks covered by the big block. 811 */ 812 static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, 813 unsigned long bb_id) 814 { 815 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 816 const uint64_t size = vm->bbm.bb_size; 817 818 return virtio_mem_offline_and_remove_memory(vm, addr, size); 819 } 820 821 /* 822 * Trigger the workqueue so the device can perform its magic. 823 */ 824 static void virtio_mem_retry(struct virtio_mem *vm) 825 { 826 unsigned long flags; 827 828 spin_lock_irqsave(&vm->removal_lock, flags); 829 if (!vm->removing) 830 queue_work(system_freezable_wq, &vm->wq); 831 spin_unlock_irqrestore(&vm->removal_lock, flags); 832 } 833 834 static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) 835 { 836 int node = NUMA_NO_NODE; 837 838 #if defined(CONFIG_ACPI_NUMA) 839 if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM)) 840 node = pxm_to_node(node_id); 841 #endif 842 return node; 843 } 844 845 /* 846 * Test if a virtio-mem device overlaps with the given range. Can be called 847 * from (notifier) callbacks lockless. 848 */ 849 static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, 850 uint64_t size) 851 { 852 return start < vm->addr + vm->region_size && vm->addr < start + size; 853 } 854 855 /* 856 * Test if a virtio-mem device contains a given range. Can be called from 857 * (notifier) callbacks lockless. 858 */ 859 static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, 860 uint64_t size) 861 { 862 return start >= vm->addr && start + size <= vm->addr + vm->region_size; 863 } 864 865 static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, 866 unsigned long mb_id) 867 { 868 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 869 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 870 case VIRTIO_MEM_SBM_MB_OFFLINE: 871 return NOTIFY_OK; 872 default: 873 break; 874 } 875 dev_warn_ratelimited(&vm->vdev->dev, 876 "memory block onlining denied\n"); 877 return NOTIFY_BAD; 878 } 879 880 static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, 881 unsigned long mb_id) 882 { 883 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 884 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 885 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 886 virtio_mem_sbm_set_mb_state(vm, mb_id, 887 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 888 break; 889 case VIRTIO_MEM_SBM_MB_KERNEL: 890 case VIRTIO_MEM_SBM_MB_MOVABLE: 891 virtio_mem_sbm_set_mb_state(vm, mb_id, 892 VIRTIO_MEM_SBM_MB_OFFLINE); 893 break; 894 default: 895 BUG(); 896 break; 897 } 898 } 899 900 static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, 901 unsigned long mb_id, 902 unsigned long start_pfn) 903 { 904 const bool is_movable = is_zone_movable_page(pfn_to_page(start_pfn)); 905 int new_state; 906 907 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 908 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 909 new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL; 910 if (is_movable) 911 new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL; 912 break; 913 case VIRTIO_MEM_SBM_MB_OFFLINE: 914 new_state = VIRTIO_MEM_SBM_MB_KERNEL; 915 if (is_movable) 916 new_state = VIRTIO_MEM_SBM_MB_MOVABLE; 917 break; 918 default: 919 BUG(); 920 break; 921 } 922 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 923 } 924 925 static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, 926 unsigned long mb_id) 927 { 928 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 929 unsigned long pfn; 930 int sb_id; 931 932 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 933 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 934 continue; 935 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 936 sb_id * vm->sbm.sb_size); 937 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 938 } 939 } 940 941 static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, 942 unsigned long mb_id) 943 { 944 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 945 unsigned long pfn; 946 int sb_id; 947 948 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 949 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 950 continue; 951 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 952 sb_id * vm->sbm.sb_size); 953 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 954 } 955 } 956 957 static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, 958 unsigned long bb_id, 959 unsigned long pfn, 960 unsigned long nr_pages) 961 { 962 /* 963 * When marked as "fake-offline", all online memory of this device block 964 * is allocated by us. Otherwise, we don't have any memory allocated. 965 */ 966 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 967 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 968 return; 969 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 970 } 971 972 static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, 973 unsigned long bb_id, 974 unsigned long pfn, 975 unsigned long nr_pages) 976 { 977 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 978 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 979 return; 980 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 981 } 982 983 /* 984 * This callback will either be called synchronously from add_memory() or 985 * asynchronously (e.g., triggered via user space). We have to be careful 986 * with locking when calling add_memory(). 987 */ 988 static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, 989 unsigned long action, void *arg) 990 { 991 struct virtio_mem *vm = container_of(nb, struct virtio_mem, 992 memory_notifier); 993 struct memory_notify *mhp = arg; 994 const unsigned long start = PFN_PHYS(mhp->start_pfn); 995 const unsigned long size = PFN_PHYS(mhp->nr_pages); 996 int rc = NOTIFY_OK; 997 unsigned long id; 998 999 if (!virtio_mem_overlaps_range(vm, start, size)) 1000 return NOTIFY_DONE; 1001 1002 if (vm->in_sbm) { 1003 id = virtio_mem_phys_to_mb_id(start); 1004 /* 1005 * In SBM, we add memory in separate memory blocks - we expect 1006 * it to be onlined/offlined in the same granularity. Bail out 1007 * if this ever changes. 1008 */ 1009 if (WARN_ON_ONCE(size != memory_block_size_bytes() || 1010 !IS_ALIGNED(start, memory_block_size_bytes()))) 1011 return NOTIFY_BAD; 1012 } else { 1013 id = virtio_mem_phys_to_bb_id(vm, start); 1014 /* 1015 * In BBM, we only care about onlining/offlining happening 1016 * within a single big block, we don't care about the 1017 * actual granularity as we don't track individual Linux 1018 * memory blocks. 1019 */ 1020 if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) 1021 return NOTIFY_BAD; 1022 } 1023 1024 /* 1025 * Avoid circular locking lockdep warnings. We lock the mutex 1026 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The 1027 * blocking_notifier_call_chain() has it's own lock, which gets unlocked 1028 * between both notifier calls and will bail out. False positive. 1029 */ 1030 lockdep_off(); 1031 1032 switch (action) { 1033 case MEM_GOING_OFFLINE: 1034 mutex_lock(&vm->hotplug_mutex); 1035 if (vm->removing) { 1036 rc = notifier_from_errno(-EBUSY); 1037 mutex_unlock(&vm->hotplug_mutex); 1038 break; 1039 } 1040 vm->hotplug_active = true; 1041 if (vm->in_sbm) 1042 virtio_mem_sbm_notify_going_offline(vm, id); 1043 else 1044 virtio_mem_bbm_notify_going_offline(vm, id, 1045 mhp->start_pfn, 1046 mhp->nr_pages); 1047 break; 1048 case MEM_GOING_ONLINE: 1049 mutex_lock(&vm->hotplug_mutex); 1050 if (vm->removing) { 1051 rc = notifier_from_errno(-EBUSY); 1052 mutex_unlock(&vm->hotplug_mutex); 1053 break; 1054 } 1055 vm->hotplug_active = true; 1056 if (vm->in_sbm) 1057 rc = virtio_mem_sbm_notify_going_online(vm, id); 1058 break; 1059 case MEM_OFFLINE: 1060 if (vm->in_sbm) 1061 virtio_mem_sbm_notify_offline(vm, id); 1062 1063 atomic64_add(size, &vm->offline_size); 1064 /* 1065 * Trigger the workqueue. Now that we have some offline memory, 1066 * maybe we can handle pending unplug requests. 1067 */ 1068 if (!unplug_online) 1069 virtio_mem_retry(vm); 1070 1071 vm->hotplug_active = false; 1072 mutex_unlock(&vm->hotplug_mutex); 1073 break; 1074 case MEM_ONLINE: 1075 if (vm->in_sbm) 1076 virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); 1077 1078 atomic64_sub(size, &vm->offline_size); 1079 /* 1080 * Start adding more memory once we onlined half of our 1081 * threshold. Don't trigger if it's possibly due to our actipn 1082 * (e.g., us adding memory which gets onlined immediately from 1083 * the core). 1084 */ 1085 if (!atomic_read(&vm->wq_active) && 1086 virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) 1087 virtio_mem_retry(vm); 1088 1089 vm->hotplug_active = false; 1090 mutex_unlock(&vm->hotplug_mutex); 1091 break; 1092 case MEM_CANCEL_OFFLINE: 1093 if (!vm->hotplug_active) 1094 break; 1095 if (vm->in_sbm) 1096 virtio_mem_sbm_notify_cancel_offline(vm, id); 1097 else 1098 virtio_mem_bbm_notify_cancel_offline(vm, id, 1099 mhp->start_pfn, 1100 mhp->nr_pages); 1101 vm->hotplug_active = false; 1102 mutex_unlock(&vm->hotplug_mutex); 1103 break; 1104 case MEM_CANCEL_ONLINE: 1105 if (!vm->hotplug_active) 1106 break; 1107 vm->hotplug_active = false; 1108 mutex_unlock(&vm->hotplug_mutex); 1109 break; 1110 default: 1111 break; 1112 } 1113 1114 lockdep_on(); 1115 1116 return rc; 1117 } 1118 1119 static int virtio_mem_pm_notifier_cb(struct notifier_block *nb, 1120 unsigned long action, void *arg) 1121 { 1122 struct virtio_mem *vm = container_of(nb, struct virtio_mem, 1123 pm_notifier); 1124 switch (action) { 1125 case PM_HIBERNATION_PREPARE: 1126 case PM_RESTORE_PREPARE: 1127 /* 1128 * When restarting the VM, all memory is unplugged. Don't 1129 * allow to hibernate and restore from an image. 1130 */ 1131 dev_err(&vm->vdev->dev, "hibernation is not supported.\n"); 1132 return NOTIFY_BAD; 1133 default: 1134 return NOTIFY_OK; 1135 } 1136 } 1137 1138 /* 1139 * Set a range of pages PG_offline. Remember pages that were never onlined 1140 * (via generic_online_page()) using PageDirty(). 1141 */ 1142 static void virtio_mem_set_fake_offline(unsigned long pfn, 1143 unsigned long nr_pages, bool onlined) 1144 { 1145 page_offline_begin(); 1146 for (; nr_pages--; pfn++) { 1147 struct page *page = pfn_to_page(pfn); 1148 1149 __SetPageOffline(page); 1150 if (!onlined) { 1151 SetPageDirty(page); 1152 /* FIXME: remove after cleanups */ 1153 ClearPageReserved(page); 1154 } 1155 } 1156 page_offline_end(); 1157 } 1158 1159 /* 1160 * Clear PG_offline from a range of pages. If the pages were never onlined, 1161 * (via generic_online_page()), clear PageDirty(). 1162 */ 1163 static void virtio_mem_clear_fake_offline(unsigned long pfn, 1164 unsigned long nr_pages, bool onlined) 1165 { 1166 for (; nr_pages--; pfn++) { 1167 struct page *page = pfn_to_page(pfn); 1168 1169 __ClearPageOffline(page); 1170 if (!onlined) 1171 ClearPageDirty(page); 1172 } 1173 } 1174 1175 /* 1176 * Release a range of fake-offline pages to the buddy, effectively 1177 * fake-onlining them. 1178 */ 1179 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) 1180 { 1181 unsigned long order = MAX_PAGE_ORDER; 1182 unsigned long i; 1183 1184 /* 1185 * We might get called for ranges that don't cover properly aligned 1186 * MAX_PAGE_ORDER pages; however, we can only online properly aligned 1187 * pages with an order of MAX_PAGE_ORDER at maximum. 1188 */ 1189 while (!IS_ALIGNED(pfn | nr_pages, 1 << order)) 1190 order--; 1191 1192 for (i = 0; i < nr_pages; i += 1 << order) { 1193 struct page *page = pfn_to_page(pfn + i); 1194 1195 /* 1196 * If the page is PageDirty(), it was kept fake-offline when 1197 * onlining the memory block. Otherwise, it was allocated 1198 * using alloc_contig_range(). All pages in a subblock are 1199 * alike. 1200 */ 1201 if (PageDirty(page)) { 1202 virtio_mem_clear_fake_offline(pfn + i, 1 << order, false); 1203 generic_online_page(page, order); 1204 } else { 1205 virtio_mem_clear_fake_offline(pfn + i, 1 << order, true); 1206 free_contig_range(pfn + i, 1 << order); 1207 adjust_managed_page_count(page, 1 << order); 1208 } 1209 } 1210 } 1211 1212 /* 1213 * Try to allocate a range, marking pages fake-offline, effectively 1214 * fake-offlining them. 1215 */ 1216 static int virtio_mem_fake_offline(struct virtio_mem *vm, unsigned long pfn, 1217 unsigned long nr_pages) 1218 { 1219 const bool is_movable = is_zone_movable_page(pfn_to_page(pfn)); 1220 int rc, retry_count; 1221 1222 /* 1223 * TODO: We want an alloc_contig_range() mode that tries to allocate 1224 * harder (e.g., dealing with temporarily pinned pages, PCP), especially 1225 * with ZONE_MOVABLE. So for now, retry a couple of times with 1226 * ZONE_MOVABLE before giving up - because that zone is supposed to give 1227 * some guarantees. 1228 */ 1229 for (retry_count = 0; retry_count < 5; retry_count++) { 1230 /* 1231 * If the config changed, stop immediately and go back to the 1232 * main loop: avoid trying to keep unplugging if the device 1233 * might have decided to not remove any more memory. 1234 */ 1235 if (atomic_read(&vm->config_changed)) 1236 return -EAGAIN; 1237 1238 rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, 1239 GFP_KERNEL); 1240 if (rc == -ENOMEM) 1241 /* whoops, out of memory */ 1242 return rc; 1243 else if (rc && !is_movable) 1244 break; 1245 else if (rc) 1246 continue; 1247 1248 virtio_mem_set_fake_offline(pfn, nr_pages, true); 1249 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1250 return 0; 1251 } 1252 1253 return -EBUSY; 1254 } 1255 1256 /* 1257 * Handle fake-offline pages when memory is going offline - such that the 1258 * pages can be skipped by mm-core when offlining. 1259 */ 1260 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 1261 unsigned long nr_pages) 1262 { 1263 struct page *page; 1264 unsigned long i; 1265 1266 /* 1267 * Drop our reference to the pages so the memory can get offlined 1268 * and add the unplugged pages to the managed page counters (so 1269 * offlining code can correctly subtract them again). 1270 */ 1271 adjust_managed_page_count(pfn_to_page(pfn), nr_pages); 1272 /* Drop our reference to the pages so the memory can get offlined. */ 1273 for (i = 0; i < nr_pages; i++) { 1274 page = pfn_to_page(pfn + i); 1275 if (WARN_ON(!page_ref_dec_and_test(page))) 1276 dump_page(page, "fake-offline page referenced"); 1277 } 1278 } 1279 1280 /* 1281 * Handle fake-offline pages when memory offlining is canceled - to undo 1282 * what we did in virtio_mem_fake_offline_going_offline(). 1283 */ 1284 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 1285 unsigned long nr_pages) 1286 { 1287 unsigned long i; 1288 1289 /* 1290 * Get the reference we dropped when going offline and subtract the 1291 * unplugged pages from the managed page counters. 1292 */ 1293 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1294 for (i = 0; i < nr_pages; i++) 1295 page_ref_inc(pfn_to_page(pfn + i)); 1296 } 1297 1298 static void virtio_mem_online_page(struct virtio_mem *vm, 1299 struct page *page, unsigned int order) 1300 { 1301 const unsigned long start = page_to_phys(page); 1302 const unsigned long end = start + PFN_PHYS(1 << order); 1303 unsigned long addr, next, id, sb_id, count; 1304 bool do_online; 1305 1306 /* 1307 * We can get called with any order up to MAX_PAGE_ORDER. If our subblock 1308 * size is smaller than that and we have a mixture of plugged and 1309 * unplugged subblocks within such a page, we have to process in 1310 * smaller granularity. In that case we'll adjust the order exactly once 1311 * within the loop. 1312 */ 1313 for (addr = start; addr < end; ) { 1314 next = addr + PFN_PHYS(1 << order); 1315 1316 if (vm->in_sbm) { 1317 id = virtio_mem_phys_to_mb_id(addr); 1318 sb_id = virtio_mem_phys_to_sb_id(vm, addr); 1319 count = virtio_mem_phys_to_sb_id(vm, next - 1) - sb_id + 1; 1320 1321 if (virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, count)) { 1322 /* Fully plugged. */ 1323 do_online = true; 1324 } else if (count == 1 || 1325 virtio_mem_sbm_test_sb_unplugged(vm, id, sb_id, count)) { 1326 /* Fully unplugged. */ 1327 do_online = false; 1328 } else { 1329 /* 1330 * Mixture, process sub-blocks instead. This 1331 * will be at least the size of a pageblock. 1332 * We'll run into this case exactly once. 1333 */ 1334 order = ilog2(vm->sbm.sb_size) - PAGE_SHIFT; 1335 do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1); 1336 continue; 1337 } 1338 } else { 1339 /* 1340 * If the whole block is marked fake offline, keep 1341 * everything that way. 1342 */ 1343 id = virtio_mem_phys_to_bb_id(vm, addr); 1344 do_online = virtio_mem_bbm_get_bb_state(vm, id) != 1345 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; 1346 } 1347 1348 if (do_online) 1349 generic_online_page(pfn_to_page(PFN_DOWN(addr)), order); 1350 else 1351 virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, 1352 false); 1353 addr = next; 1354 } 1355 } 1356 1357 static void virtio_mem_online_page_cb(struct page *page, unsigned int order) 1358 { 1359 const unsigned long addr = page_to_phys(page); 1360 struct virtio_mem *vm; 1361 1362 rcu_read_lock(); 1363 list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { 1364 /* 1365 * Pages we're onlining will never cross memory blocks and, 1366 * therefore, not virtio-mem devices. 1367 */ 1368 if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) 1369 continue; 1370 1371 /* 1372 * virtio_mem_set_fake_offline() might sleep. We can safely 1373 * drop the RCU lock at this point because the device 1374 * cannot go away. See virtio_mem_remove() how races 1375 * between memory onlining and device removal are handled. 1376 */ 1377 rcu_read_unlock(); 1378 1379 virtio_mem_online_page(vm, page, order); 1380 return; 1381 } 1382 rcu_read_unlock(); 1383 1384 /* not virtio-mem memory, but e.g., a DIMM. online it */ 1385 generic_online_page(page, order); 1386 } 1387 1388 static uint64_t virtio_mem_send_request(struct virtio_mem *vm, 1389 const struct virtio_mem_req *req) 1390 { 1391 struct scatterlist *sgs[2], sg_req, sg_resp; 1392 unsigned int len; 1393 int rc; 1394 1395 /* don't use the request residing on the stack (vaddr) */ 1396 vm->req = *req; 1397 1398 /* out: buffer for request */ 1399 sg_init_one(&sg_req, &vm->req, sizeof(vm->req)); 1400 sgs[0] = &sg_req; 1401 1402 /* in: buffer for response */ 1403 sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp)); 1404 sgs[1] = &sg_resp; 1405 1406 rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL); 1407 if (rc < 0) 1408 return rc; 1409 1410 virtqueue_kick(vm->vq); 1411 1412 /* wait for a response */ 1413 wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len)); 1414 1415 return virtio16_to_cpu(vm->vdev, vm->resp.type); 1416 } 1417 1418 static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, 1419 uint64_t size) 1420 { 1421 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1422 const struct virtio_mem_req req = { 1423 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG), 1424 .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), 1425 .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1426 }; 1427 int rc = -ENOMEM; 1428 1429 if (atomic_read(&vm->config_changed)) 1430 return -EAGAIN; 1431 1432 dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, 1433 addr + size - 1); 1434 1435 switch (virtio_mem_send_request(vm, &req)) { 1436 case VIRTIO_MEM_RESP_ACK: 1437 vm->plugged_size += size; 1438 return 0; 1439 case VIRTIO_MEM_RESP_NACK: 1440 rc = -EAGAIN; 1441 break; 1442 case VIRTIO_MEM_RESP_BUSY: 1443 rc = -ETXTBSY; 1444 break; 1445 case VIRTIO_MEM_RESP_ERROR: 1446 rc = -EINVAL; 1447 break; 1448 default: 1449 break; 1450 } 1451 1452 dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); 1453 return rc; 1454 } 1455 1456 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, 1457 uint64_t size) 1458 { 1459 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1460 const struct virtio_mem_req req = { 1461 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG), 1462 .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), 1463 .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1464 }; 1465 int rc = -ENOMEM; 1466 1467 if (atomic_read(&vm->config_changed)) 1468 return -EAGAIN; 1469 1470 dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, 1471 addr + size - 1); 1472 1473 switch (virtio_mem_send_request(vm, &req)) { 1474 case VIRTIO_MEM_RESP_ACK: 1475 vm->plugged_size -= size; 1476 return 0; 1477 case VIRTIO_MEM_RESP_BUSY: 1478 rc = -ETXTBSY; 1479 break; 1480 case VIRTIO_MEM_RESP_ERROR: 1481 rc = -EINVAL; 1482 break; 1483 default: 1484 break; 1485 } 1486 1487 dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); 1488 return rc; 1489 } 1490 1491 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) 1492 { 1493 const struct virtio_mem_req req = { 1494 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), 1495 }; 1496 int rc = -ENOMEM; 1497 1498 dev_dbg(&vm->vdev->dev, "unplugging all memory"); 1499 1500 switch (virtio_mem_send_request(vm, &req)) { 1501 case VIRTIO_MEM_RESP_ACK: 1502 vm->unplug_all_required = false; 1503 vm->plugged_size = 0; 1504 /* usable region might have shrunk */ 1505 atomic_set(&vm->config_changed, 1); 1506 return 0; 1507 case VIRTIO_MEM_RESP_BUSY: 1508 rc = -ETXTBSY; 1509 break; 1510 default: 1511 break; 1512 } 1513 1514 dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); 1515 return rc; 1516 } 1517 1518 /* 1519 * Plug selected subblocks. Updates the plugged state, but not the state 1520 * of the memory block. 1521 */ 1522 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, 1523 int sb_id, int count) 1524 { 1525 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1526 sb_id * vm->sbm.sb_size; 1527 const uint64_t size = count * vm->sbm.sb_size; 1528 int rc; 1529 1530 rc = virtio_mem_send_plug_request(vm, addr, size); 1531 if (!rc) 1532 virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); 1533 return rc; 1534 } 1535 1536 /* 1537 * Unplug selected subblocks. Updates the plugged state, but not the state 1538 * of the memory block. 1539 */ 1540 static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, 1541 int sb_id, int count) 1542 { 1543 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1544 sb_id * vm->sbm.sb_size; 1545 const uint64_t size = count * vm->sbm.sb_size; 1546 int rc; 1547 1548 rc = virtio_mem_send_unplug_request(vm, addr, size); 1549 if (!rc) 1550 virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); 1551 return rc; 1552 } 1553 1554 /* 1555 * Request to unplug a big block. 1556 * 1557 * Will not modify the state of the big block. 1558 */ 1559 static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) 1560 { 1561 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1562 const uint64_t size = vm->bbm.bb_size; 1563 1564 return virtio_mem_send_unplug_request(vm, addr, size); 1565 } 1566 1567 /* 1568 * Request to plug a big block. 1569 * 1570 * Will not modify the state of the big block. 1571 */ 1572 static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) 1573 { 1574 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1575 const uint64_t size = vm->bbm.bb_size; 1576 1577 return virtio_mem_send_plug_request(vm, addr, size); 1578 } 1579 1580 /* 1581 * Unplug the desired number of plugged subblocks of a offline or not-added 1582 * memory block. Will fail if any subblock cannot get unplugged (instead of 1583 * skipping it). 1584 * 1585 * Will not modify the state of the memory block. 1586 * 1587 * Note: can fail after some subblocks were unplugged. 1588 */ 1589 static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm, 1590 unsigned long mb_id, uint64_t *nb_sb) 1591 { 1592 int sb_id, count; 1593 int rc; 1594 1595 sb_id = vm->sbm.sbs_per_mb - 1; 1596 while (*nb_sb) { 1597 /* Find the next candidate subblock */ 1598 while (sb_id >= 0 && 1599 virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) 1600 sb_id--; 1601 if (sb_id < 0) 1602 break; 1603 /* Try to unplug multiple subblocks at a time */ 1604 count = 1; 1605 while (count < *nb_sb && sb_id > 0 && 1606 virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { 1607 count++; 1608 sb_id--; 1609 } 1610 1611 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1612 if (rc) 1613 return rc; 1614 *nb_sb -= count; 1615 sb_id--; 1616 } 1617 1618 return 0; 1619 } 1620 1621 /* 1622 * Unplug all plugged subblocks of an offline or not-added memory block. 1623 * 1624 * Will not modify the state of the memory block. 1625 * 1626 * Note: can fail after some subblocks were unplugged. 1627 */ 1628 static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) 1629 { 1630 uint64_t nb_sb = vm->sbm.sbs_per_mb; 1631 1632 return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb); 1633 } 1634 1635 /* 1636 * Prepare tracking data for the next memory block. 1637 */ 1638 static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, 1639 unsigned long *mb_id) 1640 { 1641 int rc; 1642 1643 if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) 1644 return -ENOSPC; 1645 1646 /* Resize the state array if required. */ 1647 rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); 1648 if (rc) 1649 return rc; 1650 1651 /* Resize the subblock bitmap if required. */ 1652 rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); 1653 if (rc) 1654 return rc; 1655 1656 vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; 1657 *mb_id = vm->sbm.next_mb_id++; 1658 return 0; 1659 } 1660 1661 /* 1662 * Try to plug the desired number of subblocks and add the memory block 1663 * to Linux. 1664 * 1665 * Will modify the state of the memory block. 1666 */ 1667 static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, 1668 unsigned long mb_id, uint64_t *nb_sb) 1669 { 1670 const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); 1671 int rc; 1672 1673 if (WARN_ON_ONCE(!count)) 1674 return -EINVAL; 1675 1676 /* 1677 * Plug the requested number of subblocks before adding it to linux, 1678 * so that onlining will directly online all plugged subblocks. 1679 */ 1680 rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); 1681 if (rc) 1682 return rc; 1683 1684 /* 1685 * Mark the block properly offline before adding it to Linux, 1686 * so the memory notifiers will find the block in the right state. 1687 */ 1688 if (count == vm->sbm.sbs_per_mb) 1689 virtio_mem_sbm_set_mb_state(vm, mb_id, 1690 VIRTIO_MEM_SBM_MB_OFFLINE); 1691 else 1692 virtio_mem_sbm_set_mb_state(vm, mb_id, 1693 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1694 1695 /* Add the memory block to linux - if that fails, try to unplug. */ 1696 rc = virtio_mem_sbm_add_mb(vm, mb_id); 1697 if (rc) { 1698 int new_state = VIRTIO_MEM_SBM_MB_UNUSED; 1699 1700 if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) 1701 new_state = VIRTIO_MEM_SBM_MB_PLUGGED; 1702 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 1703 return rc; 1704 } 1705 1706 *nb_sb -= count; 1707 return 0; 1708 } 1709 1710 /* 1711 * Try to plug the desired number of subblocks of a memory block that 1712 * is already added to Linux. 1713 * 1714 * Will modify the state of the memory block. 1715 * 1716 * Note: Can fail after some subblocks were successfully plugged. 1717 */ 1718 static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, 1719 unsigned long mb_id, uint64_t *nb_sb) 1720 { 1721 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1722 unsigned long pfn, nr_pages; 1723 int sb_id, count; 1724 int rc; 1725 1726 if (WARN_ON_ONCE(!*nb_sb)) 1727 return -EINVAL; 1728 1729 while (*nb_sb) { 1730 sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); 1731 if (sb_id >= vm->sbm.sbs_per_mb) 1732 break; 1733 count = 1; 1734 while (count < *nb_sb && 1735 sb_id + count < vm->sbm.sbs_per_mb && 1736 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) 1737 count++; 1738 1739 rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); 1740 if (rc) 1741 return rc; 1742 *nb_sb -= count; 1743 if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) 1744 continue; 1745 1746 /* fake-online the pages if the memory block is online */ 1747 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1748 sb_id * vm->sbm.sb_size); 1749 nr_pages = PFN_DOWN(count * vm->sbm.sb_size); 1750 virtio_mem_fake_online(pfn, nr_pages); 1751 } 1752 1753 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1754 virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1); 1755 1756 return 0; 1757 } 1758 1759 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1760 { 1761 const int mb_states[] = { 1762 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 1763 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 1764 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 1765 }; 1766 uint64_t nb_sb = diff / vm->sbm.sb_size; 1767 unsigned long mb_id; 1768 int rc, i; 1769 1770 if (!nb_sb) 1771 return 0; 1772 1773 /* Don't race with onlining/offlining */ 1774 mutex_lock(&vm->hotplug_mutex); 1775 1776 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 1777 virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) { 1778 rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb); 1779 if (rc || !nb_sb) 1780 goto out_unlock; 1781 cond_resched(); 1782 } 1783 } 1784 1785 /* 1786 * We won't be working on online/offline memory blocks from this point, 1787 * so we can't race with memory onlining/offlining. Drop the mutex. 1788 */ 1789 mutex_unlock(&vm->hotplug_mutex); 1790 1791 /* Try to plug and add unused blocks */ 1792 virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { 1793 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1794 return -ENOSPC; 1795 1796 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1797 if (rc || !nb_sb) 1798 return rc; 1799 cond_resched(); 1800 } 1801 1802 /* Try to prepare, plug and add new blocks */ 1803 while (nb_sb) { 1804 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1805 return -ENOSPC; 1806 1807 rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); 1808 if (rc) 1809 return rc; 1810 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1811 if (rc) 1812 return rc; 1813 cond_resched(); 1814 } 1815 1816 return 0; 1817 out_unlock: 1818 mutex_unlock(&vm->hotplug_mutex); 1819 return rc; 1820 } 1821 1822 /* 1823 * Plug a big block and add it to Linux. 1824 * 1825 * Will modify the state of the big block. 1826 */ 1827 static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, 1828 unsigned long bb_id) 1829 { 1830 int rc; 1831 1832 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 1833 VIRTIO_MEM_BBM_BB_UNUSED)) 1834 return -EINVAL; 1835 1836 rc = virtio_mem_bbm_plug_bb(vm, bb_id); 1837 if (rc) 1838 return rc; 1839 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 1840 1841 rc = virtio_mem_bbm_add_bb(vm, bb_id); 1842 if (rc) { 1843 if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) 1844 virtio_mem_bbm_set_bb_state(vm, bb_id, 1845 VIRTIO_MEM_BBM_BB_UNUSED); 1846 else 1847 /* Retry from the main loop. */ 1848 virtio_mem_bbm_set_bb_state(vm, bb_id, 1849 VIRTIO_MEM_BBM_BB_PLUGGED); 1850 return rc; 1851 } 1852 return 0; 1853 } 1854 1855 /* 1856 * Prepare tracking data for the next big block. 1857 */ 1858 static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, 1859 unsigned long *bb_id) 1860 { 1861 int rc; 1862 1863 if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) 1864 return -ENOSPC; 1865 1866 /* Resize the big block state array if required. */ 1867 rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); 1868 if (rc) 1869 return rc; 1870 1871 vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; 1872 *bb_id = vm->bbm.next_bb_id; 1873 vm->bbm.next_bb_id++; 1874 return 0; 1875 } 1876 1877 static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1878 { 1879 uint64_t nb_bb = diff / vm->bbm.bb_size; 1880 unsigned long bb_id; 1881 int rc; 1882 1883 if (!nb_bb) 1884 return 0; 1885 1886 /* Try to plug and add unused big blocks */ 1887 virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { 1888 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1889 return -ENOSPC; 1890 1891 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1892 if (!rc) 1893 nb_bb--; 1894 if (rc || !nb_bb) 1895 return rc; 1896 cond_resched(); 1897 } 1898 1899 /* Try to prepare, plug and add new big blocks */ 1900 while (nb_bb) { 1901 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1902 return -ENOSPC; 1903 1904 rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); 1905 if (rc) 1906 return rc; 1907 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1908 if (!rc) 1909 nb_bb--; 1910 if (rc) 1911 return rc; 1912 cond_resched(); 1913 } 1914 1915 return 0; 1916 } 1917 1918 /* 1919 * Try to plug the requested amount of memory. 1920 */ 1921 static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) 1922 { 1923 if (vm->in_sbm) 1924 return virtio_mem_sbm_plug_request(vm, diff); 1925 return virtio_mem_bbm_plug_request(vm, diff); 1926 } 1927 1928 /* 1929 * Unplug the desired number of plugged subblocks of an offline memory block. 1930 * Will fail if any subblock cannot get unplugged (instead of skipping it). 1931 * 1932 * Will modify the state of the memory block. Might temporarily drop the 1933 * hotplug_mutex. 1934 * 1935 * Note: Can fail after some subblocks were successfully unplugged. 1936 */ 1937 static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, 1938 unsigned long mb_id, 1939 uint64_t *nb_sb) 1940 { 1941 int rc; 1942 1943 rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb); 1944 1945 /* some subblocks might have been unplugged even on failure */ 1946 if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1947 virtio_mem_sbm_set_mb_state(vm, mb_id, 1948 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1949 if (rc) 1950 return rc; 1951 1952 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1953 /* 1954 * Remove the block from Linux - this should never fail. 1955 * Hinder the block from getting onlined by marking it 1956 * unplugged. Temporarily drop the mutex, so 1957 * any pending GOING_ONLINE requests can be serviced/rejected. 1958 */ 1959 virtio_mem_sbm_set_mb_state(vm, mb_id, 1960 VIRTIO_MEM_SBM_MB_UNUSED); 1961 1962 mutex_unlock(&vm->hotplug_mutex); 1963 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 1964 BUG_ON(rc); 1965 mutex_lock(&vm->hotplug_mutex); 1966 } 1967 return 0; 1968 } 1969 1970 /* 1971 * Unplug the given plugged subblocks of an online memory block. 1972 * 1973 * Will modify the state of the memory block. 1974 */ 1975 static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, 1976 unsigned long mb_id, int sb_id, 1977 int count) 1978 { 1979 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; 1980 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1981 unsigned long start_pfn; 1982 int rc; 1983 1984 start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1985 sb_id * vm->sbm.sb_size); 1986 1987 rc = virtio_mem_fake_offline(vm, start_pfn, nr_pages); 1988 if (rc) 1989 return rc; 1990 1991 /* Try to unplug the allocated memory */ 1992 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1993 if (rc) { 1994 /* Return the memory to the buddy. */ 1995 virtio_mem_fake_online(start_pfn, nr_pages); 1996 return rc; 1997 } 1998 1999 switch (old_state) { 2000 case VIRTIO_MEM_SBM_MB_KERNEL: 2001 virtio_mem_sbm_set_mb_state(vm, mb_id, 2002 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL); 2003 break; 2004 case VIRTIO_MEM_SBM_MB_MOVABLE: 2005 virtio_mem_sbm_set_mb_state(vm, mb_id, 2006 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL); 2007 break; 2008 } 2009 2010 return 0; 2011 } 2012 2013 /* 2014 * Unplug the desired number of plugged subblocks of an online memory block. 2015 * Will skip subblock that are busy. 2016 * 2017 * Will modify the state of the memory block. Might temporarily drop the 2018 * hotplug_mutex. 2019 * 2020 * Note: Can fail after some subblocks were successfully unplugged. Can 2021 * return 0 even if subblocks were busy and could not get unplugged. 2022 */ 2023 static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, 2024 unsigned long mb_id, 2025 uint64_t *nb_sb) 2026 { 2027 int rc, sb_id; 2028 2029 /* If possible, try to unplug the complete block in one shot. */ 2030 if (*nb_sb >= vm->sbm.sbs_per_mb && 2031 virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 2032 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, 2033 vm->sbm.sbs_per_mb); 2034 if (!rc) { 2035 *nb_sb -= vm->sbm.sbs_per_mb; 2036 goto unplugged; 2037 } else if (rc != -EBUSY) 2038 return rc; 2039 } 2040 2041 /* Fallback to single subblocks. */ 2042 for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { 2043 /* Find the next candidate subblock */ 2044 while (sb_id >= 0 && 2045 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 2046 sb_id--; 2047 if (sb_id < 0) 2048 break; 2049 2050 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); 2051 if (rc == -EBUSY) 2052 continue; 2053 else if (rc) 2054 return rc; 2055 *nb_sb -= 1; 2056 } 2057 2058 unplugged: 2059 rc = virtio_mem_sbm_try_remove_unplugged_mb(vm, mb_id); 2060 if (rc) 2061 vm->sbm.have_unplugged_mb = 1; 2062 /* Ignore errors, this is not critical. We'll retry later. */ 2063 return 0; 2064 } 2065 2066 /* 2067 * Unplug the desired number of plugged subblocks of a memory block that is 2068 * already added to Linux. Will skip subblock of online memory blocks that are 2069 * busy (by the OS). Will fail if any subblock that's not busy cannot get 2070 * unplugged. 2071 * 2072 * Will modify the state of the memory block. Might temporarily drop the 2073 * hotplug_mutex. 2074 * 2075 * Note: Can fail after some subblocks were successfully unplugged. Can 2076 * return 0 even if subblocks were busy and could not get unplugged. 2077 */ 2078 static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, 2079 unsigned long mb_id, 2080 uint64_t *nb_sb) 2081 { 2082 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 2083 2084 switch (old_state) { 2085 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 2086 case VIRTIO_MEM_SBM_MB_KERNEL: 2087 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 2088 case VIRTIO_MEM_SBM_MB_MOVABLE: 2089 return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); 2090 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 2091 case VIRTIO_MEM_SBM_MB_OFFLINE: 2092 return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb); 2093 } 2094 return -EINVAL; 2095 } 2096 2097 static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2098 { 2099 const int mb_states[] = { 2100 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 2101 VIRTIO_MEM_SBM_MB_OFFLINE, 2102 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 2103 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 2104 VIRTIO_MEM_SBM_MB_MOVABLE, 2105 VIRTIO_MEM_SBM_MB_KERNEL, 2106 }; 2107 uint64_t nb_sb = diff / vm->sbm.sb_size; 2108 unsigned long mb_id; 2109 int rc, i; 2110 2111 if (!nb_sb) 2112 return 0; 2113 2114 /* 2115 * We'll drop the mutex a couple of times when it is safe to do so. 2116 * This might result in some blocks switching the state (online/offline) 2117 * and we could miss them in this run - we will retry again later. 2118 */ 2119 mutex_lock(&vm->hotplug_mutex); 2120 2121 /* 2122 * We try unplug from partially plugged blocks first, to try removing 2123 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE 2124 * as it's more reliable to unplug memory and remove whole memory 2125 * blocks, and we don't want to trigger a zone imbalances by 2126 * accidentially removing too much kernel memory. 2127 */ 2128 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 2129 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { 2130 rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); 2131 if (rc || !nb_sb) 2132 goto out_unlock; 2133 mutex_unlock(&vm->hotplug_mutex); 2134 cond_resched(); 2135 mutex_lock(&vm->hotplug_mutex); 2136 } 2137 if (!unplug_online && i == 1) { 2138 mutex_unlock(&vm->hotplug_mutex); 2139 return 0; 2140 } 2141 } 2142 2143 mutex_unlock(&vm->hotplug_mutex); 2144 return nb_sb ? -EBUSY : 0; 2145 out_unlock: 2146 mutex_unlock(&vm->hotplug_mutex); 2147 return rc; 2148 } 2149 2150 /* 2151 * Try to offline and remove a big block from Linux and unplug it. Will fail 2152 * with -EBUSY if some memory is busy and cannot get unplugged. 2153 * 2154 * Will modify the state of the memory block. Might temporarily drop the 2155 * hotplug_mutex. 2156 */ 2157 static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, 2158 unsigned long bb_id) 2159 { 2160 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2161 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2162 unsigned long end_pfn = start_pfn + nr_pages; 2163 unsigned long pfn; 2164 struct page *page; 2165 int rc; 2166 2167 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 2168 VIRTIO_MEM_BBM_BB_ADDED)) 2169 return -EINVAL; 2170 2171 /* 2172 * Start by fake-offlining all memory. Once we marked the device 2173 * block as fake-offline, all newly onlined memory will 2174 * automatically be kept fake-offline. Protect from concurrent 2175 * onlining/offlining until we have a consistent state. 2176 */ 2177 mutex_lock(&vm->hotplug_mutex); 2178 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); 2179 2180 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2181 page = pfn_to_online_page(pfn); 2182 if (!page) 2183 continue; 2184 2185 rc = virtio_mem_fake_offline(vm, pfn, PAGES_PER_SECTION); 2186 if (rc) { 2187 end_pfn = pfn; 2188 goto rollback; 2189 } 2190 } 2191 mutex_unlock(&vm->hotplug_mutex); 2192 2193 rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); 2194 if (rc) { 2195 mutex_lock(&vm->hotplug_mutex); 2196 goto rollback; 2197 } 2198 2199 rc = virtio_mem_bbm_unplug_bb(vm, bb_id); 2200 if (rc) 2201 virtio_mem_bbm_set_bb_state(vm, bb_id, 2202 VIRTIO_MEM_BBM_BB_PLUGGED); 2203 else 2204 virtio_mem_bbm_set_bb_state(vm, bb_id, 2205 VIRTIO_MEM_BBM_BB_UNUSED); 2206 return rc; 2207 2208 rollback: 2209 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2210 page = pfn_to_online_page(pfn); 2211 if (!page) 2212 continue; 2213 virtio_mem_fake_online(pfn, PAGES_PER_SECTION); 2214 } 2215 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 2216 mutex_unlock(&vm->hotplug_mutex); 2217 return rc; 2218 } 2219 2220 /* 2221 * Test if a big block is completely offline. 2222 */ 2223 static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, 2224 unsigned long bb_id) 2225 { 2226 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2227 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2228 unsigned long pfn; 2229 2230 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2231 pfn += PAGES_PER_SECTION) { 2232 if (pfn_to_online_page(pfn)) 2233 return false; 2234 } 2235 2236 return true; 2237 } 2238 2239 /* 2240 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline). 2241 */ 2242 static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm, 2243 unsigned long bb_id) 2244 { 2245 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2246 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2247 struct page *page; 2248 unsigned long pfn; 2249 2250 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2251 pfn += PAGES_PER_SECTION) { 2252 page = pfn_to_online_page(pfn); 2253 if (!page) 2254 continue; 2255 if (page_zonenum(page) != ZONE_MOVABLE) 2256 return false; 2257 } 2258 2259 return true; 2260 } 2261 2262 static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2263 { 2264 uint64_t nb_bb = diff / vm->bbm.bb_size; 2265 uint64_t bb_id; 2266 int rc, i; 2267 2268 if (!nb_bb) 2269 return 0; 2270 2271 /* 2272 * Try to unplug big blocks. Similar to SBM, start with offline 2273 * big blocks. 2274 */ 2275 for (i = 0; i < 3; i++) { 2276 virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 2277 cond_resched(); 2278 2279 /* 2280 * As we're holding no locks, these checks are racy, 2281 * but we don't care. 2282 */ 2283 if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) 2284 continue; 2285 if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id)) 2286 continue; 2287 rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); 2288 if (rc == -EBUSY) 2289 continue; 2290 if (!rc) 2291 nb_bb--; 2292 if (rc || !nb_bb) 2293 return rc; 2294 } 2295 if (i == 0 && !unplug_online) 2296 return 0; 2297 } 2298 2299 return nb_bb ? -EBUSY : 0; 2300 } 2301 2302 /* 2303 * Try to unplug the requested amount of memory. 2304 */ 2305 static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) 2306 { 2307 if (vm->in_sbm) 2308 return virtio_mem_sbm_unplug_request(vm, diff); 2309 return virtio_mem_bbm_unplug_request(vm, diff); 2310 } 2311 2312 /* 2313 * Try to unplug all blocks that couldn't be unplugged before, for example, 2314 * because the hypervisor was busy. Further, offline and remove any memory 2315 * blocks where we previously failed. 2316 */ 2317 static int virtio_mem_cleanup_pending_mb(struct virtio_mem *vm) 2318 { 2319 unsigned long id; 2320 int rc = 0; 2321 2322 if (!vm->in_sbm) { 2323 virtio_mem_bbm_for_each_bb(vm, id, 2324 VIRTIO_MEM_BBM_BB_PLUGGED) { 2325 rc = virtio_mem_bbm_unplug_bb(vm, id); 2326 if (rc) 2327 return rc; 2328 virtio_mem_bbm_set_bb_state(vm, id, 2329 VIRTIO_MEM_BBM_BB_UNUSED); 2330 } 2331 return 0; 2332 } 2333 2334 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { 2335 rc = virtio_mem_sbm_unplug_mb(vm, id); 2336 if (rc) 2337 return rc; 2338 virtio_mem_sbm_set_mb_state(vm, id, 2339 VIRTIO_MEM_SBM_MB_UNUSED); 2340 } 2341 2342 if (!vm->sbm.have_unplugged_mb) 2343 return 0; 2344 2345 /* 2346 * Let's retry (offlining and) removing completely unplugged Linux 2347 * memory blocks. 2348 */ 2349 vm->sbm.have_unplugged_mb = false; 2350 2351 mutex_lock(&vm->hotplug_mutex); 2352 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL) 2353 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); 2354 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL) 2355 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); 2356 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) 2357 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); 2358 mutex_unlock(&vm->hotplug_mutex); 2359 2360 if (rc) 2361 vm->sbm.have_unplugged_mb = true; 2362 /* Ignore errors, this is not critical. We'll retry later. */ 2363 return 0; 2364 } 2365 2366 /* 2367 * Update all parts of the config that could have changed. 2368 */ 2369 static void virtio_mem_refresh_config(struct virtio_mem *vm) 2370 { 2371 const struct range pluggable_range = mhp_get_pluggable_range(true); 2372 uint64_t new_plugged_size, usable_region_size, end_addr; 2373 2374 /* the plugged_size is just a reflection of what _we_ did previously */ 2375 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2376 &new_plugged_size); 2377 if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size)) 2378 vm->plugged_size = new_plugged_size; 2379 2380 /* calculate the last usable memory block id */ 2381 virtio_cread_le(vm->vdev, struct virtio_mem_config, 2382 usable_region_size, &usable_region_size); 2383 end_addr = min(vm->addr + usable_region_size - 1, 2384 pluggable_range.end); 2385 2386 if (vm->in_sbm) { 2387 vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr); 2388 if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes())) 2389 vm->sbm.last_usable_mb_id--; 2390 } else { 2391 vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm, 2392 end_addr); 2393 if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size)) 2394 vm->bbm.last_usable_bb_id--; 2395 } 2396 /* 2397 * If we cannot plug any of our device memory (e.g., nothing in the 2398 * usable region is addressable), the last usable memory block id will 2399 * be smaller than the first usable memory block id. We'll stop 2400 * attempting to add memory with -ENOSPC from our main loop. 2401 */ 2402 2403 /* see if there is a request to change the size */ 2404 virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, 2405 &vm->requested_size); 2406 2407 dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size); 2408 dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size); 2409 } 2410 2411 /* 2412 * Workqueue function for handling plug/unplug requests and config updates. 2413 */ 2414 static void virtio_mem_run_wq(struct work_struct *work) 2415 { 2416 struct virtio_mem *vm = container_of(work, struct virtio_mem, wq); 2417 uint64_t diff; 2418 int rc; 2419 2420 if (unlikely(vm->in_kdump)) { 2421 dev_warn_once(&vm->vdev->dev, 2422 "unexpected workqueue run in kdump kernel\n"); 2423 return; 2424 } 2425 2426 hrtimer_cancel(&vm->retry_timer); 2427 2428 if (vm->broken) 2429 return; 2430 2431 atomic_set(&vm->wq_active, 1); 2432 retry: 2433 rc = 0; 2434 2435 /* Make sure we start with a clean state if there are leftovers. */ 2436 if (unlikely(vm->unplug_all_required)) 2437 rc = virtio_mem_send_unplug_all_request(vm); 2438 2439 if (atomic_read(&vm->config_changed)) { 2440 atomic_set(&vm->config_changed, 0); 2441 virtio_mem_refresh_config(vm); 2442 } 2443 2444 /* Cleanup any leftovers from previous runs */ 2445 if (!rc) 2446 rc = virtio_mem_cleanup_pending_mb(vm); 2447 2448 if (!rc && vm->requested_size != vm->plugged_size) { 2449 if (vm->requested_size > vm->plugged_size) { 2450 diff = vm->requested_size - vm->plugged_size; 2451 rc = virtio_mem_plug_request(vm, diff); 2452 } else { 2453 diff = vm->plugged_size - vm->requested_size; 2454 rc = virtio_mem_unplug_request(vm, diff); 2455 } 2456 } 2457 2458 /* 2459 * Keep retrying to offline and remove completely unplugged Linux 2460 * memory blocks. 2461 */ 2462 if (!rc && vm->in_sbm && vm->sbm.have_unplugged_mb) 2463 rc = -EBUSY; 2464 2465 switch (rc) { 2466 case 0: 2467 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2468 break; 2469 case -ENOSPC: 2470 /* 2471 * We cannot add any more memory (alignment, physical limit) 2472 * or we have too many offline memory blocks. 2473 */ 2474 break; 2475 case -ETXTBSY: 2476 /* 2477 * The hypervisor cannot process our request right now 2478 * (e.g., out of memory, migrating); 2479 */ 2480 case -EBUSY: 2481 /* 2482 * We cannot free up any memory to unplug it (all plugged memory 2483 * is busy). 2484 */ 2485 case -ENOMEM: 2486 /* Out of memory, try again later. */ 2487 hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms), 2488 HRTIMER_MODE_REL); 2489 break; 2490 case -EAGAIN: 2491 /* Retry immediately (e.g., the config changed). */ 2492 goto retry; 2493 default: 2494 /* Unknown error, mark as broken */ 2495 dev_err(&vm->vdev->dev, 2496 "unknown error, marking device broken: %d\n", rc); 2497 vm->broken = true; 2498 } 2499 2500 atomic_set(&vm->wq_active, 0); 2501 } 2502 2503 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) 2504 { 2505 struct virtio_mem *vm = container_of(timer, struct virtio_mem, 2506 retry_timer); 2507 2508 virtio_mem_retry(vm); 2509 vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2, 2510 VIRTIO_MEM_RETRY_TIMER_MAX_MS); 2511 return HRTIMER_NORESTART; 2512 } 2513 2514 static void virtio_mem_handle_response(struct virtqueue *vq) 2515 { 2516 struct virtio_mem *vm = vq->vdev->priv; 2517 2518 wake_up(&vm->host_resp); 2519 } 2520 2521 static int virtio_mem_init_vq(struct virtio_mem *vm) 2522 { 2523 struct virtqueue *vq; 2524 2525 vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response, 2526 "guest-request"); 2527 if (IS_ERR(vq)) 2528 return PTR_ERR(vq); 2529 vm->vq = vq; 2530 2531 return 0; 2532 } 2533 2534 static int virtio_mem_init_hotplug(struct virtio_mem *vm) 2535 { 2536 const struct range pluggable_range = mhp_get_pluggable_range(true); 2537 uint64_t unit_pages, sb_size, addr; 2538 int rc; 2539 2540 /* bad device setup - warn only */ 2541 if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) 2542 dev_warn(&vm->vdev->dev, 2543 "The alignment of the physical start address can make some memory unusable.\n"); 2544 if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) 2545 dev_warn(&vm->vdev->dev, 2546 "The alignment of the physical end address can make some memory unusable.\n"); 2547 if (vm->addr < pluggable_range.start || 2548 vm->addr + vm->region_size - 1 > pluggable_range.end) 2549 dev_warn(&vm->vdev->dev, 2550 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); 2551 2552 /* Prepare the offline threshold - make sure we can add two blocks. */ 2553 vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), 2554 VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); 2555 2556 /* 2557 * alloc_contig_range() works reliably with pageblock 2558 * granularity on ZONE_NORMAL, use pageblock_nr_pages. 2559 */ 2560 sb_size = PAGE_SIZE * pageblock_nr_pages; 2561 sb_size = max_t(uint64_t, vm->device_block_size, sb_size); 2562 2563 if (sb_size < memory_block_size_bytes() && !force_bbm) { 2564 /* SBM: At least two subblocks per Linux memory block. */ 2565 vm->in_sbm = true; 2566 vm->sbm.sb_size = sb_size; 2567 vm->sbm.sbs_per_mb = memory_block_size_bytes() / 2568 vm->sbm.sb_size; 2569 2570 /* Round up to the next full memory block */ 2571 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2572 memory_block_size_bytes() - 1; 2573 vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); 2574 vm->sbm.next_mb_id = vm->sbm.first_mb_id; 2575 } else { 2576 /* BBM: At least one Linux memory block. */ 2577 vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, 2578 memory_block_size_bytes()); 2579 2580 if (bbm_block_size) { 2581 if (!is_power_of_2(bbm_block_size)) { 2582 dev_warn(&vm->vdev->dev, 2583 "bbm_block_size is not a power of 2"); 2584 } else if (bbm_block_size < vm->bbm.bb_size) { 2585 dev_warn(&vm->vdev->dev, 2586 "bbm_block_size is too small"); 2587 } else { 2588 vm->bbm.bb_size = bbm_block_size; 2589 } 2590 } 2591 2592 /* Round up to the next aligned big block */ 2593 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2594 vm->bbm.bb_size - 1; 2595 vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); 2596 vm->bbm.next_bb_id = vm->bbm.first_bb_id; 2597 2598 /* Make sure we can add two big blocks. */ 2599 vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, 2600 vm->offline_threshold); 2601 } 2602 2603 dev_info(&vm->vdev->dev, "memory block size: 0x%lx", 2604 memory_block_size_bytes()); 2605 if (vm->in_sbm) 2606 dev_info(&vm->vdev->dev, "subblock size: 0x%llx", 2607 (unsigned long long)vm->sbm.sb_size); 2608 else 2609 dev_info(&vm->vdev->dev, "big block size: 0x%llx", 2610 (unsigned long long)vm->bbm.bb_size); 2611 2612 /* create the parent resource for all memory */ 2613 rc = virtio_mem_create_resource(vm); 2614 if (rc) 2615 return rc; 2616 2617 /* use a single dynamic memory group to cover the whole memory device */ 2618 if (vm->in_sbm) 2619 unit_pages = PHYS_PFN(memory_block_size_bytes()); 2620 else 2621 unit_pages = PHYS_PFN(vm->bbm.bb_size); 2622 rc = memory_group_register_dynamic(vm->nid, unit_pages); 2623 if (rc < 0) 2624 goto out_del_resource; 2625 vm->mgid = rc; 2626 2627 /* 2628 * If we still have memory plugged, we have to unplug all memory first. 2629 * Registering our parent resource makes sure that this memory isn't 2630 * actually in use (e.g., trying to reload the driver). 2631 */ 2632 if (vm->plugged_size) { 2633 vm->unplug_all_required = true; 2634 dev_info(&vm->vdev->dev, "unplugging all memory is required\n"); 2635 } 2636 2637 /* register callbacks */ 2638 vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb; 2639 rc = register_memory_notifier(&vm->memory_notifier); 2640 if (rc) 2641 goto out_unreg_group; 2642 /* Block hibernation as early as possible. */ 2643 vm->pm_notifier.priority = INT_MAX; 2644 vm->pm_notifier.notifier_call = virtio_mem_pm_notifier_cb; 2645 rc = register_pm_notifier(&vm->pm_notifier); 2646 if (rc) 2647 goto out_unreg_mem; 2648 rc = register_virtio_mem_device(vm); 2649 if (rc) 2650 goto out_unreg_pm; 2651 2652 return 0; 2653 out_unreg_pm: 2654 unregister_pm_notifier(&vm->pm_notifier); 2655 out_unreg_mem: 2656 unregister_memory_notifier(&vm->memory_notifier); 2657 out_unreg_group: 2658 memory_group_unregister(vm->mgid); 2659 out_del_resource: 2660 virtio_mem_delete_resource(vm); 2661 return rc; 2662 } 2663 2664 #ifdef CONFIG_PROC_VMCORE 2665 static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr, 2666 uint64_t size) 2667 { 2668 const uint64_t nb_vm_blocks = size / vm->device_block_size; 2669 const struct virtio_mem_req req = { 2670 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE), 2671 .u.state.addr = cpu_to_virtio64(vm->vdev, addr), 2672 .u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 2673 }; 2674 int rc = -ENOMEM; 2675 2676 dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr, 2677 addr + size - 1); 2678 2679 switch (virtio_mem_send_request(vm, &req)) { 2680 case VIRTIO_MEM_RESP_ACK: 2681 return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state); 2682 case VIRTIO_MEM_RESP_ERROR: 2683 rc = -EINVAL; 2684 break; 2685 default: 2686 break; 2687 } 2688 2689 dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc); 2690 return rc; 2691 } 2692 2693 static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb, 2694 unsigned long pfn) 2695 { 2696 struct virtio_mem *vm = container_of(cb, struct virtio_mem, 2697 vmcore_cb); 2698 uint64_t addr = PFN_PHYS(pfn); 2699 bool is_ram; 2700 int rc; 2701 2702 if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE)) 2703 return true; 2704 if (!vm->plugged_size) 2705 return false; 2706 2707 /* 2708 * We have to serialize device requests and access to the information 2709 * about the block queried last. 2710 */ 2711 mutex_lock(&vm->hotplug_mutex); 2712 2713 addr = ALIGN_DOWN(addr, vm->device_block_size); 2714 if (addr != vm->last_block_addr) { 2715 rc = virtio_mem_send_state_request(vm, addr, 2716 vm->device_block_size); 2717 /* On any kind of error, we're going to signal !ram. */ 2718 if (rc == VIRTIO_MEM_STATE_PLUGGED) 2719 vm->last_block_plugged = true; 2720 else 2721 vm->last_block_plugged = false; 2722 vm->last_block_addr = addr; 2723 } 2724 2725 is_ram = vm->last_block_plugged; 2726 mutex_unlock(&vm->hotplug_mutex); 2727 return is_ram; 2728 } 2729 #endif /* CONFIG_PROC_VMCORE */ 2730 2731 static int virtio_mem_init_kdump(struct virtio_mem *vm) 2732 { 2733 #ifdef CONFIG_PROC_VMCORE 2734 dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n"); 2735 vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram; 2736 register_vmcore_cb(&vm->vmcore_cb); 2737 return 0; 2738 #else /* CONFIG_PROC_VMCORE */ 2739 dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n"); 2740 return -EBUSY; 2741 #endif /* CONFIG_PROC_VMCORE */ 2742 } 2743 2744 static int virtio_mem_init(struct virtio_mem *vm) 2745 { 2746 uint16_t node_id; 2747 2748 if (!vm->vdev->config->get) { 2749 dev_err(&vm->vdev->dev, "config access disabled\n"); 2750 return -EINVAL; 2751 } 2752 2753 /* Fetch all properties that can't change. */ 2754 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2755 &vm->plugged_size); 2756 virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size, 2757 &vm->device_block_size); 2758 virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id, 2759 &node_id); 2760 vm->nid = virtio_mem_translate_node_id(vm, node_id); 2761 virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr); 2762 virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, 2763 &vm->region_size); 2764 2765 /* Determine the nid for the device based on the lowest address. */ 2766 if (vm->nid == NUMA_NO_NODE) 2767 vm->nid = memory_add_physaddr_to_nid(vm->addr); 2768 2769 dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); 2770 dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); 2771 dev_info(&vm->vdev->dev, "device block size: 0x%llx", 2772 (unsigned long long)vm->device_block_size); 2773 if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) 2774 dev_info(&vm->vdev->dev, "nid: %d", vm->nid); 2775 2776 /* 2777 * We don't want to (un)plug or reuse any memory when in kdump. The 2778 * memory is still accessible (but not exposed to Linux). 2779 */ 2780 if (vm->in_kdump) 2781 return virtio_mem_init_kdump(vm); 2782 return virtio_mem_init_hotplug(vm); 2783 } 2784 2785 static int virtio_mem_create_resource(struct virtio_mem *vm) 2786 { 2787 /* 2788 * When force-unloading the driver and removing the device, we 2789 * could have a garbage pointer. Duplicate the string. 2790 */ 2791 const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL); 2792 2793 if (!name) 2794 return -ENOMEM; 2795 2796 /* Disallow mapping device memory via /dev/mem completely. */ 2797 vm->parent_resource = __request_mem_region(vm->addr, vm->region_size, 2798 name, IORESOURCE_SYSTEM_RAM | 2799 IORESOURCE_EXCLUSIVE); 2800 if (!vm->parent_resource) { 2801 kfree(name); 2802 dev_warn(&vm->vdev->dev, "could not reserve device region\n"); 2803 dev_info(&vm->vdev->dev, 2804 "reloading the driver is not supported\n"); 2805 return -EBUSY; 2806 } 2807 2808 /* The memory is not actually busy - make add_memory() work. */ 2809 vm->parent_resource->flags &= ~IORESOURCE_BUSY; 2810 return 0; 2811 } 2812 2813 static void virtio_mem_delete_resource(struct virtio_mem *vm) 2814 { 2815 const char *name; 2816 2817 if (!vm->parent_resource) 2818 return; 2819 2820 name = vm->parent_resource->name; 2821 release_resource(vm->parent_resource); 2822 kfree(vm->parent_resource); 2823 kfree(name); 2824 vm->parent_resource = NULL; 2825 } 2826 2827 static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) 2828 { 2829 return 1; 2830 } 2831 2832 static bool virtio_mem_has_memory_added(struct virtio_mem *vm) 2833 { 2834 const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 2835 2836 return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, 2837 vm->addr + vm->region_size, NULL, 2838 virtio_mem_range_has_system_ram) == 1; 2839 } 2840 2841 static int virtio_mem_probe(struct virtio_device *vdev) 2842 { 2843 struct virtio_mem *vm; 2844 int rc; 2845 2846 BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24); 2847 BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10); 2848 2849 vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL); 2850 if (!vm) 2851 return -ENOMEM; 2852 2853 init_waitqueue_head(&vm->host_resp); 2854 vm->vdev = vdev; 2855 INIT_WORK(&vm->wq, virtio_mem_run_wq); 2856 mutex_init(&vm->hotplug_mutex); 2857 INIT_LIST_HEAD(&vm->next); 2858 spin_lock_init(&vm->removal_lock); 2859 hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2860 vm->retry_timer.function = virtio_mem_timer_expired; 2861 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2862 vm->in_kdump = is_kdump_kernel(); 2863 2864 /* register the virtqueue */ 2865 rc = virtio_mem_init_vq(vm); 2866 if (rc) 2867 goto out_free_vm; 2868 2869 /* initialize the device by querying the config */ 2870 rc = virtio_mem_init(vm); 2871 if (rc) 2872 goto out_del_vq; 2873 2874 virtio_device_ready(vdev); 2875 2876 /* trigger a config update to start processing the requested_size */ 2877 if (!vm->in_kdump) { 2878 atomic_set(&vm->config_changed, 1); 2879 queue_work(system_freezable_wq, &vm->wq); 2880 } 2881 2882 return 0; 2883 out_del_vq: 2884 vdev->config->del_vqs(vdev); 2885 out_free_vm: 2886 kfree(vm); 2887 vdev->priv = NULL; 2888 2889 return rc; 2890 } 2891 2892 static void virtio_mem_deinit_hotplug(struct virtio_mem *vm) 2893 { 2894 unsigned long mb_id; 2895 int rc; 2896 2897 /* 2898 * Make sure the workqueue won't be triggered anymore and no memory 2899 * blocks can be onlined/offlined until we're finished here. 2900 */ 2901 mutex_lock(&vm->hotplug_mutex); 2902 spin_lock_irq(&vm->removal_lock); 2903 vm->removing = true; 2904 spin_unlock_irq(&vm->removal_lock); 2905 mutex_unlock(&vm->hotplug_mutex); 2906 2907 /* wait until the workqueue stopped */ 2908 cancel_work_sync(&vm->wq); 2909 hrtimer_cancel(&vm->retry_timer); 2910 2911 if (vm->in_sbm) { 2912 /* 2913 * After we unregistered our callbacks, user space can online 2914 * partially plugged offline blocks. Make sure to remove them. 2915 */ 2916 virtio_mem_sbm_for_each_mb(vm, mb_id, 2917 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 2918 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 2919 BUG_ON(rc); 2920 virtio_mem_sbm_set_mb_state(vm, mb_id, 2921 VIRTIO_MEM_SBM_MB_UNUSED); 2922 } 2923 /* 2924 * After we unregistered our callbacks, user space can no longer 2925 * offline partially plugged online memory blocks. No need to 2926 * worry about them. 2927 */ 2928 } 2929 2930 /* unregister callbacks */ 2931 unregister_virtio_mem_device(vm); 2932 unregister_pm_notifier(&vm->pm_notifier); 2933 unregister_memory_notifier(&vm->memory_notifier); 2934 2935 /* 2936 * There is no way we could reliably remove all memory we have added to 2937 * the system. And there is no way to stop the driver/device from going 2938 * away. Warn at least. 2939 */ 2940 if (virtio_mem_has_memory_added(vm)) { 2941 dev_warn(&vm->vdev->dev, 2942 "device still has system memory added\n"); 2943 } else { 2944 virtio_mem_delete_resource(vm); 2945 kfree_const(vm->resource_name); 2946 memory_group_unregister(vm->mgid); 2947 } 2948 2949 /* remove all tracking data - no locking needed */ 2950 if (vm->in_sbm) { 2951 vfree(vm->sbm.mb_states); 2952 vfree(vm->sbm.sb_states); 2953 } else { 2954 vfree(vm->bbm.bb_states); 2955 } 2956 } 2957 2958 static void virtio_mem_deinit_kdump(struct virtio_mem *vm) 2959 { 2960 #ifdef CONFIG_PROC_VMCORE 2961 unregister_vmcore_cb(&vm->vmcore_cb); 2962 #endif /* CONFIG_PROC_VMCORE */ 2963 } 2964 2965 static void virtio_mem_remove(struct virtio_device *vdev) 2966 { 2967 struct virtio_mem *vm = vdev->priv; 2968 2969 if (vm->in_kdump) 2970 virtio_mem_deinit_kdump(vm); 2971 else 2972 virtio_mem_deinit_hotplug(vm); 2973 2974 /* reset the device and cleanup the queues */ 2975 virtio_reset_device(vdev); 2976 vdev->config->del_vqs(vdev); 2977 2978 kfree(vm); 2979 vdev->priv = NULL; 2980 } 2981 2982 static void virtio_mem_config_changed(struct virtio_device *vdev) 2983 { 2984 struct virtio_mem *vm = vdev->priv; 2985 2986 if (unlikely(vm->in_kdump)) 2987 return; 2988 2989 atomic_set(&vm->config_changed, 1); 2990 virtio_mem_retry(vm); 2991 } 2992 2993 #ifdef CONFIG_PM_SLEEP 2994 static int virtio_mem_freeze(struct virtio_device *vdev) 2995 { 2996 struct virtio_mem *vm = vdev->priv; 2997 2998 /* 2999 * We block hibernation using the PM notifier completely. The workqueue 3000 * is already frozen by the PM core at this point, so we simply 3001 * reset the device and cleanup the queues. 3002 */ 3003 if (pm_suspend_target_state != PM_SUSPEND_TO_IDLE && 3004 vm->plugged_size && 3005 !virtio_has_feature(vm->vdev, VIRTIO_MEM_F_PERSISTENT_SUSPEND)) { 3006 dev_err(&vm->vdev->dev, 3007 "suspending with plugged memory is not supported\n"); 3008 return -EPERM; 3009 } 3010 3011 virtio_reset_device(vdev); 3012 vdev->config->del_vqs(vdev); 3013 vm->vq = NULL; 3014 return 0; 3015 } 3016 3017 static int virtio_mem_restore(struct virtio_device *vdev) 3018 { 3019 struct virtio_mem *vm = vdev->priv; 3020 int ret; 3021 3022 ret = virtio_mem_init_vq(vm); 3023 if (ret) 3024 return ret; 3025 virtio_device_ready(vdev); 3026 3027 /* Let's check if anything changed. */ 3028 virtio_mem_config_changed(vdev); 3029 return 0; 3030 } 3031 #endif 3032 3033 static unsigned int virtio_mem_features[] = { 3034 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA) 3035 VIRTIO_MEM_F_ACPI_PXM, 3036 #endif 3037 VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE, 3038 VIRTIO_MEM_F_PERSISTENT_SUSPEND, 3039 }; 3040 3041 static const struct virtio_device_id virtio_mem_id_table[] = { 3042 { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, 3043 { 0 }, 3044 }; 3045 3046 static struct virtio_driver virtio_mem_driver = { 3047 .feature_table = virtio_mem_features, 3048 .feature_table_size = ARRAY_SIZE(virtio_mem_features), 3049 .driver.name = KBUILD_MODNAME, 3050 .id_table = virtio_mem_id_table, 3051 .probe = virtio_mem_probe, 3052 .remove = virtio_mem_remove, 3053 .config_changed = virtio_mem_config_changed, 3054 #ifdef CONFIG_PM_SLEEP 3055 .freeze = virtio_mem_freeze, 3056 .restore = virtio_mem_restore, 3057 #endif 3058 }; 3059 3060 module_virtio_driver(virtio_mem_driver); 3061 MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table); 3062 MODULE_AUTHOR("David Hildenbrand <david@redhat.com>"); 3063 MODULE_DESCRIPTION("Virtio-mem driver"); 3064 MODULE_LICENSE("GPL"); 3065