1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Virtio-mem device driver. 4 * 5 * Copyright Red Hat, Inc. 2020 6 * 7 * Author(s): David Hildenbrand <david@redhat.com> 8 */ 9 10 #include <linux/virtio.h> 11 #include <linux/virtio_mem.h> 12 #include <linux/workqueue.h> 13 #include <linux/slab.h> 14 #include <linux/module.h> 15 #include <linux/mm.h> 16 #include <linux/memory_hotplug.h> 17 #include <linux/memory.h> 18 #include <linux/hrtimer.h> 19 #include <linux/crash_dump.h> 20 #include <linux/mutex.h> 21 #include <linux/bitmap.h> 22 #include <linux/lockdep.h> 23 #include <linux/log2.h> 24 #include <linux/vmalloc.h> 25 #include <linux/suspend.h> 26 27 #include <acpi/acpi_numa.h> 28 29 static bool unplug_online = true; 30 module_param(unplug_online, bool, 0644); 31 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); 32 33 static bool force_bbm; 34 module_param(force_bbm, bool, 0444); 35 MODULE_PARM_DESC(force_bbm, 36 "Force Big Block Mode. Default is 0 (auto-selection)"); 37 38 static unsigned long bbm_block_size; 39 module_param(bbm_block_size, ulong, 0444); 40 MODULE_PARM_DESC(bbm_block_size, 41 "Big Block size in bytes. Default is 0 (auto-detection)."); 42 43 /* 44 * virtio-mem currently supports the following modes of operation: 45 * 46 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The 47 * size of a Sub Block (SB) is determined based on the device block size, the 48 * pageblock size, and the maximum allocation granularity of the buddy. 49 * Subblocks within a Linux memory block might either be plugged or unplugged. 50 * Memory is added/removed to Linux MM in Linux memory block granularity. 51 * 52 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. 53 * Memory is added/removed to Linux MM in Big Block granularity. 54 * 55 * The mode is determined automatically based on the Linux memory block size 56 * and the device block size. 57 * 58 * User space / core MM (auto onlining) is responsible for onlining added 59 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are 60 * always onlined separately, and all memory within a Linux memory block is 61 * onlined to the same zone - virtio-mem relies on this behavior. 62 */ 63 64 /* 65 * State of a Linux memory block in SBM. 66 */ 67 enum virtio_mem_sbm_mb_state { 68 /* Unplugged, not added to Linux. Can be reused later. */ 69 VIRTIO_MEM_SBM_MB_UNUSED = 0, 70 /* (Partially) plugged, not added to Linux. Error on add_memory(). */ 71 VIRTIO_MEM_SBM_MB_PLUGGED, 72 /* Fully plugged, fully added to Linux, offline. */ 73 VIRTIO_MEM_SBM_MB_OFFLINE, 74 /* Partially plugged, fully added to Linux, offline. */ 75 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 76 /* Fully plugged, fully added to Linux, onlined to a kernel zone. */ 77 VIRTIO_MEM_SBM_MB_KERNEL, 78 /* Partially plugged, fully added to Linux, online to a kernel zone */ 79 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 80 /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 81 VIRTIO_MEM_SBM_MB_MOVABLE, 82 /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 83 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 84 VIRTIO_MEM_SBM_MB_COUNT 85 }; 86 87 /* 88 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. 89 */ 90 enum virtio_mem_bbm_bb_state { 91 /* Unplugged, not added to Linux. Can be reused later. */ 92 VIRTIO_MEM_BBM_BB_UNUSED = 0, 93 /* Plugged, not added to Linux. Error on add_memory(). */ 94 VIRTIO_MEM_BBM_BB_PLUGGED, 95 /* Plugged and added to Linux. */ 96 VIRTIO_MEM_BBM_BB_ADDED, 97 /* All online parts are fake-offline, ready to remove. */ 98 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, 99 VIRTIO_MEM_BBM_BB_COUNT 100 }; 101 102 struct virtio_mem { 103 struct virtio_device *vdev; 104 105 /* We might first have to unplug all memory when starting up. */ 106 bool unplug_all_required; 107 108 /* Workqueue that processes the plug/unplug requests. */ 109 struct work_struct wq; 110 atomic_t wq_active; 111 atomic_t config_changed; 112 113 /* Virtqueue for guest->host requests. */ 114 struct virtqueue *vq; 115 116 /* Wait for a host response to a guest request. */ 117 wait_queue_head_t host_resp; 118 119 /* Space for one guest request and the host response. */ 120 struct virtio_mem_req req; 121 struct virtio_mem_resp resp; 122 123 /* The current size of the device. */ 124 uint64_t plugged_size; 125 /* The requested size of the device. */ 126 uint64_t requested_size; 127 128 /* The device block size (for communicating with the device). */ 129 uint64_t device_block_size; 130 /* The determined node id for all memory of the device. */ 131 int nid; 132 /* Physical start address of the memory region. */ 133 uint64_t addr; 134 /* Maximum region size in bytes. */ 135 uint64_t region_size; 136 137 /* The parent resource for all memory added via this device. */ 138 struct resource *parent_resource; 139 /* 140 * Copy of "System RAM (virtio_mem)" to be used for 141 * add_memory_driver_managed(). 142 */ 143 const char *resource_name; 144 /* Memory group identification. */ 145 int mgid; 146 147 /* 148 * We don't want to add too much memory if it's not getting onlined, 149 * to avoid running OOM. Besides this threshold, we allow to have at 150 * least two offline blocks at a time (whatever is bigger). 151 */ 152 #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) 153 atomic64_t offline_size; 154 uint64_t offline_threshold; 155 156 /* If set, the driver is in SBM, otherwise in BBM. */ 157 bool in_sbm; 158 159 union { 160 struct { 161 /* Id of the first memory block of this device. */ 162 unsigned long first_mb_id; 163 /* Id of the last usable memory block of this device. */ 164 unsigned long last_usable_mb_id; 165 /* Id of the next memory bock to prepare when needed. */ 166 unsigned long next_mb_id; 167 168 /* The subblock size. */ 169 uint64_t sb_size; 170 /* The number of subblocks per Linux memory block. */ 171 uint32_t sbs_per_mb; 172 173 /* 174 * Some of the Linux memory blocks tracked as "partially 175 * plugged" are completely unplugged and can be offlined 176 * and removed -- which previously failed. 177 */ 178 bool have_unplugged_mb; 179 180 /* Summary of all memory block states. */ 181 unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; 182 183 /* 184 * One byte state per memory block. Allocated via 185 * vmalloc(). Resized (alloc+copy+free) on demand. 186 * 187 * With 128 MiB memory blocks, we have states for 512 188 * GiB of memory in one 4 KiB page. 189 */ 190 uint8_t *mb_states; 191 192 /* 193 * Bitmap: one bit per subblock. Allocated similar to 194 * sbm.mb_states. 195 * 196 * A set bit means the corresponding subblock is 197 * plugged, otherwise it's unblocked. 198 * 199 * With 4 MiB subblocks, we manage 128 GiB of memory 200 * in one 4 KiB page. 201 */ 202 unsigned long *sb_states; 203 } sbm; 204 205 struct { 206 /* Id of the first big block of this device. */ 207 unsigned long first_bb_id; 208 /* Id of the last usable big block of this device. */ 209 unsigned long last_usable_bb_id; 210 /* Id of the next device bock to prepare when needed. */ 211 unsigned long next_bb_id; 212 213 /* Summary of all big block states. */ 214 unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; 215 216 /* One byte state per big block. See sbm.mb_states. */ 217 uint8_t *bb_states; 218 219 /* The block size used for plugging/adding/removing. */ 220 uint64_t bb_size; 221 } bbm; 222 }; 223 224 /* 225 * Mutex that protects the sbm.mb_count, sbm.mb_states, 226 * sbm.sb_states, bbm.bb_count, and bbm.bb_states 227 * 228 * When this lock is held the pointers can't change, ONLINE and 229 * OFFLINE blocks can't change the state and no subblocks will get 230 * plugged/unplugged. 231 * 232 * In kdump mode, used to serialize requests, last_block_addr and 233 * last_block_plugged. 234 */ 235 struct mutex hotplug_mutex; 236 bool hotplug_active; 237 238 /* An error occurred we cannot handle - stop processing requests. */ 239 bool broken; 240 241 /* Cached valued of is_kdump_kernel() when the device was probed. */ 242 bool in_kdump; 243 244 /* The driver is being removed. */ 245 spinlock_t removal_lock; 246 bool removing; 247 248 /* Timer for retrying to plug/unplug memory. */ 249 struct hrtimer retry_timer; 250 unsigned int retry_timer_ms; 251 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000 252 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000 253 254 /* Memory notifier (online/offline events). */ 255 struct notifier_block memory_notifier; 256 257 /* Notifier to block hibernation image storing/reloading. */ 258 struct notifier_block pm_notifier; 259 260 #ifdef CONFIG_PROC_VMCORE 261 /* vmcore callback for /proc/vmcore handling in kdump mode */ 262 struct vmcore_cb vmcore_cb; 263 uint64_t last_block_addr; 264 bool last_block_plugged; 265 #endif /* CONFIG_PROC_VMCORE */ 266 267 /* Next device in the list of virtio-mem devices. */ 268 struct list_head next; 269 }; 270 271 /* 272 * We have to share a single online_page callback among all virtio-mem 273 * devices. We use RCU to iterate the list in the callback. 274 */ 275 static DEFINE_MUTEX(virtio_mem_mutex); 276 static LIST_HEAD(virtio_mem_devices); 277 278 static void virtio_mem_online_page_cb(struct page *page, unsigned int order); 279 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 280 unsigned long nr_pages); 281 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 282 unsigned long nr_pages); 283 static void virtio_mem_retry(struct virtio_mem *vm); 284 static int virtio_mem_create_resource(struct virtio_mem *vm); 285 static void virtio_mem_delete_resource(struct virtio_mem *vm); 286 287 /* 288 * Register a virtio-mem device so it will be considered for the online_page 289 * callback. 290 */ 291 static int register_virtio_mem_device(struct virtio_mem *vm) 292 { 293 int rc = 0; 294 295 /* First device registers the callback. */ 296 mutex_lock(&virtio_mem_mutex); 297 if (list_empty(&virtio_mem_devices)) 298 rc = set_online_page_callback(&virtio_mem_online_page_cb); 299 if (!rc) 300 list_add_rcu(&vm->next, &virtio_mem_devices); 301 mutex_unlock(&virtio_mem_mutex); 302 303 return rc; 304 } 305 306 /* 307 * Unregister a virtio-mem device so it will no longer be considered for the 308 * online_page callback. 309 */ 310 static void unregister_virtio_mem_device(struct virtio_mem *vm) 311 { 312 /* Last device unregisters the callback. */ 313 mutex_lock(&virtio_mem_mutex); 314 list_del_rcu(&vm->next); 315 if (list_empty(&virtio_mem_devices)) 316 restore_online_page_callback(&virtio_mem_online_page_cb); 317 mutex_unlock(&virtio_mem_mutex); 318 319 synchronize_rcu(); 320 } 321 322 /* 323 * Calculate the memory block id of a given address. 324 */ 325 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr) 326 { 327 return addr / memory_block_size_bytes(); 328 } 329 330 /* 331 * Calculate the physical start address of a given memory block id. 332 */ 333 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) 334 { 335 return mb_id * memory_block_size_bytes(); 336 } 337 338 /* 339 * Calculate the big block id of a given address. 340 */ 341 static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, 342 uint64_t addr) 343 { 344 return addr / vm->bbm.bb_size; 345 } 346 347 /* 348 * Calculate the physical start address of a given big block id. 349 */ 350 static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, 351 unsigned long bb_id) 352 { 353 return bb_id * vm->bbm.bb_size; 354 } 355 356 /* 357 * Calculate the subblock id of a given address. 358 */ 359 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, 360 unsigned long addr) 361 { 362 const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); 363 const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); 364 365 return (addr - mb_addr) / vm->sbm.sb_size; 366 } 367 368 /* 369 * Set the state of a big block, taking care of the state counter. 370 */ 371 static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, 372 unsigned long bb_id, 373 enum virtio_mem_bbm_bb_state state) 374 { 375 const unsigned long idx = bb_id - vm->bbm.first_bb_id; 376 enum virtio_mem_bbm_bb_state old_state; 377 378 old_state = vm->bbm.bb_states[idx]; 379 vm->bbm.bb_states[idx] = state; 380 381 BUG_ON(vm->bbm.bb_count[old_state] == 0); 382 vm->bbm.bb_count[old_state]--; 383 vm->bbm.bb_count[state]++; 384 } 385 386 /* 387 * Get the state of a big block. 388 */ 389 static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, 390 unsigned long bb_id) 391 { 392 return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; 393 } 394 395 /* 396 * Prepare the big block state array for the next big block. 397 */ 398 static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) 399 { 400 unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; 401 unsigned long new_bytes = old_bytes + 1; 402 int old_pages = PFN_UP(old_bytes); 403 int new_pages = PFN_UP(new_bytes); 404 uint8_t *new_array; 405 406 if (vm->bbm.bb_states && old_pages == new_pages) 407 return 0; 408 409 new_array = vzalloc(new_pages * PAGE_SIZE); 410 if (!new_array) 411 return -ENOMEM; 412 413 mutex_lock(&vm->hotplug_mutex); 414 if (vm->bbm.bb_states) 415 memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); 416 vfree(vm->bbm.bb_states); 417 vm->bbm.bb_states = new_array; 418 mutex_unlock(&vm->hotplug_mutex); 419 420 return 0; 421 } 422 423 #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ 424 for (_bb_id = vm->bbm.first_bb_id; \ 425 _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ 426 _bb_id++) \ 427 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 428 429 #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ 430 for (_bb_id = vm->bbm.next_bb_id - 1; \ 431 _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ 432 _bb_id--) \ 433 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 434 435 /* 436 * Set the state of a memory block, taking care of the state counter. 437 */ 438 static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, 439 unsigned long mb_id, uint8_t state) 440 { 441 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 442 uint8_t old_state; 443 444 old_state = vm->sbm.mb_states[idx]; 445 vm->sbm.mb_states[idx] = state; 446 447 BUG_ON(vm->sbm.mb_count[old_state] == 0); 448 vm->sbm.mb_count[old_state]--; 449 vm->sbm.mb_count[state]++; 450 } 451 452 /* 453 * Get the state of a memory block. 454 */ 455 static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, 456 unsigned long mb_id) 457 { 458 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 459 460 return vm->sbm.mb_states[idx]; 461 } 462 463 /* 464 * Prepare the state array for the next memory block. 465 */ 466 static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) 467 { 468 int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); 469 int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); 470 uint8_t *new_array; 471 472 if (vm->sbm.mb_states && old_pages == new_pages) 473 return 0; 474 475 new_array = vzalloc(new_pages * PAGE_SIZE); 476 if (!new_array) 477 return -ENOMEM; 478 479 mutex_lock(&vm->hotplug_mutex); 480 if (vm->sbm.mb_states) 481 memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); 482 vfree(vm->sbm.mb_states); 483 vm->sbm.mb_states = new_array; 484 mutex_unlock(&vm->hotplug_mutex); 485 486 return 0; 487 } 488 489 #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ 490 for (_mb_id = _vm->sbm.first_mb_id; \ 491 _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ 492 _mb_id++) \ 493 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 494 495 #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ 496 for (_mb_id = _vm->sbm.next_mb_id - 1; \ 497 _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ 498 _mb_id--) \ 499 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 500 501 /* 502 * Calculate the bit number in the subblock bitmap for the given subblock 503 * inside the given memory block. 504 */ 505 static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, 506 unsigned long mb_id, int sb_id) 507 { 508 return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; 509 } 510 511 /* 512 * Mark all selected subblocks plugged. 513 * 514 * Will not modify the state of the memory block. 515 */ 516 static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, 517 unsigned long mb_id, int sb_id, 518 int count) 519 { 520 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 521 522 __bitmap_set(vm->sbm.sb_states, bit, count); 523 } 524 525 /* 526 * Mark all selected subblocks unplugged. 527 * 528 * Will not modify the state of the memory block. 529 */ 530 static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, 531 unsigned long mb_id, int sb_id, 532 int count) 533 { 534 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 535 536 __bitmap_clear(vm->sbm.sb_states, bit, count); 537 } 538 539 /* 540 * Test if all selected subblocks are plugged. 541 */ 542 static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, 543 unsigned long mb_id, int sb_id, 544 int count) 545 { 546 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 547 548 if (count == 1) 549 return test_bit(bit, vm->sbm.sb_states); 550 551 /* TODO: Helper similar to bitmap_set() */ 552 return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= 553 bit + count; 554 } 555 556 /* 557 * Test if all selected subblocks are unplugged. 558 */ 559 static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, 560 unsigned long mb_id, int sb_id, 561 int count) 562 { 563 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 564 565 /* TODO: Helper similar to bitmap_set() */ 566 return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= 567 bit + count; 568 } 569 570 /* 571 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is 572 * none. 573 */ 574 static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, 575 unsigned long mb_id) 576 { 577 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); 578 579 return find_next_zero_bit(vm->sbm.sb_states, 580 bit + vm->sbm.sbs_per_mb, bit) - bit; 581 } 582 583 /* 584 * Prepare the subblock bitmap for the next memory block. 585 */ 586 static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) 587 { 588 const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; 589 const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; 590 const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; 591 int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); 592 int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); 593 unsigned long *new_bitmap, *old_bitmap; 594 595 if (vm->sbm.sb_states && old_pages == new_pages) 596 return 0; 597 598 new_bitmap = vzalloc(new_pages * PAGE_SIZE); 599 if (!new_bitmap) 600 return -ENOMEM; 601 602 mutex_lock(&vm->hotplug_mutex); 603 if (vm->sbm.sb_states) 604 memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); 605 606 old_bitmap = vm->sbm.sb_states; 607 vm->sbm.sb_states = new_bitmap; 608 mutex_unlock(&vm->hotplug_mutex); 609 610 vfree(old_bitmap); 611 return 0; 612 } 613 614 /* 615 * Test if we could add memory without creating too much offline memory - 616 * to avoid running OOM if memory is getting onlined deferred. 617 */ 618 static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) 619 { 620 if (WARN_ON_ONCE(size > vm->offline_threshold)) 621 return false; 622 623 return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; 624 } 625 626 /* 627 * Try adding memory to Linux. Will usually only fail if out of memory. 628 * 629 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 630 * onlining code). 631 * 632 * Will not modify the state of memory blocks in virtio-mem. 633 */ 634 static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, 635 uint64_t size) 636 { 637 int rc; 638 639 /* 640 * When force-unloading the driver and we still have memory added to 641 * Linux, the resource name has to stay. 642 */ 643 if (!vm->resource_name) { 644 vm->resource_name = kstrdup_const("System RAM (virtio_mem)", 645 GFP_KERNEL); 646 if (!vm->resource_name) 647 return -ENOMEM; 648 } 649 650 dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, 651 addr + size - 1); 652 /* Memory might get onlined immediately. */ 653 atomic64_add(size, &vm->offline_size); 654 rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name, 655 MHP_MERGE_RESOURCE | MHP_NID_IS_MGID); 656 if (rc) { 657 atomic64_sub(size, &vm->offline_size); 658 dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); 659 /* 660 * TODO: Linux MM does not properly clean up yet in all cases 661 * where adding of memory failed - especially on -ENOMEM. 662 */ 663 } 664 return rc; 665 } 666 667 /* 668 * See virtio_mem_add_memory(): Try adding a single Linux memory block. 669 */ 670 static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) 671 { 672 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 673 const uint64_t size = memory_block_size_bytes(); 674 675 return virtio_mem_add_memory(vm, addr, size); 676 } 677 678 /* 679 * See virtio_mem_add_memory(): Try adding a big block. 680 */ 681 static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) 682 { 683 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 684 const uint64_t size = vm->bbm.bb_size; 685 686 return virtio_mem_add_memory(vm, addr, size); 687 } 688 689 /* 690 * Try removing memory from Linux. Will only fail if memory blocks aren't 691 * offline. 692 * 693 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 694 * onlining code). 695 * 696 * Will not modify the state of memory blocks in virtio-mem. 697 */ 698 static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, 699 uint64_t size) 700 { 701 int rc; 702 703 dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, 704 addr + size - 1); 705 rc = remove_memory(addr, size); 706 if (!rc) { 707 atomic64_sub(size, &vm->offline_size); 708 /* 709 * We might have freed up memory we can now unplug, retry 710 * immediately instead of waiting. 711 */ 712 virtio_mem_retry(vm); 713 } else { 714 dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); 715 } 716 return rc; 717 } 718 719 /* 720 * See virtio_mem_remove_memory(): Try removing a single Linux memory block. 721 */ 722 static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) 723 { 724 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 725 const uint64_t size = memory_block_size_bytes(); 726 727 return virtio_mem_remove_memory(vm, addr, size); 728 } 729 730 /* 731 * Try offlining and removing memory from Linux. 732 * 733 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 734 * onlining code). 735 * 736 * Will not modify the state of memory blocks in virtio-mem. 737 */ 738 static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, 739 uint64_t addr, 740 uint64_t size) 741 { 742 int rc; 743 744 dev_dbg(&vm->vdev->dev, 745 "offlining and removing memory: 0x%llx - 0x%llx\n", addr, 746 addr + size - 1); 747 748 rc = offline_and_remove_memory(addr, size); 749 if (!rc) { 750 atomic64_sub(size, &vm->offline_size); 751 /* 752 * We might have freed up memory we can now unplug, retry 753 * immediately instead of waiting. 754 */ 755 virtio_mem_retry(vm); 756 return 0; 757 } 758 dev_dbg(&vm->vdev->dev, "offlining and removing memory failed: %d\n", rc); 759 /* 760 * We don't really expect this to fail, because we fake-offlined all 761 * memory already. But it could fail in corner cases. 762 */ 763 WARN_ON_ONCE(rc != -ENOMEM && rc != -EBUSY); 764 return rc == -ENOMEM ? -ENOMEM : -EBUSY; 765 } 766 767 /* 768 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing 769 * a single Linux memory block. 770 */ 771 static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, 772 unsigned long mb_id) 773 { 774 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 775 const uint64_t size = memory_block_size_bytes(); 776 777 return virtio_mem_offline_and_remove_memory(vm, addr, size); 778 } 779 780 /* 781 * Try (offlining and) removing memory from Linux in case all subblocks are 782 * unplugged. Can be called on online and offline memory blocks. 783 * 784 * May modify the state of memory blocks in virtio-mem. 785 */ 786 static int virtio_mem_sbm_try_remove_unplugged_mb(struct virtio_mem *vm, 787 unsigned long mb_id) 788 { 789 int rc; 790 791 /* 792 * Once all subblocks of a memory block were unplugged, offline and 793 * remove it. 794 */ 795 if (!virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 796 return 0; 797 798 /* offline_and_remove_memory() works for online and offline memory. */ 799 mutex_unlock(&vm->hotplug_mutex); 800 rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); 801 mutex_lock(&vm->hotplug_mutex); 802 if (!rc) 803 virtio_mem_sbm_set_mb_state(vm, mb_id, 804 VIRTIO_MEM_SBM_MB_UNUSED); 805 return rc; 806 } 807 808 /* 809 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a 810 * all Linux memory blocks covered by the big block. 811 */ 812 static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, 813 unsigned long bb_id) 814 { 815 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 816 const uint64_t size = vm->bbm.bb_size; 817 818 return virtio_mem_offline_and_remove_memory(vm, addr, size); 819 } 820 821 /* 822 * Trigger the workqueue so the device can perform its magic. 823 */ 824 static void virtio_mem_retry(struct virtio_mem *vm) 825 { 826 unsigned long flags; 827 828 spin_lock_irqsave(&vm->removal_lock, flags); 829 if (!vm->removing) 830 queue_work(system_freezable_wq, &vm->wq); 831 spin_unlock_irqrestore(&vm->removal_lock, flags); 832 } 833 834 static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) 835 { 836 int node = NUMA_NO_NODE; 837 838 #if defined(CONFIG_ACPI_NUMA) 839 if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM)) 840 node = pxm_to_node(node_id); 841 #endif 842 return node; 843 } 844 845 /* 846 * Test if a virtio-mem device overlaps with the given range. Can be called 847 * from (notifier) callbacks lockless. 848 */ 849 static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, 850 uint64_t size) 851 { 852 return start < vm->addr + vm->region_size && vm->addr < start + size; 853 } 854 855 /* 856 * Test if a virtio-mem device contains a given range. Can be called from 857 * (notifier) callbacks lockless. 858 */ 859 static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, 860 uint64_t size) 861 { 862 return start >= vm->addr && start + size <= vm->addr + vm->region_size; 863 } 864 865 static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, 866 unsigned long mb_id) 867 { 868 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 869 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 870 case VIRTIO_MEM_SBM_MB_OFFLINE: 871 return NOTIFY_OK; 872 default: 873 break; 874 } 875 dev_warn_ratelimited(&vm->vdev->dev, 876 "memory block onlining denied\n"); 877 return NOTIFY_BAD; 878 } 879 880 static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, 881 unsigned long mb_id) 882 { 883 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 884 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 885 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 886 virtio_mem_sbm_set_mb_state(vm, mb_id, 887 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 888 break; 889 case VIRTIO_MEM_SBM_MB_KERNEL: 890 case VIRTIO_MEM_SBM_MB_MOVABLE: 891 virtio_mem_sbm_set_mb_state(vm, mb_id, 892 VIRTIO_MEM_SBM_MB_OFFLINE); 893 break; 894 default: 895 BUG(); 896 break; 897 } 898 } 899 900 static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, 901 unsigned long mb_id, 902 unsigned long start_pfn) 903 { 904 const bool is_movable = is_zone_movable_page(pfn_to_page(start_pfn)); 905 int new_state; 906 907 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 908 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 909 new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL; 910 if (is_movable) 911 new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL; 912 break; 913 case VIRTIO_MEM_SBM_MB_OFFLINE: 914 new_state = VIRTIO_MEM_SBM_MB_KERNEL; 915 if (is_movable) 916 new_state = VIRTIO_MEM_SBM_MB_MOVABLE; 917 break; 918 default: 919 BUG(); 920 break; 921 } 922 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 923 } 924 925 static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, 926 unsigned long mb_id) 927 { 928 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 929 unsigned long pfn; 930 int sb_id; 931 932 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 933 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 934 continue; 935 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 936 sb_id * vm->sbm.sb_size); 937 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 938 } 939 } 940 941 static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, 942 unsigned long mb_id) 943 { 944 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 945 unsigned long pfn; 946 int sb_id; 947 948 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 949 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 950 continue; 951 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 952 sb_id * vm->sbm.sb_size); 953 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 954 } 955 } 956 957 static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, 958 unsigned long bb_id, 959 unsigned long pfn, 960 unsigned long nr_pages) 961 { 962 /* 963 * When marked as "fake-offline", all online memory of this device block 964 * is allocated by us. Otherwise, we don't have any memory allocated. 965 */ 966 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 967 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 968 return; 969 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 970 } 971 972 static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, 973 unsigned long bb_id, 974 unsigned long pfn, 975 unsigned long nr_pages) 976 { 977 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 978 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 979 return; 980 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 981 } 982 983 /* 984 * This callback will either be called synchronously from add_memory() or 985 * asynchronously (e.g., triggered via user space). We have to be careful 986 * with locking when calling add_memory(). 987 */ 988 static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, 989 unsigned long action, void *arg) 990 { 991 struct virtio_mem *vm = container_of(nb, struct virtio_mem, 992 memory_notifier); 993 struct memory_notify *mhp = arg; 994 const unsigned long start = PFN_PHYS(mhp->start_pfn); 995 const unsigned long size = PFN_PHYS(mhp->nr_pages); 996 int rc = NOTIFY_OK; 997 unsigned long id; 998 999 if (!virtio_mem_overlaps_range(vm, start, size)) 1000 return NOTIFY_DONE; 1001 1002 if (vm->in_sbm) { 1003 id = virtio_mem_phys_to_mb_id(start); 1004 /* 1005 * In SBM, we add memory in separate memory blocks - we expect 1006 * it to be onlined/offlined in the same granularity. Bail out 1007 * if this ever changes. 1008 */ 1009 if (WARN_ON_ONCE(size != memory_block_size_bytes() || 1010 !IS_ALIGNED(start, memory_block_size_bytes()))) 1011 return NOTIFY_BAD; 1012 } else { 1013 id = virtio_mem_phys_to_bb_id(vm, start); 1014 /* 1015 * In BBM, we only care about onlining/offlining happening 1016 * within a single big block, we don't care about the 1017 * actual granularity as we don't track individual Linux 1018 * memory blocks. 1019 */ 1020 if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) 1021 return NOTIFY_BAD; 1022 } 1023 1024 /* 1025 * Avoid circular locking lockdep warnings. We lock the mutex 1026 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The 1027 * blocking_notifier_call_chain() has it's own lock, which gets unlocked 1028 * between both notifier calls and will bail out. False positive. 1029 */ 1030 lockdep_off(); 1031 1032 switch (action) { 1033 case MEM_GOING_OFFLINE: 1034 mutex_lock(&vm->hotplug_mutex); 1035 if (vm->removing) { 1036 rc = notifier_from_errno(-EBUSY); 1037 mutex_unlock(&vm->hotplug_mutex); 1038 break; 1039 } 1040 vm->hotplug_active = true; 1041 if (vm->in_sbm) 1042 virtio_mem_sbm_notify_going_offline(vm, id); 1043 else 1044 virtio_mem_bbm_notify_going_offline(vm, id, 1045 mhp->start_pfn, 1046 mhp->nr_pages); 1047 break; 1048 case MEM_GOING_ONLINE: 1049 mutex_lock(&vm->hotplug_mutex); 1050 if (vm->removing) { 1051 rc = notifier_from_errno(-EBUSY); 1052 mutex_unlock(&vm->hotplug_mutex); 1053 break; 1054 } 1055 vm->hotplug_active = true; 1056 if (vm->in_sbm) 1057 rc = virtio_mem_sbm_notify_going_online(vm, id); 1058 break; 1059 case MEM_OFFLINE: 1060 if (vm->in_sbm) 1061 virtio_mem_sbm_notify_offline(vm, id); 1062 1063 atomic64_add(size, &vm->offline_size); 1064 /* 1065 * Trigger the workqueue. Now that we have some offline memory, 1066 * maybe we can handle pending unplug requests. 1067 */ 1068 if (!unplug_online) 1069 virtio_mem_retry(vm); 1070 1071 vm->hotplug_active = false; 1072 mutex_unlock(&vm->hotplug_mutex); 1073 break; 1074 case MEM_ONLINE: 1075 if (vm->in_sbm) 1076 virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); 1077 1078 atomic64_sub(size, &vm->offline_size); 1079 /* 1080 * Start adding more memory once we onlined half of our 1081 * threshold. Don't trigger if it's possibly due to our actipn 1082 * (e.g., us adding memory which gets onlined immediately from 1083 * the core). 1084 */ 1085 if (!atomic_read(&vm->wq_active) && 1086 virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) 1087 virtio_mem_retry(vm); 1088 1089 vm->hotplug_active = false; 1090 mutex_unlock(&vm->hotplug_mutex); 1091 break; 1092 case MEM_CANCEL_OFFLINE: 1093 if (!vm->hotplug_active) 1094 break; 1095 if (vm->in_sbm) 1096 virtio_mem_sbm_notify_cancel_offline(vm, id); 1097 else 1098 virtio_mem_bbm_notify_cancel_offline(vm, id, 1099 mhp->start_pfn, 1100 mhp->nr_pages); 1101 vm->hotplug_active = false; 1102 mutex_unlock(&vm->hotplug_mutex); 1103 break; 1104 case MEM_CANCEL_ONLINE: 1105 if (!vm->hotplug_active) 1106 break; 1107 vm->hotplug_active = false; 1108 mutex_unlock(&vm->hotplug_mutex); 1109 break; 1110 default: 1111 break; 1112 } 1113 1114 lockdep_on(); 1115 1116 return rc; 1117 } 1118 1119 static int virtio_mem_pm_notifier_cb(struct notifier_block *nb, 1120 unsigned long action, void *arg) 1121 { 1122 struct virtio_mem *vm = container_of(nb, struct virtio_mem, 1123 pm_notifier); 1124 switch (action) { 1125 case PM_HIBERNATION_PREPARE: 1126 case PM_RESTORE_PREPARE: 1127 /* 1128 * When restarting the VM, all memory is unplugged. Don't 1129 * allow to hibernate and restore from an image. 1130 */ 1131 dev_err(&vm->vdev->dev, "hibernation is not supported.\n"); 1132 return NOTIFY_BAD; 1133 default: 1134 return NOTIFY_OK; 1135 } 1136 } 1137 1138 /* 1139 * Set a range of pages PG_offline. Remember pages that were never onlined 1140 * (via generic_online_page()) using PageDirty(). 1141 */ 1142 static void virtio_mem_set_fake_offline(unsigned long pfn, 1143 unsigned long nr_pages, bool onlined) 1144 { 1145 page_offline_begin(); 1146 for (; nr_pages--; pfn++) { 1147 struct page *page = pfn_to_page(pfn); 1148 1149 if (!onlined) 1150 /* 1151 * Pages that have not been onlined yet were initialized 1152 * to PageOffline(). Remember that we have to route them 1153 * through generic_online_page(). 1154 */ 1155 SetPageDirty(page); 1156 else 1157 __SetPageOffline(page); 1158 VM_WARN_ON_ONCE(!PageOffline(page)); 1159 } 1160 page_offline_end(); 1161 } 1162 1163 /* 1164 * Clear PG_offline from a range of pages. If the pages were never onlined, 1165 * (via generic_online_page()), clear PageDirty(). 1166 */ 1167 static void virtio_mem_clear_fake_offline(unsigned long pfn, 1168 unsigned long nr_pages, bool onlined) 1169 { 1170 for (; nr_pages--; pfn++) { 1171 struct page *page = pfn_to_page(pfn); 1172 1173 if (!onlined) 1174 /* generic_online_page() will clear PageOffline(). */ 1175 ClearPageDirty(page); 1176 else 1177 __ClearPageOffline(page); 1178 } 1179 } 1180 1181 /* 1182 * Release a range of fake-offline pages to the buddy, effectively 1183 * fake-onlining them. 1184 */ 1185 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) 1186 { 1187 unsigned long order = MAX_PAGE_ORDER; 1188 unsigned long i; 1189 1190 /* 1191 * We might get called for ranges that don't cover properly aligned 1192 * MAX_PAGE_ORDER pages; however, we can only online properly aligned 1193 * pages with an order of MAX_PAGE_ORDER at maximum. 1194 */ 1195 while (!IS_ALIGNED(pfn | nr_pages, 1 << order)) 1196 order--; 1197 1198 for (i = 0; i < nr_pages; i += 1 << order) { 1199 struct page *page = pfn_to_page(pfn + i); 1200 1201 /* 1202 * If the page is PageDirty(), it was kept fake-offline when 1203 * onlining the memory block. Otherwise, it was allocated 1204 * using alloc_contig_range(). All pages in a subblock are 1205 * alike. 1206 */ 1207 if (PageDirty(page)) { 1208 virtio_mem_clear_fake_offline(pfn + i, 1 << order, false); 1209 generic_online_page(page, order); 1210 } else { 1211 virtio_mem_clear_fake_offline(pfn + i, 1 << order, true); 1212 free_contig_range(pfn + i, 1 << order); 1213 adjust_managed_page_count(page, 1 << order); 1214 } 1215 } 1216 } 1217 1218 /* 1219 * Try to allocate a range, marking pages fake-offline, effectively 1220 * fake-offlining them. 1221 */ 1222 static int virtio_mem_fake_offline(struct virtio_mem *vm, unsigned long pfn, 1223 unsigned long nr_pages) 1224 { 1225 const bool is_movable = is_zone_movable_page(pfn_to_page(pfn)); 1226 int rc, retry_count; 1227 1228 /* 1229 * TODO: We want an alloc_contig_range() mode that tries to allocate 1230 * harder (e.g., dealing with temporarily pinned pages, PCP), especially 1231 * with ZONE_MOVABLE. So for now, retry a couple of times with 1232 * ZONE_MOVABLE before giving up - because that zone is supposed to give 1233 * some guarantees. 1234 */ 1235 for (retry_count = 0; retry_count < 5; retry_count++) { 1236 /* 1237 * If the config changed, stop immediately and go back to the 1238 * main loop: avoid trying to keep unplugging if the device 1239 * might have decided to not remove any more memory. 1240 */ 1241 if (atomic_read(&vm->config_changed)) 1242 return -EAGAIN; 1243 1244 rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, 1245 GFP_KERNEL); 1246 if (rc == -ENOMEM) 1247 /* whoops, out of memory */ 1248 return rc; 1249 else if (rc && !is_movable) 1250 break; 1251 else if (rc) 1252 continue; 1253 1254 virtio_mem_set_fake_offline(pfn, nr_pages, true); 1255 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1256 return 0; 1257 } 1258 1259 return -EBUSY; 1260 } 1261 1262 /* 1263 * Handle fake-offline pages when memory is going offline - such that the 1264 * pages can be skipped by mm-core when offlining. 1265 */ 1266 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 1267 unsigned long nr_pages) 1268 { 1269 struct page *page; 1270 unsigned long i; 1271 1272 /* Drop our reference to the pages so the memory can get offlined. */ 1273 for (i = 0; i < nr_pages; i++) { 1274 page = pfn_to_page(pfn + i); 1275 if (WARN_ON(!page_ref_dec_and_test(page))) 1276 dump_page(page, "fake-offline page referenced"); 1277 } 1278 } 1279 1280 /* 1281 * Handle fake-offline pages when memory offlining is canceled - to undo 1282 * what we did in virtio_mem_fake_offline_going_offline(). 1283 */ 1284 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 1285 unsigned long nr_pages) 1286 { 1287 unsigned long i; 1288 1289 /* 1290 * Get the reference again that we dropped via page_ref_dec_and_test() 1291 * when going offline. 1292 */ 1293 for (i = 0; i < nr_pages; i++) 1294 page_ref_inc(pfn_to_page(pfn + i)); 1295 } 1296 1297 static void virtio_mem_online_page(struct virtio_mem *vm, 1298 struct page *page, unsigned int order) 1299 { 1300 const unsigned long start = page_to_phys(page); 1301 const unsigned long end = start + PFN_PHYS(1 << order); 1302 unsigned long addr, next, id, sb_id, count; 1303 bool do_online; 1304 1305 /* 1306 * We can get called with any order up to MAX_PAGE_ORDER. If our subblock 1307 * size is smaller than that and we have a mixture of plugged and 1308 * unplugged subblocks within such a page, we have to process in 1309 * smaller granularity. In that case we'll adjust the order exactly once 1310 * within the loop. 1311 */ 1312 for (addr = start; addr < end; ) { 1313 next = addr + PFN_PHYS(1 << order); 1314 1315 if (vm->in_sbm) { 1316 id = virtio_mem_phys_to_mb_id(addr); 1317 sb_id = virtio_mem_phys_to_sb_id(vm, addr); 1318 count = virtio_mem_phys_to_sb_id(vm, next - 1) - sb_id + 1; 1319 1320 if (virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, count)) { 1321 /* Fully plugged. */ 1322 do_online = true; 1323 } else if (count == 1 || 1324 virtio_mem_sbm_test_sb_unplugged(vm, id, sb_id, count)) { 1325 /* Fully unplugged. */ 1326 do_online = false; 1327 } else { 1328 /* 1329 * Mixture, process sub-blocks instead. This 1330 * will be at least the size of a pageblock. 1331 * We'll run into this case exactly once. 1332 */ 1333 order = ilog2(vm->sbm.sb_size) - PAGE_SHIFT; 1334 do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1); 1335 continue; 1336 } 1337 } else { 1338 /* 1339 * If the whole block is marked fake offline, keep 1340 * everything that way. 1341 */ 1342 id = virtio_mem_phys_to_bb_id(vm, addr); 1343 do_online = virtio_mem_bbm_get_bb_state(vm, id) != 1344 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; 1345 } 1346 1347 if (do_online) 1348 generic_online_page(pfn_to_page(PFN_DOWN(addr)), order); 1349 else 1350 virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, 1351 false); 1352 addr = next; 1353 } 1354 } 1355 1356 static void virtio_mem_online_page_cb(struct page *page, unsigned int order) 1357 { 1358 const unsigned long addr = page_to_phys(page); 1359 struct virtio_mem *vm; 1360 1361 rcu_read_lock(); 1362 list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { 1363 /* 1364 * Pages we're onlining will never cross memory blocks and, 1365 * therefore, not virtio-mem devices. 1366 */ 1367 if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) 1368 continue; 1369 1370 /* 1371 * virtio_mem_set_fake_offline() might sleep. We can safely 1372 * drop the RCU lock at this point because the device 1373 * cannot go away. See virtio_mem_remove() how races 1374 * between memory onlining and device removal are handled. 1375 */ 1376 rcu_read_unlock(); 1377 1378 virtio_mem_online_page(vm, page, order); 1379 return; 1380 } 1381 rcu_read_unlock(); 1382 1383 /* not virtio-mem memory, but e.g., a DIMM. online it */ 1384 generic_online_page(page, order); 1385 } 1386 1387 static uint64_t virtio_mem_send_request(struct virtio_mem *vm, 1388 const struct virtio_mem_req *req) 1389 { 1390 struct scatterlist *sgs[2], sg_req, sg_resp; 1391 unsigned int len; 1392 int rc; 1393 1394 /* don't use the request residing on the stack (vaddr) */ 1395 vm->req = *req; 1396 1397 /* out: buffer for request */ 1398 sg_init_one(&sg_req, &vm->req, sizeof(vm->req)); 1399 sgs[0] = &sg_req; 1400 1401 /* in: buffer for response */ 1402 sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp)); 1403 sgs[1] = &sg_resp; 1404 1405 rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL); 1406 if (rc < 0) 1407 return rc; 1408 1409 virtqueue_kick(vm->vq); 1410 1411 /* wait for a response */ 1412 wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len)); 1413 1414 return virtio16_to_cpu(vm->vdev, vm->resp.type); 1415 } 1416 1417 static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, 1418 uint64_t size) 1419 { 1420 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1421 const struct virtio_mem_req req = { 1422 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG), 1423 .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), 1424 .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1425 }; 1426 int rc = -ENOMEM; 1427 1428 if (atomic_read(&vm->config_changed)) 1429 return -EAGAIN; 1430 1431 dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, 1432 addr + size - 1); 1433 1434 switch (virtio_mem_send_request(vm, &req)) { 1435 case VIRTIO_MEM_RESP_ACK: 1436 vm->plugged_size += size; 1437 return 0; 1438 case VIRTIO_MEM_RESP_NACK: 1439 rc = -EAGAIN; 1440 break; 1441 case VIRTIO_MEM_RESP_BUSY: 1442 rc = -ETXTBSY; 1443 break; 1444 case VIRTIO_MEM_RESP_ERROR: 1445 rc = -EINVAL; 1446 break; 1447 default: 1448 break; 1449 } 1450 1451 dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); 1452 return rc; 1453 } 1454 1455 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, 1456 uint64_t size) 1457 { 1458 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1459 const struct virtio_mem_req req = { 1460 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG), 1461 .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), 1462 .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1463 }; 1464 int rc = -ENOMEM; 1465 1466 if (atomic_read(&vm->config_changed)) 1467 return -EAGAIN; 1468 1469 dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, 1470 addr + size - 1); 1471 1472 switch (virtio_mem_send_request(vm, &req)) { 1473 case VIRTIO_MEM_RESP_ACK: 1474 vm->plugged_size -= size; 1475 return 0; 1476 case VIRTIO_MEM_RESP_BUSY: 1477 rc = -ETXTBSY; 1478 break; 1479 case VIRTIO_MEM_RESP_ERROR: 1480 rc = -EINVAL; 1481 break; 1482 default: 1483 break; 1484 } 1485 1486 dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); 1487 return rc; 1488 } 1489 1490 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) 1491 { 1492 const struct virtio_mem_req req = { 1493 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), 1494 }; 1495 int rc = -ENOMEM; 1496 1497 dev_dbg(&vm->vdev->dev, "unplugging all memory"); 1498 1499 switch (virtio_mem_send_request(vm, &req)) { 1500 case VIRTIO_MEM_RESP_ACK: 1501 vm->unplug_all_required = false; 1502 vm->plugged_size = 0; 1503 /* usable region might have shrunk */ 1504 atomic_set(&vm->config_changed, 1); 1505 return 0; 1506 case VIRTIO_MEM_RESP_BUSY: 1507 rc = -ETXTBSY; 1508 break; 1509 default: 1510 break; 1511 } 1512 1513 dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); 1514 return rc; 1515 } 1516 1517 /* 1518 * Plug selected subblocks. Updates the plugged state, but not the state 1519 * of the memory block. 1520 */ 1521 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, 1522 int sb_id, int count) 1523 { 1524 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1525 sb_id * vm->sbm.sb_size; 1526 const uint64_t size = count * vm->sbm.sb_size; 1527 int rc; 1528 1529 rc = virtio_mem_send_plug_request(vm, addr, size); 1530 if (!rc) 1531 virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); 1532 return rc; 1533 } 1534 1535 /* 1536 * Unplug selected subblocks. Updates the plugged state, but not the state 1537 * of the memory block. 1538 */ 1539 static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, 1540 int sb_id, int count) 1541 { 1542 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1543 sb_id * vm->sbm.sb_size; 1544 const uint64_t size = count * vm->sbm.sb_size; 1545 int rc; 1546 1547 rc = virtio_mem_send_unplug_request(vm, addr, size); 1548 if (!rc) 1549 virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); 1550 return rc; 1551 } 1552 1553 /* 1554 * Request to unplug a big block. 1555 * 1556 * Will not modify the state of the big block. 1557 */ 1558 static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) 1559 { 1560 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1561 const uint64_t size = vm->bbm.bb_size; 1562 1563 return virtio_mem_send_unplug_request(vm, addr, size); 1564 } 1565 1566 /* 1567 * Request to plug a big block. 1568 * 1569 * Will not modify the state of the big block. 1570 */ 1571 static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) 1572 { 1573 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1574 const uint64_t size = vm->bbm.bb_size; 1575 1576 return virtio_mem_send_plug_request(vm, addr, size); 1577 } 1578 1579 /* 1580 * Unplug the desired number of plugged subblocks of a offline or not-added 1581 * memory block. Will fail if any subblock cannot get unplugged (instead of 1582 * skipping it). 1583 * 1584 * Will not modify the state of the memory block. 1585 * 1586 * Note: can fail after some subblocks were unplugged. 1587 */ 1588 static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm, 1589 unsigned long mb_id, uint64_t *nb_sb) 1590 { 1591 int sb_id, count; 1592 int rc; 1593 1594 sb_id = vm->sbm.sbs_per_mb - 1; 1595 while (*nb_sb) { 1596 /* Find the next candidate subblock */ 1597 while (sb_id >= 0 && 1598 virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) 1599 sb_id--; 1600 if (sb_id < 0) 1601 break; 1602 /* Try to unplug multiple subblocks at a time */ 1603 count = 1; 1604 while (count < *nb_sb && sb_id > 0 && 1605 virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { 1606 count++; 1607 sb_id--; 1608 } 1609 1610 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1611 if (rc) 1612 return rc; 1613 *nb_sb -= count; 1614 sb_id--; 1615 } 1616 1617 return 0; 1618 } 1619 1620 /* 1621 * Unplug all plugged subblocks of an offline or not-added memory block. 1622 * 1623 * Will not modify the state of the memory block. 1624 * 1625 * Note: can fail after some subblocks were unplugged. 1626 */ 1627 static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) 1628 { 1629 uint64_t nb_sb = vm->sbm.sbs_per_mb; 1630 1631 return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb); 1632 } 1633 1634 /* 1635 * Prepare tracking data for the next memory block. 1636 */ 1637 static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, 1638 unsigned long *mb_id) 1639 { 1640 int rc; 1641 1642 if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) 1643 return -ENOSPC; 1644 1645 /* Resize the state array if required. */ 1646 rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); 1647 if (rc) 1648 return rc; 1649 1650 /* Resize the subblock bitmap if required. */ 1651 rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); 1652 if (rc) 1653 return rc; 1654 1655 vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; 1656 *mb_id = vm->sbm.next_mb_id++; 1657 return 0; 1658 } 1659 1660 /* 1661 * Try to plug the desired number of subblocks and add the memory block 1662 * to Linux. 1663 * 1664 * Will modify the state of the memory block. 1665 */ 1666 static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, 1667 unsigned long mb_id, uint64_t *nb_sb) 1668 { 1669 const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); 1670 int rc; 1671 1672 if (WARN_ON_ONCE(!count)) 1673 return -EINVAL; 1674 1675 /* 1676 * Plug the requested number of subblocks before adding it to linux, 1677 * so that onlining will directly online all plugged subblocks. 1678 */ 1679 rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); 1680 if (rc) 1681 return rc; 1682 1683 /* 1684 * Mark the block properly offline before adding it to Linux, 1685 * so the memory notifiers will find the block in the right state. 1686 */ 1687 if (count == vm->sbm.sbs_per_mb) 1688 virtio_mem_sbm_set_mb_state(vm, mb_id, 1689 VIRTIO_MEM_SBM_MB_OFFLINE); 1690 else 1691 virtio_mem_sbm_set_mb_state(vm, mb_id, 1692 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1693 1694 /* Add the memory block to linux - if that fails, try to unplug. */ 1695 rc = virtio_mem_sbm_add_mb(vm, mb_id); 1696 if (rc) { 1697 int new_state = VIRTIO_MEM_SBM_MB_UNUSED; 1698 1699 if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) 1700 new_state = VIRTIO_MEM_SBM_MB_PLUGGED; 1701 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 1702 return rc; 1703 } 1704 1705 *nb_sb -= count; 1706 return 0; 1707 } 1708 1709 /* 1710 * Try to plug the desired number of subblocks of a memory block that 1711 * is already added to Linux. 1712 * 1713 * Will modify the state of the memory block. 1714 * 1715 * Note: Can fail after some subblocks were successfully plugged. 1716 */ 1717 static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, 1718 unsigned long mb_id, uint64_t *nb_sb) 1719 { 1720 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1721 unsigned long pfn, nr_pages; 1722 int sb_id, count; 1723 int rc; 1724 1725 if (WARN_ON_ONCE(!*nb_sb)) 1726 return -EINVAL; 1727 1728 while (*nb_sb) { 1729 sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); 1730 if (sb_id >= vm->sbm.sbs_per_mb) 1731 break; 1732 count = 1; 1733 while (count < *nb_sb && 1734 sb_id + count < vm->sbm.sbs_per_mb && 1735 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) 1736 count++; 1737 1738 rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); 1739 if (rc) 1740 return rc; 1741 *nb_sb -= count; 1742 if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) 1743 continue; 1744 1745 /* fake-online the pages if the memory block is online */ 1746 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1747 sb_id * vm->sbm.sb_size); 1748 nr_pages = PFN_DOWN(count * vm->sbm.sb_size); 1749 virtio_mem_fake_online(pfn, nr_pages); 1750 } 1751 1752 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1753 virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1); 1754 1755 return 0; 1756 } 1757 1758 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1759 { 1760 const int mb_states[] = { 1761 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 1762 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 1763 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 1764 }; 1765 uint64_t nb_sb = diff / vm->sbm.sb_size; 1766 unsigned long mb_id; 1767 int rc, i; 1768 1769 if (!nb_sb) 1770 return 0; 1771 1772 /* Don't race with onlining/offlining */ 1773 mutex_lock(&vm->hotplug_mutex); 1774 1775 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 1776 virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) { 1777 rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb); 1778 if (rc || !nb_sb) 1779 goto out_unlock; 1780 cond_resched(); 1781 } 1782 } 1783 1784 /* 1785 * We won't be working on online/offline memory blocks from this point, 1786 * so we can't race with memory onlining/offlining. Drop the mutex. 1787 */ 1788 mutex_unlock(&vm->hotplug_mutex); 1789 1790 /* Try to plug and add unused blocks */ 1791 virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { 1792 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1793 return -ENOSPC; 1794 1795 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1796 if (rc || !nb_sb) 1797 return rc; 1798 cond_resched(); 1799 } 1800 1801 /* Try to prepare, plug and add new blocks */ 1802 while (nb_sb) { 1803 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1804 return -ENOSPC; 1805 1806 rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); 1807 if (rc) 1808 return rc; 1809 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1810 if (rc) 1811 return rc; 1812 cond_resched(); 1813 } 1814 1815 return 0; 1816 out_unlock: 1817 mutex_unlock(&vm->hotplug_mutex); 1818 return rc; 1819 } 1820 1821 /* 1822 * Plug a big block and add it to Linux. 1823 * 1824 * Will modify the state of the big block. 1825 */ 1826 static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, 1827 unsigned long bb_id) 1828 { 1829 int rc; 1830 1831 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 1832 VIRTIO_MEM_BBM_BB_UNUSED)) 1833 return -EINVAL; 1834 1835 rc = virtio_mem_bbm_plug_bb(vm, bb_id); 1836 if (rc) 1837 return rc; 1838 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 1839 1840 rc = virtio_mem_bbm_add_bb(vm, bb_id); 1841 if (rc) { 1842 if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) 1843 virtio_mem_bbm_set_bb_state(vm, bb_id, 1844 VIRTIO_MEM_BBM_BB_UNUSED); 1845 else 1846 /* Retry from the main loop. */ 1847 virtio_mem_bbm_set_bb_state(vm, bb_id, 1848 VIRTIO_MEM_BBM_BB_PLUGGED); 1849 return rc; 1850 } 1851 return 0; 1852 } 1853 1854 /* 1855 * Prepare tracking data for the next big block. 1856 */ 1857 static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, 1858 unsigned long *bb_id) 1859 { 1860 int rc; 1861 1862 if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) 1863 return -ENOSPC; 1864 1865 /* Resize the big block state array if required. */ 1866 rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); 1867 if (rc) 1868 return rc; 1869 1870 vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; 1871 *bb_id = vm->bbm.next_bb_id; 1872 vm->bbm.next_bb_id++; 1873 return 0; 1874 } 1875 1876 static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1877 { 1878 uint64_t nb_bb = diff / vm->bbm.bb_size; 1879 unsigned long bb_id; 1880 int rc; 1881 1882 if (!nb_bb) 1883 return 0; 1884 1885 /* Try to plug and add unused big blocks */ 1886 virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { 1887 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1888 return -ENOSPC; 1889 1890 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1891 if (!rc) 1892 nb_bb--; 1893 if (rc || !nb_bb) 1894 return rc; 1895 cond_resched(); 1896 } 1897 1898 /* Try to prepare, plug and add new big blocks */ 1899 while (nb_bb) { 1900 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1901 return -ENOSPC; 1902 1903 rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); 1904 if (rc) 1905 return rc; 1906 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1907 if (!rc) 1908 nb_bb--; 1909 if (rc) 1910 return rc; 1911 cond_resched(); 1912 } 1913 1914 return 0; 1915 } 1916 1917 /* 1918 * Try to plug the requested amount of memory. 1919 */ 1920 static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) 1921 { 1922 if (vm->in_sbm) 1923 return virtio_mem_sbm_plug_request(vm, diff); 1924 return virtio_mem_bbm_plug_request(vm, diff); 1925 } 1926 1927 /* 1928 * Unplug the desired number of plugged subblocks of an offline memory block. 1929 * Will fail if any subblock cannot get unplugged (instead of skipping it). 1930 * 1931 * Will modify the state of the memory block. Might temporarily drop the 1932 * hotplug_mutex. 1933 * 1934 * Note: Can fail after some subblocks were successfully unplugged. 1935 */ 1936 static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, 1937 unsigned long mb_id, 1938 uint64_t *nb_sb) 1939 { 1940 int rc; 1941 1942 rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb); 1943 1944 /* some subblocks might have been unplugged even on failure */ 1945 if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1946 virtio_mem_sbm_set_mb_state(vm, mb_id, 1947 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1948 if (rc) 1949 return rc; 1950 1951 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1952 /* 1953 * Remove the block from Linux - this should never fail. 1954 * Hinder the block from getting onlined by marking it 1955 * unplugged. Temporarily drop the mutex, so 1956 * any pending GOING_ONLINE requests can be serviced/rejected. 1957 */ 1958 virtio_mem_sbm_set_mb_state(vm, mb_id, 1959 VIRTIO_MEM_SBM_MB_UNUSED); 1960 1961 mutex_unlock(&vm->hotplug_mutex); 1962 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 1963 BUG_ON(rc); 1964 mutex_lock(&vm->hotplug_mutex); 1965 } 1966 return 0; 1967 } 1968 1969 /* 1970 * Unplug the given plugged subblocks of an online memory block. 1971 * 1972 * Will modify the state of the memory block. 1973 */ 1974 static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, 1975 unsigned long mb_id, int sb_id, 1976 int count) 1977 { 1978 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; 1979 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1980 unsigned long start_pfn; 1981 int rc; 1982 1983 start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1984 sb_id * vm->sbm.sb_size); 1985 1986 rc = virtio_mem_fake_offline(vm, start_pfn, nr_pages); 1987 if (rc) 1988 return rc; 1989 1990 /* Try to unplug the allocated memory */ 1991 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1992 if (rc) { 1993 /* Return the memory to the buddy. */ 1994 virtio_mem_fake_online(start_pfn, nr_pages); 1995 return rc; 1996 } 1997 1998 switch (old_state) { 1999 case VIRTIO_MEM_SBM_MB_KERNEL: 2000 virtio_mem_sbm_set_mb_state(vm, mb_id, 2001 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL); 2002 break; 2003 case VIRTIO_MEM_SBM_MB_MOVABLE: 2004 virtio_mem_sbm_set_mb_state(vm, mb_id, 2005 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL); 2006 break; 2007 } 2008 2009 return 0; 2010 } 2011 2012 /* 2013 * Unplug the desired number of plugged subblocks of an online memory block. 2014 * Will skip subblock that are busy. 2015 * 2016 * Will modify the state of the memory block. Might temporarily drop the 2017 * hotplug_mutex. 2018 * 2019 * Note: Can fail after some subblocks were successfully unplugged. Can 2020 * return 0 even if subblocks were busy and could not get unplugged. 2021 */ 2022 static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, 2023 unsigned long mb_id, 2024 uint64_t *nb_sb) 2025 { 2026 int rc, sb_id; 2027 2028 /* If possible, try to unplug the complete block in one shot. */ 2029 if (*nb_sb >= vm->sbm.sbs_per_mb && 2030 virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 2031 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, 2032 vm->sbm.sbs_per_mb); 2033 if (!rc) { 2034 *nb_sb -= vm->sbm.sbs_per_mb; 2035 goto unplugged; 2036 } else if (rc != -EBUSY) 2037 return rc; 2038 } 2039 2040 /* Fallback to single subblocks. */ 2041 for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { 2042 /* Find the next candidate subblock */ 2043 while (sb_id >= 0 && 2044 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 2045 sb_id--; 2046 if (sb_id < 0) 2047 break; 2048 2049 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); 2050 if (rc == -EBUSY) 2051 continue; 2052 else if (rc) 2053 return rc; 2054 *nb_sb -= 1; 2055 } 2056 2057 unplugged: 2058 rc = virtio_mem_sbm_try_remove_unplugged_mb(vm, mb_id); 2059 if (rc) 2060 vm->sbm.have_unplugged_mb = 1; 2061 /* Ignore errors, this is not critical. We'll retry later. */ 2062 return 0; 2063 } 2064 2065 /* 2066 * Unplug the desired number of plugged subblocks of a memory block that is 2067 * already added to Linux. Will skip subblock of online memory blocks that are 2068 * busy (by the OS). Will fail if any subblock that's not busy cannot get 2069 * unplugged. 2070 * 2071 * Will modify the state of the memory block. Might temporarily drop the 2072 * hotplug_mutex. 2073 * 2074 * Note: Can fail after some subblocks were successfully unplugged. Can 2075 * return 0 even if subblocks were busy and could not get unplugged. 2076 */ 2077 static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, 2078 unsigned long mb_id, 2079 uint64_t *nb_sb) 2080 { 2081 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 2082 2083 switch (old_state) { 2084 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 2085 case VIRTIO_MEM_SBM_MB_KERNEL: 2086 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 2087 case VIRTIO_MEM_SBM_MB_MOVABLE: 2088 return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); 2089 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 2090 case VIRTIO_MEM_SBM_MB_OFFLINE: 2091 return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb); 2092 } 2093 return -EINVAL; 2094 } 2095 2096 static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2097 { 2098 const int mb_states[] = { 2099 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 2100 VIRTIO_MEM_SBM_MB_OFFLINE, 2101 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 2102 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 2103 VIRTIO_MEM_SBM_MB_MOVABLE, 2104 VIRTIO_MEM_SBM_MB_KERNEL, 2105 }; 2106 uint64_t nb_sb = diff / vm->sbm.sb_size; 2107 unsigned long mb_id; 2108 int rc, i; 2109 2110 if (!nb_sb) 2111 return 0; 2112 2113 /* 2114 * We'll drop the mutex a couple of times when it is safe to do so. 2115 * This might result in some blocks switching the state (online/offline) 2116 * and we could miss them in this run - we will retry again later. 2117 */ 2118 mutex_lock(&vm->hotplug_mutex); 2119 2120 /* 2121 * We try unplug from partially plugged blocks first, to try removing 2122 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE 2123 * as it's more reliable to unplug memory and remove whole memory 2124 * blocks, and we don't want to trigger a zone imbalances by 2125 * accidentially removing too much kernel memory. 2126 */ 2127 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 2128 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { 2129 rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); 2130 if (rc || !nb_sb) 2131 goto out_unlock; 2132 mutex_unlock(&vm->hotplug_mutex); 2133 cond_resched(); 2134 mutex_lock(&vm->hotplug_mutex); 2135 } 2136 if (!unplug_online && i == 1) { 2137 mutex_unlock(&vm->hotplug_mutex); 2138 return 0; 2139 } 2140 } 2141 2142 mutex_unlock(&vm->hotplug_mutex); 2143 return nb_sb ? -EBUSY : 0; 2144 out_unlock: 2145 mutex_unlock(&vm->hotplug_mutex); 2146 return rc; 2147 } 2148 2149 /* 2150 * Try to offline and remove a big block from Linux and unplug it. Will fail 2151 * with -EBUSY if some memory is busy and cannot get unplugged. 2152 * 2153 * Will modify the state of the memory block. Might temporarily drop the 2154 * hotplug_mutex. 2155 */ 2156 static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, 2157 unsigned long bb_id) 2158 { 2159 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2160 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2161 unsigned long end_pfn = start_pfn + nr_pages; 2162 unsigned long pfn; 2163 struct page *page; 2164 int rc; 2165 2166 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 2167 VIRTIO_MEM_BBM_BB_ADDED)) 2168 return -EINVAL; 2169 2170 /* 2171 * Start by fake-offlining all memory. Once we marked the device 2172 * block as fake-offline, all newly onlined memory will 2173 * automatically be kept fake-offline. Protect from concurrent 2174 * onlining/offlining until we have a consistent state. 2175 */ 2176 mutex_lock(&vm->hotplug_mutex); 2177 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); 2178 2179 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2180 page = pfn_to_online_page(pfn); 2181 if (!page) 2182 continue; 2183 2184 rc = virtio_mem_fake_offline(vm, pfn, PAGES_PER_SECTION); 2185 if (rc) { 2186 end_pfn = pfn; 2187 goto rollback; 2188 } 2189 } 2190 mutex_unlock(&vm->hotplug_mutex); 2191 2192 rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); 2193 if (rc) { 2194 mutex_lock(&vm->hotplug_mutex); 2195 goto rollback; 2196 } 2197 2198 rc = virtio_mem_bbm_unplug_bb(vm, bb_id); 2199 if (rc) 2200 virtio_mem_bbm_set_bb_state(vm, bb_id, 2201 VIRTIO_MEM_BBM_BB_PLUGGED); 2202 else 2203 virtio_mem_bbm_set_bb_state(vm, bb_id, 2204 VIRTIO_MEM_BBM_BB_UNUSED); 2205 return rc; 2206 2207 rollback: 2208 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2209 page = pfn_to_online_page(pfn); 2210 if (!page) 2211 continue; 2212 virtio_mem_fake_online(pfn, PAGES_PER_SECTION); 2213 } 2214 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 2215 mutex_unlock(&vm->hotplug_mutex); 2216 return rc; 2217 } 2218 2219 /* 2220 * Test if a big block is completely offline. 2221 */ 2222 static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, 2223 unsigned long bb_id) 2224 { 2225 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2226 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2227 unsigned long pfn; 2228 2229 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2230 pfn += PAGES_PER_SECTION) { 2231 if (pfn_to_online_page(pfn)) 2232 return false; 2233 } 2234 2235 return true; 2236 } 2237 2238 /* 2239 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline). 2240 */ 2241 static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm, 2242 unsigned long bb_id) 2243 { 2244 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2245 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2246 struct page *page; 2247 unsigned long pfn; 2248 2249 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2250 pfn += PAGES_PER_SECTION) { 2251 page = pfn_to_online_page(pfn); 2252 if (!page) 2253 continue; 2254 if (page_zonenum(page) != ZONE_MOVABLE) 2255 return false; 2256 } 2257 2258 return true; 2259 } 2260 2261 static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2262 { 2263 uint64_t nb_bb = diff / vm->bbm.bb_size; 2264 uint64_t bb_id; 2265 int rc, i; 2266 2267 if (!nb_bb) 2268 return 0; 2269 2270 /* 2271 * Try to unplug big blocks. Similar to SBM, start with offline 2272 * big blocks. 2273 */ 2274 for (i = 0; i < 3; i++) { 2275 virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 2276 cond_resched(); 2277 2278 /* 2279 * As we're holding no locks, these checks are racy, 2280 * but we don't care. 2281 */ 2282 if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) 2283 continue; 2284 if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id)) 2285 continue; 2286 rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); 2287 if (rc == -EBUSY) 2288 continue; 2289 if (!rc) 2290 nb_bb--; 2291 if (rc || !nb_bb) 2292 return rc; 2293 } 2294 if (i == 0 && !unplug_online) 2295 return 0; 2296 } 2297 2298 return nb_bb ? -EBUSY : 0; 2299 } 2300 2301 /* 2302 * Try to unplug the requested amount of memory. 2303 */ 2304 static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) 2305 { 2306 if (vm->in_sbm) 2307 return virtio_mem_sbm_unplug_request(vm, diff); 2308 return virtio_mem_bbm_unplug_request(vm, diff); 2309 } 2310 2311 /* 2312 * Try to unplug all blocks that couldn't be unplugged before, for example, 2313 * because the hypervisor was busy. Further, offline and remove any memory 2314 * blocks where we previously failed. 2315 */ 2316 static int virtio_mem_cleanup_pending_mb(struct virtio_mem *vm) 2317 { 2318 unsigned long id; 2319 int rc = 0; 2320 2321 if (!vm->in_sbm) { 2322 virtio_mem_bbm_for_each_bb(vm, id, 2323 VIRTIO_MEM_BBM_BB_PLUGGED) { 2324 rc = virtio_mem_bbm_unplug_bb(vm, id); 2325 if (rc) 2326 return rc; 2327 virtio_mem_bbm_set_bb_state(vm, id, 2328 VIRTIO_MEM_BBM_BB_UNUSED); 2329 } 2330 return 0; 2331 } 2332 2333 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { 2334 rc = virtio_mem_sbm_unplug_mb(vm, id); 2335 if (rc) 2336 return rc; 2337 virtio_mem_sbm_set_mb_state(vm, id, 2338 VIRTIO_MEM_SBM_MB_UNUSED); 2339 } 2340 2341 if (!vm->sbm.have_unplugged_mb) 2342 return 0; 2343 2344 /* 2345 * Let's retry (offlining and) removing completely unplugged Linux 2346 * memory blocks. 2347 */ 2348 vm->sbm.have_unplugged_mb = false; 2349 2350 mutex_lock(&vm->hotplug_mutex); 2351 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL) 2352 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); 2353 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL) 2354 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); 2355 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) 2356 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); 2357 mutex_unlock(&vm->hotplug_mutex); 2358 2359 if (rc) 2360 vm->sbm.have_unplugged_mb = true; 2361 /* Ignore errors, this is not critical. We'll retry later. */ 2362 return 0; 2363 } 2364 2365 /* 2366 * Update all parts of the config that could have changed. 2367 */ 2368 static void virtio_mem_refresh_config(struct virtio_mem *vm) 2369 { 2370 const struct range pluggable_range = mhp_get_pluggable_range(true); 2371 uint64_t new_plugged_size, usable_region_size, end_addr; 2372 2373 /* the plugged_size is just a reflection of what _we_ did previously */ 2374 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2375 &new_plugged_size); 2376 if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size)) 2377 vm->plugged_size = new_plugged_size; 2378 2379 /* calculate the last usable memory block id */ 2380 virtio_cread_le(vm->vdev, struct virtio_mem_config, 2381 usable_region_size, &usable_region_size); 2382 end_addr = min(vm->addr + usable_region_size - 1, 2383 pluggable_range.end); 2384 2385 if (vm->in_sbm) { 2386 vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr); 2387 if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes())) 2388 vm->sbm.last_usable_mb_id--; 2389 } else { 2390 vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm, 2391 end_addr); 2392 if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size)) 2393 vm->bbm.last_usable_bb_id--; 2394 } 2395 /* 2396 * If we cannot plug any of our device memory (e.g., nothing in the 2397 * usable region is addressable), the last usable memory block id will 2398 * be smaller than the first usable memory block id. We'll stop 2399 * attempting to add memory with -ENOSPC from our main loop. 2400 */ 2401 2402 /* see if there is a request to change the size */ 2403 virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, 2404 &vm->requested_size); 2405 2406 dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size); 2407 dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size); 2408 } 2409 2410 /* 2411 * Workqueue function for handling plug/unplug requests and config updates. 2412 */ 2413 static void virtio_mem_run_wq(struct work_struct *work) 2414 { 2415 struct virtio_mem *vm = container_of(work, struct virtio_mem, wq); 2416 uint64_t diff; 2417 int rc; 2418 2419 if (unlikely(vm->in_kdump)) { 2420 dev_warn_once(&vm->vdev->dev, 2421 "unexpected workqueue run in kdump kernel\n"); 2422 return; 2423 } 2424 2425 hrtimer_cancel(&vm->retry_timer); 2426 2427 if (vm->broken) 2428 return; 2429 2430 atomic_set(&vm->wq_active, 1); 2431 retry: 2432 rc = 0; 2433 2434 /* Make sure we start with a clean state if there are leftovers. */ 2435 if (unlikely(vm->unplug_all_required)) 2436 rc = virtio_mem_send_unplug_all_request(vm); 2437 2438 if (atomic_read(&vm->config_changed)) { 2439 atomic_set(&vm->config_changed, 0); 2440 virtio_mem_refresh_config(vm); 2441 } 2442 2443 /* Cleanup any leftovers from previous runs */ 2444 if (!rc) 2445 rc = virtio_mem_cleanup_pending_mb(vm); 2446 2447 if (!rc && vm->requested_size != vm->plugged_size) { 2448 if (vm->requested_size > vm->plugged_size) { 2449 diff = vm->requested_size - vm->plugged_size; 2450 rc = virtio_mem_plug_request(vm, diff); 2451 } else { 2452 diff = vm->plugged_size - vm->requested_size; 2453 rc = virtio_mem_unplug_request(vm, diff); 2454 } 2455 } 2456 2457 /* 2458 * Keep retrying to offline and remove completely unplugged Linux 2459 * memory blocks. 2460 */ 2461 if (!rc && vm->in_sbm && vm->sbm.have_unplugged_mb) 2462 rc = -EBUSY; 2463 2464 switch (rc) { 2465 case 0: 2466 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2467 break; 2468 case -ENOSPC: 2469 /* 2470 * We cannot add any more memory (alignment, physical limit) 2471 * or we have too many offline memory blocks. 2472 */ 2473 break; 2474 case -ETXTBSY: 2475 /* 2476 * The hypervisor cannot process our request right now 2477 * (e.g., out of memory, migrating); 2478 */ 2479 case -EBUSY: 2480 /* 2481 * We cannot free up any memory to unplug it (all plugged memory 2482 * is busy). 2483 */ 2484 case -ENOMEM: 2485 /* Out of memory, try again later. */ 2486 hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms), 2487 HRTIMER_MODE_REL); 2488 break; 2489 case -EAGAIN: 2490 /* Retry immediately (e.g., the config changed). */ 2491 goto retry; 2492 default: 2493 /* Unknown error, mark as broken */ 2494 dev_err(&vm->vdev->dev, 2495 "unknown error, marking device broken: %d\n", rc); 2496 vm->broken = true; 2497 } 2498 2499 atomic_set(&vm->wq_active, 0); 2500 } 2501 2502 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) 2503 { 2504 struct virtio_mem *vm = container_of(timer, struct virtio_mem, 2505 retry_timer); 2506 2507 virtio_mem_retry(vm); 2508 vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2, 2509 VIRTIO_MEM_RETRY_TIMER_MAX_MS); 2510 return HRTIMER_NORESTART; 2511 } 2512 2513 static void virtio_mem_handle_response(struct virtqueue *vq) 2514 { 2515 struct virtio_mem *vm = vq->vdev->priv; 2516 2517 wake_up(&vm->host_resp); 2518 } 2519 2520 static int virtio_mem_init_vq(struct virtio_mem *vm) 2521 { 2522 struct virtqueue *vq; 2523 2524 vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response, 2525 "guest-request"); 2526 if (IS_ERR(vq)) 2527 return PTR_ERR(vq); 2528 vm->vq = vq; 2529 2530 return 0; 2531 } 2532 2533 static int virtio_mem_init_hotplug(struct virtio_mem *vm) 2534 { 2535 const struct range pluggable_range = mhp_get_pluggable_range(true); 2536 uint64_t unit_pages, sb_size, addr; 2537 int rc; 2538 2539 /* bad device setup - warn only */ 2540 if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) 2541 dev_warn(&vm->vdev->dev, 2542 "The alignment of the physical start address can make some memory unusable.\n"); 2543 if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) 2544 dev_warn(&vm->vdev->dev, 2545 "The alignment of the physical end address can make some memory unusable.\n"); 2546 if (vm->addr < pluggable_range.start || 2547 vm->addr + vm->region_size - 1 > pluggable_range.end) 2548 dev_warn(&vm->vdev->dev, 2549 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); 2550 2551 /* Prepare the offline threshold - make sure we can add two blocks. */ 2552 vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), 2553 VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); 2554 2555 /* 2556 * alloc_contig_range() works reliably with pageblock 2557 * granularity on ZONE_NORMAL, use pageblock_nr_pages. 2558 */ 2559 sb_size = PAGE_SIZE * pageblock_nr_pages; 2560 sb_size = max_t(uint64_t, vm->device_block_size, sb_size); 2561 2562 if (sb_size < memory_block_size_bytes() && !force_bbm) { 2563 /* SBM: At least two subblocks per Linux memory block. */ 2564 vm->in_sbm = true; 2565 vm->sbm.sb_size = sb_size; 2566 vm->sbm.sbs_per_mb = memory_block_size_bytes() / 2567 vm->sbm.sb_size; 2568 2569 /* Round up to the next full memory block */ 2570 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2571 memory_block_size_bytes() - 1; 2572 vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); 2573 vm->sbm.next_mb_id = vm->sbm.first_mb_id; 2574 } else { 2575 /* BBM: At least one Linux memory block. */ 2576 vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, 2577 memory_block_size_bytes()); 2578 2579 if (bbm_block_size) { 2580 if (!is_power_of_2(bbm_block_size)) { 2581 dev_warn(&vm->vdev->dev, 2582 "bbm_block_size is not a power of 2"); 2583 } else if (bbm_block_size < vm->bbm.bb_size) { 2584 dev_warn(&vm->vdev->dev, 2585 "bbm_block_size is too small"); 2586 } else { 2587 vm->bbm.bb_size = bbm_block_size; 2588 } 2589 } 2590 2591 /* Round up to the next aligned big block */ 2592 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2593 vm->bbm.bb_size - 1; 2594 vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); 2595 vm->bbm.next_bb_id = vm->bbm.first_bb_id; 2596 2597 /* Make sure we can add two big blocks. */ 2598 vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, 2599 vm->offline_threshold); 2600 } 2601 2602 dev_info(&vm->vdev->dev, "memory block size: 0x%lx", 2603 memory_block_size_bytes()); 2604 if (vm->in_sbm) 2605 dev_info(&vm->vdev->dev, "subblock size: 0x%llx", 2606 (unsigned long long)vm->sbm.sb_size); 2607 else 2608 dev_info(&vm->vdev->dev, "big block size: 0x%llx", 2609 (unsigned long long)vm->bbm.bb_size); 2610 2611 /* create the parent resource for all memory */ 2612 rc = virtio_mem_create_resource(vm); 2613 if (rc) 2614 return rc; 2615 2616 /* use a single dynamic memory group to cover the whole memory device */ 2617 if (vm->in_sbm) 2618 unit_pages = PHYS_PFN(memory_block_size_bytes()); 2619 else 2620 unit_pages = PHYS_PFN(vm->bbm.bb_size); 2621 rc = memory_group_register_dynamic(vm->nid, unit_pages); 2622 if (rc < 0) 2623 goto out_del_resource; 2624 vm->mgid = rc; 2625 2626 /* 2627 * If we still have memory plugged, we have to unplug all memory first. 2628 * Registering our parent resource makes sure that this memory isn't 2629 * actually in use (e.g., trying to reload the driver). 2630 */ 2631 if (vm->plugged_size) { 2632 vm->unplug_all_required = true; 2633 dev_info(&vm->vdev->dev, "unplugging all memory is required\n"); 2634 } 2635 2636 /* register callbacks */ 2637 vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb; 2638 rc = register_memory_notifier(&vm->memory_notifier); 2639 if (rc) 2640 goto out_unreg_group; 2641 /* Block hibernation as early as possible. */ 2642 vm->pm_notifier.priority = INT_MAX; 2643 vm->pm_notifier.notifier_call = virtio_mem_pm_notifier_cb; 2644 rc = register_pm_notifier(&vm->pm_notifier); 2645 if (rc) 2646 goto out_unreg_mem; 2647 rc = register_virtio_mem_device(vm); 2648 if (rc) 2649 goto out_unreg_pm; 2650 2651 return 0; 2652 out_unreg_pm: 2653 unregister_pm_notifier(&vm->pm_notifier); 2654 out_unreg_mem: 2655 unregister_memory_notifier(&vm->memory_notifier); 2656 out_unreg_group: 2657 memory_group_unregister(vm->mgid); 2658 out_del_resource: 2659 virtio_mem_delete_resource(vm); 2660 return rc; 2661 } 2662 2663 #ifdef CONFIG_PROC_VMCORE 2664 static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr, 2665 uint64_t size) 2666 { 2667 const uint64_t nb_vm_blocks = size / vm->device_block_size; 2668 const struct virtio_mem_req req = { 2669 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE), 2670 .u.state.addr = cpu_to_virtio64(vm->vdev, addr), 2671 .u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 2672 }; 2673 int rc = -ENOMEM; 2674 2675 dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr, 2676 addr + size - 1); 2677 2678 switch (virtio_mem_send_request(vm, &req)) { 2679 case VIRTIO_MEM_RESP_ACK: 2680 return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state); 2681 case VIRTIO_MEM_RESP_ERROR: 2682 rc = -EINVAL; 2683 break; 2684 default: 2685 break; 2686 } 2687 2688 dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc); 2689 return rc; 2690 } 2691 2692 static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb, 2693 unsigned long pfn) 2694 { 2695 struct virtio_mem *vm = container_of(cb, struct virtio_mem, 2696 vmcore_cb); 2697 uint64_t addr = PFN_PHYS(pfn); 2698 bool is_ram; 2699 int rc; 2700 2701 if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE)) 2702 return true; 2703 if (!vm->plugged_size) 2704 return false; 2705 2706 /* 2707 * We have to serialize device requests and access to the information 2708 * about the block queried last. 2709 */ 2710 mutex_lock(&vm->hotplug_mutex); 2711 2712 addr = ALIGN_DOWN(addr, vm->device_block_size); 2713 if (addr != vm->last_block_addr) { 2714 rc = virtio_mem_send_state_request(vm, addr, 2715 vm->device_block_size); 2716 /* On any kind of error, we're going to signal !ram. */ 2717 if (rc == VIRTIO_MEM_STATE_PLUGGED) 2718 vm->last_block_plugged = true; 2719 else 2720 vm->last_block_plugged = false; 2721 vm->last_block_addr = addr; 2722 } 2723 2724 is_ram = vm->last_block_plugged; 2725 mutex_unlock(&vm->hotplug_mutex); 2726 return is_ram; 2727 } 2728 #endif /* CONFIG_PROC_VMCORE */ 2729 2730 static int virtio_mem_init_kdump(struct virtio_mem *vm) 2731 { 2732 #ifdef CONFIG_PROC_VMCORE 2733 dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n"); 2734 vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram; 2735 register_vmcore_cb(&vm->vmcore_cb); 2736 return 0; 2737 #else /* CONFIG_PROC_VMCORE */ 2738 dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n"); 2739 return -EBUSY; 2740 #endif /* CONFIG_PROC_VMCORE */ 2741 } 2742 2743 static int virtio_mem_init(struct virtio_mem *vm) 2744 { 2745 uint16_t node_id; 2746 2747 if (!vm->vdev->config->get) { 2748 dev_err(&vm->vdev->dev, "config access disabled\n"); 2749 return -EINVAL; 2750 } 2751 2752 /* Fetch all properties that can't change. */ 2753 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2754 &vm->plugged_size); 2755 virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size, 2756 &vm->device_block_size); 2757 virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id, 2758 &node_id); 2759 vm->nid = virtio_mem_translate_node_id(vm, node_id); 2760 virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr); 2761 virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, 2762 &vm->region_size); 2763 2764 /* Determine the nid for the device based on the lowest address. */ 2765 if (vm->nid == NUMA_NO_NODE) 2766 vm->nid = memory_add_physaddr_to_nid(vm->addr); 2767 2768 dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); 2769 dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); 2770 dev_info(&vm->vdev->dev, "device block size: 0x%llx", 2771 (unsigned long long)vm->device_block_size); 2772 if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) 2773 dev_info(&vm->vdev->dev, "nid: %d", vm->nid); 2774 2775 /* 2776 * We don't want to (un)plug or reuse any memory when in kdump. The 2777 * memory is still accessible (but not exposed to Linux). 2778 */ 2779 if (vm->in_kdump) 2780 return virtio_mem_init_kdump(vm); 2781 return virtio_mem_init_hotplug(vm); 2782 } 2783 2784 static int virtio_mem_create_resource(struct virtio_mem *vm) 2785 { 2786 /* 2787 * When force-unloading the driver and removing the device, we 2788 * could have a garbage pointer. Duplicate the string. 2789 */ 2790 const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL); 2791 2792 if (!name) 2793 return -ENOMEM; 2794 2795 /* Disallow mapping device memory via /dev/mem completely. */ 2796 vm->parent_resource = __request_mem_region(vm->addr, vm->region_size, 2797 name, IORESOURCE_SYSTEM_RAM | 2798 IORESOURCE_EXCLUSIVE); 2799 if (!vm->parent_resource) { 2800 kfree(name); 2801 dev_warn(&vm->vdev->dev, "could not reserve device region\n"); 2802 dev_info(&vm->vdev->dev, 2803 "reloading the driver is not supported\n"); 2804 return -EBUSY; 2805 } 2806 2807 /* The memory is not actually busy - make add_memory() work. */ 2808 vm->parent_resource->flags &= ~IORESOURCE_BUSY; 2809 return 0; 2810 } 2811 2812 static void virtio_mem_delete_resource(struct virtio_mem *vm) 2813 { 2814 const char *name; 2815 2816 if (!vm->parent_resource) 2817 return; 2818 2819 name = vm->parent_resource->name; 2820 release_resource(vm->parent_resource); 2821 kfree(vm->parent_resource); 2822 kfree(name); 2823 vm->parent_resource = NULL; 2824 } 2825 2826 static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) 2827 { 2828 return 1; 2829 } 2830 2831 static bool virtio_mem_has_memory_added(struct virtio_mem *vm) 2832 { 2833 const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 2834 2835 return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, 2836 vm->addr + vm->region_size, NULL, 2837 virtio_mem_range_has_system_ram) == 1; 2838 } 2839 2840 static int virtio_mem_probe(struct virtio_device *vdev) 2841 { 2842 struct virtio_mem *vm; 2843 int rc; 2844 2845 BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24); 2846 BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10); 2847 2848 vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL); 2849 if (!vm) 2850 return -ENOMEM; 2851 2852 init_waitqueue_head(&vm->host_resp); 2853 vm->vdev = vdev; 2854 INIT_WORK(&vm->wq, virtio_mem_run_wq); 2855 mutex_init(&vm->hotplug_mutex); 2856 INIT_LIST_HEAD(&vm->next); 2857 spin_lock_init(&vm->removal_lock); 2858 hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2859 vm->retry_timer.function = virtio_mem_timer_expired; 2860 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2861 vm->in_kdump = is_kdump_kernel(); 2862 2863 /* register the virtqueue */ 2864 rc = virtio_mem_init_vq(vm); 2865 if (rc) 2866 goto out_free_vm; 2867 2868 /* initialize the device by querying the config */ 2869 rc = virtio_mem_init(vm); 2870 if (rc) 2871 goto out_del_vq; 2872 2873 virtio_device_ready(vdev); 2874 2875 /* trigger a config update to start processing the requested_size */ 2876 if (!vm->in_kdump) { 2877 atomic_set(&vm->config_changed, 1); 2878 queue_work(system_freezable_wq, &vm->wq); 2879 } 2880 2881 return 0; 2882 out_del_vq: 2883 vdev->config->del_vqs(vdev); 2884 out_free_vm: 2885 kfree(vm); 2886 vdev->priv = NULL; 2887 2888 return rc; 2889 } 2890 2891 static void virtio_mem_deinit_hotplug(struct virtio_mem *vm) 2892 { 2893 unsigned long mb_id; 2894 int rc; 2895 2896 /* 2897 * Make sure the workqueue won't be triggered anymore and no memory 2898 * blocks can be onlined/offlined until we're finished here. 2899 */ 2900 mutex_lock(&vm->hotplug_mutex); 2901 spin_lock_irq(&vm->removal_lock); 2902 vm->removing = true; 2903 spin_unlock_irq(&vm->removal_lock); 2904 mutex_unlock(&vm->hotplug_mutex); 2905 2906 /* wait until the workqueue stopped */ 2907 cancel_work_sync(&vm->wq); 2908 hrtimer_cancel(&vm->retry_timer); 2909 2910 if (vm->in_sbm) { 2911 /* 2912 * After we unregistered our callbacks, user space can online 2913 * partially plugged offline blocks. Make sure to remove them. 2914 */ 2915 virtio_mem_sbm_for_each_mb(vm, mb_id, 2916 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 2917 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 2918 BUG_ON(rc); 2919 virtio_mem_sbm_set_mb_state(vm, mb_id, 2920 VIRTIO_MEM_SBM_MB_UNUSED); 2921 } 2922 /* 2923 * After we unregistered our callbacks, user space can no longer 2924 * offline partially plugged online memory blocks. No need to 2925 * worry about them. 2926 */ 2927 } 2928 2929 /* unregister callbacks */ 2930 unregister_virtio_mem_device(vm); 2931 unregister_pm_notifier(&vm->pm_notifier); 2932 unregister_memory_notifier(&vm->memory_notifier); 2933 2934 /* 2935 * There is no way we could reliably remove all memory we have added to 2936 * the system. And there is no way to stop the driver/device from going 2937 * away. Warn at least. 2938 */ 2939 if (virtio_mem_has_memory_added(vm)) { 2940 dev_warn(&vm->vdev->dev, 2941 "device still has system memory added\n"); 2942 } else { 2943 virtio_mem_delete_resource(vm); 2944 kfree_const(vm->resource_name); 2945 memory_group_unregister(vm->mgid); 2946 } 2947 2948 /* remove all tracking data - no locking needed */ 2949 if (vm->in_sbm) { 2950 vfree(vm->sbm.mb_states); 2951 vfree(vm->sbm.sb_states); 2952 } else { 2953 vfree(vm->bbm.bb_states); 2954 } 2955 } 2956 2957 static void virtio_mem_deinit_kdump(struct virtio_mem *vm) 2958 { 2959 #ifdef CONFIG_PROC_VMCORE 2960 unregister_vmcore_cb(&vm->vmcore_cb); 2961 #endif /* CONFIG_PROC_VMCORE */ 2962 } 2963 2964 static void virtio_mem_remove(struct virtio_device *vdev) 2965 { 2966 struct virtio_mem *vm = vdev->priv; 2967 2968 if (vm->in_kdump) 2969 virtio_mem_deinit_kdump(vm); 2970 else 2971 virtio_mem_deinit_hotplug(vm); 2972 2973 /* reset the device and cleanup the queues */ 2974 virtio_reset_device(vdev); 2975 vdev->config->del_vqs(vdev); 2976 2977 kfree(vm); 2978 vdev->priv = NULL; 2979 } 2980 2981 static void virtio_mem_config_changed(struct virtio_device *vdev) 2982 { 2983 struct virtio_mem *vm = vdev->priv; 2984 2985 if (unlikely(vm->in_kdump)) 2986 return; 2987 2988 atomic_set(&vm->config_changed, 1); 2989 virtio_mem_retry(vm); 2990 } 2991 2992 #ifdef CONFIG_PM_SLEEP 2993 static int virtio_mem_freeze(struct virtio_device *vdev) 2994 { 2995 struct virtio_mem *vm = vdev->priv; 2996 2997 /* 2998 * We block hibernation using the PM notifier completely. The workqueue 2999 * is already frozen by the PM core at this point, so we simply 3000 * reset the device and cleanup the queues. 3001 */ 3002 if (pm_suspend_target_state != PM_SUSPEND_TO_IDLE && 3003 vm->plugged_size && 3004 !virtio_has_feature(vm->vdev, VIRTIO_MEM_F_PERSISTENT_SUSPEND)) { 3005 dev_err(&vm->vdev->dev, 3006 "suspending with plugged memory is not supported\n"); 3007 return -EPERM; 3008 } 3009 3010 virtio_reset_device(vdev); 3011 vdev->config->del_vqs(vdev); 3012 vm->vq = NULL; 3013 return 0; 3014 } 3015 3016 static int virtio_mem_restore(struct virtio_device *vdev) 3017 { 3018 struct virtio_mem *vm = vdev->priv; 3019 int ret; 3020 3021 ret = virtio_mem_init_vq(vm); 3022 if (ret) 3023 return ret; 3024 virtio_device_ready(vdev); 3025 3026 /* Let's check if anything changed. */ 3027 virtio_mem_config_changed(vdev); 3028 return 0; 3029 } 3030 #endif 3031 3032 static unsigned int virtio_mem_features[] = { 3033 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA) 3034 VIRTIO_MEM_F_ACPI_PXM, 3035 #endif 3036 VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE, 3037 VIRTIO_MEM_F_PERSISTENT_SUSPEND, 3038 }; 3039 3040 static const struct virtio_device_id virtio_mem_id_table[] = { 3041 { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, 3042 { 0 }, 3043 }; 3044 3045 static struct virtio_driver virtio_mem_driver = { 3046 .feature_table = virtio_mem_features, 3047 .feature_table_size = ARRAY_SIZE(virtio_mem_features), 3048 .driver.name = KBUILD_MODNAME, 3049 .id_table = virtio_mem_id_table, 3050 .probe = virtio_mem_probe, 3051 .remove = virtio_mem_remove, 3052 .config_changed = virtio_mem_config_changed, 3053 #ifdef CONFIG_PM_SLEEP 3054 .freeze = virtio_mem_freeze, 3055 .restore = virtio_mem_restore, 3056 #endif 3057 }; 3058 3059 module_virtio_driver(virtio_mem_driver); 3060 MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table); 3061 MODULE_AUTHOR("David Hildenbrand <david@redhat.com>"); 3062 MODULE_DESCRIPTION("Virtio-mem driver"); 3063 MODULE_LICENSE("GPL"); 3064