1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Virtio-mem device driver. 4 * 5 * Copyright Red Hat, Inc. 2020 6 * 7 * Author(s): David Hildenbrand <david@redhat.com> 8 */ 9 10 #include <linux/virtio.h> 11 #include <linux/virtio_mem.h> 12 #include <linux/workqueue.h> 13 #include <linux/slab.h> 14 #include <linux/module.h> 15 #include <linux/mm.h> 16 #include <linux/memory_hotplug.h> 17 #include <linux/memory.h> 18 #include <linux/hrtimer.h> 19 #include <linux/crash_dump.h> 20 #include <linux/mutex.h> 21 #include <linux/bitmap.h> 22 #include <linux/lockdep.h> 23 #include <linux/log2.h> 24 25 #include <acpi/acpi_numa.h> 26 27 static bool unplug_online = true; 28 module_param(unplug_online, bool, 0644); 29 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); 30 31 static bool force_bbm; 32 module_param(force_bbm, bool, 0444); 33 MODULE_PARM_DESC(force_bbm, 34 "Force Big Block Mode. Default is 0 (auto-selection)"); 35 36 static unsigned long bbm_block_size; 37 module_param(bbm_block_size, ulong, 0444); 38 MODULE_PARM_DESC(bbm_block_size, 39 "Big Block size in bytes. Default is 0 (auto-detection)."); 40 41 static bool bbm_safe_unplug = true; 42 module_param(bbm_safe_unplug, bool, 0444); 43 MODULE_PARM_DESC(bbm_safe_unplug, 44 "Use a safe unplug mechanism in BBM, avoiding long/endless loops"); 45 46 /* 47 * virtio-mem currently supports the following modes of operation: 48 * 49 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The 50 * size of a Sub Block (SB) is determined based on the device block size, the 51 * pageblock size, and the maximum allocation granularity of the buddy. 52 * Subblocks within a Linux memory block might either be plugged or unplugged. 53 * Memory is added/removed to Linux MM in Linux memory block granularity. 54 * 55 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. 56 * Memory is added/removed to Linux MM in Big Block granularity. 57 * 58 * The mode is determined automatically based on the Linux memory block size 59 * and the device block size. 60 * 61 * User space / core MM (auto onlining) is responsible for onlining added 62 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are 63 * always onlined separately, and all memory within a Linux memory block is 64 * onlined to the same zone - virtio-mem relies on this behavior. 65 */ 66 67 /* 68 * State of a Linux memory block in SBM. 69 */ 70 enum virtio_mem_sbm_mb_state { 71 /* Unplugged, not added to Linux. Can be reused later. */ 72 VIRTIO_MEM_SBM_MB_UNUSED = 0, 73 /* (Partially) plugged, not added to Linux. Error on add_memory(). */ 74 VIRTIO_MEM_SBM_MB_PLUGGED, 75 /* Fully plugged, fully added to Linux, offline. */ 76 VIRTIO_MEM_SBM_MB_OFFLINE, 77 /* Partially plugged, fully added to Linux, offline. */ 78 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 79 /* Fully plugged, fully added to Linux, onlined to a kernel zone. */ 80 VIRTIO_MEM_SBM_MB_KERNEL, 81 /* Partially plugged, fully added to Linux, online to a kernel zone */ 82 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 83 /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 84 VIRTIO_MEM_SBM_MB_MOVABLE, 85 /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 86 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 87 VIRTIO_MEM_SBM_MB_COUNT 88 }; 89 90 /* 91 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. 92 */ 93 enum virtio_mem_bbm_bb_state { 94 /* Unplugged, not added to Linux. Can be reused later. */ 95 VIRTIO_MEM_BBM_BB_UNUSED = 0, 96 /* Plugged, not added to Linux. Error on add_memory(). */ 97 VIRTIO_MEM_BBM_BB_PLUGGED, 98 /* Plugged and added to Linux. */ 99 VIRTIO_MEM_BBM_BB_ADDED, 100 /* All online parts are fake-offline, ready to remove. */ 101 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, 102 VIRTIO_MEM_BBM_BB_COUNT 103 }; 104 105 struct virtio_mem { 106 struct virtio_device *vdev; 107 108 /* We might first have to unplug all memory when starting up. */ 109 bool unplug_all_required; 110 111 /* Workqueue that processes the plug/unplug requests. */ 112 struct work_struct wq; 113 atomic_t wq_active; 114 atomic_t config_changed; 115 116 /* Virtqueue for guest->host requests. */ 117 struct virtqueue *vq; 118 119 /* Wait for a host response to a guest request. */ 120 wait_queue_head_t host_resp; 121 122 /* Space for one guest request and the host response. */ 123 struct virtio_mem_req req; 124 struct virtio_mem_resp resp; 125 126 /* The current size of the device. */ 127 uint64_t plugged_size; 128 /* The requested size of the device. */ 129 uint64_t requested_size; 130 131 /* The device block size (for communicating with the device). */ 132 uint64_t device_block_size; 133 /* The determined node id for all memory of the device. */ 134 int nid; 135 /* Physical start address of the memory region. */ 136 uint64_t addr; 137 /* Maximum region size in bytes. */ 138 uint64_t region_size; 139 140 /* The parent resource for all memory added via this device. */ 141 struct resource *parent_resource; 142 /* 143 * Copy of "System RAM (virtio_mem)" to be used for 144 * add_memory_driver_managed(). 145 */ 146 const char *resource_name; 147 /* Memory group identification. */ 148 int mgid; 149 150 /* 151 * We don't want to add too much memory if it's not getting onlined, 152 * to avoid running OOM. Besides this threshold, we allow to have at 153 * least two offline blocks at a time (whatever is bigger). 154 */ 155 #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) 156 atomic64_t offline_size; 157 uint64_t offline_threshold; 158 159 /* If set, the driver is in SBM, otherwise in BBM. */ 160 bool in_sbm; 161 162 union { 163 struct { 164 /* Id of the first memory block of this device. */ 165 unsigned long first_mb_id; 166 /* Id of the last usable memory block of this device. */ 167 unsigned long last_usable_mb_id; 168 /* Id of the next memory bock to prepare when needed. */ 169 unsigned long next_mb_id; 170 171 /* The subblock size. */ 172 uint64_t sb_size; 173 /* The number of subblocks per Linux memory block. */ 174 uint32_t sbs_per_mb; 175 176 /* Summary of all memory block states. */ 177 unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; 178 179 /* 180 * One byte state per memory block. Allocated via 181 * vmalloc(). Resized (alloc+copy+free) on demand. 182 * 183 * With 128 MiB memory blocks, we have states for 512 184 * GiB of memory in one 4 KiB page. 185 */ 186 uint8_t *mb_states; 187 188 /* 189 * Bitmap: one bit per subblock. Allocated similar to 190 * sbm.mb_states. 191 * 192 * A set bit means the corresponding subblock is 193 * plugged, otherwise it's unblocked. 194 * 195 * With 4 MiB subblocks, we manage 128 GiB of memory 196 * in one 4 KiB page. 197 */ 198 unsigned long *sb_states; 199 } sbm; 200 201 struct { 202 /* Id of the first big block of this device. */ 203 unsigned long first_bb_id; 204 /* Id of the last usable big block of this device. */ 205 unsigned long last_usable_bb_id; 206 /* Id of the next device bock to prepare when needed. */ 207 unsigned long next_bb_id; 208 209 /* Summary of all big block states. */ 210 unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; 211 212 /* One byte state per big block. See sbm.mb_states. */ 213 uint8_t *bb_states; 214 215 /* The block size used for plugging/adding/removing. */ 216 uint64_t bb_size; 217 } bbm; 218 }; 219 220 /* 221 * Mutex that protects the sbm.mb_count, sbm.mb_states, 222 * sbm.sb_states, bbm.bb_count, and bbm.bb_states 223 * 224 * When this lock is held the pointers can't change, ONLINE and 225 * OFFLINE blocks can't change the state and no subblocks will get 226 * plugged/unplugged. 227 * 228 * In kdump mode, used to serialize requests, last_block_addr and 229 * last_block_plugged. 230 */ 231 struct mutex hotplug_mutex; 232 bool hotplug_active; 233 234 /* An error occurred we cannot handle - stop processing requests. */ 235 bool broken; 236 237 /* Cached valued of is_kdump_kernel() when the device was probed. */ 238 bool in_kdump; 239 240 /* The driver is being removed. */ 241 spinlock_t removal_lock; 242 bool removing; 243 244 /* Timer for retrying to plug/unplug memory. */ 245 struct hrtimer retry_timer; 246 unsigned int retry_timer_ms; 247 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000 248 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000 249 250 /* Memory notifier (online/offline events). */ 251 struct notifier_block memory_notifier; 252 253 #ifdef CONFIG_PROC_VMCORE 254 /* vmcore callback for /proc/vmcore handling in kdump mode */ 255 struct vmcore_cb vmcore_cb; 256 uint64_t last_block_addr; 257 bool last_block_plugged; 258 #endif /* CONFIG_PROC_VMCORE */ 259 260 /* Next device in the list of virtio-mem devices. */ 261 struct list_head next; 262 }; 263 264 /* 265 * We have to share a single online_page callback among all virtio-mem 266 * devices. We use RCU to iterate the list in the callback. 267 */ 268 static DEFINE_MUTEX(virtio_mem_mutex); 269 static LIST_HEAD(virtio_mem_devices); 270 271 static void virtio_mem_online_page_cb(struct page *page, unsigned int order); 272 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 273 unsigned long nr_pages); 274 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 275 unsigned long nr_pages); 276 static void virtio_mem_retry(struct virtio_mem *vm); 277 static int virtio_mem_create_resource(struct virtio_mem *vm); 278 static void virtio_mem_delete_resource(struct virtio_mem *vm); 279 280 /* 281 * Register a virtio-mem device so it will be considered for the online_page 282 * callback. 283 */ 284 static int register_virtio_mem_device(struct virtio_mem *vm) 285 { 286 int rc = 0; 287 288 /* First device registers the callback. */ 289 mutex_lock(&virtio_mem_mutex); 290 if (list_empty(&virtio_mem_devices)) 291 rc = set_online_page_callback(&virtio_mem_online_page_cb); 292 if (!rc) 293 list_add_rcu(&vm->next, &virtio_mem_devices); 294 mutex_unlock(&virtio_mem_mutex); 295 296 return rc; 297 } 298 299 /* 300 * Unregister a virtio-mem device so it will no longer be considered for the 301 * online_page callback. 302 */ 303 static void unregister_virtio_mem_device(struct virtio_mem *vm) 304 { 305 /* Last device unregisters the callback. */ 306 mutex_lock(&virtio_mem_mutex); 307 list_del_rcu(&vm->next); 308 if (list_empty(&virtio_mem_devices)) 309 restore_online_page_callback(&virtio_mem_online_page_cb); 310 mutex_unlock(&virtio_mem_mutex); 311 312 synchronize_rcu(); 313 } 314 315 /* 316 * Calculate the memory block id of a given address. 317 */ 318 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr) 319 { 320 return addr / memory_block_size_bytes(); 321 } 322 323 /* 324 * Calculate the physical start address of a given memory block id. 325 */ 326 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) 327 { 328 return mb_id * memory_block_size_bytes(); 329 } 330 331 /* 332 * Calculate the big block id of a given address. 333 */ 334 static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, 335 uint64_t addr) 336 { 337 return addr / vm->bbm.bb_size; 338 } 339 340 /* 341 * Calculate the physical start address of a given big block id. 342 */ 343 static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, 344 unsigned long bb_id) 345 { 346 return bb_id * vm->bbm.bb_size; 347 } 348 349 /* 350 * Calculate the subblock id of a given address. 351 */ 352 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, 353 unsigned long addr) 354 { 355 const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); 356 const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); 357 358 return (addr - mb_addr) / vm->sbm.sb_size; 359 } 360 361 /* 362 * Set the state of a big block, taking care of the state counter. 363 */ 364 static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, 365 unsigned long bb_id, 366 enum virtio_mem_bbm_bb_state state) 367 { 368 const unsigned long idx = bb_id - vm->bbm.first_bb_id; 369 enum virtio_mem_bbm_bb_state old_state; 370 371 old_state = vm->bbm.bb_states[idx]; 372 vm->bbm.bb_states[idx] = state; 373 374 BUG_ON(vm->bbm.bb_count[old_state] == 0); 375 vm->bbm.bb_count[old_state]--; 376 vm->bbm.bb_count[state]++; 377 } 378 379 /* 380 * Get the state of a big block. 381 */ 382 static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, 383 unsigned long bb_id) 384 { 385 return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; 386 } 387 388 /* 389 * Prepare the big block state array for the next big block. 390 */ 391 static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) 392 { 393 unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; 394 unsigned long new_bytes = old_bytes + 1; 395 int old_pages = PFN_UP(old_bytes); 396 int new_pages = PFN_UP(new_bytes); 397 uint8_t *new_array; 398 399 if (vm->bbm.bb_states && old_pages == new_pages) 400 return 0; 401 402 new_array = vzalloc(new_pages * PAGE_SIZE); 403 if (!new_array) 404 return -ENOMEM; 405 406 mutex_lock(&vm->hotplug_mutex); 407 if (vm->bbm.bb_states) 408 memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); 409 vfree(vm->bbm.bb_states); 410 vm->bbm.bb_states = new_array; 411 mutex_unlock(&vm->hotplug_mutex); 412 413 return 0; 414 } 415 416 #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ 417 for (_bb_id = vm->bbm.first_bb_id; \ 418 _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ 419 _bb_id++) \ 420 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 421 422 #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ 423 for (_bb_id = vm->bbm.next_bb_id - 1; \ 424 _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ 425 _bb_id--) \ 426 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 427 428 /* 429 * Set the state of a memory block, taking care of the state counter. 430 */ 431 static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, 432 unsigned long mb_id, uint8_t state) 433 { 434 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 435 uint8_t old_state; 436 437 old_state = vm->sbm.mb_states[idx]; 438 vm->sbm.mb_states[idx] = state; 439 440 BUG_ON(vm->sbm.mb_count[old_state] == 0); 441 vm->sbm.mb_count[old_state]--; 442 vm->sbm.mb_count[state]++; 443 } 444 445 /* 446 * Get the state of a memory block. 447 */ 448 static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, 449 unsigned long mb_id) 450 { 451 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 452 453 return vm->sbm.mb_states[idx]; 454 } 455 456 /* 457 * Prepare the state array for the next memory block. 458 */ 459 static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) 460 { 461 int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); 462 int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); 463 uint8_t *new_array; 464 465 if (vm->sbm.mb_states && old_pages == new_pages) 466 return 0; 467 468 new_array = vzalloc(new_pages * PAGE_SIZE); 469 if (!new_array) 470 return -ENOMEM; 471 472 mutex_lock(&vm->hotplug_mutex); 473 if (vm->sbm.mb_states) 474 memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); 475 vfree(vm->sbm.mb_states); 476 vm->sbm.mb_states = new_array; 477 mutex_unlock(&vm->hotplug_mutex); 478 479 return 0; 480 } 481 482 #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ 483 for (_mb_id = _vm->sbm.first_mb_id; \ 484 _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ 485 _mb_id++) \ 486 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 487 488 #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ 489 for (_mb_id = _vm->sbm.next_mb_id - 1; \ 490 _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ 491 _mb_id--) \ 492 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 493 494 /* 495 * Calculate the bit number in the subblock bitmap for the given subblock 496 * inside the given memory block. 497 */ 498 static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, 499 unsigned long mb_id, int sb_id) 500 { 501 return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; 502 } 503 504 /* 505 * Mark all selected subblocks plugged. 506 * 507 * Will not modify the state of the memory block. 508 */ 509 static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, 510 unsigned long mb_id, int sb_id, 511 int count) 512 { 513 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 514 515 __bitmap_set(vm->sbm.sb_states, bit, count); 516 } 517 518 /* 519 * Mark all selected subblocks unplugged. 520 * 521 * Will not modify the state of the memory block. 522 */ 523 static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, 524 unsigned long mb_id, int sb_id, 525 int count) 526 { 527 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 528 529 __bitmap_clear(vm->sbm.sb_states, bit, count); 530 } 531 532 /* 533 * Test if all selected subblocks are plugged. 534 */ 535 static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, 536 unsigned long mb_id, int sb_id, 537 int count) 538 { 539 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 540 541 if (count == 1) 542 return test_bit(bit, vm->sbm.sb_states); 543 544 /* TODO: Helper similar to bitmap_set() */ 545 return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= 546 bit + count; 547 } 548 549 /* 550 * Test if all selected subblocks are unplugged. 551 */ 552 static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, 553 unsigned long mb_id, int sb_id, 554 int count) 555 { 556 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 557 558 /* TODO: Helper similar to bitmap_set() */ 559 return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= 560 bit + count; 561 } 562 563 /* 564 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is 565 * none. 566 */ 567 static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, 568 unsigned long mb_id) 569 { 570 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); 571 572 return find_next_zero_bit(vm->sbm.sb_states, 573 bit + vm->sbm.sbs_per_mb, bit) - bit; 574 } 575 576 /* 577 * Prepare the subblock bitmap for the next memory block. 578 */ 579 static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) 580 { 581 const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; 582 const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; 583 const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; 584 int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); 585 int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); 586 unsigned long *new_bitmap, *old_bitmap; 587 588 if (vm->sbm.sb_states && old_pages == new_pages) 589 return 0; 590 591 new_bitmap = vzalloc(new_pages * PAGE_SIZE); 592 if (!new_bitmap) 593 return -ENOMEM; 594 595 mutex_lock(&vm->hotplug_mutex); 596 if (vm->sbm.sb_states) 597 memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); 598 599 old_bitmap = vm->sbm.sb_states; 600 vm->sbm.sb_states = new_bitmap; 601 mutex_unlock(&vm->hotplug_mutex); 602 603 vfree(old_bitmap); 604 return 0; 605 } 606 607 /* 608 * Test if we could add memory without creating too much offline memory - 609 * to avoid running OOM if memory is getting onlined deferred. 610 */ 611 static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) 612 { 613 if (WARN_ON_ONCE(size > vm->offline_threshold)) 614 return false; 615 616 return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; 617 } 618 619 /* 620 * Try adding memory to Linux. Will usually only fail if out of memory. 621 * 622 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 623 * onlining code). 624 * 625 * Will not modify the state of memory blocks in virtio-mem. 626 */ 627 static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, 628 uint64_t size) 629 { 630 int rc; 631 632 /* 633 * When force-unloading the driver and we still have memory added to 634 * Linux, the resource name has to stay. 635 */ 636 if (!vm->resource_name) { 637 vm->resource_name = kstrdup_const("System RAM (virtio_mem)", 638 GFP_KERNEL); 639 if (!vm->resource_name) 640 return -ENOMEM; 641 } 642 643 dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, 644 addr + size - 1); 645 /* Memory might get onlined immediately. */ 646 atomic64_add(size, &vm->offline_size); 647 rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name, 648 MHP_MERGE_RESOURCE | MHP_NID_IS_MGID); 649 if (rc) { 650 atomic64_sub(size, &vm->offline_size); 651 dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); 652 /* 653 * TODO: Linux MM does not properly clean up yet in all cases 654 * where adding of memory failed - especially on -ENOMEM. 655 */ 656 } 657 return rc; 658 } 659 660 /* 661 * See virtio_mem_add_memory(): Try adding a single Linux memory block. 662 */ 663 static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) 664 { 665 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 666 const uint64_t size = memory_block_size_bytes(); 667 668 return virtio_mem_add_memory(vm, addr, size); 669 } 670 671 /* 672 * See virtio_mem_add_memory(): Try adding a big block. 673 */ 674 static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) 675 { 676 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 677 const uint64_t size = vm->bbm.bb_size; 678 679 return virtio_mem_add_memory(vm, addr, size); 680 } 681 682 /* 683 * Try removing memory from Linux. Will only fail if memory blocks aren't 684 * offline. 685 * 686 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 687 * onlining code). 688 * 689 * Will not modify the state of memory blocks in virtio-mem. 690 */ 691 static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, 692 uint64_t size) 693 { 694 int rc; 695 696 dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, 697 addr + size - 1); 698 rc = remove_memory(addr, size); 699 if (!rc) { 700 atomic64_sub(size, &vm->offline_size); 701 /* 702 * We might have freed up memory we can now unplug, retry 703 * immediately instead of waiting. 704 */ 705 virtio_mem_retry(vm); 706 } else { 707 dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); 708 } 709 return rc; 710 } 711 712 /* 713 * See virtio_mem_remove_memory(): Try removing a single Linux memory block. 714 */ 715 static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) 716 { 717 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 718 const uint64_t size = memory_block_size_bytes(); 719 720 return virtio_mem_remove_memory(vm, addr, size); 721 } 722 723 /* 724 * Try offlining and removing memory from Linux. 725 * 726 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 727 * onlining code). 728 * 729 * Will not modify the state of memory blocks in virtio-mem. 730 */ 731 static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, 732 uint64_t addr, 733 uint64_t size) 734 { 735 int rc; 736 737 dev_dbg(&vm->vdev->dev, 738 "offlining and removing memory: 0x%llx - 0x%llx\n", addr, 739 addr + size - 1); 740 741 rc = offline_and_remove_memory(addr, size); 742 if (!rc) { 743 atomic64_sub(size, &vm->offline_size); 744 /* 745 * We might have freed up memory we can now unplug, retry 746 * immediately instead of waiting. 747 */ 748 virtio_mem_retry(vm); 749 } else { 750 dev_dbg(&vm->vdev->dev, 751 "offlining and removing memory failed: %d\n", rc); 752 } 753 return rc; 754 } 755 756 /* 757 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing 758 * a single Linux memory block. 759 */ 760 static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, 761 unsigned long mb_id) 762 { 763 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 764 const uint64_t size = memory_block_size_bytes(); 765 766 return virtio_mem_offline_and_remove_memory(vm, addr, size); 767 } 768 769 /* 770 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a 771 * all Linux memory blocks covered by the big block. 772 */ 773 static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, 774 unsigned long bb_id) 775 { 776 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 777 const uint64_t size = vm->bbm.bb_size; 778 779 return virtio_mem_offline_and_remove_memory(vm, addr, size); 780 } 781 782 /* 783 * Trigger the workqueue so the device can perform its magic. 784 */ 785 static void virtio_mem_retry(struct virtio_mem *vm) 786 { 787 unsigned long flags; 788 789 spin_lock_irqsave(&vm->removal_lock, flags); 790 if (!vm->removing) 791 queue_work(system_freezable_wq, &vm->wq); 792 spin_unlock_irqrestore(&vm->removal_lock, flags); 793 } 794 795 static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) 796 { 797 int node = NUMA_NO_NODE; 798 799 #if defined(CONFIG_ACPI_NUMA) 800 if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM)) 801 node = pxm_to_node(node_id); 802 #endif 803 return node; 804 } 805 806 /* 807 * Test if a virtio-mem device overlaps with the given range. Can be called 808 * from (notifier) callbacks lockless. 809 */ 810 static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, 811 uint64_t size) 812 { 813 return start < vm->addr + vm->region_size && vm->addr < start + size; 814 } 815 816 /* 817 * Test if a virtio-mem device contains a given range. Can be called from 818 * (notifier) callbacks lockless. 819 */ 820 static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, 821 uint64_t size) 822 { 823 return start >= vm->addr && start + size <= vm->addr + vm->region_size; 824 } 825 826 static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, 827 unsigned long mb_id) 828 { 829 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 830 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 831 case VIRTIO_MEM_SBM_MB_OFFLINE: 832 return NOTIFY_OK; 833 default: 834 break; 835 } 836 dev_warn_ratelimited(&vm->vdev->dev, 837 "memory block onlining denied\n"); 838 return NOTIFY_BAD; 839 } 840 841 static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, 842 unsigned long mb_id) 843 { 844 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 845 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 846 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 847 virtio_mem_sbm_set_mb_state(vm, mb_id, 848 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 849 break; 850 case VIRTIO_MEM_SBM_MB_KERNEL: 851 case VIRTIO_MEM_SBM_MB_MOVABLE: 852 virtio_mem_sbm_set_mb_state(vm, mb_id, 853 VIRTIO_MEM_SBM_MB_OFFLINE); 854 break; 855 default: 856 BUG(); 857 break; 858 } 859 } 860 861 static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, 862 unsigned long mb_id, 863 unsigned long start_pfn) 864 { 865 const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) == 866 ZONE_MOVABLE; 867 int new_state; 868 869 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 870 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 871 new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL; 872 if (is_movable) 873 new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL; 874 break; 875 case VIRTIO_MEM_SBM_MB_OFFLINE: 876 new_state = VIRTIO_MEM_SBM_MB_KERNEL; 877 if (is_movable) 878 new_state = VIRTIO_MEM_SBM_MB_MOVABLE; 879 break; 880 default: 881 BUG(); 882 break; 883 } 884 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 885 } 886 887 static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, 888 unsigned long mb_id) 889 { 890 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 891 unsigned long pfn; 892 int sb_id; 893 894 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 895 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 896 continue; 897 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 898 sb_id * vm->sbm.sb_size); 899 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 900 } 901 } 902 903 static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, 904 unsigned long mb_id) 905 { 906 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 907 unsigned long pfn; 908 int sb_id; 909 910 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 911 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 912 continue; 913 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 914 sb_id * vm->sbm.sb_size); 915 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 916 } 917 } 918 919 static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, 920 unsigned long bb_id, 921 unsigned long pfn, 922 unsigned long nr_pages) 923 { 924 /* 925 * When marked as "fake-offline", all online memory of this device block 926 * is allocated by us. Otherwise, we don't have any memory allocated. 927 */ 928 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 929 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 930 return; 931 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 932 } 933 934 static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, 935 unsigned long bb_id, 936 unsigned long pfn, 937 unsigned long nr_pages) 938 { 939 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 940 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 941 return; 942 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 943 } 944 945 /* 946 * This callback will either be called synchronously from add_memory() or 947 * asynchronously (e.g., triggered via user space). We have to be careful 948 * with locking when calling add_memory(). 949 */ 950 static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, 951 unsigned long action, void *arg) 952 { 953 struct virtio_mem *vm = container_of(nb, struct virtio_mem, 954 memory_notifier); 955 struct memory_notify *mhp = arg; 956 const unsigned long start = PFN_PHYS(mhp->start_pfn); 957 const unsigned long size = PFN_PHYS(mhp->nr_pages); 958 int rc = NOTIFY_OK; 959 unsigned long id; 960 961 if (!virtio_mem_overlaps_range(vm, start, size)) 962 return NOTIFY_DONE; 963 964 if (vm->in_sbm) { 965 id = virtio_mem_phys_to_mb_id(start); 966 /* 967 * In SBM, we add memory in separate memory blocks - we expect 968 * it to be onlined/offlined in the same granularity. Bail out 969 * if this ever changes. 970 */ 971 if (WARN_ON_ONCE(size != memory_block_size_bytes() || 972 !IS_ALIGNED(start, memory_block_size_bytes()))) 973 return NOTIFY_BAD; 974 } else { 975 id = virtio_mem_phys_to_bb_id(vm, start); 976 /* 977 * In BBM, we only care about onlining/offlining happening 978 * within a single big block, we don't care about the 979 * actual granularity as we don't track individual Linux 980 * memory blocks. 981 */ 982 if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) 983 return NOTIFY_BAD; 984 } 985 986 /* 987 * Avoid circular locking lockdep warnings. We lock the mutex 988 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The 989 * blocking_notifier_call_chain() has it's own lock, which gets unlocked 990 * between both notifier calls and will bail out. False positive. 991 */ 992 lockdep_off(); 993 994 switch (action) { 995 case MEM_GOING_OFFLINE: 996 mutex_lock(&vm->hotplug_mutex); 997 if (vm->removing) { 998 rc = notifier_from_errno(-EBUSY); 999 mutex_unlock(&vm->hotplug_mutex); 1000 break; 1001 } 1002 vm->hotplug_active = true; 1003 if (vm->in_sbm) 1004 virtio_mem_sbm_notify_going_offline(vm, id); 1005 else 1006 virtio_mem_bbm_notify_going_offline(vm, id, 1007 mhp->start_pfn, 1008 mhp->nr_pages); 1009 break; 1010 case MEM_GOING_ONLINE: 1011 mutex_lock(&vm->hotplug_mutex); 1012 if (vm->removing) { 1013 rc = notifier_from_errno(-EBUSY); 1014 mutex_unlock(&vm->hotplug_mutex); 1015 break; 1016 } 1017 vm->hotplug_active = true; 1018 if (vm->in_sbm) 1019 rc = virtio_mem_sbm_notify_going_online(vm, id); 1020 break; 1021 case MEM_OFFLINE: 1022 if (vm->in_sbm) 1023 virtio_mem_sbm_notify_offline(vm, id); 1024 1025 atomic64_add(size, &vm->offline_size); 1026 /* 1027 * Trigger the workqueue. Now that we have some offline memory, 1028 * maybe we can handle pending unplug requests. 1029 */ 1030 if (!unplug_online) 1031 virtio_mem_retry(vm); 1032 1033 vm->hotplug_active = false; 1034 mutex_unlock(&vm->hotplug_mutex); 1035 break; 1036 case MEM_ONLINE: 1037 if (vm->in_sbm) 1038 virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); 1039 1040 atomic64_sub(size, &vm->offline_size); 1041 /* 1042 * Start adding more memory once we onlined half of our 1043 * threshold. Don't trigger if it's possibly due to our actipn 1044 * (e.g., us adding memory which gets onlined immediately from 1045 * the core). 1046 */ 1047 if (!atomic_read(&vm->wq_active) && 1048 virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) 1049 virtio_mem_retry(vm); 1050 1051 vm->hotplug_active = false; 1052 mutex_unlock(&vm->hotplug_mutex); 1053 break; 1054 case MEM_CANCEL_OFFLINE: 1055 if (!vm->hotplug_active) 1056 break; 1057 if (vm->in_sbm) 1058 virtio_mem_sbm_notify_cancel_offline(vm, id); 1059 else 1060 virtio_mem_bbm_notify_cancel_offline(vm, id, 1061 mhp->start_pfn, 1062 mhp->nr_pages); 1063 vm->hotplug_active = false; 1064 mutex_unlock(&vm->hotplug_mutex); 1065 break; 1066 case MEM_CANCEL_ONLINE: 1067 if (!vm->hotplug_active) 1068 break; 1069 vm->hotplug_active = false; 1070 mutex_unlock(&vm->hotplug_mutex); 1071 break; 1072 default: 1073 break; 1074 } 1075 1076 lockdep_on(); 1077 1078 return rc; 1079 } 1080 1081 /* 1082 * Set a range of pages PG_offline. Remember pages that were never onlined 1083 * (via generic_online_page()) using PageDirty(). 1084 */ 1085 static void virtio_mem_set_fake_offline(unsigned long pfn, 1086 unsigned long nr_pages, bool onlined) 1087 { 1088 page_offline_begin(); 1089 for (; nr_pages--; pfn++) { 1090 struct page *page = pfn_to_page(pfn); 1091 1092 __SetPageOffline(page); 1093 if (!onlined) { 1094 SetPageDirty(page); 1095 /* FIXME: remove after cleanups */ 1096 ClearPageReserved(page); 1097 } 1098 } 1099 page_offline_end(); 1100 } 1101 1102 /* 1103 * Clear PG_offline from a range of pages. If the pages were never onlined, 1104 * (via generic_online_page()), clear PageDirty(). 1105 */ 1106 static void virtio_mem_clear_fake_offline(unsigned long pfn, 1107 unsigned long nr_pages, bool onlined) 1108 { 1109 for (; nr_pages--; pfn++) { 1110 struct page *page = pfn_to_page(pfn); 1111 1112 __ClearPageOffline(page); 1113 if (!onlined) 1114 ClearPageDirty(page); 1115 } 1116 } 1117 1118 /* 1119 * Release a range of fake-offline pages to the buddy, effectively 1120 * fake-onlining them. 1121 */ 1122 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) 1123 { 1124 unsigned long order = MAX_ORDER - 1; 1125 unsigned long i; 1126 1127 /* 1128 * We might get called for ranges that don't cover properly aligned 1129 * MAX_ORDER - 1 pages; however, we can only online properly aligned 1130 * pages with an order of MAX_ORDER - 1 at maximum. 1131 */ 1132 while (!IS_ALIGNED(pfn | nr_pages, 1 << order)) 1133 order--; 1134 1135 for (i = 0; i < nr_pages; i += 1 << order) { 1136 struct page *page = pfn_to_page(pfn + i); 1137 1138 /* 1139 * If the page is PageDirty(), it was kept fake-offline when 1140 * onlining the memory block. Otherwise, it was allocated 1141 * using alloc_contig_range(). All pages in a subblock are 1142 * alike. 1143 */ 1144 if (PageDirty(page)) { 1145 virtio_mem_clear_fake_offline(pfn + i, 1 << order, false); 1146 generic_online_page(page, order); 1147 } else { 1148 virtio_mem_clear_fake_offline(pfn + i, 1 << order, true); 1149 free_contig_range(pfn + i, 1 << order); 1150 adjust_managed_page_count(page, 1 << order); 1151 } 1152 } 1153 } 1154 1155 /* 1156 * Try to allocate a range, marking pages fake-offline, effectively 1157 * fake-offlining them. 1158 */ 1159 static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) 1160 { 1161 const bool is_movable = page_zonenum(pfn_to_page(pfn)) == 1162 ZONE_MOVABLE; 1163 int rc, retry_count; 1164 1165 /* 1166 * TODO: We want an alloc_contig_range() mode that tries to allocate 1167 * harder (e.g., dealing with temporarily pinned pages, PCP), especially 1168 * with ZONE_MOVABLE. So for now, retry a couple of times with 1169 * ZONE_MOVABLE before giving up - because that zone is supposed to give 1170 * some guarantees. 1171 */ 1172 for (retry_count = 0; retry_count < 5; retry_count++) { 1173 rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, 1174 GFP_KERNEL); 1175 if (rc == -ENOMEM) 1176 /* whoops, out of memory */ 1177 return rc; 1178 else if (rc && !is_movable) 1179 break; 1180 else if (rc) 1181 continue; 1182 1183 virtio_mem_set_fake_offline(pfn, nr_pages, true); 1184 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1185 return 0; 1186 } 1187 1188 return -EBUSY; 1189 } 1190 1191 /* 1192 * Handle fake-offline pages when memory is going offline - such that the 1193 * pages can be skipped by mm-core when offlining. 1194 */ 1195 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 1196 unsigned long nr_pages) 1197 { 1198 struct page *page; 1199 unsigned long i; 1200 1201 /* 1202 * Drop our reference to the pages so the memory can get offlined 1203 * and add the unplugged pages to the managed page counters (so 1204 * offlining code can correctly subtract them again). 1205 */ 1206 adjust_managed_page_count(pfn_to_page(pfn), nr_pages); 1207 /* Drop our reference to the pages so the memory can get offlined. */ 1208 for (i = 0; i < nr_pages; i++) { 1209 page = pfn_to_page(pfn + i); 1210 if (WARN_ON(!page_ref_dec_and_test(page))) 1211 dump_page(page, "fake-offline page referenced"); 1212 } 1213 } 1214 1215 /* 1216 * Handle fake-offline pages when memory offlining is canceled - to undo 1217 * what we did in virtio_mem_fake_offline_going_offline(). 1218 */ 1219 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 1220 unsigned long nr_pages) 1221 { 1222 unsigned long i; 1223 1224 /* 1225 * Get the reference we dropped when going offline and subtract the 1226 * unplugged pages from the managed page counters. 1227 */ 1228 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1229 for (i = 0; i < nr_pages; i++) 1230 page_ref_inc(pfn_to_page(pfn + i)); 1231 } 1232 1233 static void virtio_mem_online_page(struct virtio_mem *vm, 1234 struct page *page, unsigned int order) 1235 { 1236 const unsigned long start = page_to_phys(page); 1237 const unsigned long end = start + PFN_PHYS(1 << order); 1238 unsigned long addr, next, id, sb_id, count; 1239 bool do_online; 1240 1241 /* 1242 * We can get called with any order up to MAX_ORDER - 1. If our 1243 * subblock size is smaller than that and we have a mixture of plugged 1244 * and unplugged subblocks within such a page, we have to process in 1245 * smaller granularity. In that case we'll adjust the order exactly once 1246 * within the loop. 1247 */ 1248 for (addr = start; addr < end; ) { 1249 next = addr + PFN_PHYS(1 << order); 1250 1251 if (vm->in_sbm) { 1252 id = virtio_mem_phys_to_mb_id(addr); 1253 sb_id = virtio_mem_phys_to_sb_id(vm, addr); 1254 count = virtio_mem_phys_to_sb_id(vm, next - 1) - sb_id + 1; 1255 1256 if (virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, count)) { 1257 /* Fully plugged. */ 1258 do_online = true; 1259 } else if (count == 1 || 1260 virtio_mem_sbm_test_sb_unplugged(vm, id, sb_id, count)) { 1261 /* Fully unplugged. */ 1262 do_online = false; 1263 } else { 1264 /* 1265 * Mixture, process sub-blocks instead. This 1266 * will be at least the size of a pageblock. 1267 * We'll run into this case exactly once. 1268 */ 1269 order = ilog2(vm->sbm.sb_size) - PAGE_SHIFT; 1270 do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1); 1271 continue; 1272 } 1273 } else { 1274 /* 1275 * If the whole block is marked fake offline, keep 1276 * everything that way. 1277 */ 1278 id = virtio_mem_phys_to_bb_id(vm, addr); 1279 do_online = virtio_mem_bbm_get_bb_state(vm, id) != 1280 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; 1281 } 1282 1283 if (do_online) 1284 generic_online_page(pfn_to_page(PFN_DOWN(addr)), order); 1285 else 1286 virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, 1287 false); 1288 addr = next; 1289 } 1290 } 1291 1292 static void virtio_mem_online_page_cb(struct page *page, unsigned int order) 1293 { 1294 const unsigned long addr = page_to_phys(page); 1295 struct virtio_mem *vm; 1296 1297 rcu_read_lock(); 1298 list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { 1299 /* 1300 * Pages we're onlining will never cross memory blocks and, 1301 * therefore, not virtio-mem devices. 1302 */ 1303 if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) 1304 continue; 1305 1306 /* 1307 * virtio_mem_set_fake_offline() might sleep. We can safely 1308 * drop the RCU lock at this point because the device 1309 * cannot go away. See virtio_mem_remove() how races 1310 * between memory onlining and device removal are handled. 1311 */ 1312 rcu_read_unlock(); 1313 1314 virtio_mem_online_page(vm, page, order); 1315 return; 1316 } 1317 rcu_read_unlock(); 1318 1319 /* not virtio-mem memory, but e.g., a DIMM. online it */ 1320 generic_online_page(page, order); 1321 } 1322 1323 static uint64_t virtio_mem_send_request(struct virtio_mem *vm, 1324 const struct virtio_mem_req *req) 1325 { 1326 struct scatterlist *sgs[2], sg_req, sg_resp; 1327 unsigned int len; 1328 int rc; 1329 1330 /* don't use the request residing on the stack (vaddr) */ 1331 vm->req = *req; 1332 1333 /* out: buffer for request */ 1334 sg_init_one(&sg_req, &vm->req, sizeof(vm->req)); 1335 sgs[0] = &sg_req; 1336 1337 /* in: buffer for response */ 1338 sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp)); 1339 sgs[1] = &sg_resp; 1340 1341 rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL); 1342 if (rc < 0) 1343 return rc; 1344 1345 virtqueue_kick(vm->vq); 1346 1347 /* wait for a response */ 1348 wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len)); 1349 1350 return virtio16_to_cpu(vm->vdev, vm->resp.type); 1351 } 1352 1353 static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, 1354 uint64_t size) 1355 { 1356 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1357 const struct virtio_mem_req req = { 1358 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG), 1359 .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), 1360 .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1361 }; 1362 int rc = -ENOMEM; 1363 1364 if (atomic_read(&vm->config_changed)) 1365 return -EAGAIN; 1366 1367 dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, 1368 addr + size - 1); 1369 1370 switch (virtio_mem_send_request(vm, &req)) { 1371 case VIRTIO_MEM_RESP_ACK: 1372 vm->plugged_size += size; 1373 return 0; 1374 case VIRTIO_MEM_RESP_NACK: 1375 rc = -EAGAIN; 1376 break; 1377 case VIRTIO_MEM_RESP_BUSY: 1378 rc = -ETXTBSY; 1379 break; 1380 case VIRTIO_MEM_RESP_ERROR: 1381 rc = -EINVAL; 1382 break; 1383 default: 1384 break; 1385 } 1386 1387 dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); 1388 return rc; 1389 } 1390 1391 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, 1392 uint64_t size) 1393 { 1394 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1395 const struct virtio_mem_req req = { 1396 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG), 1397 .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), 1398 .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1399 }; 1400 int rc = -ENOMEM; 1401 1402 if (atomic_read(&vm->config_changed)) 1403 return -EAGAIN; 1404 1405 dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, 1406 addr + size - 1); 1407 1408 switch (virtio_mem_send_request(vm, &req)) { 1409 case VIRTIO_MEM_RESP_ACK: 1410 vm->plugged_size -= size; 1411 return 0; 1412 case VIRTIO_MEM_RESP_BUSY: 1413 rc = -ETXTBSY; 1414 break; 1415 case VIRTIO_MEM_RESP_ERROR: 1416 rc = -EINVAL; 1417 break; 1418 default: 1419 break; 1420 } 1421 1422 dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); 1423 return rc; 1424 } 1425 1426 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) 1427 { 1428 const struct virtio_mem_req req = { 1429 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), 1430 }; 1431 int rc = -ENOMEM; 1432 1433 dev_dbg(&vm->vdev->dev, "unplugging all memory"); 1434 1435 switch (virtio_mem_send_request(vm, &req)) { 1436 case VIRTIO_MEM_RESP_ACK: 1437 vm->unplug_all_required = false; 1438 vm->plugged_size = 0; 1439 /* usable region might have shrunk */ 1440 atomic_set(&vm->config_changed, 1); 1441 return 0; 1442 case VIRTIO_MEM_RESP_BUSY: 1443 rc = -ETXTBSY; 1444 break; 1445 default: 1446 break; 1447 } 1448 1449 dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); 1450 return rc; 1451 } 1452 1453 /* 1454 * Plug selected subblocks. Updates the plugged state, but not the state 1455 * of the memory block. 1456 */ 1457 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, 1458 int sb_id, int count) 1459 { 1460 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1461 sb_id * vm->sbm.sb_size; 1462 const uint64_t size = count * vm->sbm.sb_size; 1463 int rc; 1464 1465 rc = virtio_mem_send_plug_request(vm, addr, size); 1466 if (!rc) 1467 virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); 1468 return rc; 1469 } 1470 1471 /* 1472 * Unplug selected subblocks. Updates the plugged state, but not the state 1473 * of the memory block. 1474 */ 1475 static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, 1476 int sb_id, int count) 1477 { 1478 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1479 sb_id * vm->sbm.sb_size; 1480 const uint64_t size = count * vm->sbm.sb_size; 1481 int rc; 1482 1483 rc = virtio_mem_send_unplug_request(vm, addr, size); 1484 if (!rc) 1485 virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); 1486 return rc; 1487 } 1488 1489 /* 1490 * Request to unplug a big block. 1491 * 1492 * Will not modify the state of the big block. 1493 */ 1494 static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) 1495 { 1496 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1497 const uint64_t size = vm->bbm.bb_size; 1498 1499 return virtio_mem_send_unplug_request(vm, addr, size); 1500 } 1501 1502 /* 1503 * Request to plug a big block. 1504 * 1505 * Will not modify the state of the big block. 1506 */ 1507 static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) 1508 { 1509 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1510 const uint64_t size = vm->bbm.bb_size; 1511 1512 return virtio_mem_send_plug_request(vm, addr, size); 1513 } 1514 1515 /* 1516 * Unplug the desired number of plugged subblocks of a offline or not-added 1517 * memory block. Will fail if any subblock cannot get unplugged (instead of 1518 * skipping it). 1519 * 1520 * Will not modify the state of the memory block. 1521 * 1522 * Note: can fail after some subblocks were unplugged. 1523 */ 1524 static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm, 1525 unsigned long mb_id, uint64_t *nb_sb) 1526 { 1527 int sb_id, count; 1528 int rc; 1529 1530 sb_id = vm->sbm.sbs_per_mb - 1; 1531 while (*nb_sb) { 1532 /* Find the next candidate subblock */ 1533 while (sb_id >= 0 && 1534 virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) 1535 sb_id--; 1536 if (sb_id < 0) 1537 break; 1538 /* Try to unplug multiple subblocks at a time */ 1539 count = 1; 1540 while (count < *nb_sb && sb_id > 0 && 1541 virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { 1542 count++; 1543 sb_id--; 1544 } 1545 1546 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1547 if (rc) 1548 return rc; 1549 *nb_sb -= count; 1550 sb_id--; 1551 } 1552 1553 return 0; 1554 } 1555 1556 /* 1557 * Unplug all plugged subblocks of an offline or not-added memory block. 1558 * 1559 * Will not modify the state of the memory block. 1560 * 1561 * Note: can fail after some subblocks were unplugged. 1562 */ 1563 static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) 1564 { 1565 uint64_t nb_sb = vm->sbm.sbs_per_mb; 1566 1567 return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb); 1568 } 1569 1570 /* 1571 * Prepare tracking data for the next memory block. 1572 */ 1573 static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, 1574 unsigned long *mb_id) 1575 { 1576 int rc; 1577 1578 if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) 1579 return -ENOSPC; 1580 1581 /* Resize the state array if required. */ 1582 rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); 1583 if (rc) 1584 return rc; 1585 1586 /* Resize the subblock bitmap if required. */ 1587 rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); 1588 if (rc) 1589 return rc; 1590 1591 vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; 1592 *mb_id = vm->sbm.next_mb_id++; 1593 return 0; 1594 } 1595 1596 /* 1597 * Try to plug the desired number of subblocks and add the memory block 1598 * to Linux. 1599 * 1600 * Will modify the state of the memory block. 1601 */ 1602 static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, 1603 unsigned long mb_id, uint64_t *nb_sb) 1604 { 1605 const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); 1606 int rc; 1607 1608 if (WARN_ON_ONCE(!count)) 1609 return -EINVAL; 1610 1611 /* 1612 * Plug the requested number of subblocks before adding it to linux, 1613 * so that onlining will directly online all plugged subblocks. 1614 */ 1615 rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); 1616 if (rc) 1617 return rc; 1618 1619 /* 1620 * Mark the block properly offline before adding it to Linux, 1621 * so the memory notifiers will find the block in the right state. 1622 */ 1623 if (count == vm->sbm.sbs_per_mb) 1624 virtio_mem_sbm_set_mb_state(vm, mb_id, 1625 VIRTIO_MEM_SBM_MB_OFFLINE); 1626 else 1627 virtio_mem_sbm_set_mb_state(vm, mb_id, 1628 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1629 1630 /* Add the memory block to linux - if that fails, try to unplug. */ 1631 rc = virtio_mem_sbm_add_mb(vm, mb_id); 1632 if (rc) { 1633 int new_state = VIRTIO_MEM_SBM_MB_UNUSED; 1634 1635 if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) 1636 new_state = VIRTIO_MEM_SBM_MB_PLUGGED; 1637 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 1638 return rc; 1639 } 1640 1641 *nb_sb -= count; 1642 return 0; 1643 } 1644 1645 /* 1646 * Try to plug the desired number of subblocks of a memory block that 1647 * is already added to Linux. 1648 * 1649 * Will modify the state of the memory block. 1650 * 1651 * Note: Can fail after some subblocks were successfully plugged. 1652 */ 1653 static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, 1654 unsigned long mb_id, uint64_t *nb_sb) 1655 { 1656 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1657 unsigned long pfn, nr_pages; 1658 int sb_id, count; 1659 int rc; 1660 1661 if (WARN_ON_ONCE(!*nb_sb)) 1662 return -EINVAL; 1663 1664 while (*nb_sb) { 1665 sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); 1666 if (sb_id >= vm->sbm.sbs_per_mb) 1667 break; 1668 count = 1; 1669 while (count < *nb_sb && 1670 sb_id + count < vm->sbm.sbs_per_mb && 1671 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) 1672 count++; 1673 1674 rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); 1675 if (rc) 1676 return rc; 1677 *nb_sb -= count; 1678 if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) 1679 continue; 1680 1681 /* fake-online the pages if the memory block is online */ 1682 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1683 sb_id * vm->sbm.sb_size); 1684 nr_pages = PFN_DOWN(count * vm->sbm.sb_size); 1685 virtio_mem_fake_online(pfn, nr_pages); 1686 } 1687 1688 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1689 virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1); 1690 1691 return 0; 1692 } 1693 1694 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1695 { 1696 const int mb_states[] = { 1697 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 1698 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 1699 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 1700 }; 1701 uint64_t nb_sb = diff / vm->sbm.sb_size; 1702 unsigned long mb_id; 1703 int rc, i; 1704 1705 if (!nb_sb) 1706 return 0; 1707 1708 /* Don't race with onlining/offlining */ 1709 mutex_lock(&vm->hotplug_mutex); 1710 1711 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 1712 virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) { 1713 rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb); 1714 if (rc || !nb_sb) 1715 goto out_unlock; 1716 cond_resched(); 1717 } 1718 } 1719 1720 /* 1721 * We won't be working on online/offline memory blocks from this point, 1722 * so we can't race with memory onlining/offlining. Drop the mutex. 1723 */ 1724 mutex_unlock(&vm->hotplug_mutex); 1725 1726 /* Try to plug and add unused blocks */ 1727 virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { 1728 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1729 return -ENOSPC; 1730 1731 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1732 if (rc || !nb_sb) 1733 return rc; 1734 cond_resched(); 1735 } 1736 1737 /* Try to prepare, plug and add new blocks */ 1738 while (nb_sb) { 1739 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1740 return -ENOSPC; 1741 1742 rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); 1743 if (rc) 1744 return rc; 1745 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1746 if (rc) 1747 return rc; 1748 cond_resched(); 1749 } 1750 1751 return 0; 1752 out_unlock: 1753 mutex_unlock(&vm->hotplug_mutex); 1754 return rc; 1755 } 1756 1757 /* 1758 * Plug a big block and add it to Linux. 1759 * 1760 * Will modify the state of the big block. 1761 */ 1762 static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, 1763 unsigned long bb_id) 1764 { 1765 int rc; 1766 1767 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 1768 VIRTIO_MEM_BBM_BB_UNUSED)) 1769 return -EINVAL; 1770 1771 rc = virtio_mem_bbm_plug_bb(vm, bb_id); 1772 if (rc) 1773 return rc; 1774 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 1775 1776 rc = virtio_mem_bbm_add_bb(vm, bb_id); 1777 if (rc) { 1778 if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) 1779 virtio_mem_bbm_set_bb_state(vm, bb_id, 1780 VIRTIO_MEM_BBM_BB_UNUSED); 1781 else 1782 /* Retry from the main loop. */ 1783 virtio_mem_bbm_set_bb_state(vm, bb_id, 1784 VIRTIO_MEM_BBM_BB_PLUGGED); 1785 return rc; 1786 } 1787 return 0; 1788 } 1789 1790 /* 1791 * Prepare tracking data for the next big block. 1792 */ 1793 static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, 1794 unsigned long *bb_id) 1795 { 1796 int rc; 1797 1798 if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) 1799 return -ENOSPC; 1800 1801 /* Resize the big block state array if required. */ 1802 rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); 1803 if (rc) 1804 return rc; 1805 1806 vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; 1807 *bb_id = vm->bbm.next_bb_id; 1808 vm->bbm.next_bb_id++; 1809 return 0; 1810 } 1811 1812 static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1813 { 1814 uint64_t nb_bb = diff / vm->bbm.bb_size; 1815 unsigned long bb_id; 1816 int rc; 1817 1818 if (!nb_bb) 1819 return 0; 1820 1821 /* Try to plug and add unused big blocks */ 1822 virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { 1823 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1824 return -ENOSPC; 1825 1826 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1827 if (!rc) 1828 nb_bb--; 1829 if (rc || !nb_bb) 1830 return rc; 1831 cond_resched(); 1832 } 1833 1834 /* Try to prepare, plug and add new big blocks */ 1835 while (nb_bb) { 1836 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1837 return -ENOSPC; 1838 1839 rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); 1840 if (rc) 1841 return rc; 1842 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1843 if (!rc) 1844 nb_bb--; 1845 if (rc) 1846 return rc; 1847 cond_resched(); 1848 } 1849 1850 return 0; 1851 } 1852 1853 /* 1854 * Try to plug the requested amount of memory. 1855 */ 1856 static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) 1857 { 1858 if (vm->in_sbm) 1859 return virtio_mem_sbm_plug_request(vm, diff); 1860 return virtio_mem_bbm_plug_request(vm, diff); 1861 } 1862 1863 /* 1864 * Unplug the desired number of plugged subblocks of an offline memory block. 1865 * Will fail if any subblock cannot get unplugged (instead of skipping it). 1866 * 1867 * Will modify the state of the memory block. Might temporarily drop the 1868 * hotplug_mutex. 1869 * 1870 * Note: Can fail after some subblocks were successfully unplugged. 1871 */ 1872 static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, 1873 unsigned long mb_id, 1874 uint64_t *nb_sb) 1875 { 1876 int rc; 1877 1878 rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb); 1879 1880 /* some subblocks might have been unplugged even on failure */ 1881 if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1882 virtio_mem_sbm_set_mb_state(vm, mb_id, 1883 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1884 if (rc) 1885 return rc; 1886 1887 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1888 /* 1889 * Remove the block from Linux - this should never fail. 1890 * Hinder the block from getting onlined by marking it 1891 * unplugged. Temporarily drop the mutex, so 1892 * any pending GOING_ONLINE requests can be serviced/rejected. 1893 */ 1894 virtio_mem_sbm_set_mb_state(vm, mb_id, 1895 VIRTIO_MEM_SBM_MB_UNUSED); 1896 1897 mutex_unlock(&vm->hotplug_mutex); 1898 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 1899 BUG_ON(rc); 1900 mutex_lock(&vm->hotplug_mutex); 1901 } 1902 return 0; 1903 } 1904 1905 /* 1906 * Unplug the given plugged subblocks of an online memory block. 1907 * 1908 * Will modify the state of the memory block. 1909 */ 1910 static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, 1911 unsigned long mb_id, int sb_id, 1912 int count) 1913 { 1914 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; 1915 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1916 unsigned long start_pfn; 1917 int rc; 1918 1919 start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1920 sb_id * vm->sbm.sb_size); 1921 1922 rc = virtio_mem_fake_offline(start_pfn, nr_pages); 1923 if (rc) 1924 return rc; 1925 1926 /* Try to unplug the allocated memory */ 1927 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1928 if (rc) { 1929 /* Return the memory to the buddy. */ 1930 virtio_mem_fake_online(start_pfn, nr_pages); 1931 return rc; 1932 } 1933 1934 switch (old_state) { 1935 case VIRTIO_MEM_SBM_MB_KERNEL: 1936 virtio_mem_sbm_set_mb_state(vm, mb_id, 1937 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL); 1938 break; 1939 case VIRTIO_MEM_SBM_MB_MOVABLE: 1940 virtio_mem_sbm_set_mb_state(vm, mb_id, 1941 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL); 1942 break; 1943 } 1944 1945 return 0; 1946 } 1947 1948 /* 1949 * Unplug the desired number of plugged subblocks of an online memory block. 1950 * Will skip subblock that are busy. 1951 * 1952 * Will modify the state of the memory block. Might temporarily drop the 1953 * hotplug_mutex. 1954 * 1955 * Note: Can fail after some subblocks were successfully unplugged. Can 1956 * return 0 even if subblocks were busy and could not get unplugged. 1957 */ 1958 static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, 1959 unsigned long mb_id, 1960 uint64_t *nb_sb) 1961 { 1962 int rc, sb_id; 1963 1964 /* If possible, try to unplug the complete block in one shot. */ 1965 if (*nb_sb >= vm->sbm.sbs_per_mb && 1966 virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1967 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, 1968 vm->sbm.sbs_per_mb); 1969 if (!rc) { 1970 *nb_sb -= vm->sbm.sbs_per_mb; 1971 goto unplugged; 1972 } else if (rc != -EBUSY) 1973 return rc; 1974 } 1975 1976 /* Fallback to single subblocks. */ 1977 for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { 1978 /* Find the next candidate subblock */ 1979 while (sb_id >= 0 && 1980 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 1981 sb_id--; 1982 if (sb_id < 0) 1983 break; 1984 1985 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); 1986 if (rc == -EBUSY) 1987 continue; 1988 else if (rc) 1989 return rc; 1990 *nb_sb -= 1; 1991 } 1992 1993 unplugged: 1994 /* 1995 * Once all subblocks of a memory block were unplugged, offline and 1996 * remove it. This will usually not fail, as no memory is in use 1997 * anymore - however some other notifiers might NACK the request. 1998 */ 1999 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 2000 mutex_unlock(&vm->hotplug_mutex); 2001 rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); 2002 mutex_lock(&vm->hotplug_mutex); 2003 if (!rc) 2004 virtio_mem_sbm_set_mb_state(vm, mb_id, 2005 VIRTIO_MEM_SBM_MB_UNUSED); 2006 } 2007 2008 return 0; 2009 } 2010 2011 /* 2012 * Unplug the desired number of plugged subblocks of a memory block that is 2013 * already added to Linux. Will skip subblock of online memory blocks that are 2014 * busy (by the OS). Will fail if any subblock that's not busy cannot get 2015 * unplugged. 2016 * 2017 * Will modify the state of the memory block. Might temporarily drop the 2018 * hotplug_mutex. 2019 * 2020 * Note: Can fail after some subblocks were successfully unplugged. Can 2021 * return 0 even if subblocks were busy and could not get unplugged. 2022 */ 2023 static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, 2024 unsigned long mb_id, 2025 uint64_t *nb_sb) 2026 { 2027 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 2028 2029 switch (old_state) { 2030 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 2031 case VIRTIO_MEM_SBM_MB_KERNEL: 2032 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 2033 case VIRTIO_MEM_SBM_MB_MOVABLE: 2034 return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); 2035 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 2036 case VIRTIO_MEM_SBM_MB_OFFLINE: 2037 return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb); 2038 } 2039 return -EINVAL; 2040 } 2041 2042 static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2043 { 2044 const int mb_states[] = { 2045 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 2046 VIRTIO_MEM_SBM_MB_OFFLINE, 2047 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 2048 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 2049 VIRTIO_MEM_SBM_MB_MOVABLE, 2050 VIRTIO_MEM_SBM_MB_KERNEL, 2051 }; 2052 uint64_t nb_sb = diff / vm->sbm.sb_size; 2053 unsigned long mb_id; 2054 int rc, i; 2055 2056 if (!nb_sb) 2057 return 0; 2058 2059 /* 2060 * We'll drop the mutex a couple of times when it is safe to do so. 2061 * This might result in some blocks switching the state (online/offline) 2062 * and we could miss them in this run - we will retry again later. 2063 */ 2064 mutex_lock(&vm->hotplug_mutex); 2065 2066 /* 2067 * We try unplug from partially plugged blocks first, to try removing 2068 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE 2069 * as it's more reliable to unplug memory and remove whole memory 2070 * blocks, and we don't want to trigger a zone imbalances by 2071 * accidentially removing too much kernel memory. 2072 */ 2073 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 2074 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { 2075 rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); 2076 if (rc || !nb_sb) 2077 goto out_unlock; 2078 mutex_unlock(&vm->hotplug_mutex); 2079 cond_resched(); 2080 mutex_lock(&vm->hotplug_mutex); 2081 } 2082 if (!unplug_online && i == 1) { 2083 mutex_unlock(&vm->hotplug_mutex); 2084 return 0; 2085 } 2086 } 2087 2088 mutex_unlock(&vm->hotplug_mutex); 2089 return nb_sb ? -EBUSY : 0; 2090 out_unlock: 2091 mutex_unlock(&vm->hotplug_mutex); 2092 return rc; 2093 } 2094 2095 /* 2096 * Try to offline and remove a big block from Linux and unplug it. Will fail 2097 * with -EBUSY if some memory is busy and cannot get unplugged. 2098 * 2099 * Will modify the state of the memory block. Might temporarily drop the 2100 * hotplug_mutex. 2101 */ 2102 static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, 2103 unsigned long bb_id) 2104 { 2105 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2106 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2107 unsigned long end_pfn = start_pfn + nr_pages; 2108 unsigned long pfn; 2109 struct page *page; 2110 int rc; 2111 2112 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 2113 VIRTIO_MEM_BBM_BB_ADDED)) 2114 return -EINVAL; 2115 2116 if (bbm_safe_unplug) { 2117 /* 2118 * Start by fake-offlining all memory. Once we marked the device 2119 * block as fake-offline, all newly onlined memory will 2120 * automatically be kept fake-offline. Protect from concurrent 2121 * onlining/offlining until we have a consistent state. 2122 */ 2123 mutex_lock(&vm->hotplug_mutex); 2124 virtio_mem_bbm_set_bb_state(vm, bb_id, 2125 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); 2126 2127 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2128 page = pfn_to_online_page(pfn); 2129 if (!page) 2130 continue; 2131 2132 rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION); 2133 if (rc) { 2134 end_pfn = pfn; 2135 goto rollback_safe_unplug; 2136 } 2137 } 2138 mutex_unlock(&vm->hotplug_mutex); 2139 } 2140 2141 rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); 2142 if (rc) { 2143 if (bbm_safe_unplug) { 2144 mutex_lock(&vm->hotplug_mutex); 2145 goto rollback_safe_unplug; 2146 } 2147 return rc; 2148 } 2149 2150 rc = virtio_mem_bbm_unplug_bb(vm, bb_id); 2151 if (rc) 2152 virtio_mem_bbm_set_bb_state(vm, bb_id, 2153 VIRTIO_MEM_BBM_BB_PLUGGED); 2154 else 2155 virtio_mem_bbm_set_bb_state(vm, bb_id, 2156 VIRTIO_MEM_BBM_BB_UNUSED); 2157 return rc; 2158 2159 rollback_safe_unplug: 2160 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2161 page = pfn_to_online_page(pfn); 2162 if (!page) 2163 continue; 2164 virtio_mem_fake_online(pfn, PAGES_PER_SECTION); 2165 } 2166 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 2167 mutex_unlock(&vm->hotplug_mutex); 2168 return rc; 2169 } 2170 2171 /* 2172 * Test if a big block is completely offline. 2173 */ 2174 static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, 2175 unsigned long bb_id) 2176 { 2177 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2178 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2179 unsigned long pfn; 2180 2181 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2182 pfn += PAGES_PER_SECTION) { 2183 if (pfn_to_online_page(pfn)) 2184 return false; 2185 } 2186 2187 return true; 2188 } 2189 2190 /* 2191 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline). 2192 */ 2193 static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm, 2194 unsigned long bb_id) 2195 { 2196 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2197 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2198 struct page *page; 2199 unsigned long pfn; 2200 2201 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2202 pfn += PAGES_PER_SECTION) { 2203 page = pfn_to_online_page(pfn); 2204 if (!page) 2205 continue; 2206 if (page_zonenum(page) != ZONE_MOVABLE) 2207 return false; 2208 } 2209 2210 return true; 2211 } 2212 2213 static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2214 { 2215 uint64_t nb_bb = diff / vm->bbm.bb_size; 2216 uint64_t bb_id; 2217 int rc, i; 2218 2219 if (!nb_bb) 2220 return 0; 2221 2222 /* 2223 * Try to unplug big blocks. Similar to SBM, start with offline 2224 * big blocks. 2225 */ 2226 for (i = 0; i < 3; i++) { 2227 virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 2228 cond_resched(); 2229 2230 /* 2231 * As we're holding no locks, these checks are racy, 2232 * but we don't care. 2233 */ 2234 if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) 2235 continue; 2236 if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id)) 2237 continue; 2238 rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); 2239 if (rc == -EBUSY) 2240 continue; 2241 if (!rc) 2242 nb_bb--; 2243 if (rc || !nb_bb) 2244 return rc; 2245 } 2246 if (i == 0 && !unplug_online) 2247 return 0; 2248 } 2249 2250 return nb_bb ? -EBUSY : 0; 2251 } 2252 2253 /* 2254 * Try to unplug the requested amount of memory. 2255 */ 2256 static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) 2257 { 2258 if (vm->in_sbm) 2259 return virtio_mem_sbm_unplug_request(vm, diff); 2260 return virtio_mem_bbm_unplug_request(vm, diff); 2261 } 2262 2263 /* 2264 * Try to unplug all blocks that couldn't be unplugged before, for example, 2265 * because the hypervisor was busy. 2266 */ 2267 static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) 2268 { 2269 unsigned long id; 2270 int rc; 2271 2272 if (!vm->in_sbm) { 2273 virtio_mem_bbm_for_each_bb(vm, id, 2274 VIRTIO_MEM_BBM_BB_PLUGGED) { 2275 rc = virtio_mem_bbm_unplug_bb(vm, id); 2276 if (rc) 2277 return rc; 2278 virtio_mem_bbm_set_bb_state(vm, id, 2279 VIRTIO_MEM_BBM_BB_UNUSED); 2280 } 2281 return 0; 2282 } 2283 2284 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { 2285 rc = virtio_mem_sbm_unplug_mb(vm, id); 2286 if (rc) 2287 return rc; 2288 virtio_mem_sbm_set_mb_state(vm, id, 2289 VIRTIO_MEM_SBM_MB_UNUSED); 2290 } 2291 2292 return 0; 2293 } 2294 2295 /* 2296 * Update all parts of the config that could have changed. 2297 */ 2298 static void virtio_mem_refresh_config(struct virtio_mem *vm) 2299 { 2300 const struct range pluggable_range = mhp_get_pluggable_range(true); 2301 uint64_t new_plugged_size, usable_region_size, end_addr; 2302 2303 /* the plugged_size is just a reflection of what _we_ did previously */ 2304 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2305 &new_plugged_size); 2306 if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size)) 2307 vm->plugged_size = new_plugged_size; 2308 2309 /* calculate the last usable memory block id */ 2310 virtio_cread_le(vm->vdev, struct virtio_mem_config, 2311 usable_region_size, &usable_region_size); 2312 end_addr = min(vm->addr + usable_region_size - 1, 2313 pluggable_range.end); 2314 2315 if (vm->in_sbm) { 2316 vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr); 2317 if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes())) 2318 vm->sbm.last_usable_mb_id--; 2319 } else { 2320 vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm, 2321 end_addr); 2322 if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size)) 2323 vm->bbm.last_usable_bb_id--; 2324 } 2325 /* 2326 * If we cannot plug any of our device memory (e.g., nothing in the 2327 * usable region is addressable), the last usable memory block id will 2328 * be smaller than the first usable memory block id. We'll stop 2329 * attempting to add memory with -ENOSPC from our main loop. 2330 */ 2331 2332 /* see if there is a request to change the size */ 2333 virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, 2334 &vm->requested_size); 2335 2336 dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size); 2337 dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size); 2338 } 2339 2340 /* 2341 * Workqueue function for handling plug/unplug requests and config updates. 2342 */ 2343 static void virtio_mem_run_wq(struct work_struct *work) 2344 { 2345 struct virtio_mem *vm = container_of(work, struct virtio_mem, wq); 2346 uint64_t diff; 2347 int rc; 2348 2349 if (unlikely(vm->in_kdump)) { 2350 dev_warn_once(&vm->vdev->dev, 2351 "unexpected workqueue run in kdump kernel\n"); 2352 return; 2353 } 2354 2355 hrtimer_cancel(&vm->retry_timer); 2356 2357 if (vm->broken) 2358 return; 2359 2360 atomic_set(&vm->wq_active, 1); 2361 retry: 2362 rc = 0; 2363 2364 /* Make sure we start with a clean state if there are leftovers. */ 2365 if (unlikely(vm->unplug_all_required)) 2366 rc = virtio_mem_send_unplug_all_request(vm); 2367 2368 if (atomic_read(&vm->config_changed)) { 2369 atomic_set(&vm->config_changed, 0); 2370 virtio_mem_refresh_config(vm); 2371 } 2372 2373 /* Unplug any leftovers from previous runs */ 2374 if (!rc) 2375 rc = virtio_mem_unplug_pending_mb(vm); 2376 2377 if (!rc && vm->requested_size != vm->plugged_size) { 2378 if (vm->requested_size > vm->plugged_size) { 2379 diff = vm->requested_size - vm->plugged_size; 2380 rc = virtio_mem_plug_request(vm, diff); 2381 } else { 2382 diff = vm->plugged_size - vm->requested_size; 2383 rc = virtio_mem_unplug_request(vm, diff); 2384 } 2385 } 2386 2387 switch (rc) { 2388 case 0: 2389 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2390 break; 2391 case -ENOSPC: 2392 /* 2393 * We cannot add any more memory (alignment, physical limit) 2394 * or we have too many offline memory blocks. 2395 */ 2396 break; 2397 case -ETXTBSY: 2398 /* 2399 * The hypervisor cannot process our request right now 2400 * (e.g., out of memory, migrating); 2401 */ 2402 case -EBUSY: 2403 /* 2404 * We cannot free up any memory to unplug it (all plugged memory 2405 * is busy). 2406 */ 2407 case -ENOMEM: 2408 /* Out of memory, try again later. */ 2409 hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms), 2410 HRTIMER_MODE_REL); 2411 break; 2412 case -EAGAIN: 2413 /* Retry immediately (e.g., the config changed). */ 2414 goto retry; 2415 default: 2416 /* Unknown error, mark as broken */ 2417 dev_err(&vm->vdev->dev, 2418 "unknown error, marking device broken: %d\n", rc); 2419 vm->broken = true; 2420 } 2421 2422 atomic_set(&vm->wq_active, 0); 2423 } 2424 2425 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) 2426 { 2427 struct virtio_mem *vm = container_of(timer, struct virtio_mem, 2428 retry_timer); 2429 2430 virtio_mem_retry(vm); 2431 vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2, 2432 VIRTIO_MEM_RETRY_TIMER_MAX_MS); 2433 return HRTIMER_NORESTART; 2434 } 2435 2436 static void virtio_mem_handle_response(struct virtqueue *vq) 2437 { 2438 struct virtio_mem *vm = vq->vdev->priv; 2439 2440 wake_up(&vm->host_resp); 2441 } 2442 2443 static int virtio_mem_init_vq(struct virtio_mem *vm) 2444 { 2445 struct virtqueue *vq; 2446 2447 vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response, 2448 "guest-request"); 2449 if (IS_ERR(vq)) 2450 return PTR_ERR(vq); 2451 vm->vq = vq; 2452 2453 return 0; 2454 } 2455 2456 static int virtio_mem_init_hotplug(struct virtio_mem *vm) 2457 { 2458 const struct range pluggable_range = mhp_get_pluggable_range(true); 2459 uint64_t unit_pages, sb_size, addr; 2460 int rc; 2461 2462 /* bad device setup - warn only */ 2463 if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) 2464 dev_warn(&vm->vdev->dev, 2465 "The alignment of the physical start address can make some memory unusable.\n"); 2466 if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) 2467 dev_warn(&vm->vdev->dev, 2468 "The alignment of the physical end address can make some memory unusable.\n"); 2469 if (vm->addr < pluggable_range.start || 2470 vm->addr + vm->region_size - 1 > pluggable_range.end) 2471 dev_warn(&vm->vdev->dev, 2472 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); 2473 2474 /* Prepare the offline threshold - make sure we can add two blocks. */ 2475 vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), 2476 VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); 2477 2478 /* 2479 * TODO: once alloc_contig_range() works reliably with pageblock 2480 * granularity on ZONE_NORMAL, use pageblock_nr_pages instead. 2481 */ 2482 sb_size = PAGE_SIZE * MAX_ORDER_NR_PAGES; 2483 sb_size = max_t(uint64_t, vm->device_block_size, sb_size); 2484 2485 if (sb_size < memory_block_size_bytes() && !force_bbm) { 2486 /* SBM: At least two subblocks per Linux memory block. */ 2487 vm->in_sbm = true; 2488 vm->sbm.sb_size = sb_size; 2489 vm->sbm.sbs_per_mb = memory_block_size_bytes() / 2490 vm->sbm.sb_size; 2491 2492 /* Round up to the next full memory block */ 2493 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2494 memory_block_size_bytes() - 1; 2495 vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); 2496 vm->sbm.next_mb_id = vm->sbm.first_mb_id; 2497 } else { 2498 /* BBM: At least one Linux memory block. */ 2499 vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, 2500 memory_block_size_bytes()); 2501 2502 if (bbm_block_size) { 2503 if (!is_power_of_2(bbm_block_size)) { 2504 dev_warn(&vm->vdev->dev, 2505 "bbm_block_size is not a power of 2"); 2506 } else if (bbm_block_size < vm->bbm.bb_size) { 2507 dev_warn(&vm->vdev->dev, 2508 "bbm_block_size is too small"); 2509 } else { 2510 vm->bbm.bb_size = bbm_block_size; 2511 } 2512 } 2513 2514 /* Round up to the next aligned big block */ 2515 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2516 vm->bbm.bb_size - 1; 2517 vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); 2518 vm->bbm.next_bb_id = vm->bbm.first_bb_id; 2519 2520 /* Make sure we can add two big blocks. */ 2521 vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, 2522 vm->offline_threshold); 2523 } 2524 2525 dev_info(&vm->vdev->dev, "memory block size: 0x%lx", 2526 memory_block_size_bytes()); 2527 if (vm->in_sbm) 2528 dev_info(&vm->vdev->dev, "subblock size: 0x%llx", 2529 (unsigned long long)vm->sbm.sb_size); 2530 else 2531 dev_info(&vm->vdev->dev, "big block size: 0x%llx", 2532 (unsigned long long)vm->bbm.bb_size); 2533 2534 /* create the parent resource for all memory */ 2535 rc = virtio_mem_create_resource(vm); 2536 if (rc) 2537 return rc; 2538 2539 /* use a single dynamic memory group to cover the whole memory device */ 2540 if (vm->in_sbm) 2541 unit_pages = PHYS_PFN(memory_block_size_bytes()); 2542 else 2543 unit_pages = PHYS_PFN(vm->bbm.bb_size); 2544 rc = memory_group_register_dynamic(vm->nid, unit_pages); 2545 if (rc < 0) 2546 goto out_del_resource; 2547 vm->mgid = rc; 2548 2549 /* 2550 * If we still have memory plugged, we have to unplug all memory first. 2551 * Registering our parent resource makes sure that this memory isn't 2552 * actually in use (e.g., trying to reload the driver). 2553 */ 2554 if (vm->plugged_size) { 2555 vm->unplug_all_required = true; 2556 dev_info(&vm->vdev->dev, "unplugging all memory is required\n"); 2557 } 2558 2559 /* register callbacks */ 2560 vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb; 2561 rc = register_memory_notifier(&vm->memory_notifier); 2562 if (rc) 2563 goto out_unreg_group; 2564 rc = register_virtio_mem_device(vm); 2565 if (rc) 2566 goto out_unreg_mem; 2567 2568 return 0; 2569 out_unreg_mem: 2570 unregister_memory_notifier(&vm->memory_notifier); 2571 out_unreg_group: 2572 memory_group_unregister(vm->mgid); 2573 out_del_resource: 2574 virtio_mem_delete_resource(vm); 2575 return rc; 2576 } 2577 2578 #ifdef CONFIG_PROC_VMCORE 2579 static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr, 2580 uint64_t size) 2581 { 2582 const uint64_t nb_vm_blocks = size / vm->device_block_size; 2583 const struct virtio_mem_req req = { 2584 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE), 2585 .u.state.addr = cpu_to_virtio64(vm->vdev, addr), 2586 .u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 2587 }; 2588 int rc = -ENOMEM; 2589 2590 dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr, 2591 addr + size - 1); 2592 2593 switch (virtio_mem_send_request(vm, &req)) { 2594 case VIRTIO_MEM_RESP_ACK: 2595 return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state); 2596 case VIRTIO_MEM_RESP_ERROR: 2597 rc = -EINVAL; 2598 break; 2599 default: 2600 break; 2601 } 2602 2603 dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc); 2604 return rc; 2605 } 2606 2607 static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb, 2608 unsigned long pfn) 2609 { 2610 struct virtio_mem *vm = container_of(cb, struct virtio_mem, 2611 vmcore_cb); 2612 uint64_t addr = PFN_PHYS(pfn); 2613 bool is_ram; 2614 int rc; 2615 2616 if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE)) 2617 return true; 2618 if (!vm->plugged_size) 2619 return false; 2620 2621 /* 2622 * We have to serialize device requests and access to the information 2623 * about the block queried last. 2624 */ 2625 mutex_lock(&vm->hotplug_mutex); 2626 2627 addr = ALIGN_DOWN(addr, vm->device_block_size); 2628 if (addr != vm->last_block_addr) { 2629 rc = virtio_mem_send_state_request(vm, addr, 2630 vm->device_block_size); 2631 /* On any kind of error, we're going to signal !ram. */ 2632 if (rc == VIRTIO_MEM_STATE_PLUGGED) 2633 vm->last_block_plugged = true; 2634 else 2635 vm->last_block_plugged = false; 2636 vm->last_block_addr = addr; 2637 } 2638 2639 is_ram = vm->last_block_plugged; 2640 mutex_unlock(&vm->hotplug_mutex); 2641 return is_ram; 2642 } 2643 #endif /* CONFIG_PROC_VMCORE */ 2644 2645 static int virtio_mem_init_kdump(struct virtio_mem *vm) 2646 { 2647 #ifdef CONFIG_PROC_VMCORE 2648 dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n"); 2649 vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram; 2650 register_vmcore_cb(&vm->vmcore_cb); 2651 return 0; 2652 #else /* CONFIG_PROC_VMCORE */ 2653 dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n"); 2654 return -EBUSY; 2655 #endif /* CONFIG_PROC_VMCORE */ 2656 } 2657 2658 static int virtio_mem_init(struct virtio_mem *vm) 2659 { 2660 uint16_t node_id; 2661 2662 if (!vm->vdev->config->get) { 2663 dev_err(&vm->vdev->dev, "config access disabled\n"); 2664 return -EINVAL; 2665 } 2666 2667 /* Fetch all properties that can't change. */ 2668 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2669 &vm->plugged_size); 2670 virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size, 2671 &vm->device_block_size); 2672 virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id, 2673 &node_id); 2674 vm->nid = virtio_mem_translate_node_id(vm, node_id); 2675 virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr); 2676 virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, 2677 &vm->region_size); 2678 2679 /* Determine the nid for the device based on the lowest address. */ 2680 if (vm->nid == NUMA_NO_NODE) 2681 vm->nid = memory_add_physaddr_to_nid(vm->addr); 2682 2683 dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); 2684 dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); 2685 dev_info(&vm->vdev->dev, "device block size: 0x%llx", 2686 (unsigned long long)vm->device_block_size); 2687 if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) 2688 dev_info(&vm->vdev->dev, "nid: %d", vm->nid); 2689 2690 /* 2691 * We don't want to (un)plug or reuse any memory when in kdump. The 2692 * memory is still accessible (but not exposed to Linux). 2693 */ 2694 if (vm->in_kdump) 2695 return virtio_mem_init_kdump(vm); 2696 return virtio_mem_init_hotplug(vm); 2697 } 2698 2699 static int virtio_mem_create_resource(struct virtio_mem *vm) 2700 { 2701 /* 2702 * When force-unloading the driver and removing the device, we 2703 * could have a garbage pointer. Duplicate the string. 2704 */ 2705 const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL); 2706 2707 if (!name) 2708 return -ENOMEM; 2709 2710 /* Disallow mapping device memory via /dev/mem completely. */ 2711 vm->parent_resource = __request_mem_region(vm->addr, vm->region_size, 2712 name, IORESOURCE_SYSTEM_RAM | 2713 IORESOURCE_EXCLUSIVE); 2714 if (!vm->parent_resource) { 2715 kfree(name); 2716 dev_warn(&vm->vdev->dev, "could not reserve device region\n"); 2717 dev_info(&vm->vdev->dev, 2718 "reloading the driver is not supported\n"); 2719 return -EBUSY; 2720 } 2721 2722 /* The memory is not actually busy - make add_memory() work. */ 2723 vm->parent_resource->flags &= ~IORESOURCE_BUSY; 2724 return 0; 2725 } 2726 2727 static void virtio_mem_delete_resource(struct virtio_mem *vm) 2728 { 2729 const char *name; 2730 2731 if (!vm->parent_resource) 2732 return; 2733 2734 name = vm->parent_resource->name; 2735 release_resource(vm->parent_resource); 2736 kfree(vm->parent_resource); 2737 kfree(name); 2738 vm->parent_resource = NULL; 2739 } 2740 2741 static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) 2742 { 2743 return 1; 2744 } 2745 2746 static bool virtio_mem_has_memory_added(struct virtio_mem *vm) 2747 { 2748 const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 2749 2750 return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, 2751 vm->addr + vm->region_size, NULL, 2752 virtio_mem_range_has_system_ram) == 1; 2753 } 2754 2755 static int virtio_mem_probe(struct virtio_device *vdev) 2756 { 2757 struct virtio_mem *vm; 2758 int rc; 2759 2760 BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24); 2761 BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10); 2762 2763 vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL); 2764 if (!vm) 2765 return -ENOMEM; 2766 2767 init_waitqueue_head(&vm->host_resp); 2768 vm->vdev = vdev; 2769 INIT_WORK(&vm->wq, virtio_mem_run_wq); 2770 mutex_init(&vm->hotplug_mutex); 2771 INIT_LIST_HEAD(&vm->next); 2772 spin_lock_init(&vm->removal_lock); 2773 hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2774 vm->retry_timer.function = virtio_mem_timer_expired; 2775 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2776 vm->in_kdump = is_kdump_kernel(); 2777 2778 /* register the virtqueue */ 2779 rc = virtio_mem_init_vq(vm); 2780 if (rc) 2781 goto out_free_vm; 2782 2783 /* initialize the device by querying the config */ 2784 rc = virtio_mem_init(vm); 2785 if (rc) 2786 goto out_del_vq; 2787 2788 virtio_device_ready(vdev); 2789 2790 /* trigger a config update to start processing the requested_size */ 2791 if (!vm->in_kdump) { 2792 atomic_set(&vm->config_changed, 1); 2793 queue_work(system_freezable_wq, &vm->wq); 2794 } 2795 2796 return 0; 2797 out_del_vq: 2798 vdev->config->del_vqs(vdev); 2799 out_free_vm: 2800 kfree(vm); 2801 vdev->priv = NULL; 2802 2803 return rc; 2804 } 2805 2806 static void virtio_mem_deinit_hotplug(struct virtio_mem *vm) 2807 { 2808 unsigned long mb_id; 2809 int rc; 2810 2811 /* 2812 * Make sure the workqueue won't be triggered anymore and no memory 2813 * blocks can be onlined/offlined until we're finished here. 2814 */ 2815 mutex_lock(&vm->hotplug_mutex); 2816 spin_lock_irq(&vm->removal_lock); 2817 vm->removing = true; 2818 spin_unlock_irq(&vm->removal_lock); 2819 mutex_unlock(&vm->hotplug_mutex); 2820 2821 /* wait until the workqueue stopped */ 2822 cancel_work_sync(&vm->wq); 2823 hrtimer_cancel(&vm->retry_timer); 2824 2825 if (vm->in_sbm) { 2826 /* 2827 * After we unregistered our callbacks, user space can online 2828 * partially plugged offline blocks. Make sure to remove them. 2829 */ 2830 virtio_mem_sbm_for_each_mb(vm, mb_id, 2831 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 2832 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 2833 BUG_ON(rc); 2834 virtio_mem_sbm_set_mb_state(vm, mb_id, 2835 VIRTIO_MEM_SBM_MB_UNUSED); 2836 } 2837 /* 2838 * After we unregistered our callbacks, user space can no longer 2839 * offline partially plugged online memory blocks. No need to 2840 * worry about them. 2841 */ 2842 } 2843 2844 /* unregister callbacks */ 2845 unregister_virtio_mem_device(vm); 2846 unregister_memory_notifier(&vm->memory_notifier); 2847 2848 /* 2849 * There is no way we could reliably remove all memory we have added to 2850 * the system. And there is no way to stop the driver/device from going 2851 * away. Warn at least. 2852 */ 2853 if (virtio_mem_has_memory_added(vm)) { 2854 dev_warn(&vm->vdev->dev, 2855 "device still has system memory added\n"); 2856 } else { 2857 virtio_mem_delete_resource(vm); 2858 kfree_const(vm->resource_name); 2859 memory_group_unregister(vm->mgid); 2860 } 2861 2862 /* remove all tracking data - no locking needed */ 2863 if (vm->in_sbm) { 2864 vfree(vm->sbm.mb_states); 2865 vfree(vm->sbm.sb_states); 2866 } else { 2867 vfree(vm->bbm.bb_states); 2868 } 2869 } 2870 2871 static void virtio_mem_deinit_kdump(struct virtio_mem *vm) 2872 { 2873 #ifdef CONFIG_PROC_VMCORE 2874 unregister_vmcore_cb(&vm->vmcore_cb); 2875 #endif /* CONFIG_PROC_VMCORE */ 2876 } 2877 2878 static void virtio_mem_remove(struct virtio_device *vdev) 2879 { 2880 struct virtio_mem *vm = vdev->priv; 2881 2882 if (vm->in_kdump) 2883 virtio_mem_deinit_kdump(vm); 2884 else 2885 virtio_mem_deinit_hotplug(vm); 2886 2887 /* reset the device and cleanup the queues */ 2888 virtio_reset_device(vdev); 2889 vdev->config->del_vqs(vdev); 2890 2891 kfree(vm); 2892 vdev->priv = NULL; 2893 } 2894 2895 static void virtio_mem_config_changed(struct virtio_device *vdev) 2896 { 2897 struct virtio_mem *vm = vdev->priv; 2898 2899 if (unlikely(vm->in_kdump)) 2900 return; 2901 2902 atomic_set(&vm->config_changed, 1); 2903 virtio_mem_retry(vm); 2904 } 2905 2906 #ifdef CONFIG_PM_SLEEP 2907 static int virtio_mem_freeze(struct virtio_device *vdev) 2908 { 2909 /* 2910 * When restarting the VM, all memory is usually unplugged. Don't 2911 * allow to suspend/hibernate. 2912 */ 2913 dev_err(&vdev->dev, "save/restore not supported.\n"); 2914 return -EPERM; 2915 } 2916 2917 static int virtio_mem_restore(struct virtio_device *vdev) 2918 { 2919 return -EPERM; 2920 } 2921 #endif 2922 2923 static unsigned int virtio_mem_features[] = { 2924 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA) 2925 VIRTIO_MEM_F_ACPI_PXM, 2926 #endif 2927 VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE, 2928 }; 2929 2930 static const struct virtio_device_id virtio_mem_id_table[] = { 2931 { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, 2932 { 0 }, 2933 }; 2934 2935 static struct virtio_driver virtio_mem_driver = { 2936 .feature_table = virtio_mem_features, 2937 .feature_table_size = ARRAY_SIZE(virtio_mem_features), 2938 .driver.name = KBUILD_MODNAME, 2939 .driver.owner = THIS_MODULE, 2940 .id_table = virtio_mem_id_table, 2941 .probe = virtio_mem_probe, 2942 .remove = virtio_mem_remove, 2943 .config_changed = virtio_mem_config_changed, 2944 #ifdef CONFIG_PM_SLEEP 2945 .freeze = virtio_mem_freeze, 2946 .restore = virtio_mem_restore, 2947 #endif 2948 }; 2949 2950 module_virtio_driver(virtio_mem_driver); 2951 MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table); 2952 MODULE_AUTHOR("David Hildenbrand <david@redhat.com>"); 2953 MODULE_DESCRIPTION("Virtio-mem driver"); 2954 MODULE_LICENSE("GPL"); 2955