1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Virtio-mem device driver. 4 * 5 * Copyright Red Hat, Inc. 2020 6 * 7 * Author(s): David Hildenbrand <david@redhat.com> 8 */ 9 10 #include <linux/virtio.h> 11 #include <linux/virtio_mem.h> 12 #include <linux/workqueue.h> 13 #include <linux/slab.h> 14 #include <linux/module.h> 15 #include <linux/mm.h> 16 #include <linux/memory_hotplug.h> 17 #include <linux/memory.h> 18 #include <linux/hrtimer.h> 19 #include <linux/crash_dump.h> 20 #include <linux/mutex.h> 21 #include <linux/bitmap.h> 22 #include <linux/lockdep.h> 23 #include <linux/log2.h> 24 25 #include <acpi/acpi_numa.h> 26 27 static bool unplug_online = true; 28 module_param(unplug_online, bool, 0644); 29 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); 30 31 static bool force_bbm; 32 module_param(force_bbm, bool, 0444); 33 MODULE_PARM_DESC(force_bbm, 34 "Force Big Block Mode. Default is 0 (auto-selection)"); 35 36 static unsigned long bbm_block_size; 37 module_param(bbm_block_size, ulong, 0444); 38 MODULE_PARM_DESC(bbm_block_size, 39 "Big Block size in bytes. Default is 0 (auto-detection)."); 40 41 static bool bbm_safe_unplug = true; 42 module_param(bbm_safe_unplug, bool, 0444); 43 MODULE_PARM_DESC(bbm_safe_unplug, 44 "Use a safe unplug mechanism in BBM, avoiding long/endless loops"); 45 46 /* 47 * virtio-mem currently supports the following modes of operation: 48 * 49 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The 50 * size of a Sub Block (SB) is determined based on the device block size, the 51 * pageblock size, and the maximum allocation granularity of the buddy. 52 * Subblocks within a Linux memory block might either be plugged or unplugged. 53 * Memory is added/removed to Linux MM in Linux memory block granularity. 54 * 55 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. 56 * Memory is added/removed to Linux MM in Big Block granularity. 57 * 58 * The mode is determined automatically based on the Linux memory block size 59 * and the device block size. 60 * 61 * User space / core MM (auto onlining) is responsible for onlining added 62 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are 63 * always onlined separately, and all memory within a Linux memory block is 64 * onlined to the same zone - virtio-mem relies on this behavior. 65 */ 66 67 /* 68 * State of a Linux memory block in SBM. 69 */ 70 enum virtio_mem_sbm_mb_state { 71 /* Unplugged, not added to Linux. Can be reused later. */ 72 VIRTIO_MEM_SBM_MB_UNUSED = 0, 73 /* (Partially) plugged, not added to Linux. Error on add_memory(). */ 74 VIRTIO_MEM_SBM_MB_PLUGGED, 75 /* Fully plugged, fully added to Linux, offline. */ 76 VIRTIO_MEM_SBM_MB_OFFLINE, 77 /* Partially plugged, fully added to Linux, offline. */ 78 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 79 /* Fully plugged, fully added to Linux, onlined to a kernel zone. */ 80 VIRTIO_MEM_SBM_MB_KERNEL, 81 /* Partially plugged, fully added to Linux, online to a kernel zone */ 82 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 83 /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 84 VIRTIO_MEM_SBM_MB_MOVABLE, 85 /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 86 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 87 VIRTIO_MEM_SBM_MB_COUNT 88 }; 89 90 /* 91 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. 92 */ 93 enum virtio_mem_bbm_bb_state { 94 /* Unplugged, not added to Linux. Can be reused later. */ 95 VIRTIO_MEM_BBM_BB_UNUSED = 0, 96 /* Plugged, not added to Linux. Error on add_memory(). */ 97 VIRTIO_MEM_BBM_BB_PLUGGED, 98 /* Plugged and added to Linux. */ 99 VIRTIO_MEM_BBM_BB_ADDED, 100 /* All online parts are fake-offline, ready to remove. */ 101 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, 102 VIRTIO_MEM_BBM_BB_COUNT 103 }; 104 105 struct virtio_mem { 106 struct virtio_device *vdev; 107 108 /* We might first have to unplug all memory when starting up. */ 109 bool unplug_all_required; 110 111 /* Workqueue that processes the plug/unplug requests. */ 112 struct work_struct wq; 113 atomic_t wq_active; 114 atomic_t config_changed; 115 116 /* Virtqueue for guest->host requests. */ 117 struct virtqueue *vq; 118 119 /* Wait for a host response to a guest request. */ 120 wait_queue_head_t host_resp; 121 122 /* Space for one guest request and the host response. */ 123 struct virtio_mem_req req; 124 struct virtio_mem_resp resp; 125 126 /* The current size of the device. */ 127 uint64_t plugged_size; 128 /* The requested size of the device. */ 129 uint64_t requested_size; 130 131 /* The device block size (for communicating with the device). */ 132 uint64_t device_block_size; 133 /* The determined node id for all memory of the device. */ 134 int nid; 135 /* Physical start address of the memory region. */ 136 uint64_t addr; 137 /* Maximum region size in bytes. */ 138 uint64_t region_size; 139 140 /* The parent resource for all memory added via this device. */ 141 struct resource *parent_resource; 142 /* 143 * Copy of "System RAM (virtio_mem)" to be used for 144 * add_memory_driver_managed(). 145 */ 146 const char *resource_name; 147 /* Memory group identification. */ 148 int mgid; 149 150 /* 151 * We don't want to add too much memory if it's not getting onlined, 152 * to avoid running OOM. Besides this threshold, we allow to have at 153 * least two offline blocks at a time (whatever is bigger). 154 */ 155 #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) 156 atomic64_t offline_size; 157 uint64_t offline_threshold; 158 159 /* If set, the driver is in SBM, otherwise in BBM. */ 160 bool in_sbm; 161 162 union { 163 struct { 164 /* Id of the first memory block of this device. */ 165 unsigned long first_mb_id; 166 /* Id of the last usable memory block of this device. */ 167 unsigned long last_usable_mb_id; 168 /* Id of the next memory bock to prepare when needed. */ 169 unsigned long next_mb_id; 170 171 /* The subblock size. */ 172 uint64_t sb_size; 173 /* The number of subblocks per Linux memory block. */ 174 uint32_t sbs_per_mb; 175 176 /* Summary of all memory block states. */ 177 unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; 178 179 /* 180 * One byte state per memory block. Allocated via 181 * vmalloc(). Resized (alloc+copy+free) on demand. 182 * 183 * With 128 MiB memory blocks, we have states for 512 184 * GiB of memory in one 4 KiB page. 185 */ 186 uint8_t *mb_states; 187 188 /* 189 * Bitmap: one bit per subblock. Allocated similar to 190 * sbm.mb_states. 191 * 192 * A set bit means the corresponding subblock is 193 * plugged, otherwise it's unblocked. 194 * 195 * With 4 MiB subblocks, we manage 128 GiB of memory 196 * in one 4 KiB page. 197 */ 198 unsigned long *sb_states; 199 } sbm; 200 201 struct { 202 /* Id of the first big block of this device. */ 203 unsigned long first_bb_id; 204 /* Id of the last usable big block of this device. */ 205 unsigned long last_usable_bb_id; 206 /* Id of the next device bock to prepare when needed. */ 207 unsigned long next_bb_id; 208 209 /* Summary of all big block states. */ 210 unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; 211 212 /* One byte state per big block. See sbm.mb_states. */ 213 uint8_t *bb_states; 214 215 /* The block size used for plugging/adding/removing. */ 216 uint64_t bb_size; 217 } bbm; 218 }; 219 220 /* 221 * Mutex that protects the sbm.mb_count, sbm.mb_states, 222 * sbm.sb_states, bbm.bb_count, and bbm.bb_states 223 * 224 * When this lock is held the pointers can't change, ONLINE and 225 * OFFLINE blocks can't change the state and no subblocks will get 226 * plugged/unplugged. 227 * 228 * In kdump mode, used to serialize requests, last_block_addr and 229 * last_block_plugged. 230 */ 231 struct mutex hotplug_mutex; 232 bool hotplug_active; 233 234 /* An error occurred we cannot handle - stop processing requests. */ 235 bool broken; 236 237 /* Cached valued of is_kdump_kernel() when the device was probed. */ 238 bool in_kdump; 239 240 /* The driver is being removed. */ 241 spinlock_t removal_lock; 242 bool removing; 243 244 /* Timer for retrying to plug/unplug memory. */ 245 struct hrtimer retry_timer; 246 unsigned int retry_timer_ms; 247 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000 248 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000 249 250 /* Memory notifier (online/offline events). */ 251 struct notifier_block memory_notifier; 252 253 #ifdef CONFIG_PROC_VMCORE 254 /* vmcore callback for /proc/vmcore handling in kdump mode */ 255 struct vmcore_cb vmcore_cb; 256 uint64_t last_block_addr; 257 bool last_block_plugged; 258 #endif /* CONFIG_PROC_VMCORE */ 259 260 /* Next device in the list of virtio-mem devices. */ 261 struct list_head next; 262 }; 263 264 /* 265 * We have to share a single online_page callback among all virtio-mem 266 * devices. We use RCU to iterate the list in the callback. 267 */ 268 static DEFINE_MUTEX(virtio_mem_mutex); 269 static LIST_HEAD(virtio_mem_devices); 270 271 static void virtio_mem_online_page_cb(struct page *page, unsigned int order); 272 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 273 unsigned long nr_pages); 274 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 275 unsigned long nr_pages); 276 static void virtio_mem_retry(struct virtio_mem *vm); 277 static int virtio_mem_create_resource(struct virtio_mem *vm); 278 static void virtio_mem_delete_resource(struct virtio_mem *vm); 279 280 /* 281 * Register a virtio-mem device so it will be considered for the online_page 282 * callback. 283 */ 284 static int register_virtio_mem_device(struct virtio_mem *vm) 285 { 286 int rc = 0; 287 288 /* First device registers the callback. */ 289 mutex_lock(&virtio_mem_mutex); 290 if (list_empty(&virtio_mem_devices)) 291 rc = set_online_page_callback(&virtio_mem_online_page_cb); 292 if (!rc) 293 list_add_rcu(&vm->next, &virtio_mem_devices); 294 mutex_unlock(&virtio_mem_mutex); 295 296 return rc; 297 } 298 299 /* 300 * Unregister a virtio-mem device so it will no longer be considered for the 301 * online_page callback. 302 */ 303 static void unregister_virtio_mem_device(struct virtio_mem *vm) 304 { 305 /* Last device unregisters the callback. */ 306 mutex_lock(&virtio_mem_mutex); 307 list_del_rcu(&vm->next); 308 if (list_empty(&virtio_mem_devices)) 309 restore_online_page_callback(&virtio_mem_online_page_cb); 310 mutex_unlock(&virtio_mem_mutex); 311 312 synchronize_rcu(); 313 } 314 315 /* 316 * Calculate the memory block id of a given address. 317 */ 318 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr) 319 { 320 return addr / memory_block_size_bytes(); 321 } 322 323 /* 324 * Calculate the physical start address of a given memory block id. 325 */ 326 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) 327 { 328 return mb_id * memory_block_size_bytes(); 329 } 330 331 /* 332 * Calculate the big block id of a given address. 333 */ 334 static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, 335 uint64_t addr) 336 { 337 return addr / vm->bbm.bb_size; 338 } 339 340 /* 341 * Calculate the physical start address of a given big block id. 342 */ 343 static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, 344 unsigned long bb_id) 345 { 346 return bb_id * vm->bbm.bb_size; 347 } 348 349 /* 350 * Calculate the subblock id of a given address. 351 */ 352 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, 353 unsigned long addr) 354 { 355 const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); 356 const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); 357 358 return (addr - mb_addr) / vm->sbm.sb_size; 359 } 360 361 /* 362 * Set the state of a big block, taking care of the state counter. 363 */ 364 static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, 365 unsigned long bb_id, 366 enum virtio_mem_bbm_bb_state state) 367 { 368 const unsigned long idx = bb_id - vm->bbm.first_bb_id; 369 enum virtio_mem_bbm_bb_state old_state; 370 371 old_state = vm->bbm.bb_states[idx]; 372 vm->bbm.bb_states[idx] = state; 373 374 BUG_ON(vm->bbm.bb_count[old_state] == 0); 375 vm->bbm.bb_count[old_state]--; 376 vm->bbm.bb_count[state]++; 377 } 378 379 /* 380 * Get the state of a big block. 381 */ 382 static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, 383 unsigned long bb_id) 384 { 385 return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; 386 } 387 388 /* 389 * Prepare the big block state array for the next big block. 390 */ 391 static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) 392 { 393 unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; 394 unsigned long new_bytes = old_bytes + 1; 395 int old_pages = PFN_UP(old_bytes); 396 int new_pages = PFN_UP(new_bytes); 397 uint8_t *new_array; 398 399 if (vm->bbm.bb_states && old_pages == new_pages) 400 return 0; 401 402 new_array = vzalloc(new_pages * PAGE_SIZE); 403 if (!new_array) 404 return -ENOMEM; 405 406 mutex_lock(&vm->hotplug_mutex); 407 if (vm->bbm.bb_states) 408 memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); 409 vfree(vm->bbm.bb_states); 410 vm->bbm.bb_states = new_array; 411 mutex_unlock(&vm->hotplug_mutex); 412 413 return 0; 414 } 415 416 #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ 417 for (_bb_id = vm->bbm.first_bb_id; \ 418 _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ 419 _bb_id++) \ 420 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 421 422 #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ 423 for (_bb_id = vm->bbm.next_bb_id - 1; \ 424 _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ 425 _bb_id--) \ 426 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 427 428 /* 429 * Set the state of a memory block, taking care of the state counter. 430 */ 431 static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, 432 unsigned long mb_id, uint8_t state) 433 { 434 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 435 uint8_t old_state; 436 437 old_state = vm->sbm.mb_states[idx]; 438 vm->sbm.mb_states[idx] = state; 439 440 BUG_ON(vm->sbm.mb_count[old_state] == 0); 441 vm->sbm.mb_count[old_state]--; 442 vm->sbm.mb_count[state]++; 443 } 444 445 /* 446 * Get the state of a memory block. 447 */ 448 static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, 449 unsigned long mb_id) 450 { 451 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 452 453 return vm->sbm.mb_states[idx]; 454 } 455 456 /* 457 * Prepare the state array for the next memory block. 458 */ 459 static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) 460 { 461 int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); 462 int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); 463 uint8_t *new_array; 464 465 if (vm->sbm.mb_states && old_pages == new_pages) 466 return 0; 467 468 new_array = vzalloc(new_pages * PAGE_SIZE); 469 if (!new_array) 470 return -ENOMEM; 471 472 mutex_lock(&vm->hotplug_mutex); 473 if (vm->sbm.mb_states) 474 memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); 475 vfree(vm->sbm.mb_states); 476 vm->sbm.mb_states = new_array; 477 mutex_unlock(&vm->hotplug_mutex); 478 479 return 0; 480 } 481 482 #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ 483 for (_mb_id = _vm->sbm.first_mb_id; \ 484 _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ 485 _mb_id++) \ 486 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 487 488 #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ 489 for (_mb_id = _vm->sbm.next_mb_id - 1; \ 490 _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ 491 _mb_id--) \ 492 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 493 494 /* 495 * Calculate the bit number in the subblock bitmap for the given subblock 496 * inside the given memory block. 497 */ 498 static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, 499 unsigned long mb_id, int sb_id) 500 { 501 return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; 502 } 503 504 /* 505 * Mark all selected subblocks plugged. 506 * 507 * Will not modify the state of the memory block. 508 */ 509 static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, 510 unsigned long mb_id, int sb_id, 511 int count) 512 { 513 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 514 515 __bitmap_set(vm->sbm.sb_states, bit, count); 516 } 517 518 /* 519 * Mark all selected subblocks unplugged. 520 * 521 * Will not modify the state of the memory block. 522 */ 523 static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, 524 unsigned long mb_id, int sb_id, 525 int count) 526 { 527 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 528 529 __bitmap_clear(vm->sbm.sb_states, bit, count); 530 } 531 532 /* 533 * Test if all selected subblocks are plugged. 534 */ 535 static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, 536 unsigned long mb_id, int sb_id, 537 int count) 538 { 539 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 540 541 if (count == 1) 542 return test_bit(bit, vm->sbm.sb_states); 543 544 /* TODO: Helper similar to bitmap_set() */ 545 return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= 546 bit + count; 547 } 548 549 /* 550 * Test if all selected subblocks are unplugged. 551 */ 552 static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, 553 unsigned long mb_id, int sb_id, 554 int count) 555 { 556 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 557 558 /* TODO: Helper similar to bitmap_set() */ 559 return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= 560 bit + count; 561 } 562 563 /* 564 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is 565 * none. 566 */ 567 static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, 568 unsigned long mb_id) 569 { 570 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); 571 572 return find_next_zero_bit(vm->sbm.sb_states, 573 bit + vm->sbm.sbs_per_mb, bit) - bit; 574 } 575 576 /* 577 * Prepare the subblock bitmap for the next memory block. 578 */ 579 static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) 580 { 581 const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; 582 const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; 583 const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; 584 int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); 585 int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); 586 unsigned long *new_bitmap, *old_bitmap; 587 588 if (vm->sbm.sb_states && old_pages == new_pages) 589 return 0; 590 591 new_bitmap = vzalloc(new_pages * PAGE_SIZE); 592 if (!new_bitmap) 593 return -ENOMEM; 594 595 mutex_lock(&vm->hotplug_mutex); 596 if (vm->sbm.sb_states) 597 memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); 598 599 old_bitmap = vm->sbm.sb_states; 600 vm->sbm.sb_states = new_bitmap; 601 mutex_unlock(&vm->hotplug_mutex); 602 603 vfree(old_bitmap); 604 return 0; 605 } 606 607 /* 608 * Test if we could add memory without creating too much offline memory - 609 * to avoid running OOM if memory is getting onlined deferred. 610 */ 611 static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) 612 { 613 if (WARN_ON_ONCE(size > vm->offline_threshold)) 614 return false; 615 616 return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; 617 } 618 619 /* 620 * Try adding memory to Linux. Will usually only fail if out of memory. 621 * 622 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 623 * onlining code). 624 * 625 * Will not modify the state of memory blocks in virtio-mem. 626 */ 627 static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, 628 uint64_t size) 629 { 630 int rc; 631 632 /* 633 * When force-unloading the driver and we still have memory added to 634 * Linux, the resource name has to stay. 635 */ 636 if (!vm->resource_name) { 637 vm->resource_name = kstrdup_const("System RAM (virtio_mem)", 638 GFP_KERNEL); 639 if (!vm->resource_name) 640 return -ENOMEM; 641 } 642 643 dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, 644 addr + size - 1); 645 /* Memory might get onlined immediately. */ 646 atomic64_add(size, &vm->offline_size); 647 rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name, 648 MHP_MERGE_RESOURCE | MHP_NID_IS_MGID); 649 if (rc) { 650 atomic64_sub(size, &vm->offline_size); 651 dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); 652 /* 653 * TODO: Linux MM does not properly clean up yet in all cases 654 * where adding of memory failed - especially on -ENOMEM. 655 */ 656 } 657 return rc; 658 } 659 660 /* 661 * See virtio_mem_add_memory(): Try adding a single Linux memory block. 662 */ 663 static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) 664 { 665 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 666 const uint64_t size = memory_block_size_bytes(); 667 668 return virtio_mem_add_memory(vm, addr, size); 669 } 670 671 /* 672 * See virtio_mem_add_memory(): Try adding a big block. 673 */ 674 static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) 675 { 676 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 677 const uint64_t size = vm->bbm.bb_size; 678 679 return virtio_mem_add_memory(vm, addr, size); 680 } 681 682 /* 683 * Try removing memory from Linux. Will only fail if memory blocks aren't 684 * offline. 685 * 686 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 687 * onlining code). 688 * 689 * Will not modify the state of memory blocks in virtio-mem. 690 */ 691 static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, 692 uint64_t size) 693 { 694 int rc; 695 696 dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, 697 addr + size - 1); 698 rc = remove_memory(addr, size); 699 if (!rc) { 700 atomic64_sub(size, &vm->offline_size); 701 /* 702 * We might have freed up memory we can now unplug, retry 703 * immediately instead of waiting. 704 */ 705 virtio_mem_retry(vm); 706 } else { 707 dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); 708 } 709 return rc; 710 } 711 712 /* 713 * See virtio_mem_remove_memory(): Try removing a single Linux memory block. 714 */ 715 static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) 716 { 717 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 718 const uint64_t size = memory_block_size_bytes(); 719 720 return virtio_mem_remove_memory(vm, addr, size); 721 } 722 723 /* 724 * Try offlining and removing memory from Linux. 725 * 726 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 727 * onlining code). 728 * 729 * Will not modify the state of memory blocks in virtio-mem. 730 */ 731 static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, 732 uint64_t addr, 733 uint64_t size) 734 { 735 int rc; 736 737 dev_dbg(&vm->vdev->dev, 738 "offlining and removing memory: 0x%llx - 0x%llx\n", addr, 739 addr + size - 1); 740 741 rc = offline_and_remove_memory(addr, size); 742 if (!rc) { 743 atomic64_sub(size, &vm->offline_size); 744 /* 745 * We might have freed up memory we can now unplug, retry 746 * immediately instead of waiting. 747 */ 748 virtio_mem_retry(vm); 749 } else { 750 dev_dbg(&vm->vdev->dev, 751 "offlining and removing memory failed: %d\n", rc); 752 } 753 return rc; 754 } 755 756 /* 757 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing 758 * a single Linux memory block. 759 */ 760 static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, 761 unsigned long mb_id) 762 { 763 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 764 const uint64_t size = memory_block_size_bytes(); 765 766 return virtio_mem_offline_and_remove_memory(vm, addr, size); 767 } 768 769 /* 770 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a 771 * all Linux memory blocks covered by the big block. 772 */ 773 static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, 774 unsigned long bb_id) 775 { 776 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 777 const uint64_t size = vm->bbm.bb_size; 778 779 return virtio_mem_offline_and_remove_memory(vm, addr, size); 780 } 781 782 /* 783 * Trigger the workqueue so the device can perform its magic. 784 */ 785 static void virtio_mem_retry(struct virtio_mem *vm) 786 { 787 unsigned long flags; 788 789 spin_lock_irqsave(&vm->removal_lock, flags); 790 if (!vm->removing) 791 queue_work(system_freezable_wq, &vm->wq); 792 spin_unlock_irqrestore(&vm->removal_lock, flags); 793 } 794 795 static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) 796 { 797 int node = NUMA_NO_NODE; 798 799 #if defined(CONFIG_ACPI_NUMA) 800 if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM)) 801 node = pxm_to_node(node_id); 802 #endif 803 return node; 804 } 805 806 /* 807 * Test if a virtio-mem device overlaps with the given range. Can be called 808 * from (notifier) callbacks lockless. 809 */ 810 static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, 811 uint64_t size) 812 { 813 return start < vm->addr + vm->region_size && vm->addr < start + size; 814 } 815 816 /* 817 * Test if a virtio-mem device contains a given range. Can be called from 818 * (notifier) callbacks lockless. 819 */ 820 static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, 821 uint64_t size) 822 { 823 return start >= vm->addr && start + size <= vm->addr + vm->region_size; 824 } 825 826 static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, 827 unsigned long mb_id) 828 { 829 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 830 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 831 case VIRTIO_MEM_SBM_MB_OFFLINE: 832 return NOTIFY_OK; 833 default: 834 break; 835 } 836 dev_warn_ratelimited(&vm->vdev->dev, 837 "memory block onlining denied\n"); 838 return NOTIFY_BAD; 839 } 840 841 static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, 842 unsigned long mb_id) 843 { 844 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 845 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 846 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 847 virtio_mem_sbm_set_mb_state(vm, mb_id, 848 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 849 break; 850 case VIRTIO_MEM_SBM_MB_KERNEL: 851 case VIRTIO_MEM_SBM_MB_MOVABLE: 852 virtio_mem_sbm_set_mb_state(vm, mb_id, 853 VIRTIO_MEM_SBM_MB_OFFLINE); 854 break; 855 default: 856 BUG(); 857 break; 858 } 859 } 860 861 static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, 862 unsigned long mb_id, 863 unsigned long start_pfn) 864 { 865 const bool is_movable = is_zone_movable_page(pfn_to_page(start_pfn)); 866 int new_state; 867 868 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 869 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 870 new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL; 871 if (is_movable) 872 new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL; 873 break; 874 case VIRTIO_MEM_SBM_MB_OFFLINE: 875 new_state = VIRTIO_MEM_SBM_MB_KERNEL; 876 if (is_movable) 877 new_state = VIRTIO_MEM_SBM_MB_MOVABLE; 878 break; 879 default: 880 BUG(); 881 break; 882 } 883 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 884 } 885 886 static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, 887 unsigned long mb_id) 888 { 889 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 890 unsigned long pfn; 891 int sb_id; 892 893 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 894 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 895 continue; 896 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 897 sb_id * vm->sbm.sb_size); 898 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 899 } 900 } 901 902 static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, 903 unsigned long mb_id) 904 { 905 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 906 unsigned long pfn; 907 int sb_id; 908 909 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 910 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 911 continue; 912 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 913 sb_id * vm->sbm.sb_size); 914 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 915 } 916 } 917 918 static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, 919 unsigned long bb_id, 920 unsigned long pfn, 921 unsigned long nr_pages) 922 { 923 /* 924 * When marked as "fake-offline", all online memory of this device block 925 * is allocated by us. Otherwise, we don't have any memory allocated. 926 */ 927 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 928 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 929 return; 930 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 931 } 932 933 static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, 934 unsigned long bb_id, 935 unsigned long pfn, 936 unsigned long nr_pages) 937 { 938 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 939 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 940 return; 941 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 942 } 943 944 /* 945 * This callback will either be called synchronously from add_memory() or 946 * asynchronously (e.g., triggered via user space). We have to be careful 947 * with locking when calling add_memory(). 948 */ 949 static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, 950 unsigned long action, void *arg) 951 { 952 struct virtio_mem *vm = container_of(nb, struct virtio_mem, 953 memory_notifier); 954 struct memory_notify *mhp = arg; 955 const unsigned long start = PFN_PHYS(mhp->start_pfn); 956 const unsigned long size = PFN_PHYS(mhp->nr_pages); 957 int rc = NOTIFY_OK; 958 unsigned long id; 959 960 if (!virtio_mem_overlaps_range(vm, start, size)) 961 return NOTIFY_DONE; 962 963 if (vm->in_sbm) { 964 id = virtio_mem_phys_to_mb_id(start); 965 /* 966 * In SBM, we add memory in separate memory blocks - we expect 967 * it to be onlined/offlined in the same granularity. Bail out 968 * if this ever changes. 969 */ 970 if (WARN_ON_ONCE(size != memory_block_size_bytes() || 971 !IS_ALIGNED(start, memory_block_size_bytes()))) 972 return NOTIFY_BAD; 973 } else { 974 id = virtio_mem_phys_to_bb_id(vm, start); 975 /* 976 * In BBM, we only care about onlining/offlining happening 977 * within a single big block, we don't care about the 978 * actual granularity as we don't track individual Linux 979 * memory blocks. 980 */ 981 if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) 982 return NOTIFY_BAD; 983 } 984 985 /* 986 * Avoid circular locking lockdep warnings. We lock the mutex 987 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The 988 * blocking_notifier_call_chain() has it's own lock, which gets unlocked 989 * between both notifier calls and will bail out. False positive. 990 */ 991 lockdep_off(); 992 993 switch (action) { 994 case MEM_GOING_OFFLINE: 995 mutex_lock(&vm->hotplug_mutex); 996 if (vm->removing) { 997 rc = notifier_from_errno(-EBUSY); 998 mutex_unlock(&vm->hotplug_mutex); 999 break; 1000 } 1001 vm->hotplug_active = true; 1002 if (vm->in_sbm) 1003 virtio_mem_sbm_notify_going_offline(vm, id); 1004 else 1005 virtio_mem_bbm_notify_going_offline(vm, id, 1006 mhp->start_pfn, 1007 mhp->nr_pages); 1008 break; 1009 case MEM_GOING_ONLINE: 1010 mutex_lock(&vm->hotplug_mutex); 1011 if (vm->removing) { 1012 rc = notifier_from_errno(-EBUSY); 1013 mutex_unlock(&vm->hotplug_mutex); 1014 break; 1015 } 1016 vm->hotplug_active = true; 1017 if (vm->in_sbm) 1018 rc = virtio_mem_sbm_notify_going_online(vm, id); 1019 break; 1020 case MEM_OFFLINE: 1021 if (vm->in_sbm) 1022 virtio_mem_sbm_notify_offline(vm, id); 1023 1024 atomic64_add(size, &vm->offline_size); 1025 /* 1026 * Trigger the workqueue. Now that we have some offline memory, 1027 * maybe we can handle pending unplug requests. 1028 */ 1029 if (!unplug_online) 1030 virtio_mem_retry(vm); 1031 1032 vm->hotplug_active = false; 1033 mutex_unlock(&vm->hotplug_mutex); 1034 break; 1035 case MEM_ONLINE: 1036 if (vm->in_sbm) 1037 virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); 1038 1039 atomic64_sub(size, &vm->offline_size); 1040 /* 1041 * Start adding more memory once we onlined half of our 1042 * threshold. Don't trigger if it's possibly due to our actipn 1043 * (e.g., us adding memory which gets onlined immediately from 1044 * the core). 1045 */ 1046 if (!atomic_read(&vm->wq_active) && 1047 virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) 1048 virtio_mem_retry(vm); 1049 1050 vm->hotplug_active = false; 1051 mutex_unlock(&vm->hotplug_mutex); 1052 break; 1053 case MEM_CANCEL_OFFLINE: 1054 if (!vm->hotplug_active) 1055 break; 1056 if (vm->in_sbm) 1057 virtio_mem_sbm_notify_cancel_offline(vm, id); 1058 else 1059 virtio_mem_bbm_notify_cancel_offline(vm, id, 1060 mhp->start_pfn, 1061 mhp->nr_pages); 1062 vm->hotplug_active = false; 1063 mutex_unlock(&vm->hotplug_mutex); 1064 break; 1065 case MEM_CANCEL_ONLINE: 1066 if (!vm->hotplug_active) 1067 break; 1068 vm->hotplug_active = false; 1069 mutex_unlock(&vm->hotplug_mutex); 1070 break; 1071 default: 1072 break; 1073 } 1074 1075 lockdep_on(); 1076 1077 return rc; 1078 } 1079 1080 /* 1081 * Set a range of pages PG_offline. Remember pages that were never onlined 1082 * (via generic_online_page()) using PageDirty(). 1083 */ 1084 static void virtio_mem_set_fake_offline(unsigned long pfn, 1085 unsigned long nr_pages, bool onlined) 1086 { 1087 page_offline_begin(); 1088 for (; nr_pages--; pfn++) { 1089 struct page *page = pfn_to_page(pfn); 1090 1091 __SetPageOffline(page); 1092 if (!onlined) { 1093 SetPageDirty(page); 1094 /* FIXME: remove after cleanups */ 1095 ClearPageReserved(page); 1096 } 1097 } 1098 page_offline_end(); 1099 } 1100 1101 /* 1102 * Clear PG_offline from a range of pages. If the pages were never onlined, 1103 * (via generic_online_page()), clear PageDirty(). 1104 */ 1105 static void virtio_mem_clear_fake_offline(unsigned long pfn, 1106 unsigned long nr_pages, bool onlined) 1107 { 1108 for (; nr_pages--; pfn++) { 1109 struct page *page = pfn_to_page(pfn); 1110 1111 __ClearPageOffline(page); 1112 if (!onlined) 1113 ClearPageDirty(page); 1114 } 1115 } 1116 1117 /* 1118 * Release a range of fake-offline pages to the buddy, effectively 1119 * fake-onlining them. 1120 */ 1121 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) 1122 { 1123 unsigned long order = MAX_ORDER - 1; 1124 unsigned long i; 1125 1126 /* 1127 * We might get called for ranges that don't cover properly aligned 1128 * MAX_ORDER - 1 pages; however, we can only online properly aligned 1129 * pages with an order of MAX_ORDER - 1 at maximum. 1130 */ 1131 while (!IS_ALIGNED(pfn | nr_pages, 1 << order)) 1132 order--; 1133 1134 for (i = 0; i < nr_pages; i += 1 << order) { 1135 struct page *page = pfn_to_page(pfn + i); 1136 1137 /* 1138 * If the page is PageDirty(), it was kept fake-offline when 1139 * onlining the memory block. Otherwise, it was allocated 1140 * using alloc_contig_range(). All pages in a subblock are 1141 * alike. 1142 */ 1143 if (PageDirty(page)) { 1144 virtio_mem_clear_fake_offline(pfn + i, 1 << order, false); 1145 generic_online_page(page, order); 1146 } else { 1147 virtio_mem_clear_fake_offline(pfn + i, 1 << order, true); 1148 free_contig_range(pfn + i, 1 << order); 1149 adjust_managed_page_count(page, 1 << order); 1150 } 1151 } 1152 } 1153 1154 /* 1155 * Try to allocate a range, marking pages fake-offline, effectively 1156 * fake-offlining them. 1157 */ 1158 static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) 1159 { 1160 const bool is_movable = is_zone_movable_page(pfn_to_page(pfn)); 1161 int rc, retry_count; 1162 1163 /* 1164 * TODO: We want an alloc_contig_range() mode that tries to allocate 1165 * harder (e.g., dealing with temporarily pinned pages, PCP), especially 1166 * with ZONE_MOVABLE. So for now, retry a couple of times with 1167 * ZONE_MOVABLE before giving up - because that zone is supposed to give 1168 * some guarantees. 1169 */ 1170 for (retry_count = 0; retry_count < 5; retry_count++) { 1171 rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, 1172 GFP_KERNEL); 1173 if (rc == -ENOMEM) 1174 /* whoops, out of memory */ 1175 return rc; 1176 else if (rc && !is_movable) 1177 break; 1178 else if (rc) 1179 continue; 1180 1181 virtio_mem_set_fake_offline(pfn, nr_pages, true); 1182 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1183 return 0; 1184 } 1185 1186 return -EBUSY; 1187 } 1188 1189 /* 1190 * Handle fake-offline pages when memory is going offline - such that the 1191 * pages can be skipped by mm-core when offlining. 1192 */ 1193 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 1194 unsigned long nr_pages) 1195 { 1196 struct page *page; 1197 unsigned long i; 1198 1199 /* 1200 * Drop our reference to the pages so the memory can get offlined 1201 * and add the unplugged pages to the managed page counters (so 1202 * offlining code can correctly subtract them again). 1203 */ 1204 adjust_managed_page_count(pfn_to_page(pfn), nr_pages); 1205 /* Drop our reference to the pages so the memory can get offlined. */ 1206 for (i = 0; i < nr_pages; i++) { 1207 page = pfn_to_page(pfn + i); 1208 if (WARN_ON(!page_ref_dec_and_test(page))) 1209 dump_page(page, "fake-offline page referenced"); 1210 } 1211 } 1212 1213 /* 1214 * Handle fake-offline pages when memory offlining is canceled - to undo 1215 * what we did in virtio_mem_fake_offline_going_offline(). 1216 */ 1217 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 1218 unsigned long nr_pages) 1219 { 1220 unsigned long i; 1221 1222 /* 1223 * Get the reference we dropped when going offline and subtract the 1224 * unplugged pages from the managed page counters. 1225 */ 1226 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1227 for (i = 0; i < nr_pages; i++) 1228 page_ref_inc(pfn_to_page(pfn + i)); 1229 } 1230 1231 static void virtio_mem_online_page(struct virtio_mem *vm, 1232 struct page *page, unsigned int order) 1233 { 1234 const unsigned long start = page_to_phys(page); 1235 const unsigned long end = start + PFN_PHYS(1 << order); 1236 unsigned long addr, next, id, sb_id, count; 1237 bool do_online; 1238 1239 /* 1240 * We can get called with any order up to MAX_ORDER - 1. If our 1241 * subblock size is smaller than that and we have a mixture of plugged 1242 * and unplugged subblocks within such a page, we have to process in 1243 * smaller granularity. In that case we'll adjust the order exactly once 1244 * within the loop. 1245 */ 1246 for (addr = start; addr < end; ) { 1247 next = addr + PFN_PHYS(1 << order); 1248 1249 if (vm->in_sbm) { 1250 id = virtio_mem_phys_to_mb_id(addr); 1251 sb_id = virtio_mem_phys_to_sb_id(vm, addr); 1252 count = virtio_mem_phys_to_sb_id(vm, next - 1) - sb_id + 1; 1253 1254 if (virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, count)) { 1255 /* Fully plugged. */ 1256 do_online = true; 1257 } else if (count == 1 || 1258 virtio_mem_sbm_test_sb_unplugged(vm, id, sb_id, count)) { 1259 /* Fully unplugged. */ 1260 do_online = false; 1261 } else { 1262 /* 1263 * Mixture, process sub-blocks instead. This 1264 * will be at least the size of a pageblock. 1265 * We'll run into this case exactly once. 1266 */ 1267 order = ilog2(vm->sbm.sb_size) - PAGE_SHIFT; 1268 do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1); 1269 continue; 1270 } 1271 } else { 1272 /* 1273 * If the whole block is marked fake offline, keep 1274 * everything that way. 1275 */ 1276 id = virtio_mem_phys_to_bb_id(vm, addr); 1277 do_online = virtio_mem_bbm_get_bb_state(vm, id) != 1278 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; 1279 } 1280 1281 if (do_online) 1282 generic_online_page(pfn_to_page(PFN_DOWN(addr)), order); 1283 else 1284 virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, 1285 false); 1286 addr = next; 1287 } 1288 } 1289 1290 static void virtio_mem_online_page_cb(struct page *page, unsigned int order) 1291 { 1292 const unsigned long addr = page_to_phys(page); 1293 struct virtio_mem *vm; 1294 1295 rcu_read_lock(); 1296 list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { 1297 /* 1298 * Pages we're onlining will never cross memory blocks and, 1299 * therefore, not virtio-mem devices. 1300 */ 1301 if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) 1302 continue; 1303 1304 /* 1305 * virtio_mem_set_fake_offline() might sleep. We can safely 1306 * drop the RCU lock at this point because the device 1307 * cannot go away. See virtio_mem_remove() how races 1308 * between memory onlining and device removal are handled. 1309 */ 1310 rcu_read_unlock(); 1311 1312 virtio_mem_online_page(vm, page, order); 1313 return; 1314 } 1315 rcu_read_unlock(); 1316 1317 /* not virtio-mem memory, but e.g., a DIMM. online it */ 1318 generic_online_page(page, order); 1319 } 1320 1321 static uint64_t virtio_mem_send_request(struct virtio_mem *vm, 1322 const struct virtio_mem_req *req) 1323 { 1324 struct scatterlist *sgs[2], sg_req, sg_resp; 1325 unsigned int len; 1326 int rc; 1327 1328 /* don't use the request residing on the stack (vaddr) */ 1329 vm->req = *req; 1330 1331 /* out: buffer for request */ 1332 sg_init_one(&sg_req, &vm->req, sizeof(vm->req)); 1333 sgs[0] = &sg_req; 1334 1335 /* in: buffer for response */ 1336 sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp)); 1337 sgs[1] = &sg_resp; 1338 1339 rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL); 1340 if (rc < 0) 1341 return rc; 1342 1343 virtqueue_kick(vm->vq); 1344 1345 /* wait for a response */ 1346 wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len)); 1347 1348 return virtio16_to_cpu(vm->vdev, vm->resp.type); 1349 } 1350 1351 static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, 1352 uint64_t size) 1353 { 1354 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1355 const struct virtio_mem_req req = { 1356 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG), 1357 .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), 1358 .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1359 }; 1360 int rc = -ENOMEM; 1361 1362 if (atomic_read(&vm->config_changed)) 1363 return -EAGAIN; 1364 1365 dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, 1366 addr + size - 1); 1367 1368 switch (virtio_mem_send_request(vm, &req)) { 1369 case VIRTIO_MEM_RESP_ACK: 1370 vm->plugged_size += size; 1371 return 0; 1372 case VIRTIO_MEM_RESP_NACK: 1373 rc = -EAGAIN; 1374 break; 1375 case VIRTIO_MEM_RESP_BUSY: 1376 rc = -ETXTBSY; 1377 break; 1378 case VIRTIO_MEM_RESP_ERROR: 1379 rc = -EINVAL; 1380 break; 1381 default: 1382 break; 1383 } 1384 1385 dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); 1386 return rc; 1387 } 1388 1389 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, 1390 uint64_t size) 1391 { 1392 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1393 const struct virtio_mem_req req = { 1394 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG), 1395 .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), 1396 .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1397 }; 1398 int rc = -ENOMEM; 1399 1400 if (atomic_read(&vm->config_changed)) 1401 return -EAGAIN; 1402 1403 dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, 1404 addr + size - 1); 1405 1406 switch (virtio_mem_send_request(vm, &req)) { 1407 case VIRTIO_MEM_RESP_ACK: 1408 vm->plugged_size -= size; 1409 return 0; 1410 case VIRTIO_MEM_RESP_BUSY: 1411 rc = -ETXTBSY; 1412 break; 1413 case VIRTIO_MEM_RESP_ERROR: 1414 rc = -EINVAL; 1415 break; 1416 default: 1417 break; 1418 } 1419 1420 dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); 1421 return rc; 1422 } 1423 1424 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) 1425 { 1426 const struct virtio_mem_req req = { 1427 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), 1428 }; 1429 int rc = -ENOMEM; 1430 1431 dev_dbg(&vm->vdev->dev, "unplugging all memory"); 1432 1433 switch (virtio_mem_send_request(vm, &req)) { 1434 case VIRTIO_MEM_RESP_ACK: 1435 vm->unplug_all_required = false; 1436 vm->plugged_size = 0; 1437 /* usable region might have shrunk */ 1438 atomic_set(&vm->config_changed, 1); 1439 return 0; 1440 case VIRTIO_MEM_RESP_BUSY: 1441 rc = -ETXTBSY; 1442 break; 1443 default: 1444 break; 1445 } 1446 1447 dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); 1448 return rc; 1449 } 1450 1451 /* 1452 * Plug selected subblocks. Updates the plugged state, but not the state 1453 * of the memory block. 1454 */ 1455 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, 1456 int sb_id, int count) 1457 { 1458 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1459 sb_id * vm->sbm.sb_size; 1460 const uint64_t size = count * vm->sbm.sb_size; 1461 int rc; 1462 1463 rc = virtio_mem_send_plug_request(vm, addr, size); 1464 if (!rc) 1465 virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); 1466 return rc; 1467 } 1468 1469 /* 1470 * Unplug selected subblocks. Updates the plugged state, but not the state 1471 * of the memory block. 1472 */ 1473 static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, 1474 int sb_id, int count) 1475 { 1476 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1477 sb_id * vm->sbm.sb_size; 1478 const uint64_t size = count * vm->sbm.sb_size; 1479 int rc; 1480 1481 rc = virtio_mem_send_unplug_request(vm, addr, size); 1482 if (!rc) 1483 virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); 1484 return rc; 1485 } 1486 1487 /* 1488 * Request to unplug a big block. 1489 * 1490 * Will not modify the state of the big block. 1491 */ 1492 static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) 1493 { 1494 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1495 const uint64_t size = vm->bbm.bb_size; 1496 1497 return virtio_mem_send_unplug_request(vm, addr, size); 1498 } 1499 1500 /* 1501 * Request to plug a big block. 1502 * 1503 * Will not modify the state of the big block. 1504 */ 1505 static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) 1506 { 1507 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1508 const uint64_t size = vm->bbm.bb_size; 1509 1510 return virtio_mem_send_plug_request(vm, addr, size); 1511 } 1512 1513 /* 1514 * Unplug the desired number of plugged subblocks of a offline or not-added 1515 * memory block. Will fail if any subblock cannot get unplugged (instead of 1516 * skipping it). 1517 * 1518 * Will not modify the state of the memory block. 1519 * 1520 * Note: can fail after some subblocks were unplugged. 1521 */ 1522 static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm, 1523 unsigned long mb_id, uint64_t *nb_sb) 1524 { 1525 int sb_id, count; 1526 int rc; 1527 1528 sb_id = vm->sbm.sbs_per_mb - 1; 1529 while (*nb_sb) { 1530 /* Find the next candidate subblock */ 1531 while (sb_id >= 0 && 1532 virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) 1533 sb_id--; 1534 if (sb_id < 0) 1535 break; 1536 /* Try to unplug multiple subblocks at a time */ 1537 count = 1; 1538 while (count < *nb_sb && sb_id > 0 && 1539 virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { 1540 count++; 1541 sb_id--; 1542 } 1543 1544 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1545 if (rc) 1546 return rc; 1547 *nb_sb -= count; 1548 sb_id--; 1549 } 1550 1551 return 0; 1552 } 1553 1554 /* 1555 * Unplug all plugged subblocks of an offline or not-added memory block. 1556 * 1557 * Will not modify the state of the memory block. 1558 * 1559 * Note: can fail after some subblocks were unplugged. 1560 */ 1561 static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) 1562 { 1563 uint64_t nb_sb = vm->sbm.sbs_per_mb; 1564 1565 return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb); 1566 } 1567 1568 /* 1569 * Prepare tracking data for the next memory block. 1570 */ 1571 static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, 1572 unsigned long *mb_id) 1573 { 1574 int rc; 1575 1576 if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) 1577 return -ENOSPC; 1578 1579 /* Resize the state array if required. */ 1580 rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); 1581 if (rc) 1582 return rc; 1583 1584 /* Resize the subblock bitmap if required. */ 1585 rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); 1586 if (rc) 1587 return rc; 1588 1589 vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; 1590 *mb_id = vm->sbm.next_mb_id++; 1591 return 0; 1592 } 1593 1594 /* 1595 * Try to plug the desired number of subblocks and add the memory block 1596 * to Linux. 1597 * 1598 * Will modify the state of the memory block. 1599 */ 1600 static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, 1601 unsigned long mb_id, uint64_t *nb_sb) 1602 { 1603 const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); 1604 int rc; 1605 1606 if (WARN_ON_ONCE(!count)) 1607 return -EINVAL; 1608 1609 /* 1610 * Plug the requested number of subblocks before adding it to linux, 1611 * so that onlining will directly online all plugged subblocks. 1612 */ 1613 rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); 1614 if (rc) 1615 return rc; 1616 1617 /* 1618 * Mark the block properly offline before adding it to Linux, 1619 * so the memory notifiers will find the block in the right state. 1620 */ 1621 if (count == vm->sbm.sbs_per_mb) 1622 virtio_mem_sbm_set_mb_state(vm, mb_id, 1623 VIRTIO_MEM_SBM_MB_OFFLINE); 1624 else 1625 virtio_mem_sbm_set_mb_state(vm, mb_id, 1626 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1627 1628 /* Add the memory block to linux - if that fails, try to unplug. */ 1629 rc = virtio_mem_sbm_add_mb(vm, mb_id); 1630 if (rc) { 1631 int new_state = VIRTIO_MEM_SBM_MB_UNUSED; 1632 1633 if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) 1634 new_state = VIRTIO_MEM_SBM_MB_PLUGGED; 1635 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 1636 return rc; 1637 } 1638 1639 *nb_sb -= count; 1640 return 0; 1641 } 1642 1643 /* 1644 * Try to plug the desired number of subblocks of a memory block that 1645 * is already added to Linux. 1646 * 1647 * Will modify the state of the memory block. 1648 * 1649 * Note: Can fail after some subblocks were successfully plugged. 1650 */ 1651 static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, 1652 unsigned long mb_id, uint64_t *nb_sb) 1653 { 1654 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1655 unsigned long pfn, nr_pages; 1656 int sb_id, count; 1657 int rc; 1658 1659 if (WARN_ON_ONCE(!*nb_sb)) 1660 return -EINVAL; 1661 1662 while (*nb_sb) { 1663 sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); 1664 if (sb_id >= vm->sbm.sbs_per_mb) 1665 break; 1666 count = 1; 1667 while (count < *nb_sb && 1668 sb_id + count < vm->sbm.sbs_per_mb && 1669 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) 1670 count++; 1671 1672 rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); 1673 if (rc) 1674 return rc; 1675 *nb_sb -= count; 1676 if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) 1677 continue; 1678 1679 /* fake-online the pages if the memory block is online */ 1680 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1681 sb_id * vm->sbm.sb_size); 1682 nr_pages = PFN_DOWN(count * vm->sbm.sb_size); 1683 virtio_mem_fake_online(pfn, nr_pages); 1684 } 1685 1686 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1687 virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1); 1688 1689 return 0; 1690 } 1691 1692 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1693 { 1694 const int mb_states[] = { 1695 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 1696 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 1697 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 1698 }; 1699 uint64_t nb_sb = diff / vm->sbm.sb_size; 1700 unsigned long mb_id; 1701 int rc, i; 1702 1703 if (!nb_sb) 1704 return 0; 1705 1706 /* Don't race with onlining/offlining */ 1707 mutex_lock(&vm->hotplug_mutex); 1708 1709 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 1710 virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) { 1711 rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb); 1712 if (rc || !nb_sb) 1713 goto out_unlock; 1714 cond_resched(); 1715 } 1716 } 1717 1718 /* 1719 * We won't be working on online/offline memory blocks from this point, 1720 * so we can't race with memory onlining/offlining. Drop the mutex. 1721 */ 1722 mutex_unlock(&vm->hotplug_mutex); 1723 1724 /* Try to plug and add unused blocks */ 1725 virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { 1726 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1727 return -ENOSPC; 1728 1729 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1730 if (rc || !nb_sb) 1731 return rc; 1732 cond_resched(); 1733 } 1734 1735 /* Try to prepare, plug and add new blocks */ 1736 while (nb_sb) { 1737 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1738 return -ENOSPC; 1739 1740 rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); 1741 if (rc) 1742 return rc; 1743 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1744 if (rc) 1745 return rc; 1746 cond_resched(); 1747 } 1748 1749 return 0; 1750 out_unlock: 1751 mutex_unlock(&vm->hotplug_mutex); 1752 return rc; 1753 } 1754 1755 /* 1756 * Plug a big block and add it to Linux. 1757 * 1758 * Will modify the state of the big block. 1759 */ 1760 static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, 1761 unsigned long bb_id) 1762 { 1763 int rc; 1764 1765 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 1766 VIRTIO_MEM_BBM_BB_UNUSED)) 1767 return -EINVAL; 1768 1769 rc = virtio_mem_bbm_plug_bb(vm, bb_id); 1770 if (rc) 1771 return rc; 1772 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 1773 1774 rc = virtio_mem_bbm_add_bb(vm, bb_id); 1775 if (rc) { 1776 if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) 1777 virtio_mem_bbm_set_bb_state(vm, bb_id, 1778 VIRTIO_MEM_BBM_BB_UNUSED); 1779 else 1780 /* Retry from the main loop. */ 1781 virtio_mem_bbm_set_bb_state(vm, bb_id, 1782 VIRTIO_MEM_BBM_BB_PLUGGED); 1783 return rc; 1784 } 1785 return 0; 1786 } 1787 1788 /* 1789 * Prepare tracking data for the next big block. 1790 */ 1791 static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, 1792 unsigned long *bb_id) 1793 { 1794 int rc; 1795 1796 if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) 1797 return -ENOSPC; 1798 1799 /* Resize the big block state array if required. */ 1800 rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); 1801 if (rc) 1802 return rc; 1803 1804 vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; 1805 *bb_id = vm->bbm.next_bb_id; 1806 vm->bbm.next_bb_id++; 1807 return 0; 1808 } 1809 1810 static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1811 { 1812 uint64_t nb_bb = diff / vm->bbm.bb_size; 1813 unsigned long bb_id; 1814 int rc; 1815 1816 if (!nb_bb) 1817 return 0; 1818 1819 /* Try to plug and add unused big blocks */ 1820 virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { 1821 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1822 return -ENOSPC; 1823 1824 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1825 if (!rc) 1826 nb_bb--; 1827 if (rc || !nb_bb) 1828 return rc; 1829 cond_resched(); 1830 } 1831 1832 /* Try to prepare, plug and add new big blocks */ 1833 while (nb_bb) { 1834 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1835 return -ENOSPC; 1836 1837 rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); 1838 if (rc) 1839 return rc; 1840 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1841 if (!rc) 1842 nb_bb--; 1843 if (rc) 1844 return rc; 1845 cond_resched(); 1846 } 1847 1848 return 0; 1849 } 1850 1851 /* 1852 * Try to plug the requested amount of memory. 1853 */ 1854 static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) 1855 { 1856 if (vm->in_sbm) 1857 return virtio_mem_sbm_plug_request(vm, diff); 1858 return virtio_mem_bbm_plug_request(vm, diff); 1859 } 1860 1861 /* 1862 * Unplug the desired number of plugged subblocks of an offline memory block. 1863 * Will fail if any subblock cannot get unplugged (instead of skipping it). 1864 * 1865 * Will modify the state of the memory block. Might temporarily drop the 1866 * hotplug_mutex. 1867 * 1868 * Note: Can fail after some subblocks were successfully unplugged. 1869 */ 1870 static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, 1871 unsigned long mb_id, 1872 uint64_t *nb_sb) 1873 { 1874 int rc; 1875 1876 rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb); 1877 1878 /* some subblocks might have been unplugged even on failure */ 1879 if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1880 virtio_mem_sbm_set_mb_state(vm, mb_id, 1881 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1882 if (rc) 1883 return rc; 1884 1885 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1886 /* 1887 * Remove the block from Linux - this should never fail. 1888 * Hinder the block from getting onlined by marking it 1889 * unplugged. Temporarily drop the mutex, so 1890 * any pending GOING_ONLINE requests can be serviced/rejected. 1891 */ 1892 virtio_mem_sbm_set_mb_state(vm, mb_id, 1893 VIRTIO_MEM_SBM_MB_UNUSED); 1894 1895 mutex_unlock(&vm->hotplug_mutex); 1896 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 1897 BUG_ON(rc); 1898 mutex_lock(&vm->hotplug_mutex); 1899 } 1900 return 0; 1901 } 1902 1903 /* 1904 * Unplug the given plugged subblocks of an online memory block. 1905 * 1906 * Will modify the state of the memory block. 1907 */ 1908 static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, 1909 unsigned long mb_id, int sb_id, 1910 int count) 1911 { 1912 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; 1913 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1914 unsigned long start_pfn; 1915 int rc; 1916 1917 start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1918 sb_id * vm->sbm.sb_size); 1919 1920 rc = virtio_mem_fake_offline(start_pfn, nr_pages); 1921 if (rc) 1922 return rc; 1923 1924 /* Try to unplug the allocated memory */ 1925 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1926 if (rc) { 1927 /* Return the memory to the buddy. */ 1928 virtio_mem_fake_online(start_pfn, nr_pages); 1929 return rc; 1930 } 1931 1932 switch (old_state) { 1933 case VIRTIO_MEM_SBM_MB_KERNEL: 1934 virtio_mem_sbm_set_mb_state(vm, mb_id, 1935 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL); 1936 break; 1937 case VIRTIO_MEM_SBM_MB_MOVABLE: 1938 virtio_mem_sbm_set_mb_state(vm, mb_id, 1939 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL); 1940 break; 1941 } 1942 1943 return 0; 1944 } 1945 1946 /* 1947 * Unplug the desired number of plugged subblocks of an online memory block. 1948 * Will skip subblock that are busy. 1949 * 1950 * Will modify the state of the memory block. Might temporarily drop the 1951 * hotplug_mutex. 1952 * 1953 * Note: Can fail after some subblocks were successfully unplugged. Can 1954 * return 0 even if subblocks were busy and could not get unplugged. 1955 */ 1956 static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, 1957 unsigned long mb_id, 1958 uint64_t *nb_sb) 1959 { 1960 int rc, sb_id; 1961 1962 /* If possible, try to unplug the complete block in one shot. */ 1963 if (*nb_sb >= vm->sbm.sbs_per_mb && 1964 virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1965 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, 1966 vm->sbm.sbs_per_mb); 1967 if (!rc) { 1968 *nb_sb -= vm->sbm.sbs_per_mb; 1969 goto unplugged; 1970 } else if (rc != -EBUSY) 1971 return rc; 1972 } 1973 1974 /* Fallback to single subblocks. */ 1975 for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { 1976 /* Find the next candidate subblock */ 1977 while (sb_id >= 0 && 1978 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 1979 sb_id--; 1980 if (sb_id < 0) 1981 break; 1982 1983 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); 1984 if (rc == -EBUSY) 1985 continue; 1986 else if (rc) 1987 return rc; 1988 *nb_sb -= 1; 1989 } 1990 1991 unplugged: 1992 /* 1993 * Once all subblocks of a memory block were unplugged, offline and 1994 * remove it. This will usually not fail, as no memory is in use 1995 * anymore - however some other notifiers might NACK the request. 1996 */ 1997 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1998 mutex_unlock(&vm->hotplug_mutex); 1999 rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); 2000 mutex_lock(&vm->hotplug_mutex); 2001 if (!rc) 2002 virtio_mem_sbm_set_mb_state(vm, mb_id, 2003 VIRTIO_MEM_SBM_MB_UNUSED); 2004 } 2005 2006 return 0; 2007 } 2008 2009 /* 2010 * Unplug the desired number of plugged subblocks of a memory block that is 2011 * already added to Linux. Will skip subblock of online memory blocks that are 2012 * busy (by the OS). Will fail if any subblock that's not busy cannot get 2013 * unplugged. 2014 * 2015 * Will modify the state of the memory block. Might temporarily drop the 2016 * hotplug_mutex. 2017 * 2018 * Note: Can fail after some subblocks were successfully unplugged. Can 2019 * return 0 even if subblocks were busy and could not get unplugged. 2020 */ 2021 static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, 2022 unsigned long mb_id, 2023 uint64_t *nb_sb) 2024 { 2025 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 2026 2027 switch (old_state) { 2028 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 2029 case VIRTIO_MEM_SBM_MB_KERNEL: 2030 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 2031 case VIRTIO_MEM_SBM_MB_MOVABLE: 2032 return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); 2033 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 2034 case VIRTIO_MEM_SBM_MB_OFFLINE: 2035 return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb); 2036 } 2037 return -EINVAL; 2038 } 2039 2040 static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2041 { 2042 const int mb_states[] = { 2043 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 2044 VIRTIO_MEM_SBM_MB_OFFLINE, 2045 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 2046 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 2047 VIRTIO_MEM_SBM_MB_MOVABLE, 2048 VIRTIO_MEM_SBM_MB_KERNEL, 2049 }; 2050 uint64_t nb_sb = diff / vm->sbm.sb_size; 2051 unsigned long mb_id; 2052 int rc, i; 2053 2054 if (!nb_sb) 2055 return 0; 2056 2057 /* 2058 * We'll drop the mutex a couple of times when it is safe to do so. 2059 * This might result in some blocks switching the state (online/offline) 2060 * and we could miss them in this run - we will retry again later. 2061 */ 2062 mutex_lock(&vm->hotplug_mutex); 2063 2064 /* 2065 * We try unplug from partially plugged blocks first, to try removing 2066 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE 2067 * as it's more reliable to unplug memory and remove whole memory 2068 * blocks, and we don't want to trigger a zone imbalances by 2069 * accidentially removing too much kernel memory. 2070 */ 2071 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 2072 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { 2073 rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); 2074 if (rc || !nb_sb) 2075 goto out_unlock; 2076 mutex_unlock(&vm->hotplug_mutex); 2077 cond_resched(); 2078 mutex_lock(&vm->hotplug_mutex); 2079 } 2080 if (!unplug_online && i == 1) { 2081 mutex_unlock(&vm->hotplug_mutex); 2082 return 0; 2083 } 2084 } 2085 2086 mutex_unlock(&vm->hotplug_mutex); 2087 return nb_sb ? -EBUSY : 0; 2088 out_unlock: 2089 mutex_unlock(&vm->hotplug_mutex); 2090 return rc; 2091 } 2092 2093 /* 2094 * Try to offline and remove a big block from Linux and unplug it. Will fail 2095 * with -EBUSY if some memory is busy and cannot get unplugged. 2096 * 2097 * Will modify the state of the memory block. Might temporarily drop the 2098 * hotplug_mutex. 2099 */ 2100 static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, 2101 unsigned long bb_id) 2102 { 2103 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2104 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2105 unsigned long end_pfn = start_pfn + nr_pages; 2106 unsigned long pfn; 2107 struct page *page; 2108 int rc; 2109 2110 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 2111 VIRTIO_MEM_BBM_BB_ADDED)) 2112 return -EINVAL; 2113 2114 if (bbm_safe_unplug) { 2115 /* 2116 * Start by fake-offlining all memory. Once we marked the device 2117 * block as fake-offline, all newly onlined memory will 2118 * automatically be kept fake-offline. Protect from concurrent 2119 * onlining/offlining until we have a consistent state. 2120 */ 2121 mutex_lock(&vm->hotplug_mutex); 2122 virtio_mem_bbm_set_bb_state(vm, bb_id, 2123 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); 2124 2125 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2126 page = pfn_to_online_page(pfn); 2127 if (!page) 2128 continue; 2129 2130 rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION); 2131 if (rc) { 2132 end_pfn = pfn; 2133 goto rollback_safe_unplug; 2134 } 2135 } 2136 mutex_unlock(&vm->hotplug_mutex); 2137 } 2138 2139 rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); 2140 if (rc) { 2141 if (bbm_safe_unplug) { 2142 mutex_lock(&vm->hotplug_mutex); 2143 goto rollback_safe_unplug; 2144 } 2145 return rc; 2146 } 2147 2148 rc = virtio_mem_bbm_unplug_bb(vm, bb_id); 2149 if (rc) 2150 virtio_mem_bbm_set_bb_state(vm, bb_id, 2151 VIRTIO_MEM_BBM_BB_PLUGGED); 2152 else 2153 virtio_mem_bbm_set_bb_state(vm, bb_id, 2154 VIRTIO_MEM_BBM_BB_UNUSED); 2155 return rc; 2156 2157 rollback_safe_unplug: 2158 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2159 page = pfn_to_online_page(pfn); 2160 if (!page) 2161 continue; 2162 virtio_mem_fake_online(pfn, PAGES_PER_SECTION); 2163 } 2164 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 2165 mutex_unlock(&vm->hotplug_mutex); 2166 return rc; 2167 } 2168 2169 /* 2170 * Test if a big block is completely offline. 2171 */ 2172 static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, 2173 unsigned long bb_id) 2174 { 2175 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2176 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2177 unsigned long pfn; 2178 2179 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2180 pfn += PAGES_PER_SECTION) { 2181 if (pfn_to_online_page(pfn)) 2182 return false; 2183 } 2184 2185 return true; 2186 } 2187 2188 /* 2189 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline). 2190 */ 2191 static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm, 2192 unsigned long bb_id) 2193 { 2194 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2195 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2196 struct page *page; 2197 unsigned long pfn; 2198 2199 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2200 pfn += PAGES_PER_SECTION) { 2201 page = pfn_to_online_page(pfn); 2202 if (!page) 2203 continue; 2204 if (page_zonenum(page) != ZONE_MOVABLE) 2205 return false; 2206 } 2207 2208 return true; 2209 } 2210 2211 static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2212 { 2213 uint64_t nb_bb = diff / vm->bbm.bb_size; 2214 uint64_t bb_id; 2215 int rc, i; 2216 2217 if (!nb_bb) 2218 return 0; 2219 2220 /* 2221 * Try to unplug big blocks. Similar to SBM, start with offline 2222 * big blocks. 2223 */ 2224 for (i = 0; i < 3; i++) { 2225 virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 2226 cond_resched(); 2227 2228 /* 2229 * As we're holding no locks, these checks are racy, 2230 * but we don't care. 2231 */ 2232 if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) 2233 continue; 2234 if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id)) 2235 continue; 2236 rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); 2237 if (rc == -EBUSY) 2238 continue; 2239 if (!rc) 2240 nb_bb--; 2241 if (rc || !nb_bb) 2242 return rc; 2243 } 2244 if (i == 0 && !unplug_online) 2245 return 0; 2246 } 2247 2248 return nb_bb ? -EBUSY : 0; 2249 } 2250 2251 /* 2252 * Try to unplug the requested amount of memory. 2253 */ 2254 static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) 2255 { 2256 if (vm->in_sbm) 2257 return virtio_mem_sbm_unplug_request(vm, diff); 2258 return virtio_mem_bbm_unplug_request(vm, diff); 2259 } 2260 2261 /* 2262 * Try to unplug all blocks that couldn't be unplugged before, for example, 2263 * because the hypervisor was busy. 2264 */ 2265 static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) 2266 { 2267 unsigned long id; 2268 int rc; 2269 2270 if (!vm->in_sbm) { 2271 virtio_mem_bbm_for_each_bb(vm, id, 2272 VIRTIO_MEM_BBM_BB_PLUGGED) { 2273 rc = virtio_mem_bbm_unplug_bb(vm, id); 2274 if (rc) 2275 return rc; 2276 virtio_mem_bbm_set_bb_state(vm, id, 2277 VIRTIO_MEM_BBM_BB_UNUSED); 2278 } 2279 return 0; 2280 } 2281 2282 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { 2283 rc = virtio_mem_sbm_unplug_mb(vm, id); 2284 if (rc) 2285 return rc; 2286 virtio_mem_sbm_set_mb_state(vm, id, 2287 VIRTIO_MEM_SBM_MB_UNUSED); 2288 } 2289 2290 return 0; 2291 } 2292 2293 /* 2294 * Update all parts of the config that could have changed. 2295 */ 2296 static void virtio_mem_refresh_config(struct virtio_mem *vm) 2297 { 2298 const struct range pluggable_range = mhp_get_pluggable_range(true); 2299 uint64_t new_plugged_size, usable_region_size, end_addr; 2300 2301 /* the plugged_size is just a reflection of what _we_ did previously */ 2302 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2303 &new_plugged_size); 2304 if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size)) 2305 vm->plugged_size = new_plugged_size; 2306 2307 /* calculate the last usable memory block id */ 2308 virtio_cread_le(vm->vdev, struct virtio_mem_config, 2309 usable_region_size, &usable_region_size); 2310 end_addr = min(vm->addr + usable_region_size - 1, 2311 pluggable_range.end); 2312 2313 if (vm->in_sbm) { 2314 vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr); 2315 if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes())) 2316 vm->sbm.last_usable_mb_id--; 2317 } else { 2318 vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm, 2319 end_addr); 2320 if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size)) 2321 vm->bbm.last_usable_bb_id--; 2322 } 2323 /* 2324 * If we cannot plug any of our device memory (e.g., nothing in the 2325 * usable region is addressable), the last usable memory block id will 2326 * be smaller than the first usable memory block id. We'll stop 2327 * attempting to add memory with -ENOSPC from our main loop. 2328 */ 2329 2330 /* see if there is a request to change the size */ 2331 virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, 2332 &vm->requested_size); 2333 2334 dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size); 2335 dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size); 2336 } 2337 2338 /* 2339 * Workqueue function for handling plug/unplug requests and config updates. 2340 */ 2341 static void virtio_mem_run_wq(struct work_struct *work) 2342 { 2343 struct virtio_mem *vm = container_of(work, struct virtio_mem, wq); 2344 uint64_t diff; 2345 int rc; 2346 2347 if (unlikely(vm->in_kdump)) { 2348 dev_warn_once(&vm->vdev->dev, 2349 "unexpected workqueue run in kdump kernel\n"); 2350 return; 2351 } 2352 2353 hrtimer_cancel(&vm->retry_timer); 2354 2355 if (vm->broken) 2356 return; 2357 2358 atomic_set(&vm->wq_active, 1); 2359 retry: 2360 rc = 0; 2361 2362 /* Make sure we start with a clean state if there are leftovers. */ 2363 if (unlikely(vm->unplug_all_required)) 2364 rc = virtio_mem_send_unplug_all_request(vm); 2365 2366 if (atomic_read(&vm->config_changed)) { 2367 atomic_set(&vm->config_changed, 0); 2368 virtio_mem_refresh_config(vm); 2369 } 2370 2371 /* Unplug any leftovers from previous runs */ 2372 if (!rc) 2373 rc = virtio_mem_unplug_pending_mb(vm); 2374 2375 if (!rc && vm->requested_size != vm->plugged_size) { 2376 if (vm->requested_size > vm->plugged_size) { 2377 diff = vm->requested_size - vm->plugged_size; 2378 rc = virtio_mem_plug_request(vm, diff); 2379 } else { 2380 diff = vm->plugged_size - vm->requested_size; 2381 rc = virtio_mem_unplug_request(vm, diff); 2382 } 2383 } 2384 2385 switch (rc) { 2386 case 0: 2387 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2388 break; 2389 case -ENOSPC: 2390 /* 2391 * We cannot add any more memory (alignment, physical limit) 2392 * or we have too many offline memory blocks. 2393 */ 2394 break; 2395 case -ETXTBSY: 2396 /* 2397 * The hypervisor cannot process our request right now 2398 * (e.g., out of memory, migrating); 2399 */ 2400 case -EBUSY: 2401 /* 2402 * We cannot free up any memory to unplug it (all plugged memory 2403 * is busy). 2404 */ 2405 case -ENOMEM: 2406 /* Out of memory, try again later. */ 2407 hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms), 2408 HRTIMER_MODE_REL); 2409 break; 2410 case -EAGAIN: 2411 /* Retry immediately (e.g., the config changed). */ 2412 goto retry; 2413 default: 2414 /* Unknown error, mark as broken */ 2415 dev_err(&vm->vdev->dev, 2416 "unknown error, marking device broken: %d\n", rc); 2417 vm->broken = true; 2418 } 2419 2420 atomic_set(&vm->wq_active, 0); 2421 } 2422 2423 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) 2424 { 2425 struct virtio_mem *vm = container_of(timer, struct virtio_mem, 2426 retry_timer); 2427 2428 virtio_mem_retry(vm); 2429 vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2, 2430 VIRTIO_MEM_RETRY_TIMER_MAX_MS); 2431 return HRTIMER_NORESTART; 2432 } 2433 2434 static void virtio_mem_handle_response(struct virtqueue *vq) 2435 { 2436 struct virtio_mem *vm = vq->vdev->priv; 2437 2438 wake_up(&vm->host_resp); 2439 } 2440 2441 static int virtio_mem_init_vq(struct virtio_mem *vm) 2442 { 2443 struct virtqueue *vq; 2444 2445 vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response, 2446 "guest-request"); 2447 if (IS_ERR(vq)) 2448 return PTR_ERR(vq); 2449 vm->vq = vq; 2450 2451 return 0; 2452 } 2453 2454 static int virtio_mem_init_hotplug(struct virtio_mem *vm) 2455 { 2456 const struct range pluggable_range = mhp_get_pluggable_range(true); 2457 uint64_t unit_pages, sb_size, addr; 2458 int rc; 2459 2460 /* bad device setup - warn only */ 2461 if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) 2462 dev_warn(&vm->vdev->dev, 2463 "The alignment of the physical start address can make some memory unusable.\n"); 2464 if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) 2465 dev_warn(&vm->vdev->dev, 2466 "The alignment of the physical end address can make some memory unusable.\n"); 2467 if (vm->addr < pluggable_range.start || 2468 vm->addr + vm->region_size - 1 > pluggable_range.end) 2469 dev_warn(&vm->vdev->dev, 2470 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); 2471 2472 /* Prepare the offline threshold - make sure we can add two blocks. */ 2473 vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), 2474 VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); 2475 2476 /* 2477 * alloc_contig_range() works reliably with pageblock 2478 * granularity on ZONE_NORMAL, use pageblock_nr_pages. 2479 */ 2480 sb_size = PAGE_SIZE * pageblock_nr_pages; 2481 sb_size = max_t(uint64_t, vm->device_block_size, sb_size); 2482 2483 if (sb_size < memory_block_size_bytes() && !force_bbm) { 2484 /* SBM: At least two subblocks per Linux memory block. */ 2485 vm->in_sbm = true; 2486 vm->sbm.sb_size = sb_size; 2487 vm->sbm.sbs_per_mb = memory_block_size_bytes() / 2488 vm->sbm.sb_size; 2489 2490 /* Round up to the next full memory block */ 2491 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2492 memory_block_size_bytes() - 1; 2493 vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); 2494 vm->sbm.next_mb_id = vm->sbm.first_mb_id; 2495 } else { 2496 /* BBM: At least one Linux memory block. */ 2497 vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, 2498 memory_block_size_bytes()); 2499 2500 if (bbm_block_size) { 2501 if (!is_power_of_2(bbm_block_size)) { 2502 dev_warn(&vm->vdev->dev, 2503 "bbm_block_size is not a power of 2"); 2504 } else if (bbm_block_size < vm->bbm.bb_size) { 2505 dev_warn(&vm->vdev->dev, 2506 "bbm_block_size is too small"); 2507 } else { 2508 vm->bbm.bb_size = bbm_block_size; 2509 } 2510 } 2511 2512 /* Round up to the next aligned big block */ 2513 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2514 vm->bbm.bb_size - 1; 2515 vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); 2516 vm->bbm.next_bb_id = vm->bbm.first_bb_id; 2517 2518 /* Make sure we can add two big blocks. */ 2519 vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, 2520 vm->offline_threshold); 2521 } 2522 2523 dev_info(&vm->vdev->dev, "memory block size: 0x%lx", 2524 memory_block_size_bytes()); 2525 if (vm->in_sbm) 2526 dev_info(&vm->vdev->dev, "subblock size: 0x%llx", 2527 (unsigned long long)vm->sbm.sb_size); 2528 else 2529 dev_info(&vm->vdev->dev, "big block size: 0x%llx", 2530 (unsigned long long)vm->bbm.bb_size); 2531 2532 /* create the parent resource for all memory */ 2533 rc = virtio_mem_create_resource(vm); 2534 if (rc) 2535 return rc; 2536 2537 /* use a single dynamic memory group to cover the whole memory device */ 2538 if (vm->in_sbm) 2539 unit_pages = PHYS_PFN(memory_block_size_bytes()); 2540 else 2541 unit_pages = PHYS_PFN(vm->bbm.bb_size); 2542 rc = memory_group_register_dynamic(vm->nid, unit_pages); 2543 if (rc < 0) 2544 goto out_del_resource; 2545 vm->mgid = rc; 2546 2547 /* 2548 * If we still have memory plugged, we have to unplug all memory first. 2549 * Registering our parent resource makes sure that this memory isn't 2550 * actually in use (e.g., trying to reload the driver). 2551 */ 2552 if (vm->plugged_size) { 2553 vm->unplug_all_required = true; 2554 dev_info(&vm->vdev->dev, "unplugging all memory is required\n"); 2555 } 2556 2557 /* register callbacks */ 2558 vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb; 2559 rc = register_memory_notifier(&vm->memory_notifier); 2560 if (rc) 2561 goto out_unreg_group; 2562 rc = register_virtio_mem_device(vm); 2563 if (rc) 2564 goto out_unreg_mem; 2565 2566 return 0; 2567 out_unreg_mem: 2568 unregister_memory_notifier(&vm->memory_notifier); 2569 out_unreg_group: 2570 memory_group_unregister(vm->mgid); 2571 out_del_resource: 2572 virtio_mem_delete_resource(vm); 2573 return rc; 2574 } 2575 2576 #ifdef CONFIG_PROC_VMCORE 2577 static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr, 2578 uint64_t size) 2579 { 2580 const uint64_t nb_vm_blocks = size / vm->device_block_size; 2581 const struct virtio_mem_req req = { 2582 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE), 2583 .u.state.addr = cpu_to_virtio64(vm->vdev, addr), 2584 .u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 2585 }; 2586 int rc = -ENOMEM; 2587 2588 dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr, 2589 addr + size - 1); 2590 2591 switch (virtio_mem_send_request(vm, &req)) { 2592 case VIRTIO_MEM_RESP_ACK: 2593 return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state); 2594 case VIRTIO_MEM_RESP_ERROR: 2595 rc = -EINVAL; 2596 break; 2597 default: 2598 break; 2599 } 2600 2601 dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc); 2602 return rc; 2603 } 2604 2605 static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb, 2606 unsigned long pfn) 2607 { 2608 struct virtio_mem *vm = container_of(cb, struct virtio_mem, 2609 vmcore_cb); 2610 uint64_t addr = PFN_PHYS(pfn); 2611 bool is_ram; 2612 int rc; 2613 2614 if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE)) 2615 return true; 2616 if (!vm->plugged_size) 2617 return false; 2618 2619 /* 2620 * We have to serialize device requests and access to the information 2621 * about the block queried last. 2622 */ 2623 mutex_lock(&vm->hotplug_mutex); 2624 2625 addr = ALIGN_DOWN(addr, vm->device_block_size); 2626 if (addr != vm->last_block_addr) { 2627 rc = virtio_mem_send_state_request(vm, addr, 2628 vm->device_block_size); 2629 /* On any kind of error, we're going to signal !ram. */ 2630 if (rc == VIRTIO_MEM_STATE_PLUGGED) 2631 vm->last_block_plugged = true; 2632 else 2633 vm->last_block_plugged = false; 2634 vm->last_block_addr = addr; 2635 } 2636 2637 is_ram = vm->last_block_plugged; 2638 mutex_unlock(&vm->hotplug_mutex); 2639 return is_ram; 2640 } 2641 #endif /* CONFIG_PROC_VMCORE */ 2642 2643 static int virtio_mem_init_kdump(struct virtio_mem *vm) 2644 { 2645 #ifdef CONFIG_PROC_VMCORE 2646 dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n"); 2647 vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram; 2648 register_vmcore_cb(&vm->vmcore_cb); 2649 return 0; 2650 #else /* CONFIG_PROC_VMCORE */ 2651 dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n"); 2652 return -EBUSY; 2653 #endif /* CONFIG_PROC_VMCORE */ 2654 } 2655 2656 static int virtio_mem_init(struct virtio_mem *vm) 2657 { 2658 uint16_t node_id; 2659 2660 if (!vm->vdev->config->get) { 2661 dev_err(&vm->vdev->dev, "config access disabled\n"); 2662 return -EINVAL; 2663 } 2664 2665 /* Fetch all properties that can't change. */ 2666 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2667 &vm->plugged_size); 2668 virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size, 2669 &vm->device_block_size); 2670 virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id, 2671 &node_id); 2672 vm->nid = virtio_mem_translate_node_id(vm, node_id); 2673 virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr); 2674 virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, 2675 &vm->region_size); 2676 2677 /* Determine the nid for the device based on the lowest address. */ 2678 if (vm->nid == NUMA_NO_NODE) 2679 vm->nid = memory_add_physaddr_to_nid(vm->addr); 2680 2681 dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); 2682 dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); 2683 dev_info(&vm->vdev->dev, "device block size: 0x%llx", 2684 (unsigned long long)vm->device_block_size); 2685 if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) 2686 dev_info(&vm->vdev->dev, "nid: %d", vm->nid); 2687 2688 /* 2689 * We don't want to (un)plug or reuse any memory when in kdump. The 2690 * memory is still accessible (but not exposed to Linux). 2691 */ 2692 if (vm->in_kdump) 2693 return virtio_mem_init_kdump(vm); 2694 return virtio_mem_init_hotplug(vm); 2695 } 2696 2697 static int virtio_mem_create_resource(struct virtio_mem *vm) 2698 { 2699 /* 2700 * When force-unloading the driver and removing the device, we 2701 * could have a garbage pointer. Duplicate the string. 2702 */ 2703 const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL); 2704 2705 if (!name) 2706 return -ENOMEM; 2707 2708 /* Disallow mapping device memory via /dev/mem completely. */ 2709 vm->parent_resource = __request_mem_region(vm->addr, vm->region_size, 2710 name, IORESOURCE_SYSTEM_RAM | 2711 IORESOURCE_EXCLUSIVE); 2712 if (!vm->parent_resource) { 2713 kfree(name); 2714 dev_warn(&vm->vdev->dev, "could not reserve device region\n"); 2715 dev_info(&vm->vdev->dev, 2716 "reloading the driver is not supported\n"); 2717 return -EBUSY; 2718 } 2719 2720 /* The memory is not actually busy - make add_memory() work. */ 2721 vm->parent_resource->flags &= ~IORESOURCE_BUSY; 2722 return 0; 2723 } 2724 2725 static void virtio_mem_delete_resource(struct virtio_mem *vm) 2726 { 2727 const char *name; 2728 2729 if (!vm->parent_resource) 2730 return; 2731 2732 name = vm->parent_resource->name; 2733 release_resource(vm->parent_resource); 2734 kfree(vm->parent_resource); 2735 kfree(name); 2736 vm->parent_resource = NULL; 2737 } 2738 2739 static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) 2740 { 2741 return 1; 2742 } 2743 2744 static bool virtio_mem_has_memory_added(struct virtio_mem *vm) 2745 { 2746 const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 2747 2748 return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, 2749 vm->addr + vm->region_size, NULL, 2750 virtio_mem_range_has_system_ram) == 1; 2751 } 2752 2753 static int virtio_mem_probe(struct virtio_device *vdev) 2754 { 2755 struct virtio_mem *vm; 2756 int rc; 2757 2758 BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24); 2759 BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10); 2760 2761 vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL); 2762 if (!vm) 2763 return -ENOMEM; 2764 2765 init_waitqueue_head(&vm->host_resp); 2766 vm->vdev = vdev; 2767 INIT_WORK(&vm->wq, virtio_mem_run_wq); 2768 mutex_init(&vm->hotplug_mutex); 2769 INIT_LIST_HEAD(&vm->next); 2770 spin_lock_init(&vm->removal_lock); 2771 hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2772 vm->retry_timer.function = virtio_mem_timer_expired; 2773 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2774 vm->in_kdump = is_kdump_kernel(); 2775 2776 /* register the virtqueue */ 2777 rc = virtio_mem_init_vq(vm); 2778 if (rc) 2779 goto out_free_vm; 2780 2781 /* initialize the device by querying the config */ 2782 rc = virtio_mem_init(vm); 2783 if (rc) 2784 goto out_del_vq; 2785 2786 virtio_device_ready(vdev); 2787 2788 /* trigger a config update to start processing the requested_size */ 2789 if (!vm->in_kdump) { 2790 atomic_set(&vm->config_changed, 1); 2791 queue_work(system_freezable_wq, &vm->wq); 2792 } 2793 2794 return 0; 2795 out_del_vq: 2796 vdev->config->del_vqs(vdev); 2797 out_free_vm: 2798 kfree(vm); 2799 vdev->priv = NULL; 2800 2801 return rc; 2802 } 2803 2804 static void virtio_mem_deinit_hotplug(struct virtio_mem *vm) 2805 { 2806 unsigned long mb_id; 2807 int rc; 2808 2809 /* 2810 * Make sure the workqueue won't be triggered anymore and no memory 2811 * blocks can be onlined/offlined until we're finished here. 2812 */ 2813 mutex_lock(&vm->hotplug_mutex); 2814 spin_lock_irq(&vm->removal_lock); 2815 vm->removing = true; 2816 spin_unlock_irq(&vm->removal_lock); 2817 mutex_unlock(&vm->hotplug_mutex); 2818 2819 /* wait until the workqueue stopped */ 2820 cancel_work_sync(&vm->wq); 2821 hrtimer_cancel(&vm->retry_timer); 2822 2823 if (vm->in_sbm) { 2824 /* 2825 * After we unregistered our callbacks, user space can online 2826 * partially plugged offline blocks. Make sure to remove them. 2827 */ 2828 virtio_mem_sbm_for_each_mb(vm, mb_id, 2829 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 2830 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 2831 BUG_ON(rc); 2832 virtio_mem_sbm_set_mb_state(vm, mb_id, 2833 VIRTIO_MEM_SBM_MB_UNUSED); 2834 } 2835 /* 2836 * After we unregistered our callbacks, user space can no longer 2837 * offline partially plugged online memory blocks. No need to 2838 * worry about them. 2839 */ 2840 } 2841 2842 /* unregister callbacks */ 2843 unregister_virtio_mem_device(vm); 2844 unregister_memory_notifier(&vm->memory_notifier); 2845 2846 /* 2847 * There is no way we could reliably remove all memory we have added to 2848 * the system. And there is no way to stop the driver/device from going 2849 * away. Warn at least. 2850 */ 2851 if (virtio_mem_has_memory_added(vm)) { 2852 dev_warn(&vm->vdev->dev, 2853 "device still has system memory added\n"); 2854 } else { 2855 virtio_mem_delete_resource(vm); 2856 kfree_const(vm->resource_name); 2857 memory_group_unregister(vm->mgid); 2858 } 2859 2860 /* remove all tracking data - no locking needed */ 2861 if (vm->in_sbm) { 2862 vfree(vm->sbm.mb_states); 2863 vfree(vm->sbm.sb_states); 2864 } else { 2865 vfree(vm->bbm.bb_states); 2866 } 2867 } 2868 2869 static void virtio_mem_deinit_kdump(struct virtio_mem *vm) 2870 { 2871 #ifdef CONFIG_PROC_VMCORE 2872 unregister_vmcore_cb(&vm->vmcore_cb); 2873 #endif /* CONFIG_PROC_VMCORE */ 2874 } 2875 2876 static void virtio_mem_remove(struct virtio_device *vdev) 2877 { 2878 struct virtio_mem *vm = vdev->priv; 2879 2880 if (vm->in_kdump) 2881 virtio_mem_deinit_kdump(vm); 2882 else 2883 virtio_mem_deinit_hotplug(vm); 2884 2885 /* reset the device and cleanup the queues */ 2886 virtio_reset_device(vdev); 2887 vdev->config->del_vqs(vdev); 2888 2889 kfree(vm); 2890 vdev->priv = NULL; 2891 } 2892 2893 static void virtio_mem_config_changed(struct virtio_device *vdev) 2894 { 2895 struct virtio_mem *vm = vdev->priv; 2896 2897 if (unlikely(vm->in_kdump)) 2898 return; 2899 2900 atomic_set(&vm->config_changed, 1); 2901 virtio_mem_retry(vm); 2902 } 2903 2904 #ifdef CONFIG_PM_SLEEP 2905 static int virtio_mem_freeze(struct virtio_device *vdev) 2906 { 2907 /* 2908 * When restarting the VM, all memory is usually unplugged. Don't 2909 * allow to suspend/hibernate. 2910 */ 2911 dev_err(&vdev->dev, "save/restore not supported.\n"); 2912 return -EPERM; 2913 } 2914 2915 static int virtio_mem_restore(struct virtio_device *vdev) 2916 { 2917 return -EPERM; 2918 } 2919 #endif 2920 2921 static unsigned int virtio_mem_features[] = { 2922 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA) 2923 VIRTIO_MEM_F_ACPI_PXM, 2924 #endif 2925 VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE, 2926 }; 2927 2928 static const struct virtio_device_id virtio_mem_id_table[] = { 2929 { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, 2930 { 0 }, 2931 }; 2932 2933 static struct virtio_driver virtio_mem_driver = { 2934 .feature_table = virtio_mem_features, 2935 .feature_table_size = ARRAY_SIZE(virtio_mem_features), 2936 .driver.name = KBUILD_MODNAME, 2937 .driver.owner = THIS_MODULE, 2938 .id_table = virtio_mem_id_table, 2939 .probe = virtio_mem_probe, 2940 .remove = virtio_mem_remove, 2941 .config_changed = virtio_mem_config_changed, 2942 #ifdef CONFIG_PM_SLEEP 2943 .freeze = virtio_mem_freeze, 2944 .restore = virtio_mem_restore, 2945 #endif 2946 }; 2947 2948 module_virtio_driver(virtio_mem_driver); 2949 MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table); 2950 MODULE_AUTHOR("David Hildenbrand <david@redhat.com>"); 2951 MODULE_DESCRIPTION("Virtio-mem driver"); 2952 MODULE_LICENSE("GPL"); 2953