1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Virtio-mem device driver. 4 * 5 * Copyright Red Hat, Inc. 2020 6 * 7 * Author(s): David Hildenbrand <david@redhat.com> 8 */ 9 10 #include <linux/virtio.h> 11 #include <linux/virtio_mem.h> 12 #include <linux/workqueue.h> 13 #include <linux/slab.h> 14 #include <linux/module.h> 15 #include <linux/mm.h> 16 #include <linux/memory_hotplug.h> 17 #include <linux/memory.h> 18 #include <linux/hrtimer.h> 19 #include <linux/crash_dump.h> 20 #include <linux/mutex.h> 21 #include <linux/bitmap.h> 22 #include <linux/lockdep.h> 23 #include <linux/log2.h> 24 #include <linux/vmalloc.h> 25 26 #include <acpi/acpi_numa.h> 27 28 static bool unplug_online = true; 29 module_param(unplug_online, bool, 0644); 30 MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); 31 32 static bool force_bbm; 33 module_param(force_bbm, bool, 0444); 34 MODULE_PARM_DESC(force_bbm, 35 "Force Big Block Mode. Default is 0 (auto-selection)"); 36 37 static unsigned long bbm_block_size; 38 module_param(bbm_block_size, ulong, 0444); 39 MODULE_PARM_DESC(bbm_block_size, 40 "Big Block size in bytes. Default is 0 (auto-detection)."); 41 42 /* 43 * virtio-mem currently supports the following modes of operation: 44 * 45 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The 46 * size of a Sub Block (SB) is determined based on the device block size, the 47 * pageblock size, and the maximum allocation granularity of the buddy. 48 * Subblocks within a Linux memory block might either be plugged or unplugged. 49 * Memory is added/removed to Linux MM in Linux memory block granularity. 50 * 51 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. 52 * Memory is added/removed to Linux MM in Big Block granularity. 53 * 54 * The mode is determined automatically based on the Linux memory block size 55 * and the device block size. 56 * 57 * User space / core MM (auto onlining) is responsible for onlining added 58 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are 59 * always onlined separately, and all memory within a Linux memory block is 60 * onlined to the same zone - virtio-mem relies on this behavior. 61 */ 62 63 /* 64 * State of a Linux memory block in SBM. 65 */ 66 enum virtio_mem_sbm_mb_state { 67 /* Unplugged, not added to Linux. Can be reused later. */ 68 VIRTIO_MEM_SBM_MB_UNUSED = 0, 69 /* (Partially) plugged, not added to Linux. Error on add_memory(). */ 70 VIRTIO_MEM_SBM_MB_PLUGGED, 71 /* Fully plugged, fully added to Linux, offline. */ 72 VIRTIO_MEM_SBM_MB_OFFLINE, 73 /* Partially plugged, fully added to Linux, offline. */ 74 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 75 /* Fully plugged, fully added to Linux, onlined to a kernel zone. */ 76 VIRTIO_MEM_SBM_MB_KERNEL, 77 /* Partially plugged, fully added to Linux, online to a kernel zone */ 78 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 79 /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 80 VIRTIO_MEM_SBM_MB_MOVABLE, 81 /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ 82 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 83 VIRTIO_MEM_SBM_MB_COUNT 84 }; 85 86 /* 87 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. 88 */ 89 enum virtio_mem_bbm_bb_state { 90 /* Unplugged, not added to Linux. Can be reused later. */ 91 VIRTIO_MEM_BBM_BB_UNUSED = 0, 92 /* Plugged, not added to Linux. Error on add_memory(). */ 93 VIRTIO_MEM_BBM_BB_PLUGGED, 94 /* Plugged and added to Linux. */ 95 VIRTIO_MEM_BBM_BB_ADDED, 96 /* All online parts are fake-offline, ready to remove. */ 97 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, 98 VIRTIO_MEM_BBM_BB_COUNT 99 }; 100 101 struct virtio_mem { 102 struct virtio_device *vdev; 103 104 /* We might first have to unplug all memory when starting up. */ 105 bool unplug_all_required; 106 107 /* Workqueue that processes the plug/unplug requests. */ 108 struct work_struct wq; 109 atomic_t wq_active; 110 atomic_t config_changed; 111 112 /* Virtqueue for guest->host requests. */ 113 struct virtqueue *vq; 114 115 /* Wait for a host response to a guest request. */ 116 wait_queue_head_t host_resp; 117 118 /* Space for one guest request and the host response. */ 119 struct virtio_mem_req req; 120 struct virtio_mem_resp resp; 121 122 /* The current size of the device. */ 123 uint64_t plugged_size; 124 /* The requested size of the device. */ 125 uint64_t requested_size; 126 127 /* The device block size (for communicating with the device). */ 128 uint64_t device_block_size; 129 /* The determined node id for all memory of the device. */ 130 int nid; 131 /* Physical start address of the memory region. */ 132 uint64_t addr; 133 /* Maximum region size in bytes. */ 134 uint64_t region_size; 135 136 /* The parent resource for all memory added via this device. */ 137 struct resource *parent_resource; 138 /* 139 * Copy of "System RAM (virtio_mem)" to be used for 140 * add_memory_driver_managed(). 141 */ 142 const char *resource_name; 143 /* Memory group identification. */ 144 int mgid; 145 146 /* 147 * We don't want to add too much memory if it's not getting onlined, 148 * to avoid running OOM. Besides this threshold, we allow to have at 149 * least two offline blocks at a time (whatever is bigger). 150 */ 151 #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) 152 atomic64_t offline_size; 153 uint64_t offline_threshold; 154 155 /* If set, the driver is in SBM, otherwise in BBM. */ 156 bool in_sbm; 157 158 union { 159 struct { 160 /* Id of the first memory block of this device. */ 161 unsigned long first_mb_id; 162 /* Id of the last usable memory block of this device. */ 163 unsigned long last_usable_mb_id; 164 /* Id of the next memory bock to prepare when needed. */ 165 unsigned long next_mb_id; 166 167 /* The subblock size. */ 168 uint64_t sb_size; 169 /* The number of subblocks per Linux memory block. */ 170 uint32_t sbs_per_mb; 171 172 /* 173 * Some of the Linux memory blocks tracked as "partially 174 * plugged" are completely unplugged and can be offlined 175 * and removed -- which previously failed. 176 */ 177 bool have_unplugged_mb; 178 179 /* Summary of all memory block states. */ 180 unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; 181 182 /* 183 * One byte state per memory block. Allocated via 184 * vmalloc(). Resized (alloc+copy+free) on demand. 185 * 186 * With 128 MiB memory blocks, we have states for 512 187 * GiB of memory in one 4 KiB page. 188 */ 189 uint8_t *mb_states; 190 191 /* 192 * Bitmap: one bit per subblock. Allocated similar to 193 * sbm.mb_states. 194 * 195 * A set bit means the corresponding subblock is 196 * plugged, otherwise it's unblocked. 197 * 198 * With 4 MiB subblocks, we manage 128 GiB of memory 199 * in one 4 KiB page. 200 */ 201 unsigned long *sb_states; 202 } sbm; 203 204 struct { 205 /* Id of the first big block of this device. */ 206 unsigned long first_bb_id; 207 /* Id of the last usable big block of this device. */ 208 unsigned long last_usable_bb_id; 209 /* Id of the next device bock to prepare when needed. */ 210 unsigned long next_bb_id; 211 212 /* Summary of all big block states. */ 213 unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; 214 215 /* One byte state per big block. See sbm.mb_states. */ 216 uint8_t *bb_states; 217 218 /* The block size used for plugging/adding/removing. */ 219 uint64_t bb_size; 220 } bbm; 221 }; 222 223 /* 224 * Mutex that protects the sbm.mb_count, sbm.mb_states, 225 * sbm.sb_states, bbm.bb_count, and bbm.bb_states 226 * 227 * When this lock is held the pointers can't change, ONLINE and 228 * OFFLINE blocks can't change the state and no subblocks will get 229 * plugged/unplugged. 230 * 231 * In kdump mode, used to serialize requests, last_block_addr and 232 * last_block_plugged. 233 */ 234 struct mutex hotplug_mutex; 235 bool hotplug_active; 236 237 /* An error occurred we cannot handle - stop processing requests. */ 238 bool broken; 239 240 /* Cached valued of is_kdump_kernel() when the device was probed. */ 241 bool in_kdump; 242 243 /* The driver is being removed. */ 244 spinlock_t removal_lock; 245 bool removing; 246 247 /* Timer for retrying to plug/unplug memory. */ 248 struct hrtimer retry_timer; 249 unsigned int retry_timer_ms; 250 #define VIRTIO_MEM_RETRY_TIMER_MIN_MS 50000 251 #define VIRTIO_MEM_RETRY_TIMER_MAX_MS 300000 252 253 /* Memory notifier (online/offline events). */ 254 struct notifier_block memory_notifier; 255 256 #ifdef CONFIG_PROC_VMCORE 257 /* vmcore callback for /proc/vmcore handling in kdump mode */ 258 struct vmcore_cb vmcore_cb; 259 uint64_t last_block_addr; 260 bool last_block_plugged; 261 #endif /* CONFIG_PROC_VMCORE */ 262 263 /* Next device in the list of virtio-mem devices. */ 264 struct list_head next; 265 }; 266 267 /* 268 * We have to share a single online_page callback among all virtio-mem 269 * devices. We use RCU to iterate the list in the callback. 270 */ 271 static DEFINE_MUTEX(virtio_mem_mutex); 272 static LIST_HEAD(virtio_mem_devices); 273 274 static void virtio_mem_online_page_cb(struct page *page, unsigned int order); 275 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 276 unsigned long nr_pages); 277 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 278 unsigned long nr_pages); 279 static void virtio_mem_retry(struct virtio_mem *vm); 280 static int virtio_mem_create_resource(struct virtio_mem *vm); 281 static void virtio_mem_delete_resource(struct virtio_mem *vm); 282 283 /* 284 * Register a virtio-mem device so it will be considered for the online_page 285 * callback. 286 */ 287 static int register_virtio_mem_device(struct virtio_mem *vm) 288 { 289 int rc = 0; 290 291 /* First device registers the callback. */ 292 mutex_lock(&virtio_mem_mutex); 293 if (list_empty(&virtio_mem_devices)) 294 rc = set_online_page_callback(&virtio_mem_online_page_cb); 295 if (!rc) 296 list_add_rcu(&vm->next, &virtio_mem_devices); 297 mutex_unlock(&virtio_mem_mutex); 298 299 return rc; 300 } 301 302 /* 303 * Unregister a virtio-mem device so it will no longer be considered for the 304 * online_page callback. 305 */ 306 static void unregister_virtio_mem_device(struct virtio_mem *vm) 307 { 308 /* Last device unregisters the callback. */ 309 mutex_lock(&virtio_mem_mutex); 310 list_del_rcu(&vm->next); 311 if (list_empty(&virtio_mem_devices)) 312 restore_online_page_callback(&virtio_mem_online_page_cb); 313 mutex_unlock(&virtio_mem_mutex); 314 315 synchronize_rcu(); 316 } 317 318 /* 319 * Calculate the memory block id of a given address. 320 */ 321 static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr) 322 { 323 return addr / memory_block_size_bytes(); 324 } 325 326 /* 327 * Calculate the physical start address of a given memory block id. 328 */ 329 static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) 330 { 331 return mb_id * memory_block_size_bytes(); 332 } 333 334 /* 335 * Calculate the big block id of a given address. 336 */ 337 static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, 338 uint64_t addr) 339 { 340 return addr / vm->bbm.bb_size; 341 } 342 343 /* 344 * Calculate the physical start address of a given big block id. 345 */ 346 static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, 347 unsigned long bb_id) 348 { 349 return bb_id * vm->bbm.bb_size; 350 } 351 352 /* 353 * Calculate the subblock id of a given address. 354 */ 355 static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, 356 unsigned long addr) 357 { 358 const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); 359 const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); 360 361 return (addr - mb_addr) / vm->sbm.sb_size; 362 } 363 364 /* 365 * Set the state of a big block, taking care of the state counter. 366 */ 367 static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, 368 unsigned long bb_id, 369 enum virtio_mem_bbm_bb_state state) 370 { 371 const unsigned long idx = bb_id - vm->bbm.first_bb_id; 372 enum virtio_mem_bbm_bb_state old_state; 373 374 old_state = vm->bbm.bb_states[idx]; 375 vm->bbm.bb_states[idx] = state; 376 377 BUG_ON(vm->bbm.bb_count[old_state] == 0); 378 vm->bbm.bb_count[old_state]--; 379 vm->bbm.bb_count[state]++; 380 } 381 382 /* 383 * Get the state of a big block. 384 */ 385 static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, 386 unsigned long bb_id) 387 { 388 return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; 389 } 390 391 /* 392 * Prepare the big block state array for the next big block. 393 */ 394 static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) 395 { 396 unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; 397 unsigned long new_bytes = old_bytes + 1; 398 int old_pages = PFN_UP(old_bytes); 399 int new_pages = PFN_UP(new_bytes); 400 uint8_t *new_array; 401 402 if (vm->bbm.bb_states && old_pages == new_pages) 403 return 0; 404 405 new_array = vzalloc(new_pages * PAGE_SIZE); 406 if (!new_array) 407 return -ENOMEM; 408 409 mutex_lock(&vm->hotplug_mutex); 410 if (vm->bbm.bb_states) 411 memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); 412 vfree(vm->bbm.bb_states); 413 vm->bbm.bb_states = new_array; 414 mutex_unlock(&vm->hotplug_mutex); 415 416 return 0; 417 } 418 419 #define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ 420 for (_bb_id = vm->bbm.first_bb_id; \ 421 _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ 422 _bb_id++) \ 423 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 424 425 #define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ 426 for (_bb_id = vm->bbm.next_bb_id - 1; \ 427 _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ 428 _bb_id--) \ 429 if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) 430 431 /* 432 * Set the state of a memory block, taking care of the state counter. 433 */ 434 static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, 435 unsigned long mb_id, uint8_t state) 436 { 437 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 438 uint8_t old_state; 439 440 old_state = vm->sbm.mb_states[idx]; 441 vm->sbm.mb_states[idx] = state; 442 443 BUG_ON(vm->sbm.mb_count[old_state] == 0); 444 vm->sbm.mb_count[old_state]--; 445 vm->sbm.mb_count[state]++; 446 } 447 448 /* 449 * Get the state of a memory block. 450 */ 451 static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, 452 unsigned long mb_id) 453 { 454 const unsigned long idx = mb_id - vm->sbm.first_mb_id; 455 456 return vm->sbm.mb_states[idx]; 457 } 458 459 /* 460 * Prepare the state array for the next memory block. 461 */ 462 static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) 463 { 464 int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); 465 int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); 466 uint8_t *new_array; 467 468 if (vm->sbm.mb_states && old_pages == new_pages) 469 return 0; 470 471 new_array = vzalloc(new_pages * PAGE_SIZE); 472 if (!new_array) 473 return -ENOMEM; 474 475 mutex_lock(&vm->hotplug_mutex); 476 if (vm->sbm.mb_states) 477 memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); 478 vfree(vm->sbm.mb_states); 479 vm->sbm.mb_states = new_array; 480 mutex_unlock(&vm->hotplug_mutex); 481 482 return 0; 483 } 484 485 #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ 486 for (_mb_id = _vm->sbm.first_mb_id; \ 487 _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ 488 _mb_id++) \ 489 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 490 491 #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ 492 for (_mb_id = _vm->sbm.next_mb_id - 1; \ 493 _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ 494 _mb_id--) \ 495 if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) 496 497 /* 498 * Calculate the bit number in the subblock bitmap for the given subblock 499 * inside the given memory block. 500 */ 501 static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, 502 unsigned long mb_id, int sb_id) 503 { 504 return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; 505 } 506 507 /* 508 * Mark all selected subblocks plugged. 509 * 510 * Will not modify the state of the memory block. 511 */ 512 static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, 513 unsigned long mb_id, int sb_id, 514 int count) 515 { 516 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 517 518 __bitmap_set(vm->sbm.sb_states, bit, count); 519 } 520 521 /* 522 * Mark all selected subblocks unplugged. 523 * 524 * Will not modify the state of the memory block. 525 */ 526 static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, 527 unsigned long mb_id, int sb_id, 528 int count) 529 { 530 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 531 532 __bitmap_clear(vm->sbm.sb_states, bit, count); 533 } 534 535 /* 536 * Test if all selected subblocks are plugged. 537 */ 538 static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, 539 unsigned long mb_id, int sb_id, 540 int count) 541 { 542 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 543 544 if (count == 1) 545 return test_bit(bit, vm->sbm.sb_states); 546 547 /* TODO: Helper similar to bitmap_set() */ 548 return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= 549 bit + count; 550 } 551 552 /* 553 * Test if all selected subblocks are unplugged. 554 */ 555 static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, 556 unsigned long mb_id, int sb_id, 557 int count) 558 { 559 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); 560 561 /* TODO: Helper similar to bitmap_set() */ 562 return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= 563 bit + count; 564 } 565 566 /* 567 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is 568 * none. 569 */ 570 static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, 571 unsigned long mb_id) 572 { 573 const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); 574 575 return find_next_zero_bit(vm->sbm.sb_states, 576 bit + vm->sbm.sbs_per_mb, bit) - bit; 577 } 578 579 /* 580 * Prepare the subblock bitmap for the next memory block. 581 */ 582 static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) 583 { 584 const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; 585 const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; 586 const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; 587 int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); 588 int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); 589 unsigned long *new_bitmap, *old_bitmap; 590 591 if (vm->sbm.sb_states && old_pages == new_pages) 592 return 0; 593 594 new_bitmap = vzalloc(new_pages * PAGE_SIZE); 595 if (!new_bitmap) 596 return -ENOMEM; 597 598 mutex_lock(&vm->hotplug_mutex); 599 if (vm->sbm.sb_states) 600 memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); 601 602 old_bitmap = vm->sbm.sb_states; 603 vm->sbm.sb_states = new_bitmap; 604 mutex_unlock(&vm->hotplug_mutex); 605 606 vfree(old_bitmap); 607 return 0; 608 } 609 610 /* 611 * Test if we could add memory without creating too much offline memory - 612 * to avoid running OOM if memory is getting onlined deferred. 613 */ 614 static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) 615 { 616 if (WARN_ON_ONCE(size > vm->offline_threshold)) 617 return false; 618 619 return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; 620 } 621 622 /* 623 * Try adding memory to Linux. Will usually only fail if out of memory. 624 * 625 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 626 * onlining code). 627 * 628 * Will not modify the state of memory blocks in virtio-mem. 629 */ 630 static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, 631 uint64_t size) 632 { 633 int rc; 634 635 /* 636 * When force-unloading the driver and we still have memory added to 637 * Linux, the resource name has to stay. 638 */ 639 if (!vm->resource_name) { 640 vm->resource_name = kstrdup_const("System RAM (virtio_mem)", 641 GFP_KERNEL); 642 if (!vm->resource_name) 643 return -ENOMEM; 644 } 645 646 dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, 647 addr + size - 1); 648 /* Memory might get onlined immediately. */ 649 atomic64_add(size, &vm->offline_size); 650 rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name, 651 MHP_MERGE_RESOURCE | MHP_NID_IS_MGID); 652 if (rc) { 653 atomic64_sub(size, &vm->offline_size); 654 dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); 655 /* 656 * TODO: Linux MM does not properly clean up yet in all cases 657 * where adding of memory failed - especially on -ENOMEM. 658 */ 659 } 660 return rc; 661 } 662 663 /* 664 * See virtio_mem_add_memory(): Try adding a single Linux memory block. 665 */ 666 static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) 667 { 668 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 669 const uint64_t size = memory_block_size_bytes(); 670 671 return virtio_mem_add_memory(vm, addr, size); 672 } 673 674 /* 675 * See virtio_mem_add_memory(): Try adding a big block. 676 */ 677 static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) 678 { 679 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 680 const uint64_t size = vm->bbm.bb_size; 681 682 return virtio_mem_add_memory(vm, addr, size); 683 } 684 685 /* 686 * Try removing memory from Linux. Will only fail if memory blocks aren't 687 * offline. 688 * 689 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 690 * onlining code). 691 * 692 * Will not modify the state of memory blocks in virtio-mem. 693 */ 694 static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, 695 uint64_t size) 696 { 697 int rc; 698 699 dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, 700 addr + size - 1); 701 rc = remove_memory(addr, size); 702 if (!rc) { 703 atomic64_sub(size, &vm->offline_size); 704 /* 705 * We might have freed up memory we can now unplug, retry 706 * immediately instead of waiting. 707 */ 708 virtio_mem_retry(vm); 709 } else { 710 dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); 711 } 712 return rc; 713 } 714 715 /* 716 * See virtio_mem_remove_memory(): Try removing a single Linux memory block. 717 */ 718 static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) 719 { 720 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 721 const uint64_t size = memory_block_size_bytes(); 722 723 return virtio_mem_remove_memory(vm, addr, size); 724 } 725 726 /* 727 * Try offlining and removing memory from Linux. 728 * 729 * Must not be called with the vm->hotplug_mutex held (possible deadlock with 730 * onlining code). 731 * 732 * Will not modify the state of memory blocks in virtio-mem. 733 */ 734 static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, 735 uint64_t addr, 736 uint64_t size) 737 { 738 int rc; 739 740 dev_dbg(&vm->vdev->dev, 741 "offlining and removing memory: 0x%llx - 0x%llx\n", addr, 742 addr + size - 1); 743 744 rc = offline_and_remove_memory(addr, size); 745 if (!rc) { 746 atomic64_sub(size, &vm->offline_size); 747 /* 748 * We might have freed up memory we can now unplug, retry 749 * immediately instead of waiting. 750 */ 751 virtio_mem_retry(vm); 752 return 0; 753 } 754 dev_dbg(&vm->vdev->dev, "offlining and removing memory failed: %d\n", rc); 755 /* 756 * We don't really expect this to fail, because we fake-offlined all 757 * memory already. But it could fail in corner cases. 758 */ 759 WARN_ON_ONCE(rc != -ENOMEM && rc != -EBUSY); 760 return rc == -ENOMEM ? -ENOMEM : -EBUSY; 761 } 762 763 /* 764 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing 765 * a single Linux memory block. 766 */ 767 static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, 768 unsigned long mb_id) 769 { 770 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); 771 const uint64_t size = memory_block_size_bytes(); 772 773 return virtio_mem_offline_and_remove_memory(vm, addr, size); 774 } 775 776 /* 777 * Try (offlining and) removing memory from Linux in case all subblocks are 778 * unplugged. Can be called on online and offline memory blocks. 779 * 780 * May modify the state of memory blocks in virtio-mem. 781 */ 782 static int virtio_mem_sbm_try_remove_unplugged_mb(struct virtio_mem *vm, 783 unsigned long mb_id) 784 { 785 int rc; 786 787 /* 788 * Once all subblocks of a memory block were unplugged, offline and 789 * remove it. 790 */ 791 if (!virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 792 return 0; 793 794 /* offline_and_remove_memory() works for online and offline memory. */ 795 mutex_unlock(&vm->hotplug_mutex); 796 rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); 797 mutex_lock(&vm->hotplug_mutex); 798 if (!rc) 799 virtio_mem_sbm_set_mb_state(vm, mb_id, 800 VIRTIO_MEM_SBM_MB_UNUSED); 801 return rc; 802 } 803 804 /* 805 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a 806 * all Linux memory blocks covered by the big block. 807 */ 808 static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, 809 unsigned long bb_id) 810 { 811 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 812 const uint64_t size = vm->bbm.bb_size; 813 814 return virtio_mem_offline_and_remove_memory(vm, addr, size); 815 } 816 817 /* 818 * Trigger the workqueue so the device can perform its magic. 819 */ 820 static void virtio_mem_retry(struct virtio_mem *vm) 821 { 822 unsigned long flags; 823 824 spin_lock_irqsave(&vm->removal_lock, flags); 825 if (!vm->removing) 826 queue_work(system_freezable_wq, &vm->wq); 827 spin_unlock_irqrestore(&vm->removal_lock, flags); 828 } 829 830 static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) 831 { 832 int node = NUMA_NO_NODE; 833 834 #if defined(CONFIG_ACPI_NUMA) 835 if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM)) 836 node = pxm_to_node(node_id); 837 #endif 838 return node; 839 } 840 841 /* 842 * Test if a virtio-mem device overlaps with the given range. Can be called 843 * from (notifier) callbacks lockless. 844 */ 845 static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, 846 uint64_t size) 847 { 848 return start < vm->addr + vm->region_size && vm->addr < start + size; 849 } 850 851 /* 852 * Test if a virtio-mem device contains a given range. Can be called from 853 * (notifier) callbacks lockless. 854 */ 855 static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, 856 uint64_t size) 857 { 858 return start >= vm->addr && start + size <= vm->addr + vm->region_size; 859 } 860 861 static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, 862 unsigned long mb_id) 863 { 864 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 865 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 866 case VIRTIO_MEM_SBM_MB_OFFLINE: 867 return NOTIFY_OK; 868 default: 869 break; 870 } 871 dev_warn_ratelimited(&vm->vdev->dev, 872 "memory block onlining denied\n"); 873 return NOTIFY_BAD; 874 } 875 876 static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, 877 unsigned long mb_id) 878 { 879 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 880 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 881 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 882 virtio_mem_sbm_set_mb_state(vm, mb_id, 883 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 884 break; 885 case VIRTIO_MEM_SBM_MB_KERNEL: 886 case VIRTIO_MEM_SBM_MB_MOVABLE: 887 virtio_mem_sbm_set_mb_state(vm, mb_id, 888 VIRTIO_MEM_SBM_MB_OFFLINE); 889 break; 890 default: 891 BUG(); 892 break; 893 } 894 } 895 896 static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, 897 unsigned long mb_id, 898 unsigned long start_pfn) 899 { 900 const bool is_movable = is_zone_movable_page(pfn_to_page(start_pfn)); 901 int new_state; 902 903 switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { 904 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 905 new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL; 906 if (is_movable) 907 new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL; 908 break; 909 case VIRTIO_MEM_SBM_MB_OFFLINE: 910 new_state = VIRTIO_MEM_SBM_MB_KERNEL; 911 if (is_movable) 912 new_state = VIRTIO_MEM_SBM_MB_MOVABLE; 913 break; 914 default: 915 BUG(); 916 break; 917 } 918 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 919 } 920 921 static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, 922 unsigned long mb_id) 923 { 924 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 925 unsigned long pfn; 926 int sb_id; 927 928 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 929 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 930 continue; 931 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 932 sb_id * vm->sbm.sb_size); 933 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 934 } 935 } 936 937 static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, 938 unsigned long mb_id) 939 { 940 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); 941 unsigned long pfn; 942 int sb_id; 943 944 for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { 945 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 946 continue; 947 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 948 sb_id * vm->sbm.sb_size); 949 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 950 } 951 } 952 953 static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, 954 unsigned long bb_id, 955 unsigned long pfn, 956 unsigned long nr_pages) 957 { 958 /* 959 * When marked as "fake-offline", all online memory of this device block 960 * is allocated by us. Otherwise, we don't have any memory allocated. 961 */ 962 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 963 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 964 return; 965 virtio_mem_fake_offline_going_offline(pfn, nr_pages); 966 } 967 968 static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, 969 unsigned long bb_id, 970 unsigned long pfn, 971 unsigned long nr_pages) 972 { 973 if (virtio_mem_bbm_get_bb_state(vm, bb_id) != 974 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) 975 return; 976 virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); 977 } 978 979 /* 980 * This callback will either be called synchronously from add_memory() or 981 * asynchronously (e.g., triggered via user space). We have to be careful 982 * with locking when calling add_memory(). 983 */ 984 static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, 985 unsigned long action, void *arg) 986 { 987 struct virtio_mem *vm = container_of(nb, struct virtio_mem, 988 memory_notifier); 989 struct memory_notify *mhp = arg; 990 const unsigned long start = PFN_PHYS(mhp->start_pfn); 991 const unsigned long size = PFN_PHYS(mhp->nr_pages); 992 int rc = NOTIFY_OK; 993 unsigned long id; 994 995 if (!virtio_mem_overlaps_range(vm, start, size)) 996 return NOTIFY_DONE; 997 998 if (vm->in_sbm) { 999 id = virtio_mem_phys_to_mb_id(start); 1000 /* 1001 * In SBM, we add memory in separate memory blocks - we expect 1002 * it to be onlined/offlined in the same granularity. Bail out 1003 * if this ever changes. 1004 */ 1005 if (WARN_ON_ONCE(size != memory_block_size_bytes() || 1006 !IS_ALIGNED(start, memory_block_size_bytes()))) 1007 return NOTIFY_BAD; 1008 } else { 1009 id = virtio_mem_phys_to_bb_id(vm, start); 1010 /* 1011 * In BBM, we only care about onlining/offlining happening 1012 * within a single big block, we don't care about the 1013 * actual granularity as we don't track individual Linux 1014 * memory blocks. 1015 */ 1016 if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) 1017 return NOTIFY_BAD; 1018 } 1019 1020 /* 1021 * Avoid circular locking lockdep warnings. We lock the mutex 1022 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The 1023 * blocking_notifier_call_chain() has it's own lock, which gets unlocked 1024 * between both notifier calls and will bail out. False positive. 1025 */ 1026 lockdep_off(); 1027 1028 switch (action) { 1029 case MEM_GOING_OFFLINE: 1030 mutex_lock(&vm->hotplug_mutex); 1031 if (vm->removing) { 1032 rc = notifier_from_errno(-EBUSY); 1033 mutex_unlock(&vm->hotplug_mutex); 1034 break; 1035 } 1036 vm->hotplug_active = true; 1037 if (vm->in_sbm) 1038 virtio_mem_sbm_notify_going_offline(vm, id); 1039 else 1040 virtio_mem_bbm_notify_going_offline(vm, id, 1041 mhp->start_pfn, 1042 mhp->nr_pages); 1043 break; 1044 case MEM_GOING_ONLINE: 1045 mutex_lock(&vm->hotplug_mutex); 1046 if (vm->removing) { 1047 rc = notifier_from_errno(-EBUSY); 1048 mutex_unlock(&vm->hotplug_mutex); 1049 break; 1050 } 1051 vm->hotplug_active = true; 1052 if (vm->in_sbm) 1053 rc = virtio_mem_sbm_notify_going_online(vm, id); 1054 break; 1055 case MEM_OFFLINE: 1056 if (vm->in_sbm) 1057 virtio_mem_sbm_notify_offline(vm, id); 1058 1059 atomic64_add(size, &vm->offline_size); 1060 /* 1061 * Trigger the workqueue. Now that we have some offline memory, 1062 * maybe we can handle pending unplug requests. 1063 */ 1064 if (!unplug_online) 1065 virtio_mem_retry(vm); 1066 1067 vm->hotplug_active = false; 1068 mutex_unlock(&vm->hotplug_mutex); 1069 break; 1070 case MEM_ONLINE: 1071 if (vm->in_sbm) 1072 virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); 1073 1074 atomic64_sub(size, &vm->offline_size); 1075 /* 1076 * Start adding more memory once we onlined half of our 1077 * threshold. Don't trigger if it's possibly due to our actipn 1078 * (e.g., us adding memory which gets onlined immediately from 1079 * the core). 1080 */ 1081 if (!atomic_read(&vm->wq_active) && 1082 virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) 1083 virtio_mem_retry(vm); 1084 1085 vm->hotplug_active = false; 1086 mutex_unlock(&vm->hotplug_mutex); 1087 break; 1088 case MEM_CANCEL_OFFLINE: 1089 if (!vm->hotplug_active) 1090 break; 1091 if (vm->in_sbm) 1092 virtio_mem_sbm_notify_cancel_offline(vm, id); 1093 else 1094 virtio_mem_bbm_notify_cancel_offline(vm, id, 1095 mhp->start_pfn, 1096 mhp->nr_pages); 1097 vm->hotplug_active = false; 1098 mutex_unlock(&vm->hotplug_mutex); 1099 break; 1100 case MEM_CANCEL_ONLINE: 1101 if (!vm->hotplug_active) 1102 break; 1103 vm->hotplug_active = false; 1104 mutex_unlock(&vm->hotplug_mutex); 1105 break; 1106 default: 1107 break; 1108 } 1109 1110 lockdep_on(); 1111 1112 return rc; 1113 } 1114 1115 /* 1116 * Set a range of pages PG_offline. Remember pages that were never onlined 1117 * (via generic_online_page()) using PageDirty(). 1118 */ 1119 static void virtio_mem_set_fake_offline(unsigned long pfn, 1120 unsigned long nr_pages, bool onlined) 1121 { 1122 page_offline_begin(); 1123 for (; nr_pages--; pfn++) { 1124 struct page *page = pfn_to_page(pfn); 1125 1126 __SetPageOffline(page); 1127 if (!onlined) { 1128 SetPageDirty(page); 1129 /* FIXME: remove after cleanups */ 1130 ClearPageReserved(page); 1131 } 1132 } 1133 page_offline_end(); 1134 } 1135 1136 /* 1137 * Clear PG_offline from a range of pages. If the pages were never onlined, 1138 * (via generic_online_page()), clear PageDirty(). 1139 */ 1140 static void virtio_mem_clear_fake_offline(unsigned long pfn, 1141 unsigned long nr_pages, bool onlined) 1142 { 1143 for (; nr_pages--; pfn++) { 1144 struct page *page = pfn_to_page(pfn); 1145 1146 __ClearPageOffline(page); 1147 if (!onlined) 1148 ClearPageDirty(page); 1149 } 1150 } 1151 1152 /* 1153 * Release a range of fake-offline pages to the buddy, effectively 1154 * fake-onlining them. 1155 */ 1156 static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) 1157 { 1158 unsigned long order = MAX_PAGE_ORDER; 1159 unsigned long i; 1160 1161 /* 1162 * We might get called for ranges that don't cover properly aligned 1163 * MAX_PAGE_ORDER pages; however, we can only online properly aligned 1164 * pages with an order of MAX_PAGE_ORDER at maximum. 1165 */ 1166 while (!IS_ALIGNED(pfn | nr_pages, 1 << order)) 1167 order--; 1168 1169 for (i = 0; i < nr_pages; i += 1 << order) { 1170 struct page *page = pfn_to_page(pfn + i); 1171 1172 /* 1173 * If the page is PageDirty(), it was kept fake-offline when 1174 * onlining the memory block. Otherwise, it was allocated 1175 * using alloc_contig_range(). All pages in a subblock are 1176 * alike. 1177 */ 1178 if (PageDirty(page)) { 1179 virtio_mem_clear_fake_offline(pfn + i, 1 << order, false); 1180 generic_online_page(page, order); 1181 } else { 1182 virtio_mem_clear_fake_offline(pfn + i, 1 << order, true); 1183 free_contig_range(pfn + i, 1 << order); 1184 adjust_managed_page_count(page, 1 << order); 1185 } 1186 } 1187 } 1188 1189 /* 1190 * Try to allocate a range, marking pages fake-offline, effectively 1191 * fake-offlining them. 1192 */ 1193 static int virtio_mem_fake_offline(struct virtio_mem *vm, unsigned long pfn, 1194 unsigned long nr_pages) 1195 { 1196 const bool is_movable = is_zone_movable_page(pfn_to_page(pfn)); 1197 int rc, retry_count; 1198 1199 /* 1200 * TODO: We want an alloc_contig_range() mode that tries to allocate 1201 * harder (e.g., dealing with temporarily pinned pages, PCP), especially 1202 * with ZONE_MOVABLE. So for now, retry a couple of times with 1203 * ZONE_MOVABLE before giving up - because that zone is supposed to give 1204 * some guarantees. 1205 */ 1206 for (retry_count = 0; retry_count < 5; retry_count++) { 1207 /* 1208 * If the config changed, stop immediately and go back to the 1209 * main loop: avoid trying to keep unplugging if the device 1210 * might have decided to not remove any more memory. 1211 */ 1212 if (atomic_read(&vm->config_changed)) 1213 return -EAGAIN; 1214 1215 rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, 1216 GFP_KERNEL); 1217 if (rc == -ENOMEM) 1218 /* whoops, out of memory */ 1219 return rc; 1220 else if (rc && !is_movable) 1221 break; 1222 else if (rc) 1223 continue; 1224 1225 virtio_mem_set_fake_offline(pfn, nr_pages, true); 1226 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1227 return 0; 1228 } 1229 1230 return -EBUSY; 1231 } 1232 1233 /* 1234 * Handle fake-offline pages when memory is going offline - such that the 1235 * pages can be skipped by mm-core when offlining. 1236 */ 1237 static void virtio_mem_fake_offline_going_offline(unsigned long pfn, 1238 unsigned long nr_pages) 1239 { 1240 struct page *page; 1241 unsigned long i; 1242 1243 /* 1244 * Drop our reference to the pages so the memory can get offlined 1245 * and add the unplugged pages to the managed page counters (so 1246 * offlining code can correctly subtract them again). 1247 */ 1248 adjust_managed_page_count(pfn_to_page(pfn), nr_pages); 1249 /* Drop our reference to the pages so the memory can get offlined. */ 1250 for (i = 0; i < nr_pages; i++) { 1251 page = pfn_to_page(pfn + i); 1252 if (WARN_ON(!page_ref_dec_and_test(page))) 1253 dump_page(page, "fake-offline page referenced"); 1254 } 1255 } 1256 1257 /* 1258 * Handle fake-offline pages when memory offlining is canceled - to undo 1259 * what we did in virtio_mem_fake_offline_going_offline(). 1260 */ 1261 static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, 1262 unsigned long nr_pages) 1263 { 1264 unsigned long i; 1265 1266 /* 1267 * Get the reference we dropped when going offline and subtract the 1268 * unplugged pages from the managed page counters. 1269 */ 1270 adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); 1271 for (i = 0; i < nr_pages; i++) 1272 page_ref_inc(pfn_to_page(pfn + i)); 1273 } 1274 1275 static void virtio_mem_online_page(struct virtio_mem *vm, 1276 struct page *page, unsigned int order) 1277 { 1278 const unsigned long start = page_to_phys(page); 1279 const unsigned long end = start + PFN_PHYS(1 << order); 1280 unsigned long addr, next, id, sb_id, count; 1281 bool do_online; 1282 1283 /* 1284 * We can get called with any order up to MAX_PAGE_ORDER. If our subblock 1285 * size is smaller than that and we have a mixture of plugged and 1286 * unplugged subblocks within such a page, we have to process in 1287 * smaller granularity. In that case we'll adjust the order exactly once 1288 * within the loop. 1289 */ 1290 for (addr = start; addr < end; ) { 1291 next = addr + PFN_PHYS(1 << order); 1292 1293 if (vm->in_sbm) { 1294 id = virtio_mem_phys_to_mb_id(addr); 1295 sb_id = virtio_mem_phys_to_sb_id(vm, addr); 1296 count = virtio_mem_phys_to_sb_id(vm, next - 1) - sb_id + 1; 1297 1298 if (virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, count)) { 1299 /* Fully plugged. */ 1300 do_online = true; 1301 } else if (count == 1 || 1302 virtio_mem_sbm_test_sb_unplugged(vm, id, sb_id, count)) { 1303 /* Fully unplugged. */ 1304 do_online = false; 1305 } else { 1306 /* 1307 * Mixture, process sub-blocks instead. This 1308 * will be at least the size of a pageblock. 1309 * We'll run into this case exactly once. 1310 */ 1311 order = ilog2(vm->sbm.sb_size) - PAGE_SHIFT; 1312 do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1); 1313 continue; 1314 } 1315 } else { 1316 /* 1317 * If the whole block is marked fake offline, keep 1318 * everything that way. 1319 */ 1320 id = virtio_mem_phys_to_bb_id(vm, addr); 1321 do_online = virtio_mem_bbm_get_bb_state(vm, id) != 1322 VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; 1323 } 1324 1325 if (do_online) 1326 generic_online_page(pfn_to_page(PFN_DOWN(addr)), order); 1327 else 1328 virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, 1329 false); 1330 addr = next; 1331 } 1332 } 1333 1334 static void virtio_mem_online_page_cb(struct page *page, unsigned int order) 1335 { 1336 const unsigned long addr = page_to_phys(page); 1337 struct virtio_mem *vm; 1338 1339 rcu_read_lock(); 1340 list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { 1341 /* 1342 * Pages we're onlining will never cross memory blocks and, 1343 * therefore, not virtio-mem devices. 1344 */ 1345 if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) 1346 continue; 1347 1348 /* 1349 * virtio_mem_set_fake_offline() might sleep. We can safely 1350 * drop the RCU lock at this point because the device 1351 * cannot go away. See virtio_mem_remove() how races 1352 * between memory onlining and device removal are handled. 1353 */ 1354 rcu_read_unlock(); 1355 1356 virtio_mem_online_page(vm, page, order); 1357 return; 1358 } 1359 rcu_read_unlock(); 1360 1361 /* not virtio-mem memory, but e.g., a DIMM. online it */ 1362 generic_online_page(page, order); 1363 } 1364 1365 static uint64_t virtio_mem_send_request(struct virtio_mem *vm, 1366 const struct virtio_mem_req *req) 1367 { 1368 struct scatterlist *sgs[2], sg_req, sg_resp; 1369 unsigned int len; 1370 int rc; 1371 1372 /* don't use the request residing on the stack (vaddr) */ 1373 vm->req = *req; 1374 1375 /* out: buffer for request */ 1376 sg_init_one(&sg_req, &vm->req, sizeof(vm->req)); 1377 sgs[0] = &sg_req; 1378 1379 /* in: buffer for response */ 1380 sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp)); 1381 sgs[1] = &sg_resp; 1382 1383 rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL); 1384 if (rc < 0) 1385 return rc; 1386 1387 virtqueue_kick(vm->vq); 1388 1389 /* wait for a response */ 1390 wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len)); 1391 1392 return virtio16_to_cpu(vm->vdev, vm->resp.type); 1393 } 1394 1395 static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, 1396 uint64_t size) 1397 { 1398 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1399 const struct virtio_mem_req req = { 1400 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG), 1401 .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), 1402 .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1403 }; 1404 int rc = -ENOMEM; 1405 1406 if (atomic_read(&vm->config_changed)) 1407 return -EAGAIN; 1408 1409 dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, 1410 addr + size - 1); 1411 1412 switch (virtio_mem_send_request(vm, &req)) { 1413 case VIRTIO_MEM_RESP_ACK: 1414 vm->plugged_size += size; 1415 return 0; 1416 case VIRTIO_MEM_RESP_NACK: 1417 rc = -EAGAIN; 1418 break; 1419 case VIRTIO_MEM_RESP_BUSY: 1420 rc = -ETXTBSY; 1421 break; 1422 case VIRTIO_MEM_RESP_ERROR: 1423 rc = -EINVAL; 1424 break; 1425 default: 1426 break; 1427 } 1428 1429 dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); 1430 return rc; 1431 } 1432 1433 static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, 1434 uint64_t size) 1435 { 1436 const uint64_t nb_vm_blocks = size / vm->device_block_size; 1437 const struct virtio_mem_req req = { 1438 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG), 1439 .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), 1440 .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 1441 }; 1442 int rc = -ENOMEM; 1443 1444 if (atomic_read(&vm->config_changed)) 1445 return -EAGAIN; 1446 1447 dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, 1448 addr + size - 1); 1449 1450 switch (virtio_mem_send_request(vm, &req)) { 1451 case VIRTIO_MEM_RESP_ACK: 1452 vm->plugged_size -= size; 1453 return 0; 1454 case VIRTIO_MEM_RESP_BUSY: 1455 rc = -ETXTBSY; 1456 break; 1457 case VIRTIO_MEM_RESP_ERROR: 1458 rc = -EINVAL; 1459 break; 1460 default: 1461 break; 1462 } 1463 1464 dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); 1465 return rc; 1466 } 1467 1468 static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) 1469 { 1470 const struct virtio_mem_req req = { 1471 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), 1472 }; 1473 int rc = -ENOMEM; 1474 1475 dev_dbg(&vm->vdev->dev, "unplugging all memory"); 1476 1477 switch (virtio_mem_send_request(vm, &req)) { 1478 case VIRTIO_MEM_RESP_ACK: 1479 vm->unplug_all_required = false; 1480 vm->plugged_size = 0; 1481 /* usable region might have shrunk */ 1482 atomic_set(&vm->config_changed, 1); 1483 return 0; 1484 case VIRTIO_MEM_RESP_BUSY: 1485 rc = -ETXTBSY; 1486 break; 1487 default: 1488 break; 1489 } 1490 1491 dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); 1492 return rc; 1493 } 1494 1495 /* 1496 * Plug selected subblocks. Updates the plugged state, but not the state 1497 * of the memory block. 1498 */ 1499 static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, 1500 int sb_id, int count) 1501 { 1502 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1503 sb_id * vm->sbm.sb_size; 1504 const uint64_t size = count * vm->sbm.sb_size; 1505 int rc; 1506 1507 rc = virtio_mem_send_plug_request(vm, addr, size); 1508 if (!rc) 1509 virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); 1510 return rc; 1511 } 1512 1513 /* 1514 * Unplug selected subblocks. Updates the plugged state, but not the state 1515 * of the memory block. 1516 */ 1517 static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, 1518 int sb_id, int count) 1519 { 1520 const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + 1521 sb_id * vm->sbm.sb_size; 1522 const uint64_t size = count * vm->sbm.sb_size; 1523 int rc; 1524 1525 rc = virtio_mem_send_unplug_request(vm, addr, size); 1526 if (!rc) 1527 virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); 1528 return rc; 1529 } 1530 1531 /* 1532 * Request to unplug a big block. 1533 * 1534 * Will not modify the state of the big block. 1535 */ 1536 static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) 1537 { 1538 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1539 const uint64_t size = vm->bbm.bb_size; 1540 1541 return virtio_mem_send_unplug_request(vm, addr, size); 1542 } 1543 1544 /* 1545 * Request to plug a big block. 1546 * 1547 * Will not modify the state of the big block. 1548 */ 1549 static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) 1550 { 1551 const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); 1552 const uint64_t size = vm->bbm.bb_size; 1553 1554 return virtio_mem_send_plug_request(vm, addr, size); 1555 } 1556 1557 /* 1558 * Unplug the desired number of plugged subblocks of a offline or not-added 1559 * memory block. Will fail if any subblock cannot get unplugged (instead of 1560 * skipping it). 1561 * 1562 * Will not modify the state of the memory block. 1563 * 1564 * Note: can fail after some subblocks were unplugged. 1565 */ 1566 static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm, 1567 unsigned long mb_id, uint64_t *nb_sb) 1568 { 1569 int sb_id, count; 1570 int rc; 1571 1572 sb_id = vm->sbm.sbs_per_mb - 1; 1573 while (*nb_sb) { 1574 /* Find the next candidate subblock */ 1575 while (sb_id >= 0 && 1576 virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) 1577 sb_id--; 1578 if (sb_id < 0) 1579 break; 1580 /* Try to unplug multiple subblocks at a time */ 1581 count = 1; 1582 while (count < *nb_sb && sb_id > 0 && 1583 virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { 1584 count++; 1585 sb_id--; 1586 } 1587 1588 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1589 if (rc) 1590 return rc; 1591 *nb_sb -= count; 1592 sb_id--; 1593 } 1594 1595 return 0; 1596 } 1597 1598 /* 1599 * Unplug all plugged subblocks of an offline or not-added memory block. 1600 * 1601 * Will not modify the state of the memory block. 1602 * 1603 * Note: can fail after some subblocks were unplugged. 1604 */ 1605 static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) 1606 { 1607 uint64_t nb_sb = vm->sbm.sbs_per_mb; 1608 1609 return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb); 1610 } 1611 1612 /* 1613 * Prepare tracking data for the next memory block. 1614 */ 1615 static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, 1616 unsigned long *mb_id) 1617 { 1618 int rc; 1619 1620 if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) 1621 return -ENOSPC; 1622 1623 /* Resize the state array if required. */ 1624 rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); 1625 if (rc) 1626 return rc; 1627 1628 /* Resize the subblock bitmap if required. */ 1629 rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); 1630 if (rc) 1631 return rc; 1632 1633 vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; 1634 *mb_id = vm->sbm.next_mb_id++; 1635 return 0; 1636 } 1637 1638 /* 1639 * Try to plug the desired number of subblocks and add the memory block 1640 * to Linux. 1641 * 1642 * Will modify the state of the memory block. 1643 */ 1644 static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, 1645 unsigned long mb_id, uint64_t *nb_sb) 1646 { 1647 const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); 1648 int rc; 1649 1650 if (WARN_ON_ONCE(!count)) 1651 return -EINVAL; 1652 1653 /* 1654 * Plug the requested number of subblocks before adding it to linux, 1655 * so that onlining will directly online all plugged subblocks. 1656 */ 1657 rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); 1658 if (rc) 1659 return rc; 1660 1661 /* 1662 * Mark the block properly offline before adding it to Linux, 1663 * so the memory notifiers will find the block in the right state. 1664 */ 1665 if (count == vm->sbm.sbs_per_mb) 1666 virtio_mem_sbm_set_mb_state(vm, mb_id, 1667 VIRTIO_MEM_SBM_MB_OFFLINE); 1668 else 1669 virtio_mem_sbm_set_mb_state(vm, mb_id, 1670 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1671 1672 /* Add the memory block to linux - if that fails, try to unplug. */ 1673 rc = virtio_mem_sbm_add_mb(vm, mb_id); 1674 if (rc) { 1675 int new_state = VIRTIO_MEM_SBM_MB_UNUSED; 1676 1677 if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) 1678 new_state = VIRTIO_MEM_SBM_MB_PLUGGED; 1679 virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); 1680 return rc; 1681 } 1682 1683 *nb_sb -= count; 1684 return 0; 1685 } 1686 1687 /* 1688 * Try to plug the desired number of subblocks of a memory block that 1689 * is already added to Linux. 1690 * 1691 * Will modify the state of the memory block. 1692 * 1693 * Note: Can fail after some subblocks were successfully plugged. 1694 */ 1695 static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, 1696 unsigned long mb_id, uint64_t *nb_sb) 1697 { 1698 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1699 unsigned long pfn, nr_pages; 1700 int sb_id, count; 1701 int rc; 1702 1703 if (WARN_ON_ONCE(!*nb_sb)) 1704 return -EINVAL; 1705 1706 while (*nb_sb) { 1707 sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); 1708 if (sb_id >= vm->sbm.sbs_per_mb) 1709 break; 1710 count = 1; 1711 while (count < *nb_sb && 1712 sb_id + count < vm->sbm.sbs_per_mb && 1713 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) 1714 count++; 1715 1716 rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); 1717 if (rc) 1718 return rc; 1719 *nb_sb -= count; 1720 if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) 1721 continue; 1722 1723 /* fake-online the pages if the memory block is online */ 1724 pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1725 sb_id * vm->sbm.sb_size); 1726 nr_pages = PFN_DOWN(count * vm->sbm.sb_size); 1727 virtio_mem_fake_online(pfn, nr_pages); 1728 } 1729 1730 if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1731 virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1); 1732 1733 return 0; 1734 } 1735 1736 static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1737 { 1738 const int mb_states[] = { 1739 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 1740 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 1741 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 1742 }; 1743 uint64_t nb_sb = diff / vm->sbm.sb_size; 1744 unsigned long mb_id; 1745 int rc, i; 1746 1747 if (!nb_sb) 1748 return 0; 1749 1750 /* Don't race with onlining/offlining */ 1751 mutex_lock(&vm->hotplug_mutex); 1752 1753 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 1754 virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) { 1755 rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb); 1756 if (rc || !nb_sb) 1757 goto out_unlock; 1758 cond_resched(); 1759 } 1760 } 1761 1762 /* 1763 * We won't be working on online/offline memory blocks from this point, 1764 * so we can't race with memory onlining/offlining. Drop the mutex. 1765 */ 1766 mutex_unlock(&vm->hotplug_mutex); 1767 1768 /* Try to plug and add unused blocks */ 1769 virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { 1770 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1771 return -ENOSPC; 1772 1773 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1774 if (rc || !nb_sb) 1775 return rc; 1776 cond_resched(); 1777 } 1778 1779 /* Try to prepare, plug and add new blocks */ 1780 while (nb_sb) { 1781 if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) 1782 return -ENOSPC; 1783 1784 rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); 1785 if (rc) 1786 return rc; 1787 rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); 1788 if (rc) 1789 return rc; 1790 cond_resched(); 1791 } 1792 1793 return 0; 1794 out_unlock: 1795 mutex_unlock(&vm->hotplug_mutex); 1796 return rc; 1797 } 1798 1799 /* 1800 * Plug a big block and add it to Linux. 1801 * 1802 * Will modify the state of the big block. 1803 */ 1804 static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, 1805 unsigned long bb_id) 1806 { 1807 int rc; 1808 1809 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 1810 VIRTIO_MEM_BBM_BB_UNUSED)) 1811 return -EINVAL; 1812 1813 rc = virtio_mem_bbm_plug_bb(vm, bb_id); 1814 if (rc) 1815 return rc; 1816 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 1817 1818 rc = virtio_mem_bbm_add_bb(vm, bb_id); 1819 if (rc) { 1820 if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) 1821 virtio_mem_bbm_set_bb_state(vm, bb_id, 1822 VIRTIO_MEM_BBM_BB_UNUSED); 1823 else 1824 /* Retry from the main loop. */ 1825 virtio_mem_bbm_set_bb_state(vm, bb_id, 1826 VIRTIO_MEM_BBM_BB_PLUGGED); 1827 return rc; 1828 } 1829 return 0; 1830 } 1831 1832 /* 1833 * Prepare tracking data for the next big block. 1834 */ 1835 static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, 1836 unsigned long *bb_id) 1837 { 1838 int rc; 1839 1840 if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) 1841 return -ENOSPC; 1842 1843 /* Resize the big block state array if required. */ 1844 rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); 1845 if (rc) 1846 return rc; 1847 1848 vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; 1849 *bb_id = vm->bbm.next_bb_id; 1850 vm->bbm.next_bb_id++; 1851 return 0; 1852 } 1853 1854 static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) 1855 { 1856 uint64_t nb_bb = diff / vm->bbm.bb_size; 1857 unsigned long bb_id; 1858 int rc; 1859 1860 if (!nb_bb) 1861 return 0; 1862 1863 /* Try to plug and add unused big blocks */ 1864 virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { 1865 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1866 return -ENOSPC; 1867 1868 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1869 if (!rc) 1870 nb_bb--; 1871 if (rc || !nb_bb) 1872 return rc; 1873 cond_resched(); 1874 } 1875 1876 /* Try to prepare, plug and add new big blocks */ 1877 while (nb_bb) { 1878 if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) 1879 return -ENOSPC; 1880 1881 rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); 1882 if (rc) 1883 return rc; 1884 rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); 1885 if (!rc) 1886 nb_bb--; 1887 if (rc) 1888 return rc; 1889 cond_resched(); 1890 } 1891 1892 return 0; 1893 } 1894 1895 /* 1896 * Try to plug the requested amount of memory. 1897 */ 1898 static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) 1899 { 1900 if (vm->in_sbm) 1901 return virtio_mem_sbm_plug_request(vm, diff); 1902 return virtio_mem_bbm_plug_request(vm, diff); 1903 } 1904 1905 /* 1906 * Unplug the desired number of plugged subblocks of an offline memory block. 1907 * Will fail if any subblock cannot get unplugged (instead of skipping it). 1908 * 1909 * Will modify the state of the memory block. Might temporarily drop the 1910 * hotplug_mutex. 1911 * 1912 * Note: Can fail after some subblocks were successfully unplugged. 1913 */ 1914 static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, 1915 unsigned long mb_id, 1916 uint64_t *nb_sb) 1917 { 1918 int rc; 1919 1920 rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb); 1921 1922 /* some subblocks might have been unplugged even on failure */ 1923 if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) 1924 virtio_mem_sbm_set_mb_state(vm, mb_id, 1925 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); 1926 if (rc) 1927 return rc; 1928 1929 if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 1930 /* 1931 * Remove the block from Linux - this should never fail. 1932 * Hinder the block from getting onlined by marking it 1933 * unplugged. Temporarily drop the mutex, so 1934 * any pending GOING_ONLINE requests can be serviced/rejected. 1935 */ 1936 virtio_mem_sbm_set_mb_state(vm, mb_id, 1937 VIRTIO_MEM_SBM_MB_UNUSED); 1938 1939 mutex_unlock(&vm->hotplug_mutex); 1940 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 1941 BUG_ON(rc); 1942 mutex_lock(&vm->hotplug_mutex); 1943 } 1944 return 0; 1945 } 1946 1947 /* 1948 * Unplug the given plugged subblocks of an online memory block. 1949 * 1950 * Will modify the state of the memory block. 1951 */ 1952 static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, 1953 unsigned long mb_id, int sb_id, 1954 int count) 1955 { 1956 const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; 1957 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 1958 unsigned long start_pfn; 1959 int rc; 1960 1961 start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + 1962 sb_id * vm->sbm.sb_size); 1963 1964 rc = virtio_mem_fake_offline(vm, start_pfn, nr_pages); 1965 if (rc) 1966 return rc; 1967 1968 /* Try to unplug the allocated memory */ 1969 rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); 1970 if (rc) { 1971 /* Return the memory to the buddy. */ 1972 virtio_mem_fake_online(start_pfn, nr_pages); 1973 return rc; 1974 } 1975 1976 switch (old_state) { 1977 case VIRTIO_MEM_SBM_MB_KERNEL: 1978 virtio_mem_sbm_set_mb_state(vm, mb_id, 1979 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL); 1980 break; 1981 case VIRTIO_MEM_SBM_MB_MOVABLE: 1982 virtio_mem_sbm_set_mb_state(vm, mb_id, 1983 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL); 1984 break; 1985 } 1986 1987 return 0; 1988 } 1989 1990 /* 1991 * Unplug the desired number of plugged subblocks of an online memory block. 1992 * Will skip subblock that are busy. 1993 * 1994 * Will modify the state of the memory block. Might temporarily drop the 1995 * hotplug_mutex. 1996 * 1997 * Note: Can fail after some subblocks were successfully unplugged. Can 1998 * return 0 even if subblocks were busy and could not get unplugged. 1999 */ 2000 static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, 2001 unsigned long mb_id, 2002 uint64_t *nb_sb) 2003 { 2004 int rc, sb_id; 2005 2006 /* If possible, try to unplug the complete block in one shot. */ 2007 if (*nb_sb >= vm->sbm.sbs_per_mb && 2008 virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { 2009 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, 2010 vm->sbm.sbs_per_mb); 2011 if (!rc) { 2012 *nb_sb -= vm->sbm.sbs_per_mb; 2013 goto unplugged; 2014 } else if (rc != -EBUSY) 2015 return rc; 2016 } 2017 2018 /* Fallback to single subblocks. */ 2019 for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { 2020 /* Find the next candidate subblock */ 2021 while (sb_id >= 0 && 2022 !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) 2023 sb_id--; 2024 if (sb_id < 0) 2025 break; 2026 2027 rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); 2028 if (rc == -EBUSY) 2029 continue; 2030 else if (rc) 2031 return rc; 2032 *nb_sb -= 1; 2033 } 2034 2035 unplugged: 2036 rc = virtio_mem_sbm_try_remove_unplugged_mb(vm, mb_id); 2037 if (rc) 2038 vm->sbm.have_unplugged_mb = 1; 2039 /* Ignore errors, this is not critical. We'll retry later. */ 2040 return 0; 2041 } 2042 2043 /* 2044 * Unplug the desired number of plugged subblocks of a memory block that is 2045 * already added to Linux. Will skip subblock of online memory blocks that are 2046 * busy (by the OS). Will fail if any subblock that's not busy cannot get 2047 * unplugged. 2048 * 2049 * Will modify the state of the memory block. Might temporarily drop the 2050 * hotplug_mutex. 2051 * 2052 * Note: Can fail after some subblocks were successfully unplugged. Can 2053 * return 0 even if subblocks were busy and could not get unplugged. 2054 */ 2055 static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, 2056 unsigned long mb_id, 2057 uint64_t *nb_sb) 2058 { 2059 const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); 2060 2061 switch (old_state) { 2062 case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: 2063 case VIRTIO_MEM_SBM_MB_KERNEL: 2064 case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: 2065 case VIRTIO_MEM_SBM_MB_MOVABLE: 2066 return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); 2067 case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: 2068 case VIRTIO_MEM_SBM_MB_OFFLINE: 2069 return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb); 2070 } 2071 return -EINVAL; 2072 } 2073 2074 static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2075 { 2076 const int mb_states[] = { 2077 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, 2078 VIRTIO_MEM_SBM_MB_OFFLINE, 2079 VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, 2080 VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, 2081 VIRTIO_MEM_SBM_MB_MOVABLE, 2082 VIRTIO_MEM_SBM_MB_KERNEL, 2083 }; 2084 uint64_t nb_sb = diff / vm->sbm.sb_size; 2085 unsigned long mb_id; 2086 int rc, i; 2087 2088 if (!nb_sb) 2089 return 0; 2090 2091 /* 2092 * We'll drop the mutex a couple of times when it is safe to do so. 2093 * This might result in some blocks switching the state (online/offline) 2094 * and we could miss them in this run - we will retry again later. 2095 */ 2096 mutex_lock(&vm->hotplug_mutex); 2097 2098 /* 2099 * We try unplug from partially plugged blocks first, to try removing 2100 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE 2101 * as it's more reliable to unplug memory and remove whole memory 2102 * blocks, and we don't want to trigger a zone imbalances by 2103 * accidentially removing too much kernel memory. 2104 */ 2105 for (i = 0; i < ARRAY_SIZE(mb_states); i++) { 2106 virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { 2107 rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); 2108 if (rc || !nb_sb) 2109 goto out_unlock; 2110 mutex_unlock(&vm->hotplug_mutex); 2111 cond_resched(); 2112 mutex_lock(&vm->hotplug_mutex); 2113 } 2114 if (!unplug_online && i == 1) { 2115 mutex_unlock(&vm->hotplug_mutex); 2116 return 0; 2117 } 2118 } 2119 2120 mutex_unlock(&vm->hotplug_mutex); 2121 return nb_sb ? -EBUSY : 0; 2122 out_unlock: 2123 mutex_unlock(&vm->hotplug_mutex); 2124 return rc; 2125 } 2126 2127 /* 2128 * Try to offline and remove a big block from Linux and unplug it. Will fail 2129 * with -EBUSY if some memory is busy and cannot get unplugged. 2130 * 2131 * Will modify the state of the memory block. Might temporarily drop the 2132 * hotplug_mutex. 2133 */ 2134 static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, 2135 unsigned long bb_id) 2136 { 2137 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2138 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2139 unsigned long end_pfn = start_pfn + nr_pages; 2140 unsigned long pfn; 2141 struct page *page; 2142 int rc; 2143 2144 if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != 2145 VIRTIO_MEM_BBM_BB_ADDED)) 2146 return -EINVAL; 2147 2148 /* 2149 * Start by fake-offlining all memory. Once we marked the device 2150 * block as fake-offline, all newly onlined memory will 2151 * automatically be kept fake-offline. Protect from concurrent 2152 * onlining/offlining until we have a consistent state. 2153 */ 2154 mutex_lock(&vm->hotplug_mutex); 2155 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); 2156 2157 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2158 page = pfn_to_online_page(pfn); 2159 if (!page) 2160 continue; 2161 2162 rc = virtio_mem_fake_offline(vm, pfn, PAGES_PER_SECTION); 2163 if (rc) { 2164 end_pfn = pfn; 2165 goto rollback; 2166 } 2167 } 2168 mutex_unlock(&vm->hotplug_mutex); 2169 2170 rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); 2171 if (rc) { 2172 mutex_lock(&vm->hotplug_mutex); 2173 goto rollback; 2174 } 2175 2176 rc = virtio_mem_bbm_unplug_bb(vm, bb_id); 2177 if (rc) 2178 virtio_mem_bbm_set_bb_state(vm, bb_id, 2179 VIRTIO_MEM_BBM_BB_PLUGGED); 2180 else 2181 virtio_mem_bbm_set_bb_state(vm, bb_id, 2182 VIRTIO_MEM_BBM_BB_UNUSED); 2183 return rc; 2184 2185 rollback: 2186 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2187 page = pfn_to_online_page(pfn); 2188 if (!page) 2189 continue; 2190 virtio_mem_fake_online(pfn, PAGES_PER_SECTION); 2191 } 2192 virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); 2193 mutex_unlock(&vm->hotplug_mutex); 2194 return rc; 2195 } 2196 2197 /* 2198 * Test if a big block is completely offline. 2199 */ 2200 static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, 2201 unsigned long bb_id) 2202 { 2203 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2204 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2205 unsigned long pfn; 2206 2207 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2208 pfn += PAGES_PER_SECTION) { 2209 if (pfn_to_online_page(pfn)) 2210 return false; 2211 } 2212 2213 return true; 2214 } 2215 2216 /* 2217 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline). 2218 */ 2219 static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm, 2220 unsigned long bb_id) 2221 { 2222 const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); 2223 const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); 2224 struct page *page; 2225 unsigned long pfn; 2226 2227 for (pfn = start_pfn; pfn < start_pfn + nr_pages; 2228 pfn += PAGES_PER_SECTION) { 2229 page = pfn_to_online_page(pfn); 2230 if (!page) 2231 continue; 2232 if (page_zonenum(page) != ZONE_MOVABLE) 2233 return false; 2234 } 2235 2236 return true; 2237 } 2238 2239 static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) 2240 { 2241 uint64_t nb_bb = diff / vm->bbm.bb_size; 2242 uint64_t bb_id; 2243 int rc, i; 2244 2245 if (!nb_bb) 2246 return 0; 2247 2248 /* 2249 * Try to unplug big blocks. Similar to SBM, start with offline 2250 * big blocks. 2251 */ 2252 for (i = 0; i < 3; i++) { 2253 virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { 2254 cond_resched(); 2255 2256 /* 2257 * As we're holding no locks, these checks are racy, 2258 * but we don't care. 2259 */ 2260 if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) 2261 continue; 2262 if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id)) 2263 continue; 2264 rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); 2265 if (rc == -EBUSY) 2266 continue; 2267 if (!rc) 2268 nb_bb--; 2269 if (rc || !nb_bb) 2270 return rc; 2271 } 2272 if (i == 0 && !unplug_online) 2273 return 0; 2274 } 2275 2276 return nb_bb ? -EBUSY : 0; 2277 } 2278 2279 /* 2280 * Try to unplug the requested amount of memory. 2281 */ 2282 static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) 2283 { 2284 if (vm->in_sbm) 2285 return virtio_mem_sbm_unplug_request(vm, diff); 2286 return virtio_mem_bbm_unplug_request(vm, diff); 2287 } 2288 2289 /* 2290 * Try to unplug all blocks that couldn't be unplugged before, for example, 2291 * because the hypervisor was busy. Further, offline and remove any memory 2292 * blocks where we previously failed. 2293 */ 2294 static int virtio_mem_cleanup_pending_mb(struct virtio_mem *vm) 2295 { 2296 unsigned long id; 2297 int rc = 0; 2298 2299 if (!vm->in_sbm) { 2300 virtio_mem_bbm_for_each_bb(vm, id, 2301 VIRTIO_MEM_BBM_BB_PLUGGED) { 2302 rc = virtio_mem_bbm_unplug_bb(vm, id); 2303 if (rc) 2304 return rc; 2305 virtio_mem_bbm_set_bb_state(vm, id, 2306 VIRTIO_MEM_BBM_BB_UNUSED); 2307 } 2308 return 0; 2309 } 2310 2311 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { 2312 rc = virtio_mem_sbm_unplug_mb(vm, id); 2313 if (rc) 2314 return rc; 2315 virtio_mem_sbm_set_mb_state(vm, id, 2316 VIRTIO_MEM_SBM_MB_UNUSED); 2317 } 2318 2319 if (!vm->sbm.have_unplugged_mb) 2320 return 0; 2321 2322 /* 2323 * Let's retry (offlining and) removing completely unplugged Linux 2324 * memory blocks. 2325 */ 2326 vm->sbm.have_unplugged_mb = false; 2327 2328 mutex_lock(&vm->hotplug_mutex); 2329 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL) 2330 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); 2331 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL) 2332 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); 2333 virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) 2334 rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id); 2335 mutex_unlock(&vm->hotplug_mutex); 2336 2337 if (rc) 2338 vm->sbm.have_unplugged_mb = true; 2339 /* Ignore errors, this is not critical. We'll retry later. */ 2340 return 0; 2341 } 2342 2343 /* 2344 * Update all parts of the config that could have changed. 2345 */ 2346 static void virtio_mem_refresh_config(struct virtio_mem *vm) 2347 { 2348 const struct range pluggable_range = mhp_get_pluggable_range(true); 2349 uint64_t new_plugged_size, usable_region_size, end_addr; 2350 2351 /* the plugged_size is just a reflection of what _we_ did previously */ 2352 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2353 &new_plugged_size); 2354 if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size)) 2355 vm->plugged_size = new_plugged_size; 2356 2357 /* calculate the last usable memory block id */ 2358 virtio_cread_le(vm->vdev, struct virtio_mem_config, 2359 usable_region_size, &usable_region_size); 2360 end_addr = min(vm->addr + usable_region_size - 1, 2361 pluggable_range.end); 2362 2363 if (vm->in_sbm) { 2364 vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr); 2365 if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes())) 2366 vm->sbm.last_usable_mb_id--; 2367 } else { 2368 vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm, 2369 end_addr); 2370 if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size)) 2371 vm->bbm.last_usable_bb_id--; 2372 } 2373 /* 2374 * If we cannot plug any of our device memory (e.g., nothing in the 2375 * usable region is addressable), the last usable memory block id will 2376 * be smaller than the first usable memory block id. We'll stop 2377 * attempting to add memory with -ENOSPC from our main loop. 2378 */ 2379 2380 /* see if there is a request to change the size */ 2381 virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, 2382 &vm->requested_size); 2383 2384 dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size); 2385 dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size); 2386 } 2387 2388 /* 2389 * Workqueue function for handling plug/unplug requests and config updates. 2390 */ 2391 static void virtio_mem_run_wq(struct work_struct *work) 2392 { 2393 struct virtio_mem *vm = container_of(work, struct virtio_mem, wq); 2394 uint64_t diff; 2395 int rc; 2396 2397 if (unlikely(vm->in_kdump)) { 2398 dev_warn_once(&vm->vdev->dev, 2399 "unexpected workqueue run in kdump kernel\n"); 2400 return; 2401 } 2402 2403 hrtimer_cancel(&vm->retry_timer); 2404 2405 if (vm->broken) 2406 return; 2407 2408 atomic_set(&vm->wq_active, 1); 2409 retry: 2410 rc = 0; 2411 2412 /* Make sure we start with a clean state if there are leftovers. */ 2413 if (unlikely(vm->unplug_all_required)) 2414 rc = virtio_mem_send_unplug_all_request(vm); 2415 2416 if (atomic_read(&vm->config_changed)) { 2417 atomic_set(&vm->config_changed, 0); 2418 virtio_mem_refresh_config(vm); 2419 } 2420 2421 /* Cleanup any leftovers from previous runs */ 2422 if (!rc) 2423 rc = virtio_mem_cleanup_pending_mb(vm); 2424 2425 if (!rc && vm->requested_size != vm->plugged_size) { 2426 if (vm->requested_size > vm->plugged_size) { 2427 diff = vm->requested_size - vm->plugged_size; 2428 rc = virtio_mem_plug_request(vm, diff); 2429 } else { 2430 diff = vm->plugged_size - vm->requested_size; 2431 rc = virtio_mem_unplug_request(vm, diff); 2432 } 2433 } 2434 2435 /* 2436 * Keep retrying to offline and remove completely unplugged Linux 2437 * memory blocks. 2438 */ 2439 if (!rc && vm->in_sbm && vm->sbm.have_unplugged_mb) 2440 rc = -EBUSY; 2441 2442 switch (rc) { 2443 case 0: 2444 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2445 break; 2446 case -ENOSPC: 2447 /* 2448 * We cannot add any more memory (alignment, physical limit) 2449 * or we have too many offline memory blocks. 2450 */ 2451 break; 2452 case -ETXTBSY: 2453 /* 2454 * The hypervisor cannot process our request right now 2455 * (e.g., out of memory, migrating); 2456 */ 2457 case -EBUSY: 2458 /* 2459 * We cannot free up any memory to unplug it (all plugged memory 2460 * is busy). 2461 */ 2462 case -ENOMEM: 2463 /* Out of memory, try again later. */ 2464 hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms), 2465 HRTIMER_MODE_REL); 2466 break; 2467 case -EAGAIN: 2468 /* Retry immediately (e.g., the config changed). */ 2469 goto retry; 2470 default: 2471 /* Unknown error, mark as broken */ 2472 dev_err(&vm->vdev->dev, 2473 "unknown error, marking device broken: %d\n", rc); 2474 vm->broken = true; 2475 } 2476 2477 atomic_set(&vm->wq_active, 0); 2478 } 2479 2480 static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) 2481 { 2482 struct virtio_mem *vm = container_of(timer, struct virtio_mem, 2483 retry_timer); 2484 2485 virtio_mem_retry(vm); 2486 vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2, 2487 VIRTIO_MEM_RETRY_TIMER_MAX_MS); 2488 return HRTIMER_NORESTART; 2489 } 2490 2491 static void virtio_mem_handle_response(struct virtqueue *vq) 2492 { 2493 struct virtio_mem *vm = vq->vdev->priv; 2494 2495 wake_up(&vm->host_resp); 2496 } 2497 2498 static int virtio_mem_init_vq(struct virtio_mem *vm) 2499 { 2500 struct virtqueue *vq; 2501 2502 vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response, 2503 "guest-request"); 2504 if (IS_ERR(vq)) 2505 return PTR_ERR(vq); 2506 vm->vq = vq; 2507 2508 return 0; 2509 } 2510 2511 static int virtio_mem_init_hotplug(struct virtio_mem *vm) 2512 { 2513 const struct range pluggable_range = mhp_get_pluggable_range(true); 2514 uint64_t unit_pages, sb_size, addr; 2515 int rc; 2516 2517 /* bad device setup - warn only */ 2518 if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) 2519 dev_warn(&vm->vdev->dev, 2520 "The alignment of the physical start address can make some memory unusable.\n"); 2521 if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes())) 2522 dev_warn(&vm->vdev->dev, 2523 "The alignment of the physical end address can make some memory unusable.\n"); 2524 if (vm->addr < pluggable_range.start || 2525 vm->addr + vm->region_size - 1 > pluggable_range.end) 2526 dev_warn(&vm->vdev->dev, 2527 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); 2528 2529 /* Prepare the offline threshold - make sure we can add two blocks. */ 2530 vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), 2531 VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); 2532 2533 /* 2534 * alloc_contig_range() works reliably with pageblock 2535 * granularity on ZONE_NORMAL, use pageblock_nr_pages. 2536 */ 2537 sb_size = PAGE_SIZE * pageblock_nr_pages; 2538 sb_size = max_t(uint64_t, vm->device_block_size, sb_size); 2539 2540 if (sb_size < memory_block_size_bytes() && !force_bbm) { 2541 /* SBM: At least two subblocks per Linux memory block. */ 2542 vm->in_sbm = true; 2543 vm->sbm.sb_size = sb_size; 2544 vm->sbm.sbs_per_mb = memory_block_size_bytes() / 2545 vm->sbm.sb_size; 2546 2547 /* Round up to the next full memory block */ 2548 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2549 memory_block_size_bytes() - 1; 2550 vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); 2551 vm->sbm.next_mb_id = vm->sbm.first_mb_id; 2552 } else { 2553 /* BBM: At least one Linux memory block. */ 2554 vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, 2555 memory_block_size_bytes()); 2556 2557 if (bbm_block_size) { 2558 if (!is_power_of_2(bbm_block_size)) { 2559 dev_warn(&vm->vdev->dev, 2560 "bbm_block_size is not a power of 2"); 2561 } else if (bbm_block_size < vm->bbm.bb_size) { 2562 dev_warn(&vm->vdev->dev, 2563 "bbm_block_size is too small"); 2564 } else { 2565 vm->bbm.bb_size = bbm_block_size; 2566 } 2567 } 2568 2569 /* Round up to the next aligned big block */ 2570 addr = max_t(uint64_t, vm->addr, pluggable_range.start) + 2571 vm->bbm.bb_size - 1; 2572 vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); 2573 vm->bbm.next_bb_id = vm->bbm.first_bb_id; 2574 2575 /* Make sure we can add two big blocks. */ 2576 vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, 2577 vm->offline_threshold); 2578 } 2579 2580 dev_info(&vm->vdev->dev, "memory block size: 0x%lx", 2581 memory_block_size_bytes()); 2582 if (vm->in_sbm) 2583 dev_info(&vm->vdev->dev, "subblock size: 0x%llx", 2584 (unsigned long long)vm->sbm.sb_size); 2585 else 2586 dev_info(&vm->vdev->dev, "big block size: 0x%llx", 2587 (unsigned long long)vm->bbm.bb_size); 2588 2589 /* create the parent resource for all memory */ 2590 rc = virtio_mem_create_resource(vm); 2591 if (rc) 2592 return rc; 2593 2594 /* use a single dynamic memory group to cover the whole memory device */ 2595 if (vm->in_sbm) 2596 unit_pages = PHYS_PFN(memory_block_size_bytes()); 2597 else 2598 unit_pages = PHYS_PFN(vm->bbm.bb_size); 2599 rc = memory_group_register_dynamic(vm->nid, unit_pages); 2600 if (rc < 0) 2601 goto out_del_resource; 2602 vm->mgid = rc; 2603 2604 /* 2605 * If we still have memory plugged, we have to unplug all memory first. 2606 * Registering our parent resource makes sure that this memory isn't 2607 * actually in use (e.g., trying to reload the driver). 2608 */ 2609 if (vm->plugged_size) { 2610 vm->unplug_all_required = true; 2611 dev_info(&vm->vdev->dev, "unplugging all memory is required\n"); 2612 } 2613 2614 /* register callbacks */ 2615 vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb; 2616 rc = register_memory_notifier(&vm->memory_notifier); 2617 if (rc) 2618 goto out_unreg_group; 2619 rc = register_virtio_mem_device(vm); 2620 if (rc) 2621 goto out_unreg_mem; 2622 2623 return 0; 2624 out_unreg_mem: 2625 unregister_memory_notifier(&vm->memory_notifier); 2626 out_unreg_group: 2627 memory_group_unregister(vm->mgid); 2628 out_del_resource: 2629 virtio_mem_delete_resource(vm); 2630 return rc; 2631 } 2632 2633 #ifdef CONFIG_PROC_VMCORE 2634 static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr, 2635 uint64_t size) 2636 { 2637 const uint64_t nb_vm_blocks = size / vm->device_block_size; 2638 const struct virtio_mem_req req = { 2639 .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE), 2640 .u.state.addr = cpu_to_virtio64(vm->vdev, addr), 2641 .u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), 2642 }; 2643 int rc = -ENOMEM; 2644 2645 dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr, 2646 addr + size - 1); 2647 2648 switch (virtio_mem_send_request(vm, &req)) { 2649 case VIRTIO_MEM_RESP_ACK: 2650 return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state); 2651 case VIRTIO_MEM_RESP_ERROR: 2652 rc = -EINVAL; 2653 break; 2654 default: 2655 break; 2656 } 2657 2658 dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc); 2659 return rc; 2660 } 2661 2662 static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb, 2663 unsigned long pfn) 2664 { 2665 struct virtio_mem *vm = container_of(cb, struct virtio_mem, 2666 vmcore_cb); 2667 uint64_t addr = PFN_PHYS(pfn); 2668 bool is_ram; 2669 int rc; 2670 2671 if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE)) 2672 return true; 2673 if (!vm->plugged_size) 2674 return false; 2675 2676 /* 2677 * We have to serialize device requests and access to the information 2678 * about the block queried last. 2679 */ 2680 mutex_lock(&vm->hotplug_mutex); 2681 2682 addr = ALIGN_DOWN(addr, vm->device_block_size); 2683 if (addr != vm->last_block_addr) { 2684 rc = virtio_mem_send_state_request(vm, addr, 2685 vm->device_block_size); 2686 /* On any kind of error, we're going to signal !ram. */ 2687 if (rc == VIRTIO_MEM_STATE_PLUGGED) 2688 vm->last_block_plugged = true; 2689 else 2690 vm->last_block_plugged = false; 2691 vm->last_block_addr = addr; 2692 } 2693 2694 is_ram = vm->last_block_plugged; 2695 mutex_unlock(&vm->hotplug_mutex); 2696 return is_ram; 2697 } 2698 #endif /* CONFIG_PROC_VMCORE */ 2699 2700 static int virtio_mem_init_kdump(struct virtio_mem *vm) 2701 { 2702 #ifdef CONFIG_PROC_VMCORE 2703 dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n"); 2704 vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram; 2705 register_vmcore_cb(&vm->vmcore_cb); 2706 return 0; 2707 #else /* CONFIG_PROC_VMCORE */ 2708 dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n"); 2709 return -EBUSY; 2710 #endif /* CONFIG_PROC_VMCORE */ 2711 } 2712 2713 static int virtio_mem_init(struct virtio_mem *vm) 2714 { 2715 uint16_t node_id; 2716 2717 if (!vm->vdev->config->get) { 2718 dev_err(&vm->vdev->dev, "config access disabled\n"); 2719 return -EINVAL; 2720 } 2721 2722 /* Fetch all properties that can't change. */ 2723 virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size, 2724 &vm->plugged_size); 2725 virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size, 2726 &vm->device_block_size); 2727 virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id, 2728 &node_id); 2729 vm->nid = virtio_mem_translate_node_id(vm, node_id); 2730 virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr); 2731 virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, 2732 &vm->region_size); 2733 2734 /* Determine the nid for the device based on the lowest address. */ 2735 if (vm->nid == NUMA_NO_NODE) 2736 vm->nid = memory_add_physaddr_to_nid(vm->addr); 2737 2738 dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); 2739 dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); 2740 dev_info(&vm->vdev->dev, "device block size: 0x%llx", 2741 (unsigned long long)vm->device_block_size); 2742 if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) 2743 dev_info(&vm->vdev->dev, "nid: %d", vm->nid); 2744 2745 /* 2746 * We don't want to (un)plug or reuse any memory when in kdump. The 2747 * memory is still accessible (but not exposed to Linux). 2748 */ 2749 if (vm->in_kdump) 2750 return virtio_mem_init_kdump(vm); 2751 return virtio_mem_init_hotplug(vm); 2752 } 2753 2754 static int virtio_mem_create_resource(struct virtio_mem *vm) 2755 { 2756 /* 2757 * When force-unloading the driver and removing the device, we 2758 * could have a garbage pointer. Duplicate the string. 2759 */ 2760 const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL); 2761 2762 if (!name) 2763 return -ENOMEM; 2764 2765 /* Disallow mapping device memory via /dev/mem completely. */ 2766 vm->parent_resource = __request_mem_region(vm->addr, vm->region_size, 2767 name, IORESOURCE_SYSTEM_RAM | 2768 IORESOURCE_EXCLUSIVE); 2769 if (!vm->parent_resource) { 2770 kfree(name); 2771 dev_warn(&vm->vdev->dev, "could not reserve device region\n"); 2772 dev_info(&vm->vdev->dev, 2773 "reloading the driver is not supported\n"); 2774 return -EBUSY; 2775 } 2776 2777 /* The memory is not actually busy - make add_memory() work. */ 2778 vm->parent_resource->flags &= ~IORESOURCE_BUSY; 2779 return 0; 2780 } 2781 2782 static void virtio_mem_delete_resource(struct virtio_mem *vm) 2783 { 2784 const char *name; 2785 2786 if (!vm->parent_resource) 2787 return; 2788 2789 name = vm->parent_resource->name; 2790 release_resource(vm->parent_resource); 2791 kfree(vm->parent_resource); 2792 kfree(name); 2793 vm->parent_resource = NULL; 2794 } 2795 2796 static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) 2797 { 2798 return 1; 2799 } 2800 2801 static bool virtio_mem_has_memory_added(struct virtio_mem *vm) 2802 { 2803 const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 2804 2805 return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, 2806 vm->addr + vm->region_size, NULL, 2807 virtio_mem_range_has_system_ram) == 1; 2808 } 2809 2810 static int virtio_mem_probe(struct virtio_device *vdev) 2811 { 2812 struct virtio_mem *vm; 2813 int rc; 2814 2815 BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24); 2816 BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10); 2817 2818 vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL); 2819 if (!vm) 2820 return -ENOMEM; 2821 2822 init_waitqueue_head(&vm->host_resp); 2823 vm->vdev = vdev; 2824 INIT_WORK(&vm->wq, virtio_mem_run_wq); 2825 mutex_init(&vm->hotplug_mutex); 2826 INIT_LIST_HEAD(&vm->next); 2827 spin_lock_init(&vm->removal_lock); 2828 hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 2829 vm->retry_timer.function = virtio_mem_timer_expired; 2830 vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS; 2831 vm->in_kdump = is_kdump_kernel(); 2832 2833 /* register the virtqueue */ 2834 rc = virtio_mem_init_vq(vm); 2835 if (rc) 2836 goto out_free_vm; 2837 2838 /* initialize the device by querying the config */ 2839 rc = virtio_mem_init(vm); 2840 if (rc) 2841 goto out_del_vq; 2842 2843 virtio_device_ready(vdev); 2844 2845 /* trigger a config update to start processing the requested_size */ 2846 if (!vm->in_kdump) { 2847 atomic_set(&vm->config_changed, 1); 2848 queue_work(system_freezable_wq, &vm->wq); 2849 } 2850 2851 return 0; 2852 out_del_vq: 2853 vdev->config->del_vqs(vdev); 2854 out_free_vm: 2855 kfree(vm); 2856 vdev->priv = NULL; 2857 2858 return rc; 2859 } 2860 2861 static void virtio_mem_deinit_hotplug(struct virtio_mem *vm) 2862 { 2863 unsigned long mb_id; 2864 int rc; 2865 2866 /* 2867 * Make sure the workqueue won't be triggered anymore and no memory 2868 * blocks can be onlined/offlined until we're finished here. 2869 */ 2870 mutex_lock(&vm->hotplug_mutex); 2871 spin_lock_irq(&vm->removal_lock); 2872 vm->removing = true; 2873 spin_unlock_irq(&vm->removal_lock); 2874 mutex_unlock(&vm->hotplug_mutex); 2875 2876 /* wait until the workqueue stopped */ 2877 cancel_work_sync(&vm->wq); 2878 hrtimer_cancel(&vm->retry_timer); 2879 2880 if (vm->in_sbm) { 2881 /* 2882 * After we unregistered our callbacks, user space can online 2883 * partially plugged offline blocks. Make sure to remove them. 2884 */ 2885 virtio_mem_sbm_for_each_mb(vm, mb_id, 2886 VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { 2887 rc = virtio_mem_sbm_remove_mb(vm, mb_id); 2888 BUG_ON(rc); 2889 virtio_mem_sbm_set_mb_state(vm, mb_id, 2890 VIRTIO_MEM_SBM_MB_UNUSED); 2891 } 2892 /* 2893 * After we unregistered our callbacks, user space can no longer 2894 * offline partially plugged online memory blocks. No need to 2895 * worry about them. 2896 */ 2897 } 2898 2899 /* unregister callbacks */ 2900 unregister_virtio_mem_device(vm); 2901 unregister_memory_notifier(&vm->memory_notifier); 2902 2903 /* 2904 * There is no way we could reliably remove all memory we have added to 2905 * the system. And there is no way to stop the driver/device from going 2906 * away. Warn at least. 2907 */ 2908 if (virtio_mem_has_memory_added(vm)) { 2909 dev_warn(&vm->vdev->dev, 2910 "device still has system memory added\n"); 2911 } else { 2912 virtio_mem_delete_resource(vm); 2913 kfree_const(vm->resource_name); 2914 memory_group_unregister(vm->mgid); 2915 } 2916 2917 /* remove all tracking data - no locking needed */ 2918 if (vm->in_sbm) { 2919 vfree(vm->sbm.mb_states); 2920 vfree(vm->sbm.sb_states); 2921 } else { 2922 vfree(vm->bbm.bb_states); 2923 } 2924 } 2925 2926 static void virtio_mem_deinit_kdump(struct virtio_mem *vm) 2927 { 2928 #ifdef CONFIG_PROC_VMCORE 2929 unregister_vmcore_cb(&vm->vmcore_cb); 2930 #endif /* CONFIG_PROC_VMCORE */ 2931 } 2932 2933 static void virtio_mem_remove(struct virtio_device *vdev) 2934 { 2935 struct virtio_mem *vm = vdev->priv; 2936 2937 if (vm->in_kdump) 2938 virtio_mem_deinit_kdump(vm); 2939 else 2940 virtio_mem_deinit_hotplug(vm); 2941 2942 /* reset the device and cleanup the queues */ 2943 virtio_reset_device(vdev); 2944 vdev->config->del_vqs(vdev); 2945 2946 kfree(vm); 2947 vdev->priv = NULL; 2948 } 2949 2950 static void virtio_mem_config_changed(struct virtio_device *vdev) 2951 { 2952 struct virtio_mem *vm = vdev->priv; 2953 2954 if (unlikely(vm->in_kdump)) 2955 return; 2956 2957 atomic_set(&vm->config_changed, 1); 2958 virtio_mem_retry(vm); 2959 } 2960 2961 #ifdef CONFIG_PM_SLEEP 2962 static int virtio_mem_freeze(struct virtio_device *vdev) 2963 { 2964 /* 2965 * When restarting the VM, all memory is usually unplugged. Don't 2966 * allow to suspend/hibernate. 2967 */ 2968 dev_err(&vdev->dev, "save/restore not supported.\n"); 2969 return -EPERM; 2970 } 2971 2972 static int virtio_mem_restore(struct virtio_device *vdev) 2973 { 2974 return -EPERM; 2975 } 2976 #endif 2977 2978 static unsigned int virtio_mem_features[] = { 2979 #if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA) 2980 VIRTIO_MEM_F_ACPI_PXM, 2981 #endif 2982 VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE, 2983 }; 2984 2985 static const struct virtio_device_id virtio_mem_id_table[] = { 2986 { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, 2987 { 0 }, 2988 }; 2989 2990 static struct virtio_driver virtio_mem_driver = { 2991 .feature_table = virtio_mem_features, 2992 .feature_table_size = ARRAY_SIZE(virtio_mem_features), 2993 .driver.name = KBUILD_MODNAME, 2994 .driver.owner = THIS_MODULE, 2995 .id_table = virtio_mem_id_table, 2996 .probe = virtio_mem_probe, 2997 .remove = virtio_mem_remove, 2998 .config_changed = virtio_mem_config_changed, 2999 #ifdef CONFIG_PM_SLEEP 3000 .freeze = virtio_mem_freeze, 3001 .restore = virtio_mem_restore, 3002 #endif 3003 }; 3004 3005 module_virtio_driver(virtio_mem_driver); 3006 MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table); 3007 MODULE_AUTHOR("David Hildenbrand <david@redhat.com>"); 3008 MODULE_DESCRIPTION("Virtio-mem driver"); 3009 MODULE_LICENSE("GPL"); 3010