1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/backing-dev.h> 3 #include <linux/falloc.h> 4 #include <linux/kvm_host.h> 5 #include <linux/pagemap.h> 6 #include <linux/anon_inodes.h> 7 8 #include "kvm_mm.h" 9 10 struct kvm_gmem { 11 struct kvm *kvm; 12 struct xarray bindings; 13 struct list_head entry; 14 }; 15 16 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) 17 { 18 struct folio *folio; 19 20 /* TODO: Support huge pages. */ 21 folio = filemap_grab_folio(inode->i_mapping, index); 22 if (IS_ERR_OR_NULL(folio)) 23 return NULL; 24 25 /* 26 * Use the up-to-date flag to track whether or not the memory has been 27 * zeroed before being handed off to the guest. There is no backing 28 * storage for the memory, so the folio will remain up-to-date until 29 * it's removed. 30 * 31 * TODO: Skip clearing pages when trusted firmware will do it when 32 * assigning memory to the guest. 33 */ 34 if (!folio_test_uptodate(folio)) { 35 unsigned long nr_pages = folio_nr_pages(folio); 36 unsigned long i; 37 38 for (i = 0; i < nr_pages; i++) 39 clear_highpage(folio_page(folio, i)); 40 41 folio_mark_uptodate(folio); 42 } 43 44 /* 45 * Ignore accessed, referenced, and dirty flags. The memory is 46 * unevictable and there is no storage to write back to. 47 */ 48 return folio; 49 } 50 51 static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, 52 pgoff_t end) 53 { 54 bool flush = false, found_memslot = false; 55 struct kvm_memory_slot *slot; 56 struct kvm *kvm = gmem->kvm; 57 unsigned long index; 58 59 xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { 60 pgoff_t pgoff = slot->gmem.pgoff; 61 62 struct kvm_gfn_range gfn_range = { 63 .start = slot->base_gfn + max(pgoff, start) - pgoff, 64 .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, 65 .slot = slot, 66 .may_block = true, 67 }; 68 69 if (!found_memslot) { 70 found_memslot = true; 71 72 KVM_MMU_LOCK(kvm); 73 kvm_mmu_invalidate_begin(kvm); 74 } 75 76 flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); 77 } 78 79 if (flush) 80 kvm_flush_remote_tlbs(kvm); 81 82 if (found_memslot) 83 KVM_MMU_UNLOCK(kvm); 84 } 85 86 static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, 87 pgoff_t end) 88 { 89 struct kvm *kvm = gmem->kvm; 90 91 if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { 92 KVM_MMU_LOCK(kvm); 93 kvm_mmu_invalidate_end(kvm); 94 KVM_MMU_UNLOCK(kvm); 95 } 96 } 97 98 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) 99 { 100 struct list_head *gmem_list = &inode->i_mapping->i_private_list; 101 pgoff_t start = offset >> PAGE_SHIFT; 102 pgoff_t end = (offset + len) >> PAGE_SHIFT; 103 struct kvm_gmem *gmem; 104 105 /* 106 * Bindings must be stable across invalidation to ensure the start+end 107 * are balanced. 108 */ 109 filemap_invalidate_lock(inode->i_mapping); 110 111 list_for_each_entry(gmem, gmem_list, entry) 112 kvm_gmem_invalidate_begin(gmem, start, end); 113 114 truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); 115 116 list_for_each_entry(gmem, gmem_list, entry) 117 kvm_gmem_invalidate_end(gmem, start, end); 118 119 filemap_invalidate_unlock(inode->i_mapping); 120 121 return 0; 122 } 123 124 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) 125 { 126 struct address_space *mapping = inode->i_mapping; 127 pgoff_t start, index, end; 128 int r; 129 130 /* Dedicated guest is immutable by default. */ 131 if (offset + len > i_size_read(inode)) 132 return -EINVAL; 133 134 filemap_invalidate_lock_shared(mapping); 135 136 start = offset >> PAGE_SHIFT; 137 end = (offset + len) >> PAGE_SHIFT; 138 139 r = 0; 140 for (index = start; index < end; ) { 141 struct folio *folio; 142 143 if (signal_pending(current)) { 144 r = -EINTR; 145 break; 146 } 147 148 folio = kvm_gmem_get_folio(inode, index); 149 if (!folio) { 150 r = -ENOMEM; 151 break; 152 } 153 154 index = folio_next_index(folio); 155 156 folio_unlock(folio); 157 folio_put(folio); 158 159 /* 64-bit only, wrapping the index should be impossible. */ 160 if (WARN_ON_ONCE(!index)) 161 break; 162 163 cond_resched(); 164 } 165 166 filemap_invalidate_unlock_shared(mapping); 167 168 return r; 169 } 170 171 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, 172 loff_t len) 173 { 174 int ret; 175 176 if (!(mode & FALLOC_FL_KEEP_SIZE)) 177 return -EOPNOTSUPP; 178 179 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 180 return -EOPNOTSUPP; 181 182 if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) 183 return -EINVAL; 184 185 if (mode & FALLOC_FL_PUNCH_HOLE) 186 ret = kvm_gmem_punch_hole(file_inode(file), offset, len); 187 else 188 ret = kvm_gmem_allocate(file_inode(file), offset, len); 189 190 if (!ret) 191 file_modified(file); 192 return ret; 193 } 194 195 static int kvm_gmem_release(struct inode *inode, struct file *file) 196 { 197 struct kvm_gmem *gmem = file->private_data; 198 struct kvm_memory_slot *slot; 199 struct kvm *kvm = gmem->kvm; 200 unsigned long index; 201 202 /* 203 * Prevent concurrent attempts to *unbind* a memslot. This is the last 204 * reference to the file and thus no new bindings can be created, but 205 * dereferencing the slot for existing bindings needs to be protected 206 * against memslot updates, specifically so that unbind doesn't race 207 * and free the memslot (kvm_gmem_get_file() will return NULL). 208 */ 209 mutex_lock(&kvm->slots_lock); 210 211 filemap_invalidate_lock(inode->i_mapping); 212 213 xa_for_each(&gmem->bindings, index, slot) 214 rcu_assign_pointer(slot->gmem.file, NULL); 215 216 synchronize_rcu(); 217 218 /* 219 * All in-flight operations are gone and new bindings can be created. 220 * Zap all SPTEs pointed at by this file. Do not free the backing 221 * memory, as its lifetime is associated with the inode, not the file. 222 */ 223 kvm_gmem_invalidate_begin(gmem, 0, -1ul); 224 kvm_gmem_invalidate_end(gmem, 0, -1ul); 225 226 list_del(&gmem->entry); 227 228 filemap_invalidate_unlock(inode->i_mapping); 229 230 mutex_unlock(&kvm->slots_lock); 231 232 xa_destroy(&gmem->bindings); 233 kfree(gmem); 234 235 kvm_put_kvm(kvm); 236 237 return 0; 238 } 239 240 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) 241 { 242 /* 243 * Do not return slot->gmem.file if it has already been closed; 244 * there might be some time between the last fput() and when 245 * kvm_gmem_release() clears slot->gmem.file, and you do not 246 * want to spin in the meanwhile. 247 */ 248 return get_file_active(&slot->gmem.file); 249 } 250 251 static struct file_operations kvm_gmem_fops = { 252 .open = generic_file_open, 253 .release = kvm_gmem_release, 254 .fallocate = kvm_gmem_fallocate, 255 }; 256 257 void kvm_gmem_init(struct module *module) 258 { 259 kvm_gmem_fops.owner = module; 260 } 261 262 static int kvm_gmem_migrate_folio(struct address_space *mapping, 263 struct folio *dst, struct folio *src, 264 enum migrate_mode mode) 265 { 266 WARN_ON_ONCE(1); 267 return -EINVAL; 268 } 269 270 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) 271 { 272 struct list_head *gmem_list = &mapping->i_private_list; 273 struct kvm_gmem *gmem; 274 pgoff_t start, end; 275 276 filemap_invalidate_lock_shared(mapping); 277 278 start = folio->index; 279 end = start + folio_nr_pages(folio); 280 281 list_for_each_entry(gmem, gmem_list, entry) 282 kvm_gmem_invalidate_begin(gmem, start, end); 283 284 /* 285 * Do not truncate the range, what action is taken in response to the 286 * error is userspace's decision (assuming the architecture supports 287 * gracefully handling memory errors). If/when the guest attempts to 288 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, 289 * at which point KVM can either terminate the VM or propagate the 290 * error to userspace. 291 */ 292 293 list_for_each_entry(gmem, gmem_list, entry) 294 kvm_gmem_invalidate_end(gmem, start, end); 295 296 filemap_invalidate_unlock_shared(mapping); 297 298 return MF_DELAYED; 299 } 300 301 static const struct address_space_operations kvm_gmem_aops = { 302 .dirty_folio = noop_dirty_folio, 303 .migrate_folio = kvm_gmem_migrate_folio, 304 .error_remove_folio = kvm_gmem_error_folio, 305 }; 306 307 static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, 308 struct kstat *stat, u32 request_mask, 309 unsigned int query_flags) 310 { 311 struct inode *inode = path->dentry->d_inode; 312 313 generic_fillattr(idmap, request_mask, inode, stat); 314 return 0; 315 } 316 317 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 318 struct iattr *attr) 319 { 320 return -EINVAL; 321 } 322 static const struct inode_operations kvm_gmem_iops = { 323 .getattr = kvm_gmem_getattr, 324 .setattr = kvm_gmem_setattr, 325 }; 326 327 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) 328 { 329 const char *anon_name = "[kvm-gmem]"; 330 struct kvm_gmem *gmem; 331 struct inode *inode; 332 struct file *file; 333 int fd, err; 334 335 fd = get_unused_fd_flags(0); 336 if (fd < 0) 337 return fd; 338 339 gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); 340 if (!gmem) { 341 err = -ENOMEM; 342 goto err_fd; 343 } 344 345 file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, 346 O_RDWR, NULL); 347 if (IS_ERR(file)) { 348 err = PTR_ERR(file); 349 goto err_gmem; 350 } 351 352 file->f_flags |= O_LARGEFILE; 353 354 inode = file->f_inode; 355 WARN_ON(file->f_mapping != inode->i_mapping); 356 357 inode->i_private = (void *)(unsigned long)flags; 358 inode->i_op = &kvm_gmem_iops; 359 inode->i_mapping->a_ops = &kvm_gmem_aops; 360 inode->i_mode |= S_IFREG; 361 inode->i_size = size; 362 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 363 mapping_set_unmovable(inode->i_mapping); 364 /* Unmovable mappings are supposed to be marked unevictable as well. */ 365 WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); 366 367 kvm_get_kvm(kvm); 368 gmem->kvm = kvm; 369 xa_init(&gmem->bindings); 370 list_add(&gmem->entry, &inode->i_mapping->i_private_list); 371 372 fd_install(fd, file); 373 return fd; 374 375 err_gmem: 376 kfree(gmem); 377 err_fd: 378 put_unused_fd(fd); 379 return err; 380 } 381 382 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) 383 { 384 loff_t size = args->size; 385 u64 flags = args->flags; 386 u64 valid_flags = 0; 387 388 if (flags & ~valid_flags) 389 return -EINVAL; 390 391 if (size <= 0 || !PAGE_ALIGNED(size)) 392 return -EINVAL; 393 394 return __kvm_gmem_create(kvm, size, flags); 395 } 396 397 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, 398 unsigned int fd, loff_t offset) 399 { 400 loff_t size = slot->npages << PAGE_SHIFT; 401 unsigned long start, end; 402 struct kvm_gmem *gmem; 403 struct inode *inode; 404 struct file *file; 405 int r = -EINVAL; 406 407 BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); 408 409 file = fget(fd); 410 if (!file) 411 return -EBADF; 412 413 if (file->f_op != &kvm_gmem_fops) 414 goto err; 415 416 gmem = file->private_data; 417 if (gmem->kvm != kvm) 418 goto err; 419 420 inode = file_inode(file); 421 422 if (offset < 0 || !PAGE_ALIGNED(offset) || 423 offset + size > i_size_read(inode)) 424 goto err; 425 426 filemap_invalidate_lock(inode->i_mapping); 427 428 start = offset >> PAGE_SHIFT; 429 end = start + slot->npages; 430 431 if (!xa_empty(&gmem->bindings) && 432 xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { 433 filemap_invalidate_unlock(inode->i_mapping); 434 goto err; 435 } 436 437 /* 438 * No synchronize_rcu() needed, any in-flight readers are guaranteed to 439 * be see either a NULL file or this new file, no need for them to go 440 * away. 441 */ 442 rcu_assign_pointer(slot->gmem.file, file); 443 slot->gmem.pgoff = start; 444 445 xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); 446 filemap_invalidate_unlock(inode->i_mapping); 447 448 /* 449 * Drop the reference to the file, even on success. The file pins KVM, 450 * not the other way 'round. Active bindings are invalidated if the 451 * file is closed before memslots are destroyed. 452 */ 453 r = 0; 454 err: 455 fput(file); 456 return r; 457 } 458 459 void kvm_gmem_unbind(struct kvm_memory_slot *slot) 460 { 461 unsigned long start = slot->gmem.pgoff; 462 unsigned long end = start + slot->npages; 463 struct kvm_gmem *gmem; 464 struct file *file; 465 466 /* 467 * Nothing to do if the underlying file was already closed (or is being 468 * closed right now), kvm_gmem_release() invalidates all bindings. 469 */ 470 file = kvm_gmem_get_file(slot); 471 if (!file) 472 return; 473 474 gmem = file->private_data; 475 476 filemap_invalidate_lock(file->f_mapping); 477 xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); 478 rcu_assign_pointer(slot->gmem.file, NULL); 479 synchronize_rcu(); 480 filemap_invalidate_unlock(file->f_mapping); 481 482 fput(file); 483 } 484 485 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, 486 gfn_t gfn, kvm_pfn_t *pfn, int *max_order) 487 { 488 pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; 489 struct kvm_gmem *gmem; 490 struct folio *folio; 491 struct page *page; 492 struct file *file; 493 int r; 494 495 file = kvm_gmem_get_file(slot); 496 if (!file) 497 return -EFAULT; 498 499 gmem = file->private_data; 500 501 if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) { 502 r = -EIO; 503 goto out_fput; 504 } 505 506 folio = kvm_gmem_get_folio(file_inode(file), index); 507 if (!folio) { 508 r = -ENOMEM; 509 goto out_fput; 510 } 511 512 if (folio_test_hwpoison(folio)) { 513 folio_unlock(folio); 514 folio_put(folio); 515 r = -EHWPOISON; 516 goto out_fput; 517 } 518 519 page = folio_file_page(folio, index); 520 521 *pfn = page_to_pfn(page); 522 if (max_order) 523 *max_order = 0; 524 525 r = 0; 526 527 folio_unlock(folio); 528 out_fput: 529 fput(file); 530 531 return r; 532 } 533 EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); 534