1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * AMD SVM-SEV support 6 * 7 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 8 */ 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/kvm_types.h> 12 #include <linux/kvm_host.h> 13 #include <linux/kernel.h> 14 #include <linux/highmem.h> 15 #include <linux/psp.h> 16 #include <linux/psp-sev.h> 17 #include <linux/pagemap.h> 18 #include <linux/swap.h> 19 #include <linux/misc_cgroup.h> 20 #include <linux/processor.h> 21 #include <linux/trace_events.h> 22 #include <uapi/linux/sev-guest.h> 23 24 #include <asm/pkru.h> 25 #include <asm/trapnr.h> 26 #include <asm/fpu/xcr.h> 27 #include <asm/fpu/xstate.h> 28 #include <asm/debugreg.h> 29 #include <asm/msr.h> 30 #include <asm/sev.h> 31 32 #include "mmu.h" 33 #include "x86.h" 34 #include "svm.h" 35 #include "svm_ops.h" 36 #include "cpuid.h" 37 #include "trace.h" 38 39 #define GHCB_VERSION_MAX 2ULL 40 #define GHCB_VERSION_MIN 1ULL 41 42 #define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) 43 44 /* 45 * The GHCB spec essentially states that all non-zero error codes other than 46 * those explicitly defined above should be treated as an error by the guest. 47 * Define a generic error to cover that case, and choose a value that is not 48 * likely to overlap with new explicit error codes should more be added to 49 * the GHCB spec later. KVM will use this to report generic errors when 50 * handling SNP guest requests. 51 */ 52 #define SNP_GUEST_VMM_ERR_GENERIC (~0U) 53 54 /* enable/disable SEV support */ 55 static bool __ro_after_init sev_enabled = true; 56 module_param_named(sev, sev_enabled, bool, 0444); 57 58 /* enable/disable SEV-ES support */ 59 static bool __ro_after_init sev_es_enabled = true; 60 module_param_named(sev_es, sev_es_enabled, bool, 0444); 61 62 /* enable/disable SEV-SNP support */ 63 static bool __ro_after_init sev_snp_enabled = true; 64 module_param_named(sev_snp, sev_snp_enabled, bool, 0444); 65 66 static unsigned int __ro_after_init nr_ciphertext_hiding_asids; 67 module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 0444); 68 69 #define AP_RESET_HOLD_NONE 0 70 #define AP_RESET_HOLD_NAE_EVENT 1 71 #define AP_RESET_HOLD_MSR_PROTO 2 72 73 /* 74 * SEV-SNP policy bits that can be supported by KVM. These include policy bits 75 * that have implementation support within KVM or policy bits that do not 76 * require implementation support within KVM to enforce the policy. 77 */ 78 #define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ 79 SNP_POLICY_MASK_API_MAJOR | \ 80 SNP_POLICY_MASK_SMT | \ 81 SNP_POLICY_MASK_RSVD_MBO | \ 82 SNP_POLICY_MASK_DEBUG | \ 83 SNP_POLICY_MASK_SINGLE_SOCKET | \ 84 SNP_POLICY_MASK_CXL_ALLOW | \ 85 SNP_POLICY_MASK_MEM_AES_256_XTS | \ 86 SNP_POLICY_MASK_RAPL_DIS | \ 87 SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM | \ 88 SNP_POLICY_MASK_PAGE_SWAP_DISABLE) 89 90 static u64 snp_supported_policy_bits __ro_after_init; 91 92 static u64 sev_supported_vmsa_features __ro_after_init; 93 94 #define INITIAL_VMSA_GPA 0xFFFFFFFFF000 95 96 static u8 sev_enc_bit; 97 static DECLARE_RWSEM(sev_deactivate_lock); 98 static DEFINE_MUTEX(sev_bitmap_lock); 99 unsigned int max_sev_asid; 100 static unsigned int min_sev_asid; 101 static unsigned int max_sev_es_asid; 102 static unsigned int min_sev_es_asid; 103 static unsigned int max_snp_asid; 104 static unsigned int min_snp_asid; 105 static unsigned long sev_me_mask; 106 static unsigned int nr_asids; 107 static unsigned long *sev_asid_bitmap; 108 static unsigned long *sev_reclaim_asid_bitmap; 109 110 static __always_inline void kvm_lockdep_assert_sev_lock_held(struct kvm *kvm) 111 { 112 #ifdef CONFIG_PROVE_LOCKING 113 /* 114 * Querying SEV+ support is safe if there are no other references, i.e. 115 * if concurrent initialization of SEV+ is impossible. 116 */ 117 if (!refcount_read(&kvm->users_count)) 118 return; 119 120 /* 121 * Querying SEV+ support from vCPU context is always safe, as vCPUs can 122 * only be created after SEV+ is initialized (and KVM disallows all SEV 123 * sub-ioctls while vCPU creation is in-progress). 124 */ 125 if (kvm_get_running_vcpu()) 126 return; 127 128 lockdep_assert_held(&kvm->lock); 129 #endif 130 } 131 132 static bool sev_guest(struct kvm *kvm) 133 { 134 kvm_lockdep_assert_sev_lock_held(kvm); 135 return ____sev_guest(kvm); 136 } 137 static bool sev_es_guest(struct kvm *kvm) 138 { 139 kvm_lockdep_assert_sev_lock_held(kvm); 140 return ____sev_es_guest(kvm); 141 } 142 143 static bool sev_snp_guest(struct kvm *kvm) 144 { 145 kvm_lockdep_assert_sev_lock_held(kvm); 146 return ____sev_snp_guest(kvm); 147 } 148 149 static int snp_decommission_context(struct kvm *kvm); 150 151 struct enc_region { 152 struct list_head list; 153 unsigned long npages; 154 struct page **pages; 155 unsigned long uaddr; 156 unsigned long size; 157 }; 158 159 /* Called with the sev_bitmap_lock held, or on shutdown */ 160 static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) 161 { 162 int ret, error = 0; 163 unsigned int asid; 164 165 /* Check if there are any ASIDs to reclaim before performing a flush */ 166 asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); 167 if (asid > max_asid) 168 return -EBUSY; 169 170 /* 171 * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail, 172 * so it must be guarded. 173 */ 174 down_write(&sev_deactivate_lock); 175 176 /* SNP firmware requires use of WBINVD for ASID recycling. */ 177 wbinvd_on_all_cpus(); 178 179 if (sev_snp_enabled) 180 ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error); 181 else 182 ret = sev_guest_df_flush(&error); 183 184 up_write(&sev_deactivate_lock); 185 186 if (ret) 187 pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n", 188 sev_snp_enabled ? "-SNP" : "", ret, error); 189 190 return ret; 191 } 192 193 static inline bool is_mirroring_enc_context(struct kvm *kvm) 194 { 195 return !!to_kvm_sev_info(kvm)->enc_context_owner; 196 } 197 198 static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) 199 { 200 struct kvm_vcpu *vcpu = &svm->vcpu; 201 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 202 203 return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; 204 } 205 206 static bool snp_is_secure_tsc_enabled(struct kvm *kvm) 207 { 208 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 209 210 return (sev->vmsa_features & SVM_SEV_FEAT_SECURE_TSC) && 211 !WARN_ON_ONCE(!sev_snp_guest(kvm)); 212 } 213 214 /* Must be called with the sev_bitmap_lock held */ 215 static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid) 216 { 217 if (sev_flush_asids(min_asid, max_asid)) 218 return false; 219 220 /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */ 221 bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, 222 nr_asids); 223 bitmap_zero(sev_reclaim_asid_bitmap, nr_asids); 224 225 return true; 226 } 227 228 static int sev_misc_cg_try_charge(struct kvm_sev_info *sev) 229 { 230 enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; 231 return misc_cg_try_charge(type, sev->misc_cg, 1); 232 } 233 234 static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) 235 { 236 enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; 237 misc_cg_uncharge(type, sev->misc_cg, 1); 238 } 239 240 static unsigned int sev_alloc_asid(unsigned int min_asid, unsigned int max_asid) 241 { 242 unsigned int asid; 243 bool retry = true; 244 245 guard(mutex)(&sev_bitmap_lock); 246 247 again: 248 asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); 249 if (asid > max_asid) { 250 if (retry && __sev_recycle_asids(min_asid, max_asid)) { 251 retry = false; 252 goto again; 253 } 254 255 return asid; 256 } 257 258 __set_bit(asid, sev_asid_bitmap); 259 return asid; 260 } 261 262 static int sev_asid_new(struct kvm_sev_info *sev, unsigned long vm_type) 263 { 264 /* 265 * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. 266 * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. 267 */ 268 unsigned int min_asid, max_asid, asid; 269 int ret; 270 271 if (vm_type == KVM_X86_SNP_VM) { 272 min_asid = min_snp_asid; 273 max_asid = max_snp_asid; 274 } else if (sev->es_active) { 275 min_asid = min_sev_es_asid; 276 max_asid = max_sev_es_asid; 277 } else { 278 min_asid = min_sev_asid; 279 max_asid = max_sev_asid; 280 } 281 282 /* 283 * The min ASID can end up larger than the max if basic SEV support is 284 * effectively disabled by disallowing use of ASIDs for SEV guests. 285 * Similarly for SEV-ES guests the min ASID can end up larger than the 286 * max when ciphertext hiding is enabled, effectively disabling SEV-ES 287 * support. 288 */ 289 if (min_asid > max_asid) 290 return -ENOTTY; 291 292 WARN_ON_ONCE(sev->misc_cg); 293 sev->misc_cg = get_current_misc_cg(); 294 ret = sev_misc_cg_try_charge(sev); 295 if (ret) 296 goto e_put_cg; 297 298 asid = sev_alloc_asid(min_asid, max_asid); 299 if (asid > max_asid) { 300 ret = -EBUSY; 301 goto e_uncharge; 302 } 303 304 sev->asid = asid; 305 return 0; 306 307 e_uncharge: 308 sev_misc_cg_uncharge(sev); 309 e_put_cg: 310 put_misc_cg(sev->misc_cg); 311 sev->misc_cg = NULL; 312 return ret; 313 } 314 315 static unsigned int sev_get_asid(struct kvm *kvm) 316 { 317 return to_kvm_sev_info(kvm)->asid; 318 } 319 320 static void sev_asid_free(struct kvm_sev_info *sev) 321 { 322 struct svm_cpu_data *sd; 323 int cpu; 324 325 mutex_lock(&sev_bitmap_lock); 326 327 __set_bit(sev->asid, sev_reclaim_asid_bitmap); 328 329 for_each_possible_cpu(cpu) { 330 sd = per_cpu_ptr(&svm_data, cpu); 331 sd->sev_vmcbs[sev->asid] = NULL; 332 } 333 334 mutex_unlock(&sev_bitmap_lock); 335 336 sev_misc_cg_uncharge(sev); 337 put_misc_cg(sev->misc_cg); 338 sev->misc_cg = NULL; 339 } 340 341 static void sev_decommission(unsigned int handle) 342 { 343 struct sev_data_decommission decommission; 344 345 if (!handle) 346 return; 347 348 decommission.handle = handle; 349 sev_guest_decommission(&decommission, NULL); 350 } 351 352 /* 353 * Transition a page to hypervisor-owned/shared state in the RMP table. This 354 * should not fail under normal conditions, but leak the page should that 355 * happen since it will no longer be usable by the host due to RMP protections. 356 */ 357 static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level) 358 { 359 if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) { 360 snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT); 361 return -EIO; 362 } 363 364 return 0; 365 } 366 367 /* 368 * Certain page-states, such as Pre-Guest and Firmware pages (as documented 369 * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be 370 * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE 371 * unless they are reclaimed first. 372 * 373 * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they 374 * might not be usable by the host due to being set as immutable or still 375 * being associated with a guest ASID. 376 * 377 * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be 378 * converted back to shared, as the page is no longer usable due to RMP 379 * protections, and it's infeasible for the guest to continue on. 380 */ 381 static int snp_page_reclaim(struct kvm *kvm, u64 pfn) 382 { 383 struct sev_data_snp_page_reclaim data = {0}; 384 int fw_err, rc; 385 386 data.paddr = __sme_set(pfn << PAGE_SHIFT); 387 rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err); 388 if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) { 389 snp_leak_pages(pfn, 1); 390 return -EIO; 391 } 392 393 if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K)) 394 return -EIO; 395 396 return rc; 397 } 398 399 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) 400 { 401 struct sev_data_deactivate deactivate; 402 403 if (!handle) 404 return; 405 406 deactivate.handle = handle; 407 408 /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */ 409 down_read(&sev_deactivate_lock); 410 sev_guest_deactivate(&deactivate, NULL); 411 up_read(&sev_deactivate_lock); 412 413 sev_decommission(handle); 414 } 415 416 /* 417 * This sets up bounce buffers/firmware pages to handle SNP Guest Request 418 * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB 419 * 2.0 specification for more details. 420 * 421 * Technically, when an SNP Guest Request is issued, the guest will provide its 422 * own request/response pages, which could in theory be passed along directly 423 * to firmware rather than using bounce pages. However, these pages would need 424 * special care: 425 * 426 * - Both pages are from shared guest memory, so they need to be protected 427 * from migration/etc. occurring while firmware reads/writes to them. At a 428 * minimum, this requires elevating the ref counts and potentially needing 429 * an explicit pinning of the memory. This places additional restrictions 430 * on what type of memory backends userspace can use for shared guest 431 * memory since there is some reliance on using refcounted pages. 432 * 433 * - The response page needs to be switched to Firmware-owned[1] state 434 * before the firmware can write to it, which can lead to potential 435 * host RMP #PFs if the guest is misbehaved and hands the host a 436 * guest page that KVM might write to for other reasons (e.g. virtio 437 * buffers/etc.). 438 * 439 * Both of these issues can be avoided completely by using separately-allocated 440 * bounce pages for both the request/response pages and passing those to 441 * firmware instead. So that's what is being set up here. 442 * 443 * Guest requests rely on message sequence numbers to ensure requests are 444 * issued to firmware in the order the guest issues them, so concurrent guest 445 * requests generally shouldn't happen. But a misbehaved guest could issue 446 * concurrent guest requests in theory, so a mutex is used to serialize 447 * access to the bounce buffers. 448 * 449 * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more 450 * details on Firmware-owned pages, along with "RMP and VMPL Access Checks" 451 * in the APM for details on the related RMP restrictions. 452 */ 453 static int snp_guest_req_init(struct kvm *kvm) 454 { 455 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 456 struct page *req_page; 457 458 req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 459 if (!req_page) 460 return -ENOMEM; 461 462 sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 463 if (!sev->guest_resp_buf) { 464 __free_page(req_page); 465 return -EIO; 466 } 467 468 sev->guest_req_buf = page_address(req_page); 469 mutex_init(&sev->guest_req_mutex); 470 471 return 0; 472 } 473 474 static void snp_guest_req_cleanup(struct kvm *kvm) 475 { 476 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 477 478 if (sev->guest_resp_buf) 479 snp_free_firmware_page(sev->guest_resp_buf); 480 481 if (sev->guest_req_buf) 482 __free_page(virt_to_page(sev->guest_req_buf)); 483 484 sev->guest_req_buf = NULL; 485 sev->guest_resp_buf = NULL; 486 } 487 488 static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, 489 struct kvm_sev_init *data, 490 unsigned long vm_type) 491 { 492 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 493 struct sev_platform_init_args init_args = {0}; 494 bool es_active = vm_type != KVM_X86_SEV_VM; 495 bool snp_active = vm_type == KVM_X86_SNP_VM; 496 u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; 497 int ret; 498 499 if (kvm->created_vcpus) 500 return -EINVAL; 501 502 if (data->flags) 503 return -EINVAL; 504 505 if (!snp_active) 506 valid_vmsa_features &= ~SVM_SEV_FEAT_SECURE_TSC; 507 508 if (data->vmsa_features & ~valid_vmsa_features) 509 return -EINVAL; 510 511 if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version)) 512 return -EINVAL; 513 514 /* 515 * KVM supports the full range of mandatory features defined by version 516 * 2 of the GHCB protocol, so default to that for SEV-ES guests created 517 * via KVM_SEV_INIT2 (KVM_SEV_INIT forces version 1). 518 */ 519 if (es_active && !data->ghcb_version) 520 data->ghcb_version = 2; 521 522 if (snp_active && data->ghcb_version < 2) 523 return -EINVAL; 524 525 if (unlikely(sev->active)) 526 return -EINVAL; 527 528 sev->active = true; 529 sev->es_active = es_active; 530 sev->vmsa_features = data->vmsa_features; 531 sev->ghcb_version = data->ghcb_version; 532 533 if (snp_active) 534 sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; 535 536 ret = sev_asid_new(sev, vm_type); 537 if (ret) 538 goto e_no_asid; 539 540 init_args.probe = false; 541 ret = sev_platform_init(&init_args); 542 if (ret) 543 goto e_free_asid; 544 545 if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 546 ret = -ENOMEM; 547 goto e_free_asid; 548 } 549 550 /* This needs to happen after SEV/SNP firmware initialization. */ 551 if (snp_active) { 552 ret = snp_guest_req_init(kvm); 553 if (ret) 554 goto e_free; 555 } 556 557 INIT_LIST_HEAD(&sev->regions_list); 558 INIT_LIST_HEAD(&sev->mirror_vms); 559 sev->need_init = false; 560 561 kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV); 562 563 return 0; 564 565 e_free: 566 free_cpumask_var(sev->have_run_cpus); 567 e_free_asid: 568 argp->error = init_args.error; 569 sev_asid_free(sev); 570 sev->asid = 0; 571 e_no_asid: 572 sev->vmsa_features = 0; 573 sev->es_active = false; 574 sev->active = false; 575 return ret; 576 } 577 578 static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) 579 { 580 struct kvm_sev_init data = { 581 .vmsa_features = 0, 582 .ghcb_version = 0, 583 }; 584 unsigned long vm_type; 585 586 if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM) 587 return -EINVAL; 588 589 vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM); 590 591 /* 592 * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will 593 * continue to only ever support the minimal GHCB protocol version. 594 */ 595 if (vm_type == KVM_X86_SEV_ES_VM) 596 data.ghcb_version = GHCB_VERSION_MIN; 597 598 return __sev_guest_init(kvm, argp, &data, vm_type); 599 } 600 601 static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) 602 { 603 struct kvm_sev_init data; 604 605 if (!to_kvm_sev_info(kvm)->need_init) 606 return -EINVAL; 607 608 if (kvm->arch.vm_type != KVM_X86_SEV_VM && 609 kvm->arch.vm_type != KVM_X86_SEV_ES_VM && 610 kvm->arch.vm_type != KVM_X86_SNP_VM) 611 return -EINVAL; 612 613 if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data))) 614 return -EFAULT; 615 616 return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type); 617 } 618 619 static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) 620 { 621 unsigned int asid = sev_get_asid(kvm); 622 struct sev_data_activate activate; 623 int ret; 624 625 /* activate ASID on the given handle */ 626 activate.handle = handle; 627 activate.asid = asid; 628 ret = sev_guest_activate(&activate, error); 629 630 return ret; 631 } 632 633 static int __sev_issue_cmd(int fd, int id, void *data, int *error) 634 { 635 CLASS(fd, f)(fd); 636 637 if (fd_empty(f)) 638 return -EBADF; 639 640 return sev_issue_cmd_external_user(fd_file(f), id, data, error); 641 } 642 643 static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) 644 { 645 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 646 647 return __sev_issue_cmd(sev->fd, id, data, error); 648 } 649 650 static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 651 { 652 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 653 struct sev_data_launch_start start; 654 struct kvm_sev_launch_start params; 655 void *dh_blob, *session_blob; 656 int *error = &argp->error; 657 int ret; 658 659 if (!sev_guest(kvm)) 660 return -ENOTTY; 661 662 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 663 return -EFAULT; 664 665 memset(&start, 0, sizeof(start)); 666 667 dh_blob = NULL; 668 if (params.dh_uaddr) { 669 dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len); 670 if (IS_ERR(dh_blob)) 671 return PTR_ERR(dh_blob); 672 673 start.dh_cert_address = __sme_set(__pa(dh_blob)); 674 start.dh_cert_len = params.dh_len; 675 } 676 677 session_blob = NULL; 678 if (params.session_uaddr) { 679 session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len); 680 if (IS_ERR(session_blob)) { 681 ret = PTR_ERR(session_blob); 682 goto e_free_dh; 683 } 684 685 start.session_address = __sme_set(__pa(session_blob)); 686 start.session_len = params.session_len; 687 } 688 689 start.handle = params.handle; 690 start.policy = params.policy; 691 692 /* create memory encryption context */ 693 ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error); 694 if (ret) 695 goto e_free_session; 696 697 /* Bind ASID to this guest */ 698 ret = sev_bind_asid(kvm, start.handle, error); 699 if (ret) { 700 sev_decommission(start.handle); 701 goto e_free_session; 702 } 703 704 /* return handle to userspace */ 705 params.handle = start.handle; 706 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) { 707 sev_unbind_asid(kvm, start.handle); 708 ret = -EFAULT; 709 goto e_free_session; 710 } 711 712 sev->policy = params.policy; 713 sev->handle = start.handle; 714 sev->fd = argp->sev_fd; 715 716 e_free_session: 717 kfree(session_blob); 718 e_free_dh: 719 kfree(dh_blob); 720 return ret; 721 } 722 723 static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, 724 unsigned long ulen, unsigned long *n, 725 unsigned int flags) 726 { 727 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 728 unsigned long npages, total_npages, lock_limit; 729 struct page **pages; 730 int npinned, ret; 731 732 lockdep_assert_held(&kvm->lock); 733 734 if (ulen == 0 || uaddr + ulen < uaddr) 735 return ERR_PTR(-EINVAL); 736 737 /* 738 * Calculate the number of pages that need to be pinned to cover the 739 * entire range. Note! This isn't simply PFN_DOWN(ulen), as KVM 740 * doesn't require the incoming address+size to be page aligned! 741 */ 742 npages = PFN_DOWN(uaddr + ulen - 1) - PFN_DOWN(uaddr) + 1; 743 if (npages > INT_MAX) 744 return ERR_PTR(-EINVAL); 745 746 total_npages = sev->pages_locked + npages; 747 if (total_npages > totalram_pages()) 748 return ERR_PTR(-EINVAL); 749 750 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 751 if (total_npages > lock_limit && !capable(CAP_IPC_LOCK)) { 752 pr_err("SEV: %lu total pages would exceed the lock limit of %lu.\n", 753 total_npages, lock_limit); 754 return ERR_PTR(-ENOMEM); 755 } 756 757 /* 758 * Don't WARN if the kernel (rightly) thinks the total size is absurd, 759 * i.e. rely on the kernel to reject outrageous range sizes. The above 760 * check on the number of pages is purely to avoid truncation as 761 * pin_user_pages_fast() takes the number of pages as a 32-bit int. 762 */ 763 pages = kvzalloc_objs(*pages, npages, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 764 if (!pages) 765 return ERR_PTR(-ENOMEM); 766 767 /* Pin the user virtual address. */ 768 npinned = pin_user_pages_fast(uaddr, npages, flags, pages); 769 if (npinned != npages) { 770 pr_err("SEV: Failure locking %lu pages.\n", npages); 771 ret = -ENOMEM; 772 goto err; 773 } 774 775 *n = npages; 776 sev->pages_locked = total_npages; 777 778 return pages; 779 780 err: 781 if (npinned > 0) 782 unpin_user_pages(pages, npinned); 783 784 kvfree(pages); 785 return ERR_PTR(ret); 786 } 787 788 static void sev_unpin_memory(struct kvm *kvm, struct page **pages, 789 unsigned long npages) 790 { 791 unpin_user_pages(pages, npages); 792 kvfree(pages); 793 to_kvm_sev_info(kvm)->pages_locked -= npages; 794 } 795 796 static void sev_clflush_pages(struct page *pages[], unsigned long npages) 797 { 798 uint8_t *page_virtual; 799 unsigned long i; 800 801 if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 || 802 pages == NULL) 803 return; 804 805 for (i = 0; i < npages; i++) { 806 page_virtual = kmap_local_page(pages[i]); 807 clflush_cache_range(page_virtual, PAGE_SIZE); 808 kunmap_local(page_virtual); 809 cond_resched(); 810 } 811 } 812 813 static void sev_writeback_caches(struct kvm *kvm) 814 { 815 /* 816 * Ensure that all dirty guest tagged cache entries are written back 817 * before releasing the pages back to the system for use. CLFLUSH will 818 * not do this without SME_COHERENT, and flushing many cache lines 819 * individually is slower than blasting WBINVD for large VMs, so issue 820 * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported) 821 * on CPUs that have done VMRUN, i.e. may have dirtied data using the 822 * VM's ASID. 823 * 824 * For simplicity, never remove CPUs from the bitmap. Ideally, KVM 825 * would clear the mask when flushing caches, but doing so requires 826 * serializing multiple calls and having responding CPUs (to the IPI) 827 * mark themselves as still running if they are running (or about to 828 * run) a vCPU for the VM. 829 * 830 * Note, the caller is responsible for ensuring correctness if the mask 831 * can be modified, e.g. if a CPU could be doing VMRUN. 832 */ 833 wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus); 834 } 835 836 static unsigned long get_num_contig_pages(unsigned long idx, 837 struct page **inpages, unsigned long npages) 838 { 839 unsigned long paddr, next_paddr; 840 unsigned long i = idx + 1, pages = 1; 841 842 /* find the number of contiguous pages starting from idx */ 843 paddr = __sme_page_pa(inpages[idx]); 844 while (i < npages) { 845 next_paddr = __sme_page_pa(inpages[i++]); 846 if ((paddr + PAGE_SIZE) == next_paddr) { 847 pages++; 848 paddr = next_paddr; 849 continue; 850 } 851 break; 852 } 853 854 return pages; 855 } 856 857 static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) 858 { 859 unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i; 860 struct kvm_sev_launch_update_data params; 861 struct sev_data_launch_update_data data; 862 struct page **inpages; 863 int ret; 864 865 if (!sev_guest(kvm)) 866 return -ENOTTY; 867 868 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 869 return -EFAULT; 870 871 vaddr = params.uaddr; 872 size = params.len; 873 vaddr_end = vaddr + size; 874 875 /* Lock the user memory. */ 876 inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE); 877 if (IS_ERR(inpages)) 878 return PTR_ERR(inpages); 879 880 /* 881 * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in 882 * place; the cache may contain the data that was written unencrypted. 883 */ 884 sev_clflush_pages(inpages, npages); 885 886 data.reserved = 0; 887 data.handle = to_kvm_sev_info(kvm)->handle; 888 889 for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { 890 int offset, len; 891 892 /* 893 * If the user buffer is not page-aligned, calculate the offset 894 * within the page. 895 */ 896 offset = vaddr & (PAGE_SIZE - 1); 897 898 /* Calculate the number of pages that can be encrypted in one go. */ 899 pages = get_num_contig_pages(i, inpages, npages); 900 901 len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size); 902 903 data.len = len; 904 data.address = __sme_page_pa(inpages[i]) + offset; 905 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error); 906 if (ret) 907 goto e_unpin; 908 909 size -= len; 910 next_vaddr = vaddr + len; 911 } 912 913 e_unpin: 914 /* content of memory is updated, mark pages dirty */ 915 for (i = 0; i < npages; i++) { 916 set_page_dirty_lock(inpages[i]); 917 mark_page_accessed(inpages[i]); 918 } 919 /* unlock the user pages */ 920 sev_unpin_memory(kvm, inpages, npages); 921 return ret; 922 } 923 924 static int sev_es_sync_vmsa(struct vcpu_svm *svm) 925 { 926 struct kvm_vcpu *vcpu = &svm->vcpu; 927 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 928 struct sev_es_save_area *save = svm->sev_es.vmsa; 929 struct xregs_state *xsave; 930 const u8 *s; 931 u8 *d; 932 int i; 933 934 lockdep_assert_held(&vcpu->mutex); 935 936 if (vcpu->arch.guest_state_protected) 937 return -EINVAL; 938 939 /* Check some debug related fields before encrypting the VMSA */ 940 if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) 941 return -EINVAL; 942 943 /* 944 * SEV-ES will use a VMSA that is pointed to by the VMCB, not 945 * the traditional VMSA that is part of the VMCB. Copy the 946 * traditional VMSA as it has been built so far (in prep 947 * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. 948 */ 949 memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save)); 950 951 /* Sync registgers */ 952 save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX]; 953 save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX]; 954 save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 955 save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX]; 956 save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP]; 957 save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP]; 958 save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI]; 959 save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI]; 960 #ifdef CONFIG_X86_64 961 save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8]; 962 save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9]; 963 save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10]; 964 save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11]; 965 save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12]; 966 save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13]; 967 save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14]; 968 save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15]; 969 #endif 970 save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP]; 971 972 /* Sync some non-GPR registers before encrypting */ 973 save->xcr0 = svm->vcpu.arch.xcr0; 974 save->pkru = svm->vcpu.arch.pkru; 975 save->xss = svm->vcpu.arch.ia32_xss; 976 save->dr6 = svm->vcpu.arch.dr6; 977 978 save->sev_features = sev->vmsa_features; 979 980 /* 981 * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid 982 * breaking older measurements. 983 */ 984 if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) { 985 xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave; 986 save->x87_dp = xsave->i387.rdp; 987 save->mxcsr = xsave->i387.mxcsr; 988 save->x87_ftw = xsave->i387.twd; 989 save->x87_fsw = xsave->i387.swd; 990 save->x87_fcw = xsave->i387.cwd; 991 save->x87_fop = xsave->i387.fop; 992 save->x87_ds = 0; 993 save->x87_cs = 0; 994 save->x87_rip = xsave->i387.rip; 995 996 for (i = 0; i < 8; i++) { 997 /* 998 * The format of the x87 save area is undocumented and 999 * definitely not what you would expect. It consists of 1000 * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes 1001 * area with bytes 8-9 of each register. 1002 */ 1003 d = save->fpreg_x87 + i * 8; 1004 s = ((u8 *)xsave->i387.st_space) + i * 16; 1005 memcpy(d, s, 8); 1006 save->fpreg_x87[64 + i * 2] = s[8]; 1007 save->fpreg_x87[64 + i * 2 + 1] = s[9]; 1008 } 1009 memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256); 1010 1011 s = get_xsave_addr(xsave, XFEATURE_YMM); 1012 if (s) 1013 memcpy(save->fpreg_ymm, s, 256); 1014 else 1015 memset(save->fpreg_ymm, 0, 256); 1016 } 1017 1018 pr_debug("Virtual Machine Save Area (VMSA):\n"); 1019 print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); 1020 1021 return 0; 1022 } 1023 1024 static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, 1025 int *error) 1026 { 1027 struct sev_data_launch_update_vmsa vmsa; 1028 struct vcpu_svm *svm = to_svm(vcpu); 1029 int ret; 1030 1031 if (vcpu->guest_debug) { 1032 pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported"); 1033 return -EINVAL; 1034 } 1035 1036 /* Perform some pre-encryption checks against the VMSA */ 1037 ret = sev_es_sync_vmsa(svm); 1038 if (ret) 1039 return ret; 1040 1041 /* 1042 * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of 1043 * the VMSA memory content (i.e it will write the same memory region 1044 * with the guest's key), so invalidate it first. 1045 */ 1046 clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE); 1047 1048 vmsa.reserved = 0; 1049 vmsa.handle = to_kvm_sev_info(kvm)->handle; 1050 vmsa.address = __sme_pa(svm->sev_es.vmsa); 1051 vmsa.len = PAGE_SIZE; 1052 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); 1053 if (ret) 1054 return ret; 1055 1056 /* 1057 * SEV-ES guests maintain an encrypted version of their FPU 1058 * state which is restored and saved on VMRUN and VMEXIT. 1059 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't 1060 * do xsave/xrstor on it. 1061 */ 1062 fpstate_set_confidential(&vcpu->arch.guest_fpu); 1063 vcpu->arch.guest_state_protected = true; 1064 1065 /* 1066 * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it 1067 * only after setting guest_state_protected because KVM_SET_MSRS allows 1068 * dynamic toggling of LBRV (for performance reason) on write access to 1069 * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. 1070 */ 1071 svm_enable_lbrv(vcpu); 1072 return 0; 1073 } 1074 1075 static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) 1076 { 1077 struct kvm_vcpu *vcpu; 1078 unsigned long i; 1079 int ret; 1080 1081 if (!sev_es_guest(kvm)) 1082 return -ENOTTY; 1083 1084 if (kvm_is_vcpu_creation_in_progress(kvm)) 1085 return -EBUSY; 1086 1087 ret = kvm_lock_all_vcpus(kvm); 1088 if (ret) 1089 return ret; 1090 1091 kvm_for_each_vcpu(i, vcpu, kvm) { 1092 ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error); 1093 if (ret) 1094 break; 1095 } 1096 1097 kvm_unlock_all_vcpus(kvm); 1098 return ret; 1099 } 1100 1101 static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) 1102 { 1103 void __user *measure = u64_to_user_ptr(argp->data); 1104 struct sev_data_launch_measure data; 1105 struct kvm_sev_launch_measure params; 1106 void __user *p = NULL; 1107 void *blob = NULL; 1108 int ret; 1109 1110 if (!sev_guest(kvm)) 1111 return -ENOTTY; 1112 1113 if (copy_from_user(¶ms, measure, sizeof(params))) 1114 return -EFAULT; 1115 1116 memset(&data, 0, sizeof(data)); 1117 1118 /* User wants to query the blob length */ 1119 if (!params.len) 1120 goto cmd; 1121 1122 p = u64_to_user_ptr(params.uaddr); 1123 if (p) { 1124 if (params.len > SEV_FW_BLOB_MAX_SIZE) 1125 return -EINVAL; 1126 1127 blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT); 1128 if (!blob) 1129 return -ENOMEM; 1130 1131 data.address = __psp_pa(blob); 1132 data.len = params.len; 1133 } 1134 1135 cmd: 1136 data.handle = to_kvm_sev_info(kvm)->handle; 1137 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error); 1138 1139 /* 1140 * If we query the session length, FW responded with expected data. 1141 */ 1142 if (!params.len) 1143 goto done; 1144 1145 if (ret) 1146 goto e_free_blob; 1147 1148 if (blob) { 1149 if (copy_to_user(p, blob, params.len)) 1150 ret = -EFAULT; 1151 } 1152 1153 done: 1154 params.len = data.len; 1155 if (copy_to_user(measure, ¶ms, sizeof(params))) 1156 ret = -EFAULT; 1157 e_free_blob: 1158 kfree(blob); 1159 return ret; 1160 } 1161 1162 static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 1163 { 1164 struct sev_data_launch_finish data; 1165 1166 if (!sev_guest(kvm)) 1167 return -ENOTTY; 1168 1169 data.handle = to_kvm_sev_info(kvm)->handle; 1170 return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error); 1171 } 1172 1173 static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) 1174 { 1175 struct kvm_sev_guest_status params; 1176 struct sev_data_guest_status data; 1177 int ret; 1178 1179 if (!sev_guest(kvm)) 1180 return -ENOTTY; 1181 1182 memset(&data, 0, sizeof(data)); 1183 1184 data.handle = to_kvm_sev_info(kvm)->handle; 1185 ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error); 1186 if (ret) 1187 return ret; 1188 1189 params.policy = data.policy; 1190 params.state = data.state; 1191 params.handle = data.handle; 1192 1193 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) 1194 ret = -EFAULT; 1195 1196 return ret; 1197 } 1198 1199 static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, 1200 unsigned long dst, int size, 1201 int *error, bool enc) 1202 { 1203 struct sev_data_dbg data; 1204 1205 data.reserved = 0; 1206 data.handle = to_kvm_sev_info(kvm)->handle; 1207 data.dst_addr = dst; 1208 data.src_addr = src; 1209 data.len = size; 1210 1211 return sev_issue_cmd(kvm, 1212 enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT, 1213 &data, error); 1214 } 1215 1216 static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr, 1217 unsigned long dst_paddr, int sz, int *err) 1218 { 1219 int offset; 1220 1221 /* 1222 * Its safe to read more than we are asked, caller should ensure that 1223 * destination has enough space. 1224 */ 1225 offset = src_paddr & 15; 1226 src_paddr = round_down(src_paddr, 16); 1227 sz = round_up(sz + offset, 16); 1228 1229 return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false); 1230 } 1231 1232 static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, 1233 void __user *dst_uaddr, 1234 unsigned long dst_paddr, 1235 int size, int *err) 1236 { 1237 struct page *tpage = NULL; 1238 int ret, offset; 1239 1240 /* if inputs are not 16-byte then use intermediate buffer */ 1241 if (!IS_ALIGNED(dst_paddr, 16) || 1242 !IS_ALIGNED(paddr, 16) || 1243 !IS_ALIGNED(size, 16)) { 1244 tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 1245 if (!tpage) 1246 return -ENOMEM; 1247 1248 dst_paddr = __sme_page_pa(tpage); 1249 } 1250 1251 ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err); 1252 if (ret) 1253 goto e_free; 1254 1255 if (tpage) { 1256 offset = paddr & 15; 1257 if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size)) 1258 ret = -EFAULT; 1259 } 1260 1261 e_free: 1262 if (tpage) 1263 __free_page(tpage); 1264 1265 return ret; 1266 } 1267 1268 static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, 1269 void __user *vaddr, 1270 unsigned long dst_paddr, 1271 void __user *dst_vaddr, 1272 int size, int *error) 1273 { 1274 struct page *src_tpage = NULL; 1275 struct page *dst_tpage = NULL; 1276 int ret, len = size; 1277 1278 /* If source buffer is not aligned then use an intermediate buffer */ 1279 if (!IS_ALIGNED((unsigned long)vaddr, 16)) { 1280 src_tpage = alloc_page(GFP_KERNEL_ACCOUNT); 1281 if (!src_tpage) 1282 return -ENOMEM; 1283 1284 if (copy_from_user(page_address(src_tpage), vaddr, size)) { 1285 __free_page(src_tpage); 1286 return -EFAULT; 1287 } 1288 1289 paddr = __sme_page_pa(src_tpage); 1290 } 1291 1292 /* 1293 * If destination buffer or length is not aligned then do read-modify-write: 1294 * - decrypt destination in an intermediate buffer 1295 * - copy the source buffer in an intermediate buffer 1296 * - use the intermediate buffer as source buffer 1297 */ 1298 if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { 1299 int dst_offset; 1300 1301 dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT); 1302 if (!dst_tpage) { 1303 ret = -ENOMEM; 1304 goto e_free; 1305 } 1306 1307 ret = __sev_dbg_decrypt(kvm, dst_paddr, 1308 __sme_page_pa(dst_tpage), size, error); 1309 if (ret) 1310 goto e_free; 1311 1312 /* 1313 * If source is kernel buffer then use memcpy() otherwise 1314 * copy_from_user(). 1315 */ 1316 dst_offset = dst_paddr & 15; 1317 1318 if (src_tpage) 1319 memcpy(page_address(dst_tpage) + dst_offset, 1320 page_address(src_tpage), size); 1321 else { 1322 if (copy_from_user(page_address(dst_tpage) + dst_offset, 1323 vaddr, size)) { 1324 ret = -EFAULT; 1325 goto e_free; 1326 } 1327 } 1328 1329 paddr = __sme_page_pa(dst_tpage); 1330 dst_paddr = round_down(dst_paddr, 16); 1331 len = round_up(size, 16); 1332 } 1333 1334 ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true); 1335 1336 e_free: 1337 if (src_tpage) 1338 __free_page(src_tpage); 1339 if (dst_tpage) 1340 __free_page(dst_tpage); 1341 return ret; 1342 } 1343 1344 static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) 1345 { 1346 unsigned long vaddr, vaddr_end, next_vaddr; 1347 unsigned long dst_vaddr; 1348 struct page **src_p, **dst_p; 1349 struct kvm_sev_dbg debug; 1350 unsigned long n; 1351 unsigned int size; 1352 int ret; 1353 1354 if (!sev_guest(kvm)) 1355 return -ENOTTY; 1356 1357 if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug))) 1358 return -EFAULT; 1359 1360 if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr) 1361 return -EINVAL; 1362 if (!debug.dst_uaddr) 1363 return -EINVAL; 1364 1365 vaddr = debug.src_uaddr; 1366 size = debug.len; 1367 vaddr_end = vaddr + size; 1368 dst_vaddr = debug.dst_uaddr; 1369 1370 for (; vaddr < vaddr_end; vaddr = next_vaddr) { 1371 int len, s_off, d_off; 1372 1373 /* lock userspace source and destination page */ 1374 src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0); 1375 if (IS_ERR(src_p)) 1376 return PTR_ERR(src_p); 1377 1378 dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE); 1379 if (IS_ERR(dst_p)) { 1380 sev_unpin_memory(kvm, src_p, n); 1381 return PTR_ERR(dst_p); 1382 } 1383 1384 /* 1385 * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify 1386 * the pages; flush the destination too so that future accesses do not 1387 * see stale data. 1388 */ 1389 sev_clflush_pages(src_p, 1); 1390 sev_clflush_pages(dst_p, 1); 1391 1392 /* 1393 * Since user buffer may not be page aligned, calculate the 1394 * offset within the page. 1395 */ 1396 s_off = vaddr & ~PAGE_MASK; 1397 d_off = dst_vaddr & ~PAGE_MASK; 1398 len = min_t(size_t, (PAGE_SIZE - s_off), size); 1399 1400 if (dec) 1401 ret = __sev_dbg_decrypt_user(kvm, 1402 __sme_page_pa(src_p[0]) + s_off, 1403 (void __user *)dst_vaddr, 1404 __sme_page_pa(dst_p[0]) + d_off, 1405 len, &argp->error); 1406 else 1407 ret = __sev_dbg_encrypt_user(kvm, 1408 __sme_page_pa(src_p[0]) + s_off, 1409 (void __user *)vaddr, 1410 __sme_page_pa(dst_p[0]) + d_off, 1411 (void __user *)dst_vaddr, 1412 len, &argp->error); 1413 1414 sev_unpin_memory(kvm, src_p, n); 1415 sev_unpin_memory(kvm, dst_p, n); 1416 1417 if (ret) 1418 goto err; 1419 1420 next_vaddr = vaddr + len; 1421 dst_vaddr = dst_vaddr + len; 1422 size -= len; 1423 } 1424 err: 1425 return ret; 1426 } 1427 1428 static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) 1429 { 1430 struct sev_data_launch_secret data; 1431 struct kvm_sev_launch_secret params; 1432 struct page **pages; 1433 void *blob, *hdr; 1434 unsigned long n, i; 1435 int ret, offset; 1436 1437 if (!sev_guest(kvm)) 1438 return -ENOTTY; 1439 1440 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 1441 return -EFAULT; 1442 1443 pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE); 1444 if (IS_ERR(pages)) 1445 return PTR_ERR(pages); 1446 1447 /* 1448 * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in 1449 * place; the cache may contain the data that was written unencrypted. 1450 */ 1451 sev_clflush_pages(pages, n); 1452 1453 /* 1454 * The secret must be copied into contiguous memory region, lets verify 1455 * that userspace memory pages are contiguous before we issue command. 1456 */ 1457 if (get_num_contig_pages(0, pages, n) != n) { 1458 ret = -EINVAL; 1459 goto e_unpin_memory; 1460 } 1461 1462 memset(&data, 0, sizeof(data)); 1463 1464 offset = params.guest_uaddr & (PAGE_SIZE - 1); 1465 data.guest_address = __sme_page_pa(pages[0]) + offset; 1466 data.guest_len = params.guest_len; 1467 1468 blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len); 1469 if (IS_ERR(blob)) { 1470 ret = PTR_ERR(blob); 1471 goto e_unpin_memory; 1472 } 1473 1474 data.trans_address = __psp_pa(blob); 1475 data.trans_len = params.trans_len; 1476 1477 hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); 1478 if (IS_ERR(hdr)) { 1479 ret = PTR_ERR(hdr); 1480 goto e_free_blob; 1481 } 1482 data.hdr_address = __psp_pa(hdr); 1483 data.hdr_len = params.hdr_len; 1484 1485 data.handle = to_kvm_sev_info(kvm)->handle; 1486 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error); 1487 1488 kfree(hdr); 1489 1490 e_free_blob: 1491 kfree(blob); 1492 e_unpin_memory: 1493 /* content of memory is updated, mark pages dirty */ 1494 for (i = 0; i < n; i++) { 1495 set_page_dirty_lock(pages[i]); 1496 mark_page_accessed(pages[i]); 1497 } 1498 sev_unpin_memory(kvm, pages, n); 1499 return ret; 1500 } 1501 1502 static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) 1503 { 1504 void __user *report = u64_to_user_ptr(argp->data); 1505 struct sev_data_attestation_report data; 1506 struct kvm_sev_attestation_report params; 1507 void __user *p; 1508 void *blob = NULL; 1509 int ret; 1510 1511 if (!sev_guest(kvm)) 1512 return -ENOTTY; 1513 1514 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 1515 return -EFAULT; 1516 1517 memset(&data, 0, sizeof(data)); 1518 1519 /* User wants to query the blob length */ 1520 if (!params.len) 1521 goto cmd; 1522 1523 p = u64_to_user_ptr(params.uaddr); 1524 if (p) { 1525 if (params.len > SEV_FW_BLOB_MAX_SIZE) 1526 return -EINVAL; 1527 1528 blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT); 1529 if (!blob) 1530 return -ENOMEM; 1531 1532 data.address = __psp_pa(blob); 1533 data.len = params.len; 1534 memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce)); 1535 } 1536 cmd: 1537 data.handle = to_kvm_sev_info(kvm)->handle; 1538 ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error); 1539 /* 1540 * If we query the session length, FW responded with expected data. 1541 */ 1542 if (!params.len) 1543 goto done; 1544 1545 if (ret) 1546 goto e_free_blob; 1547 1548 if (blob) { 1549 if (copy_to_user(p, blob, params.len)) 1550 ret = -EFAULT; 1551 } 1552 1553 done: 1554 params.len = data.len; 1555 if (copy_to_user(report, ¶ms, sizeof(params))) 1556 ret = -EFAULT; 1557 e_free_blob: 1558 kfree(blob); 1559 return ret; 1560 } 1561 1562 /* Userspace wants to query session length. */ 1563 static int 1564 __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, 1565 struct kvm_sev_send_start *params) 1566 { 1567 struct sev_data_send_start data; 1568 int ret; 1569 1570 memset(&data, 0, sizeof(data)); 1571 data.handle = to_kvm_sev_info(kvm)->handle; 1572 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); 1573 1574 params->session_len = data.session_len; 1575 if (copy_to_user(u64_to_user_ptr(argp->data), params, 1576 sizeof(struct kvm_sev_send_start))) 1577 ret = -EFAULT; 1578 1579 return ret; 1580 } 1581 1582 static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 1583 { 1584 struct sev_data_send_start data; 1585 struct kvm_sev_send_start params; 1586 void *amd_certs, *session_data; 1587 void *pdh_cert, *plat_certs; 1588 int ret; 1589 1590 if (!sev_guest(kvm)) 1591 return -ENOTTY; 1592 1593 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1594 sizeof(struct kvm_sev_send_start))) 1595 return -EFAULT; 1596 1597 /* if session_len is zero, userspace wants to query the session length */ 1598 if (!params.session_len) 1599 return __sev_send_start_query_session_length(kvm, argp, 1600 ¶ms); 1601 1602 /* some sanity checks */ 1603 if (!params.pdh_cert_uaddr || !params.pdh_cert_len || 1604 !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE) 1605 return -EINVAL; 1606 1607 /* allocate the memory to hold the session data blob */ 1608 session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT); 1609 if (!session_data) 1610 return -ENOMEM; 1611 1612 /* copy the certificate blobs from userspace */ 1613 pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr, 1614 params.pdh_cert_len); 1615 if (IS_ERR(pdh_cert)) { 1616 ret = PTR_ERR(pdh_cert); 1617 goto e_free_session; 1618 } 1619 1620 plat_certs = psp_copy_user_blob(params.plat_certs_uaddr, 1621 params.plat_certs_len); 1622 if (IS_ERR(plat_certs)) { 1623 ret = PTR_ERR(plat_certs); 1624 goto e_free_pdh; 1625 } 1626 1627 amd_certs = psp_copy_user_blob(params.amd_certs_uaddr, 1628 params.amd_certs_len); 1629 if (IS_ERR(amd_certs)) { 1630 ret = PTR_ERR(amd_certs); 1631 goto e_free_plat_cert; 1632 } 1633 1634 /* populate the FW SEND_START field with system physical address */ 1635 memset(&data, 0, sizeof(data)); 1636 data.pdh_cert_address = __psp_pa(pdh_cert); 1637 data.pdh_cert_len = params.pdh_cert_len; 1638 data.plat_certs_address = __psp_pa(plat_certs); 1639 data.plat_certs_len = params.plat_certs_len; 1640 data.amd_certs_address = __psp_pa(amd_certs); 1641 data.amd_certs_len = params.amd_certs_len; 1642 data.session_address = __psp_pa(session_data); 1643 data.session_len = params.session_len; 1644 data.handle = to_kvm_sev_info(kvm)->handle; 1645 1646 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); 1647 1648 if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr), 1649 session_data, params.session_len)) { 1650 ret = -EFAULT; 1651 goto e_free_amd_cert; 1652 } 1653 1654 params.policy = data.policy; 1655 params.session_len = data.session_len; 1656 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, 1657 sizeof(struct kvm_sev_send_start))) 1658 ret = -EFAULT; 1659 1660 e_free_amd_cert: 1661 kfree(amd_certs); 1662 e_free_plat_cert: 1663 kfree(plat_certs); 1664 e_free_pdh: 1665 kfree(pdh_cert); 1666 e_free_session: 1667 kfree(session_data); 1668 return ret; 1669 } 1670 1671 /* Userspace wants to query either header or trans length. */ 1672 static int 1673 __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, 1674 struct kvm_sev_send_update_data *params) 1675 { 1676 struct sev_data_send_update_data data; 1677 int ret; 1678 1679 memset(&data, 0, sizeof(data)); 1680 data.handle = to_kvm_sev_info(kvm)->handle; 1681 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); 1682 1683 params->hdr_len = data.hdr_len; 1684 params->trans_len = data.trans_len; 1685 1686 if (copy_to_user(u64_to_user_ptr(argp->data), params, 1687 sizeof(struct kvm_sev_send_update_data))) 1688 ret = -EFAULT; 1689 1690 return ret; 1691 } 1692 1693 static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) 1694 { 1695 struct sev_data_send_update_data data; 1696 struct kvm_sev_send_update_data params; 1697 void *hdr, *trans_data; 1698 struct page **guest_page; 1699 unsigned long n; 1700 int ret, offset; 1701 1702 if (!sev_guest(kvm)) 1703 return -ENOTTY; 1704 1705 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1706 sizeof(struct kvm_sev_send_update_data))) 1707 return -EFAULT; 1708 1709 /* userspace wants to query either header or trans length */ 1710 if (!params.trans_len || !params.hdr_len) 1711 return __sev_send_update_data_query_lengths(kvm, argp, ¶ms); 1712 1713 if (!params.trans_uaddr || !params.guest_uaddr || 1714 !params.guest_len || !params.hdr_uaddr) 1715 return -EINVAL; 1716 1717 /* Check if we are crossing the page boundary */ 1718 offset = params.guest_uaddr & (PAGE_SIZE - 1); 1719 if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) 1720 return -EINVAL; 1721 1722 /* Pin guest memory */ 1723 guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, 1724 PAGE_SIZE, &n, 0); 1725 if (IS_ERR(guest_page)) 1726 return PTR_ERR(guest_page); 1727 1728 /* allocate memory for header and transport buffer */ 1729 ret = -ENOMEM; 1730 hdr = kzalloc(params.hdr_len, GFP_KERNEL); 1731 if (!hdr) 1732 goto e_unpin; 1733 1734 trans_data = kzalloc(params.trans_len, GFP_KERNEL); 1735 if (!trans_data) 1736 goto e_free_hdr; 1737 1738 memset(&data, 0, sizeof(data)); 1739 data.hdr_address = __psp_pa(hdr); 1740 data.hdr_len = params.hdr_len; 1741 data.trans_address = __psp_pa(trans_data); 1742 data.trans_len = params.trans_len; 1743 1744 /* The SEND_UPDATE_DATA command requires C-bit to be always set. */ 1745 data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; 1746 data.guest_address |= sev_me_mask; 1747 data.guest_len = params.guest_len; 1748 data.handle = to_kvm_sev_info(kvm)->handle; 1749 1750 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); 1751 1752 if (ret) 1753 goto e_free_trans_data; 1754 1755 /* copy transport buffer to user space */ 1756 if (copy_to_user(u64_to_user_ptr(params.trans_uaddr), 1757 trans_data, params.trans_len)) { 1758 ret = -EFAULT; 1759 goto e_free_trans_data; 1760 } 1761 1762 /* Copy packet header to userspace. */ 1763 if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr, 1764 params.hdr_len)) 1765 ret = -EFAULT; 1766 1767 e_free_trans_data: 1768 kfree(trans_data); 1769 e_free_hdr: 1770 kfree(hdr); 1771 e_unpin: 1772 sev_unpin_memory(kvm, guest_page, n); 1773 1774 return ret; 1775 } 1776 1777 static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 1778 { 1779 struct sev_data_send_finish data; 1780 1781 if (!sev_guest(kvm)) 1782 return -ENOTTY; 1783 1784 data.handle = to_kvm_sev_info(kvm)->handle; 1785 return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error); 1786 } 1787 1788 static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp) 1789 { 1790 struct sev_data_send_cancel data; 1791 1792 if (!sev_guest(kvm)) 1793 return -ENOTTY; 1794 1795 data.handle = to_kvm_sev_info(kvm)->handle; 1796 return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error); 1797 } 1798 1799 static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 1800 { 1801 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 1802 struct sev_data_receive_start start; 1803 struct kvm_sev_receive_start params; 1804 int *error = &argp->error; 1805 void *session_data; 1806 void *pdh_data; 1807 int ret; 1808 1809 if (!sev_guest(kvm)) 1810 return -ENOTTY; 1811 1812 /* Get parameter from the userspace */ 1813 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1814 sizeof(struct kvm_sev_receive_start))) 1815 return -EFAULT; 1816 1817 /* some sanity checks */ 1818 if (!params.pdh_uaddr || !params.pdh_len || 1819 !params.session_uaddr || !params.session_len) 1820 return -EINVAL; 1821 1822 pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len); 1823 if (IS_ERR(pdh_data)) 1824 return PTR_ERR(pdh_data); 1825 1826 session_data = psp_copy_user_blob(params.session_uaddr, 1827 params.session_len); 1828 if (IS_ERR(session_data)) { 1829 ret = PTR_ERR(session_data); 1830 goto e_free_pdh; 1831 } 1832 1833 memset(&start, 0, sizeof(start)); 1834 start.handle = params.handle; 1835 start.policy = params.policy; 1836 start.pdh_cert_address = __psp_pa(pdh_data); 1837 start.pdh_cert_len = params.pdh_len; 1838 start.session_address = __psp_pa(session_data); 1839 start.session_len = params.session_len; 1840 1841 /* create memory encryption context */ 1842 ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start, 1843 error); 1844 if (ret) 1845 goto e_free_session; 1846 1847 /* Bind ASID to this guest */ 1848 ret = sev_bind_asid(kvm, start.handle, error); 1849 if (ret) { 1850 sev_decommission(start.handle); 1851 goto e_free_session; 1852 } 1853 1854 params.handle = start.handle; 1855 if (copy_to_user(u64_to_user_ptr(argp->data), 1856 ¶ms, sizeof(struct kvm_sev_receive_start))) { 1857 ret = -EFAULT; 1858 sev_unbind_asid(kvm, start.handle); 1859 goto e_free_session; 1860 } 1861 1862 sev->handle = start.handle; 1863 sev->fd = argp->sev_fd; 1864 1865 e_free_session: 1866 kfree(session_data); 1867 e_free_pdh: 1868 kfree(pdh_data); 1869 1870 return ret; 1871 } 1872 1873 static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) 1874 { 1875 struct kvm_sev_receive_update_data params; 1876 struct sev_data_receive_update_data data; 1877 void *hdr = NULL, *trans = NULL; 1878 struct page **guest_page; 1879 unsigned long n; 1880 int ret, offset; 1881 1882 if (!sev_guest(kvm)) 1883 return -EINVAL; 1884 1885 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1886 sizeof(struct kvm_sev_receive_update_data))) 1887 return -EFAULT; 1888 1889 if (!params.hdr_uaddr || !params.hdr_len || 1890 !params.guest_uaddr || !params.guest_len || 1891 !params.trans_uaddr || !params.trans_len) 1892 return -EINVAL; 1893 1894 /* Check if we are crossing the page boundary */ 1895 offset = params.guest_uaddr & (PAGE_SIZE - 1); 1896 if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) 1897 return -EINVAL; 1898 1899 hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); 1900 if (IS_ERR(hdr)) 1901 return PTR_ERR(hdr); 1902 1903 trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len); 1904 if (IS_ERR(trans)) { 1905 ret = PTR_ERR(trans); 1906 goto e_free_hdr; 1907 } 1908 1909 memset(&data, 0, sizeof(data)); 1910 data.hdr_address = __psp_pa(hdr); 1911 data.hdr_len = params.hdr_len; 1912 data.trans_address = __psp_pa(trans); 1913 data.trans_len = params.trans_len; 1914 1915 /* Pin guest memory */ 1916 guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, 1917 PAGE_SIZE, &n, FOLL_WRITE); 1918 if (IS_ERR(guest_page)) { 1919 ret = PTR_ERR(guest_page); 1920 goto e_free_trans; 1921 } 1922 1923 /* 1924 * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP 1925 * encrypts the written data with the guest's key, and the cache may 1926 * contain dirty, unencrypted data. 1927 */ 1928 sev_clflush_pages(guest_page, n); 1929 1930 /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */ 1931 data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; 1932 data.guest_address |= sev_me_mask; 1933 data.guest_len = params.guest_len; 1934 data.handle = to_kvm_sev_info(kvm)->handle; 1935 1936 ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data, 1937 &argp->error); 1938 1939 sev_unpin_memory(kvm, guest_page, n); 1940 1941 e_free_trans: 1942 kfree(trans); 1943 e_free_hdr: 1944 kfree(hdr); 1945 1946 return ret; 1947 } 1948 1949 static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 1950 { 1951 struct sev_data_receive_finish data; 1952 1953 if (!sev_guest(kvm)) 1954 return -ENOTTY; 1955 1956 data.handle = to_kvm_sev_info(kvm)->handle; 1957 return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); 1958 } 1959 1960 static bool is_cmd_allowed_from_mirror(u32 cmd_id) 1961 { 1962 /* 1963 * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES 1964 * active mirror VMs. Also allow the debugging and status commands. 1965 */ 1966 if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA || 1967 cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT || 1968 cmd_id == KVM_SEV_DBG_ENCRYPT) 1969 return true; 1970 1971 return false; 1972 } 1973 1974 static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) 1975 { 1976 struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); 1977 struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); 1978 int r = -EBUSY; 1979 1980 if (dst_kvm == src_kvm) 1981 return -EINVAL; 1982 1983 /* 1984 * Bail if these VMs are already involved in a migration to avoid 1985 * deadlock between two VMs trying to migrate to/from each other. 1986 */ 1987 if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1)) 1988 return -EBUSY; 1989 1990 if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) 1991 goto release_dst; 1992 1993 r = -EINTR; 1994 if (mutex_lock_killable(&dst_kvm->lock)) 1995 goto release_src; 1996 if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING)) 1997 goto unlock_dst; 1998 return 0; 1999 2000 unlock_dst: 2001 mutex_unlock(&dst_kvm->lock); 2002 release_src: 2003 atomic_set_release(&src_sev->migration_in_progress, 0); 2004 release_dst: 2005 atomic_set_release(&dst_sev->migration_in_progress, 0); 2006 return r; 2007 } 2008 2009 static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) 2010 { 2011 struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); 2012 struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); 2013 2014 mutex_unlock(&dst_kvm->lock); 2015 mutex_unlock(&src_kvm->lock); 2016 atomic_set_release(&dst_sev->migration_in_progress, 0); 2017 atomic_set_release(&src_sev->migration_in_progress, 0); 2018 } 2019 2020 static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) 2021 { 2022 struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm); 2023 struct kvm_sev_info *src = to_kvm_sev_info(src_kvm); 2024 struct kvm_vcpu *dst_vcpu, *src_vcpu; 2025 struct vcpu_svm *dst_svm, *src_svm; 2026 struct kvm_sev_info *mirror; 2027 unsigned long i; 2028 2029 dst->active = true; 2030 dst->asid = src->asid; 2031 dst->handle = src->handle; 2032 dst->pages_locked = src->pages_locked; 2033 dst->enc_context_owner = src->enc_context_owner; 2034 dst->es_active = src->es_active; 2035 dst->vmsa_features = src->vmsa_features; 2036 2037 src->asid = 0; 2038 src->active = false; 2039 src->handle = 0; 2040 src->pages_locked = 0; 2041 src->enc_context_owner = NULL; 2042 src->es_active = false; 2043 2044 list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); 2045 2046 /* 2047 * If this VM has mirrors, "transfer" each mirror's refcount of the 2048 * source to the destination (this KVM). The caller holds a reference 2049 * to the source, so there's no danger of use-after-free. 2050 */ 2051 list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms); 2052 list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) { 2053 kvm_get_kvm(dst_kvm); 2054 kvm_put_kvm(src_kvm); 2055 mirror->enc_context_owner = dst_kvm; 2056 } 2057 2058 /* 2059 * If this VM is a mirror, remove the old mirror from the owners list 2060 * and add the new mirror to the list. 2061 */ 2062 if (is_mirroring_enc_context(dst_kvm)) { 2063 struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner); 2064 2065 list_del(&src->mirror_entry); 2066 list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); 2067 } 2068 2069 kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) { 2070 dst_svm = to_svm(dst_vcpu); 2071 2072 sev_init_vmcb(dst_svm, false); 2073 2074 if (!dst->es_active) 2075 continue; 2076 2077 /* 2078 * Note, the source is not required to have the same number of 2079 * vCPUs as the destination when migrating a vanilla SEV VM. 2080 */ 2081 src_vcpu = kvm_get_vcpu(src_kvm, i); 2082 src_svm = to_svm(src_vcpu); 2083 2084 /* 2085 * Transfer VMSA and GHCB state to the destination. Nullify and 2086 * clear source fields as appropriate, the state now belongs to 2087 * the destination. 2088 */ 2089 memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es)); 2090 dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa; 2091 dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa; 2092 dst_vcpu->arch.guest_state_protected = true; 2093 2094 memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es)); 2095 src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE; 2096 src_svm->vmcb->control.vmsa_pa = INVALID_PAGE; 2097 src_vcpu->arch.guest_state_protected = false; 2098 } 2099 } 2100 2101 static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) 2102 { 2103 struct kvm_vcpu *src_vcpu; 2104 unsigned long i; 2105 2106 if (kvm_is_vcpu_creation_in_progress(src) || 2107 kvm_is_vcpu_creation_in_progress(dst)) 2108 return -EBUSY; 2109 2110 if (!sev_es_guest(src)) 2111 return 0; 2112 2113 if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus)) 2114 return -EINVAL; 2115 2116 kvm_for_each_vcpu(i, src_vcpu, src) { 2117 if (!src_vcpu->arch.guest_state_protected) 2118 return -EINVAL; 2119 } 2120 2121 return 0; 2122 } 2123 2124 int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) 2125 { 2126 struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm); 2127 struct kvm_sev_info *src_sev, *cg_cleanup_sev; 2128 CLASS(fd, f)(source_fd); 2129 struct kvm *source_kvm; 2130 bool charged = false; 2131 int ret; 2132 2133 if (fd_empty(f)) 2134 return -EBADF; 2135 2136 if (!file_is_kvm(fd_file(f))) 2137 return -EBADF; 2138 2139 source_kvm = fd_file(f)->private_data; 2140 ret = sev_lock_two_vms(kvm, source_kvm); 2141 if (ret) 2142 return ret; 2143 2144 if (kvm->arch.vm_type != source_kvm->arch.vm_type || 2145 sev_guest(kvm) || !sev_guest(source_kvm)) { 2146 ret = -EINVAL; 2147 goto out_unlock; 2148 } 2149 2150 src_sev = to_kvm_sev_info(source_kvm); 2151 2152 dst_sev->misc_cg = get_current_misc_cg(); 2153 cg_cleanup_sev = dst_sev; 2154 if (dst_sev->misc_cg != src_sev->misc_cg) { 2155 ret = sev_misc_cg_try_charge(dst_sev); 2156 if (ret) 2157 goto out_dst_cgroup; 2158 charged = true; 2159 } 2160 2161 ret = kvm_lock_all_vcpus(kvm); 2162 if (ret) 2163 goto out_dst_cgroup; 2164 ret = kvm_lock_all_vcpus(source_kvm); 2165 if (ret) 2166 goto out_dst_vcpu; 2167 2168 ret = sev_check_source_vcpus(kvm, source_kvm); 2169 if (ret) 2170 goto out_source_vcpu; 2171 2172 /* 2173 * Allocate a new have_run_cpus for the destination, i.e. don't copy 2174 * the set of CPUs from the source. If a CPU was used to run a vCPU in 2175 * the source VM but is never used for the destination VM, then the CPU 2176 * can only have cached memory that was accessible to the source VM. 2177 */ 2178 if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 2179 ret = -ENOMEM; 2180 goto out_source_vcpu; 2181 } 2182 2183 sev_migrate_from(kvm, source_kvm); 2184 kvm_vm_dead(source_kvm); 2185 cg_cleanup_sev = src_sev; 2186 ret = 0; 2187 2188 out_source_vcpu: 2189 kvm_unlock_all_vcpus(source_kvm); 2190 out_dst_vcpu: 2191 kvm_unlock_all_vcpus(kvm); 2192 out_dst_cgroup: 2193 /* Operates on the source on success, on the destination on failure. */ 2194 if (charged) 2195 sev_misc_cg_uncharge(cg_cleanup_sev); 2196 put_misc_cg(cg_cleanup_sev->misc_cg); 2197 cg_cleanup_sev->misc_cg = NULL; 2198 out_unlock: 2199 sev_unlock_two_vms(kvm, source_kvm); 2200 return ret; 2201 } 2202 2203 int sev_dev_get_attr(u32 group, u64 attr, u64 *val) 2204 { 2205 if (group != KVM_X86_GRP_SEV) 2206 return -ENXIO; 2207 2208 switch (attr) { 2209 case KVM_X86_SEV_VMSA_FEATURES: 2210 *val = sev_supported_vmsa_features; 2211 return 0; 2212 2213 case KVM_X86_SNP_POLICY_BITS: 2214 *val = snp_supported_policy_bits; 2215 return 0; 2216 2217 case KVM_X86_SEV_SNP_REQ_CERTS: 2218 *val = sev_snp_enabled ? 1 : 0; 2219 return 0; 2220 default: 2221 return -ENXIO; 2222 } 2223 } 2224 2225 /* 2226 * The guest context contains all the information, keys and metadata 2227 * associated with the guest that the firmware tracks to implement SEV 2228 * and SNP features. The firmware stores the guest context in hypervisor 2229 * provide page via the SNP_GCTX_CREATE command. 2230 */ 2231 static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) 2232 { 2233 struct sev_data_snp_addr data = {}; 2234 void *context; 2235 int rc; 2236 2237 /* Allocate memory for context page */ 2238 context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); 2239 if (!context) 2240 return NULL; 2241 2242 data.address = __psp_pa(context); 2243 rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error); 2244 if (rc) { 2245 pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d", 2246 rc, argp->error); 2247 snp_free_firmware_page(context); 2248 return NULL; 2249 } 2250 2251 return context; 2252 } 2253 2254 static int snp_bind_asid(struct kvm *kvm, int *error) 2255 { 2256 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2257 struct sev_data_snp_activate data = {0}; 2258 2259 data.gctx_paddr = __psp_pa(sev->snp_context); 2260 data.asid = sev_get_asid(kvm); 2261 return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error); 2262 } 2263 2264 static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 2265 { 2266 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2267 struct sev_data_snp_launch_start start = {0}; 2268 struct kvm_sev_snp_launch_start params; 2269 int rc; 2270 2271 if (!sev_snp_guest(kvm)) 2272 return -ENOTTY; 2273 2274 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 2275 return -EFAULT; 2276 2277 /* Don't allow userspace to allocate memory for more than 1 SNP context. */ 2278 if (sev->snp_context) 2279 return -EINVAL; 2280 2281 if (params.flags) 2282 return -EINVAL; 2283 2284 if (params.policy & ~snp_supported_policy_bits) 2285 return -EINVAL; 2286 2287 /* Check for policy bits that must be set */ 2288 if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO)) 2289 return -EINVAL; 2290 2291 if (snp_is_secure_tsc_enabled(kvm)) { 2292 if (WARN_ON_ONCE(!kvm->arch.default_tsc_khz)) 2293 return -EINVAL; 2294 2295 start.desired_tsc_khz = kvm->arch.default_tsc_khz; 2296 } 2297 2298 sev->snp_context = snp_context_create(kvm, argp); 2299 if (!sev->snp_context) 2300 return -ENOTTY; 2301 2302 start.gctx_paddr = __psp_pa(sev->snp_context); 2303 start.policy = params.policy; 2304 2305 memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); 2306 rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); 2307 if (rc) { 2308 pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n", 2309 __func__, rc); 2310 goto e_free_context; 2311 } 2312 2313 sev->policy = params.policy; 2314 sev->fd = argp->sev_fd; 2315 rc = snp_bind_asid(kvm, &argp->error); 2316 if (rc) { 2317 pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n", 2318 __func__, rc); 2319 goto e_free_context; 2320 } 2321 2322 return 0; 2323 2324 e_free_context: 2325 snp_decommission_context(kvm); 2326 2327 return rc; 2328 } 2329 2330 struct sev_gmem_populate_args { 2331 __u8 type; 2332 int sev_fd; 2333 int fw_error; 2334 }; 2335 2336 static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 2337 struct page *src_page, void *opaque) 2338 { 2339 struct sev_gmem_populate_args *sev_populate_args = opaque; 2340 struct sev_data_snp_launch_update fw_args = {0}; 2341 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2342 bool assigned = false; 2343 int level; 2344 int ret; 2345 2346 if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page)) 2347 return -EINVAL; 2348 2349 ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level); 2350 if (ret || assigned) { 2351 pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", 2352 __func__, gfn, ret, assigned); 2353 ret = ret ? -EINVAL : -EEXIST; 2354 goto out; 2355 } 2356 2357 if (src_page) { 2358 void *src_vaddr = kmap_local_page(src_page); 2359 void *dst_vaddr = kmap_local_pfn(pfn); 2360 2361 memcpy(dst_vaddr, src_vaddr, PAGE_SIZE); 2362 2363 kunmap_local(src_vaddr); 2364 kunmap_local(dst_vaddr); 2365 } 2366 2367 ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K, 2368 sev_get_asid(kvm), true); 2369 if (ret) 2370 goto out; 2371 2372 fw_args.gctx_paddr = __psp_pa(sev->snp_context); 2373 fw_args.address = __sme_set(pfn_to_hpa(pfn)); 2374 fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); 2375 fw_args.page_type = sev_populate_args->type; 2376 2377 ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, 2378 &fw_args, &sev_populate_args->fw_error); 2379 /* 2380 * If the firmware command failed handle the reclaim and cleanup of that 2381 * PFN before reporting an error. 2382 * 2383 * Additionally, when invalid CPUID function entries are detected, 2384 * firmware writes the expected values into the page and leaves it 2385 * unencrypted so it can be used for debugging and error-reporting. 2386 * 2387 * Copy this page back into the source buffer so userspace can use this 2388 * information to provide information on which CPUID leaves/fields 2389 * failed CPUID validation. 2390 */ 2391 if (ret && !snp_page_reclaim(kvm, pfn) && 2392 sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && 2393 sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { 2394 void *src_vaddr = kmap_local_page(src_page); 2395 void *dst_vaddr = kmap_local_pfn(pfn); 2396 2397 memcpy(src_vaddr, dst_vaddr, PAGE_SIZE); 2398 2399 kunmap_local(src_vaddr); 2400 kunmap_local(dst_vaddr); 2401 } 2402 2403 out: 2404 if (ret) 2405 pr_debug("%s: error updating GFN %llx, return code %d (fw_error %d)\n", 2406 __func__, gfn, ret, sev_populate_args->fw_error); 2407 return ret; 2408 } 2409 2410 static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) 2411 { 2412 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2413 struct sev_gmem_populate_args sev_populate_args = {0}; 2414 struct kvm_sev_snp_launch_update params; 2415 struct kvm_memory_slot *memslot; 2416 long npages, count; 2417 void __user *src; 2418 2419 if (!sev_snp_guest(kvm) || !sev->snp_context) 2420 return -EINVAL; 2421 2422 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 2423 return -EFAULT; 2424 2425 pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__, 2426 params.gfn_start, params.len, params.type, params.flags); 2427 2428 if (!params.len || !PAGE_ALIGNED(params.len) || params.flags || 2429 (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL && 2430 params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO && 2431 params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED && 2432 params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS && 2433 params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID)) 2434 return -EINVAL; 2435 2436 src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr); 2437 2438 if (!PAGE_ALIGNED(src)) 2439 return -EINVAL; 2440 2441 npages = params.len / PAGE_SIZE; 2442 2443 /* 2444 * For each GFN that's being prepared as part of the initial guest 2445 * state, the following pre-conditions are verified: 2446 * 2447 * 1) The backing memslot is a valid private memslot. 2448 * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES 2449 * beforehand. 2450 * 3) The PFN of the guest_memfd has not already been set to private 2451 * in the RMP table. 2452 * 2453 * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page 2454 * faults if there's a race between a fault and an attribute update via 2455 * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized 2456 * here. However, kvm->slots_lock guards against both this as well as 2457 * concurrent memslot updates occurring while these checks are being 2458 * performed, so use that here to make it easier to reason about the 2459 * initial expected state and better guard against unexpected 2460 * situations. 2461 */ 2462 guard(mutex)(&kvm->slots_lock); 2463 2464 memslot = gfn_to_memslot(kvm, params.gfn_start); 2465 if (!kvm_slot_has_gmem(memslot)) 2466 return -EINVAL; 2467 2468 sev_populate_args.sev_fd = argp->sev_fd; 2469 sev_populate_args.type = params.type; 2470 2471 count = kvm_gmem_populate(kvm, params.gfn_start, src, npages, 2472 sev_gmem_post_populate, &sev_populate_args); 2473 if (count < 0) { 2474 argp->error = sev_populate_args.fw_error; 2475 pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n", 2476 __func__, count, argp->error); 2477 return -EIO; 2478 } 2479 2480 params.gfn_start += count; 2481 params.len -= count * PAGE_SIZE; 2482 if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO) 2483 params.uaddr += count * PAGE_SIZE; 2484 2485 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) 2486 return -EFAULT; 2487 2488 return 0; 2489 } 2490 2491 static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) 2492 { 2493 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2494 struct sev_data_snp_launch_update data = {}; 2495 struct kvm_vcpu *vcpu; 2496 unsigned long i; 2497 int ret; 2498 2499 if (kvm_is_vcpu_creation_in_progress(kvm)) 2500 return -EBUSY; 2501 2502 ret = kvm_lock_all_vcpus(kvm); 2503 if (ret) 2504 return ret; 2505 2506 data.gctx_paddr = __psp_pa(sev->snp_context); 2507 data.page_type = SNP_PAGE_TYPE_VMSA; 2508 2509 kvm_for_each_vcpu(i, vcpu, kvm) { 2510 struct vcpu_svm *svm = to_svm(vcpu); 2511 u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; 2512 2513 ret = sev_es_sync_vmsa(svm); 2514 if (ret) 2515 goto out; 2516 2517 /* Transition the VMSA page to a firmware state. */ 2518 ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true); 2519 if (ret) 2520 goto out; 2521 2522 /* Issue the SNP command to encrypt the VMSA */ 2523 data.address = __sme_pa(svm->sev_es.vmsa); 2524 ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, 2525 &data, &argp->error); 2526 if (ret) { 2527 snp_page_reclaim(kvm, pfn); 2528 2529 goto out; 2530 } 2531 2532 svm->vcpu.arch.guest_state_protected = true; 2533 /* 2534 * SEV-ES (and thus SNP) guest mandates LBR Virtualization to 2535 * be _always_ ON. Enable it only after setting 2536 * guest_state_protected because KVM_SET_MSRS allows dynamic 2537 * toggling of LBRV (for performance reason) on write access to 2538 * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. 2539 */ 2540 svm_enable_lbrv(vcpu); 2541 } 2542 2543 out: 2544 kvm_unlock_all_vcpus(kvm); 2545 return ret; 2546 } 2547 2548 static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 2549 { 2550 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2551 struct kvm_sev_snp_launch_finish params; 2552 struct sev_data_snp_launch_finish *data; 2553 void *id_block = NULL, *id_auth = NULL; 2554 int ret; 2555 2556 if (!sev_snp_guest(kvm)) 2557 return -ENOTTY; 2558 2559 if (!sev->snp_context) 2560 return -EINVAL; 2561 2562 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 2563 return -EFAULT; 2564 2565 if (params.flags) 2566 return -EINVAL; 2567 2568 /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */ 2569 ret = snp_launch_update_vmsa(kvm, argp); 2570 if (ret) 2571 return ret; 2572 2573 data = kzalloc_obj(*data, GFP_KERNEL_ACCOUNT); 2574 if (!data) 2575 return -ENOMEM; 2576 2577 if (params.id_block_en) { 2578 id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE); 2579 if (IS_ERR(id_block)) { 2580 ret = PTR_ERR(id_block); 2581 goto e_free; 2582 } 2583 2584 data->id_block_en = 1; 2585 data->id_block_paddr = __sme_pa(id_block); 2586 2587 id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE); 2588 if (IS_ERR(id_auth)) { 2589 ret = PTR_ERR(id_auth); 2590 goto e_free_id_block; 2591 } 2592 2593 data->id_auth_paddr = __sme_pa(id_auth); 2594 2595 if (params.auth_key_en) 2596 data->auth_key_en = 1; 2597 } 2598 2599 data->vcek_disabled = params.vcek_disabled; 2600 2601 memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE); 2602 data->gctx_paddr = __psp_pa(sev->snp_context); 2603 ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); 2604 2605 /* 2606 * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages 2607 * can be given to the guest simply by marking the RMP entry as private. 2608 * This can happen on first access and also with KVM_PRE_FAULT_MEMORY. 2609 */ 2610 if (!ret) 2611 kvm->arch.pre_fault_allowed = true; 2612 2613 kfree(id_auth); 2614 2615 e_free_id_block: 2616 kfree(id_block); 2617 2618 e_free: 2619 kfree(data); 2620 2621 return ret; 2622 } 2623 2624 static int snp_enable_certs(struct kvm *kvm) 2625 { 2626 if (kvm->created_vcpus || !sev_snp_guest(kvm)) 2627 return -EINVAL; 2628 2629 to_kvm_sev_info(kvm)->snp_certs_enabled = true; 2630 2631 return 0; 2632 } 2633 2634 int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) 2635 { 2636 struct kvm_sev_cmd sev_cmd; 2637 int r; 2638 2639 if (!sev_enabled) 2640 return -ENOTTY; 2641 2642 if (!argp) 2643 return 0; 2644 2645 if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd))) 2646 return -EFAULT; 2647 2648 guard(mutex)(&kvm->lock); 2649 2650 /* Only the enc_context_owner handles some memory enc operations. */ 2651 if (is_mirroring_enc_context(kvm) && 2652 !is_cmd_allowed_from_mirror(sev_cmd.id)) 2653 return -EINVAL; 2654 2655 /* 2656 * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only 2657 * allow the use of SNP-specific commands. 2658 */ 2659 if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) 2660 return -EPERM; 2661 2662 switch (sev_cmd.id) { 2663 case KVM_SEV_ES_INIT: 2664 if (!sev_es_enabled) 2665 return -ENOTTY; 2666 fallthrough; 2667 case KVM_SEV_INIT: 2668 r = sev_guest_init(kvm, &sev_cmd); 2669 break; 2670 case KVM_SEV_INIT2: 2671 r = sev_guest_init2(kvm, &sev_cmd); 2672 break; 2673 case KVM_SEV_LAUNCH_START: 2674 r = sev_launch_start(kvm, &sev_cmd); 2675 break; 2676 case KVM_SEV_LAUNCH_UPDATE_DATA: 2677 r = sev_launch_update_data(kvm, &sev_cmd); 2678 break; 2679 case KVM_SEV_LAUNCH_UPDATE_VMSA: 2680 r = sev_launch_update_vmsa(kvm, &sev_cmd); 2681 break; 2682 case KVM_SEV_LAUNCH_MEASURE: 2683 r = sev_launch_measure(kvm, &sev_cmd); 2684 break; 2685 case KVM_SEV_LAUNCH_FINISH: 2686 r = sev_launch_finish(kvm, &sev_cmd); 2687 break; 2688 case KVM_SEV_GUEST_STATUS: 2689 r = sev_guest_status(kvm, &sev_cmd); 2690 break; 2691 case KVM_SEV_DBG_DECRYPT: 2692 r = sev_dbg_crypt(kvm, &sev_cmd, true); 2693 break; 2694 case KVM_SEV_DBG_ENCRYPT: 2695 r = sev_dbg_crypt(kvm, &sev_cmd, false); 2696 break; 2697 case KVM_SEV_LAUNCH_SECRET: 2698 r = sev_launch_secret(kvm, &sev_cmd); 2699 break; 2700 case KVM_SEV_GET_ATTESTATION_REPORT: 2701 r = sev_get_attestation_report(kvm, &sev_cmd); 2702 break; 2703 case KVM_SEV_SEND_START: 2704 r = sev_send_start(kvm, &sev_cmd); 2705 break; 2706 case KVM_SEV_SEND_UPDATE_DATA: 2707 r = sev_send_update_data(kvm, &sev_cmd); 2708 break; 2709 case KVM_SEV_SEND_FINISH: 2710 r = sev_send_finish(kvm, &sev_cmd); 2711 break; 2712 case KVM_SEV_SEND_CANCEL: 2713 r = sev_send_cancel(kvm, &sev_cmd); 2714 break; 2715 case KVM_SEV_RECEIVE_START: 2716 r = sev_receive_start(kvm, &sev_cmd); 2717 break; 2718 case KVM_SEV_RECEIVE_UPDATE_DATA: 2719 r = sev_receive_update_data(kvm, &sev_cmd); 2720 break; 2721 case KVM_SEV_RECEIVE_FINISH: 2722 r = sev_receive_finish(kvm, &sev_cmd); 2723 break; 2724 case KVM_SEV_SNP_LAUNCH_START: 2725 r = snp_launch_start(kvm, &sev_cmd); 2726 break; 2727 case KVM_SEV_SNP_LAUNCH_UPDATE: 2728 r = snp_launch_update(kvm, &sev_cmd); 2729 break; 2730 case KVM_SEV_SNP_LAUNCH_FINISH: 2731 r = snp_launch_finish(kvm, &sev_cmd); 2732 break; 2733 case KVM_SEV_SNP_ENABLE_REQ_CERTS: 2734 r = snp_enable_certs(kvm); 2735 break; 2736 default: 2737 return -EINVAL; 2738 } 2739 2740 if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd))) 2741 r = -EFAULT; 2742 2743 return r; 2744 } 2745 2746 int sev_mem_enc_register_region(struct kvm *kvm, 2747 struct kvm_enc_region *range) 2748 { 2749 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2750 struct enc_region *region; 2751 int ret = 0; 2752 2753 guard(mutex)(&kvm->lock); 2754 2755 if (!sev_guest(kvm)) 2756 return -ENOTTY; 2757 2758 /* If kvm is mirroring encryption context it isn't responsible for it */ 2759 if (is_mirroring_enc_context(kvm)) 2760 return -EINVAL; 2761 2762 region = kzalloc_obj(*region, GFP_KERNEL_ACCOUNT); 2763 if (!region) 2764 return -ENOMEM; 2765 2766 region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 2767 FOLL_WRITE | FOLL_LONGTERM); 2768 if (IS_ERR(region->pages)) { 2769 ret = PTR_ERR(region->pages); 2770 goto e_free; 2771 } 2772 2773 /* 2774 * The guest may change the memory encryption attribute from C=0 -> C=1 2775 * or vice versa for this memory range. Lets make sure caches are 2776 * flushed to ensure that guest data gets written into memory with 2777 * correct C-bit. Note, this must be done before dropping kvm->lock, 2778 * as region and its array of pages can be freed by a different task 2779 * once kvm->lock is released. 2780 */ 2781 sev_clflush_pages(region->pages, region->npages); 2782 2783 region->uaddr = range->addr; 2784 region->size = range->size; 2785 2786 list_add_tail(®ion->list, &sev->regions_list); 2787 return ret; 2788 2789 e_free: 2790 kfree(region); 2791 return ret; 2792 } 2793 2794 static struct enc_region * 2795 find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) 2796 { 2797 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2798 struct list_head *head = &sev->regions_list; 2799 struct enc_region *i; 2800 2801 list_for_each_entry(i, head, list) { 2802 if (i->uaddr == range->addr && 2803 i->size == range->size) 2804 return i; 2805 } 2806 2807 return NULL; 2808 } 2809 2810 static void __unregister_enc_region_locked(struct kvm *kvm, 2811 struct enc_region *region) 2812 { 2813 sev_unpin_memory(kvm, region->pages, region->npages); 2814 list_del(®ion->list); 2815 kfree(region); 2816 } 2817 2818 int sev_mem_enc_unregister_region(struct kvm *kvm, 2819 struct kvm_enc_region *range) 2820 { 2821 struct enc_region *region; 2822 2823 /* If kvm is mirroring encryption context it isn't responsible for it */ 2824 if (is_mirroring_enc_context(kvm)) 2825 return -EINVAL; 2826 2827 guard(mutex)(&kvm->lock); 2828 2829 if (!sev_guest(kvm)) 2830 return -ENOTTY; 2831 2832 region = find_enc_region(kvm, range); 2833 if (!region) 2834 return -EINVAL; 2835 2836 sev_writeback_caches(kvm); 2837 2838 __unregister_enc_region_locked(kvm, region); 2839 2840 return 0; 2841 } 2842 2843 int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) 2844 { 2845 CLASS(fd, f)(source_fd); 2846 struct kvm *source_kvm; 2847 struct kvm_sev_info *source_sev, *mirror_sev; 2848 int ret; 2849 2850 if (fd_empty(f)) 2851 return -EBADF; 2852 2853 if (!file_is_kvm(fd_file(f))) 2854 return -EBADF; 2855 2856 source_kvm = fd_file(f)->private_data; 2857 ret = sev_lock_two_vms(kvm, source_kvm); 2858 if (ret) 2859 return ret; 2860 2861 /* 2862 * Mirrors of mirrors should work, but let's not get silly. Also 2863 * disallow out-of-band SEV/SEV-ES init if the target is already an 2864 * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being 2865 * created after SEV/SEV-ES initialization, e.g. to init intercepts. 2866 */ 2867 if (sev_guest(kvm) || !sev_guest(source_kvm) || 2868 is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) { 2869 ret = -EINVAL; 2870 goto e_unlock; 2871 } 2872 2873 mirror_sev = to_kvm_sev_info(kvm); 2874 if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 2875 ret = -ENOMEM; 2876 goto e_unlock; 2877 } 2878 2879 /* 2880 * The mirror kvm holds an enc_context_owner ref so its asid can't 2881 * disappear until we're done with it 2882 */ 2883 source_sev = to_kvm_sev_info(source_kvm); 2884 kvm_get_kvm(source_kvm); 2885 list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); 2886 2887 /* Set enc_context_owner and copy its encryption context over */ 2888 mirror_sev->enc_context_owner = source_kvm; 2889 mirror_sev->active = true; 2890 mirror_sev->asid = source_sev->asid; 2891 mirror_sev->fd = source_sev->fd; 2892 mirror_sev->es_active = source_sev->es_active; 2893 mirror_sev->need_init = false; 2894 mirror_sev->handle = source_sev->handle; 2895 INIT_LIST_HEAD(&mirror_sev->regions_list); 2896 INIT_LIST_HEAD(&mirror_sev->mirror_vms); 2897 ret = 0; 2898 2899 /* 2900 * Do not copy ap_jump_table. Since the mirror does not share the same 2901 * KVM contexts as the original, and they may have different 2902 * memory-views. 2903 */ 2904 2905 e_unlock: 2906 sev_unlock_two_vms(kvm, source_kvm); 2907 return ret; 2908 } 2909 2910 static int snp_decommission_context(struct kvm *kvm) 2911 { 2912 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2913 struct sev_data_snp_addr data = {}; 2914 int ret; 2915 2916 /* If context is not created then do nothing */ 2917 if (!sev->snp_context) 2918 return 0; 2919 2920 /* Do the decommision, which will unbind the ASID from the SNP context */ 2921 data.address = __sme_pa(sev->snp_context); 2922 down_write(&sev_deactivate_lock); 2923 ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL); 2924 up_write(&sev_deactivate_lock); 2925 2926 if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret)) 2927 return ret; 2928 2929 snp_free_firmware_page(sev->snp_context); 2930 sev->snp_context = NULL; 2931 2932 return 0; 2933 } 2934 2935 void sev_vm_init(struct kvm *kvm) 2936 { 2937 switch (kvm->arch.vm_type) { 2938 case KVM_X86_DEFAULT_VM: 2939 case KVM_X86_SW_PROTECTED_VM: 2940 break; 2941 case KVM_X86_SNP_VM: 2942 kvm->arch.has_private_mem = true; 2943 fallthrough; 2944 case KVM_X86_SEV_ES_VM: 2945 kvm->arch.has_protected_state = true; 2946 fallthrough; 2947 case KVM_X86_SEV_VM: 2948 kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; 2949 to_kvm_sev_info(kvm)->need_init = true; 2950 break; 2951 default: 2952 WARN_ONCE(1, "Unsupported VM type %u", kvm->arch.vm_type); 2953 break; 2954 } 2955 } 2956 2957 void sev_vm_destroy(struct kvm *kvm) 2958 { 2959 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2960 struct list_head *head = &sev->regions_list; 2961 struct list_head *pos, *q; 2962 2963 if (!sev_guest(kvm)) 2964 return; 2965 2966 WARN_ON(!list_empty(&sev->mirror_vms)); 2967 2968 free_cpumask_var(sev->have_run_cpus); 2969 2970 /* 2971 * If this is a mirror VM, remove it from the owner's list of a mirrors 2972 * and skip ASID cleanup (the ASID is tied to the lifetime of the owner). 2973 * Note, mirror VMs don't support registering encrypted regions. 2974 */ 2975 if (is_mirroring_enc_context(kvm)) { 2976 struct kvm *owner_kvm = sev->enc_context_owner; 2977 2978 mutex_lock(&owner_kvm->lock); 2979 list_del(&sev->mirror_entry); 2980 mutex_unlock(&owner_kvm->lock); 2981 kvm_put_kvm(owner_kvm); 2982 return; 2983 } 2984 2985 2986 /* 2987 * if userspace was terminated before unregistering the memory regions 2988 * then lets unpin all the registered memory. 2989 */ 2990 if (!list_empty(head)) { 2991 list_for_each_safe(pos, q, head) { 2992 __unregister_enc_region_locked(kvm, 2993 list_entry(pos, struct enc_region, list)); 2994 cond_resched(); 2995 } 2996 } 2997 2998 if (sev_snp_guest(kvm)) { 2999 snp_guest_req_cleanup(kvm); 3000 3001 /* 3002 * Decomission handles unbinding of the ASID. If it fails for 3003 * some unexpected reason, just leak the ASID. 3004 */ 3005 if (snp_decommission_context(kvm)) 3006 return; 3007 } else { 3008 sev_unbind_asid(kvm, sev->handle); 3009 } 3010 3011 sev_asid_free(sev); 3012 } 3013 3014 void __init sev_set_cpu_caps(void) 3015 { 3016 if (sev_enabled) { 3017 kvm_cpu_cap_set(X86_FEATURE_SEV); 3018 kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM); 3019 } 3020 if (sev_es_enabled) { 3021 kvm_cpu_cap_set(X86_FEATURE_SEV_ES); 3022 kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM); 3023 } 3024 if (sev_snp_enabled) { 3025 kvm_cpu_cap_set(X86_FEATURE_SEV_SNP); 3026 kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM); 3027 } 3028 } 3029 3030 static bool is_sev_snp_initialized(void) 3031 { 3032 struct sev_user_data_snp_status *status; 3033 struct sev_data_snp_addr buf; 3034 bool initialized = false; 3035 int ret, error = 0; 3036 3037 status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO); 3038 if (!status) 3039 return false; 3040 3041 buf.address = __psp_pa(status); 3042 ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error); 3043 if (ret) { 3044 pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n", 3045 ret, error, error); 3046 goto out; 3047 } 3048 3049 initialized = !!status->state; 3050 3051 out: 3052 snp_free_firmware_page(status); 3053 3054 return initialized; 3055 } 3056 3057 void __init sev_hardware_setup(void) 3058 { 3059 unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; 3060 struct sev_platform_init_args init_args = {0}; 3061 bool sev_snp_supported = false; 3062 bool sev_es_supported = false; 3063 bool sev_supported = false; 3064 3065 if (!sev_enabled || !npt_enabled || !nrips) 3066 goto out; 3067 3068 /* 3069 * SEV must obviously be supported in hardware. Sanity check that the 3070 * CPU supports decode assists, which is mandatory for SEV guests to 3071 * support instruction emulation. Ditto for flushing by ASID, as SEV 3072 * guests are bound to a single ASID, i.e. KVM can't rotate to a new 3073 * ASID to effect a TLB flush. 3074 */ 3075 if (!boot_cpu_has(X86_FEATURE_SEV) || 3076 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) || 3077 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) 3078 goto out; 3079 3080 /* 3081 * The kernel's initcall infrastructure lacks the ability to express 3082 * dependencies between initcalls, whereas the modules infrastructure 3083 * automatically handles dependencies via symbol loading. Ensure the 3084 * PSP SEV driver is initialized before proceeding if KVM is built-in, 3085 * as the dependency isn't handled by the initcall infrastructure. 3086 */ 3087 if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) 3088 goto out; 3089 3090 /* Retrieve SEV CPUID information */ 3091 cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); 3092 3093 /* Set encryption bit location for SEV-ES guests */ 3094 sev_enc_bit = ebx & 0x3f; 3095 3096 /* Maximum number of encrypted guests supported simultaneously */ 3097 max_sev_asid = ecx; 3098 if (!max_sev_asid) 3099 goto out; 3100 3101 /* Minimum ASID value that should be used for SEV guest */ 3102 min_sev_asid = edx; 3103 sev_me_mask = 1UL << (ebx & 0x3f); 3104 3105 /* 3106 * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap, 3107 * even though it's never used, so that the bitmap is indexed by the 3108 * actual ASID. 3109 */ 3110 nr_asids = max_sev_asid + 1; 3111 sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); 3112 if (!sev_asid_bitmap) 3113 goto out; 3114 3115 sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); 3116 if (!sev_reclaim_asid_bitmap) { 3117 bitmap_free(sev_asid_bitmap); 3118 sev_asid_bitmap = NULL; 3119 goto out; 3120 } 3121 3122 if (min_sev_asid <= max_sev_asid) { 3123 sev_asid_count = max_sev_asid - min_sev_asid + 1; 3124 WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); 3125 } 3126 sev_supported = true; 3127 3128 /* SEV-ES support requested? */ 3129 if (!sev_es_enabled) 3130 goto out; 3131 3132 /* 3133 * SEV-ES requires MMIO caching as KVM doesn't have access to the guest 3134 * instruction stream, i.e. can't emulate in response to a #NPF and 3135 * instead relies on #NPF(RSVD) being reflected into the guest as #VC 3136 * (the guest can then do a #VMGEXIT to request MMIO emulation). 3137 */ 3138 if (!enable_mmio_caching) 3139 goto out; 3140 3141 /* Does the CPU support SEV-ES? */ 3142 if (!boot_cpu_has(X86_FEATURE_SEV_ES)) 3143 goto out; 3144 3145 if (!lbrv) { 3146 WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV), 3147 "LBRV must be present for SEV-ES support"); 3148 goto out; 3149 } 3150 3151 /* Has the system been allocated ASIDs for SEV-ES? */ 3152 if (min_sev_asid == 1) 3153 goto out; 3154 3155 min_sev_es_asid = min_snp_asid = 1; 3156 max_sev_es_asid = max_snp_asid = min_sev_asid - 1; 3157 3158 sev_es_asid_count = min_sev_asid - 1; 3159 WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); 3160 sev_es_supported = true; 3161 sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); 3162 3163 out: 3164 if (sev_enabled) { 3165 init_args.probe = true; 3166 3167 if (sev_is_snp_ciphertext_hiding_supported()) 3168 init_args.max_snp_asid = min(nr_ciphertext_hiding_asids, 3169 min_sev_asid - 1); 3170 3171 if (sev_platform_init(&init_args)) 3172 sev_supported = sev_es_supported = sev_snp_supported = false; 3173 else if (sev_snp_supported) 3174 sev_snp_supported = is_sev_snp_initialized(); 3175 3176 if (sev_snp_supported) { 3177 snp_supported_policy_bits = sev_get_snp_policy_bits() & 3178 KVM_SNP_POLICY_MASK_VALID; 3179 nr_ciphertext_hiding_asids = init_args.max_snp_asid; 3180 } 3181 3182 /* 3183 * If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP 3184 * ASID range is partitioned into separate SEV-ES and SEV-SNP 3185 * ASID ranges, with the SEV-SNP range being [1..max_snp_asid] 3186 * and the SEV-ES range being (max_snp_asid..max_sev_es_asid]. 3187 * Note, SEV-ES may effectively be disabled if all ASIDs from 3188 * the joint range are assigned to SEV-SNP. 3189 */ 3190 if (nr_ciphertext_hiding_asids) { 3191 max_snp_asid = nr_ciphertext_hiding_asids; 3192 min_sev_es_asid = max_snp_asid + 1; 3193 pr_info("SEV-SNP ciphertext hiding enabled\n"); 3194 } 3195 } 3196 3197 if (boot_cpu_has(X86_FEATURE_SEV)) 3198 pr_info("SEV %s (ASIDs %u - %u)\n", 3199 sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : 3200 "unusable" : 3201 "disabled", 3202 min_sev_asid, max_sev_asid); 3203 if (boot_cpu_has(X86_FEATURE_SEV_ES)) 3204 pr_info("SEV-ES %s (ASIDs %u - %u)\n", 3205 sev_es_supported ? min_sev_es_asid <= max_sev_es_asid ? "enabled" : 3206 "unusable" : 3207 "disabled", 3208 min_sev_es_asid, max_sev_es_asid); 3209 if (boot_cpu_has(X86_FEATURE_SEV_SNP)) 3210 pr_info("SEV-SNP %s (ASIDs %u - %u)\n", 3211 str_enabled_disabled(sev_snp_supported), 3212 min_snp_asid, max_snp_asid); 3213 3214 sev_enabled = sev_supported; 3215 sev_es_enabled = sev_es_supported; 3216 sev_snp_enabled = sev_snp_supported; 3217 3218 sev_supported_vmsa_features = 0; 3219 3220 if (sev_es_enabled && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) && 3221 cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) 3222 sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; 3223 3224 if (sev_snp_enabled && tsc_khz && cpu_feature_enabled(X86_FEATURE_SNP_SECURE_TSC)) 3225 sev_supported_vmsa_features |= SVM_SEV_FEAT_SECURE_TSC; 3226 } 3227 3228 void sev_hardware_unsetup(void) 3229 { 3230 if (!sev_enabled) 3231 return; 3232 3233 /* No need to take sev_bitmap_lock, all VMs have been destroyed. */ 3234 sev_flush_asids(1, max_sev_asid); 3235 3236 bitmap_free(sev_asid_bitmap); 3237 bitmap_free(sev_reclaim_asid_bitmap); 3238 3239 misc_cg_set_capacity(MISC_CG_RES_SEV, 0); 3240 misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); 3241 3242 sev_platform_shutdown(); 3243 } 3244 3245 int sev_cpu_init(struct svm_cpu_data *sd) 3246 { 3247 if (!sev_enabled) 3248 return 0; 3249 3250 sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL); 3251 if (!sd->sev_vmcbs) 3252 return -ENOMEM; 3253 3254 return 0; 3255 } 3256 3257 /* 3258 * Pages used by hardware to hold guest encrypted state must be flushed before 3259 * returning them to the system. 3260 */ 3261 static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) 3262 { 3263 unsigned int asid = sev_get_asid(vcpu->kvm); 3264 3265 /* 3266 * Note! The address must be a kernel address, as regular page walk 3267 * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user 3268 * address is non-deterministic and unsafe. This function deliberately 3269 * takes a pointer to deter passing in a user address. 3270 */ 3271 unsigned long addr = (unsigned long)va; 3272 3273 /* 3274 * If CPU enforced cache coherency for encrypted mappings of the 3275 * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache 3276 * flush is still needed in order to work properly with DMA devices. 3277 */ 3278 if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) { 3279 clflush_cache_range(va, PAGE_SIZE); 3280 return; 3281 } 3282 3283 /* 3284 * VM Page Flush takes a host virtual address and a guest ASID. Fall 3285 * back to full writeback of caches if this faults so as not to make 3286 * any problems worse by leaving stale encrypted data in the cache. 3287 */ 3288 if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) 3289 goto do_sev_writeback_caches; 3290 3291 return; 3292 3293 do_sev_writeback_caches: 3294 sev_writeback_caches(vcpu->kvm); 3295 } 3296 3297 void sev_guest_memory_reclaimed(struct kvm *kvm) 3298 { 3299 /* 3300 * With SNP+gmem, private/encrypted memory is unreachable via the 3301 * hva-based mmu notifiers, i.e. these events are explicitly scoped to 3302 * shared pages, where there's no need to flush caches. 3303 * 3304 * Checking for SEV+ outside of kvm->lock is safe as __sev_guest_init() 3305 * can only be done before vCPUs are created, caches can be incoherent 3306 * if and only if a vCPU was run, and either this task will see the VM 3307 * as being SEV+ or the vCPU won't be to access the memory (because of 3308 * the in-progress invalidation). 3309 */ 3310 if (!____sev_guest(kvm) || ____sev_snp_guest(kvm)) 3311 return; 3312 3313 sev_writeback_caches(kvm); 3314 } 3315 3316 void sev_free_vcpu(struct kvm_vcpu *vcpu) 3317 { 3318 struct vcpu_svm *svm; 3319 3320 if (!is_sev_es_guest(vcpu)) 3321 return; 3322 3323 svm = to_svm(vcpu); 3324 3325 /* 3326 * If it's an SNP guest, then the VMSA was marked in the RMP table as 3327 * a guest-owned page. Transition the page to hypervisor state before 3328 * releasing it back to the system. 3329 */ 3330 if (is_sev_snp_guest(vcpu)) { 3331 u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; 3332 3333 if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) 3334 goto skip_vmsa_free; 3335 } 3336 3337 if (vcpu->arch.guest_state_protected) 3338 sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); 3339 3340 __free_page(virt_to_page(svm->sev_es.vmsa)); 3341 3342 skip_vmsa_free: 3343 if (svm->sev_es.ghcb_sa_free) 3344 kvfree(svm->sev_es.ghcb_sa); 3345 } 3346 3347 static void dump_ghcb(struct vcpu_svm *svm) 3348 { 3349 struct vmcb_control_area *control = &svm->vmcb->control; 3350 unsigned int nbits; 3351 3352 /* Re-use the dump_invalid_vmcb module parameter */ 3353 if (!dump_invalid_vmcb) { 3354 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); 3355 return; 3356 } 3357 3358 nbits = sizeof(svm->sev_es.valid_bitmap) * 8; 3359 3360 /* 3361 * Print KVM's snapshot of the GHCB values that were (unsuccessfully) 3362 * used to handle the exit. If the guest has since modified the GHCB 3363 * itself, dumping the raw GHCB won't help debug why KVM was unable to 3364 * handle the VMGEXIT that KVM observed. 3365 */ 3366 pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); 3367 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", 3368 control->exit_code, kvm_ghcb_sw_exit_code_is_valid(svm)); 3369 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", 3370 control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); 3371 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", 3372 control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); 3373 pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", 3374 svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); 3375 pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); 3376 } 3377 3378 static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) 3379 { 3380 struct kvm_vcpu *vcpu = &svm->vcpu; 3381 struct ghcb *ghcb = svm->sev_es.ghcb; 3382 3383 /* 3384 * The GHCB protocol so far allows for the following data 3385 * to be returned: 3386 * GPRs RAX, RBX, RCX, RDX 3387 * 3388 * Copy their values, even if they may not have been written during the 3389 * VM-Exit. It's the guest's responsibility to not consume random data. 3390 */ 3391 ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]); 3392 ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]); 3393 ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]); 3394 ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]); 3395 } 3396 3397 static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) 3398 { 3399 struct vmcb_control_area *control = &svm->vmcb->control; 3400 struct kvm_vcpu *vcpu = &svm->vcpu; 3401 struct ghcb *ghcb = svm->sev_es.ghcb; 3402 3403 /* 3404 * The GHCB protocol so far allows for the following data 3405 * to be supplied: 3406 * GPRs RAX, RBX, RCX, RDX 3407 * XCR0 3408 * CPL 3409 * 3410 * VMMCALL allows the guest to provide extra registers. KVM also 3411 * expects RSI for hypercalls, so include that, too. 3412 * 3413 * Copy their values to the appropriate location if supplied. 3414 */ 3415 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); 3416 3417 BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap)); 3418 memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap)); 3419 3420 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm); 3421 vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm); 3422 vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm); 3423 vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm); 3424 vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm); 3425 3426 svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm); 3427 3428 if (kvm_ghcb_xcr0_is_valid(svm)) 3429 __kvm_set_xcr(vcpu, 0, kvm_ghcb_get_xcr0(svm)); 3430 3431 if (kvm_ghcb_xss_is_valid(svm)) 3432 __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm)); 3433 3434 /* Copy the GHCB exit information into the VMCB fields */ 3435 control->exit_code = kvm_ghcb_get_sw_exit_code(svm); 3436 control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm); 3437 control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm); 3438 svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm); 3439 3440 /* Clear the valid entries fields */ 3441 memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); 3442 } 3443 3444 static int sev_es_validate_vmgexit(struct vcpu_svm *svm) 3445 { 3446 struct vmcb_control_area *control = &svm->vmcb->control; 3447 struct kvm_vcpu *vcpu = &svm->vcpu; 3448 u64 reason; 3449 3450 /* Only GHCB Usage code 0 is supported */ 3451 if (svm->sev_es.ghcb->ghcb_usage) { 3452 reason = GHCB_ERR_INVALID_USAGE; 3453 goto vmgexit_err; 3454 } 3455 3456 reason = GHCB_ERR_MISSING_INPUT; 3457 3458 if (!kvm_ghcb_sw_exit_code_is_valid(svm) || 3459 !kvm_ghcb_sw_exit_info_1_is_valid(svm) || 3460 !kvm_ghcb_sw_exit_info_2_is_valid(svm)) 3461 goto vmgexit_err; 3462 3463 switch (control->exit_code) { 3464 case SVM_EXIT_READ_DR7: 3465 break; 3466 case SVM_EXIT_WRITE_DR7: 3467 if (!kvm_ghcb_rax_is_valid(svm)) 3468 goto vmgexit_err; 3469 break; 3470 case SVM_EXIT_RDTSC: 3471 break; 3472 case SVM_EXIT_RDPMC: 3473 if (!kvm_ghcb_rcx_is_valid(svm)) 3474 goto vmgexit_err; 3475 break; 3476 case SVM_EXIT_CPUID: 3477 if (!kvm_ghcb_rax_is_valid(svm) || 3478 !kvm_ghcb_rcx_is_valid(svm)) 3479 goto vmgexit_err; 3480 if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd) 3481 if (!kvm_ghcb_xcr0_is_valid(svm)) 3482 goto vmgexit_err; 3483 break; 3484 case SVM_EXIT_INVD: 3485 break; 3486 case SVM_EXIT_IOIO: 3487 if (control->exit_info_1 & SVM_IOIO_STR_MASK) { 3488 if (!kvm_ghcb_sw_scratch_is_valid(svm)) 3489 goto vmgexit_err; 3490 } else { 3491 if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK)) 3492 if (!kvm_ghcb_rax_is_valid(svm)) 3493 goto vmgexit_err; 3494 } 3495 break; 3496 case SVM_EXIT_MSR: 3497 if (!kvm_ghcb_rcx_is_valid(svm)) 3498 goto vmgexit_err; 3499 if (control->exit_info_1) { 3500 if (!kvm_ghcb_rax_is_valid(svm) || 3501 !kvm_ghcb_rdx_is_valid(svm)) 3502 goto vmgexit_err; 3503 } 3504 break; 3505 case SVM_EXIT_VMMCALL: 3506 if (!kvm_ghcb_rax_is_valid(svm) || 3507 !kvm_ghcb_cpl_is_valid(svm)) 3508 goto vmgexit_err; 3509 break; 3510 case SVM_EXIT_RDTSCP: 3511 break; 3512 case SVM_EXIT_WBINVD: 3513 break; 3514 case SVM_EXIT_MONITOR: 3515 if (!kvm_ghcb_rax_is_valid(svm) || 3516 !kvm_ghcb_rcx_is_valid(svm) || 3517 !kvm_ghcb_rdx_is_valid(svm)) 3518 goto vmgexit_err; 3519 break; 3520 case SVM_EXIT_MWAIT: 3521 if (!kvm_ghcb_rax_is_valid(svm) || 3522 !kvm_ghcb_rcx_is_valid(svm)) 3523 goto vmgexit_err; 3524 break; 3525 case SVM_VMGEXIT_MMIO_READ: 3526 case SVM_VMGEXIT_MMIO_WRITE: 3527 if (!kvm_ghcb_sw_scratch_is_valid(svm)) 3528 goto vmgexit_err; 3529 break; 3530 case SVM_VMGEXIT_AP_CREATION: 3531 if (!is_sev_snp_guest(vcpu)) 3532 goto vmgexit_err; 3533 if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY) 3534 if (!kvm_ghcb_rax_is_valid(svm)) 3535 goto vmgexit_err; 3536 break; 3537 case SVM_VMGEXIT_NMI_COMPLETE: 3538 case SVM_VMGEXIT_AP_HLT_LOOP: 3539 case SVM_VMGEXIT_AP_JUMP_TABLE: 3540 case SVM_VMGEXIT_UNSUPPORTED_EVENT: 3541 case SVM_VMGEXIT_HV_FEATURES: 3542 case SVM_VMGEXIT_TERM_REQUEST: 3543 break; 3544 case SVM_VMGEXIT_PSC: 3545 if (!is_sev_snp_guest(vcpu) || !kvm_ghcb_sw_scratch_is_valid(svm)) 3546 goto vmgexit_err; 3547 break; 3548 case SVM_VMGEXIT_GUEST_REQUEST: 3549 case SVM_VMGEXIT_EXT_GUEST_REQUEST: 3550 if (!is_sev_snp_guest(vcpu) || 3551 !PAGE_ALIGNED(control->exit_info_1) || 3552 !PAGE_ALIGNED(control->exit_info_2) || 3553 control->exit_info_1 == control->exit_info_2) 3554 goto vmgexit_err; 3555 break; 3556 default: 3557 reason = GHCB_ERR_INVALID_EVENT; 3558 goto vmgexit_err; 3559 } 3560 3561 return 0; 3562 3563 vmgexit_err: 3564 /* 3565 * Print the exit code even though it may not be marked valid as it 3566 * could help with debugging. 3567 */ 3568 if (reason == GHCB_ERR_INVALID_USAGE) { 3569 vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", 3570 svm->sev_es.ghcb->ghcb_usage); 3571 } else if (reason == GHCB_ERR_INVALID_EVENT) { 3572 vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", 3573 control->exit_code); 3574 } else { 3575 vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n", 3576 control->exit_code); 3577 dump_ghcb(svm); 3578 } 3579 3580 svm_vmgexit_bad_input(svm, reason); 3581 3582 /* Resume the guest to "return" the error code. */ 3583 return 1; 3584 } 3585 3586 void sev_es_unmap_ghcb(struct vcpu_svm *svm) 3587 { 3588 /* Clear any indication that the vCPU is in a type of AP Reset Hold */ 3589 svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE; 3590 3591 if (!svm->sev_es.ghcb) 3592 return; 3593 3594 if (svm->sev_es.ghcb_sa_free) { 3595 /* 3596 * The scratch area lives outside the GHCB, so there is a 3597 * buffer that, depending on the operation performed, may 3598 * need to be synced, then freed. 3599 */ 3600 if (svm->sev_es.ghcb_sa_sync) { 3601 kvm_write_guest(svm->vcpu.kvm, 3602 svm->sev_es.sw_scratch, 3603 svm->sev_es.ghcb_sa, 3604 svm->sev_es.ghcb_sa_len); 3605 svm->sev_es.ghcb_sa_sync = false; 3606 } 3607 3608 kvfree(svm->sev_es.ghcb_sa); 3609 svm->sev_es.ghcb_sa = NULL; 3610 svm->sev_es.ghcb_sa_free = false; 3611 } 3612 3613 trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb); 3614 3615 sev_es_sync_to_ghcb(svm); 3616 3617 kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); 3618 svm->sev_es.ghcb = NULL; 3619 } 3620 3621 int pre_sev_run(struct vcpu_svm *svm, int cpu) 3622 { 3623 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 3624 struct kvm_vcpu *vcpu = &svm->vcpu; 3625 struct kvm *kvm = vcpu->kvm; 3626 unsigned int asid = sev_get_asid(kvm); 3627 3628 /* 3629 * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid 3630 * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP 3631 * AP Destroy event. 3632 */ 3633 if (is_sev_es_guest(vcpu) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) 3634 return -EINVAL; 3635 3636 /* 3637 * To optimize cache flushes when memory is reclaimed from an SEV VM, 3638 * track physical CPUs that enter the guest for SEV VMs and thus can 3639 * have encrypted, dirty data in the cache, and flush caches only for 3640 * CPUs that have entered the guest. 3641 */ 3642 if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus)) 3643 cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus); 3644 3645 /* Assign the asid allocated with this SEV guest */ 3646 svm->asid = asid; 3647 3648 /* 3649 * Flush guest TLB: 3650 * 3651 * 1) when different VMCB for the same ASID is to be run on the same host CPU. 3652 * 2) or this VMCB was executed on different host CPU in previous VMRUNs. 3653 */ 3654 if (sd->sev_vmcbs[asid] == svm->vmcb && 3655 svm->vcpu.arch.last_vmentry_cpu == cpu) 3656 return 0; 3657 3658 sd->sev_vmcbs[asid] = svm->vmcb; 3659 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 3660 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 3661 return 0; 3662 } 3663 3664 #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) 3665 static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) 3666 { 3667 struct vmcb_control_area *control = &svm->vmcb->control; 3668 u64 ghcb_scratch_beg, ghcb_scratch_end; 3669 u64 scratch_gpa_beg, scratch_gpa_end; 3670 void *scratch_va; 3671 3672 scratch_gpa_beg = svm->sev_es.sw_scratch; 3673 if (!scratch_gpa_beg) { 3674 pr_err("vmgexit: scratch gpa not provided\n"); 3675 goto e_scratch; 3676 } 3677 3678 scratch_gpa_end = scratch_gpa_beg + len; 3679 if (scratch_gpa_end < scratch_gpa_beg) { 3680 pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n", 3681 len, scratch_gpa_beg); 3682 goto e_scratch; 3683 } 3684 3685 if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) { 3686 /* Scratch area begins within GHCB */ 3687 ghcb_scratch_beg = control->ghcb_gpa + 3688 offsetof(struct ghcb, shared_buffer); 3689 ghcb_scratch_end = control->ghcb_gpa + 3690 offsetof(struct ghcb, reserved_0xff0); 3691 3692 /* 3693 * If the scratch area begins within the GHCB, it must be 3694 * completely contained in the GHCB shared buffer area. 3695 */ 3696 if (scratch_gpa_beg < ghcb_scratch_beg || 3697 scratch_gpa_end > ghcb_scratch_end) { 3698 pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n", 3699 scratch_gpa_beg, scratch_gpa_end); 3700 goto e_scratch; 3701 } 3702 3703 scratch_va = (void *)svm->sev_es.ghcb; 3704 scratch_va += (scratch_gpa_beg - control->ghcb_gpa); 3705 } else { 3706 /* 3707 * The guest memory must be read into a kernel buffer, so 3708 * limit the size 3709 */ 3710 if (len > GHCB_SCRATCH_AREA_LIMIT) { 3711 pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n", 3712 len, GHCB_SCRATCH_AREA_LIMIT); 3713 goto e_scratch; 3714 } 3715 scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT); 3716 if (!scratch_va) 3717 return -ENOMEM; 3718 3719 if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) { 3720 /* Unable to copy scratch area from guest */ 3721 pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); 3722 3723 kvfree(scratch_va); 3724 return -EFAULT; 3725 } 3726 3727 /* 3728 * The scratch area is outside the GHCB. The operation will 3729 * dictate whether the buffer needs to be synced before running 3730 * the vCPU next time (i.e. a read was requested so the data 3731 * must be written back to the guest memory). 3732 */ 3733 svm->sev_es.ghcb_sa_sync = sync; 3734 svm->sev_es.ghcb_sa_free = true; 3735 } 3736 3737 svm->sev_es.ghcb_sa = scratch_va; 3738 svm->sev_es.ghcb_sa_len = len; 3739 3740 return 0; 3741 3742 e_scratch: 3743 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA); 3744 3745 return 1; 3746 } 3747 3748 static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask, 3749 unsigned int pos) 3750 { 3751 svm->vmcb->control.ghcb_gpa &= ~(mask << pos); 3752 svm->vmcb->control.ghcb_gpa |= (value & mask) << pos; 3753 } 3754 3755 static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos) 3756 { 3757 return (svm->vmcb->control.ghcb_gpa >> pos) & mask; 3758 } 3759 3760 static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) 3761 { 3762 svm->vmcb->control.ghcb_gpa = value; 3763 } 3764 3765 static int snp_rmptable_psmash(kvm_pfn_t pfn) 3766 { 3767 int ret; 3768 3769 pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); 3770 3771 /* 3772 * PSMASH_FAIL_INUSE indicates another processor is modifying the 3773 * entry, so retry until that's no longer the case. 3774 */ 3775 do { 3776 ret = psmash(pfn); 3777 } while (ret == PSMASH_FAIL_INUSE); 3778 3779 return ret; 3780 } 3781 3782 static int snp_complete_psc_msr(struct kvm_vcpu *vcpu) 3783 { 3784 struct vcpu_svm *svm = to_svm(vcpu); 3785 3786 if (vcpu->run->hypercall.ret) 3787 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); 3788 else 3789 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP); 3790 3791 return 1; /* resume guest */ 3792 } 3793 3794 static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr) 3795 { 3796 u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr)); 3797 u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr); 3798 struct kvm_vcpu *vcpu = &svm->vcpu; 3799 3800 if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) { 3801 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); 3802 return 1; /* resume guest */ 3803 } 3804 3805 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { 3806 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); 3807 return 1; /* resume guest */ 3808 } 3809 3810 vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; 3811 vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 3812 /* 3813 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 3814 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 3815 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 3816 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 3817 */ 3818 vcpu->run->hypercall.ret = 0; 3819 vcpu->run->hypercall.args[0] = gpa; 3820 vcpu->run->hypercall.args[1] = 1; 3821 vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE) 3822 ? KVM_MAP_GPA_RANGE_ENCRYPTED 3823 : KVM_MAP_GPA_RANGE_DECRYPTED; 3824 vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K; 3825 3826 vcpu->arch.complete_userspace_io = snp_complete_psc_msr; 3827 3828 return 0; /* forward request to userspace */ 3829 } 3830 3831 struct psc_buffer { 3832 struct psc_hdr hdr; 3833 struct psc_entry entries[]; 3834 } __packed; 3835 3836 static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc); 3837 3838 static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) 3839 { 3840 svm->sev_es.psc_inflight = 0; 3841 svm->sev_es.psc_idx = 0; 3842 svm->sev_es.psc_2m = false; 3843 3844 /* 3845 * PSC requests always get a "no action" response in SW_EXITINFO1, with 3846 * a PSC-specific return code in SW_EXITINFO2 that provides the "real" 3847 * return code. E.g. if the PSC request was interrupted, the need to 3848 * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1. 3849 */ 3850 svm_vmgexit_no_action(svm, psc_ret); 3851 } 3852 3853 static void __snp_complete_one_psc(struct vcpu_svm *svm) 3854 { 3855 struct psc_buffer *psc = svm->sev_es.ghcb_sa; 3856 struct psc_entry *entries = psc->entries; 3857 struct psc_hdr *hdr = &psc->hdr; 3858 __u16 idx; 3859 3860 /* 3861 * Everything in-flight has been processed successfully. Update the 3862 * corresponding entries in the guest's PSC buffer and zero out the 3863 * count of in-flight PSC entries. 3864 */ 3865 for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight; 3866 svm->sev_es.psc_inflight--, idx++) { 3867 struct psc_entry *entry = &entries[idx]; 3868 3869 entry->cur_page = entry->pagesize ? 512 : 1; 3870 } 3871 3872 hdr->cur_entry = idx; 3873 } 3874 3875 static int snp_complete_one_psc(struct kvm_vcpu *vcpu) 3876 { 3877 struct vcpu_svm *svm = to_svm(vcpu); 3878 struct psc_buffer *psc = svm->sev_es.ghcb_sa; 3879 3880 if (vcpu->run->hypercall.ret) { 3881 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 3882 return 1; /* resume guest */ 3883 } 3884 3885 __snp_complete_one_psc(svm); 3886 3887 /* Handle the next range (if any). */ 3888 return snp_begin_psc(svm, psc); 3889 } 3890 3891 static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc) 3892 { 3893 struct psc_entry *entries = psc->entries; 3894 struct kvm_vcpu *vcpu = &svm->vcpu; 3895 struct psc_hdr *hdr = &psc->hdr; 3896 struct psc_entry entry_start; 3897 u16 idx, idx_start, idx_end; 3898 int npages; 3899 bool huge; 3900 u64 gfn; 3901 3902 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { 3903 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 3904 return 1; 3905 } 3906 3907 next_range: 3908 /* There should be no other PSCs in-flight at this point. */ 3909 if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) { 3910 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 3911 return 1; 3912 } 3913 3914 /* 3915 * The PSC descriptor buffer can be modified by a misbehaved guest after 3916 * validation, so take care to only use validated copies of values used 3917 * for things like array indexing. 3918 */ 3919 idx_start = hdr->cur_entry; 3920 idx_end = hdr->end_entry; 3921 3922 if (idx_end >= VMGEXIT_PSC_MAX_COUNT) { 3923 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); 3924 return 1; 3925 } 3926 3927 /* Find the start of the next range which needs processing. */ 3928 for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) { 3929 entry_start = entries[idx]; 3930 3931 gfn = entry_start.gfn; 3932 huge = entry_start.pagesize; 3933 npages = huge ? 512 : 1; 3934 3935 if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) { 3936 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY); 3937 return 1; 3938 } 3939 3940 if (entry_start.cur_page) { 3941 /* 3942 * If this is a partially-completed 2M range, force 4K handling 3943 * for the remaining pages since they're effectively split at 3944 * this point. Subsequent code should ensure this doesn't get 3945 * combined with adjacent PSC entries where 2M handling is still 3946 * possible. 3947 */ 3948 npages -= entry_start.cur_page; 3949 gfn += entry_start.cur_page; 3950 huge = false; 3951 } 3952 3953 if (npages) 3954 break; 3955 } 3956 3957 if (idx > idx_end) { 3958 /* Nothing more to process. */ 3959 snp_complete_psc(svm, 0); 3960 return 1; 3961 } 3962 3963 svm->sev_es.psc_2m = huge; 3964 svm->sev_es.psc_idx = idx; 3965 svm->sev_es.psc_inflight = 1; 3966 3967 /* 3968 * Find all subsequent PSC entries that contain adjacent GPA 3969 * ranges/operations and can be combined into a single 3970 * KVM_HC_MAP_GPA_RANGE exit. 3971 */ 3972 while (++idx <= idx_end) { 3973 struct psc_entry entry = entries[idx]; 3974 3975 if (entry.operation != entry_start.operation || 3976 entry.gfn != entry_start.gfn + npages || 3977 entry.cur_page || !!entry.pagesize != huge) 3978 break; 3979 3980 svm->sev_es.psc_inflight++; 3981 npages += huge ? 512 : 1; 3982 } 3983 3984 switch (entry_start.operation) { 3985 case VMGEXIT_PSC_OP_PRIVATE: 3986 case VMGEXIT_PSC_OP_SHARED: 3987 vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; 3988 vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 3989 /* 3990 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 3991 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 3992 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 3993 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 3994 */ 3995 vcpu->run->hypercall.ret = 0; 3996 vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn); 3997 vcpu->run->hypercall.args[1] = npages; 3998 vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE 3999 ? KVM_MAP_GPA_RANGE_ENCRYPTED 4000 : KVM_MAP_GPA_RANGE_DECRYPTED; 4001 vcpu->run->hypercall.args[2] |= entry_start.pagesize 4002 ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M 4003 : KVM_MAP_GPA_RANGE_PAGE_SZ_4K; 4004 vcpu->arch.complete_userspace_io = snp_complete_one_psc; 4005 return 0; /* forward request to userspace */ 4006 default: 4007 /* 4008 * Only shared/private PSC operations are currently supported, so if the 4009 * entire range consists of unsupported operations (e.g. SMASH/UNSMASH), 4010 * then consider the entire range completed and avoid exiting to 4011 * userspace. In theory snp_complete_psc() can always be called directly 4012 * at this point to complete the current range and start the next one, 4013 * but that could lead to unexpected levels of recursion. 4014 */ 4015 __snp_complete_one_psc(svm); 4016 goto next_range; 4017 } 4018 4019 BUG(); 4020 } 4021 4022 /* 4023 * Invoked as part of svm_vcpu_reset() processing of an init event. 4024 */ 4025 static void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) 4026 { 4027 struct vcpu_svm *svm = to_svm(vcpu); 4028 struct kvm_memory_slot *slot; 4029 struct page *page; 4030 kvm_pfn_t pfn; 4031 gfn_t gfn; 4032 4033 guard(mutex)(&svm->sev_es.snp_vmsa_mutex); 4034 4035 if (!svm->sev_es.snp_ap_waiting_for_reset) 4036 return; 4037 4038 svm->sev_es.snp_ap_waiting_for_reset = false; 4039 4040 /* Mark the vCPU as offline and not runnable */ 4041 vcpu->arch.pv.pv_unhalted = false; 4042 kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); 4043 4044 /* Clear use of the VMSA */ 4045 svm->vmcb->control.vmsa_pa = INVALID_PAGE; 4046 4047 /* 4048 * When replacing the VMSA during SEV-SNP AP creation, 4049 * mark the VMCB dirty so that full state is always reloaded. 4050 */ 4051 vmcb_mark_all_dirty(svm->vmcb); 4052 4053 if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) 4054 return; 4055 4056 gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); 4057 svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; 4058 4059 slot = gfn_to_memslot(vcpu->kvm, gfn); 4060 if (!slot) 4061 return; 4062 4063 /* 4064 * The new VMSA will be private memory guest memory, so retrieve the 4065 * PFN from the gmem backend. 4066 */ 4067 if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) 4068 return; 4069 4070 /* 4071 * From this point forward, the VMSA will always be a guest-mapped page 4072 * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In 4073 * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but 4074 * that involves cleanups like flushing caches, which would ideally be 4075 * handled during teardown rather than guest boot. Deferring that also 4076 * allows the existing logic for SEV-ES VMSAs to be re-used with 4077 * minimal SNP-specific changes. 4078 */ 4079 svm->sev_es.snp_has_guest_vmsa = true; 4080 4081 /* Use the new VMSA */ 4082 svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); 4083 4084 /* Mark the vCPU as runnable */ 4085 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 4086 4087 /* 4088 * gmem pages aren't currently migratable, but if this ever changes 4089 * then care should be taken to ensure svm->sev_es.vmsa is pinned 4090 * through some other means. 4091 */ 4092 kvm_release_page_clean(page); 4093 } 4094 4095 static int sev_snp_ap_creation(struct vcpu_svm *svm) 4096 { 4097 struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); 4098 struct kvm_vcpu *vcpu = &svm->vcpu; 4099 struct kvm_vcpu *target_vcpu; 4100 struct vcpu_svm *target_svm; 4101 unsigned int request; 4102 unsigned int apic_id; 4103 4104 request = lower_32_bits(svm->vmcb->control.exit_info_1); 4105 apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); 4106 4107 /* Validate the APIC ID */ 4108 target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id); 4109 if (!target_vcpu) { 4110 vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n", 4111 apic_id); 4112 return -EINVAL; 4113 } 4114 4115 target_svm = to_svm(target_vcpu); 4116 4117 guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex); 4118 4119 switch (request) { 4120 case SVM_VMGEXIT_AP_CREATE_ON_INIT: 4121 case SVM_VMGEXIT_AP_CREATE: 4122 if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) { 4123 vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n", 4124 vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features); 4125 return -EINVAL; 4126 } 4127 4128 if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { 4129 vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", 4130 svm->vmcb->control.exit_info_2); 4131 return -EINVAL; 4132 } 4133 4134 /* 4135 * Malicious guest can RMPADJUST a large page into VMSA which 4136 * will hit the SNP erratum where the CPU will incorrectly signal 4137 * an RMP violation #PF if a hugepage collides with the RMP entry 4138 * of VMSA page, reject the AP CREATE request if VMSA address from 4139 * guest is 2M aligned. 4140 */ 4141 if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) { 4142 vcpu_unimpl(vcpu, 4143 "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n", 4144 svm->vmcb->control.exit_info_2); 4145 return -EINVAL; 4146 } 4147 4148 target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; 4149 break; 4150 case SVM_VMGEXIT_AP_DESTROY: 4151 target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; 4152 break; 4153 default: 4154 vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", 4155 request); 4156 return -EINVAL; 4157 } 4158 4159 target_svm->sev_es.snp_ap_waiting_for_reset = true; 4160 4161 /* 4162 * Unless Creation is deferred until INIT, signal the vCPU to update 4163 * its state. 4164 */ 4165 if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) 4166 kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); 4167 4168 return 0; 4169 } 4170 4171 static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) 4172 { 4173 struct sev_data_snp_guest_request data = {0}; 4174 struct kvm *kvm = svm->vcpu.kvm; 4175 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 4176 sev_ret_code fw_err = 0; 4177 int ret; 4178 4179 if (!is_sev_snp_guest(&svm->vcpu)) 4180 return -EINVAL; 4181 4182 guard(mutex)(&sev->guest_req_mutex); 4183 4184 if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) 4185 return -EIO; 4186 4187 data.gctx_paddr = __psp_pa(sev->snp_context); 4188 data.req_paddr = __psp_pa(sev->guest_req_buf); 4189 data.res_paddr = __psp_pa(sev->guest_resp_buf); 4190 4191 /* 4192 * Firmware failures are propagated on to guest, but any other failure 4193 * condition along the way should be reported to userspace. E.g. if 4194 * the PSP is dead and commands are timing out. 4195 */ 4196 ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); 4197 if (ret && !fw_err) 4198 return ret; 4199 4200 if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) 4201 return -EIO; 4202 4203 /* No action is requested *from KVM* if there was a firmware error. */ 4204 svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err)); 4205 4206 /* resume guest */ 4207 return 1; 4208 } 4209 4210 static int snp_req_certs_err(struct vcpu_svm *svm, u32 vmm_error) 4211 { 4212 ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_error, 0)); 4213 4214 return 1; /* resume guest */ 4215 } 4216 4217 static int snp_complete_req_certs(struct kvm_vcpu *vcpu) 4218 { 4219 struct vcpu_svm *svm = to_svm(vcpu); 4220 struct vmcb_control_area *control = &svm->vmcb->control; 4221 4222 switch (READ_ONCE(vcpu->run->snp_req_certs.ret)) { 4223 case 0: 4224 return snp_handle_guest_req(svm, control->exit_info_1, 4225 control->exit_info_2); 4226 case ENOSPC: 4227 vcpu->arch.regs[VCPU_REGS_RBX] = vcpu->run->snp_req_certs.npages; 4228 return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_INVALID_LEN); 4229 case EAGAIN: 4230 return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_BUSY); 4231 case EIO: 4232 return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_GENERIC); 4233 default: 4234 break; 4235 } 4236 4237 return -EINVAL; 4238 } 4239 4240 static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) 4241 { 4242 struct kvm_vcpu *vcpu = &svm->vcpu; 4243 struct kvm *kvm = vcpu->kvm; 4244 4245 u8 msg_type; 4246 4247 if (!is_sev_snp_guest(vcpu)) 4248 return -EINVAL; 4249 4250 if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type), 4251 &msg_type, 1)) 4252 return -EIO; 4253 4254 /* 4255 * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for 4256 * additional certificate data to be provided alongside the attestation 4257 * report via the guest-provided data pages indicated by RAX/RBX. If 4258 * userspace enables KVM_EXIT_SNP_REQ_CERTS, then exit to userspace 4259 * to give userspace an opportunity to provide the certificate data 4260 * before issuing/completing the attestation request. Otherwise, return 4261 * an empty certificate table in the guest-provided data pages and 4262 * handle the attestation request immediately. 4263 */ 4264 if (msg_type == SNP_MSG_REPORT_REQ) { 4265 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 4266 u64 data_npages; 4267 gpa_t data_gpa; 4268 4269 if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm)) 4270 goto request_invalid; 4271 4272 data_gpa = vcpu->arch.regs[VCPU_REGS_RAX]; 4273 data_npages = vcpu->arch.regs[VCPU_REGS_RBX]; 4274 4275 if (!PAGE_ALIGNED(data_gpa)) 4276 goto request_invalid; 4277 4278 if (sev->snp_certs_enabled) { 4279 vcpu->run->exit_reason = KVM_EXIT_SNP_REQ_CERTS; 4280 vcpu->run->snp_req_certs.gpa = data_gpa; 4281 vcpu->run->snp_req_certs.npages = data_npages; 4282 vcpu->run->snp_req_certs.ret = 0; 4283 vcpu->arch.complete_userspace_io = snp_complete_req_certs; 4284 return 0; 4285 } 4286 4287 /* 4288 * As per GHCB spec (see "SNP Extended Guest Request"), the 4289 * certificate table is terminated by 24-bytes of zeroes. 4290 */ 4291 if (data_npages && kvm_clear_guest(kvm, data_gpa, 24)) 4292 return -EIO; 4293 } 4294 4295 return snp_handle_guest_req(svm, req_gpa, resp_gpa); 4296 4297 request_invalid: 4298 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4299 return 1; /* resume guest */ 4300 } 4301 4302 static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) 4303 { 4304 struct vmcb_control_area *control = &svm->vmcb->control; 4305 struct kvm_vcpu *vcpu = &svm->vcpu; 4306 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 4307 u64 ghcb_info; 4308 int ret = 1; 4309 4310 ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK; 4311 4312 trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id, 4313 control->ghcb_gpa); 4314 4315 switch (ghcb_info) { 4316 case GHCB_MSR_SEV_INFO_REQ: 4317 set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, 4318 GHCB_VERSION_MIN, 4319 sev_enc_bit)); 4320 break; 4321 case GHCB_MSR_CPUID_REQ: { 4322 u64 cpuid_fn, cpuid_reg, cpuid_value; 4323 4324 cpuid_fn = get_ghcb_msr_bits(svm, 4325 GHCB_MSR_CPUID_FUNC_MASK, 4326 GHCB_MSR_CPUID_FUNC_POS); 4327 4328 /* Initialize the registers needed by the CPUID intercept */ 4329 vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn; 4330 vcpu->arch.regs[VCPU_REGS_RCX] = 0; 4331 4332 ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID); 4333 if (!ret) { 4334 /* Error, keep GHCB MSR value as-is */ 4335 break; 4336 } 4337 4338 cpuid_reg = get_ghcb_msr_bits(svm, 4339 GHCB_MSR_CPUID_REG_MASK, 4340 GHCB_MSR_CPUID_REG_POS); 4341 if (cpuid_reg == 0) 4342 cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX]; 4343 else if (cpuid_reg == 1) 4344 cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX]; 4345 else if (cpuid_reg == 2) 4346 cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX]; 4347 else 4348 cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX]; 4349 4350 set_ghcb_msr_bits(svm, cpuid_value, 4351 GHCB_MSR_CPUID_VALUE_MASK, 4352 GHCB_MSR_CPUID_VALUE_POS); 4353 4354 set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP, 4355 GHCB_MSR_INFO_MASK, 4356 GHCB_MSR_INFO_POS); 4357 break; 4358 } 4359 case GHCB_MSR_AP_RESET_HOLD_REQ: 4360 svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO; 4361 ret = kvm_emulate_ap_reset_hold(&svm->vcpu); 4362 4363 /* 4364 * Preset the result to a non-SIPI return and then only set 4365 * the result to non-zero when delivering a SIPI. 4366 */ 4367 set_ghcb_msr_bits(svm, 0, 4368 GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, 4369 GHCB_MSR_AP_RESET_HOLD_RESULT_POS); 4370 4371 set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, 4372 GHCB_MSR_INFO_MASK, 4373 GHCB_MSR_INFO_POS); 4374 break; 4375 case GHCB_MSR_HV_FT_REQ: 4376 set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED, 4377 GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS); 4378 set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP, 4379 GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); 4380 break; 4381 case GHCB_MSR_PREF_GPA_REQ: 4382 if (!is_sev_snp_guest(vcpu)) 4383 goto out_terminate; 4384 4385 set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK, 4386 GHCB_MSR_GPA_VALUE_POS); 4387 set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK, 4388 GHCB_MSR_INFO_POS); 4389 break; 4390 case GHCB_MSR_REG_GPA_REQ: { 4391 u64 gfn; 4392 4393 if (!is_sev_snp_guest(vcpu)) 4394 goto out_terminate; 4395 4396 gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK, 4397 GHCB_MSR_GPA_VALUE_POS); 4398 4399 svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn); 4400 4401 set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK, 4402 GHCB_MSR_GPA_VALUE_POS); 4403 set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK, 4404 GHCB_MSR_INFO_POS); 4405 break; 4406 } 4407 case GHCB_MSR_PSC_REQ: 4408 if (!is_sev_snp_guest(vcpu)) 4409 goto out_terminate; 4410 4411 ret = snp_begin_psc_msr(svm, control->ghcb_gpa); 4412 break; 4413 case GHCB_MSR_TERM_REQ: { 4414 u64 reason_set, reason_code; 4415 4416 reason_set = get_ghcb_msr_bits(svm, 4417 GHCB_MSR_TERM_REASON_SET_MASK, 4418 GHCB_MSR_TERM_REASON_SET_POS); 4419 reason_code = get_ghcb_msr_bits(svm, 4420 GHCB_MSR_TERM_REASON_MASK, 4421 GHCB_MSR_TERM_REASON_POS); 4422 pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", 4423 reason_set, reason_code); 4424 4425 goto out_terminate; 4426 } 4427 default: 4428 /* Error, keep GHCB MSR value as-is */ 4429 break; 4430 } 4431 4432 trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id, 4433 control->ghcb_gpa, ret); 4434 4435 return ret; 4436 4437 out_terminate: 4438 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 4439 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; 4440 vcpu->run->system_event.ndata = 1; 4441 vcpu->run->system_event.data[0] = control->ghcb_gpa; 4442 4443 return 0; 4444 } 4445 4446 int sev_handle_vmgexit(struct kvm_vcpu *vcpu) 4447 { 4448 struct vcpu_svm *svm = to_svm(vcpu); 4449 struct vmcb_control_area *control = &svm->vmcb->control; 4450 u64 ghcb_gpa; 4451 int ret; 4452 4453 /* Validate the GHCB */ 4454 ghcb_gpa = control->ghcb_gpa; 4455 if (ghcb_gpa & GHCB_MSR_INFO_MASK) 4456 return sev_handle_vmgexit_msr_protocol(svm); 4457 4458 if (!ghcb_gpa) { 4459 vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n"); 4460 4461 /* Without a GHCB, just return right back to the guest */ 4462 return 1; 4463 } 4464 4465 if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) { 4466 /* Unable to map GHCB from guest */ 4467 vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", 4468 ghcb_gpa); 4469 4470 /* Without a GHCB, just return right back to the guest */ 4471 return 1; 4472 } 4473 4474 svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; 4475 4476 trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb); 4477 4478 sev_es_sync_from_ghcb(svm); 4479 4480 /* SEV-SNP guest requires that the GHCB GPA must be registered */ 4481 if (is_sev_snp_guest(vcpu) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) { 4482 vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa); 4483 return -EINVAL; 4484 } 4485 4486 ret = sev_es_validate_vmgexit(svm); 4487 if (ret) 4488 return ret; 4489 4490 svm_vmgexit_success(svm, 0); 4491 4492 switch (control->exit_code) { 4493 case SVM_VMGEXIT_MMIO_READ: 4494 case SVM_VMGEXIT_MMIO_WRITE: { 4495 bool is_write = control->exit_code == SVM_VMGEXIT_MMIO_WRITE; 4496 4497 ret = setup_vmgexit_scratch(svm, !is_write, control->exit_info_2); 4498 if (ret) 4499 break; 4500 4501 ret = kvm_sev_es_mmio(vcpu, is_write, control->exit_info_1, 4502 control->exit_info_2, svm->sev_es.ghcb_sa); 4503 break; 4504 } 4505 case SVM_VMGEXIT_NMI_COMPLETE: 4506 ++vcpu->stat.nmi_window_exits; 4507 svm->nmi_masked = false; 4508 kvm_make_request(KVM_REQ_EVENT, vcpu); 4509 ret = 1; 4510 break; 4511 case SVM_VMGEXIT_AP_HLT_LOOP: 4512 svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT; 4513 ret = kvm_emulate_ap_reset_hold(vcpu); 4514 break; 4515 case SVM_VMGEXIT_AP_JUMP_TABLE: { 4516 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 4517 4518 switch (control->exit_info_1) { 4519 case 0: 4520 /* Set AP jump table address */ 4521 sev->ap_jump_table = control->exit_info_2; 4522 break; 4523 case 1: 4524 /* Get AP jump table address */ 4525 svm_vmgexit_success(svm, sev->ap_jump_table); 4526 break; 4527 default: 4528 pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", 4529 control->exit_info_1); 4530 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4531 } 4532 4533 ret = 1; 4534 break; 4535 } 4536 case SVM_VMGEXIT_HV_FEATURES: 4537 svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED); 4538 ret = 1; 4539 break; 4540 case SVM_VMGEXIT_TERM_REQUEST: 4541 pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n", 4542 control->exit_info_1, control->exit_info_2); 4543 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 4544 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; 4545 vcpu->run->system_event.ndata = 1; 4546 vcpu->run->system_event.data[0] = control->ghcb_gpa; 4547 break; 4548 case SVM_VMGEXIT_PSC: 4549 ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); 4550 if (ret) 4551 break; 4552 4553 ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa); 4554 break; 4555 case SVM_VMGEXIT_AP_CREATION: 4556 ret = sev_snp_ap_creation(svm); 4557 if (ret) { 4558 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4559 } 4560 4561 ret = 1; 4562 break; 4563 case SVM_VMGEXIT_GUEST_REQUEST: 4564 ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2); 4565 break; 4566 case SVM_VMGEXIT_EXT_GUEST_REQUEST: 4567 ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2); 4568 break; 4569 case SVM_VMGEXIT_UNSUPPORTED_EVENT: 4570 vcpu_unimpl(vcpu, 4571 "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", 4572 control->exit_info_1, control->exit_info_2); 4573 ret = -EINVAL; 4574 break; 4575 default: 4576 ret = svm_invoke_exit_handler(vcpu, control->exit_code); 4577 } 4578 4579 return ret; 4580 } 4581 4582 int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) 4583 { 4584 int count; 4585 int bytes; 4586 int r; 4587 4588 if (svm->vmcb->control.exit_info_2 > INT_MAX) 4589 return -EINVAL; 4590 4591 count = svm->vmcb->control.exit_info_2; 4592 if (unlikely(check_mul_overflow(count, size, &bytes))) 4593 return -EINVAL; 4594 4595 r = setup_vmgexit_scratch(svm, in, bytes); 4596 if (r) 4597 return r; 4598 4599 return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa, 4600 count, in); 4601 } 4602 4603 void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4604 { 4605 /* Clear intercepts on MSRs that are context switched by hardware. */ 4606 svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW); 4607 svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW); 4608 svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW); 4609 4610 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) 4611 svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW, 4612 !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && 4613 !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)); 4614 4615 svm_set_intercept_for_msr(vcpu, MSR_AMD64_GUEST_TSC_FREQ, MSR_TYPE_R, 4616 !snp_is_secure_tsc_enabled(vcpu->kvm)); 4617 4618 /* 4619 * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if 4620 * the host/guest supports its use. 4621 * 4622 * KVM treats the guest as being capable of using XSAVES even if XSAVES 4623 * isn't enabled in guest CPUID as there is no intercept for XSAVES, 4624 * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is 4625 * exposed to the guest and XSAVES is supported in hardware. Condition 4626 * full XSS passthrough on the guest being able to use XSAVES *and* 4627 * XSAVES being exposed to the guest so that KVM can at least honor 4628 * guest CPUID for RDMSR and WRMSR. 4629 */ 4630 svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW, 4631 !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) || 4632 !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)); 4633 } 4634 4635 void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) 4636 { 4637 struct kvm_vcpu *vcpu = &svm->vcpu; 4638 struct kvm_cpuid_entry2 *best; 4639 4640 /* For sev guests, the memory encryption bit is not reserved in CR3. */ 4641 best = kvm_find_cpuid_entry(vcpu, 0x8000001F); 4642 if (best) 4643 vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); 4644 } 4645 4646 static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event) 4647 { 4648 struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); 4649 struct vmcb *vmcb = svm->vmcb01.ptr; 4650 4651 svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV_ES; 4652 4653 /* 4654 * An SEV-ES guest requires a VMSA area that is a separate from the 4655 * VMCB page. Do not include the encryption mask on the VMSA physical 4656 * address since hardware will access it using the guest key. Note, 4657 * the VMSA will be NULL if this vCPU is the destination for intrahost 4658 * migration, and will be copied later. 4659 */ 4660 if (!svm->sev_es.snp_has_guest_vmsa) { 4661 if (svm->sev_es.vmsa) 4662 svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); 4663 else 4664 svm->vmcb->control.vmsa_pa = INVALID_PAGE; 4665 } 4666 4667 if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) 4668 svm->vmcb->control.allowed_sev_features = sev->vmsa_features | 4669 VMCB_ALLOWED_SEV_FEATURES_VALID; 4670 4671 /* Can't intercept CR register access, HV can't modify CR registers */ 4672 svm_clr_intercept(svm, INTERCEPT_CR0_READ); 4673 svm_clr_intercept(svm, INTERCEPT_CR4_READ); 4674 svm_clr_intercept(svm, INTERCEPT_CR8_READ); 4675 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); 4676 svm_clr_intercept(svm, INTERCEPT_CR4_WRITE); 4677 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); 4678 4679 svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0); 4680 4681 /* Track EFER/CR register changes */ 4682 svm_set_intercept(svm, TRAP_EFER_WRITE); 4683 svm_set_intercept(svm, TRAP_CR0_WRITE); 4684 svm_set_intercept(svm, TRAP_CR4_WRITE); 4685 svm_set_intercept(svm, TRAP_CR8_WRITE); 4686 4687 vmcb->control.intercepts[INTERCEPT_DR] = 0; 4688 if (!sev_vcpu_has_debug_swap(svm)) { 4689 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); 4690 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); 4691 svm_mark_intercepts_dirty(svm); 4692 } else { 4693 /* 4694 * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't 4695 * allow debugging SEV-ES guests, and enables DebugSwap iff 4696 * NO_NESTED_DATA_BP is supported, so there's no reason to 4697 * intercept #DB when DebugSwap is enabled. For simplicity 4698 * with respect to guest debug, intercept #DB for other VMs 4699 * even if NO_NESTED_DATA_BP is supported, i.e. even if the 4700 * guest can't DoS the CPU with infinite #DB vectoring. 4701 */ 4702 clr_exception_intercept(svm, DB_VECTOR); 4703 } 4704 4705 /* Can't intercept XSETBV, HV can't modify XCR0 directly */ 4706 svm_clr_intercept(svm, INTERCEPT_XSETBV); 4707 4708 /* 4709 * Set the GHCB MSR value as per the GHCB specification when emulating 4710 * vCPU RESET for an SEV-ES guest. 4711 */ 4712 if (!init_event) 4713 set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, 4714 GHCB_VERSION_MIN, 4715 sev_enc_bit)); 4716 } 4717 4718 void sev_init_vmcb(struct vcpu_svm *svm, bool init_event) 4719 { 4720 struct kvm_vcpu *vcpu = &svm->vcpu; 4721 4722 svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV; 4723 clr_exception_intercept(svm, UD_VECTOR); 4724 4725 /* 4726 * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as 4727 * KVM can't decrypt guest memory to decode the faulting instruction. 4728 */ 4729 clr_exception_intercept(svm, GP_VECTOR); 4730 4731 if (init_event && is_sev_snp_guest(vcpu)) 4732 sev_snp_init_protected_guest_state(vcpu); 4733 4734 if (is_sev_es_guest(vcpu)) 4735 sev_es_init_vmcb(svm, init_event); 4736 } 4737 4738 int sev_vcpu_create(struct kvm_vcpu *vcpu) 4739 { 4740 struct vcpu_svm *svm = to_svm(vcpu); 4741 struct page *vmsa_page; 4742 4743 mutex_init(&svm->sev_es.snp_vmsa_mutex); 4744 4745 if (!is_sev_es_guest(vcpu)) 4746 return 0; 4747 4748 /* 4749 * SEV-ES guests require a separate (from the VMCB) VMSA page used to 4750 * contain the encrypted register state of the guest. 4751 */ 4752 vmsa_page = snp_safe_alloc_page(); 4753 if (!vmsa_page) 4754 return -ENOMEM; 4755 4756 svm->sev_es.vmsa = page_address(vmsa_page); 4757 4758 vcpu->arch.guest_tsc_protected = snp_is_secure_tsc_enabled(vcpu->kvm); 4759 4760 return 0; 4761 } 4762 4763 void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) 4764 { 4765 /* 4766 * All host state for SEV-ES guests is categorized into three swap types 4767 * based on how it is handled by hardware during a world switch: 4768 * 4769 * A: VMRUN: Host state saved in host save area 4770 * VMEXIT: Host state loaded from host save area 4771 * 4772 * B: VMRUN: Host state _NOT_ saved in host save area 4773 * VMEXIT: Host state loaded from host save area 4774 * 4775 * C: VMRUN: Host state _NOT_ saved in host save area 4776 * VMEXIT: Host state initialized to default(reset) values 4777 * 4778 * Manually save type-B state, i.e. state that is loaded by VMEXIT but 4779 * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed 4780 * by common SVM code). 4781 */ 4782 hostsa->xcr0 = kvm_host.xcr0; 4783 hostsa->pkru = read_pkru(); 4784 hostsa->xss = kvm_host.xss; 4785 4786 /* 4787 * If DebugSwap is enabled, debug registers are loaded but NOT saved by 4788 * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does 4789 * not save or load debug registers. Sadly, KVM can't prevent SNP 4790 * guests from lying about DebugSwap on secondary vCPUs, i.e. the 4791 * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what 4792 * the guest has actually enabled (or not!) in the VMSA. 4793 * 4794 * If DebugSwap is *possible*, save the masks so that they're restored 4795 * if the guest enables DebugSwap. But for the DRs themselves, do NOT 4796 * rely on the CPU to restore the host values; KVM will restore them as 4797 * needed in common code, via hw_breakpoint_restore(). Note, KVM does 4798 * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs 4799 * don't need to be restored per se, KVM just needs to ensure they are 4800 * loaded with the correct values *if* the CPU writes the MSRs. 4801 */ 4802 if (sev_vcpu_has_debug_swap(svm) || 4803 (cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) && 4804 is_sev_snp_guest(&svm->vcpu))) { 4805 hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); 4806 hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); 4807 hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); 4808 hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3); 4809 } 4810 4811 /* 4812 * TSC_AUX is always virtualized for SEV-ES guests when the feature is 4813 * available, i.e. TSC_AUX is loaded on #VMEXIT from the host save area. 4814 * Set the save area to the current hardware value, i.e. the current 4815 * user return value, so that the correct value is restored on #VMEXIT. 4816 */ 4817 if (cpu_feature_enabled(X86_FEATURE_V_TSC_AUX) && 4818 !WARN_ON_ONCE(tsc_aux_uret_slot < 0)) 4819 hostsa->tsc_aux = kvm_get_user_return_msr(tsc_aux_uret_slot); 4820 } 4821 4822 void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) 4823 { 4824 struct vcpu_svm *svm = to_svm(vcpu); 4825 4826 /* First SIPI: Use the values as initially set by the VMM */ 4827 if (!svm->sev_es.received_first_sipi) { 4828 svm->sev_es.received_first_sipi = true; 4829 return; 4830 } 4831 4832 /* Subsequent SIPI */ 4833 switch (svm->sev_es.ap_reset_hold_type) { 4834 case AP_RESET_HOLD_NAE_EVENT: 4835 /* 4836 * Return from an AP Reset Hold VMGEXIT, where the guest will 4837 * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. 4838 */ 4839 svm_vmgexit_success(svm, 1); 4840 break; 4841 case AP_RESET_HOLD_MSR_PROTO: 4842 /* 4843 * Return from an AP Reset Hold VMGEXIT, where the guest will 4844 * set the CS and RIP. Set GHCB data field to a non-zero value. 4845 */ 4846 set_ghcb_msr_bits(svm, 1, 4847 GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, 4848 GHCB_MSR_AP_RESET_HOLD_RESULT_POS); 4849 4850 set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, 4851 GHCB_MSR_INFO_MASK, 4852 GHCB_MSR_INFO_POS); 4853 break; 4854 default: 4855 break; 4856 } 4857 } 4858 4859 struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) 4860 { 4861 unsigned long pfn; 4862 struct page *p; 4863 4864 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 4865 return alloc_pages_node(node, gfp | __GFP_ZERO, 0); 4866 4867 /* 4868 * Allocate an SNP-safe page to workaround the SNP erratum where 4869 * the CPU will incorrectly signal an RMP violation #PF if a 4870 * hugepage (2MB or 1GB) collides with the RMP entry of a 4871 * 2MB-aligned VMCB, VMSA, or AVIC backing page. 4872 * 4873 * Allocate one extra page, choose a page which is not 4874 * 2MB-aligned, and free the other. 4875 */ 4876 p = alloc_pages_node(node, gfp | __GFP_ZERO, 1); 4877 if (!p) 4878 return NULL; 4879 4880 split_page(p, 1); 4881 4882 pfn = page_to_pfn(p); 4883 if (IS_ALIGNED(pfn, PTRS_PER_PMD)) 4884 __free_page(p++); 4885 else 4886 __free_page(p + 1); 4887 4888 return p; 4889 } 4890 4891 void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) 4892 { 4893 struct kvm_memory_slot *slot; 4894 struct kvm *kvm = vcpu->kvm; 4895 int order, rmp_level, ret; 4896 struct page *page; 4897 bool assigned; 4898 kvm_pfn_t pfn; 4899 gfn_t gfn; 4900 4901 gfn = gpa >> PAGE_SHIFT; 4902 4903 /* 4904 * The only time RMP faults occur for shared pages is when the guest is 4905 * triggering an RMP fault for an implicit page-state change from 4906 * shared->private. Implicit page-state changes are forwarded to 4907 * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults 4908 * for shared pages should not end up here. 4909 */ 4910 if (!kvm_mem_is_private(kvm, gfn)) { 4911 pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n", 4912 gpa); 4913 return; 4914 } 4915 4916 slot = gfn_to_memslot(kvm, gfn); 4917 if (!kvm_slot_has_gmem(slot)) { 4918 pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", 4919 gpa); 4920 return; 4921 } 4922 4923 ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order); 4924 if (ret) { 4925 pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", 4926 gpa); 4927 return; 4928 } 4929 4930 ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); 4931 if (ret || !assigned) { 4932 pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n", 4933 gpa, pfn, ret); 4934 goto out_no_trace; 4935 } 4936 4937 /* 4938 * There are 2 cases where a PSMASH may be needed to resolve an #NPF 4939 * with PFERR_GUEST_RMP_BIT set: 4940 * 4941 * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM 4942 * bit set if the guest issues them with a smaller granularity than 4943 * what is indicated by the page-size bit in the 2MB RMP entry for 4944 * the PFN that backs the GPA. 4945 * 4946 * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is 4947 * smaller than what is indicated by the 2MB RMP entry for the PFN 4948 * that backs the GPA. 4949 * 4950 * In both these cases, the corresponding 2M RMP entry needs to 4951 * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already 4952 * split into 4K RMP entries, then this is likely a spurious case which 4953 * can occur when there are concurrent accesses by the guest to a 2MB 4954 * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in 4955 * the process of being PMASH'd into 4K entries. These cases should 4956 * resolve automatically on subsequent accesses, so just ignore them 4957 * here. 4958 */ 4959 if (rmp_level == PG_LEVEL_4K) 4960 goto out; 4961 4962 ret = snp_rmptable_psmash(pfn); 4963 if (ret) { 4964 /* 4965 * Look it up again. If it's 4K now then the PSMASH may have 4966 * raced with another process and the issue has already resolved 4967 * itself. 4968 */ 4969 if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) && 4970 assigned && rmp_level == PG_LEVEL_4K) 4971 goto out; 4972 4973 pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n", 4974 gpa, pfn, ret); 4975 } 4976 4977 kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD); 4978 out: 4979 trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); 4980 out_no_trace: 4981 kvm_release_page_unused(page); 4982 } 4983 4984 static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) 4985 { 4986 kvm_pfn_t pfn = start; 4987 4988 while (pfn < end) { 4989 int ret, rmp_level; 4990 bool assigned; 4991 4992 ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); 4993 if (ret) { 4994 pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n", 4995 pfn, start, end, rmp_level, ret); 4996 return false; 4997 } 4998 4999 if (assigned) { 5000 pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n", 5001 __func__, pfn, start, end, rmp_level); 5002 return false; 5003 } 5004 5005 pfn++; 5006 } 5007 5008 return true; 5009 } 5010 5011 static u8 max_level_for_order(int order) 5012 { 5013 if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) 5014 return PG_LEVEL_2M; 5015 5016 return PG_LEVEL_4K; 5017 } 5018 5019 static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order) 5020 { 5021 kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); 5022 5023 /* 5024 * If this is a large folio, and the entire 2M range containing the 5025 * PFN is currently shared, then the entire 2M-aligned range can be 5026 * set to private via a single 2M RMP entry. 5027 */ 5028 if (max_level_for_order(order) > PG_LEVEL_4K && 5029 is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD)) 5030 return true; 5031 5032 return false; 5033 } 5034 5035 int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) 5036 { 5037 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 5038 kvm_pfn_t pfn_aligned; 5039 gfn_t gfn_aligned; 5040 int level, rc; 5041 bool assigned; 5042 5043 if (!sev_snp_guest(kvm)) 5044 return 0; 5045 5046 rc = snp_lookup_rmpentry(pfn, &assigned, &level); 5047 if (rc) { 5048 pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n", 5049 gfn, pfn, rc); 5050 return -ENOENT; 5051 } 5052 5053 if (assigned) { 5054 pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n", 5055 __func__, gfn, pfn, max_order, level); 5056 return 0; 5057 } 5058 5059 if (is_large_rmp_possible(kvm, pfn, max_order)) { 5060 level = PG_LEVEL_2M; 5061 pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); 5062 gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD); 5063 } else { 5064 level = PG_LEVEL_4K; 5065 pfn_aligned = pfn; 5066 gfn_aligned = gfn; 5067 } 5068 5069 rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false); 5070 if (rc) { 5071 pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n", 5072 gfn, pfn, level, rc); 5073 return -EINVAL; 5074 } 5075 5076 pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n", 5077 __func__, gfn, pfn, pfn_aligned, max_order, level); 5078 5079 return 0; 5080 } 5081 5082 void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) 5083 { 5084 kvm_pfn_t pfn; 5085 5086 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 5087 return; 5088 5089 pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end); 5090 5091 for (pfn = start; pfn < end;) { 5092 bool use_2m_update = false; 5093 int rc, rmp_level; 5094 bool assigned; 5095 5096 rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); 5097 if (rc || !assigned) 5098 goto next_pfn; 5099 5100 use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) && 5101 end >= (pfn + PTRS_PER_PMD) && 5102 rmp_level > PG_LEVEL_4K; 5103 5104 /* 5105 * If an unaligned PFN corresponds to a 2M region assigned as a 5106 * large page in the RMP table, PSMASH the region into individual 5107 * 4K RMP entries before attempting to convert a 4K sub-page. 5108 */ 5109 if (!use_2m_update && rmp_level > PG_LEVEL_4K) { 5110 /* 5111 * This shouldn't fail, but if it does, report it, but 5112 * still try to update RMP entry to shared and pray this 5113 * was a spurious error that can be addressed later. 5114 */ 5115 rc = snp_rmptable_psmash(pfn); 5116 WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n", 5117 pfn, rc); 5118 } 5119 5120 rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K); 5121 if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n", 5122 pfn, rc)) 5123 goto next_pfn; 5124 5125 /* 5126 * SEV-ES avoids host/guest cache coherency issues through 5127 * WBNOINVD hooks issued via MMU notifiers during run-time, and 5128 * KVM's VM destroy path at shutdown. Those MMU notifier events 5129 * don't cover gmem since there is no requirement to map pages 5130 * to a HVA in order to use them for a running guest. While the 5131 * shutdown path would still likely cover things for SNP guests, 5132 * userspace may also free gmem pages during run-time via 5133 * hole-punching operations on the guest_memfd, so flush the 5134 * cache entries for these pages before free'ing them back to 5135 * the host. 5136 */ 5137 clflush_cache_range(__va(pfn_to_hpa(pfn)), 5138 use_2m_update ? PMD_SIZE : PAGE_SIZE); 5139 next_pfn: 5140 pfn += use_2m_update ? PTRS_PER_PMD : 1; 5141 cond_resched(); 5142 } 5143 } 5144 5145 int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) 5146 { 5147 int level, rc; 5148 bool assigned; 5149 5150 if (!sev_snp_guest(kvm)) 5151 return 0; 5152 5153 rc = snp_lookup_rmpentry(pfn, &assigned, &level); 5154 if (rc || !assigned) 5155 return PG_LEVEL_4K; 5156 5157 return level; 5158 } 5159 5160 struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) 5161 { 5162 struct vcpu_svm *svm = to_svm(vcpu); 5163 struct vmcb_save_area *vmsa; 5164 struct kvm_sev_info *sev; 5165 int error = 0; 5166 int ret; 5167 5168 if (!is_sev_es_guest(vcpu)) 5169 return NULL; 5170 5171 /* 5172 * If the VMSA has not yet been encrypted, return a pointer to the 5173 * current un-encrypted VMSA. 5174 */ 5175 if (!vcpu->arch.guest_state_protected) 5176 return (struct vmcb_save_area *)svm->sev_es.vmsa; 5177 5178 sev = to_kvm_sev_info(vcpu->kvm); 5179 5180 /* Check if the SEV policy allows debugging */ 5181 if (is_sev_snp_guest(vcpu)) { 5182 if (!(sev->policy & SNP_POLICY_MASK_DEBUG)) 5183 return NULL; 5184 } else { 5185 if (sev->policy & SEV_POLICY_MASK_NODBG) 5186 return NULL; 5187 } 5188 5189 if (is_sev_snp_guest(vcpu)) { 5190 struct sev_data_snp_dbg dbg = {0}; 5191 5192 vmsa = snp_alloc_firmware_page(__GFP_ZERO); 5193 if (!vmsa) 5194 return NULL; 5195 5196 dbg.gctx_paddr = __psp_pa(sev->snp_context); 5197 dbg.src_addr = svm->vmcb->control.vmsa_pa; 5198 dbg.dst_addr = __psp_pa(vmsa); 5199 5200 ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); 5201 5202 /* 5203 * Return the target page to a hypervisor page no matter what. 5204 * If this fails, the page can't be used, so leak it and don't 5205 * try to use it. 5206 */ 5207 if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) 5208 return NULL; 5209 5210 if (ret) { 5211 pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", 5212 ret, error, error); 5213 free_page((unsigned long)vmsa); 5214 5215 return NULL; 5216 } 5217 } else { 5218 struct sev_data_dbg dbg = {0}; 5219 struct page *vmsa_page; 5220 5221 vmsa_page = alloc_page(GFP_KERNEL); 5222 if (!vmsa_page) 5223 return NULL; 5224 5225 vmsa = page_address(vmsa_page); 5226 5227 dbg.handle = sev->handle; 5228 dbg.src_addr = svm->vmcb->control.vmsa_pa; 5229 dbg.dst_addr = __psp_pa(vmsa); 5230 dbg.len = PAGE_SIZE; 5231 5232 ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); 5233 if (ret) { 5234 pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", 5235 ret, error, error); 5236 __free_page(vmsa_page); 5237 5238 return NULL; 5239 } 5240 } 5241 5242 return vmsa; 5243 } 5244 5245 void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) 5246 { 5247 /* If the VMSA has not yet been encrypted, nothing was allocated */ 5248 if (!vcpu->arch.guest_state_protected || !vmsa) 5249 return; 5250 5251 free_page((unsigned long)vmsa); 5252 } 5253