1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * AMD SVM-SEV support 6 * 7 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 8 */ 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/kvm_types.h> 12 #include <linux/kvm_host.h> 13 #include <linux/kernel.h> 14 #include <linux/highmem.h> 15 #include <linux/psp.h> 16 #include <linux/psp-sev.h> 17 #include <linux/pagemap.h> 18 #include <linux/swap.h> 19 #include <linux/misc_cgroup.h> 20 #include <linux/processor.h> 21 #include <linux/trace_events.h> 22 #include <uapi/linux/sev-guest.h> 23 24 #include <asm/pkru.h> 25 #include <asm/trapnr.h> 26 #include <asm/cpuid/api.h> 27 #include <asm/fpu/xcr.h> 28 #include <asm/fpu/xstate.h> 29 #include <asm/debugreg.h> 30 #include <asm/msr.h> 31 #include <asm/sev.h> 32 33 #include "mmu.h" 34 #include "x86.h" 35 #include "svm.h" 36 #include "svm_ops.h" 37 #include "cpuid.h" 38 #include "trace.h" 39 40 #define GHCB_VERSION_MAX 2ULL 41 #define GHCB_VERSION_MIN 1ULL 42 43 #define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) 44 45 /* 46 * The GHCB spec essentially states that all non-zero error codes other than 47 * those explicitly defined above should be treated as an error by the guest. 48 * Define a generic error to cover that case, and choose a value that is not 49 * likely to overlap with new explicit error codes should more be added to 50 * the GHCB spec later. KVM will use this to report generic errors when 51 * handling SNP guest requests. 52 */ 53 #define SNP_GUEST_VMM_ERR_GENERIC (~0U) 54 55 /* enable/disable SEV support */ 56 static bool __ro_after_init sev_enabled = true; 57 module_param_named(sev, sev_enabled, bool, 0444); 58 59 /* enable/disable SEV-ES support */ 60 static bool __ro_after_init sev_es_enabled = true; 61 module_param_named(sev_es, sev_es_enabled, bool, 0444); 62 63 /* enable/disable SEV-SNP support */ 64 static bool __ro_after_init sev_snp_enabled = true; 65 module_param_named(sev_snp, sev_snp_enabled, bool, 0444); 66 67 static unsigned int __ro_after_init nr_ciphertext_hiding_asids; 68 module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 0444); 69 70 #define AP_RESET_HOLD_NONE 0 71 #define AP_RESET_HOLD_NAE_EVENT 1 72 #define AP_RESET_HOLD_MSR_PROTO 2 73 74 /* 75 * SEV-SNP policy bits that can be supported by KVM. These include policy bits 76 * that have implementation support within KVM or policy bits that do not 77 * require implementation support within KVM to enforce the policy. 78 */ 79 #define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ 80 SNP_POLICY_MASK_API_MAJOR | \ 81 SNP_POLICY_MASK_SMT | \ 82 SNP_POLICY_MASK_RSVD_MBO | \ 83 SNP_POLICY_MASK_DEBUG | \ 84 SNP_POLICY_MASK_SINGLE_SOCKET | \ 85 SNP_POLICY_MASK_CXL_ALLOW | \ 86 SNP_POLICY_MASK_MEM_AES_256_XTS | \ 87 SNP_POLICY_MASK_RAPL_DIS | \ 88 SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM | \ 89 SNP_POLICY_MASK_PAGE_SWAP_DISABLE) 90 91 static u64 snp_supported_policy_bits __ro_after_init; 92 93 static u64 sev_supported_vmsa_features __ro_after_init; 94 95 #define INITIAL_VMSA_GPA 0xFFFFFFFFF000 96 97 static u8 sev_enc_bit; 98 static DECLARE_RWSEM(sev_deactivate_lock); 99 static DEFINE_MUTEX(sev_bitmap_lock); 100 unsigned int max_sev_asid; 101 static unsigned int min_sev_asid; 102 static unsigned int max_sev_es_asid; 103 static unsigned int min_sev_es_asid; 104 static unsigned int max_snp_asid; 105 static unsigned int min_snp_asid; 106 static unsigned long sev_me_mask; 107 static unsigned int nr_asids; 108 static unsigned long *sev_asid_bitmap; 109 static unsigned long *sev_reclaim_asid_bitmap; 110 111 static __always_inline void kvm_lockdep_assert_sev_lock_held(struct kvm *kvm) 112 { 113 #ifdef CONFIG_PROVE_LOCKING 114 /* 115 * Querying SEV+ support is safe if there are no other references, i.e. 116 * if concurrent initialization of SEV+ is impossible. 117 */ 118 if (!refcount_read(&kvm->users_count)) 119 return; 120 121 /* 122 * Querying SEV+ support from vCPU context is always safe, as vCPUs can 123 * only be created after SEV+ is initialized (and KVM disallows all SEV 124 * sub-ioctls while vCPU creation is in-progress). 125 */ 126 if (kvm_get_running_vcpu()) 127 return; 128 129 lockdep_assert_held(&kvm->lock); 130 #endif 131 } 132 133 static bool sev_guest(struct kvm *kvm) 134 { 135 kvm_lockdep_assert_sev_lock_held(kvm); 136 return ____sev_guest(kvm); 137 } 138 static bool sev_es_guest(struct kvm *kvm) 139 { 140 kvm_lockdep_assert_sev_lock_held(kvm); 141 return ____sev_es_guest(kvm); 142 } 143 144 static bool sev_snp_guest(struct kvm *kvm) 145 { 146 kvm_lockdep_assert_sev_lock_held(kvm); 147 return ____sev_snp_guest(kvm); 148 } 149 150 static int snp_decommission_context(struct kvm *kvm); 151 152 struct enc_region { 153 struct list_head list; 154 unsigned long npages; 155 struct page **pages; 156 unsigned long uaddr; 157 unsigned long size; 158 }; 159 160 /* Called with the sev_bitmap_lock held, or on shutdown */ 161 static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) 162 { 163 int ret, error = 0; 164 unsigned int asid; 165 166 /* Check if there are any ASIDs to reclaim before performing a flush */ 167 asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); 168 if (asid > max_asid) 169 return -EBUSY; 170 171 /* 172 * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail, 173 * so it must be guarded. 174 */ 175 down_write(&sev_deactivate_lock); 176 177 /* SNP firmware requires use of WBINVD for ASID recycling. */ 178 wbinvd_on_all_cpus(); 179 180 if (sev_snp_enabled) 181 ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error); 182 else 183 ret = sev_guest_df_flush(&error); 184 185 up_write(&sev_deactivate_lock); 186 187 if (ret) 188 pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n", 189 sev_snp_enabled ? "-SNP" : "", ret, error); 190 191 return ret; 192 } 193 194 static inline bool is_mirroring_enc_context(struct kvm *kvm) 195 { 196 return !!to_kvm_sev_info(kvm)->enc_context_owner; 197 } 198 199 static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) 200 { 201 struct kvm_vcpu *vcpu = &svm->vcpu; 202 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 203 204 return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; 205 } 206 207 static bool snp_is_secure_tsc_enabled(struct kvm *kvm) 208 { 209 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 210 211 return (sev->vmsa_features & SVM_SEV_FEAT_SECURE_TSC) && 212 !WARN_ON_ONCE(!sev_snp_guest(kvm)); 213 } 214 215 /* Must be called with the sev_bitmap_lock held */ 216 static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid) 217 { 218 if (sev_flush_asids(min_asid, max_asid)) 219 return false; 220 221 /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */ 222 bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, 223 nr_asids); 224 bitmap_zero(sev_reclaim_asid_bitmap, nr_asids); 225 226 return true; 227 } 228 229 static int sev_misc_cg_try_charge(struct kvm_sev_info *sev) 230 { 231 enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; 232 return misc_cg_try_charge(type, sev->misc_cg, 1); 233 } 234 235 static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) 236 { 237 enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; 238 misc_cg_uncharge(type, sev->misc_cg, 1); 239 } 240 241 static unsigned int sev_alloc_asid(unsigned int min_asid, unsigned int max_asid) 242 { 243 unsigned int asid; 244 bool retry = true; 245 246 guard(mutex)(&sev_bitmap_lock); 247 248 again: 249 asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); 250 if (asid > max_asid) { 251 if (retry && __sev_recycle_asids(min_asid, max_asid)) { 252 retry = false; 253 goto again; 254 } 255 256 return asid; 257 } 258 259 __set_bit(asid, sev_asid_bitmap); 260 return asid; 261 } 262 263 static int sev_asid_new(struct kvm_sev_info *sev, unsigned long vm_type) 264 { 265 /* 266 * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. 267 * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. 268 */ 269 unsigned int min_asid, max_asid, asid; 270 int ret; 271 272 if (vm_type == KVM_X86_SNP_VM) { 273 min_asid = min_snp_asid; 274 max_asid = max_snp_asid; 275 } else if (sev->es_active) { 276 min_asid = min_sev_es_asid; 277 max_asid = max_sev_es_asid; 278 } else { 279 min_asid = min_sev_asid; 280 max_asid = max_sev_asid; 281 } 282 283 /* 284 * The min ASID can end up larger than the max if basic SEV support is 285 * effectively disabled by disallowing use of ASIDs for SEV guests. 286 * Similarly for SEV-ES guests the min ASID can end up larger than the 287 * max when ciphertext hiding is enabled, effectively disabling SEV-ES 288 * support. 289 */ 290 if (min_asid > max_asid) 291 return -ENOTTY; 292 293 WARN_ON_ONCE(sev->misc_cg); 294 sev->misc_cg = get_current_misc_cg(); 295 ret = sev_misc_cg_try_charge(sev); 296 if (ret) 297 goto e_put_cg; 298 299 asid = sev_alloc_asid(min_asid, max_asid); 300 if (asid > max_asid) { 301 ret = -EBUSY; 302 goto e_uncharge; 303 } 304 305 sev->asid = asid; 306 return 0; 307 308 e_uncharge: 309 sev_misc_cg_uncharge(sev); 310 e_put_cg: 311 put_misc_cg(sev->misc_cg); 312 sev->misc_cg = NULL; 313 return ret; 314 } 315 316 static unsigned int sev_get_asid(struct kvm *kvm) 317 { 318 return to_kvm_sev_info(kvm)->asid; 319 } 320 321 static void sev_asid_free(struct kvm_sev_info *sev) 322 { 323 struct svm_cpu_data *sd; 324 int cpu; 325 326 mutex_lock(&sev_bitmap_lock); 327 328 __set_bit(sev->asid, sev_reclaim_asid_bitmap); 329 330 for_each_possible_cpu(cpu) { 331 sd = per_cpu_ptr(&svm_data, cpu); 332 sd->sev_vmcbs[sev->asid] = NULL; 333 } 334 335 mutex_unlock(&sev_bitmap_lock); 336 337 sev_misc_cg_uncharge(sev); 338 put_misc_cg(sev->misc_cg); 339 sev->misc_cg = NULL; 340 } 341 342 static void sev_decommission(unsigned int handle) 343 { 344 struct sev_data_decommission decommission; 345 346 if (!handle) 347 return; 348 349 decommission.handle = handle; 350 sev_guest_decommission(&decommission, NULL); 351 } 352 353 /* 354 * Transition a page to hypervisor-owned/shared state in the RMP table. This 355 * should not fail under normal conditions, but leak the page should that 356 * happen since it will no longer be usable by the host due to RMP protections. 357 */ 358 static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level) 359 { 360 if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) { 361 snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT); 362 return -EIO; 363 } 364 365 return 0; 366 } 367 368 /* 369 * Certain page-states, such as Pre-Guest and Firmware pages (as documented 370 * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be 371 * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE 372 * unless they are reclaimed first. 373 * 374 * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they 375 * might not be usable by the host due to being set as immutable or still 376 * being associated with a guest ASID. 377 * 378 * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be 379 * converted back to shared, as the page is no longer usable due to RMP 380 * protections, and it's infeasible for the guest to continue on. 381 */ 382 static int snp_page_reclaim(struct kvm *kvm, u64 pfn) 383 { 384 struct sev_data_snp_page_reclaim data = {0}; 385 int fw_err, rc; 386 387 data.paddr = __sme_set(pfn << PAGE_SHIFT); 388 rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err); 389 if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) { 390 snp_leak_pages(pfn, 1); 391 return -EIO; 392 } 393 394 if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K)) 395 return -EIO; 396 397 return rc; 398 } 399 400 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) 401 { 402 struct sev_data_deactivate deactivate; 403 404 if (!handle) 405 return; 406 407 deactivate.handle = handle; 408 409 /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */ 410 down_read(&sev_deactivate_lock); 411 sev_guest_deactivate(&deactivate, NULL); 412 up_read(&sev_deactivate_lock); 413 414 sev_decommission(handle); 415 } 416 417 /* 418 * This sets up bounce buffers/firmware pages to handle SNP Guest Request 419 * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB 420 * 2.0 specification for more details. 421 * 422 * Technically, when an SNP Guest Request is issued, the guest will provide its 423 * own request/response pages, which could in theory be passed along directly 424 * to firmware rather than using bounce pages. However, these pages would need 425 * special care: 426 * 427 * - Both pages are from shared guest memory, so they need to be protected 428 * from migration/etc. occurring while firmware reads/writes to them. At a 429 * minimum, this requires elevating the ref counts and potentially needing 430 * an explicit pinning of the memory. This places additional restrictions 431 * on what type of memory backends userspace can use for shared guest 432 * memory since there is some reliance on using refcounted pages. 433 * 434 * - The response page needs to be switched to Firmware-owned[1] state 435 * before the firmware can write to it, which can lead to potential 436 * host RMP #PFs if the guest is misbehaved and hands the host a 437 * guest page that KVM might write to for other reasons (e.g. virtio 438 * buffers/etc.). 439 * 440 * Both of these issues can be avoided completely by using separately-allocated 441 * bounce pages for both the request/response pages and passing those to 442 * firmware instead. So that's what is being set up here. 443 * 444 * Guest requests rely on message sequence numbers to ensure requests are 445 * issued to firmware in the order the guest issues them, so concurrent guest 446 * requests generally shouldn't happen. But a misbehaved guest could issue 447 * concurrent guest requests in theory, so a mutex is used to serialize 448 * access to the bounce buffers. 449 * 450 * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more 451 * details on Firmware-owned pages, along with "RMP and VMPL Access Checks" 452 * in the APM for details on the related RMP restrictions. 453 */ 454 static int snp_guest_req_init(struct kvm *kvm) 455 { 456 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 457 struct page *req_page; 458 459 req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 460 if (!req_page) 461 return -ENOMEM; 462 463 sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 464 if (!sev->guest_resp_buf) { 465 __free_page(req_page); 466 return -EIO; 467 } 468 469 sev->guest_req_buf = page_address(req_page); 470 mutex_init(&sev->guest_req_mutex); 471 472 return 0; 473 } 474 475 static void snp_guest_req_cleanup(struct kvm *kvm) 476 { 477 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 478 479 if (sev->guest_resp_buf) 480 snp_free_firmware_page(sev->guest_resp_buf); 481 482 if (sev->guest_req_buf) 483 __free_page(virt_to_page(sev->guest_req_buf)); 484 485 sev->guest_req_buf = NULL; 486 sev->guest_resp_buf = NULL; 487 } 488 489 static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, 490 struct kvm_sev_init *data, 491 unsigned long vm_type) 492 { 493 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 494 struct sev_platform_init_args init_args = {0}; 495 bool es_active = vm_type != KVM_X86_SEV_VM; 496 bool snp_active = vm_type == KVM_X86_SNP_VM; 497 u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; 498 int ret; 499 500 if (kvm->created_vcpus) 501 return -EINVAL; 502 503 if (data->flags) 504 return -EINVAL; 505 506 if (!snp_active) 507 valid_vmsa_features &= ~SVM_SEV_FEAT_SECURE_TSC; 508 509 if (data->vmsa_features & ~valid_vmsa_features) 510 return -EINVAL; 511 512 if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version)) 513 return -EINVAL; 514 515 /* 516 * KVM supports the full range of mandatory features defined by version 517 * 2 of the GHCB protocol, so default to that for SEV-ES guests created 518 * via KVM_SEV_INIT2 (KVM_SEV_INIT forces version 1). 519 */ 520 if (es_active && !data->ghcb_version) 521 data->ghcb_version = 2; 522 523 if (snp_active && data->ghcb_version < 2) 524 return -EINVAL; 525 526 if (unlikely(sev->active)) 527 return -EINVAL; 528 529 sev->active = true; 530 sev->es_active = es_active; 531 sev->vmsa_features = data->vmsa_features; 532 sev->ghcb_version = data->ghcb_version; 533 534 if (snp_active) 535 sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; 536 537 ret = sev_asid_new(sev, vm_type); 538 if (ret) 539 goto e_no_asid; 540 541 init_args.probe = false; 542 ret = sev_platform_init(&init_args); 543 if (ret) 544 goto e_free_asid; 545 546 if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 547 ret = -ENOMEM; 548 goto e_free_asid; 549 } 550 551 /* This needs to happen after SEV/SNP firmware initialization. */ 552 if (snp_active) { 553 ret = snp_guest_req_init(kvm); 554 if (ret) 555 goto e_free; 556 } 557 558 INIT_LIST_HEAD(&sev->regions_list); 559 INIT_LIST_HEAD(&sev->mirror_vms); 560 sev->need_init = false; 561 562 kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV); 563 564 return 0; 565 566 e_free: 567 free_cpumask_var(sev->have_run_cpus); 568 e_free_asid: 569 argp->error = init_args.error; 570 sev_asid_free(sev); 571 sev->asid = 0; 572 e_no_asid: 573 sev->vmsa_features = 0; 574 sev->es_active = false; 575 sev->active = false; 576 return ret; 577 } 578 579 static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) 580 { 581 struct kvm_sev_init data = { 582 .vmsa_features = 0, 583 .ghcb_version = 0, 584 }; 585 unsigned long vm_type; 586 587 if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM) 588 return -EINVAL; 589 590 vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM); 591 592 /* 593 * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will 594 * continue to only ever support the minimal GHCB protocol version. 595 */ 596 if (vm_type == KVM_X86_SEV_ES_VM) 597 data.ghcb_version = GHCB_VERSION_MIN; 598 599 return __sev_guest_init(kvm, argp, &data, vm_type); 600 } 601 602 static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) 603 { 604 struct kvm_sev_init data; 605 606 if (!to_kvm_sev_info(kvm)->need_init) 607 return -EINVAL; 608 609 if (kvm->arch.vm_type != KVM_X86_SEV_VM && 610 kvm->arch.vm_type != KVM_X86_SEV_ES_VM && 611 kvm->arch.vm_type != KVM_X86_SNP_VM) 612 return -EINVAL; 613 614 if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data))) 615 return -EFAULT; 616 617 return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type); 618 } 619 620 static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) 621 { 622 unsigned int asid = sev_get_asid(kvm); 623 struct sev_data_activate activate; 624 int ret; 625 626 /* activate ASID on the given handle */ 627 activate.handle = handle; 628 activate.asid = asid; 629 ret = sev_guest_activate(&activate, error); 630 631 return ret; 632 } 633 634 static int __sev_issue_cmd(int fd, int id, void *data, int *error) 635 { 636 CLASS(fd, f)(fd); 637 638 if (fd_empty(f)) 639 return -EBADF; 640 641 return sev_issue_cmd_external_user(fd_file(f), id, data, error); 642 } 643 644 static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) 645 { 646 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 647 648 return __sev_issue_cmd(sev->fd, id, data, error); 649 } 650 651 static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 652 { 653 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 654 struct sev_data_launch_start start; 655 struct kvm_sev_launch_start params; 656 void *dh_blob, *session_blob; 657 int *error = &argp->error; 658 int ret; 659 660 if (!sev_guest(kvm)) 661 return -ENOTTY; 662 663 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 664 return -EFAULT; 665 666 memset(&start, 0, sizeof(start)); 667 668 dh_blob = NULL; 669 if (params.dh_uaddr) { 670 dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len); 671 if (IS_ERR(dh_blob)) 672 return PTR_ERR(dh_blob); 673 674 start.dh_cert_address = __sme_set(__pa(dh_blob)); 675 start.dh_cert_len = params.dh_len; 676 } 677 678 session_blob = NULL; 679 if (params.session_uaddr) { 680 session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len); 681 if (IS_ERR(session_blob)) { 682 ret = PTR_ERR(session_blob); 683 goto e_free_dh; 684 } 685 686 start.session_address = __sme_set(__pa(session_blob)); 687 start.session_len = params.session_len; 688 } 689 690 start.handle = params.handle; 691 start.policy = params.policy; 692 693 /* create memory encryption context */ 694 ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error); 695 if (ret) 696 goto e_free_session; 697 698 /* Bind ASID to this guest */ 699 ret = sev_bind_asid(kvm, start.handle, error); 700 if (ret) { 701 sev_decommission(start.handle); 702 goto e_free_session; 703 } 704 705 /* return handle to userspace */ 706 params.handle = start.handle; 707 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) { 708 sev_unbind_asid(kvm, start.handle); 709 ret = -EFAULT; 710 goto e_free_session; 711 } 712 713 sev->policy = params.policy; 714 sev->handle = start.handle; 715 sev->fd = argp->sev_fd; 716 717 e_free_session: 718 kfree(session_blob); 719 e_free_dh: 720 kfree(dh_blob); 721 return ret; 722 } 723 724 static int sev_check_pin_count(struct kvm *kvm, unsigned long npages) 725 { 726 unsigned long total_npages, lock_limit; 727 728 total_npages = to_kvm_sev_info(kvm)->pages_locked + npages; 729 if (total_npages > totalram_pages()) 730 return -EINVAL; 731 732 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 733 if (total_npages > lock_limit && !capable(CAP_IPC_LOCK)) { 734 pr_err_ratelimited("SEV: %lu total pages would exceed the lock limit of %lu.\n", 735 total_npages, lock_limit); 736 return -ENOMEM; 737 } 738 739 return 0; 740 } 741 742 static int sev_pin_user_pages(struct kvm *kvm, unsigned long addr, int npages, 743 unsigned int gup_flags, struct page **pages) 744 { 745 int npinned; 746 747 lockdep_assert_held(&kvm->lock); 748 749 npinned = pin_user_pages_fast(addr, npages, gup_flags, pages); 750 if (npinned != npages) { 751 if (npinned > 0) 752 unpin_user_pages(pages, npinned); 753 pr_err_ratelimited("SEV: Failure locking %u pages.\n", npages); 754 return -ENOMEM; 755 } 756 757 to_kvm_sev_info(kvm)->pages_locked += npages; 758 return 0; 759 } 760 761 static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, 762 unsigned long ulen, unsigned long *n, 763 unsigned int flags) 764 { 765 unsigned long npages; 766 struct page **pages; 767 int ret; 768 769 lockdep_assert_held(&kvm->lock); 770 771 if (ulen == 0 || uaddr + ulen < uaddr) 772 return ERR_PTR(-EINVAL); 773 774 /* 775 * Calculate the number of pages that need to be pinned to cover the 776 * entire range. Note! This isn't simply PFN_DOWN(ulen), as KVM 777 * doesn't require the incoming address+size to be page aligned! 778 */ 779 npages = PFN_DOWN(uaddr + ulen - 1) - PFN_DOWN(uaddr) + 1; 780 if (npages > INT_MAX) 781 return ERR_PTR(-EINVAL); 782 783 ret = sev_check_pin_count(kvm, npages); 784 if (ret) 785 return ERR_PTR(ret); 786 787 /* 788 * Don't WARN if the kernel (rightly) thinks the total size is absurd, 789 * i.e. rely on the kernel to reject outrageous range sizes. The above 790 * check on the number of pages is purely to avoid truncation as 791 * pin_user_pages_fast() takes the number of pages as a 32-bit int. 792 */ 793 pages = kvzalloc_objs(*pages, npages, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 794 if (!pages) 795 return ERR_PTR(-ENOMEM); 796 797 ret = sev_pin_user_pages(kvm, uaddr, npages, flags, pages); 798 if (ret) { 799 kvfree(pages); 800 return ERR_PTR(ret); 801 } 802 803 *n = npages; 804 return pages; 805 } 806 807 static void sev_unpin_memory(struct kvm *kvm, struct page **pages, 808 unsigned long npages) 809 { 810 unpin_user_pages(pages, npages); 811 kvfree(pages); 812 to_kvm_sev_info(kvm)->pages_locked -= npages; 813 } 814 815 static struct page *sev_pin_page(struct kvm *kvm, unsigned long addr, 816 unsigned int flags) 817 { 818 struct page *page; 819 int r; 820 821 r = sev_check_pin_count(kvm, 1); 822 if (r) 823 return ERR_PTR(r); 824 825 r = sev_pin_user_pages(kvm, addr, 1, flags, &page); 826 if (r) 827 return ERR_PTR(r); 828 829 return page; 830 } 831 832 static void sev_unpin_page(struct kvm *kvm, struct page *page) 833 { 834 unpin_user_pages(&page, 1); 835 to_kvm_sev_info(kvm)->pages_locked -= 1; 836 } 837 838 static void sev_clflush_pages(struct page *pages[], unsigned long npages) 839 { 840 uint8_t *page_virtual; 841 unsigned long i; 842 843 if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 || 844 pages == NULL) 845 return; 846 847 for (i = 0; i < npages; i++) { 848 page_virtual = kmap_local_page(pages[i]); 849 clflush_cache_range(page_virtual, PAGE_SIZE); 850 kunmap_local(page_virtual); 851 cond_resched(); 852 } 853 } 854 855 static void sev_writeback_caches(struct kvm *kvm) 856 { 857 /* 858 * Ensure that all dirty guest tagged cache entries are written back 859 * before releasing the pages back to the system for use. CLFLUSH will 860 * not do this without SME_COHERENT, and flushing many cache lines 861 * individually is slower than blasting WBINVD for large VMs, so issue 862 * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported) 863 * on CPUs that have done VMRUN, i.e. may have dirtied data using the 864 * VM's ASID. 865 * 866 * For simplicity, never remove CPUs from the bitmap. Ideally, KVM 867 * would clear the mask when flushing caches, but doing so requires 868 * serializing multiple calls and having responding CPUs (to the IPI) 869 * mark themselves as still running if they are running (or about to 870 * run) a vCPU for the VM. 871 * 872 * Note, the caller is responsible for ensuring correctness if the mask 873 * can be modified, e.g. if a CPU could be doing VMRUN. 874 */ 875 wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus); 876 } 877 878 static unsigned long get_num_contig_pages(unsigned long idx, 879 struct page **inpages, unsigned long npages) 880 { 881 unsigned long paddr, next_paddr; 882 unsigned long i = idx + 1, pages = 1; 883 884 /* find the number of contiguous pages starting from idx */ 885 paddr = __sme_page_pa(inpages[idx]); 886 while (i < npages) { 887 next_paddr = __sme_page_pa(inpages[i++]); 888 if ((paddr + PAGE_SIZE) == next_paddr) { 889 pages++; 890 paddr = next_paddr; 891 continue; 892 } 893 break; 894 } 895 896 return pages; 897 } 898 899 static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) 900 { 901 unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i; 902 struct kvm_sev_launch_update_data params; 903 struct sev_data_launch_update_data data; 904 struct page **inpages; 905 int ret; 906 907 if (!sev_guest(kvm)) 908 return -ENOTTY; 909 910 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 911 return -EFAULT; 912 913 vaddr = params.uaddr; 914 size = params.len; 915 vaddr_end = vaddr + size; 916 917 /* Lock the user memory. */ 918 inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE); 919 if (IS_ERR(inpages)) 920 return PTR_ERR(inpages); 921 922 /* 923 * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in 924 * place; the cache may contain the data that was written unencrypted. 925 */ 926 sev_clflush_pages(inpages, npages); 927 928 data.reserved = 0; 929 data.handle = to_kvm_sev_info(kvm)->handle; 930 931 for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { 932 int offset, len; 933 934 /* 935 * If the user buffer is not page-aligned, calculate the offset 936 * within the page. 937 */ 938 offset = vaddr & (PAGE_SIZE - 1); 939 940 /* Calculate the number of pages that can be encrypted in one go. */ 941 pages = get_num_contig_pages(i, inpages, npages); 942 943 len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size); 944 945 data.len = len; 946 data.address = __sme_page_pa(inpages[i]) + offset; 947 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error); 948 if (ret) 949 goto e_unpin; 950 951 size -= len; 952 next_vaddr = vaddr + len; 953 } 954 955 e_unpin: 956 /* content of memory is updated, mark pages dirty */ 957 for (i = 0; i < npages; i++) { 958 set_page_dirty_lock(inpages[i]); 959 mark_page_accessed(inpages[i]); 960 } 961 /* unlock the user pages */ 962 sev_unpin_memory(kvm, inpages, npages); 963 return ret; 964 } 965 966 static int sev_es_sync_vmsa(struct vcpu_svm *svm) 967 { 968 struct kvm_vcpu *vcpu = &svm->vcpu; 969 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 970 struct sev_es_save_area *save = svm->sev_es.vmsa; 971 struct xregs_state *xsave; 972 const u8 *s; 973 u8 *d; 974 int i; 975 976 lockdep_assert_held(&vcpu->mutex); 977 978 if (vcpu->arch.guest_state_protected) 979 return -EINVAL; 980 981 /* Check some debug related fields before encrypting the VMSA */ 982 if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) 983 return -EINVAL; 984 985 /* 986 * SEV-ES will use a VMSA that is pointed to by the VMCB, not 987 * the traditional VMSA that is part of the VMCB. Copy the 988 * traditional VMSA as it has been built so far (in prep 989 * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. 990 */ 991 memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save)); 992 993 /* Sync registgers */ 994 save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX]; 995 save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX]; 996 save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 997 save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX]; 998 save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP]; 999 save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP]; 1000 save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI]; 1001 save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI]; 1002 #ifdef CONFIG_X86_64 1003 save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8]; 1004 save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9]; 1005 save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10]; 1006 save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11]; 1007 save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12]; 1008 save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13]; 1009 save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14]; 1010 save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15]; 1011 #endif 1012 save->rip = svm->vcpu.arch.rip; 1013 1014 /* Sync some non-GPR registers before encrypting */ 1015 save->xcr0 = svm->vcpu.arch.xcr0; 1016 save->pkru = svm->vcpu.arch.pkru; 1017 save->xss = svm->vcpu.arch.ia32_xss; 1018 save->dr6 = svm->vcpu.arch.dr6; 1019 1020 save->sev_features = sev->vmsa_features; 1021 1022 /* 1023 * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid 1024 * breaking older measurements. 1025 */ 1026 if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) { 1027 xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave; 1028 save->x87_dp = xsave->i387.rdp; 1029 save->mxcsr = xsave->i387.mxcsr; 1030 save->x87_ftw = xsave->i387.twd; 1031 save->x87_fsw = xsave->i387.swd; 1032 save->x87_fcw = xsave->i387.cwd; 1033 save->x87_fop = xsave->i387.fop; 1034 save->x87_ds = 0; 1035 save->x87_cs = 0; 1036 save->x87_rip = xsave->i387.rip; 1037 1038 for (i = 0; i < 8; i++) { 1039 /* 1040 * The format of the x87 save area is undocumented and 1041 * definitely not what you would expect. It consists of 1042 * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes 1043 * area with bytes 8-9 of each register. 1044 */ 1045 d = save->fpreg_x87 + i * 8; 1046 s = ((u8 *)xsave->i387.st_space) + i * 16; 1047 memcpy(d, s, 8); 1048 save->fpreg_x87[64 + i * 2] = s[8]; 1049 save->fpreg_x87[64 + i * 2 + 1] = s[9]; 1050 } 1051 memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256); 1052 1053 s = get_xsave_addr(xsave, XFEATURE_YMM); 1054 if (s) 1055 memcpy(save->fpreg_ymm, s, 256); 1056 else 1057 memset(save->fpreg_ymm, 0, 256); 1058 } 1059 1060 pr_debug("Virtual Machine Save Area (VMSA):\n"); 1061 print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); 1062 1063 return 0; 1064 } 1065 1066 static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, 1067 int *error) 1068 { 1069 struct sev_data_launch_update_vmsa vmsa; 1070 struct vcpu_svm *svm = to_svm(vcpu); 1071 int ret; 1072 1073 if (vcpu->guest_debug) { 1074 pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported"); 1075 return -EINVAL; 1076 } 1077 1078 /* Perform some pre-encryption checks against the VMSA */ 1079 ret = sev_es_sync_vmsa(svm); 1080 if (ret) 1081 return ret; 1082 1083 /* 1084 * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of 1085 * the VMSA memory content (i.e it will write the same memory region 1086 * with the guest's key), so invalidate it first. 1087 */ 1088 clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE); 1089 1090 vmsa.reserved = 0; 1091 vmsa.handle = to_kvm_sev_info(kvm)->handle; 1092 vmsa.address = __sme_pa(svm->sev_es.vmsa); 1093 vmsa.len = PAGE_SIZE; 1094 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); 1095 if (ret) 1096 return ret; 1097 1098 /* 1099 * SEV-ES guests maintain an encrypted version of their FPU 1100 * state which is restored and saved on VMRUN and VMEXIT. 1101 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't 1102 * do xsave/xrstor on it. 1103 */ 1104 fpstate_set_confidential(&vcpu->arch.guest_fpu); 1105 vcpu->arch.guest_state_protected = true; 1106 1107 /* 1108 * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it 1109 * only after setting guest_state_protected because KVM_SET_MSRS allows 1110 * dynamic toggling of LBRV (for performance reason) on write access to 1111 * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. 1112 */ 1113 svm_enable_lbrv(vcpu); 1114 return 0; 1115 } 1116 1117 static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) 1118 { 1119 struct kvm_vcpu *vcpu; 1120 unsigned long i; 1121 int ret; 1122 1123 if (!sev_es_guest(kvm)) 1124 return -ENOTTY; 1125 1126 if (kvm_is_vcpu_creation_in_progress(kvm)) 1127 return -EBUSY; 1128 1129 ret = kvm_lock_all_vcpus(kvm); 1130 if (ret) 1131 return ret; 1132 1133 kvm_for_each_vcpu(i, vcpu, kvm) { 1134 ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error); 1135 if (ret) 1136 break; 1137 } 1138 1139 kvm_unlock_all_vcpus(kvm); 1140 return ret; 1141 } 1142 1143 static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) 1144 { 1145 void __user *measure = u64_to_user_ptr(argp->data); 1146 struct sev_data_launch_measure data; 1147 struct kvm_sev_launch_measure params; 1148 void __user *p = NULL; 1149 void *blob = NULL; 1150 int ret; 1151 1152 if (!sev_guest(kvm)) 1153 return -ENOTTY; 1154 1155 if (copy_from_user(¶ms, measure, sizeof(params))) 1156 return -EFAULT; 1157 1158 memset(&data, 0, sizeof(data)); 1159 1160 /* User wants to query the blob length */ 1161 if (!params.len) 1162 goto cmd; 1163 1164 p = u64_to_user_ptr(params.uaddr); 1165 if (p) { 1166 if (params.len > SEV_FW_BLOB_MAX_SIZE) 1167 return -EINVAL; 1168 1169 blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT); 1170 if (!blob) 1171 return -ENOMEM; 1172 1173 data.address = __psp_pa(blob); 1174 data.len = params.len; 1175 } 1176 1177 cmd: 1178 data.handle = to_kvm_sev_info(kvm)->handle; 1179 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error); 1180 1181 /* 1182 * If we query the session length, FW responded with expected data. 1183 */ 1184 if (!params.len) 1185 goto done; 1186 1187 if (ret) 1188 goto e_free_blob; 1189 1190 if (blob) { 1191 if (copy_to_user(p, blob, params.len)) 1192 ret = -EFAULT; 1193 } 1194 1195 done: 1196 params.len = data.len; 1197 if (copy_to_user(measure, ¶ms, sizeof(params))) 1198 ret = -EFAULT; 1199 e_free_blob: 1200 kfree(blob); 1201 return ret; 1202 } 1203 1204 static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 1205 { 1206 struct sev_data_launch_finish data; 1207 1208 if (!sev_guest(kvm)) 1209 return -ENOTTY; 1210 1211 data.handle = to_kvm_sev_info(kvm)->handle; 1212 return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error); 1213 } 1214 1215 static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) 1216 { 1217 struct kvm_sev_guest_status params; 1218 struct sev_data_guest_status data; 1219 int ret; 1220 1221 if (!sev_guest(kvm)) 1222 return -ENOTTY; 1223 1224 memset(&data, 0, sizeof(data)); 1225 1226 data.handle = to_kvm_sev_info(kvm)->handle; 1227 ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error); 1228 if (ret) 1229 return ret; 1230 1231 params.policy = data.policy; 1232 params.state = data.state; 1233 params.handle = data.handle; 1234 1235 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) 1236 ret = -EFAULT; 1237 1238 return ret; 1239 } 1240 1241 static int sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src_pa, 1242 unsigned long dst_pa, unsigned int size, 1243 unsigned int ioctl, int *error) 1244 { 1245 int cmd = ioctl == KVM_SEV_DBG_DECRYPT ? SEV_CMD_DBG_DECRYPT : 1246 SEV_CMD_DBG_ENCRYPT; 1247 struct sev_data_dbg data = { 1248 .handle = to_kvm_sev_info(kvm)->handle, 1249 .dst_addr = dst_pa, 1250 .src_addr = src_pa, 1251 .len = size, 1252 }; 1253 1254 return sev_issue_cmd(kvm, cmd, &data, error); 1255 } 1256 1257 static void *sev_dbg_crypt_slow_alloc(struct page *page, unsigned long __va, 1258 unsigned int len, unsigned long *pa, 1259 unsigned int *nr_bytes) 1260 { 1261 unsigned long va = ALIGN_DOWN(__va, 16); 1262 1263 /* The number of bytes to {de,en}crypt must be 16-byte aligned. */ 1264 *nr_bytes = round_up(len, 16); 1265 1266 /* 1267 * Increase the number of bytes to {de,en}crypt by one chunk (16 bytes) 1268 * if the aligned address and length doesn't cover the unaligned range, 1269 * e.g. if the address is unaligned _and_ the access will split a chunk 1270 * at the tail. 1271 */ 1272 if (va + *nr_bytes < __va + len) 1273 *nr_bytes += 16; 1274 1275 *pa = __sme_page_pa(page) + (va & ~PAGE_MASK); 1276 1277 /* 1278 * Sanity check that the new access won't split a page. This should 1279 * never happen; just pretend the allocation failed. 1280 */ 1281 if (WARN_ON_ONCE((*pa & PAGE_MASK) != ((*pa + *nr_bytes - 1) & PAGE_MASK))) 1282 return NULL; 1283 1284 return kmalloc(*nr_bytes, GFP_KERNEL); 1285 } 1286 1287 static int sev_dbg_decrypt_slow(struct kvm *kvm, unsigned long src, 1288 struct page *src_p, unsigned long dst, 1289 unsigned int len, int *err) 1290 { 1291 unsigned int nr_bytes; 1292 unsigned long src_pa; 1293 void *buf; 1294 int r; 1295 1296 buf = sev_dbg_crypt_slow_alloc(src_p, src, len, &src_pa, &nr_bytes); 1297 if (!buf) 1298 return -ENOMEM; 1299 1300 r = sev_issue_dbg_cmd(kvm, src_pa, __sme_set(__pa(buf)), 1301 nr_bytes, KVM_SEV_DBG_DECRYPT, err); 1302 if (r) 1303 goto out; 1304 1305 if (copy_to_user((void __user *)dst, buf + (src & 15), len)) 1306 r = -EFAULT; 1307 out: 1308 kfree(buf); 1309 return r; 1310 } 1311 1312 static int sev_dbg_encrypt_slow(struct kvm *kvm, unsigned long src, 1313 unsigned long dst, struct page *dst_p, 1314 unsigned int len, int *err) 1315 { 1316 unsigned int nr_bytes; 1317 unsigned long dst_pa; 1318 void *buf; 1319 int r; 1320 1321 /* Decrypt the _destination_ to do a RMW on plaintext. */ 1322 buf = sev_dbg_crypt_slow_alloc(dst_p, dst, len, &dst_pa, &nr_bytes); 1323 if (!buf) 1324 return -ENOMEM; 1325 1326 r = sev_issue_dbg_cmd(kvm, dst_pa, __sme_set(__pa(buf)), 1327 nr_bytes, KVM_SEV_DBG_DECRYPT, err); 1328 if (r) 1329 goto out; 1330 1331 /* 1332 * Copy from the source into the intermediate buffer, and then 1333 * re-encrypt the buffer into the destination. 1334 */ 1335 if (copy_from_user(buf + (dst & 15), (void __user *)src, len)) 1336 r = -EFAULT; 1337 else 1338 r = sev_issue_dbg_cmd(kvm, __sme_set(__pa(buf)), dst_pa, 1339 nr_bytes, KVM_SEV_DBG_ENCRYPT, err); 1340 out: 1341 kfree(buf); 1342 return r; 1343 } 1344 1345 static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, 1346 unsigned int cmd) 1347 { 1348 struct kvm_sev_dbg debug; 1349 unsigned int i, len; 1350 1351 if (!sev_guest(kvm)) 1352 return -ENOTTY; 1353 1354 if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug))) 1355 return -EFAULT; 1356 1357 if (!debug.len || !debug.src_uaddr || !debug.dst_uaddr) 1358 return -EINVAL; 1359 1360 if (debug.src_uaddr + debug.len < debug.src_uaddr || 1361 debug.dst_uaddr + debug.len < debug.dst_uaddr) 1362 return -EINVAL; 1363 1364 for (i = 0; i < debug.len; i += len) { 1365 unsigned long src = debug.src_uaddr + i; 1366 unsigned long dst = debug.dst_uaddr + i; 1367 unsigned long s_off = src & ~PAGE_MASK; 1368 unsigned long d_off = dst & ~PAGE_MASK; 1369 struct page *src_p, *dst_p; 1370 int ret; 1371 1372 /* 1373 * Copy as many remaining bytes as possible while staying in a 1374 * single page for both the source and destination. 1375 */ 1376 len = min3(debug.len - i, PAGE_SIZE - s_off, PAGE_SIZE - d_off); 1377 1378 /* 1379 * Pin the source and destination pages; firmware operates on 1380 * physical addresses. 1381 */ 1382 src_p = sev_pin_page(kvm, src & PAGE_MASK, 0); 1383 if (IS_ERR(src_p)) 1384 return PTR_ERR(src_p); 1385 1386 dst_p = sev_pin_page(kvm, dst & PAGE_MASK, FOLL_WRITE); 1387 if (IS_ERR(dst_p)) { 1388 sev_unpin_page(kvm, src_p); 1389 return PTR_ERR(dst_p); 1390 } 1391 1392 /* 1393 * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify 1394 * the pages; flush the destination too so that future accesses do not 1395 * see stale data. 1396 */ 1397 sev_clflush_pages(&src_p, 1); 1398 sev_clflush_pages(&dst_p, 1); 1399 1400 if (IS_ALIGNED(src, 16) && IS_ALIGNED(dst, 16) && IS_ALIGNED(len, 16)) 1401 ret = sev_issue_dbg_cmd(kvm, 1402 __sme_page_pa(src_p) + s_off, 1403 __sme_page_pa(dst_p) + d_off, 1404 len, cmd, &argp->error); 1405 else if (cmd == KVM_SEV_DBG_DECRYPT) 1406 ret = sev_dbg_decrypt_slow(kvm, src, src_p, dst, 1407 len, &argp->error); 1408 else 1409 ret = sev_dbg_encrypt_slow(kvm, src, dst, dst_p, 1410 len, &argp->error); 1411 1412 sev_unpin_page(kvm, src_p); 1413 sev_unpin_page(kvm, dst_p); 1414 1415 if (ret) 1416 return ret; 1417 } 1418 return 0; 1419 } 1420 1421 static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) 1422 { 1423 struct sev_data_launch_secret data; 1424 struct kvm_sev_launch_secret params; 1425 struct page **pages; 1426 void *blob, *hdr; 1427 unsigned long n, i; 1428 int ret, offset; 1429 1430 if (!sev_guest(kvm)) 1431 return -ENOTTY; 1432 1433 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 1434 return -EFAULT; 1435 1436 pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE); 1437 if (IS_ERR(pages)) 1438 return PTR_ERR(pages); 1439 1440 /* 1441 * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in 1442 * place; the cache may contain the data that was written unencrypted. 1443 */ 1444 sev_clflush_pages(pages, n); 1445 1446 /* 1447 * The secret must be copied into contiguous memory region, lets verify 1448 * that userspace memory pages are contiguous before we issue command. 1449 */ 1450 if (get_num_contig_pages(0, pages, n) != n) { 1451 ret = -EINVAL; 1452 goto e_unpin_memory; 1453 } 1454 1455 memset(&data, 0, sizeof(data)); 1456 1457 offset = params.guest_uaddr & (PAGE_SIZE - 1); 1458 data.guest_address = __sme_page_pa(pages[0]) + offset; 1459 data.guest_len = params.guest_len; 1460 1461 blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len); 1462 if (IS_ERR(blob)) { 1463 ret = PTR_ERR(blob); 1464 goto e_unpin_memory; 1465 } 1466 1467 data.trans_address = __psp_pa(blob); 1468 data.trans_len = params.trans_len; 1469 1470 hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); 1471 if (IS_ERR(hdr)) { 1472 ret = PTR_ERR(hdr); 1473 goto e_free_blob; 1474 } 1475 data.hdr_address = __psp_pa(hdr); 1476 data.hdr_len = params.hdr_len; 1477 1478 data.handle = to_kvm_sev_info(kvm)->handle; 1479 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error); 1480 1481 kfree(hdr); 1482 1483 e_free_blob: 1484 kfree(blob); 1485 e_unpin_memory: 1486 /* content of memory is updated, mark pages dirty */ 1487 for (i = 0; i < n; i++) { 1488 set_page_dirty_lock(pages[i]); 1489 mark_page_accessed(pages[i]); 1490 } 1491 sev_unpin_memory(kvm, pages, n); 1492 return ret; 1493 } 1494 1495 static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) 1496 { 1497 void __user *report = u64_to_user_ptr(argp->data); 1498 struct sev_data_attestation_report data; 1499 struct kvm_sev_attestation_report params; 1500 void __user *p; 1501 void *blob = NULL; 1502 int ret; 1503 1504 if (!sev_guest(kvm)) 1505 return -ENOTTY; 1506 1507 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 1508 return -EFAULT; 1509 1510 memset(&data, 0, sizeof(data)); 1511 1512 /* User wants to query the blob length */ 1513 if (!params.len) 1514 goto cmd; 1515 1516 p = u64_to_user_ptr(params.uaddr); 1517 if (p) { 1518 if (params.len > SEV_FW_BLOB_MAX_SIZE) 1519 return -EINVAL; 1520 1521 blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT); 1522 if (!blob) 1523 return -ENOMEM; 1524 1525 data.address = __psp_pa(blob); 1526 data.len = params.len; 1527 memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce)); 1528 } 1529 cmd: 1530 data.handle = to_kvm_sev_info(kvm)->handle; 1531 ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error); 1532 /* 1533 * If we query the session length, FW responded with expected data. 1534 */ 1535 if (!params.len) 1536 goto done; 1537 1538 if (ret) 1539 goto e_free_blob; 1540 1541 if (blob) { 1542 if (copy_to_user(p, blob, params.len)) 1543 ret = -EFAULT; 1544 } 1545 1546 done: 1547 params.len = data.len; 1548 if (copy_to_user(report, ¶ms, sizeof(params))) 1549 ret = -EFAULT; 1550 e_free_blob: 1551 kfree(blob); 1552 return ret; 1553 } 1554 1555 /* Userspace wants to query session length. */ 1556 static int 1557 __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, 1558 struct kvm_sev_send_start *params) 1559 { 1560 struct sev_data_send_start data; 1561 int ret; 1562 1563 memset(&data, 0, sizeof(data)); 1564 data.handle = to_kvm_sev_info(kvm)->handle; 1565 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); 1566 1567 params->session_len = data.session_len; 1568 if (copy_to_user(u64_to_user_ptr(argp->data), params, 1569 sizeof(struct kvm_sev_send_start))) 1570 ret = -EFAULT; 1571 1572 return ret; 1573 } 1574 1575 static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 1576 { 1577 struct sev_data_send_start data; 1578 struct kvm_sev_send_start params; 1579 void *amd_certs, *session_data; 1580 void *pdh_cert, *plat_certs; 1581 int ret; 1582 1583 if (!sev_guest(kvm)) 1584 return -ENOTTY; 1585 1586 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1587 sizeof(struct kvm_sev_send_start))) 1588 return -EFAULT; 1589 1590 /* if session_len is zero, userspace wants to query the session length */ 1591 if (!params.session_len) 1592 return __sev_send_start_query_session_length(kvm, argp, 1593 ¶ms); 1594 1595 /* some sanity checks */ 1596 if (!params.pdh_cert_uaddr || !params.pdh_cert_len || 1597 !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE) 1598 return -EINVAL; 1599 1600 /* allocate the memory to hold the session data blob */ 1601 session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT); 1602 if (!session_data) 1603 return -ENOMEM; 1604 1605 /* copy the certificate blobs from userspace */ 1606 pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr, 1607 params.pdh_cert_len); 1608 if (IS_ERR(pdh_cert)) { 1609 ret = PTR_ERR(pdh_cert); 1610 goto e_free_session; 1611 } 1612 1613 plat_certs = psp_copy_user_blob(params.plat_certs_uaddr, 1614 params.plat_certs_len); 1615 if (IS_ERR(plat_certs)) { 1616 ret = PTR_ERR(plat_certs); 1617 goto e_free_pdh; 1618 } 1619 1620 amd_certs = psp_copy_user_blob(params.amd_certs_uaddr, 1621 params.amd_certs_len); 1622 if (IS_ERR(amd_certs)) { 1623 ret = PTR_ERR(amd_certs); 1624 goto e_free_plat_cert; 1625 } 1626 1627 /* populate the FW SEND_START field with system physical address */ 1628 memset(&data, 0, sizeof(data)); 1629 data.pdh_cert_address = __psp_pa(pdh_cert); 1630 data.pdh_cert_len = params.pdh_cert_len; 1631 data.plat_certs_address = __psp_pa(plat_certs); 1632 data.plat_certs_len = params.plat_certs_len; 1633 data.amd_certs_address = __psp_pa(amd_certs); 1634 data.amd_certs_len = params.amd_certs_len; 1635 data.session_address = __psp_pa(session_data); 1636 data.session_len = params.session_len; 1637 data.handle = to_kvm_sev_info(kvm)->handle; 1638 1639 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); 1640 1641 if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr), 1642 session_data, params.session_len)) { 1643 ret = -EFAULT; 1644 goto e_free_amd_cert; 1645 } 1646 1647 params.policy = data.policy; 1648 params.session_len = data.session_len; 1649 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, 1650 sizeof(struct kvm_sev_send_start))) 1651 ret = -EFAULT; 1652 1653 e_free_amd_cert: 1654 kfree(amd_certs); 1655 e_free_plat_cert: 1656 kfree(plat_certs); 1657 e_free_pdh: 1658 kfree(pdh_cert); 1659 e_free_session: 1660 kfree(session_data); 1661 return ret; 1662 } 1663 1664 /* Userspace wants to query either header or trans length. */ 1665 static int 1666 __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, 1667 struct kvm_sev_send_update_data *params) 1668 { 1669 struct sev_data_send_update_data data; 1670 int ret; 1671 1672 memset(&data, 0, sizeof(data)); 1673 data.handle = to_kvm_sev_info(kvm)->handle; 1674 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); 1675 1676 params->hdr_len = data.hdr_len; 1677 params->trans_len = data.trans_len; 1678 1679 if (copy_to_user(u64_to_user_ptr(argp->data), params, 1680 sizeof(struct kvm_sev_send_update_data))) 1681 ret = -EFAULT; 1682 1683 return ret; 1684 } 1685 1686 static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) 1687 { 1688 struct sev_data_send_update_data data; 1689 struct kvm_sev_send_update_data params; 1690 void *hdr, *trans_data; 1691 struct page *guest_page; 1692 int ret, offset; 1693 1694 if (!sev_guest(kvm)) 1695 return -ENOTTY; 1696 1697 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1698 sizeof(struct kvm_sev_send_update_data))) 1699 return -EFAULT; 1700 1701 /* userspace wants to query either header or trans length */ 1702 if (!params.trans_len || !params.hdr_len) 1703 return __sev_send_update_data_query_lengths(kvm, argp, ¶ms); 1704 1705 if (!params.trans_uaddr || !params.guest_uaddr || 1706 !params.guest_len || !params.hdr_uaddr) 1707 return -EINVAL; 1708 1709 /* Check if we are crossing the page boundary */ 1710 offset = params.guest_uaddr & (PAGE_SIZE - 1); 1711 if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) 1712 return -EINVAL; 1713 1714 /* Pin guest memory */ 1715 guest_page = sev_pin_page(kvm, params.guest_uaddr & PAGE_MASK, 0); 1716 if (IS_ERR(guest_page)) 1717 return PTR_ERR(guest_page); 1718 1719 /* allocate memory for header and transport buffer */ 1720 ret = -ENOMEM; 1721 hdr = kzalloc(params.hdr_len, GFP_KERNEL); 1722 if (!hdr) 1723 goto e_unpin; 1724 1725 trans_data = kzalloc(params.trans_len, GFP_KERNEL); 1726 if (!trans_data) 1727 goto e_free_hdr; 1728 1729 memset(&data, 0, sizeof(data)); 1730 data.hdr_address = __psp_pa(hdr); 1731 data.hdr_len = params.hdr_len; 1732 data.trans_address = __psp_pa(trans_data); 1733 data.trans_len = params.trans_len; 1734 1735 /* The SEND_UPDATE_DATA command requires C-bit to be always set. */ 1736 data.guest_address = page_to_phys(guest_page) + offset; 1737 data.guest_address |= sev_me_mask; 1738 data.guest_len = params.guest_len; 1739 data.handle = to_kvm_sev_info(kvm)->handle; 1740 1741 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); 1742 1743 if (ret) 1744 goto e_free_trans_data; 1745 1746 /* copy transport buffer to user space */ 1747 if (copy_to_user(u64_to_user_ptr(params.trans_uaddr), 1748 trans_data, params.trans_len)) { 1749 ret = -EFAULT; 1750 goto e_free_trans_data; 1751 } 1752 1753 /* Copy packet header to userspace. */ 1754 if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr, 1755 params.hdr_len)) 1756 ret = -EFAULT; 1757 1758 e_free_trans_data: 1759 kfree(trans_data); 1760 e_free_hdr: 1761 kfree(hdr); 1762 e_unpin: 1763 sev_unpin_page(kvm, guest_page); 1764 return ret; 1765 } 1766 1767 static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 1768 { 1769 struct sev_data_send_finish data; 1770 1771 if (!sev_guest(kvm)) 1772 return -ENOTTY; 1773 1774 data.handle = to_kvm_sev_info(kvm)->handle; 1775 return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error); 1776 } 1777 1778 static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp) 1779 { 1780 struct sev_data_send_cancel data; 1781 1782 if (!sev_guest(kvm)) 1783 return -ENOTTY; 1784 1785 data.handle = to_kvm_sev_info(kvm)->handle; 1786 return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error); 1787 } 1788 1789 static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 1790 { 1791 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 1792 struct sev_data_receive_start start; 1793 struct kvm_sev_receive_start params; 1794 int *error = &argp->error; 1795 void *session_data; 1796 void *pdh_data; 1797 int ret; 1798 1799 if (!sev_guest(kvm)) 1800 return -ENOTTY; 1801 1802 /* Get parameter from the userspace */ 1803 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1804 sizeof(struct kvm_sev_receive_start))) 1805 return -EFAULT; 1806 1807 /* some sanity checks */ 1808 if (!params.pdh_uaddr || !params.pdh_len || 1809 !params.session_uaddr || !params.session_len) 1810 return -EINVAL; 1811 1812 pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len); 1813 if (IS_ERR(pdh_data)) 1814 return PTR_ERR(pdh_data); 1815 1816 session_data = psp_copy_user_blob(params.session_uaddr, 1817 params.session_len); 1818 if (IS_ERR(session_data)) { 1819 ret = PTR_ERR(session_data); 1820 goto e_free_pdh; 1821 } 1822 1823 memset(&start, 0, sizeof(start)); 1824 start.handle = params.handle; 1825 start.policy = params.policy; 1826 start.pdh_cert_address = __psp_pa(pdh_data); 1827 start.pdh_cert_len = params.pdh_len; 1828 start.session_address = __psp_pa(session_data); 1829 start.session_len = params.session_len; 1830 1831 /* create memory encryption context */ 1832 ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start, 1833 error); 1834 if (ret) 1835 goto e_free_session; 1836 1837 /* Bind ASID to this guest */ 1838 ret = sev_bind_asid(kvm, start.handle, error); 1839 if (ret) { 1840 sev_decommission(start.handle); 1841 goto e_free_session; 1842 } 1843 1844 params.handle = start.handle; 1845 if (copy_to_user(u64_to_user_ptr(argp->data), 1846 ¶ms, sizeof(struct kvm_sev_receive_start))) { 1847 ret = -EFAULT; 1848 sev_unbind_asid(kvm, start.handle); 1849 goto e_free_session; 1850 } 1851 1852 sev->handle = start.handle; 1853 sev->fd = argp->sev_fd; 1854 1855 e_free_session: 1856 kfree(session_data); 1857 e_free_pdh: 1858 kfree(pdh_data); 1859 1860 return ret; 1861 } 1862 1863 static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) 1864 { 1865 struct kvm_sev_receive_update_data params; 1866 struct sev_data_receive_update_data data; 1867 void *hdr = NULL, *trans = NULL; 1868 struct page *guest_page; 1869 int ret, offset; 1870 1871 if (!sev_guest(kvm)) 1872 return -EINVAL; 1873 1874 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1875 sizeof(struct kvm_sev_receive_update_data))) 1876 return -EFAULT; 1877 1878 if (!params.hdr_uaddr || !params.hdr_len || 1879 !params.guest_uaddr || !params.guest_len || 1880 !params.trans_uaddr || !params.trans_len) 1881 return -EINVAL; 1882 1883 /* Check if we are crossing the page boundary */ 1884 offset = params.guest_uaddr & (PAGE_SIZE - 1); 1885 if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) 1886 return -EINVAL; 1887 1888 hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); 1889 if (IS_ERR(hdr)) 1890 return PTR_ERR(hdr); 1891 1892 trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len); 1893 if (IS_ERR(trans)) { 1894 ret = PTR_ERR(trans); 1895 goto e_free_hdr; 1896 } 1897 1898 memset(&data, 0, sizeof(data)); 1899 data.hdr_address = __psp_pa(hdr); 1900 data.hdr_len = params.hdr_len; 1901 data.trans_address = __psp_pa(trans); 1902 data.trans_len = params.trans_len; 1903 1904 /* Pin guest memory */ 1905 guest_page = sev_pin_page(kvm, params.guest_uaddr & PAGE_MASK, FOLL_WRITE); 1906 if (IS_ERR(guest_page)) { 1907 ret = PTR_ERR(guest_page); 1908 goto e_free_trans; 1909 } 1910 1911 /* 1912 * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP 1913 * encrypts the written data with the guest's key, and the cache may 1914 * contain dirty, unencrypted data. 1915 */ 1916 sev_clflush_pages(&guest_page, 1); 1917 1918 /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */ 1919 data.guest_address = page_to_phys(guest_page) + offset; 1920 data.guest_address |= sev_me_mask; 1921 data.guest_len = params.guest_len; 1922 data.handle = to_kvm_sev_info(kvm)->handle; 1923 1924 ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data, 1925 &argp->error); 1926 1927 sev_unpin_page(kvm, guest_page); 1928 1929 e_free_trans: 1930 kfree(trans); 1931 e_free_hdr: 1932 kfree(hdr); 1933 1934 return ret; 1935 } 1936 1937 static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 1938 { 1939 struct sev_data_receive_finish data; 1940 1941 if (!sev_guest(kvm)) 1942 return -ENOTTY; 1943 1944 data.handle = to_kvm_sev_info(kvm)->handle; 1945 return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); 1946 } 1947 1948 static bool is_cmd_allowed_from_mirror(u32 cmd_id) 1949 { 1950 /* 1951 * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES 1952 * active mirror VMs. Also allow the debugging and status commands. 1953 */ 1954 if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA || 1955 cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT || 1956 cmd_id == KVM_SEV_DBG_ENCRYPT) 1957 return true; 1958 1959 return false; 1960 } 1961 1962 static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) 1963 { 1964 struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); 1965 struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); 1966 int r = -EBUSY; 1967 1968 if (dst_kvm == src_kvm) 1969 return -EINVAL; 1970 1971 /* 1972 * Bail if these VMs are already involved in a migration to avoid 1973 * deadlock between two VMs trying to migrate to/from each other. 1974 */ 1975 if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1)) 1976 return -EBUSY; 1977 1978 if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) 1979 goto release_dst; 1980 1981 r = -EINTR; 1982 if (mutex_lock_killable(&dst_kvm->lock)) 1983 goto release_src; 1984 if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING)) 1985 goto unlock_dst; 1986 return 0; 1987 1988 unlock_dst: 1989 mutex_unlock(&dst_kvm->lock); 1990 release_src: 1991 atomic_set_release(&src_sev->migration_in_progress, 0); 1992 release_dst: 1993 atomic_set_release(&dst_sev->migration_in_progress, 0); 1994 return r; 1995 } 1996 1997 static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) 1998 { 1999 struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); 2000 struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); 2001 2002 mutex_unlock(&dst_kvm->lock); 2003 mutex_unlock(&src_kvm->lock); 2004 atomic_set_release(&dst_sev->migration_in_progress, 0); 2005 atomic_set_release(&src_sev->migration_in_progress, 0); 2006 } 2007 2008 static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) 2009 { 2010 struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm); 2011 struct kvm_sev_info *src = to_kvm_sev_info(src_kvm); 2012 struct kvm_vcpu *dst_vcpu, *src_vcpu; 2013 struct vcpu_svm *dst_svm, *src_svm; 2014 struct kvm_sev_info *mirror; 2015 unsigned long i; 2016 2017 dst->active = true; 2018 dst->asid = src->asid; 2019 dst->handle = src->handle; 2020 dst->pages_locked = src->pages_locked; 2021 dst->enc_context_owner = src->enc_context_owner; 2022 dst->es_active = src->es_active; 2023 dst->vmsa_features = src->vmsa_features; 2024 2025 src->asid = 0; 2026 src->active = false; 2027 src->handle = 0; 2028 src->pages_locked = 0; 2029 src->enc_context_owner = NULL; 2030 src->es_active = false; 2031 2032 list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); 2033 2034 /* 2035 * If this VM has mirrors, "transfer" each mirror's refcount of the 2036 * source to the destination (this KVM). The caller holds a reference 2037 * to the source, so there's no danger of use-after-free. 2038 */ 2039 list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms); 2040 list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) { 2041 kvm_get_kvm(dst_kvm); 2042 kvm_put_kvm(src_kvm); 2043 mirror->enc_context_owner = dst_kvm; 2044 } 2045 2046 /* 2047 * If this VM is a mirror, remove the old mirror from the owners list 2048 * and add the new mirror to the list. 2049 */ 2050 if (is_mirroring_enc_context(dst_kvm)) { 2051 struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner); 2052 2053 list_del(&src->mirror_entry); 2054 list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); 2055 } 2056 2057 kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) { 2058 dst_svm = to_svm(dst_vcpu); 2059 2060 sev_init_vmcb(dst_svm, false); 2061 2062 if (!dst->es_active) 2063 continue; 2064 2065 /* 2066 * Note, the source is not required to have the same number of 2067 * vCPUs as the destination when migrating a vanilla SEV VM. 2068 */ 2069 src_vcpu = kvm_get_vcpu(src_kvm, i); 2070 src_svm = to_svm(src_vcpu); 2071 2072 /* 2073 * Transfer VMSA and GHCB state to the destination. Nullify and 2074 * clear source fields as appropriate, the state now belongs to 2075 * the destination. 2076 */ 2077 memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es)); 2078 dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa; 2079 dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa; 2080 dst_vcpu->arch.guest_state_protected = true; 2081 2082 memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es)); 2083 src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE; 2084 src_svm->vmcb->control.vmsa_pa = INVALID_PAGE; 2085 src_vcpu->arch.guest_state_protected = false; 2086 } 2087 } 2088 2089 static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) 2090 { 2091 struct kvm_vcpu *src_vcpu; 2092 unsigned long i; 2093 2094 if (kvm_is_vcpu_creation_in_progress(src) || 2095 kvm_is_vcpu_creation_in_progress(dst)) 2096 return -EBUSY; 2097 2098 if (!sev_es_guest(src)) 2099 return 0; 2100 2101 if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus)) 2102 return -EINVAL; 2103 2104 kvm_for_each_vcpu(i, src_vcpu, src) { 2105 if (!src_vcpu->arch.guest_state_protected) 2106 return -EINVAL; 2107 } 2108 2109 return 0; 2110 } 2111 2112 int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) 2113 { 2114 struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm); 2115 struct kvm_sev_info *src_sev, *cg_cleanup_sev; 2116 CLASS(fd, f)(source_fd); 2117 struct kvm *source_kvm; 2118 bool charged = false; 2119 int ret; 2120 2121 if (fd_empty(f)) 2122 return -EBADF; 2123 2124 if (!file_is_kvm(fd_file(f))) 2125 return -EBADF; 2126 2127 source_kvm = fd_file(f)->private_data; 2128 ret = sev_lock_two_vms(kvm, source_kvm); 2129 if (ret) 2130 return ret; 2131 2132 if (kvm->arch.vm_type != source_kvm->arch.vm_type || 2133 sev_guest(kvm) || !sev_guest(source_kvm)) { 2134 ret = -EINVAL; 2135 goto out_unlock; 2136 } 2137 2138 src_sev = to_kvm_sev_info(source_kvm); 2139 2140 dst_sev->misc_cg = get_current_misc_cg(); 2141 cg_cleanup_sev = dst_sev; 2142 if (dst_sev->misc_cg != src_sev->misc_cg) { 2143 ret = sev_misc_cg_try_charge(dst_sev); 2144 if (ret) 2145 goto out_dst_cgroup; 2146 charged = true; 2147 } 2148 2149 ret = kvm_lock_all_vcpus(kvm); 2150 if (ret) 2151 goto out_dst_cgroup; 2152 ret = kvm_lock_all_vcpus(source_kvm); 2153 if (ret) 2154 goto out_dst_vcpu; 2155 2156 ret = sev_check_source_vcpus(kvm, source_kvm); 2157 if (ret) 2158 goto out_source_vcpu; 2159 2160 /* 2161 * Allocate a new have_run_cpus for the destination, i.e. don't copy 2162 * the set of CPUs from the source. If a CPU was used to run a vCPU in 2163 * the source VM but is never used for the destination VM, then the CPU 2164 * can only have cached memory that was accessible to the source VM. 2165 */ 2166 if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 2167 ret = -ENOMEM; 2168 goto out_source_vcpu; 2169 } 2170 2171 sev_migrate_from(kvm, source_kvm); 2172 kvm_vm_dead(source_kvm); 2173 cg_cleanup_sev = src_sev; 2174 ret = 0; 2175 2176 out_source_vcpu: 2177 kvm_unlock_all_vcpus(source_kvm); 2178 out_dst_vcpu: 2179 kvm_unlock_all_vcpus(kvm); 2180 out_dst_cgroup: 2181 /* Operates on the source on success, on the destination on failure. */ 2182 if (charged) 2183 sev_misc_cg_uncharge(cg_cleanup_sev); 2184 put_misc_cg(cg_cleanup_sev->misc_cg); 2185 cg_cleanup_sev->misc_cg = NULL; 2186 out_unlock: 2187 sev_unlock_two_vms(kvm, source_kvm); 2188 return ret; 2189 } 2190 2191 int sev_dev_get_attr(u32 group, u64 attr, u64 *val) 2192 { 2193 if (group != KVM_X86_GRP_SEV) 2194 return -ENXIO; 2195 2196 switch (attr) { 2197 case KVM_X86_SEV_VMSA_FEATURES: 2198 *val = sev_supported_vmsa_features; 2199 return 0; 2200 2201 case KVM_X86_SNP_POLICY_BITS: 2202 *val = snp_supported_policy_bits; 2203 return 0; 2204 2205 case KVM_X86_SEV_SNP_REQ_CERTS: 2206 *val = sev_snp_enabled ? 1 : 0; 2207 return 0; 2208 default: 2209 return -ENXIO; 2210 } 2211 } 2212 2213 /* 2214 * The guest context contains all the information, keys and metadata 2215 * associated with the guest that the firmware tracks to implement SEV 2216 * and SNP features. The firmware stores the guest context in hypervisor 2217 * provide page via the SNP_GCTX_CREATE command. 2218 */ 2219 static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) 2220 { 2221 struct sev_data_snp_addr data = {}; 2222 void *context; 2223 int rc; 2224 2225 /* Allocate memory for context page */ 2226 context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); 2227 if (!context) 2228 return NULL; 2229 2230 data.address = __psp_pa(context); 2231 rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error); 2232 if (rc) { 2233 pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d", 2234 rc, argp->error); 2235 snp_free_firmware_page(context); 2236 return NULL; 2237 } 2238 2239 return context; 2240 } 2241 2242 static int snp_bind_asid(struct kvm *kvm, int *error) 2243 { 2244 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2245 struct sev_data_snp_activate data = {0}; 2246 2247 data.gctx_paddr = __psp_pa(sev->snp_context); 2248 data.asid = sev_get_asid(kvm); 2249 return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error); 2250 } 2251 2252 static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 2253 { 2254 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2255 struct sev_data_snp_launch_start start = {0}; 2256 struct kvm_sev_snp_launch_start params; 2257 int rc; 2258 2259 if (!sev_snp_guest(kvm)) 2260 return -ENOTTY; 2261 2262 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 2263 return -EFAULT; 2264 2265 /* Don't allow userspace to allocate memory for more than 1 SNP context. */ 2266 if (sev->snp_context) 2267 return -EINVAL; 2268 2269 if (params.flags) 2270 return -EINVAL; 2271 2272 if (params.policy & ~snp_supported_policy_bits) 2273 return -EINVAL; 2274 2275 /* Check for policy bits that must be set */ 2276 if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO)) 2277 return -EINVAL; 2278 2279 if (snp_is_secure_tsc_enabled(kvm)) { 2280 if (WARN_ON_ONCE(!kvm->arch.default_tsc_khz)) 2281 return -EINVAL; 2282 2283 start.desired_tsc_khz = kvm->arch.default_tsc_khz; 2284 } 2285 2286 sev->snp_context = snp_context_create(kvm, argp); 2287 if (!sev->snp_context) 2288 return -ENOTTY; 2289 2290 start.gctx_paddr = __psp_pa(sev->snp_context); 2291 start.policy = params.policy; 2292 2293 memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); 2294 rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); 2295 if (rc) { 2296 pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n", 2297 __func__, rc); 2298 goto e_free_context; 2299 } 2300 2301 sev->policy = params.policy; 2302 sev->fd = argp->sev_fd; 2303 rc = snp_bind_asid(kvm, &argp->error); 2304 if (rc) { 2305 pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n", 2306 __func__, rc); 2307 goto e_free_context; 2308 } 2309 2310 return 0; 2311 2312 e_free_context: 2313 snp_decommission_context(kvm); 2314 2315 return rc; 2316 } 2317 2318 struct sev_gmem_populate_args { 2319 __u8 type; 2320 int sev_fd; 2321 int fw_error; 2322 }; 2323 2324 static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 2325 struct page *src_page, void *opaque) 2326 { 2327 struct sev_gmem_populate_args *sev_populate_args = opaque; 2328 struct sev_data_snp_launch_update fw_args = {0}; 2329 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2330 bool assigned = false; 2331 int level; 2332 int ret; 2333 2334 if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page)) 2335 return -EINVAL; 2336 2337 ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level); 2338 if (ret || assigned) { 2339 pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", 2340 __func__, gfn, ret, assigned); 2341 ret = ret ? -EINVAL : -EEXIST; 2342 goto out; 2343 } 2344 2345 if (src_page) { 2346 void *src_vaddr = kmap_local_page(src_page); 2347 void *dst_vaddr = kmap_local_pfn(pfn); 2348 2349 memcpy(dst_vaddr, src_vaddr, PAGE_SIZE); 2350 2351 kunmap_local(dst_vaddr); 2352 kunmap_local(src_vaddr); 2353 } 2354 2355 ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K, 2356 sev_get_asid(kvm), true); 2357 if (ret) 2358 goto out; 2359 2360 fw_args.gctx_paddr = __psp_pa(sev->snp_context); 2361 fw_args.address = __sme_set(pfn_to_hpa(pfn)); 2362 fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); 2363 fw_args.page_type = sev_populate_args->type; 2364 2365 ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, 2366 &fw_args, &sev_populate_args->fw_error); 2367 /* 2368 * If the firmware command failed handle the reclaim and cleanup of that 2369 * PFN before reporting an error. 2370 * 2371 * Additionally, when invalid CPUID function entries are detected, 2372 * firmware writes the expected values into the page and leaves it 2373 * unencrypted so it can be used for debugging and error-reporting. 2374 * 2375 * Copy this page back into the source buffer so userspace can use this 2376 * information to provide information on which CPUID leaves/fields 2377 * failed CPUID validation. 2378 */ 2379 if (ret && !snp_page_reclaim(kvm, pfn) && 2380 sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && 2381 sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { 2382 void *src_vaddr = kmap_local_page(src_page); 2383 void *dst_vaddr = kmap_local_pfn(pfn); 2384 2385 memcpy(src_vaddr, dst_vaddr, PAGE_SIZE); 2386 set_page_dirty(src_page); 2387 2388 kunmap_local(dst_vaddr); 2389 kunmap_local(src_vaddr); 2390 } 2391 2392 out: 2393 if (ret) 2394 pr_debug("%s: error updating GFN %llx, return code %d (fw_error %d)\n", 2395 __func__, gfn, ret, sev_populate_args->fw_error); 2396 return ret; 2397 } 2398 2399 static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) 2400 { 2401 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2402 struct sev_gmem_populate_args sev_populate_args = {0}; 2403 struct kvm_sev_snp_launch_update params; 2404 struct kvm_memory_slot *memslot; 2405 long npages, count; 2406 void __user *src; 2407 2408 if (!sev_snp_guest(kvm) || !sev->snp_context) 2409 return -EINVAL; 2410 2411 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 2412 return -EFAULT; 2413 2414 pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__, 2415 params.gfn_start, params.len, params.type, params.flags); 2416 2417 if (!params.len || !PAGE_ALIGNED(params.len) || params.flags || 2418 (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL && 2419 params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO && 2420 params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED && 2421 params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS && 2422 params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID)) 2423 return -EINVAL; 2424 2425 src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr); 2426 2427 if (!PAGE_ALIGNED(src)) 2428 return -EINVAL; 2429 2430 npages = params.len / PAGE_SIZE; 2431 2432 /* 2433 * For each GFN that's being prepared as part of the initial guest 2434 * state, the following pre-conditions are verified: 2435 * 2436 * 1) The backing memslot is a valid private memslot. 2437 * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES 2438 * beforehand. 2439 * 3) The PFN of the guest_memfd has not already been set to private 2440 * in the RMP table. 2441 * 2442 * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page 2443 * faults if there's a race between a fault and an attribute update via 2444 * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized 2445 * here. However, kvm->slots_lock guards against both this as well as 2446 * concurrent memslot updates occurring while these checks are being 2447 * performed, so use that here to make it easier to reason about the 2448 * initial expected state and better guard against unexpected 2449 * situations. 2450 */ 2451 guard(mutex)(&kvm->slots_lock); 2452 2453 memslot = gfn_to_memslot(kvm, params.gfn_start); 2454 if (!kvm_slot_has_gmem(memslot)) 2455 return -EINVAL; 2456 2457 sev_populate_args.sev_fd = argp->sev_fd; 2458 sev_populate_args.type = params.type; 2459 2460 count = kvm_gmem_populate(kvm, params.gfn_start, src, npages, 2461 params.type == KVM_SEV_SNP_PAGE_TYPE_CPUID, 2462 sev_gmem_post_populate, &sev_populate_args); 2463 if (count < 0) { 2464 argp->error = sev_populate_args.fw_error; 2465 pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n", 2466 __func__, count, argp->error); 2467 return -EIO; 2468 } 2469 2470 params.gfn_start += count; 2471 params.len -= count * PAGE_SIZE; 2472 if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO) 2473 params.uaddr += count * PAGE_SIZE; 2474 2475 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) 2476 return -EFAULT; 2477 2478 return 0; 2479 } 2480 2481 static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) 2482 { 2483 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2484 struct sev_data_snp_launch_update data = {}; 2485 struct kvm_vcpu *vcpu; 2486 unsigned long i; 2487 int ret; 2488 2489 if (kvm_is_vcpu_creation_in_progress(kvm)) 2490 return -EBUSY; 2491 2492 ret = kvm_lock_all_vcpus(kvm); 2493 if (ret) 2494 return ret; 2495 2496 data.gctx_paddr = __psp_pa(sev->snp_context); 2497 data.page_type = SNP_PAGE_TYPE_VMSA; 2498 2499 kvm_for_each_vcpu(i, vcpu, kvm) { 2500 struct vcpu_svm *svm = to_svm(vcpu); 2501 u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; 2502 2503 ret = sev_es_sync_vmsa(svm); 2504 if (ret) 2505 goto out; 2506 2507 /* Transition the VMSA page to a firmware state. */ 2508 ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true); 2509 if (ret) 2510 goto out; 2511 2512 /* Issue the SNP command to encrypt the VMSA */ 2513 data.address = __sme_pa(svm->sev_es.vmsa); 2514 ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, 2515 &data, &argp->error); 2516 if (ret) { 2517 snp_page_reclaim(kvm, pfn); 2518 2519 goto out; 2520 } 2521 2522 svm->vcpu.arch.guest_state_protected = true; 2523 /* 2524 * SEV-ES (and thus SNP) guest mandates LBR Virtualization to 2525 * be _always_ ON. Enable it only after setting 2526 * guest_state_protected because KVM_SET_MSRS allows dynamic 2527 * toggling of LBRV (for performance reason) on write access to 2528 * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. 2529 */ 2530 svm_enable_lbrv(vcpu); 2531 } 2532 2533 out: 2534 kvm_unlock_all_vcpus(kvm); 2535 return ret; 2536 } 2537 2538 static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 2539 { 2540 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2541 struct kvm_sev_snp_launch_finish params; 2542 struct sev_data_snp_launch_finish *data; 2543 void *id_block = NULL, *id_auth = NULL; 2544 int ret; 2545 2546 if (!sev_snp_guest(kvm)) 2547 return -ENOTTY; 2548 2549 if (!sev->snp_context) 2550 return -EINVAL; 2551 2552 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 2553 return -EFAULT; 2554 2555 if (params.flags) 2556 return -EINVAL; 2557 2558 /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */ 2559 ret = snp_launch_update_vmsa(kvm, argp); 2560 if (ret) 2561 return ret; 2562 2563 data = kzalloc_obj(*data, GFP_KERNEL_ACCOUNT); 2564 if (!data) 2565 return -ENOMEM; 2566 2567 if (params.id_block_en) { 2568 id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE); 2569 if (IS_ERR(id_block)) { 2570 ret = PTR_ERR(id_block); 2571 goto e_free; 2572 } 2573 2574 data->id_block_en = 1; 2575 data->id_block_paddr = __sme_pa(id_block); 2576 2577 id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE); 2578 if (IS_ERR(id_auth)) { 2579 ret = PTR_ERR(id_auth); 2580 goto e_free_id_block; 2581 } 2582 2583 data->id_auth_paddr = __sme_pa(id_auth); 2584 2585 if (params.auth_key_en) 2586 data->auth_key_en = 1; 2587 } 2588 2589 data->vcek_disabled = params.vcek_disabled; 2590 2591 memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE); 2592 data->gctx_paddr = __psp_pa(sev->snp_context); 2593 ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); 2594 2595 /* 2596 * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages 2597 * can be given to the guest simply by marking the RMP entry as private. 2598 * This can happen on first access and also with KVM_PRE_FAULT_MEMORY. 2599 */ 2600 if (!ret) 2601 kvm->arch.pre_fault_allowed = true; 2602 2603 kfree(id_auth); 2604 2605 e_free_id_block: 2606 kfree(id_block); 2607 2608 e_free: 2609 kfree(data); 2610 2611 return ret; 2612 } 2613 2614 static int snp_enable_certs(struct kvm *kvm) 2615 { 2616 if (kvm->created_vcpus || !sev_snp_guest(kvm)) 2617 return -EINVAL; 2618 2619 to_kvm_sev_info(kvm)->snp_certs_enabled = true; 2620 2621 return 0; 2622 } 2623 2624 int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) 2625 { 2626 struct kvm_sev_cmd sev_cmd; 2627 int r; 2628 2629 if (!sev_enabled) 2630 return -ENOTTY; 2631 2632 if (!argp) 2633 return 0; 2634 2635 if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd))) 2636 return -EFAULT; 2637 2638 guard(mutex)(&kvm->lock); 2639 2640 /* Only the enc_context_owner handles some memory enc operations. */ 2641 if (is_mirroring_enc_context(kvm) && 2642 !is_cmd_allowed_from_mirror(sev_cmd.id)) 2643 return -EINVAL; 2644 2645 /* 2646 * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only 2647 * allow the use of SNP-specific commands. 2648 */ 2649 if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) 2650 return -EPERM; 2651 2652 switch (sev_cmd.id) { 2653 case KVM_SEV_ES_INIT: 2654 if (!sev_es_enabled) 2655 return -ENOTTY; 2656 fallthrough; 2657 case KVM_SEV_INIT: 2658 r = sev_guest_init(kvm, &sev_cmd); 2659 break; 2660 case KVM_SEV_INIT2: 2661 r = sev_guest_init2(kvm, &sev_cmd); 2662 break; 2663 case KVM_SEV_LAUNCH_START: 2664 r = sev_launch_start(kvm, &sev_cmd); 2665 break; 2666 case KVM_SEV_LAUNCH_UPDATE_DATA: 2667 r = sev_launch_update_data(kvm, &sev_cmd); 2668 break; 2669 case KVM_SEV_LAUNCH_UPDATE_VMSA: 2670 r = sev_launch_update_vmsa(kvm, &sev_cmd); 2671 break; 2672 case KVM_SEV_LAUNCH_MEASURE: 2673 r = sev_launch_measure(kvm, &sev_cmd); 2674 break; 2675 case KVM_SEV_LAUNCH_FINISH: 2676 r = sev_launch_finish(kvm, &sev_cmd); 2677 break; 2678 case KVM_SEV_GUEST_STATUS: 2679 r = sev_guest_status(kvm, &sev_cmd); 2680 break; 2681 case KVM_SEV_DBG_DECRYPT: 2682 case KVM_SEV_DBG_ENCRYPT: 2683 r = sev_dbg_crypt(kvm, &sev_cmd, sev_cmd.id); 2684 break; 2685 case KVM_SEV_LAUNCH_SECRET: 2686 r = sev_launch_secret(kvm, &sev_cmd); 2687 break; 2688 case KVM_SEV_GET_ATTESTATION_REPORT: 2689 r = sev_get_attestation_report(kvm, &sev_cmd); 2690 break; 2691 case KVM_SEV_SEND_START: 2692 r = sev_send_start(kvm, &sev_cmd); 2693 break; 2694 case KVM_SEV_SEND_UPDATE_DATA: 2695 r = sev_send_update_data(kvm, &sev_cmd); 2696 break; 2697 case KVM_SEV_SEND_FINISH: 2698 r = sev_send_finish(kvm, &sev_cmd); 2699 break; 2700 case KVM_SEV_SEND_CANCEL: 2701 r = sev_send_cancel(kvm, &sev_cmd); 2702 break; 2703 case KVM_SEV_RECEIVE_START: 2704 r = sev_receive_start(kvm, &sev_cmd); 2705 break; 2706 case KVM_SEV_RECEIVE_UPDATE_DATA: 2707 r = sev_receive_update_data(kvm, &sev_cmd); 2708 break; 2709 case KVM_SEV_RECEIVE_FINISH: 2710 r = sev_receive_finish(kvm, &sev_cmd); 2711 break; 2712 case KVM_SEV_SNP_LAUNCH_START: 2713 r = snp_launch_start(kvm, &sev_cmd); 2714 break; 2715 case KVM_SEV_SNP_LAUNCH_UPDATE: 2716 r = snp_launch_update(kvm, &sev_cmd); 2717 break; 2718 case KVM_SEV_SNP_LAUNCH_FINISH: 2719 r = snp_launch_finish(kvm, &sev_cmd); 2720 break; 2721 case KVM_SEV_SNP_ENABLE_REQ_CERTS: 2722 r = snp_enable_certs(kvm); 2723 break; 2724 default: 2725 return -EINVAL; 2726 } 2727 2728 if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd))) 2729 r = -EFAULT; 2730 2731 return r; 2732 } 2733 2734 int sev_mem_enc_register_region(struct kvm *kvm, 2735 struct kvm_enc_region *range) 2736 { 2737 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2738 struct enc_region *region; 2739 int ret = 0; 2740 2741 guard(mutex)(&kvm->lock); 2742 2743 if (!sev_guest(kvm)) 2744 return -ENOTTY; 2745 2746 /* If kvm is mirroring encryption context it isn't responsible for it */ 2747 if (is_mirroring_enc_context(kvm)) 2748 return -EINVAL; 2749 2750 region = kzalloc_obj(*region, GFP_KERNEL_ACCOUNT); 2751 if (!region) 2752 return -ENOMEM; 2753 2754 region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 2755 FOLL_WRITE | FOLL_LONGTERM); 2756 if (IS_ERR(region->pages)) { 2757 ret = PTR_ERR(region->pages); 2758 goto e_free; 2759 } 2760 2761 /* 2762 * The guest may change the memory encryption attribute from C=0 -> C=1 2763 * or vice versa for this memory range. Lets make sure caches are 2764 * flushed to ensure that guest data gets written into memory with 2765 * correct C-bit. Note, this must be done before dropping kvm->lock, 2766 * as region and its array of pages can be freed by a different task 2767 * once kvm->lock is released. 2768 */ 2769 sev_clflush_pages(region->pages, region->npages); 2770 2771 region->uaddr = range->addr; 2772 region->size = range->size; 2773 2774 list_add_tail(®ion->list, &sev->regions_list); 2775 return ret; 2776 2777 e_free: 2778 kfree(region); 2779 return ret; 2780 } 2781 2782 static struct enc_region * 2783 find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) 2784 { 2785 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2786 struct list_head *head = &sev->regions_list; 2787 struct enc_region *i; 2788 2789 list_for_each_entry(i, head, list) { 2790 if (i->uaddr == range->addr && 2791 i->size == range->size) 2792 return i; 2793 } 2794 2795 return NULL; 2796 } 2797 2798 static void __unregister_enc_region_locked(struct kvm *kvm, 2799 struct enc_region *region) 2800 { 2801 sev_unpin_memory(kvm, region->pages, region->npages); 2802 list_del(®ion->list); 2803 kfree(region); 2804 } 2805 2806 int sev_mem_enc_unregister_region(struct kvm *kvm, 2807 struct kvm_enc_region *range) 2808 { 2809 struct enc_region *region; 2810 2811 /* If kvm is mirroring encryption context it isn't responsible for it */ 2812 if (is_mirroring_enc_context(kvm)) 2813 return -EINVAL; 2814 2815 guard(mutex)(&kvm->lock); 2816 2817 if (!sev_guest(kvm)) 2818 return -ENOTTY; 2819 2820 region = find_enc_region(kvm, range); 2821 if (!region) 2822 return -EINVAL; 2823 2824 sev_writeback_caches(kvm); 2825 2826 __unregister_enc_region_locked(kvm, region); 2827 2828 return 0; 2829 } 2830 2831 int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) 2832 { 2833 CLASS(fd, f)(source_fd); 2834 struct kvm *source_kvm; 2835 struct kvm_sev_info *source_sev, *mirror_sev; 2836 int ret; 2837 2838 if (fd_empty(f)) 2839 return -EBADF; 2840 2841 if (!file_is_kvm(fd_file(f))) 2842 return -EBADF; 2843 2844 source_kvm = fd_file(f)->private_data; 2845 ret = sev_lock_two_vms(kvm, source_kvm); 2846 if (ret) 2847 return ret; 2848 2849 /* 2850 * Mirrors of mirrors should work, but let's not get silly. Also 2851 * disallow out-of-band SEV/SEV-ES init if the target is already an 2852 * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being 2853 * created after SEV/SEV-ES initialization, e.g. to init intercepts. 2854 */ 2855 if (sev_guest(kvm) || !sev_guest(source_kvm) || 2856 is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) { 2857 ret = -EINVAL; 2858 goto e_unlock; 2859 } 2860 2861 mirror_sev = to_kvm_sev_info(kvm); 2862 if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 2863 ret = -ENOMEM; 2864 goto e_unlock; 2865 } 2866 2867 /* 2868 * The mirror kvm holds an enc_context_owner ref so its asid can't 2869 * disappear until we're done with it 2870 */ 2871 source_sev = to_kvm_sev_info(source_kvm); 2872 kvm_get_kvm(source_kvm); 2873 list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); 2874 2875 /* Set enc_context_owner and copy its encryption context over */ 2876 mirror_sev->enc_context_owner = source_kvm; 2877 mirror_sev->active = true; 2878 mirror_sev->asid = source_sev->asid; 2879 mirror_sev->fd = source_sev->fd; 2880 mirror_sev->es_active = source_sev->es_active; 2881 mirror_sev->need_init = false; 2882 mirror_sev->handle = source_sev->handle; 2883 INIT_LIST_HEAD(&mirror_sev->regions_list); 2884 INIT_LIST_HEAD(&mirror_sev->mirror_vms); 2885 ret = 0; 2886 2887 /* 2888 * Do not copy ap_jump_table. Since the mirror does not share the same 2889 * KVM contexts as the original, and they may have different 2890 * memory-views. 2891 */ 2892 2893 e_unlock: 2894 sev_unlock_two_vms(kvm, source_kvm); 2895 return ret; 2896 } 2897 2898 static int snp_decommission_context(struct kvm *kvm) 2899 { 2900 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2901 struct sev_data_snp_addr data = {}; 2902 int ret; 2903 2904 /* If context is not created then do nothing */ 2905 if (!sev->snp_context) 2906 return 0; 2907 2908 /* Do the decommision, which will unbind the ASID from the SNP context */ 2909 data.address = __sme_pa(sev->snp_context); 2910 down_write(&sev_deactivate_lock); 2911 ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL); 2912 up_write(&sev_deactivate_lock); 2913 2914 if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret)) 2915 return ret; 2916 2917 snp_free_firmware_page(sev->snp_context); 2918 sev->snp_context = NULL; 2919 2920 return 0; 2921 } 2922 2923 void sev_vm_init(struct kvm *kvm) 2924 { 2925 switch (kvm->arch.vm_type) { 2926 case KVM_X86_DEFAULT_VM: 2927 case KVM_X86_SW_PROTECTED_VM: 2928 break; 2929 case KVM_X86_SNP_VM: 2930 kvm->arch.has_private_mem = true; 2931 fallthrough; 2932 case KVM_X86_SEV_ES_VM: 2933 kvm->arch.has_protected_state = true; 2934 fallthrough; 2935 case KVM_X86_SEV_VM: 2936 kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; 2937 to_kvm_sev_info(kvm)->need_init = true; 2938 break; 2939 default: 2940 WARN_ONCE(1, "Unsupported VM type %u", kvm->arch.vm_type); 2941 break; 2942 } 2943 } 2944 2945 void sev_vm_destroy(struct kvm *kvm) 2946 { 2947 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2948 struct list_head *head = &sev->regions_list; 2949 struct list_head *pos, *q; 2950 2951 if (!sev_guest(kvm)) 2952 return; 2953 2954 WARN_ON(!list_empty(&sev->mirror_vms)); 2955 2956 free_cpumask_var(sev->have_run_cpus); 2957 2958 /* 2959 * If this is a mirror VM, remove it from the owner's list of a mirrors 2960 * and skip ASID cleanup (the ASID is tied to the lifetime of the owner). 2961 * Note, mirror VMs don't support registering encrypted regions. 2962 */ 2963 if (is_mirroring_enc_context(kvm)) { 2964 struct kvm *owner_kvm = sev->enc_context_owner; 2965 2966 mutex_lock(&owner_kvm->lock); 2967 list_del(&sev->mirror_entry); 2968 mutex_unlock(&owner_kvm->lock); 2969 kvm_put_kvm(owner_kvm); 2970 return; 2971 } 2972 2973 2974 /* 2975 * if userspace was terminated before unregistering the memory regions 2976 * then lets unpin all the registered memory. 2977 */ 2978 if (!list_empty(head)) { 2979 list_for_each_safe(pos, q, head) { 2980 __unregister_enc_region_locked(kvm, 2981 list_entry(pos, struct enc_region, list)); 2982 cond_resched(); 2983 } 2984 } 2985 2986 if (sev_snp_guest(kvm)) { 2987 snp_guest_req_cleanup(kvm); 2988 2989 /* 2990 * Decomission handles unbinding of the ASID. If it fails for 2991 * some unexpected reason, just leak the ASID. 2992 */ 2993 if (snp_decommission_context(kvm)) 2994 return; 2995 } else { 2996 sev_unbind_asid(kvm, sev->handle); 2997 } 2998 2999 sev_asid_free(sev); 3000 } 3001 3002 void __init sev_set_cpu_caps(void) 3003 { 3004 if (sev_enabled) 3005 kvm_cpu_cap_set(X86_FEATURE_SEV); 3006 3007 if (sev_es_enabled) 3008 kvm_cpu_cap_set(X86_FEATURE_SEV_ES); 3009 3010 if (sev_snp_enabled) 3011 kvm_cpu_cap_set(X86_FEATURE_SEV_SNP); 3012 } 3013 3014 static bool is_sev_snp_initialized(void) 3015 { 3016 struct sev_user_data_snp_status *status; 3017 struct sev_data_snp_addr buf; 3018 bool initialized = false; 3019 int ret, error = 0; 3020 3021 status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO); 3022 if (!status) 3023 return false; 3024 3025 buf.address = __psp_pa(status); 3026 ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error); 3027 if (ret) { 3028 pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n", 3029 ret, error, error); 3030 goto out; 3031 } 3032 3033 initialized = !!status->state; 3034 3035 out: 3036 snp_free_firmware_page(status); 3037 3038 return initialized; 3039 } 3040 3041 static const char * __init sev_str_feature_state(bool is_supported, bool is_usable) 3042 { 3043 return is_supported ? is_usable ? "enabled" : "unusable" : "disabled"; 3044 } 3045 3046 void __init sev_hardware_setup(void) 3047 { 3048 unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; 3049 struct sev_platform_init_args init_args = {0}; 3050 bool sev_snp_supported = false; 3051 bool sev_es_supported = false; 3052 bool sev_supported = false; 3053 u32 vm_types = 0; 3054 3055 if (!sev_enabled || !npt_enabled || !nrips) 3056 goto out; 3057 3058 /* 3059 * SEV must obviously be supported in hardware. Sanity check that the 3060 * CPU supports decode assists, which is mandatory for SEV guests to 3061 * support instruction emulation. Ditto for flushing by ASID, as SEV 3062 * guests are bound to a single ASID, i.e. KVM can't rotate to a new 3063 * ASID to effect a TLB flush. 3064 */ 3065 if (!boot_cpu_has(X86_FEATURE_SEV) || 3066 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) || 3067 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) 3068 goto out; 3069 3070 /* 3071 * The kernel's initcall infrastructure lacks the ability to express 3072 * dependencies between initcalls, whereas the modules infrastructure 3073 * automatically handles dependencies via symbol loading. Ensure the 3074 * PSP SEV driver is initialized before proceeding if KVM is built-in, 3075 * as the dependency isn't handled by the initcall infrastructure. 3076 */ 3077 if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) 3078 goto out; 3079 3080 /* Retrieve SEV CPUID information */ 3081 cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); 3082 3083 /* Set encryption bit location for SEV-ES guests */ 3084 sev_enc_bit = ebx & 0x3f; 3085 3086 /* Maximum number of encrypted guests supported simultaneously */ 3087 max_sev_asid = ecx; 3088 if (!max_sev_asid) 3089 goto out; 3090 3091 /* Minimum ASID value that should be used for SEV guest */ 3092 min_sev_asid = edx; 3093 sev_me_mask = 1UL << (ebx & 0x3f); 3094 3095 /* 3096 * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap, 3097 * even though it's never used, so that the bitmap is indexed by the 3098 * actual ASID. 3099 */ 3100 nr_asids = max_sev_asid + 1; 3101 sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); 3102 if (!sev_asid_bitmap) 3103 goto out; 3104 3105 sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); 3106 if (!sev_reclaim_asid_bitmap) { 3107 bitmap_free(sev_asid_bitmap); 3108 sev_asid_bitmap = NULL; 3109 goto out; 3110 } 3111 3112 if (min_sev_asid <= max_sev_asid) { 3113 sev_asid_count = max_sev_asid - min_sev_asid + 1; 3114 WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); 3115 } 3116 sev_supported = true; 3117 3118 /* SEV-ES support requested? */ 3119 if (!sev_es_enabled) 3120 goto out; 3121 3122 /* 3123 * SEV-ES requires MMIO caching as KVM doesn't have access to the guest 3124 * instruction stream, i.e. can't emulate in response to a #NPF and 3125 * instead relies on #NPF(RSVD) being reflected into the guest as #VC 3126 * (the guest can then do a #VMGEXIT to request MMIO emulation). 3127 */ 3128 if (!enable_mmio_caching) 3129 goto out; 3130 3131 /* Does the CPU support SEV-ES? */ 3132 if (!boot_cpu_has(X86_FEATURE_SEV_ES)) 3133 goto out; 3134 3135 if (!lbrv) { 3136 WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV), 3137 "LBRV must be present for SEV-ES support"); 3138 goto out; 3139 } 3140 3141 /* Has the system been allocated ASIDs for SEV-ES? */ 3142 if (min_sev_asid == 1) 3143 goto out; 3144 3145 min_sev_es_asid = min_snp_asid = 1; 3146 max_sev_es_asid = max_snp_asid = min_sev_asid - 1; 3147 3148 sev_es_asid_count = min_sev_asid - 1; 3149 WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); 3150 sev_es_supported = true; 3151 sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); 3152 3153 out: 3154 if (sev_enabled) { 3155 init_args.probe = true; 3156 3157 if (sev_is_snp_ciphertext_hiding_supported()) 3158 init_args.max_snp_asid = min(nr_ciphertext_hiding_asids, 3159 min_sev_asid - 1); 3160 3161 if (sev_platform_init(&init_args)) 3162 sev_supported = sev_es_supported = sev_snp_supported = false; 3163 else if (sev_snp_supported) 3164 sev_snp_supported = is_sev_snp_initialized(); 3165 3166 if (sev_snp_supported) { 3167 snp_supported_policy_bits = sev_get_snp_policy_bits() & 3168 KVM_SNP_POLICY_MASK_VALID; 3169 nr_ciphertext_hiding_asids = init_args.max_snp_asid; 3170 } 3171 3172 /* 3173 * If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP 3174 * ASID range is partitioned into separate SEV-ES and SEV-SNP 3175 * ASID ranges, with the SEV-SNP range being [1..max_snp_asid] 3176 * and the SEV-ES range being (max_snp_asid..max_sev_es_asid]. 3177 * Note, SEV-ES may effectively be disabled if all ASIDs from 3178 * the joint range are assigned to SEV-SNP. 3179 */ 3180 if (nr_ciphertext_hiding_asids) { 3181 max_snp_asid = nr_ciphertext_hiding_asids; 3182 min_sev_es_asid = max_snp_asid + 1; 3183 pr_info("SEV-SNP ciphertext hiding enabled\n"); 3184 } 3185 } 3186 3187 if (sev_supported && min_sev_asid <= max_sev_asid) 3188 vm_types |= BIT(KVM_X86_SEV_VM); 3189 if (sev_es_supported && min_sev_es_asid <= max_sev_es_asid) 3190 vm_types |= BIT(KVM_X86_SEV_ES_VM); 3191 if (sev_snp_supported) 3192 vm_types |= BIT(KVM_X86_SNP_VM); 3193 vm_types &= sev_firmware_supported_vm_types(); 3194 3195 kvm_caps.supported_vm_types |= vm_types; 3196 3197 if (boot_cpu_has(X86_FEATURE_SEV)) 3198 pr_info("SEV %s (ASIDs %u - %u)\n", 3199 sev_str_feature_state(sev_supported, vm_types & BIT(KVM_X86_SEV_VM)), 3200 min_sev_asid, max_sev_asid); 3201 if (boot_cpu_has(X86_FEATURE_SEV_ES)) 3202 pr_info("SEV-ES %s (ASIDs %u - %u)\n", 3203 sev_str_feature_state(sev_es_supported, vm_types & BIT(KVM_X86_SEV_ES_VM)), 3204 min_sev_es_asid, max_sev_es_asid); 3205 if (boot_cpu_has(X86_FEATURE_SEV_SNP)) 3206 pr_info("SEV-SNP %s (ASIDs %u - %u)\n", 3207 sev_str_feature_state(sev_snp_supported, vm_types & BIT(KVM_X86_SNP_VM)), 3208 min_snp_asid, max_snp_asid); 3209 3210 sev_enabled = sev_supported; 3211 sev_es_enabled = sev_es_supported; 3212 sev_snp_enabled = sev_snp_supported; 3213 3214 sev_supported_vmsa_features = 0; 3215 3216 if (sev_es_enabled && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) && 3217 cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) 3218 sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; 3219 3220 if (sev_snp_enabled && tsc_khz && cpu_feature_enabled(X86_FEATURE_SNP_SECURE_TSC)) 3221 sev_supported_vmsa_features |= SVM_SEV_FEAT_SECURE_TSC; 3222 } 3223 3224 void sev_hardware_unsetup(void) 3225 { 3226 if (!sev_enabled) 3227 return; 3228 3229 /* No need to take sev_bitmap_lock, all VMs have been destroyed. */ 3230 sev_flush_asids(1, max_sev_asid); 3231 3232 bitmap_free(sev_asid_bitmap); 3233 bitmap_free(sev_reclaim_asid_bitmap); 3234 3235 misc_cg_set_capacity(MISC_CG_RES_SEV, 0); 3236 misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); 3237 3238 sev_platform_shutdown(); 3239 } 3240 3241 int sev_cpu_init(struct svm_cpu_data *sd) 3242 { 3243 if (!sev_enabled) 3244 return 0; 3245 3246 sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL); 3247 if (!sd->sev_vmcbs) 3248 return -ENOMEM; 3249 3250 return 0; 3251 } 3252 3253 /* 3254 * Pages used by hardware to hold guest encrypted state must be flushed before 3255 * returning them to the system. 3256 */ 3257 static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) 3258 { 3259 unsigned int asid = sev_get_asid(vcpu->kvm); 3260 3261 /* 3262 * Note! The address must be a kernel address, as regular page walk 3263 * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user 3264 * address is non-deterministic and unsafe. This function deliberately 3265 * takes a pointer to deter passing in a user address. 3266 */ 3267 unsigned long addr = (unsigned long)va; 3268 3269 /* 3270 * If CPU enforced cache coherency for encrypted mappings of the 3271 * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache 3272 * flush is still needed in order to work properly with DMA devices. 3273 */ 3274 if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) { 3275 clflush_cache_range(va, PAGE_SIZE); 3276 return; 3277 } 3278 3279 /* 3280 * VM Page Flush takes a host virtual address and a guest ASID. Fall 3281 * back to full writeback of caches if this faults so as not to make 3282 * any problems worse by leaving stale encrypted data in the cache. 3283 */ 3284 if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) 3285 goto do_sev_writeback_caches; 3286 3287 return; 3288 3289 do_sev_writeback_caches: 3290 sev_writeback_caches(vcpu->kvm); 3291 } 3292 3293 void sev_guest_memory_reclaimed(struct kvm *kvm) 3294 { 3295 /* 3296 * With SNP+gmem, private/encrypted memory is unreachable via the 3297 * hva-based mmu notifiers, i.e. these events are explicitly scoped to 3298 * shared pages, where there's no need to flush caches. 3299 * 3300 * Checking for SEV+ outside of kvm->lock is safe as __sev_guest_init() 3301 * can only be done before vCPUs are created, caches can be incoherent 3302 * if and only if a vCPU was run, and either this task will see the VM 3303 * as being SEV+ or the vCPU won't be to access the memory (because of 3304 * the in-progress invalidation). 3305 */ 3306 if (!____sev_guest(kvm) || ____sev_snp_guest(kvm)) 3307 return; 3308 3309 sev_writeback_caches(kvm); 3310 } 3311 3312 static void dump_ghcb(struct vcpu_svm *svm) 3313 { 3314 struct vmcb_control_area *control = &svm->vmcb->control; 3315 unsigned int nbits; 3316 3317 /* Re-use the dump_invalid_vmcb module parameter */ 3318 if (!dump_invalid_vmcb) { 3319 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); 3320 return; 3321 } 3322 3323 nbits = sizeof(svm->sev_es.valid_bitmap) * 8; 3324 3325 /* 3326 * Print KVM's snapshot of the GHCB values that were (unsuccessfully) 3327 * used to handle the exit. If the guest has since modified the GHCB 3328 * itself, dumping the raw GHCB won't help debug why KVM was unable to 3329 * handle the VMGEXIT that KVM observed. 3330 */ 3331 pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); 3332 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", 3333 control->exit_code, kvm_ghcb_sw_exit_code_is_valid(svm)); 3334 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", 3335 control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); 3336 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", 3337 control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); 3338 pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", 3339 svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); 3340 pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); 3341 } 3342 3343 static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) 3344 { 3345 struct kvm_vcpu *vcpu = &svm->vcpu; 3346 struct ghcb *ghcb = svm->sev_es.ghcb; 3347 3348 /* 3349 * The GHCB protocol so far allows for the following data 3350 * to be returned: 3351 * GPRs RAX, RBX, RCX, RDX 3352 * 3353 * Copy their values, even if they may not have been written during the 3354 * VM-Exit. It's the guest's responsibility to not consume random data. 3355 */ 3356 ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]); 3357 ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]); 3358 ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]); 3359 ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]); 3360 } 3361 3362 static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) 3363 { 3364 struct vmcb_control_area *control = &svm->vmcb->control; 3365 struct kvm_vcpu *vcpu = &svm->vcpu; 3366 struct ghcb *ghcb = svm->sev_es.ghcb; 3367 3368 /* 3369 * The GHCB protocol so far allows for the following data 3370 * to be supplied: 3371 * GPRs RAX, RBX, RCX, RDX 3372 * XCR0 3373 * CPL 3374 * 3375 * VMMCALL allows the guest to provide extra registers. KVM also 3376 * expects RSI for hypercalls, so include that, too. 3377 * 3378 * Copy their values to the appropriate location if supplied. 3379 */ 3380 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); 3381 3382 BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap)); 3383 memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap)); 3384 3385 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm); 3386 vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm); 3387 vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm); 3388 vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm); 3389 vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm); 3390 3391 svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm); 3392 3393 if (kvm_ghcb_xcr0_is_valid(svm)) 3394 __kvm_set_xcr(vcpu, 0, kvm_ghcb_get_xcr0(svm)); 3395 3396 if (kvm_ghcb_xss_is_valid(svm)) 3397 __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm)); 3398 3399 /* Copy the GHCB exit information into the VMCB fields */ 3400 control->exit_code = kvm_ghcb_get_sw_exit_code(svm); 3401 control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm); 3402 control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm); 3403 svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm); 3404 3405 /* Clear the valid entries fields */ 3406 memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); 3407 } 3408 3409 static bool sev_es_are_required_ghcb_fields_valid(struct vcpu_svm *svm) 3410 { 3411 struct vmcb_control_area *control = &svm->vmcb->control; 3412 struct kvm_vcpu *vcpu = &svm->vcpu; 3413 3414 if (!kvm_ghcb_sw_exit_code_is_valid(svm) || 3415 !kvm_ghcb_sw_exit_info_1_is_valid(svm) || 3416 !kvm_ghcb_sw_exit_info_2_is_valid(svm)) 3417 return false; 3418 3419 switch (control->exit_code) { 3420 case SVM_EXIT_WRITE_DR7: 3421 return kvm_ghcb_rax_is_valid(svm); 3422 case SVM_EXIT_RDPMC: 3423 return kvm_ghcb_rcx_is_valid(svm); 3424 case SVM_EXIT_CPUID: 3425 if (!kvm_ghcb_rax_is_valid(svm) || 3426 !kvm_ghcb_rcx_is_valid(svm)) 3427 return false; 3428 3429 return vcpu->arch.regs[VCPU_REGS_RAX] != 0xd || 3430 kvm_ghcb_xcr0_is_valid(svm); 3431 case SVM_EXIT_IOIO: 3432 if (control->exit_info_1 & SVM_IOIO_STR_MASK) 3433 return kvm_ghcb_sw_scratch_is_valid(svm); 3434 3435 if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK)) 3436 return kvm_ghcb_rax_is_valid(svm); 3437 3438 return true; 3439 case SVM_EXIT_MSR: 3440 if (!kvm_ghcb_rcx_is_valid(svm)) 3441 return false; 3442 3443 return !control->exit_info_1 || 3444 (kvm_ghcb_rax_is_valid(svm) && kvm_ghcb_rdx_is_valid(svm)); 3445 case SVM_EXIT_VMMCALL: 3446 return kvm_ghcb_rax_is_valid(svm) && kvm_ghcb_cpl_is_valid(svm); 3447 case SVM_EXIT_MONITOR: 3448 return kvm_ghcb_rax_is_valid(svm) && 3449 kvm_ghcb_rcx_is_valid(svm) && 3450 kvm_ghcb_rdx_is_valid(svm); 3451 case SVM_EXIT_MWAIT: 3452 return kvm_ghcb_rax_is_valid(svm) && kvm_ghcb_rcx_is_valid(svm); 3453 case SVM_VMGEXIT_AP_CREATION: 3454 return kvm_ghcb_rax_is_valid(svm) || 3455 lower_32_bits(control->exit_info_1) == SVM_VMGEXIT_AP_DESTROY; 3456 break; 3457 case SVM_VMGEXIT_MMIO_READ: 3458 case SVM_VMGEXIT_MMIO_WRITE: 3459 case SVM_VMGEXIT_PSC: 3460 return kvm_ghcb_sw_scratch_is_valid(svm); 3461 default: 3462 return true; 3463 } 3464 } 3465 3466 static void __sev_es_unmap_ghcb(struct vcpu_svm *svm) 3467 { 3468 if (svm->sev_es.ghcb_sa_free) { 3469 kvfree(svm->sev_es.ghcb_sa); 3470 svm->sev_es.ghcb_sa = NULL; 3471 svm->sev_es.ghcb_sa_free = false; 3472 } 3473 3474 if (svm->sev_es.ghcb) { 3475 kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); 3476 svm->sev_es.ghcb = NULL; 3477 } 3478 } 3479 3480 void sev_es_unmap_ghcb(struct vcpu_svm *svm) 3481 { 3482 /* Clear any indication that the vCPU is in a type of AP Reset Hold */ 3483 svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE; 3484 3485 if (!svm->sev_es.ghcb) 3486 return; 3487 3488 /* 3489 * If the scratch area lives outside the GHCB, there's a buffer that, 3490 * depending on the operation performed, may need to be synced. 3491 */ 3492 if (svm->sev_es.ghcb_sa_sync) { 3493 kvm_write_guest(svm->vcpu.kvm, svm->sev_es.sw_scratch, 3494 svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); 3495 svm->sev_es.ghcb_sa_sync = false; 3496 } 3497 3498 trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb); 3499 3500 sev_es_sync_to_ghcb(svm); 3501 3502 __sev_es_unmap_ghcb(svm); 3503 } 3504 3505 void sev_free_vcpu(struct kvm_vcpu *vcpu) 3506 { 3507 struct vcpu_svm *svm; 3508 3509 if (!is_sev_es_guest(vcpu)) 3510 return; 3511 3512 svm = to_svm(vcpu); 3513 3514 /* 3515 * If it's an SNP guest, then the VMSA was marked in the RMP table as 3516 * a guest-owned page. Transition the page to hypervisor state before 3517 * releasing it back to the system. 3518 */ 3519 if (is_sev_snp_guest(vcpu)) { 3520 u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; 3521 3522 if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) 3523 goto skip_vmsa_free; 3524 } 3525 3526 if (vcpu->arch.guest_state_protected) 3527 sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); 3528 3529 __free_page(virt_to_page(svm->sev_es.vmsa)); 3530 3531 skip_vmsa_free: 3532 __sev_es_unmap_ghcb(svm); 3533 } 3534 3535 int pre_sev_run(struct vcpu_svm *svm, int cpu) 3536 { 3537 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 3538 struct kvm_vcpu *vcpu = &svm->vcpu; 3539 struct kvm *kvm = vcpu->kvm; 3540 unsigned int asid = sev_get_asid(kvm); 3541 3542 /* 3543 * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid 3544 * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP 3545 * AP Destroy event. 3546 */ 3547 if (is_sev_es_guest(vcpu) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) 3548 return -EINVAL; 3549 3550 /* 3551 * To optimize cache flushes when memory is reclaimed from an SEV VM, 3552 * track physical CPUs that enter the guest for SEV VMs and thus can 3553 * have encrypted, dirty data in the cache, and flush caches only for 3554 * CPUs that have entered the guest. 3555 */ 3556 if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus)) 3557 cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus); 3558 3559 /* Assign the asid allocated with this SEV guest */ 3560 svm->asid = asid; 3561 3562 /* 3563 * Flush guest TLB: 3564 * 3565 * 1) when different VMCB for the same ASID is to be run on the same host CPU. 3566 * 2) or this VMCB was executed on different host CPU in previous VMRUNs. 3567 */ 3568 if (sd->sev_vmcbs[asid] == svm->vmcb && 3569 svm->vcpu.arch.last_vmentry_cpu == cpu) 3570 return 0; 3571 3572 sd->sev_vmcbs[asid] = svm->vmcb; 3573 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 3574 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 3575 return 0; 3576 } 3577 3578 #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) 3579 static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 min_len) 3580 { 3581 struct vmcb_control_area *control = &svm->vmcb->control; 3582 u64 ghcb_scratch_beg, ghcb_scratch_end; 3583 u64 scratch_gpa_beg, scratch_gpa_end; 3584 void *scratch_va; 3585 3586 if (WARN_ON_ONCE(!min_len)) 3587 goto e_scratch; 3588 3589 scratch_gpa_beg = svm->sev_es.sw_scratch; 3590 if (!scratch_gpa_beg) { 3591 pr_err("vmgexit: scratch gpa not provided\n"); 3592 goto e_scratch; 3593 } 3594 3595 scratch_gpa_end = scratch_gpa_beg + min_len; 3596 if (scratch_gpa_end < scratch_gpa_beg) { 3597 pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n", 3598 min_len, scratch_gpa_beg); 3599 goto e_scratch; 3600 } 3601 3602 WARN_ON_ONCE(svm->sev_es.ghcb_sa_sync || svm->sev_es.ghcb_sa_free); 3603 3604 if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) { 3605 /* Scratch area begins within GHCB */ 3606 ghcb_scratch_beg = control->ghcb_gpa + 3607 offsetof(struct ghcb, shared_buffer); 3608 ghcb_scratch_end = control->ghcb_gpa + 3609 offsetof(struct ghcb, reserved_0xff0); 3610 3611 /* 3612 * If the scratch area begins within the GHCB, it must be 3613 * completely contained in the GHCB shared buffer area. 3614 */ 3615 if (scratch_gpa_beg < ghcb_scratch_beg || 3616 scratch_gpa_end > ghcb_scratch_end) { 3617 pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n", 3618 scratch_gpa_beg, scratch_gpa_end); 3619 goto e_scratch; 3620 } 3621 3622 scratch_va = (void *)svm->sev_es.ghcb; 3623 scratch_va += (scratch_gpa_beg - control->ghcb_gpa); 3624 3625 svm->sev_es.ghcb_sa_sync = false; 3626 svm->sev_es.ghcb_sa_free = false; 3627 svm->sev_es.ghcb_sa_len = ghcb_scratch_end - scratch_gpa_beg; 3628 } else { 3629 /* GHCB v2 requires the scratch area to be within the GHCB. */ 3630 if (to_kvm_sev_info(svm->vcpu.kvm)->ghcb_version >= 2) 3631 goto e_scratch; 3632 3633 /* 3634 * The guest memory must be read into a kernel buffer, so 3635 * limit the size 3636 */ 3637 if (min_len > GHCB_SCRATCH_AREA_LIMIT) { 3638 pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n", 3639 min_len, GHCB_SCRATCH_AREA_LIMIT); 3640 goto e_scratch; 3641 } 3642 scratch_va = kvzalloc(min_len, GFP_KERNEL_ACCOUNT); 3643 if (!scratch_va) 3644 return -ENOMEM; 3645 3646 if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, min_len)) { 3647 /* Unable to copy scratch area from guest */ 3648 pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); 3649 3650 kvfree(scratch_va); 3651 return -EFAULT; 3652 } 3653 3654 /* 3655 * The scratch area is outside the GHCB. The operation will 3656 * dictate whether the buffer needs to be synced before running 3657 * the vCPU next time (i.e. a read was requested so the data 3658 * must be written back to the guest memory). 3659 */ 3660 svm->sev_es.ghcb_sa_sync = sync; 3661 svm->sev_es.ghcb_sa_free = true; 3662 svm->sev_es.ghcb_sa_len = min_len; 3663 } 3664 3665 svm->sev_es.ghcb_sa = scratch_va; 3666 return 0; 3667 3668 e_scratch: 3669 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA); 3670 3671 return 1; 3672 } 3673 3674 static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask, 3675 unsigned int pos) 3676 { 3677 svm->vmcb->control.ghcb_gpa &= ~(mask << pos); 3678 svm->vmcb->control.ghcb_gpa |= (value & mask) << pos; 3679 } 3680 3681 static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos) 3682 { 3683 return (svm->vmcb->control.ghcb_gpa >> pos) & mask; 3684 } 3685 3686 static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) 3687 { 3688 svm->vmcb->control.ghcb_gpa = value; 3689 } 3690 3691 static int snp_rmptable_psmash(kvm_pfn_t pfn) 3692 { 3693 int ret; 3694 3695 pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); 3696 3697 /* 3698 * PSMASH_FAIL_INUSE indicates another processor is modifying the 3699 * entry, so retry until that's no longer the case. 3700 */ 3701 do { 3702 ret = psmash(pfn); 3703 } while (ret == PSMASH_FAIL_INUSE); 3704 3705 return ret; 3706 } 3707 3708 static int snp_complete_psc_msr(struct kvm_vcpu *vcpu) 3709 { 3710 u64 hypercall_ret = READ_ONCE(vcpu->run->hypercall.ret); 3711 struct vcpu_svm *svm = to_svm(vcpu); 3712 3713 if (!kvm_is_valid_map_gpa_range_ret(hypercall_ret)) 3714 return -EINVAL; 3715 3716 if (hypercall_ret) 3717 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); 3718 else 3719 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP); 3720 3721 return 1; /* resume guest */ 3722 } 3723 3724 static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr) 3725 { 3726 u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr)); 3727 u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr); 3728 struct kvm_vcpu *vcpu = &svm->vcpu; 3729 3730 if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) { 3731 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); 3732 return 1; /* resume guest */ 3733 } 3734 3735 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { 3736 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); 3737 return 1; /* resume guest */ 3738 } 3739 3740 vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; 3741 vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 3742 /* 3743 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 3744 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 3745 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 3746 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 3747 */ 3748 vcpu->run->hypercall.ret = 0; 3749 vcpu->run->hypercall.args[0] = gpa; 3750 vcpu->run->hypercall.args[1] = 1; 3751 vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE) 3752 ? KVM_MAP_GPA_RANGE_ENCRYPTED 3753 : KVM_MAP_GPA_RANGE_DECRYPTED; 3754 vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K; 3755 3756 vcpu->arch.complete_userspace_io = snp_complete_psc_msr; 3757 3758 return 0; /* forward request to userspace */ 3759 } 3760 3761 struct psc_buffer { 3762 struct psc_hdr hdr; 3763 struct psc_entry entries[]; 3764 } __packed; 3765 3766 static int snp_do_psc(struct vcpu_svm *svm); 3767 3768 static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) 3769 { 3770 memset(&svm->sev_es.psc, 0, sizeof(svm->sev_es.psc)); 3771 3772 /* 3773 * PSC requests always get a "no action" response in SW_EXITINFO1, with 3774 * a PSC-specific return code in SW_EXITINFO2 that provides the "real" 3775 * return code. E.g. if the PSC request was interrupted, the need to 3776 * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1. 3777 */ 3778 svm_vmgexit_no_action(svm, psc_ret); 3779 } 3780 3781 static void __snp_complete_one_psc(struct vcpu_svm *svm) 3782 { 3783 struct vcpu_sev_es_state *sev_es = &svm->sev_es; 3784 struct psc_buffer *guest_psc = sev_es->ghcb_sa; 3785 __u16 idx; 3786 3787 /* 3788 * Everything in-flight has been processed successfully. Update the 3789 * corresponding entries in the guest's PSC buffer and zero out the 3790 * count of in-flight PSC entries. 3791 */ 3792 for (idx = sev_es->psc.cur_idx; sev_es->psc.batch_size; 3793 sev_es->psc.batch_size--, idx++) { 3794 struct psc_entry entry = READ_ONCE(guest_psc->entries[idx]); 3795 3796 guest_psc->entries[idx].cur_page = entry.pagesize ? 512 : 1; 3797 } 3798 3799 sev_es->psc.cur_idx = idx; 3800 guest_psc->hdr.cur_entry = idx; 3801 } 3802 3803 static int snp_complete_one_psc(struct kvm_vcpu *vcpu) 3804 { 3805 u64 hypercall_ret = READ_ONCE(vcpu->run->hypercall.ret); 3806 struct vcpu_svm *svm = to_svm(vcpu); 3807 3808 if (!kvm_is_valid_map_gpa_range_ret(hypercall_ret)) 3809 return -EINVAL; 3810 3811 if (hypercall_ret) { 3812 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 3813 return 1; /* resume guest */ 3814 } 3815 3816 __snp_complete_one_psc(svm); 3817 3818 /* Handle the next range (if any). */ 3819 return snp_do_psc(svm); 3820 } 3821 3822 static int snp_do_psc(struct vcpu_svm *svm) 3823 { 3824 struct vcpu_sev_es_state *sev_es = &svm->sev_es; 3825 struct psc_buffer *guest_psc = sev_es->ghcb_sa; 3826 struct kvm_vcpu *vcpu = &svm->vcpu; 3827 struct psc_entry entry_start; 3828 int npages; 3829 bool huge; 3830 u64 gfn; 3831 u16 idx; 3832 3833 next_range: 3834 /* There should be no other PSCs in-flight at this point. */ 3835 if (WARN_ON_ONCE(svm->sev_es.psc.batch_size)) { 3836 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 3837 return 1; 3838 } 3839 3840 /* Find the start of the next range which needs processing. */ 3841 for (idx = sev_es->psc.cur_idx; idx <= sev_es->psc.end_idx; idx++) { 3842 entry_start = READ_ONCE(guest_psc->entries[idx]); 3843 3844 gfn = entry_start.gfn; 3845 huge = entry_start.pagesize; 3846 npages = huge ? 512 : 1; 3847 3848 if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) { 3849 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY); 3850 return 1; 3851 } 3852 3853 if (entry_start.cur_page) { 3854 /* 3855 * If this is a partially-completed 2M range, force 4K handling 3856 * for the remaining pages since they're effectively split at 3857 * this point. Subsequent code should ensure this doesn't get 3858 * combined with adjacent PSC entries where 2M handling is still 3859 * possible. 3860 */ 3861 npages -= entry_start.cur_page; 3862 gfn += entry_start.cur_page; 3863 huge = false; 3864 } 3865 3866 if (npages) 3867 break; 3868 3869 /* 3870 * Increment the guest-visible index to communicate the current 3871 * entry back to the guest, e.g. in case of failure. No need 3872 * for READ_ONCE() as KVM doesn't consume the field, i.e. a 3873 * misbehaving guest can only break itself. 3874 */ 3875 guest_psc->hdr.cur_entry++; 3876 } 3877 3878 if (idx > sev_es->psc.end_idx) { 3879 /* Nothing more to process. */ 3880 snp_complete_psc(svm, 0); 3881 return 1; 3882 } 3883 3884 sev_es->psc.is_2m = huge; 3885 sev_es->psc.cur_idx = idx; 3886 sev_es->psc.batch_size = 1; 3887 3888 /* 3889 * Find all subsequent PSC entries that contain adjacent GPA 3890 * ranges/operations and can be combined into a single 3891 * KVM_HC_MAP_GPA_RANGE exit. 3892 */ 3893 while (++idx <= sev_es->psc.end_idx) { 3894 struct psc_entry entry = READ_ONCE(guest_psc->entries[idx]); 3895 3896 if (entry.operation != entry_start.operation || 3897 entry.gfn != entry_start.gfn + npages || 3898 entry.cur_page || !!entry.pagesize != huge) 3899 break; 3900 3901 sev_es->psc.batch_size++; 3902 npages += huge ? 512 : 1; 3903 } 3904 3905 switch (entry_start.operation) { 3906 case VMGEXIT_PSC_OP_PRIVATE: 3907 case VMGEXIT_PSC_OP_SHARED: 3908 vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; 3909 vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 3910 /* 3911 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 3912 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 3913 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 3914 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 3915 */ 3916 vcpu->run->hypercall.ret = 0; 3917 vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn); 3918 vcpu->run->hypercall.args[1] = npages; 3919 vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE 3920 ? KVM_MAP_GPA_RANGE_ENCRYPTED 3921 : KVM_MAP_GPA_RANGE_DECRYPTED; 3922 vcpu->run->hypercall.args[2] |= entry_start.pagesize 3923 ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M 3924 : KVM_MAP_GPA_RANGE_PAGE_SZ_4K; 3925 vcpu->arch.complete_userspace_io = snp_complete_one_psc; 3926 return 0; /* forward request to userspace */ 3927 default: 3928 /* 3929 * Only shared/private PSC operations are currently supported, so if the 3930 * entire range consists of unsupported operations (e.g. SMASH/UNSMASH), 3931 * then consider the entire range completed and avoid exiting to 3932 * userspace. In theory snp_complete_psc() can always be called directly 3933 * at this point to complete the current range and start the next one, 3934 * but that could lead to unexpected levels of recursion. 3935 */ 3936 __snp_complete_one_psc(svm); 3937 goto next_range; 3938 } 3939 3940 BUG(); 3941 } 3942 3943 static int snp_begin_psc(struct vcpu_svm *svm) 3944 { 3945 struct vcpu_sev_es_state *sev_es = &svm->sev_es; 3946 struct psc_buffer *guest_psc = sev_es->ghcb_sa; 3947 u16 max_nr_entries; 3948 3949 if (!user_exit_on_hypercall(svm->vcpu.kvm, KVM_HC_MAP_GPA_RANGE)) { 3950 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 3951 return 1; 3952 } 3953 3954 /* 3955 * GHCB v2 requires the scratch area to reside within the GHCB itself, 3956 * and PSC requests are only supported for GHCB v2+. Thus it should be 3957 * impossible to exceed the max PSC entry count (which is derived from 3958 * the size of the shared GHCB buffer). 3959 */ 3960 max_nr_entries = (sev_es->ghcb_sa_len - sizeof(struct psc_hdr)) / 3961 sizeof(struct psc_entry); 3962 if (WARN_ON_ONCE(max_nr_entries > VMGEXIT_PSC_MAX_COUNT)) { 3963 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 3964 return 1; 3965 } 3966 3967 /* 3968 * The PSC descriptor buffer can be modified by a misbehaved guest after 3969 * validation, so take care to only use validated copies of values used 3970 * for things like array indexing. 3971 */ 3972 sev_es->psc.cur_idx = READ_ONCE(guest_psc->hdr.cur_entry); 3973 sev_es->psc.end_idx = READ_ONCE(guest_psc->hdr.end_entry); 3974 3975 if (sev_es->psc.end_idx >= max_nr_entries) { 3976 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); 3977 return 1; 3978 } 3979 3980 return snp_do_psc(svm); 3981 } 3982 3983 /* 3984 * Invoked as part of svm_vcpu_reset() processing of an init event. 3985 */ 3986 static void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) 3987 { 3988 struct vcpu_svm *svm = to_svm(vcpu); 3989 struct kvm_memory_slot *slot; 3990 struct page *page; 3991 kvm_pfn_t pfn; 3992 gfn_t gfn; 3993 3994 guard(mutex)(&svm->sev_es.snp_vmsa_mutex); 3995 3996 if (!svm->sev_es.snp_ap_waiting_for_reset) 3997 return; 3998 3999 svm->sev_es.snp_ap_waiting_for_reset = false; 4000 4001 /* Mark the vCPU as offline and not runnable */ 4002 vcpu->arch.pv.pv_unhalted = false; 4003 kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); 4004 4005 /* Clear use of the VMSA */ 4006 svm->vmcb->control.vmsa_pa = INVALID_PAGE; 4007 4008 /* 4009 * When replacing the VMSA during SEV-SNP AP creation, 4010 * mark the VMCB dirty so that full state is always reloaded. 4011 */ 4012 vmcb_mark_all_dirty(svm->vmcb); 4013 4014 if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) 4015 return; 4016 4017 gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); 4018 svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; 4019 4020 slot = gfn_to_memslot(vcpu->kvm, gfn); 4021 if (!slot) 4022 return; 4023 4024 /* 4025 * The new VMSA will be private memory guest memory, so retrieve the 4026 * PFN from the gmem backend. 4027 */ 4028 if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) 4029 return; 4030 4031 /* 4032 * From this point forward, the VMSA will always be a guest-mapped page 4033 * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In 4034 * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but 4035 * that involves cleanups like flushing caches, which would ideally be 4036 * handled during teardown rather than guest boot. Deferring that also 4037 * allows the existing logic for SEV-ES VMSAs to be re-used with 4038 * minimal SNP-specific changes. 4039 */ 4040 svm->sev_es.snp_has_guest_vmsa = true; 4041 4042 /* Use the new VMSA */ 4043 svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); 4044 4045 /* Mark the vCPU as runnable */ 4046 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 4047 4048 /* 4049 * gmem pages aren't currently migratable, but if this ever changes 4050 * then care should be taken to ensure svm->sev_es.vmsa is pinned 4051 * through some other means. 4052 */ 4053 kvm_release_page_clean(page); 4054 } 4055 4056 static int sev_snp_ap_creation(struct vcpu_svm *svm) 4057 { 4058 struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); 4059 struct kvm_vcpu *vcpu = &svm->vcpu; 4060 struct kvm_vcpu *target_vcpu; 4061 struct vcpu_svm *target_svm; 4062 unsigned int request; 4063 unsigned int apic_id; 4064 4065 request = lower_32_bits(svm->vmcb->control.exit_info_1); 4066 apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); 4067 4068 /* Validate the APIC ID */ 4069 target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id); 4070 if (!target_vcpu) { 4071 vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n", 4072 apic_id); 4073 return -EINVAL; 4074 } 4075 4076 target_svm = to_svm(target_vcpu); 4077 4078 guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex); 4079 4080 switch (request) { 4081 case SVM_VMGEXIT_AP_CREATE_ON_INIT: 4082 case SVM_VMGEXIT_AP_CREATE: 4083 if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) { 4084 vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n", 4085 vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features); 4086 return -EINVAL; 4087 } 4088 4089 if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { 4090 vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", 4091 svm->vmcb->control.exit_info_2); 4092 return -EINVAL; 4093 } 4094 4095 /* 4096 * Malicious guest can RMPADJUST a large page into VMSA which 4097 * will hit the SNP erratum where the CPU will incorrectly signal 4098 * an RMP violation #PF if a hugepage collides with the RMP entry 4099 * of VMSA page, reject the AP CREATE request if VMSA address from 4100 * guest is 2M aligned. 4101 */ 4102 if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) { 4103 vcpu_unimpl(vcpu, 4104 "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n", 4105 svm->vmcb->control.exit_info_2); 4106 return -EINVAL; 4107 } 4108 4109 target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; 4110 break; 4111 case SVM_VMGEXIT_AP_DESTROY: 4112 target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; 4113 break; 4114 default: 4115 vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", 4116 request); 4117 return -EINVAL; 4118 } 4119 4120 target_svm->sev_es.snp_ap_waiting_for_reset = true; 4121 4122 /* 4123 * Unless Creation is deferred until INIT, signal the vCPU to update 4124 * its state. 4125 */ 4126 if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) 4127 kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); 4128 4129 return 0; 4130 } 4131 4132 static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) 4133 { 4134 struct sev_data_snp_guest_request data = {0}; 4135 struct kvm *kvm = svm->vcpu.kvm; 4136 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 4137 sev_ret_code fw_err = 0; 4138 int ret; 4139 4140 if (!is_sev_snp_guest(&svm->vcpu)) 4141 return -EINVAL; 4142 4143 guard(mutex)(&sev->guest_req_mutex); 4144 4145 if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) 4146 return -EIO; 4147 4148 data.gctx_paddr = __psp_pa(sev->snp_context); 4149 data.req_paddr = __psp_pa(sev->guest_req_buf); 4150 data.res_paddr = __psp_pa(sev->guest_resp_buf); 4151 4152 /* 4153 * Firmware failures are propagated on to guest, but any other failure 4154 * condition along the way should be reported to userspace. E.g. if 4155 * the PSP is dead and commands are timing out. 4156 */ 4157 ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); 4158 if (ret && !fw_err) 4159 return ret; 4160 4161 if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) 4162 return -EIO; 4163 4164 /* No action is requested *from KVM* if there was a firmware error. */ 4165 svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err)); 4166 4167 /* resume guest */ 4168 return 1; 4169 } 4170 4171 static int snp_req_certs_err(struct vcpu_svm *svm, u32 vmm_error) 4172 { 4173 ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_error, 0)); 4174 4175 return 1; /* resume guest */ 4176 } 4177 4178 static int snp_complete_req_certs(struct kvm_vcpu *vcpu) 4179 { 4180 struct vcpu_svm *svm = to_svm(vcpu); 4181 struct vmcb_control_area *control = &svm->vmcb->control; 4182 4183 switch (READ_ONCE(vcpu->run->snp_req_certs.ret)) { 4184 case 0: 4185 return snp_handle_guest_req(svm, control->exit_info_1, 4186 control->exit_info_2); 4187 case ENOSPC: 4188 vcpu->arch.regs[VCPU_REGS_RBX] = vcpu->run->snp_req_certs.npages; 4189 return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_INVALID_LEN); 4190 case EAGAIN: 4191 return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_BUSY); 4192 case EIO: 4193 return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_GENERIC); 4194 default: 4195 break; 4196 } 4197 4198 return -EINVAL; 4199 } 4200 4201 static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) 4202 { 4203 struct kvm_vcpu *vcpu = &svm->vcpu; 4204 struct kvm *kvm = vcpu->kvm; 4205 4206 u8 msg_type; 4207 4208 if (!is_sev_snp_guest(vcpu)) 4209 return -EINVAL; 4210 4211 if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type), 4212 &msg_type, 1)) 4213 return -EIO; 4214 4215 /* 4216 * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for 4217 * additional certificate data to be provided alongside the attestation 4218 * report via the guest-provided data pages indicated by RAX/RBX. If 4219 * userspace enables KVM_EXIT_SNP_REQ_CERTS, then exit to userspace 4220 * to give userspace an opportunity to provide the certificate data 4221 * before issuing/completing the attestation request. Otherwise, return 4222 * an empty certificate table in the guest-provided data pages and 4223 * handle the attestation request immediately. 4224 */ 4225 if (msg_type == SNP_MSG_REPORT_REQ) { 4226 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 4227 u64 data_npages; 4228 gpa_t data_gpa; 4229 4230 if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm)) 4231 goto request_invalid; 4232 4233 data_gpa = vcpu->arch.regs[VCPU_REGS_RAX]; 4234 data_npages = vcpu->arch.regs[VCPU_REGS_RBX]; 4235 4236 if (!PAGE_ALIGNED(data_gpa)) 4237 goto request_invalid; 4238 4239 if (sev->snp_certs_enabled) { 4240 vcpu->run->exit_reason = KVM_EXIT_SNP_REQ_CERTS; 4241 vcpu->run->snp_req_certs.gpa = data_gpa; 4242 vcpu->run->snp_req_certs.npages = data_npages; 4243 vcpu->run->snp_req_certs.ret = 0; 4244 vcpu->arch.complete_userspace_io = snp_complete_req_certs; 4245 return 0; 4246 } 4247 4248 /* 4249 * As per GHCB spec (see "SNP Extended Guest Request"), the 4250 * certificate table is terminated by 24-bytes of zeroes. 4251 */ 4252 if (data_npages && kvm_clear_guest(kvm, data_gpa, 24)) 4253 return -EIO; 4254 } 4255 4256 return snp_handle_guest_req(svm, req_gpa, resp_gpa); 4257 4258 request_invalid: 4259 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4260 return 1; /* resume guest */ 4261 } 4262 4263 static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) 4264 { 4265 struct vmcb_control_area *control = &svm->vmcb->control; 4266 struct kvm_vcpu *vcpu = &svm->vcpu; 4267 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 4268 u64 ghcb_info; 4269 int ret = 1; 4270 4271 ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK; 4272 4273 trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id, 4274 control->ghcb_gpa); 4275 4276 switch (ghcb_info) { 4277 case GHCB_MSR_SEV_INFO_REQ: 4278 set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, 4279 GHCB_VERSION_MIN, 4280 sev_enc_bit)); 4281 break; 4282 case GHCB_MSR_CPUID_REQ: { 4283 u64 cpuid_fn, cpuid_reg, cpuid_value; 4284 4285 cpuid_fn = get_ghcb_msr_bits(svm, 4286 GHCB_MSR_CPUID_FUNC_MASK, 4287 GHCB_MSR_CPUID_FUNC_POS); 4288 4289 /* Initialize the registers needed by the CPUID intercept */ 4290 vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn; 4291 vcpu->arch.regs[VCPU_REGS_RCX] = 0; 4292 4293 ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID); 4294 if (!ret) { 4295 /* Error, keep GHCB MSR value as-is */ 4296 break; 4297 } 4298 4299 cpuid_reg = get_ghcb_msr_bits(svm, 4300 GHCB_MSR_CPUID_REG_MASK, 4301 GHCB_MSR_CPUID_REG_POS); 4302 if (cpuid_reg == 0) 4303 cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX]; 4304 else if (cpuid_reg == 1) 4305 cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX]; 4306 else if (cpuid_reg == 2) 4307 cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX]; 4308 else 4309 cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX]; 4310 4311 set_ghcb_msr_bits(svm, cpuid_value, 4312 GHCB_MSR_CPUID_VALUE_MASK, 4313 GHCB_MSR_CPUID_VALUE_POS); 4314 4315 set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP, 4316 GHCB_MSR_INFO_MASK, 4317 GHCB_MSR_INFO_POS); 4318 break; 4319 } 4320 case GHCB_MSR_AP_RESET_HOLD_REQ: 4321 svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO; 4322 ret = kvm_emulate_ap_reset_hold(&svm->vcpu); 4323 4324 /* 4325 * Preset the result to a non-SIPI return and then only set 4326 * the result to non-zero when delivering a SIPI. 4327 */ 4328 set_ghcb_msr_bits(svm, 0, 4329 GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, 4330 GHCB_MSR_AP_RESET_HOLD_RESULT_POS); 4331 4332 set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, 4333 GHCB_MSR_INFO_MASK, 4334 GHCB_MSR_INFO_POS); 4335 break; 4336 case GHCB_MSR_HV_FT_REQ: 4337 set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED, 4338 GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS); 4339 set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP, 4340 GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); 4341 break; 4342 case GHCB_MSR_PREF_GPA_REQ: 4343 if (!is_sev_snp_guest(vcpu)) 4344 goto out_terminate; 4345 4346 set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK, 4347 GHCB_MSR_GPA_VALUE_POS); 4348 set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK, 4349 GHCB_MSR_INFO_POS); 4350 break; 4351 case GHCB_MSR_REG_GPA_REQ: { 4352 u64 gfn; 4353 4354 if (!is_sev_snp_guest(vcpu)) 4355 goto out_terminate; 4356 4357 gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK, 4358 GHCB_MSR_GPA_VALUE_POS); 4359 4360 svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn); 4361 4362 set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK, 4363 GHCB_MSR_GPA_VALUE_POS); 4364 set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK, 4365 GHCB_MSR_INFO_POS); 4366 break; 4367 } 4368 case GHCB_MSR_PSC_REQ: 4369 if (!is_sev_snp_guest(vcpu)) 4370 goto out_terminate; 4371 4372 ret = snp_begin_psc_msr(svm, control->ghcb_gpa); 4373 break; 4374 case GHCB_MSR_TERM_REQ: { 4375 u64 reason_set, reason_code; 4376 4377 reason_set = get_ghcb_msr_bits(svm, 4378 GHCB_MSR_TERM_REASON_SET_MASK, 4379 GHCB_MSR_TERM_REASON_SET_POS); 4380 reason_code = get_ghcb_msr_bits(svm, 4381 GHCB_MSR_TERM_REASON_MASK, 4382 GHCB_MSR_TERM_REASON_POS); 4383 pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", 4384 reason_set, reason_code); 4385 4386 goto out_terminate; 4387 } 4388 default: 4389 /* Error, keep GHCB MSR value as-is */ 4390 break; 4391 } 4392 4393 trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id, 4394 control->ghcb_gpa, ret); 4395 4396 return ret; 4397 4398 out_terminate: 4399 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 4400 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; 4401 vcpu->run->system_event.ndata = 1; 4402 vcpu->run->system_event.data[0] = control->ghcb_gpa; 4403 4404 return 0; 4405 } 4406 4407 static bool is_snp_only_vmgexit(u64 exit_code) 4408 { 4409 switch (exit_code) { 4410 case SVM_VMGEXIT_AP_CREATION: 4411 case SVM_VMGEXIT_GUEST_REQUEST: 4412 case SVM_VMGEXIT_EXT_GUEST_REQUEST: 4413 case SVM_VMGEXIT_PSC: 4414 return true; 4415 default: 4416 return false; 4417 } 4418 } 4419 4420 int sev_handle_vmgexit(struct kvm_vcpu *vcpu) 4421 { 4422 struct vcpu_svm *svm = to_svm(vcpu); 4423 struct vmcb_control_area *control = &svm->vmcb->control; 4424 u64 ghcb_gpa; 4425 4426 /* Validate the GHCB */ 4427 ghcb_gpa = control->ghcb_gpa; 4428 if (ghcb_gpa & GHCB_MSR_INFO_MASK) 4429 return sev_handle_vmgexit_msr_protocol(svm); 4430 4431 if (!ghcb_gpa) { 4432 vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n"); 4433 4434 /* Without a GHCB, just return right back to the guest */ 4435 return 1; 4436 } 4437 4438 if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) { 4439 /* Unable to map GHCB from guest */ 4440 vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", 4441 ghcb_gpa); 4442 4443 /* Without a GHCB, just return right back to the guest */ 4444 return 1; 4445 } 4446 4447 svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; 4448 4449 trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb); 4450 4451 sev_es_sync_from_ghcb(svm); 4452 4453 /* SEV-SNP guest requires that the GHCB GPA must be registered */ 4454 if (is_sev_snp_guest(vcpu) && 4455 !ghcb_gpa_is_registered(svm, control->ghcb_gpa)) { 4456 vcpu_unimpl(vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", 4457 control->ghcb_gpa); 4458 svm_vmgexit_bad_input(svm, GHCB_ERR_NOT_REGISTERED); 4459 return 1; 4460 } 4461 4462 /* Only GHCB Usage code 0 is supported */ 4463 if (svm->sev_es.ghcb->ghcb_usage) { 4464 vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", 4465 svm->sev_es.ghcb->ghcb_usage); 4466 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_USAGE); 4467 return 1; 4468 } 4469 4470 if (is_snp_only_vmgexit(control->exit_code) && !is_sev_snp_guest(vcpu)) { 4471 vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is SNP-only\n", 4472 control->exit_code); 4473 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_EVENT); 4474 return 1; 4475 } 4476 4477 if (!sev_es_are_required_ghcb_fields_valid(svm)) { 4478 /* 4479 * Print the exit code even though it may not be marked valid 4480 * as it could help with debugging. 4481 */ 4482 vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n", 4483 control->exit_code); 4484 dump_ghcb(svm); 4485 svm_vmgexit_bad_input(svm, GHCB_ERR_MISSING_INPUT); 4486 return 1; 4487 } 4488 4489 svm_vmgexit_success(svm, 0); 4490 4491 switch (control->exit_code) { 4492 case SVM_EXIT_IOIO: 4493 if (!((control->exit_info_1 & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT)) 4494 return 1; 4495 4496 fallthrough; 4497 case SVM_EXIT_READ_DR7: 4498 case SVM_EXIT_WRITE_DR7: 4499 case SVM_EXIT_RDTSC: 4500 case SVM_EXIT_RDTSCP: 4501 case SVM_EXIT_RDPMC: 4502 case SVM_EXIT_CPUID: 4503 case SVM_EXIT_INVD: 4504 case SVM_EXIT_MSR: 4505 case SVM_EXIT_VMMCALL: 4506 case SVM_EXIT_WBINVD: 4507 case SVM_EXIT_MONITOR: 4508 case SVM_EXIT_MWAIT: 4509 return svm_invoke_exit_handler(vcpu, control->exit_code); 4510 case SVM_VMGEXIT_MMIO_READ: 4511 case SVM_VMGEXIT_MMIO_WRITE: { 4512 bool is_write = control->exit_code == SVM_VMGEXIT_MMIO_WRITE; 4513 u64 len = control->exit_info_2; 4514 int r; 4515 4516 if (!len) 4517 return 1; 4518 4519 if (to_kvm_sev_info(vcpu->kvm)->ghcb_version >= 2 && len > 8) { 4520 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4521 return 1; 4522 } 4523 4524 r = setup_vmgexit_scratch(svm, !is_write, len); 4525 if (r) 4526 return r; 4527 4528 return kvm_sev_es_mmio(vcpu, is_write, control->exit_info_1, len, 4529 svm->sev_es.ghcb_sa); 4530 } 4531 case SVM_VMGEXIT_NMI_COMPLETE: 4532 ++vcpu->stat.nmi_window_exits; 4533 svm->nmi_masked = false; 4534 kvm_make_request(KVM_REQ_EVENT, vcpu); 4535 return 1; 4536 case SVM_VMGEXIT_AP_HLT_LOOP: 4537 svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT; 4538 return kvm_emulate_ap_reset_hold(vcpu); 4539 case SVM_VMGEXIT_AP_JUMP_TABLE: { 4540 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 4541 4542 switch (control->exit_info_1) { 4543 case 0: 4544 /* Set AP jump table address */ 4545 sev->ap_jump_table = control->exit_info_2; 4546 break; 4547 case 1: 4548 /* Get AP jump table address */ 4549 svm_vmgexit_success(svm, sev->ap_jump_table); 4550 break; 4551 default: 4552 pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", 4553 control->exit_info_1); 4554 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4555 } 4556 return 1; 4557 } 4558 case SVM_VMGEXIT_HV_FEATURES: 4559 svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED); 4560 return 1; 4561 case SVM_VMGEXIT_TERM_REQUEST: 4562 pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n", 4563 control->exit_info_1, control->exit_info_2); 4564 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 4565 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; 4566 vcpu->run->system_event.ndata = 1; 4567 vcpu->run->system_event.data[0] = control->ghcb_gpa; 4568 return 0; 4569 case SVM_VMGEXIT_PSC: { 4570 int r; 4571 4572 r = setup_vmgexit_scratch(svm, true, sizeof(struct psc_hdr)); 4573 if (r) 4574 return r; 4575 4576 return snp_begin_psc(svm); 4577 } 4578 case SVM_VMGEXIT_AP_CREATION: 4579 if (sev_snp_ap_creation(svm)) 4580 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4581 return 1; 4582 case SVM_VMGEXIT_GUEST_REQUEST: 4583 case SVM_VMGEXIT_EXT_GUEST_REQUEST: 4584 if (!PAGE_ALIGNED(control->exit_info_1) || 4585 !PAGE_ALIGNED(control->exit_info_2) || 4586 control->exit_info_1 == control->exit_info_2) { 4587 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4588 return 1; 4589 } 4590 4591 if (control->exit_code == SVM_VMGEXIT_GUEST_REQUEST) 4592 return snp_handle_guest_req(svm, control->exit_info_1, 4593 control->exit_info_2); 4594 4595 return snp_handle_ext_guest_req(svm, control->exit_info_1, 4596 control->exit_info_2); 4597 case SVM_VMGEXIT_UNSUPPORTED_EVENT: 4598 /* 4599 * Note, the _guest_ is reporting an unsupported #VC, i.e. this 4600 * isn't the same thing as KVM getting an unsupported #VMGEXIT. 4601 */ 4602 vcpu_unimpl(vcpu, 4603 "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", 4604 control->exit_info_1, control->exit_info_2); 4605 return -EINVAL; 4606 default: 4607 vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", 4608 control->exit_code); 4609 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_EVENT); 4610 return 1; 4611 } 4612 4613 KVM_BUG_ON(1, vcpu->kvm); 4614 return -EIO; 4615 } 4616 4617 int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) 4618 { 4619 int count; 4620 int bytes; 4621 int r; 4622 4623 if (svm->vmcb->control.exit_info_2 > INT_MAX) 4624 return -EINVAL; 4625 4626 count = svm->vmcb->control.exit_info_2; 4627 if (unlikely(check_mul_overflow(count, size, &bytes))) 4628 return -EINVAL; 4629 4630 if (!bytes) 4631 return 1; 4632 4633 r = setup_vmgexit_scratch(svm, in, bytes); 4634 if (r) 4635 return r; 4636 4637 return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa, 4638 count, in); 4639 } 4640 4641 void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4642 { 4643 /* Clear intercepts on MSRs that are context switched by hardware. */ 4644 svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW); 4645 svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW); 4646 svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW); 4647 4648 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) 4649 svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW, 4650 !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && 4651 !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)); 4652 4653 svm_set_intercept_for_msr(vcpu, MSR_AMD64_GUEST_TSC_FREQ, MSR_TYPE_R, 4654 !snp_is_secure_tsc_enabled(vcpu->kvm)); 4655 4656 /* 4657 * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if 4658 * the host/guest supports its use. 4659 * 4660 * KVM treats the guest as being capable of using XSAVES even if XSAVES 4661 * isn't enabled in guest CPUID as there is no intercept for XSAVES, 4662 * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is 4663 * exposed to the guest and XSAVES is supported in hardware. Condition 4664 * full XSS passthrough on the guest being able to use XSAVES *and* 4665 * XSAVES being exposed to the guest so that KVM can at least honor 4666 * guest CPUID for RDMSR and WRMSR. 4667 */ 4668 svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW, 4669 !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) || 4670 !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)); 4671 } 4672 4673 void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) 4674 { 4675 struct kvm_vcpu *vcpu = &svm->vcpu; 4676 struct kvm_cpuid_entry2 *best; 4677 4678 /* For sev guests, the memory encryption bit is not reserved in CR3. */ 4679 best = kvm_find_cpuid_entry(vcpu, 0x8000001F); 4680 if (best) 4681 vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); 4682 } 4683 4684 static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event) 4685 { 4686 struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); 4687 struct vmcb *vmcb = svm->vmcb01.ptr; 4688 4689 svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV_ES; 4690 4691 /* 4692 * An SEV-ES guest requires a VMSA area that is a separate from the 4693 * VMCB page. Do not include the encryption mask on the VMSA physical 4694 * address since hardware will access it using the guest key. Note, 4695 * the VMSA will be NULL if this vCPU is the destination for intrahost 4696 * migration, and will be copied later. 4697 */ 4698 if (!svm->sev_es.snp_has_guest_vmsa) { 4699 if (svm->sev_es.vmsa) 4700 svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); 4701 else 4702 svm->vmcb->control.vmsa_pa = INVALID_PAGE; 4703 } 4704 4705 if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) 4706 svm->vmcb->control.allowed_sev_features = sev->vmsa_features | 4707 VMCB_ALLOWED_SEV_FEATURES_VALID; 4708 4709 /* Can't intercept CR register access, HV can't modify CR registers */ 4710 svm_clr_intercept(svm, INTERCEPT_CR0_READ); 4711 svm_clr_intercept(svm, INTERCEPT_CR4_READ); 4712 svm_clr_intercept(svm, INTERCEPT_CR8_READ); 4713 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); 4714 svm_clr_intercept(svm, INTERCEPT_CR4_WRITE); 4715 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); 4716 4717 svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0); 4718 4719 /* Track EFER/CR register changes */ 4720 svm_set_intercept(svm, TRAP_EFER_WRITE); 4721 svm_set_intercept(svm, TRAP_CR0_WRITE); 4722 svm_set_intercept(svm, TRAP_CR4_WRITE); 4723 svm_set_intercept(svm, TRAP_CR8_WRITE); 4724 4725 vmcb->control.intercepts[INTERCEPT_DR] = 0; 4726 if (!sev_vcpu_has_debug_swap(svm)) { 4727 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); 4728 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); 4729 svm_mark_intercepts_dirty(svm); 4730 } else { 4731 /* 4732 * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't 4733 * allow debugging SEV-ES guests, and enables DebugSwap iff 4734 * NO_NESTED_DATA_BP is supported, so there's no reason to 4735 * intercept #DB when DebugSwap is enabled. For simplicity 4736 * with respect to guest debug, intercept #DB for other VMs 4737 * even if NO_NESTED_DATA_BP is supported, i.e. even if the 4738 * guest can't DoS the CPU with infinite #DB vectoring. 4739 */ 4740 clr_exception_intercept(svm, DB_VECTOR); 4741 } 4742 4743 /* Can't intercept XSETBV, HV can't modify XCR0 directly */ 4744 svm_clr_intercept(svm, INTERCEPT_XSETBV); 4745 4746 /* 4747 * Set the GHCB MSR value as per the GHCB specification when emulating 4748 * vCPU RESET for an SEV-ES guest. 4749 */ 4750 if (!init_event) 4751 set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, 4752 GHCB_VERSION_MIN, 4753 sev_enc_bit)); 4754 } 4755 4756 void sev_init_vmcb(struct vcpu_svm *svm, bool init_event) 4757 { 4758 struct kvm_vcpu *vcpu = &svm->vcpu; 4759 4760 svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV; 4761 clr_exception_intercept(svm, UD_VECTOR); 4762 4763 /* 4764 * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as 4765 * KVM can't decrypt guest memory to decode the faulting instruction. 4766 */ 4767 clr_exception_intercept(svm, GP_VECTOR); 4768 4769 if (init_event && is_sev_snp_guest(vcpu)) 4770 sev_snp_init_protected_guest_state(vcpu); 4771 4772 if (is_sev_es_guest(vcpu)) 4773 sev_es_init_vmcb(svm, init_event); 4774 } 4775 4776 int sev_vcpu_create(struct kvm_vcpu *vcpu) 4777 { 4778 struct vcpu_svm *svm = to_svm(vcpu); 4779 struct page *vmsa_page; 4780 4781 mutex_init(&svm->sev_es.snp_vmsa_mutex); 4782 4783 if (!is_sev_es_guest(vcpu)) 4784 return 0; 4785 4786 /* 4787 * SEV-ES guests require a separate (from the VMCB) VMSA page used to 4788 * contain the encrypted register state of the guest. 4789 */ 4790 vmsa_page = snp_safe_alloc_page(); 4791 if (!vmsa_page) 4792 return -ENOMEM; 4793 4794 svm->sev_es.vmsa = page_address(vmsa_page); 4795 4796 vcpu->arch.guest_tsc_protected = snp_is_secure_tsc_enabled(vcpu->kvm); 4797 4798 return 0; 4799 } 4800 4801 void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) 4802 { 4803 /* 4804 * All host state for SEV-ES guests is categorized into three swap types 4805 * based on how it is handled by hardware during a world switch: 4806 * 4807 * A: VMRUN: Host state saved in host save area 4808 * VMEXIT: Host state loaded from host save area 4809 * 4810 * B: VMRUN: Host state _NOT_ saved in host save area 4811 * VMEXIT: Host state loaded from host save area 4812 * 4813 * C: VMRUN: Host state _NOT_ saved in host save area 4814 * VMEXIT: Host state initialized to default(reset) values 4815 * 4816 * Manually save type-B state, i.e. state that is loaded by VMEXIT but 4817 * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed 4818 * by common SVM code). 4819 */ 4820 hostsa->xcr0 = kvm_host.xcr0; 4821 hostsa->pkru = read_pkru(); 4822 hostsa->xss = kvm_host.xss; 4823 4824 /* 4825 * If DebugSwap is enabled, debug registers are loaded but NOT saved by 4826 * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does 4827 * not save or load debug registers. Sadly, KVM can't prevent SNP 4828 * guests from lying about DebugSwap on secondary vCPUs, i.e. the 4829 * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what 4830 * the guest has actually enabled (or not!) in the VMSA. 4831 * 4832 * If DebugSwap is *possible*, save the masks so that they're restored 4833 * if the guest enables DebugSwap. But for the DRs themselves, do NOT 4834 * rely on the CPU to restore the host values; KVM will restore them as 4835 * needed in common code, via hw_breakpoint_restore(). Note, KVM does 4836 * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs 4837 * don't need to be restored per se, KVM just needs to ensure they are 4838 * loaded with the correct values *if* the CPU writes the MSRs. 4839 */ 4840 if (sev_vcpu_has_debug_swap(svm) || 4841 (cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) && 4842 is_sev_snp_guest(&svm->vcpu))) { 4843 hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); 4844 hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); 4845 hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); 4846 hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3); 4847 } 4848 4849 /* 4850 * TSC_AUX is always virtualized for SEV-ES guests when the feature is 4851 * available, i.e. TSC_AUX is loaded on #VMEXIT from the host save area. 4852 * Set the save area to the current hardware value, i.e. the current 4853 * user return value, so that the correct value is restored on #VMEXIT. 4854 */ 4855 if (cpu_feature_enabled(X86_FEATURE_V_TSC_AUX) && 4856 !WARN_ON_ONCE(tsc_aux_uret_slot < 0)) 4857 hostsa->tsc_aux = kvm_get_user_return_msr(tsc_aux_uret_slot); 4858 } 4859 4860 void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) 4861 { 4862 struct vcpu_svm *svm = to_svm(vcpu); 4863 4864 /* First SIPI: Use the values as initially set by the VMM */ 4865 if (!svm->sev_es.received_first_sipi) { 4866 svm->sev_es.received_first_sipi = true; 4867 return; 4868 } 4869 4870 /* Subsequent SIPI */ 4871 switch (svm->sev_es.ap_reset_hold_type) { 4872 case AP_RESET_HOLD_NAE_EVENT: 4873 /* 4874 * Return from an AP Reset Hold VMGEXIT, where the guest will 4875 * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. 4876 */ 4877 svm_vmgexit_success(svm, 1); 4878 break; 4879 case AP_RESET_HOLD_MSR_PROTO: 4880 /* 4881 * Return from an AP Reset Hold VMGEXIT, where the guest will 4882 * set the CS and RIP. Set GHCB data field to a non-zero value. 4883 */ 4884 set_ghcb_msr_bits(svm, 1, 4885 GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, 4886 GHCB_MSR_AP_RESET_HOLD_RESULT_POS); 4887 4888 set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, 4889 GHCB_MSR_INFO_MASK, 4890 GHCB_MSR_INFO_POS); 4891 break; 4892 default: 4893 break; 4894 } 4895 } 4896 4897 struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) 4898 { 4899 unsigned long pfn; 4900 struct page *p; 4901 4902 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 4903 return alloc_pages_node(node, gfp | __GFP_ZERO, 0); 4904 4905 /* 4906 * Allocate an SNP-safe page to workaround the SNP erratum where 4907 * the CPU will incorrectly signal an RMP violation #PF if a 4908 * hugepage (2MB or 1GB) collides with the RMP entry of a 4909 * 2MB-aligned VMCB, VMSA, or AVIC backing page. 4910 * 4911 * Allocate one extra page, choose a page which is not 4912 * 2MB-aligned, and free the other. 4913 */ 4914 p = alloc_pages_node(node, gfp | __GFP_ZERO, 1); 4915 if (!p) 4916 return NULL; 4917 4918 split_page(p, 1); 4919 4920 pfn = page_to_pfn(p); 4921 if (IS_ALIGNED(pfn, PTRS_PER_PMD)) 4922 __free_page(p++); 4923 else 4924 __free_page(p + 1); 4925 4926 return p; 4927 } 4928 4929 void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) 4930 { 4931 struct kvm_memory_slot *slot; 4932 struct kvm *kvm = vcpu->kvm; 4933 int order, rmp_level, ret; 4934 struct page *page; 4935 bool assigned; 4936 kvm_pfn_t pfn; 4937 gfn_t gfn; 4938 4939 gfn = gpa >> PAGE_SHIFT; 4940 4941 /* 4942 * The only time RMP faults occur for shared pages is when the guest is 4943 * triggering an RMP fault for an implicit page-state change from 4944 * shared->private. Implicit page-state changes are forwarded to 4945 * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults 4946 * for shared pages should not end up here. 4947 */ 4948 if (!kvm_mem_is_private(kvm, gfn)) { 4949 pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n", 4950 gpa); 4951 return; 4952 } 4953 4954 slot = gfn_to_memslot(kvm, gfn); 4955 if (!kvm_slot_has_gmem(slot)) { 4956 pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", 4957 gpa); 4958 return; 4959 } 4960 4961 ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order); 4962 if (ret) { 4963 pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", 4964 gpa); 4965 return; 4966 } 4967 4968 ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); 4969 if (ret || !assigned) { 4970 pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n", 4971 gpa, pfn, ret); 4972 goto out_no_trace; 4973 } 4974 4975 /* 4976 * There are 2 cases where a PSMASH may be needed to resolve an #NPF 4977 * with PFERR_GUEST_RMP_BIT set: 4978 * 4979 * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM 4980 * bit set if the guest issues them with a smaller granularity than 4981 * what is indicated by the page-size bit in the 2MB RMP entry for 4982 * the PFN that backs the GPA. 4983 * 4984 * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is 4985 * smaller than what is indicated by the 2MB RMP entry for the PFN 4986 * that backs the GPA. 4987 * 4988 * In both these cases, the corresponding 2M RMP entry needs to 4989 * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already 4990 * split into 4K RMP entries, then this is likely a spurious case which 4991 * can occur when there are concurrent accesses by the guest to a 2MB 4992 * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in 4993 * the process of being PMASH'd into 4K entries. These cases should 4994 * resolve automatically on subsequent accesses, so just ignore them 4995 * here. 4996 */ 4997 if (rmp_level == PG_LEVEL_4K) 4998 goto out; 4999 5000 ret = snp_rmptable_psmash(pfn); 5001 if (ret) { 5002 /* 5003 * Look it up again. If it's 4K now then the PSMASH may have 5004 * raced with another process and the issue has already resolved 5005 * itself. 5006 */ 5007 if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) && 5008 assigned && rmp_level == PG_LEVEL_4K) 5009 goto out; 5010 5011 pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n", 5012 gpa, pfn, ret); 5013 } 5014 5015 kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD); 5016 out: 5017 trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); 5018 out_no_trace: 5019 kvm_release_page_unused(page); 5020 } 5021 5022 static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) 5023 { 5024 kvm_pfn_t pfn = start; 5025 5026 while (pfn < end) { 5027 int ret, rmp_level; 5028 bool assigned; 5029 5030 ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); 5031 if (ret) { 5032 pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n", 5033 pfn, start, end, rmp_level, ret); 5034 return false; 5035 } 5036 5037 if (assigned) { 5038 pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n", 5039 __func__, pfn, start, end, rmp_level); 5040 return false; 5041 } 5042 5043 pfn++; 5044 } 5045 5046 return true; 5047 } 5048 5049 static u8 max_level_for_order(int order) 5050 { 5051 if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) 5052 return PG_LEVEL_2M; 5053 5054 return PG_LEVEL_4K; 5055 } 5056 5057 static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order) 5058 { 5059 kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); 5060 5061 /* 5062 * If this is a large folio, and the entire 2M range containing the 5063 * PFN is currently shared, then the entire 2M-aligned range can be 5064 * set to private via a single 2M RMP entry. 5065 */ 5066 if (max_level_for_order(order) > PG_LEVEL_4K && 5067 is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD)) 5068 return true; 5069 5070 return false; 5071 } 5072 5073 int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) 5074 { 5075 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 5076 kvm_pfn_t pfn_aligned; 5077 gfn_t gfn_aligned; 5078 int level, rc; 5079 bool assigned; 5080 5081 if (!sev_snp_guest(kvm)) 5082 return 0; 5083 5084 rc = snp_lookup_rmpentry(pfn, &assigned, &level); 5085 if (rc) { 5086 pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n", 5087 gfn, pfn, rc); 5088 return -ENOENT; 5089 } 5090 5091 if (assigned) { 5092 pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n", 5093 __func__, gfn, pfn, max_order, level); 5094 return 0; 5095 } 5096 5097 if (is_large_rmp_possible(kvm, pfn, max_order)) { 5098 level = PG_LEVEL_2M; 5099 pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); 5100 gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD); 5101 } else { 5102 level = PG_LEVEL_4K; 5103 pfn_aligned = pfn; 5104 gfn_aligned = gfn; 5105 } 5106 5107 rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false); 5108 if (rc) { 5109 pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n", 5110 gfn, pfn, level, rc); 5111 return -EINVAL; 5112 } 5113 5114 pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n", 5115 __func__, gfn, pfn, pfn_aligned, max_order, level); 5116 5117 return 0; 5118 } 5119 5120 void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) 5121 { 5122 kvm_pfn_t pfn; 5123 5124 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 5125 return; 5126 5127 pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end); 5128 5129 for (pfn = start; pfn < end;) { 5130 bool use_2m_update = false; 5131 int rc, rmp_level; 5132 bool assigned; 5133 5134 rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); 5135 if (rc || !assigned) 5136 goto next_pfn; 5137 5138 use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) && 5139 end >= (pfn + PTRS_PER_PMD) && 5140 rmp_level > PG_LEVEL_4K; 5141 5142 /* 5143 * If an unaligned PFN corresponds to a 2M region assigned as a 5144 * large page in the RMP table, PSMASH the region into individual 5145 * 4K RMP entries before attempting to convert a 4K sub-page. 5146 */ 5147 if (!use_2m_update && rmp_level > PG_LEVEL_4K) { 5148 /* 5149 * This shouldn't fail, but if it does, report it, but 5150 * still try to update RMP entry to shared and pray this 5151 * was a spurious error that can be addressed later. 5152 */ 5153 rc = snp_rmptable_psmash(pfn); 5154 WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n", 5155 pfn, rc); 5156 } 5157 5158 rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K); 5159 if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n", 5160 pfn, rc)) 5161 goto next_pfn; 5162 5163 /* 5164 * SEV-ES avoids host/guest cache coherency issues through 5165 * WBNOINVD hooks issued via MMU notifiers during run-time, and 5166 * KVM's VM destroy path at shutdown. Those MMU notifier events 5167 * don't cover gmem since there is no requirement to map pages 5168 * to a HVA in order to use them for a running guest. While the 5169 * shutdown path would still likely cover things for SNP guests, 5170 * userspace may also free gmem pages during run-time via 5171 * hole-punching operations on the guest_memfd, so flush the 5172 * cache entries for these pages before free'ing them back to 5173 * the host. 5174 */ 5175 clflush_cache_range(__va(pfn_to_hpa(pfn)), 5176 use_2m_update ? PMD_SIZE : PAGE_SIZE); 5177 next_pfn: 5178 pfn += use_2m_update ? PTRS_PER_PMD : 1; 5179 cond_resched(); 5180 } 5181 } 5182 5183 int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) 5184 { 5185 int level, rc; 5186 bool assigned; 5187 5188 if (!sev_snp_guest(kvm)) 5189 return 0; 5190 5191 rc = snp_lookup_rmpentry(pfn, &assigned, &level); 5192 if (rc || !assigned) 5193 return PG_LEVEL_4K; 5194 5195 return level; 5196 } 5197 5198 struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) 5199 { 5200 struct vcpu_svm *svm = to_svm(vcpu); 5201 struct vmcb_save_area *vmsa; 5202 struct kvm_sev_info *sev; 5203 int error = 0; 5204 int ret; 5205 5206 if (!is_sev_es_guest(vcpu)) 5207 return NULL; 5208 5209 /* 5210 * If the VMSA has not yet been encrypted, return a pointer to the 5211 * current un-encrypted VMSA. 5212 */ 5213 if (!vcpu->arch.guest_state_protected) 5214 return (struct vmcb_save_area *)svm->sev_es.vmsa; 5215 5216 sev = to_kvm_sev_info(vcpu->kvm); 5217 5218 /* Check if the SEV policy allows debugging */ 5219 if (is_sev_snp_guest(vcpu)) { 5220 if (!(sev->policy & SNP_POLICY_MASK_DEBUG)) 5221 return NULL; 5222 } else { 5223 if (sev->policy & SEV_POLICY_MASK_NODBG) 5224 return NULL; 5225 } 5226 5227 if (is_sev_snp_guest(vcpu)) { 5228 struct sev_data_snp_dbg dbg = {0}; 5229 5230 vmsa = snp_alloc_firmware_page(__GFP_ZERO); 5231 if (!vmsa) 5232 return NULL; 5233 5234 dbg.gctx_paddr = __psp_pa(sev->snp_context); 5235 dbg.src_addr = svm->vmcb->control.vmsa_pa; 5236 dbg.dst_addr = __psp_pa(vmsa); 5237 5238 ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); 5239 5240 /* 5241 * Return the target page to a hypervisor page no matter what. 5242 * If this fails, the page can't be used, so leak it and don't 5243 * try to use it. 5244 */ 5245 if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) 5246 return NULL; 5247 5248 if (ret) { 5249 pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", 5250 ret, error, error); 5251 free_page((unsigned long)vmsa); 5252 5253 return NULL; 5254 } 5255 } else { 5256 struct sev_data_dbg dbg = {0}; 5257 struct page *vmsa_page; 5258 5259 vmsa_page = alloc_page(GFP_KERNEL); 5260 if (!vmsa_page) 5261 return NULL; 5262 5263 vmsa = page_address(vmsa_page); 5264 5265 dbg.handle = sev->handle; 5266 dbg.src_addr = svm->vmcb->control.vmsa_pa; 5267 dbg.dst_addr = __psp_pa(vmsa); 5268 dbg.len = PAGE_SIZE; 5269 5270 ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); 5271 if (ret) { 5272 pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", 5273 ret, error, error); 5274 __free_page(vmsa_page); 5275 5276 return NULL; 5277 } 5278 } 5279 5280 return vmsa; 5281 } 5282 5283 void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) 5284 { 5285 /* If the VMSA has not yet been encrypted, nothing was allocated */ 5286 if (!vcpu->arch.guest_state_protected || !vmsa) 5287 return; 5288 5289 free_page((unsigned long)vmsa); 5290 } 5291