1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * AMD SVM-SEV support 6 * 7 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 8 */ 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/kvm_types.h> 12 #include <linux/kvm_host.h> 13 #include <linux/kernel.h> 14 #include <linux/highmem.h> 15 #include <linux/psp.h> 16 #include <linux/psp-sev.h> 17 #include <linux/pagemap.h> 18 #include <linux/swap.h> 19 #include <linux/misc_cgroup.h> 20 #include <linux/processor.h> 21 #include <linux/trace_events.h> 22 #include <uapi/linux/sev-guest.h> 23 24 #include <asm/pkru.h> 25 #include <asm/trapnr.h> 26 #include <asm/fpu/xcr.h> 27 #include <asm/fpu/xstate.h> 28 #include <asm/debugreg.h> 29 #include <asm/msr.h> 30 #include <asm/sev.h> 31 32 #include "mmu.h" 33 #include "x86.h" 34 #include "svm.h" 35 #include "svm_ops.h" 36 #include "cpuid.h" 37 #include "trace.h" 38 39 #define GHCB_VERSION_MAX 2ULL 40 #define GHCB_VERSION_MIN 1ULL 41 42 #define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) 43 44 /* 45 * The GHCB spec essentially states that all non-zero error codes other than 46 * those explicitly defined above should be treated as an error by the guest. 47 * Define a generic error to cover that case, and choose a value that is not 48 * likely to overlap with new explicit error codes should more be added to 49 * the GHCB spec later. KVM will use this to report generic errors when 50 * handling SNP guest requests. 51 */ 52 #define SNP_GUEST_VMM_ERR_GENERIC (~0U) 53 54 /* enable/disable SEV support */ 55 static bool __ro_after_init sev_enabled = true; 56 module_param_named(sev, sev_enabled, bool, 0444); 57 58 /* enable/disable SEV-ES support */ 59 static bool __ro_after_init sev_es_enabled = true; 60 module_param_named(sev_es, sev_es_enabled, bool, 0444); 61 62 /* enable/disable SEV-SNP support */ 63 static bool __ro_after_init sev_snp_enabled = true; 64 module_param_named(sev_snp, sev_snp_enabled, bool, 0444); 65 66 static unsigned int __ro_after_init nr_ciphertext_hiding_asids; 67 module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 0444); 68 69 #define AP_RESET_HOLD_NONE 0 70 #define AP_RESET_HOLD_NAE_EVENT 1 71 #define AP_RESET_HOLD_MSR_PROTO 2 72 73 /* 74 * SEV-SNP policy bits that can be supported by KVM. These include policy bits 75 * that have implementation support within KVM or policy bits that do not 76 * require implementation support within KVM to enforce the policy. 77 */ 78 #define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ 79 SNP_POLICY_MASK_API_MAJOR | \ 80 SNP_POLICY_MASK_SMT | \ 81 SNP_POLICY_MASK_RSVD_MBO | \ 82 SNP_POLICY_MASK_DEBUG | \ 83 SNP_POLICY_MASK_SINGLE_SOCKET | \ 84 SNP_POLICY_MASK_CXL_ALLOW | \ 85 SNP_POLICY_MASK_MEM_AES_256_XTS | \ 86 SNP_POLICY_MASK_RAPL_DIS | \ 87 SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM | \ 88 SNP_POLICY_MASK_PAGE_SWAP_DISABLE) 89 90 static u64 snp_supported_policy_bits __ro_after_init; 91 92 static u64 sev_supported_vmsa_features __ro_after_init; 93 94 #define INITIAL_VMSA_GPA 0xFFFFFFFFF000 95 96 static u8 sev_enc_bit; 97 static DECLARE_RWSEM(sev_deactivate_lock); 98 static DEFINE_MUTEX(sev_bitmap_lock); 99 unsigned int max_sev_asid; 100 static unsigned int min_sev_asid; 101 static unsigned int max_sev_es_asid; 102 static unsigned int min_sev_es_asid; 103 static unsigned int max_snp_asid; 104 static unsigned int min_snp_asid; 105 static unsigned long sev_me_mask; 106 static unsigned int nr_asids; 107 static unsigned long *sev_asid_bitmap; 108 static unsigned long *sev_reclaim_asid_bitmap; 109 110 static __always_inline void kvm_lockdep_assert_sev_lock_held(struct kvm *kvm) 111 { 112 #ifdef CONFIG_PROVE_LOCKING 113 /* 114 * Querying SEV+ support is safe if there are no other references, i.e. 115 * if concurrent initialization of SEV+ is impossible. 116 */ 117 if (!refcount_read(&kvm->users_count)) 118 return; 119 120 /* 121 * Querying SEV+ support from vCPU context is always safe, as vCPUs can 122 * only be created after SEV+ is initialized (and KVM disallows all SEV 123 * sub-ioctls while vCPU creation is in-progress). 124 */ 125 if (kvm_get_running_vcpu()) 126 return; 127 128 lockdep_assert_held(&kvm->lock); 129 #endif 130 } 131 132 static bool sev_guest(struct kvm *kvm) 133 { 134 kvm_lockdep_assert_sev_lock_held(kvm); 135 return ____sev_guest(kvm); 136 } 137 static bool sev_es_guest(struct kvm *kvm) 138 { 139 kvm_lockdep_assert_sev_lock_held(kvm); 140 return ____sev_es_guest(kvm); 141 } 142 143 static bool sev_snp_guest(struct kvm *kvm) 144 { 145 kvm_lockdep_assert_sev_lock_held(kvm); 146 return ____sev_snp_guest(kvm); 147 } 148 149 static int snp_decommission_context(struct kvm *kvm); 150 151 struct enc_region { 152 struct list_head list; 153 unsigned long npages; 154 struct page **pages; 155 unsigned long uaddr; 156 unsigned long size; 157 }; 158 159 /* Called with the sev_bitmap_lock held, or on shutdown */ 160 static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) 161 { 162 int ret, error = 0; 163 unsigned int asid; 164 165 /* Check if there are any ASIDs to reclaim before performing a flush */ 166 asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); 167 if (asid > max_asid) 168 return -EBUSY; 169 170 /* 171 * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail, 172 * so it must be guarded. 173 */ 174 down_write(&sev_deactivate_lock); 175 176 /* SNP firmware requires use of WBINVD for ASID recycling. */ 177 wbinvd_on_all_cpus(); 178 179 if (sev_snp_enabled) 180 ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error); 181 else 182 ret = sev_guest_df_flush(&error); 183 184 up_write(&sev_deactivate_lock); 185 186 if (ret) 187 pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n", 188 sev_snp_enabled ? "-SNP" : "", ret, error); 189 190 return ret; 191 } 192 193 static inline bool is_mirroring_enc_context(struct kvm *kvm) 194 { 195 return !!to_kvm_sev_info(kvm)->enc_context_owner; 196 } 197 198 static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) 199 { 200 struct kvm_vcpu *vcpu = &svm->vcpu; 201 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 202 203 return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; 204 } 205 206 static bool snp_is_secure_tsc_enabled(struct kvm *kvm) 207 { 208 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 209 210 return (sev->vmsa_features & SVM_SEV_FEAT_SECURE_TSC) && 211 !WARN_ON_ONCE(!sev_snp_guest(kvm)); 212 } 213 214 /* Must be called with the sev_bitmap_lock held */ 215 static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid) 216 { 217 if (sev_flush_asids(min_asid, max_asid)) 218 return false; 219 220 /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */ 221 bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, 222 nr_asids); 223 bitmap_zero(sev_reclaim_asid_bitmap, nr_asids); 224 225 return true; 226 } 227 228 static int sev_misc_cg_try_charge(struct kvm_sev_info *sev) 229 { 230 enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; 231 return misc_cg_try_charge(type, sev->misc_cg, 1); 232 } 233 234 static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) 235 { 236 enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; 237 misc_cg_uncharge(type, sev->misc_cg, 1); 238 } 239 240 static unsigned int sev_alloc_asid(unsigned int min_asid, unsigned int max_asid) 241 { 242 unsigned int asid; 243 bool retry = true; 244 245 guard(mutex)(&sev_bitmap_lock); 246 247 again: 248 asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); 249 if (asid > max_asid) { 250 if (retry && __sev_recycle_asids(min_asid, max_asid)) { 251 retry = false; 252 goto again; 253 } 254 255 return asid; 256 } 257 258 __set_bit(asid, sev_asid_bitmap); 259 return asid; 260 } 261 262 static int sev_asid_new(struct kvm_sev_info *sev, unsigned long vm_type) 263 { 264 /* 265 * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. 266 * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. 267 */ 268 unsigned int min_asid, max_asid, asid; 269 int ret; 270 271 if (vm_type == KVM_X86_SNP_VM) { 272 min_asid = min_snp_asid; 273 max_asid = max_snp_asid; 274 } else if (sev->es_active) { 275 min_asid = min_sev_es_asid; 276 max_asid = max_sev_es_asid; 277 } else { 278 min_asid = min_sev_asid; 279 max_asid = max_sev_asid; 280 } 281 282 /* 283 * The min ASID can end up larger than the max if basic SEV support is 284 * effectively disabled by disallowing use of ASIDs for SEV guests. 285 * Similarly for SEV-ES guests the min ASID can end up larger than the 286 * max when ciphertext hiding is enabled, effectively disabling SEV-ES 287 * support. 288 */ 289 if (min_asid > max_asid) 290 return -ENOTTY; 291 292 WARN_ON_ONCE(sev->misc_cg); 293 sev->misc_cg = get_current_misc_cg(); 294 ret = sev_misc_cg_try_charge(sev); 295 if (ret) 296 goto e_put_cg; 297 298 asid = sev_alloc_asid(min_asid, max_asid); 299 if (asid > max_asid) { 300 ret = -EBUSY; 301 goto e_uncharge; 302 } 303 304 sev->asid = asid; 305 return 0; 306 307 e_uncharge: 308 sev_misc_cg_uncharge(sev); 309 e_put_cg: 310 put_misc_cg(sev->misc_cg); 311 sev->misc_cg = NULL; 312 return ret; 313 } 314 315 static unsigned int sev_get_asid(struct kvm *kvm) 316 { 317 return to_kvm_sev_info(kvm)->asid; 318 } 319 320 static void sev_asid_free(struct kvm_sev_info *sev) 321 { 322 struct svm_cpu_data *sd; 323 int cpu; 324 325 mutex_lock(&sev_bitmap_lock); 326 327 __set_bit(sev->asid, sev_reclaim_asid_bitmap); 328 329 for_each_possible_cpu(cpu) { 330 sd = per_cpu_ptr(&svm_data, cpu); 331 sd->sev_vmcbs[sev->asid] = NULL; 332 } 333 334 mutex_unlock(&sev_bitmap_lock); 335 336 sev_misc_cg_uncharge(sev); 337 put_misc_cg(sev->misc_cg); 338 sev->misc_cg = NULL; 339 } 340 341 static void sev_decommission(unsigned int handle) 342 { 343 struct sev_data_decommission decommission; 344 345 if (!handle) 346 return; 347 348 decommission.handle = handle; 349 sev_guest_decommission(&decommission, NULL); 350 } 351 352 /* 353 * Transition a page to hypervisor-owned/shared state in the RMP table. This 354 * should not fail under normal conditions, but leak the page should that 355 * happen since it will no longer be usable by the host due to RMP protections. 356 */ 357 static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level) 358 { 359 if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) { 360 snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT); 361 return -EIO; 362 } 363 364 return 0; 365 } 366 367 /* 368 * Certain page-states, such as Pre-Guest and Firmware pages (as documented 369 * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be 370 * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE 371 * unless they are reclaimed first. 372 * 373 * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they 374 * might not be usable by the host due to being set as immutable or still 375 * being associated with a guest ASID. 376 * 377 * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be 378 * converted back to shared, as the page is no longer usable due to RMP 379 * protections, and it's infeasible for the guest to continue on. 380 */ 381 static int snp_page_reclaim(struct kvm *kvm, u64 pfn) 382 { 383 struct sev_data_snp_page_reclaim data = {0}; 384 int fw_err, rc; 385 386 data.paddr = __sme_set(pfn << PAGE_SHIFT); 387 rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err); 388 if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) { 389 snp_leak_pages(pfn, 1); 390 return -EIO; 391 } 392 393 if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K)) 394 return -EIO; 395 396 return rc; 397 } 398 399 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) 400 { 401 struct sev_data_deactivate deactivate; 402 403 if (!handle) 404 return; 405 406 deactivate.handle = handle; 407 408 /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */ 409 down_read(&sev_deactivate_lock); 410 sev_guest_deactivate(&deactivate, NULL); 411 up_read(&sev_deactivate_lock); 412 413 sev_decommission(handle); 414 } 415 416 /* 417 * This sets up bounce buffers/firmware pages to handle SNP Guest Request 418 * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB 419 * 2.0 specification for more details. 420 * 421 * Technically, when an SNP Guest Request is issued, the guest will provide its 422 * own request/response pages, which could in theory be passed along directly 423 * to firmware rather than using bounce pages. However, these pages would need 424 * special care: 425 * 426 * - Both pages are from shared guest memory, so they need to be protected 427 * from migration/etc. occurring while firmware reads/writes to them. At a 428 * minimum, this requires elevating the ref counts and potentially needing 429 * an explicit pinning of the memory. This places additional restrictions 430 * on what type of memory backends userspace can use for shared guest 431 * memory since there is some reliance on using refcounted pages. 432 * 433 * - The response page needs to be switched to Firmware-owned[1] state 434 * before the firmware can write to it, which can lead to potential 435 * host RMP #PFs if the guest is misbehaved and hands the host a 436 * guest page that KVM might write to for other reasons (e.g. virtio 437 * buffers/etc.). 438 * 439 * Both of these issues can be avoided completely by using separately-allocated 440 * bounce pages for both the request/response pages and passing those to 441 * firmware instead. So that's what is being set up here. 442 * 443 * Guest requests rely on message sequence numbers to ensure requests are 444 * issued to firmware in the order the guest issues them, so concurrent guest 445 * requests generally shouldn't happen. But a misbehaved guest could issue 446 * concurrent guest requests in theory, so a mutex is used to serialize 447 * access to the bounce buffers. 448 * 449 * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more 450 * details on Firmware-owned pages, along with "RMP and VMPL Access Checks" 451 * in the APM for details on the related RMP restrictions. 452 */ 453 static int snp_guest_req_init(struct kvm *kvm) 454 { 455 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 456 struct page *req_page; 457 458 req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 459 if (!req_page) 460 return -ENOMEM; 461 462 sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 463 if (!sev->guest_resp_buf) { 464 __free_page(req_page); 465 return -EIO; 466 } 467 468 sev->guest_req_buf = page_address(req_page); 469 mutex_init(&sev->guest_req_mutex); 470 471 return 0; 472 } 473 474 static void snp_guest_req_cleanup(struct kvm *kvm) 475 { 476 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 477 478 if (sev->guest_resp_buf) 479 snp_free_firmware_page(sev->guest_resp_buf); 480 481 if (sev->guest_req_buf) 482 __free_page(virt_to_page(sev->guest_req_buf)); 483 484 sev->guest_req_buf = NULL; 485 sev->guest_resp_buf = NULL; 486 } 487 488 static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, 489 struct kvm_sev_init *data, 490 unsigned long vm_type) 491 { 492 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 493 struct sev_platform_init_args init_args = {0}; 494 bool es_active = vm_type != KVM_X86_SEV_VM; 495 bool snp_active = vm_type == KVM_X86_SNP_VM; 496 u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; 497 int ret; 498 499 if (kvm->created_vcpus) 500 return -EINVAL; 501 502 if (data->flags) 503 return -EINVAL; 504 505 if (!snp_active) 506 valid_vmsa_features &= ~SVM_SEV_FEAT_SECURE_TSC; 507 508 if (data->vmsa_features & ~valid_vmsa_features) 509 return -EINVAL; 510 511 if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version)) 512 return -EINVAL; 513 514 /* 515 * KVM supports the full range of mandatory features defined by version 516 * 2 of the GHCB protocol, so default to that for SEV-ES guests created 517 * via KVM_SEV_INIT2 (KVM_SEV_INIT forces version 1). 518 */ 519 if (es_active && !data->ghcb_version) 520 data->ghcb_version = 2; 521 522 if (snp_active && data->ghcb_version < 2) 523 return -EINVAL; 524 525 if (unlikely(sev->active)) 526 return -EINVAL; 527 528 sev->active = true; 529 sev->es_active = es_active; 530 sev->vmsa_features = data->vmsa_features; 531 sev->ghcb_version = data->ghcb_version; 532 533 if (snp_active) 534 sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; 535 536 ret = sev_asid_new(sev, vm_type); 537 if (ret) 538 goto e_no_asid; 539 540 init_args.probe = false; 541 ret = sev_platform_init(&init_args); 542 if (ret) 543 goto e_free_asid; 544 545 if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 546 ret = -ENOMEM; 547 goto e_free_asid; 548 } 549 550 /* This needs to happen after SEV/SNP firmware initialization. */ 551 if (snp_active) { 552 ret = snp_guest_req_init(kvm); 553 if (ret) 554 goto e_free; 555 } 556 557 INIT_LIST_HEAD(&sev->regions_list); 558 INIT_LIST_HEAD(&sev->mirror_vms); 559 sev->need_init = false; 560 561 kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV); 562 563 return 0; 564 565 e_free: 566 free_cpumask_var(sev->have_run_cpus); 567 e_free_asid: 568 argp->error = init_args.error; 569 sev_asid_free(sev); 570 sev->asid = 0; 571 e_no_asid: 572 sev->vmsa_features = 0; 573 sev->es_active = false; 574 sev->active = false; 575 return ret; 576 } 577 578 static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) 579 { 580 struct kvm_sev_init data = { 581 .vmsa_features = 0, 582 .ghcb_version = 0, 583 }; 584 unsigned long vm_type; 585 586 if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM) 587 return -EINVAL; 588 589 vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM); 590 591 /* 592 * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will 593 * continue to only ever support the minimal GHCB protocol version. 594 */ 595 if (vm_type == KVM_X86_SEV_ES_VM) 596 data.ghcb_version = GHCB_VERSION_MIN; 597 598 return __sev_guest_init(kvm, argp, &data, vm_type); 599 } 600 601 static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) 602 { 603 struct kvm_sev_init data; 604 605 if (!to_kvm_sev_info(kvm)->need_init) 606 return -EINVAL; 607 608 if (kvm->arch.vm_type != KVM_X86_SEV_VM && 609 kvm->arch.vm_type != KVM_X86_SEV_ES_VM && 610 kvm->arch.vm_type != KVM_X86_SNP_VM) 611 return -EINVAL; 612 613 if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data))) 614 return -EFAULT; 615 616 return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type); 617 } 618 619 static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) 620 { 621 unsigned int asid = sev_get_asid(kvm); 622 struct sev_data_activate activate; 623 int ret; 624 625 /* activate ASID on the given handle */ 626 activate.handle = handle; 627 activate.asid = asid; 628 ret = sev_guest_activate(&activate, error); 629 630 return ret; 631 } 632 633 static int __sev_issue_cmd(int fd, int id, void *data, int *error) 634 { 635 CLASS(fd, f)(fd); 636 637 if (fd_empty(f)) 638 return -EBADF; 639 640 return sev_issue_cmd_external_user(fd_file(f), id, data, error); 641 } 642 643 static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) 644 { 645 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 646 647 return __sev_issue_cmd(sev->fd, id, data, error); 648 } 649 650 static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 651 { 652 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 653 struct sev_data_launch_start start; 654 struct kvm_sev_launch_start params; 655 void *dh_blob, *session_blob; 656 int *error = &argp->error; 657 int ret; 658 659 if (!sev_guest(kvm)) 660 return -ENOTTY; 661 662 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 663 return -EFAULT; 664 665 memset(&start, 0, sizeof(start)); 666 667 dh_blob = NULL; 668 if (params.dh_uaddr) { 669 dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len); 670 if (IS_ERR(dh_blob)) 671 return PTR_ERR(dh_blob); 672 673 start.dh_cert_address = __sme_set(__pa(dh_blob)); 674 start.dh_cert_len = params.dh_len; 675 } 676 677 session_blob = NULL; 678 if (params.session_uaddr) { 679 session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len); 680 if (IS_ERR(session_blob)) { 681 ret = PTR_ERR(session_blob); 682 goto e_free_dh; 683 } 684 685 start.session_address = __sme_set(__pa(session_blob)); 686 start.session_len = params.session_len; 687 } 688 689 start.handle = params.handle; 690 start.policy = params.policy; 691 692 /* create memory encryption context */ 693 ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error); 694 if (ret) 695 goto e_free_session; 696 697 /* Bind ASID to this guest */ 698 ret = sev_bind_asid(kvm, start.handle, error); 699 if (ret) { 700 sev_decommission(start.handle); 701 goto e_free_session; 702 } 703 704 /* return handle to userspace */ 705 params.handle = start.handle; 706 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) { 707 sev_unbind_asid(kvm, start.handle); 708 ret = -EFAULT; 709 goto e_free_session; 710 } 711 712 sev->policy = params.policy; 713 sev->handle = start.handle; 714 sev->fd = argp->sev_fd; 715 716 e_free_session: 717 kfree(session_blob); 718 e_free_dh: 719 kfree(dh_blob); 720 return ret; 721 } 722 723 static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, 724 unsigned long ulen, unsigned long *n, 725 unsigned int flags) 726 { 727 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 728 unsigned long npages, total_npages, lock_limit; 729 struct page **pages; 730 int npinned, ret; 731 732 lockdep_assert_held(&kvm->lock); 733 734 if (ulen == 0 || uaddr + ulen < uaddr) 735 return ERR_PTR(-EINVAL); 736 737 /* 738 * Calculate the number of pages that need to be pinned to cover the 739 * entire range. Note! This isn't simply PFN_DOWN(ulen), as KVM 740 * doesn't require the incoming address+size to be page aligned! 741 */ 742 npages = PFN_DOWN(uaddr + ulen - 1) - PFN_DOWN(uaddr) + 1; 743 if (npages > INT_MAX) 744 return ERR_PTR(-EINVAL); 745 746 total_npages = sev->pages_locked + npages; 747 if (total_npages > totalram_pages()) 748 return ERR_PTR(-EINVAL); 749 750 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 751 if (total_npages > lock_limit && !capable(CAP_IPC_LOCK)) { 752 pr_err("SEV: %lu total pages would exceed the lock limit of %lu.\n", 753 total_npages, lock_limit); 754 return ERR_PTR(-ENOMEM); 755 } 756 757 /* 758 * Don't WARN if the kernel (rightly) thinks the total size is absurd, 759 * i.e. rely on the kernel to reject outrageous range sizes. The above 760 * check on the number of pages is purely to avoid truncation as 761 * pin_user_pages_fast() takes the number of pages as a 32-bit int. 762 */ 763 pages = kvzalloc_objs(*pages, npages, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 764 if (!pages) 765 return ERR_PTR(-ENOMEM); 766 767 /* Pin the user virtual address. */ 768 npinned = pin_user_pages_fast(uaddr, npages, flags, pages); 769 if (npinned != npages) { 770 pr_err("SEV: Failure locking %lu pages.\n", npages); 771 ret = -ENOMEM; 772 goto err; 773 } 774 775 *n = npages; 776 sev->pages_locked = total_npages; 777 778 return pages; 779 780 err: 781 if (npinned > 0) 782 unpin_user_pages(pages, npinned); 783 784 kvfree(pages); 785 return ERR_PTR(ret); 786 } 787 788 static void sev_unpin_memory(struct kvm *kvm, struct page **pages, 789 unsigned long npages) 790 { 791 unpin_user_pages(pages, npages); 792 kvfree(pages); 793 to_kvm_sev_info(kvm)->pages_locked -= npages; 794 } 795 796 static void sev_clflush_pages(struct page *pages[], unsigned long npages) 797 { 798 uint8_t *page_virtual; 799 unsigned long i; 800 801 if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 || 802 pages == NULL) 803 return; 804 805 for (i = 0; i < npages; i++) { 806 page_virtual = kmap_local_page(pages[i]); 807 clflush_cache_range(page_virtual, PAGE_SIZE); 808 kunmap_local(page_virtual); 809 cond_resched(); 810 } 811 } 812 813 static void sev_writeback_caches(struct kvm *kvm) 814 { 815 /* 816 * Ensure that all dirty guest tagged cache entries are written back 817 * before releasing the pages back to the system for use. CLFLUSH will 818 * not do this without SME_COHERENT, and flushing many cache lines 819 * individually is slower than blasting WBINVD for large VMs, so issue 820 * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported) 821 * on CPUs that have done VMRUN, i.e. may have dirtied data using the 822 * VM's ASID. 823 * 824 * For simplicity, never remove CPUs from the bitmap. Ideally, KVM 825 * would clear the mask when flushing caches, but doing so requires 826 * serializing multiple calls and having responding CPUs (to the IPI) 827 * mark themselves as still running if they are running (or about to 828 * run) a vCPU for the VM. 829 * 830 * Note, the caller is responsible for ensuring correctness if the mask 831 * can be modified, e.g. if a CPU could be doing VMRUN. 832 */ 833 wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus); 834 } 835 836 static unsigned long get_num_contig_pages(unsigned long idx, 837 struct page **inpages, unsigned long npages) 838 { 839 unsigned long paddr, next_paddr; 840 unsigned long i = idx + 1, pages = 1; 841 842 /* find the number of contiguous pages starting from idx */ 843 paddr = __sme_page_pa(inpages[idx]); 844 while (i < npages) { 845 next_paddr = __sme_page_pa(inpages[i++]); 846 if ((paddr + PAGE_SIZE) == next_paddr) { 847 pages++; 848 paddr = next_paddr; 849 continue; 850 } 851 break; 852 } 853 854 return pages; 855 } 856 857 static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) 858 { 859 unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i; 860 struct kvm_sev_launch_update_data params; 861 struct sev_data_launch_update_data data; 862 struct page **inpages; 863 int ret; 864 865 if (!sev_guest(kvm)) 866 return -ENOTTY; 867 868 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 869 return -EFAULT; 870 871 vaddr = params.uaddr; 872 size = params.len; 873 vaddr_end = vaddr + size; 874 875 /* Lock the user memory. */ 876 inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE); 877 if (IS_ERR(inpages)) 878 return PTR_ERR(inpages); 879 880 /* 881 * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in 882 * place; the cache may contain the data that was written unencrypted. 883 */ 884 sev_clflush_pages(inpages, npages); 885 886 data.reserved = 0; 887 data.handle = to_kvm_sev_info(kvm)->handle; 888 889 for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { 890 int offset, len; 891 892 /* 893 * If the user buffer is not page-aligned, calculate the offset 894 * within the page. 895 */ 896 offset = vaddr & (PAGE_SIZE - 1); 897 898 /* Calculate the number of pages that can be encrypted in one go. */ 899 pages = get_num_contig_pages(i, inpages, npages); 900 901 len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size); 902 903 data.len = len; 904 data.address = __sme_page_pa(inpages[i]) + offset; 905 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error); 906 if (ret) 907 goto e_unpin; 908 909 size -= len; 910 next_vaddr = vaddr + len; 911 } 912 913 e_unpin: 914 /* content of memory is updated, mark pages dirty */ 915 for (i = 0; i < npages; i++) { 916 set_page_dirty_lock(inpages[i]); 917 mark_page_accessed(inpages[i]); 918 } 919 /* unlock the user pages */ 920 sev_unpin_memory(kvm, inpages, npages); 921 return ret; 922 } 923 924 static int sev_es_sync_vmsa(struct vcpu_svm *svm) 925 { 926 struct kvm_vcpu *vcpu = &svm->vcpu; 927 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 928 struct sev_es_save_area *save = svm->sev_es.vmsa; 929 struct xregs_state *xsave; 930 const u8 *s; 931 u8 *d; 932 int i; 933 934 lockdep_assert_held(&vcpu->mutex); 935 936 if (vcpu->arch.guest_state_protected) 937 return -EINVAL; 938 939 /* Check some debug related fields before encrypting the VMSA */ 940 if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) 941 return -EINVAL; 942 943 /* 944 * SEV-ES will use a VMSA that is pointed to by the VMCB, not 945 * the traditional VMSA that is part of the VMCB. Copy the 946 * traditional VMSA as it has been built so far (in prep 947 * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. 948 */ 949 memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save)); 950 951 /* Sync registgers */ 952 save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX]; 953 save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX]; 954 save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 955 save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX]; 956 save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP]; 957 save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP]; 958 save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI]; 959 save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI]; 960 #ifdef CONFIG_X86_64 961 save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8]; 962 save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9]; 963 save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10]; 964 save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11]; 965 save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12]; 966 save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13]; 967 save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14]; 968 save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15]; 969 #endif 970 save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP]; 971 972 /* Sync some non-GPR registers before encrypting */ 973 save->xcr0 = svm->vcpu.arch.xcr0; 974 save->pkru = svm->vcpu.arch.pkru; 975 save->xss = svm->vcpu.arch.ia32_xss; 976 save->dr6 = svm->vcpu.arch.dr6; 977 978 save->sev_features = sev->vmsa_features; 979 980 /* 981 * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid 982 * breaking older measurements. 983 */ 984 if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) { 985 xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave; 986 save->x87_dp = xsave->i387.rdp; 987 save->mxcsr = xsave->i387.mxcsr; 988 save->x87_ftw = xsave->i387.twd; 989 save->x87_fsw = xsave->i387.swd; 990 save->x87_fcw = xsave->i387.cwd; 991 save->x87_fop = xsave->i387.fop; 992 save->x87_ds = 0; 993 save->x87_cs = 0; 994 save->x87_rip = xsave->i387.rip; 995 996 for (i = 0; i < 8; i++) { 997 /* 998 * The format of the x87 save area is undocumented and 999 * definitely not what you would expect. It consists of 1000 * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes 1001 * area with bytes 8-9 of each register. 1002 */ 1003 d = save->fpreg_x87 + i * 8; 1004 s = ((u8 *)xsave->i387.st_space) + i * 16; 1005 memcpy(d, s, 8); 1006 save->fpreg_x87[64 + i * 2] = s[8]; 1007 save->fpreg_x87[64 + i * 2 + 1] = s[9]; 1008 } 1009 memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256); 1010 1011 s = get_xsave_addr(xsave, XFEATURE_YMM); 1012 if (s) 1013 memcpy(save->fpreg_ymm, s, 256); 1014 else 1015 memset(save->fpreg_ymm, 0, 256); 1016 } 1017 1018 pr_debug("Virtual Machine Save Area (VMSA):\n"); 1019 print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); 1020 1021 return 0; 1022 } 1023 1024 static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, 1025 int *error) 1026 { 1027 struct sev_data_launch_update_vmsa vmsa; 1028 struct vcpu_svm *svm = to_svm(vcpu); 1029 int ret; 1030 1031 if (vcpu->guest_debug) { 1032 pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported"); 1033 return -EINVAL; 1034 } 1035 1036 /* Perform some pre-encryption checks against the VMSA */ 1037 ret = sev_es_sync_vmsa(svm); 1038 if (ret) 1039 return ret; 1040 1041 /* 1042 * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of 1043 * the VMSA memory content (i.e it will write the same memory region 1044 * with the guest's key), so invalidate it first. 1045 */ 1046 clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE); 1047 1048 vmsa.reserved = 0; 1049 vmsa.handle = to_kvm_sev_info(kvm)->handle; 1050 vmsa.address = __sme_pa(svm->sev_es.vmsa); 1051 vmsa.len = PAGE_SIZE; 1052 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); 1053 if (ret) 1054 return ret; 1055 1056 /* 1057 * SEV-ES guests maintain an encrypted version of their FPU 1058 * state which is restored and saved on VMRUN and VMEXIT. 1059 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't 1060 * do xsave/xrstor on it. 1061 */ 1062 fpstate_set_confidential(&vcpu->arch.guest_fpu); 1063 vcpu->arch.guest_state_protected = true; 1064 1065 /* 1066 * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it 1067 * only after setting guest_state_protected because KVM_SET_MSRS allows 1068 * dynamic toggling of LBRV (for performance reason) on write access to 1069 * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. 1070 */ 1071 svm_enable_lbrv(vcpu); 1072 return 0; 1073 } 1074 1075 static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) 1076 { 1077 struct kvm_vcpu *vcpu; 1078 unsigned long i; 1079 int ret; 1080 1081 if (!sev_es_guest(kvm)) 1082 return -ENOTTY; 1083 1084 if (kvm_is_vcpu_creation_in_progress(kvm)) 1085 return -EBUSY; 1086 1087 ret = kvm_lock_all_vcpus(kvm); 1088 if (ret) 1089 return ret; 1090 1091 kvm_for_each_vcpu(i, vcpu, kvm) { 1092 ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error); 1093 if (ret) 1094 break; 1095 } 1096 1097 kvm_unlock_all_vcpus(kvm); 1098 return ret; 1099 } 1100 1101 static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) 1102 { 1103 void __user *measure = u64_to_user_ptr(argp->data); 1104 struct sev_data_launch_measure data; 1105 struct kvm_sev_launch_measure params; 1106 void __user *p = NULL; 1107 void *blob = NULL; 1108 int ret; 1109 1110 if (!sev_guest(kvm)) 1111 return -ENOTTY; 1112 1113 if (copy_from_user(¶ms, measure, sizeof(params))) 1114 return -EFAULT; 1115 1116 memset(&data, 0, sizeof(data)); 1117 1118 /* User wants to query the blob length */ 1119 if (!params.len) 1120 goto cmd; 1121 1122 p = u64_to_user_ptr(params.uaddr); 1123 if (p) { 1124 if (params.len > SEV_FW_BLOB_MAX_SIZE) 1125 return -EINVAL; 1126 1127 blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT); 1128 if (!blob) 1129 return -ENOMEM; 1130 1131 data.address = __psp_pa(blob); 1132 data.len = params.len; 1133 } 1134 1135 cmd: 1136 data.handle = to_kvm_sev_info(kvm)->handle; 1137 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error); 1138 1139 /* 1140 * If we query the session length, FW responded with expected data. 1141 */ 1142 if (!params.len) 1143 goto done; 1144 1145 if (ret) 1146 goto e_free_blob; 1147 1148 if (blob) { 1149 if (copy_to_user(p, blob, params.len)) 1150 ret = -EFAULT; 1151 } 1152 1153 done: 1154 params.len = data.len; 1155 if (copy_to_user(measure, ¶ms, sizeof(params))) 1156 ret = -EFAULT; 1157 e_free_blob: 1158 kfree(blob); 1159 return ret; 1160 } 1161 1162 static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 1163 { 1164 struct sev_data_launch_finish data; 1165 1166 if (!sev_guest(kvm)) 1167 return -ENOTTY; 1168 1169 data.handle = to_kvm_sev_info(kvm)->handle; 1170 return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error); 1171 } 1172 1173 static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) 1174 { 1175 struct kvm_sev_guest_status params; 1176 struct sev_data_guest_status data; 1177 int ret; 1178 1179 if (!sev_guest(kvm)) 1180 return -ENOTTY; 1181 1182 memset(&data, 0, sizeof(data)); 1183 1184 data.handle = to_kvm_sev_info(kvm)->handle; 1185 ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error); 1186 if (ret) 1187 return ret; 1188 1189 params.policy = data.policy; 1190 params.state = data.state; 1191 params.handle = data.handle; 1192 1193 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) 1194 ret = -EFAULT; 1195 1196 return ret; 1197 } 1198 1199 static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, 1200 unsigned long dst, int size, 1201 int *error, bool enc) 1202 { 1203 struct sev_data_dbg data; 1204 1205 data.reserved = 0; 1206 data.handle = to_kvm_sev_info(kvm)->handle; 1207 data.dst_addr = dst; 1208 data.src_addr = src; 1209 data.len = size; 1210 1211 return sev_issue_cmd(kvm, 1212 enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT, 1213 &data, error); 1214 } 1215 1216 static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr, 1217 unsigned long dst_paddr, int sz, int *err) 1218 { 1219 int offset; 1220 1221 /* 1222 * Its safe to read more than we are asked, caller should ensure that 1223 * destination has enough space. 1224 */ 1225 offset = src_paddr & 15; 1226 src_paddr = round_down(src_paddr, 16); 1227 sz = round_up(sz + offset, 16); 1228 1229 return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false); 1230 } 1231 1232 static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, 1233 void __user *dst_uaddr, 1234 unsigned long dst_paddr, 1235 int size, int *err) 1236 { 1237 struct page *tpage = NULL; 1238 int ret, offset; 1239 1240 /* if inputs are not 16-byte then use intermediate buffer */ 1241 if (!IS_ALIGNED(dst_paddr, 16) || 1242 !IS_ALIGNED(paddr, 16) || 1243 !IS_ALIGNED(size, 16)) { 1244 tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 1245 if (!tpage) 1246 return -ENOMEM; 1247 1248 dst_paddr = __sme_page_pa(tpage); 1249 } 1250 1251 ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err); 1252 if (ret) 1253 goto e_free; 1254 1255 if (tpage) { 1256 offset = paddr & 15; 1257 if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size)) 1258 ret = -EFAULT; 1259 } 1260 1261 e_free: 1262 if (tpage) 1263 __free_page(tpage); 1264 1265 return ret; 1266 } 1267 1268 static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, 1269 void __user *vaddr, 1270 unsigned long dst_paddr, 1271 void __user *dst_vaddr, 1272 int size, int *error) 1273 { 1274 struct page *src_tpage = NULL; 1275 struct page *dst_tpage = NULL; 1276 int ret, len = size; 1277 1278 /* If source buffer is not aligned then use an intermediate buffer */ 1279 if (!IS_ALIGNED((unsigned long)vaddr, 16)) { 1280 src_tpage = alloc_page(GFP_KERNEL_ACCOUNT); 1281 if (!src_tpage) 1282 return -ENOMEM; 1283 1284 if (copy_from_user(page_address(src_tpage), vaddr, size)) { 1285 __free_page(src_tpage); 1286 return -EFAULT; 1287 } 1288 1289 paddr = __sme_page_pa(src_tpage); 1290 } 1291 1292 /* 1293 * If destination buffer or length is not aligned then do read-modify-write: 1294 * - decrypt destination in an intermediate buffer 1295 * - copy the source buffer in an intermediate buffer 1296 * - use the intermediate buffer as source buffer 1297 */ 1298 if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { 1299 int dst_offset; 1300 1301 dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT); 1302 if (!dst_tpage) { 1303 ret = -ENOMEM; 1304 goto e_free; 1305 } 1306 1307 ret = __sev_dbg_decrypt(kvm, dst_paddr, 1308 __sme_page_pa(dst_tpage), size, error); 1309 if (ret) 1310 goto e_free; 1311 1312 /* 1313 * If source is kernel buffer then use memcpy() otherwise 1314 * copy_from_user(). 1315 */ 1316 dst_offset = dst_paddr & 15; 1317 1318 if (src_tpage) 1319 memcpy(page_address(dst_tpage) + dst_offset, 1320 page_address(src_tpage), size); 1321 else { 1322 if (copy_from_user(page_address(dst_tpage) + dst_offset, 1323 vaddr, size)) { 1324 ret = -EFAULT; 1325 goto e_free; 1326 } 1327 } 1328 1329 paddr = __sme_page_pa(dst_tpage); 1330 dst_paddr = round_down(dst_paddr, 16); 1331 len = round_up(size, 16); 1332 } 1333 1334 ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true); 1335 1336 e_free: 1337 if (src_tpage) 1338 __free_page(src_tpage); 1339 if (dst_tpage) 1340 __free_page(dst_tpage); 1341 return ret; 1342 } 1343 1344 static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) 1345 { 1346 unsigned long vaddr, vaddr_end, next_vaddr; 1347 unsigned long dst_vaddr; 1348 struct page **src_p, **dst_p; 1349 struct kvm_sev_dbg debug; 1350 unsigned long n; 1351 unsigned int size; 1352 int ret; 1353 1354 if (!sev_guest(kvm)) 1355 return -ENOTTY; 1356 1357 if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug))) 1358 return -EFAULT; 1359 1360 if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr) 1361 return -EINVAL; 1362 if (!debug.dst_uaddr) 1363 return -EINVAL; 1364 1365 vaddr = debug.src_uaddr; 1366 size = debug.len; 1367 vaddr_end = vaddr + size; 1368 dst_vaddr = debug.dst_uaddr; 1369 1370 for (; vaddr < vaddr_end; vaddr = next_vaddr) { 1371 int len, s_off, d_off; 1372 1373 /* lock userspace source and destination page */ 1374 src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0); 1375 if (IS_ERR(src_p)) 1376 return PTR_ERR(src_p); 1377 1378 dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE); 1379 if (IS_ERR(dst_p)) { 1380 sev_unpin_memory(kvm, src_p, n); 1381 return PTR_ERR(dst_p); 1382 } 1383 1384 /* 1385 * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify 1386 * the pages; flush the destination too so that future accesses do not 1387 * see stale data. 1388 */ 1389 sev_clflush_pages(src_p, 1); 1390 sev_clflush_pages(dst_p, 1); 1391 1392 /* 1393 * Since user buffer may not be page aligned, calculate the 1394 * offset within the page. 1395 */ 1396 s_off = vaddr & ~PAGE_MASK; 1397 d_off = dst_vaddr & ~PAGE_MASK; 1398 len = min_t(size_t, (PAGE_SIZE - s_off), size); 1399 1400 if (dec) 1401 ret = __sev_dbg_decrypt_user(kvm, 1402 __sme_page_pa(src_p[0]) + s_off, 1403 (void __user *)dst_vaddr, 1404 __sme_page_pa(dst_p[0]) + d_off, 1405 len, &argp->error); 1406 else 1407 ret = __sev_dbg_encrypt_user(kvm, 1408 __sme_page_pa(src_p[0]) + s_off, 1409 (void __user *)vaddr, 1410 __sme_page_pa(dst_p[0]) + d_off, 1411 (void __user *)dst_vaddr, 1412 len, &argp->error); 1413 1414 sev_unpin_memory(kvm, src_p, n); 1415 sev_unpin_memory(kvm, dst_p, n); 1416 1417 if (ret) 1418 goto err; 1419 1420 next_vaddr = vaddr + len; 1421 dst_vaddr = dst_vaddr + len; 1422 size -= len; 1423 } 1424 err: 1425 return ret; 1426 } 1427 1428 static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) 1429 { 1430 struct sev_data_launch_secret data; 1431 struct kvm_sev_launch_secret params; 1432 struct page **pages; 1433 void *blob, *hdr; 1434 unsigned long n, i; 1435 int ret, offset; 1436 1437 if (!sev_guest(kvm)) 1438 return -ENOTTY; 1439 1440 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 1441 return -EFAULT; 1442 1443 pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE); 1444 if (IS_ERR(pages)) 1445 return PTR_ERR(pages); 1446 1447 /* 1448 * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in 1449 * place; the cache may contain the data that was written unencrypted. 1450 */ 1451 sev_clflush_pages(pages, n); 1452 1453 /* 1454 * The secret must be copied into contiguous memory region, lets verify 1455 * that userspace memory pages are contiguous before we issue command. 1456 */ 1457 if (get_num_contig_pages(0, pages, n) != n) { 1458 ret = -EINVAL; 1459 goto e_unpin_memory; 1460 } 1461 1462 memset(&data, 0, sizeof(data)); 1463 1464 offset = params.guest_uaddr & (PAGE_SIZE - 1); 1465 data.guest_address = __sme_page_pa(pages[0]) + offset; 1466 data.guest_len = params.guest_len; 1467 1468 blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len); 1469 if (IS_ERR(blob)) { 1470 ret = PTR_ERR(blob); 1471 goto e_unpin_memory; 1472 } 1473 1474 data.trans_address = __psp_pa(blob); 1475 data.trans_len = params.trans_len; 1476 1477 hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); 1478 if (IS_ERR(hdr)) { 1479 ret = PTR_ERR(hdr); 1480 goto e_free_blob; 1481 } 1482 data.hdr_address = __psp_pa(hdr); 1483 data.hdr_len = params.hdr_len; 1484 1485 data.handle = to_kvm_sev_info(kvm)->handle; 1486 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error); 1487 1488 kfree(hdr); 1489 1490 e_free_blob: 1491 kfree(blob); 1492 e_unpin_memory: 1493 /* content of memory is updated, mark pages dirty */ 1494 for (i = 0; i < n; i++) { 1495 set_page_dirty_lock(pages[i]); 1496 mark_page_accessed(pages[i]); 1497 } 1498 sev_unpin_memory(kvm, pages, n); 1499 return ret; 1500 } 1501 1502 static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) 1503 { 1504 void __user *report = u64_to_user_ptr(argp->data); 1505 struct sev_data_attestation_report data; 1506 struct kvm_sev_attestation_report params; 1507 void __user *p; 1508 void *blob = NULL; 1509 int ret; 1510 1511 if (!sev_guest(kvm)) 1512 return -ENOTTY; 1513 1514 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 1515 return -EFAULT; 1516 1517 memset(&data, 0, sizeof(data)); 1518 1519 /* User wants to query the blob length */ 1520 if (!params.len) 1521 goto cmd; 1522 1523 p = u64_to_user_ptr(params.uaddr); 1524 if (p) { 1525 if (params.len > SEV_FW_BLOB_MAX_SIZE) 1526 return -EINVAL; 1527 1528 blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT); 1529 if (!blob) 1530 return -ENOMEM; 1531 1532 data.address = __psp_pa(blob); 1533 data.len = params.len; 1534 memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce)); 1535 } 1536 cmd: 1537 data.handle = to_kvm_sev_info(kvm)->handle; 1538 ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error); 1539 /* 1540 * If we query the session length, FW responded with expected data. 1541 */ 1542 if (!params.len) 1543 goto done; 1544 1545 if (ret) 1546 goto e_free_blob; 1547 1548 if (blob) { 1549 if (copy_to_user(p, blob, params.len)) 1550 ret = -EFAULT; 1551 } 1552 1553 done: 1554 params.len = data.len; 1555 if (copy_to_user(report, ¶ms, sizeof(params))) 1556 ret = -EFAULT; 1557 e_free_blob: 1558 kfree(blob); 1559 return ret; 1560 } 1561 1562 /* Userspace wants to query session length. */ 1563 static int 1564 __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, 1565 struct kvm_sev_send_start *params) 1566 { 1567 struct sev_data_send_start data; 1568 int ret; 1569 1570 memset(&data, 0, sizeof(data)); 1571 data.handle = to_kvm_sev_info(kvm)->handle; 1572 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); 1573 1574 params->session_len = data.session_len; 1575 if (copy_to_user(u64_to_user_ptr(argp->data), params, 1576 sizeof(struct kvm_sev_send_start))) 1577 ret = -EFAULT; 1578 1579 return ret; 1580 } 1581 1582 static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 1583 { 1584 struct sev_data_send_start data; 1585 struct kvm_sev_send_start params; 1586 void *amd_certs, *session_data; 1587 void *pdh_cert, *plat_certs; 1588 int ret; 1589 1590 if (!sev_guest(kvm)) 1591 return -ENOTTY; 1592 1593 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1594 sizeof(struct kvm_sev_send_start))) 1595 return -EFAULT; 1596 1597 /* if session_len is zero, userspace wants to query the session length */ 1598 if (!params.session_len) 1599 return __sev_send_start_query_session_length(kvm, argp, 1600 ¶ms); 1601 1602 /* some sanity checks */ 1603 if (!params.pdh_cert_uaddr || !params.pdh_cert_len || 1604 !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE) 1605 return -EINVAL; 1606 1607 /* allocate the memory to hold the session data blob */ 1608 session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT); 1609 if (!session_data) 1610 return -ENOMEM; 1611 1612 /* copy the certificate blobs from userspace */ 1613 pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr, 1614 params.pdh_cert_len); 1615 if (IS_ERR(pdh_cert)) { 1616 ret = PTR_ERR(pdh_cert); 1617 goto e_free_session; 1618 } 1619 1620 plat_certs = psp_copy_user_blob(params.plat_certs_uaddr, 1621 params.plat_certs_len); 1622 if (IS_ERR(plat_certs)) { 1623 ret = PTR_ERR(plat_certs); 1624 goto e_free_pdh; 1625 } 1626 1627 amd_certs = psp_copy_user_blob(params.amd_certs_uaddr, 1628 params.amd_certs_len); 1629 if (IS_ERR(amd_certs)) { 1630 ret = PTR_ERR(amd_certs); 1631 goto e_free_plat_cert; 1632 } 1633 1634 /* populate the FW SEND_START field with system physical address */ 1635 memset(&data, 0, sizeof(data)); 1636 data.pdh_cert_address = __psp_pa(pdh_cert); 1637 data.pdh_cert_len = params.pdh_cert_len; 1638 data.plat_certs_address = __psp_pa(plat_certs); 1639 data.plat_certs_len = params.plat_certs_len; 1640 data.amd_certs_address = __psp_pa(amd_certs); 1641 data.amd_certs_len = params.amd_certs_len; 1642 data.session_address = __psp_pa(session_data); 1643 data.session_len = params.session_len; 1644 data.handle = to_kvm_sev_info(kvm)->handle; 1645 1646 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); 1647 1648 if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr), 1649 session_data, params.session_len)) { 1650 ret = -EFAULT; 1651 goto e_free_amd_cert; 1652 } 1653 1654 params.policy = data.policy; 1655 params.session_len = data.session_len; 1656 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, 1657 sizeof(struct kvm_sev_send_start))) 1658 ret = -EFAULT; 1659 1660 e_free_amd_cert: 1661 kfree(amd_certs); 1662 e_free_plat_cert: 1663 kfree(plat_certs); 1664 e_free_pdh: 1665 kfree(pdh_cert); 1666 e_free_session: 1667 kfree(session_data); 1668 return ret; 1669 } 1670 1671 /* Userspace wants to query either header or trans length. */ 1672 static int 1673 __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, 1674 struct kvm_sev_send_update_data *params) 1675 { 1676 struct sev_data_send_update_data data; 1677 int ret; 1678 1679 memset(&data, 0, sizeof(data)); 1680 data.handle = to_kvm_sev_info(kvm)->handle; 1681 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); 1682 1683 params->hdr_len = data.hdr_len; 1684 params->trans_len = data.trans_len; 1685 1686 if (copy_to_user(u64_to_user_ptr(argp->data), params, 1687 sizeof(struct kvm_sev_send_update_data))) 1688 ret = -EFAULT; 1689 1690 return ret; 1691 } 1692 1693 static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) 1694 { 1695 struct sev_data_send_update_data data; 1696 struct kvm_sev_send_update_data params; 1697 void *hdr, *trans_data; 1698 struct page **guest_page; 1699 unsigned long n; 1700 int ret, offset; 1701 1702 if (!sev_guest(kvm)) 1703 return -ENOTTY; 1704 1705 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1706 sizeof(struct kvm_sev_send_update_data))) 1707 return -EFAULT; 1708 1709 /* userspace wants to query either header or trans length */ 1710 if (!params.trans_len || !params.hdr_len) 1711 return __sev_send_update_data_query_lengths(kvm, argp, ¶ms); 1712 1713 if (!params.trans_uaddr || !params.guest_uaddr || 1714 !params.guest_len || !params.hdr_uaddr) 1715 return -EINVAL; 1716 1717 /* Check if we are crossing the page boundary */ 1718 offset = params.guest_uaddr & (PAGE_SIZE - 1); 1719 if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) 1720 return -EINVAL; 1721 1722 /* Pin guest memory */ 1723 guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, 1724 PAGE_SIZE, &n, 0); 1725 if (IS_ERR(guest_page)) 1726 return PTR_ERR(guest_page); 1727 1728 /* allocate memory for header and transport buffer */ 1729 ret = -ENOMEM; 1730 hdr = kzalloc(params.hdr_len, GFP_KERNEL); 1731 if (!hdr) 1732 goto e_unpin; 1733 1734 trans_data = kzalloc(params.trans_len, GFP_KERNEL); 1735 if (!trans_data) 1736 goto e_free_hdr; 1737 1738 memset(&data, 0, sizeof(data)); 1739 data.hdr_address = __psp_pa(hdr); 1740 data.hdr_len = params.hdr_len; 1741 data.trans_address = __psp_pa(trans_data); 1742 data.trans_len = params.trans_len; 1743 1744 /* The SEND_UPDATE_DATA command requires C-bit to be always set. */ 1745 data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; 1746 data.guest_address |= sev_me_mask; 1747 data.guest_len = params.guest_len; 1748 data.handle = to_kvm_sev_info(kvm)->handle; 1749 1750 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); 1751 1752 if (ret) 1753 goto e_free_trans_data; 1754 1755 /* copy transport buffer to user space */ 1756 if (copy_to_user(u64_to_user_ptr(params.trans_uaddr), 1757 trans_data, params.trans_len)) { 1758 ret = -EFAULT; 1759 goto e_free_trans_data; 1760 } 1761 1762 /* Copy packet header to userspace. */ 1763 if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr, 1764 params.hdr_len)) 1765 ret = -EFAULT; 1766 1767 e_free_trans_data: 1768 kfree(trans_data); 1769 e_free_hdr: 1770 kfree(hdr); 1771 e_unpin: 1772 sev_unpin_memory(kvm, guest_page, n); 1773 1774 return ret; 1775 } 1776 1777 static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 1778 { 1779 struct sev_data_send_finish data; 1780 1781 if (!sev_guest(kvm)) 1782 return -ENOTTY; 1783 1784 data.handle = to_kvm_sev_info(kvm)->handle; 1785 return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error); 1786 } 1787 1788 static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp) 1789 { 1790 struct sev_data_send_cancel data; 1791 1792 if (!sev_guest(kvm)) 1793 return -ENOTTY; 1794 1795 data.handle = to_kvm_sev_info(kvm)->handle; 1796 return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error); 1797 } 1798 1799 static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 1800 { 1801 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 1802 struct sev_data_receive_start start; 1803 struct kvm_sev_receive_start params; 1804 int *error = &argp->error; 1805 void *session_data; 1806 void *pdh_data; 1807 int ret; 1808 1809 if (!sev_guest(kvm)) 1810 return -ENOTTY; 1811 1812 /* Get parameter from the userspace */ 1813 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1814 sizeof(struct kvm_sev_receive_start))) 1815 return -EFAULT; 1816 1817 /* some sanity checks */ 1818 if (!params.pdh_uaddr || !params.pdh_len || 1819 !params.session_uaddr || !params.session_len) 1820 return -EINVAL; 1821 1822 pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len); 1823 if (IS_ERR(pdh_data)) 1824 return PTR_ERR(pdh_data); 1825 1826 session_data = psp_copy_user_blob(params.session_uaddr, 1827 params.session_len); 1828 if (IS_ERR(session_data)) { 1829 ret = PTR_ERR(session_data); 1830 goto e_free_pdh; 1831 } 1832 1833 memset(&start, 0, sizeof(start)); 1834 start.handle = params.handle; 1835 start.policy = params.policy; 1836 start.pdh_cert_address = __psp_pa(pdh_data); 1837 start.pdh_cert_len = params.pdh_len; 1838 start.session_address = __psp_pa(session_data); 1839 start.session_len = params.session_len; 1840 1841 /* create memory encryption context */ 1842 ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start, 1843 error); 1844 if (ret) 1845 goto e_free_session; 1846 1847 /* Bind ASID to this guest */ 1848 ret = sev_bind_asid(kvm, start.handle, error); 1849 if (ret) { 1850 sev_decommission(start.handle); 1851 goto e_free_session; 1852 } 1853 1854 params.handle = start.handle; 1855 if (copy_to_user(u64_to_user_ptr(argp->data), 1856 ¶ms, sizeof(struct kvm_sev_receive_start))) { 1857 ret = -EFAULT; 1858 sev_unbind_asid(kvm, start.handle); 1859 goto e_free_session; 1860 } 1861 1862 sev->handle = start.handle; 1863 sev->fd = argp->sev_fd; 1864 1865 e_free_session: 1866 kfree(session_data); 1867 e_free_pdh: 1868 kfree(pdh_data); 1869 1870 return ret; 1871 } 1872 1873 static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) 1874 { 1875 struct kvm_sev_receive_update_data params; 1876 struct sev_data_receive_update_data data; 1877 void *hdr = NULL, *trans = NULL; 1878 struct page **guest_page; 1879 unsigned long n; 1880 int ret, offset; 1881 1882 if (!sev_guest(kvm)) 1883 return -EINVAL; 1884 1885 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1886 sizeof(struct kvm_sev_receive_update_data))) 1887 return -EFAULT; 1888 1889 if (!params.hdr_uaddr || !params.hdr_len || 1890 !params.guest_uaddr || !params.guest_len || 1891 !params.trans_uaddr || !params.trans_len) 1892 return -EINVAL; 1893 1894 /* Check if we are crossing the page boundary */ 1895 offset = params.guest_uaddr & (PAGE_SIZE - 1); 1896 if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) 1897 return -EINVAL; 1898 1899 hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); 1900 if (IS_ERR(hdr)) 1901 return PTR_ERR(hdr); 1902 1903 trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len); 1904 if (IS_ERR(trans)) { 1905 ret = PTR_ERR(trans); 1906 goto e_free_hdr; 1907 } 1908 1909 memset(&data, 0, sizeof(data)); 1910 data.hdr_address = __psp_pa(hdr); 1911 data.hdr_len = params.hdr_len; 1912 data.trans_address = __psp_pa(trans); 1913 data.trans_len = params.trans_len; 1914 1915 /* Pin guest memory */ 1916 guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, 1917 PAGE_SIZE, &n, FOLL_WRITE); 1918 if (IS_ERR(guest_page)) { 1919 ret = PTR_ERR(guest_page); 1920 goto e_free_trans; 1921 } 1922 1923 /* 1924 * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP 1925 * encrypts the written data with the guest's key, and the cache may 1926 * contain dirty, unencrypted data. 1927 */ 1928 sev_clflush_pages(guest_page, n); 1929 1930 /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */ 1931 data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; 1932 data.guest_address |= sev_me_mask; 1933 data.guest_len = params.guest_len; 1934 data.handle = to_kvm_sev_info(kvm)->handle; 1935 1936 ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data, 1937 &argp->error); 1938 1939 sev_unpin_memory(kvm, guest_page, n); 1940 1941 e_free_trans: 1942 kfree(trans); 1943 e_free_hdr: 1944 kfree(hdr); 1945 1946 return ret; 1947 } 1948 1949 static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 1950 { 1951 struct sev_data_receive_finish data; 1952 1953 if (!sev_guest(kvm)) 1954 return -ENOTTY; 1955 1956 data.handle = to_kvm_sev_info(kvm)->handle; 1957 return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); 1958 } 1959 1960 static bool is_cmd_allowed_from_mirror(u32 cmd_id) 1961 { 1962 /* 1963 * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES 1964 * active mirror VMs. Also allow the debugging and status commands. 1965 */ 1966 if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA || 1967 cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT || 1968 cmd_id == KVM_SEV_DBG_ENCRYPT) 1969 return true; 1970 1971 return false; 1972 } 1973 1974 static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) 1975 { 1976 struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); 1977 struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); 1978 int r = -EBUSY; 1979 1980 if (dst_kvm == src_kvm) 1981 return -EINVAL; 1982 1983 /* 1984 * Bail if these VMs are already involved in a migration to avoid 1985 * deadlock between two VMs trying to migrate to/from each other. 1986 */ 1987 if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1)) 1988 return -EBUSY; 1989 1990 if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) 1991 goto release_dst; 1992 1993 r = -EINTR; 1994 if (mutex_lock_killable(&dst_kvm->lock)) 1995 goto release_src; 1996 if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING)) 1997 goto unlock_dst; 1998 return 0; 1999 2000 unlock_dst: 2001 mutex_unlock(&dst_kvm->lock); 2002 release_src: 2003 atomic_set_release(&src_sev->migration_in_progress, 0); 2004 release_dst: 2005 atomic_set_release(&dst_sev->migration_in_progress, 0); 2006 return r; 2007 } 2008 2009 static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) 2010 { 2011 struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); 2012 struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); 2013 2014 mutex_unlock(&dst_kvm->lock); 2015 mutex_unlock(&src_kvm->lock); 2016 atomic_set_release(&dst_sev->migration_in_progress, 0); 2017 atomic_set_release(&src_sev->migration_in_progress, 0); 2018 } 2019 2020 static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) 2021 { 2022 struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm); 2023 struct kvm_sev_info *src = to_kvm_sev_info(src_kvm); 2024 struct kvm_vcpu *dst_vcpu, *src_vcpu; 2025 struct vcpu_svm *dst_svm, *src_svm; 2026 struct kvm_sev_info *mirror; 2027 unsigned long i; 2028 2029 dst->active = true; 2030 dst->asid = src->asid; 2031 dst->handle = src->handle; 2032 dst->pages_locked = src->pages_locked; 2033 dst->enc_context_owner = src->enc_context_owner; 2034 dst->es_active = src->es_active; 2035 dst->vmsa_features = src->vmsa_features; 2036 2037 src->asid = 0; 2038 src->active = false; 2039 src->handle = 0; 2040 src->pages_locked = 0; 2041 src->enc_context_owner = NULL; 2042 src->es_active = false; 2043 2044 list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); 2045 2046 /* 2047 * If this VM has mirrors, "transfer" each mirror's refcount of the 2048 * source to the destination (this KVM). The caller holds a reference 2049 * to the source, so there's no danger of use-after-free. 2050 */ 2051 list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms); 2052 list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) { 2053 kvm_get_kvm(dst_kvm); 2054 kvm_put_kvm(src_kvm); 2055 mirror->enc_context_owner = dst_kvm; 2056 } 2057 2058 /* 2059 * If this VM is a mirror, remove the old mirror from the owners list 2060 * and add the new mirror to the list. 2061 */ 2062 if (is_mirroring_enc_context(dst_kvm)) { 2063 struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner); 2064 2065 list_del(&src->mirror_entry); 2066 list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); 2067 } 2068 2069 kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) { 2070 dst_svm = to_svm(dst_vcpu); 2071 2072 sev_init_vmcb(dst_svm, false); 2073 2074 if (!dst->es_active) 2075 continue; 2076 2077 /* 2078 * Note, the source is not required to have the same number of 2079 * vCPUs as the destination when migrating a vanilla SEV VM. 2080 */ 2081 src_vcpu = kvm_get_vcpu(src_kvm, i); 2082 src_svm = to_svm(src_vcpu); 2083 2084 /* 2085 * Transfer VMSA and GHCB state to the destination. Nullify and 2086 * clear source fields as appropriate, the state now belongs to 2087 * the destination. 2088 */ 2089 memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es)); 2090 dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa; 2091 dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa; 2092 dst_vcpu->arch.guest_state_protected = true; 2093 2094 memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es)); 2095 src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE; 2096 src_svm->vmcb->control.vmsa_pa = INVALID_PAGE; 2097 src_vcpu->arch.guest_state_protected = false; 2098 } 2099 } 2100 2101 static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) 2102 { 2103 struct kvm_vcpu *src_vcpu; 2104 unsigned long i; 2105 2106 if (kvm_is_vcpu_creation_in_progress(src) || 2107 kvm_is_vcpu_creation_in_progress(dst)) 2108 return -EBUSY; 2109 2110 if (!sev_es_guest(src)) 2111 return 0; 2112 2113 if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus)) 2114 return -EINVAL; 2115 2116 kvm_for_each_vcpu(i, src_vcpu, src) { 2117 if (!src_vcpu->arch.guest_state_protected) 2118 return -EINVAL; 2119 } 2120 2121 return 0; 2122 } 2123 2124 int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) 2125 { 2126 struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm); 2127 struct kvm_sev_info *src_sev, *cg_cleanup_sev; 2128 CLASS(fd, f)(source_fd); 2129 struct kvm *source_kvm; 2130 bool charged = false; 2131 int ret; 2132 2133 if (fd_empty(f)) 2134 return -EBADF; 2135 2136 if (!file_is_kvm(fd_file(f))) 2137 return -EBADF; 2138 2139 source_kvm = fd_file(f)->private_data; 2140 ret = sev_lock_two_vms(kvm, source_kvm); 2141 if (ret) 2142 return ret; 2143 2144 if (kvm->arch.vm_type != source_kvm->arch.vm_type || 2145 sev_guest(kvm) || !sev_guest(source_kvm)) { 2146 ret = -EINVAL; 2147 goto out_unlock; 2148 } 2149 2150 src_sev = to_kvm_sev_info(source_kvm); 2151 2152 dst_sev->misc_cg = get_current_misc_cg(); 2153 cg_cleanup_sev = dst_sev; 2154 if (dst_sev->misc_cg != src_sev->misc_cg) { 2155 ret = sev_misc_cg_try_charge(dst_sev); 2156 if (ret) 2157 goto out_dst_cgroup; 2158 charged = true; 2159 } 2160 2161 ret = kvm_lock_all_vcpus(kvm); 2162 if (ret) 2163 goto out_dst_cgroup; 2164 ret = kvm_lock_all_vcpus(source_kvm); 2165 if (ret) 2166 goto out_dst_vcpu; 2167 2168 ret = sev_check_source_vcpus(kvm, source_kvm); 2169 if (ret) 2170 goto out_source_vcpu; 2171 2172 /* 2173 * Allocate a new have_run_cpus for the destination, i.e. don't copy 2174 * the set of CPUs from the source. If a CPU was used to run a vCPU in 2175 * the source VM but is never used for the destination VM, then the CPU 2176 * can only have cached memory that was accessible to the source VM. 2177 */ 2178 if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 2179 ret = -ENOMEM; 2180 goto out_source_vcpu; 2181 } 2182 2183 sev_migrate_from(kvm, source_kvm); 2184 kvm_vm_dead(source_kvm); 2185 cg_cleanup_sev = src_sev; 2186 ret = 0; 2187 2188 out_source_vcpu: 2189 kvm_unlock_all_vcpus(source_kvm); 2190 out_dst_vcpu: 2191 kvm_unlock_all_vcpus(kvm); 2192 out_dst_cgroup: 2193 /* Operates on the source on success, on the destination on failure. */ 2194 if (charged) 2195 sev_misc_cg_uncharge(cg_cleanup_sev); 2196 put_misc_cg(cg_cleanup_sev->misc_cg); 2197 cg_cleanup_sev->misc_cg = NULL; 2198 out_unlock: 2199 sev_unlock_two_vms(kvm, source_kvm); 2200 return ret; 2201 } 2202 2203 int sev_dev_get_attr(u32 group, u64 attr, u64 *val) 2204 { 2205 if (group != KVM_X86_GRP_SEV) 2206 return -ENXIO; 2207 2208 switch (attr) { 2209 case KVM_X86_SEV_VMSA_FEATURES: 2210 *val = sev_supported_vmsa_features; 2211 return 0; 2212 2213 case KVM_X86_SNP_POLICY_BITS: 2214 *val = snp_supported_policy_bits; 2215 return 0; 2216 2217 case KVM_X86_SEV_SNP_REQ_CERTS: 2218 *val = sev_snp_enabled ? 1 : 0; 2219 return 0; 2220 default: 2221 return -ENXIO; 2222 } 2223 } 2224 2225 /* 2226 * The guest context contains all the information, keys and metadata 2227 * associated with the guest that the firmware tracks to implement SEV 2228 * and SNP features. The firmware stores the guest context in hypervisor 2229 * provide page via the SNP_GCTX_CREATE command. 2230 */ 2231 static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) 2232 { 2233 struct sev_data_snp_addr data = {}; 2234 void *context; 2235 int rc; 2236 2237 /* Allocate memory for context page */ 2238 context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); 2239 if (!context) 2240 return NULL; 2241 2242 data.address = __psp_pa(context); 2243 rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error); 2244 if (rc) { 2245 pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d", 2246 rc, argp->error); 2247 snp_free_firmware_page(context); 2248 return NULL; 2249 } 2250 2251 return context; 2252 } 2253 2254 static int snp_bind_asid(struct kvm *kvm, int *error) 2255 { 2256 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2257 struct sev_data_snp_activate data = {0}; 2258 2259 data.gctx_paddr = __psp_pa(sev->snp_context); 2260 data.asid = sev_get_asid(kvm); 2261 return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error); 2262 } 2263 2264 static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 2265 { 2266 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2267 struct sev_data_snp_launch_start start = {0}; 2268 struct kvm_sev_snp_launch_start params; 2269 int rc; 2270 2271 if (!sev_snp_guest(kvm)) 2272 return -ENOTTY; 2273 2274 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 2275 return -EFAULT; 2276 2277 /* Don't allow userspace to allocate memory for more than 1 SNP context. */ 2278 if (sev->snp_context) 2279 return -EINVAL; 2280 2281 if (params.flags) 2282 return -EINVAL; 2283 2284 if (params.policy & ~snp_supported_policy_bits) 2285 return -EINVAL; 2286 2287 /* Check for policy bits that must be set */ 2288 if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO)) 2289 return -EINVAL; 2290 2291 if (snp_is_secure_tsc_enabled(kvm)) { 2292 if (WARN_ON_ONCE(!kvm->arch.default_tsc_khz)) 2293 return -EINVAL; 2294 2295 start.desired_tsc_khz = kvm->arch.default_tsc_khz; 2296 } 2297 2298 sev->snp_context = snp_context_create(kvm, argp); 2299 if (!sev->snp_context) 2300 return -ENOTTY; 2301 2302 start.gctx_paddr = __psp_pa(sev->snp_context); 2303 start.policy = params.policy; 2304 2305 memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); 2306 rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); 2307 if (rc) { 2308 pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n", 2309 __func__, rc); 2310 goto e_free_context; 2311 } 2312 2313 sev->policy = params.policy; 2314 sev->fd = argp->sev_fd; 2315 rc = snp_bind_asid(kvm, &argp->error); 2316 if (rc) { 2317 pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n", 2318 __func__, rc); 2319 goto e_free_context; 2320 } 2321 2322 return 0; 2323 2324 e_free_context: 2325 snp_decommission_context(kvm); 2326 2327 return rc; 2328 } 2329 2330 struct sev_gmem_populate_args { 2331 __u8 type; 2332 int sev_fd; 2333 int fw_error; 2334 }; 2335 2336 static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 2337 struct page *src_page, void *opaque) 2338 { 2339 struct sev_gmem_populate_args *sev_populate_args = opaque; 2340 struct sev_data_snp_launch_update fw_args = {0}; 2341 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2342 bool assigned = false; 2343 int level; 2344 int ret; 2345 2346 if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page)) 2347 return -EINVAL; 2348 2349 ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level); 2350 if (ret || assigned) { 2351 pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", 2352 __func__, gfn, ret, assigned); 2353 ret = ret ? -EINVAL : -EEXIST; 2354 goto out; 2355 } 2356 2357 if (src_page) { 2358 void *src_vaddr = kmap_local_page(src_page); 2359 void *dst_vaddr = kmap_local_pfn(pfn); 2360 2361 memcpy(dst_vaddr, src_vaddr, PAGE_SIZE); 2362 2363 kunmap_local(src_vaddr); 2364 kunmap_local(dst_vaddr); 2365 } 2366 2367 ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K, 2368 sev_get_asid(kvm), true); 2369 if (ret) 2370 goto out; 2371 2372 fw_args.gctx_paddr = __psp_pa(sev->snp_context); 2373 fw_args.address = __sme_set(pfn_to_hpa(pfn)); 2374 fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); 2375 fw_args.page_type = sev_populate_args->type; 2376 2377 ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, 2378 &fw_args, &sev_populate_args->fw_error); 2379 /* 2380 * If the firmware command failed handle the reclaim and cleanup of that 2381 * PFN before reporting an error. 2382 * 2383 * Additionally, when invalid CPUID function entries are detected, 2384 * firmware writes the expected values into the page and leaves it 2385 * unencrypted so it can be used for debugging and error-reporting. 2386 * 2387 * Copy this page back into the source buffer so userspace can use this 2388 * information to provide information on which CPUID leaves/fields 2389 * failed CPUID validation. 2390 */ 2391 if (ret && !snp_page_reclaim(kvm, pfn) && 2392 sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && 2393 sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { 2394 void *src_vaddr = kmap_local_page(src_page); 2395 void *dst_vaddr = kmap_local_pfn(pfn); 2396 2397 memcpy(src_vaddr, dst_vaddr, PAGE_SIZE); 2398 2399 kunmap_local(src_vaddr); 2400 kunmap_local(dst_vaddr); 2401 } 2402 2403 out: 2404 if (ret) 2405 pr_debug("%s: error updating GFN %llx, return code %d (fw_error %d)\n", 2406 __func__, gfn, ret, sev_populate_args->fw_error); 2407 return ret; 2408 } 2409 2410 static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) 2411 { 2412 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2413 struct sev_gmem_populate_args sev_populate_args = {0}; 2414 struct kvm_sev_snp_launch_update params; 2415 struct kvm_memory_slot *memslot; 2416 long npages, count; 2417 void __user *src; 2418 2419 if (!sev_snp_guest(kvm) || !sev->snp_context) 2420 return -EINVAL; 2421 2422 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 2423 return -EFAULT; 2424 2425 pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__, 2426 params.gfn_start, params.len, params.type, params.flags); 2427 2428 if (!params.len || !PAGE_ALIGNED(params.len) || params.flags || 2429 (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL && 2430 params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO && 2431 params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED && 2432 params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS && 2433 params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID)) 2434 return -EINVAL; 2435 2436 src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr); 2437 2438 if (!PAGE_ALIGNED(src)) 2439 return -EINVAL; 2440 2441 npages = params.len / PAGE_SIZE; 2442 2443 /* 2444 * For each GFN that's being prepared as part of the initial guest 2445 * state, the following pre-conditions are verified: 2446 * 2447 * 1) The backing memslot is a valid private memslot. 2448 * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES 2449 * beforehand. 2450 * 3) The PFN of the guest_memfd has not already been set to private 2451 * in the RMP table. 2452 * 2453 * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page 2454 * faults if there's a race between a fault and an attribute update via 2455 * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized 2456 * here. However, kvm->slots_lock guards against both this as well as 2457 * concurrent memslot updates occurring while these checks are being 2458 * performed, so use that here to make it easier to reason about the 2459 * initial expected state and better guard against unexpected 2460 * situations. 2461 */ 2462 guard(mutex)(&kvm->slots_lock); 2463 2464 memslot = gfn_to_memslot(kvm, params.gfn_start); 2465 if (!kvm_slot_has_gmem(memslot)) 2466 return -EINVAL; 2467 2468 sev_populate_args.sev_fd = argp->sev_fd; 2469 sev_populate_args.type = params.type; 2470 2471 count = kvm_gmem_populate(kvm, params.gfn_start, src, npages, 2472 sev_gmem_post_populate, &sev_populate_args); 2473 if (count < 0) { 2474 argp->error = sev_populate_args.fw_error; 2475 pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n", 2476 __func__, count, argp->error); 2477 return -EIO; 2478 } 2479 2480 params.gfn_start += count; 2481 params.len -= count * PAGE_SIZE; 2482 if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO) 2483 params.uaddr += count * PAGE_SIZE; 2484 2485 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) 2486 return -EFAULT; 2487 2488 return 0; 2489 } 2490 2491 static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) 2492 { 2493 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2494 struct sev_data_snp_launch_update data = {}; 2495 struct kvm_vcpu *vcpu; 2496 unsigned long i; 2497 int ret; 2498 2499 if (kvm_is_vcpu_creation_in_progress(kvm)) 2500 return -EBUSY; 2501 2502 ret = kvm_lock_all_vcpus(kvm); 2503 if (ret) 2504 return ret; 2505 2506 data.gctx_paddr = __psp_pa(sev->snp_context); 2507 data.page_type = SNP_PAGE_TYPE_VMSA; 2508 2509 kvm_for_each_vcpu(i, vcpu, kvm) { 2510 struct vcpu_svm *svm = to_svm(vcpu); 2511 u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; 2512 2513 ret = sev_es_sync_vmsa(svm); 2514 if (ret) 2515 goto out; 2516 2517 /* Transition the VMSA page to a firmware state. */ 2518 ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true); 2519 if (ret) 2520 goto out; 2521 2522 /* Issue the SNP command to encrypt the VMSA */ 2523 data.address = __sme_pa(svm->sev_es.vmsa); 2524 ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, 2525 &data, &argp->error); 2526 if (ret) { 2527 snp_page_reclaim(kvm, pfn); 2528 2529 goto out; 2530 } 2531 2532 svm->vcpu.arch.guest_state_protected = true; 2533 /* 2534 * SEV-ES (and thus SNP) guest mandates LBR Virtualization to 2535 * be _always_ ON. Enable it only after setting 2536 * guest_state_protected because KVM_SET_MSRS allows dynamic 2537 * toggling of LBRV (for performance reason) on write access to 2538 * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. 2539 */ 2540 svm_enable_lbrv(vcpu); 2541 } 2542 2543 out: 2544 kvm_unlock_all_vcpus(kvm); 2545 return ret; 2546 } 2547 2548 static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 2549 { 2550 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2551 struct kvm_sev_snp_launch_finish params; 2552 struct sev_data_snp_launch_finish *data; 2553 void *id_block = NULL, *id_auth = NULL; 2554 int ret; 2555 2556 if (!sev_snp_guest(kvm)) 2557 return -ENOTTY; 2558 2559 if (!sev->snp_context) 2560 return -EINVAL; 2561 2562 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 2563 return -EFAULT; 2564 2565 if (params.flags) 2566 return -EINVAL; 2567 2568 /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */ 2569 ret = snp_launch_update_vmsa(kvm, argp); 2570 if (ret) 2571 return ret; 2572 2573 data = kzalloc_obj(*data, GFP_KERNEL_ACCOUNT); 2574 if (!data) 2575 return -ENOMEM; 2576 2577 if (params.id_block_en) { 2578 id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE); 2579 if (IS_ERR(id_block)) { 2580 ret = PTR_ERR(id_block); 2581 goto e_free; 2582 } 2583 2584 data->id_block_en = 1; 2585 data->id_block_paddr = __sme_pa(id_block); 2586 2587 id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE); 2588 if (IS_ERR(id_auth)) { 2589 ret = PTR_ERR(id_auth); 2590 goto e_free_id_block; 2591 } 2592 2593 data->id_auth_paddr = __sme_pa(id_auth); 2594 2595 if (params.auth_key_en) 2596 data->auth_key_en = 1; 2597 } 2598 2599 data->vcek_disabled = params.vcek_disabled; 2600 2601 memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE); 2602 data->gctx_paddr = __psp_pa(sev->snp_context); 2603 ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); 2604 2605 /* 2606 * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages 2607 * can be given to the guest simply by marking the RMP entry as private. 2608 * This can happen on first access and also with KVM_PRE_FAULT_MEMORY. 2609 */ 2610 if (!ret) 2611 kvm->arch.pre_fault_allowed = true; 2612 2613 kfree(id_auth); 2614 2615 e_free_id_block: 2616 kfree(id_block); 2617 2618 e_free: 2619 kfree(data); 2620 2621 return ret; 2622 } 2623 2624 static int snp_enable_certs(struct kvm *kvm) 2625 { 2626 if (kvm->created_vcpus || !sev_snp_guest(kvm)) 2627 return -EINVAL; 2628 2629 to_kvm_sev_info(kvm)->snp_certs_enabled = true; 2630 2631 return 0; 2632 } 2633 2634 int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) 2635 { 2636 struct kvm_sev_cmd sev_cmd; 2637 int r; 2638 2639 if (!sev_enabled) 2640 return -ENOTTY; 2641 2642 if (!argp) 2643 return 0; 2644 2645 if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd))) 2646 return -EFAULT; 2647 2648 guard(mutex)(&kvm->lock); 2649 2650 /* Only the enc_context_owner handles some memory enc operations. */ 2651 if (is_mirroring_enc_context(kvm) && 2652 !is_cmd_allowed_from_mirror(sev_cmd.id)) 2653 return -EINVAL; 2654 2655 /* 2656 * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only 2657 * allow the use of SNP-specific commands. 2658 */ 2659 if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) 2660 return -EPERM; 2661 2662 switch (sev_cmd.id) { 2663 case KVM_SEV_ES_INIT: 2664 if (!sev_es_enabled) 2665 return -ENOTTY; 2666 fallthrough; 2667 case KVM_SEV_INIT: 2668 r = sev_guest_init(kvm, &sev_cmd); 2669 break; 2670 case KVM_SEV_INIT2: 2671 r = sev_guest_init2(kvm, &sev_cmd); 2672 break; 2673 case KVM_SEV_LAUNCH_START: 2674 r = sev_launch_start(kvm, &sev_cmd); 2675 break; 2676 case KVM_SEV_LAUNCH_UPDATE_DATA: 2677 r = sev_launch_update_data(kvm, &sev_cmd); 2678 break; 2679 case KVM_SEV_LAUNCH_UPDATE_VMSA: 2680 r = sev_launch_update_vmsa(kvm, &sev_cmd); 2681 break; 2682 case KVM_SEV_LAUNCH_MEASURE: 2683 r = sev_launch_measure(kvm, &sev_cmd); 2684 break; 2685 case KVM_SEV_LAUNCH_FINISH: 2686 r = sev_launch_finish(kvm, &sev_cmd); 2687 break; 2688 case KVM_SEV_GUEST_STATUS: 2689 r = sev_guest_status(kvm, &sev_cmd); 2690 break; 2691 case KVM_SEV_DBG_DECRYPT: 2692 r = sev_dbg_crypt(kvm, &sev_cmd, true); 2693 break; 2694 case KVM_SEV_DBG_ENCRYPT: 2695 r = sev_dbg_crypt(kvm, &sev_cmd, false); 2696 break; 2697 case KVM_SEV_LAUNCH_SECRET: 2698 r = sev_launch_secret(kvm, &sev_cmd); 2699 break; 2700 case KVM_SEV_GET_ATTESTATION_REPORT: 2701 r = sev_get_attestation_report(kvm, &sev_cmd); 2702 break; 2703 case KVM_SEV_SEND_START: 2704 r = sev_send_start(kvm, &sev_cmd); 2705 break; 2706 case KVM_SEV_SEND_UPDATE_DATA: 2707 r = sev_send_update_data(kvm, &sev_cmd); 2708 break; 2709 case KVM_SEV_SEND_FINISH: 2710 r = sev_send_finish(kvm, &sev_cmd); 2711 break; 2712 case KVM_SEV_SEND_CANCEL: 2713 r = sev_send_cancel(kvm, &sev_cmd); 2714 break; 2715 case KVM_SEV_RECEIVE_START: 2716 r = sev_receive_start(kvm, &sev_cmd); 2717 break; 2718 case KVM_SEV_RECEIVE_UPDATE_DATA: 2719 r = sev_receive_update_data(kvm, &sev_cmd); 2720 break; 2721 case KVM_SEV_RECEIVE_FINISH: 2722 r = sev_receive_finish(kvm, &sev_cmd); 2723 break; 2724 case KVM_SEV_SNP_LAUNCH_START: 2725 r = snp_launch_start(kvm, &sev_cmd); 2726 break; 2727 case KVM_SEV_SNP_LAUNCH_UPDATE: 2728 r = snp_launch_update(kvm, &sev_cmd); 2729 break; 2730 case KVM_SEV_SNP_LAUNCH_FINISH: 2731 r = snp_launch_finish(kvm, &sev_cmd); 2732 break; 2733 case KVM_SEV_SNP_ENABLE_REQ_CERTS: 2734 r = snp_enable_certs(kvm); 2735 break; 2736 default: 2737 return -EINVAL; 2738 } 2739 2740 if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd))) 2741 r = -EFAULT; 2742 2743 return r; 2744 } 2745 2746 int sev_mem_enc_register_region(struct kvm *kvm, 2747 struct kvm_enc_region *range) 2748 { 2749 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2750 struct enc_region *region; 2751 int ret = 0; 2752 2753 guard(mutex)(&kvm->lock); 2754 2755 if (!sev_guest(kvm)) 2756 return -ENOTTY; 2757 2758 /* If kvm is mirroring encryption context it isn't responsible for it */ 2759 if (is_mirroring_enc_context(kvm)) 2760 return -EINVAL; 2761 2762 region = kzalloc_obj(*region, GFP_KERNEL_ACCOUNT); 2763 if (!region) 2764 return -ENOMEM; 2765 2766 region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 2767 FOLL_WRITE | FOLL_LONGTERM); 2768 if (IS_ERR(region->pages)) { 2769 ret = PTR_ERR(region->pages); 2770 goto e_free; 2771 } 2772 2773 /* 2774 * The guest may change the memory encryption attribute from C=0 -> C=1 2775 * or vice versa for this memory range. Lets make sure caches are 2776 * flushed to ensure that guest data gets written into memory with 2777 * correct C-bit. Note, this must be done before dropping kvm->lock, 2778 * as region and its array of pages can be freed by a different task 2779 * once kvm->lock is released. 2780 */ 2781 sev_clflush_pages(region->pages, region->npages); 2782 2783 region->uaddr = range->addr; 2784 region->size = range->size; 2785 2786 list_add_tail(®ion->list, &sev->regions_list); 2787 return ret; 2788 2789 e_free: 2790 kfree(region); 2791 return ret; 2792 } 2793 2794 static struct enc_region * 2795 find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) 2796 { 2797 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2798 struct list_head *head = &sev->regions_list; 2799 struct enc_region *i; 2800 2801 list_for_each_entry(i, head, list) { 2802 if (i->uaddr == range->addr && 2803 i->size == range->size) 2804 return i; 2805 } 2806 2807 return NULL; 2808 } 2809 2810 static void __unregister_enc_region_locked(struct kvm *kvm, 2811 struct enc_region *region) 2812 { 2813 sev_unpin_memory(kvm, region->pages, region->npages); 2814 list_del(®ion->list); 2815 kfree(region); 2816 } 2817 2818 int sev_mem_enc_unregister_region(struct kvm *kvm, 2819 struct kvm_enc_region *range) 2820 { 2821 struct enc_region *region; 2822 2823 /* If kvm is mirroring encryption context it isn't responsible for it */ 2824 if (is_mirroring_enc_context(kvm)) 2825 return -EINVAL; 2826 2827 guard(mutex)(&kvm->lock); 2828 2829 if (!sev_guest(kvm)) 2830 return -ENOTTY; 2831 2832 region = find_enc_region(kvm, range); 2833 if (!region) 2834 return -EINVAL; 2835 2836 sev_writeback_caches(kvm); 2837 2838 __unregister_enc_region_locked(kvm, region); 2839 2840 return 0; 2841 } 2842 2843 int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) 2844 { 2845 CLASS(fd, f)(source_fd); 2846 struct kvm *source_kvm; 2847 struct kvm_sev_info *source_sev, *mirror_sev; 2848 int ret; 2849 2850 if (fd_empty(f)) 2851 return -EBADF; 2852 2853 if (!file_is_kvm(fd_file(f))) 2854 return -EBADF; 2855 2856 source_kvm = fd_file(f)->private_data; 2857 ret = sev_lock_two_vms(kvm, source_kvm); 2858 if (ret) 2859 return ret; 2860 2861 /* 2862 * Mirrors of mirrors should work, but let's not get silly. Also 2863 * disallow out-of-band SEV/SEV-ES init if the target is already an 2864 * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being 2865 * created after SEV/SEV-ES initialization, e.g. to init intercepts. 2866 */ 2867 if (sev_guest(kvm) || !sev_guest(source_kvm) || 2868 is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) { 2869 ret = -EINVAL; 2870 goto e_unlock; 2871 } 2872 2873 mirror_sev = to_kvm_sev_info(kvm); 2874 if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 2875 ret = -ENOMEM; 2876 goto e_unlock; 2877 } 2878 2879 /* 2880 * The mirror kvm holds an enc_context_owner ref so its asid can't 2881 * disappear until we're done with it 2882 */ 2883 source_sev = to_kvm_sev_info(source_kvm); 2884 kvm_get_kvm(source_kvm); 2885 list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); 2886 2887 /* Set enc_context_owner and copy its encryption context over */ 2888 mirror_sev->enc_context_owner = source_kvm; 2889 mirror_sev->active = true; 2890 mirror_sev->asid = source_sev->asid; 2891 mirror_sev->fd = source_sev->fd; 2892 mirror_sev->es_active = source_sev->es_active; 2893 mirror_sev->need_init = false; 2894 mirror_sev->handle = source_sev->handle; 2895 INIT_LIST_HEAD(&mirror_sev->regions_list); 2896 INIT_LIST_HEAD(&mirror_sev->mirror_vms); 2897 ret = 0; 2898 2899 /* 2900 * Do not copy ap_jump_table. Since the mirror does not share the same 2901 * KVM contexts as the original, and they may have different 2902 * memory-views. 2903 */ 2904 2905 e_unlock: 2906 sev_unlock_two_vms(kvm, source_kvm); 2907 return ret; 2908 } 2909 2910 static int snp_decommission_context(struct kvm *kvm) 2911 { 2912 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2913 struct sev_data_snp_addr data = {}; 2914 int ret; 2915 2916 /* If context is not created then do nothing */ 2917 if (!sev->snp_context) 2918 return 0; 2919 2920 /* Do the decommision, which will unbind the ASID from the SNP context */ 2921 data.address = __sme_pa(sev->snp_context); 2922 down_write(&sev_deactivate_lock); 2923 ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL); 2924 up_write(&sev_deactivate_lock); 2925 2926 if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret)) 2927 return ret; 2928 2929 snp_free_firmware_page(sev->snp_context); 2930 sev->snp_context = NULL; 2931 2932 return 0; 2933 } 2934 2935 void sev_vm_init(struct kvm *kvm) 2936 { 2937 switch (kvm->arch.vm_type) { 2938 case KVM_X86_DEFAULT_VM: 2939 case KVM_X86_SW_PROTECTED_VM: 2940 break; 2941 case KVM_X86_SNP_VM: 2942 kvm->arch.has_private_mem = true; 2943 fallthrough; 2944 case KVM_X86_SEV_ES_VM: 2945 kvm->arch.has_protected_state = true; 2946 fallthrough; 2947 case KVM_X86_SEV_VM: 2948 kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; 2949 to_kvm_sev_info(kvm)->need_init = true; 2950 break; 2951 default: 2952 WARN_ONCE(1, "Unsupported VM type %u", kvm->arch.vm_type); 2953 break; 2954 } 2955 } 2956 2957 void sev_vm_destroy(struct kvm *kvm) 2958 { 2959 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2960 struct list_head *head = &sev->regions_list; 2961 struct list_head *pos, *q; 2962 2963 if (!sev_guest(kvm)) 2964 return; 2965 2966 WARN_ON(!list_empty(&sev->mirror_vms)); 2967 2968 free_cpumask_var(sev->have_run_cpus); 2969 2970 /* 2971 * If this is a mirror VM, remove it from the owner's list of a mirrors 2972 * and skip ASID cleanup (the ASID is tied to the lifetime of the owner). 2973 * Note, mirror VMs don't support registering encrypted regions. 2974 */ 2975 if (is_mirroring_enc_context(kvm)) { 2976 struct kvm *owner_kvm = sev->enc_context_owner; 2977 2978 mutex_lock(&owner_kvm->lock); 2979 list_del(&sev->mirror_entry); 2980 mutex_unlock(&owner_kvm->lock); 2981 kvm_put_kvm(owner_kvm); 2982 return; 2983 } 2984 2985 2986 /* 2987 * if userspace was terminated before unregistering the memory regions 2988 * then lets unpin all the registered memory. 2989 */ 2990 if (!list_empty(head)) { 2991 list_for_each_safe(pos, q, head) { 2992 __unregister_enc_region_locked(kvm, 2993 list_entry(pos, struct enc_region, list)); 2994 cond_resched(); 2995 } 2996 } 2997 2998 if (sev_snp_guest(kvm)) { 2999 snp_guest_req_cleanup(kvm); 3000 3001 /* 3002 * Decomission handles unbinding of the ASID. If it fails for 3003 * some unexpected reason, just leak the ASID. 3004 */ 3005 if (snp_decommission_context(kvm)) 3006 return; 3007 } else { 3008 sev_unbind_asid(kvm, sev->handle); 3009 } 3010 3011 sev_asid_free(sev); 3012 } 3013 3014 void __init sev_set_cpu_caps(void) 3015 { 3016 if (sev_enabled) { 3017 kvm_cpu_cap_set(X86_FEATURE_SEV); 3018 kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM); 3019 } 3020 if (sev_es_enabled) { 3021 kvm_cpu_cap_set(X86_FEATURE_SEV_ES); 3022 kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM); 3023 } 3024 if (sev_snp_enabled) { 3025 kvm_cpu_cap_set(X86_FEATURE_SEV_SNP); 3026 kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM); 3027 } 3028 } 3029 3030 static bool is_sev_snp_initialized(void) 3031 { 3032 struct sev_user_data_snp_status *status; 3033 struct sev_data_snp_addr buf; 3034 bool initialized = false; 3035 int ret, error = 0; 3036 3037 status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO); 3038 if (!status) 3039 return false; 3040 3041 buf.address = __psp_pa(status); 3042 ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error); 3043 if (ret) { 3044 pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n", 3045 ret, error, error); 3046 goto out; 3047 } 3048 3049 initialized = !!status->state; 3050 3051 out: 3052 snp_free_firmware_page(status); 3053 3054 return initialized; 3055 } 3056 3057 void __init sev_hardware_setup(void) 3058 { 3059 unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; 3060 struct sev_platform_init_args init_args = {0}; 3061 bool sev_snp_supported = false; 3062 bool sev_es_supported = false; 3063 bool sev_supported = false; 3064 3065 if (!sev_enabled || !npt_enabled || !nrips) 3066 goto out; 3067 3068 /* 3069 * SEV must obviously be supported in hardware. Sanity check that the 3070 * CPU supports decode assists, which is mandatory for SEV guests to 3071 * support instruction emulation. Ditto for flushing by ASID, as SEV 3072 * guests are bound to a single ASID, i.e. KVM can't rotate to a new 3073 * ASID to effect a TLB flush. 3074 */ 3075 if (!boot_cpu_has(X86_FEATURE_SEV) || 3076 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) || 3077 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) 3078 goto out; 3079 3080 /* 3081 * The kernel's initcall infrastructure lacks the ability to express 3082 * dependencies between initcalls, whereas the modules infrastructure 3083 * automatically handles dependencies via symbol loading. Ensure the 3084 * PSP SEV driver is initialized before proceeding if KVM is built-in, 3085 * as the dependency isn't handled by the initcall infrastructure. 3086 */ 3087 if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) 3088 goto out; 3089 3090 /* Retrieve SEV CPUID information */ 3091 cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); 3092 3093 /* Set encryption bit location for SEV-ES guests */ 3094 sev_enc_bit = ebx & 0x3f; 3095 3096 /* Maximum number of encrypted guests supported simultaneously */ 3097 max_sev_asid = ecx; 3098 if (!max_sev_asid) 3099 goto out; 3100 3101 /* Minimum ASID value that should be used for SEV guest */ 3102 min_sev_asid = edx; 3103 sev_me_mask = 1UL << (ebx & 0x3f); 3104 3105 /* 3106 * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap, 3107 * even though it's never used, so that the bitmap is indexed by the 3108 * actual ASID. 3109 */ 3110 nr_asids = max_sev_asid + 1; 3111 sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); 3112 if (!sev_asid_bitmap) 3113 goto out; 3114 3115 sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); 3116 if (!sev_reclaim_asid_bitmap) { 3117 bitmap_free(sev_asid_bitmap); 3118 sev_asid_bitmap = NULL; 3119 goto out; 3120 } 3121 3122 if (min_sev_asid <= max_sev_asid) { 3123 sev_asid_count = max_sev_asid - min_sev_asid + 1; 3124 WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); 3125 } 3126 sev_supported = true; 3127 3128 /* SEV-ES support requested? */ 3129 if (!sev_es_enabled) 3130 goto out; 3131 3132 /* 3133 * SEV-ES requires MMIO caching as KVM doesn't have access to the guest 3134 * instruction stream, i.e. can't emulate in response to a #NPF and 3135 * instead relies on #NPF(RSVD) being reflected into the guest as #VC 3136 * (the guest can then do a #VMGEXIT to request MMIO emulation). 3137 */ 3138 if (!enable_mmio_caching) 3139 goto out; 3140 3141 /* Does the CPU support SEV-ES? */ 3142 if (!boot_cpu_has(X86_FEATURE_SEV_ES)) 3143 goto out; 3144 3145 if (!lbrv) { 3146 WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV), 3147 "LBRV must be present for SEV-ES support"); 3148 goto out; 3149 } 3150 3151 /* Has the system been allocated ASIDs for SEV-ES? */ 3152 if (min_sev_asid == 1) 3153 goto out; 3154 3155 min_sev_es_asid = min_snp_asid = 1; 3156 max_sev_es_asid = max_snp_asid = min_sev_asid - 1; 3157 3158 sev_es_asid_count = min_sev_asid - 1; 3159 WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); 3160 sev_es_supported = true; 3161 sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); 3162 3163 out: 3164 if (sev_enabled) { 3165 init_args.probe = true; 3166 3167 if (sev_is_snp_ciphertext_hiding_supported()) 3168 init_args.max_snp_asid = min(nr_ciphertext_hiding_asids, 3169 min_sev_asid - 1); 3170 3171 if (sev_platform_init(&init_args)) 3172 sev_supported = sev_es_supported = sev_snp_supported = false; 3173 else if (sev_snp_supported) 3174 sev_snp_supported = is_sev_snp_initialized(); 3175 3176 if (sev_snp_supported) { 3177 snp_supported_policy_bits = sev_get_snp_policy_bits() & 3178 KVM_SNP_POLICY_MASK_VALID; 3179 nr_ciphertext_hiding_asids = init_args.max_snp_asid; 3180 } 3181 3182 /* 3183 * If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP 3184 * ASID range is partitioned into separate SEV-ES and SEV-SNP 3185 * ASID ranges, with the SEV-SNP range being [1..max_snp_asid] 3186 * and the SEV-ES range being (max_snp_asid..max_sev_es_asid]. 3187 * Note, SEV-ES may effectively be disabled if all ASIDs from 3188 * the joint range are assigned to SEV-SNP. 3189 */ 3190 if (nr_ciphertext_hiding_asids) { 3191 max_snp_asid = nr_ciphertext_hiding_asids; 3192 min_sev_es_asid = max_snp_asid + 1; 3193 pr_info("SEV-SNP ciphertext hiding enabled\n"); 3194 } 3195 } 3196 3197 if (boot_cpu_has(X86_FEATURE_SEV)) 3198 pr_info("SEV %s (ASIDs %u - %u)\n", 3199 sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : 3200 "unusable" : 3201 "disabled", 3202 min_sev_asid, max_sev_asid); 3203 if (boot_cpu_has(X86_FEATURE_SEV_ES)) 3204 pr_info("SEV-ES %s (ASIDs %u - %u)\n", 3205 sev_es_supported ? min_sev_es_asid <= max_sev_es_asid ? "enabled" : 3206 "unusable" : 3207 "disabled", 3208 min_sev_es_asid, max_sev_es_asid); 3209 if (boot_cpu_has(X86_FEATURE_SEV_SNP)) 3210 pr_info("SEV-SNP %s (ASIDs %u - %u)\n", 3211 str_enabled_disabled(sev_snp_supported), 3212 min_snp_asid, max_snp_asid); 3213 3214 sev_enabled = sev_supported; 3215 sev_es_enabled = sev_es_supported; 3216 sev_snp_enabled = sev_snp_supported; 3217 3218 sev_supported_vmsa_features = 0; 3219 3220 if (sev_es_enabled && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) && 3221 cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) 3222 sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; 3223 3224 if (sev_snp_enabled && tsc_khz && cpu_feature_enabled(X86_FEATURE_SNP_SECURE_TSC)) 3225 sev_supported_vmsa_features |= SVM_SEV_FEAT_SECURE_TSC; 3226 } 3227 3228 void sev_hardware_unsetup(void) 3229 { 3230 if (!sev_enabled) 3231 return; 3232 3233 /* No need to take sev_bitmap_lock, all VMs have been destroyed. */ 3234 sev_flush_asids(1, max_sev_asid); 3235 3236 bitmap_free(sev_asid_bitmap); 3237 bitmap_free(sev_reclaim_asid_bitmap); 3238 3239 misc_cg_set_capacity(MISC_CG_RES_SEV, 0); 3240 misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); 3241 3242 sev_platform_shutdown(); 3243 } 3244 3245 int sev_cpu_init(struct svm_cpu_data *sd) 3246 { 3247 if (!sev_enabled) 3248 return 0; 3249 3250 sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL); 3251 if (!sd->sev_vmcbs) 3252 return -ENOMEM; 3253 3254 return 0; 3255 } 3256 3257 /* 3258 * Pages used by hardware to hold guest encrypted state must be flushed before 3259 * returning them to the system. 3260 */ 3261 static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) 3262 { 3263 unsigned int asid = sev_get_asid(vcpu->kvm); 3264 3265 /* 3266 * Note! The address must be a kernel address, as regular page walk 3267 * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user 3268 * address is non-deterministic and unsafe. This function deliberately 3269 * takes a pointer to deter passing in a user address. 3270 */ 3271 unsigned long addr = (unsigned long)va; 3272 3273 /* 3274 * If CPU enforced cache coherency for encrypted mappings of the 3275 * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache 3276 * flush is still needed in order to work properly with DMA devices. 3277 */ 3278 if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) { 3279 clflush_cache_range(va, PAGE_SIZE); 3280 return; 3281 } 3282 3283 /* 3284 * VM Page Flush takes a host virtual address and a guest ASID. Fall 3285 * back to full writeback of caches if this faults so as not to make 3286 * any problems worse by leaving stale encrypted data in the cache. 3287 */ 3288 if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) 3289 goto do_sev_writeback_caches; 3290 3291 return; 3292 3293 do_sev_writeback_caches: 3294 sev_writeback_caches(vcpu->kvm); 3295 } 3296 3297 void sev_guest_memory_reclaimed(struct kvm *kvm) 3298 { 3299 /* 3300 * With SNP+gmem, private/encrypted memory is unreachable via the 3301 * hva-based mmu notifiers, i.e. these events are explicitly scoped to 3302 * shared pages, where there's no need to flush caches. 3303 * 3304 * Checking for SEV+ outside of kvm->lock is safe as __sev_guest_init() 3305 * can only be done before vCPUs are created, caches can be incoherent 3306 * if and only if a vCPU was run, and either this task will see the VM 3307 * as being SEV+ or the vCPU won't be to access the memory (because of 3308 * the in-progress invalidation). 3309 */ 3310 if (!____sev_guest(kvm) || ____sev_snp_guest(kvm)) 3311 return; 3312 3313 sev_writeback_caches(kvm); 3314 } 3315 3316 static void dump_ghcb(struct vcpu_svm *svm) 3317 { 3318 struct vmcb_control_area *control = &svm->vmcb->control; 3319 unsigned int nbits; 3320 3321 /* Re-use the dump_invalid_vmcb module parameter */ 3322 if (!dump_invalid_vmcb) { 3323 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); 3324 return; 3325 } 3326 3327 nbits = sizeof(svm->sev_es.valid_bitmap) * 8; 3328 3329 /* 3330 * Print KVM's snapshot of the GHCB values that were (unsuccessfully) 3331 * used to handle the exit. If the guest has since modified the GHCB 3332 * itself, dumping the raw GHCB won't help debug why KVM was unable to 3333 * handle the VMGEXIT that KVM observed. 3334 */ 3335 pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); 3336 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", 3337 control->exit_code, kvm_ghcb_sw_exit_code_is_valid(svm)); 3338 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", 3339 control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); 3340 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", 3341 control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); 3342 pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", 3343 svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); 3344 pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); 3345 } 3346 3347 static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) 3348 { 3349 struct kvm_vcpu *vcpu = &svm->vcpu; 3350 struct ghcb *ghcb = svm->sev_es.ghcb; 3351 3352 /* 3353 * The GHCB protocol so far allows for the following data 3354 * to be returned: 3355 * GPRs RAX, RBX, RCX, RDX 3356 * 3357 * Copy their values, even if they may not have been written during the 3358 * VM-Exit. It's the guest's responsibility to not consume random data. 3359 */ 3360 ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]); 3361 ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]); 3362 ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]); 3363 ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]); 3364 } 3365 3366 static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) 3367 { 3368 struct vmcb_control_area *control = &svm->vmcb->control; 3369 struct kvm_vcpu *vcpu = &svm->vcpu; 3370 struct ghcb *ghcb = svm->sev_es.ghcb; 3371 3372 /* 3373 * The GHCB protocol so far allows for the following data 3374 * to be supplied: 3375 * GPRs RAX, RBX, RCX, RDX 3376 * XCR0 3377 * CPL 3378 * 3379 * VMMCALL allows the guest to provide extra registers. KVM also 3380 * expects RSI for hypercalls, so include that, too. 3381 * 3382 * Copy their values to the appropriate location if supplied. 3383 */ 3384 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); 3385 3386 BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap)); 3387 memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap)); 3388 3389 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm); 3390 vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm); 3391 vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm); 3392 vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm); 3393 vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm); 3394 3395 svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm); 3396 3397 if (kvm_ghcb_xcr0_is_valid(svm)) 3398 __kvm_set_xcr(vcpu, 0, kvm_ghcb_get_xcr0(svm)); 3399 3400 if (kvm_ghcb_xss_is_valid(svm)) 3401 __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm)); 3402 3403 /* Copy the GHCB exit information into the VMCB fields */ 3404 control->exit_code = kvm_ghcb_get_sw_exit_code(svm); 3405 control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm); 3406 control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm); 3407 svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm); 3408 3409 /* Clear the valid entries fields */ 3410 memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); 3411 } 3412 3413 static int sev_es_validate_vmgexit(struct vcpu_svm *svm) 3414 { 3415 struct vmcb_control_area *control = &svm->vmcb->control; 3416 struct kvm_vcpu *vcpu = &svm->vcpu; 3417 u64 reason; 3418 3419 /* Only GHCB Usage code 0 is supported */ 3420 if (svm->sev_es.ghcb->ghcb_usage) { 3421 reason = GHCB_ERR_INVALID_USAGE; 3422 goto vmgexit_err; 3423 } 3424 3425 reason = GHCB_ERR_MISSING_INPUT; 3426 3427 if (!kvm_ghcb_sw_exit_code_is_valid(svm) || 3428 !kvm_ghcb_sw_exit_info_1_is_valid(svm) || 3429 !kvm_ghcb_sw_exit_info_2_is_valid(svm)) 3430 goto vmgexit_err; 3431 3432 switch (control->exit_code) { 3433 case SVM_EXIT_READ_DR7: 3434 break; 3435 case SVM_EXIT_WRITE_DR7: 3436 if (!kvm_ghcb_rax_is_valid(svm)) 3437 goto vmgexit_err; 3438 break; 3439 case SVM_EXIT_RDTSC: 3440 break; 3441 case SVM_EXIT_RDPMC: 3442 if (!kvm_ghcb_rcx_is_valid(svm)) 3443 goto vmgexit_err; 3444 break; 3445 case SVM_EXIT_CPUID: 3446 if (!kvm_ghcb_rax_is_valid(svm) || 3447 !kvm_ghcb_rcx_is_valid(svm)) 3448 goto vmgexit_err; 3449 if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd) 3450 if (!kvm_ghcb_xcr0_is_valid(svm)) 3451 goto vmgexit_err; 3452 break; 3453 case SVM_EXIT_INVD: 3454 break; 3455 case SVM_EXIT_IOIO: 3456 if (control->exit_info_1 & SVM_IOIO_STR_MASK) { 3457 if (!kvm_ghcb_sw_scratch_is_valid(svm)) 3458 goto vmgexit_err; 3459 } else { 3460 if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK)) 3461 if (!kvm_ghcb_rax_is_valid(svm)) 3462 goto vmgexit_err; 3463 } 3464 break; 3465 case SVM_EXIT_MSR: 3466 if (!kvm_ghcb_rcx_is_valid(svm)) 3467 goto vmgexit_err; 3468 if (control->exit_info_1) { 3469 if (!kvm_ghcb_rax_is_valid(svm) || 3470 !kvm_ghcb_rdx_is_valid(svm)) 3471 goto vmgexit_err; 3472 } 3473 break; 3474 case SVM_EXIT_VMMCALL: 3475 if (!kvm_ghcb_rax_is_valid(svm) || 3476 !kvm_ghcb_cpl_is_valid(svm)) 3477 goto vmgexit_err; 3478 break; 3479 case SVM_EXIT_RDTSCP: 3480 break; 3481 case SVM_EXIT_WBINVD: 3482 break; 3483 case SVM_EXIT_MONITOR: 3484 if (!kvm_ghcb_rax_is_valid(svm) || 3485 !kvm_ghcb_rcx_is_valid(svm) || 3486 !kvm_ghcb_rdx_is_valid(svm)) 3487 goto vmgexit_err; 3488 break; 3489 case SVM_EXIT_MWAIT: 3490 if (!kvm_ghcb_rax_is_valid(svm) || 3491 !kvm_ghcb_rcx_is_valid(svm)) 3492 goto vmgexit_err; 3493 break; 3494 case SVM_VMGEXIT_MMIO_READ: 3495 case SVM_VMGEXIT_MMIO_WRITE: 3496 if (!kvm_ghcb_sw_scratch_is_valid(svm)) 3497 goto vmgexit_err; 3498 break; 3499 case SVM_VMGEXIT_AP_CREATION: 3500 if (!is_sev_snp_guest(vcpu)) 3501 goto vmgexit_err; 3502 if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY) 3503 if (!kvm_ghcb_rax_is_valid(svm)) 3504 goto vmgexit_err; 3505 break; 3506 case SVM_VMGEXIT_NMI_COMPLETE: 3507 case SVM_VMGEXIT_AP_HLT_LOOP: 3508 case SVM_VMGEXIT_AP_JUMP_TABLE: 3509 case SVM_VMGEXIT_UNSUPPORTED_EVENT: 3510 case SVM_VMGEXIT_HV_FEATURES: 3511 case SVM_VMGEXIT_TERM_REQUEST: 3512 break; 3513 case SVM_VMGEXIT_PSC: 3514 if (!is_sev_snp_guest(vcpu) || !kvm_ghcb_sw_scratch_is_valid(svm)) 3515 goto vmgexit_err; 3516 break; 3517 case SVM_VMGEXIT_GUEST_REQUEST: 3518 case SVM_VMGEXIT_EXT_GUEST_REQUEST: 3519 if (!is_sev_snp_guest(vcpu) || 3520 !PAGE_ALIGNED(control->exit_info_1) || 3521 !PAGE_ALIGNED(control->exit_info_2) || 3522 control->exit_info_1 == control->exit_info_2) 3523 goto vmgexit_err; 3524 break; 3525 default: 3526 reason = GHCB_ERR_INVALID_EVENT; 3527 goto vmgexit_err; 3528 } 3529 3530 return 0; 3531 3532 vmgexit_err: 3533 /* 3534 * Print the exit code even though it may not be marked valid as it 3535 * could help with debugging. 3536 */ 3537 if (reason == GHCB_ERR_INVALID_USAGE) { 3538 vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", 3539 svm->sev_es.ghcb->ghcb_usage); 3540 } else if (reason == GHCB_ERR_INVALID_EVENT) { 3541 vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", 3542 control->exit_code); 3543 } else { 3544 vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n", 3545 control->exit_code); 3546 dump_ghcb(svm); 3547 } 3548 3549 svm_vmgexit_bad_input(svm, reason); 3550 3551 /* Resume the guest to "return" the error code. */ 3552 return 1; 3553 } 3554 3555 static void __sev_es_unmap_ghcb(struct vcpu_svm *svm) 3556 { 3557 if (svm->sev_es.ghcb_sa_free) { 3558 kvfree(svm->sev_es.ghcb_sa); 3559 svm->sev_es.ghcb_sa = NULL; 3560 svm->sev_es.ghcb_sa_free = false; 3561 } 3562 3563 if (svm->sev_es.ghcb) { 3564 kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); 3565 svm->sev_es.ghcb = NULL; 3566 } 3567 } 3568 3569 void sev_es_unmap_ghcb(struct vcpu_svm *svm) 3570 { 3571 /* Clear any indication that the vCPU is in a type of AP Reset Hold */ 3572 svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE; 3573 3574 if (!svm->sev_es.ghcb) 3575 return; 3576 3577 /* 3578 * If the scratch area lives outside the GHCB, there's a buffer that, 3579 * depending on the operation performed, may need to be synced. 3580 */ 3581 if (svm->sev_es.ghcb_sa_sync) { 3582 kvm_write_guest(svm->vcpu.kvm, svm->sev_es.sw_scratch, 3583 svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); 3584 svm->sev_es.ghcb_sa_sync = false; 3585 } 3586 3587 trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb); 3588 3589 sev_es_sync_to_ghcb(svm); 3590 3591 __sev_es_unmap_ghcb(svm); 3592 } 3593 3594 void sev_free_vcpu(struct kvm_vcpu *vcpu) 3595 { 3596 struct vcpu_svm *svm; 3597 3598 if (!is_sev_es_guest(vcpu)) 3599 return; 3600 3601 svm = to_svm(vcpu); 3602 3603 /* 3604 * If it's an SNP guest, then the VMSA was marked in the RMP table as 3605 * a guest-owned page. Transition the page to hypervisor state before 3606 * releasing it back to the system. 3607 */ 3608 if (is_sev_snp_guest(vcpu)) { 3609 u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; 3610 3611 if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) 3612 goto skip_vmsa_free; 3613 } 3614 3615 if (vcpu->arch.guest_state_protected) 3616 sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); 3617 3618 __free_page(virt_to_page(svm->sev_es.vmsa)); 3619 3620 skip_vmsa_free: 3621 __sev_es_unmap_ghcb(svm); 3622 } 3623 3624 int pre_sev_run(struct vcpu_svm *svm, int cpu) 3625 { 3626 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 3627 struct kvm_vcpu *vcpu = &svm->vcpu; 3628 struct kvm *kvm = vcpu->kvm; 3629 unsigned int asid = sev_get_asid(kvm); 3630 3631 /* 3632 * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid 3633 * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP 3634 * AP Destroy event. 3635 */ 3636 if (is_sev_es_guest(vcpu) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) 3637 return -EINVAL; 3638 3639 /* 3640 * To optimize cache flushes when memory is reclaimed from an SEV VM, 3641 * track physical CPUs that enter the guest for SEV VMs and thus can 3642 * have encrypted, dirty data in the cache, and flush caches only for 3643 * CPUs that have entered the guest. 3644 */ 3645 if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus)) 3646 cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus); 3647 3648 /* Assign the asid allocated with this SEV guest */ 3649 svm->asid = asid; 3650 3651 /* 3652 * Flush guest TLB: 3653 * 3654 * 1) when different VMCB for the same ASID is to be run on the same host CPU. 3655 * 2) or this VMCB was executed on different host CPU in previous VMRUNs. 3656 */ 3657 if (sd->sev_vmcbs[asid] == svm->vmcb && 3658 svm->vcpu.arch.last_vmentry_cpu == cpu) 3659 return 0; 3660 3661 sd->sev_vmcbs[asid] = svm->vmcb; 3662 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 3663 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 3664 return 0; 3665 } 3666 3667 #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) 3668 static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 min_len) 3669 { 3670 struct vmcb_control_area *control = &svm->vmcb->control; 3671 u64 ghcb_scratch_beg, ghcb_scratch_end; 3672 u64 scratch_gpa_beg, scratch_gpa_end; 3673 void *scratch_va; 3674 3675 if (WARN_ON_ONCE(!min_len)) 3676 goto e_scratch; 3677 3678 scratch_gpa_beg = svm->sev_es.sw_scratch; 3679 if (!scratch_gpa_beg) { 3680 pr_err("vmgexit: scratch gpa not provided\n"); 3681 goto e_scratch; 3682 } 3683 3684 scratch_gpa_end = scratch_gpa_beg + min_len; 3685 if (scratch_gpa_end < scratch_gpa_beg) { 3686 pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n", 3687 min_len, scratch_gpa_beg); 3688 goto e_scratch; 3689 } 3690 3691 WARN_ON_ONCE(svm->sev_es.ghcb_sa_sync || svm->sev_es.ghcb_sa_free); 3692 3693 if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) { 3694 /* Scratch area begins within GHCB */ 3695 ghcb_scratch_beg = control->ghcb_gpa + 3696 offsetof(struct ghcb, shared_buffer); 3697 ghcb_scratch_end = control->ghcb_gpa + 3698 offsetof(struct ghcb, reserved_0xff0); 3699 3700 /* 3701 * If the scratch area begins within the GHCB, it must be 3702 * completely contained in the GHCB shared buffer area. 3703 */ 3704 if (scratch_gpa_beg < ghcb_scratch_beg || 3705 scratch_gpa_end > ghcb_scratch_end) { 3706 pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n", 3707 scratch_gpa_beg, scratch_gpa_end); 3708 goto e_scratch; 3709 } 3710 3711 scratch_va = (void *)svm->sev_es.ghcb; 3712 scratch_va += (scratch_gpa_beg - control->ghcb_gpa); 3713 3714 svm->sev_es.ghcb_sa_sync = false; 3715 svm->sev_es.ghcb_sa_free = false; 3716 svm->sev_es.ghcb_sa_len = ghcb_scratch_end - scratch_gpa_beg; 3717 } else { 3718 /* GHCB v2 requires the scratch area to be within the GHCB. */ 3719 if (to_kvm_sev_info(svm->vcpu.kvm)->ghcb_version >= 2) 3720 goto e_scratch; 3721 3722 /* 3723 * The guest memory must be read into a kernel buffer, so 3724 * limit the size 3725 */ 3726 if (min_len > GHCB_SCRATCH_AREA_LIMIT) { 3727 pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n", 3728 min_len, GHCB_SCRATCH_AREA_LIMIT); 3729 goto e_scratch; 3730 } 3731 scratch_va = kvzalloc(min_len, GFP_KERNEL_ACCOUNT); 3732 if (!scratch_va) 3733 return -ENOMEM; 3734 3735 if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, min_len)) { 3736 /* Unable to copy scratch area from guest */ 3737 pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); 3738 3739 kvfree(scratch_va); 3740 return -EFAULT; 3741 } 3742 3743 /* 3744 * The scratch area is outside the GHCB. The operation will 3745 * dictate whether the buffer needs to be synced before running 3746 * the vCPU next time (i.e. a read was requested so the data 3747 * must be written back to the guest memory). 3748 */ 3749 svm->sev_es.ghcb_sa_sync = sync; 3750 svm->sev_es.ghcb_sa_free = true; 3751 svm->sev_es.ghcb_sa_len = min_len; 3752 } 3753 3754 svm->sev_es.ghcb_sa = scratch_va; 3755 return 0; 3756 3757 e_scratch: 3758 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA); 3759 3760 return 1; 3761 } 3762 3763 static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask, 3764 unsigned int pos) 3765 { 3766 svm->vmcb->control.ghcb_gpa &= ~(mask << pos); 3767 svm->vmcb->control.ghcb_gpa |= (value & mask) << pos; 3768 } 3769 3770 static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos) 3771 { 3772 return (svm->vmcb->control.ghcb_gpa >> pos) & mask; 3773 } 3774 3775 static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) 3776 { 3777 svm->vmcb->control.ghcb_gpa = value; 3778 } 3779 3780 static int snp_rmptable_psmash(kvm_pfn_t pfn) 3781 { 3782 int ret; 3783 3784 pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); 3785 3786 /* 3787 * PSMASH_FAIL_INUSE indicates another processor is modifying the 3788 * entry, so retry until that's no longer the case. 3789 */ 3790 do { 3791 ret = psmash(pfn); 3792 } while (ret == PSMASH_FAIL_INUSE); 3793 3794 return ret; 3795 } 3796 3797 static int snp_complete_psc_msr(struct kvm_vcpu *vcpu) 3798 { 3799 struct vcpu_svm *svm = to_svm(vcpu); 3800 3801 if (vcpu->run->hypercall.ret) 3802 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); 3803 else 3804 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP); 3805 3806 return 1; /* resume guest */ 3807 } 3808 3809 static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr) 3810 { 3811 u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr)); 3812 u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr); 3813 struct kvm_vcpu *vcpu = &svm->vcpu; 3814 3815 if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) { 3816 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); 3817 return 1; /* resume guest */ 3818 } 3819 3820 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { 3821 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); 3822 return 1; /* resume guest */ 3823 } 3824 3825 vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; 3826 vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 3827 /* 3828 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 3829 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 3830 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 3831 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 3832 */ 3833 vcpu->run->hypercall.ret = 0; 3834 vcpu->run->hypercall.args[0] = gpa; 3835 vcpu->run->hypercall.args[1] = 1; 3836 vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE) 3837 ? KVM_MAP_GPA_RANGE_ENCRYPTED 3838 : KVM_MAP_GPA_RANGE_DECRYPTED; 3839 vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K; 3840 3841 vcpu->arch.complete_userspace_io = snp_complete_psc_msr; 3842 3843 return 0; /* forward request to userspace */ 3844 } 3845 3846 struct psc_buffer { 3847 struct psc_hdr hdr; 3848 struct psc_entry entries[]; 3849 } __packed; 3850 3851 static int snp_do_psc(struct vcpu_svm *svm); 3852 3853 static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) 3854 { 3855 memset(&svm->sev_es.psc, 0, sizeof(svm->sev_es.psc)); 3856 3857 /* 3858 * PSC requests always get a "no action" response in SW_EXITINFO1, with 3859 * a PSC-specific return code in SW_EXITINFO2 that provides the "real" 3860 * return code. E.g. if the PSC request was interrupted, the need to 3861 * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1. 3862 */ 3863 svm_vmgexit_no_action(svm, psc_ret); 3864 } 3865 3866 static void __snp_complete_one_psc(struct vcpu_svm *svm) 3867 { 3868 struct vcpu_sev_es_state *sev_es = &svm->sev_es; 3869 struct psc_buffer *guest_psc = sev_es->ghcb_sa; 3870 __u16 idx; 3871 3872 /* 3873 * Everything in-flight has been processed successfully. Update the 3874 * corresponding entries in the guest's PSC buffer and zero out the 3875 * count of in-flight PSC entries. 3876 */ 3877 for (idx = sev_es->psc.cur_idx; sev_es->psc.batch_size; 3878 sev_es->psc.batch_size--, idx++) { 3879 struct psc_entry entry = READ_ONCE(guest_psc->entries[idx]); 3880 3881 guest_psc->entries[idx].cur_page = entry.pagesize ? 512 : 1; 3882 } 3883 3884 sev_es->psc.cur_idx = idx; 3885 guest_psc->hdr.cur_entry = idx; 3886 } 3887 3888 static int snp_complete_one_psc(struct kvm_vcpu *vcpu) 3889 { 3890 struct vcpu_svm *svm = to_svm(vcpu); 3891 3892 if (vcpu->run->hypercall.ret) { 3893 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 3894 return 1; /* resume guest */ 3895 } 3896 3897 __snp_complete_one_psc(svm); 3898 3899 /* Handle the next range (if any). */ 3900 return snp_do_psc(svm); 3901 } 3902 3903 static int snp_do_psc(struct vcpu_svm *svm) 3904 { 3905 struct vcpu_sev_es_state *sev_es = &svm->sev_es; 3906 struct psc_buffer *guest_psc = sev_es->ghcb_sa; 3907 struct kvm_vcpu *vcpu = &svm->vcpu; 3908 struct psc_entry entry_start; 3909 int npages; 3910 bool huge; 3911 u64 gfn; 3912 u16 idx; 3913 3914 next_range: 3915 /* There should be no other PSCs in-flight at this point. */ 3916 if (WARN_ON_ONCE(svm->sev_es.psc.batch_size)) { 3917 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 3918 return 1; 3919 } 3920 3921 /* Find the start of the next range which needs processing. */ 3922 for (idx = sev_es->psc.cur_idx; idx <= sev_es->psc.end_idx; idx++) { 3923 entry_start = READ_ONCE(guest_psc->entries[idx]); 3924 3925 gfn = entry_start.gfn; 3926 huge = entry_start.pagesize; 3927 npages = huge ? 512 : 1; 3928 3929 if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) { 3930 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY); 3931 return 1; 3932 } 3933 3934 if (entry_start.cur_page) { 3935 /* 3936 * If this is a partially-completed 2M range, force 4K handling 3937 * for the remaining pages since they're effectively split at 3938 * this point. Subsequent code should ensure this doesn't get 3939 * combined with adjacent PSC entries where 2M handling is still 3940 * possible. 3941 */ 3942 npages -= entry_start.cur_page; 3943 gfn += entry_start.cur_page; 3944 huge = false; 3945 } 3946 3947 if (npages) 3948 break; 3949 3950 /* 3951 * Increment the guest-visible index to communicate the current 3952 * entry back to the guest, e.g. in case of failure. No need 3953 * for READ_ONCE() as KVM doesn't consume the field, i.e. a 3954 * misbehaving guest can only break itself. 3955 */ 3956 guest_psc->hdr.cur_entry++; 3957 } 3958 3959 if (idx > sev_es->psc.end_idx) { 3960 /* Nothing more to process. */ 3961 snp_complete_psc(svm, 0); 3962 return 1; 3963 } 3964 3965 sev_es->psc.is_2m = huge; 3966 sev_es->psc.cur_idx = idx; 3967 sev_es->psc.batch_size = 1; 3968 3969 /* 3970 * Find all subsequent PSC entries that contain adjacent GPA 3971 * ranges/operations and can be combined into a single 3972 * KVM_HC_MAP_GPA_RANGE exit. 3973 */ 3974 while (++idx <= sev_es->psc.end_idx) { 3975 struct psc_entry entry = READ_ONCE(guest_psc->entries[idx]); 3976 3977 if (entry.operation != entry_start.operation || 3978 entry.gfn != entry_start.gfn + npages || 3979 entry.cur_page || !!entry.pagesize != huge) 3980 break; 3981 3982 sev_es->psc.batch_size++; 3983 npages += huge ? 512 : 1; 3984 } 3985 3986 switch (entry_start.operation) { 3987 case VMGEXIT_PSC_OP_PRIVATE: 3988 case VMGEXIT_PSC_OP_SHARED: 3989 vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; 3990 vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 3991 /* 3992 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 3993 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 3994 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 3995 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 3996 */ 3997 vcpu->run->hypercall.ret = 0; 3998 vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn); 3999 vcpu->run->hypercall.args[1] = npages; 4000 vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE 4001 ? KVM_MAP_GPA_RANGE_ENCRYPTED 4002 : KVM_MAP_GPA_RANGE_DECRYPTED; 4003 vcpu->run->hypercall.args[2] |= entry_start.pagesize 4004 ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M 4005 : KVM_MAP_GPA_RANGE_PAGE_SZ_4K; 4006 vcpu->arch.complete_userspace_io = snp_complete_one_psc; 4007 return 0; /* forward request to userspace */ 4008 default: 4009 /* 4010 * Only shared/private PSC operations are currently supported, so if the 4011 * entire range consists of unsupported operations (e.g. SMASH/UNSMASH), 4012 * then consider the entire range completed and avoid exiting to 4013 * userspace. In theory snp_complete_psc() can always be called directly 4014 * at this point to complete the current range and start the next one, 4015 * but that could lead to unexpected levels of recursion. 4016 */ 4017 __snp_complete_one_psc(svm); 4018 goto next_range; 4019 } 4020 4021 BUG(); 4022 } 4023 4024 static int snp_begin_psc(struct vcpu_svm *svm) 4025 { 4026 struct vcpu_sev_es_state *sev_es = &svm->sev_es; 4027 struct psc_buffer *guest_psc = sev_es->ghcb_sa; 4028 u16 max_nr_entries; 4029 4030 if (!user_exit_on_hypercall(svm->vcpu.kvm, KVM_HC_MAP_GPA_RANGE)) { 4031 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 4032 return 1; 4033 } 4034 4035 /* 4036 * GHCB v2 requires the scratch area to reside within the GHCB itself, 4037 * and PSC requests are only supported for GHCB v2+. Thus it should be 4038 * impossible to exceed the max PSC entry count (which is derived from 4039 * the size of the shared GHCB buffer). 4040 */ 4041 max_nr_entries = (sev_es->ghcb_sa_len - sizeof(struct psc_hdr)) / 4042 sizeof(struct psc_entry); 4043 if (WARN_ON_ONCE(max_nr_entries > VMGEXIT_PSC_MAX_COUNT)) { 4044 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 4045 return 1; 4046 } 4047 4048 /* 4049 * The PSC descriptor buffer can be modified by a misbehaved guest after 4050 * validation, so take care to only use validated copies of values used 4051 * for things like array indexing. 4052 */ 4053 sev_es->psc.cur_idx = READ_ONCE(guest_psc->hdr.cur_entry); 4054 sev_es->psc.end_idx = READ_ONCE(guest_psc->hdr.end_entry); 4055 4056 if (sev_es->psc.end_idx >= max_nr_entries) { 4057 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); 4058 return 1; 4059 } 4060 4061 return snp_do_psc(svm); 4062 } 4063 4064 /* 4065 * Invoked as part of svm_vcpu_reset() processing of an init event. 4066 */ 4067 static void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) 4068 { 4069 struct vcpu_svm *svm = to_svm(vcpu); 4070 struct kvm_memory_slot *slot; 4071 struct page *page; 4072 kvm_pfn_t pfn; 4073 gfn_t gfn; 4074 4075 guard(mutex)(&svm->sev_es.snp_vmsa_mutex); 4076 4077 if (!svm->sev_es.snp_ap_waiting_for_reset) 4078 return; 4079 4080 svm->sev_es.snp_ap_waiting_for_reset = false; 4081 4082 /* Mark the vCPU as offline and not runnable */ 4083 vcpu->arch.pv.pv_unhalted = false; 4084 kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); 4085 4086 /* Clear use of the VMSA */ 4087 svm->vmcb->control.vmsa_pa = INVALID_PAGE; 4088 4089 /* 4090 * When replacing the VMSA during SEV-SNP AP creation, 4091 * mark the VMCB dirty so that full state is always reloaded. 4092 */ 4093 vmcb_mark_all_dirty(svm->vmcb); 4094 4095 if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) 4096 return; 4097 4098 gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); 4099 svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; 4100 4101 slot = gfn_to_memslot(vcpu->kvm, gfn); 4102 if (!slot) 4103 return; 4104 4105 /* 4106 * The new VMSA will be private memory guest memory, so retrieve the 4107 * PFN from the gmem backend. 4108 */ 4109 if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) 4110 return; 4111 4112 /* 4113 * From this point forward, the VMSA will always be a guest-mapped page 4114 * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In 4115 * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but 4116 * that involves cleanups like flushing caches, which would ideally be 4117 * handled during teardown rather than guest boot. Deferring that also 4118 * allows the existing logic for SEV-ES VMSAs to be re-used with 4119 * minimal SNP-specific changes. 4120 */ 4121 svm->sev_es.snp_has_guest_vmsa = true; 4122 4123 /* Use the new VMSA */ 4124 svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); 4125 4126 /* Mark the vCPU as runnable */ 4127 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 4128 4129 /* 4130 * gmem pages aren't currently migratable, but if this ever changes 4131 * then care should be taken to ensure svm->sev_es.vmsa is pinned 4132 * through some other means. 4133 */ 4134 kvm_release_page_clean(page); 4135 } 4136 4137 static int sev_snp_ap_creation(struct vcpu_svm *svm) 4138 { 4139 struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); 4140 struct kvm_vcpu *vcpu = &svm->vcpu; 4141 struct kvm_vcpu *target_vcpu; 4142 struct vcpu_svm *target_svm; 4143 unsigned int request; 4144 unsigned int apic_id; 4145 4146 request = lower_32_bits(svm->vmcb->control.exit_info_1); 4147 apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); 4148 4149 /* Validate the APIC ID */ 4150 target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id); 4151 if (!target_vcpu) { 4152 vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n", 4153 apic_id); 4154 return -EINVAL; 4155 } 4156 4157 target_svm = to_svm(target_vcpu); 4158 4159 guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex); 4160 4161 switch (request) { 4162 case SVM_VMGEXIT_AP_CREATE_ON_INIT: 4163 case SVM_VMGEXIT_AP_CREATE: 4164 if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) { 4165 vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n", 4166 vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features); 4167 return -EINVAL; 4168 } 4169 4170 if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { 4171 vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", 4172 svm->vmcb->control.exit_info_2); 4173 return -EINVAL; 4174 } 4175 4176 /* 4177 * Malicious guest can RMPADJUST a large page into VMSA which 4178 * will hit the SNP erratum where the CPU will incorrectly signal 4179 * an RMP violation #PF if a hugepage collides with the RMP entry 4180 * of VMSA page, reject the AP CREATE request if VMSA address from 4181 * guest is 2M aligned. 4182 */ 4183 if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) { 4184 vcpu_unimpl(vcpu, 4185 "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n", 4186 svm->vmcb->control.exit_info_2); 4187 return -EINVAL; 4188 } 4189 4190 target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; 4191 break; 4192 case SVM_VMGEXIT_AP_DESTROY: 4193 target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; 4194 break; 4195 default: 4196 vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", 4197 request); 4198 return -EINVAL; 4199 } 4200 4201 target_svm->sev_es.snp_ap_waiting_for_reset = true; 4202 4203 /* 4204 * Unless Creation is deferred until INIT, signal the vCPU to update 4205 * its state. 4206 */ 4207 if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) 4208 kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); 4209 4210 return 0; 4211 } 4212 4213 static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) 4214 { 4215 struct sev_data_snp_guest_request data = {0}; 4216 struct kvm *kvm = svm->vcpu.kvm; 4217 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 4218 sev_ret_code fw_err = 0; 4219 int ret; 4220 4221 if (!is_sev_snp_guest(&svm->vcpu)) 4222 return -EINVAL; 4223 4224 guard(mutex)(&sev->guest_req_mutex); 4225 4226 if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) 4227 return -EIO; 4228 4229 data.gctx_paddr = __psp_pa(sev->snp_context); 4230 data.req_paddr = __psp_pa(sev->guest_req_buf); 4231 data.res_paddr = __psp_pa(sev->guest_resp_buf); 4232 4233 /* 4234 * Firmware failures are propagated on to guest, but any other failure 4235 * condition along the way should be reported to userspace. E.g. if 4236 * the PSP is dead and commands are timing out. 4237 */ 4238 ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); 4239 if (ret && !fw_err) 4240 return ret; 4241 4242 if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) 4243 return -EIO; 4244 4245 /* No action is requested *from KVM* if there was a firmware error. */ 4246 svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err)); 4247 4248 /* resume guest */ 4249 return 1; 4250 } 4251 4252 static int snp_req_certs_err(struct vcpu_svm *svm, u32 vmm_error) 4253 { 4254 ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_error, 0)); 4255 4256 return 1; /* resume guest */ 4257 } 4258 4259 static int snp_complete_req_certs(struct kvm_vcpu *vcpu) 4260 { 4261 struct vcpu_svm *svm = to_svm(vcpu); 4262 struct vmcb_control_area *control = &svm->vmcb->control; 4263 4264 switch (READ_ONCE(vcpu->run->snp_req_certs.ret)) { 4265 case 0: 4266 return snp_handle_guest_req(svm, control->exit_info_1, 4267 control->exit_info_2); 4268 case ENOSPC: 4269 vcpu->arch.regs[VCPU_REGS_RBX] = vcpu->run->snp_req_certs.npages; 4270 return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_INVALID_LEN); 4271 case EAGAIN: 4272 return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_BUSY); 4273 case EIO: 4274 return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_GENERIC); 4275 default: 4276 break; 4277 } 4278 4279 return -EINVAL; 4280 } 4281 4282 static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) 4283 { 4284 struct kvm_vcpu *vcpu = &svm->vcpu; 4285 struct kvm *kvm = vcpu->kvm; 4286 4287 u8 msg_type; 4288 4289 if (!is_sev_snp_guest(vcpu)) 4290 return -EINVAL; 4291 4292 if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type), 4293 &msg_type, 1)) 4294 return -EIO; 4295 4296 /* 4297 * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for 4298 * additional certificate data to be provided alongside the attestation 4299 * report via the guest-provided data pages indicated by RAX/RBX. If 4300 * userspace enables KVM_EXIT_SNP_REQ_CERTS, then exit to userspace 4301 * to give userspace an opportunity to provide the certificate data 4302 * before issuing/completing the attestation request. Otherwise, return 4303 * an empty certificate table in the guest-provided data pages and 4304 * handle the attestation request immediately. 4305 */ 4306 if (msg_type == SNP_MSG_REPORT_REQ) { 4307 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 4308 u64 data_npages; 4309 gpa_t data_gpa; 4310 4311 if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm)) 4312 goto request_invalid; 4313 4314 data_gpa = vcpu->arch.regs[VCPU_REGS_RAX]; 4315 data_npages = vcpu->arch.regs[VCPU_REGS_RBX]; 4316 4317 if (!PAGE_ALIGNED(data_gpa)) 4318 goto request_invalid; 4319 4320 if (sev->snp_certs_enabled) { 4321 vcpu->run->exit_reason = KVM_EXIT_SNP_REQ_CERTS; 4322 vcpu->run->snp_req_certs.gpa = data_gpa; 4323 vcpu->run->snp_req_certs.npages = data_npages; 4324 vcpu->run->snp_req_certs.ret = 0; 4325 vcpu->arch.complete_userspace_io = snp_complete_req_certs; 4326 return 0; 4327 } 4328 4329 /* 4330 * As per GHCB spec (see "SNP Extended Guest Request"), the 4331 * certificate table is terminated by 24-bytes of zeroes. 4332 */ 4333 if (data_npages && kvm_clear_guest(kvm, data_gpa, 24)) 4334 return -EIO; 4335 } 4336 4337 return snp_handle_guest_req(svm, req_gpa, resp_gpa); 4338 4339 request_invalid: 4340 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4341 return 1; /* resume guest */ 4342 } 4343 4344 static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) 4345 { 4346 struct vmcb_control_area *control = &svm->vmcb->control; 4347 struct kvm_vcpu *vcpu = &svm->vcpu; 4348 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 4349 u64 ghcb_info; 4350 int ret = 1; 4351 4352 ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK; 4353 4354 trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id, 4355 control->ghcb_gpa); 4356 4357 switch (ghcb_info) { 4358 case GHCB_MSR_SEV_INFO_REQ: 4359 set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, 4360 GHCB_VERSION_MIN, 4361 sev_enc_bit)); 4362 break; 4363 case GHCB_MSR_CPUID_REQ: { 4364 u64 cpuid_fn, cpuid_reg, cpuid_value; 4365 4366 cpuid_fn = get_ghcb_msr_bits(svm, 4367 GHCB_MSR_CPUID_FUNC_MASK, 4368 GHCB_MSR_CPUID_FUNC_POS); 4369 4370 /* Initialize the registers needed by the CPUID intercept */ 4371 vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn; 4372 vcpu->arch.regs[VCPU_REGS_RCX] = 0; 4373 4374 ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID); 4375 if (!ret) { 4376 /* Error, keep GHCB MSR value as-is */ 4377 break; 4378 } 4379 4380 cpuid_reg = get_ghcb_msr_bits(svm, 4381 GHCB_MSR_CPUID_REG_MASK, 4382 GHCB_MSR_CPUID_REG_POS); 4383 if (cpuid_reg == 0) 4384 cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX]; 4385 else if (cpuid_reg == 1) 4386 cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX]; 4387 else if (cpuid_reg == 2) 4388 cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX]; 4389 else 4390 cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX]; 4391 4392 set_ghcb_msr_bits(svm, cpuid_value, 4393 GHCB_MSR_CPUID_VALUE_MASK, 4394 GHCB_MSR_CPUID_VALUE_POS); 4395 4396 set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP, 4397 GHCB_MSR_INFO_MASK, 4398 GHCB_MSR_INFO_POS); 4399 break; 4400 } 4401 case GHCB_MSR_AP_RESET_HOLD_REQ: 4402 svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO; 4403 ret = kvm_emulate_ap_reset_hold(&svm->vcpu); 4404 4405 /* 4406 * Preset the result to a non-SIPI return and then only set 4407 * the result to non-zero when delivering a SIPI. 4408 */ 4409 set_ghcb_msr_bits(svm, 0, 4410 GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, 4411 GHCB_MSR_AP_RESET_HOLD_RESULT_POS); 4412 4413 set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, 4414 GHCB_MSR_INFO_MASK, 4415 GHCB_MSR_INFO_POS); 4416 break; 4417 case GHCB_MSR_HV_FT_REQ: 4418 set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED, 4419 GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS); 4420 set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP, 4421 GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); 4422 break; 4423 case GHCB_MSR_PREF_GPA_REQ: 4424 if (!is_sev_snp_guest(vcpu)) 4425 goto out_terminate; 4426 4427 set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK, 4428 GHCB_MSR_GPA_VALUE_POS); 4429 set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK, 4430 GHCB_MSR_INFO_POS); 4431 break; 4432 case GHCB_MSR_REG_GPA_REQ: { 4433 u64 gfn; 4434 4435 if (!is_sev_snp_guest(vcpu)) 4436 goto out_terminate; 4437 4438 gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK, 4439 GHCB_MSR_GPA_VALUE_POS); 4440 4441 svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn); 4442 4443 set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK, 4444 GHCB_MSR_GPA_VALUE_POS); 4445 set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK, 4446 GHCB_MSR_INFO_POS); 4447 break; 4448 } 4449 case GHCB_MSR_PSC_REQ: 4450 if (!is_sev_snp_guest(vcpu)) 4451 goto out_terminate; 4452 4453 ret = snp_begin_psc_msr(svm, control->ghcb_gpa); 4454 break; 4455 case GHCB_MSR_TERM_REQ: { 4456 u64 reason_set, reason_code; 4457 4458 reason_set = get_ghcb_msr_bits(svm, 4459 GHCB_MSR_TERM_REASON_SET_MASK, 4460 GHCB_MSR_TERM_REASON_SET_POS); 4461 reason_code = get_ghcb_msr_bits(svm, 4462 GHCB_MSR_TERM_REASON_MASK, 4463 GHCB_MSR_TERM_REASON_POS); 4464 pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", 4465 reason_set, reason_code); 4466 4467 goto out_terminate; 4468 } 4469 default: 4470 /* Error, keep GHCB MSR value as-is */ 4471 break; 4472 } 4473 4474 trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id, 4475 control->ghcb_gpa, ret); 4476 4477 return ret; 4478 4479 out_terminate: 4480 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 4481 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; 4482 vcpu->run->system_event.ndata = 1; 4483 vcpu->run->system_event.data[0] = control->ghcb_gpa; 4484 4485 return 0; 4486 } 4487 4488 int sev_handle_vmgexit(struct kvm_vcpu *vcpu) 4489 { 4490 struct vcpu_svm *svm = to_svm(vcpu); 4491 struct vmcb_control_area *control = &svm->vmcb->control; 4492 u64 ghcb_gpa; 4493 int ret; 4494 4495 /* Validate the GHCB */ 4496 ghcb_gpa = control->ghcb_gpa; 4497 if (ghcb_gpa & GHCB_MSR_INFO_MASK) 4498 return sev_handle_vmgexit_msr_protocol(svm); 4499 4500 if (!ghcb_gpa) { 4501 vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n"); 4502 4503 /* Without a GHCB, just return right back to the guest */ 4504 return 1; 4505 } 4506 4507 if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) { 4508 /* Unable to map GHCB from guest */ 4509 vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", 4510 ghcb_gpa); 4511 4512 /* Without a GHCB, just return right back to the guest */ 4513 return 1; 4514 } 4515 4516 svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; 4517 4518 trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb); 4519 4520 sev_es_sync_from_ghcb(svm); 4521 4522 /* SEV-SNP guest requires that the GHCB GPA must be registered */ 4523 if (is_sev_snp_guest(vcpu) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) { 4524 vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa); 4525 return -EINVAL; 4526 } 4527 4528 ret = sev_es_validate_vmgexit(svm); 4529 if (ret) 4530 return ret; 4531 4532 svm_vmgexit_success(svm, 0); 4533 4534 switch (control->exit_code) { 4535 case SVM_VMGEXIT_MMIO_READ: 4536 case SVM_VMGEXIT_MMIO_WRITE: { 4537 bool is_write = control->exit_code == SVM_VMGEXIT_MMIO_WRITE; 4538 u64 len = control->exit_info_2; 4539 4540 if (!len) 4541 return 1; 4542 4543 if (to_kvm_sev_info(vcpu->kvm)->ghcb_version >= 2 && len > 8) { 4544 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4545 return 1; 4546 } 4547 4548 ret = setup_vmgexit_scratch(svm, !is_write, len); 4549 if (ret) 4550 break; 4551 4552 ret = kvm_sev_es_mmio(vcpu, is_write, control->exit_info_1, len, 4553 svm->sev_es.ghcb_sa); 4554 break; 4555 } 4556 case SVM_VMGEXIT_NMI_COMPLETE: 4557 ++vcpu->stat.nmi_window_exits; 4558 svm->nmi_masked = false; 4559 kvm_make_request(KVM_REQ_EVENT, vcpu); 4560 ret = 1; 4561 break; 4562 case SVM_VMGEXIT_AP_HLT_LOOP: 4563 svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT; 4564 ret = kvm_emulate_ap_reset_hold(vcpu); 4565 break; 4566 case SVM_VMGEXIT_AP_JUMP_TABLE: { 4567 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 4568 4569 switch (control->exit_info_1) { 4570 case 0: 4571 /* Set AP jump table address */ 4572 sev->ap_jump_table = control->exit_info_2; 4573 break; 4574 case 1: 4575 /* Get AP jump table address */ 4576 svm_vmgexit_success(svm, sev->ap_jump_table); 4577 break; 4578 default: 4579 pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", 4580 control->exit_info_1); 4581 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4582 } 4583 4584 ret = 1; 4585 break; 4586 } 4587 case SVM_VMGEXIT_HV_FEATURES: 4588 svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED); 4589 ret = 1; 4590 break; 4591 case SVM_VMGEXIT_TERM_REQUEST: 4592 pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n", 4593 control->exit_info_1, control->exit_info_2); 4594 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 4595 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; 4596 vcpu->run->system_event.ndata = 1; 4597 vcpu->run->system_event.data[0] = control->ghcb_gpa; 4598 break; 4599 case SVM_VMGEXIT_PSC: 4600 ret = setup_vmgexit_scratch(svm, true, sizeof(struct psc_hdr)); 4601 if (ret) 4602 break; 4603 4604 ret = snp_begin_psc(svm); 4605 break; 4606 case SVM_VMGEXIT_AP_CREATION: 4607 ret = sev_snp_ap_creation(svm); 4608 if (ret) { 4609 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4610 } 4611 4612 ret = 1; 4613 break; 4614 case SVM_VMGEXIT_GUEST_REQUEST: 4615 ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2); 4616 break; 4617 case SVM_VMGEXIT_EXT_GUEST_REQUEST: 4618 ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2); 4619 break; 4620 case SVM_VMGEXIT_UNSUPPORTED_EVENT: 4621 vcpu_unimpl(vcpu, 4622 "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", 4623 control->exit_info_1, control->exit_info_2); 4624 ret = -EINVAL; 4625 break; 4626 case SVM_EXIT_IOIO: 4627 if (!((control->exit_info_1 & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT)) 4628 return 1; 4629 4630 fallthrough; 4631 default: 4632 ret = svm_invoke_exit_handler(vcpu, control->exit_code); 4633 } 4634 4635 return ret; 4636 } 4637 4638 int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) 4639 { 4640 int count; 4641 int bytes; 4642 int r; 4643 4644 if (svm->vmcb->control.exit_info_2 > INT_MAX) 4645 return -EINVAL; 4646 4647 count = svm->vmcb->control.exit_info_2; 4648 if (unlikely(check_mul_overflow(count, size, &bytes))) 4649 return -EINVAL; 4650 4651 if (!bytes) 4652 return 1; 4653 4654 r = setup_vmgexit_scratch(svm, in, bytes); 4655 if (r) 4656 return r; 4657 4658 return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa, 4659 count, in); 4660 } 4661 4662 void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4663 { 4664 /* Clear intercepts on MSRs that are context switched by hardware. */ 4665 svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW); 4666 svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW); 4667 svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW); 4668 4669 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) 4670 svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW, 4671 !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && 4672 !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)); 4673 4674 svm_set_intercept_for_msr(vcpu, MSR_AMD64_GUEST_TSC_FREQ, MSR_TYPE_R, 4675 !snp_is_secure_tsc_enabled(vcpu->kvm)); 4676 4677 /* 4678 * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if 4679 * the host/guest supports its use. 4680 * 4681 * KVM treats the guest as being capable of using XSAVES even if XSAVES 4682 * isn't enabled in guest CPUID as there is no intercept for XSAVES, 4683 * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is 4684 * exposed to the guest and XSAVES is supported in hardware. Condition 4685 * full XSS passthrough on the guest being able to use XSAVES *and* 4686 * XSAVES being exposed to the guest so that KVM can at least honor 4687 * guest CPUID for RDMSR and WRMSR. 4688 */ 4689 svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW, 4690 !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) || 4691 !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)); 4692 } 4693 4694 void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) 4695 { 4696 struct kvm_vcpu *vcpu = &svm->vcpu; 4697 struct kvm_cpuid_entry2 *best; 4698 4699 /* For sev guests, the memory encryption bit is not reserved in CR3. */ 4700 best = kvm_find_cpuid_entry(vcpu, 0x8000001F); 4701 if (best) 4702 vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); 4703 } 4704 4705 static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event) 4706 { 4707 struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); 4708 struct vmcb *vmcb = svm->vmcb01.ptr; 4709 4710 svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV_ES; 4711 4712 /* 4713 * An SEV-ES guest requires a VMSA area that is a separate from the 4714 * VMCB page. Do not include the encryption mask on the VMSA physical 4715 * address since hardware will access it using the guest key. Note, 4716 * the VMSA will be NULL if this vCPU is the destination for intrahost 4717 * migration, and will be copied later. 4718 */ 4719 if (!svm->sev_es.snp_has_guest_vmsa) { 4720 if (svm->sev_es.vmsa) 4721 svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); 4722 else 4723 svm->vmcb->control.vmsa_pa = INVALID_PAGE; 4724 } 4725 4726 if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) 4727 svm->vmcb->control.allowed_sev_features = sev->vmsa_features | 4728 VMCB_ALLOWED_SEV_FEATURES_VALID; 4729 4730 /* Can't intercept CR register access, HV can't modify CR registers */ 4731 svm_clr_intercept(svm, INTERCEPT_CR0_READ); 4732 svm_clr_intercept(svm, INTERCEPT_CR4_READ); 4733 svm_clr_intercept(svm, INTERCEPT_CR8_READ); 4734 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); 4735 svm_clr_intercept(svm, INTERCEPT_CR4_WRITE); 4736 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); 4737 4738 svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0); 4739 4740 /* Track EFER/CR register changes */ 4741 svm_set_intercept(svm, TRAP_EFER_WRITE); 4742 svm_set_intercept(svm, TRAP_CR0_WRITE); 4743 svm_set_intercept(svm, TRAP_CR4_WRITE); 4744 svm_set_intercept(svm, TRAP_CR8_WRITE); 4745 4746 vmcb->control.intercepts[INTERCEPT_DR] = 0; 4747 if (!sev_vcpu_has_debug_swap(svm)) { 4748 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); 4749 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); 4750 svm_mark_intercepts_dirty(svm); 4751 } else { 4752 /* 4753 * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't 4754 * allow debugging SEV-ES guests, and enables DebugSwap iff 4755 * NO_NESTED_DATA_BP is supported, so there's no reason to 4756 * intercept #DB when DebugSwap is enabled. For simplicity 4757 * with respect to guest debug, intercept #DB for other VMs 4758 * even if NO_NESTED_DATA_BP is supported, i.e. even if the 4759 * guest can't DoS the CPU with infinite #DB vectoring. 4760 */ 4761 clr_exception_intercept(svm, DB_VECTOR); 4762 } 4763 4764 /* Can't intercept XSETBV, HV can't modify XCR0 directly */ 4765 svm_clr_intercept(svm, INTERCEPT_XSETBV); 4766 4767 /* 4768 * Set the GHCB MSR value as per the GHCB specification when emulating 4769 * vCPU RESET for an SEV-ES guest. 4770 */ 4771 if (!init_event) 4772 set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, 4773 GHCB_VERSION_MIN, 4774 sev_enc_bit)); 4775 } 4776 4777 void sev_init_vmcb(struct vcpu_svm *svm, bool init_event) 4778 { 4779 struct kvm_vcpu *vcpu = &svm->vcpu; 4780 4781 svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV; 4782 clr_exception_intercept(svm, UD_VECTOR); 4783 4784 /* 4785 * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as 4786 * KVM can't decrypt guest memory to decode the faulting instruction. 4787 */ 4788 clr_exception_intercept(svm, GP_VECTOR); 4789 4790 if (init_event && is_sev_snp_guest(vcpu)) 4791 sev_snp_init_protected_guest_state(vcpu); 4792 4793 if (is_sev_es_guest(vcpu)) 4794 sev_es_init_vmcb(svm, init_event); 4795 } 4796 4797 int sev_vcpu_create(struct kvm_vcpu *vcpu) 4798 { 4799 struct vcpu_svm *svm = to_svm(vcpu); 4800 struct page *vmsa_page; 4801 4802 mutex_init(&svm->sev_es.snp_vmsa_mutex); 4803 4804 if (!is_sev_es_guest(vcpu)) 4805 return 0; 4806 4807 /* 4808 * SEV-ES guests require a separate (from the VMCB) VMSA page used to 4809 * contain the encrypted register state of the guest. 4810 */ 4811 vmsa_page = snp_safe_alloc_page(); 4812 if (!vmsa_page) 4813 return -ENOMEM; 4814 4815 svm->sev_es.vmsa = page_address(vmsa_page); 4816 4817 vcpu->arch.guest_tsc_protected = snp_is_secure_tsc_enabled(vcpu->kvm); 4818 4819 return 0; 4820 } 4821 4822 void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) 4823 { 4824 /* 4825 * All host state for SEV-ES guests is categorized into three swap types 4826 * based on how it is handled by hardware during a world switch: 4827 * 4828 * A: VMRUN: Host state saved in host save area 4829 * VMEXIT: Host state loaded from host save area 4830 * 4831 * B: VMRUN: Host state _NOT_ saved in host save area 4832 * VMEXIT: Host state loaded from host save area 4833 * 4834 * C: VMRUN: Host state _NOT_ saved in host save area 4835 * VMEXIT: Host state initialized to default(reset) values 4836 * 4837 * Manually save type-B state, i.e. state that is loaded by VMEXIT but 4838 * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed 4839 * by common SVM code). 4840 */ 4841 hostsa->xcr0 = kvm_host.xcr0; 4842 hostsa->pkru = read_pkru(); 4843 hostsa->xss = kvm_host.xss; 4844 4845 /* 4846 * If DebugSwap is enabled, debug registers are loaded but NOT saved by 4847 * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does 4848 * not save or load debug registers. Sadly, KVM can't prevent SNP 4849 * guests from lying about DebugSwap on secondary vCPUs, i.e. the 4850 * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what 4851 * the guest has actually enabled (or not!) in the VMSA. 4852 * 4853 * If DebugSwap is *possible*, save the masks so that they're restored 4854 * if the guest enables DebugSwap. But for the DRs themselves, do NOT 4855 * rely on the CPU to restore the host values; KVM will restore them as 4856 * needed in common code, via hw_breakpoint_restore(). Note, KVM does 4857 * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs 4858 * don't need to be restored per se, KVM just needs to ensure they are 4859 * loaded with the correct values *if* the CPU writes the MSRs. 4860 */ 4861 if (sev_vcpu_has_debug_swap(svm) || 4862 (cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) && 4863 is_sev_snp_guest(&svm->vcpu))) { 4864 hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); 4865 hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); 4866 hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); 4867 hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3); 4868 } 4869 4870 /* 4871 * TSC_AUX is always virtualized for SEV-ES guests when the feature is 4872 * available, i.e. TSC_AUX is loaded on #VMEXIT from the host save area. 4873 * Set the save area to the current hardware value, i.e. the current 4874 * user return value, so that the correct value is restored on #VMEXIT. 4875 */ 4876 if (cpu_feature_enabled(X86_FEATURE_V_TSC_AUX) && 4877 !WARN_ON_ONCE(tsc_aux_uret_slot < 0)) 4878 hostsa->tsc_aux = kvm_get_user_return_msr(tsc_aux_uret_slot); 4879 } 4880 4881 void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) 4882 { 4883 struct vcpu_svm *svm = to_svm(vcpu); 4884 4885 /* First SIPI: Use the values as initially set by the VMM */ 4886 if (!svm->sev_es.received_first_sipi) { 4887 svm->sev_es.received_first_sipi = true; 4888 return; 4889 } 4890 4891 /* Subsequent SIPI */ 4892 switch (svm->sev_es.ap_reset_hold_type) { 4893 case AP_RESET_HOLD_NAE_EVENT: 4894 /* 4895 * Return from an AP Reset Hold VMGEXIT, where the guest will 4896 * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. 4897 */ 4898 svm_vmgexit_success(svm, 1); 4899 break; 4900 case AP_RESET_HOLD_MSR_PROTO: 4901 /* 4902 * Return from an AP Reset Hold VMGEXIT, where the guest will 4903 * set the CS and RIP. Set GHCB data field to a non-zero value. 4904 */ 4905 set_ghcb_msr_bits(svm, 1, 4906 GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, 4907 GHCB_MSR_AP_RESET_HOLD_RESULT_POS); 4908 4909 set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, 4910 GHCB_MSR_INFO_MASK, 4911 GHCB_MSR_INFO_POS); 4912 break; 4913 default: 4914 break; 4915 } 4916 } 4917 4918 struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) 4919 { 4920 unsigned long pfn; 4921 struct page *p; 4922 4923 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 4924 return alloc_pages_node(node, gfp | __GFP_ZERO, 0); 4925 4926 /* 4927 * Allocate an SNP-safe page to workaround the SNP erratum where 4928 * the CPU will incorrectly signal an RMP violation #PF if a 4929 * hugepage (2MB or 1GB) collides with the RMP entry of a 4930 * 2MB-aligned VMCB, VMSA, or AVIC backing page. 4931 * 4932 * Allocate one extra page, choose a page which is not 4933 * 2MB-aligned, and free the other. 4934 */ 4935 p = alloc_pages_node(node, gfp | __GFP_ZERO, 1); 4936 if (!p) 4937 return NULL; 4938 4939 split_page(p, 1); 4940 4941 pfn = page_to_pfn(p); 4942 if (IS_ALIGNED(pfn, PTRS_PER_PMD)) 4943 __free_page(p++); 4944 else 4945 __free_page(p + 1); 4946 4947 return p; 4948 } 4949 4950 void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) 4951 { 4952 struct kvm_memory_slot *slot; 4953 struct kvm *kvm = vcpu->kvm; 4954 int order, rmp_level, ret; 4955 struct page *page; 4956 bool assigned; 4957 kvm_pfn_t pfn; 4958 gfn_t gfn; 4959 4960 gfn = gpa >> PAGE_SHIFT; 4961 4962 /* 4963 * The only time RMP faults occur for shared pages is when the guest is 4964 * triggering an RMP fault for an implicit page-state change from 4965 * shared->private. Implicit page-state changes are forwarded to 4966 * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults 4967 * for shared pages should not end up here. 4968 */ 4969 if (!kvm_mem_is_private(kvm, gfn)) { 4970 pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n", 4971 gpa); 4972 return; 4973 } 4974 4975 slot = gfn_to_memslot(kvm, gfn); 4976 if (!kvm_slot_has_gmem(slot)) { 4977 pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", 4978 gpa); 4979 return; 4980 } 4981 4982 ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order); 4983 if (ret) { 4984 pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", 4985 gpa); 4986 return; 4987 } 4988 4989 ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); 4990 if (ret || !assigned) { 4991 pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n", 4992 gpa, pfn, ret); 4993 goto out_no_trace; 4994 } 4995 4996 /* 4997 * There are 2 cases where a PSMASH may be needed to resolve an #NPF 4998 * with PFERR_GUEST_RMP_BIT set: 4999 * 5000 * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM 5001 * bit set if the guest issues them with a smaller granularity than 5002 * what is indicated by the page-size bit in the 2MB RMP entry for 5003 * the PFN that backs the GPA. 5004 * 5005 * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is 5006 * smaller than what is indicated by the 2MB RMP entry for the PFN 5007 * that backs the GPA. 5008 * 5009 * In both these cases, the corresponding 2M RMP entry needs to 5010 * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already 5011 * split into 4K RMP entries, then this is likely a spurious case which 5012 * can occur when there are concurrent accesses by the guest to a 2MB 5013 * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in 5014 * the process of being PMASH'd into 4K entries. These cases should 5015 * resolve automatically on subsequent accesses, so just ignore them 5016 * here. 5017 */ 5018 if (rmp_level == PG_LEVEL_4K) 5019 goto out; 5020 5021 ret = snp_rmptable_psmash(pfn); 5022 if (ret) { 5023 /* 5024 * Look it up again. If it's 4K now then the PSMASH may have 5025 * raced with another process and the issue has already resolved 5026 * itself. 5027 */ 5028 if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) && 5029 assigned && rmp_level == PG_LEVEL_4K) 5030 goto out; 5031 5032 pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n", 5033 gpa, pfn, ret); 5034 } 5035 5036 kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD); 5037 out: 5038 trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); 5039 out_no_trace: 5040 kvm_release_page_unused(page); 5041 } 5042 5043 static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) 5044 { 5045 kvm_pfn_t pfn = start; 5046 5047 while (pfn < end) { 5048 int ret, rmp_level; 5049 bool assigned; 5050 5051 ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); 5052 if (ret) { 5053 pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n", 5054 pfn, start, end, rmp_level, ret); 5055 return false; 5056 } 5057 5058 if (assigned) { 5059 pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n", 5060 __func__, pfn, start, end, rmp_level); 5061 return false; 5062 } 5063 5064 pfn++; 5065 } 5066 5067 return true; 5068 } 5069 5070 static u8 max_level_for_order(int order) 5071 { 5072 if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) 5073 return PG_LEVEL_2M; 5074 5075 return PG_LEVEL_4K; 5076 } 5077 5078 static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order) 5079 { 5080 kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); 5081 5082 /* 5083 * If this is a large folio, and the entire 2M range containing the 5084 * PFN is currently shared, then the entire 2M-aligned range can be 5085 * set to private via a single 2M RMP entry. 5086 */ 5087 if (max_level_for_order(order) > PG_LEVEL_4K && 5088 is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD)) 5089 return true; 5090 5091 return false; 5092 } 5093 5094 int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) 5095 { 5096 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 5097 kvm_pfn_t pfn_aligned; 5098 gfn_t gfn_aligned; 5099 int level, rc; 5100 bool assigned; 5101 5102 if (!sev_snp_guest(kvm)) 5103 return 0; 5104 5105 rc = snp_lookup_rmpentry(pfn, &assigned, &level); 5106 if (rc) { 5107 pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n", 5108 gfn, pfn, rc); 5109 return -ENOENT; 5110 } 5111 5112 if (assigned) { 5113 pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n", 5114 __func__, gfn, pfn, max_order, level); 5115 return 0; 5116 } 5117 5118 if (is_large_rmp_possible(kvm, pfn, max_order)) { 5119 level = PG_LEVEL_2M; 5120 pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); 5121 gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD); 5122 } else { 5123 level = PG_LEVEL_4K; 5124 pfn_aligned = pfn; 5125 gfn_aligned = gfn; 5126 } 5127 5128 rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false); 5129 if (rc) { 5130 pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n", 5131 gfn, pfn, level, rc); 5132 return -EINVAL; 5133 } 5134 5135 pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n", 5136 __func__, gfn, pfn, pfn_aligned, max_order, level); 5137 5138 return 0; 5139 } 5140 5141 void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) 5142 { 5143 kvm_pfn_t pfn; 5144 5145 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 5146 return; 5147 5148 pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end); 5149 5150 for (pfn = start; pfn < end;) { 5151 bool use_2m_update = false; 5152 int rc, rmp_level; 5153 bool assigned; 5154 5155 rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); 5156 if (rc || !assigned) 5157 goto next_pfn; 5158 5159 use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) && 5160 end >= (pfn + PTRS_PER_PMD) && 5161 rmp_level > PG_LEVEL_4K; 5162 5163 /* 5164 * If an unaligned PFN corresponds to a 2M region assigned as a 5165 * large page in the RMP table, PSMASH the region into individual 5166 * 4K RMP entries before attempting to convert a 4K sub-page. 5167 */ 5168 if (!use_2m_update && rmp_level > PG_LEVEL_4K) { 5169 /* 5170 * This shouldn't fail, but if it does, report it, but 5171 * still try to update RMP entry to shared and pray this 5172 * was a spurious error that can be addressed later. 5173 */ 5174 rc = snp_rmptable_psmash(pfn); 5175 WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n", 5176 pfn, rc); 5177 } 5178 5179 rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K); 5180 if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n", 5181 pfn, rc)) 5182 goto next_pfn; 5183 5184 /* 5185 * SEV-ES avoids host/guest cache coherency issues through 5186 * WBNOINVD hooks issued via MMU notifiers during run-time, and 5187 * KVM's VM destroy path at shutdown. Those MMU notifier events 5188 * don't cover gmem since there is no requirement to map pages 5189 * to a HVA in order to use them for a running guest. While the 5190 * shutdown path would still likely cover things for SNP guests, 5191 * userspace may also free gmem pages during run-time via 5192 * hole-punching operations on the guest_memfd, so flush the 5193 * cache entries for these pages before free'ing them back to 5194 * the host. 5195 */ 5196 clflush_cache_range(__va(pfn_to_hpa(pfn)), 5197 use_2m_update ? PMD_SIZE : PAGE_SIZE); 5198 next_pfn: 5199 pfn += use_2m_update ? PTRS_PER_PMD : 1; 5200 cond_resched(); 5201 } 5202 } 5203 5204 int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) 5205 { 5206 int level, rc; 5207 bool assigned; 5208 5209 if (!sev_snp_guest(kvm)) 5210 return 0; 5211 5212 rc = snp_lookup_rmpentry(pfn, &assigned, &level); 5213 if (rc || !assigned) 5214 return PG_LEVEL_4K; 5215 5216 return level; 5217 } 5218 5219 struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) 5220 { 5221 struct vcpu_svm *svm = to_svm(vcpu); 5222 struct vmcb_save_area *vmsa; 5223 struct kvm_sev_info *sev; 5224 int error = 0; 5225 int ret; 5226 5227 if (!is_sev_es_guest(vcpu)) 5228 return NULL; 5229 5230 /* 5231 * If the VMSA has not yet been encrypted, return a pointer to the 5232 * current un-encrypted VMSA. 5233 */ 5234 if (!vcpu->arch.guest_state_protected) 5235 return (struct vmcb_save_area *)svm->sev_es.vmsa; 5236 5237 sev = to_kvm_sev_info(vcpu->kvm); 5238 5239 /* Check if the SEV policy allows debugging */ 5240 if (is_sev_snp_guest(vcpu)) { 5241 if (!(sev->policy & SNP_POLICY_MASK_DEBUG)) 5242 return NULL; 5243 } else { 5244 if (sev->policy & SEV_POLICY_MASK_NODBG) 5245 return NULL; 5246 } 5247 5248 if (is_sev_snp_guest(vcpu)) { 5249 struct sev_data_snp_dbg dbg = {0}; 5250 5251 vmsa = snp_alloc_firmware_page(__GFP_ZERO); 5252 if (!vmsa) 5253 return NULL; 5254 5255 dbg.gctx_paddr = __psp_pa(sev->snp_context); 5256 dbg.src_addr = svm->vmcb->control.vmsa_pa; 5257 dbg.dst_addr = __psp_pa(vmsa); 5258 5259 ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); 5260 5261 /* 5262 * Return the target page to a hypervisor page no matter what. 5263 * If this fails, the page can't be used, so leak it and don't 5264 * try to use it. 5265 */ 5266 if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) 5267 return NULL; 5268 5269 if (ret) { 5270 pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", 5271 ret, error, error); 5272 free_page((unsigned long)vmsa); 5273 5274 return NULL; 5275 } 5276 } else { 5277 struct sev_data_dbg dbg = {0}; 5278 struct page *vmsa_page; 5279 5280 vmsa_page = alloc_page(GFP_KERNEL); 5281 if (!vmsa_page) 5282 return NULL; 5283 5284 vmsa = page_address(vmsa_page); 5285 5286 dbg.handle = sev->handle; 5287 dbg.src_addr = svm->vmcb->control.vmsa_pa; 5288 dbg.dst_addr = __psp_pa(vmsa); 5289 dbg.len = PAGE_SIZE; 5290 5291 ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); 5292 if (ret) { 5293 pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", 5294 ret, error, error); 5295 __free_page(vmsa_page); 5296 5297 return NULL; 5298 } 5299 } 5300 5301 return vmsa; 5302 } 5303 5304 void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) 5305 { 5306 /* If the VMSA has not yet been encrypted, nothing was allocated */ 5307 if (!vcpu->arch.guest_state_protected || !vmsa) 5308 return; 5309 5310 free_page((unsigned long)vmsa); 5311 } 5312