1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * AMD SVM-SEV support 6 * 7 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 8 */ 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/kvm_types.h> 12 #include <linux/kvm_host.h> 13 #include <linux/kernel.h> 14 #include <linux/highmem.h> 15 #include <linux/psp.h> 16 #include <linux/psp-sev.h> 17 #include <linux/pagemap.h> 18 #include <linux/swap.h> 19 #include <linux/misc_cgroup.h> 20 #include <linux/processor.h> 21 #include <linux/trace_events.h> 22 #include <uapi/linux/sev-guest.h> 23 24 #include <asm/pkru.h> 25 #include <asm/trapnr.h> 26 #include <asm/cpuid/api.h> 27 #include <asm/fpu/xcr.h> 28 #include <asm/fpu/xstate.h> 29 #include <asm/debugreg.h> 30 #include <asm/msr.h> 31 #include <asm/sev.h> 32 33 #include "mmu.h" 34 #include "x86.h" 35 #include "svm.h" 36 #include "svm_ops.h" 37 #include "cpuid.h" 38 #include "trace.h" 39 40 #define GHCB_VERSION_MAX 2ULL 41 #define GHCB_VERSION_MIN 1ULL 42 43 #define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) 44 45 /* 46 * The GHCB spec essentially states that all non-zero error codes other than 47 * those explicitly defined above should be treated as an error by the guest. 48 * Define a generic error to cover that case, and choose a value that is not 49 * likely to overlap with new explicit error codes should more be added to 50 * the GHCB spec later. KVM will use this to report generic errors when 51 * handling SNP guest requests. 52 */ 53 #define SNP_GUEST_VMM_ERR_GENERIC (~0U) 54 55 /* enable/disable SEV support */ 56 static bool __ro_after_init sev_enabled = true; 57 module_param_named(sev, sev_enabled, bool, 0444); 58 59 /* enable/disable SEV-ES support */ 60 static bool __ro_after_init sev_es_enabled = true; 61 module_param_named(sev_es, sev_es_enabled, bool, 0444); 62 63 /* enable/disable SEV-SNP support */ 64 static bool __ro_after_init sev_snp_enabled = true; 65 module_param_named(sev_snp, sev_snp_enabled, bool, 0444); 66 67 static unsigned int __ro_after_init nr_ciphertext_hiding_asids; 68 module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 0444); 69 70 #define AP_RESET_HOLD_NONE 0 71 #define AP_RESET_HOLD_NAE_EVENT 1 72 #define AP_RESET_HOLD_MSR_PROTO 2 73 74 /* 75 * SEV-SNP policy bits that can be supported by KVM. These include policy bits 76 * that have implementation support within KVM or policy bits that do not 77 * require implementation support within KVM to enforce the policy. 78 */ 79 #define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ 80 SNP_POLICY_MASK_API_MAJOR | \ 81 SNP_POLICY_MASK_SMT | \ 82 SNP_POLICY_MASK_RSVD_MBO | \ 83 SNP_POLICY_MASK_DEBUG | \ 84 SNP_POLICY_MASK_SINGLE_SOCKET | \ 85 SNP_POLICY_MASK_CXL_ALLOW | \ 86 SNP_POLICY_MASK_MEM_AES_256_XTS | \ 87 SNP_POLICY_MASK_RAPL_DIS | \ 88 SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM | \ 89 SNP_POLICY_MASK_PAGE_SWAP_DISABLE) 90 91 static u64 snp_supported_policy_bits __ro_after_init; 92 93 static u64 sev_supported_vmsa_features __ro_after_init; 94 95 #define INITIAL_VMSA_GPA 0xFFFFFFFFF000 96 97 static u8 sev_enc_bit; 98 static DECLARE_RWSEM(sev_deactivate_lock); 99 static DEFINE_MUTEX(sev_bitmap_lock); 100 unsigned int max_sev_asid; 101 static unsigned int min_sev_asid; 102 static unsigned int max_sev_es_asid; 103 static unsigned int min_sev_es_asid; 104 static unsigned int max_snp_asid; 105 static unsigned int min_snp_asid; 106 static unsigned long sev_me_mask; 107 static unsigned int nr_asids; 108 static unsigned long *sev_asid_bitmap; 109 static unsigned long *sev_reclaim_asid_bitmap; 110 111 static __always_inline void kvm_lockdep_assert_sev_lock_held(struct kvm *kvm) 112 { 113 #ifdef CONFIG_PROVE_LOCKING 114 /* 115 * Querying SEV+ support is safe if there are no other references, i.e. 116 * if concurrent initialization of SEV+ is impossible. 117 */ 118 if (!refcount_read(&kvm->users_count)) 119 return; 120 121 /* 122 * Querying SEV+ support from vCPU context is always safe, as vCPUs can 123 * only be created after SEV+ is initialized (and KVM disallows all SEV 124 * sub-ioctls while vCPU creation is in-progress). 125 */ 126 if (kvm_get_running_vcpu()) 127 return; 128 129 lockdep_assert_held(&kvm->lock); 130 #endif 131 } 132 133 static bool sev_guest(struct kvm *kvm) 134 { 135 kvm_lockdep_assert_sev_lock_held(kvm); 136 return ____sev_guest(kvm); 137 } 138 static bool sev_es_guest(struct kvm *kvm) 139 { 140 kvm_lockdep_assert_sev_lock_held(kvm); 141 return ____sev_es_guest(kvm); 142 } 143 144 static bool sev_snp_guest(struct kvm *kvm) 145 { 146 kvm_lockdep_assert_sev_lock_held(kvm); 147 return ____sev_snp_guest(kvm); 148 } 149 150 static int snp_decommission_context(struct kvm *kvm); 151 152 struct enc_region { 153 struct list_head list; 154 unsigned long npages; 155 struct page **pages; 156 unsigned long uaddr; 157 unsigned long size; 158 }; 159 160 /* Called with the sev_bitmap_lock held, or on shutdown */ 161 static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) 162 { 163 int ret, error = 0; 164 unsigned int asid; 165 166 /* Check if there are any ASIDs to reclaim before performing a flush */ 167 asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); 168 if (asid > max_asid) 169 return -EBUSY; 170 171 /* 172 * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail, 173 * so it must be guarded. 174 */ 175 down_write(&sev_deactivate_lock); 176 177 /* SNP firmware requires use of WBINVD for ASID recycling. */ 178 wbinvd_on_all_cpus(); 179 180 if (sev_snp_enabled) 181 ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error); 182 else 183 ret = sev_guest_df_flush(&error); 184 185 up_write(&sev_deactivate_lock); 186 187 if (ret) 188 pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n", 189 sev_snp_enabled ? "-SNP" : "", ret, error); 190 191 return ret; 192 } 193 194 static inline bool is_mirroring_enc_context(struct kvm *kvm) 195 { 196 return !!to_kvm_sev_info(kvm)->enc_context_owner; 197 } 198 199 static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) 200 { 201 struct kvm_vcpu *vcpu = &svm->vcpu; 202 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 203 204 return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; 205 } 206 207 static bool snp_is_secure_tsc_enabled(struct kvm *kvm) 208 { 209 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 210 211 return (sev->vmsa_features & SVM_SEV_FEAT_SECURE_TSC) && 212 !WARN_ON_ONCE(!sev_snp_guest(kvm)); 213 } 214 215 /* Must be called with the sev_bitmap_lock held */ 216 static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid) 217 { 218 if (sev_flush_asids(min_asid, max_asid)) 219 return false; 220 221 /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */ 222 bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, 223 nr_asids); 224 bitmap_zero(sev_reclaim_asid_bitmap, nr_asids); 225 226 return true; 227 } 228 229 static int sev_misc_cg_try_charge(struct kvm_sev_info *sev) 230 { 231 enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; 232 return misc_cg_try_charge(type, sev->misc_cg, 1); 233 } 234 235 static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) 236 { 237 enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; 238 misc_cg_uncharge(type, sev->misc_cg, 1); 239 } 240 241 static unsigned int sev_alloc_asid(unsigned int min_asid, unsigned int max_asid) 242 { 243 unsigned int asid; 244 bool retry = true; 245 246 guard(mutex)(&sev_bitmap_lock); 247 248 again: 249 asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); 250 if (asid > max_asid) { 251 if (retry && __sev_recycle_asids(min_asid, max_asid)) { 252 retry = false; 253 goto again; 254 } 255 256 return asid; 257 } 258 259 __set_bit(asid, sev_asid_bitmap); 260 return asid; 261 } 262 263 static int sev_asid_new(struct kvm_sev_info *sev, unsigned long vm_type) 264 { 265 /* 266 * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. 267 * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. 268 */ 269 unsigned int min_asid, max_asid, asid; 270 int ret; 271 272 if (vm_type == KVM_X86_SNP_VM) { 273 min_asid = min_snp_asid; 274 max_asid = max_snp_asid; 275 } else if (sev->es_active) { 276 min_asid = min_sev_es_asid; 277 max_asid = max_sev_es_asid; 278 } else { 279 min_asid = min_sev_asid; 280 max_asid = max_sev_asid; 281 } 282 283 /* 284 * The min ASID can end up larger than the max if basic SEV support is 285 * effectively disabled by disallowing use of ASIDs for SEV guests. 286 * Similarly for SEV-ES guests the min ASID can end up larger than the 287 * max when ciphertext hiding is enabled, effectively disabling SEV-ES 288 * support. 289 */ 290 if (min_asid > max_asid) 291 return -ENOTTY; 292 293 WARN_ON_ONCE(sev->misc_cg); 294 sev->misc_cg = get_current_misc_cg(); 295 ret = sev_misc_cg_try_charge(sev); 296 if (ret) 297 goto e_put_cg; 298 299 asid = sev_alloc_asid(min_asid, max_asid); 300 if (asid > max_asid) { 301 ret = -EBUSY; 302 goto e_uncharge; 303 } 304 305 sev->asid = asid; 306 return 0; 307 308 e_uncharge: 309 sev_misc_cg_uncharge(sev); 310 e_put_cg: 311 put_misc_cg(sev->misc_cg); 312 sev->misc_cg = NULL; 313 return ret; 314 } 315 316 static unsigned int sev_get_asid(struct kvm *kvm) 317 { 318 return to_kvm_sev_info(kvm)->asid; 319 } 320 321 static void sev_asid_free(struct kvm_sev_info *sev) 322 { 323 struct svm_cpu_data *sd; 324 int cpu; 325 326 mutex_lock(&sev_bitmap_lock); 327 328 __set_bit(sev->asid, sev_reclaim_asid_bitmap); 329 330 for_each_possible_cpu(cpu) { 331 sd = per_cpu_ptr(&svm_data, cpu); 332 sd->sev_vmcbs[sev->asid] = NULL; 333 } 334 335 mutex_unlock(&sev_bitmap_lock); 336 337 sev_misc_cg_uncharge(sev); 338 put_misc_cg(sev->misc_cg); 339 sev->misc_cg = NULL; 340 } 341 342 static void sev_decommission(unsigned int handle) 343 { 344 struct sev_data_decommission decommission; 345 346 if (!handle) 347 return; 348 349 decommission.handle = handle; 350 sev_guest_decommission(&decommission, NULL); 351 } 352 353 /* 354 * Transition a page to hypervisor-owned/shared state in the RMP table. This 355 * should not fail under normal conditions, but leak the page should that 356 * happen since it will no longer be usable by the host due to RMP protections. 357 */ 358 static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level) 359 { 360 if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) { 361 snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT); 362 return -EIO; 363 } 364 365 return 0; 366 } 367 368 /* 369 * Certain page-states, such as Pre-Guest and Firmware pages (as documented 370 * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be 371 * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE 372 * unless they are reclaimed first. 373 * 374 * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they 375 * might not be usable by the host due to being set as immutable or still 376 * being associated with a guest ASID. 377 * 378 * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be 379 * converted back to shared, as the page is no longer usable due to RMP 380 * protections, and it's infeasible for the guest to continue on. 381 */ 382 static int snp_page_reclaim(struct kvm *kvm, u64 pfn) 383 { 384 struct sev_data_snp_page_reclaim data = {0}; 385 int fw_err, rc; 386 387 data.paddr = __sme_set(pfn << PAGE_SHIFT); 388 rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err); 389 if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) { 390 snp_leak_pages(pfn, 1); 391 return -EIO; 392 } 393 394 if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K)) 395 return -EIO; 396 397 return rc; 398 } 399 400 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) 401 { 402 struct sev_data_deactivate deactivate; 403 404 if (!handle) 405 return; 406 407 deactivate.handle = handle; 408 409 /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */ 410 down_read(&sev_deactivate_lock); 411 sev_guest_deactivate(&deactivate, NULL); 412 up_read(&sev_deactivate_lock); 413 414 sev_decommission(handle); 415 } 416 417 /* 418 * This sets up bounce buffers/firmware pages to handle SNP Guest Request 419 * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB 420 * 2.0 specification for more details. 421 * 422 * Technically, when an SNP Guest Request is issued, the guest will provide its 423 * own request/response pages, which could in theory be passed along directly 424 * to firmware rather than using bounce pages. However, these pages would need 425 * special care: 426 * 427 * - Both pages are from shared guest memory, so they need to be protected 428 * from migration/etc. occurring while firmware reads/writes to them. At a 429 * minimum, this requires elevating the ref counts and potentially needing 430 * an explicit pinning of the memory. This places additional restrictions 431 * on what type of memory backends userspace can use for shared guest 432 * memory since there is some reliance on using refcounted pages. 433 * 434 * - The response page needs to be switched to Firmware-owned[1] state 435 * before the firmware can write to it, which can lead to potential 436 * host RMP #PFs if the guest is misbehaved and hands the host a 437 * guest page that KVM might write to for other reasons (e.g. virtio 438 * buffers/etc.). 439 * 440 * Both of these issues can be avoided completely by using separately-allocated 441 * bounce pages for both the request/response pages and passing those to 442 * firmware instead. So that's what is being set up here. 443 * 444 * Guest requests rely on message sequence numbers to ensure requests are 445 * issued to firmware in the order the guest issues them, so concurrent guest 446 * requests generally shouldn't happen. But a misbehaved guest could issue 447 * concurrent guest requests in theory, so a mutex is used to serialize 448 * access to the bounce buffers. 449 * 450 * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more 451 * details on Firmware-owned pages, along with "RMP and VMPL Access Checks" 452 * in the APM for details on the related RMP restrictions. 453 */ 454 static int snp_guest_req_init(struct kvm *kvm) 455 { 456 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 457 struct page *req_page; 458 459 req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 460 if (!req_page) 461 return -ENOMEM; 462 463 sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 464 if (!sev->guest_resp_buf) { 465 __free_page(req_page); 466 return -EIO; 467 } 468 469 sev->guest_req_buf = page_address(req_page); 470 mutex_init(&sev->guest_req_mutex); 471 472 return 0; 473 } 474 475 static void snp_guest_req_cleanup(struct kvm *kvm) 476 { 477 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 478 479 if (sev->guest_resp_buf) 480 snp_free_firmware_page(sev->guest_resp_buf); 481 482 if (sev->guest_req_buf) 483 __free_page(virt_to_page(sev->guest_req_buf)); 484 485 sev->guest_req_buf = NULL; 486 sev->guest_resp_buf = NULL; 487 } 488 489 static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, 490 struct kvm_sev_init *data, 491 unsigned long vm_type) 492 { 493 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 494 struct sev_platform_init_args init_args = {0}; 495 bool es_active = vm_type != KVM_X86_SEV_VM; 496 bool snp_active = vm_type == KVM_X86_SNP_VM; 497 u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; 498 int ret; 499 500 if (kvm->created_vcpus) 501 return -EINVAL; 502 503 if (data->flags) 504 return -EINVAL; 505 506 if (!snp_active) 507 valid_vmsa_features &= ~SVM_SEV_FEAT_SECURE_TSC; 508 509 if (data->vmsa_features & ~valid_vmsa_features) 510 return -EINVAL; 511 512 if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version)) 513 return -EINVAL; 514 515 /* 516 * KVM supports the full range of mandatory features defined by version 517 * 2 of the GHCB protocol, so default to that for SEV-ES guests created 518 * via KVM_SEV_INIT2 (KVM_SEV_INIT forces version 1). 519 */ 520 if (es_active && !data->ghcb_version) 521 data->ghcb_version = 2; 522 523 if (snp_active && data->ghcb_version < 2) 524 return -EINVAL; 525 526 if (unlikely(sev->active)) 527 return -EINVAL; 528 529 sev->active = true; 530 sev->es_active = es_active; 531 sev->vmsa_features = data->vmsa_features; 532 sev->ghcb_version = data->ghcb_version; 533 534 if (snp_active) 535 sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; 536 537 ret = sev_asid_new(sev, vm_type); 538 if (ret) 539 goto e_no_asid; 540 541 init_args.probe = false; 542 ret = sev_platform_init(&init_args); 543 if (ret) 544 goto e_free_asid; 545 546 if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 547 ret = -ENOMEM; 548 goto e_free_asid; 549 } 550 551 /* This needs to happen after SEV/SNP firmware initialization. */ 552 if (snp_active) { 553 ret = snp_guest_req_init(kvm); 554 if (ret) 555 goto e_free; 556 } 557 558 INIT_LIST_HEAD(&sev->regions_list); 559 INIT_LIST_HEAD(&sev->mirror_vms); 560 sev->need_init = false; 561 562 kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV); 563 564 return 0; 565 566 e_free: 567 free_cpumask_var(sev->have_run_cpus); 568 e_free_asid: 569 argp->error = init_args.error; 570 sev_asid_free(sev); 571 sev->asid = 0; 572 e_no_asid: 573 sev->vmsa_features = 0; 574 sev->es_active = false; 575 sev->active = false; 576 return ret; 577 } 578 579 static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) 580 { 581 struct kvm_sev_init data = { 582 .vmsa_features = 0, 583 .ghcb_version = 0, 584 }; 585 unsigned long vm_type; 586 587 if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM) 588 return -EINVAL; 589 590 vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM); 591 592 /* 593 * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will 594 * continue to only ever support the minimal GHCB protocol version. 595 */ 596 if (vm_type == KVM_X86_SEV_ES_VM) 597 data.ghcb_version = GHCB_VERSION_MIN; 598 599 return __sev_guest_init(kvm, argp, &data, vm_type); 600 } 601 602 static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) 603 { 604 struct kvm_sev_init data; 605 606 if (!to_kvm_sev_info(kvm)->need_init) 607 return -EINVAL; 608 609 if (kvm->arch.vm_type != KVM_X86_SEV_VM && 610 kvm->arch.vm_type != KVM_X86_SEV_ES_VM && 611 kvm->arch.vm_type != KVM_X86_SNP_VM) 612 return -EINVAL; 613 614 if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data))) 615 return -EFAULT; 616 617 return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type); 618 } 619 620 static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) 621 { 622 unsigned int asid = sev_get_asid(kvm); 623 struct sev_data_activate activate; 624 int ret; 625 626 /* activate ASID on the given handle */ 627 activate.handle = handle; 628 activate.asid = asid; 629 ret = sev_guest_activate(&activate, error); 630 631 return ret; 632 } 633 634 static int __sev_issue_cmd(int fd, int id, void *data, int *error) 635 { 636 CLASS(fd, f)(fd); 637 638 if (fd_empty(f)) 639 return -EBADF; 640 641 return sev_issue_cmd_external_user(fd_file(f), id, data, error); 642 } 643 644 static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) 645 { 646 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 647 648 return __sev_issue_cmd(sev->fd, id, data, error); 649 } 650 651 static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 652 { 653 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 654 struct sev_data_launch_start start; 655 struct kvm_sev_launch_start params; 656 void *dh_blob, *session_blob; 657 int *error = &argp->error; 658 int ret; 659 660 if (!sev_guest(kvm)) 661 return -ENOTTY; 662 663 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 664 return -EFAULT; 665 666 memset(&start, 0, sizeof(start)); 667 668 dh_blob = NULL; 669 if (params.dh_uaddr) { 670 dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len); 671 if (IS_ERR(dh_blob)) 672 return PTR_ERR(dh_blob); 673 674 start.dh_cert_address = __sme_set(__pa(dh_blob)); 675 start.dh_cert_len = params.dh_len; 676 } 677 678 session_blob = NULL; 679 if (params.session_uaddr) { 680 session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len); 681 if (IS_ERR(session_blob)) { 682 ret = PTR_ERR(session_blob); 683 goto e_free_dh; 684 } 685 686 start.session_address = __sme_set(__pa(session_blob)); 687 start.session_len = params.session_len; 688 } 689 690 start.handle = params.handle; 691 start.policy = params.policy; 692 693 /* create memory encryption context */ 694 ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, &start, error); 695 if (ret) 696 goto e_free_session; 697 698 /* Bind ASID to this guest */ 699 ret = sev_bind_asid(kvm, start.handle, error); 700 if (ret) { 701 sev_decommission(start.handle); 702 goto e_free_session; 703 } 704 705 /* return handle to userspace */ 706 params.handle = start.handle; 707 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) { 708 sev_unbind_asid(kvm, start.handle); 709 ret = -EFAULT; 710 goto e_free_session; 711 } 712 713 sev->policy = params.policy; 714 sev->handle = start.handle; 715 sev->fd = argp->sev_fd; 716 717 e_free_session: 718 kfree(session_blob); 719 e_free_dh: 720 kfree(dh_blob); 721 return ret; 722 } 723 724 static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, 725 unsigned long ulen, unsigned long *n, 726 unsigned int flags) 727 { 728 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 729 unsigned long npages, total_npages, lock_limit; 730 struct page **pages; 731 int npinned, ret; 732 733 lockdep_assert_held(&kvm->lock); 734 735 if (ulen == 0 || uaddr + ulen < uaddr) 736 return ERR_PTR(-EINVAL); 737 738 /* 739 * Calculate the number of pages that need to be pinned to cover the 740 * entire range. Note! This isn't simply PFN_DOWN(ulen), as KVM 741 * doesn't require the incoming address+size to be page aligned! 742 */ 743 npages = PFN_DOWN(uaddr + ulen - 1) - PFN_DOWN(uaddr) + 1; 744 if (npages > INT_MAX) 745 return ERR_PTR(-EINVAL); 746 747 total_npages = sev->pages_locked + npages; 748 if (total_npages > totalram_pages()) 749 return ERR_PTR(-EINVAL); 750 751 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 752 if (total_npages > lock_limit && !capable(CAP_IPC_LOCK)) { 753 pr_err("SEV: %lu total pages would exceed the lock limit of %lu.\n", 754 total_npages, lock_limit); 755 return ERR_PTR(-ENOMEM); 756 } 757 758 /* 759 * Don't WARN if the kernel (rightly) thinks the total size is absurd, 760 * i.e. rely on the kernel to reject outrageous range sizes. The above 761 * check on the number of pages is purely to avoid truncation as 762 * pin_user_pages_fast() takes the number of pages as a 32-bit int. 763 */ 764 pages = kvzalloc_objs(*pages, npages, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 765 if (!pages) 766 return ERR_PTR(-ENOMEM); 767 768 /* Pin the user virtual address. */ 769 npinned = pin_user_pages_fast(uaddr, npages, flags, pages); 770 if (npinned != npages) { 771 pr_err("SEV: Failure locking %lu pages.\n", npages); 772 ret = -ENOMEM; 773 goto err; 774 } 775 776 *n = npages; 777 sev->pages_locked = total_npages; 778 779 return pages; 780 781 err: 782 if (npinned > 0) 783 unpin_user_pages(pages, npinned); 784 785 kvfree(pages); 786 return ERR_PTR(ret); 787 } 788 789 static void sev_unpin_memory(struct kvm *kvm, struct page **pages, 790 unsigned long npages) 791 { 792 unpin_user_pages(pages, npages); 793 kvfree(pages); 794 to_kvm_sev_info(kvm)->pages_locked -= npages; 795 } 796 797 static void sev_clflush_pages(struct page *pages[], unsigned long npages) 798 { 799 uint8_t *page_virtual; 800 unsigned long i; 801 802 if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 || 803 pages == NULL) 804 return; 805 806 for (i = 0; i < npages; i++) { 807 page_virtual = kmap_local_page(pages[i]); 808 clflush_cache_range(page_virtual, PAGE_SIZE); 809 kunmap_local(page_virtual); 810 cond_resched(); 811 } 812 } 813 814 static void sev_writeback_caches(struct kvm *kvm) 815 { 816 /* 817 * Ensure that all dirty guest tagged cache entries are written back 818 * before releasing the pages back to the system for use. CLFLUSH will 819 * not do this without SME_COHERENT, and flushing many cache lines 820 * individually is slower than blasting WBINVD for large VMs, so issue 821 * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported) 822 * on CPUs that have done VMRUN, i.e. may have dirtied data using the 823 * VM's ASID. 824 * 825 * For simplicity, never remove CPUs from the bitmap. Ideally, KVM 826 * would clear the mask when flushing caches, but doing so requires 827 * serializing multiple calls and having responding CPUs (to the IPI) 828 * mark themselves as still running if they are running (or about to 829 * run) a vCPU for the VM. 830 * 831 * Note, the caller is responsible for ensuring correctness if the mask 832 * can be modified, e.g. if a CPU could be doing VMRUN. 833 */ 834 wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus); 835 } 836 837 static unsigned long get_num_contig_pages(unsigned long idx, 838 struct page **inpages, unsigned long npages) 839 { 840 unsigned long paddr, next_paddr; 841 unsigned long i = idx + 1, pages = 1; 842 843 /* find the number of contiguous pages starting from idx */ 844 paddr = __sme_page_pa(inpages[idx]); 845 while (i < npages) { 846 next_paddr = __sme_page_pa(inpages[i++]); 847 if ((paddr + PAGE_SIZE) == next_paddr) { 848 pages++; 849 paddr = next_paddr; 850 continue; 851 } 852 break; 853 } 854 855 return pages; 856 } 857 858 static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) 859 { 860 unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i; 861 struct kvm_sev_launch_update_data params; 862 struct sev_data_launch_update_data data; 863 struct page **inpages; 864 int ret; 865 866 if (!sev_guest(kvm)) 867 return -ENOTTY; 868 869 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 870 return -EFAULT; 871 872 vaddr = params.uaddr; 873 size = params.len; 874 vaddr_end = vaddr + size; 875 876 /* Lock the user memory. */ 877 inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE); 878 if (IS_ERR(inpages)) 879 return PTR_ERR(inpages); 880 881 /* 882 * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in 883 * place; the cache may contain the data that was written unencrypted. 884 */ 885 sev_clflush_pages(inpages, npages); 886 887 data.reserved = 0; 888 data.handle = to_kvm_sev_info(kvm)->handle; 889 890 for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { 891 int offset, len; 892 893 /* 894 * If the user buffer is not page-aligned, calculate the offset 895 * within the page. 896 */ 897 offset = vaddr & (PAGE_SIZE - 1); 898 899 /* Calculate the number of pages that can be encrypted in one go. */ 900 pages = get_num_contig_pages(i, inpages, npages); 901 902 len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size); 903 904 data.len = len; 905 data.address = __sme_page_pa(inpages[i]) + offset; 906 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error); 907 if (ret) 908 goto e_unpin; 909 910 size -= len; 911 next_vaddr = vaddr + len; 912 } 913 914 e_unpin: 915 /* content of memory is updated, mark pages dirty */ 916 for (i = 0; i < npages; i++) { 917 set_page_dirty_lock(inpages[i]); 918 mark_page_accessed(inpages[i]); 919 } 920 /* unlock the user pages */ 921 sev_unpin_memory(kvm, inpages, npages); 922 return ret; 923 } 924 925 static int sev_es_sync_vmsa(struct vcpu_svm *svm) 926 { 927 struct kvm_vcpu *vcpu = &svm->vcpu; 928 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 929 struct sev_es_save_area *save = svm->sev_es.vmsa; 930 struct xregs_state *xsave; 931 const u8 *s; 932 u8 *d; 933 int i; 934 935 lockdep_assert_held(&vcpu->mutex); 936 937 if (vcpu->arch.guest_state_protected) 938 return -EINVAL; 939 940 /* Check some debug related fields before encrypting the VMSA */ 941 if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) 942 return -EINVAL; 943 944 /* 945 * SEV-ES will use a VMSA that is pointed to by the VMCB, not 946 * the traditional VMSA that is part of the VMCB. Copy the 947 * traditional VMSA as it has been built so far (in prep 948 * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. 949 */ 950 memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save)); 951 952 /* Sync registgers */ 953 save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX]; 954 save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX]; 955 save->rcx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 956 save->rdx = svm->vcpu.arch.regs[VCPU_REGS_RDX]; 957 save->rsp = svm->vcpu.arch.regs[VCPU_REGS_RSP]; 958 save->rbp = svm->vcpu.arch.regs[VCPU_REGS_RBP]; 959 save->rsi = svm->vcpu.arch.regs[VCPU_REGS_RSI]; 960 save->rdi = svm->vcpu.arch.regs[VCPU_REGS_RDI]; 961 #ifdef CONFIG_X86_64 962 save->r8 = svm->vcpu.arch.regs[VCPU_REGS_R8]; 963 save->r9 = svm->vcpu.arch.regs[VCPU_REGS_R9]; 964 save->r10 = svm->vcpu.arch.regs[VCPU_REGS_R10]; 965 save->r11 = svm->vcpu.arch.regs[VCPU_REGS_R11]; 966 save->r12 = svm->vcpu.arch.regs[VCPU_REGS_R12]; 967 save->r13 = svm->vcpu.arch.regs[VCPU_REGS_R13]; 968 save->r14 = svm->vcpu.arch.regs[VCPU_REGS_R14]; 969 save->r15 = svm->vcpu.arch.regs[VCPU_REGS_R15]; 970 #endif 971 save->rip = svm->vcpu.arch.regs[VCPU_REGS_RIP]; 972 973 /* Sync some non-GPR registers before encrypting */ 974 save->xcr0 = svm->vcpu.arch.xcr0; 975 save->pkru = svm->vcpu.arch.pkru; 976 save->xss = svm->vcpu.arch.ia32_xss; 977 save->dr6 = svm->vcpu.arch.dr6; 978 979 save->sev_features = sev->vmsa_features; 980 981 /* 982 * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid 983 * breaking older measurements. 984 */ 985 if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) { 986 xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave; 987 save->x87_dp = xsave->i387.rdp; 988 save->mxcsr = xsave->i387.mxcsr; 989 save->x87_ftw = xsave->i387.twd; 990 save->x87_fsw = xsave->i387.swd; 991 save->x87_fcw = xsave->i387.cwd; 992 save->x87_fop = xsave->i387.fop; 993 save->x87_ds = 0; 994 save->x87_cs = 0; 995 save->x87_rip = xsave->i387.rip; 996 997 for (i = 0; i < 8; i++) { 998 /* 999 * The format of the x87 save area is undocumented and 1000 * definitely not what you would expect. It consists of 1001 * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes 1002 * area with bytes 8-9 of each register. 1003 */ 1004 d = save->fpreg_x87 + i * 8; 1005 s = ((u8 *)xsave->i387.st_space) + i * 16; 1006 memcpy(d, s, 8); 1007 save->fpreg_x87[64 + i * 2] = s[8]; 1008 save->fpreg_x87[64 + i * 2 + 1] = s[9]; 1009 } 1010 memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256); 1011 1012 s = get_xsave_addr(xsave, XFEATURE_YMM); 1013 if (s) 1014 memcpy(save->fpreg_ymm, s, 256); 1015 else 1016 memset(save->fpreg_ymm, 0, 256); 1017 } 1018 1019 pr_debug("Virtual Machine Save Area (VMSA):\n"); 1020 print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); 1021 1022 return 0; 1023 } 1024 1025 static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, 1026 int *error) 1027 { 1028 struct sev_data_launch_update_vmsa vmsa; 1029 struct vcpu_svm *svm = to_svm(vcpu); 1030 int ret; 1031 1032 if (vcpu->guest_debug) { 1033 pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported"); 1034 return -EINVAL; 1035 } 1036 1037 /* Perform some pre-encryption checks against the VMSA */ 1038 ret = sev_es_sync_vmsa(svm); 1039 if (ret) 1040 return ret; 1041 1042 /* 1043 * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of 1044 * the VMSA memory content (i.e it will write the same memory region 1045 * with the guest's key), so invalidate it first. 1046 */ 1047 clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE); 1048 1049 vmsa.reserved = 0; 1050 vmsa.handle = to_kvm_sev_info(kvm)->handle; 1051 vmsa.address = __sme_pa(svm->sev_es.vmsa); 1052 vmsa.len = PAGE_SIZE; 1053 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); 1054 if (ret) 1055 return ret; 1056 1057 /* 1058 * SEV-ES guests maintain an encrypted version of their FPU 1059 * state which is restored and saved on VMRUN and VMEXIT. 1060 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't 1061 * do xsave/xrstor on it. 1062 */ 1063 fpstate_set_confidential(&vcpu->arch.guest_fpu); 1064 vcpu->arch.guest_state_protected = true; 1065 1066 /* 1067 * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it 1068 * only after setting guest_state_protected because KVM_SET_MSRS allows 1069 * dynamic toggling of LBRV (for performance reason) on write access to 1070 * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. 1071 */ 1072 svm_enable_lbrv(vcpu); 1073 return 0; 1074 } 1075 1076 static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) 1077 { 1078 struct kvm_vcpu *vcpu; 1079 unsigned long i; 1080 int ret; 1081 1082 if (!sev_es_guest(kvm)) 1083 return -ENOTTY; 1084 1085 if (kvm_is_vcpu_creation_in_progress(kvm)) 1086 return -EBUSY; 1087 1088 ret = kvm_lock_all_vcpus(kvm); 1089 if (ret) 1090 return ret; 1091 1092 kvm_for_each_vcpu(i, vcpu, kvm) { 1093 ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error); 1094 if (ret) 1095 break; 1096 } 1097 1098 kvm_unlock_all_vcpus(kvm); 1099 return ret; 1100 } 1101 1102 static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) 1103 { 1104 void __user *measure = u64_to_user_ptr(argp->data); 1105 struct sev_data_launch_measure data; 1106 struct kvm_sev_launch_measure params; 1107 void __user *p = NULL; 1108 void *blob = NULL; 1109 int ret; 1110 1111 if (!sev_guest(kvm)) 1112 return -ENOTTY; 1113 1114 if (copy_from_user(¶ms, measure, sizeof(params))) 1115 return -EFAULT; 1116 1117 memset(&data, 0, sizeof(data)); 1118 1119 /* User wants to query the blob length */ 1120 if (!params.len) 1121 goto cmd; 1122 1123 p = u64_to_user_ptr(params.uaddr); 1124 if (p) { 1125 if (params.len > SEV_FW_BLOB_MAX_SIZE) 1126 return -EINVAL; 1127 1128 blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT); 1129 if (!blob) 1130 return -ENOMEM; 1131 1132 data.address = __psp_pa(blob); 1133 data.len = params.len; 1134 } 1135 1136 cmd: 1137 data.handle = to_kvm_sev_info(kvm)->handle; 1138 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error); 1139 1140 /* 1141 * If we query the session length, FW responded with expected data. 1142 */ 1143 if (!params.len) 1144 goto done; 1145 1146 if (ret) 1147 goto e_free_blob; 1148 1149 if (blob) { 1150 if (copy_to_user(p, blob, params.len)) 1151 ret = -EFAULT; 1152 } 1153 1154 done: 1155 params.len = data.len; 1156 if (copy_to_user(measure, ¶ms, sizeof(params))) 1157 ret = -EFAULT; 1158 e_free_blob: 1159 kfree(blob); 1160 return ret; 1161 } 1162 1163 static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 1164 { 1165 struct sev_data_launch_finish data; 1166 1167 if (!sev_guest(kvm)) 1168 return -ENOTTY; 1169 1170 data.handle = to_kvm_sev_info(kvm)->handle; 1171 return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error); 1172 } 1173 1174 static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) 1175 { 1176 struct kvm_sev_guest_status params; 1177 struct sev_data_guest_status data; 1178 int ret; 1179 1180 if (!sev_guest(kvm)) 1181 return -ENOTTY; 1182 1183 memset(&data, 0, sizeof(data)); 1184 1185 data.handle = to_kvm_sev_info(kvm)->handle; 1186 ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error); 1187 if (ret) 1188 return ret; 1189 1190 params.policy = data.policy; 1191 params.state = data.state; 1192 params.handle = data.handle; 1193 1194 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) 1195 ret = -EFAULT; 1196 1197 return ret; 1198 } 1199 1200 static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, 1201 unsigned long dst, int size, 1202 int *error, bool enc) 1203 { 1204 struct sev_data_dbg data; 1205 1206 data.reserved = 0; 1207 data.handle = to_kvm_sev_info(kvm)->handle; 1208 data.dst_addr = dst; 1209 data.src_addr = src; 1210 data.len = size; 1211 1212 return sev_issue_cmd(kvm, 1213 enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT, 1214 &data, error); 1215 } 1216 1217 static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr, 1218 unsigned long dst_paddr, int sz, int *err) 1219 { 1220 int offset; 1221 1222 /* 1223 * Its safe to read more than we are asked, caller should ensure that 1224 * destination has enough space. 1225 */ 1226 offset = src_paddr & 15; 1227 src_paddr = round_down(src_paddr, 16); 1228 sz = round_up(sz + offset, 16); 1229 1230 return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false); 1231 } 1232 1233 static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, 1234 void __user *dst_uaddr, 1235 unsigned long dst_paddr, 1236 int size, int *err) 1237 { 1238 struct page *tpage = NULL; 1239 int ret, offset; 1240 1241 /* if inputs are not 16-byte then use intermediate buffer */ 1242 if (!IS_ALIGNED(dst_paddr, 16) || 1243 !IS_ALIGNED(paddr, 16) || 1244 !IS_ALIGNED(size, 16)) { 1245 tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 1246 if (!tpage) 1247 return -ENOMEM; 1248 1249 dst_paddr = __sme_page_pa(tpage); 1250 } 1251 1252 ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err); 1253 if (ret) 1254 goto e_free; 1255 1256 if (tpage) { 1257 offset = paddr & 15; 1258 if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size)) 1259 ret = -EFAULT; 1260 } 1261 1262 e_free: 1263 if (tpage) 1264 __free_page(tpage); 1265 1266 return ret; 1267 } 1268 1269 static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, 1270 void __user *vaddr, 1271 unsigned long dst_paddr, 1272 void __user *dst_vaddr, 1273 int size, int *error) 1274 { 1275 struct page *src_tpage = NULL; 1276 struct page *dst_tpage = NULL; 1277 int ret, len = size; 1278 1279 /* If source buffer is not aligned then use an intermediate buffer */ 1280 if (!IS_ALIGNED((unsigned long)vaddr, 16)) { 1281 src_tpage = alloc_page(GFP_KERNEL_ACCOUNT); 1282 if (!src_tpage) 1283 return -ENOMEM; 1284 1285 if (copy_from_user(page_address(src_tpage), vaddr, size)) { 1286 __free_page(src_tpage); 1287 return -EFAULT; 1288 } 1289 1290 paddr = __sme_page_pa(src_tpage); 1291 } 1292 1293 /* 1294 * If destination buffer or length is not aligned then do read-modify-write: 1295 * - decrypt destination in an intermediate buffer 1296 * - copy the source buffer in an intermediate buffer 1297 * - use the intermediate buffer as source buffer 1298 */ 1299 if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { 1300 int dst_offset; 1301 1302 dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT); 1303 if (!dst_tpage) { 1304 ret = -ENOMEM; 1305 goto e_free; 1306 } 1307 1308 ret = __sev_dbg_decrypt(kvm, dst_paddr, 1309 __sme_page_pa(dst_tpage), size, error); 1310 if (ret) 1311 goto e_free; 1312 1313 /* 1314 * If source is kernel buffer then use memcpy() otherwise 1315 * copy_from_user(). 1316 */ 1317 dst_offset = dst_paddr & 15; 1318 1319 if (src_tpage) 1320 memcpy(page_address(dst_tpage) + dst_offset, 1321 page_address(src_tpage), size); 1322 else { 1323 if (copy_from_user(page_address(dst_tpage) + dst_offset, 1324 vaddr, size)) { 1325 ret = -EFAULT; 1326 goto e_free; 1327 } 1328 } 1329 1330 paddr = __sme_page_pa(dst_tpage); 1331 dst_paddr = round_down(dst_paddr, 16); 1332 len = round_up(size, 16); 1333 } 1334 1335 ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true); 1336 1337 e_free: 1338 if (src_tpage) 1339 __free_page(src_tpage); 1340 if (dst_tpage) 1341 __free_page(dst_tpage); 1342 return ret; 1343 } 1344 1345 static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) 1346 { 1347 unsigned long vaddr, vaddr_end, next_vaddr; 1348 unsigned long dst_vaddr; 1349 struct page **src_p, **dst_p; 1350 struct kvm_sev_dbg debug; 1351 unsigned long n; 1352 unsigned int size; 1353 int ret; 1354 1355 if (!sev_guest(kvm)) 1356 return -ENOTTY; 1357 1358 if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug))) 1359 return -EFAULT; 1360 1361 if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr) 1362 return -EINVAL; 1363 if (!debug.dst_uaddr) 1364 return -EINVAL; 1365 1366 vaddr = debug.src_uaddr; 1367 size = debug.len; 1368 vaddr_end = vaddr + size; 1369 dst_vaddr = debug.dst_uaddr; 1370 1371 for (; vaddr < vaddr_end; vaddr = next_vaddr) { 1372 int len, s_off, d_off; 1373 1374 /* lock userspace source and destination page */ 1375 src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0); 1376 if (IS_ERR(src_p)) 1377 return PTR_ERR(src_p); 1378 1379 dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE); 1380 if (IS_ERR(dst_p)) { 1381 sev_unpin_memory(kvm, src_p, n); 1382 return PTR_ERR(dst_p); 1383 } 1384 1385 /* 1386 * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify 1387 * the pages; flush the destination too so that future accesses do not 1388 * see stale data. 1389 */ 1390 sev_clflush_pages(src_p, 1); 1391 sev_clflush_pages(dst_p, 1); 1392 1393 /* 1394 * Since user buffer may not be page aligned, calculate the 1395 * offset within the page. 1396 */ 1397 s_off = vaddr & ~PAGE_MASK; 1398 d_off = dst_vaddr & ~PAGE_MASK; 1399 len = min_t(size_t, (PAGE_SIZE - s_off), size); 1400 1401 if (dec) 1402 ret = __sev_dbg_decrypt_user(kvm, 1403 __sme_page_pa(src_p[0]) + s_off, 1404 (void __user *)dst_vaddr, 1405 __sme_page_pa(dst_p[0]) + d_off, 1406 len, &argp->error); 1407 else 1408 ret = __sev_dbg_encrypt_user(kvm, 1409 __sme_page_pa(src_p[0]) + s_off, 1410 (void __user *)vaddr, 1411 __sme_page_pa(dst_p[0]) + d_off, 1412 (void __user *)dst_vaddr, 1413 len, &argp->error); 1414 1415 sev_unpin_memory(kvm, src_p, n); 1416 sev_unpin_memory(kvm, dst_p, n); 1417 1418 if (ret) 1419 goto err; 1420 1421 next_vaddr = vaddr + len; 1422 dst_vaddr = dst_vaddr + len; 1423 size -= len; 1424 } 1425 err: 1426 return ret; 1427 } 1428 1429 static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) 1430 { 1431 struct sev_data_launch_secret data; 1432 struct kvm_sev_launch_secret params; 1433 struct page **pages; 1434 void *blob, *hdr; 1435 unsigned long n, i; 1436 int ret, offset; 1437 1438 if (!sev_guest(kvm)) 1439 return -ENOTTY; 1440 1441 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 1442 return -EFAULT; 1443 1444 pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE); 1445 if (IS_ERR(pages)) 1446 return PTR_ERR(pages); 1447 1448 /* 1449 * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in 1450 * place; the cache may contain the data that was written unencrypted. 1451 */ 1452 sev_clflush_pages(pages, n); 1453 1454 /* 1455 * The secret must be copied into contiguous memory region, lets verify 1456 * that userspace memory pages are contiguous before we issue command. 1457 */ 1458 if (get_num_contig_pages(0, pages, n) != n) { 1459 ret = -EINVAL; 1460 goto e_unpin_memory; 1461 } 1462 1463 memset(&data, 0, sizeof(data)); 1464 1465 offset = params.guest_uaddr & (PAGE_SIZE - 1); 1466 data.guest_address = __sme_page_pa(pages[0]) + offset; 1467 data.guest_len = params.guest_len; 1468 1469 blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len); 1470 if (IS_ERR(blob)) { 1471 ret = PTR_ERR(blob); 1472 goto e_unpin_memory; 1473 } 1474 1475 data.trans_address = __psp_pa(blob); 1476 data.trans_len = params.trans_len; 1477 1478 hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); 1479 if (IS_ERR(hdr)) { 1480 ret = PTR_ERR(hdr); 1481 goto e_free_blob; 1482 } 1483 data.hdr_address = __psp_pa(hdr); 1484 data.hdr_len = params.hdr_len; 1485 1486 data.handle = to_kvm_sev_info(kvm)->handle; 1487 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error); 1488 1489 kfree(hdr); 1490 1491 e_free_blob: 1492 kfree(blob); 1493 e_unpin_memory: 1494 /* content of memory is updated, mark pages dirty */ 1495 for (i = 0; i < n; i++) { 1496 set_page_dirty_lock(pages[i]); 1497 mark_page_accessed(pages[i]); 1498 } 1499 sev_unpin_memory(kvm, pages, n); 1500 return ret; 1501 } 1502 1503 static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) 1504 { 1505 void __user *report = u64_to_user_ptr(argp->data); 1506 struct sev_data_attestation_report data; 1507 struct kvm_sev_attestation_report params; 1508 void __user *p; 1509 void *blob = NULL; 1510 int ret; 1511 1512 if (!sev_guest(kvm)) 1513 return -ENOTTY; 1514 1515 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 1516 return -EFAULT; 1517 1518 memset(&data, 0, sizeof(data)); 1519 1520 /* User wants to query the blob length */ 1521 if (!params.len) 1522 goto cmd; 1523 1524 p = u64_to_user_ptr(params.uaddr); 1525 if (p) { 1526 if (params.len > SEV_FW_BLOB_MAX_SIZE) 1527 return -EINVAL; 1528 1529 blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT); 1530 if (!blob) 1531 return -ENOMEM; 1532 1533 data.address = __psp_pa(blob); 1534 data.len = params.len; 1535 memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce)); 1536 } 1537 cmd: 1538 data.handle = to_kvm_sev_info(kvm)->handle; 1539 ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error); 1540 /* 1541 * If we query the session length, FW responded with expected data. 1542 */ 1543 if (!params.len) 1544 goto done; 1545 1546 if (ret) 1547 goto e_free_blob; 1548 1549 if (blob) { 1550 if (copy_to_user(p, blob, params.len)) 1551 ret = -EFAULT; 1552 } 1553 1554 done: 1555 params.len = data.len; 1556 if (copy_to_user(report, ¶ms, sizeof(params))) 1557 ret = -EFAULT; 1558 e_free_blob: 1559 kfree(blob); 1560 return ret; 1561 } 1562 1563 /* Userspace wants to query session length. */ 1564 static int 1565 __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, 1566 struct kvm_sev_send_start *params) 1567 { 1568 struct sev_data_send_start data; 1569 int ret; 1570 1571 memset(&data, 0, sizeof(data)); 1572 data.handle = to_kvm_sev_info(kvm)->handle; 1573 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); 1574 1575 params->session_len = data.session_len; 1576 if (copy_to_user(u64_to_user_ptr(argp->data), params, 1577 sizeof(struct kvm_sev_send_start))) 1578 ret = -EFAULT; 1579 1580 return ret; 1581 } 1582 1583 static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 1584 { 1585 struct sev_data_send_start data; 1586 struct kvm_sev_send_start params; 1587 void *amd_certs, *session_data; 1588 void *pdh_cert, *plat_certs; 1589 int ret; 1590 1591 if (!sev_guest(kvm)) 1592 return -ENOTTY; 1593 1594 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1595 sizeof(struct kvm_sev_send_start))) 1596 return -EFAULT; 1597 1598 /* if session_len is zero, userspace wants to query the session length */ 1599 if (!params.session_len) 1600 return __sev_send_start_query_session_length(kvm, argp, 1601 ¶ms); 1602 1603 /* some sanity checks */ 1604 if (!params.pdh_cert_uaddr || !params.pdh_cert_len || 1605 !params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE) 1606 return -EINVAL; 1607 1608 /* allocate the memory to hold the session data blob */ 1609 session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT); 1610 if (!session_data) 1611 return -ENOMEM; 1612 1613 /* copy the certificate blobs from userspace */ 1614 pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr, 1615 params.pdh_cert_len); 1616 if (IS_ERR(pdh_cert)) { 1617 ret = PTR_ERR(pdh_cert); 1618 goto e_free_session; 1619 } 1620 1621 plat_certs = psp_copy_user_blob(params.plat_certs_uaddr, 1622 params.plat_certs_len); 1623 if (IS_ERR(plat_certs)) { 1624 ret = PTR_ERR(plat_certs); 1625 goto e_free_pdh; 1626 } 1627 1628 amd_certs = psp_copy_user_blob(params.amd_certs_uaddr, 1629 params.amd_certs_len); 1630 if (IS_ERR(amd_certs)) { 1631 ret = PTR_ERR(amd_certs); 1632 goto e_free_plat_cert; 1633 } 1634 1635 /* populate the FW SEND_START field with system physical address */ 1636 memset(&data, 0, sizeof(data)); 1637 data.pdh_cert_address = __psp_pa(pdh_cert); 1638 data.pdh_cert_len = params.pdh_cert_len; 1639 data.plat_certs_address = __psp_pa(plat_certs); 1640 data.plat_certs_len = params.plat_certs_len; 1641 data.amd_certs_address = __psp_pa(amd_certs); 1642 data.amd_certs_len = params.amd_certs_len; 1643 data.session_address = __psp_pa(session_data); 1644 data.session_len = params.session_len; 1645 data.handle = to_kvm_sev_info(kvm)->handle; 1646 1647 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); 1648 1649 if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr), 1650 session_data, params.session_len)) { 1651 ret = -EFAULT; 1652 goto e_free_amd_cert; 1653 } 1654 1655 params.policy = data.policy; 1656 params.session_len = data.session_len; 1657 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, 1658 sizeof(struct kvm_sev_send_start))) 1659 ret = -EFAULT; 1660 1661 e_free_amd_cert: 1662 kfree(amd_certs); 1663 e_free_plat_cert: 1664 kfree(plat_certs); 1665 e_free_pdh: 1666 kfree(pdh_cert); 1667 e_free_session: 1668 kfree(session_data); 1669 return ret; 1670 } 1671 1672 /* Userspace wants to query either header or trans length. */ 1673 static int 1674 __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, 1675 struct kvm_sev_send_update_data *params) 1676 { 1677 struct sev_data_send_update_data data; 1678 int ret; 1679 1680 memset(&data, 0, sizeof(data)); 1681 data.handle = to_kvm_sev_info(kvm)->handle; 1682 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); 1683 1684 params->hdr_len = data.hdr_len; 1685 params->trans_len = data.trans_len; 1686 1687 if (copy_to_user(u64_to_user_ptr(argp->data), params, 1688 sizeof(struct kvm_sev_send_update_data))) 1689 ret = -EFAULT; 1690 1691 return ret; 1692 } 1693 1694 static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) 1695 { 1696 struct sev_data_send_update_data data; 1697 struct kvm_sev_send_update_data params; 1698 void *hdr, *trans_data; 1699 struct page **guest_page; 1700 unsigned long n; 1701 int ret, offset; 1702 1703 if (!sev_guest(kvm)) 1704 return -ENOTTY; 1705 1706 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1707 sizeof(struct kvm_sev_send_update_data))) 1708 return -EFAULT; 1709 1710 /* userspace wants to query either header or trans length */ 1711 if (!params.trans_len || !params.hdr_len) 1712 return __sev_send_update_data_query_lengths(kvm, argp, ¶ms); 1713 1714 if (!params.trans_uaddr || !params.guest_uaddr || 1715 !params.guest_len || !params.hdr_uaddr) 1716 return -EINVAL; 1717 1718 /* Check if we are crossing the page boundary */ 1719 offset = params.guest_uaddr & (PAGE_SIZE - 1); 1720 if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) 1721 return -EINVAL; 1722 1723 /* Pin guest memory */ 1724 guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, 1725 PAGE_SIZE, &n, 0); 1726 if (IS_ERR(guest_page)) 1727 return PTR_ERR(guest_page); 1728 1729 /* allocate memory for header and transport buffer */ 1730 ret = -ENOMEM; 1731 hdr = kzalloc(params.hdr_len, GFP_KERNEL); 1732 if (!hdr) 1733 goto e_unpin; 1734 1735 trans_data = kzalloc(params.trans_len, GFP_KERNEL); 1736 if (!trans_data) 1737 goto e_free_hdr; 1738 1739 memset(&data, 0, sizeof(data)); 1740 data.hdr_address = __psp_pa(hdr); 1741 data.hdr_len = params.hdr_len; 1742 data.trans_address = __psp_pa(trans_data); 1743 data.trans_len = params.trans_len; 1744 1745 /* The SEND_UPDATE_DATA command requires C-bit to be always set. */ 1746 data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; 1747 data.guest_address |= sev_me_mask; 1748 data.guest_len = params.guest_len; 1749 data.handle = to_kvm_sev_info(kvm)->handle; 1750 1751 ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); 1752 1753 if (ret) 1754 goto e_free_trans_data; 1755 1756 /* copy transport buffer to user space */ 1757 if (copy_to_user(u64_to_user_ptr(params.trans_uaddr), 1758 trans_data, params.trans_len)) { 1759 ret = -EFAULT; 1760 goto e_free_trans_data; 1761 } 1762 1763 /* Copy packet header to userspace. */ 1764 if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr, 1765 params.hdr_len)) 1766 ret = -EFAULT; 1767 1768 e_free_trans_data: 1769 kfree(trans_data); 1770 e_free_hdr: 1771 kfree(hdr); 1772 e_unpin: 1773 sev_unpin_memory(kvm, guest_page, n); 1774 1775 return ret; 1776 } 1777 1778 static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 1779 { 1780 struct sev_data_send_finish data; 1781 1782 if (!sev_guest(kvm)) 1783 return -ENOTTY; 1784 1785 data.handle = to_kvm_sev_info(kvm)->handle; 1786 return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error); 1787 } 1788 1789 static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp) 1790 { 1791 struct sev_data_send_cancel data; 1792 1793 if (!sev_guest(kvm)) 1794 return -ENOTTY; 1795 1796 data.handle = to_kvm_sev_info(kvm)->handle; 1797 return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error); 1798 } 1799 1800 static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 1801 { 1802 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 1803 struct sev_data_receive_start start; 1804 struct kvm_sev_receive_start params; 1805 int *error = &argp->error; 1806 void *session_data; 1807 void *pdh_data; 1808 int ret; 1809 1810 if (!sev_guest(kvm)) 1811 return -ENOTTY; 1812 1813 /* Get parameter from the userspace */ 1814 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1815 sizeof(struct kvm_sev_receive_start))) 1816 return -EFAULT; 1817 1818 /* some sanity checks */ 1819 if (!params.pdh_uaddr || !params.pdh_len || 1820 !params.session_uaddr || !params.session_len) 1821 return -EINVAL; 1822 1823 pdh_data = psp_copy_user_blob(params.pdh_uaddr, params.pdh_len); 1824 if (IS_ERR(pdh_data)) 1825 return PTR_ERR(pdh_data); 1826 1827 session_data = psp_copy_user_blob(params.session_uaddr, 1828 params.session_len); 1829 if (IS_ERR(session_data)) { 1830 ret = PTR_ERR(session_data); 1831 goto e_free_pdh; 1832 } 1833 1834 memset(&start, 0, sizeof(start)); 1835 start.handle = params.handle; 1836 start.policy = params.policy; 1837 start.pdh_cert_address = __psp_pa(pdh_data); 1838 start.pdh_cert_len = params.pdh_len; 1839 start.session_address = __psp_pa(session_data); 1840 start.session_len = params.session_len; 1841 1842 /* create memory encryption context */ 1843 ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_RECEIVE_START, &start, 1844 error); 1845 if (ret) 1846 goto e_free_session; 1847 1848 /* Bind ASID to this guest */ 1849 ret = sev_bind_asid(kvm, start.handle, error); 1850 if (ret) { 1851 sev_decommission(start.handle); 1852 goto e_free_session; 1853 } 1854 1855 params.handle = start.handle; 1856 if (copy_to_user(u64_to_user_ptr(argp->data), 1857 ¶ms, sizeof(struct kvm_sev_receive_start))) { 1858 ret = -EFAULT; 1859 sev_unbind_asid(kvm, start.handle); 1860 goto e_free_session; 1861 } 1862 1863 sev->handle = start.handle; 1864 sev->fd = argp->sev_fd; 1865 1866 e_free_session: 1867 kfree(session_data); 1868 e_free_pdh: 1869 kfree(pdh_data); 1870 1871 return ret; 1872 } 1873 1874 static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) 1875 { 1876 struct kvm_sev_receive_update_data params; 1877 struct sev_data_receive_update_data data; 1878 void *hdr = NULL, *trans = NULL; 1879 struct page **guest_page; 1880 unsigned long n; 1881 int ret, offset; 1882 1883 if (!sev_guest(kvm)) 1884 return -EINVAL; 1885 1886 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), 1887 sizeof(struct kvm_sev_receive_update_data))) 1888 return -EFAULT; 1889 1890 if (!params.hdr_uaddr || !params.hdr_len || 1891 !params.guest_uaddr || !params.guest_len || 1892 !params.trans_uaddr || !params.trans_len) 1893 return -EINVAL; 1894 1895 /* Check if we are crossing the page boundary */ 1896 offset = params.guest_uaddr & (PAGE_SIZE - 1); 1897 if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) 1898 return -EINVAL; 1899 1900 hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); 1901 if (IS_ERR(hdr)) 1902 return PTR_ERR(hdr); 1903 1904 trans = psp_copy_user_blob(params.trans_uaddr, params.trans_len); 1905 if (IS_ERR(trans)) { 1906 ret = PTR_ERR(trans); 1907 goto e_free_hdr; 1908 } 1909 1910 memset(&data, 0, sizeof(data)); 1911 data.hdr_address = __psp_pa(hdr); 1912 data.hdr_len = params.hdr_len; 1913 data.trans_address = __psp_pa(trans); 1914 data.trans_len = params.trans_len; 1915 1916 /* Pin guest memory */ 1917 guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, 1918 PAGE_SIZE, &n, FOLL_WRITE); 1919 if (IS_ERR(guest_page)) { 1920 ret = PTR_ERR(guest_page); 1921 goto e_free_trans; 1922 } 1923 1924 /* 1925 * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP 1926 * encrypts the written data with the guest's key, and the cache may 1927 * contain dirty, unencrypted data. 1928 */ 1929 sev_clflush_pages(guest_page, n); 1930 1931 /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */ 1932 data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; 1933 data.guest_address |= sev_me_mask; 1934 data.guest_len = params.guest_len; 1935 data.handle = to_kvm_sev_info(kvm)->handle; 1936 1937 ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data, 1938 &argp->error); 1939 1940 sev_unpin_memory(kvm, guest_page, n); 1941 1942 e_free_trans: 1943 kfree(trans); 1944 e_free_hdr: 1945 kfree(hdr); 1946 1947 return ret; 1948 } 1949 1950 static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 1951 { 1952 struct sev_data_receive_finish data; 1953 1954 if (!sev_guest(kvm)) 1955 return -ENOTTY; 1956 1957 data.handle = to_kvm_sev_info(kvm)->handle; 1958 return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); 1959 } 1960 1961 static bool is_cmd_allowed_from_mirror(u32 cmd_id) 1962 { 1963 /* 1964 * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES 1965 * active mirror VMs. Also allow the debugging and status commands. 1966 */ 1967 if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA || 1968 cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT || 1969 cmd_id == KVM_SEV_DBG_ENCRYPT) 1970 return true; 1971 1972 return false; 1973 } 1974 1975 static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) 1976 { 1977 struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); 1978 struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); 1979 int r = -EBUSY; 1980 1981 if (dst_kvm == src_kvm) 1982 return -EINVAL; 1983 1984 /* 1985 * Bail if these VMs are already involved in a migration to avoid 1986 * deadlock between two VMs trying to migrate to/from each other. 1987 */ 1988 if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1)) 1989 return -EBUSY; 1990 1991 if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) 1992 goto release_dst; 1993 1994 r = -EINTR; 1995 if (mutex_lock_killable(&dst_kvm->lock)) 1996 goto release_src; 1997 if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING)) 1998 goto unlock_dst; 1999 return 0; 2000 2001 unlock_dst: 2002 mutex_unlock(&dst_kvm->lock); 2003 release_src: 2004 atomic_set_release(&src_sev->migration_in_progress, 0); 2005 release_dst: 2006 atomic_set_release(&dst_sev->migration_in_progress, 0); 2007 return r; 2008 } 2009 2010 static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) 2011 { 2012 struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); 2013 struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); 2014 2015 mutex_unlock(&dst_kvm->lock); 2016 mutex_unlock(&src_kvm->lock); 2017 atomic_set_release(&dst_sev->migration_in_progress, 0); 2018 atomic_set_release(&src_sev->migration_in_progress, 0); 2019 } 2020 2021 static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) 2022 { 2023 struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm); 2024 struct kvm_sev_info *src = to_kvm_sev_info(src_kvm); 2025 struct kvm_vcpu *dst_vcpu, *src_vcpu; 2026 struct vcpu_svm *dst_svm, *src_svm; 2027 struct kvm_sev_info *mirror; 2028 unsigned long i; 2029 2030 dst->active = true; 2031 dst->asid = src->asid; 2032 dst->handle = src->handle; 2033 dst->pages_locked = src->pages_locked; 2034 dst->enc_context_owner = src->enc_context_owner; 2035 dst->es_active = src->es_active; 2036 dst->vmsa_features = src->vmsa_features; 2037 2038 src->asid = 0; 2039 src->active = false; 2040 src->handle = 0; 2041 src->pages_locked = 0; 2042 src->enc_context_owner = NULL; 2043 src->es_active = false; 2044 2045 list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); 2046 2047 /* 2048 * If this VM has mirrors, "transfer" each mirror's refcount of the 2049 * source to the destination (this KVM). The caller holds a reference 2050 * to the source, so there's no danger of use-after-free. 2051 */ 2052 list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms); 2053 list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) { 2054 kvm_get_kvm(dst_kvm); 2055 kvm_put_kvm(src_kvm); 2056 mirror->enc_context_owner = dst_kvm; 2057 } 2058 2059 /* 2060 * If this VM is a mirror, remove the old mirror from the owners list 2061 * and add the new mirror to the list. 2062 */ 2063 if (is_mirroring_enc_context(dst_kvm)) { 2064 struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner); 2065 2066 list_del(&src->mirror_entry); 2067 list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); 2068 } 2069 2070 kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) { 2071 dst_svm = to_svm(dst_vcpu); 2072 2073 sev_init_vmcb(dst_svm, false); 2074 2075 if (!dst->es_active) 2076 continue; 2077 2078 /* 2079 * Note, the source is not required to have the same number of 2080 * vCPUs as the destination when migrating a vanilla SEV VM. 2081 */ 2082 src_vcpu = kvm_get_vcpu(src_kvm, i); 2083 src_svm = to_svm(src_vcpu); 2084 2085 /* 2086 * Transfer VMSA and GHCB state to the destination. Nullify and 2087 * clear source fields as appropriate, the state now belongs to 2088 * the destination. 2089 */ 2090 memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es)); 2091 dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa; 2092 dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa; 2093 dst_vcpu->arch.guest_state_protected = true; 2094 2095 memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es)); 2096 src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE; 2097 src_svm->vmcb->control.vmsa_pa = INVALID_PAGE; 2098 src_vcpu->arch.guest_state_protected = false; 2099 } 2100 } 2101 2102 static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) 2103 { 2104 struct kvm_vcpu *src_vcpu; 2105 unsigned long i; 2106 2107 if (kvm_is_vcpu_creation_in_progress(src) || 2108 kvm_is_vcpu_creation_in_progress(dst)) 2109 return -EBUSY; 2110 2111 if (!sev_es_guest(src)) 2112 return 0; 2113 2114 if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus)) 2115 return -EINVAL; 2116 2117 kvm_for_each_vcpu(i, src_vcpu, src) { 2118 if (!src_vcpu->arch.guest_state_protected) 2119 return -EINVAL; 2120 } 2121 2122 return 0; 2123 } 2124 2125 int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) 2126 { 2127 struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm); 2128 struct kvm_sev_info *src_sev, *cg_cleanup_sev; 2129 CLASS(fd, f)(source_fd); 2130 struct kvm *source_kvm; 2131 bool charged = false; 2132 int ret; 2133 2134 if (fd_empty(f)) 2135 return -EBADF; 2136 2137 if (!file_is_kvm(fd_file(f))) 2138 return -EBADF; 2139 2140 source_kvm = fd_file(f)->private_data; 2141 ret = sev_lock_two_vms(kvm, source_kvm); 2142 if (ret) 2143 return ret; 2144 2145 if (kvm->arch.vm_type != source_kvm->arch.vm_type || 2146 sev_guest(kvm) || !sev_guest(source_kvm)) { 2147 ret = -EINVAL; 2148 goto out_unlock; 2149 } 2150 2151 src_sev = to_kvm_sev_info(source_kvm); 2152 2153 dst_sev->misc_cg = get_current_misc_cg(); 2154 cg_cleanup_sev = dst_sev; 2155 if (dst_sev->misc_cg != src_sev->misc_cg) { 2156 ret = sev_misc_cg_try_charge(dst_sev); 2157 if (ret) 2158 goto out_dst_cgroup; 2159 charged = true; 2160 } 2161 2162 ret = kvm_lock_all_vcpus(kvm); 2163 if (ret) 2164 goto out_dst_cgroup; 2165 ret = kvm_lock_all_vcpus(source_kvm); 2166 if (ret) 2167 goto out_dst_vcpu; 2168 2169 ret = sev_check_source_vcpus(kvm, source_kvm); 2170 if (ret) 2171 goto out_source_vcpu; 2172 2173 /* 2174 * Allocate a new have_run_cpus for the destination, i.e. don't copy 2175 * the set of CPUs from the source. If a CPU was used to run a vCPU in 2176 * the source VM but is never used for the destination VM, then the CPU 2177 * can only have cached memory that was accessible to the source VM. 2178 */ 2179 if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 2180 ret = -ENOMEM; 2181 goto out_source_vcpu; 2182 } 2183 2184 sev_migrate_from(kvm, source_kvm); 2185 kvm_vm_dead(source_kvm); 2186 cg_cleanup_sev = src_sev; 2187 ret = 0; 2188 2189 out_source_vcpu: 2190 kvm_unlock_all_vcpus(source_kvm); 2191 out_dst_vcpu: 2192 kvm_unlock_all_vcpus(kvm); 2193 out_dst_cgroup: 2194 /* Operates on the source on success, on the destination on failure. */ 2195 if (charged) 2196 sev_misc_cg_uncharge(cg_cleanup_sev); 2197 put_misc_cg(cg_cleanup_sev->misc_cg); 2198 cg_cleanup_sev->misc_cg = NULL; 2199 out_unlock: 2200 sev_unlock_two_vms(kvm, source_kvm); 2201 return ret; 2202 } 2203 2204 int sev_dev_get_attr(u32 group, u64 attr, u64 *val) 2205 { 2206 if (group != KVM_X86_GRP_SEV) 2207 return -ENXIO; 2208 2209 switch (attr) { 2210 case KVM_X86_SEV_VMSA_FEATURES: 2211 *val = sev_supported_vmsa_features; 2212 return 0; 2213 2214 case KVM_X86_SNP_POLICY_BITS: 2215 *val = snp_supported_policy_bits; 2216 return 0; 2217 2218 case KVM_X86_SEV_SNP_REQ_CERTS: 2219 *val = sev_snp_enabled ? 1 : 0; 2220 return 0; 2221 default: 2222 return -ENXIO; 2223 } 2224 } 2225 2226 /* 2227 * The guest context contains all the information, keys and metadata 2228 * associated with the guest that the firmware tracks to implement SEV 2229 * and SNP features. The firmware stores the guest context in hypervisor 2230 * provide page via the SNP_GCTX_CREATE command. 2231 */ 2232 static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) 2233 { 2234 struct sev_data_snp_addr data = {}; 2235 void *context; 2236 int rc; 2237 2238 /* Allocate memory for context page */ 2239 context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); 2240 if (!context) 2241 return NULL; 2242 2243 data.address = __psp_pa(context); 2244 rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error); 2245 if (rc) { 2246 pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d", 2247 rc, argp->error); 2248 snp_free_firmware_page(context); 2249 return NULL; 2250 } 2251 2252 return context; 2253 } 2254 2255 static int snp_bind_asid(struct kvm *kvm, int *error) 2256 { 2257 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2258 struct sev_data_snp_activate data = {0}; 2259 2260 data.gctx_paddr = __psp_pa(sev->snp_context); 2261 data.asid = sev_get_asid(kvm); 2262 return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error); 2263 } 2264 2265 static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) 2266 { 2267 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2268 struct sev_data_snp_launch_start start = {0}; 2269 struct kvm_sev_snp_launch_start params; 2270 int rc; 2271 2272 if (!sev_snp_guest(kvm)) 2273 return -ENOTTY; 2274 2275 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 2276 return -EFAULT; 2277 2278 /* Don't allow userspace to allocate memory for more than 1 SNP context. */ 2279 if (sev->snp_context) 2280 return -EINVAL; 2281 2282 if (params.flags) 2283 return -EINVAL; 2284 2285 if (params.policy & ~snp_supported_policy_bits) 2286 return -EINVAL; 2287 2288 /* Check for policy bits that must be set */ 2289 if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO)) 2290 return -EINVAL; 2291 2292 if (snp_is_secure_tsc_enabled(kvm)) { 2293 if (WARN_ON_ONCE(!kvm->arch.default_tsc_khz)) 2294 return -EINVAL; 2295 2296 start.desired_tsc_khz = kvm->arch.default_tsc_khz; 2297 } 2298 2299 sev->snp_context = snp_context_create(kvm, argp); 2300 if (!sev->snp_context) 2301 return -ENOTTY; 2302 2303 start.gctx_paddr = __psp_pa(sev->snp_context); 2304 start.policy = params.policy; 2305 2306 memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); 2307 rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); 2308 if (rc) { 2309 pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n", 2310 __func__, rc); 2311 goto e_free_context; 2312 } 2313 2314 sev->policy = params.policy; 2315 sev->fd = argp->sev_fd; 2316 rc = snp_bind_asid(kvm, &argp->error); 2317 if (rc) { 2318 pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n", 2319 __func__, rc); 2320 goto e_free_context; 2321 } 2322 2323 return 0; 2324 2325 e_free_context: 2326 snp_decommission_context(kvm); 2327 2328 return rc; 2329 } 2330 2331 struct sev_gmem_populate_args { 2332 __u8 type; 2333 int sev_fd; 2334 int fw_error; 2335 }; 2336 2337 static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 2338 struct page *src_page, void *opaque) 2339 { 2340 struct sev_gmem_populate_args *sev_populate_args = opaque; 2341 struct sev_data_snp_launch_update fw_args = {0}; 2342 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2343 bool assigned = false; 2344 int level; 2345 int ret; 2346 2347 if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_page)) 2348 return -EINVAL; 2349 2350 ret = snp_lookup_rmpentry((u64)pfn, &assigned, &level); 2351 if (ret || assigned) { 2352 pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", 2353 __func__, gfn, ret, assigned); 2354 ret = ret ? -EINVAL : -EEXIST; 2355 goto out; 2356 } 2357 2358 if (src_page) { 2359 void *src_vaddr = kmap_local_page(src_page); 2360 void *dst_vaddr = kmap_local_pfn(pfn); 2361 2362 memcpy(dst_vaddr, src_vaddr, PAGE_SIZE); 2363 2364 kunmap_local(src_vaddr); 2365 kunmap_local(dst_vaddr); 2366 } 2367 2368 ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, PG_LEVEL_4K, 2369 sev_get_asid(kvm), true); 2370 if (ret) 2371 goto out; 2372 2373 fw_args.gctx_paddr = __psp_pa(sev->snp_context); 2374 fw_args.address = __sme_set(pfn_to_hpa(pfn)); 2375 fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); 2376 fw_args.page_type = sev_populate_args->type; 2377 2378 ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, 2379 &fw_args, &sev_populate_args->fw_error); 2380 /* 2381 * If the firmware command failed handle the reclaim and cleanup of that 2382 * PFN before reporting an error. 2383 * 2384 * Additionally, when invalid CPUID function entries are detected, 2385 * firmware writes the expected values into the page and leaves it 2386 * unencrypted so it can be used for debugging and error-reporting. 2387 * 2388 * Copy this page back into the source buffer so userspace can use this 2389 * information to provide information on which CPUID leaves/fields 2390 * failed CPUID validation. 2391 */ 2392 if (ret && !snp_page_reclaim(kvm, pfn) && 2393 sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && 2394 sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { 2395 void *src_vaddr = kmap_local_page(src_page); 2396 void *dst_vaddr = kmap_local_pfn(pfn); 2397 2398 memcpy(src_vaddr, dst_vaddr, PAGE_SIZE); 2399 2400 kunmap_local(src_vaddr); 2401 kunmap_local(dst_vaddr); 2402 } 2403 2404 out: 2405 if (ret) 2406 pr_debug("%s: error updating GFN %llx, return code %d (fw_error %d)\n", 2407 __func__, gfn, ret, sev_populate_args->fw_error); 2408 return ret; 2409 } 2410 2411 static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) 2412 { 2413 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2414 struct sev_gmem_populate_args sev_populate_args = {0}; 2415 struct kvm_sev_snp_launch_update params; 2416 struct kvm_memory_slot *memslot; 2417 long npages, count; 2418 void __user *src; 2419 2420 if (!sev_snp_guest(kvm) || !sev->snp_context) 2421 return -EINVAL; 2422 2423 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 2424 return -EFAULT; 2425 2426 pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__, 2427 params.gfn_start, params.len, params.type, params.flags); 2428 2429 if (!params.len || !PAGE_ALIGNED(params.len) || params.flags || 2430 (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL && 2431 params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO && 2432 params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED && 2433 params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS && 2434 params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID)) 2435 return -EINVAL; 2436 2437 src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr); 2438 2439 if (!PAGE_ALIGNED(src)) 2440 return -EINVAL; 2441 2442 npages = params.len / PAGE_SIZE; 2443 2444 /* 2445 * For each GFN that's being prepared as part of the initial guest 2446 * state, the following pre-conditions are verified: 2447 * 2448 * 1) The backing memslot is a valid private memslot. 2449 * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES 2450 * beforehand. 2451 * 3) The PFN of the guest_memfd has not already been set to private 2452 * in the RMP table. 2453 * 2454 * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page 2455 * faults if there's a race between a fault and an attribute update via 2456 * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized 2457 * here. However, kvm->slots_lock guards against both this as well as 2458 * concurrent memslot updates occurring while these checks are being 2459 * performed, so use that here to make it easier to reason about the 2460 * initial expected state and better guard against unexpected 2461 * situations. 2462 */ 2463 guard(mutex)(&kvm->slots_lock); 2464 2465 memslot = gfn_to_memslot(kvm, params.gfn_start); 2466 if (!kvm_slot_has_gmem(memslot)) 2467 return -EINVAL; 2468 2469 sev_populate_args.sev_fd = argp->sev_fd; 2470 sev_populate_args.type = params.type; 2471 2472 count = kvm_gmem_populate(kvm, params.gfn_start, src, npages, 2473 sev_gmem_post_populate, &sev_populate_args); 2474 if (count < 0) { 2475 argp->error = sev_populate_args.fw_error; 2476 pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n", 2477 __func__, count, argp->error); 2478 return -EIO; 2479 } 2480 2481 params.gfn_start += count; 2482 params.len -= count * PAGE_SIZE; 2483 if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO) 2484 params.uaddr += count * PAGE_SIZE; 2485 2486 if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) 2487 return -EFAULT; 2488 2489 return 0; 2490 } 2491 2492 static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) 2493 { 2494 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2495 struct sev_data_snp_launch_update data = {}; 2496 struct kvm_vcpu *vcpu; 2497 unsigned long i; 2498 int ret; 2499 2500 if (kvm_is_vcpu_creation_in_progress(kvm)) 2501 return -EBUSY; 2502 2503 ret = kvm_lock_all_vcpus(kvm); 2504 if (ret) 2505 return ret; 2506 2507 data.gctx_paddr = __psp_pa(sev->snp_context); 2508 data.page_type = SNP_PAGE_TYPE_VMSA; 2509 2510 kvm_for_each_vcpu(i, vcpu, kvm) { 2511 struct vcpu_svm *svm = to_svm(vcpu); 2512 u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; 2513 2514 ret = sev_es_sync_vmsa(svm); 2515 if (ret) 2516 goto out; 2517 2518 /* Transition the VMSA page to a firmware state. */ 2519 ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true); 2520 if (ret) 2521 goto out; 2522 2523 /* Issue the SNP command to encrypt the VMSA */ 2524 data.address = __sme_pa(svm->sev_es.vmsa); 2525 ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, 2526 &data, &argp->error); 2527 if (ret) { 2528 snp_page_reclaim(kvm, pfn); 2529 2530 goto out; 2531 } 2532 2533 svm->vcpu.arch.guest_state_protected = true; 2534 /* 2535 * SEV-ES (and thus SNP) guest mandates LBR Virtualization to 2536 * be _always_ ON. Enable it only after setting 2537 * guest_state_protected because KVM_SET_MSRS allows dynamic 2538 * toggling of LBRV (for performance reason) on write access to 2539 * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. 2540 */ 2541 svm_enable_lbrv(vcpu); 2542 } 2543 2544 out: 2545 kvm_unlock_all_vcpus(kvm); 2546 return ret; 2547 } 2548 2549 static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) 2550 { 2551 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2552 struct kvm_sev_snp_launch_finish params; 2553 struct sev_data_snp_launch_finish *data; 2554 void *id_block = NULL, *id_auth = NULL; 2555 int ret; 2556 2557 if (!sev_snp_guest(kvm)) 2558 return -ENOTTY; 2559 2560 if (!sev->snp_context) 2561 return -EINVAL; 2562 2563 if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) 2564 return -EFAULT; 2565 2566 if (params.flags) 2567 return -EINVAL; 2568 2569 /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */ 2570 ret = snp_launch_update_vmsa(kvm, argp); 2571 if (ret) 2572 return ret; 2573 2574 data = kzalloc_obj(*data, GFP_KERNEL_ACCOUNT); 2575 if (!data) 2576 return -ENOMEM; 2577 2578 if (params.id_block_en) { 2579 id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE); 2580 if (IS_ERR(id_block)) { 2581 ret = PTR_ERR(id_block); 2582 goto e_free; 2583 } 2584 2585 data->id_block_en = 1; 2586 data->id_block_paddr = __sme_pa(id_block); 2587 2588 id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE); 2589 if (IS_ERR(id_auth)) { 2590 ret = PTR_ERR(id_auth); 2591 goto e_free_id_block; 2592 } 2593 2594 data->id_auth_paddr = __sme_pa(id_auth); 2595 2596 if (params.auth_key_en) 2597 data->auth_key_en = 1; 2598 } 2599 2600 data->vcek_disabled = params.vcek_disabled; 2601 2602 memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE); 2603 data->gctx_paddr = __psp_pa(sev->snp_context); 2604 ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); 2605 2606 /* 2607 * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages 2608 * can be given to the guest simply by marking the RMP entry as private. 2609 * This can happen on first access and also with KVM_PRE_FAULT_MEMORY. 2610 */ 2611 if (!ret) 2612 kvm->arch.pre_fault_allowed = true; 2613 2614 kfree(id_auth); 2615 2616 e_free_id_block: 2617 kfree(id_block); 2618 2619 e_free: 2620 kfree(data); 2621 2622 return ret; 2623 } 2624 2625 static int snp_enable_certs(struct kvm *kvm) 2626 { 2627 if (kvm->created_vcpus || !sev_snp_guest(kvm)) 2628 return -EINVAL; 2629 2630 to_kvm_sev_info(kvm)->snp_certs_enabled = true; 2631 2632 return 0; 2633 } 2634 2635 int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) 2636 { 2637 struct kvm_sev_cmd sev_cmd; 2638 int r; 2639 2640 if (!sev_enabled) 2641 return -ENOTTY; 2642 2643 if (!argp) 2644 return 0; 2645 2646 if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd))) 2647 return -EFAULT; 2648 2649 guard(mutex)(&kvm->lock); 2650 2651 /* Only the enc_context_owner handles some memory enc operations. */ 2652 if (is_mirroring_enc_context(kvm) && 2653 !is_cmd_allowed_from_mirror(sev_cmd.id)) 2654 return -EINVAL; 2655 2656 /* 2657 * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only 2658 * allow the use of SNP-specific commands. 2659 */ 2660 if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) 2661 return -EPERM; 2662 2663 switch (sev_cmd.id) { 2664 case KVM_SEV_ES_INIT: 2665 if (!sev_es_enabled) 2666 return -ENOTTY; 2667 fallthrough; 2668 case KVM_SEV_INIT: 2669 r = sev_guest_init(kvm, &sev_cmd); 2670 break; 2671 case KVM_SEV_INIT2: 2672 r = sev_guest_init2(kvm, &sev_cmd); 2673 break; 2674 case KVM_SEV_LAUNCH_START: 2675 r = sev_launch_start(kvm, &sev_cmd); 2676 break; 2677 case KVM_SEV_LAUNCH_UPDATE_DATA: 2678 r = sev_launch_update_data(kvm, &sev_cmd); 2679 break; 2680 case KVM_SEV_LAUNCH_UPDATE_VMSA: 2681 r = sev_launch_update_vmsa(kvm, &sev_cmd); 2682 break; 2683 case KVM_SEV_LAUNCH_MEASURE: 2684 r = sev_launch_measure(kvm, &sev_cmd); 2685 break; 2686 case KVM_SEV_LAUNCH_FINISH: 2687 r = sev_launch_finish(kvm, &sev_cmd); 2688 break; 2689 case KVM_SEV_GUEST_STATUS: 2690 r = sev_guest_status(kvm, &sev_cmd); 2691 break; 2692 case KVM_SEV_DBG_DECRYPT: 2693 r = sev_dbg_crypt(kvm, &sev_cmd, true); 2694 break; 2695 case KVM_SEV_DBG_ENCRYPT: 2696 r = sev_dbg_crypt(kvm, &sev_cmd, false); 2697 break; 2698 case KVM_SEV_LAUNCH_SECRET: 2699 r = sev_launch_secret(kvm, &sev_cmd); 2700 break; 2701 case KVM_SEV_GET_ATTESTATION_REPORT: 2702 r = sev_get_attestation_report(kvm, &sev_cmd); 2703 break; 2704 case KVM_SEV_SEND_START: 2705 r = sev_send_start(kvm, &sev_cmd); 2706 break; 2707 case KVM_SEV_SEND_UPDATE_DATA: 2708 r = sev_send_update_data(kvm, &sev_cmd); 2709 break; 2710 case KVM_SEV_SEND_FINISH: 2711 r = sev_send_finish(kvm, &sev_cmd); 2712 break; 2713 case KVM_SEV_SEND_CANCEL: 2714 r = sev_send_cancel(kvm, &sev_cmd); 2715 break; 2716 case KVM_SEV_RECEIVE_START: 2717 r = sev_receive_start(kvm, &sev_cmd); 2718 break; 2719 case KVM_SEV_RECEIVE_UPDATE_DATA: 2720 r = sev_receive_update_data(kvm, &sev_cmd); 2721 break; 2722 case KVM_SEV_RECEIVE_FINISH: 2723 r = sev_receive_finish(kvm, &sev_cmd); 2724 break; 2725 case KVM_SEV_SNP_LAUNCH_START: 2726 r = snp_launch_start(kvm, &sev_cmd); 2727 break; 2728 case KVM_SEV_SNP_LAUNCH_UPDATE: 2729 r = snp_launch_update(kvm, &sev_cmd); 2730 break; 2731 case KVM_SEV_SNP_LAUNCH_FINISH: 2732 r = snp_launch_finish(kvm, &sev_cmd); 2733 break; 2734 case KVM_SEV_SNP_ENABLE_REQ_CERTS: 2735 r = snp_enable_certs(kvm); 2736 break; 2737 default: 2738 return -EINVAL; 2739 } 2740 2741 if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd))) 2742 r = -EFAULT; 2743 2744 return r; 2745 } 2746 2747 int sev_mem_enc_register_region(struct kvm *kvm, 2748 struct kvm_enc_region *range) 2749 { 2750 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2751 struct enc_region *region; 2752 int ret = 0; 2753 2754 guard(mutex)(&kvm->lock); 2755 2756 if (!sev_guest(kvm)) 2757 return -ENOTTY; 2758 2759 /* If kvm is mirroring encryption context it isn't responsible for it */ 2760 if (is_mirroring_enc_context(kvm)) 2761 return -EINVAL; 2762 2763 region = kzalloc_obj(*region, GFP_KERNEL_ACCOUNT); 2764 if (!region) 2765 return -ENOMEM; 2766 2767 region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 2768 FOLL_WRITE | FOLL_LONGTERM); 2769 if (IS_ERR(region->pages)) { 2770 ret = PTR_ERR(region->pages); 2771 goto e_free; 2772 } 2773 2774 /* 2775 * The guest may change the memory encryption attribute from C=0 -> C=1 2776 * or vice versa for this memory range. Lets make sure caches are 2777 * flushed to ensure that guest data gets written into memory with 2778 * correct C-bit. Note, this must be done before dropping kvm->lock, 2779 * as region and its array of pages can be freed by a different task 2780 * once kvm->lock is released. 2781 */ 2782 sev_clflush_pages(region->pages, region->npages); 2783 2784 region->uaddr = range->addr; 2785 region->size = range->size; 2786 2787 list_add_tail(®ion->list, &sev->regions_list); 2788 return ret; 2789 2790 e_free: 2791 kfree(region); 2792 return ret; 2793 } 2794 2795 static struct enc_region * 2796 find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) 2797 { 2798 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2799 struct list_head *head = &sev->regions_list; 2800 struct enc_region *i; 2801 2802 list_for_each_entry(i, head, list) { 2803 if (i->uaddr == range->addr && 2804 i->size == range->size) 2805 return i; 2806 } 2807 2808 return NULL; 2809 } 2810 2811 static void __unregister_enc_region_locked(struct kvm *kvm, 2812 struct enc_region *region) 2813 { 2814 sev_unpin_memory(kvm, region->pages, region->npages); 2815 list_del(®ion->list); 2816 kfree(region); 2817 } 2818 2819 int sev_mem_enc_unregister_region(struct kvm *kvm, 2820 struct kvm_enc_region *range) 2821 { 2822 struct enc_region *region; 2823 2824 /* If kvm is mirroring encryption context it isn't responsible for it */ 2825 if (is_mirroring_enc_context(kvm)) 2826 return -EINVAL; 2827 2828 guard(mutex)(&kvm->lock); 2829 2830 if (!sev_guest(kvm)) 2831 return -ENOTTY; 2832 2833 region = find_enc_region(kvm, range); 2834 if (!region) 2835 return -EINVAL; 2836 2837 sev_writeback_caches(kvm); 2838 2839 __unregister_enc_region_locked(kvm, region); 2840 2841 return 0; 2842 } 2843 2844 int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) 2845 { 2846 CLASS(fd, f)(source_fd); 2847 struct kvm *source_kvm; 2848 struct kvm_sev_info *source_sev, *mirror_sev; 2849 int ret; 2850 2851 if (fd_empty(f)) 2852 return -EBADF; 2853 2854 if (!file_is_kvm(fd_file(f))) 2855 return -EBADF; 2856 2857 source_kvm = fd_file(f)->private_data; 2858 ret = sev_lock_two_vms(kvm, source_kvm); 2859 if (ret) 2860 return ret; 2861 2862 /* 2863 * Mirrors of mirrors should work, but let's not get silly. Also 2864 * disallow out-of-band SEV/SEV-ES init if the target is already an 2865 * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being 2866 * created after SEV/SEV-ES initialization, e.g. to init intercepts. 2867 */ 2868 if (sev_guest(kvm) || !sev_guest(source_kvm) || 2869 is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) { 2870 ret = -EINVAL; 2871 goto e_unlock; 2872 } 2873 2874 mirror_sev = to_kvm_sev_info(kvm); 2875 if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) { 2876 ret = -ENOMEM; 2877 goto e_unlock; 2878 } 2879 2880 /* 2881 * The mirror kvm holds an enc_context_owner ref so its asid can't 2882 * disappear until we're done with it 2883 */ 2884 source_sev = to_kvm_sev_info(source_kvm); 2885 kvm_get_kvm(source_kvm); 2886 list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); 2887 2888 /* Set enc_context_owner and copy its encryption context over */ 2889 mirror_sev->enc_context_owner = source_kvm; 2890 mirror_sev->active = true; 2891 mirror_sev->asid = source_sev->asid; 2892 mirror_sev->fd = source_sev->fd; 2893 mirror_sev->es_active = source_sev->es_active; 2894 mirror_sev->need_init = false; 2895 mirror_sev->handle = source_sev->handle; 2896 INIT_LIST_HEAD(&mirror_sev->regions_list); 2897 INIT_LIST_HEAD(&mirror_sev->mirror_vms); 2898 ret = 0; 2899 2900 /* 2901 * Do not copy ap_jump_table. Since the mirror does not share the same 2902 * KVM contexts as the original, and they may have different 2903 * memory-views. 2904 */ 2905 2906 e_unlock: 2907 sev_unlock_two_vms(kvm, source_kvm); 2908 return ret; 2909 } 2910 2911 static int snp_decommission_context(struct kvm *kvm) 2912 { 2913 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2914 struct sev_data_snp_addr data = {}; 2915 int ret; 2916 2917 /* If context is not created then do nothing */ 2918 if (!sev->snp_context) 2919 return 0; 2920 2921 /* Do the decommision, which will unbind the ASID from the SNP context */ 2922 data.address = __sme_pa(sev->snp_context); 2923 down_write(&sev_deactivate_lock); 2924 ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL); 2925 up_write(&sev_deactivate_lock); 2926 2927 if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret)) 2928 return ret; 2929 2930 snp_free_firmware_page(sev->snp_context); 2931 sev->snp_context = NULL; 2932 2933 return 0; 2934 } 2935 2936 void sev_vm_init(struct kvm *kvm) 2937 { 2938 switch (kvm->arch.vm_type) { 2939 case KVM_X86_DEFAULT_VM: 2940 case KVM_X86_SW_PROTECTED_VM: 2941 break; 2942 case KVM_X86_SNP_VM: 2943 kvm->arch.has_private_mem = true; 2944 fallthrough; 2945 case KVM_X86_SEV_ES_VM: 2946 kvm->arch.has_protected_state = true; 2947 fallthrough; 2948 case KVM_X86_SEV_VM: 2949 kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; 2950 to_kvm_sev_info(kvm)->need_init = true; 2951 break; 2952 default: 2953 WARN_ONCE(1, "Unsupported VM type %u", kvm->arch.vm_type); 2954 break; 2955 } 2956 } 2957 2958 void sev_vm_destroy(struct kvm *kvm) 2959 { 2960 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 2961 struct list_head *head = &sev->regions_list; 2962 struct list_head *pos, *q; 2963 2964 if (!sev_guest(kvm)) 2965 return; 2966 2967 WARN_ON(!list_empty(&sev->mirror_vms)); 2968 2969 free_cpumask_var(sev->have_run_cpus); 2970 2971 /* 2972 * If this is a mirror VM, remove it from the owner's list of a mirrors 2973 * and skip ASID cleanup (the ASID is tied to the lifetime of the owner). 2974 * Note, mirror VMs don't support registering encrypted regions. 2975 */ 2976 if (is_mirroring_enc_context(kvm)) { 2977 struct kvm *owner_kvm = sev->enc_context_owner; 2978 2979 mutex_lock(&owner_kvm->lock); 2980 list_del(&sev->mirror_entry); 2981 mutex_unlock(&owner_kvm->lock); 2982 kvm_put_kvm(owner_kvm); 2983 return; 2984 } 2985 2986 2987 /* 2988 * if userspace was terminated before unregistering the memory regions 2989 * then lets unpin all the registered memory. 2990 */ 2991 if (!list_empty(head)) { 2992 list_for_each_safe(pos, q, head) { 2993 __unregister_enc_region_locked(kvm, 2994 list_entry(pos, struct enc_region, list)); 2995 cond_resched(); 2996 } 2997 } 2998 2999 if (sev_snp_guest(kvm)) { 3000 snp_guest_req_cleanup(kvm); 3001 3002 /* 3003 * Decomission handles unbinding of the ASID. If it fails for 3004 * some unexpected reason, just leak the ASID. 3005 */ 3006 if (snp_decommission_context(kvm)) 3007 return; 3008 } else { 3009 sev_unbind_asid(kvm, sev->handle); 3010 } 3011 3012 sev_asid_free(sev); 3013 } 3014 3015 void __init sev_set_cpu_caps(void) 3016 { 3017 if (sev_enabled) { 3018 kvm_cpu_cap_set(X86_FEATURE_SEV); 3019 kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM); 3020 } 3021 if (sev_es_enabled) { 3022 kvm_cpu_cap_set(X86_FEATURE_SEV_ES); 3023 kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM); 3024 } 3025 if (sev_snp_enabled) { 3026 kvm_cpu_cap_set(X86_FEATURE_SEV_SNP); 3027 kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM); 3028 } 3029 } 3030 3031 static bool is_sev_snp_initialized(void) 3032 { 3033 struct sev_user_data_snp_status *status; 3034 struct sev_data_snp_addr buf; 3035 bool initialized = false; 3036 int ret, error = 0; 3037 3038 status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO); 3039 if (!status) 3040 return false; 3041 3042 buf.address = __psp_pa(status); 3043 ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error); 3044 if (ret) { 3045 pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n", 3046 ret, error, error); 3047 goto out; 3048 } 3049 3050 initialized = !!status->state; 3051 3052 out: 3053 snp_free_firmware_page(status); 3054 3055 return initialized; 3056 } 3057 3058 void __init sev_hardware_setup(void) 3059 { 3060 unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; 3061 struct sev_platform_init_args init_args = {0}; 3062 bool sev_snp_supported = false; 3063 bool sev_es_supported = false; 3064 bool sev_supported = false; 3065 3066 if (!sev_enabled || !npt_enabled || !nrips) 3067 goto out; 3068 3069 /* 3070 * SEV must obviously be supported in hardware. Sanity check that the 3071 * CPU supports decode assists, which is mandatory for SEV guests to 3072 * support instruction emulation. Ditto for flushing by ASID, as SEV 3073 * guests are bound to a single ASID, i.e. KVM can't rotate to a new 3074 * ASID to effect a TLB flush. 3075 */ 3076 if (!boot_cpu_has(X86_FEATURE_SEV) || 3077 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) || 3078 WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) 3079 goto out; 3080 3081 /* 3082 * The kernel's initcall infrastructure lacks the ability to express 3083 * dependencies between initcalls, whereas the modules infrastructure 3084 * automatically handles dependencies via symbol loading. Ensure the 3085 * PSP SEV driver is initialized before proceeding if KVM is built-in, 3086 * as the dependency isn't handled by the initcall infrastructure. 3087 */ 3088 if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) 3089 goto out; 3090 3091 /* Retrieve SEV CPUID information */ 3092 cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); 3093 3094 /* Set encryption bit location for SEV-ES guests */ 3095 sev_enc_bit = ebx & 0x3f; 3096 3097 /* Maximum number of encrypted guests supported simultaneously */ 3098 max_sev_asid = ecx; 3099 if (!max_sev_asid) 3100 goto out; 3101 3102 /* Minimum ASID value that should be used for SEV guest */ 3103 min_sev_asid = edx; 3104 sev_me_mask = 1UL << (ebx & 0x3f); 3105 3106 /* 3107 * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap, 3108 * even though it's never used, so that the bitmap is indexed by the 3109 * actual ASID. 3110 */ 3111 nr_asids = max_sev_asid + 1; 3112 sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); 3113 if (!sev_asid_bitmap) 3114 goto out; 3115 3116 sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); 3117 if (!sev_reclaim_asid_bitmap) { 3118 bitmap_free(sev_asid_bitmap); 3119 sev_asid_bitmap = NULL; 3120 goto out; 3121 } 3122 3123 if (min_sev_asid <= max_sev_asid) { 3124 sev_asid_count = max_sev_asid - min_sev_asid + 1; 3125 WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); 3126 } 3127 sev_supported = true; 3128 3129 /* SEV-ES support requested? */ 3130 if (!sev_es_enabled) 3131 goto out; 3132 3133 /* 3134 * SEV-ES requires MMIO caching as KVM doesn't have access to the guest 3135 * instruction stream, i.e. can't emulate in response to a #NPF and 3136 * instead relies on #NPF(RSVD) being reflected into the guest as #VC 3137 * (the guest can then do a #VMGEXIT to request MMIO emulation). 3138 */ 3139 if (!enable_mmio_caching) 3140 goto out; 3141 3142 /* Does the CPU support SEV-ES? */ 3143 if (!boot_cpu_has(X86_FEATURE_SEV_ES)) 3144 goto out; 3145 3146 if (!lbrv) { 3147 WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV), 3148 "LBRV must be present for SEV-ES support"); 3149 goto out; 3150 } 3151 3152 /* Has the system been allocated ASIDs for SEV-ES? */ 3153 if (min_sev_asid == 1) 3154 goto out; 3155 3156 min_sev_es_asid = min_snp_asid = 1; 3157 max_sev_es_asid = max_snp_asid = min_sev_asid - 1; 3158 3159 sev_es_asid_count = min_sev_asid - 1; 3160 WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); 3161 sev_es_supported = true; 3162 sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); 3163 3164 out: 3165 if (sev_enabled) { 3166 init_args.probe = true; 3167 3168 if (sev_is_snp_ciphertext_hiding_supported()) 3169 init_args.max_snp_asid = min(nr_ciphertext_hiding_asids, 3170 min_sev_asid - 1); 3171 3172 if (sev_platform_init(&init_args)) 3173 sev_supported = sev_es_supported = sev_snp_supported = false; 3174 else if (sev_snp_supported) 3175 sev_snp_supported = is_sev_snp_initialized(); 3176 3177 if (sev_snp_supported) { 3178 snp_supported_policy_bits = sev_get_snp_policy_bits() & 3179 KVM_SNP_POLICY_MASK_VALID; 3180 nr_ciphertext_hiding_asids = init_args.max_snp_asid; 3181 } 3182 3183 /* 3184 * If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP 3185 * ASID range is partitioned into separate SEV-ES and SEV-SNP 3186 * ASID ranges, with the SEV-SNP range being [1..max_snp_asid] 3187 * and the SEV-ES range being (max_snp_asid..max_sev_es_asid]. 3188 * Note, SEV-ES may effectively be disabled if all ASIDs from 3189 * the joint range are assigned to SEV-SNP. 3190 */ 3191 if (nr_ciphertext_hiding_asids) { 3192 max_snp_asid = nr_ciphertext_hiding_asids; 3193 min_sev_es_asid = max_snp_asid + 1; 3194 pr_info("SEV-SNP ciphertext hiding enabled\n"); 3195 } 3196 } 3197 3198 if (boot_cpu_has(X86_FEATURE_SEV)) 3199 pr_info("SEV %s (ASIDs %u - %u)\n", 3200 sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : 3201 "unusable" : 3202 "disabled", 3203 min_sev_asid, max_sev_asid); 3204 if (boot_cpu_has(X86_FEATURE_SEV_ES)) 3205 pr_info("SEV-ES %s (ASIDs %u - %u)\n", 3206 sev_es_supported ? min_sev_es_asid <= max_sev_es_asid ? "enabled" : 3207 "unusable" : 3208 "disabled", 3209 min_sev_es_asid, max_sev_es_asid); 3210 if (boot_cpu_has(X86_FEATURE_SEV_SNP)) 3211 pr_info("SEV-SNP %s (ASIDs %u - %u)\n", 3212 str_enabled_disabled(sev_snp_supported), 3213 min_snp_asid, max_snp_asid); 3214 3215 sev_enabled = sev_supported; 3216 sev_es_enabled = sev_es_supported; 3217 sev_snp_enabled = sev_snp_supported; 3218 3219 sev_supported_vmsa_features = 0; 3220 3221 if (sev_es_enabled && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) && 3222 cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) 3223 sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; 3224 3225 if (sev_snp_enabled && tsc_khz && cpu_feature_enabled(X86_FEATURE_SNP_SECURE_TSC)) 3226 sev_supported_vmsa_features |= SVM_SEV_FEAT_SECURE_TSC; 3227 } 3228 3229 void sev_hardware_unsetup(void) 3230 { 3231 if (!sev_enabled) 3232 return; 3233 3234 /* No need to take sev_bitmap_lock, all VMs have been destroyed. */ 3235 sev_flush_asids(1, max_sev_asid); 3236 3237 bitmap_free(sev_asid_bitmap); 3238 bitmap_free(sev_reclaim_asid_bitmap); 3239 3240 misc_cg_set_capacity(MISC_CG_RES_SEV, 0); 3241 misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); 3242 3243 sev_platform_shutdown(); 3244 } 3245 3246 int sev_cpu_init(struct svm_cpu_data *sd) 3247 { 3248 if (!sev_enabled) 3249 return 0; 3250 3251 sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL); 3252 if (!sd->sev_vmcbs) 3253 return -ENOMEM; 3254 3255 return 0; 3256 } 3257 3258 /* 3259 * Pages used by hardware to hold guest encrypted state must be flushed before 3260 * returning them to the system. 3261 */ 3262 static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) 3263 { 3264 unsigned int asid = sev_get_asid(vcpu->kvm); 3265 3266 /* 3267 * Note! The address must be a kernel address, as regular page walk 3268 * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user 3269 * address is non-deterministic and unsafe. This function deliberately 3270 * takes a pointer to deter passing in a user address. 3271 */ 3272 unsigned long addr = (unsigned long)va; 3273 3274 /* 3275 * If CPU enforced cache coherency for encrypted mappings of the 3276 * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache 3277 * flush is still needed in order to work properly with DMA devices. 3278 */ 3279 if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) { 3280 clflush_cache_range(va, PAGE_SIZE); 3281 return; 3282 } 3283 3284 /* 3285 * VM Page Flush takes a host virtual address and a guest ASID. Fall 3286 * back to full writeback of caches if this faults so as not to make 3287 * any problems worse by leaving stale encrypted data in the cache. 3288 */ 3289 if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) 3290 goto do_sev_writeback_caches; 3291 3292 return; 3293 3294 do_sev_writeback_caches: 3295 sev_writeback_caches(vcpu->kvm); 3296 } 3297 3298 void sev_guest_memory_reclaimed(struct kvm *kvm) 3299 { 3300 /* 3301 * With SNP+gmem, private/encrypted memory is unreachable via the 3302 * hva-based mmu notifiers, i.e. these events are explicitly scoped to 3303 * shared pages, where there's no need to flush caches. 3304 * 3305 * Checking for SEV+ outside of kvm->lock is safe as __sev_guest_init() 3306 * can only be done before vCPUs are created, caches can be incoherent 3307 * if and only if a vCPU was run, and either this task will see the VM 3308 * as being SEV+ or the vCPU won't be to access the memory (because of 3309 * the in-progress invalidation). 3310 */ 3311 if (!____sev_guest(kvm) || ____sev_snp_guest(kvm)) 3312 return; 3313 3314 sev_writeback_caches(kvm); 3315 } 3316 3317 static void dump_ghcb(struct vcpu_svm *svm) 3318 { 3319 struct vmcb_control_area *control = &svm->vmcb->control; 3320 unsigned int nbits; 3321 3322 /* Re-use the dump_invalid_vmcb module parameter */ 3323 if (!dump_invalid_vmcb) { 3324 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); 3325 return; 3326 } 3327 3328 nbits = sizeof(svm->sev_es.valid_bitmap) * 8; 3329 3330 /* 3331 * Print KVM's snapshot of the GHCB values that were (unsuccessfully) 3332 * used to handle the exit. If the guest has since modified the GHCB 3333 * itself, dumping the raw GHCB won't help debug why KVM was unable to 3334 * handle the VMGEXIT that KVM observed. 3335 */ 3336 pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); 3337 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", 3338 control->exit_code, kvm_ghcb_sw_exit_code_is_valid(svm)); 3339 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", 3340 control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); 3341 pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", 3342 control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); 3343 pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", 3344 svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); 3345 pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); 3346 } 3347 3348 static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) 3349 { 3350 struct kvm_vcpu *vcpu = &svm->vcpu; 3351 struct ghcb *ghcb = svm->sev_es.ghcb; 3352 3353 /* 3354 * The GHCB protocol so far allows for the following data 3355 * to be returned: 3356 * GPRs RAX, RBX, RCX, RDX 3357 * 3358 * Copy their values, even if they may not have been written during the 3359 * VM-Exit. It's the guest's responsibility to not consume random data. 3360 */ 3361 ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]); 3362 ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]); 3363 ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]); 3364 ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]); 3365 } 3366 3367 static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) 3368 { 3369 struct vmcb_control_area *control = &svm->vmcb->control; 3370 struct kvm_vcpu *vcpu = &svm->vcpu; 3371 struct ghcb *ghcb = svm->sev_es.ghcb; 3372 3373 /* 3374 * The GHCB protocol so far allows for the following data 3375 * to be supplied: 3376 * GPRs RAX, RBX, RCX, RDX 3377 * XCR0 3378 * CPL 3379 * 3380 * VMMCALL allows the guest to provide extra registers. KVM also 3381 * expects RSI for hypercalls, so include that, too. 3382 * 3383 * Copy their values to the appropriate location if supplied. 3384 */ 3385 memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); 3386 3387 BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap)); 3388 memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap)); 3389 3390 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm); 3391 vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm); 3392 vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm); 3393 vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm); 3394 vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm); 3395 3396 svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm); 3397 3398 if (kvm_ghcb_xcr0_is_valid(svm)) 3399 __kvm_set_xcr(vcpu, 0, kvm_ghcb_get_xcr0(svm)); 3400 3401 if (kvm_ghcb_xss_is_valid(svm)) 3402 __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm)); 3403 3404 /* Copy the GHCB exit information into the VMCB fields */ 3405 control->exit_code = kvm_ghcb_get_sw_exit_code(svm); 3406 control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm); 3407 control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm); 3408 svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm); 3409 3410 /* Clear the valid entries fields */ 3411 memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); 3412 } 3413 3414 static int sev_es_validate_vmgexit(struct vcpu_svm *svm) 3415 { 3416 struct vmcb_control_area *control = &svm->vmcb->control; 3417 struct kvm_vcpu *vcpu = &svm->vcpu; 3418 u64 reason; 3419 3420 /* Only GHCB Usage code 0 is supported */ 3421 if (svm->sev_es.ghcb->ghcb_usage) { 3422 reason = GHCB_ERR_INVALID_USAGE; 3423 goto vmgexit_err; 3424 } 3425 3426 reason = GHCB_ERR_MISSING_INPUT; 3427 3428 if (!kvm_ghcb_sw_exit_code_is_valid(svm) || 3429 !kvm_ghcb_sw_exit_info_1_is_valid(svm) || 3430 !kvm_ghcb_sw_exit_info_2_is_valid(svm)) 3431 goto vmgexit_err; 3432 3433 switch (control->exit_code) { 3434 case SVM_EXIT_READ_DR7: 3435 break; 3436 case SVM_EXIT_WRITE_DR7: 3437 if (!kvm_ghcb_rax_is_valid(svm)) 3438 goto vmgexit_err; 3439 break; 3440 case SVM_EXIT_RDTSC: 3441 break; 3442 case SVM_EXIT_RDPMC: 3443 if (!kvm_ghcb_rcx_is_valid(svm)) 3444 goto vmgexit_err; 3445 break; 3446 case SVM_EXIT_CPUID: 3447 if (!kvm_ghcb_rax_is_valid(svm) || 3448 !kvm_ghcb_rcx_is_valid(svm)) 3449 goto vmgexit_err; 3450 if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd) 3451 if (!kvm_ghcb_xcr0_is_valid(svm)) 3452 goto vmgexit_err; 3453 break; 3454 case SVM_EXIT_INVD: 3455 break; 3456 case SVM_EXIT_IOIO: 3457 if (control->exit_info_1 & SVM_IOIO_STR_MASK) { 3458 if (!kvm_ghcb_sw_scratch_is_valid(svm)) 3459 goto vmgexit_err; 3460 } else { 3461 if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK)) 3462 if (!kvm_ghcb_rax_is_valid(svm)) 3463 goto vmgexit_err; 3464 } 3465 break; 3466 case SVM_EXIT_MSR: 3467 if (!kvm_ghcb_rcx_is_valid(svm)) 3468 goto vmgexit_err; 3469 if (control->exit_info_1) { 3470 if (!kvm_ghcb_rax_is_valid(svm) || 3471 !kvm_ghcb_rdx_is_valid(svm)) 3472 goto vmgexit_err; 3473 } 3474 break; 3475 case SVM_EXIT_VMMCALL: 3476 if (!kvm_ghcb_rax_is_valid(svm) || 3477 !kvm_ghcb_cpl_is_valid(svm)) 3478 goto vmgexit_err; 3479 break; 3480 case SVM_EXIT_RDTSCP: 3481 break; 3482 case SVM_EXIT_WBINVD: 3483 break; 3484 case SVM_EXIT_MONITOR: 3485 if (!kvm_ghcb_rax_is_valid(svm) || 3486 !kvm_ghcb_rcx_is_valid(svm) || 3487 !kvm_ghcb_rdx_is_valid(svm)) 3488 goto vmgexit_err; 3489 break; 3490 case SVM_EXIT_MWAIT: 3491 if (!kvm_ghcb_rax_is_valid(svm) || 3492 !kvm_ghcb_rcx_is_valid(svm)) 3493 goto vmgexit_err; 3494 break; 3495 case SVM_VMGEXIT_MMIO_READ: 3496 case SVM_VMGEXIT_MMIO_WRITE: 3497 if (!kvm_ghcb_sw_scratch_is_valid(svm)) 3498 goto vmgexit_err; 3499 break; 3500 case SVM_VMGEXIT_AP_CREATION: 3501 if (!is_sev_snp_guest(vcpu)) 3502 goto vmgexit_err; 3503 if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY) 3504 if (!kvm_ghcb_rax_is_valid(svm)) 3505 goto vmgexit_err; 3506 break; 3507 case SVM_VMGEXIT_NMI_COMPLETE: 3508 case SVM_VMGEXIT_AP_HLT_LOOP: 3509 case SVM_VMGEXIT_AP_JUMP_TABLE: 3510 case SVM_VMGEXIT_UNSUPPORTED_EVENT: 3511 case SVM_VMGEXIT_HV_FEATURES: 3512 case SVM_VMGEXIT_TERM_REQUEST: 3513 break; 3514 case SVM_VMGEXIT_PSC: 3515 if (!is_sev_snp_guest(vcpu) || !kvm_ghcb_sw_scratch_is_valid(svm)) 3516 goto vmgexit_err; 3517 break; 3518 case SVM_VMGEXIT_GUEST_REQUEST: 3519 case SVM_VMGEXIT_EXT_GUEST_REQUEST: 3520 if (!is_sev_snp_guest(vcpu) || 3521 !PAGE_ALIGNED(control->exit_info_1) || 3522 !PAGE_ALIGNED(control->exit_info_2) || 3523 control->exit_info_1 == control->exit_info_2) 3524 goto vmgexit_err; 3525 break; 3526 default: 3527 reason = GHCB_ERR_INVALID_EVENT; 3528 goto vmgexit_err; 3529 } 3530 3531 return 0; 3532 3533 vmgexit_err: 3534 /* 3535 * Print the exit code even though it may not be marked valid as it 3536 * could help with debugging. 3537 */ 3538 if (reason == GHCB_ERR_INVALID_USAGE) { 3539 vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", 3540 svm->sev_es.ghcb->ghcb_usage); 3541 } else if (reason == GHCB_ERR_INVALID_EVENT) { 3542 vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", 3543 control->exit_code); 3544 } else { 3545 vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n", 3546 control->exit_code); 3547 dump_ghcb(svm); 3548 } 3549 3550 svm_vmgexit_bad_input(svm, reason); 3551 3552 /* Resume the guest to "return" the error code. */ 3553 return 1; 3554 } 3555 3556 static void __sev_es_unmap_ghcb(struct vcpu_svm *svm) 3557 { 3558 if (svm->sev_es.ghcb_sa_free) { 3559 kvfree(svm->sev_es.ghcb_sa); 3560 svm->sev_es.ghcb_sa = NULL; 3561 svm->sev_es.ghcb_sa_free = false; 3562 } 3563 3564 if (svm->sev_es.ghcb) { 3565 kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); 3566 svm->sev_es.ghcb = NULL; 3567 } 3568 } 3569 3570 void sev_es_unmap_ghcb(struct vcpu_svm *svm) 3571 { 3572 /* Clear any indication that the vCPU is in a type of AP Reset Hold */ 3573 svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE; 3574 3575 if (!svm->sev_es.ghcb) 3576 return; 3577 3578 /* 3579 * If the scratch area lives outside the GHCB, there's a buffer that, 3580 * depending on the operation performed, may need to be synced. 3581 */ 3582 if (svm->sev_es.ghcb_sa_sync) { 3583 kvm_write_guest(svm->vcpu.kvm, svm->sev_es.sw_scratch, 3584 svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); 3585 svm->sev_es.ghcb_sa_sync = false; 3586 } 3587 3588 trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb); 3589 3590 sev_es_sync_to_ghcb(svm); 3591 3592 __sev_es_unmap_ghcb(svm); 3593 } 3594 3595 void sev_free_vcpu(struct kvm_vcpu *vcpu) 3596 { 3597 struct vcpu_svm *svm; 3598 3599 if (!is_sev_es_guest(vcpu)) 3600 return; 3601 3602 svm = to_svm(vcpu); 3603 3604 /* 3605 * If it's an SNP guest, then the VMSA was marked in the RMP table as 3606 * a guest-owned page. Transition the page to hypervisor state before 3607 * releasing it back to the system. 3608 */ 3609 if (is_sev_snp_guest(vcpu)) { 3610 u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; 3611 3612 if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) 3613 goto skip_vmsa_free; 3614 } 3615 3616 if (vcpu->arch.guest_state_protected) 3617 sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); 3618 3619 __free_page(virt_to_page(svm->sev_es.vmsa)); 3620 3621 skip_vmsa_free: 3622 __sev_es_unmap_ghcb(svm); 3623 } 3624 3625 int pre_sev_run(struct vcpu_svm *svm, int cpu) 3626 { 3627 struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); 3628 struct kvm_vcpu *vcpu = &svm->vcpu; 3629 struct kvm *kvm = vcpu->kvm; 3630 unsigned int asid = sev_get_asid(kvm); 3631 3632 /* 3633 * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid 3634 * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP 3635 * AP Destroy event. 3636 */ 3637 if (is_sev_es_guest(vcpu) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) 3638 return -EINVAL; 3639 3640 /* 3641 * To optimize cache flushes when memory is reclaimed from an SEV VM, 3642 * track physical CPUs that enter the guest for SEV VMs and thus can 3643 * have encrypted, dirty data in the cache, and flush caches only for 3644 * CPUs that have entered the guest. 3645 */ 3646 if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus)) 3647 cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus); 3648 3649 /* Assign the asid allocated with this SEV guest */ 3650 svm->asid = asid; 3651 3652 /* 3653 * Flush guest TLB: 3654 * 3655 * 1) when different VMCB for the same ASID is to be run on the same host CPU. 3656 * 2) or this VMCB was executed on different host CPU in previous VMRUNs. 3657 */ 3658 if (sd->sev_vmcbs[asid] == svm->vmcb && 3659 svm->vcpu.arch.last_vmentry_cpu == cpu) 3660 return 0; 3661 3662 sd->sev_vmcbs[asid] = svm->vmcb; 3663 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 3664 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 3665 return 0; 3666 } 3667 3668 #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) 3669 static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 min_len) 3670 { 3671 struct vmcb_control_area *control = &svm->vmcb->control; 3672 u64 ghcb_scratch_beg, ghcb_scratch_end; 3673 u64 scratch_gpa_beg, scratch_gpa_end; 3674 void *scratch_va; 3675 3676 if (WARN_ON_ONCE(!min_len)) 3677 goto e_scratch; 3678 3679 scratch_gpa_beg = svm->sev_es.sw_scratch; 3680 if (!scratch_gpa_beg) { 3681 pr_err("vmgexit: scratch gpa not provided\n"); 3682 goto e_scratch; 3683 } 3684 3685 scratch_gpa_end = scratch_gpa_beg + min_len; 3686 if (scratch_gpa_end < scratch_gpa_beg) { 3687 pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n", 3688 min_len, scratch_gpa_beg); 3689 goto e_scratch; 3690 } 3691 3692 WARN_ON_ONCE(svm->sev_es.ghcb_sa_sync || svm->sev_es.ghcb_sa_free); 3693 3694 if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) { 3695 /* Scratch area begins within GHCB */ 3696 ghcb_scratch_beg = control->ghcb_gpa + 3697 offsetof(struct ghcb, shared_buffer); 3698 ghcb_scratch_end = control->ghcb_gpa + 3699 offsetof(struct ghcb, reserved_0xff0); 3700 3701 /* 3702 * If the scratch area begins within the GHCB, it must be 3703 * completely contained in the GHCB shared buffer area. 3704 */ 3705 if (scratch_gpa_beg < ghcb_scratch_beg || 3706 scratch_gpa_end > ghcb_scratch_end) { 3707 pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n", 3708 scratch_gpa_beg, scratch_gpa_end); 3709 goto e_scratch; 3710 } 3711 3712 scratch_va = (void *)svm->sev_es.ghcb; 3713 scratch_va += (scratch_gpa_beg - control->ghcb_gpa); 3714 3715 svm->sev_es.ghcb_sa_sync = false; 3716 svm->sev_es.ghcb_sa_free = false; 3717 svm->sev_es.ghcb_sa_len = ghcb_scratch_end - scratch_gpa_beg; 3718 } else { 3719 /* GHCB v2 requires the scratch area to be within the GHCB. */ 3720 if (to_kvm_sev_info(svm->vcpu.kvm)->ghcb_version >= 2) 3721 goto e_scratch; 3722 3723 /* 3724 * The guest memory must be read into a kernel buffer, so 3725 * limit the size 3726 */ 3727 if (min_len > GHCB_SCRATCH_AREA_LIMIT) { 3728 pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n", 3729 min_len, GHCB_SCRATCH_AREA_LIMIT); 3730 goto e_scratch; 3731 } 3732 scratch_va = kvzalloc(min_len, GFP_KERNEL_ACCOUNT); 3733 if (!scratch_va) 3734 return -ENOMEM; 3735 3736 if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, min_len)) { 3737 /* Unable to copy scratch area from guest */ 3738 pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); 3739 3740 kvfree(scratch_va); 3741 return -EFAULT; 3742 } 3743 3744 /* 3745 * The scratch area is outside the GHCB. The operation will 3746 * dictate whether the buffer needs to be synced before running 3747 * the vCPU next time (i.e. a read was requested so the data 3748 * must be written back to the guest memory). 3749 */ 3750 svm->sev_es.ghcb_sa_sync = sync; 3751 svm->sev_es.ghcb_sa_free = true; 3752 svm->sev_es.ghcb_sa_len = min_len; 3753 } 3754 3755 svm->sev_es.ghcb_sa = scratch_va; 3756 return 0; 3757 3758 e_scratch: 3759 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA); 3760 3761 return 1; 3762 } 3763 3764 static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask, 3765 unsigned int pos) 3766 { 3767 svm->vmcb->control.ghcb_gpa &= ~(mask << pos); 3768 svm->vmcb->control.ghcb_gpa |= (value & mask) << pos; 3769 } 3770 3771 static u64 get_ghcb_msr_bits(struct vcpu_svm *svm, u64 mask, unsigned int pos) 3772 { 3773 return (svm->vmcb->control.ghcb_gpa >> pos) & mask; 3774 } 3775 3776 static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) 3777 { 3778 svm->vmcb->control.ghcb_gpa = value; 3779 } 3780 3781 static int snp_rmptable_psmash(kvm_pfn_t pfn) 3782 { 3783 int ret; 3784 3785 pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); 3786 3787 /* 3788 * PSMASH_FAIL_INUSE indicates another processor is modifying the 3789 * entry, so retry until that's no longer the case. 3790 */ 3791 do { 3792 ret = psmash(pfn); 3793 } while (ret == PSMASH_FAIL_INUSE); 3794 3795 return ret; 3796 } 3797 3798 static int snp_complete_psc_msr(struct kvm_vcpu *vcpu) 3799 { 3800 struct vcpu_svm *svm = to_svm(vcpu); 3801 3802 if (vcpu->run->hypercall.ret) 3803 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); 3804 else 3805 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP); 3806 3807 return 1; /* resume guest */ 3808 } 3809 3810 static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr) 3811 { 3812 u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr)); 3813 u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr); 3814 struct kvm_vcpu *vcpu = &svm->vcpu; 3815 3816 if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) { 3817 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); 3818 return 1; /* resume guest */ 3819 } 3820 3821 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { 3822 set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); 3823 return 1; /* resume guest */ 3824 } 3825 3826 vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; 3827 vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 3828 /* 3829 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 3830 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 3831 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 3832 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 3833 */ 3834 vcpu->run->hypercall.ret = 0; 3835 vcpu->run->hypercall.args[0] = gpa; 3836 vcpu->run->hypercall.args[1] = 1; 3837 vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE) 3838 ? KVM_MAP_GPA_RANGE_ENCRYPTED 3839 : KVM_MAP_GPA_RANGE_DECRYPTED; 3840 vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K; 3841 3842 vcpu->arch.complete_userspace_io = snp_complete_psc_msr; 3843 3844 return 0; /* forward request to userspace */ 3845 } 3846 3847 struct psc_buffer { 3848 struct psc_hdr hdr; 3849 struct psc_entry entries[]; 3850 } __packed; 3851 3852 static int snp_do_psc(struct vcpu_svm *svm); 3853 3854 static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) 3855 { 3856 memset(&svm->sev_es.psc, 0, sizeof(svm->sev_es.psc)); 3857 3858 /* 3859 * PSC requests always get a "no action" response in SW_EXITINFO1, with 3860 * a PSC-specific return code in SW_EXITINFO2 that provides the "real" 3861 * return code. E.g. if the PSC request was interrupted, the need to 3862 * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1. 3863 */ 3864 svm_vmgexit_no_action(svm, psc_ret); 3865 } 3866 3867 static void __snp_complete_one_psc(struct vcpu_svm *svm) 3868 { 3869 struct vcpu_sev_es_state *sev_es = &svm->sev_es; 3870 struct psc_buffer *guest_psc = sev_es->ghcb_sa; 3871 __u16 idx; 3872 3873 /* 3874 * Everything in-flight has been processed successfully. Update the 3875 * corresponding entries in the guest's PSC buffer and zero out the 3876 * count of in-flight PSC entries. 3877 */ 3878 for (idx = sev_es->psc.cur_idx; sev_es->psc.batch_size; 3879 sev_es->psc.batch_size--, idx++) { 3880 struct psc_entry entry = READ_ONCE(guest_psc->entries[idx]); 3881 3882 guest_psc->entries[idx].cur_page = entry.pagesize ? 512 : 1; 3883 } 3884 3885 sev_es->psc.cur_idx = idx; 3886 guest_psc->hdr.cur_entry = idx; 3887 } 3888 3889 static int snp_complete_one_psc(struct kvm_vcpu *vcpu) 3890 { 3891 struct vcpu_svm *svm = to_svm(vcpu); 3892 3893 if (vcpu->run->hypercall.ret) { 3894 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 3895 return 1; /* resume guest */ 3896 } 3897 3898 __snp_complete_one_psc(svm); 3899 3900 /* Handle the next range (if any). */ 3901 return snp_do_psc(svm); 3902 } 3903 3904 static int snp_do_psc(struct vcpu_svm *svm) 3905 { 3906 struct vcpu_sev_es_state *sev_es = &svm->sev_es; 3907 struct psc_buffer *guest_psc = sev_es->ghcb_sa; 3908 struct kvm_vcpu *vcpu = &svm->vcpu; 3909 struct psc_entry entry_start; 3910 int npages; 3911 bool huge; 3912 u64 gfn; 3913 u16 idx; 3914 3915 next_range: 3916 /* There should be no other PSCs in-flight at this point. */ 3917 if (WARN_ON_ONCE(svm->sev_es.psc.batch_size)) { 3918 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 3919 return 1; 3920 } 3921 3922 /* Find the start of the next range which needs processing. */ 3923 for (idx = sev_es->psc.cur_idx; idx <= sev_es->psc.end_idx; idx++) { 3924 entry_start = READ_ONCE(guest_psc->entries[idx]); 3925 3926 gfn = entry_start.gfn; 3927 huge = entry_start.pagesize; 3928 npages = huge ? 512 : 1; 3929 3930 if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) { 3931 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY); 3932 return 1; 3933 } 3934 3935 if (entry_start.cur_page) { 3936 /* 3937 * If this is a partially-completed 2M range, force 4K handling 3938 * for the remaining pages since they're effectively split at 3939 * this point. Subsequent code should ensure this doesn't get 3940 * combined with adjacent PSC entries where 2M handling is still 3941 * possible. 3942 */ 3943 npages -= entry_start.cur_page; 3944 gfn += entry_start.cur_page; 3945 huge = false; 3946 } 3947 3948 if (npages) 3949 break; 3950 3951 /* 3952 * Increment the guest-visible index to communicate the current 3953 * entry back to the guest, e.g. in case of failure. No need 3954 * for READ_ONCE() as KVM doesn't consume the field, i.e. a 3955 * misbehaving guest can only break itself. 3956 */ 3957 guest_psc->hdr.cur_entry++; 3958 } 3959 3960 if (idx > sev_es->psc.end_idx) { 3961 /* Nothing more to process. */ 3962 snp_complete_psc(svm, 0); 3963 return 1; 3964 } 3965 3966 sev_es->psc.is_2m = huge; 3967 sev_es->psc.cur_idx = idx; 3968 sev_es->psc.batch_size = 1; 3969 3970 /* 3971 * Find all subsequent PSC entries that contain adjacent GPA 3972 * ranges/operations and can be combined into a single 3973 * KVM_HC_MAP_GPA_RANGE exit. 3974 */ 3975 while (++idx <= sev_es->psc.end_idx) { 3976 struct psc_entry entry = READ_ONCE(guest_psc->entries[idx]); 3977 3978 if (entry.operation != entry_start.operation || 3979 entry.gfn != entry_start.gfn + npages || 3980 entry.cur_page || !!entry.pagesize != huge) 3981 break; 3982 3983 sev_es->psc.batch_size++; 3984 npages += huge ? 512 : 1; 3985 } 3986 3987 switch (entry_start.operation) { 3988 case VMGEXIT_PSC_OP_PRIVATE: 3989 case VMGEXIT_PSC_OP_SHARED: 3990 vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; 3991 vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; 3992 /* 3993 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) 3994 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that 3995 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting 3996 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. 3997 */ 3998 vcpu->run->hypercall.ret = 0; 3999 vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn); 4000 vcpu->run->hypercall.args[1] = npages; 4001 vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE 4002 ? KVM_MAP_GPA_RANGE_ENCRYPTED 4003 : KVM_MAP_GPA_RANGE_DECRYPTED; 4004 vcpu->run->hypercall.args[2] |= entry_start.pagesize 4005 ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M 4006 : KVM_MAP_GPA_RANGE_PAGE_SZ_4K; 4007 vcpu->arch.complete_userspace_io = snp_complete_one_psc; 4008 return 0; /* forward request to userspace */ 4009 default: 4010 /* 4011 * Only shared/private PSC operations are currently supported, so if the 4012 * entire range consists of unsupported operations (e.g. SMASH/UNSMASH), 4013 * then consider the entire range completed and avoid exiting to 4014 * userspace. In theory snp_complete_psc() can always be called directly 4015 * at this point to complete the current range and start the next one, 4016 * but that could lead to unexpected levels of recursion. 4017 */ 4018 __snp_complete_one_psc(svm); 4019 goto next_range; 4020 } 4021 4022 BUG(); 4023 } 4024 4025 static int snp_begin_psc(struct vcpu_svm *svm) 4026 { 4027 struct vcpu_sev_es_state *sev_es = &svm->sev_es; 4028 struct psc_buffer *guest_psc = sev_es->ghcb_sa; 4029 u16 max_nr_entries; 4030 4031 if (!user_exit_on_hypercall(svm->vcpu.kvm, KVM_HC_MAP_GPA_RANGE)) { 4032 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 4033 return 1; 4034 } 4035 4036 /* 4037 * GHCB v2 requires the scratch area to reside within the GHCB itself, 4038 * and PSC requests are only supported for GHCB v2+. Thus it should be 4039 * impossible to exceed the max PSC entry count (which is derived from 4040 * the size of the shared GHCB buffer). 4041 */ 4042 max_nr_entries = (sev_es->ghcb_sa_len - sizeof(struct psc_hdr)) / 4043 sizeof(struct psc_entry); 4044 if (WARN_ON_ONCE(max_nr_entries > VMGEXIT_PSC_MAX_COUNT)) { 4045 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); 4046 return 1; 4047 } 4048 4049 /* 4050 * The PSC descriptor buffer can be modified by a misbehaved guest after 4051 * validation, so take care to only use validated copies of values used 4052 * for things like array indexing. 4053 */ 4054 sev_es->psc.cur_idx = READ_ONCE(guest_psc->hdr.cur_entry); 4055 sev_es->psc.end_idx = READ_ONCE(guest_psc->hdr.end_entry); 4056 4057 if (sev_es->psc.end_idx >= max_nr_entries) { 4058 snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); 4059 return 1; 4060 } 4061 4062 return snp_do_psc(svm); 4063 } 4064 4065 /* 4066 * Invoked as part of svm_vcpu_reset() processing of an init event. 4067 */ 4068 static void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) 4069 { 4070 struct vcpu_svm *svm = to_svm(vcpu); 4071 struct kvm_memory_slot *slot; 4072 struct page *page; 4073 kvm_pfn_t pfn; 4074 gfn_t gfn; 4075 4076 guard(mutex)(&svm->sev_es.snp_vmsa_mutex); 4077 4078 if (!svm->sev_es.snp_ap_waiting_for_reset) 4079 return; 4080 4081 svm->sev_es.snp_ap_waiting_for_reset = false; 4082 4083 /* Mark the vCPU as offline and not runnable */ 4084 vcpu->arch.pv.pv_unhalted = false; 4085 kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); 4086 4087 /* Clear use of the VMSA */ 4088 svm->vmcb->control.vmsa_pa = INVALID_PAGE; 4089 4090 /* 4091 * When replacing the VMSA during SEV-SNP AP creation, 4092 * mark the VMCB dirty so that full state is always reloaded. 4093 */ 4094 vmcb_mark_all_dirty(svm->vmcb); 4095 4096 if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) 4097 return; 4098 4099 gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); 4100 svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; 4101 4102 slot = gfn_to_memslot(vcpu->kvm, gfn); 4103 if (!slot) 4104 return; 4105 4106 /* 4107 * The new VMSA will be private memory guest memory, so retrieve the 4108 * PFN from the gmem backend. 4109 */ 4110 if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) 4111 return; 4112 4113 /* 4114 * From this point forward, the VMSA will always be a guest-mapped page 4115 * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In 4116 * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but 4117 * that involves cleanups like flushing caches, which would ideally be 4118 * handled during teardown rather than guest boot. Deferring that also 4119 * allows the existing logic for SEV-ES VMSAs to be re-used with 4120 * minimal SNP-specific changes. 4121 */ 4122 svm->sev_es.snp_has_guest_vmsa = true; 4123 4124 /* Use the new VMSA */ 4125 svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); 4126 4127 /* Mark the vCPU as runnable */ 4128 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 4129 4130 /* 4131 * gmem pages aren't currently migratable, but if this ever changes 4132 * then care should be taken to ensure svm->sev_es.vmsa is pinned 4133 * through some other means. 4134 */ 4135 kvm_release_page_clean(page); 4136 } 4137 4138 static int sev_snp_ap_creation(struct vcpu_svm *svm) 4139 { 4140 struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); 4141 struct kvm_vcpu *vcpu = &svm->vcpu; 4142 struct kvm_vcpu *target_vcpu; 4143 struct vcpu_svm *target_svm; 4144 unsigned int request; 4145 unsigned int apic_id; 4146 4147 request = lower_32_bits(svm->vmcb->control.exit_info_1); 4148 apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); 4149 4150 /* Validate the APIC ID */ 4151 target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id); 4152 if (!target_vcpu) { 4153 vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n", 4154 apic_id); 4155 return -EINVAL; 4156 } 4157 4158 target_svm = to_svm(target_vcpu); 4159 4160 guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex); 4161 4162 switch (request) { 4163 case SVM_VMGEXIT_AP_CREATE_ON_INIT: 4164 case SVM_VMGEXIT_AP_CREATE: 4165 if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) { 4166 vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n", 4167 vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features); 4168 return -EINVAL; 4169 } 4170 4171 if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { 4172 vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", 4173 svm->vmcb->control.exit_info_2); 4174 return -EINVAL; 4175 } 4176 4177 /* 4178 * Malicious guest can RMPADJUST a large page into VMSA which 4179 * will hit the SNP erratum where the CPU will incorrectly signal 4180 * an RMP violation #PF if a hugepage collides with the RMP entry 4181 * of VMSA page, reject the AP CREATE request if VMSA address from 4182 * guest is 2M aligned. 4183 */ 4184 if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) { 4185 vcpu_unimpl(vcpu, 4186 "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n", 4187 svm->vmcb->control.exit_info_2); 4188 return -EINVAL; 4189 } 4190 4191 target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; 4192 break; 4193 case SVM_VMGEXIT_AP_DESTROY: 4194 target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; 4195 break; 4196 default: 4197 vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", 4198 request); 4199 return -EINVAL; 4200 } 4201 4202 target_svm->sev_es.snp_ap_waiting_for_reset = true; 4203 4204 /* 4205 * Unless Creation is deferred until INIT, signal the vCPU to update 4206 * its state. 4207 */ 4208 if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) 4209 kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); 4210 4211 return 0; 4212 } 4213 4214 static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) 4215 { 4216 struct sev_data_snp_guest_request data = {0}; 4217 struct kvm *kvm = svm->vcpu.kvm; 4218 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 4219 sev_ret_code fw_err = 0; 4220 int ret; 4221 4222 if (!is_sev_snp_guest(&svm->vcpu)) 4223 return -EINVAL; 4224 4225 guard(mutex)(&sev->guest_req_mutex); 4226 4227 if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) 4228 return -EIO; 4229 4230 data.gctx_paddr = __psp_pa(sev->snp_context); 4231 data.req_paddr = __psp_pa(sev->guest_req_buf); 4232 data.res_paddr = __psp_pa(sev->guest_resp_buf); 4233 4234 /* 4235 * Firmware failures are propagated on to guest, but any other failure 4236 * condition along the way should be reported to userspace. E.g. if 4237 * the PSP is dead and commands are timing out. 4238 */ 4239 ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); 4240 if (ret && !fw_err) 4241 return ret; 4242 4243 if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) 4244 return -EIO; 4245 4246 /* No action is requested *from KVM* if there was a firmware error. */ 4247 svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err)); 4248 4249 /* resume guest */ 4250 return 1; 4251 } 4252 4253 static int snp_req_certs_err(struct vcpu_svm *svm, u32 vmm_error) 4254 { 4255 ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_error, 0)); 4256 4257 return 1; /* resume guest */ 4258 } 4259 4260 static int snp_complete_req_certs(struct kvm_vcpu *vcpu) 4261 { 4262 struct vcpu_svm *svm = to_svm(vcpu); 4263 struct vmcb_control_area *control = &svm->vmcb->control; 4264 4265 switch (READ_ONCE(vcpu->run->snp_req_certs.ret)) { 4266 case 0: 4267 return snp_handle_guest_req(svm, control->exit_info_1, 4268 control->exit_info_2); 4269 case ENOSPC: 4270 vcpu->arch.regs[VCPU_REGS_RBX] = vcpu->run->snp_req_certs.npages; 4271 return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_INVALID_LEN); 4272 case EAGAIN: 4273 return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_BUSY); 4274 case EIO: 4275 return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_GENERIC); 4276 default: 4277 break; 4278 } 4279 4280 return -EINVAL; 4281 } 4282 4283 static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) 4284 { 4285 struct kvm_vcpu *vcpu = &svm->vcpu; 4286 struct kvm *kvm = vcpu->kvm; 4287 4288 u8 msg_type; 4289 4290 if (!is_sev_snp_guest(vcpu)) 4291 return -EINVAL; 4292 4293 if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type), 4294 &msg_type, 1)) 4295 return -EIO; 4296 4297 /* 4298 * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for 4299 * additional certificate data to be provided alongside the attestation 4300 * report via the guest-provided data pages indicated by RAX/RBX. If 4301 * userspace enables KVM_EXIT_SNP_REQ_CERTS, then exit to userspace 4302 * to give userspace an opportunity to provide the certificate data 4303 * before issuing/completing the attestation request. Otherwise, return 4304 * an empty certificate table in the guest-provided data pages and 4305 * handle the attestation request immediately. 4306 */ 4307 if (msg_type == SNP_MSG_REPORT_REQ) { 4308 struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; 4309 u64 data_npages; 4310 gpa_t data_gpa; 4311 4312 if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm)) 4313 goto request_invalid; 4314 4315 data_gpa = vcpu->arch.regs[VCPU_REGS_RAX]; 4316 data_npages = vcpu->arch.regs[VCPU_REGS_RBX]; 4317 4318 if (!PAGE_ALIGNED(data_gpa)) 4319 goto request_invalid; 4320 4321 if (sev->snp_certs_enabled) { 4322 vcpu->run->exit_reason = KVM_EXIT_SNP_REQ_CERTS; 4323 vcpu->run->snp_req_certs.gpa = data_gpa; 4324 vcpu->run->snp_req_certs.npages = data_npages; 4325 vcpu->run->snp_req_certs.ret = 0; 4326 vcpu->arch.complete_userspace_io = snp_complete_req_certs; 4327 return 0; 4328 } 4329 4330 /* 4331 * As per GHCB spec (see "SNP Extended Guest Request"), the 4332 * certificate table is terminated by 24-bytes of zeroes. 4333 */ 4334 if (data_npages && kvm_clear_guest(kvm, data_gpa, 24)) 4335 return -EIO; 4336 } 4337 4338 return snp_handle_guest_req(svm, req_gpa, resp_gpa); 4339 4340 request_invalid: 4341 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4342 return 1; /* resume guest */ 4343 } 4344 4345 static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) 4346 { 4347 struct vmcb_control_area *control = &svm->vmcb->control; 4348 struct kvm_vcpu *vcpu = &svm->vcpu; 4349 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 4350 u64 ghcb_info; 4351 int ret = 1; 4352 4353 ghcb_info = control->ghcb_gpa & GHCB_MSR_INFO_MASK; 4354 4355 trace_kvm_vmgexit_msr_protocol_enter(svm->vcpu.vcpu_id, 4356 control->ghcb_gpa); 4357 4358 switch (ghcb_info) { 4359 case GHCB_MSR_SEV_INFO_REQ: 4360 set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, 4361 GHCB_VERSION_MIN, 4362 sev_enc_bit)); 4363 break; 4364 case GHCB_MSR_CPUID_REQ: { 4365 u64 cpuid_fn, cpuid_reg, cpuid_value; 4366 4367 cpuid_fn = get_ghcb_msr_bits(svm, 4368 GHCB_MSR_CPUID_FUNC_MASK, 4369 GHCB_MSR_CPUID_FUNC_POS); 4370 4371 /* Initialize the registers needed by the CPUID intercept */ 4372 vcpu->arch.regs[VCPU_REGS_RAX] = cpuid_fn; 4373 vcpu->arch.regs[VCPU_REGS_RCX] = 0; 4374 4375 ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID); 4376 if (!ret) { 4377 /* Error, keep GHCB MSR value as-is */ 4378 break; 4379 } 4380 4381 cpuid_reg = get_ghcb_msr_bits(svm, 4382 GHCB_MSR_CPUID_REG_MASK, 4383 GHCB_MSR_CPUID_REG_POS); 4384 if (cpuid_reg == 0) 4385 cpuid_value = vcpu->arch.regs[VCPU_REGS_RAX]; 4386 else if (cpuid_reg == 1) 4387 cpuid_value = vcpu->arch.regs[VCPU_REGS_RBX]; 4388 else if (cpuid_reg == 2) 4389 cpuid_value = vcpu->arch.regs[VCPU_REGS_RCX]; 4390 else 4391 cpuid_value = vcpu->arch.regs[VCPU_REGS_RDX]; 4392 4393 set_ghcb_msr_bits(svm, cpuid_value, 4394 GHCB_MSR_CPUID_VALUE_MASK, 4395 GHCB_MSR_CPUID_VALUE_POS); 4396 4397 set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP, 4398 GHCB_MSR_INFO_MASK, 4399 GHCB_MSR_INFO_POS); 4400 break; 4401 } 4402 case GHCB_MSR_AP_RESET_HOLD_REQ: 4403 svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO; 4404 ret = kvm_emulate_ap_reset_hold(&svm->vcpu); 4405 4406 /* 4407 * Preset the result to a non-SIPI return and then only set 4408 * the result to non-zero when delivering a SIPI. 4409 */ 4410 set_ghcb_msr_bits(svm, 0, 4411 GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, 4412 GHCB_MSR_AP_RESET_HOLD_RESULT_POS); 4413 4414 set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, 4415 GHCB_MSR_INFO_MASK, 4416 GHCB_MSR_INFO_POS); 4417 break; 4418 case GHCB_MSR_HV_FT_REQ: 4419 set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED, 4420 GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS); 4421 set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP, 4422 GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); 4423 break; 4424 case GHCB_MSR_PREF_GPA_REQ: 4425 if (!is_sev_snp_guest(vcpu)) 4426 goto out_terminate; 4427 4428 set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK, 4429 GHCB_MSR_GPA_VALUE_POS); 4430 set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK, 4431 GHCB_MSR_INFO_POS); 4432 break; 4433 case GHCB_MSR_REG_GPA_REQ: { 4434 u64 gfn; 4435 4436 if (!is_sev_snp_guest(vcpu)) 4437 goto out_terminate; 4438 4439 gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK, 4440 GHCB_MSR_GPA_VALUE_POS); 4441 4442 svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn); 4443 4444 set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK, 4445 GHCB_MSR_GPA_VALUE_POS); 4446 set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK, 4447 GHCB_MSR_INFO_POS); 4448 break; 4449 } 4450 case GHCB_MSR_PSC_REQ: 4451 if (!is_sev_snp_guest(vcpu)) 4452 goto out_terminate; 4453 4454 ret = snp_begin_psc_msr(svm, control->ghcb_gpa); 4455 break; 4456 case GHCB_MSR_TERM_REQ: { 4457 u64 reason_set, reason_code; 4458 4459 reason_set = get_ghcb_msr_bits(svm, 4460 GHCB_MSR_TERM_REASON_SET_MASK, 4461 GHCB_MSR_TERM_REASON_SET_POS); 4462 reason_code = get_ghcb_msr_bits(svm, 4463 GHCB_MSR_TERM_REASON_MASK, 4464 GHCB_MSR_TERM_REASON_POS); 4465 pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", 4466 reason_set, reason_code); 4467 4468 goto out_terminate; 4469 } 4470 default: 4471 /* Error, keep GHCB MSR value as-is */ 4472 break; 4473 } 4474 4475 trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id, 4476 control->ghcb_gpa, ret); 4477 4478 return ret; 4479 4480 out_terminate: 4481 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 4482 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; 4483 vcpu->run->system_event.ndata = 1; 4484 vcpu->run->system_event.data[0] = control->ghcb_gpa; 4485 4486 return 0; 4487 } 4488 4489 int sev_handle_vmgexit(struct kvm_vcpu *vcpu) 4490 { 4491 struct vcpu_svm *svm = to_svm(vcpu); 4492 struct vmcb_control_area *control = &svm->vmcb->control; 4493 u64 ghcb_gpa; 4494 int ret; 4495 4496 /* Validate the GHCB */ 4497 ghcb_gpa = control->ghcb_gpa; 4498 if (ghcb_gpa & GHCB_MSR_INFO_MASK) 4499 return sev_handle_vmgexit_msr_protocol(svm); 4500 4501 if (!ghcb_gpa) { 4502 vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n"); 4503 4504 /* Without a GHCB, just return right back to the guest */ 4505 return 1; 4506 } 4507 4508 if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) { 4509 /* Unable to map GHCB from guest */ 4510 vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", 4511 ghcb_gpa); 4512 4513 /* Without a GHCB, just return right back to the guest */ 4514 return 1; 4515 } 4516 4517 svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; 4518 4519 trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb); 4520 4521 sev_es_sync_from_ghcb(svm); 4522 4523 /* SEV-SNP guest requires that the GHCB GPA must be registered */ 4524 if (is_sev_snp_guest(vcpu) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) { 4525 vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa); 4526 return -EINVAL; 4527 } 4528 4529 ret = sev_es_validate_vmgexit(svm); 4530 if (ret) 4531 return ret; 4532 4533 svm_vmgexit_success(svm, 0); 4534 4535 switch (control->exit_code) { 4536 case SVM_VMGEXIT_MMIO_READ: 4537 case SVM_VMGEXIT_MMIO_WRITE: { 4538 bool is_write = control->exit_code == SVM_VMGEXIT_MMIO_WRITE; 4539 u64 len = control->exit_info_2; 4540 4541 if (!len) 4542 return 1; 4543 4544 if (to_kvm_sev_info(vcpu->kvm)->ghcb_version >= 2 && len > 8) { 4545 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4546 return 1; 4547 } 4548 4549 ret = setup_vmgexit_scratch(svm, !is_write, len); 4550 if (ret) 4551 break; 4552 4553 ret = kvm_sev_es_mmio(vcpu, is_write, control->exit_info_1, len, 4554 svm->sev_es.ghcb_sa); 4555 break; 4556 } 4557 case SVM_VMGEXIT_NMI_COMPLETE: 4558 ++vcpu->stat.nmi_window_exits; 4559 svm->nmi_masked = false; 4560 kvm_make_request(KVM_REQ_EVENT, vcpu); 4561 ret = 1; 4562 break; 4563 case SVM_VMGEXIT_AP_HLT_LOOP: 4564 svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT; 4565 ret = kvm_emulate_ap_reset_hold(vcpu); 4566 break; 4567 case SVM_VMGEXIT_AP_JUMP_TABLE: { 4568 struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); 4569 4570 switch (control->exit_info_1) { 4571 case 0: 4572 /* Set AP jump table address */ 4573 sev->ap_jump_table = control->exit_info_2; 4574 break; 4575 case 1: 4576 /* Get AP jump table address */ 4577 svm_vmgexit_success(svm, sev->ap_jump_table); 4578 break; 4579 default: 4580 pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", 4581 control->exit_info_1); 4582 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4583 } 4584 4585 ret = 1; 4586 break; 4587 } 4588 case SVM_VMGEXIT_HV_FEATURES: 4589 svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED); 4590 ret = 1; 4591 break; 4592 case SVM_VMGEXIT_TERM_REQUEST: 4593 pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n", 4594 control->exit_info_1, control->exit_info_2); 4595 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; 4596 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; 4597 vcpu->run->system_event.ndata = 1; 4598 vcpu->run->system_event.data[0] = control->ghcb_gpa; 4599 break; 4600 case SVM_VMGEXIT_PSC: 4601 ret = setup_vmgexit_scratch(svm, true, sizeof(struct psc_hdr)); 4602 if (ret) 4603 break; 4604 4605 ret = snp_begin_psc(svm); 4606 break; 4607 case SVM_VMGEXIT_AP_CREATION: 4608 ret = sev_snp_ap_creation(svm); 4609 if (ret) { 4610 svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); 4611 } 4612 4613 ret = 1; 4614 break; 4615 case SVM_VMGEXIT_GUEST_REQUEST: 4616 ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2); 4617 break; 4618 case SVM_VMGEXIT_EXT_GUEST_REQUEST: 4619 ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2); 4620 break; 4621 case SVM_VMGEXIT_UNSUPPORTED_EVENT: 4622 vcpu_unimpl(vcpu, 4623 "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", 4624 control->exit_info_1, control->exit_info_2); 4625 ret = -EINVAL; 4626 break; 4627 case SVM_EXIT_IOIO: 4628 if (!((control->exit_info_1 & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT)) 4629 return 1; 4630 4631 fallthrough; 4632 default: 4633 ret = svm_invoke_exit_handler(vcpu, control->exit_code); 4634 } 4635 4636 return ret; 4637 } 4638 4639 int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) 4640 { 4641 int count; 4642 int bytes; 4643 int r; 4644 4645 if (svm->vmcb->control.exit_info_2 > INT_MAX) 4646 return -EINVAL; 4647 4648 count = svm->vmcb->control.exit_info_2; 4649 if (unlikely(check_mul_overflow(count, size, &bytes))) 4650 return -EINVAL; 4651 4652 if (!bytes) 4653 return 1; 4654 4655 r = setup_vmgexit_scratch(svm, in, bytes); 4656 if (r) 4657 return r; 4658 4659 return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa, 4660 count, in); 4661 } 4662 4663 void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4664 { 4665 /* Clear intercepts on MSRs that are context switched by hardware. */ 4666 svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW); 4667 svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW); 4668 svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW); 4669 4670 if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) 4671 svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW, 4672 !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && 4673 !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)); 4674 4675 svm_set_intercept_for_msr(vcpu, MSR_AMD64_GUEST_TSC_FREQ, MSR_TYPE_R, 4676 !snp_is_secure_tsc_enabled(vcpu->kvm)); 4677 4678 /* 4679 * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if 4680 * the host/guest supports its use. 4681 * 4682 * KVM treats the guest as being capable of using XSAVES even if XSAVES 4683 * isn't enabled in guest CPUID as there is no intercept for XSAVES, 4684 * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is 4685 * exposed to the guest and XSAVES is supported in hardware. Condition 4686 * full XSS passthrough on the guest being able to use XSAVES *and* 4687 * XSAVES being exposed to the guest so that KVM can at least honor 4688 * guest CPUID for RDMSR and WRMSR. 4689 */ 4690 svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW, 4691 !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) || 4692 !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)); 4693 } 4694 4695 void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) 4696 { 4697 struct kvm_vcpu *vcpu = &svm->vcpu; 4698 struct kvm_cpuid_entry2 *best; 4699 4700 /* For sev guests, the memory encryption bit is not reserved in CR3. */ 4701 best = kvm_find_cpuid_entry(vcpu, 0x8000001F); 4702 if (best) 4703 vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); 4704 } 4705 4706 static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event) 4707 { 4708 struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); 4709 struct vmcb *vmcb = svm->vmcb01.ptr; 4710 4711 svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV_ES; 4712 4713 /* 4714 * An SEV-ES guest requires a VMSA area that is a separate from the 4715 * VMCB page. Do not include the encryption mask on the VMSA physical 4716 * address since hardware will access it using the guest key. Note, 4717 * the VMSA will be NULL if this vCPU is the destination for intrahost 4718 * migration, and will be copied later. 4719 */ 4720 if (!svm->sev_es.snp_has_guest_vmsa) { 4721 if (svm->sev_es.vmsa) 4722 svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); 4723 else 4724 svm->vmcb->control.vmsa_pa = INVALID_PAGE; 4725 } 4726 4727 if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) 4728 svm->vmcb->control.allowed_sev_features = sev->vmsa_features | 4729 VMCB_ALLOWED_SEV_FEATURES_VALID; 4730 4731 /* Can't intercept CR register access, HV can't modify CR registers */ 4732 svm_clr_intercept(svm, INTERCEPT_CR0_READ); 4733 svm_clr_intercept(svm, INTERCEPT_CR4_READ); 4734 svm_clr_intercept(svm, INTERCEPT_CR8_READ); 4735 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); 4736 svm_clr_intercept(svm, INTERCEPT_CR4_WRITE); 4737 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); 4738 4739 svm_clr_intercept(svm, INTERCEPT_SELECTIVE_CR0); 4740 4741 /* Track EFER/CR register changes */ 4742 svm_set_intercept(svm, TRAP_EFER_WRITE); 4743 svm_set_intercept(svm, TRAP_CR0_WRITE); 4744 svm_set_intercept(svm, TRAP_CR4_WRITE); 4745 svm_set_intercept(svm, TRAP_CR8_WRITE); 4746 4747 vmcb->control.intercepts[INTERCEPT_DR] = 0; 4748 if (!sev_vcpu_has_debug_swap(svm)) { 4749 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); 4750 vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); 4751 svm_mark_intercepts_dirty(svm); 4752 } else { 4753 /* 4754 * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't 4755 * allow debugging SEV-ES guests, and enables DebugSwap iff 4756 * NO_NESTED_DATA_BP is supported, so there's no reason to 4757 * intercept #DB when DebugSwap is enabled. For simplicity 4758 * with respect to guest debug, intercept #DB for other VMs 4759 * even if NO_NESTED_DATA_BP is supported, i.e. even if the 4760 * guest can't DoS the CPU with infinite #DB vectoring. 4761 */ 4762 clr_exception_intercept(svm, DB_VECTOR); 4763 } 4764 4765 /* Can't intercept XSETBV, HV can't modify XCR0 directly */ 4766 svm_clr_intercept(svm, INTERCEPT_XSETBV); 4767 4768 /* 4769 * Set the GHCB MSR value as per the GHCB specification when emulating 4770 * vCPU RESET for an SEV-ES guest. 4771 */ 4772 if (!init_event) 4773 set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, 4774 GHCB_VERSION_MIN, 4775 sev_enc_bit)); 4776 } 4777 4778 void sev_init_vmcb(struct vcpu_svm *svm, bool init_event) 4779 { 4780 struct kvm_vcpu *vcpu = &svm->vcpu; 4781 4782 svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV; 4783 clr_exception_intercept(svm, UD_VECTOR); 4784 4785 /* 4786 * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as 4787 * KVM can't decrypt guest memory to decode the faulting instruction. 4788 */ 4789 clr_exception_intercept(svm, GP_VECTOR); 4790 4791 if (init_event && is_sev_snp_guest(vcpu)) 4792 sev_snp_init_protected_guest_state(vcpu); 4793 4794 if (is_sev_es_guest(vcpu)) 4795 sev_es_init_vmcb(svm, init_event); 4796 } 4797 4798 int sev_vcpu_create(struct kvm_vcpu *vcpu) 4799 { 4800 struct vcpu_svm *svm = to_svm(vcpu); 4801 struct page *vmsa_page; 4802 4803 mutex_init(&svm->sev_es.snp_vmsa_mutex); 4804 4805 if (!is_sev_es_guest(vcpu)) 4806 return 0; 4807 4808 /* 4809 * SEV-ES guests require a separate (from the VMCB) VMSA page used to 4810 * contain the encrypted register state of the guest. 4811 */ 4812 vmsa_page = snp_safe_alloc_page(); 4813 if (!vmsa_page) 4814 return -ENOMEM; 4815 4816 svm->sev_es.vmsa = page_address(vmsa_page); 4817 4818 vcpu->arch.guest_tsc_protected = snp_is_secure_tsc_enabled(vcpu->kvm); 4819 4820 return 0; 4821 } 4822 4823 void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) 4824 { 4825 /* 4826 * All host state for SEV-ES guests is categorized into three swap types 4827 * based on how it is handled by hardware during a world switch: 4828 * 4829 * A: VMRUN: Host state saved in host save area 4830 * VMEXIT: Host state loaded from host save area 4831 * 4832 * B: VMRUN: Host state _NOT_ saved in host save area 4833 * VMEXIT: Host state loaded from host save area 4834 * 4835 * C: VMRUN: Host state _NOT_ saved in host save area 4836 * VMEXIT: Host state initialized to default(reset) values 4837 * 4838 * Manually save type-B state, i.e. state that is loaded by VMEXIT but 4839 * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed 4840 * by common SVM code). 4841 */ 4842 hostsa->xcr0 = kvm_host.xcr0; 4843 hostsa->pkru = read_pkru(); 4844 hostsa->xss = kvm_host.xss; 4845 4846 /* 4847 * If DebugSwap is enabled, debug registers are loaded but NOT saved by 4848 * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does 4849 * not save or load debug registers. Sadly, KVM can't prevent SNP 4850 * guests from lying about DebugSwap on secondary vCPUs, i.e. the 4851 * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what 4852 * the guest has actually enabled (or not!) in the VMSA. 4853 * 4854 * If DebugSwap is *possible*, save the masks so that they're restored 4855 * if the guest enables DebugSwap. But for the DRs themselves, do NOT 4856 * rely on the CPU to restore the host values; KVM will restore them as 4857 * needed in common code, via hw_breakpoint_restore(). Note, KVM does 4858 * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs 4859 * don't need to be restored per se, KVM just needs to ensure they are 4860 * loaded with the correct values *if* the CPU writes the MSRs. 4861 */ 4862 if (sev_vcpu_has_debug_swap(svm) || 4863 (cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) && 4864 is_sev_snp_guest(&svm->vcpu))) { 4865 hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); 4866 hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); 4867 hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); 4868 hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3); 4869 } 4870 4871 /* 4872 * TSC_AUX is always virtualized for SEV-ES guests when the feature is 4873 * available, i.e. TSC_AUX is loaded on #VMEXIT from the host save area. 4874 * Set the save area to the current hardware value, i.e. the current 4875 * user return value, so that the correct value is restored on #VMEXIT. 4876 */ 4877 if (cpu_feature_enabled(X86_FEATURE_V_TSC_AUX) && 4878 !WARN_ON_ONCE(tsc_aux_uret_slot < 0)) 4879 hostsa->tsc_aux = kvm_get_user_return_msr(tsc_aux_uret_slot); 4880 } 4881 4882 void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) 4883 { 4884 struct vcpu_svm *svm = to_svm(vcpu); 4885 4886 /* First SIPI: Use the values as initially set by the VMM */ 4887 if (!svm->sev_es.received_first_sipi) { 4888 svm->sev_es.received_first_sipi = true; 4889 return; 4890 } 4891 4892 /* Subsequent SIPI */ 4893 switch (svm->sev_es.ap_reset_hold_type) { 4894 case AP_RESET_HOLD_NAE_EVENT: 4895 /* 4896 * Return from an AP Reset Hold VMGEXIT, where the guest will 4897 * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. 4898 */ 4899 svm_vmgexit_success(svm, 1); 4900 break; 4901 case AP_RESET_HOLD_MSR_PROTO: 4902 /* 4903 * Return from an AP Reset Hold VMGEXIT, where the guest will 4904 * set the CS and RIP. Set GHCB data field to a non-zero value. 4905 */ 4906 set_ghcb_msr_bits(svm, 1, 4907 GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, 4908 GHCB_MSR_AP_RESET_HOLD_RESULT_POS); 4909 4910 set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, 4911 GHCB_MSR_INFO_MASK, 4912 GHCB_MSR_INFO_POS); 4913 break; 4914 default: 4915 break; 4916 } 4917 } 4918 4919 struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) 4920 { 4921 unsigned long pfn; 4922 struct page *p; 4923 4924 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 4925 return alloc_pages_node(node, gfp | __GFP_ZERO, 0); 4926 4927 /* 4928 * Allocate an SNP-safe page to workaround the SNP erratum where 4929 * the CPU will incorrectly signal an RMP violation #PF if a 4930 * hugepage (2MB or 1GB) collides with the RMP entry of a 4931 * 2MB-aligned VMCB, VMSA, or AVIC backing page. 4932 * 4933 * Allocate one extra page, choose a page which is not 4934 * 2MB-aligned, and free the other. 4935 */ 4936 p = alloc_pages_node(node, gfp | __GFP_ZERO, 1); 4937 if (!p) 4938 return NULL; 4939 4940 split_page(p, 1); 4941 4942 pfn = page_to_pfn(p); 4943 if (IS_ALIGNED(pfn, PTRS_PER_PMD)) 4944 __free_page(p++); 4945 else 4946 __free_page(p + 1); 4947 4948 return p; 4949 } 4950 4951 void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) 4952 { 4953 struct kvm_memory_slot *slot; 4954 struct kvm *kvm = vcpu->kvm; 4955 int order, rmp_level, ret; 4956 struct page *page; 4957 bool assigned; 4958 kvm_pfn_t pfn; 4959 gfn_t gfn; 4960 4961 gfn = gpa >> PAGE_SHIFT; 4962 4963 /* 4964 * The only time RMP faults occur for shared pages is when the guest is 4965 * triggering an RMP fault for an implicit page-state change from 4966 * shared->private. Implicit page-state changes are forwarded to 4967 * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults 4968 * for shared pages should not end up here. 4969 */ 4970 if (!kvm_mem_is_private(kvm, gfn)) { 4971 pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n", 4972 gpa); 4973 return; 4974 } 4975 4976 slot = gfn_to_memslot(kvm, gfn); 4977 if (!kvm_slot_has_gmem(slot)) { 4978 pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", 4979 gpa); 4980 return; 4981 } 4982 4983 ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order); 4984 if (ret) { 4985 pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", 4986 gpa); 4987 return; 4988 } 4989 4990 ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); 4991 if (ret || !assigned) { 4992 pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n", 4993 gpa, pfn, ret); 4994 goto out_no_trace; 4995 } 4996 4997 /* 4998 * There are 2 cases where a PSMASH may be needed to resolve an #NPF 4999 * with PFERR_GUEST_RMP_BIT set: 5000 * 5001 * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM 5002 * bit set if the guest issues them with a smaller granularity than 5003 * what is indicated by the page-size bit in the 2MB RMP entry for 5004 * the PFN that backs the GPA. 5005 * 5006 * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is 5007 * smaller than what is indicated by the 2MB RMP entry for the PFN 5008 * that backs the GPA. 5009 * 5010 * In both these cases, the corresponding 2M RMP entry needs to 5011 * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already 5012 * split into 4K RMP entries, then this is likely a spurious case which 5013 * can occur when there are concurrent accesses by the guest to a 2MB 5014 * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in 5015 * the process of being PMASH'd into 4K entries. These cases should 5016 * resolve automatically on subsequent accesses, so just ignore them 5017 * here. 5018 */ 5019 if (rmp_level == PG_LEVEL_4K) 5020 goto out; 5021 5022 ret = snp_rmptable_psmash(pfn); 5023 if (ret) { 5024 /* 5025 * Look it up again. If it's 4K now then the PSMASH may have 5026 * raced with another process and the issue has already resolved 5027 * itself. 5028 */ 5029 if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) && 5030 assigned && rmp_level == PG_LEVEL_4K) 5031 goto out; 5032 5033 pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n", 5034 gpa, pfn, ret); 5035 } 5036 5037 kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD); 5038 out: 5039 trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); 5040 out_no_trace: 5041 kvm_release_page_unused(page); 5042 } 5043 5044 static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) 5045 { 5046 kvm_pfn_t pfn = start; 5047 5048 while (pfn < end) { 5049 int ret, rmp_level; 5050 bool assigned; 5051 5052 ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); 5053 if (ret) { 5054 pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n", 5055 pfn, start, end, rmp_level, ret); 5056 return false; 5057 } 5058 5059 if (assigned) { 5060 pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n", 5061 __func__, pfn, start, end, rmp_level); 5062 return false; 5063 } 5064 5065 pfn++; 5066 } 5067 5068 return true; 5069 } 5070 5071 static u8 max_level_for_order(int order) 5072 { 5073 if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) 5074 return PG_LEVEL_2M; 5075 5076 return PG_LEVEL_4K; 5077 } 5078 5079 static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order) 5080 { 5081 kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); 5082 5083 /* 5084 * If this is a large folio, and the entire 2M range containing the 5085 * PFN is currently shared, then the entire 2M-aligned range can be 5086 * set to private via a single 2M RMP entry. 5087 */ 5088 if (max_level_for_order(order) > PG_LEVEL_4K && 5089 is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD)) 5090 return true; 5091 5092 return false; 5093 } 5094 5095 int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) 5096 { 5097 struct kvm_sev_info *sev = to_kvm_sev_info(kvm); 5098 kvm_pfn_t pfn_aligned; 5099 gfn_t gfn_aligned; 5100 int level, rc; 5101 bool assigned; 5102 5103 if (!sev_snp_guest(kvm)) 5104 return 0; 5105 5106 rc = snp_lookup_rmpentry(pfn, &assigned, &level); 5107 if (rc) { 5108 pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n", 5109 gfn, pfn, rc); 5110 return -ENOENT; 5111 } 5112 5113 if (assigned) { 5114 pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n", 5115 __func__, gfn, pfn, max_order, level); 5116 return 0; 5117 } 5118 5119 if (is_large_rmp_possible(kvm, pfn, max_order)) { 5120 level = PG_LEVEL_2M; 5121 pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); 5122 gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD); 5123 } else { 5124 level = PG_LEVEL_4K; 5125 pfn_aligned = pfn; 5126 gfn_aligned = gfn; 5127 } 5128 5129 rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false); 5130 if (rc) { 5131 pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n", 5132 gfn, pfn, level, rc); 5133 return -EINVAL; 5134 } 5135 5136 pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n", 5137 __func__, gfn, pfn, pfn_aligned, max_order, level); 5138 5139 return 0; 5140 } 5141 5142 void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) 5143 { 5144 kvm_pfn_t pfn; 5145 5146 if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) 5147 return; 5148 5149 pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end); 5150 5151 for (pfn = start; pfn < end;) { 5152 bool use_2m_update = false; 5153 int rc, rmp_level; 5154 bool assigned; 5155 5156 rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); 5157 if (rc || !assigned) 5158 goto next_pfn; 5159 5160 use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) && 5161 end >= (pfn + PTRS_PER_PMD) && 5162 rmp_level > PG_LEVEL_4K; 5163 5164 /* 5165 * If an unaligned PFN corresponds to a 2M region assigned as a 5166 * large page in the RMP table, PSMASH the region into individual 5167 * 4K RMP entries before attempting to convert a 4K sub-page. 5168 */ 5169 if (!use_2m_update && rmp_level > PG_LEVEL_4K) { 5170 /* 5171 * This shouldn't fail, but if it does, report it, but 5172 * still try to update RMP entry to shared and pray this 5173 * was a spurious error that can be addressed later. 5174 */ 5175 rc = snp_rmptable_psmash(pfn); 5176 WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n", 5177 pfn, rc); 5178 } 5179 5180 rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K); 5181 if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n", 5182 pfn, rc)) 5183 goto next_pfn; 5184 5185 /* 5186 * SEV-ES avoids host/guest cache coherency issues through 5187 * WBNOINVD hooks issued via MMU notifiers during run-time, and 5188 * KVM's VM destroy path at shutdown. Those MMU notifier events 5189 * don't cover gmem since there is no requirement to map pages 5190 * to a HVA in order to use them for a running guest. While the 5191 * shutdown path would still likely cover things for SNP guests, 5192 * userspace may also free gmem pages during run-time via 5193 * hole-punching operations on the guest_memfd, so flush the 5194 * cache entries for these pages before free'ing them back to 5195 * the host. 5196 */ 5197 clflush_cache_range(__va(pfn_to_hpa(pfn)), 5198 use_2m_update ? PMD_SIZE : PAGE_SIZE); 5199 next_pfn: 5200 pfn += use_2m_update ? PTRS_PER_PMD : 1; 5201 cond_resched(); 5202 } 5203 } 5204 5205 int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) 5206 { 5207 int level, rc; 5208 bool assigned; 5209 5210 if (!sev_snp_guest(kvm)) 5211 return 0; 5212 5213 rc = snp_lookup_rmpentry(pfn, &assigned, &level); 5214 if (rc || !assigned) 5215 return PG_LEVEL_4K; 5216 5217 return level; 5218 } 5219 5220 struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) 5221 { 5222 struct vcpu_svm *svm = to_svm(vcpu); 5223 struct vmcb_save_area *vmsa; 5224 struct kvm_sev_info *sev; 5225 int error = 0; 5226 int ret; 5227 5228 if (!is_sev_es_guest(vcpu)) 5229 return NULL; 5230 5231 /* 5232 * If the VMSA has not yet been encrypted, return a pointer to the 5233 * current un-encrypted VMSA. 5234 */ 5235 if (!vcpu->arch.guest_state_protected) 5236 return (struct vmcb_save_area *)svm->sev_es.vmsa; 5237 5238 sev = to_kvm_sev_info(vcpu->kvm); 5239 5240 /* Check if the SEV policy allows debugging */ 5241 if (is_sev_snp_guest(vcpu)) { 5242 if (!(sev->policy & SNP_POLICY_MASK_DEBUG)) 5243 return NULL; 5244 } else { 5245 if (sev->policy & SEV_POLICY_MASK_NODBG) 5246 return NULL; 5247 } 5248 5249 if (is_sev_snp_guest(vcpu)) { 5250 struct sev_data_snp_dbg dbg = {0}; 5251 5252 vmsa = snp_alloc_firmware_page(__GFP_ZERO); 5253 if (!vmsa) 5254 return NULL; 5255 5256 dbg.gctx_paddr = __psp_pa(sev->snp_context); 5257 dbg.src_addr = svm->vmcb->control.vmsa_pa; 5258 dbg.dst_addr = __psp_pa(vmsa); 5259 5260 ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); 5261 5262 /* 5263 * Return the target page to a hypervisor page no matter what. 5264 * If this fails, the page can't be used, so leak it and don't 5265 * try to use it. 5266 */ 5267 if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) 5268 return NULL; 5269 5270 if (ret) { 5271 pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", 5272 ret, error, error); 5273 free_page((unsigned long)vmsa); 5274 5275 return NULL; 5276 } 5277 } else { 5278 struct sev_data_dbg dbg = {0}; 5279 struct page *vmsa_page; 5280 5281 vmsa_page = alloc_page(GFP_KERNEL); 5282 if (!vmsa_page) 5283 return NULL; 5284 5285 vmsa = page_address(vmsa_page); 5286 5287 dbg.handle = sev->handle; 5288 dbg.src_addr = svm->vmcb->control.vmsa_pa; 5289 dbg.dst_addr = __psp_pa(vmsa); 5290 dbg.len = PAGE_SIZE; 5291 5292 ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); 5293 if (ret) { 5294 pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", 5295 ret, error, error); 5296 __free_page(vmsa_page); 5297 5298 return NULL; 5299 } 5300 } 5301 5302 return vmsa; 5303 } 5304 5305 void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) 5306 { 5307 /* If the VMSA has not yet been encrypted, nothing was allocated */ 5308 if (!vcpu->arch.guest_state_protected || !vmsa) 5309 return; 5310 5311 free_page((unsigned long)vmsa); 5312 } 5313