1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * Architecture neutral utility routines for interacting with 5 * Hyper-V. This file is specifically for code that must be 6 * built-in to the kernel image when CONFIG_HYPERV is set 7 * (vs. being in a module) because it is called from architecture 8 * specific code under arch/. 9 * 10 * Copyright (C) 2021, Microsoft, Inc. 11 * 12 * Author : Michael Kelley <mikelley@microsoft.com> 13 */ 14 15 #include <linux/types.h> 16 #include <linux/acpi.h> 17 #include <linux/export.h> 18 #include <linux/bitfield.h> 19 #include <linux/cpumask.h> 20 #include <linux/sched/task_stack.h> 21 #include <linux/panic_notifier.h> 22 #include <linux/ptrace.h> 23 #include <linux/random.h> 24 #include <linux/efi.h> 25 #include <linux/kdebug.h> 26 #include <linux/kmsg_dump.h> 27 #include <linux/sizes.h> 28 #include <linux/slab.h> 29 #include <linux/dma-map-ops.h> 30 #include <linux/set_memory.h> 31 #include <asm/hyperv-tlfs.h> 32 #include <asm/mshyperv.h> 33 34 /* 35 * hv_root_partition, ms_hyperv and hv_nested are defined here with other 36 * Hyper-V specific globals so they are shared across all architectures and are 37 * built only when CONFIG_HYPERV is defined. But on x86, 38 * ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not 39 * defined, and it uses these three variables. So mark them as __weak 40 * here, allowing for an overriding definition in the module containing 41 * ms_hyperv_init_platform(). 42 */ 43 bool __weak hv_root_partition; 44 EXPORT_SYMBOL_GPL(hv_root_partition); 45 46 bool __weak hv_nested; 47 EXPORT_SYMBOL_GPL(hv_nested); 48 49 struct ms_hyperv_info __weak ms_hyperv; 50 EXPORT_SYMBOL_GPL(ms_hyperv); 51 52 u32 *hv_vp_index; 53 EXPORT_SYMBOL_GPL(hv_vp_index); 54 55 u32 hv_max_vp_index; 56 EXPORT_SYMBOL_GPL(hv_max_vp_index); 57 58 void * __percpu *hyperv_pcpu_input_arg; 59 EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg); 60 61 void * __percpu *hyperv_pcpu_output_arg; 62 EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg); 63 64 static void hv_kmsg_dump_unregister(void); 65 66 static struct ctl_table_header *hv_ctl_table_hdr; 67 68 /* 69 * Hyper-V specific initialization and shutdown code that is 70 * common across all architectures. Called from architecture 71 * specific initialization functions. 72 */ 73 74 void __init hv_common_free(void) 75 { 76 unregister_sysctl_table(hv_ctl_table_hdr); 77 hv_ctl_table_hdr = NULL; 78 79 if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) 80 hv_kmsg_dump_unregister(); 81 82 kfree(hv_vp_index); 83 hv_vp_index = NULL; 84 85 free_percpu(hyperv_pcpu_output_arg); 86 hyperv_pcpu_output_arg = NULL; 87 88 free_percpu(hyperv_pcpu_input_arg); 89 hyperv_pcpu_input_arg = NULL; 90 } 91 92 /* 93 * Functions for allocating and freeing memory with size and 94 * alignment HV_HYP_PAGE_SIZE. These functions are needed because 95 * the guest page size may not be the same as the Hyper-V page 96 * size. We depend upon kmalloc() aligning power-of-two size 97 * allocations to the allocation size boundary, so that the 98 * allocated memory appears to Hyper-V as a page of the size 99 * it expects. 100 */ 101 102 void *hv_alloc_hyperv_page(void) 103 { 104 BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE); 105 106 if (PAGE_SIZE == HV_HYP_PAGE_SIZE) 107 return (void *)__get_free_page(GFP_KERNEL); 108 else 109 return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); 110 } 111 EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page); 112 113 void *hv_alloc_hyperv_zeroed_page(void) 114 { 115 if (PAGE_SIZE == HV_HYP_PAGE_SIZE) 116 return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 117 else 118 return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); 119 } 120 EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page); 121 122 void hv_free_hyperv_page(void *addr) 123 { 124 if (PAGE_SIZE == HV_HYP_PAGE_SIZE) 125 free_page((unsigned long)addr); 126 else 127 kfree(addr); 128 } 129 EXPORT_SYMBOL_GPL(hv_free_hyperv_page); 130 131 static void *hv_panic_page; 132 133 /* 134 * Boolean to control whether to report panic messages over Hyper-V. 135 * 136 * It can be set via /proc/sys/kernel/hyperv_record_panic_msg 137 */ 138 static int sysctl_record_panic_msg = 1; 139 140 /* 141 * sysctl option to allow the user to control whether kmsg data should be 142 * reported to Hyper-V on panic. 143 */ 144 static struct ctl_table hv_ctl_table[] = { 145 { 146 .procname = "hyperv_record_panic_msg", 147 .data = &sysctl_record_panic_msg, 148 .maxlen = sizeof(int), 149 .mode = 0644, 150 .proc_handler = proc_dointvec_minmax, 151 .extra1 = SYSCTL_ZERO, 152 .extra2 = SYSCTL_ONE 153 }, 154 }; 155 156 static int hv_die_panic_notify_crash(struct notifier_block *self, 157 unsigned long val, void *args); 158 159 static struct notifier_block hyperv_die_report_block = { 160 .notifier_call = hv_die_panic_notify_crash, 161 }; 162 163 static struct notifier_block hyperv_panic_report_block = { 164 .notifier_call = hv_die_panic_notify_crash, 165 }; 166 167 /* 168 * The following callback works both as die and panic notifier; its 169 * goal is to provide panic information to the hypervisor unless the 170 * kmsg dumper is used [see hv_kmsg_dump()], which provides more 171 * information but isn't always available. 172 * 173 * Notice that both the panic/die report notifiers are registered only 174 * if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set. 175 */ 176 static int hv_die_panic_notify_crash(struct notifier_block *self, 177 unsigned long val, void *args) 178 { 179 struct pt_regs *regs; 180 bool is_die; 181 182 /* Don't notify Hyper-V unless we have a die oops event or panic. */ 183 if (self == &hyperv_panic_report_block) { 184 is_die = false; 185 regs = current_pt_regs(); 186 } else { /* die event */ 187 if (val != DIE_OOPS) 188 return NOTIFY_DONE; 189 190 is_die = true; 191 regs = ((struct die_args *)args)->regs; 192 } 193 194 /* 195 * Hyper-V should be notified only once about a panic/die. If we will 196 * be calling hv_kmsg_dump() later with kmsg data, don't do the 197 * notification here. 198 */ 199 if (!sysctl_record_panic_msg || !hv_panic_page) 200 hyperv_report_panic(regs, val, is_die); 201 202 return NOTIFY_DONE; 203 } 204 205 /* 206 * Callback from kmsg_dump. Grab as much as possible from the end of the kmsg 207 * buffer and call into Hyper-V to transfer the data. 208 */ 209 static void hv_kmsg_dump(struct kmsg_dumper *dumper, 210 enum kmsg_dump_reason reason) 211 { 212 struct kmsg_dump_iter iter; 213 size_t bytes_written; 214 215 /* We are only interested in panics. */ 216 if (reason != KMSG_DUMP_PANIC || !sysctl_record_panic_msg) 217 return; 218 219 /* 220 * Write dump contents to the page. No need to synchronize; panic should 221 * be single-threaded. 222 */ 223 kmsg_dump_rewind(&iter); 224 kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE, 225 &bytes_written); 226 if (!bytes_written) 227 return; 228 /* 229 * P3 to contain the physical address of the panic page & P4 to 230 * contain the size of the panic data in that page. Rest of the 231 * registers are no-op when the NOTIFY_MSG flag is set. 232 */ 233 hv_set_msr(HV_MSR_CRASH_P0, 0); 234 hv_set_msr(HV_MSR_CRASH_P1, 0); 235 hv_set_msr(HV_MSR_CRASH_P2, 0); 236 hv_set_msr(HV_MSR_CRASH_P3, virt_to_phys(hv_panic_page)); 237 hv_set_msr(HV_MSR_CRASH_P4, bytes_written); 238 239 /* 240 * Let Hyper-V know there is crash data available along with 241 * the panic message. 242 */ 243 hv_set_msr(HV_MSR_CRASH_CTL, 244 (HV_CRASH_CTL_CRASH_NOTIFY | 245 HV_CRASH_CTL_CRASH_NOTIFY_MSG)); 246 } 247 248 static struct kmsg_dumper hv_kmsg_dumper = { 249 .dump = hv_kmsg_dump, 250 }; 251 252 static void hv_kmsg_dump_unregister(void) 253 { 254 kmsg_dump_unregister(&hv_kmsg_dumper); 255 unregister_die_notifier(&hyperv_die_report_block); 256 atomic_notifier_chain_unregister(&panic_notifier_list, 257 &hyperv_panic_report_block); 258 259 hv_free_hyperv_page(hv_panic_page); 260 hv_panic_page = NULL; 261 } 262 263 static void hv_kmsg_dump_register(void) 264 { 265 int ret; 266 267 hv_panic_page = hv_alloc_hyperv_zeroed_page(); 268 if (!hv_panic_page) { 269 pr_err("Hyper-V: panic message page memory allocation failed\n"); 270 return; 271 } 272 273 ret = kmsg_dump_register(&hv_kmsg_dumper); 274 if (ret) { 275 pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret); 276 hv_free_hyperv_page(hv_panic_page); 277 hv_panic_page = NULL; 278 } 279 } 280 281 int __init hv_common_init(void) 282 { 283 int i; 284 union hv_hypervisor_version_info version; 285 286 /* Get information about the Hyper-V host version */ 287 if (!hv_get_hypervisor_version(&version)) 288 pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n", 289 version.major_version, version.minor_version, 290 version.build_number, version.service_number, 291 version.service_pack, version.service_branch); 292 293 if (hv_is_isolation_supported()) 294 sysctl_record_panic_msg = 0; 295 296 /* 297 * Hyper-V expects to get crash register data or kmsg when 298 * crash enlightment is available and system crashes. Set 299 * crash_kexec_post_notifiers to be true to make sure that 300 * calling crash enlightment interface before running kdump 301 * kernel. 302 */ 303 if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { 304 u64 hyperv_crash_ctl; 305 306 crash_kexec_post_notifiers = true; 307 pr_info("Hyper-V: enabling crash_kexec_post_notifiers\n"); 308 309 /* 310 * Panic message recording (sysctl_record_panic_msg) 311 * is enabled by default in non-isolated guests and 312 * disabled by default in isolated guests; the panic 313 * message recording won't be available in isolated 314 * guests should the following registration fail. 315 */ 316 hv_ctl_table_hdr = register_sysctl("kernel", hv_ctl_table); 317 if (!hv_ctl_table_hdr) 318 pr_err("Hyper-V: sysctl table register error"); 319 320 /* 321 * Register for panic kmsg callback only if the right 322 * capability is supported by the hypervisor. 323 */ 324 hyperv_crash_ctl = hv_get_msr(HV_MSR_CRASH_CTL); 325 if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) 326 hv_kmsg_dump_register(); 327 328 register_die_notifier(&hyperv_die_report_block); 329 atomic_notifier_chain_register(&panic_notifier_list, 330 &hyperv_panic_report_block); 331 } 332 333 /* 334 * Allocate the per-CPU state for the hypercall input arg. 335 * If this allocation fails, we will not be able to setup 336 * (per-CPU) hypercall input page and thus this failure is 337 * fatal on Hyper-V. 338 */ 339 hyperv_pcpu_input_arg = alloc_percpu(void *); 340 BUG_ON(!hyperv_pcpu_input_arg); 341 342 /* Allocate the per-CPU state for output arg for root */ 343 if (hv_root_partition) { 344 hyperv_pcpu_output_arg = alloc_percpu(void *); 345 BUG_ON(!hyperv_pcpu_output_arg); 346 } 347 348 hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), 349 GFP_KERNEL); 350 if (!hv_vp_index) { 351 hv_common_free(); 352 return -ENOMEM; 353 } 354 355 for (i = 0; i < num_possible_cpus(); i++) 356 hv_vp_index[i] = VP_INVAL; 357 358 return 0; 359 } 360 361 void __init ms_hyperv_late_init(void) 362 { 363 struct acpi_table_header *header; 364 acpi_status status; 365 u8 *randomdata; 366 u32 length, i; 367 368 /* 369 * Seed the Linux random number generator with entropy provided by 370 * the Hyper-V host in ACPI table OEM0. 371 */ 372 if (!IS_ENABLED(CONFIG_ACPI)) 373 return; 374 375 status = acpi_get_table("OEM0", 0, &header); 376 if (ACPI_FAILURE(status) || !header) 377 return; 378 379 /* 380 * Since the "OEM0" table name is for OEM specific usage, verify 381 * that what we're seeing purports to be from Microsoft. 382 */ 383 if (strncmp(header->oem_table_id, "MICROSFT", 8)) 384 goto error; 385 386 /* 387 * Ensure the length is reasonable. Requiring at least 8 bytes and 388 * no more than 4K bytes is somewhat arbitrary and just protects 389 * against a malformed table. Hyper-V currently provides 64 bytes, 390 * but allow for a change in a later version. 391 */ 392 if (header->length < sizeof(*header) + 8 || 393 header->length > sizeof(*header) + SZ_4K) 394 goto error; 395 396 length = header->length - sizeof(*header); 397 randomdata = (u8 *)(header + 1); 398 399 pr_debug("Hyper-V: Seeding rng with %d random bytes from ACPI table OEM0\n", 400 length); 401 402 add_bootloader_randomness(randomdata, length); 403 404 /* 405 * To prevent the seed data from being visible in /sys/firmware/acpi, 406 * zero out the random data in the ACPI table and fixup the checksum. 407 * The zero'ing is done out of an abundance of caution in avoiding 408 * potential security risks to the rng. Similarly, reset the table 409 * length to just the header size so that a subsequent kexec doesn't 410 * try to use the zero'ed out random data. 411 */ 412 for (i = 0; i < length; i++) { 413 header->checksum += randomdata[i]; 414 randomdata[i] = 0; 415 } 416 417 for (i = 0; i < sizeof(header->length); i++) 418 header->checksum += ((u8 *)&header->length)[i]; 419 header->length = sizeof(*header); 420 for (i = 0; i < sizeof(header->length); i++) 421 header->checksum -= ((u8 *)&header->length)[i]; 422 423 error: 424 acpi_put_table(header); 425 } 426 427 /* 428 * Hyper-V specific initialization and die code for 429 * individual CPUs that is common across all architectures. 430 * Called by the CPU hotplug mechanism. 431 */ 432 433 int hv_common_cpu_init(unsigned int cpu) 434 { 435 void **inputarg, **outputarg; 436 u64 msr_vp_index; 437 gfp_t flags; 438 int pgcount = hv_root_partition ? 2 : 1; 439 void *mem; 440 int ret; 441 442 /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */ 443 flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL; 444 445 inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); 446 447 /* 448 * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already 449 * allocated if this CPU was previously online and then taken offline 450 */ 451 if (!*inputarg) { 452 mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); 453 if (!mem) 454 return -ENOMEM; 455 456 if (hv_root_partition) { 457 outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); 458 *outputarg = (char *)mem + HV_HYP_PAGE_SIZE; 459 } 460 461 if (!ms_hyperv.paravisor_present && 462 (hv_isolation_type_snp() || hv_isolation_type_tdx())) { 463 ret = set_memory_decrypted((unsigned long)mem, pgcount); 464 if (ret) { 465 /* It may be unsafe to free 'mem' */ 466 return ret; 467 } 468 469 memset(mem, 0x00, pgcount * HV_HYP_PAGE_SIZE); 470 } 471 472 /* 473 * In a fully enlightened TDX/SNP VM with more than 64 VPs, if 474 * hyperv_pcpu_input_arg is not NULL, set_memory_decrypted() -> 475 * ... -> cpa_flush()-> ... -> __send_ipi_mask_ex() tries to 476 * use hyperv_pcpu_input_arg as the hypercall input page, which 477 * must be a decrypted page in such a VM, but the page is still 478 * encrypted before set_memory_decrypted() returns. Fix this by 479 * setting *inputarg after the above set_memory_decrypted(): if 480 * hyperv_pcpu_input_arg is NULL, __send_ipi_mask_ex() returns 481 * HV_STATUS_INVALID_PARAMETER immediately, and the function 482 * hv_send_ipi_mask() falls back to orig_apic.send_IPI_mask(), 483 * which may be slightly slower than the hypercall, but still 484 * works correctly in such a VM. 485 */ 486 *inputarg = mem; 487 } 488 489 msr_vp_index = hv_get_msr(HV_MSR_VP_INDEX); 490 491 hv_vp_index[cpu] = msr_vp_index; 492 493 if (msr_vp_index > hv_max_vp_index) 494 hv_max_vp_index = msr_vp_index; 495 496 return 0; 497 } 498 499 int hv_common_cpu_die(unsigned int cpu) 500 { 501 /* 502 * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory 503 * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg 504 * may be used by the Hyper-V vPCI driver in reassigning interrupts 505 * as part of the offlining process. The interrupt reassignment 506 * happens *after* the CPUHP_AP_HYPERV_ONLINE state has run and 507 * called this function. 508 * 509 * If a previously offlined CPU is brought back online again, the 510 * originally allocated memory is reused in hv_common_cpu_init(). 511 */ 512 513 return 0; 514 } 515 516 /* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */ 517 bool hv_query_ext_cap(u64 cap_query) 518 { 519 /* 520 * The address of the 'hv_extended_cap' variable will be used as an 521 * output parameter to the hypercall below and so it should be 522 * compatible with 'virt_to_phys'. Which means, it's address should be 523 * directly mapped. Use 'static' to keep it compatible; stack variables 524 * can be virtually mapped, making them incompatible with 525 * 'virt_to_phys'. 526 * Hypercall input/output addresses should also be 8-byte aligned. 527 */ 528 static u64 hv_extended_cap __aligned(8); 529 static bool hv_extended_cap_queried; 530 u64 status; 531 532 /* 533 * Querying extended capabilities is an extended hypercall. Check if the 534 * partition supports extended hypercall, first. 535 */ 536 if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS)) 537 return false; 538 539 /* Extended capabilities do not change at runtime. */ 540 if (hv_extended_cap_queried) 541 return hv_extended_cap & cap_query; 542 543 status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, 544 &hv_extended_cap); 545 546 /* 547 * The query extended capabilities hypercall should not fail under 548 * any normal circumstances. Avoid repeatedly making the hypercall, on 549 * error. 550 */ 551 hv_extended_cap_queried = true; 552 if (!hv_result_success(status)) { 553 pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n", 554 status); 555 return false; 556 } 557 558 return hv_extended_cap & cap_query; 559 } 560 EXPORT_SYMBOL_GPL(hv_query_ext_cap); 561 562 void hv_setup_dma_ops(struct device *dev, bool coherent) 563 { 564 /* 565 * Hyper-V does not offer a vIOMMU in the guest 566 * VM, so pass 0/NULL for the IOMMU settings 567 */ 568 arch_setup_dma_ops(dev, 0, 0, coherent); 569 } 570 EXPORT_SYMBOL_GPL(hv_setup_dma_ops); 571 572 bool hv_is_hibernation_supported(void) 573 { 574 return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4); 575 } 576 EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); 577 578 /* 579 * Default function to read the Hyper-V reference counter, independent 580 * of whether Hyper-V enlightened clocks/timers are being used. But on 581 * architectures where it is used, Hyper-V enlightenment code in 582 * hyperv_timer.c may override this function. 583 */ 584 static u64 __hv_read_ref_counter(void) 585 { 586 return hv_get_msr(HV_MSR_TIME_REF_COUNT); 587 } 588 589 u64 (*hv_read_reference_counter)(void) = __hv_read_ref_counter; 590 EXPORT_SYMBOL_GPL(hv_read_reference_counter); 591 592 /* These __weak functions provide default "no-op" behavior and 593 * may be overridden by architecture specific versions. Architectures 594 * for which the default "no-op" behavior is sufficient can leave 595 * them unimplemented and not be cluttered with a bunch of stub 596 * functions in arch-specific code. 597 */ 598 599 bool __weak hv_is_isolation_supported(void) 600 { 601 return false; 602 } 603 EXPORT_SYMBOL_GPL(hv_is_isolation_supported); 604 605 bool __weak hv_isolation_type_snp(void) 606 { 607 return false; 608 } 609 EXPORT_SYMBOL_GPL(hv_isolation_type_snp); 610 611 bool __weak hv_isolation_type_tdx(void) 612 { 613 return false; 614 } 615 EXPORT_SYMBOL_GPL(hv_isolation_type_tdx); 616 617 void __weak hv_setup_vmbus_handler(void (*handler)(void)) 618 { 619 } 620 EXPORT_SYMBOL_GPL(hv_setup_vmbus_handler); 621 622 void __weak hv_remove_vmbus_handler(void) 623 { 624 } 625 EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler); 626 627 void __weak hv_setup_kexec_handler(void (*handler)(void)) 628 { 629 } 630 EXPORT_SYMBOL_GPL(hv_setup_kexec_handler); 631 632 void __weak hv_remove_kexec_handler(void) 633 { 634 } 635 EXPORT_SYMBOL_GPL(hv_remove_kexec_handler); 636 637 void __weak hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)) 638 { 639 } 640 EXPORT_SYMBOL_GPL(hv_setup_crash_handler); 641 642 void __weak hv_remove_crash_handler(void) 643 { 644 } 645 EXPORT_SYMBOL_GPL(hv_remove_crash_handler); 646 647 void __weak hyperv_cleanup(void) 648 { 649 } 650 EXPORT_SYMBOL_GPL(hyperv_cleanup); 651 652 u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size) 653 { 654 return HV_STATUS_INVALID_PARAMETER; 655 } 656 EXPORT_SYMBOL_GPL(hv_ghcb_hypercall); 657 658 u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2) 659 { 660 return HV_STATUS_INVALID_PARAMETER; 661 } 662 EXPORT_SYMBOL_GPL(hv_tdx_hypercall); 663