1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * Architecture neutral utility routines for interacting with 5 * Hyper-V. This file is specifically for code that must be 6 * built-in to the kernel image when CONFIG_HYPERV is set 7 * (vs. being in a module) because it is called from architecture 8 * specific code under arch/. 9 * 10 * Copyright (C) 2021, Microsoft, Inc. 11 * 12 * Author : Michael Kelley <mikelley@microsoft.com> 13 */ 14 15 #include <linux/types.h> 16 #include <linux/acpi.h> 17 #include <linux/export.h> 18 #include <linux/bitfield.h> 19 #include <linux/cpumask.h> 20 #include <linux/sched/task_stack.h> 21 #include <linux/panic_notifier.h> 22 #include <linux/ptrace.h> 23 #include <linux/kdebug.h> 24 #include <linux/kmsg_dump.h> 25 #include <linux/slab.h> 26 #include <linux/dma-map-ops.h> 27 #include <linux/set_memory.h> 28 #include <asm/hyperv-tlfs.h> 29 #include <asm/mshyperv.h> 30 31 /* 32 * hv_root_partition, ms_hyperv and hv_nested are defined here with other 33 * Hyper-V specific globals so they are shared across all architectures and are 34 * built only when CONFIG_HYPERV is defined. But on x86, 35 * ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not 36 * defined, and it uses these three variables. So mark them as __weak 37 * here, allowing for an overriding definition in the module containing 38 * ms_hyperv_init_platform(). 39 */ 40 bool __weak hv_root_partition; 41 EXPORT_SYMBOL_GPL(hv_root_partition); 42 43 bool __weak hv_nested; 44 EXPORT_SYMBOL_GPL(hv_nested); 45 46 struct ms_hyperv_info __weak ms_hyperv; 47 EXPORT_SYMBOL_GPL(ms_hyperv); 48 49 u32 *hv_vp_index; 50 EXPORT_SYMBOL_GPL(hv_vp_index); 51 52 u32 hv_max_vp_index; 53 EXPORT_SYMBOL_GPL(hv_max_vp_index); 54 55 void * __percpu *hyperv_pcpu_input_arg; 56 EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg); 57 58 void * __percpu *hyperv_pcpu_output_arg; 59 EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg); 60 61 static void hv_kmsg_dump_unregister(void); 62 63 static struct ctl_table_header *hv_ctl_table_hdr; 64 65 /* 66 * Hyper-V specific initialization and shutdown code that is 67 * common across all architectures. Called from architecture 68 * specific initialization functions. 69 */ 70 71 void __init hv_common_free(void) 72 { 73 unregister_sysctl_table(hv_ctl_table_hdr); 74 hv_ctl_table_hdr = NULL; 75 76 if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) 77 hv_kmsg_dump_unregister(); 78 79 kfree(hv_vp_index); 80 hv_vp_index = NULL; 81 82 free_percpu(hyperv_pcpu_output_arg); 83 hyperv_pcpu_output_arg = NULL; 84 85 free_percpu(hyperv_pcpu_input_arg); 86 hyperv_pcpu_input_arg = NULL; 87 } 88 89 /* 90 * Functions for allocating and freeing memory with size and 91 * alignment HV_HYP_PAGE_SIZE. These functions are needed because 92 * the guest page size may not be the same as the Hyper-V page 93 * size. We depend upon kmalloc() aligning power-of-two size 94 * allocations to the allocation size boundary, so that the 95 * allocated memory appears to Hyper-V as a page of the size 96 * it expects. 97 */ 98 99 void *hv_alloc_hyperv_page(void) 100 { 101 BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE); 102 103 if (PAGE_SIZE == HV_HYP_PAGE_SIZE) 104 return (void *)__get_free_page(GFP_KERNEL); 105 else 106 return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); 107 } 108 EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page); 109 110 void *hv_alloc_hyperv_zeroed_page(void) 111 { 112 if (PAGE_SIZE == HV_HYP_PAGE_SIZE) 113 return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 114 else 115 return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); 116 } 117 EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page); 118 119 void hv_free_hyperv_page(void *addr) 120 { 121 if (PAGE_SIZE == HV_HYP_PAGE_SIZE) 122 free_page((unsigned long)addr); 123 else 124 kfree(addr); 125 } 126 EXPORT_SYMBOL_GPL(hv_free_hyperv_page); 127 128 static void *hv_panic_page; 129 130 /* 131 * Boolean to control whether to report panic messages over Hyper-V. 132 * 133 * It can be set via /proc/sys/kernel/hyperv_record_panic_msg 134 */ 135 static int sysctl_record_panic_msg = 1; 136 137 /* 138 * sysctl option to allow the user to control whether kmsg data should be 139 * reported to Hyper-V on panic. 140 */ 141 static struct ctl_table hv_ctl_table[] = { 142 { 143 .procname = "hyperv_record_panic_msg", 144 .data = &sysctl_record_panic_msg, 145 .maxlen = sizeof(int), 146 .mode = 0644, 147 .proc_handler = proc_dointvec_minmax, 148 .extra1 = SYSCTL_ZERO, 149 .extra2 = SYSCTL_ONE 150 }, 151 {} 152 }; 153 154 static int hv_die_panic_notify_crash(struct notifier_block *self, 155 unsigned long val, void *args); 156 157 static struct notifier_block hyperv_die_report_block = { 158 .notifier_call = hv_die_panic_notify_crash, 159 }; 160 161 static struct notifier_block hyperv_panic_report_block = { 162 .notifier_call = hv_die_panic_notify_crash, 163 }; 164 165 /* 166 * The following callback works both as die and panic notifier; its 167 * goal is to provide panic information to the hypervisor unless the 168 * kmsg dumper is used [see hv_kmsg_dump()], which provides more 169 * information but isn't always available. 170 * 171 * Notice that both the panic/die report notifiers are registered only 172 * if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set. 173 */ 174 static int hv_die_panic_notify_crash(struct notifier_block *self, 175 unsigned long val, void *args) 176 { 177 struct pt_regs *regs; 178 bool is_die; 179 180 /* Don't notify Hyper-V unless we have a die oops event or panic. */ 181 if (self == &hyperv_panic_report_block) { 182 is_die = false; 183 regs = current_pt_regs(); 184 } else { /* die event */ 185 if (val != DIE_OOPS) 186 return NOTIFY_DONE; 187 188 is_die = true; 189 regs = ((struct die_args *)args)->regs; 190 } 191 192 /* 193 * Hyper-V should be notified only once about a panic/die. If we will 194 * be calling hv_kmsg_dump() later with kmsg data, don't do the 195 * notification here. 196 */ 197 if (!sysctl_record_panic_msg || !hv_panic_page) 198 hyperv_report_panic(regs, val, is_die); 199 200 return NOTIFY_DONE; 201 } 202 203 /* 204 * Callback from kmsg_dump. Grab as much as possible from the end of the kmsg 205 * buffer and call into Hyper-V to transfer the data. 206 */ 207 static void hv_kmsg_dump(struct kmsg_dumper *dumper, 208 enum kmsg_dump_reason reason) 209 { 210 struct kmsg_dump_iter iter; 211 size_t bytes_written; 212 213 /* We are only interested in panics. */ 214 if (reason != KMSG_DUMP_PANIC || !sysctl_record_panic_msg) 215 return; 216 217 /* 218 * Write dump contents to the page. No need to synchronize; panic should 219 * be single-threaded. 220 */ 221 kmsg_dump_rewind(&iter); 222 kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE, 223 &bytes_written); 224 if (!bytes_written) 225 return; 226 /* 227 * P3 to contain the physical address of the panic page & P4 to 228 * contain the size of the panic data in that page. Rest of the 229 * registers are no-op when the NOTIFY_MSG flag is set. 230 */ 231 hv_set_register(HV_REGISTER_CRASH_P0, 0); 232 hv_set_register(HV_REGISTER_CRASH_P1, 0); 233 hv_set_register(HV_REGISTER_CRASH_P2, 0); 234 hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page)); 235 hv_set_register(HV_REGISTER_CRASH_P4, bytes_written); 236 237 /* 238 * Let Hyper-V know there is crash data available along with 239 * the panic message. 240 */ 241 hv_set_register(HV_REGISTER_CRASH_CTL, 242 (HV_CRASH_CTL_CRASH_NOTIFY | 243 HV_CRASH_CTL_CRASH_NOTIFY_MSG)); 244 } 245 246 static struct kmsg_dumper hv_kmsg_dumper = { 247 .dump = hv_kmsg_dump, 248 }; 249 250 static void hv_kmsg_dump_unregister(void) 251 { 252 kmsg_dump_unregister(&hv_kmsg_dumper); 253 unregister_die_notifier(&hyperv_die_report_block); 254 atomic_notifier_chain_unregister(&panic_notifier_list, 255 &hyperv_panic_report_block); 256 257 hv_free_hyperv_page(hv_panic_page); 258 hv_panic_page = NULL; 259 } 260 261 static void hv_kmsg_dump_register(void) 262 { 263 int ret; 264 265 hv_panic_page = hv_alloc_hyperv_zeroed_page(); 266 if (!hv_panic_page) { 267 pr_err("Hyper-V: panic message page memory allocation failed\n"); 268 return; 269 } 270 271 ret = kmsg_dump_register(&hv_kmsg_dumper); 272 if (ret) { 273 pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret); 274 hv_free_hyperv_page(hv_panic_page); 275 hv_panic_page = NULL; 276 } 277 } 278 279 int __init hv_common_init(void) 280 { 281 int i; 282 283 if (hv_is_isolation_supported()) 284 sysctl_record_panic_msg = 0; 285 286 /* 287 * Hyper-V expects to get crash register data or kmsg when 288 * crash enlightment is available and system crashes. Set 289 * crash_kexec_post_notifiers to be true to make sure that 290 * calling crash enlightment interface before running kdump 291 * kernel. 292 */ 293 if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { 294 u64 hyperv_crash_ctl; 295 296 crash_kexec_post_notifiers = true; 297 pr_info("Hyper-V: enabling crash_kexec_post_notifiers\n"); 298 299 /* 300 * Panic message recording (sysctl_record_panic_msg) 301 * is enabled by default in non-isolated guests and 302 * disabled by default in isolated guests; the panic 303 * message recording won't be available in isolated 304 * guests should the following registration fail. 305 */ 306 hv_ctl_table_hdr = register_sysctl("kernel", hv_ctl_table); 307 if (!hv_ctl_table_hdr) 308 pr_err("Hyper-V: sysctl table register error"); 309 310 /* 311 * Register for panic kmsg callback only if the right 312 * capability is supported by the hypervisor. 313 */ 314 hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL); 315 if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) 316 hv_kmsg_dump_register(); 317 318 register_die_notifier(&hyperv_die_report_block); 319 atomic_notifier_chain_register(&panic_notifier_list, 320 &hyperv_panic_report_block); 321 } 322 323 /* 324 * Allocate the per-CPU state for the hypercall input arg. 325 * If this allocation fails, we will not be able to setup 326 * (per-CPU) hypercall input page and thus this failure is 327 * fatal on Hyper-V. 328 */ 329 hyperv_pcpu_input_arg = alloc_percpu(void *); 330 BUG_ON(!hyperv_pcpu_input_arg); 331 332 /* Allocate the per-CPU state for output arg for root */ 333 if (hv_root_partition) { 334 hyperv_pcpu_output_arg = alloc_percpu(void *); 335 BUG_ON(!hyperv_pcpu_output_arg); 336 } 337 338 hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), 339 GFP_KERNEL); 340 if (!hv_vp_index) { 341 hv_common_free(); 342 return -ENOMEM; 343 } 344 345 for (i = 0; i < num_possible_cpus(); i++) 346 hv_vp_index[i] = VP_INVAL; 347 348 return 0; 349 } 350 351 /* 352 * Hyper-V specific initialization and die code for 353 * individual CPUs that is common across all architectures. 354 * Called by the CPU hotplug mechanism. 355 */ 356 357 int hv_common_cpu_init(unsigned int cpu) 358 { 359 void **inputarg, **outputarg; 360 u64 msr_vp_index; 361 gfp_t flags; 362 int pgcount = hv_root_partition ? 2 : 1; 363 void *mem; 364 int ret; 365 366 /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */ 367 flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL; 368 369 inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); 370 371 /* 372 * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already 373 * allocated if this CPU was previously online and then taken offline 374 */ 375 if (!*inputarg) { 376 mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); 377 if (!mem) 378 return -ENOMEM; 379 380 if (hv_root_partition) { 381 outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); 382 *outputarg = (char *)mem + HV_HYP_PAGE_SIZE; 383 } 384 385 if (!ms_hyperv.paravisor_present && 386 (hv_isolation_type_snp() || hv_isolation_type_tdx())) { 387 ret = set_memory_decrypted((unsigned long)mem, pgcount); 388 if (ret) { 389 /* It may be unsafe to free 'mem' */ 390 return ret; 391 } 392 393 memset(mem, 0x00, pgcount * HV_HYP_PAGE_SIZE); 394 } 395 396 /* 397 * In a fully enlightened TDX/SNP VM with more than 64 VPs, if 398 * hyperv_pcpu_input_arg is not NULL, set_memory_decrypted() -> 399 * ... -> cpa_flush()-> ... -> __send_ipi_mask_ex() tries to 400 * use hyperv_pcpu_input_arg as the hypercall input page, which 401 * must be a decrypted page in such a VM, but the page is still 402 * encrypted before set_memory_decrypted() returns. Fix this by 403 * setting *inputarg after the above set_memory_decrypted(): if 404 * hyperv_pcpu_input_arg is NULL, __send_ipi_mask_ex() returns 405 * HV_STATUS_INVALID_PARAMETER immediately, and the function 406 * hv_send_ipi_mask() falls back to orig_apic.send_IPI_mask(), 407 * which may be slightly slower than the hypercall, but still 408 * works correctly in such a VM. 409 */ 410 *inputarg = mem; 411 } 412 413 msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX); 414 415 hv_vp_index[cpu] = msr_vp_index; 416 417 if (msr_vp_index > hv_max_vp_index) 418 hv_max_vp_index = msr_vp_index; 419 420 return 0; 421 } 422 423 int hv_common_cpu_die(unsigned int cpu) 424 { 425 /* 426 * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory 427 * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg 428 * may be used by the Hyper-V vPCI driver in reassigning interrupts 429 * as part of the offlining process. The interrupt reassignment 430 * happens *after* the CPUHP_AP_HYPERV_ONLINE state has run and 431 * called this function. 432 * 433 * If a previously offlined CPU is brought back online again, the 434 * originally allocated memory is reused in hv_common_cpu_init(). 435 */ 436 437 return 0; 438 } 439 440 /* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */ 441 bool hv_query_ext_cap(u64 cap_query) 442 { 443 /* 444 * The address of the 'hv_extended_cap' variable will be used as an 445 * output parameter to the hypercall below and so it should be 446 * compatible with 'virt_to_phys'. Which means, it's address should be 447 * directly mapped. Use 'static' to keep it compatible; stack variables 448 * can be virtually mapped, making them incompatible with 449 * 'virt_to_phys'. 450 * Hypercall input/output addresses should also be 8-byte aligned. 451 */ 452 static u64 hv_extended_cap __aligned(8); 453 static bool hv_extended_cap_queried; 454 u64 status; 455 456 /* 457 * Querying extended capabilities is an extended hypercall. Check if the 458 * partition supports extended hypercall, first. 459 */ 460 if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS)) 461 return false; 462 463 /* Extended capabilities do not change at runtime. */ 464 if (hv_extended_cap_queried) 465 return hv_extended_cap & cap_query; 466 467 status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, 468 &hv_extended_cap); 469 470 /* 471 * The query extended capabilities hypercall should not fail under 472 * any normal circumstances. Avoid repeatedly making the hypercall, on 473 * error. 474 */ 475 hv_extended_cap_queried = true; 476 if (!hv_result_success(status)) { 477 pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n", 478 status); 479 return false; 480 } 481 482 return hv_extended_cap & cap_query; 483 } 484 EXPORT_SYMBOL_GPL(hv_query_ext_cap); 485 486 void hv_setup_dma_ops(struct device *dev, bool coherent) 487 { 488 /* 489 * Hyper-V does not offer a vIOMMU in the guest 490 * VM, so pass 0/NULL for the IOMMU settings 491 */ 492 arch_setup_dma_ops(dev, 0, 0, NULL, coherent); 493 } 494 EXPORT_SYMBOL_GPL(hv_setup_dma_ops); 495 496 bool hv_is_hibernation_supported(void) 497 { 498 return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4); 499 } 500 EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); 501 502 /* 503 * Default function to read the Hyper-V reference counter, independent 504 * of whether Hyper-V enlightened clocks/timers are being used. But on 505 * architectures where it is used, Hyper-V enlightenment code in 506 * hyperv_timer.c may override this function. 507 */ 508 static u64 __hv_read_ref_counter(void) 509 { 510 return hv_get_register(HV_REGISTER_TIME_REF_COUNT); 511 } 512 513 u64 (*hv_read_reference_counter)(void) = __hv_read_ref_counter; 514 EXPORT_SYMBOL_GPL(hv_read_reference_counter); 515 516 /* These __weak functions provide default "no-op" behavior and 517 * may be overridden by architecture specific versions. Architectures 518 * for which the default "no-op" behavior is sufficient can leave 519 * them unimplemented and not be cluttered with a bunch of stub 520 * functions in arch-specific code. 521 */ 522 523 bool __weak hv_is_isolation_supported(void) 524 { 525 return false; 526 } 527 EXPORT_SYMBOL_GPL(hv_is_isolation_supported); 528 529 bool __weak hv_isolation_type_snp(void) 530 { 531 return false; 532 } 533 EXPORT_SYMBOL_GPL(hv_isolation_type_snp); 534 535 bool __weak hv_isolation_type_tdx(void) 536 { 537 return false; 538 } 539 EXPORT_SYMBOL_GPL(hv_isolation_type_tdx); 540 541 void __weak hv_setup_vmbus_handler(void (*handler)(void)) 542 { 543 } 544 EXPORT_SYMBOL_GPL(hv_setup_vmbus_handler); 545 546 void __weak hv_remove_vmbus_handler(void) 547 { 548 } 549 EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler); 550 551 void __weak hv_setup_kexec_handler(void (*handler)(void)) 552 { 553 } 554 EXPORT_SYMBOL_GPL(hv_setup_kexec_handler); 555 556 void __weak hv_remove_kexec_handler(void) 557 { 558 } 559 EXPORT_SYMBOL_GPL(hv_remove_kexec_handler); 560 561 void __weak hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)) 562 { 563 } 564 EXPORT_SYMBOL_GPL(hv_setup_crash_handler); 565 566 void __weak hv_remove_crash_handler(void) 567 { 568 } 569 EXPORT_SYMBOL_GPL(hv_remove_crash_handler); 570 571 void __weak hyperv_cleanup(void) 572 { 573 } 574 EXPORT_SYMBOL_GPL(hyperv_cleanup); 575 576 u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size) 577 { 578 return HV_STATUS_INVALID_PARAMETER; 579 } 580 EXPORT_SYMBOL_GPL(hv_ghcb_hypercall); 581 582 u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2) 583 { 584 return HV_STATUS_INVALID_PARAMETER; 585 } 586 EXPORT_SYMBOL_GPL(hv_tdx_hypercall); 587