1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2009, Microsoft Corporation. 4 * 5 * Authors: 6 * Haiyang Zhang <haiyangz@microsoft.com> 7 * Hank Janssen <hjanssen@microsoft.com> 8 */ 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/io.h> 12 #include <linux/kernel.h> 13 #include <linux/mm.h> 14 #include <linux/slab.h> 15 #include <linux/vmalloc.h> 16 #include <linux/hyperv.h> 17 #include <linux/random.h> 18 #include <linux/clockchips.h> 19 #include <linux/delay.h> 20 #include <linux/interrupt.h> 21 #include <clocksource/hyperv_timer.h> 22 #include <asm/mshyperv.h> 23 #include <linux/set_memory.h> 24 #include "hyperv_vmbus.h" 25 26 /* The one and only */ 27 struct hv_context hv_context; 28 29 /* 30 * hv_init - Main initialization routine. 31 * 32 * This routine must be called before any other routines in here are called 33 */ 34 int hv_init(void) 35 { 36 hv_context.cpu_context = alloc_percpu(struct hv_per_cpu_context); 37 if (!hv_context.cpu_context) 38 return -ENOMEM; 39 return 0; 40 } 41 42 /* 43 * hv_post_message - Post a message using the hypervisor message IPC. 44 * 45 * This involves a hypercall. 46 */ 47 int hv_post_message(union hv_connection_id connection_id, 48 enum hv_message_type message_type, 49 void *payload, size_t payload_size) 50 { 51 struct hv_input_post_message *aligned_msg; 52 unsigned long flags; 53 u64 status; 54 55 if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) 56 return -EMSGSIZE; 57 58 local_irq_save(flags); 59 60 /* 61 * A TDX VM with the paravisor must use the decrypted post_msg_page: see 62 * the comment in struct hv_per_cpu_context. A SNP VM with the paravisor 63 * can use the encrypted hyperv_pcpu_input_arg because it copies the 64 * input into the GHCB page, which has been decrypted by the paravisor. 65 */ 66 if (hv_isolation_type_tdx() && ms_hyperv.paravisor_present) 67 aligned_msg = this_cpu_ptr(hv_context.cpu_context)->post_msg_page; 68 else 69 aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg); 70 71 aligned_msg->connectionid = connection_id; 72 aligned_msg->reserved = 0; 73 aligned_msg->message_type = message_type; 74 aligned_msg->payload_size = payload_size; 75 memcpy((void *)aligned_msg->payload, payload, payload_size); 76 77 if (ms_hyperv.paravisor_present) { 78 if (hv_isolation_type_tdx()) 79 status = hv_tdx_hypercall(HVCALL_POST_MESSAGE, 80 virt_to_phys(aligned_msg), 0); 81 else if (hv_isolation_type_snp()) 82 status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE, 83 aligned_msg, NULL, 84 sizeof(*aligned_msg)); 85 else 86 status = HV_STATUS_INVALID_PARAMETER; 87 } else { 88 u64 control = HVCALL_POST_MESSAGE; 89 90 control |= hv_nested ? HV_HYPERCALL_NESTED : 0; 91 status = hv_do_hypercall(control, aligned_msg, NULL); 92 } 93 94 local_irq_restore(flags); 95 96 return hv_result(status); 97 } 98 99 int hv_synic_alloc(void) 100 { 101 int cpu, ret = -ENOMEM; 102 struct hv_per_cpu_context *hv_cpu; 103 104 /* 105 * First, zero all per-cpu memory areas so hv_synic_free() can 106 * detect what memory has been allocated and cleanup properly 107 * after any failures. 108 */ 109 for_each_present_cpu(cpu) { 110 hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); 111 memset(hv_cpu, 0, sizeof(*hv_cpu)); 112 } 113 114 hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask), 115 GFP_KERNEL); 116 if (!hv_context.hv_numa_map) { 117 pr_err("Unable to allocate NUMA map\n"); 118 goto err; 119 } 120 121 for_each_present_cpu(cpu) { 122 hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); 123 124 tasklet_init(&hv_cpu->msg_dpc, 125 vmbus_on_msg_dpc, (unsigned long)hv_cpu); 126 127 if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { 128 hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC); 129 if (!hv_cpu->post_msg_page) { 130 pr_err("Unable to allocate post msg page\n"); 131 goto err; 132 } 133 134 ret = set_memory_decrypted((unsigned long)hv_cpu->post_msg_page, 1); 135 if (ret) { 136 pr_err("Failed to decrypt post msg page: %d\n", ret); 137 /* Just leak the page, as it's unsafe to free the page. */ 138 hv_cpu->post_msg_page = NULL; 139 goto err; 140 } 141 142 memset(hv_cpu->post_msg_page, 0, PAGE_SIZE); 143 } 144 145 /* 146 * Synic message and event pages are allocated by paravisor. 147 * Skip these pages allocation here. 148 */ 149 if (!ms_hyperv.paravisor_present && !hv_root_partition()) { 150 hv_cpu->synic_message_page = 151 (void *)get_zeroed_page(GFP_ATOMIC); 152 if (!hv_cpu->synic_message_page) { 153 pr_err("Unable to allocate SYNIC message page\n"); 154 goto err; 155 } 156 157 hv_cpu->synic_event_page = 158 (void *)get_zeroed_page(GFP_ATOMIC); 159 if (!hv_cpu->synic_event_page) { 160 pr_err("Unable to allocate SYNIC event page\n"); 161 162 free_page((unsigned long)hv_cpu->synic_message_page); 163 hv_cpu->synic_message_page = NULL; 164 goto err; 165 } 166 } 167 168 if (!ms_hyperv.paravisor_present && 169 (hv_isolation_type_snp() || hv_isolation_type_tdx())) { 170 ret = set_memory_decrypted((unsigned long) 171 hv_cpu->synic_message_page, 1); 172 if (ret) { 173 pr_err("Failed to decrypt SYNIC msg page: %d\n", ret); 174 hv_cpu->synic_message_page = NULL; 175 176 /* 177 * Free the event page here so that hv_synic_free() 178 * won't later try to re-encrypt it. 179 */ 180 free_page((unsigned long)hv_cpu->synic_event_page); 181 hv_cpu->synic_event_page = NULL; 182 goto err; 183 } 184 185 ret = set_memory_decrypted((unsigned long) 186 hv_cpu->synic_event_page, 1); 187 if (ret) { 188 pr_err("Failed to decrypt SYNIC event page: %d\n", ret); 189 hv_cpu->synic_event_page = NULL; 190 goto err; 191 } 192 193 memset(hv_cpu->synic_message_page, 0, PAGE_SIZE); 194 memset(hv_cpu->synic_event_page, 0, PAGE_SIZE); 195 } 196 } 197 198 return 0; 199 200 err: 201 /* 202 * Any memory allocations that succeeded will be freed when 203 * the caller cleans up by calling hv_synic_free() 204 */ 205 return ret; 206 } 207 208 void hv_synic_free(void) 209 { 210 int cpu, ret; 211 212 for_each_present_cpu(cpu) { 213 struct hv_per_cpu_context *hv_cpu = 214 per_cpu_ptr(hv_context.cpu_context, cpu); 215 216 /* It's better to leak the page if the encryption fails. */ 217 if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { 218 if (hv_cpu->post_msg_page) { 219 ret = set_memory_encrypted((unsigned long) 220 hv_cpu->post_msg_page, 1); 221 if (ret) { 222 pr_err("Failed to encrypt post msg page: %d\n", ret); 223 hv_cpu->post_msg_page = NULL; 224 } 225 } 226 } 227 228 if (!ms_hyperv.paravisor_present && 229 (hv_isolation_type_snp() || hv_isolation_type_tdx())) { 230 if (hv_cpu->synic_message_page) { 231 ret = set_memory_encrypted((unsigned long) 232 hv_cpu->synic_message_page, 1); 233 if (ret) { 234 pr_err("Failed to encrypt SYNIC msg page: %d\n", ret); 235 hv_cpu->synic_message_page = NULL; 236 } 237 } 238 239 if (hv_cpu->synic_event_page) { 240 ret = set_memory_encrypted((unsigned long) 241 hv_cpu->synic_event_page, 1); 242 if (ret) { 243 pr_err("Failed to encrypt SYNIC event page: %d\n", ret); 244 hv_cpu->synic_event_page = NULL; 245 } 246 } 247 } 248 249 free_page((unsigned long)hv_cpu->post_msg_page); 250 free_page((unsigned long)hv_cpu->synic_event_page); 251 free_page((unsigned long)hv_cpu->synic_message_page); 252 } 253 254 kfree(hv_context.hv_numa_map); 255 } 256 257 /* 258 * hv_synic_init - Initialize the Synthetic Interrupt Controller. 259 * 260 * If it is already initialized by another entity (ie x2v shim), we need to 261 * retrieve the initialized message and event pages. Otherwise, we create and 262 * initialize the message and event pages. 263 */ 264 void hv_synic_enable_regs(unsigned int cpu) 265 { 266 struct hv_per_cpu_context *hv_cpu = 267 per_cpu_ptr(hv_context.cpu_context, cpu); 268 union hv_synic_simp simp; 269 union hv_synic_siefp siefp; 270 union hv_synic_sint shared_sint; 271 union hv_synic_scontrol sctrl; 272 273 /* Setup the Synic's message page */ 274 simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); 275 simp.simp_enabled = 1; 276 277 if (ms_hyperv.paravisor_present || hv_root_partition()) { 278 /* Mask out vTOM bit. ioremap_cache() maps decrypted */ 279 u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & 280 ~ms_hyperv.shared_gpa_boundary; 281 hv_cpu->synic_message_page = 282 (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); 283 if (!hv_cpu->synic_message_page) 284 pr_err("Fail to map synic message page.\n"); 285 } else { 286 simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page) 287 >> HV_HYP_PAGE_SHIFT; 288 } 289 290 hv_set_msr(HV_MSR_SIMP, simp.as_uint64); 291 292 /* Setup the Synic's event page */ 293 siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); 294 siefp.siefp_enabled = 1; 295 296 if (ms_hyperv.paravisor_present || hv_root_partition()) { 297 /* Mask out vTOM bit. ioremap_cache() maps decrypted */ 298 u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & 299 ~ms_hyperv.shared_gpa_boundary; 300 hv_cpu->synic_event_page = 301 (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); 302 if (!hv_cpu->synic_event_page) 303 pr_err("Fail to map synic event page.\n"); 304 } else { 305 siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page) 306 >> HV_HYP_PAGE_SHIFT; 307 } 308 309 hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); 310 311 /* Setup the shared SINT. */ 312 if (vmbus_irq != -1) 313 enable_percpu_irq(vmbus_irq, 0); 314 shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT); 315 316 shared_sint.vector = vmbus_interrupt; 317 shared_sint.masked = false; 318 shared_sint.auto_eoi = hv_recommend_using_aeoi(); 319 hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); 320 321 /* Enable the global synic bit */ 322 sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); 323 sctrl.enable = 1; 324 325 hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); 326 } 327 328 int hv_synic_init(unsigned int cpu) 329 { 330 hv_synic_enable_regs(cpu); 331 332 hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT); 333 334 return 0; 335 } 336 337 void hv_synic_disable_regs(unsigned int cpu) 338 { 339 struct hv_per_cpu_context *hv_cpu = 340 per_cpu_ptr(hv_context.cpu_context, cpu); 341 union hv_synic_sint shared_sint; 342 union hv_synic_simp simp; 343 union hv_synic_siefp siefp; 344 union hv_synic_scontrol sctrl; 345 346 shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT); 347 348 shared_sint.masked = 1; 349 350 /* Need to correctly cleanup in the case of SMP!!! */ 351 /* Disable the interrupt */ 352 hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); 353 354 simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); 355 /* 356 * In Isolation VM, sim and sief pages are allocated by 357 * paravisor. These pages also will be used by kdump 358 * kernel. So just reset enable bit here and keep page 359 * addresses. 360 */ 361 simp.simp_enabled = 0; 362 if (ms_hyperv.paravisor_present || hv_root_partition()) { 363 iounmap(hv_cpu->synic_message_page); 364 hv_cpu->synic_message_page = NULL; 365 } else { 366 simp.base_simp_gpa = 0; 367 } 368 369 hv_set_msr(HV_MSR_SIMP, simp.as_uint64); 370 371 siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); 372 siefp.siefp_enabled = 0; 373 374 if (ms_hyperv.paravisor_present || hv_root_partition()) { 375 iounmap(hv_cpu->synic_event_page); 376 hv_cpu->synic_event_page = NULL; 377 } else { 378 siefp.base_siefp_gpa = 0; 379 } 380 381 hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); 382 383 /* Disable the global synic bit */ 384 sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); 385 sctrl.enable = 0; 386 hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); 387 388 if (vmbus_irq != -1) 389 disable_percpu_irq(vmbus_irq); 390 } 391 392 #define HV_MAX_TRIES 3 393 /* 394 * Scan the event flags page of 'this' CPU looking for any bit that is set. If we find one 395 * bit set, then wait for a few milliseconds. Repeat these steps for a maximum of 3 times. 396 * Return 'true', if there is still any set bit after this operation; 'false', otherwise. 397 * 398 * If a bit is set, that means there is a pending channel interrupt. The expectation is 399 * that the normal interrupt handling mechanism will find and process the channel interrupt 400 * "very soon", and in the process clear the bit. 401 */ 402 static bool hv_synic_event_pending(void) 403 { 404 struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); 405 union hv_synic_event_flags *event = 406 (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT; 407 unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */ 408 bool pending; 409 u32 relid; 410 int tries = 0; 411 412 retry: 413 pending = false; 414 for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) { 415 /* Special case - VMBus channel protocol messages */ 416 if (relid == 0) 417 continue; 418 pending = true; 419 break; 420 } 421 if (pending && tries++ < HV_MAX_TRIES) { 422 usleep_range(10000, 20000); 423 goto retry; 424 } 425 return pending; 426 } 427 428 static int hv_pick_new_cpu(struct vmbus_channel *channel) 429 { 430 int ret = -EBUSY; 431 int start; 432 int cpu; 433 434 lockdep_assert_cpus_held(); 435 lockdep_assert_held(&vmbus_connection.channel_mutex); 436 437 /* 438 * We can't assume that the relevant interrupts will be sent before 439 * the cpu is offlined on older versions of hyperv. 440 */ 441 if (vmbus_proto_version < VERSION_WIN10_V5_3) 442 return -EBUSY; 443 444 start = get_random_u32_below(nr_cpu_ids); 445 446 for_each_cpu_wrap(cpu, cpu_online_mask, start) { 447 if (channel->target_cpu == cpu || 448 channel->target_cpu == VMBUS_CONNECT_CPU) 449 continue; 450 451 ret = vmbus_channel_set_cpu(channel, cpu); 452 if (!ret) 453 break; 454 } 455 456 if (ret) 457 ret = vmbus_channel_set_cpu(channel, VMBUS_CONNECT_CPU); 458 459 return ret; 460 } 461 462 /* 463 * hv_synic_cleanup - Cleanup routine for hv_synic_init(). 464 */ 465 int hv_synic_cleanup(unsigned int cpu) 466 { 467 struct vmbus_channel *channel, *sc; 468 int ret = 0; 469 470 if (vmbus_connection.conn_state != CONNECTED) 471 goto always_cleanup; 472 473 /* 474 * Hyper-V does not provide a way to change the connect CPU once 475 * it is set; we must prevent the connect CPU from going offline 476 * while the VM is running normally. But in the panic or kexec() 477 * path where the vmbus is already disconnected, the CPU must be 478 * allowed to shut down. 479 */ 480 if (cpu == VMBUS_CONNECT_CPU) 481 return -EBUSY; 482 483 /* 484 * Search for channels which are bound to the CPU we're about to 485 * cleanup. 486 */ 487 mutex_lock(&vmbus_connection.channel_mutex); 488 list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { 489 if (channel->target_cpu == cpu) { 490 ret = hv_pick_new_cpu(channel); 491 if (ret) { 492 mutex_unlock(&vmbus_connection.channel_mutex); 493 return ret; 494 } 495 } 496 list_for_each_entry(sc, &channel->sc_list, sc_list) { 497 if (sc->target_cpu == cpu) { 498 ret = hv_pick_new_cpu(sc); 499 if (ret) { 500 mutex_unlock(&vmbus_connection.channel_mutex); 501 return ret; 502 } 503 } 504 } 505 } 506 mutex_unlock(&vmbus_connection.channel_mutex); 507 508 /* 509 * Scan the event flags page looking for bits that are set and waiting 510 * with a timeout for vmbus_chan_sched() to process such bits. If bits 511 * are still set after this operation and VMBus is connected, fail the 512 * CPU offlining operation. 513 */ 514 if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending()) 515 return -EBUSY; 516 517 always_cleanup: 518 hv_stimer_legacy_cleanup(cpu); 519 520 hv_synic_disable_regs(cpu); 521 522 return ret; 523 } 524