1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2009, Microsoft Corporation. 4 * 5 * Authors: 6 * Haiyang Zhang <haiyangz@microsoft.com> 7 * Hank Janssen <hjanssen@microsoft.com> 8 * K. Y. Srinivasan <kys@microsoft.com> 9 */ 10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 11 12 #include <linux/init.h> 13 #include <linux/module.h> 14 #include <linux/device.h> 15 #include <linux/platform_device.h> 16 #include <linux/interrupt.h> 17 #include <linux/sysctl.h> 18 #include <linux/slab.h> 19 #include <linux/acpi.h> 20 #include <linux/completion.h> 21 #include <linux/hyperv.h> 22 #include <linux/kernel_stat.h> 23 #include <linux/of_address.h> 24 #include <linux/clockchips.h> 25 #include <linux/cpu.h> 26 #include <linux/sched/isolation.h> 27 #include <linux/sched/task_stack.h> 28 29 #include <linux/delay.h> 30 #include <linux/panic_notifier.h> 31 #include <linux/ptrace.h> 32 #include <linux/screen_info.h> 33 #include <linux/efi.h> 34 #include <linux/random.h> 35 #include <linux/kernel.h> 36 #include <linux/syscore_ops.h> 37 #include <linux/dma-map-ops.h> 38 #include <linux/pci.h> 39 #include <linux/export.h> 40 #include <clocksource/hyperv_timer.h> 41 #include <asm/mshyperv.h> 42 #include "hyperv_vmbus.h" 43 44 struct vmbus_dynid { 45 struct list_head node; 46 struct hv_vmbus_device_id id; 47 }; 48 49 /* VMBus Root Device */ 50 static struct device *vmbus_root_device; 51 52 static int hyperv_cpuhp_online; 53 54 static long __percpu *vmbus_evt; 55 56 /* Values parsed from ACPI DSDT */ 57 int vmbus_irq; 58 int vmbus_interrupt; 59 60 /* 61 * If the Confidential VMBus is used, the data on the "wire" is not 62 * visible to either the host or the hypervisor. 63 */ 64 static bool is_confidential; 65 66 bool vmbus_is_confidential(void) 67 { 68 return is_confidential; 69 } 70 EXPORT_SYMBOL_GPL(vmbus_is_confidential); 71 72 /* 73 * The panic notifier below is responsible solely for unloading the 74 * vmbus connection, which is necessary in a panic event. 75 * 76 * Notice an intrincate relation of this notifier with Hyper-V 77 * framebuffer panic notifier exists - we need vmbus connection alive 78 * there in order to succeed, so we need to order both with each other 79 * [see hvfb_on_panic()] - this is done using notifiers' priorities. 80 */ 81 static int hv_panic_vmbus_unload(struct notifier_block *nb, unsigned long val, 82 void *args) 83 { 84 vmbus_initiate_unload(true); 85 return NOTIFY_DONE; 86 } 87 static struct notifier_block hyperv_panic_vmbus_unload_block = { 88 .notifier_call = hv_panic_vmbus_unload, 89 .priority = INT_MIN + 1, /* almost the latest one to execute */ 90 }; 91 92 static const char *fb_mmio_name = "fb_range"; 93 static struct resource *fb_mmio; 94 static struct resource *hyperv_mmio; 95 static DEFINE_MUTEX(hyperv_mmio_lock); 96 97 struct device *hv_get_vmbus_root_device(void) 98 { 99 return vmbus_root_device; 100 } 101 EXPORT_SYMBOL_GPL(hv_get_vmbus_root_device); 102 103 static int vmbus_exists(void) 104 { 105 if (vmbus_root_device == NULL) 106 return -ENODEV; 107 108 return 0; 109 } 110 111 static u8 channel_monitor_group(const struct vmbus_channel *channel) 112 { 113 return (u8)channel->offermsg.monitorid / 32; 114 } 115 116 static u8 channel_monitor_offset(const struct vmbus_channel *channel) 117 { 118 return (u8)channel->offermsg.monitorid % 32; 119 } 120 121 static u32 channel_pending(const struct vmbus_channel *channel, 122 const struct hv_monitor_page *monitor_page) 123 { 124 u8 monitor_group = channel_monitor_group(channel); 125 126 return monitor_page->trigger_group[monitor_group].pending; 127 } 128 129 static u32 channel_latency(const struct vmbus_channel *channel, 130 const struct hv_monitor_page *monitor_page) 131 { 132 u8 monitor_group = channel_monitor_group(channel); 133 u8 monitor_offset = channel_monitor_offset(channel); 134 135 return monitor_page->latency[monitor_group][monitor_offset]; 136 } 137 138 static u32 channel_conn_id(struct vmbus_channel *channel, 139 struct hv_monitor_page *monitor_page) 140 { 141 u8 monitor_group = channel_monitor_group(channel); 142 u8 monitor_offset = channel_monitor_offset(channel); 143 144 return monitor_page->parameter[monitor_group][monitor_offset].connectionid.u.id; 145 } 146 147 static ssize_t id_show(struct device *dev, struct device_attribute *dev_attr, 148 char *buf) 149 { 150 struct hv_device *hv_dev = device_to_hv_device(dev); 151 152 if (!hv_dev->channel) 153 return -ENODEV; 154 return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.child_relid); 155 } 156 static DEVICE_ATTR_RO(id); 157 158 static ssize_t state_show(struct device *dev, struct device_attribute *dev_attr, 159 char *buf) 160 { 161 struct hv_device *hv_dev = device_to_hv_device(dev); 162 163 if (!hv_dev->channel) 164 return -ENODEV; 165 return sysfs_emit(buf, "%d\n", hv_dev->channel->state); 166 } 167 static DEVICE_ATTR_RO(state); 168 169 static ssize_t monitor_id_show(struct device *dev, 170 struct device_attribute *dev_attr, char *buf) 171 { 172 struct hv_device *hv_dev = device_to_hv_device(dev); 173 174 if (!hv_dev->channel) 175 return -ENODEV; 176 return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.monitorid); 177 } 178 static DEVICE_ATTR_RO(monitor_id); 179 180 static ssize_t class_id_show(struct device *dev, 181 struct device_attribute *dev_attr, char *buf) 182 { 183 struct hv_device *hv_dev = device_to_hv_device(dev); 184 185 if (!hv_dev->channel) 186 return -ENODEV; 187 return sysfs_emit(buf, "{%pUl}\n", 188 &hv_dev->channel->offermsg.offer.if_type); 189 } 190 static DEVICE_ATTR_RO(class_id); 191 192 static ssize_t device_id_show(struct device *dev, 193 struct device_attribute *dev_attr, char *buf) 194 { 195 struct hv_device *hv_dev = device_to_hv_device(dev); 196 197 if (!hv_dev->channel) 198 return -ENODEV; 199 return sysfs_emit(buf, "{%pUl}\n", 200 &hv_dev->channel->offermsg.offer.if_instance); 201 } 202 static DEVICE_ATTR_RO(device_id); 203 204 static ssize_t modalias_show(struct device *dev, 205 struct device_attribute *dev_attr, char *buf) 206 { 207 struct hv_device *hv_dev = device_to_hv_device(dev); 208 209 return sysfs_emit(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); 210 } 211 static DEVICE_ATTR_RO(modalias); 212 213 #ifdef CONFIG_NUMA 214 static ssize_t numa_node_show(struct device *dev, 215 struct device_attribute *attr, char *buf) 216 { 217 struct hv_device *hv_dev = device_to_hv_device(dev); 218 219 if (!hv_dev->channel) 220 return -ENODEV; 221 222 return sysfs_emit(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu)); 223 } 224 static DEVICE_ATTR_RO(numa_node); 225 #endif 226 227 static ssize_t server_monitor_pending_show(struct device *dev, 228 struct device_attribute *dev_attr, 229 char *buf) 230 { 231 struct hv_device *hv_dev = device_to_hv_device(dev); 232 233 if (!hv_dev->channel) 234 return -ENODEV; 235 return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel, 236 vmbus_connection.monitor_pages[0])); 237 } 238 static DEVICE_ATTR_RO(server_monitor_pending); 239 240 static ssize_t client_monitor_pending_show(struct device *dev, 241 struct device_attribute *dev_attr, 242 char *buf) 243 { 244 struct hv_device *hv_dev = device_to_hv_device(dev); 245 246 if (!hv_dev->channel) 247 return -ENODEV; 248 return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel, 249 vmbus_connection.monitor_pages[1])); 250 } 251 static DEVICE_ATTR_RO(client_monitor_pending); 252 253 static ssize_t server_monitor_latency_show(struct device *dev, 254 struct device_attribute *dev_attr, 255 char *buf) 256 { 257 struct hv_device *hv_dev = device_to_hv_device(dev); 258 259 if (!hv_dev->channel) 260 return -ENODEV; 261 return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel, 262 vmbus_connection.monitor_pages[0])); 263 } 264 static DEVICE_ATTR_RO(server_monitor_latency); 265 266 static ssize_t client_monitor_latency_show(struct device *dev, 267 struct device_attribute *dev_attr, 268 char *buf) 269 { 270 struct hv_device *hv_dev = device_to_hv_device(dev); 271 272 if (!hv_dev->channel) 273 return -ENODEV; 274 return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel, 275 vmbus_connection.monitor_pages[1])); 276 } 277 static DEVICE_ATTR_RO(client_monitor_latency); 278 279 static ssize_t server_monitor_conn_id_show(struct device *dev, 280 struct device_attribute *dev_attr, 281 char *buf) 282 { 283 struct hv_device *hv_dev = device_to_hv_device(dev); 284 285 if (!hv_dev->channel) 286 return -ENODEV; 287 return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel, 288 vmbus_connection.monitor_pages[0])); 289 } 290 static DEVICE_ATTR_RO(server_monitor_conn_id); 291 292 static ssize_t client_monitor_conn_id_show(struct device *dev, 293 struct device_attribute *dev_attr, 294 char *buf) 295 { 296 struct hv_device *hv_dev = device_to_hv_device(dev); 297 298 if (!hv_dev->channel) 299 return -ENODEV; 300 return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel, 301 vmbus_connection.monitor_pages[1])); 302 } 303 static DEVICE_ATTR_RO(client_monitor_conn_id); 304 305 static ssize_t out_intr_mask_show(struct device *dev, 306 struct device_attribute *dev_attr, char *buf) 307 { 308 struct hv_device *hv_dev = device_to_hv_device(dev); 309 struct hv_ring_buffer_debug_info outbound; 310 int ret; 311 312 if (!hv_dev->channel) 313 return -ENODEV; 314 315 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, 316 &outbound); 317 if (ret < 0) 318 return ret; 319 320 return sysfs_emit(buf, "%d\n", outbound.current_interrupt_mask); 321 } 322 static DEVICE_ATTR_RO(out_intr_mask); 323 324 static ssize_t out_read_index_show(struct device *dev, 325 struct device_attribute *dev_attr, char *buf) 326 { 327 struct hv_device *hv_dev = device_to_hv_device(dev); 328 struct hv_ring_buffer_debug_info outbound; 329 int ret; 330 331 if (!hv_dev->channel) 332 return -ENODEV; 333 334 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, 335 &outbound); 336 if (ret < 0) 337 return ret; 338 return sysfs_emit(buf, "%u\n", outbound.current_read_index); 339 } 340 static DEVICE_ATTR_RO(out_read_index); 341 342 static ssize_t out_write_index_show(struct device *dev, 343 struct device_attribute *dev_attr, 344 char *buf) 345 { 346 struct hv_device *hv_dev = device_to_hv_device(dev); 347 struct hv_ring_buffer_debug_info outbound; 348 int ret; 349 350 if (!hv_dev->channel) 351 return -ENODEV; 352 353 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, 354 &outbound); 355 if (ret < 0) 356 return ret; 357 return sysfs_emit(buf, "%u\n", outbound.current_write_index); 358 } 359 static DEVICE_ATTR_RO(out_write_index); 360 361 static ssize_t out_read_bytes_avail_show(struct device *dev, 362 struct device_attribute *dev_attr, 363 char *buf) 364 { 365 struct hv_device *hv_dev = device_to_hv_device(dev); 366 struct hv_ring_buffer_debug_info outbound; 367 int ret; 368 369 if (!hv_dev->channel) 370 return -ENODEV; 371 372 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, 373 &outbound); 374 if (ret < 0) 375 return ret; 376 return sysfs_emit(buf, "%d\n", outbound.bytes_avail_toread); 377 } 378 static DEVICE_ATTR_RO(out_read_bytes_avail); 379 380 static ssize_t out_write_bytes_avail_show(struct device *dev, 381 struct device_attribute *dev_attr, 382 char *buf) 383 { 384 struct hv_device *hv_dev = device_to_hv_device(dev); 385 struct hv_ring_buffer_debug_info outbound; 386 int ret; 387 388 if (!hv_dev->channel) 389 return -ENODEV; 390 391 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->outbound, 392 &outbound); 393 if (ret < 0) 394 return ret; 395 return sysfs_emit(buf, "%d\n", outbound.bytes_avail_towrite); 396 } 397 static DEVICE_ATTR_RO(out_write_bytes_avail); 398 399 static ssize_t in_intr_mask_show(struct device *dev, 400 struct device_attribute *dev_attr, char *buf) 401 { 402 struct hv_device *hv_dev = device_to_hv_device(dev); 403 struct hv_ring_buffer_debug_info inbound; 404 int ret; 405 406 if (!hv_dev->channel) 407 return -ENODEV; 408 409 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound); 410 if (ret < 0) 411 return ret; 412 413 return sysfs_emit(buf, "%d\n", inbound.current_interrupt_mask); 414 } 415 static DEVICE_ATTR_RO(in_intr_mask); 416 417 static ssize_t in_read_index_show(struct device *dev, 418 struct device_attribute *dev_attr, char *buf) 419 { 420 struct hv_device *hv_dev = device_to_hv_device(dev); 421 struct hv_ring_buffer_debug_info inbound; 422 int ret; 423 424 if (!hv_dev->channel) 425 return -ENODEV; 426 427 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound); 428 if (ret < 0) 429 return ret; 430 431 return sysfs_emit(buf, "%d\n", inbound.current_read_index); 432 } 433 static DEVICE_ATTR_RO(in_read_index); 434 435 static ssize_t in_write_index_show(struct device *dev, 436 struct device_attribute *dev_attr, char *buf) 437 { 438 struct hv_device *hv_dev = device_to_hv_device(dev); 439 struct hv_ring_buffer_debug_info inbound; 440 int ret; 441 442 if (!hv_dev->channel) 443 return -ENODEV; 444 445 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound); 446 if (ret < 0) 447 return ret; 448 449 return sysfs_emit(buf, "%d\n", inbound.current_write_index); 450 } 451 static DEVICE_ATTR_RO(in_write_index); 452 453 static ssize_t in_read_bytes_avail_show(struct device *dev, 454 struct device_attribute *dev_attr, 455 char *buf) 456 { 457 struct hv_device *hv_dev = device_to_hv_device(dev); 458 struct hv_ring_buffer_debug_info inbound; 459 int ret; 460 461 if (!hv_dev->channel) 462 return -ENODEV; 463 464 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound); 465 if (ret < 0) 466 return ret; 467 468 return sysfs_emit(buf, "%d\n", inbound.bytes_avail_toread); 469 } 470 static DEVICE_ATTR_RO(in_read_bytes_avail); 471 472 static ssize_t in_write_bytes_avail_show(struct device *dev, 473 struct device_attribute *dev_attr, 474 char *buf) 475 { 476 struct hv_device *hv_dev = device_to_hv_device(dev); 477 struct hv_ring_buffer_debug_info inbound; 478 int ret; 479 480 if (!hv_dev->channel) 481 return -ENODEV; 482 483 ret = hv_ringbuffer_get_debuginfo(&hv_dev->channel->inbound, &inbound); 484 if (ret < 0) 485 return ret; 486 487 return sysfs_emit(buf, "%d\n", inbound.bytes_avail_towrite); 488 } 489 static DEVICE_ATTR_RO(in_write_bytes_avail); 490 491 static ssize_t channel_vp_mapping_show(struct device *dev, 492 struct device_attribute *dev_attr, 493 char *buf) 494 { 495 struct hv_device *hv_dev = device_to_hv_device(dev); 496 struct vmbus_channel *channel = hv_dev->channel, *cur_sc; 497 int n_written; 498 struct list_head *cur; 499 500 if (!channel) 501 return -ENODEV; 502 503 mutex_lock(&vmbus_connection.channel_mutex); 504 505 n_written = sysfs_emit(buf, "%u:%u\n", 506 channel->offermsg.child_relid, 507 channel->target_cpu); 508 509 list_for_each(cur, &channel->sc_list) { 510 511 cur_sc = list_entry(cur, struct vmbus_channel, sc_list); 512 n_written += sysfs_emit_at(buf, n_written, "%u:%u\n", 513 cur_sc->offermsg.child_relid, 514 cur_sc->target_cpu); 515 } 516 517 mutex_unlock(&vmbus_connection.channel_mutex); 518 519 return n_written; 520 } 521 static DEVICE_ATTR_RO(channel_vp_mapping); 522 523 static ssize_t vendor_show(struct device *dev, 524 struct device_attribute *dev_attr, 525 char *buf) 526 { 527 struct hv_device *hv_dev = device_to_hv_device(dev); 528 529 return sysfs_emit(buf, "0x%x\n", hv_dev->vendor_id); 530 } 531 static DEVICE_ATTR_RO(vendor); 532 533 static ssize_t device_show(struct device *dev, 534 struct device_attribute *dev_attr, 535 char *buf) 536 { 537 struct hv_device *hv_dev = device_to_hv_device(dev); 538 539 return sysfs_emit(buf, "0x%x\n", hv_dev->device_id); 540 } 541 static DEVICE_ATTR_RO(device); 542 543 static ssize_t driver_override_store(struct device *dev, 544 struct device_attribute *attr, 545 const char *buf, size_t count) 546 { 547 struct hv_device *hv_dev = device_to_hv_device(dev); 548 int ret; 549 550 ret = driver_set_override(dev, &hv_dev->driver_override, buf, count); 551 if (ret) 552 return ret; 553 554 return count; 555 } 556 557 static ssize_t driver_override_show(struct device *dev, 558 struct device_attribute *attr, char *buf) 559 { 560 struct hv_device *hv_dev = device_to_hv_device(dev); 561 ssize_t len; 562 563 device_lock(dev); 564 len = sysfs_emit(buf, "%s\n", hv_dev->driver_override); 565 device_unlock(dev); 566 567 return len; 568 } 569 static DEVICE_ATTR_RW(driver_override); 570 571 /* Set up per device attributes in /sys/bus/vmbus/devices/<bus device> */ 572 static struct attribute *vmbus_dev_attrs[] = { 573 &dev_attr_id.attr, 574 &dev_attr_state.attr, 575 &dev_attr_monitor_id.attr, 576 &dev_attr_class_id.attr, 577 &dev_attr_device_id.attr, 578 &dev_attr_modalias.attr, 579 #ifdef CONFIG_NUMA 580 &dev_attr_numa_node.attr, 581 #endif 582 &dev_attr_server_monitor_pending.attr, 583 &dev_attr_client_monitor_pending.attr, 584 &dev_attr_server_monitor_latency.attr, 585 &dev_attr_client_monitor_latency.attr, 586 &dev_attr_server_monitor_conn_id.attr, 587 &dev_attr_client_monitor_conn_id.attr, 588 &dev_attr_out_intr_mask.attr, 589 &dev_attr_out_read_index.attr, 590 &dev_attr_out_write_index.attr, 591 &dev_attr_out_read_bytes_avail.attr, 592 &dev_attr_out_write_bytes_avail.attr, 593 &dev_attr_in_intr_mask.attr, 594 &dev_attr_in_read_index.attr, 595 &dev_attr_in_write_index.attr, 596 &dev_attr_in_read_bytes_avail.attr, 597 &dev_attr_in_write_bytes_avail.attr, 598 &dev_attr_channel_vp_mapping.attr, 599 &dev_attr_vendor.attr, 600 &dev_attr_device.attr, 601 &dev_attr_driver_override.attr, 602 NULL, 603 }; 604 605 /* 606 * Device-level attribute_group callback function. Returns the permission for 607 * each attribute, and returns 0 if an attribute is not visible. 608 */ 609 static umode_t vmbus_dev_attr_is_visible(struct kobject *kobj, 610 struct attribute *attr, int idx) 611 { 612 struct device *dev = kobj_to_dev(kobj); 613 const struct hv_device *hv_dev = device_to_hv_device(dev); 614 615 /* Hide the monitor attributes if the monitor mechanism is not used. */ 616 if (!hv_dev->channel->offermsg.monitor_allocated && 617 (attr == &dev_attr_monitor_id.attr || 618 attr == &dev_attr_server_monitor_pending.attr || 619 attr == &dev_attr_client_monitor_pending.attr || 620 attr == &dev_attr_server_monitor_latency.attr || 621 attr == &dev_attr_client_monitor_latency.attr || 622 attr == &dev_attr_server_monitor_conn_id.attr || 623 attr == &dev_attr_client_monitor_conn_id.attr)) 624 return 0; 625 626 return attr->mode; 627 } 628 629 static const struct attribute_group vmbus_dev_group = { 630 .attrs = vmbus_dev_attrs, 631 .is_visible = vmbus_dev_attr_is_visible 632 }; 633 __ATTRIBUTE_GROUPS(vmbus_dev); 634 635 /* Set up the attribute for /sys/bus/vmbus/hibernation */ 636 static ssize_t hibernation_show(const struct bus_type *bus, char *buf) 637 { 638 return sprintf(buf, "%d\n", !!hv_is_hibernation_supported()); 639 } 640 641 static BUS_ATTR_RO(hibernation); 642 643 static struct attribute *vmbus_bus_attrs[] = { 644 &bus_attr_hibernation.attr, 645 NULL, 646 }; 647 static const struct attribute_group vmbus_bus_group = { 648 .attrs = vmbus_bus_attrs, 649 }; 650 __ATTRIBUTE_GROUPS(vmbus_bus); 651 652 /* 653 * vmbus_uevent - add uevent for our device 654 * 655 * This routine is invoked when a device is added or removed on the vmbus to 656 * generate a uevent to udev in the userspace. The udev will then look at its 657 * rule and the uevent generated here to load the appropriate driver 658 * 659 * The alias string will be of the form vmbus:guid where guid is the string 660 * representation of the device guid (each byte of the guid will be 661 * represented with two hex characters. 662 */ 663 static int vmbus_uevent(const struct device *device, struct kobj_uevent_env *env) 664 { 665 const struct hv_device *dev = device_to_hv_device(device); 666 const char *format = "MODALIAS=vmbus:%*phN"; 667 668 return add_uevent_var(env, format, UUID_SIZE, &dev->dev_type); 669 } 670 671 static const struct hv_vmbus_device_id * 672 hv_vmbus_dev_match(const struct hv_vmbus_device_id *id, const guid_t *guid) 673 { 674 if (id == NULL) 675 return NULL; /* empty device table */ 676 677 for (; !guid_is_null(&id->guid); id++) 678 if (guid_equal(&id->guid, guid)) 679 return id; 680 681 return NULL; 682 } 683 684 static const struct hv_vmbus_device_id * 685 hv_vmbus_dynid_match(struct hv_driver *drv, const guid_t *guid) 686 { 687 const struct hv_vmbus_device_id *id = NULL; 688 struct vmbus_dynid *dynid; 689 690 spin_lock(&drv->dynids.lock); 691 list_for_each_entry(dynid, &drv->dynids.list, node) { 692 if (guid_equal(&dynid->id.guid, guid)) { 693 id = &dynid->id; 694 break; 695 } 696 } 697 spin_unlock(&drv->dynids.lock); 698 699 return id; 700 } 701 702 static const struct hv_vmbus_device_id vmbus_device_null; 703 704 /* 705 * Return a matching hv_vmbus_device_id pointer. 706 * If there is no match, return NULL. 707 */ 708 static const struct hv_vmbus_device_id *hv_vmbus_get_id(const struct hv_driver *drv, 709 struct hv_device *dev) 710 { 711 const guid_t *guid = &dev->dev_type; 712 const struct hv_vmbus_device_id *id; 713 714 /* When driver_override is set, only bind to the matching driver */ 715 if (dev->driver_override && strcmp(dev->driver_override, drv->name)) 716 return NULL; 717 718 /* Look at the dynamic ids first, before the static ones */ 719 id = hv_vmbus_dynid_match((struct hv_driver *)drv, guid); 720 if (!id) 721 id = hv_vmbus_dev_match(drv->id_table, guid); 722 723 /* driver_override will always match, send a dummy id */ 724 if (!id && dev->driver_override) 725 id = &vmbus_device_null; 726 727 return id; 728 } 729 730 /* vmbus_add_dynid - add a new device ID to this driver and re-probe devices 731 * 732 * This function can race with vmbus_device_register(). This function is 733 * typically running on a user thread in response to writing to the "new_id" 734 * sysfs entry for a driver. vmbus_device_register() is running on a 735 * workqueue thread in response to the Hyper-V host offering a device to the 736 * guest. This function calls driver_attach(), which looks for an existing 737 * device matching the new id, and attaches the driver to which the new id 738 * has been assigned. vmbus_device_register() calls device_register(), which 739 * looks for a driver that matches the device being registered. If both 740 * operations are running simultaneously, the device driver probe function runs 741 * on whichever thread establishes the linkage between the driver and device. 742 * 743 * In most cases, it doesn't matter which thread runs the driver probe 744 * function. But if vmbus_device_register() does not find a matching driver, 745 * it proceeds to create the "channels" subdirectory and numbered per-channel 746 * subdirectory in sysfs. While that multi-step creation is in progress, this 747 * function could run the driver probe function. If the probe function checks 748 * for, or operates on, entries in the "channels" subdirectory, including by 749 * calling hv_create_ring_sysfs(), the operation may or may not succeed 750 * depending on the race. The race can't create a kernel failure in VMBus 751 * or device subsystem code, but probe functions in VMBus drivers doing such 752 * operations must be prepared for the failure case. 753 */ 754 static int vmbus_add_dynid(struct hv_driver *drv, guid_t *guid) 755 { 756 struct vmbus_dynid *dynid; 757 758 dynid = kzalloc(sizeof(*dynid), GFP_KERNEL); 759 if (!dynid) 760 return -ENOMEM; 761 762 dynid->id.guid = *guid; 763 764 spin_lock(&drv->dynids.lock); 765 list_add_tail(&dynid->node, &drv->dynids.list); 766 spin_unlock(&drv->dynids.lock); 767 768 return driver_attach(&drv->driver); 769 } 770 771 static void vmbus_free_dynids(struct hv_driver *drv) 772 { 773 struct vmbus_dynid *dynid, *n; 774 775 spin_lock(&drv->dynids.lock); 776 list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) { 777 list_del(&dynid->node); 778 kfree(dynid); 779 } 780 spin_unlock(&drv->dynids.lock); 781 } 782 783 /* 784 * store_new_id - sysfs frontend to vmbus_add_dynid() 785 * 786 * Allow GUIDs to be added to an existing driver via sysfs. 787 */ 788 static ssize_t new_id_store(struct device_driver *driver, const char *buf, 789 size_t count) 790 { 791 struct hv_driver *drv = drv_to_hv_drv(driver); 792 guid_t guid; 793 ssize_t retval; 794 795 retval = guid_parse(buf, &guid); 796 if (retval) 797 return retval; 798 799 if (hv_vmbus_dynid_match(drv, &guid)) 800 return -EEXIST; 801 802 retval = vmbus_add_dynid(drv, &guid); 803 if (retval) 804 return retval; 805 return count; 806 } 807 static DRIVER_ATTR_WO(new_id); 808 809 /* 810 * store_remove_id - remove a PCI device ID from this driver 811 * 812 * Removes a dynamic pci device ID to this driver. 813 */ 814 static ssize_t remove_id_store(struct device_driver *driver, const char *buf, 815 size_t count) 816 { 817 struct hv_driver *drv = drv_to_hv_drv(driver); 818 struct vmbus_dynid *dynid, *n; 819 guid_t guid; 820 ssize_t retval; 821 822 retval = guid_parse(buf, &guid); 823 if (retval) 824 return retval; 825 826 retval = -ENODEV; 827 spin_lock(&drv->dynids.lock); 828 list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) { 829 struct hv_vmbus_device_id *id = &dynid->id; 830 831 if (guid_equal(&id->guid, &guid)) { 832 list_del(&dynid->node); 833 kfree(dynid); 834 retval = count; 835 break; 836 } 837 } 838 spin_unlock(&drv->dynids.lock); 839 840 return retval; 841 } 842 static DRIVER_ATTR_WO(remove_id); 843 844 static struct attribute *vmbus_drv_attrs[] = { 845 &driver_attr_new_id.attr, 846 &driver_attr_remove_id.attr, 847 NULL, 848 }; 849 ATTRIBUTE_GROUPS(vmbus_drv); 850 851 852 /* 853 * vmbus_match - Attempt to match the specified device to the specified driver 854 */ 855 static int vmbus_match(struct device *device, const struct device_driver *driver) 856 { 857 const struct hv_driver *drv = drv_to_hv_drv(driver); 858 struct hv_device *hv_dev = device_to_hv_device(device); 859 860 /* The hv_sock driver handles all hv_sock offers. */ 861 if (is_hvsock_channel(hv_dev->channel)) 862 return drv->hvsock; 863 864 if (hv_vmbus_get_id(drv, hv_dev)) 865 return 1; 866 867 return 0; 868 } 869 870 /* 871 * vmbus_probe - Add the new vmbus's child device 872 */ 873 static int vmbus_probe(struct device *child_device) 874 { 875 int ret = 0; 876 struct hv_driver *drv = 877 drv_to_hv_drv(child_device->driver); 878 struct hv_device *dev = device_to_hv_device(child_device); 879 const struct hv_vmbus_device_id *dev_id; 880 881 dev_id = hv_vmbus_get_id(drv, dev); 882 if (drv->probe) { 883 ret = drv->probe(dev, dev_id); 884 if (ret != 0) 885 pr_err("probe failed for device %s (%d)\n", 886 dev_name(child_device), ret); 887 888 } else { 889 pr_err("probe not set for driver %s\n", 890 dev_name(child_device)); 891 ret = -ENODEV; 892 } 893 return ret; 894 } 895 896 /* 897 * vmbus_dma_configure -- Configure DMA coherence for VMbus device 898 */ 899 static int vmbus_dma_configure(struct device *child_device) 900 { 901 /* 902 * On ARM64, propagate the DMA coherence setting from the top level 903 * VMbus ACPI device to the child VMbus device being added here. 904 * On x86/x64 coherence is assumed and these calls have no effect. 905 */ 906 hv_setup_dma_ops(child_device, 907 device_get_dma_attr(vmbus_root_device) == DEV_DMA_COHERENT); 908 return 0; 909 } 910 911 /* 912 * vmbus_remove - Remove a vmbus device 913 */ 914 static void vmbus_remove(struct device *child_device) 915 { 916 struct hv_driver *drv; 917 struct hv_device *dev = device_to_hv_device(child_device); 918 919 if (child_device->driver) { 920 drv = drv_to_hv_drv(child_device->driver); 921 if (drv->remove) 922 drv->remove(dev); 923 } 924 } 925 926 /* 927 * vmbus_shutdown - Shutdown a vmbus device 928 */ 929 static void vmbus_shutdown(struct device *child_device) 930 { 931 struct hv_driver *drv; 932 struct hv_device *dev = device_to_hv_device(child_device); 933 934 935 /* The device may not be attached yet */ 936 if (!child_device->driver) 937 return; 938 939 drv = drv_to_hv_drv(child_device->driver); 940 941 if (drv->shutdown) 942 drv->shutdown(dev); 943 } 944 945 #ifdef CONFIG_PM_SLEEP 946 /* 947 * vmbus_suspend - Suspend a vmbus device 948 */ 949 static int vmbus_suspend(struct device *child_device) 950 { 951 struct hv_driver *drv; 952 struct hv_device *dev = device_to_hv_device(child_device); 953 954 /* The device may not be attached yet */ 955 if (!child_device->driver) 956 return 0; 957 958 drv = drv_to_hv_drv(child_device->driver); 959 if (!drv->suspend) 960 return -EOPNOTSUPP; 961 962 return drv->suspend(dev); 963 } 964 965 /* 966 * vmbus_resume - Resume a vmbus device 967 */ 968 static int vmbus_resume(struct device *child_device) 969 { 970 struct hv_driver *drv; 971 struct hv_device *dev = device_to_hv_device(child_device); 972 973 /* The device may not be attached yet */ 974 if (!child_device->driver) 975 return 0; 976 977 drv = drv_to_hv_drv(child_device->driver); 978 if (!drv->resume) 979 return -EOPNOTSUPP; 980 981 return drv->resume(dev); 982 } 983 #else 984 #define vmbus_suspend NULL 985 #define vmbus_resume NULL 986 #endif /* CONFIG_PM_SLEEP */ 987 988 /* 989 * vmbus_device_release - Final callback release of the vmbus child device 990 */ 991 static void vmbus_device_release(struct device *device) 992 { 993 struct hv_device *hv_dev = device_to_hv_device(device); 994 struct vmbus_channel *channel = hv_dev->channel; 995 996 hv_debug_rm_dev_dir(hv_dev); 997 998 mutex_lock(&vmbus_connection.channel_mutex); 999 hv_process_channel_removal(channel); 1000 mutex_unlock(&vmbus_connection.channel_mutex); 1001 kfree(hv_dev); 1002 } 1003 1004 /* 1005 * Note: we must use the "noirq" ops: see the comment before vmbus_bus_pm. 1006 * 1007 * suspend_noirq/resume_noirq are set to NULL to support Suspend-to-Idle: we 1008 * shouldn't suspend the vmbus devices upon Suspend-to-Idle, otherwise there 1009 * is no way to wake up a Generation-2 VM. 1010 * 1011 * The other 4 ops are for hibernation. 1012 */ 1013 1014 static const struct dev_pm_ops vmbus_pm = { 1015 .suspend_noirq = NULL, 1016 .resume_noirq = NULL, 1017 .freeze_noirq = vmbus_suspend, 1018 .thaw_noirq = vmbus_resume, 1019 .poweroff_noirq = vmbus_suspend, 1020 .restore_noirq = vmbus_resume, 1021 }; 1022 1023 /* The one and only one */ 1024 static const struct bus_type hv_bus = { 1025 .name = "vmbus", 1026 .match = vmbus_match, 1027 .shutdown = vmbus_shutdown, 1028 .remove = vmbus_remove, 1029 .probe = vmbus_probe, 1030 .uevent = vmbus_uevent, 1031 .dma_configure = vmbus_dma_configure, 1032 .dev_groups = vmbus_dev_groups, 1033 .drv_groups = vmbus_drv_groups, 1034 .bus_groups = vmbus_bus_groups, 1035 .pm = &vmbus_pm, 1036 }; 1037 1038 struct onmessage_work_context { 1039 struct work_struct work; 1040 struct { 1041 struct hv_message_header header; 1042 u8 payload[]; 1043 } msg; 1044 }; 1045 1046 static void vmbus_onmessage_work(struct work_struct *work) 1047 { 1048 struct onmessage_work_context *ctx; 1049 1050 /* Do not process messages if we're in DISCONNECTED state */ 1051 if (vmbus_connection.conn_state == DISCONNECTED) 1052 return; 1053 1054 ctx = container_of(work, struct onmessage_work_context, 1055 work); 1056 vmbus_onmessage((struct vmbus_channel_message_header *) 1057 &ctx->msg.payload); 1058 kfree(ctx); 1059 } 1060 1061 static void __vmbus_on_msg_dpc(void *message_page_addr) 1062 { 1063 struct hv_message msg_copy, *msg; 1064 struct vmbus_channel_message_header *hdr; 1065 enum vmbus_channel_message_type msgtype; 1066 const struct vmbus_channel_message_table_entry *entry; 1067 struct onmessage_work_context *ctx; 1068 __u8 payload_size; 1069 u32 message_type; 1070 1071 if (!message_page_addr) 1072 return; 1073 msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT; 1074 1075 /* 1076 * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as 1077 * it is being used in 'struct vmbus_channel_message_header' definition 1078 * which is supposed to match hypervisor ABI. 1079 */ 1080 BUILD_BUG_ON(sizeof(enum vmbus_channel_message_type) != sizeof(u32)); 1081 1082 /* 1083 * Since the message is in memory shared with the host, an erroneous or 1084 * malicious Hyper-V could modify the message while vmbus_on_msg_dpc() 1085 * or individual message handlers are executing; to prevent this, copy 1086 * the message into private memory. 1087 */ 1088 memcpy(&msg_copy, msg, sizeof(struct hv_message)); 1089 1090 message_type = msg_copy.header.message_type; 1091 if (message_type == HVMSG_NONE) 1092 /* no msg */ 1093 return; 1094 1095 hdr = (struct vmbus_channel_message_header *)msg_copy.u.payload; 1096 msgtype = hdr->msgtype; 1097 1098 trace_vmbus_on_msg_dpc(hdr); 1099 1100 if (msgtype >= CHANNELMSG_COUNT) { 1101 WARN_ONCE(1, "unknown msgtype=%d\n", msgtype); 1102 goto msg_handled; 1103 } 1104 1105 payload_size = msg_copy.header.payload_size; 1106 if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) { 1107 WARN_ONCE(1, "payload size is too large (%d)\n", payload_size); 1108 goto msg_handled; 1109 } 1110 1111 entry = &channel_message_table[msgtype]; 1112 1113 if (!entry->message_handler) 1114 goto msg_handled; 1115 1116 if (payload_size < entry->min_payload_len) { 1117 WARN_ONCE(1, "message too short: msgtype=%d len=%d\n", msgtype, payload_size); 1118 goto msg_handled; 1119 } 1120 1121 if (entry->handler_type == VMHT_BLOCKING) { 1122 ctx = kmalloc(struct_size(ctx, msg.payload, payload_size), GFP_ATOMIC); 1123 if (ctx == NULL) 1124 return; 1125 1126 INIT_WORK(&ctx->work, vmbus_onmessage_work); 1127 ctx->msg.header = msg_copy.header; 1128 memcpy(&ctx->msg.payload, msg_copy.u.payload, payload_size); 1129 1130 /* 1131 * The host can generate a rescind message while we 1132 * may still be handling the original offer. We deal with 1133 * this condition by relying on the synchronization provided 1134 * by offer_in_progress and by channel_mutex. See also the 1135 * inline comments in vmbus_onoffer_rescind(). 1136 */ 1137 switch (msgtype) { 1138 case CHANNELMSG_RESCIND_CHANNELOFFER: 1139 /* 1140 * If we are handling the rescind message; 1141 * schedule the work on the global work queue. 1142 * 1143 * The OFFER message and the RESCIND message should 1144 * not be handled by the same serialized work queue, 1145 * because the OFFER handler may call vmbus_open(), 1146 * which tries to open the channel by sending an 1147 * OPEN_CHANNEL message to the host and waits for 1148 * the host's response; however, if the host has 1149 * rescinded the channel before it receives the 1150 * OPEN_CHANNEL message, the host just silently 1151 * ignores the OPEN_CHANNEL message; as a result, 1152 * the guest's OFFER handler hangs for ever, if we 1153 * handle the RESCIND message in the same serialized 1154 * work queue: the RESCIND handler can not start to 1155 * run before the OFFER handler finishes. 1156 */ 1157 if (vmbus_connection.ignore_any_offer_msg) 1158 break; 1159 queue_work(vmbus_connection.rescind_work_queue, &ctx->work); 1160 break; 1161 1162 case CHANNELMSG_OFFERCHANNEL: 1163 /* 1164 * The host sends the offer message of a given channel 1165 * before sending the rescind message of the same 1166 * channel. These messages are sent to the guest's 1167 * connect CPU; the guest then starts processing them 1168 * in the tasklet handler on this CPU: 1169 * 1170 * VMBUS_CONNECT_CPU 1171 * 1172 * [vmbus_on_msg_dpc()] 1173 * atomic_inc() // CHANNELMSG_OFFERCHANNEL 1174 * queue_work() 1175 * ... 1176 * [vmbus_on_msg_dpc()] 1177 * schedule_work() // CHANNELMSG_RESCIND_CHANNELOFFER 1178 * 1179 * We rely on the memory-ordering properties of the 1180 * queue_work() and schedule_work() primitives, which 1181 * guarantee that the atomic increment will be visible 1182 * to the CPUs which will execute the offer & rescind 1183 * works by the time these works will start execution. 1184 */ 1185 if (vmbus_connection.ignore_any_offer_msg) 1186 break; 1187 atomic_inc(&vmbus_connection.offer_in_progress); 1188 fallthrough; 1189 1190 default: 1191 queue_work(vmbus_connection.work_queue, &ctx->work); 1192 } 1193 } else 1194 entry->message_handler(hdr); 1195 1196 msg_handled: 1197 vmbus_signal_eom(msg, message_type); 1198 } 1199 1200 void vmbus_on_msg_dpc(unsigned long data) 1201 { 1202 struct hv_per_cpu_context *hv_cpu = (void *)data; 1203 1204 __vmbus_on_msg_dpc(hv_cpu->hyp_synic_message_page); 1205 __vmbus_on_msg_dpc(hv_cpu->para_synic_message_page); 1206 } 1207 1208 #ifdef CONFIG_PM_SLEEP 1209 /* 1210 * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for 1211 * hibernation, because hv_sock connections can not persist across hibernation. 1212 */ 1213 static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) 1214 { 1215 struct onmessage_work_context *ctx; 1216 struct vmbus_channel_rescind_offer *rescind; 1217 1218 WARN_ON(!is_hvsock_channel(channel)); 1219 1220 /* 1221 * Allocation size is small and the allocation should really not fail, 1222 * otherwise the state of the hv_sock connections ends up in limbo. 1223 */ 1224 ctx = kzalloc(sizeof(*ctx) + sizeof(*rescind), 1225 GFP_KERNEL | __GFP_NOFAIL); 1226 1227 /* 1228 * So far, these are not really used by Linux. Just set them to the 1229 * reasonable values conforming to the definitions of the fields. 1230 */ 1231 ctx->msg.header.message_type = 1; 1232 ctx->msg.header.payload_size = sizeof(*rescind); 1233 1234 /* These values are actually used by Linux. */ 1235 rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.payload; 1236 rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER; 1237 rescind->child_relid = channel->offermsg.child_relid; 1238 1239 INIT_WORK(&ctx->work, vmbus_onmessage_work); 1240 1241 queue_work(vmbus_connection.work_queue, &ctx->work); 1242 } 1243 #endif /* CONFIG_PM_SLEEP */ 1244 1245 /* 1246 * Schedule all channels with events pending. 1247 * The event page can be directly checked to get the id of 1248 * the channel that has the interrupt pending. 1249 */ 1250 static void vmbus_chan_sched(void *event_page_addr) 1251 { 1252 unsigned long *recv_int_page; 1253 u32 maxbits, relid; 1254 union hv_synic_event_flags *event; 1255 1256 if (!event_page_addr) 1257 return; 1258 event = (union hv_synic_event_flags *)event_page_addr + VMBUS_MESSAGE_SINT; 1259 1260 maxbits = HV_EVENT_FLAGS_COUNT; 1261 recv_int_page = event->flags; 1262 1263 if (unlikely(!recv_int_page)) 1264 return; 1265 1266 /* 1267 * Suggested-by: Michael Kelley <mhklinux@outlook.com> 1268 * One possible optimization would be to keep track of the largest relID that's in use, 1269 * and only scan up to that relID. 1270 */ 1271 for_each_set_bit(relid, recv_int_page, maxbits) { 1272 void (*callback_fn)(void *context); 1273 struct vmbus_channel *channel; 1274 1275 if (!sync_test_and_clear_bit(relid, recv_int_page)) 1276 continue; 1277 1278 /* Special case - vmbus channel protocol msg */ 1279 if (relid == 0) 1280 continue; 1281 1282 /* 1283 * Pairs with the kfree_rcu() in vmbus_chan_release(). 1284 * Guarantees that the channel data structure doesn't 1285 * get freed while the channel pointer below is being 1286 * dereferenced. 1287 */ 1288 rcu_read_lock(); 1289 1290 /* Find channel based on relid */ 1291 channel = relid2channel(relid); 1292 if (channel == NULL) 1293 goto sched_unlock_rcu; 1294 1295 if (channel->rescind) 1296 goto sched_unlock_rcu; 1297 1298 /* 1299 * Make sure that the ring buffer data structure doesn't get 1300 * freed while we dereference the ring buffer pointer. Test 1301 * for the channel's onchannel_callback being NULL within a 1302 * sched_lock critical section. See also the inline comments 1303 * in vmbus_reset_channel_cb(). 1304 */ 1305 spin_lock(&channel->sched_lock); 1306 1307 callback_fn = channel->onchannel_callback; 1308 if (unlikely(callback_fn == NULL)) 1309 goto sched_unlock; 1310 1311 trace_vmbus_chan_sched(channel); 1312 1313 ++channel->interrupts; 1314 1315 switch (channel->callback_mode) { 1316 case HV_CALL_ISR: 1317 (*callback_fn)(channel->channel_callback_context); 1318 break; 1319 1320 case HV_CALL_BATCHED: 1321 hv_begin_read(&channel->inbound); 1322 fallthrough; 1323 case HV_CALL_DIRECT: 1324 tasklet_schedule(&channel->callback_event); 1325 } 1326 1327 sched_unlock: 1328 spin_unlock(&channel->sched_lock); 1329 sched_unlock_rcu: 1330 rcu_read_unlock(); 1331 } 1332 } 1333 1334 static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message_page_addr) 1335 { 1336 struct hv_message *msg; 1337 1338 if (!message_page_addr) 1339 return; 1340 msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT; 1341 1342 /* Check if there are actual msgs to be processed */ 1343 if (msg->header.message_type != HVMSG_NONE) { 1344 if (msg->header.message_type == HVMSG_TIMER_EXPIRED) { 1345 hv_stimer0_isr(); 1346 vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED); 1347 } else { 1348 tasklet_schedule(&hv_cpu->msg_dpc); 1349 } 1350 } 1351 } 1352 1353 void vmbus_isr(void) 1354 { 1355 struct hv_per_cpu_context *hv_cpu 1356 = this_cpu_ptr(hv_context.cpu_context); 1357 1358 vmbus_chan_sched(hv_cpu->hyp_synic_event_page); 1359 vmbus_chan_sched(hv_cpu->para_synic_event_page); 1360 1361 vmbus_message_sched(hv_cpu, hv_cpu->hyp_synic_message_page); 1362 vmbus_message_sched(hv_cpu, hv_cpu->para_synic_message_page); 1363 1364 add_interrupt_randomness(vmbus_interrupt); 1365 } 1366 EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl"); 1367 1368 static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) 1369 { 1370 vmbus_isr(); 1371 return IRQ_HANDLED; 1372 } 1373 1374 static void vmbus_percpu_work(struct work_struct *work) 1375 { 1376 unsigned int cpu = smp_processor_id(); 1377 1378 hv_synic_init(cpu); 1379 } 1380 1381 static int vmbus_alloc_synic_and_connect(void) 1382 { 1383 int ret, cpu; 1384 struct work_struct __percpu *works; 1385 int hyperv_cpuhp_online; 1386 1387 ret = hv_synic_alloc(); 1388 if (ret < 0) 1389 goto err_alloc; 1390 1391 works = alloc_percpu(struct work_struct); 1392 if (!works) { 1393 ret = -ENOMEM; 1394 goto err_alloc; 1395 } 1396 1397 /* 1398 * Initialize the per-cpu interrupt state and stimer state. 1399 * Then connect to the host. 1400 */ 1401 cpus_read_lock(); 1402 for_each_online_cpu(cpu) { 1403 struct work_struct *work = per_cpu_ptr(works, cpu); 1404 1405 INIT_WORK(work, vmbus_percpu_work); 1406 schedule_work_on(cpu, work); 1407 } 1408 1409 for_each_online_cpu(cpu) 1410 flush_work(per_cpu_ptr(works, cpu)); 1411 1412 /* Register the callbacks for possible CPU online/offline'ing */ 1413 ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", 1414 hv_synic_init, hv_synic_cleanup); 1415 cpus_read_unlock(); 1416 free_percpu(works); 1417 if (ret < 0) 1418 goto err_alloc; 1419 hyperv_cpuhp_online = ret; 1420 1421 ret = vmbus_connect(); 1422 if (ret) 1423 goto err_connect; 1424 return 0; 1425 1426 err_connect: 1427 cpuhp_remove_state(hyperv_cpuhp_online); 1428 return -ENODEV; 1429 err_alloc: 1430 hv_synic_free(); 1431 return -ENOMEM; 1432 } 1433 1434 /* 1435 * vmbus_bus_init -Main vmbus driver initialization routine. 1436 * 1437 * Here, we 1438 * - initialize the vmbus driver context 1439 * - invoke the vmbus hv main init routine 1440 * - retrieve the channel offers 1441 */ 1442 static int vmbus_bus_init(void) 1443 { 1444 int ret; 1445 1446 ret = hv_init(); 1447 if (ret != 0) { 1448 pr_err("Unable to initialize the hypervisor - 0x%x\n", ret); 1449 return ret; 1450 } 1451 1452 ret = bus_register(&hv_bus); 1453 if (ret) 1454 return ret; 1455 1456 /* 1457 * VMbus interrupts are best modeled as per-cpu interrupts. If 1458 * on an architecture with support for per-cpu IRQs (e.g. ARM64), 1459 * allocate a per-cpu IRQ using standard Linux kernel functionality. 1460 * If not on such an architecture (e.g., x86/x64), then rely on 1461 * code in the arch-specific portion of the code tree to connect 1462 * the VMbus interrupt handler. 1463 */ 1464 1465 if (vmbus_irq == -1) { 1466 hv_setup_vmbus_handler(vmbus_isr); 1467 } else { 1468 vmbus_evt = alloc_percpu(long); 1469 ret = request_percpu_irq(vmbus_irq, vmbus_percpu_isr, 1470 "Hyper-V VMbus", vmbus_evt); 1471 if (ret) { 1472 pr_err("Can't request Hyper-V VMbus IRQ %d, Err %d", 1473 vmbus_irq, ret); 1474 free_percpu(vmbus_evt); 1475 goto err_setup; 1476 } 1477 } 1478 1479 /* 1480 * Cache the value as getting it involves a VM exit on x86(_64), and 1481 * doing that on each VP while initializing SynIC's wastes time. 1482 */ 1483 is_confidential = ms_hyperv.confidential_vmbus_available; 1484 if (is_confidential) 1485 pr_info("Establishing connection to the confidential VMBus\n"); 1486 hv_para_set_sint_proxy(!is_confidential); 1487 ret = vmbus_alloc_synic_and_connect(); 1488 if (ret) 1489 goto err_connect; 1490 1491 /* 1492 * Always register the vmbus unload panic notifier because we 1493 * need to shut the VMbus channel connection on panic. 1494 */ 1495 atomic_notifier_chain_register(&panic_notifier_list, 1496 &hyperv_panic_vmbus_unload_block); 1497 1498 vmbus_request_offers(); 1499 1500 return 0; 1501 1502 err_connect: 1503 if (vmbus_irq == -1) { 1504 hv_remove_vmbus_handler(); 1505 } else { 1506 free_percpu_irq(vmbus_irq, vmbus_evt); 1507 free_percpu(vmbus_evt); 1508 } 1509 err_setup: 1510 bus_unregister(&hv_bus); 1511 return ret; 1512 } 1513 1514 /** 1515 * __vmbus_driver_register() - Register a vmbus's driver 1516 * @hv_driver: Pointer to driver structure you want to register 1517 * @owner: owner module of the drv 1518 * @mod_name: module name string 1519 * 1520 * Registers the given driver with Linux through the 'driver_register()' call 1521 * and sets up the hyper-v vmbus handling for this driver. 1522 * It will return the state of the 'driver_register()' call. 1523 * 1524 */ 1525 int __vmbus_driver_register(struct hv_driver *hv_driver, struct module *owner, const char *mod_name) 1526 { 1527 int ret; 1528 1529 pr_info("registering driver %s\n", hv_driver->name); 1530 1531 ret = vmbus_exists(); 1532 if (ret < 0) 1533 return ret; 1534 1535 hv_driver->driver.name = hv_driver->name; 1536 hv_driver->driver.owner = owner; 1537 hv_driver->driver.mod_name = mod_name; 1538 hv_driver->driver.bus = &hv_bus; 1539 1540 spin_lock_init(&hv_driver->dynids.lock); 1541 INIT_LIST_HEAD(&hv_driver->dynids.list); 1542 1543 ret = driver_register(&hv_driver->driver); 1544 1545 return ret; 1546 } 1547 EXPORT_SYMBOL_GPL(__vmbus_driver_register); 1548 1549 /** 1550 * vmbus_driver_unregister() - Unregister a vmbus's driver 1551 * @hv_driver: Pointer to driver structure you want to 1552 * un-register 1553 * 1554 * Un-register the given driver that was previous registered with a call to 1555 * vmbus_driver_register() 1556 */ 1557 void vmbus_driver_unregister(struct hv_driver *hv_driver) 1558 { 1559 pr_info("unregistering driver %s\n", hv_driver->name); 1560 1561 if (!vmbus_exists()) { 1562 driver_unregister(&hv_driver->driver); 1563 vmbus_free_dynids(hv_driver); 1564 } 1565 } 1566 EXPORT_SYMBOL_GPL(vmbus_driver_unregister); 1567 1568 1569 /* 1570 * Called when last reference to channel is gone. 1571 */ 1572 static void vmbus_chan_release(struct kobject *kobj) 1573 { 1574 struct vmbus_channel *channel 1575 = container_of(kobj, struct vmbus_channel, kobj); 1576 1577 kfree_rcu(channel, rcu); 1578 } 1579 1580 struct vmbus_chan_attribute { 1581 struct attribute attr; 1582 ssize_t (*show)(struct vmbus_channel *chan, char *buf); 1583 ssize_t (*store)(struct vmbus_channel *chan, 1584 const char *buf, size_t count); 1585 }; 1586 #define VMBUS_CHAN_ATTR(_name, _mode, _show, _store) \ 1587 struct vmbus_chan_attribute chan_attr_##_name \ 1588 = __ATTR(_name, _mode, _show, _store) 1589 #define VMBUS_CHAN_ATTR_RW(_name) \ 1590 struct vmbus_chan_attribute chan_attr_##_name = __ATTR_RW(_name) 1591 #define VMBUS_CHAN_ATTR_RO(_name) \ 1592 struct vmbus_chan_attribute chan_attr_##_name = __ATTR_RO(_name) 1593 #define VMBUS_CHAN_ATTR_WO(_name) \ 1594 struct vmbus_chan_attribute chan_attr_##_name = __ATTR_WO(_name) 1595 1596 static ssize_t vmbus_chan_attr_show(struct kobject *kobj, 1597 struct attribute *attr, char *buf) 1598 { 1599 const struct vmbus_chan_attribute *attribute 1600 = container_of(attr, struct vmbus_chan_attribute, attr); 1601 struct vmbus_channel *chan 1602 = container_of(kobj, struct vmbus_channel, kobj); 1603 1604 if (!attribute->show) 1605 return -EIO; 1606 1607 return attribute->show(chan, buf); 1608 } 1609 1610 static ssize_t vmbus_chan_attr_store(struct kobject *kobj, 1611 struct attribute *attr, const char *buf, 1612 size_t count) 1613 { 1614 const struct vmbus_chan_attribute *attribute 1615 = container_of(attr, struct vmbus_chan_attribute, attr); 1616 struct vmbus_channel *chan 1617 = container_of(kobj, struct vmbus_channel, kobj); 1618 1619 if (!attribute->store) 1620 return -EIO; 1621 1622 return attribute->store(chan, buf, count); 1623 } 1624 1625 static const struct sysfs_ops vmbus_chan_sysfs_ops = { 1626 .show = vmbus_chan_attr_show, 1627 .store = vmbus_chan_attr_store, 1628 }; 1629 1630 static ssize_t out_mask_show(struct vmbus_channel *channel, char *buf) 1631 { 1632 struct hv_ring_buffer_info *rbi = &channel->outbound; 1633 ssize_t ret; 1634 1635 mutex_lock(&rbi->ring_buffer_mutex); 1636 if (!rbi->ring_buffer) { 1637 mutex_unlock(&rbi->ring_buffer_mutex); 1638 return -EINVAL; 1639 } 1640 1641 ret = sprintf(buf, "%u\n", rbi->ring_buffer->interrupt_mask); 1642 mutex_unlock(&rbi->ring_buffer_mutex); 1643 return ret; 1644 } 1645 static VMBUS_CHAN_ATTR_RO(out_mask); 1646 1647 static ssize_t in_mask_show(struct vmbus_channel *channel, char *buf) 1648 { 1649 struct hv_ring_buffer_info *rbi = &channel->inbound; 1650 ssize_t ret; 1651 1652 mutex_lock(&rbi->ring_buffer_mutex); 1653 if (!rbi->ring_buffer) { 1654 mutex_unlock(&rbi->ring_buffer_mutex); 1655 return -EINVAL; 1656 } 1657 1658 ret = sprintf(buf, "%u\n", rbi->ring_buffer->interrupt_mask); 1659 mutex_unlock(&rbi->ring_buffer_mutex); 1660 return ret; 1661 } 1662 static VMBUS_CHAN_ATTR_RO(in_mask); 1663 1664 static ssize_t read_avail_show(struct vmbus_channel *channel, char *buf) 1665 { 1666 struct hv_ring_buffer_info *rbi = &channel->inbound; 1667 ssize_t ret; 1668 1669 mutex_lock(&rbi->ring_buffer_mutex); 1670 if (!rbi->ring_buffer) { 1671 mutex_unlock(&rbi->ring_buffer_mutex); 1672 return -EINVAL; 1673 } 1674 1675 ret = sprintf(buf, "%u\n", hv_get_bytes_to_read(rbi)); 1676 mutex_unlock(&rbi->ring_buffer_mutex); 1677 return ret; 1678 } 1679 static VMBUS_CHAN_ATTR_RO(read_avail); 1680 1681 static ssize_t write_avail_show(struct vmbus_channel *channel, char *buf) 1682 { 1683 struct hv_ring_buffer_info *rbi = &channel->outbound; 1684 ssize_t ret; 1685 1686 mutex_lock(&rbi->ring_buffer_mutex); 1687 if (!rbi->ring_buffer) { 1688 mutex_unlock(&rbi->ring_buffer_mutex); 1689 return -EINVAL; 1690 } 1691 1692 ret = sprintf(buf, "%u\n", hv_get_bytes_to_write(rbi)); 1693 mutex_unlock(&rbi->ring_buffer_mutex); 1694 return ret; 1695 } 1696 static VMBUS_CHAN_ATTR_RO(write_avail); 1697 1698 static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf) 1699 { 1700 return sprintf(buf, "%u\n", channel->target_cpu); 1701 } 1702 1703 int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu) 1704 { 1705 u32 origin_cpu; 1706 int ret = 0; 1707 1708 lockdep_assert_cpus_held(); 1709 lockdep_assert_held(&vmbus_connection.channel_mutex); 1710 1711 if (vmbus_proto_version < VERSION_WIN10_V4_1) 1712 return -EIO; 1713 1714 /* Validate target_cpu for the cpumask_test_cpu() operation below. */ 1715 if (target_cpu >= nr_cpumask_bits) 1716 return -EINVAL; 1717 1718 if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ))) 1719 return -EINVAL; 1720 1721 if (!cpu_online(target_cpu)) 1722 return -EINVAL; 1723 1724 /* 1725 * Synchronizes vmbus_channel_set_cpu() and channel closure: 1726 * 1727 * { Initially: state = CHANNEL_OPENED } 1728 * 1729 * CPU1 CPU2 1730 * 1731 * [vmbus_channel_set_cpu()] [vmbus_disconnect_ring()] 1732 * 1733 * LOCK channel_mutex LOCK channel_mutex 1734 * LOAD r1 = state LOAD r2 = state 1735 * IF (r1 == CHANNEL_OPENED) IF (r2 == CHANNEL_OPENED) 1736 * SEND MODIFYCHANNEL STORE state = CHANNEL_OPEN 1737 * [...] SEND CLOSECHANNEL 1738 * UNLOCK channel_mutex UNLOCK channel_mutex 1739 * 1740 * Forbids: r1 == r2 == CHANNEL_OPENED (i.e., CPU1's LOCK precedes 1741 * CPU2's LOCK) && CPU2's SEND precedes CPU1's SEND 1742 * 1743 * Note. The host processes the channel messages "sequentially", in 1744 * the order in which they are received on a per-partition basis. 1745 */ 1746 1747 /* 1748 * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels; 1749 * avoid sending the message and fail here for such channels. 1750 */ 1751 if (channel->state != CHANNEL_OPENED_STATE) { 1752 ret = -EIO; 1753 goto end; 1754 } 1755 1756 origin_cpu = channel->target_cpu; 1757 if (target_cpu == origin_cpu) 1758 goto end; 1759 1760 if (vmbus_send_modifychannel(channel, 1761 hv_cpu_number_to_vp_number(target_cpu))) { 1762 ret = -EIO; 1763 goto end; 1764 } 1765 1766 /* 1767 * For version before VERSION_WIN10_V5_3, the following warning holds: 1768 * 1769 * Warning. At this point, there is *no* guarantee that the host will 1770 * have successfully processed the vmbus_send_modifychannel() request. 1771 * See the header comment of vmbus_send_modifychannel() for more info. 1772 * 1773 * Lags in the processing of the above vmbus_send_modifychannel() can 1774 * result in missed interrupts if the "old" target CPU is taken offline 1775 * before Hyper-V starts sending interrupts to the "new" target CPU. 1776 * But apart from this offlining scenario, the code tolerates such 1777 * lags. It will function correctly even if a channel interrupt comes 1778 * in on a CPU that is different from the channel target_cpu value. 1779 */ 1780 1781 channel->target_cpu = target_cpu; 1782 1783 /* See init_vp_index(). */ 1784 if (hv_is_perf_channel(channel)) 1785 hv_update_allocated_cpus(origin_cpu, target_cpu); 1786 1787 /* Currently set only for storvsc channels. */ 1788 if (channel->change_target_cpu_callback) { 1789 (*channel->change_target_cpu_callback)(channel, 1790 origin_cpu, target_cpu); 1791 } 1792 1793 end: 1794 return ret; 1795 } 1796 1797 static ssize_t target_cpu_store(struct vmbus_channel *channel, 1798 const char *buf, size_t count) 1799 { 1800 u32 target_cpu; 1801 ssize_t ret; 1802 1803 if (sscanf(buf, "%u", &target_cpu) != 1) 1804 return -EIO; 1805 1806 cpus_read_lock(); 1807 mutex_lock(&vmbus_connection.channel_mutex); 1808 ret = vmbus_channel_set_cpu(channel, target_cpu); 1809 mutex_unlock(&vmbus_connection.channel_mutex); 1810 cpus_read_unlock(); 1811 1812 return ret ?: count; 1813 } 1814 static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store); 1815 1816 static ssize_t channel_pending_show(struct vmbus_channel *channel, 1817 char *buf) 1818 { 1819 return sprintf(buf, "%d\n", 1820 channel_pending(channel, 1821 vmbus_connection.monitor_pages[1])); 1822 } 1823 static VMBUS_CHAN_ATTR(pending, 0444, channel_pending_show, NULL); 1824 1825 static ssize_t channel_latency_show(struct vmbus_channel *channel, 1826 char *buf) 1827 { 1828 return sprintf(buf, "%d\n", 1829 channel_latency(channel, 1830 vmbus_connection.monitor_pages[1])); 1831 } 1832 static VMBUS_CHAN_ATTR(latency, 0444, channel_latency_show, NULL); 1833 1834 static ssize_t channel_interrupts_show(struct vmbus_channel *channel, char *buf) 1835 { 1836 return sprintf(buf, "%llu\n", channel->interrupts); 1837 } 1838 static VMBUS_CHAN_ATTR(interrupts, 0444, channel_interrupts_show, NULL); 1839 1840 static ssize_t channel_events_show(struct vmbus_channel *channel, char *buf) 1841 { 1842 return sprintf(buf, "%llu\n", channel->sig_events); 1843 } 1844 static VMBUS_CHAN_ATTR(events, 0444, channel_events_show, NULL); 1845 1846 static ssize_t channel_intr_in_full_show(struct vmbus_channel *channel, 1847 char *buf) 1848 { 1849 return sprintf(buf, "%llu\n", 1850 (unsigned long long)channel->intr_in_full); 1851 } 1852 static VMBUS_CHAN_ATTR(intr_in_full, 0444, channel_intr_in_full_show, NULL); 1853 1854 static ssize_t channel_intr_out_empty_show(struct vmbus_channel *channel, 1855 char *buf) 1856 { 1857 return sprintf(buf, "%llu\n", 1858 (unsigned long long)channel->intr_out_empty); 1859 } 1860 static VMBUS_CHAN_ATTR(intr_out_empty, 0444, channel_intr_out_empty_show, NULL); 1861 1862 static ssize_t channel_out_full_first_show(struct vmbus_channel *channel, 1863 char *buf) 1864 { 1865 return sprintf(buf, "%llu\n", 1866 (unsigned long long)channel->out_full_first); 1867 } 1868 static VMBUS_CHAN_ATTR(out_full_first, 0444, channel_out_full_first_show, NULL); 1869 1870 static ssize_t channel_out_full_total_show(struct vmbus_channel *channel, 1871 char *buf) 1872 { 1873 return sprintf(buf, "%llu\n", 1874 (unsigned long long)channel->out_full_total); 1875 } 1876 static VMBUS_CHAN_ATTR(out_full_total, 0444, channel_out_full_total_show, NULL); 1877 1878 static ssize_t subchannel_monitor_id_show(struct vmbus_channel *channel, 1879 char *buf) 1880 { 1881 return sprintf(buf, "%u\n", channel->offermsg.monitorid); 1882 } 1883 static VMBUS_CHAN_ATTR(monitor_id, 0444, subchannel_monitor_id_show, NULL); 1884 1885 static ssize_t subchannel_id_show(struct vmbus_channel *channel, 1886 char *buf) 1887 { 1888 return sprintf(buf, "%u\n", 1889 channel->offermsg.offer.sub_channel_index); 1890 } 1891 static VMBUS_CHAN_ATTR_RO(subchannel_id); 1892 1893 static int hv_mmap_ring_buffer_wrapper(struct file *filp, struct kobject *kobj, 1894 const struct bin_attribute *attr, 1895 struct vm_area_struct *vma) 1896 { 1897 struct vmbus_channel *channel = container_of(kobj, struct vmbus_channel, kobj); 1898 1899 /* 1900 * hv_(create|remove)_ring_sysfs implementation ensures that mmap_ring_buffer 1901 * is not NULL. 1902 */ 1903 return channel->mmap_ring_buffer(channel, vma); 1904 } 1905 1906 static struct bin_attribute chan_attr_ring_buffer = { 1907 .attr = { 1908 .name = "ring", 1909 .mode = 0600, 1910 }, 1911 .mmap = hv_mmap_ring_buffer_wrapper, 1912 }; 1913 static struct attribute *vmbus_chan_attrs[] = { 1914 &chan_attr_out_mask.attr, 1915 &chan_attr_in_mask.attr, 1916 &chan_attr_read_avail.attr, 1917 &chan_attr_write_avail.attr, 1918 &chan_attr_cpu.attr, 1919 &chan_attr_pending.attr, 1920 &chan_attr_latency.attr, 1921 &chan_attr_interrupts.attr, 1922 &chan_attr_events.attr, 1923 &chan_attr_intr_in_full.attr, 1924 &chan_attr_intr_out_empty.attr, 1925 &chan_attr_out_full_first.attr, 1926 &chan_attr_out_full_total.attr, 1927 &chan_attr_monitor_id.attr, 1928 &chan_attr_subchannel_id.attr, 1929 NULL 1930 }; 1931 1932 static const struct bin_attribute *vmbus_chan_bin_attrs[] = { 1933 &chan_attr_ring_buffer, 1934 NULL 1935 }; 1936 1937 /* 1938 * Channel-level attribute_group callback function. Returns the permission for 1939 * each attribute, and returns 0 if an attribute is not visible. 1940 */ 1941 static umode_t vmbus_chan_attr_is_visible(struct kobject *kobj, 1942 struct attribute *attr, int idx) 1943 { 1944 const struct vmbus_channel *channel = 1945 container_of(kobj, struct vmbus_channel, kobj); 1946 1947 /* Hide the monitor attributes if the monitor mechanism is not used. */ 1948 if (!channel->offermsg.monitor_allocated && 1949 (attr == &chan_attr_pending.attr || 1950 attr == &chan_attr_latency.attr || 1951 attr == &chan_attr_monitor_id.attr)) 1952 return 0; 1953 1954 return attr->mode; 1955 } 1956 1957 static umode_t vmbus_chan_bin_attr_is_visible(struct kobject *kobj, 1958 const struct bin_attribute *attr, int idx) 1959 { 1960 const struct vmbus_channel *channel = 1961 container_of(kobj, struct vmbus_channel, kobj); 1962 1963 /* Hide ring attribute if channel's ring_sysfs_visible is set to false */ 1964 if (attr == &chan_attr_ring_buffer && !channel->ring_sysfs_visible) 1965 return 0; 1966 1967 return attr->attr.mode; 1968 } 1969 1970 static size_t vmbus_chan_bin_size(struct kobject *kobj, 1971 const struct bin_attribute *bin_attr, int a) 1972 { 1973 const struct vmbus_channel *channel = 1974 container_of(kobj, struct vmbus_channel, kobj); 1975 1976 return channel->ringbuffer_pagecount << PAGE_SHIFT; 1977 } 1978 1979 static const struct attribute_group vmbus_chan_group = { 1980 .attrs = vmbus_chan_attrs, 1981 .bin_attrs = vmbus_chan_bin_attrs, 1982 .is_visible = vmbus_chan_attr_is_visible, 1983 .is_bin_visible = vmbus_chan_bin_attr_is_visible, 1984 .bin_size = vmbus_chan_bin_size, 1985 }; 1986 1987 static const struct kobj_type vmbus_chan_ktype = { 1988 .sysfs_ops = &vmbus_chan_sysfs_ops, 1989 .release = vmbus_chan_release, 1990 }; 1991 1992 /** 1993 * hv_create_ring_sysfs() - create "ring" sysfs entry corresponding to ring buffers for a channel. 1994 * @channel: Pointer to vmbus_channel structure 1995 * @hv_mmap_ring_buffer: function pointer for initializing the function to be called on mmap of 1996 * channel's "ring" sysfs node, which is for the ring buffer of that channel. 1997 * Function pointer is of below type: 1998 * int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, 1999 * struct vm_area_struct *vma)) 2000 * This has a pointer to the channel and a pointer to vm_area_struct, 2001 * used for mmap, as arguments. 2002 * 2003 * Sysfs node for ring buffer of a channel is created along with other fields, however its 2004 * visibility is disabled by default. Sysfs creation needs to be controlled when the use-case 2005 * is running. 2006 * For example, HV_NIC device is used either by uio_hv_generic or hv_netvsc at any given point of 2007 * time, and "ring" sysfs is needed only when uio_hv_generic is bound to that device. To avoid 2008 * exposing the ring buffer by default, this function is responsible to enable visibility of 2009 * ring for userspace to use. 2010 * Note: Race conditions can happen with userspace and it is not encouraged to create new 2011 * use-cases for this. This was added to maintain backward compatibility, while solving 2012 * one of the race conditions in uio_hv_generic while creating sysfs. See comments with 2013 * vmbus_add_dynid() and vmbus_device_register(). 2014 * 2015 * Returns 0 on success or error code on failure. 2016 */ 2017 int hv_create_ring_sysfs(struct vmbus_channel *channel, 2018 int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, 2019 struct vm_area_struct *vma)) 2020 { 2021 struct kobject *kobj = &channel->kobj; 2022 2023 channel->mmap_ring_buffer = hv_mmap_ring_buffer; 2024 channel->ring_sysfs_visible = true; 2025 2026 return sysfs_update_group(kobj, &vmbus_chan_group); 2027 } 2028 EXPORT_SYMBOL_GPL(hv_create_ring_sysfs); 2029 2030 /** 2031 * hv_remove_ring_sysfs() - remove ring sysfs entry corresponding to ring buffers for a channel. 2032 * @channel: Pointer to vmbus_channel structure 2033 * 2034 * Hide "ring" sysfs for a channel by changing its is_visible attribute and updating sysfs group. 2035 * 2036 * Returns 0 on success or error code on failure. 2037 */ 2038 int hv_remove_ring_sysfs(struct vmbus_channel *channel) 2039 { 2040 struct kobject *kobj = &channel->kobj; 2041 int ret; 2042 2043 channel->ring_sysfs_visible = false; 2044 ret = sysfs_update_group(kobj, &vmbus_chan_group); 2045 channel->mmap_ring_buffer = NULL; 2046 return ret; 2047 } 2048 EXPORT_SYMBOL_GPL(hv_remove_ring_sysfs); 2049 2050 /* 2051 * vmbus_add_channel_kobj - setup a sub-directory under device/channels 2052 */ 2053 int vmbus_add_channel_kobj(struct hv_device *dev, struct vmbus_channel *channel) 2054 { 2055 const struct device *device = &dev->device; 2056 struct kobject *kobj = &channel->kobj; 2057 u32 relid = channel->offermsg.child_relid; 2058 int ret; 2059 2060 kobj->kset = dev->channels_kset; 2061 ret = kobject_init_and_add(kobj, &vmbus_chan_ktype, NULL, 2062 "%u", relid); 2063 if (ret) { 2064 kobject_put(kobj); 2065 return ret; 2066 } 2067 2068 ret = sysfs_create_group(kobj, &vmbus_chan_group); 2069 2070 if (ret) { 2071 /* 2072 * The calling functions' error handling paths will cleanup the 2073 * empty channel directory. 2074 */ 2075 kobject_put(kobj); 2076 dev_err(device, "Unable to set up channel sysfs files\n"); 2077 return ret; 2078 } 2079 2080 kobject_uevent(kobj, KOBJ_ADD); 2081 2082 return 0; 2083 } 2084 2085 /* 2086 * vmbus_remove_channel_attr_group - remove the channel's attribute group 2087 */ 2088 void vmbus_remove_channel_attr_group(struct vmbus_channel *channel) 2089 { 2090 sysfs_remove_group(&channel->kobj, &vmbus_chan_group); 2091 } 2092 2093 /* 2094 * vmbus_device_create - Creates and registers a new child device 2095 * on the vmbus. 2096 */ 2097 struct hv_device *vmbus_device_create(const guid_t *type, 2098 const guid_t *instance, 2099 struct vmbus_channel *channel) 2100 { 2101 struct hv_device *child_device_obj; 2102 2103 child_device_obj = kzalloc(sizeof(struct hv_device), GFP_KERNEL); 2104 if (!child_device_obj) { 2105 pr_err("Unable to allocate device object for child device\n"); 2106 return NULL; 2107 } 2108 2109 child_device_obj->channel = channel; 2110 guid_copy(&child_device_obj->dev_type, type); 2111 guid_copy(&child_device_obj->dev_instance, instance); 2112 child_device_obj->vendor_id = PCI_VENDOR_ID_MICROSOFT; 2113 2114 return child_device_obj; 2115 } 2116 2117 /* 2118 * vmbus_device_register - Register the child device 2119 */ 2120 int vmbus_device_register(struct hv_device *child_device_obj) 2121 { 2122 struct kobject *kobj = &child_device_obj->device.kobj; 2123 int ret; 2124 2125 dev_set_name(&child_device_obj->device, "%pUl", 2126 &child_device_obj->channel->offermsg.offer.if_instance); 2127 2128 child_device_obj->device.bus = &hv_bus; 2129 child_device_obj->device.parent = vmbus_root_device; 2130 child_device_obj->device.release = vmbus_device_release; 2131 2132 child_device_obj->device.dma_parms = &child_device_obj->dma_parms; 2133 child_device_obj->device.dma_mask = &child_device_obj->dma_mask; 2134 dma_set_mask(&child_device_obj->device, DMA_BIT_MASK(64)); 2135 2136 /* 2137 * Register with the LDM. This will kick off the driver/device 2138 * binding...which will eventually call vmbus_match() and vmbus_probe() 2139 */ 2140 ret = device_register(&child_device_obj->device); 2141 if (ret) { 2142 pr_err("Unable to register child device\n"); 2143 put_device(&child_device_obj->device); 2144 return ret; 2145 } 2146 2147 /* 2148 * If device_register() found a driver to assign to the device, the 2149 * driver's probe function has already run at this point. If that 2150 * probe function accesses or operates on the "channels" subdirectory 2151 * in sysfs, those operations will have failed because the "channels" 2152 * subdirectory doesn't exist until the code below runs. Or if the 2153 * probe function creates a /dev entry, a user space program could 2154 * find and open the /dev entry, and then create a race by accessing 2155 * the "channels" subdirectory while the creation steps are in progress 2156 * here. The race can't result in a kernel failure, but the user space 2157 * program may get an error in accessing "channels" or its 2158 * subdirectories. See also comments with vmbus_add_dynid() about a 2159 * related race condition. 2160 */ 2161 child_device_obj->channels_kset = kset_create_and_add("channels", 2162 NULL, kobj); 2163 if (!child_device_obj->channels_kset) { 2164 ret = -ENOMEM; 2165 goto err_dev_unregister; 2166 } 2167 2168 ret = vmbus_add_channel_kobj(child_device_obj, 2169 child_device_obj->channel); 2170 if (ret) { 2171 pr_err("Unable to register primary channel\n"); 2172 goto err_kset_unregister; 2173 } 2174 hv_debug_add_dev_dir(child_device_obj); 2175 2176 return 0; 2177 2178 err_kset_unregister: 2179 kset_unregister(child_device_obj->channels_kset); 2180 2181 err_dev_unregister: 2182 device_unregister(&child_device_obj->device); 2183 return ret; 2184 } 2185 2186 /* 2187 * vmbus_device_unregister - Remove the specified child device 2188 * from the vmbus. 2189 */ 2190 void vmbus_device_unregister(struct hv_device *device_obj) 2191 { 2192 pr_debug("child device %s unregistered\n", 2193 dev_name(&device_obj->device)); 2194 2195 kset_unregister(device_obj->channels_kset); 2196 2197 /* 2198 * Kick off the process of unregistering the device. 2199 * This will call vmbus_remove() and eventually vmbus_device_release() 2200 */ 2201 device_unregister(&device_obj->device); 2202 } 2203 EXPORT_SYMBOL_GPL(vmbus_device_unregister); 2204 2205 #ifdef CONFIG_ACPI 2206 /* 2207 * VMBUS is an acpi enumerated device. Get the information we 2208 * need from DSDT. 2209 */ 2210 static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx) 2211 { 2212 resource_size_t start = 0; 2213 resource_size_t end = 0; 2214 struct resource *new_res; 2215 struct resource **old_res = &hyperv_mmio; 2216 struct resource **prev_res = NULL; 2217 struct resource r; 2218 2219 switch (res->type) { 2220 2221 /* 2222 * "Address" descriptors are for bus windows. Ignore 2223 * "memory" descriptors, which are for registers on 2224 * devices. 2225 */ 2226 case ACPI_RESOURCE_TYPE_ADDRESS32: 2227 start = res->data.address32.address.minimum; 2228 end = res->data.address32.address.maximum; 2229 break; 2230 2231 case ACPI_RESOURCE_TYPE_ADDRESS64: 2232 start = res->data.address64.address.minimum; 2233 end = res->data.address64.address.maximum; 2234 break; 2235 2236 /* 2237 * The IRQ information is needed only on ARM64, which Hyper-V 2238 * sets up in the extended format. IRQ information is present 2239 * on x86/x64 in the non-extended format but it is not used by 2240 * Linux. So don't bother checking for the non-extended format. 2241 */ 2242 case ACPI_RESOURCE_TYPE_EXTENDED_IRQ: 2243 if (!acpi_dev_resource_interrupt(res, 0, &r)) { 2244 pr_err("Unable to parse Hyper-V ACPI interrupt\n"); 2245 return AE_ERROR; 2246 } 2247 /* ARM64 INTID for VMbus */ 2248 vmbus_interrupt = res->data.extended_irq.interrupts[0]; 2249 /* Linux IRQ number */ 2250 vmbus_irq = r.start; 2251 return AE_OK; 2252 2253 default: 2254 /* Unused resource type */ 2255 return AE_OK; 2256 2257 } 2258 /* 2259 * Ignore ranges that are below 1MB, as they're not 2260 * necessary or useful here. 2261 */ 2262 if (end < 0x100000) 2263 return AE_OK; 2264 2265 new_res = kzalloc(sizeof(*new_res), GFP_ATOMIC); 2266 if (!new_res) 2267 return AE_NO_MEMORY; 2268 2269 /* If this range overlaps the virtual TPM, truncate it. */ 2270 if (end > VTPM_BASE_ADDRESS && start < VTPM_BASE_ADDRESS) 2271 end = VTPM_BASE_ADDRESS; 2272 2273 new_res->name = "hyperv mmio"; 2274 new_res->flags = IORESOURCE_MEM; 2275 new_res->start = start; 2276 new_res->end = end; 2277 2278 /* 2279 * If two ranges are adjacent, merge them. 2280 */ 2281 do { 2282 if (!*old_res) { 2283 *old_res = new_res; 2284 break; 2285 } 2286 2287 if (((*old_res)->end + 1) == new_res->start) { 2288 (*old_res)->end = new_res->end; 2289 kfree(new_res); 2290 break; 2291 } 2292 2293 if ((*old_res)->start == new_res->end + 1) { 2294 (*old_res)->start = new_res->start; 2295 kfree(new_res); 2296 break; 2297 } 2298 2299 if ((*old_res)->start > new_res->end) { 2300 new_res->sibling = *old_res; 2301 if (prev_res) 2302 (*prev_res)->sibling = new_res; 2303 *old_res = new_res; 2304 break; 2305 } 2306 2307 prev_res = old_res; 2308 old_res = &(*old_res)->sibling; 2309 2310 } while (1); 2311 2312 return AE_OK; 2313 } 2314 #endif 2315 2316 static void vmbus_mmio_remove(void) 2317 { 2318 struct resource *cur_res; 2319 struct resource *next_res; 2320 2321 if (hyperv_mmio) { 2322 if (fb_mmio) { 2323 __release_region(hyperv_mmio, fb_mmio->start, 2324 resource_size(fb_mmio)); 2325 fb_mmio = NULL; 2326 } 2327 2328 for (cur_res = hyperv_mmio; cur_res; cur_res = next_res) { 2329 next_res = cur_res->sibling; 2330 kfree(cur_res); 2331 } 2332 } 2333 } 2334 2335 static void __maybe_unused vmbus_reserve_fb(void) 2336 { 2337 resource_size_t start = 0, size; 2338 struct pci_dev *pdev; 2339 2340 if (efi_enabled(EFI_BOOT)) { 2341 /* Gen2 VM: get FB base from EFI framebuffer */ 2342 if (IS_ENABLED(CONFIG_SYSFB)) { 2343 start = screen_info.lfb_base; 2344 size = max_t(__u32, screen_info.lfb_size, 0x800000); 2345 } 2346 } else { 2347 /* Gen1 VM: get FB base from PCI */ 2348 pdev = pci_get_device(PCI_VENDOR_ID_MICROSOFT, 2349 PCI_DEVICE_ID_HYPERV_VIDEO, NULL); 2350 if (!pdev) 2351 return; 2352 2353 if (pdev->resource[0].flags & IORESOURCE_MEM) { 2354 start = pci_resource_start(pdev, 0); 2355 size = pci_resource_len(pdev, 0); 2356 } 2357 2358 /* 2359 * Release the PCI device so hyperv_drm or hyperv_fb driver can 2360 * grab it later. 2361 */ 2362 pci_dev_put(pdev); 2363 } 2364 2365 if (!start) 2366 return; 2367 2368 /* 2369 * Make a claim for the frame buffer in the resource tree under the 2370 * first node, which will be the one below 4GB. The length seems to 2371 * be underreported, particularly in a Generation 1 VM. So start out 2372 * reserving a larger area and make it smaller until it succeeds. 2373 */ 2374 for (; !fb_mmio && (size >= 0x100000); size >>= 1) 2375 fb_mmio = __request_region(hyperv_mmio, start, size, fb_mmio_name, 0); 2376 } 2377 2378 /** 2379 * vmbus_allocate_mmio() - Pick a memory-mapped I/O range. 2380 * @new: If successful, supplied a pointer to the 2381 * allocated MMIO space. 2382 * @device_obj: Identifies the caller 2383 * @min: Minimum guest physical address of the 2384 * allocation 2385 * @max: Maximum guest physical address 2386 * @size: Size of the range to be allocated 2387 * @align: Alignment of the range to be allocated 2388 * @fb_overlap_ok: Whether this allocation can be allowed 2389 * to overlap the video frame buffer. 2390 * 2391 * This function walks the resources granted to VMBus by the 2392 * _CRS object in the ACPI namespace underneath the parent 2393 * "bridge" whether that's a root PCI bus in the Generation 1 2394 * case or a Module Device in the Generation 2 case. It then 2395 * attempts to allocate from the global MMIO pool in a way that 2396 * matches the constraints supplied in these parameters and by 2397 * that _CRS. 2398 * 2399 * Return: 0 on success, -errno on failure 2400 */ 2401 int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, 2402 resource_size_t min, resource_size_t max, 2403 resource_size_t size, resource_size_t align, 2404 bool fb_overlap_ok) 2405 { 2406 struct resource *iter, *shadow; 2407 resource_size_t range_min, range_max, start, end; 2408 const char *dev_n = dev_name(&device_obj->device); 2409 int retval; 2410 2411 retval = -ENXIO; 2412 mutex_lock(&hyperv_mmio_lock); 2413 2414 /* 2415 * If overlaps with frame buffers are allowed, then first attempt to 2416 * make the allocation from within the reserved region. Because it 2417 * is already reserved, no shadow allocation is necessary. 2418 */ 2419 if (fb_overlap_ok && fb_mmio && !(min > fb_mmio->end) && 2420 !(max < fb_mmio->start)) { 2421 2422 range_min = fb_mmio->start; 2423 range_max = fb_mmio->end; 2424 start = (range_min + align - 1) & ~(align - 1); 2425 for (; start + size - 1 <= range_max; start += align) { 2426 *new = request_mem_region_exclusive(start, size, dev_n); 2427 if (*new) { 2428 retval = 0; 2429 goto exit; 2430 } 2431 } 2432 } 2433 2434 for (iter = hyperv_mmio; iter; iter = iter->sibling) { 2435 if ((iter->start >= max) || (iter->end <= min)) 2436 continue; 2437 2438 range_min = iter->start; 2439 range_max = iter->end; 2440 start = (range_min + align - 1) & ~(align - 1); 2441 for (; start + size - 1 <= range_max; start += align) { 2442 end = start + size - 1; 2443 2444 /* Skip the whole fb_mmio region if not fb_overlap_ok */ 2445 if (!fb_overlap_ok && fb_mmio && 2446 (((start >= fb_mmio->start) && (start <= fb_mmio->end)) || 2447 ((end >= fb_mmio->start) && (end <= fb_mmio->end)))) 2448 continue; 2449 2450 shadow = __request_region(iter, start, size, NULL, 2451 IORESOURCE_BUSY); 2452 if (!shadow) 2453 continue; 2454 2455 *new = request_mem_region_exclusive(start, size, dev_n); 2456 if (*new) { 2457 shadow->name = (char *)*new; 2458 retval = 0; 2459 goto exit; 2460 } 2461 2462 __release_region(iter, start, size); 2463 } 2464 } 2465 2466 exit: 2467 mutex_unlock(&hyperv_mmio_lock); 2468 return retval; 2469 } 2470 EXPORT_SYMBOL_GPL(vmbus_allocate_mmio); 2471 2472 /** 2473 * vmbus_free_mmio() - Free a memory-mapped I/O range. 2474 * @start: Base address of region to release. 2475 * @size: Size of the range to be allocated 2476 * 2477 * This function releases anything requested by 2478 * vmbus_mmio_allocate(). 2479 */ 2480 void vmbus_free_mmio(resource_size_t start, resource_size_t size) 2481 { 2482 struct resource *iter; 2483 2484 mutex_lock(&hyperv_mmio_lock); 2485 2486 /* 2487 * If all bytes of the MMIO range to be released are within the 2488 * special case fb_mmio shadow region, skip releasing the shadow 2489 * region since no corresponding __request_region() was done 2490 * in vmbus_allocate_mmio(). 2491 */ 2492 if (fb_mmio && start >= fb_mmio->start && 2493 (start + size - 1 <= fb_mmio->end)) 2494 goto skip_shadow_release; 2495 2496 for (iter = hyperv_mmio; iter; iter = iter->sibling) { 2497 if ((iter->start >= start + size) || (iter->end <= start)) 2498 continue; 2499 2500 __release_region(iter, start, size); 2501 } 2502 2503 skip_shadow_release: 2504 release_mem_region(start, size); 2505 mutex_unlock(&hyperv_mmio_lock); 2506 2507 } 2508 EXPORT_SYMBOL_GPL(vmbus_free_mmio); 2509 2510 #ifdef CONFIG_ACPI 2511 static int vmbus_acpi_add(struct platform_device *pdev) 2512 { 2513 acpi_status result; 2514 int ret_val = -ENODEV; 2515 struct acpi_device *ancestor; 2516 struct acpi_device *device = ACPI_COMPANION(&pdev->dev); 2517 2518 vmbus_root_device = &device->dev; 2519 2520 /* 2521 * Older versions of Hyper-V for ARM64 fail to include the _CCA 2522 * method on the top level VMbus device in the DSDT. But devices 2523 * are hardware coherent in all current Hyper-V use cases, so fix 2524 * up the ACPI device to behave as if _CCA is present and indicates 2525 * hardware coherence. 2526 */ 2527 ACPI_COMPANION_SET(&device->dev, device); 2528 if (IS_ENABLED(CONFIG_ACPI_CCA_REQUIRED) && 2529 device_get_dma_attr(&device->dev) == DEV_DMA_NOT_SUPPORTED) { 2530 pr_info("No ACPI _CCA found; assuming coherent device I/O\n"); 2531 device->flags.cca_seen = true; 2532 device->flags.coherent_dma = true; 2533 } 2534 2535 result = acpi_walk_resources(device->handle, METHOD_NAME__CRS, 2536 vmbus_walk_resources, NULL); 2537 2538 if (ACPI_FAILURE(result)) 2539 goto acpi_walk_err; 2540 /* 2541 * Some ancestor of the vmbus acpi device (Gen1 or Gen2 2542 * firmware) is the VMOD that has the mmio ranges. Get that. 2543 */ 2544 for (ancestor = acpi_dev_parent(device); 2545 ancestor && ancestor->handle != ACPI_ROOT_OBJECT; 2546 ancestor = acpi_dev_parent(ancestor)) { 2547 result = acpi_walk_resources(ancestor->handle, METHOD_NAME__CRS, 2548 vmbus_walk_resources, NULL); 2549 2550 if (ACPI_FAILURE(result)) 2551 continue; 2552 if (hyperv_mmio) { 2553 vmbus_reserve_fb(); 2554 break; 2555 } 2556 } 2557 ret_val = 0; 2558 2559 acpi_walk_err: 2560 if (ret_val) 2561 vmbus_mmio_remove(); 2562 return ret_val; 2563 } 2564 #else 2565 static int vmbus_acpi_add(struct platform_device *pdev) 2566 { 2567 return 0; 2568 } 2569 #endif 2570 #ifndef HYPERVISOR_CALLBACK_VECTOR 2571 static int vmbus_set_irq(struct platform_device *pdev) 2572 { 2573 struct irq_data *data; 2574 int irq; 2575 irq_hw_number_t hwirq; 2576 2577 irq = platform_get_irq(pdev, 0); 2578 /* platform_get_irq() may not return 0. */ 2579 if (irq < 0) 2580 return irq; 2581 2582 data = irq_get_irq_data(irq); 2583 if (!data) { 2584 pr_err("No interrupt data for VMBus virq %d\n", irq); 2585 return -ENODEV; 2586 } 2587 hwirq = irqd_to_hwirq(data); 2588 2589 vmbus_irq = irq; 2590 vmbus_interrupt = hwirq; 2591 pr_debug("VMBus virq %d, hwirq %d\n", vmbus_irq, vmbus_interrupt); 2592 2593 return 0; 2594 } 2595 #endif 2596 2597 static int vmbus_device_add(struct platform_device *pdev) 2598 { 2599 struct resource **cur_res = &hyperv_mmio; 2600 struct of_range range; 2601 struct of_range_parser parser; 2602 struct device_node *np = pdev->dev.of_node; 2603 int ret; 2604 2605 vmbus_root_device = &pdev->dev; 2606 2607 ret = of_range_parser_init(&parser, np); 2608 if (ret) 2609 return ret; 2610 2611 #ifndef HYPERVISOR_CALLBACK_VECTOR 2612 ret = vmbus_set_irq(pdev); 2613 if (ret) 2614 return ret; 2615 #endif 2616 for_each_of_range(&parser, &range) { 2617 struct resource *res; 2618 2619 res = kzalloc(sizeof(*res), GFP_KERNEL); 2620 if (!res) { 2621 vmbus_mmio_remove(); 2622 return -ENOMEM; 2623 } 2624 2625 res->name = "hyperv mmio"; 2626 res->flags = range.flags; 2627 res->start = range.cpu_addr; 2628 res->end = range.cpu_addr + range.size; 2629 2630 *cur_res = res; 2631 cur_res = &res->sibling; 2632 } 2633 2634 return ret; 2635 } 2636 2637 static int vmbus_platform_driver_probe(struct platform_device *pdev) 2638 { 2639 if (acpi_disabled) 2640 return vmbus_device_add(pdev); 2641 else 2642 return vmbus_acpi_add(pdev); 2643 } 2644 2645 static void vmbus_platform_driver_remove(struct platform_device *pdev) 2646 { 2647 vmbus_mmio_remove(); 2648 } 2649 2650 #ifdef CONFIG_PM_SLEEP 2651 static int vmbus_bus_suspend(struct device *dev) 2652 { 2653 struct hv_per_cpu_context *hv_cpu = per_cpu_ptr( 2654 hv_context.cpu_context, VMBUS_CONNECT_CPU); 2655 struct vmbus_channel *channel, *sc; 2656 2657 tasklet_disable(&hv_cpu->msg_dpc); 2658 vmbus_connection.ignore_any_offer_msg = true; 2659 /* The tasklet_enable() takes care of providing a memory barrier */ 2660 tasklet_enable(&hv_cpu->msg_dpc); 2661 2662 /* Drain all the workqueues as we are in suspend */ 2663 drain_workqueue(vmbus_connection.rescind_work_queue); 2664 drain_workqueue(vmbus_connection.work_queue); 2665 drain_workqueue(vmbus_connection.handle_primary_chan_wq); 2666 drain_workqueue(vmbus_connection.handle_sub_chan_wq); 2667 2668 mutex_lock(&vmbus_connection.channel_mutex); 2669 list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { 2670 if (!is_hvsock_channel(channel)) 2671 continue; 2672 2673 vmbus_force_channel_rescinded(channel); 2674 } 2675 mutex_unlock(&vmbus_connection.channel_mutex); 2676 2677 /* 2678 * Wait until all the sub-channels and hv_sock channels have been 2679 * cleaned up. Sub-channels should be destroyed upon suspend, otherwise 2680 * they would conflict with the new sub-channels that will be created 2681 * in the resume path. hv_sock channels should also be destroyed, but 2682 * a hv_sock channel of an established hv_sock connection can not be 2683 * really destroyed since it may still be referenced by the userspace 2684 * application, so we just force the hv_sock channel to be rescinded 2685 * by vmbus_force_channel_rescinded(), and the userspace application 2686 * will thoroughly destroy the channel after hibernation. 2687 * 2688 * Note: the counter nr_chan_close_on_suspend may never go above 0 if 2689 * the VM has no sub-channel and hv_sock channel, e.g. a 1-vCPU VM. 2690 */ 2691 if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0) 2692 wait_for_completion(&vmbus_connection.ready_for_suspend_event); 2693 2694 mutex_lock(&vmbus_connection.channel_mutex); 2695 2696 list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { 2697 /* 2698 * Remove the channel from the array of channels and invalidate 2699 * the channel's relid. Upon resume, vmbus_onoffer() will fix 2700 * up the relid (and other fields, if necessary) and add the 2701 * channel back to the array. 2702 */ 2703 vmbus_channel_unmap_relid(channel); 2704 channel->offermsg.child_relid = INVALID_RELID; 2705 2706 if (is_hvsock_channel(channel)) { 2707 if (!channel->rescind) { 2708 pr_err("hv_sock channel not rescinded!\n"); 2709 WARN_ON_ONCE(1); 2710 } 2711 continue; 2712 } 2713 2714 list_for_each_entry(sc, &channel->sc_list, sc_list) { 2715 pr_err("Sub-channel not deleted!\n"); 2716 WARN_ON_ONCE(1); 2717 } 2718 } 2719 2720 mutex_unlock(&vmbus_connection.channel_mutex); 2721 2722 vmbus_initiate_unload(false); 2723 2724 return 0; 2725 } 2726 2727 static int vmbus_bus_resume(struct device *dev) 2728 { 2729 struct vmbus_channel *channel; 2730 struct vmbus_channel_msginfo *msginfo; 2731 size_t msgsize; 2732 int ret; 2733 2734 vmbus_connection.ignore_any_offer_msg = false; 2735 2736 /* 2737 * We only use the 'vmbus_proto_version', which was in use before 2738 * hibernation, to re-negotiate with the host. 2739 */ 2740 if (!vmbus_proto_version) { 2741 pr_err("Invalid proto version = 0x%x\n", vmbus_proto_version); 2742 return -EINVAL; 2743 } 2744 2745 msgsize = sizeof(*msginfo) + 2746 sizeof(struct vmbus_channel_initiate_contact); 2747 2748 msginfo = kzalloc(msgsize, GFP_KERNEL); 2749 2750 if (msginfo == NULL) 2751 return -ENOMEM; 2752 2753 ret = vmbus_negotiate_version(msginfo, vmbus_proto_version); 2754 2755 kfree(msginfo); 2756 2757 if (ret != 0) 2758 return ret; 2759 2760 vmbus_request_offers(); 2761 2762 mutex_lock(&vmbus_connection.channel_mutex); 2763 list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { 2764 if (channel->offermsg.child_relid != INVALID_RELID) 2765 continue; 2766 2767 /* hvsock channels are not expected to be present. */ 2768 if (is_hvsock_channel(channel)) 2769 continue; 2770 2771 pr_err("channel %pUl/%pUl not present after resume.\n", 2772 &channel->offermsg.offer.if_type, 2773 &channel->offermsg.offer.if_instance); 2774 /* ToDo: Cleanup these channels here */ 2775 } 2776 mutex_unlock(&vmbus_connection.channel_mutex); 2777 2778 /* Reset the event for the next suspend. */ 2779 reinit_completion(&vmbus_connection.ready_for_suspend_event); 2780 2781 return 0; 2782 } 2783 #else 2784 #define vmbus_bus_suspend NULL 2785 #define vmbus_bus_resume NULL 2786 #endif /* CONFIG_PM_SLEEP */ 2787 2788 static const __maybe_unused struct of_device_id vmbus_of_match[] = { 2789 { 2790 .compatible = "microsoft,vmbus", 2791 }, 2792 { 2793 /* sentinel */ 2794 }, 2795 }; 2796 MODULE_DEVICE_TABLE(of, vmbus_of_match); 2797 2798 static const __maybe_unused struct acpi_device_id vmbus_acpi_device_ids[] = { 2799 {"VMBUS", 0}, 2800 {"VMBus", 0}, 2801 {"", 0}, 2802 }; 2803 MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids); 2804 2805 /* 2806 * Note: we must use the "no_irq" ops, otherwise hibernation can not work with 2807 * PCI device assignment, because "pci_dev_pm_ops" uses the "noirq" ops: in 2808 * the resume path, the pci "noirq" restore op runs before "non-noirq" op (see 2809 * resume_target_kernel() -> dpm_resume_start(), and hibernation_restore() -> 2810 * dpm_resume_end()). This means vmbus_bus_resume() and the pci-hyperv's 2811 * resume callback must also run via the "noirq" ops. 2812 * 2813 * Set suspend_noirq/resume_noirq to NULL for Suspend-to-Idle: see the comment 2814 * earlier in this file before vmbus_pm. 2815 */ 2816 2817 static const struct dev_pm_ops vmbus_bus_pm = { 2818 .suspend_noirq = NULL, 2819 .resume_noirq = NULL, 2820 .freeze_noirq = vmbus_bus_suspend, 2821 .thaw_noirq = vmbus_bus_resume, 2822 .poweroff_noirq = vmbus_bus_suspend, 2823 .restore_noirq = vmbus_bus_resume 2824 }; 2825 2826 static struct platform_driver vmbus_platform_driver = { 2827 .probe = vmbus_platform_driver_probe, 2828 .remove = vmbus_platform_driver_remove, 2829 .driver = { 2830 .name = "vmbus", 2831 .acpi_match_table = ACPI_PTR(vmbus_acpi_device_ids), 2832 .of_match_table = of_match_ptr(vmbus_of_match), 2833 .pm = &vmbus_bus_pm, 2834 .probe_type = PROBE_FORCE_SYNCHRONOUS, 2835 } 2836 }; 2837 2838 static void hv_kexec_handler(void) 2839 { 2840 hv_stimer_global_cleanup(); 2841 vmbus_initiate_unload(false); 2842 /* Make sure conn_state is set as hv_synic_cleanup checks for it */ 2843 mb(); 2844 cpuhp_remove_state(hyperv_cpuhp_online); 2845 }; 2846 2847 static void hv_crash_handler(struct pt_regs *regs) 2848 { 2849 int cpu; 2850 2851 vmbus_initiate_unload(true); 2852 /* 2853 * In crash handler we can't schedule synic cleanup for all CPUs, 2854 * doing the cleanup for current CPU only. This should be sufficient 2855 * for kdump. 2856 */ 2857 cpu = smp_processor_id(); 2858 hv_stimer_cleanup(cpu); 2859 hv_hyp_synic_disable_regs(cpu); 2860 }; 2861 2862 static int hv_synic_suspend(void *data) 2863 { 2864 /* 2865 * When we reach here, all the non-boot CPUs have been offlined. 2866 * If we're in a legacy configuration where stimer Direct Mode is 2867 * not enabled, the stimers on the non-boot CPUs have been unbound 2868 * in hv_synic_cleanup() -> hv_stimer_legacy_cleanup() -> 2869 * hv_stimer_cleanup() -> clockevents_unbind_device(). 2870 * 2871 * hv_synic_suspend() only runs on CPU0 with interrupts disabled. 2872 * Here we do not call hv_stimer_legacy_cleanup() on CPU0 because: 2873 * 1) it's unnecessary as interrupts remain disabled between 2874 * syscore_suspend() and syscore_resume(): see create_image() and 2875 * resume_target_kernel() 2876 * 2) the stimer on CPU0 is automatically disabled later by 2877 * syscore_suspend() -> timekeeping_suspend() -> tick_suspend() -> ... 2878 * -> clockevents_shutdown() -> ... -> hv_ce_shutdown() 2879 * 3) a warning would be triggered if we call 2880 * clockevents_unbind_device(), which may sleep, in an 2881 * interrupts-disabled context. 2882 */ 2883 2884 hv_hyp_synic_disable_regs(0); 2885 2886 return 0; 2887 } 2888 2889 static void hv_synic_resume(void *data) 2890 { 2891 hv_hyp_synic_enable_regs(0); 2892 2893 /* 2894 * Note: we don't need to call hv_stimer_init(0), because the timer 2895 * on CPU0 is not unbound in hv_synic_suspend(), and the timer is 2896 * automatically re-enabled in timekeeping_resume(). 2897 */ 2898 } 2899 2900 /* The callbacks run only on CPU0, with irqs_disabled. */ 2901 static const struct syscore_ops hv_synic_syscore_ops = { 2902 .suspend = hv_synic_suspend, 2903 .resume = hv_synic_resume, 2904 }; 2905 2906 static struct syscore hv_synic_syscore = { 2907 .ops = &hv_synic_syscore_ops, 2908 }; 2909 2910 static int __init hv_acpi_init(void) 2911 { 2912 int ret; 2913 2914 if (!hv_is_hyperv_initialized()) 2915 return -ENODEV; 2916 2917 if (hv_root_partition() && !hv_nested) 2918 return 0; 2919 2920 /* 2921 * Get ACPI resources first. 2922 */ 2923 ret = platform_driver_register(&vmbus_platform_driver); 2924 if (ret) 2925 return ret; 2926 2927 if (!vmbus_root_device) { 2928 ret = -ENODEV; 2929 goto cleanup; 2930 } 2931 2932 /* 2933 * If we're on an architecture with a hardcoded hypervisor 2934 * vector (i.e. x86/x64), override the VMbus interrupt found 2935 * in the ACPI tables. Ensure vmbus_irq is not set since the 2936 * normal Linux IRQ mechanism is not used in this case. 2937 */ 2938 #ifdef HYPERVISOR_CALLBACK_VECTOR 2939 vmbus_interrupt = HYPERVISOR_CALLBACK_VECTOR; 2940 vmbus_irq = -1; 2941 #endif 2942 2943 hv_debug_init(); 2944 2945 ret = vmbus_bus_init(); 2946 if (ret) 2947 goto cleanup; 2948 2949 hv_setup_kexec_handler(hv_kexec_handler); 2950 hv_setup_crash_handler(hv_crash_handler); 2951 2952 register_syscore(&hv_synic_syscore); 2953 2954 return 0; 2955 2956 cleanup: 2957 platform_driver_unregister(&vmbus_platform_driver); 2958 vmbus_root_device = NULL; 2959 return ret; 2960 } 2961 2962 static void __exit vmbus_exit(void) 2963 { 2964 int cpu; 2965 2966 unregister_syscore(&hv_synic_syscore); 2967 2968 hv_remove_kexec_handler(); 2969 hv_remove_crash_handler(); 2970 vmbus_connection.conn_state = DISCONNECTED; 2971 hv_stimer_global_cleanup(); 2972 vmbus_disconnect(); 2973 if (vmbus_irq == -1) { 2974 hv_remove_vmbus_handler(); 2975 } else { 2976 free_percpu_irq(vmbus_irq, vmbus_evt); 2977 free_percpu(vmbus_evt); 2978 } 2979 for_each_online_cpu(cpu) { 2980 struct hv_per_cpu_context *hv_cpu 2981 = per_cpu_ptr(hv_context.cpu_context, cpu); 2982 2983 tasklet_kill(&hv_cpu->msg_dpc); 2984 } 2985 hv_debug_rm_all_dir(); 2986 2987 vmbus_free_channels(); 2988 kfree(vmbus_connection.channels); 2989 2990 /* 2991 * The vmbus panic notifier is always registered, hence we should 2992 * also unconditionally unregister it here as well. 2993 */ 2994 atomic_notifier_chain_unregister(&panic_notifier_list, 2995 &hyperv_panic_vmbus_unload_block); 2996 2997 bus_unregister(&hv_bus); 2998 2999 cpuhp_remove_state(hyperv_cpuhp_online); 3000 hv_synic_free(); 3001 platform_driver_unregister(&vmbus_platform_driver); 3002 } 3003 3004 3005 MODULE_LICENSE("GPL"); 3006 MODULE_DESCRIPTION("Microsoft Hyper-V VMBus Driver"); 3007 3008 subsys_initcall(hv_acpi_init); 3009 module_exit(vmbus_exit); 3010