1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) Microsoft Corporation. 4 * 5 * Author: 6 * Jake Oshins <jakeo@microsoft.com> 7 * 8 * This driver acts as a paravirtual front-end for PCI Express root buses. 9 * When a PCI Express function (either an entire device or an SR-IOV 10 * Virtual Function) is being passed through to the VM, this driver exposes 11 * a new bus to the guest VM. This is modeled as a root PCI bus because 12 * no bridges are being exposed to the VM. In fact, with a "Generation 2" 13 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM 14 * until a device as been exposed using this driver. 15 * 16 * Each root PCI bus has its own PCI domain, which is called "Segment" in 17 * the PCI Firmware Specifications. Thus while each device passed through 18 * to the VM using this front-end will appear at "device 0", the domain will 19 * be unique. Typically, each bus will have one PCI function on it, though 20 * this driver does support more than one. 21 * 22 * In order to map the interrupts from the device through to the guest VM, 23 * this driver also implements an IRQ Domain, which handles interrupts (either 24 * MSI or MSI-X) associated with the functions on the bus. As interrupts are 25 * set up, torn down, or reaffined, this driver communicates with the 26 * underlying hypervisor to adjust the mappings in the I/O MMU so that each 27 * interrupt will be delivered to the correct virtual processor at the right 28 * vector. This driver does not support level-triggered (line-based) 29 * interrupts, and will report that the Interrupt Line register in the 30 * function's configuration space is zero. 31 * 32 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V 33 * facilities. For instance, the configuration space of a function exposed 34 * by Hyper-V is mapped into a single page of memory space, and the 35 * read and write handlers for config space must be aware of this mechanism. 36 * Similarly, device setup and teardown involves messages sent to and from 37 * the PCI back-end driver in Hyper-V. 38 */ 39 40 #include <linux/kernel.h> 41 #include <linux/module.h> 42 #include <linux/pci.h> 43 #include <linux/pci-ecam.h> 44 #include <linux/delay.h> 45 #include <linux/semaphore.h> 46 #include <linux/irq.h> 47 #include <linux/msi.h> 48 #include <linux/hyperv.h> 49 #include <linux/refcount.h> 50 #include <linux/irqdomain.h> 51 #include <linux/acpi.h> 52 #include <linux/sizes.h> 53 #include <asm/mshyperv.h> 54 55 /* 56 * Protocol versions. The low word is the minor version, the high word the 57 * major version. 58 */ 59 60 #define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor))) 61 #define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16) 62 #define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff) 63 64 enum pci_protocol_version_t { 65 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */ 66 PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */ 67 PCI_PROTOCOL_VERSION_1_3 = PCI_MAKE_VERSION(1, 3), /* Vibranium */ 68 PCI_PROTOCOL_VERSION_1_4 = PCI_MAKE_VERSION(1, 4), /* WS2022 */ 69 }; 70 71 #define CPU_AFFINITY_ALL -1ULL 72 73 /* 74 * Supported protocol versions in the order of probing - highest go 75 * first. 76 */ 77 static enum pci_protocol_version_t pci_protocol_versions[] = { 78 PCI_PROTOCOL_VERSION_1_4, 79 PCI_PROTOCOL_VERSION_1_3, 80 PCI_PROTOCOL_VERSION_1_2, 81 PCI_PROTOCOL_VERSION_1_1, 82 }; 83 84 #define PCI_CONFIG_MMIO_LENGTH 0x2000 85 #define CFG_PAGE_OFFSET 0x1000 86 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET) 87 88 #define MAX_SUPPORTED_MSI_MESSAGES 0x400 89 90 #define STATUS_REVISION_MISMATCH 0xC0000059 91 92 /* space for 32bit serial number as string */ 93 #define SLOT_NAME_SIZE 11 94 95 /* 96 * Size of requestor for VMbus; the value is based on the observation 97 * that having more than one request outstanding is 'rare', and so 64 98 * should be generous in ensuring that we don't ever run out. 99 */ 100 #define HV_PCI_RQSTOR_SIZE 64 101 102 /* 103 * Message Types 104 */ 105 106 enum pci_message_type { 107 /* 108 * Version 1.1 109 */ 110 PCI_MESSAGE_BASE = 0x42490000, 111 PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0, 112 PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1, 113 PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4, 114 PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5, 115 PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6, 116 PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7, 117 PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8, 118 PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9, 119 PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA, 120 PCI_EJECT = PCI_MESSAGE_BASE + 0xB, 121 PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC, 122 PCI_REENABLE = PCI_MESSAGE_BASE + 0xD, 123 PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE, 124 PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF, 125 PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10, 126 PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11, 127 PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12, 128 PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13, 129 PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14, 130 PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15, 131 PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16, 132 PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17, 133 PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */ 134 PCI_BUS_RELATIONS2 = PCI_MESSAGE_BASE + 0x19, 135 PCI_RESOURCES_ASSIGNED3 = PCI_MESSAGE_BASE + 0x1A, 136 PCI_CREATE_INTERRUPT_MESSAGE3 = PCI_MESSAGE_BASE + 0x1B, 137 PCI_MESSAGE_MAXIMUM 138 }; 139 140 /* 141 * Structures defining the virtual PCI Express protocol. 142 */ 143 144 union pci_version { 145 struct { 146 u16 minor_version; 147 u16 major_version; 148 } parts; 149 u32 version; 150 } __packed; 151 152 /* 153 * Function numbers are 8-bits wide on Express, as interpreted through ARI, 154 * which is all this driver does. This representation is the one used in 155 * Windows, which is what is expected when sending this back and forth with 156 * the Hyper-V parent partition. 157 */ 158 union win_slot_encoding { 159 struct { 160 u32 dev:5; 161 u32 func:3; 162 u32 reserved:24; 163 } bits; 164 u32 slot; 165 } __packed; 166 167 /* 168 * Pretty much as defined in the PCI Specifications. 169 */ 170 struct pci_function_description { 171 u16 v_id; /* vendor ID */ 172 u16 d_id; /* device ID */ 173 u8 rev; 174 u8 prog_intf; 175 u8 subclass; 176 u8 base_class; 177 u32 subsystem_id; 178 union win_slot_encoding win_slot; 179 u32 ser; /* serial number */ 180 } __packed; 181 182 enum pci_device_description_flags { 183 HV_PCI_DEVICE_FLAG_NONE = 0x0, 184 HV_PCI_DEVICE_FLAG_NUMA_AFFINITY = 0x1, 185 }; 186 187 struct pci_function_description2 { 188 u16 v_id; /* vendor ID */ 189 u16 d_id; /* device ID */ 190 u8 rev; 191 u8 prog_intf; 192 u8 subclass; 193 u8 base_class; 194 u32 subsystem_id; 195 union win_slot_encoding win_slot; 196 u32 ser; /* serial number */ 197 u32 flags; 198 u16 virtual_numa_node; 199 u16 reserved; 200 } __packed; 201 202 /** 203 * struct hv_msi_desc 204 * @vector: IDT entry 205 * @delivery_mode: As defined in Intel's Programmer's 206 * Reference Manual, Volume 3, Chapter 8. 207 * @vector_count: Number of contiguous entries in the 208 * Interrupt Descriptor Table that are 209 * occupied by this Message-Signaled 210 * Interrupt. For "MSI", as first defined 211 * in PCI 2.2, this can be between 1 and 212 * 32. For "MSI-X," as first defined in PCI 213 * 3.0, this must be 1, as each MSI-X table 214 * entry would have its own descriptor. 215 * @reserved: Empty space 216 * @cpu_mask: All the target virtual processors. 217 */ 218 struct hv_msi_desc { 219 u8 vector; 220 u8 delivery_mode; 221 u16 vector_count; 222 u32 reserved; 223 u64 cpu_mask; 224 } __packed; 225 226 /** 227 * struct hv_msi_desc2 - 1.2 version of hv_msi_desc 228 * @vector: IDT entry 229 * @delivery_mode: As defined in Intel's Programmer's 230 * Reference Manual, Volume 3, Chapter 8. 231 * @vector_count: Number of contiguous entries in the 232 * Interrupt Descriptor Table that are 233 * occupied by this Message-Signaled 234 * Interrupt. For "MSI", as first defined 235 * in PCI 2.2, this can be between 1 and 236 * 32. For "MSI-X," as first defined in PCI 237 * 3.0, this must be 1, as each MSI-X table 238 * entry would have its own descriptor. 239 * @processor_count: number of bits enabled in array. 240 * @processor_array: All the target virtual processors. 241 */ 242 struct hv_msi_desc2 { 243 u8 vector; 244 u8 delivery_mode; 245 u16 vector_count; 246 u16 processor_count; 247 u16 processor_array[32]; 248 } __packed; 249 250 /* 251 * struct hv_msi_desc3 - 1.3 version of hv_msi_desc 252 * Everything is the same as in 'hv_msi_desc2' except that the size of the 253 * 'vector' field is larger to support bigger vector values. For ex: LPI 254 * vectors on ARM. 255 */ 256 struct hv_msi_desc3 { 257 u32 vector; 258 u8 delivery_mode; 259 u8 reserved; 260 u16 vector_count; 261 u16 processor_count; 262 u16 processor_array[32]; 263 } __packed; 264 265 /** 266 * struct tran_int_desc 267 * @reserved: unused, padding 268 * @vector_count: same as in hv_msi_desc 269 * @data: This is the "data payload" value that is 270 * written by the device when it generates 271 * a message-signaled interrupt, either MSI 272 * or MSI-X. 273 * @address: This is the address to which the data 274 * payload is written on interrupt 275 * generation. 276 */ 277 struct tran_int_desc { 278 u16 reserved; 279 u16 vector_count; 280 u32 data; 281 u64 address; 282 } __packed; 283 284 /* 285 * A generic message format for virtual PCI. 286 * Specific message formats are defined later in the file. 287 */ 288 289 struct pci_message { 290 u32 type; 291 } __packed; 292 293 struct pci_child_message { 294 struct pci_message message_type; 295 union win_slot_encoding wslot; 296 } __packed; 297 298 struct pci_incoming_message { 299 struct vmpacket_descriptor hdr; 300 struct pci_message message_type; 301 } __packed; 302 303 struct pci_response { 304 struct vmpacket_descriptor hdr; 305 s32 status; /* negative values are failures */ 306 } __packed; 307 308 struct pci_packet { 309 void (*completion_func)(void *context, struct pci_response *resp, 310 int resp_packet_size); 311 void *compl_ctxt; 312 313 struct pci_message message[]; 314 }; 315 316 /* 317 * Specific message types supporting the PCI protocol. 318 */ 319 320 /* 321 * Version negotiation message. Sent from the guest to the host. 322 * The guest is free to try different versions until the host 323 * accepts the version. 324 * 325 * pci_version: The protocol version requested. 326 * is_last_attempt: If TRUE, this is the last version guest will request. 327 * reservedz: Reserved field, set to zero. 328 */ 329 330 struct pci_version_request { 331 struct pci_message message_type; 332 u32 protocol_version; 333 } __packed; 334 335 /* 336 * Bus D0 Entry. This is sent from the guest to the host when the virtual 337 * bus (PCI Express port) is ready for action. 338 */ 339 340 struct pci_bus_d0_entry { 341 struct pci_message message_type; 342 u32 reserved; 343 u64 mmio_base; 344 } __packed; 345 346 struct pci_bus_relations { 347 struct pci_incoming_message incoming; 348 u32 device_count; 349 struct pci_function_description func[]; 350 } __packed; 351 352 struct pci_bus_relations2 { 353 struct pci_incoming_message incoming; 354 u32 device_count; 355 struct pci_function_description2 func[]; 356 } __packed; 357 358 struct pci_q_res_req_response { 359 struct vmpacket_descriptor hdr; 360 s32 status; /* negative values are failures */ 361 u32 probed_bar[PCI_STD_NUM_BARS]; 362 } __packed; 363 364 struct pci_set_power { 365 struct pci_message message_type; 366 union win_slot_encoding wslot; 367 u32 power_state; /* In Windows terms */ 368 u32 reserved; 369 } __packed; 370 371 struct pci_set_power_response { 372 struct vmpacket_descriptor hdr; 373 s32 status; /* negative values are failures */ 374 union win_slot_encoding wslot; 375 u32 resultant_state; /* In Windows terms */ 376 u32 reserved; 377 } __packed; 378 379 struct pci_resources_assigned { 380 struct pci_message message_type; 381 union win_slot_encoding wslot; 382 u8 memory_range[0x14][6]; /* not used here */ 383 u32 msi_descriptors; 384 u32 reserved[4]; 385 } __packed; 386 387 struct pci_resources_assigned2 { 388 struct pci_message message_type; 389 union win_slot_encoding wslot; 390 u8 memory_range[0x14][6]; /* not used here */ 391 u32 msi_descriptor_count; 392 u8 reserved[70]; 393 } __packed; 394 395 struct pci_create_interrupt { 396 struct pci_message message_type; 397 union win_slot_encoding wslot; 398 struct hv_msi_desc int_desc; 399 } __packed; 400 401 struct pci_create_int_response { 402 struct pci_response response; 403 u32 reserved; 404 struct tran_int_desc int_desc; 405 } __packed; 406 407 struct pci_create_interrupt2 { 408 struct pci_message message_type; 409 union win_slot_encoding wslot; 410 struct hv_msi_desc2 int_desc; 411 } __packed; 412 413 struct pci_create_interrupt3 { 414 struct pci_message message_type; 415 union win_slot_encoding wslot; 416 struct hv_msi_desc3 int_desc; 417 } __packed; 418 419 struct pci_delete_interrupt { 420 struct pci_message message_type; 421 union win_slot_encoding wslot; 422 struct tran_int_desc int_desc; 423 } __packed; 424 425 /* 426 * Note: the VM must pass a valid block id, wslot and bytes_requested. 427 */ 428 struct pci_read_block { 429 struct pci_message message_type; 430 u32 block_id; 431 union win_slot_encoding wslot; 432 u32 bytes_requested; 433 } __packed; 434 435 struct pci_read_block_response { 436 struct vmpacket_descriptor hdr; 437 u32 status; 438 u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 439 } __packed; 440 441 /* 442 * Note: the VM must pass a valid block id, wslot and byte_count. 443 */ 444 struct pci_write_block { 445 struct pci_message message_type; 446 u32 block_id; 447 union win_slot_encoding wslot; 448 u32 byte_count; 449 u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 450 } __packed; 451 452 struct pci_dev_inval_block { 453 struct pci_incoming_message incoming; 454 union win_slot_encoding wslot; 455 u64 block_mask; 456 } __packed; 457 458 struct pci_dev_incoming { 459 struct pci_incoming_message incoming; 460 union win_slot_encoding wslot; 461 } __packed; 462 463 struct pci_eject_response { 464 struct pci_message message_type; 465 union win_slot_encoding wslot; 466 u32 status; 467 } __packed; 468 469 static int pci_ring_size = VMBUS_RING_SIZE(SZ_16K); 470 471 /* 472 * Driver specific state. 473 */ 474 475 enum hv_pcibus_state { 476 hv_pcibus_init = 0, 477 hv_pcibus_probed, 478 hv_pcibus_installed, 479 hv_pcibus_removing, 480 hv_pcibus_maximum 481 }; 482 483 struct hv_pcibus_device { 484 #ifdef CONFIG_X86 485 struct pci_sysdata sysdata; 486 #elif defined(CONFIG_ARM64) 487 struct pci_config_window sysdata; 488 #endif 489 struct pci_host_bridge *bridge; 490 struct fwnode_handle *fwnode; 491 /* Protocol version negotiated with the host */ 492 enum pci_protocol_version_t protocol_version; 493 494 struct mutex state_lock; 495 enum hv_pcibus_state state; 496 497 struct hv_device *hdev; 498 resource_size_t low_mmio_space; 499 resource_size_t high_mmio_space; 500 struct resource *mem_config; 501 struct resource *low_mmio_res; 502 struct resource *high_mmio_res; 503 struct completion *survey_event; 504 struct pci_bus *pci_bus; 505 spinlock_t config_lock; /* Avoid two threads writing index page */ 506 spinlock_t device_list_lock; /* Protect lists below */ 507 void __iomem *cfg_addr; 508 509 struct list_head children; 510 struct list_head dr_list; 511 512 struct msi_domain_info msi_info; 513 struct irq_domain *irq_domain; 514 515 struct workqueue_struct *wq; 516 517 /* Highest slot of child device with resources allocated */ 518 int wslot_res_allocated; 519 bool use_calls; /* Use hypercalls to access mmio cfg space */ 520 }; 521 522 /* 523 * Tracks "Device Relations" messages from the host, which must be both 524 * processed in order and deferred so that they don't run in the context 525 * of the incoming packet callback. 526 */ 527 struct hv_dr_work { 528 struct work_struct wrk; 529 struct hv_pcibus_device *bus; 530 }; 531 532 struct hv_pcidev_description { 533 u16 v_id; /* vendor ID */ 534 u16 d_id; /* device ID */ 535 u8 rev; 536 u8 prog_intf; 537 u8 subclass; 538 u8 base_class; 539 u32 subsystem_id; 540 union win_slot_encoding win_slot; 541 u32 ser; /* serial number */ 542 u32 flags; 543 u16 virtual_numa_node; 544 }; 545 546 struct hv_dr_state { 547 struct list_head list_entry; 548 u32 device_count; 549 struct hv_pcidev_description func[] __counted_by(device_count); 550 }; 551 552 struct hv_pci_dev { 553 /* List protected by pci_rescan_remove_lock */ 554 struct list_head list_entry; 555 refcount_t refs; 556 struct pci_slot *pci_slot; 557 struct hv_pcidev_description desc; 558 bool reported_missing; 559 struct hv_pcibus_device *hbus; 560 struct work_struct wrk; 561 562 void (*block_invalidate)(void *context, u64 block_mask); 563 void *invalidate_context; 564 565 /* 566 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then 567 * read it back, for each of the BAR offsets within config space. 568 */ 569 u32 probed_bar[PCI_STD_NUM_BARS]; 570 }; 571 572 struct hv_pci_compl { 573 struct completion host_event; 574 s32 completion_status; 575 }; 576 577 static void hv_pci_onchannelcallback(void *context); 578 579 #ifdef CONFIG_X86 580 #define DELIVERY_MODE APIC_DELIVERY_MODE_FIXED 581 #define FLOW_HANDLER handle_edge_irq 582 #define FLOW_NAME "edge" 583 584 static int hv_pci_irqchip_init(void) 585 { 586 return 0; 587 } 588 589 static struct irq_domain *hv_pci_get_root_domain(void) 590 { 591 return x86_vector_domain; 592 } 593 594 static unsigned int hv_msi_get_int_vector(struct irq_data *data) 595 { 596 struct irq_cfg *cfg = irqd_cfg(data); 597 598 return cfg->vector; 599 } 600 601 #define hv_msi_prepare pci_msi_prepare 602 603 /** 604 * hv_arch_irq_unmask() - "Unmask" the IRQ by setting its current 605 * affinity. 606 * @data: Describes the IRQ 607 * 608 * Build new a destination for the MSI and make a hypercall to 609 * update the Interrupt Redirection Table. "Device Logical ID" 610 * is built out of this PCI bus's instance GUID and the function 611 * number of the device. 612 */ 613 static void hv_arch_irq_unmask(struct irq_data *data) 614 { 615 struct msi_desc *msi_desc = irq_data_get_msi_desc(data); 616 struct hv_retarget_device_interrupt *params; 617 struct tran_int_desc *int_desc; 618 struct hv_pcibus_device *hbus; 619 const struct cpumask *dest; 620 cpumask_var_t tmp; 621 struct pci_bus *pbus; 622 struct pci_dev *pdev; 623 unsigned long flags; 624 u32 var_size = 0; 625 int cpu, nr_bank; 626 u64 res; 627 628 dest = irq_data_get_effective_affinity_mask(data); 629 pdev = msi_desc_to_pci_dev(msi_desc); 630 pbus = pdev->bus; 631 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 632 int_desc = data->chip_data; 633 if (!int_desc) { 634 dev_warn(&hbus->hdev->device, "%s() can not unmask irq %u\n", 635 __func__, data->irq); 636 return; 637 } 638 639 local_irq_save(flags); 640 641 params = *this_cpu_ptr(hyperv_pcpu_input_arg); 642 memset(params, 0, sizeof(*params)); 643 params->partition_id = HV_PARTITION_ID_SELF; 644 params->int_entry.source = HV_INTERRUPT_SOURCE_MSI; 645 params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff; 646 params->int_entry.msi_entry.data.as_uint32 = int_desc->data; 647 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) | 648 (hbus->hdev->dev_instance.b[4] << 16) | 649 (hbus->hdev->dev_instance.b[7] << 8) | 650 (hbus->hdev->dev_instance.b[6] & 0xf8) | 651 PCI_FUNC(pdev->devfn); 652 params->int_target.vector = hv_msi_get_int_vector(data); 653 654 if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) { 655 /* 656 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the 657 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides 658 * with >64 VP support. 659 * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED 660 * is not sufficient for this hypercall. 661 */ 662 params->int_target.flags |= 663 HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; 664 665 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) { 666 res = 1; 667 goto out; 668 } 669 670 cpumask_and(tmp, dest, cpu_online_mask); 671 nr_bank = cpumask_to_vpset(¶ms->int_target.vp_set, tmp); 672 free_cpumask_var(tmp); 673 674 if (nr_bank <= 0) { 675 res = 1; 676 goto out; 677 } 678 679 /* 680 * var-sized hypercall, var-size starts after vp_mask (thus 681 * vp_set.format does not count, but vp_set.valid_bank_mask 682 * does). 683 */ 684 var_size = 1 + nr_bank; 685 } else { 686 for_each_cpu_and(cpu, dest, cpu_online_mask) { 687 params->int_target.vp_mask |= 688 (1ULL << hv_cpu_number_to_vp_number(cpu)); 689 } 690 } 691 692 res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17), 693 params, NULL); 694 695 out: 696 local_irq_restore(flags); 697 698 /* 699 * During hibernation, when a CPU is offlined, the kernel tries 700 * to move the interrupt to the remaining CPUs that haven't 701 * been offlined yet. In this case, the below hv_do_hypercall() 702 * always fails since the vmbus channel has been closed: 703 * refer to cpu_disable_common() -> fixup_irqs() -> 704 * irq_migrate_all_off_this_cpu() -> migrate_one_irq(). 705 * 706 * Suppress the error message for hibernation because the failure 707 * during hibernation does not matter (at this time all the devices 708 * have been frozen). Note: the correct affinity info is still updated 709 * into the irqdata data structure in migrate_one_irq() -> 710 * irq_do_set_affinity(), so later when the VM resumes, 711 * hv_pci_restore_msi_state() is able to correctly restore the 712 * interrupt with the correct affinity. 713 */ 714 if (!hv_result_success(res) && hbus->state != hv_pcibus_removing) 715 dev_err(&hbus->hdev->device, 716 "%s() failed: %#llx", __func__, res); 717 } 718 #elif defined(CONFIG_ARM64) 719 /* 720 * SPI vectors to use for vPCI; arch SPIs range is [32, 1019], but leaving a bit 721 * of room at the start to allow for SPIs to be specified through ACPI and 722 * starting with a power of two to satisfy power of 2 multi-MSI requirement. 723 */ 724 #define HV_PCI_MSI_SPI_START 64 725 #define HV_PCI_MSI_SPI_NR (1020 - HV_PCI_MSI_SPI_START) 726 #define DELIVERY_MODE 0 727 #define FLOW_HANDLER NULL 728 #define FLOW_NAME NULL 729 #define hv_msi_prepare NULL 730 731 struct hv_pci_chip_data { 732 DECLARE_BITMAP(spi_map, HV_PCI_MSI_SPI_NR); 733 struct mutex map_lock; 734 }; 735 736 /* Hyper-V vPCI MSI GIC IRQ domain */ 737 static struct irq_domain *hv_msi_gic_irq_domain; 738 739 /* Hyper-V PCI MSI IRQ chip */ 740 static struct irq_chip hv_arm64_msi_irq_chip = { 741 .name = "MSI", 742 .irq_set_affinity = irq_chip_set_affinity_parent, 743 .irq_eoi = irq_chip_eoi_parent, 744 .irq_mask = irq_chip_mask_parent, 745 .irq_unmask = irq_chip_unmask_parent 746 }; 747 748 static unsigned int hv_msi_get_int_vector(struct irq_data *irqd) 749 { 750 return irqd->parent_data->hwirq; 751 } 752 753 /* 754 * @nr_bm_irqs: Indicates the number of IRQs that were allocated from 755 * the bitmap. 756 * @nr_dom_irqs: Indicates the number of IRQs that were allocated from 757 * the parent domain. 758 */ 759 static void hv_pci_vec_irq_free(struct irq_domain *domain, 760 unsigned int virq, 761 unsigned int nr_bm_irqs, 762 unsigned int nr_dom_irqs) 763 { 764 struct hv_pci_chip_data *chip_data = domain->host_data; 765 struct irq_data *d = irq_domain_get_irq_data(domain, virq); 766 int first = d->hwirq - HV_PCI_MSI_SPI_START; 767 int i; 768 769 mutex_lock(&chip_data->map_lock); 770 bitmap_release_region(chip_data->spi_map, 771 first, 772 get_count_order(nr_bm_irqs)); 773 mutex_unlock(&chip_data->map_lock); 774 for (i = 0; i < nr_dom_irqs; i++) { 775 if (i) 776 d = irq_domain_get_irq_data(domain, virq + i); 777 irq_domain_reset_irq_data(d); 778 } 779 780 irq_domain_free_irqs_parent(domain, virq, nr_dom_irqs); 781 } 782 783 static void hv_pci_vec_irq_domain_free(struct irq_domain *domain, 784 unsigned int virq, 785 unsigned int nr_irqs) 786 { 787 hv_pci_vec_irq_free(domain, virq, nr_irqs, nr_irqs); 788 } 789 790 static int hv_pci_vec_alloc_device_irq(struct irq_domain *domain, 791 unsigned int nr_irqs, 792 irq_hw_number_t *hwirq) 793 { 794 struct hv_pci_chip_data *chip_data = domain->host_data; 795 int index; 796 797 /* Find and allocate region from the SPI bitmap */ 798 mutex_lock(&chip_data->map_lock); 799 index = bitmap_find_free_region(chip_data->spi_map, 800 HV_PCI_MSI_SPI_NR, 801 get_count_order(nr_irqs)); 802 mutex_unlock(&chip_data->map_lock); 803 if (index < 0) 804 return -ENOSPC; 805 806 *hwirq = index + HV_PCI_MSI_SPI_START; 807 808 return 0; 809 } 810 811 static int hv_pci_vec_irq_gic_domain_alloc(struct irq_domain *domain, 812 unsigned int virq, 813 irq_hw_number_t hwirq) 814 { 815 struct irq_fwspec fwspec; 816 struct irq_data *d; 817 int ret; 818 819 fwspec.fwnode = domain->parent->fwnode; 820 fwspec.param_count = 2; 821 fwspec.param[0] = hwirq; 822 fwspec.param[1] = IRQ_TYPE_EDGE_RISING; 823 824 ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); 825 if (ret) 826 return ret; 827 828 /* 829 * Since the interrupt specifier is not coming from ACPI or DT, the 830 * trigger type will need to be set explicitly. Otherwise, it will be 831 * set to whatever is in the GIC configuration. 832 */ 833 d = irq_domain_get_irq_data(domain->parent, virq); 834 835 return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); 836 } 837 838 static int hv_pci_vec_irq_domain_alloc(struct irq_domain *domain, 839 unsigned int virq, unsigned int nr_irqs, 840 void *args) 841 { 842 irq_hw_number_t hwirq; 843 unsigned int i; 844 int ret; 845 846 ret = hv_pci_vec_alloc_device_irq(domain, nr_irqs, &hwirq); 847 if (ret) 848 return ret; 849 850 for (i = 0; i < nr_irqs; i++) { 851 ret = hv_pci_vec_irq_gic_domain_alloc(domain, virq + i, 852 hwirq + i); 853 if (ret) { 854 hv_pci_vec_irq_free(domain, virq, nr_irqs, i); 855 return ret; 856 } 857 858 irq_domain_set_hwirq_and_chip(domain, virq + i, 859 hwirq + i, 860 &hv_arm64_msi_irq_chip, 861 domain->host_data); 862 pr_debug("pID:%d vID:%u\n", (int)(hwirq + i), virq + i); 863 } 864 865 return 0; 866 } 867 868 /* 869 * Pick the first cpu as the irq affinity that can be temporarily used for 870 * composing MSI from the hypervisor. GIC will eventually set the right 871 * affinity for the irq and the 'unmask' will retarget the interrupt to that 872 * cpu. 873 */ 874 static int hv_pci_vec_irq_domain_activate(struct irq_domain *domain, 875 struct irq_data *irqd, bool reserve) 876 { 877 int cpu = cpumask_first(cpu_present_mask); 878 879 irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); 880 881 return 0; 882 } 883 884 static const struct irq_domain_ops hv_pci_domain_ops = { 885 .alloc = hv_pci_vec_irq_domain_alloc, 886 .free = hv_pci_vec_irq_domain_free, 887 .activate = hv_pci_vec_irq_domain_activate, 888 }; 889 890 static int hv_pci_irqchip_init(void) 891 { 892 static struct hv_pci_chip_data *chip_data; 893 struct fwnode_handle *fn = NULL; 894 int ret = -ENOMEM; 895 896 chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL); 897 if (!chip_data) 898 return ret; 899 900 mutex_init(&chip_data->map_lock); 901 fn = irq_domain_alloc_named_fwnode("hv_vpci_arm64"); 902 if (!fn) 903 goto free_chip; 904 905 /* 906 * IRQ domain once enabled, should not be removed since there is no 907 * way to ensure that all the corresponding devices are also gone and 908 * no interrupts will be generated. 909 */ 910 hv_msi_gic_irq_domain = acpi_irq_create_hierarchy(0, HV_PCI_MSI_SPI_NR, 911 fn, &hv_pci_domain_ops, 912 chip_data); 913 914 if (!hv_msi_gic_irq_domain) { 915 pr_err("Failed to create Hyper-V arm64 vPCI MSI IRQ domain\n"); 916 goto free_chip; 917 } 918 919 return 0; 920 921 free_chip: 922 kfree(chip_data); 923 if (fn) 924 irq_domain_free_fwnode(fn); 925 926 return ret; 927 } 928 929 static struct irq_domain *hv_pci_get_root_domain(void) 930 { 931 return hv_msi_gic_irq_domain; 932 } 933 934 /* 935 * SPIs are used for interrupts of PCI devices and SPIs is managed via GICD 936 * registers which Hyper-V already supports, so no hypercall needed. 937 */ 938 static void hv_arch_irq_unmask(struct irq_data *data) { } 939 #endif /* CONFIG_ARM64 */ 940 941 /** 942 * hv_pci_generic_compl() - Invoked for a completion packet 943 * @context: Set up by the sender of the packet. 944 * @resp: The response packet 945 * @resp_packet_size: Size in bytes of the packet 946 * 947 * This function is used to trigger an event and report status 948 * for any message for which the completion packet contains a 949 * status and nothing else. 950 */ 951 static void hv_pci_generic_compl(void *context, struct pci_response *resp, 952 int resp_packet_size) 953 { 954 struct hv_pci_compl *comp_pkt = context; 955 956 comp_pkt->completion_status = resp->status; 957 complete(&comp_pkt->host_event); 958 } 959 960 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 961 u32 wslot); 962 963 static void get_pcichild(struct hv_pci_dev *hpdev) 964 { 965 refcount_inc(&hpdev->refs); 966 } 967 968 static void put_pcichild(struct hv_pci_dev *hpdev) 969 { 970 if (refcount_dec_and_test(&hpdev->refs)) 971 kfree(hpdev); 972 } 973 974 /* 975 * There is no good way to get notified from vmbus_onoffer_rescind(), 976 * so let's use polling here, since this is not a hot path. 977 */ 978 static int wait_for_response(struct hv_device *hdev, 979 struct completion *comp) 980 { 981 while (true) { 982 if (hdev->channel->rescind) { 983 dev_warn_once(&hdev->device, "The device is gone.\n"); 984 return -ENODEV; 985 } 986 987 if (wait_for_completion_timeout(comp, HZ / 10)) 988 break; 989 } 990 991 return 0; 992 } 993 994 /** 995 * devfn_to_wslot() - Convert from Linux PCI slot to Windows 996 * @devfn: The Linux representation of PCI slot 997 * 998 * Windows uses a slightly different representation of PCI slot. 999 * 1000 * Return: The Windows representation 1001 */ 1002 static u32 devfn_to_wslot(int devfn) 1003 { 1004 union win_slot_encoding wslot; 1005 1006 wslot.slot = 0; 1007 wslot.bits.dev = PCI_SLOT(devfn); 1008 wslot.bits.func = PCI_FUNC(devfn); 1009 1010 return wslot.slot; 1011 } 1012 1013 /** 1014 * wslot_to_devfn() - Convert from Windows PCI slot to Linux 1015 * @wslot: The Windows representation of PCI slot 1016 * 1017 * Windows uses a slightly different representation of PCI slot. 1018 * 1019 * Return: The Linux representation 1020 */ 1021 static int wslot_to_devfn(u32 wslot) 1022 { 1023 union win_slot_encoding slot_no; 1024 1025 slot_no.slot = wslot; 1026 return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func); 1027 } 1028 1029 static void hv_pci_read_mmio(struct device *dev, phys_addr_t gpa, int size, u32 *val) 1030 { 1031 struct hv_mmio_read_input *in; 1032 struct hv_mmio_read_output *out; 1033 u64 ret; 1034 1035 /* 1036 * Must be called with interrupts disabled so it is safe 1037 * to use the per-cpu input argument page. Use it for 1038 * both input and output. 1039 */ 1040 in = *this_cpu_ptr(hyperv_pcpu_input_arg); 1041 out = *this_cpu_ptr(hyperv_pcpu_input_arg) + sizeof(*in); 1042 in->gpa = gpa; 1043 in->size = size; 1044 1045 ret = hv_do_hypercall(HVCALL_MMIO_READ, in, out); 1046 if (hv_result_success(ret)) { 1047 switch (size) { 1048 case 1: 1049 *val = *(u8 *)(out->data); 1050 break; 1051 case 2: 1052 *val = *(u16 *)(out->data); 1053 break; 1054 default: 1055 *val = *(u32 *)(out->data); 1056 break; 1057 } 1058 } else 1059 dev_err(dev, "MMIO read hypercall error %llx addr %llx size %d\n", 1060 ret, gpa, size); 1061 } 1062 1063 static void hv_pci_write_mmio(struct device *dev, phys_addr_t gpa, int size, u32 val) 1064 { 1065 struct hv_mmio_write_input *in; 1066 u64 ret; 1067 1068 /* 1069 * Must be called with interrupts disabled so it is safe 1070 * to use the per-cpu input argument memory. 1071 */ 1072 in = *this_cpu_ptr(hyperv_pcpu_input_arg); 1073 in->gpa = gpa; 1074 in->size = size; 1075 switch (size) { 1076 case 1: 1077 *(u8 *)(in->data) = val; 1078 break; 1079 case 2: 1080 *(u16 *)(in->data) = val; 1081 break; 1082 default: 1083 *(u32 *)(in->data) = val; 1084 break; 1085 } 1086 1087 ret = hv_do_hypercall(HVCALL_MMIO_WRITE, in, NULL); 1088 if (!hv_result_success(ret)) 1089 dev_err(dev, "MMIO write hypercall error %llx addr %llx size %d\n", 1090 ret, gpa, size); 1091 } 1092 1093 /* 1094 * PCI Configuration Space for these root PCI buses is implemented as a pair 1095 * of pages in memory-mapped I/O space. Writing to the first page chooses 1096 * the PCI function being written or read. Once the first page has been 1097 * written to, the following page maps in the entire configuration space of 1098 * the function. 1099 */ 1100 1101 /** 1102 * _hv_pcifront_read_config() - Internal PCI config read 1103 * @hpdev: The PCI driver's representation of the device 1104 * @where: Offset within config space 1105 * @size: Size of the transfer 1106 * @val: Pointer to the buffer receiving the data 1107 */ 1108 static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, 1109 int size, u32 *val) 1110 { 1111 struct hv_pcibus_device *hbus = hpdev->hbus; 1112 struct device *dev = &hbus->hdev->device; 1113 int offset = where + CFG_PAGE_OFFSET; 1114 unsigned long flags; 1115 1116 /* 1117 * If the attempt is to read the IDs or the ROM BAR, simulate that. 1118 */ 1119 if (where + size <= PCI_COMMAND) { 1120 memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size); 1121 } else if (where >= PCI_CLASS_REVISION && where + size <= 1122 PCI_CACHE_LINE_SIZE) { 1123 memcpy(val, ((u8 *)&hpdev->desc.rev) + where - 1124 PCI_CLASS_REVISION, size); 1125 } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <= 1126 PCI_ROM_ADDRESS) { 1127 memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where - 1128 PCI_SUBSYSTEM_VENDOR_ID, size); 1129 } else if (where >= PCI_ROM_ADDRESS && where + size <= 1130 PCI_CAPABILITY_LIST) { 1131 /* ROM BARs are unimplemented */ 1132 *val = 0; 1133 } else if ((where >= PCI_INTERRUPT_LINE && where + size <= PCI_INTERRUPT_PIN) || 1134 (where >= PCI_INTERRUPT_PIN && where + size <= PCI_MIN_GNT)) { 1135 /* 1136 * Interrupt Line and Interrupt PIN are hard-wired to zero 1137 * because this front-end only supports message-signaled 1138 * interrupts. 1139 */ 1140 *val = 0; 1141 } else if (where + size <= CFG_PAGE_SIZE) { 1142 1143 spin_lock_irqsave(&hbus->config_lock, flags); 1144 if (hbus->use_calls) { 1145 phys_addr_t addr = hbus->mem_config->start + offset; 1146 1147 hv_pci_write_mmio(dev, hbus->mem_config->start, 4, 1148 hpdev->desc.win_slot.slot); 1149 hv_pci_read_mmio(dev, addr, size, val); 1150 } else { 1151 void __iomem *addr = hbus->cfg_addr + offset; 1152 1153 /* Choose the function to be read. (See comment above) */ 1154 writel(hpdev->desc.win_slot.slot, hbus->cfg_addr); 1155 /* Make sure the function was chosen before reading. */ 1156 mb(); 1157 /* Read from that function's config space. */ 1158 switch (size) { 1159 case 1: 1160 *val = readb(addr); 1161 break; 1162 case 2: 1163 *val = readw(addr); 1164 break; 1165 default: 1166 *val = readl(addr); 1167 break; 1168 } 1169 /* 1170 * Make sure the read was done before we release the 1171 * spinlock allowing consecutive reads/writes. 1172 */ 1173 mb(); 1174 } 1175 spin_unlock_irqrestore(&hbus->config_lock, flags); 1176 } else { 1177 dev_err(dev, "Attempt to read beyond a function's config space.\n"); 1178 } 1179 } 1180 1181 static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev) 1182 { 1183 struct hv_pcibus_device *hbus = hpdev->hbus; 1184 struct device *dev = &hbus->hdev->device; 1185 u32 val; 1186 u16 ret; 1187 unsigned long flags; 1188 1189 spin_lock_irqsave(&hbus->config_lock, flags); 1190 1191 if (hbus->use_calls) { 1192 phys_addr_t addr = hbus->mem_config->start + 1193 CFG_PAGE_OFFSET + PCI_VENDOR_ID; 1194 1195 hv_pci_write_mmio(dev, hbus->mem_config->start, 4, 1196 hpdev->desc.win_slot.slot); 1197 hv_pci_read_mmio(dev, addr, 2, &val); 1198 ret = val; /* Truncates to 16 bits */ 1199 } else { 1200 void __iomem *addr = hbus->cfg_addr + CFG_PAGE_OFFSET + 1201 PCI_VENDOR_ID; 1202 /* Choose the function to be read. (See comment above) */ 1203 writel(hpdev->desc.win_slot.slot, hbus->cfg_addr); 1204 /* Make sure the function was chosen before we start reading. */ 1205 mb(); 1206 /* Read from that function's config space. */ 1207 ret = readw(addr); 1208 /* 1209 * mb() is not required here, because the 1210 * spin_unlock_irqrestore() is a barrier. 1211 */ 1212 } 1213 1214 spin_unlock_irqrestore(&hbus->config_lock, flags); 1215 1216 return ret; 1217 } 1218 1219 /** 1220 * _hv_pcifront_write_config() - Internal PCI config write 1221 * @hpdev: The PCI driver's representation of the device 1222 * @where: Offset within config space 1223 * @size: Size of the transfer 1224 * @val: The data being transferred 1225 */ 1226 static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, 1227 int size, u32 val) 1228 { 1229 struct hv_pcibus_device *hbus = hpdev->hbus; 1230 struct device *dev = &hbus->hdev->device; 1231 int offset = where + CFG_PAGE_OFFSET; 1232 unsigned long flags; 1233 1234 if (where >= PCI_SUBSYSTEM_VENDOR_ID && 1235 where + size <= PCI_CAPABILITY_LIST) { 1236 /* SSIDs and ROM BARs are read-only */ 1237 } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) { 1238 spin_lock_irqsave(&hbus->config_lock, flags); 1239 1240 if (hbus->use_calls) { 1241 phys_addr_t addr = hbus->mem_config->start + offset; 1242 1243 hv_pci_write_mmio(dev, hbus->mem_config->start, 4, 1244 hpdev->desc.win_slot.slot); 1245 hv_pci_write_mmio(dev, addr, size, val); 1246 } else { 1247 void __iomem *addr = hbus->cfg_addr + offset; 1248 1249 /* Choose the function to write. (See comment above) */ 1250 writel(hpdev->desc.win_slot.slot, hbus->cfg_addr); 1251 /* Make sure the function was chosen before writing. */ 1252 wmb(); 1253 /* Write to that function's config space. */ 1254 switch (size) { 1255 case 1: 1256 writeb(val, addr); 1257 break; 1258 case 2: 1259 writew(val, addr); 1260 break; 1261 default: 1262 writel(val, addr); 1263 break; 1264 } 1265 /* 1266 * Make sure the write was done before we release the 1267 * spinlock allowing consecutive reads/writes. 1268 */ 1269 mb(); 1270 } 1271 spin_unlock_irqrestore(&hbus->config_lock, flags); 1272 } else { 1273 dev_err(dev, "Attempt to write beyond a function's config space.\n"); 1274 } 1275 } 1276 1277 /** 1278 * hv_pcifront_read_config() - Read configuration space 1279 * @bus: PCI Bus structure 1280 * @devfn: Device/function 1281 * @where: Offset from base 1282 * @size: Byte/word/dword 1283 * @val: Value to be read 1284 * 1285 * Return: PCIBIOS_SUCCESSFUL on success 1286 * PCIBIOS_DEVICE_NOT_FOUND on failure 1287 */ 1288 static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn, 1289 int where, int size, u32 *val) 1290 { 1291 struct hv_pcibus_device *hbus = 1292 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 1293 struct hv_pci_dev *hpdev; 1294 1295 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 1296 if (!hpdev) 1297 return PCIBIOS_DEVICE_NOT_FOUND; 1298 1299 _hv_pcifront_read_config(hpdev, where, size, val); 1300 1301 put_pcichild(hpdev); 1302 return PCIBIOS_SUCCESSFUL; 1303 } 1304 1305 /** 1306 * hv_pcifront_write_config() - Write configuration space 1307 * @bus: PCI Bus structure 1308 * @devfn: Device/function 1309 * @where: Offset from base 1310 * @size: Byte/word/dword 1311 * @val: Value to be written to device 1312 * 1313 * Return: PCIBIOS_SUCCESSFUL on success 1314 * PCIBIOS_DEVICE_NOT_FOUND on failure 1315 */ 1316 static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn, 1317 int where, int size, u32 val) 1318 { 1319 struct hv_pcibus_device *hbus = 1320 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 1321 struct hv_pci_dev *hpdev; 1322 1323 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 1324 if (!hpdev) 1325 return PCIBIOS_DEVICE_NOT_FOUND; 1326 1327 _hv_pcifront_write_config(hpdev, where, size, val); 1328 1329 put_pcichild(hpdev); 1330 return PCIBIOS_SUCCESSFUL; 1331 } 1332 1333 /* PCIe operations */ 1334 static struct pci_ops hv_pcifront_ops = { 1335 .read = hv_pcifront_read_config, 1336 .write = hv_pcifront_write_config, 1337 }; 1338 1339 /* 1340 * Paravirtual backchannel 1341 * 1342 * Hyper-V SR-IOV provides a backchannel mechanism in software for 1343 * communication between a VF driver and a PF driver. These 1344 * "configuration blocks" are similar in concept to PCI configuration space, 1345 * but instead of doing reads and writes in 32-bit chunks through a very slow 1346 * path, packets of up to 128 bytes can be sent or received asynchronously. 1347 * 1348 * Nearly every SR-IOV device contains just such a communications channel in 1349 * hardware, so using this one in software is usually optional. Using the 1350 * software channel, however, allows driver implementers to leverage software 1351 * tools that fuzz the communications channel looking for vulnerabilities. 1352 * 1353 * The usage model for these packets puts the responsibility for reading or 1354 * writing on the VF driver. The VF driver sends a read or a write packet, 1355 * indicating which "block" is being referred to by number. 1356 * 1357 * If the PF driver wishes to initiate communication, it can "invalidate" one or 1358 * more of the first 64 blocks. This invalidation is delivered via a callback 1359 * supplied by the VF driver by this driver. 1360 * 1361 * No protocol is implied, except that supplied by the PF and VF drivers. 1362 */ 1363 1364 struct hv_read_config_compl { 1365 struct hv_pci_compl comp_pkt; 1366 void *buf; 1367 unsigned int len; 1368 unsigned int bytes_returned; 1369 }; 1370 1371 /** 1372 * hv_pci_read_config_compl() - Invoked when a response packet 1373 * for a read config block operation arrives. 1374 * @context: Identifies the read config operation 1375 * @resp: The response packet itself 1376 * @resp_packet_size: Size in bytes of the response packet 1377 */ 1378 static void hv_pci_read_config_compl(void *context, struct pci_response *resp, 1379 int resp_packet_size) 1380 { 1381 struct hv_read_config_compl *comp = context; 1382 struct pci_read_block_response *read_resp = 1383 (struct pci_read_block_response *)resp; 1384 unsigned int data_len, hdr_len; 1385 1386 hdr_len = offsetof(struct pci_read_block_response, bytes); 1387 if (resp_packet_size < hdr_len) { 1388 comp->comp_pkt.completion_status = -1; 1389 goto out; 1390 } 1391 1392 data_len = resp_packet_size - hdr_len; 1393 if (data_len > 0 && read_resp->status == 0) { 1394 comp->bytes_returned = min(comp->len, data_len); 1395 memcpy(comp->buf, read_resp->bytes, comp->bytes_returned); 1396 } else { 1397 comp->bytes_returned = 0; 1398 } 1399 1400 comp->comp_pkt.completion_status = read_resp->status; 1401 out: 1402 complete(&comp->comp_pkt.host_event); 1403 } 1404 1405 /** 1406 * hv_read_config_block() - Sends a read config block request to 1407 * the back-end driver running in the Hyper-V parent partition. 1408 * @pdev: The PCI driver's representation for this device. 1409 * @buf: Buffer into which the config block will be copied. 1410 * @len: Size in bytes of buf. 1411 * @block_id: Identifies the config block which has been requested. 1412 * @bytes_returned: Size which came back from the back-end driver. 1413 * 1414 * Return: 0 on success, -errno on failure 1415 */ 1416 static int hv_read_config_block(struct pci_dev *pdev, void *buf, 1417 unsigned int len, unsigned int block_id, 1418 unsigned int *bytes_returned) 1419 { 1420 struct hv_pcibus_device *hbus = 1421 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1422 sysdata); 1423 struct { 1424 struct pci_packet pkt; 1425 char buf[sizeof(struct pci_read_block)]; 1426 } pkt; 1427 struct hv_read_config_compl comp_pkt; 1428 struct pci_read_block *read_blk; 1429 int ret; 1430 1431 if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 1432 return -EINVAL; 1433 1434 init_completion(&comp_pkt.comp_pkt.host_event); 1435 comp_pkt.buf = buf; 1436 comp_pkt.len = len; 1437 1438 memset(&pkt, 0, sizeof(pkt)); 1439 pkt.pkt.completion_func = hv_pci_read_config_compl; 1440 pkt.pkt.compl_ctxt = &comp_pkt; 1441 read_blk = (struct pci_read_block *)&pkt.pkt.message; 1442 read_blk->message_type.type = PCI_READ_BLOCK; 1443 read_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 1444 read_blk->block_id = block_id; 1445 read_blk->bytes_requested = len; 1446 1447 ret = vmbus_sendpacket(hbus->hdev->channel, read_blk, 1448 sizeof(*read_blk), (unsigned long)&pkt.pkt, 1449 VM_PKT_DATA_INBAND, 1450 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1451 if (ret) 1452 return ret; 1453 1454 ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event); 1455 if (ret) 1456 return ret; 1457 1458 if (comp_pkt.comp_pkt.completion_status != 0 || 1459 comp_pkt.bytes_returned == 0) { 1460 dev_err(&hbus->hdev->device, 1461 "Read Config Block failed: 0x%x, bytes_returned=%d\n", 1462 comp_pkt.comp_pkt.completion_status, 1463 comp_pkt.bytes_returned); 1464 return -EIO; 1465 } 1466 1467 *bytes_returned = comp_pkt.bytes_returned; 1468 return 0; 1469 } 1470 1471 /** 1472 * hv_pci_write_config_compl() - Invoked when a response packet for a write 1473 * config block operation arrives. 1474 * @context: Identifies the write config operation 1475 * @resp: The response packet itself 1476 * @resp_packet_size: Size in bytes of the response packet 1477 */ 1478 static void hv_pci_write_config_compl(void *context, struct pci_response *resp, 1479 int resp_packet_size) 1480 { 1481 struct hv_pci_compl *comp_pkt = context; 1482 1483 comp_pkt->completion_status = resp->status; 1484 complete(&comp_pkt->host_event); 1485 } 1486 1487 /** 1488 * hv_write_config_block() - Sends a write config block request to the 1489 * back-end driver running in the Hyper-V parent partition. 1490 * @pdev: The PCI driver's representation for this device. 1491 * @buf: Buffer from which the config block will be copied. 1492 * @len: Size in bytes of buf. 1493 * @block_id: Identifies the config block which is being written. 1494 * 1495 * Return: 0 on success, -errno on failure 1496 */ 1497 static int hv_write_config_block(struct pci_dev *pdev, void *buf, 1498 unsigned int len, unsigned int block_id) 1499 { 1500 struct hv_pcibus_device *hbus = 1501 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1502 sysdata); 1503 struct { 1504 struct pci_packet pkt; 1505 char buf[sizeof(struct pci_write_block)]; 1506 u32 reserved; 1507 } pkt; 1508 struct hv_pci_compl comp_pkt; 1509 struct pci_write_block *write_blk; 1510 u32 pkt_size; 1511 int ret; 1512 1513 if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 1514 return -EINVAL; 1515 1516 init_completion(&comp_pkt.host_event); 1517 1518 memset(&pkt, 0, sizeof(pkt)); 1519 pkt.pkt.completion_func = hv_pci_write_config_compl; 1520 pkt.pkt.compl_ctxt = &comp_pkt; 1521 write_blk = (struct pci_write_block *)&pkt.pkt.message; 1522 write_blk->message_type.type = PCI_WRITE_BLOCK; 1523 write_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 1524 write_blk->block_id = block_id; 1525 write_blk->byte_count = len; 1526 memcpy(write_blk->bytes, buf, len); 1527 pkt_size = offsetof(struct pci_write_block, bytes) + len; 1528 /* 1529 * This quirk is required on some hosts shipped around 2018, because 1530 * these hosts don't check the pkt_size correctly (new hosts have been 1531 * fixed since early 2019). The quirk is also safe on very old hosts 1532 * and new hosts, because, on them, what really matters is the length 1533 * specified in write_blk->byte_count. 1534 */ 1535 pkt_size += sizeof(pkt.reserved); 1536 1537 ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size, 1538 (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND, 1539 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1540 if (ret) 1541 return ret; 1542 1543 ret = wait_for_response(hbus->hdev, &comp_pkt.host_event); 1544 if (ret) 1545 return ret; 1546 1547 if (comp_pkt.completion_status != 0) { 1548 dev_err(&hbus->hdev->device, 1549 "Write Config Block failed: 0x%x\n", 1550 comp_pkt.completion_status); 1551 return -EIO; 1552 } 1553 1554 return 0; 1555 } 1556 1557 /** 1558 * hv_register_block_invalidate() - Invoked when a config block invalidation 1559 * arrives from the back-end driver. 1560 * @pdev: The PCI driver's representation for this device. 1561 * @context: Identifies the device. 1562 * @block_invalidate: Identifies all of the blocks being invalidated. 1563 * 1564 * Return: 0 on success, -errno on failure 1565 */ 1566 static int hv_register_block_invalidate(struct pci_dev *pdev, void *context, 1567 void (*block_invalidate)(void *context, 1568 u64 block_mask)) 1569 { 1570 struct hv_pcibus_device *hbus = 1571 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1572 sysdata); 1573 struct hv_pci_dev *hpdev; 1574 1575 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1576 if (!hpdev) 1577 return -ENODEV; 1578 1579 hpdev->block_invalidate = block_invalidate; 1580 hpdev->invalidate_context = context; 1581 1582 put_pcichild(hpdev); 1583 return 0; 1584 1585 } 1586 1587 /* Interrupt management hooks */ 1588 static void hv_int_desc_free(struct hv_pci_dev *hpdev, 1589 struct tran_int_desc *int_desc) 1590 { 1591 struct pci_delete_interrupt *int_pkt; 1592 struct { 1593 struct pci_packet pkt; 1594 u8 buffer[sizeof(struct pci_delete_interrupt)]; 1595 } ctxt; 1596 1597 if (!int_desc->vector_count) { 1598 kfree(int_desc); 1599 return; 1600 } 1601 memset(&ctxt, 0, sizeof(ctxt)); 1602 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; 1603 int_pkt->message_type.type = 1604 PCI_DELETE_INTERRUPT_MESSAGE; 1605 int_pkt->wslot.slot = hpdev->desc.win_slot.slot; 1606 int_pkt->int_desc = *int_desc; 1607 vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt), 1608 0, VM_PKT_DATA_INBAND, 0); 1609 kfree(int_desc); 1610 } 1611 1612 /** 1613 * hv_msi_free() - Free the MSI. 1614 * @domain: The interrupt domain pointer 1615 * @info: Extra MSI-related context 1616 * @irq: Identifies the IRQ. 1617 * 1618 * The Hyper-V parent partition and hypervisor are tracking the 1619 * messages that are in use, keeping the interrupt redirection 1620 * table up to date. This callback sends a message that frees 1621 * the IRT entry and related tracking nonsense. 1622 */ 1623 static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, 1624 unsigned int irq) 1625 { 1626 struct hv_pcibus_device *hbus; 1627 struct hv_pci_dev *hpdev; 1628 struct pci_dev *pdev; 1629 struct tran_int_desc *int_desc; 1630 struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq); 1631 struct msi_desc *msi = irq_data_get_msi_desc(irq_data); 1632 1633 pdev = msi_desc_to_pci_dev(msi); 1634 hbus = info->data; 1635 int_desc = irq_data_get_irq_chip_data(irq_data); 1636 if (!int_desc) 1637 return; 1638 1639 irq_data->chip_data = NULL; 1640 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1641 if (!hpdev) { 1642 kfree(int_desc); 1643 return; 1644 } 1645 1646 hv_int_desc_free(hpdev, int_desc); 1647 put_pcichild(hpdev); 1648 } 1649 1650 static void hv_irq_mask(struct irq_data *data) 1651 { 1652 pci_msi_mask_irq(data); 1653 if (data->parent_data->chip->irq_mask) 1654 irq_chip_mask_parent(data); 1655 } 1656 1657 static void hv_irq_unmask(struct irq_data *data) 1658 { 1659 hv_arch_irq_unmask(data); 1660 1661 if (data->parent_data->chip->irq_unmask) 1662 irq_chip_unmask_parent(data); 1663 pci_msi_unmask_irq(data); 1664 } 1665 1666 struct compose_comp_ctxt { 1667 struct hv_pci_compl comp_pkt; 1668 struct tran_int_desc int_desc; 1669 }; 1670 1671 static void hv_pci_compose_compl(void *context, struct pci_response *resp, 1672 int resp_packet_size) 1673 { 1674 struct compose_comp_ctxt *comp_pkt = context; 1675 struct pci_create_int_response *int_resp = 1676 (struct pci_create_int_response *)resp; 1677 1678 if (resp_packet_size < sizeof(*int_resp)) { 1679 comp_pkt->comp_pkt.completion_status = -1; 1680 goto out; 1681 } 1682 comp_pkt->comp_pkt.completion_status = resp->status; 1683 comp_pkt->int_desc = int_resp->int_desc; 1684 out: 1685 complete(&comp_pkt->comp_pkt.host_event); 1686 } 1687 1688 static u32 hv_compose_msi_req_v1( 1689 struct pci_create_interrupt *int_pkt, 1690 u32 slot, u8 vector, u16 vector_count) 1691 { 1692 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; 1693 int_pkt->wslot.slot = slot; 1694 int_pkt->int_desc.vector = vector; 1695 int_pkt->int_desc.vector_count = vector_count; 1696 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1697 1698 /* 1699 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in 1700 * hv_irq_unmask(). 1701 */ 1702 int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL; 1703 1704 return sizeof(*int_pkt); 1705 } 1706 1707 /* 1708 * The vCPU selected by hv_compose_multi_msi_req_get_cpu() and 1709 * hv_compose_msi_req_get_cpu() is a "dummy" vCPU because the final vCPU to be 1710 * interrupted is specified later in hv_irq_unmask() and communicated to Hyper-V 1711 * via the HVCALL_RETARGET_INTERRUPT hypercall. But the choice of dummy vCPU is 1712 * not irrelevant because Hyper-V chooses the physical CPU to handle the 1713 * interrupts based on the vCPU specified in message sent to the vPCI VSP in 1714 * hv_compose_msi_msg(). Hyper-V's choice of pCPU is not visible to the guest, 1715 * but assigning too many vPCI device interrupts to the same pCPU can cause a 1716 * performance bottleneck. So we spread out the dummy vCPUs to influence Hyper-V 1717 * to spread out the pCPUs that it selects. 1718 * 1719 * For the single-MSI and MSI-X cases, it's OK for hv_compose_msi_req_get_cpu() 1720 * to always return the same dummy vCPU, because a second call to 1721 * hv_compose_msi_msg() contains the "real" vCPU, causing Hyper-V to choose a 1722 * new pCPU for the interrupt. But for the multi-MSI case, the second call to 1723 * hv_compose_msi_msg() exits without sending a message to the vPCI VSP, so the 1724 * original dummy vCPU is used. This dummy vCPU must be round-robin'ed so that 1725 * the pCPUs are spread out. All interrupts for a multi-MSI device end up using 1726 * the same pCPU, even though the vCPUs will be spread out by later calls 1727 * to hv_irq_unmask(), but that is the best we can do now. 1728 * 1729 * With Hyper-V in Nov 2022, the HVCALL_RETARGET_INTERRUPT hypercall does *not* 1730 * cause Hyper-V to reselect the pCPU based on the specified vCPU. Such an 1731 * enhancement is planned for a future version. With that enhancement, the 1732 * dummy vCPU selection won't matter, and interrupts for the same multi-MSI 1733 * device will be spread across multiple pCPUs. 1734 */ 1735 1736 /* 1737 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten 1738 * by subsequent retarget in hv_irq_unmask(). 1739 */ 1740 static int hv_compose_msi_req_get_cpu(const struct cpumask *affinity) 1741 { 1742 return cpumask_first_and(affinity, cpu_online_mask); 1743 } 1744 1745 /* 1746 * Make sure the dummy vCPU values for multi-MSI don't all point to vCPU0. 1747 */ 1748 static int hv_compose_multi_msi_req_get_cpu(void) 1749 { 1750 static DEFINE_SPINLOCK(multi_msi_cpu_lock); 1751 1752 /* -1 means starting with CPU 0 */ 1753 static int cpu_next = -1; 1754 1755 unsigned long flags; 1756 int cpu; 1757 1758 spin_lock_irqsave(&multi_msi_cpu_lock, flags); 1759 1760 cpu_next = cpumask_next_wrap(cpu_next, cpu_online_mask, nr_cpu_ids, 1761 false); 1762 cpu = cpu_next; 1763 1764 spin_unlock_irqrestore(&multi_msi_cpu_lock, flags); 1765 1766 return cpu; 1767 } 1768 1769 static u32 hv_compose_msi_req_v2( 1770 struct pci_create_interrupt2 *int_pkt, int cpu, 1771 u32 slot, u8 vector, u16 vector_count) 1772 { 1773 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2; 1774 int_pkt->wslot.slot = slot; 1775 int_pkt->int_desc.vector = vector; 1776 int_pkt->int_desc.vector_count = vector_count; 1777 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1778 int_pkt->int_desc.processor_array[0] = 1779 hv_cpu_number_to_vp_number(cpu); 1780 int_pkt->int_desc.processor_count = 1; 1781 1782 return sizeof(*int_pkt); 1783 } 1784 1785 static u32 hv_compose_msi_req_v3( 1786 struct pci_create_interrupt3 *int_pkt, int cpu, 1787 u32 slot, u32 vector, u16 vector_count) 1788 { 1789 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3; 1790 int_pkt->wslot.slot = slot; 1791 int_pkt->int_desc.vector = vector; 1792 int_pkt->int_desc.reserved = 0; 1793 int_pkt->int_desc.vector_count = vector_count; 1794 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1795 int_pkt->int_desc.processor_array[0] = 1796 hv_cpu_number_to_vp_number(cpu); 1797 int_pkt->int_desc.processor_count = 1; 1798 1799 return sizeof(*int_pkt); 1800 } 1801 1802 /** 1803 * hv_compose_msi_msg() - Supplies a valid MSI address/data 1804 * @data: Everything about this MSI 1805 * @msg: Buffer that is filled in by this function 1806 * 1807 * This function unpacks the IRQ looking for target CPU set, IDT 1808 * vector and mode and sends a message to the parent partition 1809 * asking for a mapping for that tuple in this partition. The 1810 * response supplies a data value and address to which that data 1811 * should be written to trigger that interrupt. 1812 */ 1813 static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) 1814 { 1815 struct hv_pcibus_device *hbus; 1816 struct vmbus_channel *channel; 1817 struct hv_pci_dev *hpdev; 1818 struct pci_bus *pbus; 1819 struct pci_dev *pdev; 1820 const struct cpumask *dest; 1821 struct compose_comp_ctxt comp; 1822 struct tran_int_desc *int_desc; 1823 struct msi_desc *msi_desc; 1824 /* 1825 * vector_count should be u16: see hv_msi_desc, hv_msi_desc2 1826 * and hv_msi_desc3. vector must be u32: see hv_msi_desc3. 1827 */ 1828 u16 vector_count; 1829 u32 vector; 1830 struct { 1831 struct pci_packet pci_pkt; 1832 union { 1833 struct pci_create_interrupt v1; 1834 struct pci_create_interrupt2 v2; 1835 struct pci_create_interrupt3 v3; 1836 } int_pkts; 1837 } __packed ctxt; 1838 bool multi_msi; 1839 u64 trans_id; 1840 u32 size; 1841 int ret; 1842 int cpu; 1843 1844 msi_desc = irq_data_get_msi_desc(data); 1845 multi_msi = !msi_desc->pci.msi_attrib.is_msix && 1846 msi_desc->nvec_used > 1; 1847 1848 /* Reuse the previous allocation */ 1849 if (data->chip_data && multi_msi) { 1850 int_desc = data->chip_data; 1851 msg->address_hi = int_desc->address >> 32; 1852 msg->address_lo = int_desc->address & 0xffffffff; 1853 msg->data = int_desc->data; 1854 return; 1855 } 1856 1857 pdev = msi_desc_to_pci_dev(msi_desc); 1858 dest = irq_data_get_effective_affinity_mask(data); 1859 pbus = pdev->bus; 1860 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 1861 channel = hbus->hdev->channel; 1862 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1863 if (!hpdev) 1864 goto return_null_message; 1865 1866 /* Free any previous message that might have already been composed. */ 1867 if (data->chip_data && !multi_msi) { 1868 int_desc = data->chip_data; 1869 data->chip_data = NULL; 1870 hv_int_desc_free(hpdev, int_desc); 1871 } 1872 1873 int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC); 1874 if (!int_desc) 1875 goto drop_reference; 1876 1877 if (multi_msi) { 1878 /* 1879 * If this is not the first MSI of Multi MSI, we already have 1880 * a mapping. Can exit early. 1881 */ 1882 if (msi_desc->irq != data->irq) { 1883 data->chip_data = int_desc; 1884 int_desc->address = msi_desc->msg.address_lo | 1885 (u64)msi_desc->msg.address_hi << 32; 1886 int_desc->data = msi_desc->msg.data + 1887 (data->irq - msi_desc->irq); 1888 msg->address_hi = msi_desc->msg.address_hi; 1889 msg->address_lo = msi_desc->msg.address_lo; 1890 msg->data = int_desc->data; 1891 put_pcichild(hpdev); 1892 return; 1893 } 1894 /* 1895 * The vector we select here is a dummy value. The correct 1896 * value gets sent to the hypervisor in unmask(). This needs 1897 * to be aligned with the count, and also not zero. Multi-msi 1898 * is powers of 2 up to 32, so 32 will always work here. 1899 */ 1900 vector = 32; 1901 vector_count = msi_desc->nvec_used; 1902 cpu = hv_compose_multi_msi_req_get_cpu(); 1903 } else { 1904 vector = hv_msi_get_int_vector(data); 1905 vector_count = 1; 1906 cpu = hv_compose_msi_req_get_cpu(dest); 1907 } 1908 1909 /* 1910 * hv_compose_msi_req_v1 and v2 are for x86 only, meaning 'vector' 1911 * can't exceed u8. Cast 'vector' down to u8 for v1/v2 explicitly 1912 * for better readability. 1913 */ 1914 memset(&ctxt, 0, sizeof(ctxt)); 1915 init_completion(&comp.comp_pkt.host_event); 1916 ctxt.pci_pkt.completion_func = hv_pci_compose_compl; 1917 ctxt.pci_pkt.compl_ctxt = ∁ 1918 1919 switch (hbus->protocol_version) { 1920 case PCI_PROTOCOL_VERSION_1_1: 1921 size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1, 1922 hpdev->desc.win_slot.slot, 1923 (u8)vector, 1924 vector_count); 1925 break; 1926 1927 case PCI_PROTOCOL_VERSION_1_2: 1928 case PCI_PROTOCOL_VERSION_1_3: 1929 size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2, 1930 cpu, 1931 hpdev->desc.win_slot.slot, 1932 (u8)vector, 1933 vector_count); 1934 break; 1935 1936 case PCI_PROTOCOL_VERSION_1_4: 1937 size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3, 1938 cpu, 1939 hpdev->desc.win_slot.slot, 1940 vector, 1941 vector_count); 1942 break; 1943 1944 default: 1945 /* As we only negotiate protocol versions known to this driver, 1946 * this path should never hit. However, this is it not a hot 1947 * path so we print a message to aid future updates. 1948 */ 1949 dev_err(&hbus->hdev->device, 1950 "Unexpected vPCI protocol, update driver."); 1951 goto free_int_desc; 1952 } 1953 1954 ret = vmbus_sendpacket_getid(hpdev->hbus->hdev->channel, &ctxt.int_pkts, 1955 size, (unsigned long)&ctxt.pci_pkt, 1956 &trans_id, VM_PKT_DATA_INBAND, 1957 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1958 if (ret) { 1959 dev_err(&hbus->hdev->device, 1960 "Sending request for interrupt failed: 0x%x", 1961 comp.comp_pkt.completion_status); 1962 goto free_int_desc; 1963 } 1964 1965 /* 1966 * Prevents hv_pci_onchannelcallback() from running concurrently 1967 * in the tasklet. 1968 */ 1969 tasklet_disable_in_atomic(&channel->callback_event); 1970 1971 /* 1972 * Since this function is called with IRQ locks held, can't 1973 * do normal wait for completion; instead poll. 1974 */ 1975 while (!try_wait_for_completion(&comp.comp_pkt.host_event)) { 1976 unsigned long flags; 1977 1978 /* 0xFFFF means an invalid PCI VENDOR ID. */ 1979 if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) { 1980 dev_err_once(&hbus->hdev->device, 1981 "the device has gone\n"); 1982 goto enable_tasklet; 1983 } 1984 1985 /* 1986 * Make sure that the ring buffer data structure doesn't get 1987 * freed while we dereference the ring buffer pointer. Test 1988 * for the channel's onchannel_callback being NULL within a 1989 * sched_lock critical section. See also the inline comments 1990 * in vmbus_reset_channel_cb(). 1991 */ 1992 spin_lock_irqsave(&channel->sched_lock, flags); 1993 if (unlikely(channel->onchannel_callback == NULL)) { 1994 spin_unlock_irqrestore(&channel->sched_lock, flags); 1995 goto enable_tasklet; 1996 } 1997 hv_pci_onchannelcallback(hbus); 1998 spin_unlock_irqrestore(&channel->sched_lock, flags); 1999 2000 udelay(100); 2001 } 2002 2003 tasklet_enable(&channel->callback_event); 2004 2005 if (comp.comp_pkt.completion_status < 0) { 2006 dev_err(&hbus->hdev->device, 2007 "Request for interrupt failed: 0x%x", 2008 comp.comp_pkt.completion_status); 2009 goto free_int_desc; 2010 } 2011 2012 /* 2013 * Record the assignment so that this can be unwound later. Using 2014 * irq_set_chip_data() here would be appropriate, but the lock it takes 2015 * is already held. 2016 */ 2017 *int_desc = comp.int_desc; 2018 data->chip_data = int_desc; 2019 2020 /* Pass up the result. */ 2021 msg->address_hi = comp.int_desc.address >> 32; 2022 msg->address_lo = comp.int_desc.address & 0xffffffff; 2023 msg->data = comp.int_desc.data; 2024 2025 put_pcichild(hpdev); 2026 return; 2027 2028 enable_tasklet: 2029 tasklet_enable(&channel->callback_event); 2030 /* 2031 * The completion packet on the stack becomes invalid after 'return'; 2032 * remove the ID from the VMbus requestor if the identifier is still 2033 * mapped to/associated with the packet. (The identifier could have 2034 * been 're-used', i.e., already removed and (re-)mapped.) 2035 * 2036 * Cf. hv_pci_onchannelcallback(). 2037 */ 2038 vmbus_request_addr_match(channel, trans_id, (unsigned long)&ctxt.pci_pkt); 2039 free_int_desc: 2040 kfree(int_desc); 2041 drop_reference: 2042 put_pcichild(hpdev); 2043 return_null_message: 2044 msg->address_hi = 0; 2045 msg->address_lo = 0; 2046 msg->data = 0; 2047 } 2048 2049 /* HW Interrupt Chip Descriptor */ 2050 static struct irq_chip hv_msi_irq_chip = { 2051 .name = "Hyper-V PCIe MSI", 2052 .irq_compose_msi_msg = hv_compose_msi_msg, 2053 .irq_set_affinity = irq_chip_set_affinity_parent, 2054 #ifdef CONFIG_X86 2055 .irq_ack = irq_chip_ack_parent, 2056 #elif defined(CONFIG_ARM64) 2057 .irq_eoi = irq_chip_eoi_parent, 2058 #endif 2059 .irq_mask = hv_irq_mask, 2060 .irq_unmask = hv_irq_unmask, 2061 }; 2062 2063 static struct msi_domain_ops hv_msi_ops = { 2064 .msi_prepare = hv_msi_prepare, 2065 .msi_free = hv_msi_free, 2066 }; 2067 2068 /** 2069 * hv_pcie_init_irq_domain() - Initialize IRQ domain 2070 * @hbus: The root PCI bus 2071 * 2072 * This function creates an IRQ domain which will be used for 2073 * interrupts from devices that have been passed through. These 2074 * devices only support MSI and MSI-X, not line-based interrupts 2075 * or simulations of line-based interrupts through PCIe's 2076 * fabric-layer messages. Because interrupts are remapped, we 2077 * can support multi-message MSI here. 2078 * 2079 * Return: '0' on success and error value on failure 2080 */ 2081 static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus) 2082 { 2083 hbus->msi_info.chip = &hv_msi_irq_chip; 2084 hbus->msi_info.ops = &hv_msi_ops; 2085 hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS | 2086 MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI | 2087 MSI_FLAG_PCI_MSIX); 2088 hbus->msi_info.handler = FLOW_HANDLER; 2089 hbus->msi_info.handler_name = FLOW_NAME; 2090 hbus->msi_info.data = hbus; 2091 hbus->irq_domain = pci_msi_create_irq_domain(hbus->fwnode, 2092 &hbus->msi_info, 2093 hv_pci_get_root_domain()); 2094 if (!hbus->irq_domain) { 2095 dev_err(&hbus->hdev->device, 2096 "Failed to build an MSI IRQ domain\n"); 2097 return -ENODEV; 2098 } 2099 2100 dev_set_msi_domain(&hbus->bridge->dev, hbus->irq_domain); 2101 2102 return 0; 2103 } 2104 2105 /** 2106 * get_bar_size() - Get the address space consumed by a BAR 2107 * @bar_val: Value that a BAR returned after -1 was written 2108 * to it. 2109 * 2110 * This function returns the size of the BAR, rounded up to 1 2111 * page. It has to be rounded up because the hypervisor's page 2112 * table entry that maps the BAR into the VM can't specify an 2113 * offset within a page. The invariant is that the hypervisor 2114 * must place any BARs of smaller than page length at the 2115 * beginning of a page. 2116 * 2117 * Return: Size in bytes of the consumed MMIO space. 2118 */ 2119 static u64 get_bar_size(u64 bar_val) 2120 { 2121 return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)), 2122 PAGE_SIZE); 2123 } 2124 2125 /** 2126 * survey_child_resources() - Total all MMIO requirements 2127 * @hbus: Root PCI bus, as understood by this driver 2128 */ 2129 static void survey_child_resources(struct hv_pcibus_device *hbus) 2130 { 2131 struct hv_pci_dev *hpdev; 2132 resource_size_t bar_size = 0; 2133 unsigned long flags; 2134 struct completion *event; 2135 u64 bar_val; 2136 int i; 2137 2138 /* If nobody is waiting on the answer, don't compute it. */ 2139 event = xchg(&hbus->survey_event, NULL); 2140 if (!event) 2141 return; 2142 2143 /* If the answer has already been computed, go with it. */ 2144 if (hbus->low_mmio_space || hbus->high_mmio_space) { 2145 complete(event); 2146 return; 2147 } 2148 2149 spin_lock_irqsave(&hbus->device_list_lock, flags); 2150 2151 /* 2152 * Due to an interesting quirk of the PCI spec, all memory regions 2153 * for a child device are a power of 2 in size and aligned in memory, 2154 * so it's sufficient to just add them up without tracking alignment. 2155 */ 2156 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2157 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 2158 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO) 2159 dev_err(&hbus->hdev->device, 2160 "There's an I/O BAR in this list!\n"); 2161 2162 if (hpdev->probed_bar[i] != 0) { 2163 /* 2164 * A probed BAR has all the upper bits set that 2165 * can be changed. 2166 */ 2167 2168 bar_val = hpdev->probed_bar[i]; 2169 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 2170 bar_val |= 2171 ((u64)hpdev->probed_bar[++i] << 32); 2172 else 2173 bar_val |= 0xffffffff00000000ULL; 2174 2175 bar_size = get_bar_size(bar_val); 2176 2177 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 2178 hbus->high_mmio_space += bar_size; 2179 else 2180 hbus->low_mmio_space += bar_size; 2181 } 2182 } 2183 } 2184 2185 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2186 complete(event); 2187 } 2188 2189 /** 2190 * prepopulate_bars() - Fill in BARs with defaults 2191 * @hbus: Root PCI bus, as understood by this driver 2192 * 2193 * The core PCI driver code seems much, much happier if the BARs 2194 * for a device have values upon first scan. So fill them in. 2195 * The algorithm below works down from large sizes to small, 2196 * attempting to pack the assignments optimally. The assumption, 2197 * enforced in other parts of the code, is that the beginning of 2198 * the memory-mapped I/O space will be aligned on the largest 2199 * BAR size. 2200 */ 2201 static void prepopulate_bars(struct hv_pcibus_device *hbus) 2202 { 2203 resource_size_t high_size = 0; 2204 resource_size_t low_size = 0; 2205 resource_size_t high_base = 0; 2206 resource_size_t low_base = 0; 2207 resource_size_t bar_size; 2208 struct hv_pci_dev *hpdev; 2209 unsigned long flags; 2210 u64 bar_val; 2211 u32 command; 2212 bool high; 2213 int i; 2214 2215 if (hbus->low_mmio_space) { 2216 low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 2217 low_base = hbus->low_mmio_res->start; 2218 } 2219 2220 if (hbus->high_mmio_space) { 2221 high_size = 1ULL << 2222 (63 - __builtin_clzll(hbus->high_mmio_space)); 2223 high_base = hbus->high_mmio_res->start; 2224 } 2225 2226 spin_lock_irqsave(&hbus->device_list_lock, flags); 2227 2228 /* 2229 * Clear the memory enable bit, in case it's already set. This occurs 2230 * in the suspend path of hibernation, where the device is suspended, 2231 * resumed and suspended again: see hibernation_snapshot() and 2232 * hibernation_platform_enter(). 2233 * 2234 * If the memory enable bit is already set, Hyper-V silently ignores 2235 * the below BAR updates, and the related PCI device driver can not 2236 * work, because reading from the device register(s) always returns 2237 * 0xFFFFFFFF (PCI_ERROR_RESPONSE). 2238 */ 2239 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2240 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command); 2241 command &= ~PCI_COMMAND_MEMORY; 2242 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command); 2243 } 2244 2245 /* Pick addresses for the BARs. */ 2246 do { 2247 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2248 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 2249 bar_val = hpdev->probed_bar[i]; 2250 if (bar_val == 0) 2251 continue; 2252 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64; 2253 if (high) { 2254 bar_val |= 2255 ((u64)hpdev->probed_bar[i + 1] 2256 << 32); 2257 } else { 2258 bar_val |= 0xffffffffULL << 32; 2259 } 2260 bar_size = get_bar_size(bar_val); 2261 if (high) { 2262 if (high_size != bar_size) { 2263 i++; 2264 continue; 2265 } 2266 _hv_pcifront_write_config(hpdev, 2267 PCI_BASE_ADDRESS_0 + (4 * i), 2268 4, 2269 (u32)(high_base & 0xffffff00)); 2270 i++; 2271 _hv_pcifront_write_config(hpdev, 2272 PCI_BASE_ADDRESS_0 + (4 * i), 2273 4, (u32)(high_base >> 32)); 2274 high_base += bar_size; 2275 } else { 2276 if (low_size != bar_size) 2277 continue; 2278 _hv_pcifront_write_config(hpdev, 2279 PCI_BASE_ADDRESS_0 + (4 * i), 2280 4, 2281 (u32)(low_base & 0xffffff00)); 2282 low_base += bar_size; 2283 } 2284 } 2285 if (high_size <= 1 && low_size <= 1) { 2286 /* 2287 * No need to set the PCI_COMMAND_MEMORY bit as 2288 * the core PCI driver doesn't require the bit 2289 * to be pre-set. Actually here we intentionally 2290 * keep the bit off so that the PCI BAR probing 2291 * in the core PCI driver doesn't cause Hyper-V 2292 * to unnecessarily unmap/map the virtual BARs 2293 * from/to the physical BARs multiple times. 2294 * This reduces the VM boot time significantly 2295 * if the BAR sizes are huge. 2296 */ 2297 break; 2298 } 2299 } 2300 2301 high_size >>= 1; 2302 low_size >>= 1; 2303 } while (high_size || low_size); 2304 2305 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2306 } 2307 2308 /* 2309 * Assign entries in sysfs pci slot directory. 2310 * 2311 * Note that this function does not need to lock the children list 2312 * because it is called from pci_devices_present_work which 2313 * is serialized with hv_eject_device_work because they are on the 2314 * same ordered workqueue. Therefore hbus->children list will not change 2315 * even when pci_create_slot sleeps. 2316 */ 2317 static void hv_pci_assign_slots(struct hv_pcibus_device *hbus) 2318 { 2319 struct hv_pci_dev *hpdev; 2320 char name[SLOT_NAME_SIZE]; 2321 int slot_nr; 2322 2323 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2324 if (hpdev->pci_slot) 2325 continue; 2326 2327 slot_nr = PCI_SLOT(wslot_to_devfn(hpdev->desc.win_slot.slot)); 2328 snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser); 2329 hpdev->pci_slot = pci_create_slot(hbus->bridge->bus, slot_nr, 2330 name, NULL); 2331 if (IS_ERR(hpdev->pci_slot)) { 2332 pr_warn("pci_create slot %s failed\n", name); 2333 hpdev->pci_slot = NULL; 2334 } 2335 } 2336 } 2337 2338 /* 2339 * Remove entries in sysfs pci slot directory. 2340 */ 2341 static void hv_pci_remove_slots(struct hv_pcibus_device *hbus) 2342 { 2343 struct hv_pci_dev *hpdev; 2344 2345 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2346 if (!hpdev->pci_slot) 2347 continue; 2348 pci_destroy_slot(hpdev->pci_slot); 2349 hpdev->pci_slot = NULL; 2350 } 2351 } 2352 2353 /* 2354 * Set NUMA node for the devices on the bus 2355 */ 2356 static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus) 2357 { 2358 struct pci_dev *dev; 2359 struct pci_bus *bus = hbus->bridge->bus; 2360 struct hv_pci_dev *hv_dev; 2361 2362 list_for_each_entry(dev, &bus->devices, bus_list) { 2363 hv_dev = get_pcichild_wslot(hbus, devfn_to_wslot(dev->devfn)); 2364 if (!hv_dev) 2365 continue; 2366 2367 if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY && 2368 hv_dev->desc.virtual_numa_node < num_possible_nodes()) 2369 /* 2370 * The kernel may boot with some NUMA nodes offline 2371 * (e.g. in a KDUMP kernel) or with NUMA disabled via 2372 * "numa=off". In those cases, adjust the host provided 2373 * NUMA node to a valid NUMA node used by the kernel. 2374 */ 2375 set_dev_node(&dev->dev, 2376 numa_map_to_online_node( 2377 hv_dev->desc.virtual_numa_node)); 2378 2379 put_pcichild(hv_dev); 2380 } 2381 } 2382 2383 /** 2384 * create_root_hv_pci_bus() - Expose a new root PCI bus 2385 * @hbus: Root PCI bus, as understood by this driver 2386 * 2387 * Return: 0 on success, -errno on failure 2388 */ 2389 static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus) 2390 { 2391 int error; 2392 struct pci_host_bridge *bridge = hbus->bridge; 2393 2394 bridge->dev.parent = &hbus->hdev->device; 2395 bridge->sysdata = &hbus->sysdata; 2396 bridge->ops = &hv_pcifront_ops; 2397 2398 error = pci_scan_root_bus_bridge(bridge); 2399 if (error) 2400 return error; 2401 2402 pci_lock_rescan_remove(); 2403 hv_pci_assign_numa_node(hbus); 2404 pci_bus_assign_resources(bridge->bus); 2405 hv_pci_assign_slots(hbus); 2406 pci_bus_add_devices(bridge->bus); 2407 pci_unlock_rescan_remove(); 2408 hbus->state = hv_pcibus_installed; 2409 return 0; 2410 } 2411 2412 struct q_res_req_compl { 2413 struct completion host_event; 2414 struct hv_pci_dev *hpdev; 2415 }; 2416 2417 /** 2418 * q_resource_requirements() - Query Resource Requirements 2419 * @context: The completion context. 2420 * @resp: The response that came from the host. 2421 * @resp_packet_size: The size in bytes of resp. 2422 * 2423 * This function is invoked on completion of a Query Resource 2424 * Requirements packet. 2425 */ 2426 static void q_resource_requirements(void *context, struct pci_response *resp, 2427 int resp_packet_size) 2428 { 2429 struct q_res_req_compl *completion = context; 2430 struct pci_q_res_req_response *q_res_req = 2431 (struct pci_q_res_req_response *)resp; 2432 s32 status; 2433 int i; 2434 2435 status = (resp_packet_size < sizeof(*q_res_req)) ? -1 : resp->status; 2436 if (status < 0) { 2437 dev_err(&completion->hpdev->hbus->hdev->device, 2438 "query resource requirements failed: %x\n", 2439 status); 2440 } else { 2441 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 2442 completion->hpdev->probed_bar[i] = 2443 q_res_req->probed_bar[i]; 2444 } 2445 } 2446 2447 complete(&completion->host_event); 2448 } 2449 2450 /** 2451 * new_pcichild_device() - Create a new child device 2452 * @hbus: The internal struct tracking this root PCI bus. 2453 * @desc: The information supplied so far from the host 2454 * about the device. 2455 * 2456 * This function creates the tracking structure for a new child 2457 * device and kicks off the process of figuring out what it is. 2458 * 2459 * Return: Pointer to the new tracking struct 2460 */ 2461 static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus, 2462 struct hv_pcidev_description *desc) 2463 { 2464 struct hv_pci_dev *hpdev; 2465 struct pci_child_message *res_req; 2466 struct q_res_req_compl comp_pkt; 2467 struct { 2468 struct pci_packet init_packet; 2469 u8 buffer[sizeof(struct pci_child_message)]; 2470 } pkt; 2471 unsigned long flags; 2472 int ret; 2473 2474 hpdev = kzalloc(sizeof(*hpdev), GFP_KERNEL); 2475 if (!hpdev) 2476 return NULL; 2477 2478 hpdev->hbus = hbus; 2479 2480 memset(&pkt, 0, sizeof(pkt)); 2481 init_completion(&comp_pkt.host_event); 2482 comp_pkt.hpdev = hpdev; 2483 pkt.init_packet.compl_ctxt = &comp_pkt; 2484 pkt.init_packet.completion_func = q_resource_requirements; 2485 res_req = (struct pci_child_message *)&pkt.init_packet.message; 2486 res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS; 2487 res_req->wslot.slot = desc->win_slot.slot; 2488 2489 ret = vmbus_sendpacket(hbus->hdev->channel, res_req, 2490 sizeof(struct pci_child_message), 2491 (unsigned long)&pkt.init_packet, 2492 VM_PKT_DATA_INBAND, 2493 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2494 if (ret) 2495 goto error; 2496 2497 if (wait_for_response(hbus->hdev, &comp_pkt.host_event)) 2498 goto error; 2499 2500 hpdev->desc = *desc; 2501 refcount_set(&hpdev->refs, 1); 2502 get_pcichild(hpdev); 2503 spin_lock_irqsave(&hbus->device_list_lock, flags); 2504 2505 list_add_tail(&hpdev->list_entry, &hbus->children); 2506 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2507 return hpdev; 2508 2509 error: 2510 kfree(hpdev); 2511 return NULL; 2512 } 2513 2514 /** 2515 * get_pcichild_wslot() - Find device from slot 2516 * @hbus: Root PCI bus, as understood by this driver 2517 * @wslot: Location on the bus 2518 * 2519 * This function looks up a PCI device and returns the internal 2520 * representation of it. It acquires a reference on it, so that 2521 * the device won't be deleted while somebody is using it. The 2522 * caller is responsible for calling put_pcichild() to release 2523 * this reference. 2524 * 2525 * Return: Internal representation of a PCI device 2526 */ 2527 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 2528 u32 wslot) 2529 { 2530 unsigned long flags; 2531 struct hv_pci_dev *iter, *hpdev = NULL; 2532 2533 spin_lock_irqsave(&hbus->device_list_lock, flags); 2534 list_for_each_entry(iter, &hbus->children, list_entry) { 2535 if (iter->desc.win_slot.slot == wslot) { 2536 hpdev = iter; 2537 get_pcichild(hpdev); 2538 break; 2539 } 2540 } 2541 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2542 2543 return hpdev; 2544 } 2545 2546 /** 2547 * pci_devices_present_work() - Handle new list of child devices 2548 * @work: Work struct embedded in struct hv_dr_work 2549 * 2550 * "Bus Relations" is the Windows term for "children of this 2551 * bus." The terminology is preserved here for people trying to 2552 * debug the interaction between Hyper-V and Linux. This 2553 * function is called when the parent partition reports a list 2554 * of functions that should be observed under this PCI Express 2555 * port (bus). 2556 * 2557 * This function updates the list, and must tolerate being 2558 * called multiple times with the same information. The typical 2559 * number of child devices is one, with very atypical cases 2560 * involving three or four, so the algorithms used here can be 2561 * simple and inefficient. 2562 * 2563 * It must also treat the omission of a previously observed device as 2564 * notification that the device no longer exists. 2565 * 2566 * Note that this function is serialized with hv_eject_device_work(), 2567 * because both are pushed to the ordered workqueue hbus->wq. 2568 */ 2569 static void pci_devices_present_work(struct work_struct *work) 2570 { 2571 u32 child_no; 2572 bool found; 2573 struct hv_pcidev_description *new_desc; 2574 struct hv_pci_dev *hpdev; 2575 struct hv_pcibus_device *hbus; 2576 struct list_head removed; 2577 struct hv_dr_work *dr_wrk; 2578 struct hv_dr_state *dr = NULL; 2579 unsigned long flags; 2580 2581 dr_wrk = container_of(work, struct hv_dr_work, wrk); 2582 hbus = dr_wrk->bus; 2583 kfree(dr_wrk); 2584 2585 INIT_LIST_HEAD(&removed); 2586 2587 /* Pull this off the queue and process it if it was the last one. */ 2588 spin_lock_irqsave(&hbus->device_list_lock, flags); 2589 while (!list_empty(&hbus->dr_list)) { 2590 dr = list_first_entry(&hbus->dr_list, struct hv_dr_state, 2591 list_entry); 2592 list_del(&dr->list_entry); 2593 2594 /* Throw this away if the list still has stuff in it. */ 2595 if (!list_empty(&hbus->dr_list)) { 2596 kfree(dr); 2597 continue; 2598 } 2599 } 2600 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2601 2602 if (!dr) 2603 return; 2604 2605 mutex_lock(&hbus->state_lock); 2606 2607 /* First, mark all existing children as reported missing. */ 2608 spin_lock_irqsave(&hbus->device_list_lock, flags); 2609 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2610 hpdev->reported_missing = true; 2611 } 2612 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2613 2614 /* Next, add back any reported devices. */ 2615 for (child_no = 0; child_no < dr->device_count; child_no++) { 2616 found = false; 2617 new_desc = &dr->func[child_no]; 2618 2619 spin_lock_irqsave(&hbus->device_list_lock, flags); 2620 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2621 if ((hpdev->desc.win_slot.slot == new_desc->win_slot.slot) && 2622 (hpdev->desc.v_id == new_desc->v_id) && 2623 (hpdev->desc.d_id == new_desc->d_id) && 2624 (hpdev->desc.ser == new_desc->ser)) { 2625 hpdev->reported_missing = false; 2626 found = true; 2627 } 2628 } 2629 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2630 2631 if (!found) { 2632 hpdev = new_pcichild_device(hbus, new_desc); 2633 if (!hpdev) 2634 dev_err(&hbus->hdev->device, 2635 "couldn't record a child device.\n"); 2636 } 2637 } 2638 2639 /* Move missing children to a list on the stack. */ 2640 spin_lock_irqsave(&hbus->device_list_lock, flags); 2641 do { 2642 found = false; 2643 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2644 if (hpdev->reported_missing) { 2645 found = true; 2646 put_pcichild(hpdev); 2647 list_move_tail(&hpdev->list_entry, &removed); 2648 break; 2649 } 2650 } 2651 } while (found); 2652 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2653 2654 /* Delete everything that should no longer exist. */ 2655 while (!list_empty(&removed)) { 2656 hpdev = list_first_entry(&removed, struct hv_pci_dev, 2657 list_entry); 2658 list_del(&hpdev->list_entry); 2659 2660 if (hpdev->pci_slot) 2661 pci_destroy_slot(hpdev->pci_slot); 2662 2663 put_pcichild(hpdev); 2664 } 2665 2666 switch (hbus->state) { 2667 case hv_pcibus_installed: 2668 /* 2669 * Tell the core to rescan bus 2670 * because there may have been changes. 2671 */ 2672 pci_lock_rescan_remove(); 2673 pci_scan_child_bus(hbus->bridge->bus); 2674 hv_pci_assign_numa_node(hbus); 2675 hv_pci_assign_slots(hbus); 2676 pci_unlock_rescan_remove(); 2677 break; 2678 2679 case hv_pcibus_init: 2680 case hv_pcibus_probed: 2681 survey_child_resources(hbus); 2682 break; 2683 2684 default: 2685 break; 2686 } 2687 2688 mutex_unlock(&hbus->state_lock); 2689 2690 kfree(dr); 2691 } 2692 2693 /** 2694 * hv_pci_start_relations_work() - Queue work to start device discovery 2695 * @hbus: Root PCI bus, as understood by this driver 2696 * @dr: The list of children returned from host 2697 * 2698 * Return: 0 on success, -errno on failure 2699 */ 2700 static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus, 2701 struct hv_dr_state *dr) 2702 { 2703 struct hv_dr_work *dr_wrk; 2704 unsigned long flags; 2705 bool pending_dr; 2706 2707 if (hbus->state == hv_pcibus_removing) { 2708 dev_info(&hbus->hdev->device, 2709 "PCI VMBus BUS_RELATIONS: ignored\n"); 2710 return -ENOENT; 2711 } 2712 2713 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT); 2714 if (!dr_wrk) 2715 return -ENOMEM; 2716 2717 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work); 2718 dr_wrk->bus = hbus; 2719 2720 spin_lock_irqsave(&hbus->device_list_lock, flags); 2721 /* 2722 * If pending_dr is true, we have already queued a work, 2723 * which will see the new dr. Otherwise, we need to 2724 * queue a new work. 2725 */ 2726 pending_dr = !list_empty(&hbus->dr_list); 2727 list_add_tail(&dr->list_entry, &hbus->dr_list); 2728 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2729 2730 if (pending_dr) 2731 kfree(dr_wrk); 2732 else 2733 queue_work(hbus->wq, &dr_wrk->wrk); 2734 2735 return 0; 2736 } 2737 2738 /** 2739 * hv_pci_devices_present() - Handle list of new children 2740 * @hbus: Root PCI bus, as understood by this driver 2741 * @relations: Packet from host listing children 2742 * 2743 * Process a new list of devices on the bus. The list of devices is 2744 * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS, 2745 * whenever a new list of devices for this bus appears. 2746 */ 2747 static void hv_pci_devices_present(struct hv_pcibus_device *hbus, 2748 struct pci_bus_relations *relations) 2749 { 2750 struct hv_dr_state *dr; 2751 int i; 2752 2753 dr = kzalloc(struct_size(dr, func, relations->device_count), 2754 GFP_NOWAIT); 2755 if (!dr) 2756 return; 2757 2758 dr->device_count = relations->device_count; 2759 for (i = 0; i < dr->device_count; i++) { 2760 dr->func[i].v_id = relations->func[i].v_id; 2761 dr->func[i].d_id = relations->func[i].d_id; 2762 dr->func[i].rev = relations->func[i].rev; 2763 dr->func[i].prog_intf = relations->func[i].prog_intf; 2764 dr->func[i].subclass = relations->func[i].subclass; 2765 dr->func[i].base_class = relations->func[i].base_class; 2766 dr->func[i].subsystem_id = relations->func[i].subsystem_id; 2767 dr->func[i].win_slot = relations->func[i].win_slot; 2768 dr->func[i].ser = relations->func[i].ser; 2769 } 2770 2771 if (hv_pci_start_relations_work(hbus, dr)) 2772 kfree(dr); 2773 } 2774 2775 /** 2776 * hv_pci_devices_present2() - Handle list of new children 2777 * @hbus: Root PCI bus, as understood by this driver 2778 * @relations: Packet from host listing children 2779 * 2780 * This function is the v2 version of hv_pci_devices_present() 2781 */ 2782 static void hv_pci_devices_present2(struct hv_pcibus_device *hbus, 2783 struct pci_bus_relations2 *relations) 2784 { 2785 struct hv_dr_state *dr; 2786 int i; 2787 2788 dr = kzalloc(struct_size(dr, func, relations->device_count), 2789 GFP_NOWAIT); 2790 if (!dr) 2791 return; 2792 2793 dr->device_count = relations->device_count; 2794 for (i = 0; i < dr->device_count; i++) { 2795 dr->func[i].v_id = relations->func[i].v_id; 2796 dr->func[i].d_id = relations->func[i].d_id; 2797 dr->func[i].rev = relations->func[i].rev; 2798 dr->func[i].prog_intf = relations->func[i].prog_intf; 2799 dr->func[i].subclass = relations->func[i].subclass; 2800 dr->func[i].base_class = relations->func[i].base_class; 2801 dr->func[i].subsystem_id = relations->func[i].subsystem_id; 2802 dr->func[i].win_slot = relations->func[i].win_slot; 2803 dr->func[i].ser = relations->func[i].ser; 2804 dr->func[i].flags = relations->func[i].flags; 2805 dr->func[i].virtual_numa_node = 2806 relations->func[i].virtual_numa_node; 2807 } 2808 2809 if (hv_pci_start_relations_work(hbus, dr)) 2810 kfree(dr); 2811 } 2812 2813 /** 2814 * hv_eject_device_work() - Asynchronously handles ejection 2815 * @work: Work struct embedded in internal device struct 2816 * 2817 * This function handles ejecting a device. Windows will 2818 * attempt to gracefully eject a device, waiting 60 seconds to 2819 * hear back from the guest OS that this completed successfully. 2820 * If this timer expires, the device will be forcibly removed. 2821 */ 2822 static void hv_eject_device_work(struct work_struct *work) 2823 { 2824 struct pci_eject_response *ejct_pkt; 2825 struct hv_pcibus_device *hbus; 2826 struct hv_pci_dev *hpdev; 2827 struct pci_dev *pdev; 2828 unsigned long flags; 2829 int wslot; 2830 struct { 2831 struct pci_packet pkt; 2832 u8 buffer[sizeof(struct pci_eject_response)]; 2833 } ctxt; 2834 2835 hpdev = container_of(work, struct hv_pci_dev, wrk); 2836 hbus = hpdev->hbus; 2837 2838 mutex_lock(&hbus->state_lock); 2839 2840 /* 2841 * Ejection can come before or after the PCI bus has been set up, so 2842 * attempt to find it and tear down the bus state, if it exists. This 2843 * must be done without constructs like pci_domain_nr(hbus->bridge->bus) 2844 * because hbus->bridge->bus may not exist yet. 2845 */ 2846 wslot = wslot_to_devfn(hpdev->desc.win_slot.slot); 2847 pdev = pci_get_domain_bus_and_slot(hbus->bridge->domain_nr, 0, wslot); 2848 if (pdev) { 2849 pci_lock_rescan_remove(); 2850 pci_stop_and_remove_bus_device(pdev); 2851 pci_dev_put(pdev); 2852 pci_unlock_rescan_remove(); 2853 } 2854 2855 spin_lock_irqsave(&hbus->device_list_lock, flags); 2856 list_del(&hpdev->list_entry); 2857 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2858 2859 if (hpdev->pci_slot) 2860 pci_destroy_slot(hpdev->pci_slot); 2861 2862 memset(&ctxt, 0, sizeof(ctxt)); 2863 ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message; 2864 ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE; 2865 ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot; 2866 vmbus_sendpacket(hbus->hdev->channel, ejct_pkt, 2867 sizeof(*ejct_pkt), 0, 2868 VM_PKT_DATA_INBAND, 0); 2869 2870 /* For the get_pcichild() in hv_pci_eject_device() */ 2871 put_pcichild(hpdev); 2872 /* For the two refs got in new_pcichild_device() */ 2873 put_pcichild(hpdev); 2874 put_pcichild(hpdev); 2875 /* hpdev has been freed. Do not use it any more. */ 2876 2877 mutex_unlock(&hbus->state_lock); 2878 } 2879 2880 /** 2881 * hv_pci_eject_device() - Handles device ejection 2882 * @hpdev: Internal device tracking struct 2883 * 2884 * This function is invoked when an ejection packet arrives. It 2885 * just schedules work so that we don't re-enter the packet 2886 * delivery code handling the ejection. 2887 */ 2888 static void hv_pci_eject_device(struct hv_pci_dev *hpdev) 2889 { 2890 struct hv_pcibus_device *hbus = hpdev->hbus; 2891 struct hv_device *hdev = hbus->hdev; 2892 2893 if (hbus->state == hv_pcibus_removing) { 2894 dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n"); 2895 return; 2896 } 2897 2898 get_pcichild(hpdev); 2899 INIT_WORK(&hpdev->wrk, hv_eject_device_work); 2900 queue_work(hbus->wq, &hpdev->wrk); 2901 } 2902 2903 /** 2904 * hv_pci_onchannelcallback() - Handles incoming packets 2905 * @context: Internal bus tracking struct 2906 * 2907 * This function is invoked whenever the host sends a packet to 2908 * this channel (which is private to this root PCI bus). 2909 */ 2910 static void hv_pci_onchannelcallback(void *context) 2911 { 2912 const int packet_size = 0x100; 2913 int ret; 2914 struct hv_pcibus_device *hbus = context; 2915 struct vmbus_channel *chan = hbus->hdev->channel; 2916 u32 bytes_recvd; 2917 u64 req_id, req_addr; 2918 struct vmpacket_descriptor *desc; 2919 unsigned char *buffer; 2920 int bufferlen = packet_size; 2921 struct pci_packet *comp_packet; 2922 struct pci_response *response; 2923 struct pci_incoming_message *new_message; 2924 struct pci_bus_relations *bus_rel; 2925 struct pci_bus_relations2 *bus_rel2; 2926 struct pci_dev_inval_block *inval; 2927 struct pci_dev_incoming *dev_message; 2928 struct hv_pci_dev *hpdev; 2929 unsigned long flags; 2930 2931 buffer = kmalloc(bufferlen, GFP_ATOMIC); 2932 if (!buffer) 2933 return; 2934 2935 while (1) { 2936 ret = vmbus_recvpacket_raw(chan, buffer, bufferlen, 2937 &bytes_recvd, &req_id); 2938 2939 if (ret == -ENOBUFS) { 2940 kfree(buffer); 2941 /* Handle large packet */ 2942 bufferlen = bytes_recvd; 2943 buffer = kmalloc(bytes_recvd, GFP_ATOMIC); 2944 if (!buffer) 2945 return; 2946 continue; 2947 } 2948 2949 /* Zero length indicates there are no more packets. */ 2950 if (ret || !bytes_recvd) 2951 break; 2952 2953 /* 2954 * All incoming packets must be at least as large as a 2955 * response. 2956 */ 2957 if (bytes_recvd <= sizeof(struct pci_response)) 2958 continue; 2959 desc = (struct vmpacket_descriptor *)buffer; 2960 2961 switch (desc->type) { 2962 case VM_PKT_COMP: 2963 2964 lock_requestor(chan, flags); 2965 req_addr = __vmbus_request_addr_match(chan, req_id, 2966 VMBUS_RQST_ADDR_ANY); 2967 if (req_addr == VMBUS_RQST_ERROR) { 2968 unlock_requestor(chan, flags); 2969 dev_err(&hbus->hdev->device, 2970 "Invalid transaction ID %llx\n", 2971 req_id); 2972 break; 2973 } 2974 comp_packet = (struct pci_packet *)req_addr; 2975 response = (struct pci_response *)buffer; 2976 /* 2977 * Call ->completion_func() within the critical section to make 2978 * sure that the packet pointer is still valid during the call: 2979 * here 'valid' means that there's a task still waiting for the 2980 * completion, and that the packet data is still on the waiting 2981 * task's stack. Cf. hv_compose_msi_msg(). 2982 */ 2983 comp_packet->completion_func(comp_packet->compl_ctxt, 2984 response, 2985 bytes_recvd); 2986 unlock_requestor(chan, flags); 2987 break; 2988 2989 case VM_PKT_DATA_INBAND: 2990 2991 new_message = (struct pci_incoming_message *)buffer; 2992 switch (new_message->message_type.type) { 2993 case PCI_BUS_RELATIONS: 2994 2995 bus_rel = (struct pci_bus_relations *)buffer; 2996 if (bytes_recvd < sizeof(*bus_rel) || 2997 bytes_recvd < 2998 struct_size(bus_rel, func, 2999 bus_rel->device_count)) { 3000 dev_err(&hbus->hdev->device, 3001 "bus relations too small\n"); 3002 break; 3003 } 3004 3005 hv_pci_devices_present(hbus, bus_rel); 3006 break; 3007 3008 case PCI_BUS_RELATIONS2: 3009 3010 bus_rel2 = (struct pci_bus_relations2 *)buffer; 3011 if (bytes_recvd < sizeof(*bus_rel2) || 3012 bytes_recvd < 3013 struct_size(bus_rel2, func, 3014 bus_rel2->device_count)) { 3015 dev_err(&hbus->hdev->device, 3016 "bus relations v2 too small\n"); 3017 break; 3018 } 3019 3020 hv_pci_devices_present2(hbus, bus_rel2); 3021 break; 3022 3023 case PCI_EJECT: 3024 3025 dev_message = (struct pci_dev_incoming *)buffer; 3026 if (bytes_recvd < sizeof(*dev_message)) { 3027 dev_err(&hbus->hdev->device, 3028 "eject message too small\n"); 3029 break; 3030 } 3031 hpdev = get_pcichild_wslot(hbus, 3032 dev_message->wslot.slot); 3033 if (hpdev) { 3034 hv_pci_eject_device(hpdev); 3035 put_pcichild(hpdev); 3036 } 3037 break; 3038 3039 case PCI_INVALIDATE_BLOCK: 3040 3041 inval = (struct pci_dev_inval_block *)buffer; 3042 if (bytes_recvd < sizeof(*inval)) { 3043 dev_err(&hbus->hdev->device, 3044 "invalidate message too small\n"); 3045 break; 3046 } 3047 hpdev = get_pcichild_wslot(hbus, 3048 inval->wslot.slot); 3049 if (hpdev) { 3050 if (hpdev->block_invalidate) { 3051 hpdev->block_invalidate( 3052 hpdev->invalidate_context, 3053 inval->block_mask); 3054 } 3055 put_pcichild(hpdev); 3056 } 3057 break; 3058 3059 default: 3060 dev_warn(&hbus->hdev->device, 3061 "Unimplemented protocol message %x\n", 3062 new_message->message_type.type); 3063 break; 3064 } 3065 break; 3066 3067 default: 3068 dev_err(&hbus->hdev->device, 3069 "unhandled packet type %d, tid %llx len %d\n", 3070 desc->type, req_id, bytes_recvd); 3071 break; 3072 } 3073 } 3074 3075 kfree(buffer); 3076 } 3077 3078 /** 3079 * hv_pci_protocol_negotiation() - Set up protocol 3080 * @hdev: VMBus's tracking struct for this root PCI bus. 3081 * @version: Array of supported channel protocol versions in 3082 * the order of probing - highest go first. 3083 * @num_version: Number of elements in the version array. 3084 * 3085 * This driver is intended to support running on Windows 10 3086 * (server) and later versions. It will not run on earlier 3087 * versions, as they assume that many of the operations which 3088 * Linux needs accomplished with a spinlock held were done via 3089 * asynchronous messaging via VMBus. Windows 10 increases the 3090 * surface area of PCI emulation so that these actions can take 3091 * place by suspending a virtual processor for their duration. 3092 * 3093 * This function negotiates the channel protocol version, 3094 * failing if the host doesn't support the necessary protocol 3095 * level. 3096 */ 3097 static int hv_pci_protocol_negotiation(struct hv_device *hdev, 3098 enum pci_protocol_version_t version[], 3099 int num_version) 3100 { 3101 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3102 struct pci_version_request *version_req; 3103 struct hv_pci_compl comp_pkt; 3104 struct pci_packet *pkt; 3105 int ret; 3106 int i; 3107 3108 /* 3109 * Initiate the handshake with the host and negotiate 3110 * a version that the host can support. We start with the 3111 * highest version number and go down if the host cannot 3112 * support it. 3113 */ 3114 pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL); 3115 if (!pkt) 3116 return -ENOMEM; 3117 3118 init_completion(&comp_pkt.host_event); 3119 pkt->completion_func = hv_pci_generic_compl; 3120 pkt->compl_ctxt = &comp_pkt; 3121 version_req = (struct pci_version_request *)&pkt->message; 3122 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; 3123 3124 for (i = 0; i < num_version; i++) { 3125 version_req->protocol_version = version[i]; 3126 ret = vmbus_sendpacket(hdev->channel, version_req, 3127 sizeof(struct pci_version_request), 3128 (unsigned long)pkt, VM_PKT_DATA_INBAND, 3129 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3130 if (!ret) 3131 ret = wait_for_response(hdev, &comp_pkt.host_event); 3132 3133 if (ret) { 3134 dev_err(&hdev->device, 3135 "PCI Pass-through VSP failed to request version: %d", 3136 ret); 3137 goto exit; 3138 } 3139 3140 if (comp_pkt.completion_status >= 0) { 3141 hbus->protocol_version = version[i]; 3142 dev_info(&hdev->device, 3143 "PCI VMBus probing: Using version %#x\n", 3144 hbus->protocol_version); 3145 goto exit; 3146 } 3147 3148 if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) { 3149 dev_err(&hdev->device, 3150 "PCI Pass-through VSP failed version request: %#x", 3151 comp_pkt.completion_status); 3152 ret = -EPROTO; 3153 goto exit; 3154 } 3155 3156 reinit_completion(&comp_pkt.host_event); 3157 } 3158 3159 dev_err(&hdev->device, 3160 "PCI pass-through VSP failed to find supported version"); 3161 ret = -EPROTO; 3162 3163 exit: 3164 kfree(pkt); 3165 return ret; 3166 } 3167 3168 /** 3169 * hv_pci_free_bridge_windows() - Release memory regions for the 3170 * bus 3171 * @hbus: Root PCI bus, as understood by this driver 3172 */ 3173 static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus) 3174 { 3175 /* 3176 * Set the resources back to the way they looked when they 3177 * were allocated by setting IORESOURCE_BUSY again. 3178 */ 3179 3180 if (hbus->low_mmio_space && hbus->low_mmio_res) { 3181 hbus->low_mmio_res->flags |= IORESOURCE_BUSY; 3182 vmbus_free_mmio(hbus->low_mmio_res->start, 3183 resource_size(hbus->low_mmio_res)); 3184 } 3185 3186 if (hbus->high_mmio_space && hbus->high_mmio_res) { 3187 hbus->high_mmio_res->flags |= IORESOURCE_BUSY; 3188 vmbus_free_mmio(hbus->high_mmio_res->start, 3189 resource_size(hbus->high_mmio_res)); 3190 } 3191 } 3192 3193 /** 3194 * hv_pci_allocate_bridge_windows() - Allocate memory regions 3195 * for the bus 3196 * @hbus: Root PCI bus, as understood by this driver 3197 * 3198 * This function calls vmbus_allocate_mmio(), which is itself a 3199 * bit of a compromise. Ideally, we might change the pnp layer 3200 * in the kernel such that it comprehends either PCI devices 3201 * which are "grandchildren of ACPI," with some intermediate bus 3202 * node (in this case, VMBus) or change it such that it 3203 * understands VMBus. The pnp layer, however, has been declared 3204 * deprecated, and not subject to change. 3205 * 3206 * The workaround, implemented here, is to ask VMBus to allocate 3207 * MMIO space for this bus. VMBus itself knows which ranges are 3208 * appropriate by looking at its own ACPI objects. Then, after 3209 * these ranges are claimed, they're modified to look like they 3210 * would have looked if the ACPI and pnp code had allocated 3211 * bridge windows. These descriptors have to exist in this form 3212 * in order to satisfy the code which will get invoked when the 3213 * endpoint PCI function driver calls request_mem_region() or 3214 * request_mem_region_exclusive(). 3215 * 3216 * Return: 0 on success, -errno on failure 3217 */ 3218 static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus) 3219 { 3220 resource_size_t align; 3221 int ret; 3222 3223 if (hbus->low_mmio_space) { 3224 align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 3225 ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0, 3226 (u64)(u32)0xffffffff, 3227 hbus->low_mmio_space, 3228 align, false); 3229 if (ret) { 3230 dev_err(&hbus->hdev->device, 3231 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n", 3232 hbus->low_mmio_space); 3233 return ret; 3234 } 3235 3236 /* Modify this resource to become a bridge window. */ 3237 hbus->low_mmio_res->flags |= IORESOURCE_WINDOW; 3238 hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY; 3239 pci_add_resource(&hbus->bridge->windows, hbus->low_mmio_res); 3240 } 3241 3242 if (hbus->high_mmio_space) { 3243 align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space)); 3244 ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev, 3245 0x100000000, -1, 3246 hbus->high_mmio_space, align, 3247 false); 3248 if (ret) { 3249 dev_err(&hbus->hdev->device, 3250 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n", 3251 hbus->high_mmio_space); 3252 goto release_low_mmio; 3253 } 3254 3255 /* Modify this resource to become a bridge window. */ 3256 hbus->high_mmio_res->flags |= IORESOURCE_WINDOW; 3257 hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY; 3258 pci_add_resource(&hbus->bridge->windows, hbus->high_mmio_res); 3259 } 3260 3261 return 0; 3262 3263 release_low_mmio: 3264 if (hbus->low_mmio_res) { 3265 vmbus_free_mmio(hbus->low_mmio_res->start, 3266 resource_size(hbus->low_mmio_res)); 3267 } 3268 3269 return ret; 3270 } 3271 3272 /** 3273 * hv_allocate_config_window() - Find MMIO space for PCI Config 3274 * @hbus: Root PCI bus, as understood by this driver 3275 * 3276 * This function claims memory-mapped I/O space for accessing 3277 * configuration space for the functions on this bus. 3278 * 3279 * Return: 0 on success, -errno on failure 3280 */ 3281 static int hv_allocate_config_window(struct hv_pcibus_device *hbus) 3282 { 3283 int ret; 3284 3285 /* 3286 * Set up a region of MMIO space to use for accessing configuration 3287 * space. 3288 */ 3289 ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1, 3290 PCI_CONFIG_MMIO_LENGTH, 0x1000, false); 3291 if (ret) 3292 return ret; 3293 3294 /* 3295 * vmbus_allocate_mmio() gets used for allocating both device endpoint 3296 * resource claims (those which cannot be overlapped) and the ranges 3297 * which are valid for the children of this bus, which are intended 3298 * to be overlapped by those children. Set the flag on this claim 3299 * meaning that this region can't be overlapped. 3300 */ 3301 3302 hbus->mem_config->flags |= IORESOURCE_BUSY; 3303 3304 return 0; 3305 } 3306 3307 static void hv_free_config_window(struct hv_pcibus_device *hbus) 3308 { 3309 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH); 3310 } 3311 3312 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs); 3313 3314 /** 3315 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state 3316 * @hdev: VMBus's tracking struct for this root PCI bus 3317 * 3318 * Return: 0 on success, -errno on failure 3319 */ 3320 static int hv_pci_enter_d0(struct hv_device *hdev) 3321 { 3322 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3323 struct pci_bus_d0_entry *d0_entry; 3324 struct hv_pci_compl comp_pkt; 3325 struct pci_packet *pkt; 3326 bool retry = true; 3327 int ret; 3328 3329 enter_d0_retry: 3330 /* 3331 * Tell the host that the bus is ready to use, and moved into the 3332 * powered-on state. This includes telling the host which region 3333 * of memory-mapped I/O space has been chosen for configuration space 3334 * access. 3335 */ 3336 pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL); 3337 if (!pkt) 3338 return -ENOMEM; 3339 3340 init_completion(&comp_pkt.host_event); 3341 pkt->completion_func = hv_pci_generic_compl; 3342 pkt->compl_ctxt = &comp_pkt; 3343 d0_entry = (struct pci_bus_d0_entry *)&pkt->message; 3344 d0_entry->message_type.type = PCI_BUS_D0ENTRY; 3345 d0_entry->mmio_base = hbus->mem_config->start; 3346 3347 ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry), 3348 (unsigned long)pkt, VM_PKT_DATA_INBAND, 3349 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3350 if (!ret) 3351 ret = wait_for_response(hdev, &comp_pkt.host_event); 3352 3353 if (ret) 3354 goto exit; 3355 3356 /* 3357 * In certain case (Kdump) the pci device of interest was 3358 * not cleanly shut down and resource is still held on host 3359 * side, the host could return invalid device status. 3360 * We need to explicitly request host to release the resource 3361 * and try to enter D0 again. 3362 */ 3363 if (comp_pkt.completion_status < 0 && retry) { 3364 retry = false; 3365 3366 dev_err(&hdev->device, "Retrying D0 Entry\n"); 3367 3368 /* 3369 * Hv_pci_bus_exit() calls hv_send_resource_released() 3370 * to free up resources of its child devices. 3371 * In the kdump kernel we need to set the 3372 * wslot_res_allocated to 255 so it scans all child 3373 * devices to release resources allocated in the 3374 * normal kernel before panic happened. 3375 */ 3376 hbus->wslot_res_allocated = 255; 3377 3378 ret = hv_pci_bus_exit(hdev, true); 3379 3380 if (ret == 0) { 3381 kfree(pkt); 3382 goto enter_d0_retry; 3383 } 3384 dev_err(&hdev->device, 3385 "Retrying D0 failed with ret %d\n", ret); 3386 } 3387 3388 if (comp_pkt.completion_status < 0) { 3389 dev_err(&hdev->device, 3390 "PCI Pass-through VSP failed D0 Entry with status %x\n", 3391 comp_pkt.completion_status); 3392 ret = -EPROTO; 3393 goto exit; 3394 } 3395 3396 ret = 0; 3397 3398 exit: 3399 kfree(pkt); 3400 return ret; 3401 } 3402 3403 /** 3404 * hv_pci_query_relations() - Ask host to send list of child 3405 * devices 3406 * @hdev: VMBus's tracking struct for this root PCI bus 3407 * 3408 * Return: 0 on success, -errno on failure 3409 */ 3410 static int hv_pci_query_relations(struct hv_device *hdev) 3411 { 3412 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3413 struct pci_message message; 3414 struct completion comp; 3415 int ret; 3416 3417 /* Ask the host to send along the list of child devices */ 3418 init_completion(&comp); 3419 if (cmpxchg(&hbus->survey_event, NULL, &comp)) 3420 return -ENOTEMPTY; 3421 3422 memset(&message, 0, sizeof(message)); 3423 message.type = PCI_QUERY_BUS_RELATIONS; 3424 3425 ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message), 3426 0, VM_PKT_DATA_INBAND, 0); 3427 if (!ret) 3428 ret = wait_for_response(hdev, &comp); 3429 3430 /* 3431 * In the case of fast device addition/removal, it's possible that 3432 * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we 3433 * already got a PCI_BUS_RELATIONS* message from the host and the 3434 * channel callback already scheduled a work to hbus->wq, which can be 3435 * running pci_devices_present_work() -> survey_child_resources() -> 3436 * complete(&hbus->survey_event), even after hv_pci_query_relations() 3437 * exits and the stack variable 'comp' is no longer valid; as a result, 3438 * a hang or a page fault may happen when the complete() calls 3439 * raw_spin_lock_irqsave(). Flush hbus->wq before we exit from 3440 * hv_pci_query_relations() to avoid the issues. Note: if 'ret' is 3441 * -ENODEV, there can't be any more work item scheduled to hbus->wq 3442 * after the flush_workqueue(): see vmbus_onoffer_rescind() -> 3443 * vmbus_reset_channel_cb(), vmbus_rescind_cleanup() -> 3444 * channel->rescind = true. 3445 */ 3446 flush_workqueue(hbus->wq); 3447 3448 return ret; 3449 } 3450 3451 /** 3452 * hv_send_resources_allocated() - Report local resource choices 3453 * @hdev: VMBus's tracking struct for this root PCI bus 3454 * 3455 * The host OS is expecting to be sent a request as a message 3456 * which contains all the resources that the device will use. 3457 * The response contains those same resources, "translated" 3458 * which is to say, the values which should be used by the 3459 * hardware, when it delivers an interrupt. (MMIO resources are 3460 * used in local terms.) This is nice for Windows, and lines up 3461 * with the FDO/PDO split, which doesn't exist in Linux. Linux 3462 * is deeply expecting to scan an emulated PCI configuration 3463 * space. So this message is sent here only to drive the state 3464 * machine on the host forward. 3465 * 3466 * Return: 0 on success, -errno on failure 3467 */ 3468 static int hv_send_resources_allocated(struct hv_device *hdev) 3469 { 3470 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3471 struct pci_resources_assigned *res_assigned; 3472 struct pci_resources_assigned2 *res_assigned2; 3473 struct hv_pci_compl comp_pkt; 3474 struct hv_pci_dev *hpdev; 3475 struct pci_packet *pkt; 3476 size_t size_res; 3477 int wslot; 3478 int ret; 3479 3480 size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) 3481 ? sizeof(*res_assigned) : sizeof(*res_assigned2); 3482 3483 pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL); 3484 if (!pkt) 3485 return -ENOMEM; 3486 3487 ret = 0; 3488 3489 for (wslot = 0; wslot < 256; wslot++) { 3490 hpdev = get_pcichild_wslot(hbus, wslot); 3491 if (!hpdev) 3492 continue; 3493 3494 memset(pkt, 0, sizeof(*pkt) + size_res); 3495 init_completion(&comp_pkt.host_event); 3496 pkt->completion_func = hv_pci_generic_compl; 3497 pkt->compl_ctxt = &comp_pkt; 3498 3499 if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) { 3500 res_assigned = 3501 (struct pci_resources_assigned *)&pkt->message; 3502 res_assigned->message_type.type = 3503 PCI_RESOURCES_ASSIGNED; 3504 res_assigned->wslot.slot = hpdev->desc.win_slot.slot; 3505 } else { 3506 res_assigned2 = 3507 (struct pci_resources_assigned2 *)&pkt->message; 3508 res_assigned2->message_type.type = 3509 PCI_RESOURCES_ASSIGNED2; 3510 res_assigned2->wslot.slot = hpdev->desc.win_slot.slot; 3511 } 3512 put_pcichild(hpdev); 3513 3514 ret = vmbus_sendpacket(hdev->channel, &pkt->message, 3515 size_res, (unsigned long)pkt, 3516 VM_PKT_DATA_INBAND, 3517 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3518 if (!ret) 3519 ret = wait_for_response(hdev, &comp_pkt.host_event); 3520 if (ret) 3521 break; 3522 3523 if (comp_pkt.completion_status < 0) { 3524 ret = -EPROTO; 3525 dev_err(&hdev->device, 3526 "resource allocated returned 0x%x", 3527 comp_pkt.completion_status); 3528 break; 3529 } 3530 3531 hbus->wslot_res_allocated = wslot; 3532 } 3533 3534 kfree(pkt); 3535 return ret; 3536 } 3537 3538 /** 3539 * hv_send_resources_released() - Report local resources 3540 * released 3541 * @hdev: VMBus's tracking struct for this root PCI bus 3542 * 3543 * Return: 0 on success, -errno on failure 3544 */ 3545 static int hv_send_resources_released(struct hv_device *hdev) 3546 { 3547 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3548 struct pci_child_message pkt; 3549 struct hv_pci_dev *hpdev; 3550 int wslot; 3551 int ret; 3552 3553 for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) { 3554 hpdev = get_pcichild_wslot(hbus, wslot); 3555 if (!hpdev) 3556 continue; 3557 3558 memset(&pkt, 0, sizeof(pkt)); 3559 pkt.message_type.type = PCI_RESOURCES_RELEASED; 3560 pkt.wslot.slot = hpdev->desc.win_slot.slot; 3561 3562 put_pcichild(hpdev); 3563 3564 ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0, 3565 VM_PKT_DATA_INBAND, 0); 3566 if (ret) 3567 return ret; 3568 3569 hbus->wslot_res_allocated = wslot - 1; 3570 } 3571 3572 hbus->wslot_res_allocated = -1; 3573 3574 return 0; 3575 } 3576 3577 #define HVPCI_DOM_MAP_SIZE (64 * 1024) 3578 static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); 3579 3580 /* 3581 * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 3582 * as invalid for passthrough PCI devices of this driver. 3583 */ 3584 #define HVPCI_DOM_INVALID 0 3585 3586 /** 3587 * hv_get_dom_num() - Get a valid PCI domain number 3588 * Check if the PCI domain number is in use, and return another number if 3589 * it is in use. 3590 * 3591 * @dom: Requested domain number 3592 * 3593 * return: domain number on success, HVPCI_DOM_INVALID on failure 3594 */ 3595 static u16 hv_get_dom_num(u16 dom) 3596 { 3597 unsigned int i; 3598 3599 if (test_and_set_bit(dom, hvpci_dom_map) == 0) 3600 return dom; 3601 3602 for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { 3603 if (test_and_set_bit(i, hvpci_dom_map) == 0) 3604 return i; 3605 } 3606 3607 return HVPCI_DOM_INVALID; 3608 } 3609 3610 /** 3611 * hv_put_dom_num() - Mark the PCI domain number as free 3612 * @dom: Domain number to be freed 3613 */ 3614 static void hv_put_dom_num(u16 dom) 3615 { 3616 clear_bit(dom, hvpci_dom_map); 3617 } 3618 3619 /** 3620 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus 3621 * @hdev: VMBus's tracking struct for this root PCI bus 3622 * @dev_id: Identifies the device itself 3623 * 3624 * Return: 0 on success, -errno on failure 3625 */ 3626 static int hv_pci_probe(struct hv_device *hdev, 3627 const struct hv_vmbus_device_id *dev_id) 3628 { 3629 struct pci_host_bridge *bridge; 3630 struct hv_pcibus_device *hbus; 3631 u16 dom_req, dom; 3632 char *name; 3633 int ret; 3634 3635 bridge = devm_pci_alloc_host_bridge(&hdev->device, 0); 3636 if (!bridge) 3637 return -ENOMEM; 3638 3639 hbus = kzalloc(sizeof(*hbus), GFP_KERNEL); 3640 if (!hbus) 3641 return -ENOMEM; 3642 3643 hbus->bridge = bridge; 3644 mutex_init(&hbus->state_lock); 3645 hbus->state = hv_pcibus_init; 3646 hbus->wslot_res_allocated = -1; 3647 3648 /* 3649 * The PCI bus "domain" is what is called "segment" in ACPI and other 3650 * specs. Pull it from the instance ID, to get something usually 3651 * unique. In rare cases of collision, we will find out another number 3652 * not in use. 3653 * 3654 * Note that, since this code only runs in a Hyper-V VM, Hyper-V 3655 * together with this guest driver can guarantee that (1) The only 3656 * domain used by Gen1 VMs for something that looks like a physical 3657 * PCI bus (which is actually emulated by the hypervisor) is domain 0. 3658 * (2) There will be no overlap between domains (after fixing possible 3659 * collisions) in the same VM. 3660 */ 3661 dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; 3662 dom = hv_get_dom_num(dom_req); 3663 3664 if (dom == HVPCI_DOM_INVALID) { 3665 dev_err(&hdev->device, 3666 "Unable to use dom# 0x%x or other numbers", dom_req); 3667 ret = -EINVAL; 3668 goto free_bus; 3669 } 3670 3671 if (dom != dom_req) 3672 dev_info(&hdev->device, 3673 "PCI dom# 0x%x has collision, using 0x%x", 3674 dom_req, dom); 3675 3676 hbus->bridge->domain_nr = dom; 3677 #ifdef CONFIG_X86 3678 hbus->sysdata.domain = dom; 3679 hbus->use_calls = !!(ms_hyperv.hints & HV_X64_USE_MMIO_HYPERCALLS); 3680 #elif defined(CONFIG_ARM64) 3681 /* 3682 * Set the PCI bus parent to be the corresponding VMbus 3683 * device. Then the VMbus device will be assigned as the 3684 * ACPI companion in pcibios_root_bridge_prepare() and 3685 * pci_dma_configure() will propagate device coherence 3686 * information to devices created on the bus. 3687 */ 3688 hbus->sysdata.parent = hdev->device.parent; 3689 hbus->use_calls = false; 3690 #endif 3691 3692 hbus->hdev = hdev; 3693 INIT_LIST_HEAD(&hbus->children); 3694 INIT_LIST_HEAD(&hbus->dr_list); 3695 spin_lock_init(&hbus->config_lock); 3696 spin_lock_init(&hbus->device_list_lock); 3697 hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0, 3698 hbus->bridge->domain_nr); 3699 if (!hbus->wq) { 3700 ret = -ENOMEM; 3701 goto free_dom; 3702 } 3703 3704 hdev->channel->next_request_id_callback = vmbus_next_request_id; 3705 hdev->channel->request_addr_callback = vmbus_request_addr; 3706 hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE; 3707 3708 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 3709 hv_pci_onchannelcallback, hbus); 3710 if (ret) 3711 goto destroy_wq; 3712 3713 hv_set_drvdata(hdev, hbus); 3714 3715 ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions, 3716 ARRAY_SIZE(pci_protocol_versions)); 3717 if (ret) 3718 goto close; 3719 3720 ret = hv_allocate_config_window(hbus); 3721 if (ret) 3722 goto close; 3723 3724 hbus->cfg_addr = ioremap(hbus->mem_config->start, 3725 PCI_CONFIG_MMIO_LENGTH); 3726 if (!hbus->cfg_addr) { 3727 dev_err(&hdev->device, 3728 "Unable to map a virtual address for config space\n"); 3729 ret = -ENOMEM; 3730 goto free_config; 3731 } 3732 3733 name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance); 3734 if (!name) { 3735 ret = -ENOMEM; 3736 goto unmap; 3737 } 3738 3739 hbus->fwnode = irq_domain_alloc_named_fwnode(name); 3740 kfree(name); 3741 if (!hbus->fwnode) { 3742 ret = -ENOMEM; 3743 goto unmap; 3744 } 3745 3746 ret = hv_pcie_init_irq_domain(hbus); 3747 if (ret) 3748 goto free_fwnode; 3749 3750 ret = hv_pci_query_relations(hdev); 3751 if (ret) 3752 goto free_irq_domain; 3753 3754 mutex_lock(&hbus->state_lock); 3755 3756 ret = hv_pci_enter_d0(hdev); 3757 if (ret) 3758 goto release_state_lock; 3759 3760 ret = hv_pci_allocate_bridge_windows(hbus); 3761 if (ret) 3762 goto exit_d0; 3763 3764 ret = hv_send_resources_allocated(hdev); 3765 if (ret) 3766 goto free_windows; 3767 3768 prepopulate_bars(hbus); 3769 3770 hbus->state = hv_pcibus_probed; 3771 3772 ret = create_root_hv_pci_bus(hbus); 3773 if (ret) 3774 goto free_windows; 3775 3776 mutex_unlock(&hbus->state_lock); 3777 return 0; 3778 3779 free_windows: 3780 hv_pci_free_bridge_windows(hbus); 3781 exit_d0: 3782 (void) hv_pci_bus_exit(hdev, true); 3783 release_state_lock: 3784 mutex_unlock(&hbus->state_lock); 3785 free_irq_domain: 3786 irq_domain_remove(hbus->irq_domain); 3787 free_fwnode: 3788 irq_domain_free_fwnode(hbus->fwnode); 3789 unmap: 3790 iounmap(hbus->cfg_addr); 3791 free_config: 3792 hv_free_config_window(hbus); 3793 close: 3794 vmbus_close(hdev->channel); 3795 destroy_wq: 3796 destroy_workqueue(hbus->wq); 3797 free_dom: 3798 hv_put_dom_num(hbus->bridge->domain_nr); 3799 free_bus: 3800 kfree(hbus); 3801 return ret; 3802 } 3803 3804 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) 3805 { 3806 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3807 struct vmbus_channel *chan = hdev->channel; 3808 struct { 3809 struct pci_packet teardown_packet; 3810 u8 buffer[sizeof(struct pci_message)]; 3811 } pkt; 3812 struct hv_pci_compl comp_pkt; 3813 struct hv_pci_dev *hpdev, *tmp; 3814 unsigned long flags; 3815 u64 trans_id; 3816 int ret; 3817 3818 /* 3819 * After the host sends the RESCIND_CHANNEL message, it doesn't 3820 * access the per-channel ringbuffer any longer. 3821 */ 3822 if (chan->rescind) 3823 return 0; 3824 3825 if (!keep_devs) { 3826 struct list_head removed; 3827 3828 /* Move all present children to the list on stack */ 3829 INIT_LIST_HEAD(&removed); 3830 spin_lock_irqsave(&hbus->device_list_lock, flags); 3831 list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry) 3832 list_move_tail(&hpdev->list_entry, &removed); 3833 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 3834 3835 /* Remove all children in the list */ 3836 list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) { 3837 list_del(&hpdev->list_entry); 3838 if (hpdev->pci_slot) 3839 pci_destroy_slot(hpdev->pci_slot); 3840 /* For the two refs got in new_pcichild_device() */ 3841 put_pcichild(hpdev); 3842 put_pcichild(hpdev); 3843 } 3844 } 3845 3846 ret = hv_send_resources_released(hdev); 3847 if (ret) { 3848 dev_err(&hdev->device, 3849 "Couldn't send resources released packet(s)\n"); 3850 return ret; 3851 } 3852 3853 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet)); 3854 init_completion(&comp_pkt.host_event); 3855 pkt.teardown_packet.completion_func = hv_pci_generic_compl; 3856 pkt.teardown_packet.compl_ctxt = &comp_pkt; 3857 pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT; 3858 3859 ret = vmbus_sendpacket_getid(chan, &pkt.teardown_packet.message, 3860 sizeof(struct pci_message), 3861 (unsigned long)&pkt.teardown_packet, 3862 &trans_id, VM_PKT_DATA_INBAND, 3863 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3864 if (ret) 3865 return ret; 3866 3867 if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) { 3868 /* 3869 * The completion packet on the stack becomes invalid after 3870 * 'return'; remove the ID from the VMbus requestor if the 3871 * identifier is still mapped to/associated with the packet. 3872 * 3873 * Cf. hv_pci_onchannelcallback(). 3874 */ 3875 vmbus_request_addr_match(chan, trans_id, 3876 (unsigned long)&pkt.teardown_packet); 3877 return -ETIMEDOUT; 3878 } 3879 3880 return 0; 3881 } 3882 3883 /** 3884 * hv_pci_remove() - Remove routine for this VMBus channel 3885 * @hdev: VMBus's tracking struct for this root PCI bus 3886 */ 3887 static void hv_pci_remove(struct hv_device *hdev) 3888 { 3889 struct hv_pcibus_device *hbus; 3890 3891 hbus = hv_get_drvdata(hdev); 3892 if (hbus->state == hv_pcibus_installed) { 3893 tasklet_disable(&hdev->channel->callback_event); 3894 hbus->state = hv_pcibus_removing; 3895 tasklet_enable(&hdev->channel->callback_event); 3896 destroy_workqueue(hbus->wq); 3897 hbus->wq = NULL; 3898 /* 3899 * At this point, no work is running or can be scheduled 3900 * on hbus-wq. We can't race with hv_pci_devices_present() 3901 * or hv_pci_eject_device(), it's safe to proceed. 3902 */ 3903 3904 /* Remove the bus from PCI's point of view. */ 3905 pci_lock_rescan_remove(); 3906 pci_stop_root_bus(hbus->bridge->bus); 3907 hv_pci_remove_slots(hbus); 3908 pci_remove_root_bus(hbus->bridge->bus); 3909 pci_unlock_rescan_remove(); 3910 } 3911 3912 hv_pci_bus_exit(hdev, false); 3913 3914 vmbus_close(hdev->channel); 3915 3916 iounmap(hbus->cfg_addr); 3917 hv_free_config_window(hbus); 3918 hv_pci_free_bridge_windows(hbus); 3919 irq_domain_remove(hbus->irq_domain); 3920 irq_domain_free_fwnode(hbus->fwnode); 3921 3922 hv_put_dom_num(hbus->bridge->domain_nr); 3923 3924 kfree(hbus); 3925 } 3926 3927 static int hv_pci_suspend(struct hv_device *hdev) 3928 { 3929 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3930 enum hv_pcibus_state old_state; 3931 int ret; 3932 3933 /* 3934 * hv_pci_suspend() must make sure there are no pending work items 3935 * before calling vmbus_close(), since it runs in a process context 3936 * as a callback in dpm_suspend(). When it starts to run, the channel 3937 * callback hv_pci_onchannelcallback(), which runs in a tasklet 3938 * context, can be still running concurrently and scheduling new work 3939 * items onto hbus->wq in hv_pci_devices_present() and 3940 * hv_pci_eject_device(), and the work item handlers can access the 3941 * vmbus channel, which can be being closed by hv_pci_suspend(), e.g. 3942 * the work item handler pci_devices_present_work() -> 3943 * new_pcichild_device() writes to the vmbus channel. 3944 * 3945 * To eliminate the race, hv_pci_suspend() disables the channel 3946 * callback tasklet, sets hbus->state to hv_pcibus_removing, and 3947 * re-enables the tasklet. This way, when hv_pci_suspend() proceeds, 3948 * it knows that no new work item can be scheduled, and then it flushes 3949 * hbus->wq and safely closes the vmbus channel. 3950 */ 3951 tasklet_disable(&hdev->channel->callback_event); 3952 3953 /* Change the hbus state to prevent new work items. */ 3954 old_state = hbus->state; 3955 if (hbus->state == hv_pcibus_installed) 3956 hbus->state = hv_pcibus_removing; 3957 3958 tasklet_enable(&hdev->channel->callback_event); 3959 3960 if (old_state != hv_pcibus_installed) 3961 return -EINVAL; 3962 3963 flush_workqueue(hbus->wq); 3964 3965 ret = hv_pci_bus_exit(hdev, true); 3966 if (ret) 3967 return ret; 3968 3969 vmbus_close(hdev->channel); 3970 3971 return 0; 3972 } 3973 3974 static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg) 3975 { 3976 struct irq_data *irq_data; 3977 struct msi_desc *entry; 3978 int ret = 0; 3979 3980 if (!pdev->msi_enabled && !pdev->msix_enabled) 3981 return 0; 3982 3983 msi_lock_descs(&pdev->dev); 3984 msi_for_each_desc(entry, &pdev->dev, MSI_DESC_ASSOCIATED) { 3985 irq_data = irq_get_irq_data(entry->irq); 3986 if (WARN_ON_ONCE(!irq_data)) { 3987 ret = -EINVAL; 3988 break; 3989 } 3990 3991 hv_compose_msi_msg(irq_data, &entry->msg); 3992 } 3993 msi_unlock_descs(&pdev->dev); 3994 3995 return ret; 3996 } 3997 3998 /* 3999 * Upon resume, pci_restore_msi_state() -> ... -> __pci_write_msi_msg() 4000 * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V 4001 * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg() 4002 * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping 4003 * Table entries. 4004 */ 4005 static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus) 4006 { 4007 pci_walk_bus(hbus->bridge->bus, hv_pci_restore_msi_msg, NULL); 4008 } 4009 4010 static int hv_pci_resume(struct hv_device *hdev) 4011 { 4012 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 4013 enum pci_protocol_version_t version[1]; 4014 int ret; 4015 4016 hbus->state = hv_pcibus_init; 4017 4018 hdev->channel->next_request_id_callback = vmbus_next_request_id; 4019 hdev->channel->request_addr_callback = vmbus_request_addr; 4020 hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE; 4021 4022 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 4023 hv_pci_onchannelcallback, hbus); 4024 if (ret) 4025 return ret; 4026 4027 /* Only use the version that was in use before hibernation. */ 4028 version[0] = hbus->protocol_version; 4029 ret = hv_pci_protocol_negotiation(hdev, version, 1); 4030 if (ret) 4031 goto out; 4032 4033 ret = hv_pci_query_relations(hdev); 4034 if (ret) 4035 goto out; 4036 4037 mutex_lock(&hbus->state_lock); 4038 4039 ret = hv_pci_enter_d0(hdev); 4040 if (ret) 4041 goto release_state_lock; 4042 4043 ret = hv_send_resources_allocated(hdev); 4044 if (ret) 4045 goto release_state_lock; 4046 4047 prepopulate_bars(hbus); 4048 4049 hv_pci_restore_msi_state(hbus); 4050 4051 hbus->state = hv_pcibus_installed; 4052 mutex_unlock(&hbus->state_lock); 4053 return 0; 4054 4055 release_state_lock: 4056 mutex_unlock(&hbus->state_lock); 4057 out: 4058 vmbus_close(hdev->channel); 4059 return ret; 4060 } 4061 4062 static const struct hv_vmbus_device_id hv_pci_id_table[] = { 4063 /* PCI Pass-through Class ID */ 4064 /* 44C4F61D-4444-4400-9D52-802E27EDE19F */ 4065 { HV_PCIE_GUID, }, 4066 { }, 4067 }; 4068 4069 MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table); 4070 4071 static struct hv_driver hv_pci_drv = { 4072 .name = "hv_pci", 4073 .id_table = hv_pci_id_table, 4074 .probe = hv_pci_probe, 4075 .remove = hv_pci_remove, 4076 .suspend = hv_pci_suspend, 4077 .resume = hv_pci_resume, 4078 }; 4079 4080 static void __exit exit_hv_pci_drv(void) 4081 { 4082 vmbus_driver_unregister(&hv_pci_drv); 4083 4084 hvpci_block_ops.read_block = NULL; 4085 hvpci_block_ops.write_block = NULL; 4086 hvpci_block_ops.reg_blk_invalidate = NULL; 4087 } 4088 4089 static int __init init_hv_pci_drv(void) 4090 { 4091 int ret; 4092 4093 if (!hv_is_hyperv_initialized()) 4094 return -ENODEV; 4095 4096 ret = hv_pci_irqchip_init(); 4097 if (ret) 4098 return ret; 4099 4100 /* Set the invalid domain number's bit, so it will not be used */ 4101 set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); 4102 4103 /* Initialize PCI block r/w interface */ 4104 hvpci_block_ops.read_block = hv_read_config_block; 4105 hvpci_block_ops.write_block = hv_write_config_block; 4106 hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate; 4107 4108 return vmbus_driver_register(&hv_pci_drv); 4109 } 4110 4111 module_init(init_hv_pci_drv); 4112 module_exit(exit_hv_pci_drv); 4113 4114 MODULE_DESCRIPTION("Hyper-V PCI"); 4115 MODULE_LICENSE("GPL v2"); 4116