1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) Microsoft Corporation. 4 * 5 * Author: 6 * Jake Oshins <jakeo@microsoft.com> 7 * 8 * This driver acts as a paravirtual front-end for PCI Express root buses. 9 * When a PCI Express function (either an entire device or an SR-IOV 10 * Virtual Function) is being passed through to the VM, this driver exposes 11 * a new bus to the guest VM. This is modeled as a root PCI bus because 12 * no bridges are being exposed to the VM. In fact, with a "Generation 2" 13 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM 14 * until a device as been exposed using this driver. 15 * 16 * Each root PCI bus has its own PCI domain, which is called "Segment" in 17 * the PCI Firmware Specifications. Thus while each device passed through 18 * to the VM using this front-end will appear at "device 0", the domain will 19 * be unique. Typically, each bus will have one PCI function on it, though 20 * this driver does support more than one. 21 * 22 * In order to map the interrupts from the device through to the guest VM, 23 * this driver also implements an IRQ Domain, which handles interrupts (either 24 * MSI or MSI-X) associated with the functions on the bus. As interrupts are 25 * set up, torn down, or reaffined, this driver communicates with the 26 * underlying hypervisor to adjust the mappings in the I/O MMU so that each 27 * interrupt will be delivered to the correct virtual processor at the right 28 * vector. This driver does not support level-triggered (line-based) 29 * interrupts, and will report that the Interrupt Line register in the 30 * function's configuration space is zero. 31 * 32 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V 33 * facilities. For instance, the configuration space of a function exposed 34 * by Hyper-V is mapped into a single page of memory space, and the 35 * read and write handlers for config space must be aware of this mechanism. 36 * Similarly, device setup and teardown involves messages sent to and from 37 * the PCI back-end driver in Hyper-V. 38 */ 39 40 #include <linux/kernel.h> 41 #include <linux/module.h> 42 #include <linux/pci.h> 43 #include <linux/pci-ecam.h> 44 #include <linux/delay.h> 45 #include <linux/semaphore.h> 46 #include <linux/irq.h> 47 #include <linux/msi.h> 48 #include <linux/hyperv.h> 49 #include <linux/refcount.h> 50 #include <linux/irqdomain.h> 51 #include <linux/acpi.h> 52 #include <asm/mshyperv.h> 53 54 /* 55 * Protocol versions. The low word is the minor version, the high word the 56 * major version. 57 */ 58 59 #define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor))) 60 #define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16) 61 #define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff) 62 63 enum pci_protocol_version_t { 64 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */ 65 PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */ 66 PCI_PROTOCOL_VERSION_1_3 = PCI_MAKE_VERSION(1, 3), /* Vibranium */ 67 PCI_PROTOCOL_VERSION_1_4 = PCI_MAKE_VERSION(1, 4), /* WS2022 */ 68 }; 69 70 #define CPU_AFFINITY_ALL -1ULL 71 72 /* 73 * Supported protocol versions in the order of probing - highest go 74 * first. 75 */ 76 static enum pci_protocol_version_t pci_protocol_versions[] = { 77 PCI_PROTOCOL_VERSION_1_4, 78 PCI_PROTOCOL_VERSION_1_3, 79 PCI_PROTOCOL_VERSION_1_2, 80 PCI_PROTOCOL_VERSION_1_1, 81 }; 82 83 #define PCI_CONFIG_MMIO_LENGTH 0x2000 84 #define CFG_PAGE_OFFSET 0x1000 85 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET) 86 87 #define MAX_SUPPORTED_MSI_MESSAGES 0x400 88 89 #define STATUS_REVISION_MISMATCH 0xC0000059 90 91 /* space for 32bit serial number as string */ 92 #define SLOT_NAME_SIZE 11 93 94 /* 95 * Size of requestor for VMbus; the value is based on the observation 96 * that having more than one request outstanding is 'rare', and so 64 97 * should be generous in ensuring that we don't ever run out. 98 */ 99 #define HV_PCI_RQSTOR_SIZE 64 100 101 /* 102 * Message Types 103 */ 104 105 enum pci_message_type { 106 /* 107 * Version 1.1 108 */ 109 PCI_MESSAGE_BASE = 0x42490000, 110 PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0, 111 PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1, 112 PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4, 113 PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5, 114 PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6, 115 PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7, 116 PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8, 117 PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9, 118 PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA, 119 PCI_EJECT = PCI_MESSAGE_BASE + 0xB, 120 PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC, 121 PCI_REENABLE = PCI_MESSAGE_BASE + 0xD, 122 PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE, 123 PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF, 124 PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10, 125 PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11, 126 PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12, 127 PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13, 128 PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14, 129 PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15, 130 PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16, 131 PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17, 132 PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */ 133 PCI_BUS_RELATIONS2 = PCI_MESSAGE_BASE + 0x19, 134 PCI_RESOURCES_ASSIGNED3 = PCI_MESSAGE_BASE + 0x1A, 135 PCI_CREATE_INTERRUPT_MESSAGE3 = PCI_MESSAGE_BASE + 0x1B, 136 PCI_MESSAGE_MAXIMUM 137 }; 138 139 /* 140 * Structures defining the virtual PCI Express protocol. 141 */ 142 143 union pci_version { 144 struct { 145 u16 minor_version; 146 u16 major_version; 147 } parts; 148 u32 version; 149 } __packed; 150 151 /* 152 * Function numbers are 8-bits wide on Express, as interpreted through ARI, 153 * which is all this driver does. This representation is the one used in 154 * Windows, which is what is expected when sending this back and forth with 155 * the Hyper-V parent partition. 156 */ 157 union win_slot_encoding { 158 struct { 159 u32 dev:5; 160 u32 func:3; 161 u32 reserved:24; 162 } bits; 163 u32 slot; 164 } __packed; 165 166 /* 167 * Pretty much as defined in the PCI Specifications. 168 */ 169 struct pci_function_description { 170 u16 v_id; /* vendor ID */ 171 u16 d_id; /* device ID */ 172 u8 rev; 173 u8 prog_intf; 174 u8 subclass; 175 u8 base_class; 176 u32 subsystem_id; 177 union win_slot_encoding win_slot; 178 u32 ser; /* serial number */ 179 } __packed; 180 181 enum pci_device_description_flags { 182 HV_PCI_DEVICE_FLAG_NONE = 0x0, 183 HV_PCI_DEVICE_FLAG_NUMA_AFFINITY = 0x1, 184 }; 185 186 struct pci_function_description2 { 187 u16 v_id; /* vendor ID */ 188 u16 d_id; /* device ID */ 189 u8 rev; 190 u8 prog_intf; 191 u8 subclass; 192 u8 base_class; 193 u32 subsystem_id; 194 union win_slot_encoding win_slot; 195 u32 ser; /* serial number */ 196 u32 flags; 197 u16 virtual_numa_node; 198 u16 reserved; 199 } __packed; 200 201 /** 202 * struct hv_msi_desc 203 * @vector: IDT entry 204 * @delivery_mode: As defined in Intel's Programmer's 205 * Reference Manual, Volume 3, Chapter 8. 206 * @vector_count: Number of contiguous entries in the 207 * Interrupt Descriptor Table that are 208 * occupied by this Message-Signaled 209 * Interrupt. For "MSI", as first defined 210 * in PCI 2.2, this can be between 1 and 211 * 32. For "MSI-X," as first defined in PCI 212 * 3.0, this must be 1, as each MSI-X table 213 * entry would have its own descriptor. 214 * @reserved: Empty space 215 * @cpu_mask: All the target virtual processors. 216 */ 217 struct hv_msi_desc { 218 u8 vector; 219 u8 delivery_mode; 220 u16 vector_count; 221 u32 reserved; 222 u64 cpu_mask; 223 } __packed; 224 225 /** 226 * struct hv_msi_desc2 - 1.2 version of hv_msi_desc 227 * @vector: IDT entry 228 * @delivery_mode: As defined in Intel's Programmer's 229 * Reference Manual, Volume 3, Chapter 8. 230 * @vector_count: Number of contiguous entries in the 231 * Interrupt Descriptor Table that are 232 * occupied by this Message-Signaled 233 * Interrupt. For "MSI", as first defined 234 * in PCI 2.2, this can be between 1 and 235 * 32. For "MSI-X," as first defined in PCI 236 * 3.0, this must be 1, as each MSI-X table 237 * entry would have its own descriptor. 238 * @processor_count: number of bits enabled in array. 239 * @processor_array: All the target virtual processors. 240 */ 241 struct hv_msi_desc2 { 242 u8 vector; 243 u8 delivery_mode; 244 u16 vector_count; 245 u16 processor_count; 246 u16 processor_array[32]; 247 } __packed; 248 249 /* 250 * struct hv_msi_desc3 - 1.3 version of hv_msi_desc 251 * Everything is the same as in 'hv_msi_desc2' except that the size of the 252 * 'vector' field is larger to support bigger vector values. For ex: LPI 253 * vectors on ARM. 254 */ 255 struct hv_msi_desc3 { 256 u32 vector; 257 u8 delivery_mode; 258 u8 reserved; 259 u16 vector_count; 260 u16 processor_count; 261 u16 processor_array[32]; 262 } __packed; 263 264 /** 265 * struct tran_int_desc 266 * @reserved: unused, padding 267 * @vector_count: same as in hv_msi_desc 268 * @data: This is the "data payload" value that is 269 * written by the device when it generates 270 * a message-signaled interrupt, either MSI 271 * or MSI-X. 272 * @address: This is the address to which the data 273 * payload is written on interrupt 274 * generation. 275 */ 276 struct tran_int_desc { 277 u16 reserved; 278 u16 vector_count; 279 u32 data; 280 u64 address; 281 } __packed; 282 283 /* 284 * A generic message format for virtual PCI. 285 * Specific message formats are defined later in the file. 286 */ 287 288 struct pci_message { 289 u32 type; 290 } __packed; 291 292 struct pci_child_message { 293 struct pci_message message_type; 294 union win_slot_encoding wslot; 295 } __packed; 296 297 struct pci_incoming_message { 298 struct vmpacket_descriptor hdr; 299 struct pci_message message_type; 300 } __packed; 301 302 struct pci_response { 303 struct vmpacket_descriptor hdr; 304 s32 status; /* negative values are failures */ 305 } __packed; 306 307 struct pci_packet { 308 void (*completion_func)(void *context, struct pci_response *resp, 309 int resp_packet_size); 310 void *compl_ctxt; 311 312 struct pci_message message[]; 313 }; 314 315 /* 316 * Specific message types supporting the PCI protocol. 317 */ 318 319 /* 320 * Version negotiation message. Sent from the guest to the host. 321 * The guest is free to try different versions until the host 322 * accepts the version. 323 * 324 * pci_version: The protocol version requested. 325 * is_last_attempt: If TRUE, this is the last version guest will request. 326 * reservedz: Reserved field, set to zero. 327 */ 328 329 struct pci_version_request { 330 struct pci_message message_type; 331 u32 protocol_version; 332 } __packed; 333 334 /* 335 * Bus D0 Entry. This is sent from the guest to the host when the virtual 336 * bus (PCI Express port) is ready for action. 337 */ 338 339 struct pci_bus_d0_entry { 340 struct pci_message message_type; 341 u32 reserved; 342 u64 mmio_base; 343 } __packed; 344 345 struct pci_bus_relations { 346 struct pci_incoming_message incoming; 347 u32 device_count; 348 struct pci_function_description func[]; 349 } __packed; 350 351 struct pci_bus_relations2 { 352 struct pci_incoming_message incoming; 353 u32 device_count; 354 struct pci_function_description2 func[]; 355 } __packed; 356 357 struct pci_q_res_req_response { 358 struct vmpacket_descriptor hdr; 359 s32 status; /* negative values are failures */ 360 u32 probed_bar[PCI_STD_NUM_BARS]; 361 } __packed; 362 363 struct pci_set_power { 364 struct pci_message message_type; 365 union win_slot_encoding wslot; 366 u32 power_state; /* In Windows terms */ 367 u32 reserved; 368 } __packed; 369 370 struct pci_set_power_response { 371 struct vmpacket_descriptor hdr; 372 s32 status; /* negative values are failures */ 373 union win_slot_encoding wslot; 374 u32 resultant_state; /* In Windows terms */ 375 u32 reserved; 376 } __packed; 377 378 struct pci_resources_assigned { 379 struct pci_message message_type; 380 union win_slot_encoding wslot; 381 u8 memory_range[0x14][6]; /* not used here */ 382 u32 msi_descriptors; 383 u32 reserved[4]; 384 } __packed; 385 386 struct pci_resources_assigned2 { 387 struct pci_message message_type; 388 union win_slot_encoding wslot; 389 u8 memory_range[0x14][6]; /* not used here */ 390 u32 msi_descriptor_count; 391 u8 reserved[70]; 392 } __packed; 393 394 struct pci_create_interrupt { 395 struct pci_message message_type; 396 union win_slot_encoding wslot; 397 struct hv_msi_desc int_desc; 398 } __packed; 399 400 struct pci_create_int_response { 401 struct pci_response response; 402 u32 reserved; 403 struct tran_int_desc int_desc; 404 } __packed; 405 406 struct pci_create_interrupt2 { 407 struct pci_message message_type; 408 union win_slot_encoding wslot; 409 struct hv_msi_desc2 int_desc; 410 } __packed; 411 412 struct pci_create_interrupt3 { 413 struct pci_message message_type; 414 union win_slot_encoding wslot; 415 struct hv_msi_desc3 int_desc; 416 } __packed; 417 418 struct pci_delete_interrupt { 419 struct pci_message message_type; 420 union win_slot_encoding wslot; 421 struct tran_int_desc int_desc; 422 } __packed; 423 424 /* 425 * Note: the VM must pass a valid block id, wslot and bytes_requested. 426 */ 427 struct pci_read_block { 428 struct pci_message message_type; 429 u32 block_id; 430 union win_slot_encoding wslot; 431 u32 bytes_requested; 432 } __packed; 433 434 struct pci_read_block_response { 435 struct vmpacket_descriptor hdr; 436 u32 status; 437 u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 438 } __packed; 439 440 /* 441 * Note: the VM must pass a valid block id, wslot and byte_count. 442 */ 443 struct pci_write_block { 444 struct pci_message message_type; 445 u32 block_id; 446 union win_slot_encoding wslot; 447 u32 byte_count; 448 u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 449 } __packed; 450 451 struct pci_dev_inval_block { 452 struct pci_incoming_message incoming; 453 union win_slot_encoding wslot; 454 u64 block_mask; 455 } __packed; 456 457 struct pci_dev_incoming { 458 struct pci_incoming_message incoming; 459 union win_slot_encoding wslot; 460 } __packed; 461 462 struct pci_eject_response { 463 struct pci_message message_type; 464 union win_slot_encoding wslot; 465 u32 status; 466 } __packed; 467 468 static int pci_ring_size = (4 * PAGE_SIZE); 469 470 /* 471 * Driver specific state. 472 */ 473 474 enum hv_pcibus_state { 475 hv_pcibus_init = 0, 476 hv_pcibus_probed, 477 hv_pcibus_installed, 478 hv_pcibus_removing, 479 hv_pcibus_maximum 480 }; 481 482 struct hv_pcibus_device { 483 #ifdef CONFIG_X86 484 struct pci_sysdata sysdata; 485 #elif defined(CONFIG_ARM64) 486 struct pci_config_window sysdata; 487 #endif 488 struct pci_host_bridge *bridge; 489 struct fwnode_handle *fwnode; 490 /* Protocol version negotiated with the host */ 491 enum pci_protocol_version_t protocol_version; 492 enum hv_pcibus_state state; 493 struct hv_device *hdev; 494 resource_size_t low_mmio_space; 495 resource_size_t high_mmio_space; 496 struct resource *mem_config; 497 struct resource *low_mmio_res; 498 struct resource *high_mmio_res; 499 struct completion *survey_event; 500 struct pci_bus *pci_bus; 501 spinlock_t config_lock; /* Avoid two threads writing index page */ 502 spinlock_t device_list_lock; /* Protect lists below */ 503 void __iomem *cfg_addr; 504 505 struct list_head children; 506 struct list_head dr_list; 507 508 struct msi_domain_info msi_info; 509 struct irq_domain *irq_domain; 510 511 spinlock_t retarget_msi_interrupt_lock; 512 513 struct workqueue_struct *wq; 514 515 /* Highest slot of child device with resources allocated */ 516 int wslot_res_allocated; 517 518 /* hypercall arg, must not cross page boundary */ 519 struct hv_retarget_device_interrupt retarget_msi_interrupt_params; 520 521 /* 522 * Don't put anything here: retarget_msi_interrupt_params must be last 523 */ 524 }; 525 526 /* 527 * Tracks "Device Relations" messages from the host, which must be both 528 * processed in order and deferred so that they don't run in the context 529 * of the incoming packet callback. 530 */ 531 struct hv_dr_work { 532 struct work_struct wrk; 533 struct hv_pcibus_device *bus; 534 }; 535 536 struct hv_pcidev_description { 537 u16 v_id; /* vendor ID */ 538 u16 d_id; /* device ID */ 539 u8 rev; 540 u8 prog_intf; 541 u8 subclass; 542 u8 base_class; 543 u32 subsystem_id; 544 union win_slot_encoding win_slot; 545 u32 ser; /* serial number */ 546 u32 flags; 547 u16 virtual_numa_node; 548 }; 549 550 struct hv_dr_state { 551 struct list_head list_entry; 552 u32 device_count; 553 struct hv_pcidev_description func[]; 554 }; 555 556 enum hv_pcichild_state { 557 hv_pcichild_init = 0, 558 hv_pcichild_requirements, 559 hv_pcichild_resourced, 560 hv_pcichild_ejecting, 561 hv_pcichild_maximum 562 }; 563 564 struct hv_pci_dev { 565 /* List protected by pci_rescan_remove_lock */ 566 struct list_head list_entry; 567 refcount_t refs; 568 enum hv_pcichild_state state; 569 struct pci_slot *pci_slot; 570 struct hv_pcidev_description desc; 571 bool reported_missing; 572 struct hv_pcibus_device *hbus; 573 struct work_struct wrk; 574 575 void (*block_invalidate)(void *context, u64 block_mask); 576 void *invalidate_context; 577 578 /* 579 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then 580 * read it back, for each of the BAR offsets within config space. 581 */ 582 u32 probed_bar[PCI_STD_NUM_BARS]; 583 }; 584 585 struct hv_pci_compl { 586 struct completion host_event; 587 s32 completion_status; 588 }; 589 590 static void hv_pci_onchannelcallback(void *context); 591 592 #ifdef CONFIG_X86 593 #define DELIVERY_MODE APIC_DELIVERY_MODE_FIXED 594 #define FLOW_HANDLER handle_edge_irq 595 #define FLOW_NAME "edge" 596 597 static int hv_pci_irqchip_init(void) 598 { 599 return 0; 600 } 601 602 static struct irq_domain *hv_pci_get_root_domain(void) 603 { 604 return x86_vector_domain; 605 } 606 607 static unsigned int hv_msi_get_int_vector(struct irq_data *data) 608 { 609 struct irq_cfg *cfg = irqd_cfg(data); 610 611 return cfg->vector; 612 } 613 614 static int hv_msi_prepare(struct irq_domain *domain, struct device *dev, 615 int nvec, msi_alloc_info_t *info) 616 { 617 int ret = pci_msi_prepare(domain, dev, nvec, info); 618 619 /* 620 * By using the interrupt remapper in the hypervisor IOMMU, contiguous 621 * CPU vectors is not needed for multi-MSI 622 */ 623 if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) 624 info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; 625 626 return ret; 627 } 628 629 /** 630 * hv_arch_irq_unmask() - "Unmask" the IRQ by setting its current 631 * affinity. 632 * @data: Describes the IRQ 633 * 634 * Build new a destination for the MSI and make a hypercall to 635 * update the Interrupt Redirection Table. "Device Logical ID" 636 * is built out of this PCI bus's instance GUID and the function 637 * number of the device. 638 */ 639 static void hv_arch_irq_unmask(struct irq_data *data) 640 { 641 struct msi_desc *msi_desc = irq_data_get_msi_desc(data); 642 struct hv_retarget_device_interrupt *params; 643 struct tran_int_desc *int_desc; 644 struct hv_pcibus_device *hbus; 645 const struct cpumask *dest; 646 cpumask_var_t tmp; 647 struct pci_bus *pbus; 648 struct pci_dev *pdev; 649 unsigned long flags; 650 u32 var_size = 0; 651 int cpu, nr_bank; 652 u64 res; 653 654 dest = irq_data_get_effective_affinity_mask(data); 655 pdev = msi_desc_to_pci_dev(msi_desc); 656 pbus = pdev->bus; 657 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 658 int_desc = data->chip_data; 659 660 spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags); 661 662 params = &hbus->retarget_msi_interrupt_params; 663 memset(params, 0, sizeof(*params)); 664 params->partition_id = HV_PARTITION_ID_SELF; 665 params->int_entry.source = HV_INTERRUPT_SOURCE_MSI; 666 params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff; 667 params->int_entry.msi_entry.data.as_uint32 = int_desc->data; 668 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) | 669 (hbus->hdev->dev_instance.b[4] << 16) | 670 (hbus->hdev->dev_instance.b[7] << 8) | 671 (hbus->hdev->dev_instance.b[6] & 0xf8) | 672 PCI_FUNC(pdev->devfn); 673 params->int_target.vector = hv_msi_get_int_vector(data); 674 675 /* 676 * Honoring apic->delivery_mode set to APIC_DELIVERY_MODE_FIXED by 677 * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a 678 * spurious interrupt storm. Not doing so does not seem to have a 679 * negative effect (yet?). 680 */ 681 682 if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) { 683 /* 684 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the 685 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides 686 * with >64 VP support. 687 * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED 688 * is not sufficient for this hypercall. 689 */ 690 params->int_target.flags |= 691 HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; 692 693 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) { 694 res = 1; 695 goto exit_unlock; 696 } 697 698 cpumask_and(tmp, dest, cpu_online_mask); 699 nr_bank = cpumask_to_vpset(¶ms->int_target.vp_set, tmp); 700 free_cpumask_var(tmp); 701 702 if (nr_bank <= 0) { 703 res = 1; 704 goto exit_unlock; 705 } 706 707 /* 708 * var-sized hypercall, var-size starts after vp_mask (thus 709 * vp_set.format does not count, but vp_set.valid_bank_mask 710 * does). 711 */ 712 var_size = 1 + nr_bank; 713 } else { 714 for_each_cpu_and(cpu, dest, cpu_online_mask) { 715 params->int_target.vp_mask |= 716 (1ULL << hv_cpu_number_to_vp_number(cpu)); 717 } 718 } 719 720 res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17), 721 params, NULL); 722 723 exit_unlock: 724 spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags); 725 726 /* 727 * During hibernation, when a CPU is offlined, the kernel tries 728 * to move the interrupt to the remaining CPUs that haven't 729 * been offlined yet. In this case, the below hv_do_hypercall() 730 * always fails since the vmbus channel has been closed: 731 * refer to cpu_disable_common() -> fixup_irqs() -> 732 * irq_migrate_all_off_this_cpu() -> migrate_one_irq(). 733 * 734 * Suppress the error message for hibernation because the failure 735 * during hibernation does not matter (at this time all the devices 736 * have been frozen). Note: the correct affinity info is still updated 737 * into the irqdata data structure in migrate_one_irq() -> 738 * irq_do_set_affinity() -> hv_set_affinity(), so later when the VM 739 * resumes, hv_pci_restore_msi_state() is able to correctly restore 740 * the interrupt with the correct affinity. 741 */ 742 if (!hv_result_success(res) && hbus->state != hv_pcibus_removing) 743 dev_err(&hbus->hdev->device, 744 "%s() failed: %#llx", __func__, res); 745 } 746 #elif defined(CONFIG_ARM64) 747 /* 748 * SPI vectors to use for vPCI; arch SPIs range is [32, 1019], but leaving a bit 749 * of room at the start to allow for SPIs to be specified through ACPI and 750 * starting with a power of two to satisfy power of 2 multi-MSI requirement. 751 */ 752 #define HV_PCI_MSI_SPI_START 64 753 #define HV_PCI_MSI_SPI_NR (1020 - HV_PCI_MSI_SPI_START) 754 #define DELIVERY_MODE 0 755 #define FLOW_HANDLER NULL 756 #define FLOW_NAME NULL 757 #define hv_msi_prepare NULL 758 759 struct hv_pci_chip_data { 760 DECLARE_BITMAP(spi_map, HV_PCI_MSI_SPI_NR); 761 struct mutex map_lock; 762 }; 763 764 /* Hyper-V vPCI MSI GIC IRQ domain */ 765 static struct irq_domain *hv_msi_gic_irq_domain; 766 767 /* Hyper-V PCI MSI IRQ chip */ 768 static struct irq_chip hv_arm64_msi_irq_chip = { 769 .name = "MSI", 770 .irq_set_affinity = irq_chip_set_affinity_parent, 771 .irq_eoi = irq_chip_eoi_parent, 772 .irq_mask = irq_chip_mask_parent, 773 .irq_unmask = irq_chip_unmask_parent 774 }; 775 776 static unsigned int hv_msi_get_int_vector(struct irq_data *irqd) 777 { 778 return irqd->parent_data->hwirq; 779 } 780 781 /* 782 * @nr_bm_irqs: Indicates the number of IRQs that were allocated from 783 * the bitmap. 784 * @nr_dom_irqs: Indicates the number of IRQs that were allocated from 785 * the parent domain. 786 */ 787 static void hv_pci_vec_irq_free(struct irq_domain *domain, 788 unsigned int virq, 789 unsigned int nr_bm_irqs, 790 unsigned int nr_dom_irqs) 791 { 792 struct hv_pci_chip_data *chip_data = domain->host_data; 793 struct irq_data *d = irq_domain_get_irq_data(domain, virq); 794 int first = d->hwirq - HV_PCI_MSI_SPI_START; 795 int i; 796 797 mutex_lock(&chip_data->map_lock); 798 bitmap_release_region(chip_data->spi_map, 799 first, 800 get_count_order(nr_bm_irqs)); 801 mutex_unlock(&chip_data->map_lock); 802 for (i = 0; i < nr_dom_irqs; i++) { 803 if (i) 804 d = irq_domain_get_irq_data(domain, virq + i); 805 irq_domain_reset_irq_data(d); 806 } 807 808 irq_domain_free_irqs_parent(domain, virq, nr_dom_irqs); 809 } 810 811 static void hv_pci_vec_irq_domain_free(struct irq_domain *domain, 812 unsigned int virq, 813 unsigned int nr_irqs) 814 { 815 hv_pci_vec_irq_free(domain, virq, nr_irqs, nr_irqs); 816 } 817 818 static int hv_pci_vec_alloc_device_irq(struct irq_domain *domain, 819 unsigned int nr_irqs, 820 irq_hw_number_t *hwirq) 821 { 822 struct hv_pci_chip_data *chip_data = domain->host_data; 823 int index; 824 825 /* Find and allocate region from the SPI bitmap */ 826 mutex_lock(&chip_data->map_lock); 827 index = bitmap_find_free_region(chip_data->spi_map, 828 HV_PCI_MSI_SPI_NR, 829 get_count_order(nr_irqs)); 830 mutex_unlock(&chip_data->map_lock); 831 if (index < 0) 832 return -ENOSPC; 833 834 *hwirq = index + HV_PCI_MSI_SPI_START; 835 836 return 0; 837 } 838 839 static int hv_pci_vec_irq_gic_domain_alloc(struct irq_domain *domain, 840 unsigned int virq, 841 irq_hw_number_t hwirq) 842 { 843 struct irq_fwspec fwspec; 844 struct irq_data *d; 845 int ret; 846 847 fwspec.fwnode = domain->parent->fwnode; 848 fwspec.param_count = 2; 849 fwspec.param[0] = hwirq; 850 fwspec.param[1] = IRQ_TYPE_EDGE_RISING; 851 852 ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); 853 if (ret) 854 return ret; 855 856 /* 857 * Since the interrupt specifier is not coming from ACPI or DT, the 858 * trigger type will need to be set explicitly. Otherwise, it will be 859 * set to whatever is in the GIC configuration. 860 */ 861 d = irq_domain_get_irq_data(domain->parent, virq); 862 863 return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); 864 } 865 866 static int hv_pci_vec_irq_domain_alloc(struct irq_domain *domain, 867 unsigned int virq, unsigned int nr_irqs, 868 void *args) 869 { 870 irq_hw_number_t hwirq; 871 unsigned int i; 872 int ret; 873 874 ret = hv_pci_vec_alloc_device_irq(domain, nr_irqs, &hwirq); 875 if (ret) 876 return ret; 877 878 for (i = 0; i < nr_irqs; i++) { 879 ret = hv_pci_vec_irq_gic_domain_alloc(domain, virq + i, 880 hwirq + i); 881 if (ret) { 882 hv_pci_vec_irq_free(domain, virq, nr_irqs, i); 883 return ret; 884 } 885 886 irq_domain_set_hwirq_and_chip(domain, virq + i, 887 hwirq + i, 888 &hv_arm64_msi_irq_chip, 889 domain->host_data); 890 pr_debug("pID:%d vID:%u\n", (int)(hwirq + i), virq + i); 891 } 892 893 return 0; 894 } 895 896 /* 897 * Pick the first cpu as the irq affinity that can be temporarily used for 898 * composing MSI from the hypervisor. GIC will eventually set the right 899 * affinity for the irq and the 'unmask' will retarget the interrupt to that 900 * cpu. 901 */ 902 static int hv_pci_vec_irq_domain_activate(struct irq_domain *domain, 903 struct irq_data *irqd, bool reserve) 904 { 905 int cpu = cpumask_first(cpu_present_mask); 906 907 irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); 908 909 return 0; 910 } 911 912 static const struct irq_domain_ops hv_pci_domain_ops = { 913 .alloc = hv_pci_vec_irq_domain_alloc, 914 .free = hv_pci_vec_irq_domain_free, 915 .activate = hv_pci_vec_irq_domain_activate, 916 }; 917 918 static int hv_pci_irqchip_init(void) 919 { 920 static struct hv_pci_chip_data *chip_data; 921 struct fwnode_handle *fn = NULL; 922 int ret = -ENOMEM; 923 924 chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL); 925 if (!chip_data) 926 return ret; 927 928 mutex_init(&chip_data->map_lock); 929 fn = irq_domain_alloc_named_fwnode("hv_vpci_arm64"); 930 if (!fn) 931 goto free_chip; 932 933 /* 934 * IRQ domain once enabled, should not be removed since there is no 935 * way to ensure that all the corresponding devices are also gone and 936 * no interrupts will be generated. 937 */ 938 hv_msi_gic_irq_domain = acpi_irq_create_hierarchy(0, HV_PCI_MSI_SPI_NR, 939 fn, &hv_pci_domain_ops, 940 chip_data); 941 942 if (!hv_msi_gic_irq_domain) { 943 pr_err("Failed to create Hyper-V arm64 vPCI MSI IRQ domain\n"); 944 goto free_chip; 945 } 946 947 return 0; 948 949 free_chip: 950 kfree(chip_data); 951 if (fn) 952 irq_domain_free_fwnode(fn); 953 954 return ret; 955 } 956 957 static struct irq_domain *hv_pci_get_root_domain(void) 958 { 959 return hv_msi_gic_irq_domain; 960 } 961 962 /* 963 * SPIs are used for interrupts of PCI devices and SPIs is managed via GICD 964 * registers which Hyper-V already supports, so no hypercall needed. 965 */ 966 static void hv_arch_irq_unmask(struct irq_data *data) { } 967 #endif /* CONFIG_ARM64 */ 968 969 /** 970 * hv_pci_generic_compl() - Invoked for a completion packet 971 * @context: Set up by the sender of the packet. 972 * @resp: The response packet 973 * @resp_packet_size: Size in bytes of the packet 974 * 975 * This function is used to trigger an event and report status 976 * for any message for which the completion packet contains a 977 * status and nothing else. 978 */ 979 static void hv_pci_generic_compl(void *context, struct pci_response *resp, 980 int resp_packet_size) 981 { 982 struct hv_pci_compl *comp_pkt = context; 983 984 comp_pkt->completion_status = resp->status; 985 complete(&comp_pkt->host_event); 986 } 987 988 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 989 u32 wslot); 990 991 static void get_pcichild(struct hv_pci_dev *hpdev) 992 { 993 refcount_inc(&hpdev->refs); 994 } 995 996 static void put_pcichild(struct hv_pci_dev *hpdev) 997 { 998 if (refcount_dec_and_test(&hpdev->refs)) 999 kfree(hpdev); 1000 } 1001 1002 /* 1003 * There is no good way to get notified from vmbus_onoffer_rescind(), 1004 * so let's use polling here, since this is not a hot path. 1005 */ 1006 static int wait_for_response(struct hv_device *hdev, 1007 struct completion *comp) 1008 { 1009 while (true) { 1010 if (hdev->channel->rescind) { 1011 dev_warn_once(&hdev->device, "The device is gone.\n"); 1012 return -ENODEV; 1013 } 1014 1015 if (wait_for_completion_timeout(comp, HZ / 10)) 1016 break; 1017 } 1018 1019 return 0; 1020 } 1021 1022 /** 1023 * devfn_to_wslot() - Convert from Linux PCI slot to Windows 1024 * @devfn: The Linux representation of PCI slot 1025 * 1026 * Windows uses a slightly different representation of PCI slot. 1027 * 1028 * Return: The Windows representation 1029 */ 1030 static u32 devfn_to_wslot(int devfn) 1031 { 1032 union win_slot_encoding wslot; 1033 1034 wslot.slot = 0; 1035 wslot.bits.dev = PCI_SLOT(devfn); 1036 wslot.bits.func = PCI_FUNC(devfn); 1037 1038 return wslot.slot; 1039 } 1040 1041 /** 1042 * wslot_to_devfn() - Convert from Windows PCI slot to Linux 1043 * @wslot: The Windows representation of PCI slot 1044 * 1045 * Windows uses a slightly different representation of PCI slot. 1046 * 1047 * Return: The Linux representation 1048 */ 1049 static int wslot_to_devfn(u32 wslot) 1050 { 1051 union win_slot_encoding slot_no; 1052 1053 slot_no.slot = wslot; 1054 return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func); 1055 } 1056 1057 /* 1058 * PCI Configuration Space for these root PCI buses is implemented as a pair 1059 * of pages in memory-mapped I/O space. Writing to the first page chooses 1060 * the PCI function being written or read. Once the first page has been 1061 * written to, the following page maps in the entire configuration space of 1062 * the function. 1063 */ 1064 1065 /** 1066 * _hv_pcifront_read_config() - Internal PCI config read 1067 * @hpdev: The PCI driver's representation of the device 1068 * @where: Offset within config space 1069 * @size: Size of the transfer 1070 * @val: Pointer to the buffer receiving the data 1071 */ 1072 static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, 1073 int size, u32 *val) 1074 { 1075 unsigned long flags; 1076 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; 1077 1078 /* 1079 * If the attempt is to read the IDs or the ROM BAR, simulate that. 1080 */ 1081 if (where + size <= PCI_COMMAND) { 1082 memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size); 1083 } else if (where >= PCI_CLASS_REVISION && where + size <= 1084 PCI_CACHE_LINE_SIZE) { 1085 memcpy(val, ((u8 *)&hpdev->desc.rev) + where - 1086 PCI_CLASS_REVISION, size); 1087 } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <= 1088 PCI_ROM_ADDRESS) { 1089 memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where - 1090 PCI_SUBSYSTEM_VENDOR_ID, size); 1091 } else if (where >= PCI_ROM_ADDRESS && where + size <= 1092 PCI_CAPABILITY_LIST) { 1093 /* ROM BARs are unimplemented */ 1094 *val = 0; 1095 } else if (where >= PCI_INTERRUPT_LINE && where + size <= 1096 PCI_INTERRUPT_PIN) { 1097 /* 1098 * Interrupt Line and Interrupt PIN are hard-wired to zero 1099 * because this front-end only supports message-signaled 1100 * interrupts. 1101 */ 1102 *val = 0; 1103 } else if (where + size <= CFG_PAGE_SIZE) { 1104 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 1105 /* Choose the function to be read. (See comment above) */ 1106 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 1107 /* Make sure the function was chosen before we start reading. */ 1108 mb(); 1109 /* Read from that function's config space. */ 1110 switch (size) { 1111 case 1: 1112 *val = readb(addr); 1113 break; 1114 case 2: 1115 *val = readw(addr); 1116 break; 1117 default: 1118 *val = readl(addr); 1119 break; 1120 } 1121 /* 1122 * Make sure the read was done before we release the spinlock 1123 * allowing consecutive reads/writes. 1124 */ 1125 mb(); 1126 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 1127 } else { 1128 dev_err(&hpdev->hbus->hdev->device, 1129 "Attempt to read beyond a function's config space.\n"); 1130 } 1131 } 1132 1133 static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev) 1134 { 1135 u16 ret; 1136 unsigned long flags; 1137 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + 1138 PCI_VENDOR_ID; 1139 1140 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 1141 1142 /* Choose the function to be read. (See comment above) */ 1143 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 1144 /* Make sure the function was chosen before we start reading. */ 1145 mb(); 1146 /* Read from that function's config space. */ 1147 ret = readw(addr); 1148 /* 1149 * mb() is not required here, because the spin_unlock_irqrestore() 1150 * is a barrier. 1151 */ 1152 1153 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 1154 1155 return ret; 1156 } 1157 1158 /** 1159 * _hv_pcifront_write_config() - Internal PCI config write 1160 * @hpdev: The PCI driver's representation of the device 1161 * @where: Offset within config space 1162 * @size: Size of the transfer 1163 * @val: The data being transferred 1164 */ 1165 static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, 1166 int size, u32 val) 1167 { 1168 unsigned long flags; 1169 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; 1170 1171 if (where >= PCI_SUBSYSTEM_VENDOR_ID && 1172 where + size <= PCI_CAPABILITY_LIST) { 1173 /* SSIDs and ROM BARs are read-only */ 1174 } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) { 1175 spin_lock_irqsave(&hpdev->hbus->config_lock, flags); 1176 /* Choose the function to be written. (See comment above) */ 1177 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); 1178 /* Make sure the function was chosen before we start writing. */ 1179 wmb(); 1180 /* Write to that function's config space. */ 1181 switch (size) { 1182 case 1: 1183 writeb(val, addr); 1184 break; 1185 case 2: 1186 writew(val, addr); 1187 break; 1188 default: 1189 writel(val, addr); 1190 break; 1191 } 1192 /* 1193 * Make sure the write was done before we release the spinlock 1194 * allowing consecutive reads/writes. 1195 */ 1196 mb(); 1197 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); 1198 } else { 1199 dev_err(&hpdev->hbus->hdev->device, 1200 "Attempt to write beyond a function's config space.\n"); 1201 } 1202 } 1203 1204 /** 1205 * hv_pcifront_read_config() - Read configuration space 1206 * @bus: PCI Bus structure 1207 * @devfn: Device/function 1208 * @where: Offset from base 1209 * @size: Byte/word/dword 1210 * @val: Value to be read 1211 * 1212 * Return: PCIBIOS_SUCCESSFUL on success 1213 * PCIBIOS_DEVICE_NOT_FOUND on failure 1214 */ 1215 static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn, 1216 int where, int size, u32 *val) 1217 { 1218 struct hv_pcibus_device *hbus = 1219 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 1220 struct hv_pci_dev *hpdev; 1221 1222 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 1223 if (!hpdev) 1224 return PCIBIOS_DEVICE_NOT_FOUND; 1225 1226 _hv_pcifront_read_config(hpdev, where, size, val); 1227 1228 put_pcichild(hpdev); 1229 return PCIBIOS_SUCCESSFUL; 1230 } 1231 1232 /** 1233 * hv_pcifront_write_config() - Write configuration space 1234 * @bus: PCI Bus structure 1235 * @devfn: Device/function 1236 * @where: Offset from base 1237 * @size: Byte/word/dword 1238 * @val: Value to be written to device 1239 * 1240 * Return: PCIBIOS_SUCCESSFUL on success 1241 * PCIBIOS_DEVICE_NOT_FOUND on failure 1242 */ 1243 static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn, 1244 int where, int size, u32 val) 1245 { 1246 struct hv_pcibus_device *hbus = 1247 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 1248 struct hv_pci_dev *hpdev; 1249 1250 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 1251 if (!hpdev) 1252 return PCIBIOS_DEVICE_NOT_FOUND; 1253 1254 _hv_pcifront_write_config(hpdev, where, size, val); 1255 1256 put_pcichild(hpdev); 1257 return PCIBIOS_SUCCESSFUL; 1258 } 1259 1260 /* PCIe operations */ 1261 static struct pci_ops hv_pcifront_ops = { 1262 .read = hv_pcifront_read_config, 1263 .write = hv_pcifront_write_config, 1264 }; 1265 1266 /* 1267 * Paravirtual backchannel 1268 * 1269 * Hyper-V SR-IOV provides a backchannel mechanism in software for 1270 * communication between a VF driver and a PF driver. These 1271 * "configuration blocks" are similar in concept to PCI configuration space, 1272 * but instead of doing reads and writes in 32-bit chunks through a very slow 1273 * path, packets of up to 128 bytes can be sent or received asynchronously. 1274 * 1275 * Nearly every SR-IOV device contains just such a communications channel in 1276 * hardware, so using this one in software is usually optional. Using the 1277 * software channel, however, allows driver implementers to leverage software 1278 * tools that fuzz the communications channel looking for vulnerabilities. 1279 * 1280 * The usage model for these packets puts the responsibility for reading or 1281 * writing on the VF driver. The VF driver sends a read or a write packet, 1282 * indicating which "block" is being referred to by number. 1283 * 1284 * If the PF driver wishes to initiate communication, it can "invalidate" one or 1285 * more of the first 64 blocks. This invalidation is delivered via a callback 1286 * supplied by the VF driver by this driver. 1287 * 1288 * No protocol is implied, except that supplied by the PF and VF drivers. 1289 */ 1290 1291 struct hv_read_config_compl { 1292 struct hv_pci_compl comp_pkt; 1293 void *buf; 1294 unsigned int len; 1295 unsigned int bytes_returned; 1296 }; 1297 1298 /** 1299 * hv_pci_read_config_compl() - Invoked when a response packet 1300 * for a read config block operation arrives. 1301 * @context: Identifies the read config operation 1302 * @resp: The response packet itself 1303 * @resp_packet_size: Size in bytes of the response packet 1304 */ 1305 static void hv_pci_read_config_compl(void *context, struct pci_response *resp, 1306 int resp_packet_size) 1307 { 1308 struct hv_read_config_compl *comp = context; 1309 struct pci_read_block_response *read_resp = 1310 (struct pci_read_block_response *)resp; 1311 unsigned int data_len, hdr_len; 1312 1313 hdr_len = offsetof(struct pci_read_block_response, bytes); 1314 if (resp_packet_size < hdr_len) { 1315 comp->comp_pkt.completion_status = -1; 1316 goto out; 1317 } 1318 1319 data_len = resp_packet_size - hdr_len; 1320 if (data_len > 0 && read_resp->status == 0) { 1321 comp->bytes_returned = min(comp->len, data_len); 1322 memcpy(comp->buf, read_resp->bytes, comp->bytes_returned); 1323 } else { 1324 comp->bytes_returned = 0; 1325 } 1326 1327 comp->comp_pkt.completion_status = read_resp->status; 1328 out: 1329 complete(&comp->comp_pkt.host_event); 1330 } 1331 1332 /** 1333 * hv_read_config_block() - Sends a read config block request to 1334 * the back-end driver running in the Hyper-V parent partition. 1335 * @pdev: The PCI driver's representation for this device. 1336 * @buf: Buffer into which the config block will be copied. 1337 * @len: Size in bytes of buf. 1338 * @block_id: Identifies the config block which has been requested. 1339 * @bytes_returned: Size which came back from the back-end driver. 1340 * 1341 * Return: 0 on success, -errno on failure 1342 */ 1343 static int hv_read_config_block(struct pci_dev *pdev, void *buf, 1344 unsigned int len, unsigned int block_id, 1345 unsigned int *bytes_returned) 1346 { 1347 struct hv_pcibus_device *hbus = 1348 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1349 sysdata); 1350 struct { 1351 struct pci_packet pkt; 1352 char buf[sizeof(struct pci_read_block)]; 1353 } pkt; 1354 struct hv_read_config_compl comp_pkt; 1355 struct pci_read_block *read_blk; 1356 int ret; 1357 1358 if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 1359 return -EINVAL; 1360 1361 init_completion(&comp_pkt.comp_pkt.host_event); 1362 comp_pkt.buf = buf; 1363 comp_pkt.len = len; 1364 1365 memset(&pkt, 0, sizeof(pkt)); 1366 pkt.pkt.completion_func = hv_pci_read_config_compl; 1367 pkt.pkt.compl_ctxt = &comp_pkt; 1368 read_blk = (struct pci_read_block *)&pkt.pkt.message; 1369 read_blk->message_type.type = PCI_READ_BLOCK; 1370 read_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 1371 read_blk->block_id = block_id; 1372 read_blk->bytes_requested = len; 1373 1374 ret = vmbus_sendpacket(hbus->hdev->channel, read_blk, 1375 sizeof(*read_blk), (unsigned long)&pkt.pkt, 1376 VM_PKT_DATA_INBAND, 1377 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1378 if (ret) 1379 return ret; 1380 1381 ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event); 1382 if (ret) 1383 return ret; 1384 1385 if (comp_pkt.comp_pkt.completion_status != 0 || 1386 comp_pkt.bytes_returned == 0) { 1387 dev_err(&hbus->hdev->device, 1388 "Read Config Block failed: 0x%x, bytes_returned=%d\n", 1389 comp_pkt.comp_pkt.completion_status, 1390 comp_pkt.bytes_returned); 1391 return -EIO; 1392 } 1393 1394 *bytes_returned = comp_pkt.bytes_returned; 1395 return 0; 1396 } 1397 1398 /** 1399 * hv_pci_write_config_compl() - Invoked when a response packet for a write 1400 * config block operation arrives. 1401 * @context: Identifies the write config operation 1402 * @resp: The response packet itself 1403 * @resp_packet_size: Size in bytes of the response packet 1404 */ 1405 static void hv_pci_write_config_compl(void *context, struct pci_response *resp, 1406 int resp_packet_size) 1407 { 1408 struct hv_pci_compl *comp_pkt = context; 1409 1410 comp_pkt->completion_status = resp->status; 1411 complete(&comp_pkt->host_event); 1412 } 1413 1414 /** 1415 * hv_write_config_block() - Sends a write config block request to the 1416 * back-end driver running in the Hyper-V parent partition. 1417 * @pdev: The PCI driver's representation for this device. 1418 * @buf: Buffer from which the config block will be copied. 1419 * @len: Size in bytes of buf. 1420 * @block_id: Identifies the config block which is being written. 1421 * 1422 * Return: 0 on success, -errno on failure 1423 */ 1424 static int hv_write_config_block(struct pci_dev *pdev, void *buf, 1425 unsigned int len, unsigned int block_id) 1426 { 1427 struct hv_pcibus_device *hbus = 1428 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1429 sysdata); 1430 struct { 1431 struct pci_packet pkt; 1432 char buf[sizeof(struct pci_write_block)]; 1433 u32 reserved; 1434 } pkt; 1435 struct hv_pci_compl comp_pkt; 1436 struct pci_write_block *write_blk; 1437 u32 pkt_size; 1438 int ret; 1439 1440 if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 1441 return -EINVAL; 1442 1443 init_completion(&comp_pkt.host_event); 1444 1445 memset(&pkt, 0, sizeof(pkt)); 1446 pkt.pkt.completion_func = hv_pci_write_config_compl; 1447 pkt.pkt.compl_ctxt = &comp_pkt; 1448 write_blk = (struct pci_write_block *)&pkt.pkt.message; 1449 write_blk->message_type.type = PCI_WRITE_BLOCK; 1450 write_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 1451 write_blk->block_id = block_id; 1452 write_blk->byte_count = len; 1453 memcpy(write_blk->bytes, buf, len); 1454 pkt_size = offsetof(struct pci_write_block, bytes) + len; 1455 /* 1456 * This quirk is required on some hosts shipped around 2018, because 1457 * these hosts don't check the pkt_size correctly (new hosts have been 1458 * fixed since early 2019). The quirk is also safe on very old hosts 1459 * and new hosts, because, on them, what really matters is the length 1460 * specified in write_blk->byte_count. 1461 */ 1462 pkt_size += sizeof(pkt.reserved); 1463 1464 ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size, 1465 (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND, 1466 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1467 if (ret) 1468 return ret; 1469 1470 ret = wait_for_response(hbus->hdev, &comp_pkt.host_event); 1471 if (ret) 1472 return ret; 1473 1474 if (comp_pkt.completion_status != 0) { 1475 dev_err(&hbus->hdev->device, 1476 "Write Config Block failed: 0x%x\n", 1477 comp_pkt.completion_status); 1478 return -EIO; 1479 } 1480 1481 return 0; 1482 } 1483 1484 /** 1485 * hv_register_block_invalidate() - Invoked when a config block invalidation 1486 * arrives from the back-end driver. 1487 * @pdev: The PCI driver's representation for this device. 1488 * @context: Identifies the device. 1489 * @block_invalidate: Identifies all of the blocks being invalidated. 1490 * 1491 * Return: 0 on success, -errno on failure 1492 */ 1493 static int hv_register_block_invalidate(struct pci_dev *pdev, void *context, 1494 void (*block_invalidate)(void *context, 1495 u64 block_mask)) 1496 { 1497 struct hv_pcibus_device *hbus = 1498 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1499 sysdata); 1500 struct hv_pci_dev *hpdev; 1501 1502 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1503 if (!hpdev) 1504 return -ENODEV; 1505 1506 hpdev->block_invalidate = block_invalidate; 1507 hpdev->invalidate_context = context; 1508 1509 put_pcichild(hpdev); 1510 return 0; 1511 1512 } 1513 1514 /* Interrupt management hooks */ 1515 static void hv_int_desc_free(struct hv_pci_dev *hpdev, 1516 struct tran_int_desc *int_desc) 1517 { 1518 struct pci_delete_interrupt *int_pkt; 1519 struct { 1520 struct pci_packet pkt; 1521 u8 buffer[sizeof(struct pci_delete_interrupt)]; 1522 } ctxt; 1523 1524 if (!int_desc->vector_count) { 1525 kfree(int_desc); 1526 return; 1527 } 1528 memset(&ctxt, 0, sizeof(ctxt)); 1529 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; 1530 int_pkt->message_type.type = 1531 PCI_DELETE_INTERRUPT_MESSAGE; 1532 int_pkt->wslot.slot = hpdev->desc.win_slot.slot; 1533 int_pkt->int_desc = *int_desc; 1534 vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt), 1535 0, VM_PKT_DATA_INBAND, 0); 1536 kfree(int_desc); 1537 } 1538 1539 /** 1540 * hv_msi_free() - Free the MSI. 1541 * @domain: The interrupt domain pointer 1542 * @info: Extra MSI-related context 1543 * @irq: Identifies the IRQ. 1544 * 1545 * The Hyper-V parent partition and hypervisor are tracking the 1546 * messages that are in use, keeping the interrupt redirection 1547 * table up to date. This callback sends a message that frees 1548 * the IRT entry and related tracking nonsense. 1549 */ 1550 static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, 1551 unsigned int irq) 1552 { 1553 struct hv_pcibus_device *hbus; 1554 struct hv_pci_dev *hpdev; 1555 struct pci_dev *pdev; 1556 struct tran_int_desc *int_desc; 1557 struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq); 1558 struct msi_desc *msi = irq_data_get_msi_desc(irq_data); 1559 1560 pdev = msi_desc_to_pci_dev(msi); 1561 hbus = info->data; 1562 int_desc = irq_data_get_irq_chip_data(irq_data); 1563 if (!int_desc) 1564 return; 1565 1566 irq_data->chip_data = NULL; 1567 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1568 if (!hpdev) { 1569 kfree(int_desc); 1570 return; 1571 } 1572 1573 hv_int_desc_free(hpdev, int_desc); 1574 put_pcichild(hpdev); 1575 } 1576 1577 static void hv_irq_mask(struct irq_data *data) 1578 { 1579 pci_msi_mask_irq(data); 1580 if (data->parent_data->chip->irq_mask) 1581 irq_chip_mask_parent(data); 1582 } 1583 1584 static void hv_irq_unmask(struct irq_data *data) 1585 { 1586 hv_arch_irq_unmask(data); 1587 1588 if (data->parent_data->chip->irq_unmask) 1589 irq_chip_unmask_parent(data); 1590 pci_msi_unmask_irq(data); 1591 } 1592 1593 struct compose_comp_ctxt { 1594 struct hv_pci_compl comp_pkt; 1595 struct tran_int_desc int_desc; 1596 }; 1597 1598 static void hv_pci_compose_compl(void *context, struct pci_response *resp, 1599 int resp_packet_size) 1600 { 1601 struct compose_comp_ctxt *comp_pkt = context; 1602 struct pci_create_int_response *int_resp = 1603 (struct pci_create_int_response *)resp; 1604 1605 if (resp_packet_size < sizeof(*int_resp)) { 1606 comp_pkt->comp_pkt.completion_status = -1; 1607 goto out; 1608 } 1609 comp_pkt->comp_pkt.completion_status = resp->status; 1610 comp_pkt->int_desc = int_resp->int_desc; 1611 out: 1612 complete(&comp_pkt->comp_pkt.host_event); 1613 } 1614 1615 static u32 hv_compose_msi_req_v1( 1616 struct pci_create_interrupt *int_pkt, 1617 u32 slot, u8 vector, u16 vector_count) 1618 { 1619 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; 1620 int_pkt->wslot.slot = slot; 1621 int_pkt->int_desc.vector = vector; 1622 int_pkt->int_desc.vector_count = vector_count; 1623 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1624 1625 /* 1626 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in 1627 * hv_irq_unmask(). 1628 */ 1629 int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL; 1630 1631 return sizeof(*int_pkt); 1632 } 1633 1634 /* 1635 * The vCPU selected by hv_compose_multi_msi_req_get_cpu() and 1636 * hv_compose_msi_req_get_cpu() is a "dummy" vCPU because the final vCPU to be 1637 * interrupted is specified later in hv_irq_unmask() and communicated to Hyper-V 1638 * via the HVCALL_RETARGET_INTERRUPT hypercall. But the choice of dummy vCPU is 1639 * not irrelevant because Hyper-V chooses the physical CPU to handle the 1640 * interrupts based on the vCPU specified in message sent to the vPCI VSP in 1641 * hv_compose_msi_msg(). Hyper-V's choice of pCPU is not visible to the guest, 1642 * but assigning too many vPCI device interrupts to the same pCPU can cause a 1643 * performance bottleneck. So we spread out the dummy vCPUs to influence Hyper-V 1644 * to spread out the pCPUs that it selects. 1645 * 1646 * For the single-MSI and MSI-X cases, it's OK for hv_compose_msi_req_get_cpu() 1647 * to always return the same dummy vCPU, because a second call to 1648 * hv_compose_msi_msg() contains the "real" vCPU, causing Hyper-V to choose a 1649 * new pCPU for the interrupt. But for the multi-MSI case, the second call to 1650 * hv_compose_msi_msg() exits without sending a message to the vPCI VSP, so the 1651 * original dummy vCPU is used. This dummy vCPU must be round-robin'ed so that 1652 * the pCPUs are spread out. All interrupts for a multi-MSI device end up using 1653 * the same pCPU, even though the vCPUs will be spread out by later calls 1654 * to hv_irq_unmask(), but that is the best we can do now. 1655 * 1656 * With Hyper-V in Nov 2022, the HVCALL_RETARGET_INTERRUPT hypercall does *not* 1657 * cause Hyper-V to reselect the pCPU based on the specified vCPU. Such an 1658 * enhancement is planned for a future version. With that enhancement, the 1659 * dummy vCPU selection won't matter, and interrupts for the same multi-MSI 1660 * device will be spread across multiple pCPUs. 1661 */ 1662 1663 /* 1664 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten 1665 * by subsequent retarget in hv_irq_unmask(). 1666 */ 1667 static int hv_compose_msi_req_get_cpu(const struct cpumask *affinity) 1668 { 1669 return cpumask_first_and(affinity, cpu_online_mask); 1670 } 1671 1672 /* 1673 * Make sure the dummy vCPU values for multi-MSI don't all point to vCPU0. 1674 */ 1675 static int hv_compose_multi_msi_req_get_cpu(void) 1676 { 1677 static DEFINE_SPINLOCK(multi_msi_cpu_lock); 1678 1679 /* -1 means starting with CPU 0 */ 1680 static int cpu_next = -1; 1681 1682 unsigned long flags; 1683 int cpu; 1684 1685 spin_lock_irqsave(&multi_msi_cpu_lock, flags); 1686 1687 cpu_next = cpumask_next_wrap(cpu_next, cpu_online_mask, nr_cpu_ids, 1688 false); 1689 cpu = cpu_next; 1690 1691 spin_unlock_irqrestore(&multi_msi_cpu_lock, flags); 1692 1693 return cpu; 1694 } 1695 1696 static u32 hv_compose_msi_req_v2( 1697 struct pci_create_interrupt2 *int_pkt, int cpu, 1698 u32 slot, u8 vector, u16 vector_count) 1699 { 1700 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2; 1701 int_pkt->wslot.slot = slot; 1702 int_pkt->int_desc.vector = vector; 1703 int_pkt->int_desc.vector_count = vector_count; 1704 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1705 int_pkt->int_desc.processor_array[0] = 1706 hv_cpu_number_to_vp_number(cpu); 1707 int_pkt->int_desc.processor_count = 1; 1708 1709 return sizeof(*int_pkt); 1710 } 1711 1712 static u32 hv_compose_msi_req_v3( 1713 struct pci_create_interrupt3 *int_pkt, int cpu, 1714 u32 slot, u32 vector, u16 vector_count) 1715 { 1716 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3; 1717 int_pkt->wslot.slot = slot; 1718 int_pkt->int_desc.vector = vector; 1719 int_pkt->int_desc.reserved = 0; 1720 int_pkt->int_desc.vector_count = vector_count; 1721 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1722 int_pkt->int_desc.processor_array[0] = 1723 hv_cpu_number_to_vp_number(cpu); 1724 int_pkt->int_desc.processor_count = 1; 1725 1726 return sizeof(*int_pkt); 1727 } 1728 1729 /** 1730 * hv_compose_msi_msg() - Supplies a valid MSI address/data 1731 * @data: Everything about this MSI 1732 * @msg: Buffer that is filled in by this function 1733 * 1734 * This function unpacks the IRQ looking for target CPU set, IDT 1735 * vector and mode and sends a message to the parent partition 1736 * asking for a mapping for that tuple in this partition. The 1737 * response supplies a data value and address to which that data 1738 * should be written to trigger that interrupt. 1739 */ 1740 static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) 1741 { 1742 struct hv_pcibus_device *hbus; 1743 struct vmbus_channel *channel; 1744 struct hv_pci_dev *hpdev; 1745 struct pci_bus *pbus; 1746 struct pci_dev *pdev; 1747 const struct cpumask *dest; 1748 struct compose_comp_ctxt comp; 1749 struct tran_int_desc *int_desc; 1750 struct msi_desc *msi_desc; 1751 /* 1752 * vector_count should be u16: see hv_msi_desc, hv_msi_desc2 1753 * and hv_msi_desc3. vector must be u32: see hv_msi_desc3. 1754 */ 1755 u16 vector_count; 1756 u32 vector; 1757 struct { 1758 struct pci_packet pci_pkt; 1759 union { 1760 struct pci_create_interrupt v1; 1761 struct pci_create_interrupt2 v2; 1762 struct pci_create_interrupt3 v3; 1763 } int_pkts; 1764 } __packed ctxt; 1765 bool multi_msi; 1766 u64 trans_id; 1767 u32 size; 1768 int ret; 1769 int cpu; 1770 1771 msi_desc = irq_data_get_msi_desc(data); 1772 multi_msi = !msi_desc->pci.msi_attrib.is_msix && 1773 msi_desc->nvec_used > 1; 1774 1775 /* Reuse the previous allocation */ 1776 if (data->chip_data && multi_msi) { 1777 int_desc = data->chip_data; 1778 msg->address_hi = int_desc->address >> 32; 1779 msg->address_lo = int_desc->address & 0xffffffff; 1780 msg->data = int_desc->data; 1781 return; 1782 } 1783 1784 pdev = msi_desc_to_pci_dev(msi_desc); 1785 dest = irq_data_get_effective_affinity_mask(data); 1786 pbus = pdev->bus; 1787 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 1788 channel = hbus->hdev->channel; 1789 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1790 if (!hpdev) 1791 goto return_null_message; 1792 1793 /* Free any previous message that might have already been composed. */ 1794 if (data->chip_data && !multi_msi) { 1795 int_desc = data->chip_data; 1796 data->chip_data = NULL; 1797 hv_int_desc_free(hpdev, int_desc); 1798 } 1799 1800 int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC); 1801 if (!int_desc) 1802 goto drop_reference; 1803 1804 if (multi_msi) { 1805 /* 1806 * If this is not the first MSI of Multi MSI, we already have 1807 * a mapping. Can exit early. 1808 */ 1809 if (msi_desc->irq != data->irq) { 1810 data->chip_data = int_desc; 1811 int_desc->address = msi_desc->msg.address_lo | 1812 (u64)msi_desc->msg.address_hi << 32; 1813 int_desc->data = msi_desc->msg.data + 1814 (data->irq - msi_desc->irq); 1815 msg->address_hi = msi_desc->msg.address_hi; 1816 msg->address_lo = msi_desc->msg.address_lo; 1817 msg->data = int_desc->data; 1818 put_pcichild(hpdev); 1819 return; 1820 } 1821 /* 1822 * The vector we select here is a dummy value. The correct 1823 * value gets sent to the hypervisor in unmask(). This needs 1824 * to be aligned with the count, and also not zero. Multi-msi 1825 * is powers of 2 up to 32, so 32 will always work here. 1826 */ 1827 vector = 32; 1828 vector_count = msi_desc->nvec_used; 1829 cpu = hv_compose_multi_msi_req_get_cpu(); 1830 } else { 1831 vector = hv_msi_get_int_vector(data); 1832 vector_count = 1; 1833 cpu = hv_compose_msi_req_get_cpu(dest); 1834 } 1835 1836 /* 1837 * hv_compose_msi_req_v1 and v2 are for x86 only, meaning 'vector' 1838 * can't exceed u8. Cast 'vector' down to u8 for v1/v2 explicitly 1839 * for better readability. 1840 */ 1841 memset(&ctxt, 0, sizeof(ctxt)); 1842 init_completion(&comp.comp_pkt.host_event); 1843 ctxt.pci_pkt.completion_func = hv_pci_compose_compl; 1844 ctxt.pci_pkt.compl_ctxt = ∁ 1845 1846 switch (hbus->protocol_version) { 1847 case PCI_PROTOCOL_VERSION_1_1: 1848 size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1, 1849 hpdev->desc.win_slot.slot, 1850 (u8)vector, 1851 vector_count); 1852 break; 1853 1854 case PCI_PROTOCOL_VERSION_1_2: 1855 case PCI_PROTOCOL_VERSION_1_3: 1856 size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2, 1857 cpu, 1858 hpdev->desc.win_slot.slot, 1859 (u8)vector, 1860 vector_count); 1861 break; 1862 1863 case PCI_PROTOCOL_VERSION_1_4: 1864 size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3, 1865 cpu, 1866 hpdev->desc.win_slot.slot, 1867 vector, 1868 vector_count); 1869 break; 1870 1871 default: 1872 /* As we only negotiate protocol versions known to this driver, 1873 * this path should never hit. However, this is it not a hot 1874 * path so we print a message to aid future updates. 1875 */ 1876 dev_err(&hbus->hdev->device, 1877 "Unexpected vPCI protocol, update driver."); 1878 goto free_int_desc; 1879 } 1880 1881 ret = vmbus_sendpacket_getid(hpdev->hbus->hdev->channel, &ctxt.int_pkts, 1882 size, (unsigned long)&ctxt.pci_pkt, 1883 &trans_id, VM_PKT_DATA_INBAND, 1884 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1885 if (ret) { 1886 dev_err(&hbus->hdev->device, 1887 "Sending request for interrupt failed: 0x%x", 1888 comp.comp_pkt.completion_status); 1889 goto free_int_desc; 1890 } 1891 1892 /* 1893 * Prevents hv_pci_onchannelcallback() from running concurrently 1894 * in the tasklet. 1895 */ 1896 tasklet_disable_in_atomic(&channel->callback_event); 1897 1898 /* 1899 * Since this function is called with IRQ locks held, can't 1900 * do normal wait for completion; instead poll. 1901 */ 1902 while (!try_wait_for_completion(&comp.comp_pkt.host_event)) { 1903 unsigned long flags; 1904 1905 /* 0xFFFF means an invalid PCI VENDOR ID. */ 1906 if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) { 1907 dev_err_once(&hbus->hdev->device, 1908 "the device has gone\n"); 1909 goto enable_tasklet; 1910 } 1911 1912 /* 1913 * Make sure that the ring buffer data structure doesn't get 1914 * freed while we dereference the ring buffer pointer. Test 1915 * for the channel's onchannel_callback being NULL within a 1916 * sched_lock critical section. See also the inline comments 1917 * in vmbus_reset_channel_cb(). 1918 */ 1919 spin_lock_irqsave(&channel->sched_lock, flags); 1920 if (unlikely(channel->onchannel_callback == NULL)) { 1921 spin_unlock_irqrestore(&channel->sched_lock, flags); 1922 goto enable_tasklet; 1923 } 1924 hv_pci_onchannelcallback(hbus); 1925 spin_unlock_irqrestore(&channel->sched_lock, flags); 1926 1927 if (hpdev->state == hv_pcichild_ejecting) { 1928 dev_err_once(&hbus->hdev->device, 1929 "the device is being ejected\n"); 1930 goto enable_tasklet; 1931 } 1932 1933 udelay(100); 1934 } 1935 1936 tasklet_enable(&channel->callback_event); 1937 1938 if (comp.comp_pkt.completion_status < 0) { 1939 dev_err(&hbus->hdev->device, 1940 "Request for interrupt failed: 0x%x", 1941 comp.comp_pkt.completion_status); 1942 goto free_int_desc; 1943 } 1944 1945 /* 1946 * Record the assignment so that this can be unwound later. Using 1947 * irq_set_chip_data() here would be appropriate, but the lock it takes 1948 * is already held. 1949 */ 1950 *int_desc = comp.int_desc; 1951 data->chip_data = int_desc; 1952 1953 /* Pass up the result. */ 1954 msg->address_hi = comp.int_desc.address >> 32; 1955 msg->address_lo = comp.int_desc.address & 0xffffffff; 1956 msg->data = comp.int_desc.data; 1957 1958 put_pcichild(hpdev); 1959 return; 1960 1961 enable_tasklet: 1962 tasklet_enable(&channel->callback_event); 1963 /* 1964 * The completion packet on the stack becomes invalid after 'return'; 1965 * remove the ID from the VMbus requestor if the identifier is still 1966 * mapped to/associated with the packet. (The identifier could have 1967 * been 're-used', i.e., already removed and (re-)mapped.) 1968 * 1969 * Cf. hv_pci_onchannelcallback(). 1970 */ 1971 vmbus_request_addr_match(channel, trans_id, (unsigned long)&ctxt.pci_pkt); 1972 free_int_desc: 1973 kfree(int_desc); 1974 drop_reference: 1975 put_pcichild(hpdev); 1976 return_null_message: 1977 msg->address_hi = 0; 1978 msg->address_lo = 0; 1979 msg->data = 0; 1980 } 1981 1982 /* HW Interrupt Chip Descriptor */ 1983 static struct irq_chip hv_msi_irq_chip = { 1984 .name = "Hyper-V PCIe MSI", 1985 .irq_compose_msi_msg = hv_compose_msi_msg, 1986 .irq_set_affinity = irq_chip_set_affinity_parent, 1987 #ifdef CONFIG_X86 1988 .irq_ack = irq_chip_ack_parent, 1989 #elif defined(CONFIG_ARM64) 1990 .irq_eoi = irq_chip_eoi_parent, 1991 #endif 1992 .irq_mask = hv_irq_mask, 1993 .irq_unmask = hv_irq_unmask, 1994 }; 1995 1996 static struct msi_domain_ops hv_msi_ops = { 1997 .msi_prepare = hv_msi_prepare, 1998 .msi_free = hv_msi_free, 1999 }; 2000 2001 /** 2002 * hv_pcie_init_irq_domain() - Initialize IRQ domain 2003 * @hbus: The root PCI bus 2004 * 2005 * This function creates an IRQ domain which will be used for 2006 * interrupts from devices that have been passed through. These 2007 * devices only support MSI and MSI-X, not line-based interrupts 2008 * or simulations of line-based interrupts through PCIe's 2009 * fabric-layer messages. Because interrupts are remapped, we 2010 * can support multi-message MSI here. 2011 * 2012 * Return: '0' on success and error value on failure 2013 */ 2014 static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus) 2015 { 2016 hbus->msi_info.chip = &hv_msi_irq_chip; 2017 hbus->msi_info.ops = &hv_msi_ops; 2018 hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS | 2019 MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI | 2020 MSI_FLAG_PCI_MSIX); 2021 hbus->msi_info.handler = FLOW_HANDLER; 2022 hbus->msi_info.handler_name = FLOW_NAME; 2023 hbus->msi_info.data = hbus; 2024 hbus->irq_domain = pci_msi_create_irq_domain(hbus->fwnode, 2025 &hbus->msi_info, 2026 hv_pci_get_root_domain()); 2027 if (!hbus->irq_domain) { 2028 dev_err(&hbus->hdev->device, 2029 "Failed to build an MSI IRQ domain\n"); 2030 return -ENODEV; 2031 } 2032 2033 dev_set_msi_domain(&hbus->bridge->dev, hbus->irq_domain); 2034 2035 return 0; 2036 } 2037 2038 /** 2039 * get_bar_size() - Get the address space consumed by a BAR 2040 * @bar_val: Value that a BAR returned after -1 was written 2041 * to it. 2042 * 2043 * This function returns the size of the BAR, rounded up to 1 2044 * page. It has to be rounded up because the hypervisor's page 2045 * table entry that maps the BAR into the VM can't specify an 2046 * offset within a page. The invariant is that the hypervisor 2047 * must place any BARs of smaller than page length at the 2048 * beginning of a page. 2049 * 2050 * Return: Size in bytes of the consumed MMIO space. 2051 */ 2052 static u64 get_bar_size(u64 bar_val) 2053 { 2054 return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)), 2055 PAGE_SIZE); 2056 } 2057 2058 /** 2059 * survey_child_resources() - Total all MMIO requirements 2060 * @hbus: Root PCI bus, as understood by this driver 2061 */ 2062 static void survey_child_resources(struct hv_pcibus_device *hbus) 2063 { 2064 struct hv_pci_dev *hpdev; 2065 resource_size_t bar_size = 0; 2066 unsigned long flags; 2067 struct completion *event; 2068 u64 bar_val; 2069 int i; 2070 2071 /* If nobody is waiting on the answer, don't compute it. */ 2072 event = xchg(&hbus->survey_event, NULL); 2073 if (!event) 2074 return; 2075 2076 /* If the answer has already been computed, go with it. */ 2077 if (hbus->low_mmio_space || hbus->high_mmio_space) { 2078 complete(event); 2079 return; 2080 } 2081 2082 spin_lock_irqsave(&hbus->device_list_lock, flags); 2083 2084 /* 2085 * Due to an interesting quirk of the PCI spec, all memory regions 2086 * for a child device are a power of 2 in size and aligned in memory, 2087 * so it's sufficient to just add them up without tracking alignment. 2088 */ 2089 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2090 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 2091 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO) 2092 dev_err(&hbus->hdev->device, 2093 "There's an I/O BAR in this list!\n"); 2094 2095 if (hpdev->probed_bar[i] != 0) { 2096 /* 2097 * A probed BAR has all the upper bits set that 2098 * can be changed. 2099 */ 2100 2101 bar_val = hpdev->probed_bar[i]; 2102 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 2103 bar_val |= 2104 ((u64)hpdev->probed_bar[++i] << 32); 2105 else 2106 bar_val |= 0xffffffff00000000ULL; 2107 2108 bar_size = get_bar_size(bar_val); 2109 2110 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 2111 hbus->high_mmio_space += bar_size; 2112 else 2113 hbus->low_mmio_space += bar_size; 2114 } 2115 } 2116 } 2117 2118 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2119 complete(event); 2120 } 2121 2122 /** 2123 * prepopulate_bars() - Fill in BARs with defaults 2124 * @hbus: Root PCI bus, as understood by this driver 2125 * 2126 * The core PCI driver code seems much, much happier if the BARs 2127 * for a device have values upon first scan. So fill them in. 2128 * The algorithm below works down from large sizes to small, 2129 * attempting to pack the assignments optimally. The assumption, 2130 * enforced in other parts of the code, is that the beginning of 2131 * the memory-mapped I/O space will be aligned on the largest 2132 * BAR size. 2133 */ 2134 static void prepopulate_bars(struct hv_pcibus_device *hbus) 2135 { 2136 resource_size_t high_size = 0; 2137 resource_size_t low_size = 0; 2138 resource_size_t high_base = 0; 2139 resource_size_t low_base = 0; 2140 resource_size_t bar_size; 2141 struct hv_pci_dev *hpdev; 2142 unsigned long flags; 2143 u64 bar_val; 2144 u32 command; 2145 bool high; 2146 int i; 2147 2148 if (hbus->low_mmio_space) { 2149 low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 2150 low_base = hbus->low_mmio_res->start; 2151 } 2152 2153 if (hbus->high_mmio_space) { 2154 high_size = 1ULL << 2155 (63 - __builtin_clzll(hbus->high_mmio_space)); 2156 high_base = hbus->high_mmio_res->start; 2157 } 2158 2159 spin_lock_irqsave(&hbus->device_list_lock, flags); 2160 2161 /* 2162 * Clear the memory enable bit, in case it's already set. This occurs 2163 * in the suspend path of hibernation, where the device is suspended, 2164 * resumed and suspended again: see hibernation_snapshot() and 2165 * hibernation_platform_enter(). 2166 * 2167 * If the memory enable bit is already set, Hyper-V silently ignores 2168 * the below BAR updates, and the related PCI device driver can not 2169 * work, because reading from the device register(s) always returns 2170 * 0xFFFFFFFF (PCI_ERROR_RESPONSE). 2171 */ 2172 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2173 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command); 2174 command &= ~PCI_COMMAND_MEMORY; 2175 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command); 2176 } 2177 2178 /* Pick addresses for the BARs. */ 2179 do { 2180 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2181 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 2182 bar_val = hpdev->probed_bar[i]; 2183 if (bar_val == 0) 2184 continue; 2185 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64; 2186 if (high) { 2187 bar_val |= 2188 ((u64)hpdev->probed_bar[i + 1] 2189 << 32); 2190 } else { 2191 bar_val |= 0xffffffffULL << 32; 2192 } 2193 bar_size = get_bar_size(bar_val); 2194 if (high) { 2195 if (high_size != bar_size) { 2196 i++; 2197 continue; 2198 } 2199 _hv_pcifront_write_config(hpdev, 2200 PCI_BASE_ADDRESS_0 + (4 * i), 2201 4, 2202 (u32)(high_base & 0xffffff00)); 2203 i++; 2204 _hv_pcifront_write_config(hpdev, 2205 PCI_BASE_ADDRESS_0 + (4 * i), 2206 4, (u32)(high_base >> 32)); 2207 high_base += bar_size; 2208 } else { 2209 if (low_size != bar_size) 2210 continue; 2211 _hv_pcifront_write_config(hpdev, 2212 PCI_BASE_ADDRESS_0 + (4 * i), 2213 4, 2214 (u32)(low_base & 0xffffff00)); 2215 low_base += bar_size; 2216 } 2217 } 2218 if (high_size <= 1 && low_size <= 1) { 2219 /* 2220 * No need to set the PCI_COMMAND_MEMORY bit as 2221 * the core PCI driver doesn't require the bit 2222 * to be pre-set. Actually here we intentionally 2223 * keep the bit off so that the PCI BAR probing 2224 * in the core PCI driver doesn't cause Hyper-V 2225 * to unnecessarily unmap/map the virtual BARs 2226 * from/to the physical BARs multiple times. 2227 * This reduces the VM boot time significantly 2228 * if the BAR sizes are huge. 2229 */ 2230 break; 2231 } 2232 } 2233 2234 high_size >>= 1; 2235 low_size >>= 1; 2236 } while (high_size || low_size); 2237 2238 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2239 } 2240 2241 /* 2242 * Assign entries in sysfs pci slot directory. 2243 * 2244 * Note that this function does not need to lock the children list 2245 * because it is called from pci_devices_present_work which 2246 * is serialized with hv_eject_device_work because they are on the 2247 * same ordered workqueue. Therefore hbus->children list will not change 2248 * even when pci_create_slot sleeps. 2249 */ 2250 static void hv_pci_assign_slots(struct hv_pcibus_device *hbus) 2251 { 2252 struct hv_pci_dev *hpdev; 2253 char name[SLOT_NAME_SIZE]; 2254 int slot_nr; 2255 2256 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2257 if (hpdev->pci_slot) 2258 continue; 2259 2260 slot_nr = PCI_SLOT(wslot_to_devfn(hpdev->desc.win_slot.slot)); 2261 snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser); 2262 hpdev->pci_slot = pci_create_slot(hbus->bridge->bus, slot_nr, 2263 name, NULL); 2264 if (IS_ERR(hpdev->pci_slot)) { 2265 pr_warn("pci_create slot %s failed\n", name); 2266 hpdev->pci_slot = NULL; 2267 } 2268 } 2269 } 2270 2271 /* 2272 * Remove entries in sysfs pci slot directory. 2273 */ 2274 static void hv_pci_remove_slots(struct hv_pcibus_device *hbus) 2275 { 2276 struct hv_pci_dev *hpdev; 2277 2278 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2279 if (!hpdev->pci_slot) 2280 continue; 2281 pci_destroy_slot(hpdev->pci_slot); 2282 hpdev->pci_slot = NULL; 2283 } 2284 } 2285 2286 /* 2287 * Set NUMA node for the devices on the bus 2288 */ 2289 static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus) 2290 { 2291 struct pci_dev *dev; 2292 struct pci_bus *bus = hbus->bridge->bus; 2293 struct hv_pci_dev *hv_dev; 2294 2295 list_for_each_entry(dev, &bus->devices, bus_list) { 2296 hv_dev = get_pcichild_wslot(hbus, devfn_to_wslot(dev->devfn)); 2297 if (!hv_dev) 2298 continue; 2299 2300 if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY && 2301 hv_dev->desc.virtual_numa_node < num_possible_nodes()) 2302 /* 2303 * The kernel may boot with some NUMA nodes offline 2304 * (e.g. in a KDUMP kernel) or with NUMA disabled via 2305 * "numa=off". In those cases, adjust the host provided 2306 * NUMA node to a valid NUMA node used by the kernel. 2307 */ 2308 set_dev_node(&dev->dev, 2309 numa_map_to_online_node( 2310 hv_dev->desc.virtual_numa_node)); 2311 2312 put_pcichild(hv_dev); 2313 } 2314 } 2315 2316 /** 2317 * create_root_hv_pci_bus() - Expose a new root PCI bus 2318 * @hbus: Root PCI bus, as understood by this driver 2319 * 2320 * Return: 0 on success, -errno on failure 2321 */ 2322 static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus) 2323 { 2324 int error; 2325 struct pci_host_bridge *bridge = hbus->bridge; 2326 2327 bridge->dev.parent = &hbus->hdev->device; 2328 bridge->sysdata = &hbus->sysdata; 2329 bridge->ops = &hv_pcifront_ops; 2330 2331 error = pci_scan_root_bus_bridge(bridge); 2332 if (error) 2333 return error; 2334 2335 pci_lock_rescan_remove(); 2336 hv_pci_assign_numa_node(hbus); 2337 pci_bus_assign_resources(bridge->bus); 2338 hv_pci_assign_slots(hbus); 2339 pci_bus_add_devices(bridge->bus); 2340 pci_unlock_rescan_remove(); 2341 hbus->state = hv_pcibus_installed; 2342 return 0; 2343 } 2344 2345 struct q_res_req_compl { 2346 struct completion host_event; 2347 struct hv_pci_dev *hpdev; 2348 }; 2349 2350 /** 2351 * q_resource_requirements() - Query Resource Requirements 2352 * @context: The completion context. 2353 * @resp: The response that came from the host. 2354 * @resp_packet_size: The size in bytes of resp. 2355 * 2356 * This function is invoked on completion of a Query Resource 2357 * Requirements packet. 2358 */ 2359 static void q_resource_requirements(void *context, struct pci_response *resp, 2360 int resp_packet_size) 2361 { 2362 struct q_res_req_compl *completion = context; 2363 struct pci_q_res_req_response *q_res_req = 2364 (struct pci_q_res_req_response *)resp; 2365 s32 status; 2366 int i; 2367 2368 status = (resp_packet_size < sizeof(*q_res_req)) ? -1 : resp->status; 2369 if (status < 0) { 2370 dev_err(&completion->hpdev->hbus->hdev->device, 2371 "query resource requirements failed: %x\n", 2372 status); 2373 } else { 2374 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 2375 completion->hpdev->probed_bar[i] = 2376 q_res_req->probed_bar[i]; 2377 } 2378 } 2379 2380 complete(&completion->host_event); 2381 } 2382 2383 /** 2384 * new_pcichild_device() - Create a new child device 2385 * @hbus: The internal struct tracking this root PCI bus. 2386 * @desc: The information supplied so far from the host 2387 * about the device. 2388 * 2389 * This function creates the tracking structure for a new child 2390 * device and kicks off the process of figuring out what it is. 2391 * 2392 * Return: Pointer to the new tracking struct 2393 */ 2394 static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus, 2395 struct hv_pcidev_description *desc) 2396 { 2397 struct hv_pci_dev *hpdev; 2398 struct pci_child_message *res_req; 2399 struct q_res_req_compl comp_pkt; 2400 struct { 2401 struct pci_packet init_packet; 2402 u8 buffer[sizeof(struct pci_child_message)]; 2403 } pkt; 2404 unsigned long flags; 2405 int ret; 2406 2407 hpdev = kzalloc(sizeof(*hpdev), GFP_KERNEL); 2408 if (!hpdev) 2409 return NULL; 2410 2411 hpdev->hbus = hbus; 2412 2413 memset(&pkt, 0, sizeof(pkt)); 2414 init_completion(&comp_pkt.host_event); 2415 comp_pkt.hpdev = hpdev; 2416 pkt.init_packet.compl_ctxt = &comp_pkt; 2417 pkt.init_packet.completion_func = q_resource_requirements; 2418 res_req = (struct pci_child_message *)&pkt.init_packet.message; 2419 res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS; 2420 res_req->wslot.slot = desc->win_slot.slot; 2421 2422 ret = vmbus_sendpacket(hbus->hdev->channel, res_req, 2423 sizeof(struct pci_child_message), 2424 (unsigned long)&pkt.init_packet, 2425 VM_PKT_DATA_INBAND, 2426 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2427 if (ret) 2428 goto error; 2429 2430 if (wait_for_response(hbus->hdev, &comp_pkt.host_event)) 2431 goto error; 2432 2433 hpdev->desc = *desc; 2434 refcount_set(&hpdev->refs, 1); 2435 get_pcichild(hpdev); 2436 spin_lock_irqsave(&hbus->device_list_lock, flags); 2437 2438 list_add_tail(&hpdev->list_entry, &hbus->children); 2439 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2440 return hpdev; 2441 2442 error: 2443 kfree(hpdev); 2444 return NULL; 2445 } 2446 2447 /** 2448 * get_pcichild_wslot() - Find device from slot 2449 * @hbus: Root PCI bus, as understood by this driver 2450 * @wslot: Location on the bus 2451 * 2452 * This function looks up a PCI device and returns the internal 2453 * representation of it. It acquires a reference on it, so that 2454 * the device won't be deleted while somebody is using it. The 2455 * caller is responsible for calling put_pcichild() to release 2456 * this reference. 2457 * 2458 * Return: Internal representation of a PCI device 2459 */ 2460 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 2461 u32 wslot) 2462 { 2463 unsigned long flags; 2464 struct hv_pci_dev *iter, *hpdev = NULL; 2465 2466 spin_lock_irqsave(&hbus->device_list_lock, flags); 2467 list_for_each_entry(iter, &hbus->children, list_entry) { 2468 if (iter->desc.win_slot.slot == wslot) { 2469 hpdev = iter; 2470 get_pcichild(hpdev); 2471 break; 2472 } 2473 } 2474 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2475 2476 return hpdev; 2477 } 2478 2479 /** 2480 * pci_devices_present_work() - Handle new list of child devices 2481 * @work: Work struct embedded in struct hv_dr_work 2482 * 2483 * "Bus Relations" is the Windows term for "children of this 2484 * bus." The terminology is preserved here for people trying to 2485 * debug the interaction between Hyper-V and Linux. This 2486 * function is called when the parent partition reports a list 2487 * of functions that should be observed under this PCI Express 2488 * port (bus). 2489 * 2490 * This function updates the list, and must tolerate being 2491 * called multiple times with the same information. The typical 2492 * number of child devices is one, with very atypical cases 2493 * involving three or four, so the algorithms used here can be 2494 * simple and inefficient. 2495 * 2496 * It must also treat the omission of a previously observed device as 2497 * notification that the device no longer exists. 2498 * 2499 * Note that this function is serialized with hv_eject_device_work(), 2500 * because both are pushed to the ordered workqueue hbus->wq. 2501 */ 2502 static void pci_devices_present_work(struct work_struct *work) 2503 { 2504 u32 child_no; 2505 bool found; 2506 struct hv_pcidev_description *new_desc; 2507 struct hv_pci_dev *hpdev; 2508 struct hv_pcibus_device *hbus; 2509 struct list_head removed; 2510 struct hv_dr_work *dr_wrk; 2511 struct hv_dr_state *dr = NULL; 2512 unsigned long flags; 2513 2514 dr_wrk = container_of(work, struct hv_dr_work, wrk); 2515 hbus = dr_wrk->bus; 2516 kfree(dr_wrk); 2517 2518 INIT_LIST_HEAD(&removed); 2519 2520 /* Pull this off the queue and process it if it was the last one. */ 2521 spin_lock_irqsave(&hbus->device_list_lock, flags); 2522 while (!list_empty(&hbus->dr_list)) { 2523 dr = list_first_entry(&hbus->dr_list, struct hv_dr_state, 2524 list_entry); 2525 list_del(&dr->list_entry); 2526 2527 /* Throw this away if the list still has stuff in it. */ 2528 if (!list_empty(&hbus->dr_list)) { 2529 kfree(dr); 2530 continue; 2531 } 2532 } 2533 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2534 2535 if (!dr) 2536 return; 2537 2538 /* First, mark all existing children as reported missing. */ 2539 spin_lock_irqsave(&hbus->device_list_lock, flags); 2540 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2541 hpdev->reported_missing = true; 2542 } 2543 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2544 2545 /* Next, add back any reported devices. */ 2546 for (child_no = 0; child_no < dr->device_count; child_no++) { 2547 found = false; 2548 new_desc = &dr->func[child_no]; 2549 2550 spin_lock_irqsave(&hbus->device_list_lock, flags); 2551 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2552 if ((hpdev->desc.win_slot.slot == new_desc->win_slot.slot) && 2553 (hpdev->desc.v_id == new_desc->v_id) && 2554 (hpdev->desc.d_id == new_desc->d_id) && 2555 (hpdev->desc.ser == new_desc->ser)) { 2556 hpdev->reported_missing = false; 2557 found = true; 2558 } 2559 } 2560 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2561 2562 if (!found) { 2563 hpdev = new_pcichild_device(hbus, new_desc); 2564 if (!hpdev) 2565 dev_err(&hbus->hdev->device, 2566 "couldn't record a child device.\n"); 2567 } 2568 } 2569 2570 /* Move missing children to a list on the stack. */ 2571 spin_lock_irqsave(&hbus->device_list_lock, flags); 2572 do { 2573 found = false; 2574 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2575 if (hpdev->reported_missing) { 2576 found = true; 2577 put_pcichild(hpdev); 2578 list_move_tail(&hpdev->list_entry, &removed); 2579 break; 2580 } 2581 } 2582 } while (found); 2583 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2584 2585 /* Delete everything that should no longer exist. */ 2586 while (!list_empty(&removed)) { 2587 hpdev = list_first_entry(&removed, struct hv_pci_dev, 2588 list_entry); 2589 list_del(&hpdev->list_entry); 2590 2591 if (hpdev->pci_slot) 2592 pci_destroy_slot(hpdev->pci_slot); 2593 2594 put_pcichild(hpdev); 2595 } 2596 2597 switch (hbus->state) { 2598 case hv_pcibus_installed: 2599 /* 2600 * Tell the core to rescan bus 2601 * because there may have been changes. 2602 */ 2603 pci_lock_rescan_remove(); 2604 pci_scan_child_bus(hbus->bridge->bus); 2605 hv_pci_assign_numa_node(hbus); 2606 hv_pci_assign_slots(hbus); 2607 pci_unlock_rescan_remove(); 2608 break; 2609 2610 case hv_pcibus_init: 2611 case hv_pcibus_probed: 2612 survey_child_resources(hbus); 2613 break; 2614 2615 default: 2616 break; 2617 } 2618 2619 kfree(dr); 2620 } 2621 2622 /** 2623 * hv_pci_start_relations_work() - Queue work to start device discovery 2624 * @hbus: Root PCI bus, as understood by this driver 2625 * @dr: The list of children returned from host 2626 * 2627 * Return: 0 on success, -errno on failure 2628 */ 2629 static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus, 2630 struct hv_dr_state *dr) 2631 { 2632 struct hv_dr_work *dr_wrk; 2633 unsigned long flags; 2634 bool pending_dr; 2635 2636 if (hbus->state == hv_pcibus_removing) { 2637 dev_info(&hbus->hdev->device, 2638 "PCI VMBus BUS_RELATIONS: ignored\n"); 2639 return -ENOENT; 2640 } 2641 2642 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT); 2643 if (!dr_wrk) 2644 return -ENOMEM; 2645 2646 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work); 2647 dr_wrk->bus = hbus; 2648 2649 spin_lock_irqsave(&hbus->device_list_lock, flags); 2650 /* 2651 * If pending_dr is true, we have already queued a work, 2652 * which will see the new dr. Otherwise, we need to 2653 * queue a new work. 2654 */ 2655 pending_dr = !list_empty(&hbus->dr_list); 2656 list_add_tail(&dr->list_entry, &hbus->dr_list); 2657 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2658 2659 if (pending_dr) 2660 kfree(dr_wrk); 2661 else 2662 queue_work(hbus->wq, &dr_wrk->wrk); 2663 2664 return 0; 2665 } 2666 2667 /** 2668 * hv_pci_devices_present() - Handle list of new children 2669 * @hbus: Root PCI bus, as understood by this driver 2670 * @relations: Packet from host listing children 2671 * 2672 * Process a new list of devices on the bus. The list of devices is 2673 * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS, 2674 * whenever a new list of devices for this bus appears. 2675 */ 2676 static void hv_pci_devices_present(struct hv_pcibus_device *hbus, 2677 struct pci_bus_relations *relations) 2678 { 2679 struct hv_dr_state *dr; 2680 int i; 2681 2682 dr = kzalloc(struct_size(dr, func, relations->device_count), 2683 GFP_NOWAIT); 2684 if (!dr) 2685 return; 2686 2687 dr->device_count = relations->device_count; 2688 for (i = 0; i < dr->device_count; i++) { 2689 dr->func[i].v_id = relations->func[i].v_id; 2690 dr->func[i].d_id = relations->func[i].d_id; 2691 dr->func[i].rev = relations->func[i].rev; 2692 dr->func[i].prog_intf = relations->func[i].prog_intf; 2693 dr->func[i].subclass = relations->func[i].subclass; 2694 dr->func[i].base_class = relations->func[i].base_class; 2695 dr->func[i].subsystem_id = relations->func[i].subsystem_id; 2696 dr->func[i].win_slot = relations->func[i].win_slot; 2697 dr->func[i].ser = relations->func[i].ser; 2698 } 2699 2700 if (hv_pci_start_relations_work(hbus, dr)) 2701 kfree(dr); 2702 } 2703 2704 /** 2705 * hv_pci_devices_present2() - Handle list of new children 2706 * @hbus: Root PCI bus, as understood by this driver 2707 * @relations: Packet from host listing children 2708 * 2709 * This function is the v2 version of hv_pci_devices_present() 2710 */ 2711 static void hv_pci_devices_present2(struct hv_pcibus_device *hbus, 2712 struct pci_bus_relations2 *relations) 2713 { 2714 struct hv_dr_state *dr; 2715 int i; 2716 2717 dr = kzalloc(struct_size(dr, func, relations->device_count), 2718 GFP_NOWAIT); 2719 if (!dr) 2720 return; 2721 2722 dr->device_count = relations->device_count; 2723 for (i = 0; i < dr->device_count; i++) { 2724 dr->func[i].v_id = relations->func[i].v_id; 2725 dr->func[i].d_id = relations->func[i].d_id; 2726 dr->func[i].rev = relations->func[i].rev; 2727 dr->func[i].prog_intf = relations->func[i].prog_intf; 2728 dr->func[i].subclass = relations->func[i].subclass; 2729 dr->func[i].base_class = relations->func[i].base_class; 2730 dr->func[i].subsystem_id = relations->func[i].subsystem_id; 2731 dr->func[i].win_slot = relations->func[i].win_slot; 2732 dr->func[i].ser = relations->func[i].ser; 2733 dr->func[i].flags = relations->func[i].flags; 2734 dr->func[i].virtual_numa_node = 2735 relations->func[i].virtual_numa_node; 2736 } 2737 2738 if (hv_pci_start_relations_work(hbus, dr)) 2739 kfree(dr); 2740 } 2741 2742 /** 2743 * hv_eject_device_work() - Asynchronously handles ejection 2744 * @work: Work struct embedded in internal device struct 2745 * 2746 * This function handles ejecting a device. Windows will 2747 * attempt to gracefully eject a device, waiting 60 seconds to 2748 * hear back from the guest OS that this completed successfully. 2749 * If this timer expires, the device will be forcibly removed. 2750 */ 2751 static void hv_eject_device_work(struct work_struct *work) 2752 { 2753 struct pci_eject_response *ejct_pkt; 2754 struct hv_pcibus_device *hbus; 2755 struct hv_pci_dev *hpdev; 2756 struct pci_dev *pdev; 2757 unsigned long flags; 2758 int wslot; 2759 struct { 2760 struct pci_packet pkt; 2761 u8 buffer[sizeof(struct pci_eject_response)]; 2762 } ctxt; 2763 2764 hpdev = container_of(work, struct hv_pci_dev, wrk); 2765 hbus = hpdev->hbus; 2766 2767 WARN_ON(hpdev->state != hv_pcichild_ejecting); 2768 2769 /* 2770 * Ejection can come before or after the PCI bus has been set up, so 2771 * attempt to find it and tear down the bus state, if it exists. This 2772 * must be done without constructs like pci_domain_nr(hbus->bridge->bus) 2773 * because hbus->bridge->bus may not exist yet. 2774 */ 2775 wslot = wslot_to_devfn(hpdev->desc.win_slot.slot); 2776 pdev = pci_get_domain_bus_and_slot(hbus->bridge->domain_nr, 0, wslot); 2777 if (pdev) { 2778 pci_lock_rescan_remove(); 2779 pci_stop_and_remove_bus_device(pdev); 2780 pci_dev_put(pdev); 2781 pci_unlock_rescan_remove(); 2782 } 2783 2784 spin_lock_irqsave(&hbus->device_list_lock, flags); 2785 list_del(&hpdev->list_entry); 2786 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2787 2788 if (hpdev->pci_slot) 2789 pci_destroy_slot(hpdev->pci_slot); 2790 2791 memset(&ctxt, 0, sizeof(ctxt)); 2792 ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message; 2793 ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE; 2794 ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot; 2795 vmbus_sendpacket(hbus->hdev->channel, ejct_pkt, 2796 sizeof(*ejct_pkt), 0, 2797 VM_PKT_DATA_INBAND, 0); 2798 2799 /* For the get_pcichild() in hv_pci_eject_device() */ 2800 put_pcichild(hpdev); 2801 /* For the two refs got in new_pcichild_device() */ 2802 put_pcichild(hpdev); 2803 put_pcichild(hpdev); 2804 /* hpdev has been freed. Do not use it any more. */ 2805 } 2806 2807 /** 2808 * hv_pci_eject_device() - Handles device ejection 2809 * @hpdev: Internal device tracking struct 2810 * 2811 * This function is invoked when an ejection packet arrives. It 2812 * just schedules work so that we don't re-enter the packet 2813 * delivery code handling the ejection. 2814 */ 2815 static void hv_pci_eject_device(struct hv_pci_dev *hpdev) 2816 { 2817 struct hv_pcibus_device *hbus = hpdev->hbus; 2818 struct hv_device *hdev = hbus->hdev; 2819 2820 if (hbus->state == hv_pcibus_removing) { 2821 dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n"); 2822 return; 2823 } 2824 2825 hpdev->state = hv_pcichild_ejecting; 2826 get_pcichild(hpdev); 2827 INIT_WORK(&hpdev->wrk, hv_eject_device_work); 2828 queue_work(hbus->wq, &hpdev->wrk); 2829 } 2830 2831 /** 2832 * hv_pci_onchannelcallback() - Handles incoming packets 2833 * @context: Internal bus tracking struct 2834 * 2835 * This function is invoked whenever the host sends a packet to 2836 * this channel (which is private to this root PCI bus). 2837 */ 2838 static void hv_pci_onchannelcallback(void *context) 2839 { 2840 const int packet_size = 0x100; 2841 int ret; 2842 struct hv_pcibus_device *hbus = context; 2843 struct vmbus_channel *chan = hbus->hdev->channel; 2844 u32 bytes_recvd; 2845 u64 req_id, req_addr; 2846 struct vmpacket_descriptor *desc; 2847 unsigned char *buffer; 2848 int bufferlen = packet_size; 2849 struct pci_packet *comp_packet; 2850 struct pci_response *response; 2851 struct pci_incoming_message *new_message; 2852 struct pci_bus_relations *bus_rel; 2853 struct pci_bus_relations2 *bus_rel2; 2854 struct pci_dev_inval_block *inval; 2855 struct pci_dev_incoming *dev_message; 2856 struct hv_pci_dev *hpdev; 2857 unsigned long flags; 2858 2859 buffer = kmalloc(bufferlen, GFP_ATOMIC); 2860 if (!buffer) 2861 return; 2862 2863 while (1) { 2864 ret = vmbus_recvpacket_raw(chan, buffer, bufferlen, 2865 &bytes_recvd, &req_id); 2866 2867 if (ret == -ENOBUFS) { 2868 kfree(buffer); 2869 /* Handle large packet */ 2870 bufferlen = bytes_recvd; 2871 buffer = kmalloc(bytes_recvd, GFP_ATOMIC); 2872 if (!buffer) 2873 return; 2874 continue; 2875 } 2876 2877 /* Zero length indicates there are no more packets. */ 2878 if (ret || !bytes_recvd) 2879 break; 2880 2881 /* 2882 * All incoming packets must be at least as large as a 2883 * response. 2884 */ 2885 if (bytes_recvd <= sizeof(struct pci_response)) 2886 continue; 2887 desc = (struct vmpacket_descriptor *)buffer; 2888 2889 switch (desc->type) { 2890 case VM_PKT_COMP: 2891 2892 lock_requestor(chan, flags); 2893 req_addr = __vmbus_request_addr_match(chan, req_id, 2894 VMBUS_RQST_ADDR_ANY); 2895 if (req_addr == VMBUS_RQST_ERROR) { 2896 unlock_requestor(chan, flags); 2897 dev_err(&hbus->hdev->device, 2898 "Invalid transaction ID %llx\n", 2899 req_id); 2900 break; 2901 } 2902 comp_packet = (struct pci_packet *)req_addr; 2903 response = (struct pci_response *)buffer; 2904 /* 2905 * Call ->completion_func() within the critical section to make 2906 * sure that the packet pointer is still valid during the call: 2907 * here 'valid' means that there's a task still waiting for the 2908 * completion, and that the packet data is still on the waiting 2909 * task's stack. Cf. hv_compose_msi_msg(). 2910 */ 2911 comp_packet->completion_func(comp_packet->compl_ctxt, 2912 response, 2913 bytes_recvd); 2914 unlock_requestor(chan, flags); 2915 break; 2916 2917 case VM_PKT_DATA_INBAND: 2918 2919 new_message = (struct pci_incoming_message *)buffer; 2920 switch (new_message->message_type.type) { 2921 case PCI_BUS_RELATIONS: 2922 2923 bus_rel = (struct pci_bus_relations *)buffer; 2924 if (bytes_recvd < sizeof(*bus_rel) || 2925 bytes_recvd < 2926 struct_size(bus_rel, func, 2927 bus_rel->device_count)) { 2928 dev_err(&hbus->hdev->device, 2929 "bus relations too small\n"); 2930 break; 2931 } 2932 2933 hv_pci_devices_present(hbus, bus_rel); 2934 break; 2935 2936 case PCI_BUS_RELATIONS2: 2937 2938 bus_rel2 = (struct pci_bus_relations2 *)buffer; 2939 if (bytes_recvd < sizeof(*bus_rel2) || 2940 bytes_recvd < 2941 struct_size(bus_rel2, func, 2942 bus_rel2->device_count)) { 2943 dev_err(&hbus->hdev->device, 2944 "bus relations v2 too small\n"); 2945 break; 2946 } 2947 2948 hv_pci_devices_present2(hbus, bus_rel2); 2949 break; 2950 2951 case PCI_EJECT: 2952 2953 dev_message = (struct pci_dev_incoming *)buffer; 2954 if (bytes_recvd < sizeof(*dev_message)) { 2955 dev_err(&hbus->hdev->device, 2956 "eject message too small\n"); 2957 break; 2958 } 2959 hpdev = get_pcichild_wslot(hbus, 2960 dev_message->wslot.slot); 2961 if (hpdev) { 2962 hv_pci_eject_device(hpdev); 2963 put_pcichild(hpdev); 2964 } 2965 break; 2966 2967 case PCI_INVALIDATE_BLOCK: 2968 2969 inval = (struct pci_dev_inval_block *)buffer; 2970 if (bytes_recvd < sizeof(*inval)) { 2971 dev_err(&hbus->hdev->device, 2972 "invalidate message too small\n"); 2973 break; 2974 } 2975 hpdev = get_pcichild_wslot(hbus, 2976 inval->wslot.slot); 2977 if (hpdev) { 2978 if (hpdev->block_invalidate) { 2979 hpdev->block_invalidate( 2980 hpdev->invalidate_context, 2981 inval->block_mask); 2982 } 2983 put_pcichild(hpdev); 2984 } 2985 break; 2986 2987 default: 2988 dev_warn(&hbus->hdev->device, 2989 "Unimplemented protocol message %x\n", 2990 new_message->message_type.type); 2991 break; 2992 } 2993 break; 2994 2995 default: 2996 dev_err(&hbus->hdev->device, 2997 "unhandled packet type %d, tid %llx len %d\n", 2998 desc->type, req_id, bytes_recvd); 2999 break; 3000 } 3001 } 3002 3003 kfree(buffer); 3004 } 3005 3006 /** 3007 * hv_pci_protocol_negotiation() - Set up protocol 3008 * @hdev: VMBus's tracking struct for this root PCI bus. 3009 * @version: Array of supported channel protocol versions in 3010 * the order of probing - highest go first. 3011 * @num_version: Number of elements in the version array. 3012 * 3013 * This driver is intended to support running on Windows 10 3014 * (server) and later versions. It will not run on earlier 3015 * versions, as they assume that many of the operations which 3016 * Linux needs accomplished with a spinlock held were done via 3017 * asynchronous messaging via VMBus. Windows 10 increases the 3018 * surface area of PCI emulation so that these actions can take 3019 * place by suspending a virtual processor for their duration. 3020 * 3021 * This function negotiates the channel protocol version, 3022 * failing if the host doesn't support the necessary protocol 3023 * level. 3024 */ 3025 static int hv_pci_protocol_negotiation(struct hv_device *hdev, 3026 enum pci_protocol_version_t version[], 3027 int num_version) 3028 { 3029 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3030 struct pci_version_request *version_req; 3031 struct hv_pci_compl comp_pkt; 3032 struct pci_packet *pkt; 3033 int ret; 3034 int i; 3035 3036 /* 3037 * Initiate the handshake with the host and negotiate 3038 * a version that the host can support. We start with the 3039 * highest version number and go down if the host cannot 3040 * support it. 3041 */ 3042 pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL); 3043 if (!pkt) 3044 return -ENOMEM; 3045 3046 init_completion(&comp_pkt.host_event); 3047 pkt->completion_func = hv_pci_generic_compl; 3048 pkt->compl_ctxt = &comp_pkt; 3049 version_req = (struct pci_version_request *)&pkt->message; 3050 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; 3051 3052 for (i = 0; i < num_version; i++) { 3053 version_req->protocol_version = version[i]; 3054 ret = vmbus_sendpacket(hdev->channel, version_req, 3055 sizeof(struct pci_version_request), 3056 (unsigned long)pkt, VM_PKT_DATA_INBAND, 3057 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3058 if (!ret) 3059 ret = wait_for_response(hdev, &comp_pkt.host_event); 3060 3061 if (ret) { 3062 dev_err(&hdev->device, 3063 "PCI Pass-through VSP failed to request version: %d", 3064 ret); 3065 goto exit; 3066 } 3067 3068 if (comp_pkt.completion_status >= 0) { 3069 hbus->protocol_version = version[i]; 3070 dev_info(&hdev->device, 3071 "PCI VMBus probing: Using version %#x\n", 3072 hbus->protocol_version); 3073 goto exit; 3074 } 3075 3076 if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) { 3077 dev_err(&hdev->device, 3078 "PCI Pass-through VSP failed version request: %#x", 3079 comp_pkt.completion_status); 3080 ret = -EPROTO; 3081 goto exit; 3082 } 3083 3084 reinit_completion(&comp_pkt.host_event); 3085 } 3086 3087 dev_err(&hdev->device, 3088 "PCI pass-through VSP failed to find supported version"); 3089 ret = -EPROTO; 3090 3091 exit: 3092 kfree(pkt); 3093 return ret; 3094 } 3095 3096 /** 3097 * hv_pci_free_bridge_windows() - Release memory regions for the 3098 * bus 3099 * @hbus: Root PCI bus, as understood by this driver 3100 */ 3101 static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus) 3102 { 3103 /* 3104 * Set the resources back to the way they looked when they 3105 * were allocated by setting IORESOURCE_BUSY again. 3106 */ 3107 3108 if (hbus->low_mmio_space && hbus->low_mmio_res) { 3109 hbus->low_mmio_res->flags |= IORESOURCE_BUSY; 3110 vmbus_free_mmio(hbus->low_mmio_res->start, 3111 resource_size(hbus->low_mmio_res)); 3112 } 3113 3114 if (hbus->high_mmio_space && hbus->high_mmio_res) { 3115 hbus->high_mmio_res->flags |= IORESOURCE_BUSY; 3116 vmbus_free_mmio(hbus->high_mmio_res->start, 3117 resource_size(hbus->high_mmio_res)); 3118 } 3119 } 3120 3121 /** 3122 * hv_pci_allocate_bridge_windows() - Allocate memory regions 3123 * for the bus 3124 * @hbus: Root PCI bus, as understood by this driver 3125 * 3126 * This function calls vmbus_allocate_mmio(), which is itself a 3127 * bit of a compromise. Ideally, we might change the pnp layer 3128 * in the kernel such that it comprehends either PCI devices 3129 * which are "grandchildren of ACPI," with some intermediate bus 3130 * node (in this case, VMBus) or change it such that it 3131 * understands VMBus. The pnp layer, however, has been declared 3132 * deprecated, and not subject to change. 3133 * 3134 * The workaround, implemented here, is to ask VMBus to allocate 3135 * MMIO space for this bus. VMBus itself knows which ranges are 3136 * appropriate by looking at its own ACPI objects. Then, after 3137 * these ranges are claimed, they're modified to look like they 3138 * would have looked if the ACPI and pnp code had allocated 3139 * bridge windows. These descriptors have to exist in this form 3140 * in order to satisfy the code which will get invoked when the 3141 * endpoint PCI function driver calls request_mem_region() or 3142 * request_mem_region_exclusive(). 3143 * 3144 * Return: 0 on success, -errno on failure 3145 */ 3146 static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus) 3147 { 3148 resource_size_t align; 3149 int ret; 3150 3151 if (hbus->low_mmio_space) { 3152 align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 3153 ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0, 3154 (u64)(u32)0xffffffff, 3155 hbus->low_mmio_space, 3156 align, false); 3157 if (ret) { 3158 dev_err(&hbus->hdev->device, 3159 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n", 3160 hbus->low_mmio_space); 3161 return ret; 3162 } 3163 3164 /* Modify this resource to become a bridge window. */ 3165 hbus->low_mmio_res->flags |= IORESOURCE_WINDOW; 3166 hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY; 3167 pci_add_resource(&hbus->bridge->windows, hbus->low_mmio_res); 3168 } 3169 3170 if (hbus->high_mmio_space) { 3171 align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space)); 3172 ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev, 3173 0x100000000, -1, 3174 hbus->high_mmio_space, align, 3175 false); 3176 if (ret) { 3177 dev_err(&hbus->hdev->device, 3178 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n", 3179 hbus->high_mmio_space); 3180 goto release_low_mmio; 3181 } 3182 3183 /* Modify this resource to become a bridge window. */ 3184 hbus->high_mmio_res->flags |= IORESOURCE_WINDOW; 3185 hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY; 3186 pci_add_resource(&hbus->bridge->windows, hbus->high_mmio_res); 3187 } 3188 3189 return 0; 3190 3191 release_low_mmio: 3192 if (hbus->low_mmio_res) { 3193 vmbus_free_mmio(hbus->low_mmio_res->start, 3194 resource_size(hbus->low_mmio_res)); 3195 } 3196 3197 return ret; 3198 } 3199 3200 /** 3201 * hv_allocate_config_window() - Find MMIO space for PCI Config 3202 * @hbus: Root PCI bus, as understood by this driver 3203 * 3204 * This function claims memory-mapped I/O space for accessing 3205 * configuration space for the functions on this bus. 3206 * 3207 * Return: 0 on success, -errno on failure 3208 */ 3209 static int hv_allocate_config_window(struct hv_pcibus_device *hbus) 3210 { 3211 int ret; 3212 3213 /* 3214 * Set up a region of MMIO space to use for accessing configuration 3215 * space. 3216 */ 3217 ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1, 3218 PCI_CONFIG_MMIO_LENGTH, 0x1000, false); 3219 if (ret) 3220 return ret; 3221 3222 /* 3223 * vmbus_allocate_mmio() gets used for allocating both device endpoint 3224 * resource claims (those which cannot be overlapped) and the ranges 3225 * which are valid for the children of this bus, which are intended 3226 * to be overlapped by those children. Set the flag on this claim 3227 * meaning that this region can't be overlapped. 3228 */ 3229 3230 hbus->mem_config->flags |= IORESOURCE_BUSY; 3231 3232 return 0; 3233 } 3234 3235 static void hv_free_config_window(struct hv_pcibus_device *hbus) 3236 { 3237 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH); 3238 } 3239 3240 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs); 3241 3242 /** 3243 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state 3244 * @hdev: VMBus's tracking struct for this root PCI bus 3245 * 3246 * Return: 0 on success, -errno on failure 3247 */ 3248 static int hv_pci_enter_d0(struct hv_device *hdev) 3249 { 3250 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3251 struct pci_bus_d0_entry *d0_entry; 3252 struct hv_pci_compl comp_pkt; 3253 struct pci_packet *pkt; 3254 int ret; 3255 3256 /* 3257 * Tell the host that the bus is ready to use, and moved into the 3258 * powered-on state. This includes telling the host which region 3259 * of memory-mapped I/O space has been chosen for configuration space 3260 * access. 3261 */ 3262 pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL); 3263 if (!pkt) 3264 return -ENOMEM; 3265 3266 init_completion(&comp_pkt.host_event); 3267 pkt->completion_func = hv_pci_generic_compl; 3268 pkt->compl_ctxt = &comp_pkt; 3269 d0_entry = (struct pci_bus_d0_entry *)&pkt->message; 3270 d0_entry->message_type.type = PCI_BUS_D0ENTRY; 3271 d0_entry->mmio_base = hbus->mem_config->start; 3272 3273 ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry), 3274 (unsigned long)pkt, VM_PKT_DATA_INBAND, 3275 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3276 if (!ret) 3277 ret = wait_for_response(hdev, &comp_pkt.host_event); 3278 3279 if (ret) 3280 goto exit; 3281 3282 if (comp_pkt.completion_status < 0) { 3283 dev_err(&hdev->device, 3284 "PCI Pass-through VSP failed D0 Entry with status %x\n", 3285 comp_pkt.completion_status); 3286 ret = -EPROTO; 3287 goto exit; 3288 } 3289 3290 ret = 0; 3291 3292 exit: 3293 kfree(pkt); 3294 return ret; 3295 } 3296 3297 /** 3298 * hv_pci_query_relations() - Ask host to send list of child 3299 * devices 3300 * @hdev: VMBus's tracking struct for this root PCI bus 3301 * 3302 * Return: 0 on success, -errno on failure 3303 */ 3304 static int hv_pci_query_relations(struct hv_device *hdev) 3305 { 3306 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3307 struct pci_message message; 3308 struct completion comp; 3309 int ret; 3310 3311 /* Ask the host to send along the list of child devices */ 3312 init_completion(&comp); 3313 if (cmpxchg(&hbus->survey_event, NULL, &comp)) 3314 return -ENOTEMPTY; 3315 3316 memset(&message, 0, sizeof(message)); 3317 message.type = PCI_QUERY_BUS_RELATIONS; 3318 3319 ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message), 3320 0, VM_PKT_DATA_INBAND, 0); 3321 if (!ret) 3322 ret = wait_for_response(hdev, &comp); 3323 3324 return ret; 3325 } 3326 3327 /** 3328 * hv_send_resources_allocated() - Report local resource choices 3329 * @hdev: VMBus's tracking struct for this root PCI bus 3330 * 3331 * The host OS is expecting to be sent a request as a message 3332 * which contains all the resources that the device will use. 3333 * The response contains those same resources, "translated" 3334 * which is to say, the values which should be used by the 3335 * hardware, when it delivers an interrupt. (MMIO resources are 3336 * used in local terms.) This is nice for Windows, and lines up 3337 * with the FDO/PDO split, which doesn't exist in Linux. Linux 3338 * is deeply expecting to scan an emulated PCI configuration 3339 * space. So this message is sent here only to drive the state 3340 * machine on the host forward. 3341 * 3342 * Return: 0 on success, -errno on failure 3343 */ 3344 static int hv_send_resources_allocated(struct hv_device *hdev) 3345 { 3346 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3347 struct pci_resources_assigned *res_assigned; 3348 struct pci_resources_assigned2 *res_assigned2; 3349 struct hv_pci_compl comp_pkt; 3350 struct hv_pci_dev *hpdev; 3351 struct pci_packet *pkt; 3352 size_t size_res; 3353 int wslot; 3354 int ret; 3355 3356 size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) 3357 ? sizeof(*res_assigned) : sizeof(*res_assigned2); 3358 3359 pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL); 3360 if (!pkt) 3361 return -ENOMEM; 3362 3363 ret = 0; 3364 3365 for (wslot = 0; wslot < 256; wslot++) { 3366 hpdev = get_pcichild_wslot(hbus, wslot); 3367 if (!hpdev) 3368 continue; 3369 3370 memset(pkt, 0, sizeof(*pkt) + size_res); 3371 init_completion(&comp_pkt.host_event); 3372 pkt->completion_func = hv_pci_generic_compl; 3373 pkt->compl_ctxt = &comp_pkt; 3374 3375 if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) { 3376 res_assigned = 3377 (struct pci_resources_assigned *)&pkt->message; 3378 res_assigned->message_type.type = 3379 PCI_RESOURCES_ASSIGNED; 3380 res_assigned->wslot.slot = hpdev->desc.win_slot.slot; 3381 } else { 3382 res_assigned2 = 3383 (struct pci_resources_assigned2 *)&pkt->message; 3384 res_assigned2->message_type.type = 3385 PCI_RESOURCES_ASSIGNED2; 3386 res_assigned2->wslot.slot = hpdev->desc.win_slot.slot; 3387 } 3388 put_pcichild(hpdev); 3389 3390 ret = vmbus_sendpacket(hdev->channel, &pkt->message, 3391 size_res, (unsigned long)pkt, 3392 VM_PKT_DATA_INBAND, 3393 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3394 if (!ret) 3395 ret = wait_for_response(hdev, &comp_pkt.host_event); 3396 if (ret) 3397 break; 3398 3399 if (comp_pkt.completion_status < 0) { 3400 ret = -EPROTO; 3401 dev_err(&hdev->device, 3402 "resource allocated returned 0x%x", 3403 comp_pkt.completion_status); 3404 break; 3405 } 3406 3407 hbus->wslot_res_allocated = wslot; 3408 } 3409 3410 kfree(pkt); 3411 return ret; 3412 } 3413 3414 /** 3415 * hv_send_resources_released() - Report local resources 3416 * released 3417 * @hdev: VMBus's tracking struct for this root PCI bus 3418 * 3419 * Return: 0 on success, -errno on failure 3420 */ 3421 static int hv_send_resources_released(struct hv_device *hdev) 3422 { 3423 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3424 struct pci_child_message pkt; 3425 struct hv_pci_dev *hpdev; 3426 int wslot; 3427 int ret; 3428 3429 for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) { 3430 hpdev = get_pcichild_wslot(hbus, wslot); 3431 if (!hpdev) 3432 continue; 3433 3434 memset(&pkt, 0, sizeof(pkt)); 3435 pkt.message_type.type = PCI_RESOURCES_RELEASED; 3436 pkt.wslot.slot = hpdev->desc.win_slot.slot; 3437 3438 put_pcichild(hpdev); 3439 3440 ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0, 3441 VM_PKT_DATA_INBAND, 0); 3442 if (ret) 3443 return ret; 3444 3445 hbus->wslot_res_allocated = wslot - 1; 3446 } 3447 3448 hbus->wslot_res_allocated = -1; 3449 3450 return 0; 3451 } 3452 3453 #define HVPCI_DOM_MAP_SIZE (64 * 1024) 3454 static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); 3455 3456 /* 3457 * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 3458 * as invalid for passthrough PCI devices of this driver. 3459 */ 3460 #define HVPCI_DOM_INVALID 0 3461 3462 /** 3463 * hv_get_dom_num() - Get a valid PCI domain number 3464 * Check if the PCI domain number is in use, and return another number if 3465 * it is in use. 3466 * 3467 * @dom: Requested domain number 3468 * 3469 * return: domain number on success, HVPCI_DOM_INVALID on failure 3470 */ 3471 static u16 hv_get_dom_num(u16 dom) 3472 { 3473 unsigned int i; 3474 3475 if (test_and_set_bit(dom, hvpci_dom_map) == 0) 3476 return dom; 3477 3478 for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { 3479 if (test_and_set_bit(i, hvpci_dom_map) == 0) 3480 return i; 3481 } 3482 3483 return HVPCI_DOM_INVALID; 3484 } 3485 3486 /** 3487 * hv_put_dom_num() - Mark the PCI domain number as free 3488 * @dom: Domain number to be freed 3489 */ 3490 static void hv_put_dom_num(u16 dom) 3491 { 3492 clear_bit(dom, hvpci_dom_map); 3493 } 3494 3495 /** 3496 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus 3497 * @hdev: VMBus's tracking struct for this root PCI bus 3498 * @dev_id: Identifies the device itself 3499 * 3500 * Return: 0 on success, -errno on failure 3501 */ 3502 static int hv_pci_probe(struct hv_device *hdev, 3503 const struct hv_vmbus_device_id *dev_id) 3504 { 3505 struct pci_host_bridge *bridge; 3506 struct hv_pcibus_device *hbus; 3507 u16 dom_req, dom; 3508 char *name; 3509 bool enter_d0_retry = true; 3510 int ret; 3511 3512 /* 3513 * hv_pcibus_device contains the hypercall arguments for retargeting in 3514 * hv_irq_unmask(). Those must not cross a page boundary. 3515 */ 3516 BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE); 3517 3518 bridge = devm_pci_alloc_host_bridge(&hdev->device, 0); 3519 if (!bridge) 3520 return -ENOMEM; 3521 3522 /* 3523 * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural 3524 * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate 3525 * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and 3526 * alignment of hbus is important because hbus's field 3527 * retarget_msi_interrupt_params must not cross a 4KB page boundary. 3528 * 3529 * Here we prefer kzalloc to get_zeroed_page(), because a buffer 3530 * allocated by the latter is not tracked and scanned by kmemleak, and 3531 * hence kmemleak reports the pointer contained in the hbus buffer 3532 * (i.e. the hpdev struct, which is created in new_pcichild_device() and 3533 * is tracked by hbus->children) as memory leak (false positive). 3534 * 3535 * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be 3536 * used to allocate the hbus buffer and we can avoid the kmemleak false 3537 * positive by using kmemleak_alloc() and kmemleak_free() to ask 3538 * kmemleak to track and scan the hbus buffer. 3539 */ 3540 hbus = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); 3541 if (!hbus) 3542 return -ENOMEM; 3543 3544 hbus->bridge = bridge; 3545 hbus->state = hv_pcibus_init; 3546 hbus->wslot_res_allocated = -1; 3547 3548 /* 3549 * The PCI bus "domain" is what is called "segment" in ACPI and other 3550 * specs. Pull it from the instance ID, to get something usually 3551 * unique. In rare cases of collision, we will find out another number 3552 * not in use. 3553 * 3554 * Note that, since this code only runs in a Hyper-V VM, Hyper-V 3555 * together with this guest driver can guarantee that (1) The only 3556 * domain used by Gen1 VMs for something that looks like a physical 3557 * PCI bus (which is actually emulated by the hypervisor) is domain 0. 3558 * (2) There will be no overlap between domains (after fixing possible 3559 * collisions) in the same VM. 3560 */ 3561 dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; 3562 dom = hv_get_dom_num(dom_req); 3563 3564 if (dom == HVPCI_DOM_INVALID) { 3565 dev_err(&hdev->device, 3566 "Unable to use dom# 0x%x or other numbers", dom_req); 3567 ret = -EINVAL; 3568 goto free_bus; 3569 } 3570 3571 if (dom != dom_req) 3572 dev_info(&hdev->device, 3573 "PCI dom# 0x%x has collision, using 0x%x", 3574 dom_req, dom); 3575 3576 hbus->bridge->domain_nr = dom; 3577 #ifdef CONFIG_X86 3578 hbus->sysdata.domain = dom; 3579 #elif defined(CONFIG_ARM64) 3580 /* 3581 * Set the PCI bus parent to be the corresponding VMbus 3582 * device. Then the VMbus device will be assigned as the 3583 * ACPI companion in pcibios_root_bridge_prepare() and 3584 * pci_dma_configure() will propagate device coherence 3585 * information to devices created on the bus. 3586 */ 3587 hbus->sysdata.parent = hdev->device.parent; 3588 #endif 3589 3590 hbus->hdev = hdev; 3591 INIT_LIST_HEAD(&hbus->children); 3592 INIT_LIST_HEAD(&hbus->dr_list); 3593 spin_lock_init(&hbus->config_lock); 3594 spin_lock_init(&hbus->device_list_lock); 3595 spin_lock_init(&hbus->retarget_msi_interrupt_lock); 3596 hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0, 3597 hbus->bridge->domain_nr); 3598 if (!hbus->wq) { 3599 ret = -ENOMEM; 3600 goto free_dom; 3601 } 3602 3603 hdev->channel->next_request_id_callback = vmbus_next_request_id; 3604 hdev->channel->request_addr_callback = vmbus_request_addr; 3605 hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE; 3606 3607 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 3608 hv_pci_onchannelcallback, hbus); 3609 if (ret) 3610 goto destroy_wq; 3611 3612 hv_set_drvdata(hdev, hbus); 3613 3614 ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions, 3615 ARRAY_SIZE(pci_protocol_versions)); 3616 if (ret) 3617 goto close; 3618 3619 ret = hv_allocate_config_window(hbus); 3620 if (ret) 3621 goto close; 3622 3623 hbus->cfg_addr = ioremap(hbus->mem_config->start, 3624 PCI_CONFIG_MMIO_LENGTH); 3625 if (!hbus->cfg_addr) { 3626 dev_err(&hdev->device, 3627 "Unable to map a virtual address for config space\n"); 3628 ret = -ENOMEM; 3629 goto free_config; 3630 } 3631 3632 name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance); 3633 if (!name) { 3634 ret = -ENOMEM; 3635 goto unmap; 3636 } 3637 3638 hbus->fwnode = irq_domain_alloc_named_fwnode(name); 3639 kfree(name); 3640 if (!hbus->fwnode) { 3641 ret = -ENOMEM; 3642 goto unmap; 3643 } 3644 3645 ret = hv_pcie_init_irq_domain(hbus); 3646 if (ret) 3647 goto free_fwnode; 3648 3649 retry: 3650 ret = hv_pci_query_relations(hdev); 3651 if (ret) 3652 goto free_irq_domain; 3653 3654 ret = hv_pci_enter_d0(hdev); 3655 /* 3656 * In certain case (Kdump) the pci device of interest was 3657 * not cleanly shut down and resource is still held on host 3658 * side, the host could return invalid device status. 3659 * We need to explicitly request host to release the resource 3660 * and try to enter D0 again. 3661 * Since the hv_pci_bus_exit() call releases structures 3662 * of all its child devices, we need to start the retry from 3663 * hv_pci_query_relations() call, requesting host to send 3664 * the synchronous child device relations message before this 3665 * information is needed in hv_send_resources_allocated() 3666 * call later. 3667 */ 3668 if (ret == -EPROTO && enter_d0_retry) { 3669 enter_d0_retry = false; 3670 3671 dev_err(&hdev->device, "Retrying D0 Entry\n"); 3672 3673 /* 3674 * Hv_pci_bus_exit() calls hv_send_resources_released() 3675 * to free up resources of its child devices. 3676 * In the kdump kernel we need to set the 3677 * wslot_res_allocated to 255 so it scans all child 3678 * devices to release resources allocated in the 3679 * normal kernel before panic happened. 3680 */ 3681 hbus->wslot_res_allocated = 255; 3682 ret = hv_pci_bus_exit(hdev, true); 3683 3684 if (ret == 0) 3685 goto retry; 3686 3687 dev_err(&hdev->device, 3688 "Retrying D0 failed with ret %d\n", ret); 3689 } 3690 if (ret) 3691 goto free_irq_domain; 3692 3693 ret = hv_pci_allocate_bridge_windows(hbus); 3694 if (ret) 3695 goto exit_d0; 3696 3697 ret = hv_send_resources_allocated(hdev); 3698 if (ret) 3699 goto free_windows; 3700 3701 prepopulate_bars(hbus); 3702 3703 hbus->state = hv_pcibus_probed; 3704 3705 ret = create_root_hv_pci_bus(hbus); 3706 if (ret) 3707 goto free_windows; 3708 3709 return 0; 3710 3711 free_windows: 3712 hv_pci_free_bridge_windows(hbus); 3713 exit_d0: 3714 (void) hv_pci_bus_exit(hdev, true); 3715 free_irq_domain: 3716 irq_domain_remove(hbus->irq_domain); 3717 free_fwnode: 3718 irq_domain_free_fwnode(hbus->fwnode); 3719 unmap: 3720 iounmap(hbus->cfg_addr); 3721 free_config: 3722 hv_free_config_window(hbus); 3723 close: 3724 vmbus_close(hdev->channel); 3725 destroy_wq: 3726 destroy_workqueue(hbus->wq); 3727 free_dom: 3728 hv_put_dom_num(hbus->bridge->domain_nr); 3729 free_bus: 3730 kfree(hbus); 3731 return ret; 3732 } 3733 3734 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) 3735 { 3736 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3737 struct vmbus_channel *chan = hdev->channel; 3738 struct { 3739 struct pci_packet teardown_packet; 3740 u8 buffer[sizeof(struct pci_message)]; 3741 } pkt; 3742 struct hv_pci_compl comp_pkt; 3743 struct hv_pci_dev *hpdev, *tmp; 3744 unsigned long flags; 3745 u64 trans_id; 3746 int ret; 3747 3748 /* 3749 * After the host sends the RESCIND_CHANNEL message, it doesn't 3750 * access the per-channel ringbuffer any longer. 3751 */ 3752 if (chan->rescind) 3753 return 0; 3754 3755 if (!keep_devs) { 3756 struct list_head removed; 3757 3758 /* Move all present children to the list on stack */ 3759 INIT_LIST_HEAD(&removed); 3760 spin_lock_irqsave(&hbus->device_list_lock, flags); 3761 list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry) 3762 list_move_tail(&hpdev->list_entry, &removed); 3763 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 3764 3765 /* Remove all children in the list */ 3766 list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) { 3767 list_del(&hpdev->list_entry); 3768 if (hpdev->pci_slot) 3769 pci_destroy_slot(hpdev->pci_slot); 3770 /* For the two refs got in new_pcichild_device() */ 3771 put_pcichild(hpdev); 3772 put_pcichild(hpdev); 3773 } 3774 } 3775 3776 ret = hv_send_resources_released(hdev); 3777 if (ret) { 3778 dev_err(&hdev->device, 3779 "Couldn't send resources released packet(s)\n"); 3780 return ret; 3781 } 3782 3783 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet)); 3784 init_completion(&comp_pkt.host_event); 3785 pkt.teardown_packet.completion_func = hv_pci_generic_compl; 3786 pkt.teardown_packet.compl_ctxt = &comp_pkt; 3787 pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT; 3788 3789 ret = vmbus_sendpacket_getid(chan, &pkt.teardown_packet.message, 3790 sizeof(struct pci_message), 3791 (unsigned long)&pkt.teardown_packet, 3792 &trans_id, VM_PKT_DATA_INBAND, 3793 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3794 if (ret) 3795 return ret; 3796 3797 if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) { 3798 /* 3799 * The completion packet on the stack becomes invalid after 3800 * 'return'; remove the ID from the VMbus requestor if the 3801 * identifier is still mapped to/associated with the packet. 3802 * 3803 * Cf. hv_pci_onchannelcallback(). 3804 */ 3805 vmbus_request_addr_match(chan, trans_id, 3806 (unsigned long)&pkt.teardown_packet); 3807 return -ETIMEDOUT; 3808 } 3809 3810 return 0; 3811 } 3812 3813 /** 3814 * hv_pci_remove() - Remove routine for this VMBus channel 3815 * @hdev: VMBus's tracking struct for this root PCI bus 3816 * 3817 * Return: 0 on success, -errno on failure 3818 */ 3819 static int hv_pci_remove(struct hv_device *hdev) 3820 { 3821 struct hv_pcibus_device *hbus; 3822 int ret; 3823 3824 hbus = hv_get_drvdata(hdev); 3825 if (hbus->state == hv_pcibus_installed) { 3826 tasklet_disable(&hdev->channel->callback_event); 3827 hbus->state = hv_pcibus_removing; 3828 tasklet_enable(&hdev->channel->callback_event); 3829 destroy_workqueue(hbus->wq); 3830 hbus->wq = NULL; 3831 /* 3832 * At this point, no work is running or can be scheduled 3833 * on hbus-wq. We can't race with hv_pci_devices_present() 3834 * or hv_pci_eject_device(), it's safe to proceed. 3835 */ 3836 3837 /* Remove the bus from PCI's point of view. */ 3838 pci_lock_rescan_remove(); 3839 pci_stop_root_bus(hbus->bridge->bus); 3840 hv_pci_remove_slots(hbus); 3841 pci_remove_root_bus(hbus->bridge->bus); 3842 pci_unlock_rescan_remove(); 3843 } 3844 3845 ret = hv_pci_bus_exit(hdev, false); 3846 3847 vmbus_close(hdev->channel); 3848 3849 iounmap(hbus->cfg_addr); 3850 hv_free_config_window(hbus); 3851 hv_pci_free_bridge_windows(hbus); 3852 irq_domain_remove(hbus->irq_domain); 3853 irq_domain_free_fwnode(hbus->fwnode); 3854 3855 hv_put_dom_num(hbus->bridge->domain_nr); 3856 3857 kfree(hbus); 3858 return ret; 3859 } 3860 3861 static int hv_pci_suspend(struct hv_device *hdev) 3862 { 3863 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3864 enum hv_pcibus_state old_state; 3865 int ret; 3866 3867 /* 3868 * hv_pci_suspend() must make sure there are no pending work items 3869 * before calling vmbus_close(), since it runs in a process context 3870 * as a callback in dpm_suspend(). When it starts to run, the channel 3871 * callback hv_pci_onchannelcallback(), which runs in a tasklet 3872 * context, can be still running concurrently and scheduling new work 3873 * items onto hbus->wq in hv_pci_devices_present() and 3874 * hv_pci_eject_device(), and the work item handlers can access the 3875 * vmbus channel, which can be being closed by hv_pci_suspend(), e.g. 3876 * the work item handler pci_devices_present_work() -> 3877 * new_pcichild_device() writes to the vmbus channel. 3878 * 3879 * To eliminate the race, hv_pci_suspend() disables the channel 3880 * callback tasklet, sets hbus->state to hv_pcibus_removing, and 3881 * re-enables the tasklet. This way, when hv_pci_suspend() proceeds, 3882 * it knows that no new work item can be scheduled, and then it flushes 3883 * hbus->wq and safely closes the vmbus channel. 3884 */ 3885 tasklet_disable(&hdev->channel->callback_event); 3886 3887 /* Change the hbus state to prevent new work items. */ 3888 old_state = hbus->state; 3889 if (hbus->state == hv_pcibus_installed) 3890 hbus->state = hv_pcibus_removing; 3891 3892 tasklet_enable(&hdev->channel->callback_event); 3893 3894 if (old_state != hv_pcibus_installed) 3895 return -EINVAL; 3896 3897 flush_workqueue(hbus->wq); 3898 3899 ret = hv_pci_bus_exit(hdev, true); 3900 if (ret) 3901 return ret; 3902 3903 vmbus_close(hdev->channel); 3904 3905 return 0; 3906 } 3907 3908 static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg) 3909 { 3910 struct irq_data *irq_data; 3911 struct msi_desc *entry; 3912 int ret = 0; 3913 3914 msi_lock_descs(&pdev->dev); 3915 msi_for_each_desc(entry, &pdev->dev, MSI_DESC_ASSOCIATED) { 3916 irq_data = irq_get_irq_data(entry->irq); 3917 if (WARN_ON_ONCE(!irq_data)) { 3918 ret = -EINVAL; 3919 break; 3920 } 3921 3922 hv_compose_msi_msg(irq_data, &entry->msg); 3923 } 3924 msi_unlock_descs(&pdev->dev); 3925 3926 return ret; 3927 } 3928 3929 /* 3930 * Upon resume, pci_restore_msi_state() -> ... -> __pci_write_msi_msg() 3931 * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V 3932 * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg() 3933 * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping 3934 * Table entries. 3935 */ 3936 static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus) 3937 { 3938 pci_walk_bus(hbus->bridge->bus, hv_pci_restore_msi_msg, NULL); 3939 } 3940 3941 static int hv_pci_resume(struct hv_device *hdev) 3942 { 3943 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3944 enum pci_protocol_version_t version[1]; 3945 int ret; 3946 3947 hbus->state = hv_pcibus_init; 3948 3949 hdev->channel->next_request_id_callback = vmbus_next_request_id; 3950 hdev->channel->request_addr_callback = vmbus_request_addr; 3951 hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE; 3952 3953 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 3954 hv_pci_onchannelcallback, hbus); 3955 if (ret) 3956 return ret; 3957 3958 /* Only use the version that was in use before hibernation. */ 3959 version[0] = hbus->protocol_version; 3960 ret = hv_pci_protocol_negotiation(hdev, version, 1); 3961 if (ret) 3962 goto out; 3963 3964 ret = hv_pci_query_relations(hdev); 3965 if (ret) 3966 goto out; 3967 3968 ret = hv_pci_enter_d0(hdev); 3969 if (ret) 3970 goto out; 3971 3972 ret = hv_send_resources_allocated(hdev); 3973 if (ret) 3974 goto out; 3975 3976 prepopulate_bars(hbus); 3977 3978 hv_pci_restore_msi_state(hbus); 3979 3980 hbus->state = hv_pcibus_installed; 3981 return 0; 3982 out: 3983 vmbus_close(hdev->channel); 3984 return ret; 3985 } 3986 3987 static const struct hv_vmbus_device_id hv_pci_id_table[] = { 3988 /* PCI Pass-through Class ID */ 3989 /* 44C4F61D-4444-4400-9D52-802E27EDE19F */ 3990 { HV_PCIE_GUID, }, 3991 { }, 3992 }; 3993 3994 MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table); 3995 3996 static struct hv_driver hv_pci_drv = { 3997 .name = "hv_pci", 3998 .id_table = hv_pci_id_table, 3999 .probe = hv_pci_probe, 4000 .remove = hv_pci_remove, 4001 .suspend = hv_pci_suspend, 4002 .resume = hv_pci_resume, 4003 }; 4004 4005 static void __exit exit_hv_pci_drv(void) 4006 { 4007 vmbus_driver_unregister(&hv_pci_drv); 4008 4009 hvpci_block_ops.read_block = NULL; 4010 hvpci_block_ops.write_block = NULL; 4011 hvpci_block_ops.reg_blk_invalidate = NULL; 4012 } 4013 4014 static int __init init_hv_pci_drv(void) 4015 { 4016 int ret; 4017 4018 if (!hv_is_hyperv_initialized()) 4019 return -ENODEV; 4020 4021 ret = hv_pci_irqchip_init(); 4022 if (ret) 4023 return ret; 4024 4025 /* Set the invalid domain number's bit, so it will not be used */ 4026 set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); 4027 4028 /* Initialize PCI block r/w interface */ 4029 hvpci_block_ops.read_block = hv_read_config_block; 4030 hvpci_block_ops.write_block = hv_write_config_block; 4031 hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate; 4032 4033 return vmbus_driver_register(&hv_pci_drv); 4034 } 4035 4036 module_init(init_hv_pci_drv); 4037 module_exit(exit_hv_pci_drv); 4038 4039 MODULE_DESCRIPTION("Hyper-V PCI"); 4040 MODULE_LICENSE("GPL v2"); 4041