1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) Microsoft Corporation. 4 * 5 * Author: 6 * Jake Oshins <jakeo@microsoft.com> 7 * 8 * This driver acts as a paravirtual front-end for PCI Express root buses. 9 * When a PCI Express function (either an entire device or an SR-IOV 10 * Virtual Function) is being passed through to the VM, this driver exposes 11 * a new bus to the guest VM. This is modeled as a root PCI bus because 12 * no bridges are being exposed to the VM. In fact, with a "Generation 2" 13 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM 14 * until a device as been exposed using this driver. 15 * 16 * Each root PCI bus has its own PCI domain, which is called "Segment" in 17 * the PCI Firmware Specifications. Thus while each device passed through 18 * to the VM using this front-end will appear at "device 0", the domain will 19 * be unique. Typically, each bus will have one PCI function on it, though 20 * this driver does support more than one. 21 * 22 * In order to map the interrupts from the device through to the guest VM, 23 * this driver also implements an IRQ Domain, which handles interrupts (either 24 * MSI or MSI-X) associated with the functions on the bus. As interrupts are 25 * set up, torn down, or reaffined, this driver communicates with the 26 * underlying hypervisor to adjust the mappings in the I/O MMU so that each 27 * interrupt will be delivered to the correct virtual processor at the right 28 * vector. This driver does not support level-triggered (line-based) 29 * interrupts, and will report that the Interrupt Line register in the 30 * function's configuration space is zero. 31 * 32 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V 33 * facilities. For instance, the configuration space of a function exposed 34 * by Hyper-V is mapped into a single page of memory space, and the 35 * read and write handlers for config space must be aware of this mechanism. 36 * Similarly, device setup and teardown involves messages sent to and from 37 * the PCI back-end driver in Hyper-V. 38 */ 39 40 #include <linux/kernel.h> 41 #include <linux/module.h> 42 #include <linux/pci.h> 43 #include <linux/pci-ecam.h> 44 #include <linux/delay.h> 45 #include <linux/semaphore.h> 46 #include <linux/irq.h> 47 #include <linux/msi.h> 48 #include <linux/hyperv.h> 49 #include <linux/refcount.h> 50 #include <linux/irqdomain.h> 51 #include <linux/acpi.h> 52 #include <linux/sizes.h> 53 #include <asm/mshyperv.h> 54 55 /* 56 * Protocol versions. The low word is the minor version, the high word the 57 * major version. 58 */ 59 60 #define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor))) 61 #define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16) 62 #define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff) 63 64 enum pci_protocol_version_t { 65 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */ 66 PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */ 67 PCI_PROTOCOL_VERSION_1_3 = PCI_MAKE_VERSION(1, 3), /* Vibranium */ 68 PCI_PROTOCOL_VERSION_1_4 = PCI_MAKE_VERSION(1, 4), /* WS2022 */ 69 }; 70 71 #define CPU_AFFINITY_ALL -1ULL 72 73 /* 74 * Supported protocol versions in the order of probing - highest go 75 * first. 76 */ 77 static enum pci_protocol_version_t pci_protocol_versions[] = { 78 PCI_PROTOCOL_VERSION_1_4, 79 PCI_PROTOCOL_VERSION_1_3, 80 PCI_PROTOCOL_VERSION_1_2, 81 PCI_PROTOCOL_VERSION_1_1, 82 }; 83 84 #define PCI_CONFIG_MMIO_LENGTH 0x2000 85 #define CFG_PAGE_OFFSET 0x1000 86 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET) 87 88 #define MAX_SUPPORTED_MSI_MESSAGES 0x400 89 90 #define STATUS_REVISION_MISMATCH 0xC0000059 91 92 /* space for 32bit serial number as string */ 93 #define SLOT_NAME_SIZE 11 94 95 /* 96 * Size of requestor for VMbus; the value is based on the observation 97 * that having more than one request outstanding is 'rare', and so 64 98 * should be generous in ensuring that we don't ever run out. 99 */ 100 #define HV_PCI_RQSTOR_SIZE 64 101 102 /* 103 * Message Types 104 */ 105 106 enum pci_message_type { 107 /* 108 * Version 1.1 109 */ 110 PCI_MESSAGE_BASE = 0x42490000, 111 PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0, 112 PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1, 113 PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4, 114 PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5, 115 PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6, 116 PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7, 117 PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8, 118 PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9, 119 PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA, 120 PCI_EJECT = PCI_MESSAGE_BASE + 0xB, 121 PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC, 122 PCI_REENABLE = PCI_MESSAGE_BASE + 0xD, 123 PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE, 124 PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF, 125 PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10, 126 PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11, 127 PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12, 128 PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13, 129 PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14, 130 PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15, 131 PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16, 132 PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17, 133 PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */ 134 PCI_BUS_RELATIONS2 = PCI_MESSAGE_BASE + 0x19, 135 PCI_RESOURCES_ASSIGNED3 = PCI_MESSAGE_BASE + 0x1A, 136 PCI_CREATE_INTERRUPT_MESSAGE3 = PCI_MESSAGE_BASE + 0x1B, 137 PCI_MESSAGE_MAXIMUM 138 }; 139 140 /* 141 * Structures defining the virtual PCI Express protocol. 142 */ 143 144 union pci_version { 145 struct { 146 u16 minor_version; 147 u16 major_version; 148 } parts; 149 u32 version; 150 } __packed; 151 152 /* 153 * Function numbers are 8-bits wide on Express, as interpreted through ARI, 154 * which is all this driver does. This representation is the one used in 155 * Windows, which is what is expected when sending this back and forth with 156 * the Hyper-V parent partition. 157 */ 158 union win_slot_encoding { 159 struct { 160 u32 dev:5; 161 u32 func:3; 162 u32 reserved:24; 163 } bits; 164 u32 slot; 165 } __packed; 166 167 /* 168 * Pretty much as defined in the PCI Specifications. 169 */ 170 struct pci_function_description { 171 u16 v_id; /* vendor ID */ 172 u16 d_id; /* device ID */ 173 u8 rev; 174 u8 prog_intf; 175 u8 subclass; 176 u8 base_class; 177 u32 subsystem_id; 178 union win_slot_encoding win_slot; 179 u32 ser; /* serial number */ 180 } __packed; 181 182 enum pci_device_description_flags { 183 HV_PCI_DEVICE_FLAG_NONE = 0x0, 184 HV_PCI_DEVICE_FLAG_NUMA_AFFINITY = 0x1, 185 }; 186 187 struct pci_function_description2 { 188 u16 v_id; /* vendor ID */ 189 u16 d_id; /* device ID */ 190 u8 rev; 191 u8 prog_intf; 192 u8 subclass; 193 u8 base_class; 194 u32 subsystem_id; 195 union win_slot_encoding win_slot; 196 u32 ser; /* serial number */ 197 u32 flags; 198 u16 virtual_numa_node; 199 u16 reserved; 200 } __packed; 201 202 /** 203 * struct hv_msi_desc 204 * @vector: IDT entry 205 * @delivery_mode: As defined in Intel's Programmer's 206 * Reference Manual, Volume 3, Chapter 8. 207 * @vector_count: Number of contiguous entries in the 208 * Interrupt Descriptor Table that are 209 * occupied by this Message-Signaled 210 * Interrupt. For "MSI", as first defined 211 * in PCI 2.2, this can be between 1 and 212 * 32. For "MSI-X," as first defined in PCI 213 * 3.0, this must be 1, as each MSI-X table 214 * entry would have its own descriptor. 215 * @reserved: Empty space 216 * @cpu_mask: All the target virtual processors. 217 */ 218 struct hv_msi_desc { 219 u8 vector; 220 u8 delivery_mode; 221 u16 vector_count; 222 u32 reserved; 223 u64 cpu_mask; 224 } __packed; 225 226 /** 227 * struct hv_msi_desc2 - 1.2 version of hv_msi_desc 228 * @vector: IDT entry 229 * @delivery_mode: As defined in Intel's Programmer's 230 * Reference Manual, Volume 3, Chapter 8. 231 * @vector_count: Number of contiguous entries in the 232 * Interrupt Descriptor Table that are 233 * occupied by this Message-Signaled 234 * Interrupt. For "MSI", as first defined 235 * in PCI 2.2, this can be between 1 and 236 * 32. For "MSI-X," as first defined in PCI 237 * 3.0, this must be 1, as each MSI-X table 238 * entry would have its own descriptor. 239 * @processor_count: number of bits enabled in array. 240 * @processor_array: All the target virtual processors. 241 */ 242 struct hv_msi_desc2 { 243 u8 vector; 244 u8 delivery_mode; 245 u16 vector_count; 246 u16 processor_count; 247 u16 processor_array[32]; 248 } __packed; 249 250 /* 251 * struct hv_msi_desc3 - 1.3 version of hv_msi_desc 252 * Everything is the same as in 'hv_msi_desc2' except that the size of the 253 * 'vector' field is larger to support bigger vector values. For ex: LPI 254 * vectors on ARM. 255 */ 256 struct hv_msi_desc3 { 257 u32 vector; 258 u8 delivery_mode; 259 u8 reserved; 260 u16 vector_count; 261 u16 processor_count; 262 u16 processor_array[32]; 263 } __packed; 264 265 /** 266 * struct tran_int_desc 267 * @reserved: unused, padding 268 * @vector_count: same as in hv_msi_desc 269 * @data: This is the "data payload" value that is 270 * written by the device when it generates 271 * a message-signaled interrupt, either MSI 272 * or MSI-X. 273 * @address: This is the address to which the data 274 * payload is written on interrupt 275 * generation. 276 */ 277 struct tran_int_desc { 278 u16 reserved; 279 u16 vector_count; 280 u32 data; 281 u64 address; 282 } __packed; 283 284 /* 285 * A generic message format for virtual PCI. 286 * Specific message formats are defined later in the file. 287 */ 288 289 struct pci_message { 290 u32 type; 291 } __packed; 292 293 struct pci_child_message { 294 struct pci_message message_type; 295 union win_slot_encoding wslot; 296 } __packed; 297 298 struct pci_incoming_message { 299 struct vmpacket_descriptor hdr; 300 struct pci_message message_type; 301 } __packed; 302 303 struct pci_response { 304 struct vmpacket_descriptor hdr; 305 s32 status; /* negative values are failures */ 306 } __packed; 307 308 struct pci_packet { 309 void (*completion_func)(void *context, struct pci_response *resp, 310 int resp_packet_size); 311 void *compl_ctxt; 312 313 struct pci_message message[]; 314 }; 315 316 /* 317 * Specific message types supporting the PCI protocol. 318 */ 319 320 /* 321 * Version negotiation message. Sent from the guest to the host. 322 * The guest is free to try different versions until the host 323 * accepts the version. 324 * 325 * pci_version: The protocol version requested. 326 * is_last_attempt: If TRUE, this is the last version guest will request. 327 * reservedz: Reserved field, set to zero. 328 */ 329 330 struct pci_version_request { 331 struct pci_message message_type; 332 u32 protocol_version; 333 } __packed; 334 335 /* 336 * Bus D0 Entry. This is sent from the guest to the host when the virtual 337 * bus (PCI Express port) is ready for action. 338 */ 339 340 struct pci_bus_d0_entry { 341 struct pci_message message_type; 342 u32 reserved; 343 u64 mmio_base; 344 } __packed; 345 346 struct pci_bus_relations { 347 struct pci_incoming_message incoming; 348 u32 device_count; 349 struct pci_function_description func[]; 350 } __packed; 351 352 struct pci_bus_relations2 { 353 struct pci_incoming_message incoming; 354 u32 device_count; 355 struct pci_function_description2 func[]; 356 } __packed; 357 358 struct pci_q_res_req_response { 359 struct vmpacket_descriptor hdr; 360 s32 status; /* negative values are failures */ 361 u32 probed_bar[PCI_STD_NUM_BARS]; 362 } __packed; 363 364 struct pci_set_power { 365 struct pci_message message_type; 366 union win_slot_encoding wslot; 367 u32 power_state; /* In Windows terms */ 368 u32 reserved; 369 } __packed; 370 371 struct pci_set_power_response { 372 struct vmpacket_descriptor hdr; 373 s32 status; /* negative values are failures */ 374 union win_slot_encoding wslot; 375 u32 resultant_state; /* In Windows terms */ 376 u32 reserved; 377 } __packed; 378 379 struct pci_resources_assigned { 380 struct pci_message message_type; 381 union win_slot_encoding wslot; 382 u8 memory_range[0x14][6]; /* not used here */ 383 u32 msi_descriptors; 384 u32 reserved[4]; 385 } __packed; 386 387 struct pci_resources_assigned2 { 388 struct pci_message message_type; 389 union win_slot_encoding wslot; 390 u8 memory_range[0x14][6]; /* not used here */ 391 u32 msi_descriptor_count; 392 u8 reserved[70]; 393 } __packed; 394 395 struct pci_create_interrupt { 396 struct pci_message message_type; 397 union win_slot_encoding wslot; 398 struct hv_msi_desc int_desc; 399 } __packed; 400 401 struct pci_create_int_response { 402 struct pci_response response; 403 u32 reserved; 404 struct tran_int_desc int_desc; 405 } __packed; 406 407 struct pci_create_interrupt2 { 408 struct pci_message message_type; 409 union win_slot_encoding wslot; 410 struct hv_msi_desc2 int_desc; 411 } __packed; 412 413 struct pci_create_interrupt3 { 414 struct pci_message message_type; 415 union win_slot_encoding wslot; 416 struct hv_msi_desc3 int_desc; 417 } __packed; 418 419 struct pci_delete_interrupt { 420 struct pci_message message_type; 421 union win_slot_encoding wslot; 422 struct tran_int_desc int_desc; 423 } __packed; 424 425 /* 426 * Note: the VM must pass a valid block id, wslot and bytes_requested. 427 */ 428 struct pci_read_block { 429 struct pci_message message_type; 430 u32 block_id; 431 union win_slot_encoding wslot; 432 u32 bytes_requested; 433 } __packed; 434 435 struct pci_read_block_response { 436 struct vmpacket_descriptor hdr; 437 u32 status; 438 u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 439 } __packed; 440 441 /* 442 * Note: the VM must pass a valid block id, wslot and byte_count. 443 */ 444 struct pci_write_block { 445 struct pci_message message_type; 446 u32 block_id; 447 union win_slot_encoding wslot; 448 u32 byte_count; 449 u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX]; 450 } __packed; 451 452 struct pci_dev_inval_block { 453 struct pci_incoming_message incoming; 454 union win_slot_encoding wslot; 455 u64 block_mask; 456 } __packed; 457 458 struct pci_dev_incoming { 459 struct pci_incoming_message incoming; 460 union win_slot_encoding wslot; 461 } __packed; 462 463 struct pci_eject_response { 464 struct pci_message message_type; 465 union win_slot_encoding wslot; 466 u32 status; 467 } __packed; 468 469 static int pci_ring_size = VMBUS_RING_SIZE(SZ_16K); 470 471 /* 472 * Driver specific state. 473 */ 474 475 enum hv_pcibus_state { 476 hv_pcibus_init = 0, 477 hv_pcibus_probed, 478 hv_pcibus_installed, 479 hv_pcibus_removing, 480 hv_pcibus_maximum 481 }; 482 483 struct hv_pcibus_device { 484 #ifdef CONFIG_X86 485 struct pci_sysdata sysdata; 486 #elif defined(CONFIG_ARM64) 487 struct pci_config_window sysdata; 488 #endif 489 struct pci_host_bridge *bridge; 490 struct fwnode_handle *fwnode; 491 /* Protocol version negotiated with the host */ 492 enum pci_protocol_version_t protocol_version; 493 494 struct mutex state_lock; 495 enum hv_pcibus_state state; 496 497 struct hv_device *hdev; 498 resource_size_t low_mmio_space; 499 resource_size_t high_mmio_space; 500 struct resource *mem_config; 501 struct resource *low_mmio_res; 502 struct resource *high_mmio_res; 503 struct completion *survey_event; 504 struct pci_bus *pci_bus; 505 spinlock_t config_lock; /* Avoid two threads writing index page */ 506 spinlock_t device_list_lock; /* Protect lists below */ 507 void __iomem *cfg_addr; 508 509 struct list_head children; 510 struct list_head dr_list; 511 512 struct msi_domain_info msi_info; 513 struct irq_domain *irq_domain; 514 515 struct workqueue_struct *wq; 516 517 /* Highest slot of child device with resources allocated */ 518 int wslot_res_allocated; 519 bool use_calls; /* Use hypercalls to access mmio cfg space */ 520 }; 521 522 /* 523 * Tracks "Device Relations" messages from the host, which must be both 524 * processed in order and deferred so that they don't run in the context 525 * of the incoming packet callback. 526 */ 527 struct hv_dr_work { 528 struct work_struct wrk; 529 struct hv_pcibus_device *bus; 530 }; 531 532 struct hv_pcidev_description { 533 u16 v_id; /* vendor ID */ 534 u16 d_id; /* device ID */ 535 u8 rev; 536 u8 prog_intf; 537 u8 subclass; 538 u8 base_class; 539 u32 subsystem_id; 540 union win_slot_encoding win_slot; 541 u32 ser; /* serial number */ 542 u32 flags; 543 u16 virtual_numa_node; 544 }; 545 546 struct hv_dr_state { 547 struct list_head list_entry; 548 u32 device_count; 549 struct hv_pcidev_description func[] __counted_by(device_count); 550 }; 551 552 struct hv_pci_dev { 553 /* List protected by pci_rescan_remove_lock */ 554 struct list_head list_entry; 555 refcount_t refs; 556 struct pci_slot *pci_slot; 557 struct hv_pcidev_description desc; 558 bool reported_missing; 559 struct hv_pcibus_device *hbus; 560 struct work_struct wrk; 561 562 void (*block_invalidate)(void *context, u64 block_mask); 563 void *invalidate_context; 564 565 /* 566 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then 567 * read it back, for each of the BAR offsets within config space. 568 */ 569 u32 probed_bar[PCI_STD_NUM_BARS]; 570 }; 571 572 struct hv_pci_compl { 573 struct completion host_event; 574 s32 completion_status; 575 }; 576 577 static void hv_pci_onchannelcallback(void *context); 578 579 #ifdef CONFIG_X86 580 #define DELIVERY_MODE APIC_DELIVERY_MODE_FIXED 581 #define FLOW_HANDLER handle_edge_irq 582 #define FLOW_NAME "edge" 583 584 static int hv_pci_irqchip_init(void) 585 { 586 return 0; 587 } 588 589 static struct irq_domain *hv_pci_get_root_domain(void) 590 { 591 return x86_vector_domain; 592 } 593 594 static unsigned int hv_msi_get_int_vector(struct irq_data *data) 595 { 596 struct irq_cfg *cfg = irqd_cfg(data); 597 598 return cfg->vector; 599 } 600 601 #define hv_msi_prepare pci_msi_prepare 602 603 /** 604 * hv_arch_irq_unmask() - "Unmask" the IRQ by setting its current 605 * affinity. 606 * @data: Describes the IRQ 607 * 608 * Build new a destination for the MSI and make a hypercall to 609 * update the Interrupt Redirection Table. "Device Logical ID" 610 * is built out of this PCI bus's instance GUID and the function 611 * number of the device. 612 */ 613 static void hv_arch_irq_unmask(struct irq_data *data) 614 { 615 struct msi_desc *msi_desc = irq_data_get_msi_desc(data); 616 struct hv_retarget_device_interrupt *params; 617 struct tran_int_desc *int_desc; 618 struct hv_pcibus_device *hbus; 619 const struct cpumask *dest; 620 cpumask_var_t tmp; 621 struct pci_bus *pbus; 622 struct pci_dev *pdev; 623 unsigned long flags; 624 u32 var_size = 0; 625 int cpu, nr_bank; 626 u64 res; 627 628 dest = irq_data_get_effective_affinity_mask(data); 629 pdev = msi_desc_to_pci_dev(msi_desc); 630 pbus = pdev->bus; 631 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 632 int_desc = data->chip_data; 633 if (!int_desc) { 634 dev_warn(&hbus->hdev->device, "%s() can not unmask irq %u\n", 635 __func__, data->irq); 636 return; 637 } 638 639 local_irq_save(flags); 640 641 params = *this_cpu_ptr(hyperv_pcpu_input_arg); 642 memset(params, 0, sizeof(*params)); 643 params->partition_id = HV_PARTITION_ID_SELF; 644 params->int_entry.source = HV_INTERRUPT_SOURCE_MSI; 645 params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff; 646 params->int_entry.msi_entry.data.as_uint32 = int_desc->data; 647 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) | 648 (hbus->hdev->dev_instance.b[4] << 16) | 649 (hbus->hdev->dev_instance.b[7] << 8) | 650 (hbus->hdev->dev_instance.b[6] & 0xf8) | 651 PCI_FUNC(pdev->devfn); 652 params->int_target.vector = hv_msi_get_int_vector(data); 653 654 if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) { 655 /* 656 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the 657 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides 658 * with >64 VP support. 659 * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED 660 * is not sufficient for this hypercall. 661 */ 662 params->int_target.flags |= 663 HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; 664 665 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) { 666 res = 1; 667 goto out; 668 } 669 670 cpumask_and(tmp, dest, cpu_online_mask); 671 nr_bank = cpumask_to_vpset(¶ms->int_target.vp_set, tmp); 672 free_cpumask_var(tmp); 673 674 if (nr_bank <= 0) { 675 res = 1; 676 goto out; 677 } 678 679 /* 680 * var-sized hypercall, var-size starts after vp_mask (thus 681 * vp_set.format does not count, but vp_set.valid_bank_mask 682 * does). 683 */ 684 var_size = 1 + nr_bank; 685 } else { 686 for_each_cpu_and(cpu, dest, cpu_online_mask) { 687 params->int_target.vp_mask |= 688 (1ULL << hv_cpu_number_to_vp_number(cpu)); 689 } 690 } 691 692 res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17), 693 params, NULL); 694 695 out: 696 local_irq_restore(flags); 697 698 /* 699 * During hibernation, when a CPU is offlined, the kernel tries 700 * to move the interrupt to the remaining CPUs that haven't 701 * been offlined yet. In this case, the below hv_do_hypercall() 702 * always fails since the vmbus channel has been closed: 703 * refer to cpu_disable_common() -> fixup_irqs() -> 704 * irq_migrate_all_off_this_cpu() -> migrate_one_irq(). 705 * 706 * Suppress the error message for hibernation because the failure 707 * during hibernation does not matter (at this time all the devices 708 * have been frozen). Note: the correct affinity info is still updated 709 * into the irqdata data structure in migrate_one_irq() -> 710 * irq_do_set_affinity(), so later when the VM resumes, 711 * hv_pci_restore_msi_state() is able to correctly restore the 712 * interrupt with the correct affinity. 713 */ 714 if (!hv_result_success(res) && hbus->state != hv_pcibus_removing) 715 dev_err(&hbus->hdev->device, 716 "%s() failed: %#llx", __func__, res); 717 } 718 #elif defined(CONFIG_ARM64) 719 /* 720 * SPI vectors to use for vPCI; arch SPIs range is [32, 1019], but leaving a bit 721 * of room at the start to allow for SPIs to be specified through ACPI and 722 * starting with a power of two to satisfy power of 2 multi-MSI requirement. 723 */ 724 #define HV_PCI_MSI_SPI_START 64 725 #define HV_PCI_MSI_SPI_NR (1020 - HV_PCI_MSI_SPI_START) 726 #define DELIVERY_MODE 0 727 #define FLOW_HANDLER NULL 728 #define FLOW_NAME NULL 729 #define hv_msi_prepare NULL 730 731 struct hv_pci_chip_data { 732 DECLARE_BITMAP(spi_map, HV_PCI_MSI_SPI_NR); 733 struct mutex map_lock; 734 }; 735 736 /* Hyper-V vPCI MSI GIC IRQ domain */ 737 static struct irq_domain *hv_msi_gic_irq_domain; 738 739 /* Hyper-V PCI MSI IRQ chip */ 740 static struct irq_chip hv_arm64_msi_irq_chip = { 741 .name = "MSI", 742 .irq_set_affinity = irq_chip_set_affinity_parent, 743 .irq_eoi = irq_chip_eoi_parent, 744 .irq_mask = irq_chip_mask_parent, 745 .irq_unmask = irq_chip_unmask_parent 746 }; 747 748 static unsigned int hv_msi_get_int_vector(struct irq_data *irqd) 749 { 750 return irqd->parent_data->hwirq; 751 } 752 753 /* 754 * @nr_bm_irqs: Indicates the number of IRQs that were allocated from 755 * the bitmap. 756 * @nr_dom_irqs: Indicates the number of IRQs that were allocated from 757 * the parent domain. 758 */ 759 static void hv_pci_vec_irq_free(struct irq_domain *domain, 760 unsigned int virq, 761 unsigned int nr_bm_irqs, 762 unsigned int nr_dom_irqs) 763 { 764 struct hv_pci_chip_data *chip_data = domain->host_data; 765 struct irq_data *d = irq_domain_get_irq_data(domain, virq); 766 int first = d->hwirq - HV_PCI_MSI_SPI_START; 767 int i; 768 769 mutex_lock(&chip_data->map_lock); 770 bitmap_release_region(chip_data->spi_map, 771 first, 772 get_count_order(nr_bm_irqs)); 773 mutex_unlock(&chip_data->map_lock); 774 for (i = 0; i < nr_dom_irqs; i++) { 775 if (i) 776 d = irq_domain_get_irq_data(domain, virq + i); 777 irq_domain_reset_irq_data(d); 778 } 779 780 irq_domain_free_irqs_parent(domain, virq, nr_dom_irqs); 781 } 782 783 static void hv_pci_vec_irq_domain_free(struct irq_domain *domain, 784 unsigned int virq, 785 unsigned int nr_irqs) 786 { 787 hv_pci_vec_irq_free(domain, virq, nr_irqs, nr_irqs); 788 } 789 790 static int hv_pci_vec_alloc_device_irq(struct irq_domain *domain, 791 unsigned int nr_irqs, 792 irq_hw_number_t *hwirq) 793 { 794 struct hv_pci_chip_data *chip_data = domain->host_data; 795 int index; 796 797 /* Find and allocate region from the SPI bitmap */ 798 mutex_lock(&chip_data->map_lock); 799 index = bitmap_find_free_region(chip_data->spi_map, 800 HV_PCI_MSI_SPI_NR, 801 get_count_order(nr_irqs)); 802 mutex_unlock(&chip_data->map_lock); 803 if (index < 0) 804 return -ENOSPC; 805 806 *hwirq = index + HV_PCI_MSI_SPI_START; 807 808 return 0; 809 } 810 811 static int hv_pci_vec_irq_gic_domain_alloc(struct irq_domain *domain, 812 unsigned int virq, 813 irq_hw_number_t hwirq) 814 { 815 struct irq_fwspec fwspec; 816 struct irq_data *d; 817 int ret; 818 819 fwspec.fwnode = domain->parent->fwnode; 820 fwspec.param_count = 2; 821 fwspec.param[0] = hwirq; 822 fwspec.param[1] = IRQ_TYPE_EDGE_RISING; 823 824 ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec); 825 if (ret) 826 return ret; 827 828 /* 829 * Since the interrupt specifier is not coming from ACPI or DT, the 830 * trigger type will need to be set explicitly. Otherwise, it will be 831 * set to whatever is in the GIC configuration. 832 */ 833 d = irq_domain_get_irq_data(domain->parent, virq); 834 835 return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING); 836 } 837 838 static int hv_pci_vec_irq_domain_alloc(struct irq_domain *domain, 839 unsigned int virq, unsigned int nr_irqs, 840 void *args) 841 { 842 irq_hw_number_t hwirq; 843 unsigned int i; 844 int ret; 845 846 ret = hv_pci_vec_alloc_device_irq(domain, nr_irqs, &hwirq); 847 if (ret) 848 return ret; 849 850 for (i = 0; i < nr_irqs; i++) { 851 ret = hv_pci_vec_irq_gic_domain_alloc(domain, virq + i, 852 hwirq + i); 853 if (ret) { 854 hv_pci_vec_irq_free(domain, virq, nr_irqs, i); 855 return ret; 856 } 857 858 irq_domain_set_hwirq_and_chip(domain, virq + i, 859 hwirq + i, 860 &hv_arm64_msi_irq_chip, 861 domain->host_data); 862 pr_debug("pID:%d vID:%u\n", (int)(hwirq + i), virq + i); 863 } 864 865 return 0; 866 } 867 868 /* 869 * Pick the first cpu as the irq affinity that can be temporarily used for 870 * composing MSI from the hypervisor. GIC will eventually set the right 871 * affinity for the irq and the 'unmask' will retarget the interrupt to that 872 * cpu. 873 */ 874 static int hv_pci_vec_irq_domain_activate(struct irq_domain *domain, 875 struct irq_data *irqd, bool reserve) 876 { 877 int cpu = cpumask_first(cpu_present_mask); 878 879 irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); 880 881 return 0; 882 } 883 884 static const struct irq_domain_ops hv_pci_domain_ops = { 885 .alloc = hv_pci_vec_irq_domain_alloc, 886 .free = hv_pci_vec_irq_domain_free, 887 .activate = hv_pci_vec_irq_domain_activate, 888 }; 889 890 static int hv_pci_irqchip_init(void) 891 { 892 static struct hv_pci_chip_data *chip_data; 893 struct fwnode_handle *fn = NULL; 894 int ret = -ENOMEM; 895 896 chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL); 897 if (!chip_data) 898 return ret; 899 900 mutex_init(&chip_data->map_lock); 901 fn = irq_domain_alloc_named_fwnode("hv_vpci_arm64"); 902 if (!fn) 903 goto free_chip; 904 905 /* 906 * IRQ domain once enabled, should not be removed since there is no 907 * way to ensure that all the corresponding devices are also gone and 908 * no interrupts will be generated. 909 */ 910 hv_msi_gic_irq_domain = acpi_irq_create_hierarchy(0, HV_PCI_MSI_SPI_NR, 911 fn, &hv_pci_domain_ops, 912 chip_data); 913 914 if (!hv_msi_gic_irq_domain) { 915 pr_err("Failed to create Hyper-V arm64 vPCI MSI IRQ domain\n"); 916 goto free_chip; 917 } 918 919 return 0; 920 921 free_chip: 922 kfree(chip_data); 923 if (fn) 924 irq_domain_free_fwnode(fn); 925 926 return ret; 927 } 928 929 static struct irq_domain *hv_pci_get_root_domain(void) 930 { 931 return hv_msi_gic_irq_domain; 932 } 933 934 /* 935 * SPIs are used for interrupts of PCI devices and SPIs is managed via GICD 936 * registers which Hyper-V already supports, so no hypercall needed. 937 */ 938 static void hv_arch_irq_unmask(struct irq_data *data) { } 939 #endif /* CONFIG_ARM64 */ 940 941 /** 942 * hv_pci_generic_compl() - Invoked for a completion packet 943 * @context: Set up by the sender of the packet. 944 * @resp: The response packet 945 * @resp_packet_size: Size in bytes of the packet 946 * 947 * This function is used to trigger an event and report status 948 * for any message for which the completion packet contains a 949 * status and nothing else. 950 */ 951 static void hv_pci_generic_compl(void *context, struct pci_response *resp, 952 int resp_packet_size) 953 { 954 struct hv_pci_compl *comp_pkt = context; 955 956 comp_pkt->completion_status = resp->status; 957 complete(&comp_pkt->host_event); 958 } 959 960 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 961 u32 wslot); 962 963 static void get_pcichild(struct hv_pci_dev *hpdev) 964 { 965 refcount_inc(&hpdev->refs); 966 } 967 968 static void put_pcichild(struct hv_pci_dev *hpdev) 969 { 970 if (refcount_dec_and_test(&hpdev->refs)) 971 kfree(hpdev); 972 } 973 974 /* 975 * There is no good way to get notified from vmbus_onoffer_rescind(), 976 * so let's use polling here, since this is not a hot path. 977 */ 978 static int wait_for_response(struct hv_device *hdev, 979 struct completion *comp) 980 { 981 while (true) { 982 if (hdev->channel->rescind) { 983 dev_warn_once(&hdev->device, "The device is gone.\n"); 984 return -ENODEV; 985 } 986 987 if (wait_for_completion_timeout(comp, HZ / 10)) 988 break; 989 } 990 991 return 0; 992 } 993 994 /** 995 * devfn_to_wslot() - Convert from Linux PCI slot to Windows 996 * @devfn: The Linux representation of PCI slot 997 * 998 * Windows uses a slightly different representation of PCI slot. 999 * 1000 * Return: The Windows representation 1001 */ 1002 static u32 devfn_to_wslot(int devfn) 1003 { 1004 union win_slot_encoding wslot; 1005 1006 wslot.slot = 0; 1007 wslot.bits.dev = PCI_SLOT(devfn); 1008 wslot.bits.func = PCI_FUNC(devfn); 1009 1010 return wslot.slot; 1011 } 1012 1013 /** 1014 * wslot_to_devfn() - Convert from Windows PCI slot to Linux 1015 * @wslot: The Windows representation of PCI slot 1016 * 1017 * Windows uses a slightly different representation of PCI slot. 1018 * 1019 * Return: The Linux representation 1020 */ 1021 static int wslot_to_devfn(u32 wslot) 1022 { 1023 union win_slot_encoding slot_no; 1024 1025 slot_no.slot = wslot; 1026 return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func); 1027 } 1028 1029 static void hv_pci_read_mmio(struct device *dev, phys_addr_t gpa, int size, u32 *val) 1030 { 1031 struct hv_mmio_read_input *in; 1032 struct hv_mmio_read_output *out; 1033 u64 ret; 1034 1035 /* 1036 * Must be called with interrupts disabled so it is safe 1037 * to use the per-cpu input argument page. Use it for 1038 * both input and output. 1039 */ 1040 in = *this_cpu_ptr(hyperv_pcpu_input_arg); 1041 out = *this_cpu_ptr(hyperv_pcpu_input_arg) + sizeof(*in); 1042 in->gpa = gpa; 1043 in->size = size; 1044 1045 ret = hv_do_hypercall(HVCALL_MMIO_READ, in, out); 1046 if (hv_result_success(ret)) { 1047 switch (size) { 1048 case 1: 1049 *val = *(u8 *)(out->data); 1050 break; 1051 case 2: 1052 *val = *(u16 *)(out->data); 1053 break; 1054 default: 1055 *val = *(u32 *)(out->data); 1056 break; 1057 } 1058 } else 1059 dev_err(dev, "MMIO read hypercall error %llx addr %llx size %d\n", 1060 ret, gpa, size); 1061 } 1062 1063 static void hv_pci_write_mmio(struct device *dev, phys_addr_t gpa, int size, u32 val) 1064 { 1065 struct hv_mmio_write_input *in; 1066 u64 ret; 1067 1068 /* 1069 * Must be called with interrupts disabled so it is safe 1070 * to use the per-cpu input argument memory. 1071 */ 1072 in = *this_cpu_ptr(hyperv_pcpu_input_arg); 1073 in->gpa = gpa; 1074 in->size = size; 1075 switch (size) { 1076 case 1: 1077 *(u8 *)(in->data) = val; 1078 break; 1079 case 2: 1080 *(u16 *)(in->data) = val; 1081 break; 1082 default: 1083 *(u32 *)(in->data) = val; 1084 break; 1085 } 1086 1087 ret = hv_do_hypercall(HVCALL_MMIO_WRITE, in, NULL); 1088 if (!hv_result_success(ret)) 1089 dev_err(dev, "MMIO write hypercall error %llx addr %llx size %d\n", 1090 ret, gpa, size); 1091 } 1092 1093 /* 1094 * PCI Configuration Space for these root PCI buses is implemented as a pair 1095 * of pages in memory-mapped I/O space. Writing to the first page chooses 1096 * the PCI function being written or read. Once the first page has been 1097 * written to, the following page maps in the entire configuration space of 1098 * the function. 1099 */ 1100 1101 /** 1102 * _hv_pcifront_read_config() - Internal PCI config read 1103 * @hpdev: The PCI driver's representation of the device 1104 * @where: Offset within config space 1105 * @size: Size of the transfer 1106 * @val: Pointer to the buffer receiving the data 1107 */ 1108 static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, 1109 int size, u32 *val) 1110 { 1111 struct hv_pcibus_device *hbus = hpdev->hbus; 1112 struct device *dev = &hbus->hdev->device; 1113 int offset = where + CFG_PAGE_OFFSET; 1114 unsigned long flags; 1115 1116 /* 1117 * If the attempt is to read the IDs or the ROM BAR, simulate that. 1118 */ 1119 if (where + size <= PCI_COMMAND) { 1120 memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size); 1121 } else if (where >= PCI_CLASS_REVISION && where + size <= 1122 PCI_CACHE_LINE_SIZE) { 1123 memcpy(val, ((u8 *)&hpdev->desc.rev) + where - 1124 PCI_CLASS_REVISION, size); 1125 } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <= 1126 PCI_ROM_ADDRESS) { 1127 memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where - 1128 PCI_SUBSYSTEM_VENDOR_ID, size); 1129 } else if (where >= PCI_ROM_ADDRESS && where + size <= 1130 PCI_CAPABILITY_LIST) { 1131 /* ROM BARs are unimplemented */ 1132 *val = 0; 1133 } else if ((where >= PCI_INTERRUPT_LINE && where + size <= PCI_INTERRUPT_PIN) || 1134 (where >= PCI_INTERRUPT_PIN && where + size <= PCI_MIN_GNT)) { 1135 /* 1136 * Interrupt Line and Interrupt PIN are hard-wired to zero 1137 * because this front-end only supports message-signaled 1138 * interrupts. 1139 */ 1140 *val = 0; 1141 } else if (where + size <= CFG_PAGE_SIZE) { 1142 1143 spin_lock_irqsave(&hbus->config_lock, flags); 1144 if (hbus->use_calls) { 1145 phys_addr_t addr = hbus->mem_config->start + offset; 1146 1147 hv_pci_write_mmio(dev, hbus->mem_config->start, 4, 1148 hpdev->desc.win_slot.slot); 1149 hv_pci_read_mmio(dev, addr, size, val); 1150 } else { 1151 void __iomem *addr = hbus->cfg_addr + offset; 1152 1153 /* Choose the function to be read. (See comment above) */ 1154 writel(hpdev->desc.win_slot.slot, hbus->cfg_addr); 1155 /* Make sure the function was chosen before reading. */ 1156 mb(); 1157 /* Read from that function's config space. */ 1158 switch (size) { 1159 case 1: 1160 *val = readb(addr); 1161 break; 1162 case 2: 1163 *val = readw(addr); 1164 break; 1165 default: 1166 *val = readl(addr); 1167 break; 1168 } 1169 /* 1170 * Make sure the read was done before we release the 1171 * spinlock allowing consecutive reads/writes. 1172 */ 1173 mb(); 1174 } 1175 spin_unlock_irqrestore(&hbus->config_lock, flags); 1176 } else { 1177 dev_err(dev, "Attempt to read beyond a function's config space.\n"); 1178 } 1179 } 1180 1181 static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev) 1182 { 1183 struct hv_pcibus_device *hbus = hpdev->hbus; 1184 struct device *dev = &hbus->hdev->device; 1185 u32 val; 1186 u16 ret; 1187 unsigned long flags; 1188 1189 spin_lock_irqsave(&hbus->config_lock, flags); 1190 1191 if (hbus->use_calls) { 1192 phys_addr_t addr = hbus->mem_config->start + 1193 CFG_PAGE_OFFSET + PCI_VENDOR_ID; 1194 1195 hv_pci_write_mmio(dev, hbus->mem_config->start, 4, 1196 hpdev->desc.win_slot.slot); 1197 hv_pci_read_mmio(dev, addr, 2, &val); 1198 ret = val; /* Truncates to 16 bits */ 1199 } else { 1200 void __iomem *addr = hbus->cfg_addr + CFG_PAGE_OFFSET + 1201 PCI_VENDOR_ID; 1202 /* Choose the function to be read. (See comment above) */ 1203 writel(hpdev->desc.win_slot.slot, hbus->cfg_addr); 1204 /* Make sure the function was chosen before we start reading. */ 1205 mb(); 1206 /* Read from that function's config space. */ 1207 ret = readw(addr); 1208 /* 1209 * mb() is not required here, because the 1210 * spin_unlock_irqrestore() is a barrier. 1211 */ 1212 } 1213 1214 spin_unlock_irqrestore(&hbus->config_lock, flags); 1215 1216 return ret; 1217 } 1218 1219 /** 1220 * _hv_pcifront_write_config() - Internal PCI config write 1221 * @hpdev: The PCI driver's representation of the device 1222 * @where: Offset within config space 1223 * @size: Size of the transfer 1224 * @val: The data being transferred 1225 */ 1226 static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, 1227 int size, u32 val) 1228 { 1229 struct hv_pcibus_device *hbus = hpdev->hbus; 1230 struct device *dev = &hbus->hdev->device; 1231 int offset = where + CFG_PAGE_OFFSET; 1232 unsigned long flags; 1233 1234 if (where >= PCI_SUBSYSTEM_VENDOR_ID && 1235 where + size <= PCI_CAPABILITY_LIST) { 1236 /* SSIDs and ROM BARs are read-only */ 1237 } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) { 1238 spin_lock_irqsave(&hbus->config_lock, flags); 1239 1240 if (hbus->use_calls) { 1241 phys_addr_t addr = hbus->mem_config->start + offset; 1242 1243 hv_pci_write_mmio(dev, hbus->mem_config->start, 4, 1244 hpdev->desc.win_slot.slot); 1245 hv_pci_write_mmio(dev, addr, size, val); 1246 } else { 1247 void __iomem *addr = hbus->cfg_addr + offset; 1248 1249 /* Choose the function to write. (See comment above) */ 1250 writel(hpdev->desc.win_slot.slot, hbus->cfg_addr); 1251 /* Make sure the function was chosen before writing. */ 1252 wmb(); 1253 /* Write to that function's config space. */ 1254 switch (size) { 1255 case 1: 1256 writeb(val, addr); 1257 break; 1258 case 2: 1259 writew(val, addr); 1260 break; 1261 default: 1262 writel(val, addr); 1263 break; 1264 } 1265 /* 1266 * Make sure the write was done before we release the 1267 * spinlock allowing consecutive reads/writes. 1268 */ 1269 mb(); 1270 } 1271 spin_unlock_irqrestore(&hbus->config_lock, flags); 1272 } else { 1273 dev_err(dev, "Attempt to write beyond a function's config space.\n"); 1274 } 1275 } 1276 1277 /** 1278 * hv_pcifront_read_config() - Read configuration space 1279 * @bus: PCI Bus structure 1280 * @devfn: Device/function 1281 * @where: Offset from base 1282 * @size: Byte/word/dword 1283 * @val: Value to be read 1284 * 1285 * Return: PCIBIOS_SUCCESSFUL on success 1286 * PCIBIOS_DEVICE_NOT_FOUND on failure 1287 */ 1288 static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn, 1289 int where, int size, u32 *val) 1290 { 1291 struct hv_pcibus_device *hbus = 1292 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 1293 struct hv_pci_dev *hpdev; 1294 1295 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 1296 if (!hpdev) 1297 return PCIBIOS_DEVICE_NOT_FOUND; 1298 1299 _hv_pcifront_read_config(hpdev, where, size, val); 1300 1301 put_pcichild(hpdev); 1302 return PCIBIOS_SUCCESSFUL; 1303 } 1304 1305 /** 1306 * hv_pcifront_write_config() - Write configuration space 1307 * @bus: PCI Bus structure 1308 * @devfn: Device/function 1309 * @where: Offset from base 1310 * @size: Byte/word/dword 1311 * @val: Value to be written to device 1312 * 1313 * Return: PCIBIOS_SUCCESSFUL on success 1314 * PCIBIOS_DEVICE_NOT_FOUND on failure 1315 */ 1316 static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn, 1317 int where, int size, u32 val) 1318 { 1319 struct hv_pcibus_device *hbus = 1320 container_of(bus->sysdata, struct hv_pcibus_device, sysdata); 1321 struct hv_pci_dev *hpdev; 1322 1323 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn)); 1324 if (!hpdev) 1325 return PCIBIOS_DEVICE_NOT_FOUND; 1326 1327 _hv_pcifront_write_config(hpdev, where, size, val); 1328 1329 put_pcichild(hpdev); 1330 return PCIBIOS_SUCCESSFUL; 1331 } 1332 1333 /* PCIe operations */ 1334 static struct pci_ops hv_pcifront_ops = { 1335 .read = hv_pcifront_read_config, 1336 .write = hv_pcifront_write_config, 1337 }; 1338 1339 /* 1340 * Paravirtual backchannel 1341 * 1342 * Hyper-V SR-IOV provides a backchannel mechanism in software for 1343 * communication between a VF driver and a PF driver. These 1344 * "configuration blocks" are similar in concept to PCI configuration space, 1345 * but instead of doing reads and writes in 32-bit chunks through a very slow 1346 * path, packets of up to 128 bytes can be sent or received asynchronously. 1347 * 1348 * Nearly every SR-IOV device contains just such a communications channel in 1349 * hardware, so using this one in software is usually optional. Using the 1350 * software channel, however, allows driver implementers to leverage software 1351 * tools that fuzz the communications channel looking for vulnerabilities. 1352 * 1353 * The usage model for these packets puts the responsibility for reading or 1354 * writing on the VF driver. The VF driver sends a read or a write packet, 1355 * indicating which "block" is being referred to by number. 1356 * 1357 * If the PF driver wishes to initiate communication, it can "invalidate" one or 1358 * more of the first 64 blocks. This invalidation is delivered via a callback 1359 * supplied by the VF driver by this driver. 1360 * 1361 * No protocol is implied, except that supplied by the PF and VF drivers. 1362 */ 1363 1364 struct hv_read_config_compl { 1365 struct hv_pci_compl comp_pkt; 1366 void *buf; 1367 unsigned int len; 1368 unsigned int bytes_returned; 1369 }; 1370 1371 /** 1372 * hv_pci_read_config_compl() - Invoked when a response packet 1373 * for a read config block operation arrives. 1374 * @context: Identifies the read config operation 1375 * @resp: The response packet itself 1376 * @resp_packet_size: Size in bytes of the response packet 1377 */ 1378 static void hv_pci_read_config_compl(void *context, struct pci_response *resp, 1379 int resp_packet_size) 1380 { 1381 struct hv_read_config_compl *comp = context; 1382 struct pci_read_block_response *read_resp = 1383 (struct pci_read_block_response *)resp; 1384 unsigned int data_len, hdr_len; 1385 1386 hdr_len = offsetof(struct pci_read_block_response, bytes); 1387 if (resp_packet_size < hdr_len) { 1388 comp->comp_pkt.completion_status = -1; 1389 goto out; 1390 } 1391 1392 data_len = resp_packet_size - hdr_len; 1393 if (data_len > 0 && read_resp->status == 0) { 1394 comp->bytes_returned = min(comp->len, data_len); 1395 memcpy(comp->buf, read_resp->bytes, comp->bytes_returned); 1396 } else { 1397 comp->bytes_returned = 0; 1398 } 1399 1400 comp->comp_pkt.completion_status = read_resp->status; 1401 out: 1402 complete(&comp->comp_pkt.host_event); 1403 } 1404 1405 /** 1406 * hv_read_config_block() - Sends a read config block request to 1407 * the back-end driver running in the Hyper-V parent partition. 1408 * @pdev: The PCI driver's representation for this device. 1409 * @buf: Buffer into which the config block will be copied. 1410 * @len: Size in bytes of buf. 1411 * @block_id: Identifies the config block which has been requested. 1412 * @bytes_returned: Size which came back from the back-end driver. 1413 * 1414 * Return: 0 on success, -errno on failure 1415 */ 1416 static int hv_read_config_block(struct pci_dev *pdev, void *buf, 1417 unsigned int len, unsigned int block_id, 1418 unsigned int *bytes_returned) 1419 { 1420 struct hv_pcibus_device *hbus = 1421 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1422 sysdata); 1423 struct { 1424 struct pci_packet pkt; 1425 char buf[sizeof(struct pci_read_block)]; 1426 } pkt; 1427 struct hv_read_config_compl comp_pkt; 1428 struct pci_read_block *read_blk; 1429 int ret; 1430 1431 if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 1432 return -EINVAL; 1433 1434 init_completion(&comp_pkt.comp_pkt.host_event); 1435 comp_pkt.buf = buf; 1436 comp_pkt.len = len; 1437 1438 memset(&pkt, 0, sizeof(pkt)); 1439 pkt.pkt.completion_func = hv_pci_read_config_compl; 1440 pkt.pkt.compl_ctxt = &comp_pkt; 1441 read_blk = (struct pci_read_block *)&pkt.pkt.message; 1442 read_blk->message_type.type = PCI_READ_BLOCK; 1443 read_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 1444 read_blk->block_id = block_id; 1445 read_blk->bytes_requested = len; 1446 1447 ret = vmbus_sendpacket(hbus->hdev->channel, read_blk, 1448 sizeof(*read_blk), (unsigned long)&pkt.pkt, 1449 VM_PKT_DATA_INBAND, 1450 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1451 if (ret) 1452 return ret; 1453 1454 ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event); 1455 if (ret) 1456 return ret; 1457 1458 if (comp_pkt.comp_pkt.completion_status != 0 || 1459 comp_pkt.bytes_returned == 0) { 1460 dev_err(&hbus->hdev->device, 1461 "Read Config Block failed: 0x%x, bytes_returned=%d\n", 1462 comp_pkt.comp_pkt.completion_status, 1463 comp_pkt.bytes_returned); 1464 return -EIO; 1465 } 1466 1467 *bytes_returned = comp_pkt.bytes_returned; 1468 return 0; 1469 } 1470 1471 /** 1472 * hv_pci_write_config_compl() - Invoked when a response packet for a write 1473 * config block operation arrives. 1474 * @context: Identifies the write config operation 1475 * @resp: The response packet itself 1476 * @resp_packet_size: Size in bytes of the response packet 1477 */ 1478 static void hv_pci_write_config_compl(void *context, struct pci_response *resp, 1479 int resp_packet_size) 1480 { 1481 struct hv_pci_compl *comp_pkt = context; 1482 1483 comp_pkt->completion_status = resp->status; 1484 complete(&comp_pkt->host_event); 1485 } 1486 1487 /** 1488 * hv_write_config_block() - Sends a write config block request to the 1489 * back-end driver running in the Hyper-V parent partition. 1490 * @pdev: The PCI driver's representation for this device. 1491 * @buf: Buffer from which the config block will be copied. 1492 * @len: Size in bytes of buf. 1493 * @block_id: Identifies the config block which is being written. 1494 * 1495 * Return: 0 on success, -errno on failure 1496 */ 1497 static int hv_write_config_block(struct pci_dev *pdev, void *buf, 1498 unsigned int len, unsigned int block_id) 1499 { 1500 struct hv_pcibus_device *hbus = 1501 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1502 sysdata); 1503 struct { 1504 struct pci_packet pkt; 1505 char buf[sizeof(struct pci_write_block)]; 1506 u32 reserved; 1507 } pkt; 1508 struct hv_pci_compl comp_pkt; 1509 struct pci_write_block *write_blk; 1510 u32 pkt_size; 1511 int ret; 1512 1513 if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX) 1514 return -EINVAL; 1515 1516 init_completion(&comp_pkt.host_event); 1517 1518 memset(&pkt, 0, sizeof(pkt)); 1519 pkt.pkt.completion_func = hv_pci_write_config_compl; 1520 pkt.pkt.compl_ctxt = &comp_pkt; 1521 write_blk = (struct pci_write_block *)&pkt.pkt.message; 1522 write_blk->message_type.type = PCI_WRITE_BLOCK; 1523 write_blk->wslot.slot = devfn_to_wslot(pdev->devfn); 1524 write_blk->block_id = block_id; 1525 write_blk->byte_count = len; 1526 memcpy(write_blk->bytes, buf, len); 1527 pkt_size = offsetof(struct pci_write_block, bytes) + len; 1528 /* 1529 * This quirk is required on some hosts shipped around 2018, because 1530 * these hosts don't check the pkt_size correctly (new hosts have been 1531 * fixed since early 2019). The quirk is also safe on very old hosts 1532 * and new hosts, because, on them, what really matters is the length 1533 * specified in write_blk->byte_count. 1534 */ 1535 pkt_size += sizeof(pkt.reserved); 1536 1537 ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size, 1538 (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND, 1539 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1540 if (ret) 1541 return ret; 1542 1543 ret = wait_for_response(hbus->hdev, &comp_pkt.host_event); 1544 if (ret) 1545 return ret; 1546 1547 if (comp_pkt.completion_status != 0) { 1548 dev_err(&hbus->hdev->device, 1549 "Write Config Block failed: 0x%x\n", 1550 comp_pkt.completion_status); 1551 return -EIO; 1552 } 1553 1554 return 0; 1555 } 1556 1557 /** 1558 * hv_register_block_invalidate() - Invoked when a config block invalidation 1559 * arrives from the back-end driver. 1560 * @pdev: The PCI driver's representation for this device. 1561 * @context: Identifies the device. 1562 * @block_invalidate: Identifies all of the blocks being invalidated. 1563 * 1564 * Return: 0 on success, -errno on failure 1565 */ 1566 static int hv_register_block_invalidate(struct pci_dev *pdev, void *context, 1567 void (*block_invalidate)(void *context, 1568 u64 block_mask)) 1569 { 1570 struct hv_pcibus_device *hbus = 1571 container_of(pdev->bus->sysdata, struct hv_pcibus_device, 1572 sysdata); 1573 struct hv_pci_dev *hpdev; 1574 1575 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1576 if (!hpdev) 1577 return -ENODEV; 1578 1579 hpdev->block_invalidate = block_invalidate; 1580 hpdev->invalidate_context = context; 1581 1582 put_pcichild(hpdev); 1583 return 0; 1584 1585 } 1586 1587 /* Interrupt management hooks */ 1588 static void hv_int_desc_free(struct hv_pci_dev *hpdev, 1589 struct tran_int_desc *int_desc) 1590 { 1591 struct pci_delete_interrupt *int_pkt; 1592 struct { 1593 struct pci_packet pkt; 1594 u8 buffer[sizeof(struct pci_delete_interrupt)]; 1595 } ctxt; 1596 1597 if (!int_desc->vector_count) { 1598 kfree(int_desc); 1599 return; 1600 } 1601 memset(&ctxt, 0, sizeof(ctxt)); 1602 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; 1603 int_pkt->message_type.type = 1604 PCI_DELETE_INTERRUPT_MESSAGE; 1605 int_pkt->wslot.slot = hpdev->desc.win_slot.slot; 1606 int_pkt->int_desc = *int_desc; 1607 vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt), 1608 0, VM_PKT_DATA_INBAND, 0); 1609 kfree(int_desc); 1610 } 1611 1612 /** 1613 * hv_msi_free() - Free the MSI. 1614 * @domain: The interrupt domain pointer 1615 * @info: Extra MSI-related context 1616 * @irq: Identifies the IRQ. 1617 * 1618 * The Hyper-V parent partition and hypervisor are tracking the 1619 * messages that are in use, keeping the interrupt redirection 1620 * table up to date. This callback sends a message that frees 1621 * the IRT entry and related tracking nonsense. 1622 */ 1623 static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info, 1624 unsigned int irq) 1625 { 1626 struct hv_pcibus_device *hbus; 1627 struct hv_pci_dev *hpdev; 1628 struct pci_dev *pdev; 1629 struct tran_int_desc *int_desc; 1630 struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq); 1631 struct msi_desc *msi = irq_data_get_msi_desc(irq_data); 1632 1633 pdev = msi_desc_to_pci_dev(msi); 1634 hbus = info->data; 1635 int_desc = irq_data_get_irq_chip_data(irq_data); 1636 if (!int_desc) 1637 return; 1638 1639 irq_data->chip_data = NULL; 1640 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1641 if (!hpdev) { 1642 kfree(int_desc); 1643 return; 1644 } 1645 1646 hv_int_desc_free(hpdev, int_desc); 1647 put_pcichild(hpdev); 1648 } 1649 1650 static void hv_irq_mask(struct irq_data *data) 1651 { 1652 pci_msi_mask_irq(data); 1653 if (data->parent_data->chip->irq_mask) 1654 irq_chip_mask_parent(data); 1655 } 1656 1657 static void hv_irq_unmask(struct irq_data *data) 1658 { 1659 hv_arch_irq_unmask(data); 1660 1661 if (data->parent_data->chip->irq_unmask) 1662 irq_chip_unmask_parent(data); 1663 pci_msi_unmask_irq(data); 1664 } 1665 1666 struct compose_comp_ctxt { 1667 struct hv_pci_compl comp_pkt; 1668 struct tran_int_desc int_desc; 1669 }; 1670 1671 static void hv_pci_compose_compl(void *context, struct pci_response *resp, 1672 int resp_packet_size) 1673 { 1674 struct compose_comp_ctxt *comp_pkt = context; 1675 struct pci_create_int_response *int_resp = 1676 (struct pci_create_int_response *)resp; 1677 1678 if (resp_packet_size < sizeof(*int_resp)) { 1679 comp_pkt->comp_pkt.completion_status = -1; 1680 goto out; 1681 } 1682 comp_pkt->comp_pkt.completion_status = resp->status; 1683 comp_pkt->int_desc = int_resp->int_desc; 1684 out: 1685 complete(&comp_pkt->comp_pkt.host_event); 1686 } 1687 1688 static u32 hv_compose_msi_req_v1( 1689 struct pci_create_interrupt *int_pkt, 1690 u32 slot, u8 vector, u16 vector_count) 1691 { 1692 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; 1693 int_pkt->wslot.slot = slot; 1694 int_pkt->int_desc.vector = vector; 1695 int_pkt->int_desc.vector_count = vector_count; 1696 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1697 1698 /* 1699 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in 1700 * hv_irq_unmask(). 1701 */ 1702 int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL; 1703 1704 return sizeof(*int_pkt); 1705 } 1706 1707 /* 1708 * The vCPU selected by hv_compose_multi_msi_req_get_cpu() and 1709 * hv_compose_msi_req_get_cpu() is a "dummy" vCPU because the final vCPU to be 1710 * interrupted is specified later in hv_irq_unmask() and communicated to Hyper-V 1711 * via the HVCALL_RETARGET_INTERRUPT hypercall. But the choice of dummy vCPU is 1712 * not irrelevant because Hyper-V chooses the physical CPU to handle the 1713 * interrupts based on the vCPU specified in message sent to the vPCI VSP in 1714 * hv_compose_msi_msg(). Hyper-V's choice of pCPU is not visible to the guest, 1715 * but assigning too many vPCI device interrupts to the same pCPU can cause a 1716 * performance bottleneck. So we spread out the dummy vCPUs to influence Hyper-V 1717 * to spread out the pCPUs that it selects. 1718 * 1719 * For the single-MSI and MSI-X cases, it's OK for hv_compose_msi_req_get_cpu() 1720 * to always return the same dummy vCPU, because a second call to 1721 * hv_compose_msi_msg() contains the "real" vCPU, causing Hyper-V to choose a 1722 * new pCPU for the interrupt. But for the multi-MSI case, the second call to 1723 * hv_compose_msi_msg() exits without sending a message to the vPCI VSP, so the 1724 * original dummy vCPU is used. This dummy vCPU must be round-robin'ed so that 1725 * the pCPUs are spread out. All interrupts for a multi-MSI device end up using 1726 * the same pCPU, even though the vCPUs will be spread out by later calls 1727 * to hv_irq_unmask(), but that is the best we can do now. 1728 * 1729 * With Hyper-V in Nov 2022, the HVCALL_RETARGET_INTERRUPT hypercall does *not* 1730 * cause Hyper-V to reselect the pCPU based on the specified vCPU. Such an 1731 * enhancement is planned for a future version. With that enhancement, the 1732 * dummy vCPU selection won't matter, and interrupts for the same multi-MSI 1733 * device will be spread across multiple pCPUs. 1734 */ 1735 1736 /* 1737 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten 1738 * by subsequent retarget in hv_irq_unmask(). 1739 */ 1740 static int hv_compose_msi_req_get_cpu(const struct cpumask *affinity) 1741 { 1742 return cpumask_first_and(affinity, cpu_online_mask); 1743 } 1744 1745 /* 1746 * Make sure the dummy vCPU values for multi-MSI don't all point to vCPU0. 1747 */ 1748 static int hv_compose_multi_msi_req_get_cpu(void) 1749 { 1750 static DEFINE_SPINLOCK(multi_msi_cpu_lock); 1751 1752 /* -1 means starting with CPU 0 */ 1753 static int cpu_next = -1; 1754 1755 unsigned long flags; 1756 int cpu; 1757 1758 spin_lock_irqsave(&multi_msi_cpu_lock, flags); 1759 1760 cpu_next = cpumask_next_wrap(cpu_next, cpu_online_mask, nr_cpu_ids, 1761 false); 1762 cpu = cpu_next; 1763 1764 spin_unlock_irqrestore(&multi_msi_cpu_lock, flags); 1765 1766 return cpu; 1767 } 1768 1769 static u32 hv_compose_msi_req_v2( 1770 struct pci_create_interrupt2 *int_pkt, int cpu, 1771 u32 slot, u8 vector, u16 vector_count) 1772 { 1773 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2; 1774 int_pkt->wslot.slot = slot; 1775 int_pkt->int_desc.vector = vector; 1776 int_pkt->int_desc.vector_count = vector_count; 1777 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1778 int_pkt->int_desc.processor_array[0] = 1779 hv_cpu_number_to_vp_number(cpu); 1780 int_pkt->int_desc.processor_count = 1; 1781 1782 return sizeof(*int_pkt); 1783 } 1784 1785 static u32 hv_compose_msi_req_v3( 1786 struct pci_create_interrupt3 *int_pkt, int cpu, 1787 u32 slot, u32 vector, u16 vector_count) 1788 { 1789 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3; 1790 int_pkt->wslot.slot = slot; 1791 int_pkt->int_desc.vector = vector; 1792 int_pkt->int_desc.reserved = 0; 1793 int_pkt->int_desc.vector_count = vector_count; 1794 int_pkt->int_desc.delivery_mode = DELIVERY_MODE; 1795 int_pkt->int_desc.processor_array[0] = 1796 hv_cpu_number_to_vp_number(cpu); 1797 int_pkt->int_desc.processor_count = 1; 1798 1799 return sizeof(*int_pkt); 1800 } 1801 1802 /** 1803 * hv_compose_msi_msg() - Supplies a valid MSI address/data 1804 * @data: Everything about this MSI 1805 * @msg: Buffer that is filled in by this function 1806 * 1807 * This function unpacks the IRQ looking for target CPU set, IDT 1808 * vector and mode and sends a message to the parent partition 1809 * asking for a mapping for that tuple in this partition. The 1810 * response supplies a data value and address to which that data 1811 * should be written to trigger that interrupt. 1812 */ 1813 static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) 1814 { 1815 struct hv_pcibus_device *hbus; 1816 struct vmbus_channel *channel; 1817 struct hv_pci_dev *hpdev; 1818 struct pci_bus *pbus; 1819 struct pci_dev *pdev; 1820 const struct cpumask *dest; 1821 struct compose_comp_ctxt comp; 1822 struct tran_int_desc *int_desc; 1823 struct msi_desc *msi_desc; 1824 /* 1825 * vector_count should be u16: see hv_msi_desc, hv_msi_desc2 1826 * and hv_msi_desc3. vector must be u32: see hv_msi_desc3. 1827 */ 1828 u16 vector_count; 1829 u32 vector; 1830 struct { 1831 struct pci_packet pci_pkt; 1832 union { 1833 struct pci_create_interrupt v1; 1834 struct pci_create_interrupt2 v2; 1835 struct pci_create_interrupt3 v3; 1836 } int_pkts; 1837 } __packed ctxt; 1838 bool multi_msi; 1839 u64 trans_id; 1840 u32 size; 1841 int ret; 1842 int cpu; 1843 1844 msi_desc = irq_data_get_msi_desc(data); 1845 multi_msi = !msi_desc->pci.msi_attrib.is_msix && 1846 msi_desc->nvec_used > 1; 1847 1848 /* Reuse the previous allocation */ 1849 if (data->chip_data && multi_msi) { 1850 int_desc = data->chip_data; 1851 msg->address_hi = int_desc->address >> 32; 1852 msg->address_lo = int_desc->address & 0xffffffff; 1853 msg->data = int_desc->data; 1854 return; 1855 } 1856 1857 pdev = msi_desc_to_pci_dev(msi_desc); 1858 dest = irq_data_get_effective_affinity_mask(data); 1859 pbus = pdev->bus; 1860 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 1861 channel = hbus->hdev->channel; 1862 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1863 if (!hpdev) 1864 goto return_null_message; 1865 1866 /* Free any previous message that might have already been composed. */ 1867 if (data->chip_data && !multi_msi) { 1868 int_desc = data->chip_data; 1869 data->chip_data = NULL; 1870 hv_int_desc_free(hpdev, int_desc); 1871 } 1872 1873 int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC); 1874 if (!int_desc) 1875 goto drop_reference; 1876 1877 if (multi_msi) { 1878 /* 1879 * If this is not the first MSI of Multi MSI, we already have 1880 * a mapping. Can exit early. 1881 */ 1882 if (msi_desc->irq != data->irq) { 1883 data->chip_data = int_desc; 1884 int_desc->address = msi_desc->msg.address_lo | 1885 (u64)msi_desc->msg.address_hi << 32; 1886 int_desc->data = msi_desc->msg.data + 1887 (data->irq - msi_desc->irq); 1888 msg->address_hi = msi_desc->msg.address_hi; 1889 msg->address_lo = msi_desc->msg.address_lo; 1890 msg->data = int_desc->data; 1891 put_pcichild(hpdev); 1892 return; 1893 } 1894 /* 1895 * The vector we select here is a dummy value. The correct 1896 * value gets sent to the hypervisor in unmask(). This needs 1897 * to be aligned with the count, and also not zero. Multi-msi 1898 * is powers of 2 up to 32, so 32 will always work here. 1899 */ 1900 vector = 32; 1901 vector_count = msi_desc->nvec_used; 1902 cpu = hv_compose_multi_msi_req_get_cpu(); 1903 } else { 1904 vector = hv_msi_get_int_vector(data); 1905 vector_count = 1; 1906 cpu = hv_compose_msi_req_get_cpu(dest); 1907 } 1908 1909 /* 1910 * hv_compose_msi_req_v1 and v2 are for x86 only, meaning 'vector' 1911 * can't exceed u8. Cast 'vector' down to u8 for v1/v2 explicitly 1912 * for better readability. 1913 */ 1914 memset(&ctxt, 0, sizeof(ctxt)); 1915 init_completion(&comp.comp_pkt.host_event); 1916 ctxt.pci_pkt.completion_func = hv_pci_compose_compl; 1917 ctxt.pci_pkt.compl_ctxt = ∁ 1918 1919 switch (hbus->protocol_version) { 1920 case PCI_PROTOCOL_VERSION_1_1: 1921 size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1, 1922 hpdev->desc.win_slot.slot, 1923 (u8)vector, 1924 vector_count); 1925 break; 1926 1927 case PCI_PROTOCOL_VERSION_1_2: 1928 case PCI_PROTOCOL_VERSION_1_3: 1929 size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2, 1930 cpu, 1931 hpdev->desc.win_slot.slot, 1932 (u8)vector, 1933 vector_count); 1934 break; 1935 1936 case PCI_PROTOCOL_VERSION_1_4: 1937 size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3, 1938 cpu, 1939 hpdev->desc.win_slot.slot, 1940 vector, 1941 vector_count); 1942 break; 1943 1944 default: 1945 /* As we only negotiate protocol versions known to this driver, 1946 * this path should never hit. However, this is it not a hot 1947 * path so we print a message to aid future updates. 1948 */ 1949 dev_err(&hbus->hdev->device, 1950 "Unexpected vPCI protocol, update driver."); 1951 goto free_int_desc; 1952 } 1953 1954 ret = vmbus_sendpacket_getid(hpdev->hbus->hdev->channel, &ctxt.int_pkts, 1955 size, (unsigned long)&ctxt.pci_pkt, 1956 &trans_id, VM_PKT_DATA_INBAND, 1957 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 1958 if (ret) { 1959 dev_err(&hbus->hdev->device, 1960 "Sending request for interrupt failed: 0x%x", 1961 comp.comp_pkt.completion_status); 1962 goto free_int_desc; 1963 } 1964 1965 /* 1966 * Prevents hv_pci_onchannelcallback() from running concurrently 1967 * in the tasklet. 1968 */ 1969 tasklet_disable_in_atomic(&channel->callback_event); 1970 1971 /* 1972 * Since this function is called with IRQ locks held, can't 1973 * do normal wait for completion; instead poll. 1974 */ 1975 while (!try_wait_for_completion(&comp.comp_pkt.host_event)) { 1976 unsigned long flags; 1977 1978 /* 0xFFFF means an invalid PCI VENDOR ID. */ 1979 if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) { 1980 dev_err_once(&hbus->hdev->device, 1981 "the device has gone\n"); 1982 goto enable_tasklet; 1983 } 1984 1985 /* 1986 * Make sure that the ring buffer data structure doesn't get 1987 * freed while we dereference the ring buffer pointer. Test 1988 * for the channel's onchannel_callback being NULL within a 1989 * sched_lock critical section. See also the inline comments 1990 * in vmbus_reset_channel_cb(). 1991 */ 1992 spin_lock_irqsave(&channel->sched_lock, flags); 1993 if (unlikely(channel->onchannel_callback == NULL)) { 1994 spin_unlock_irqrestore(&channel->sched_lock, flags); 1995 goto enable_tasklet; 1996 } 1997 hv_pci_onchannelcallback(hbus); 1998 spin_unlock_irqrestore(&channel->sched_lock, flags); 1999 2000 udelay(100); 2001 } 2002 2003 tasklet_enable(&channel->callback_event); 2004 2005 if (comp.comp_pkt.completion_status < 0) { 2006 dev_err(&hbus->hdev->device, 2007 "Request for interrupt failed: 0x%x", 2008 comp.comp_pkt.completion_status); 2009 goto free_int_desc; 2010 } 2011 2012 /* 2013 * Record the assignment so that this can be unwound later. Using 2014 * irq_set_chip_data() here would be appropriate, but the lock it takes 2015 * is already held. 2016 */ 2017 *int_desc = comp.int_desc; 2018 data->chip_data = int_desc; 2019 2020 /* Pass up the result. */ 2021 msg->address_hi = comp.int_desc.address >> 32; 2022 msg->address_lo = comp.int_desc.address & 0xffffffff; 2023 msg->data = comp.int_desc.data; 2024 2025 put_pcichild(hpdev); 2026 return; 2027 2028 enable_tasklet: 2029 tasklet_enable(&channel->callback_event); 2030 /* 2031 * The completion packet on the stack becomes invalid after 'return'; 2032 * remove the ID from the VMbus requestor if the identifier is still 2033 * mapped to/associated with the packet. (The identifier could have 2034 * been 're-used', i.e., already removed and (re-)mapped.) 2035 * 2036 * Cf. hv_pci_onchannelcallback(). 2037 */ 2038 vmbus_request_addr_match(channel, trans_id, (unsigned long)&ctxt.pci_pkt); 2039 free_int_desc: 2040 kfree(int_desc); 2041 drop_reference: 2042 put_pcichild(hpdev); 2043 return_null_message: 2044 msg->address_hi = 0; 2045 msg->address_lo = 0; 2046 msg->data = 0; 2047 } 2048 2049 /* HW Interrupt Chip Descriptor */ 2050 static struct irq_chip hv_msi_irq_chip = { 2051 .name = "Hyper-V PCIe MSI", 2052 .irq_compose_msi_msg = hv_compose_msi_msg, 2053 .irq_set_affinity = irq_chip_set_affinity_parent, 2054 #ifdef CONFIG_X86 2055 .irq_ack = irq_chip_ack_parent, 2056 .flags = IRQCHIP_MOVE_DEFERRED, 2057 #elif defined(CONFIG_ARM64) 2058 .irq_eoi = irq_chip_eoi_parent, 2059 #endif 2060 .irq_mask = hv_irq_mask, 2061 .irq_unmask = hv_irq_unmask, 2062 }; 2063 2064 static struct msi_domain_ops hv_msi_ops = { 2065 .msi_prepare = hv_msi_prepare, 2066 .msi_free = hv_msi_free, 2067 }; 2068 2069 /** 2070 * hv_pcie_init_irq_domain() - Initialize IRQ domain 2071 * @hbus: The root PCI bus 2072 * 2073 * This function creates an IRQ domain which will be used for 2074 * interrupts from devices that have been passed through. These 2075 * devices only support MSI and MSI-X, not line-based interrupts 2076 * or simulations of line-based interrupts through PCIe's 2077 * fabric-layer messages. Because interrupts are remapped, we 2078 * can support multi-message MSI here. 2079 * 2080 * Return: '0' on success and error value on failure 2081 */ 2082 static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus) 2083 { 2084 hbus->msi_info.chip = &hv_msi_irq_chip; 2085 hbus->msi_info.ops = &hv_msi_ops; 2086 hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS | 2087 MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI | 2088 MSI_FLAG_PCI_MSIX); 2089 hbus->msi_info.handler = FLOW_HANDLER; 2090 hbus->msi_info.handler_name = FLOW_NAME; 2091 hbus->msi_info.data = hbus; 2092 hbus->irq_domain = pci_msi_create_irq_domain(hbus->fwnode, 2093 &hbus->msi_info, 2094 hv_pci_get_root_domain()); 2095 if (!hbus->irq_domain) { 2096 dev_err(&hbus->hdev->device, 2097 "Failed to build an MSI IRQ domain\n"); 2098 return -ENODEV; 2099 } 2100 2101 dev_set_msi_domain(&hbus->bridge->dev, hbus->irq_domain); 2102 2103 return 0; 2104 } 2105 2106 /** 2107 * get_bar_size() - Get the address space consumed by a BAR 2108 * @bar_val: Value that a BAR returned after -1 was written 2109 * to it. 2110 * 2111 * This function returns the size of the BAR, rounded up to 1 2112 * page. It has to be rounded up because the hypervisor's page 2113 * table entry that maps the BAR into the VM can't specify an 2114 * offset within a page. The invariant is that the hypervisor 2115 * must place any BARs of smaller than page length at the 2116 * beginning of a page. 2117 * 2118 * Return: Size in bytes of the consumed MMIO space. 2119 */ 2120 static u64 get_bar_size(u64 bar_val) 2121 { 2122 return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)), 2123 PAGE_SIZE); 2124 } 2125 2126 /** 2127 * survey_child_resources() - Total all MMIO requirements 2128 * @hbus: Root PCI bus, as understood by this driver 2129 */ 2130 static void survey_child_resources(struct hv_pcibus_device *hbus) 2131 { 2132 struct hv_pci_dev *hpdev; 2133 resource_size_t bar_size = 0; 2134 unsigned long flags; 2135 struct completion *event; 2136 u64 bar_val; 2137 int i; 2138 2139 /* If nobody is waiting on the answer, don't compute it. */ 2140 event = xchg(&hbus->survey_event, NULL); 2141 if (!event) 2142 return; 2143 2144 /* If the answer has already been computed, go with it. */ 2145 if (hbus->low_mmio_space || hbus->high_mmio_space) { 2146 complete(event); 2147 return; 2148 } 2149 2150 spin_lock_irqsave(&hbus->device_list_lock, flags); 2151 2152 /* 2153 * Due to an interesting quirk of the PCI spec, all memory regions 2154 * for a child device are a power of 2 in size and aligned in memory, 2155 * so it's sufficient to just add them up without tracking alignment. 2156 */ 2157 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2158 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 2159 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO) 2160 dev_err(&hbus->hdev->device, 2161 "There's an I/O BAR in this list!\n"); 2162 2163 if (hpdev->probed_bar[i] != 0) { 2164 /* 2165 * A probed BAR has all the upper bits set that 2166 * can be changed. 2167 */ 2168 2169 bar_val = hpdev->probed_bar[i]; 2170 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 2171 bar_val |= 2172 ((u64)hpdev->probed_bar[++i] << 32); 2173 else 2174 bar_val |= 0xffffffff00000000ULL; 2175 2176 bar_size = get_bar_size(bar_val); 2177 2178 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64) 2179 hbus->high_mmio_space += bar_size; 2180 else 2181 hbus->low_mmio_space += bar_size; 2182 } 2183 } 2184 } 2185 2186 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2187 complete(event); 2188 } 2189 2190 /** 2191 * prepopulate_bars() - Fill in BARs with defaults 2192 * @hbus: Root PCI bus, as understood by this driver 2193 * 2194 * The core PCI driver code seems much, much happier if the BARs 2195 * for a device have values upon first scan. So fill them in. 2196 * The algorithm below works down from large sizes to small, 2197 * attempting to pack the assignments optimally. The assumption, 2198 * enforced in other parts of the code, is that the beginning of 2199 * the memory-mapped I/O space will be aligned on the largest 2200 * BAR size. 2201 */ 2202 static void prepopulate_bars(struct hv_pcibus_device *hbus) 2203 { 2204 resource_size_t high_size = 0; 2205 resource_size_t low_size = 0; 2206 resource_size_t high_base = 0; 2207 resource_size_t low_base = 0; 2208 resource_size_t bar_size; 2209 struct hv_pci_dev *hpdev; 2210 unsigned long flags; 2211 u64 bar_val; 2212 u32 command; 2213 bool high; 2214 int i; 2215 2216 if (hbus->low_mmio_space) { 2217 low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 2218 low_base = hbus->low_mmio_res->start; 2219 } 2220 2221 if (hbus->high_mmio_space) { 2222 high_size = 1ULL << 2223 (63 - __builtin_clzll(hbus->high_mmio_space)); 2224 high_base = hbus->high_mmio_res->start; 2225 } 2226 2227 spin_lock_irqsave(&hbus->device_list_lock, flags); 2228 2229 /* 2230 * Clear the memory enable bit, in case it's already set. This occurs 2231 * in the suspend path of hibernation, where the device is suspended, 2232 * resumed and suspended again: see hibernation_snapshot() and 2233 * hibernation_platform_enter(). 2234 * 2235 * If the memory enable bit is already set, Hyper-V silently ignores 2236 * the below BAR updates, and the related PCI device driver can not 2237 * work, because reading from the device register(s) always returns 2238 * 0xFFFFFFFF (PCI_ERROR_RESPONSE). 2239 */ 2240 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2241 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command); 2242 command &= ~PCI_COMMAND_MEMORY; 2243 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command); 2244 } 2245 2246 /* Pick addresses for the BARs. */ 2247 do { 2248 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2249 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 2250 bar_val = hpdev->probed_bar[i]; 2251 if (bar_val == 0) 2252 continue; 2253 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64; 2254 if (high) { 2255 bar_val |= 2256 ((u64)hpdev->probed_bar[i + 1] 2257 << 32); 2258 } else { 2259 bar_val |= 0xffffffffULL << 32; 2260 } 2261 bar_size = get_bar_size(bar_val); 2262 if (high) { 2263 if (high_size != bar_size) { 2264 i++; 2265 continue; 2266 } 2267 _hv_pcifront_write_config(hpdev, 2268 PCI_BASE_ADDRESS_0 + (4 * i), 2269 4, 2270 (u32)(high_base & 0xffffff00)); 2271 i++; 2272 _hv_pcifront_write_config(hpdev, 2273 PCI_BASE_ADDRESS_0 + (4 * i), 2274 4, (u32)(high_base >> 32)); 2275 high_base += bar_size; 2276 } else { 2277 if (low_size != bar_size) 2278 continue; 2279 _hv_pcifront_write_config(hpdev, 2280 PCI_BASE_ADDRESS_0 + (4 * i), 2281 4, 2282 (u32)(low_base & 0xffffff00)); 2283 low_base += bar_size; 2284 } 2285 } 2286 if (high_size <= 1 && low_size <= 1) { 2287 /* 2288 * No need to set the PCI_COMMAND_MEMORY bit as 2289 * the core PCI driver doesn't require the bit 2290 * to be pre-set. Actually here we intentionally 2291 * keep the bit off so that the PCI BAR probing 2292 * in the core PCI driver doesn't cause Hyper-V 2293 * to unnecessarily unmap/map the virtual BARs 2294 * from/to the physical BARs multiple times. 2295 * This reduces the VM boot time significantly 2296 * if the BAR sizes are huge. 2297 */ 2298 break; 2299 } 2300 } 2301 2302 high_size >>= 1; 2303 low_size >>= 1; 2304 } while (high_size || low_size); 2305 2306 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2307 } 2308 2309 /* 2310 * Assign entries in sysfs pci slot directory. 2311 * 2312 * Note that this function does not need to lock the children list 2313 * because it is called from pci_devices_present_work which 2314 * is serialized with hv_eject_device_work because they are on the 2315 * same ordered workqueue. Therefore hbus->children list will not change 2316 * even when pci_create_slot sleeps. 2317 */ 2318 static void hv_pci_assign_slots(struct hv_pcibus_device *hbus) 2319 { 2320 struct hv_pci_dev *hpdev; 2321 char name[SLOT_NAME_SIZE]; 2322 int slot_nr; 2323 2324 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2325 if (hpdev->pci_slot) 2326 continue; 2327 2328 slot_nr = PCI_SLOT(wslot_to_devfn(hpdev->desc.win_slot.slot)); 2329 snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser); 2330 hpdev->pci_slot = pci_create_slot(hbus->bridge->bus, slot_nr, 2331 name, NULL); 2332 if (IS_ERR(hpdev->pci_slot)) { 2333 pr_warn("pci_create slot %s failed\n", name); 2334 hpdev->pci_slot = NULL; 2335 } 2336 } 2337 } 2338 2339 /* 2340 * Remove entries in sysfs pci slot directory. 2341 */ 2342 static void hv_pci_remove_slots(struct hv_pcibus_device *hbus) 2343 { 2344 struct hv_pci_dev *hpdev; 2345 2346 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2347 if (!hpdev->pci_slot) 2348 continue; 2349 pci_destroy_slot(hpdev->pci_slot); 2350 hpdev->pci_slot = NULL; 2351 } 2352 } 2353 2354 /* 2355 * Set NUMA node for the devices on the bus 2356 */ 2357 static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus) 2358 { 2359 struct pci_dev *dev; 2360 struct pci_bus *bus = hbus->bridge->bus; 2361 struct hv_pci_dev *hv_dev; 2362 2363 list_for_each_entry(dev, &bus->devices, bus_list) { 2364 hv_dev = get_pcichild_wslot(hbus, devfn_to_wslot(dev->devfn)); 2365 if (!hv_dev) 2366 continue; 2367 2368 if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY && 2369 hv_dev->desc.virtual_numa_node < num_possible_nodes()) 2370 /* 2371 * The kernel may boot with some NUMA nodes offline 2372 * (e.g. in a KDUMP kernel) or with NUMA disabled via 2373 * "numa=off". In those cases, adjust the host provided 2374 * NUMA node to a valid NUMA node used by the kernel. 2375 */ 2376 set_dev_node(&dev->dev, 2377 numa_map_to_online_node( 2378 hv_dev->desc.virtual_numa_node)); 2379 2380 put_pcichild(hv_dev); 2381 } 2382 } 2383 2384 /** 2385 * create_root_hv_pci_bus() - Expose a new root PCI bus 2386 * @hbus: Root PCI bus, as understood by this driver 2387 * 2388 * Return: 0 on success, -errno on failure 2389 */ 2390 static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus) 2391 { 2392 int error; 2393 struct pci_host_bridge *bridge = hbus->bridge; 2394 2395 bridge->dev.parent = &hbus->hdev->device; 2396 bridge->sysdata = &hbus->sysdata; 2397 bridge->ops = &hv_pcifront_ops; 2398 2399 error = pci_scan_root_bus_bridge(bridge); 2400 if (error) 2401 return error; 2402 2403 pci_lock_rescan_remove(); 2404 hv_pci_assign_numa_node(hbus); 2405 pci_bus_assign_resources(bridge->bus); 2406 hv_pci_assign_slots(hbus); 2407 pci_bus_add_devices(bridge->bus); 2408 pci_unlock_rescan_remove(); 2409 hbus->state = hv_pcibus_installed; 2410 return 0; 2411 } 2412 2413 struct q_res_req_compl { 2414 struct completion host_event; 2415 struct hv_pci_dev *hpdev; 2416 }; 2417 2418 /** 2419 * q_resource_requirements() - Query Resource Requirements 2420 * @context: The completion context. 2421 * @resp: The response that came from the host. 2422 * @resp_packet_size: The size in bytes of resp. 2423 * 2424 * This function is invoked on completion of a Query Resource 2425 * Requirements packet. 2426 */ 2427 static void q_resource_requirements(void *context, struct pci_response *resp, 2428 int resp_packet_size) 2429 { 2430 struct q_res_req_compl *completion = context; 2431 struct pci_q_res_req_response *q_res_req = 2432 (struct pci_q_res_req_response *)resp; 2433 s32 status; 2434 int i; 2435 2436 status = (resp_packet_size < sizeof(*q_res_req)) ? -1 : resp->status; 2437 if (status < 0) { 2438 dev_err(&completion->hpdev->hbus->hdev->device, 2439 "query resource requirements failed: %x\n", 2440 status); 2441 } else { 2442 for (i = 0; i < PCI_STD_NUM_BARS; i++) { 2443 completion->hpdev->probed_bar[i] = 2444 q_res_req->probed_bar[i]; 2445 } 2446 } 2447 2448 complete(&completion->host_event); 2449 } 2450 2451 /** 2452 * new_pcichild_device() - Create a new child device 2453 * @hbus: The internal struct tracking this root PCI bus. 2454 * @desc: The information supplied so far from the host 2455 * about the device. 2456 * 2457 * This function creates the tracking structure for a new child 2458 * device and kicks off the process of figuring out what it is. 2459 * 2460 * Return: Pointer to the new tracking struct 2461 */ 2462 static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus, 2463 struct hv_pcidev_description *desc) 2464 { 2465 struct hv_pci_dev *hpdev; 2466 struct pci_child_message *res_req; 2467 struct q_res_req_compl comp_pkt; 2468 struct { 2469 struct pci_packet init_packet; 2470 u8 buffer[sizeof(struct pci_child_message)]; 2471 } pkt; 2472 unsigned long flags; 2473 int ret; 2474 2475 hpdev = kzalloc(sizeof(*hpdev), GFP_KERNEL); 2476 if (!hpdev) 2477 return NULL; 2478 2479 hpdev->hbus = hbus; 2480 2481 memset(&pkt, 0, sizeof(pkt)); 2482 init_completion(&comp_pkt.host_event); 2483 comp_pkt.hpdev = hpdev; 2484 pkt.init_packet.compl_ctxt = &comp_pkt; 2485 pkt.init_packet.completion_func = q_resource_requirements; 2486 res_req = (struct pci_child_message *)&pkt.init_packet.message; 2487 res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS; 2488 res_req->wslot.slot = desc->win_slot.slot; 2489 2490 ret = vmbus_sendpacket(hbus->hdev->channel, res_req, 2491 sizeof(struct pci_child_message), 2492 (unsigned long)&pkt.init_packet, 2493 VM_PKT_DATA_INBAND, 2494 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 2495 if (ret) 2496 goto error; 2497 2498 if (wait_for_response(hbus->hdev, &comp_pkt.host_event)) 2499 goto error; 2500 2501 hpdev->desc = *desc; 2502 refcount_set(&hpdev->refs, 1); 2503 get_pcichild(hpdev); 2504 spin_lock_irqsave(&hbus->device_list_lock, flags); 2505 2506 list_add_tail(&hpdev->list_entry, &hbus->children); 2507 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2508 return hpdev; 2509 2510 error: 2511 kfree(hpdev); 2512 return NULL; 2513 } 2514 2515 /** 2516 * get_pcichild_wslot() - Find device from slot 2517 * @hbus: Root PCI bus, as understood by this driver 2518 * @wslot: Location on the bus 2519 * 2520 * This function looks up a PCI device and returns the internal 2521 * representation of it. It acquires a reference on it, so that 2522 * the device won't be deleted while somebody is using it. The 2523 * caller is responsible for calling put_pcichild() to release 2524 * this reference. 2525 * 2526 * Return: Internal representation of a PCI device 2527 */ 2528 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus, 2529 u32 wslot) 2530 { 2531 unsigned long flags; 2532 struct hv_pci_dev *iter, *hpdev = NULL; 2533 2534 spin_lock_irqsave(&hbus->device_list_lock, flags); 2535 list_for_each_entry(iter, &hbus->children, list_entry) { 2536 if (iter->desc.win_slot.slot == wslot) { 2537 hpdev = iter; 2538 get_pcichild(hpdev); 2539 break; 2540 } 2541 } 2542 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2543 2544 return hpdev; 2545 } 2546 2547 /** 2548 * pci_devices_present_work() - Handle new list of child devices 2549 * @work: Work struct embedded in struct hv_dr_work 2550 * 2551 * "Bus Relations" is the Windows term for "children of this 2552 * bus." The terminology is preserved here for people trying to 2553 * debug the interaction between Hyper-V and Linux. This 2554 * function is called when the parent partition reports a list 2555 * of functions that should be observed under this PCI Express 2556 * port (bus). 2557 * 2558 * This function updates the list, and must tolerate being 2559 * called multiple times with the same information. The typical 2560 * number of child devices is one, with very atypical cases 2561 * involving three or four, so the algorithms used here can be 2562 * simple and inefficient. 2563 * 2564 * It must also treat the omission of a previously observed device as 2565 * notification that the device no longer exists. 2566 * 2567 * Note that this function is serialized with hv_eject_device_work(), 2568 * because both are pushed to the ordered workqueue hbus->wq. 2569 */ 2570 static void pci_devices_present_work(struct work_struct *work) 2571 { 2572 u32 child_no; 2573 bool found; 2574 struct hv_pcidev_description *new_desc; 2575 struct hv_pci_dev *hpdev; 2576 struct hv_pcibus_device *hbus; 2577 struct list_head removed; 2578 struct hv_dr_work *dr_wrk; 2579 struct hv_dr_state *dr = NULL; 2580 unsigned long flags; 2581 2582 dr_wrk = container_of(work, struct hv_dr_work, wrk); 2583 hbus = dr_wrk->bus; 2584 kfree(dr_wrk); 2585 2586 INIT_LIST_HEAD(&removed); 2587 2588 /* Pull this off the queue and process it if it was the last one. */ 2589 spin_lock_irqsave(&hbus->device_list_lock, flags); 2590 while (!list_empty(&hbus->dr_list)) { 2591 dr = list_first_entry(&hbus->dr_list, struct hv_dr_state, 2592 list_entry); 2593 list_del(&dr->list_entry); 2594 2595 /* Throw this away if the list still has stuff in it. */ 2596 if (!list_empty(&hbus->dr_list)) { 2597 kfree(dr); 2598 continue; 2599 } 2600 } 2601 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2602 2603 if (!dr) 2604 return; 2605 2606 mutex_lock(&hbus->state_lock); 2607 2608 /* First, mark all existing children as reported missing. */ 2609 spin_lock_irqsave(&hbus->device_list_lock, flags); 2610 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2611 hpdev->reported_missing = true; 2612 } 2613 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2614 2615 /* Next, add back any reported devices. */ 2616 for (child_no = 0; child_no < dr->device_count; child_no++) { 2617 found = false; 2618 new_desc = &dr->func[child_no]; 2619 2620 spin_lock_irqsave(&hbus->device_list_lock, flags); 2621 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2622 if ((hpdev->desc.win_slot.slot == new_desc->win_slot.slot) && 2623 (hpdev->desc.v_id == new_desc->v_id) && 2624 (hpdev->desc.d_id == new_desc->d_id) && 2625 (hpdev->desc.ser == new_desc->ser)) { 2626 hpdev->reported_missing = false; 2627 found = true; 2628 } 2629 } 2630 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2631 2632 if (!found) { 2633 hpdev = new_pcichild_device(hbus, new_desc); 2634 if (!hpdev) 2635 dev_err(&hbus->hdev->device, 2636 "couldn't record a child device.\n"); 2637 } 2638 } 2639 2640 /* Move missing children to a list on the stack. */ 2641 spin_lock_irqsave(&hbus->device_list_lock, flags); 2642 do { 2643 found = false; 2644 list_for_each_entry(hpdev, &hbus->children, list_entry) { 2645 if (hpdev->reported_missing) { 2646 found = true; 2647 put_pcichild(hpdev); 2648 list_move_tail(&hpdev->list_entry, &removed); 2649 break; 2650 } 2651 } 2652 } while (found); 2653 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2654 2655 /* Delete everything that should no longer exist. */ 2656 while (!list_empty(&removed)) { 2657 hpdev = list_first_entry(&removed, struct hv_pci_dev, 2658 list_entry); 2659 list_del(&hpdev->list_entry); 2660 2661 if (hpdev->pci_slot) 2662 pci_destroy_slot(hpdev->pci_slot); 2663 2664 put_pcichild(hpdev); 2665 } 2666 2667 switch (hbus->state) { 2668 case hv_pcibus_installed: 2669 /* 2670 * Tell the core to rescan bus 2671 * because there may have been changes. 2672 */ 2673 pci_lock_rescan_remove(); 2674 pci_scan_child_bus(hbus->bridge->bus); 2675 hv_pci_assign_numa_node(hbus); 2676 hv_pci_assign_slots(hbus); 2677 pci_unlock_rescan_remove(); 2678 break; 2679 2680 case hv_pcibus_init: 2681 case hv_pcibus_probed: 2682 survey_child_resources(hbus); 2683 break; 2684 2685 default: 2686 break; 2687 } 2688 2689 mutex_unlock(&hbus->state_lock); 2690 2691 kfree(dr); 2692 } 2693 2694 /** 2695 * hv_pci_start_relations_work() - Queue work to start device discovery 2696 * @hbus: Root PCI bus, as understood by this driver 2697 * @dr: The list of children returned from host 2698 * 2699 * Return: 0 on success, -errno on failure 2700 */ 2701 static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus, 2702 struct hv_dr_state *dr) 2703 { 2704 struct hv_dr_work *dr_wrk; 2705 unsigned long flags; 2706 bool pending_dr; 2707 2708 if (hbus->state == hv_pcibus_removing) { 2709 dev_info(&hbus->hdev->device, 2710 "PCI VMBus BUS_RELATIONS: ignored\n"); 2711 return -ENOENT; 2712 } 2713 2714 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT); 2715 if (!dr_wrk) 2716 return -ENOMEM; 2717 2718 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work); 2719 dr_wrk->bus = hbus; 2720 2721 spin_lock_irqsave(&hbus->device_list_lock, flags); 2722 /* 2723 * If pending_dr is true, we have already queued a work, 2724 * which will see the new dr. Otherwise, we need to 2725 * queue a new work. 2726 */ 2727 pending_dr = !list_empty(&hbus->dr_list); 2728 list_add_tail(&dr->list_entry, &hbus->dr_list); 2729 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2730 2731 if (pending_dr) 2732 kfree(dr_wrk); 2733 else 2734 queue_work(hbus->wq, &dr_wrk->wrk); 2735 2736 return 0; 2737 } 2738 2739 /** 2740 * hv_pci_devices_present() - Handle list of new children 2741 * @hbus: Root PCI bus, as understood by this driver 2742 * @relations: Packet from host listing children 2743 * 2744 * Process a new list of devices on the bus. The list of devices is 2745 * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS, 2746 * whenever a new list of devices for this bus appears. 2747 */ 2748 static void hv_pci_devices_present(struct hv_pcibus_device *hbus, 2749 struct pci_bus_relations *relations) 2750 { 2751 struct hv_dr_state *dr; 2752 int i; 2753 2754 dr = kzalloc(struct_size(dr, func, relations->device_count), 2755 GFP_NOWAIT); 2756 if (!dr) 2757 return; 2758 2759 dr->device_count = relations->device_count; 2760 for (i = 0; i < dr->device_count; i++) { 2761 dr->func[i].v_id = relations->func[i].v_id; 2762 dr->func[i].d_id = relations->func[i].d_id; 2763 dr->func[i].rev = relations->func[i].rev; 2764 dr->func[i].prog_intf = relations->func[i].prog_intf; 2765 dr->func[i].subclass = relations->func[i].subclass; 2766 dr->func[i].base_class = relations->func[i].base_class; 2767 dr->func[i].subsystem_id = relations->func[i].subsystem_id; 2768 dr->func[i].win_slot = relations->func[i].win_slot; 2769 dr->func[i].ser = relations->func[i].ser; 2770 } 2771 2772 if (hv_pci_start_relations_work(hbus, dr)) 2773 kfree(dr); 2774 } 2775 2776 /** 2777 * hv_pci_devices_present2() - Handle list of new children 2778 * @hbus: Root PCI bus, as understood by this driver 2779 * @relations: Packet from host listing children 2780 * 2781 * This function is the v2 version of hv_pci_devices_present() 2782 */ 2783 static void hv_pci_devices_present2(struct hv_pcibus_device *hbus, 2784 struct pci_bus_relations2 *relations) 2785 { 2786 struct hv_dr_state *dr; 2787 int i; 2788 2789 dr = kzalloc(struct_size(dr, func, relations->device_count), 2790 GFP_NOWAIT); 2791 if (!dr) 2792 return; 2793 2794 dr->device_count = relations->device_count; 2795 for (i = 0; i < dr->device_count; i++) { 2796 dr->func[i].v_id = relations->func[i].v_id; 2797 dr->func[i].d_id = relations->func[i].d_id; 2798 dr->func[i].rev = relations->func[i].rev; 2799 dr->func[i].prog_intf = relations->func[i].prog_intf; 2800 dr->func[i].subclass = relations->func[i].subclass; 2801 dr->func[i].base_class = relations->func[i].base_class; 2802 dr->func[i].subsystem_id = relations->func[i].subsystem_id; 2803 dr->func[i].win_slot = relations->func[i].win_slot; 2804 dr->func[i].ser = relations->func[i].ser; 2805 dr->func[i].flags = relations->func[i].flags; 2806 dr->func[i].virtual_numa_node = 2807 relations->func[i].virtual_numa_node; 2808 } 2809 2810 if (hv_pci_start_relations_work(hbus, dr)) 2811 kfree(dr); 2812 } 2813 2814 /** 2815 * hv_eject_device_work() - Asynchronously handles ejection 2816 * @work: Work struct embedded in internal device struct 2817 * 2818 * This function handles ejecting a device. Windows will 2819 * attempt to gracefully eject a device, waiting 60 seconds to 2820 * hear back from the guest OS that this completed successfully. 2821 * If this timer expires, the device will be forcibly removed. 2822 */ 2823 static void hv_eject_device_work(struct work_struct *work) 2824 { 2825 struct pci_eject_response *ejct_pkt; 2826 struct hv_pcibus_device *hbus; 2827 struct hv_pci_dev *hpdev; 2828 struct pci_dev *pdev; 2829 unsigned long flags; 2830 int wslot; 2831 struct { 2832 struct pci_packet pkt; 2833 u8 buffer[sizeof(struct pci_eject_response)]; 2834 } ctxt; 2835 2836 hpdev = container_of(work, struct hv_pci_dev, wrk); 2837 hbus = hpdev->hbus; 2838 2839 mutex_lock(&hbus->state_lock); 2840 2841 /* 2842 * Ejection can come before or after the PCI bus has been set up, so 2843 * attempt to find it and tear down the bus state, if it exists. This 2844 * must be done without constructs like pci_domain_nr(hbus->bridge->bus) 2845 * because hbus->bridge->bus may not exist yet. 2846 */ 2847 wslot = wslot_to_devfn(hpdev->desc.win_slot.slot); 2848 pdev = pci_get_domain_bus_and_slot(hbus->bridge->domain_nr, 0, wslot); 2849 if (pdev) { 2850 pci_lock_rescan_remove(); 2851 pci_stop_and_remove_bus_device(pdev); 2852 pci_dev_put(pdev); 2853 pci_unlock_rescan_remove(); 2854 } 2855 2856 spin_lock_irqsave(&hbus->device_list_lock, flags); 2857 list_del(&hpdev->list_entry); 2858 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 2859 2860 if (hpdev->pci_slot) 2861 pci_destroy_slot(hpdev->pci_slot); 2862 2863 memset(&ctxt, 0, sizeof(ctxt)); 2864 ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message; 2865 ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE; 2866 ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot; 2867 vmbus_sendpacket(hbus->hdev->channel, ejct_pkt, 2868 sizeof(*ejct_pkt), 0, 2869 VM_PKT_DATA_INBAND, 0); 2870 2871 /* For the get_pcichild() in hv_pci_eject_device() */ 2872 put_pcichild(hpdev); 2873 /* For the two refs got in new_pcichild_device() */ 2874 put_pcichild(hpdev); 2875 put_pcichild(hpdev); 2876 /* hpdev has been freed. Do not use it any more. */ 2877 2878 mutex_unlock(&hbus->state_lock); 2879 } 2880 2881 /** 2882 * hv_pci_eject_device() - Handles device ejection 2883 * @hpdev: Internal device tracking struct 2884 * 2885 * This function is invoked when an ejection packet arrives. It 2886 * just schedules work so that we don't re-enter the packet 2887 * delivery code handling the ejection. 2888 */ 2889 static void hv_pci_eject_device(struct hv_pci_dev *hpdev) 2890 { 2891 struct hv_pcibus_device *hbus = hpdev->hbus; 2892 struct hv_device *hdev = hbus->hdev; 2893 2894 if (hbus->state == hv_pcibus_removing) { 2895 dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n"); 2896 return; 2897 } 2898 2899 get_pcichild(hpdev); 2900 INIT_WORK(&hpdev->wrk, hv_eject_device_work); 2901 queue_work(hbus->wq, &hpdev->wrk); 2902 } 2903 2904 /** 2905 * hv_pci_onchannelcallback() - Handles incoming packets 2906 * @context: Internal bus tracking struct 2907 * 2908 * This function is invoked whenever the host sends a packet to 2909 * this channel (which is private to this root PCI bus). 2910 */ 2911 static void hv_pci_onchannelcallback(void *context) 2912 { 2913 const int packet_size = 0x100; 2914 int ret; 2915 struct hv_pcibus_device *hbus = context; 2916 struct vmbus_channel *chan = hbus->hdev->channel; 2917 u32 bytes_recvd; 2918 u64 req_id, req_addr; 2919 struct vmpacket_descriptor *desc; 2920 unsigned char *buffer; 2921 int bufferlen = packet_size; 2922 struct pci_packet *comp_packet; 2923 struct pci_response *response; 2924 struct pci_incoming_message *new_message; 2925 struct pci_bus_relations *bus_rel; 2926 struct pci_bus_relations2 *bus_rel2; 2927 struct pci_dev_inval_block *inval; 2928 struct pci_dev_incoming *dev_message; 2929 struct hv_pci_dev *hpdev; 2930 unsigned long flags; 2931 2932 buffer = kmalloc(bufferlen, GFP_ATOMIC); 2933 if (!buffer) 2934 return; 2935 2936 while (1) { 2937 ret = vmbus_recvpacket_raw(chan, buffer, bufferlen, 2938 &bytes_recvd, &req_id); 2939 2940 if (ret == -ENOBUFS) { 2941 kfree(buffer); 2942 /* Handle large packet */ 2943 bufferlen = bytes_recvd; 2944 buffer = kmalloc(bytes_recvd, GFP_ATOMIC); 2945 if (!buffer) 2946 return; 2947 continue; 2948 } 2949 2950 /* Zero length indicates there are no more packets. */ 2951 if (ret || !bytes_recvd) 2952 break; 2953 2954 /* 2955 * All incoming packets must be at least as large as a 2956 * response. 2957 */ 2958 if (bytes_recvd <= sizeof(struct pci_response)) 2959 continue; 2960 desc = (struct vmpacket_descriptor *)buffer; 2961 2962 switch (desc->type) { 2963 case VM_PKT_COMP: 2964 2965 lock_requestor(chan, flags); 2966 req_addr = __vmbus_request_addr_match(chan, req_id, 2967 VMBUS_RQST_ADDR_ANY); 2968 if (req_addr == VMBUS_RQST_ERROR) { 2969 unlock_requestor(chan, flags); 2970 dev_err(&hbus->hdev->device, 2971 "Invalid transaction ID %llx\n", 2972 req_id); 2973 break; 2974 } 2975 comp_packet = (struct pci_packet *)req_addr; 2976 response = (struct pci_response *)buffer; 2977 /* 2978 * Call ->completion_func() within the critical section to make 2979 * sure that the packet pointer is still valid during the call: 2980 * here 'valid' means that there's a task still waiting for the 2981 * completion, and that the packet data is still on the waiting 2982 * task's stack. Cf. hv_compose_msi_msg(). 2983 */ 2984 comp_packet->completion_func(comp_packet->compl_ctxt, 2985 response, 2986 bytes_recvd); 2987 unlock_requestor(chan, flags); 2988 break; 2989 2990 case VM_PKT_DATA_INBAND: 2991 2992 new_message = (struct pci_incoming_message *)buffer; 2993 switch (new_message->message_type.type) { 2994 case PCI_BUS_RELATIONS: 2995 2996 bus_rel = (struct pci_bus_relations *)buffer; 2997 if (bytes_recvd < sizeof(*bus_rel) || 2998 bytes_recvd < 2999 struct_size(bus_rel, func, 3000 bus_rel->device_count)) { 3001 dev_err(&hbus->hdev->device, 3002 "bus relations too small\n"); 3003 break; 3004 } 3005 3006 hv_pci_devices_present(hbus, bus_rel); 3007 break; 3008 3009 case PCI_BUS_RELATIONS2: 3010 3011 bus_rel2 = (struct pci_bus_relations2 *)buffer; 3012 if (bytes_recvd < sizeof(*bus_rel2) || 3013 bytes_recvd < 3014 struct_size(bus_rel2, func, 3015 bus_rel2->device_count)) { 3016 dev_err(&hbus->hdev->device, 3017 "bus relations v2 too small\n"); 3018 break; 3019 } 3020 3021 hv_pci_devices_present2(hbus, bus_rel2); 3022 break; 3023 3024 case PCI_EJECT: 3025 3026 dev_message = (struct pci_dev_incoming *)buffer; 3027 if (bytes_recvd < sizeof(*dev_message)) { 3028 dev_err(&hbus->hdev->device, 3029 "eject message too small\n"); 3030 break; 3031 } 3032 hpdev = get_pcichild_wslot(hbus, 3033 dev_message->wslot.slot); 3034 if (hpdev) { 3035 hv_pci_eject_device(hpdev); 3036 put_pcichild(hpdev); 3037 } 3038 break; 3039 3040 case PCI_INVALIDATE_BLOCK: 3041 3042 inval = (struct pci_dev_inval_block *)buffer; 3043 if (bytes_recvd < sizeof(*inval)) { 3044 dev_err(&hbus->hdev->device, 3045 "invalidate message too small\n"); 3046 break; 3047 } 3048 hpdev = get_pcichild_wslot(hbus, 3049 inval->wslot.slot); 3050 if (hpdev) { 3051 if (hpdev->block_invalidate) { 3052 hpdev->block_invalidate( 3053 hpdev->invalidate_context, 3054 inval->block_mask); 3055 } 3056 put_pcichild(hpdev); 3057 } 3058 break; 3059 3060 default: 3061 dev_warn(&hbus->hdev->device, 3062 "Unimplemented protocol message %x\n", 3063 new_message->message_type.type); 3064 break; 3065 } 3066 break; 3067 3068 default: 3069 dev_err(&hbus->hdev->device, 3070 "unhandled packet type %d, tid %llx len %d\n", 3071 desc->type, req_id, bytes_recvd); 3072 break; 3073 } 3074 } 3075 3076 kfree(buffer); 3077 } 3078 3079 /** 3080 * hv_pci_protocol_negotiation() - Set up protocol 3081 * @hdev: VMBus's tracking struct for this root PCI bus. 3082 * @version: Array of supported channel protocol versions in 3083 * the order of probing - highest go first. 3084 * @num_version: Number of elements in the version array. 3085 * 3086 * This driver is intended to support running on Windows 10 3087 * (server) and later versions. It will not run on earlier 3088 * versions, as they assume that many of the operations which 3089 * Linux needs accomplished with a spinlock held were done via 3090 * asynchronous messaging via VMBus. Windows 10 increases the 3091 * surface area of PCI emulation so that these actions can take 3092 * place by suspending a virtual processor for their duration. 3093 * 3094 * This function negotiates the channel protocol version, 3095 * failing if the host doesn't support the necessary protocol 3096 * level. 3097 */ 3098 static int hv_pci_protocol_negotiation(struct hv_device *hdev, 3099 enum pci_protocol_version_t version[], 3100 int num_version) 3101 { 3102 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3103 struct pci_version_request *version_req; 3104 struct hv_pci_compl comp_pkt; 3105 struct pci_packet *pkt; 3106 int ret; 3107 int i; 3108 3109 /* 3110 * Initiate the handshake with the host and negotiate 3111 * a version that the host can support. We start with the 3112 * highest version number and go down if the host cannot 3113 * support it. 3114 */ 3115 pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL); 3116 if (!pkt) 3117 return -ENOMEM; 3118 3119 init_completion(&comp_pkt.host_event); 3120 pkt->completion_func = hv_pci_generic_compl; 3121 pkt->compl_ctxt = &comp_pkt; 3122 version_req = (struct pci_version_request *)&pkt->message; 3123 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; 3124 3125 for (i = 0; i < num_version; i++) { 3126 version_req->protocol_version = version[i]; 3127 ret = vmbus_sendpacket(hdev->channel, version_req, 3128 sizeof(struct pci_version_request), 3129 (unsigned long)pkt, VM_PKT_DATA_INBAND, 3130 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3131 if (!ret) 3132 ret = wait_for_response(hdev, &comp_pkt.host_event); 3133 3134 if (ret) { 3135 dev_err(&hdev->device, 3136 "PCI Pass-through VSP failed to request version: %d", 3137 ret); 3138 goto exit; 3139 } 3140 3141 if (comp_pkt.completion_status >= 0) { 3142 hbus->protocol_version = version[i]; 3143 dev_info(&hdev->device, 3144 "PCI VMBus probing: Using version %#x\n", 3145 hbus->protocol_version); 3146 goto exit; 3147 } 3148 3149 if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) { 3150 dev_err(&hdev->device, 3151 "PCI Pass-through VSP failed version request: %#x", 3152 comp_pkt.completion_status); 3153 ret = -EPROTO; 3154 goto exit; 3155 } 3156 3157 reinit_completion(&comp_pkt.host_event); 3158 } 3159 3160 dev_err(&hdev->device, 3161 "PCI pass-through VSP failed to find supported version"); 3162 ret = -EPROTO; 3163 3164 exit: 3165 kfree(pkt); 3166 return ret; 3167 } 3168 3169 /** 3170 * hv_pci_free_bridge_windows() - Release memory regions for the 3171 * bus 3172 * @hbus: Root PCI bus, as understood by this driver 3173 */ 3174 static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus) 3175 { 3176 /* 3177 * Set the resources back to the way they looked when they 3178 * were allocated by setting IORESOURCE_BUSY again. 3179 */ 3180 3181 if (hbus->low_mmio_space && hbus->low_mmio_res) { 3182 hbus->low_mmio_res->flags |= IORESOURCE_BUSY; 3183 vmbus_free_mmio(hbus->low_mmio_res->start, 3184 resource_size(hbus->low_mmio_res)); 3185 } 3186 3187 if (hbus->high_mmio_space && hbus->high_mmio_res) { 3188 hbus->high_mmio_res->flags |= IORESOURCE_BUSY; 3189 vmbus_free_mmio(hbus->high_mmio_res->start, 3190 resource_size(hbus->high_mmio_res)); 3191 } 3192 } 3193 3194 /** 3195 * hv_pci_allocate_bridge_windows() - Allocate memory regions 3196 * for the bus 3197 * @hbus: Root PCI bus, as understood by this driver 3198 * 3199 * This function calls vmbus_allocate_mmio(), which is itself a 3200 * bit of a compromise. Ideally, we might change the pnp layer 3201 * in the kernel such that it comprehends either PCI devices 3202 * which are "grandchildren of ACPI," with some intermediate bus 3203 * node (in this case, VMBus) or change it such that it 3204 * understands VMBus. The pnp layer, however, has been declared 3205 * deprecated, and not subject to change. 3206 * 3207 * The workaround, implemented here, is to ask VMBus to allocate 3208 * MMIO space for this bus. VMBus itself knows which ranges are 3209 * appropriate by looking at its own ACPI objects. Then, after 3210 * these ranges are claimed, they're modified to look like they 3211 * would have looked if the ACPI and pnp code had allocated 3212 * bridge windows. These descriptors have to exist in this form 3213 * in order to satisfy the code which will get invoked when the 3214 * endpoint PCI function driver calls request_mem_region() or 3215 * request_mem_region_exclusive(). 3216 * 3217 * Return: 0 on success, -errno on failure 3218 */ 3219 static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus) 3220 { 3221 resource_size_t align; 3222 int ret; 3223 3224 if (hbus->low_mmio_space) { 3225 align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space)); 3226 ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0, 3227 (u64)(u32)0xffffffff, 3228 hbus->low_mmio_space, 3229 align, false); 3230 if (ret) { 3231 dev_err(&hbus->hdev->device, 3232 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n", 3233 hbus->low_mmio_space); 3234 return ret; 3235 } 3236 3237 /* Modify this resource to become a bridge window. */ 3238 hbus->low_mmio_res->flags |= IORESOURCE_WINDOW; 3239 hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY; 3240 pci_add_resource(&hbus->bridge->windows, hbus->low_mmio_res); 3241 } 3242 3243 if (hbus->high_mmio_space) { 3244 align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space)); 3245 ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev, 3246 0x100000000, -1, 3247 hbus->high_mmio_space, align, 3248 false); 3249 if (ret) { 3250 dev_err(&hbus->hdev->device, 3251 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n", 3252 hbus->high_mmio_space); 3253 goto release_low_mmio; 3254 } 3255 3256 /* Modify this resource to become a bridge window. */ 3257 hbus->high_mmio_res->flags |= IORESOURCE_WINDOW; 3258 hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY; 3259 pci_add_resource(&hbus->bridge->windows, hbus->high_mmio_res); 3260 } 3261 3262 return 0; 3263 3264 release_low_mmio: 3265 if (hbus->low_mmio_res) { 3266 vmbus_free_mmio(hbus->low_mmio_res->start, 3267 resource_size(hbus->low_mmio_res)); 3268 } 3269 3270 return ret; 3271 } 3272 3273 /** 3274 * hv_allocate_config_window() - Find MMIO space for PCI Config 3275 * @hbus: Root PCI bus, as understood by this driver 3276 * 3277 * This function claims memory-mapped I/O space for accessing 3278 * configuration space for the functions on this bus. 3279 * 3280 * Return: 0 on success, -errno on failure 3281 */ 3282 static int hv_allocate_config_window(struct hv_pcibus_device *hbus) 3283 { 3284 int ret; 3285 3286 /* 3287 * Set up a region of MMIO space to use for accessing configuration 3288 * space. 3289 */ 3290 ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1, 3291 PCI_CONFIG_MMIO_LENGTH, 0x1000, false); 3292 if (ret) 3293 return ret; 3294 3295 /* 3296 * vmbus_allocate_mmio() gets used for allocating both device endpoint 3297 * resource claims (those which cannot be overlapped) and the ranges 3298 * which are valid for the children of this bus, which are intended 3299 * to be overlapped by those children. Set the flag on this claim 3300 * meaning that this region can't be overlapped. 3301 */ 3302 3303 hbus->mem_config->flags |= IORESOURCE_BUSY; 3304 3305 return 0; 3306 } 3307 3308 static void hv_free_config_window(struct hv_pcibus_device *hbus) 3309 { 3310 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH); 3311 } 3312 3313 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs); 3314 3315 /** 3316 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state 3317 * @hdev: VMBus's tracking struct for this root PCI bus 3318 * 3319 * Return: 0 on success, -errno on failure 3320 */ 3321 static int hv_pci_enter_d0(struct hv_device *hdev) 3322 { 3323 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3324 struct pci_bus_d0_entry *d0_entry; 3325 struct hv_pci_compl comp_pkt; 3326 struct pci_packet *pkt; 3327 bool retry = true; 3328 int ret; 3329 3330 enter_d0_retry: 3331 /* 3332 * Tell the host that the bus is ready to use, and moved into the 3333 * powered-on state. This includes telling the host which region 3334 * of memory-mapped I/O space has been chosen for configuration space 3335 * access. 3336 */ 3337 pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL); 3338 if (!pkt) 3339 return -ENOMEM; 3340 3341 init_completion(&comp_pkt.host_event); 3342 pkt->completion_func = hv_pci_generic_compl; 3343 pkt->compl_ctxt = &comp_pkt; 3344 d0_entry = (struct pci_bus_d0_entry *)&pkt->message; 3345 d0_entry->message_type.type = PCI_BUS_D0ENTRY; 3346 d0_entry->mmio_base = hbus->mem_config->start; 3347 3348 ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry), 3349 (unsigned long)pkt, VM_PKT_DATA_INBAND, 3350 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3351 if (!ret) 3352 ret = wait_for_response(hdev, &comp_pkt.host_event); 3353 3354 if (ret) 3355 goto exit; 3356 3357 /* 3358 * In certain case (Kdump) the pci device of interest was 3359 * not cleanly shut down and resource is still held on host 3360 * side, the host could return invalid device status. 3361 * We need to explicitly request host to release the resource 3362 * and try to enter D0 again. 3363 */ 3364 if (comp_pkt.completion_status < 0 && retry) { 3365 retry = false; 3366 3367 dev_err(&hdev->device, "Retrying D0 Entry\n"); 3368 3369 /* 3370 * Hv_pci_bus_exit() calls hv_send_resource_released() 3371 * to free up resources of its child devices. 3372 * In the kdump kernel we need to set the 3373 * wslot_res_allocated to 255 so it scans all child 3374 * devices to release resources allocated in the 3375 * normal kernel before panic happened. 3376 */ 3377 hbus->wslot_res_allocated = 255; 3378 3379 ret = hv_pci_bus_exit(hdev, true); 3380 3381 if (ret == 0) { 3382 kfree(pkt); 3383 goto enter_d0_retry; 3384 } 3385 dev_err(&hdev->device, 3386 "Retrying D0 failed with ret %d\n", ret); 3387 } 3388 3389 if (comp_pkt.completion_status < 0) { 3390 dev_err(&hdev->device, 3391 "PCI Pass-through VSP failed D0 Entry with status %x\n", 3392 comp_pkt.completion_status); 3393 ret = -EPROTO; 3394 goto exit; 3395 } 3396 3397 ret = 0; 3398 3399 exit: 3400 kfree(pkt); 3401 return ret; 3402 } 3403 3404 /** 3405 * hv_pci_query_relations() - Ask host to send list of child 3406 * devices 3407 * @hdev: VMBus's tracking struct for this root PCI bus 3408 * 3409 * Return: 0 on success, -errno on failure 3410 */ 3411 static int hv_pci_query_relations(struct hv_device *hdev) 3412 { 3413 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3414 struct pci_message message; 3415 struct completion comp; 3416 int ret; 3417 3418 /* Ask the host to send along the list of child devices */ 3419 init_completion(&comp); 3420 if (cmpxchg(&hbus->survey_event, NULL, &comp)) 3421 return -ENOTEMPTY; 3422 3423 memset(&message, 0, sizeof(message)); 3424 message.type = PCI_QUERY_BUS_RELATIONS; 3425 3426 ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message), 3427 0, VM_PKT_DATA_INBAND, 0); 3428 if (!ret) 3429 ret = wait_for_response(hdev, &comp); 3430 3431 /* 3432 * In the case of fast device addition/removal, it's possible that 3433 * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we 3434 * already got a PCI_BUS_RELATIONS* message from the host and the 3435 * channel callback already scheduled a work to hbus->wq, which can be 3436 * running pci_devices_present_work() -> survey_child_resources() -> 3437 * complete(&hbus->survey_event), even after hv_pci_query_relations() 3438 * exits and the stack variable 'comp' is no longer valid; as a result, 3439 * a hang or a page fault may happen when the complete() calls 3440 * raw_spin_lock_irqsave(). Flush hbus->wq before we exit from 3441 * hv_pci_query_relations() to avoid the issues. Note: if 'ret' is 3442 * -ENODEV, there can't be any more work item scheduled to hbus->wq 3443 * after the flush_workqueue(): see vmbus_onoffer_rescind() -> 3444 * vmbus_reset_channel_cb(), vmbus_rescind_cleanup() -> 3445 * channel->rescind = true. 3446 */ 3447 flush_workqueue(hbus->wq); 3448 3449 return ret; 3450 } 3451 3452 /** 3453 * hv_send_resources_allocated() - Report local resource choices 3454 * @hdev: VMBus's tracking struct for this root PCI bus 3455 * 3456 * The host OS is expecting to be sent a request as a message 3457 * which contains all the resources that the device will use. 3458 * The response contains those same resources, "translated" 3459 * which is to say, the values which should be used by the 3460 * hardware, when it delivers an interrupt. (MMIO resources are 3461 * used in local terms.) This is nice for Windows, and lines up 3462 * with the FDO/PDO split, which doesn't exist in Linux. Linux 3463 * is deeply expecting to scan an emulated PCI configuration 3464 * space. So this message is sent here only to drive the state 3465 * machine on the host forward. 3466 * 3467 * Return: 0 on success, -errno on failure 3468 */ 3469 static int hv_send_resources_allocated(struct hv_device *hdev) 3470 { 3471 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3472 struct pci_resources_assigned *res_assigned; 3473 struct pci_resources_assigned2 *res_assigned2; 3474 struct hv_pci_compl comp_pkt; 3475 struct hv_pci_dev *hpdev; 3476 struct pci_packet *pkt; 3477 size_t size_res; 3478 int wslot; 3479 int ret; 3480 3481 size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) 3482 ? sizeof(*res_assigned) : sizeof(*res_assigned2); 3483 3484 pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL); 3485 if (!pkt) 3486 return -ENOMEM; 3487 3488 ret = 0; 3489 3490 for (wslot = 0; wslot < 256; wslot++) { 3491 hpdev = get_pcichild_wslot(hbus, wslot); 3492 if (!hpdev) 3493 continue; 3494 3495 memset(pkt, 0, sizeof(*pkt) + size_res); 3496 init_completion(&comp_pkt.host_event); 3497 pkt->completion_func = hv_pci_generic_compl; 3498 pkt->compl_ctxt = &comp_pkt; 3499 3500 if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) { 3501 res_assigned = 3502 (struct pci_resources_assigned *)&pkt->message; 3503 res_assigned->message_type.type = 3504 PCI_RESOURCES_ASSIGNED; 3505 res_assigned->wslot.slot = hpdev->desc.win_slot.slot; 3506 } else { 3507 res_assigned2 = 3508 (struct pci_resources_assigned2 *)&pkt->message; 3509 res_assigned2->message_type.type = 3510 PCI_RESOURCES_ASSIGNED2; 3511 res_assigned2->wslot.slot = hpdev->desc.win_slot.slot; 3512 } 3513 put_pcichild(hpdev); 3514 3515 ret = vmbus_sendpacket(hdev->channel, &pkt->message, 3516 size_res, (unsigned long)pkt, 3517 VM_PKT_DATA_INBAND, 3518 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3519 if (!ret) 3520 ret = wait_for_response(hdev, &comp_pkt.host_event); 3521 if (ret) 3522 break; 3523 3524 if (comp_pkt.completion_status < 0) { 3525 ret = -EPROTO; 3526 dev_err(&hdev->device, 3527 "resource allocated returned 0x%x", 3528 comp_pkt.completion_status); 3529 break; 3530 } 3531 3532 hbus->wslot_res_allocated = wslot; 3533 } 3534 3535 kfree(pkt); 3536 return ret; 3537 } 3538 3539 /** 3540 * hv_send_resources_released() - Report local resources 3541 * released 3542 * @hdev: VMBus's tracking struct for this root PCI bus 3543 * 3544 * Return: 0 on success, -errno on failure 3545 */ 3546 static int hv_send_resources_released(struct hv_device *hdev) 3547 { 3548 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3549 struct pci_child_message pkt; 3550 struct hv_pci_dev *hpdev; 3551 int wslot; 3552 int ret; 3553 3554 for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) { 3555 hpdev = get_pcichild_wslot(hbus, wslot); 3556 if (!hpdev) 3557 continue; 3558 3559 memset(&pkt, 0, sizeof(pkt)); 3560 pkt.message_type.type = PCI_RESOURCES_RELEASED; 3561 pkt.wslot.slot = hpdev->desc.win_slot.slot; 3562 3563 put_pcichild(hpdev); 3564 3565 ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0, 3566 VM_PKT_DATA_INBAND, 0); 3567 if (ret) 3568 return ret; 3569 3570 hbus->wslot_res_allocated = wslot - 1; 3571 } 3572 3573 hbus->wslot_res_allocated = -1; 3574 3575 return 0; 3576 } 3577 3578 #define HVPCI_DOM_MAP_SIZE (64 * 1024) 3579 static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); 3580 3581 /* 3582 * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 3583 * as invalid for passthrough PCI devices of this driver. 3584 */ 3585 #define HVPCI_DOM_INVALID 0 3586 3587 /** 3588 * hv_get_dom_num() - Get a valid PCI domain number 3589 * Check if the PCI domain number is in use, and return another number if 3590 * it is in use. 3591 * 3592 * @dom: Requested domain number 3593 * 3594 * return: domain number on success, HVPCI_DOM_INVALID on failure 3595 */ 3596 static u16 hv_get_dom_num(u16 dom) 3597 { 3598 unsigned int i; 3599 3600 if (test_and_set_bit(dom, hvpci_dom_map) == 0) 3601 return dom; 3602 3603 for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { 3604 if (test_and_set_bit(i, hvpci_dom_map) == 0) 3605 return i; 3606 } 3607 3608 return HVPCI_DOM_INVALID; 3609 } 3610 3611 /** 3612 * hv_put_dom_num() - Mark the PCI domain number as free 3613 * @dom: Domain number to be freed 3614 */ 3615 static void hv_put_dom_num(u16 dom) 3616 { 3617 clear_bit(dom, hvpci_dom_map); 3618 } 3619 3620 /** 3621 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus 3622 * @hdev: VMBus's tracking struct for this root PCI bus 3623 * @dev_id: Identifies the device itself 3624 * 3625 * Return: 0 on success, -errno on failure 3626 */ 3627 static int hv_pci_probe(struct hv_device *hdev, 3628 const struct hv_vmbus_device_id *dev_id) 3629 { 3630 struct pci_host_bridge *bridge; 3631 struct hv_pcibus_device *hbus; 3632 u16 dom_req, dom; 3633 char *name; 3634 int ret; 3635 3636 bridge = devm_pci_alloc_host_bridge(&hdev->device, 0); 3637 if (!bridge) 3638 return -ENOMEM; 3639 3640 hbus = kzalloc(sizeof(*hbus), GFP_KERNEL); 3641 if (!hbus) 3642 return -ENOMEM; 3643 3644 hbus->bridge = bridge; 3645 mutex_init(&hbus->state_lock); 3646 hbus->state = hv_pcibus_init; 3647 hbus->wslot_res_allocated = -1; 3648 3649 /* 3650 * The PCI bus "domain" is what is called "segment" in ACPI and other 3651 * specs. Pull it from the instance ID, to get something usually 3652 * unique. In rare cases of collision, we will find out another number 3653 * not in use. 3654 * 3655 * Note that, since this code only runs in a Hyper-V VM, Hyper-V 3656 * together with this guest driver can guarantee that (1) The only 3657 * domain used by Gen1 VMs for something that looks like a physical 3658 * PCI bus (which is actually emulated by the hypervisor) is domain 0. 3659 * (2) There will be no overlap between domains (after fixing possible 3660 * collisions) in the same VM. 3661 */ 3662 dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; 3663 dom = hv_get_dom_num(dom_req); 3664 3665 if (dom == HVPCI_DOM_INVALID) { 3666 dev_err(&hdev->device, 3667 "Unable to use dom# 0x%x or other numbers", dom_req); 3668 ret = -EINVAL; 3669 goto free_bus; 3670 } 3671 3672 if (dom != dom_req) 3673 dev_info(&hdev->device, 3674 "PCI dom# 0x%x has collision, using 0x%x", 3675 dom_req, dom); 3676 3677 hbus->bridge->domain_nr = dom; 3678 #ifdef CONFIG_X86 3679 hbus->sysdata.domain = dom; 3680 hbus->use_calls = !!(ms_hyperv.hints & HV_X64_USE_MMIO_HYPERCALLS); 3681 #elif defined(CONFIG_ARM64) 3682 /* 3683 * Set the PCI bus parent to be the corresponding VMbus 3684 * device. Then the VMbus device will be assigned as the 3685 * ACPI companion in pcibios_root_bridge_prepare() and 3686 * pci_dma_configure() will propagate device coherence 3687 * information to devices created on the bus. 3688 */ 3689 hbus->sysdata.parent = hdev->device.parent; 3690 hbus->use_calls = false; 3691 #endif 3692 3693 hbus->hdev = hdev; 3694 INIT_LIST_HEAD(&hbus->children); 3695 INIT_LIST_HEAD(&hbus->dr_list); 3696 spin_lock_init(&hbus->config_lock); 3697 spin_lock_init(&hbus->device_list_lock); 3698 hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0, 3699 hbus->bridge->domain_nr); 3700 if (!hbus->wq) { 3701 ret = -ENOMEM; 3702 goto free_dom; 3703 } 3704 3705 hdev->channel->next_request_id_callback = vmbus_next_request_id; 3706 hdev->channel->request_addr_callback = vmbus_request_addr; 3707 hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE; 3708 3709 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 3710 hv_pci_onchannelcallback, hbus); 3711 if (ret) 3712 goto destroy_wq; 3713 3714 hv_set_drvdata(hdev, hbus); 3715 3716 ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions, 3717 ARRAY_SIZE(pci_protocol_versions)); 3718 if (ret) 3719 goto close; 3720 3721 ret = hv_allocate_config_window(hbus); 3722 if (ret) 3723 goto close; 3724 3725 hbus->cfg_addr = ioremap(hbus->mem_config->start, 3726 PCI_CONFIG_MMIO_LENGTH); 3727 if (!hbus->cfg_addr) { 3728 dev_err(&hdev->device, 3729 "Unable to map a virtual address for config space\n"); 3730 ret = -ENOMEM; 3731 goto free_config; 3732 } 3733 3734 name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance); 3735 if (!name) { 3736 ret = -ENOMEM; 3737 goto unmap; 3738 } 3739 3740 hbus->fwnode = irq_domain_alloc_named_fwnode(name); 3741 kfree(name); 3742 if (!hbus->fwnode) { 3743 ret = -ENOMEM; 3744 goto unmap; 3745 } 3746 3747 ret = hv_pcie_init_irq_domain(hbus); 3748 if (ret) 3749 goto free_fwnode; 3750 3751 ret = hv_pci_query_relations(hdev); 3752 if (ret) 3753 goto free_irq_domain; 3754 3755 mutex_lock(&hbus->state_lock); 3756 3757 ret = hv_pci_enter_d0(hdev); 3758 if (ret) 3759 goto release_state_lock; 3760 3761 ret = hv_pci_allocate_bridge_windows(hbus); 3762 if (ret) 3763 goto exit_d0; 3764 3765 ret = hv_send_resources_allocated(hdev); 3766 if (ret) 3767 goto free_windows; 3768 3769 prepopulate_bars(hbus); 3770 3771 hbus->state = hv_pcibus_probed; 3772 3773 ret = create_root_hv_pci_bus(hbus); 3774 if (ret) 3775 goto free_windows; 3776 3777 mutex_unlock(&hbus->state_lock); 3778 return 0; 3779 3780 free_windows: 3781 hv_pci_free_bridge_windows(hbus); 3782 exit_d0: 3783 (void) hv_pci_bus_exit(hdev, true); 3784 release_state_lock: 3785 mutex_unlock(&hbus->state_lock); 3786 free_irq_domain: 3787 irq_domain_remove(hbus->irq_domain); 3788 free_fwnode: 3789 irq_domain_free_fwnode(hbus->fwnode); 3790 unmap: 3791 iounmap(hbus->cfg_addr); 3792 free_config: 3793 hv_free_config_window(hbus); 3794 close: 3795 vmbus_close(hdev->channel); 3796 destroy_wq: 3797 destroy_workqueue(hbus->wq); 3798 free_dom: 3799 hv_put_dom_num(hbus->bridge->domain_nr); 3800 free_bus: 3801 kfree(hbus); 3802 return ret; 3803 } 3804 3805 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) 3806 { 3807 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3808 struct vmbus_channel *chan = hdev->channel; 3809 struct { 3810 struct pci_packet teardown_packet; 3811 u8 buffer[sizeof(struct pci_message)]; 3812 } pkt; 3813 struct hv_pci_compl comp_pkt; 3814 struct hv_pci_dev *hpdev, *tmp; 3815 unsigned long flags; 3816 u64 trans_id; 3817 int ret; 3818 3819 /* 3820 * After the host sends the RESCIND_CHANNEL message, it doesn't 3821 * access the per-channel ringbuffer any longer. 3822 */ 3823 if (chan->rescind) 3824 return 0; 3825 3826 if (!keep_devs) { 3827 struct list_head removed; 3828 3829 /* Move all present children to the list on stack */ 3830 INIT_LIST_HEAD(&removed); 3831 spin_lock_irqsave(&hbus->device_list_lock, flags); 3832 list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry) 3833 list_move_tail(&hpdev->list_entry, &removed); 3834 spin_unlock_irqrestore(&hbus->device_list_lock, flags); 3835 3836 /* Remove all children in the list */ 3837 list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) { 3838 list_del(&hpdev->list_entry); 3839 if (hpdev->pci_slot) 3840 pci_destroy_slot(hpdev->pci_slot); 3841 /* For the two refs got in new_pcichild_device() */ 3842 put_pcichild(hpdev); 3843 put_pcichild(hpdev); 3844 } 3845 } 3846 3847 ret = hv_send_resources_released(hdev); 3848 if (ret) { 3849 dev_err(&hdev->device, 3850 "Couldn't send resources released packet(s)\n"); 3851 return ret; 3852 } 3853 3854 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet)); 3855 init_completion(&comp_pkt.host_event); 3856 pkt.teardown_packet.completion_func = hv_pci_generic_compl; 3857 pkt.teardown_packet.compl_ctxt = &comp_pkt; 3858 pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT; 3859 3860 ret = vmbus_sendpacket_getid(chan, &pkt.teardown_packet.message, 3861 sizeof(struct pci_message), 3862 (unsigned long)&pkt.teardown_packet, 3863 &trans_id, VM_PKT_DATA_INBAND, 3864 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); 3865 if (ret) 3866 return ret; 3867 3868 if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) { 3869 /* 3870 * The completion packet on the stack becomes invalid after 3871 * 'return'; remove the ID from the VMbus requestor if the 3872 * identifier is still mapped to/associated with the packet. 3873 * 3874 * Cf. hv_pci_onchannelcallback(). 3875 */ 3876 vmbus_request_addr_match(chan, trans_id, 3877 (unsigned long)&pkt.teardown_packet); 3878 return -ETIMEDOUT; 3879 } 3880 3881 return 0; 3882 } 3883 3884 /** 3885 * hv_pci_remove() - Remove routine for this VMBus channel 3886 * @hdev: VMBus's tracking struct for this root PCI bus 3887 */ 3888 static void hv_pci_remove(struct hv_device *hdev) 3889 { 3890 struct hv_pcibus_device *hbus; 3891 3892 hbus = hv_get_drvdata(hdev); 3893 if (hbus->state == hv_pcibus_installed) { 3894 tasklet_disable(&hdev->channel->callback_event); 3895 hbus->state = hv_pcibus_removing; 3896 tasklet_enable(&hdev->channel->callback_event); 3897 destroy_workqueue(hbus->wq); 3898 hbus->wq = NULL; 3899 /* 3900 * At this point, no work is running or can be scheduled 3901 * on hbus-wq. We can't race with hv_pci_devices_present() 3902 * or hv_pci_eject_device(), it's safe to proceed. 3903 */ 3904 3905 /* Remove the bus from PCI's point of view. */ 3906 pci_lock_rescan_remove(); 3907 pci_stop_root_bus(hbus->bridge->bus); 3908 hv_pci_remove_slots(hbus); 3909 pci_remove_root_bus(hbus->bridge->bus); 3910 pci_unlock_rescan_remove(); 3911 } 3912 3913 hv_pci_bus_exit(hdev, false); 3914 3915 vmbus_close(hdev->channel); 3916 3917 iounmap(hbus->cfg_addr); 3918 hv_free_config_window(hbus); 3919 hv_pci_free_bridge_windows(hbus); 3920 irq_domain_remove(hbus->irq_domain); 3921 irq_domain_free_fwnode(hbus->fwnode); 3922 3923 hv_put_dom_num(hbus->bridge->domain_nr); 3924 3925 kfree(hbus); 3926 } 3927 3928 static int hv_pci_suspend(struct hv_device *hdev) 3929 { 3930 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 3931 enum hv_pcibus_state old_state; 3932 int ret; 3933 3934 /* 3935 * hv_pci_suspend() must make sure there are no pending work items 3936 * before calling vmbus_close(), since it runs in a process context 3937 * as a callback in dpm_suspend(). When it starts to run, the channel 3938 * callback hv_pci_onchannelcallback(), which runs in a tasklet 3939 * context, can be still running concurrently and scheduling new work 3940 * items onto hbus->wq in hv_pci_devices_present() and 3941 * hv_pci_eject_device(), and the work item handlers can access the 3942 * vmbus channel, which can be being closed by hv_pci_suspend(), e.g. 3943 * the work item handler pci_devices_present_work() -> 3944 * new_pcichild_device() writes to the vmbus channel. 3945 * 3946 * To eliminate the race, hv_pci_suspend() disables the channel 3947 * callback tasklet, sets hbus->state to hv_pcibus_removing, and 3948 * re-enables the tasklet. This way, when hv_pci_suspend() proceeds, 3949 * it knows that no new work item can be scheduled, and then it flushes 3950 * hbus->wq and safely closes the vmbus channel. 3951 */ 3952 tasklet_disable(&hdev->channel->callback_event); 3953 3954 /* Change the hbus state to prevent new work items. */ 3955 old_state = hbus->state; 3956 if (hbus->state == hv_pcibus_installed) 3957 hbus->state = hv_pcibus_removing; 3958 3959 tasklet_enable(&hdev->channel->callback_event); 3960 3961 if (old_state != hv_pcibus_installed) 3962 return -EINVAL; 3963 3964 flush_workqueue(hbus->wq); 3965 3966 ret = hv_pci_bus_exit(hdev, true); 3967 if (ret) 3968 return ret; 3969 3970 vmbus_close(hdev->channel); 3971 3972 return 0; 3973 } 3974 3975 static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg) 3976 { 3977 struct irq_data *irq_data; 3978 struct msi_desc *entry; 3979 int ret = 0; 3980 3981 if (!pdev->msi_enabled && !pdev->msix_enabled) 3982 return 0; 3983 3984 msi_lock_descs(&pdev->dev); 3985 msi_for_each_desc(entry, &pdev->dev, MSI_DESC_ASSOCIATED) { 3986 irq_data = irq_get_irq_data(entry->irq); 3987 if (WARN_ON_ONCE(!irq_data)) { 3988 ret = -EINVAL; 3989 break; 3990 } 3991 3992 hv_compose_msi_msg(irq_data, &entry->msg); 3993 } 3994 msi_unlock_descs(&pdev->dev); 3995 3996 return ret; 3997 } 3998 3999 /* 4000 * Upon resume, pci_restore_msi_state() -> ... -> __pci_write_msi_msg() 4001 * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V 4002 * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg() 4003 * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping 4004 * Table entries. 4005 */ 4006 static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus) 4007 { 4008 pci_walk_bus(hbus->bridge->bus, hv_pci_restore_msi_msg, NULL); 4009 } 4010 4011 static int hv_pci_resume(struct hv_device *hdev) 4012 { 4013 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); 4014 enum pci_protocol_version_t version[1]; 4015 int ret; 4016 4017 hbus->state = hv_pcibus_init; 4018 4019 hdev->channel->next_request_id_callback = vmbus_next_request_id; 4020 hdev->channel->request_addr_callback = vmbus_request_addr; 4021 hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE; 4022 4023 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, 4024 hv_pci_onchannelcallback, hbus); 4025 if (ret) 4026 return ret; 4027 4028 /* Only use the version that was in use before hibernation. */ 4029 version[0] = hbus->protocol_version; 4030 ret = hv_pci_protocol_negotiation(hdev, version, 1); 4031 if (ret) 4032 goto out; 4033 4034 ret = hv_pci_query_relations(hdev); 4035 if (ret) 4036 goto out; 4037 4038 mutex_lock(&hbus->state_lock); 4039 4040 ret = hv_pci_enter_d0(hdev); 4041 if (ret) 4042 goto release_state_lock; 4043 4044 ret = hv_send_resources_allocated(hdev); 4045 if (ret) 4046 goto release_state_lock; 4047 4048 prepopulate_bars(hbus); 4049 4050 hv_pci_restore_msi_state(hbus); 4051 4052 hbus->state = hv_pcibus_installed; 4053 mutex_unlock(&hbus->state_lock); 4054 return 0; 4055 4056 release_state_lock: 4057 mutex_unlock(&hbus->state_lock); 4058 out: 4059 vmbus_close(hdev->channel); 4060 return ret; 4061 } 4062 4063 static const struct hv_vmbus_device_id hv_pci_id_table[] = { 4064 /* PCI Pass-through Class ID */ 4065 /* 44C4F61D-4444-4400-9D52-802E27EDE19F */ 4066 { HV_PCIE_GUID, }, 4067 { }, 4068 }; 4069 4070 MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table); 4071 4072 static struct hv_driver hv_pci_drv = { 4073 .name = "hv_pci", 4074 .id_table = hv_pci_id_table, 4075 .probe = hv_pci_probe, 4076 .remove = hv_pci_remove, 4077 .suspend = hv_pci_suspend, 4078 .resume = hv_pci_resume, 4079 }; 4080 4081 static void __exit exit_hv_pci_drv(void) 4082 { 4083 vmbus_driver_unregister(&hv_pci_drv); 4084 4085 hvpci_block_ops.read_block = NULL; 4086 hvpci_block_ops.write_block = NULL; 4087 hvpci_block_ops.reg_blk_invalidate = NULL; 4088 } 4089 4090 static int __init init_hv_pci_drv(void) 4091 { 4092 int ret; 4093 4094 if (!hv_is_hyperv_initialized()) 4095 return -ENODEV; 4096 4097 ret = hv_pci_irqchip_init(); 4098 if (ret) 4099 return ret; 4100 4101 /* Set the invalid domain number's bit, so it will not be used */ 4102 set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); 4103 4104 /* Initialize PCI block r/w interface */ 4105 hvpci_block_ops.read_block = hv_read_config_block; 4106 hvpci_block_ops.write_block = hv_write_config_block; 4107 hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate; 4108 4109 return vmbus_driver_register(&hv_pci_drv); 4110 } 4111 4112 module_init(init_hv_pci_drv); 4113 module_exit(exit_hv_pci_drv); 4114 4115 MODULE_DESCRIPTION("Hyper-V PCI"); 4116 MODULE_LICENSE("GPL v2"); 4117