1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Irqdomain for Linux to run as the root partition on Microsoft Hypervisor. 4 * 5 * Authors: 6 * Sunil Muthuswamy <sunilmut@microsoft.com> 7 * Wei Liu <wei.liu@kernel.org> 8 */ 9 10 #include <linux/pci.h> 11 #include <linux/irq.h> 12 #include <linux/export.h> 13 #include <linux/irqchip/irq-msi-lib.h> 14 #include <asm/mshyperv.h> 15 16 static int hv_map_interrupt(union hv_device_id hv_devid, bool level, 17 int cpu, int vector, struct hv_interrupt_entry *ret_entry) 18 { 19 struct hv_input_map_device_interrupt *input; 20 struct hv_output_map_device_interrupt *output; 21 struct hv_device_interrupt_descriptor *intr_desc; 22 unsigned long flags; 23 u64 status; 24 int nr_bank, var_size; 25 26 local_irq_save(flags); 27 28 input = *this_cpu_ptr(hyperv_pcpu_input_arg); 29 output = *this_cpu_ptr(hyperv_pcpu_output_arg); 30 31 intr_desc = &input->interrupt_descriptor; 32 memset(input, 0, sizeof(*input)); 33 input->partition_id = hv_current_partition_id; 34 input->device_id = hv_devid.as_uint64; 35 intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED; 36 intr_desc->vector_count = 1; 37 intr_desc->target.vector = vector; 38 39 if (level) 40 intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_LEVEL; 41 else 42 intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE; 43 44 intr_desc->target.vp_set.valid_bank_mask = 0; 45 intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K; 46 nr_bank = cpumask_to_vpset(&intr_desc->target.vp_set, cpumask_of(cpu)); 47 if (nr_bank < 0) { 48 local_irq_restore(flags); 49 pr_err("%s: unable to generate VP set\n", __func__); 50 return -EINVAL; 51 } 52 intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; 53 54 /* 55 * var-sized hypercall, var-size starts after vp_mask (thus 56 * vp_set.format does not count, but vp_set.valid_bank_mask 57 * does). 58 */ 59 var_size = nr_bank + 1; 60 61 status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size, 62 input, output); 63 *ret_entry = output->interrupt_entry; 64 65 local_irq_restore(flags); 66 67 if (!hv_result_success(status)) 68 hv_status_err(status, "\n"); 69 70 return hv_result_to_errno(status); 71 } 72 73 static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *irq_entry) 74 { 75 unsigned long flags; 76 struct hv_input_unmap_device_interrupt *input; 77 u64 status; 78 79 local_irq_save(flags); 80 input = *this_cpu_ptr(hyperv_pcpu_input_arg); 81 82 memset(input, 0, sizeof(*input)); 83 input->partition_id = hv_current_partition_id; 84 input->device_id = id; 85 input->interrupt_entry = *irq_entry; 86 87 status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL); 88 local_irq_restore(flags); 89 90 if (!hv_result_success(status)) 91 hv_status_err(status, "\n"); 92 93 return hv_result_to_errno(status); 94 } 95 96 #ifdef CONFIG_PCI_MSI 97 struct rid_data { 98 struct pci_dev *bridge; 99 u32 rid; 100 }; 101 102 static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data) 103 { 104 struct rid_data *rd = data; 105 u8 bus = PCI_BUS_NUM(rd->rid); 106 107 if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) { 108 rd->bridge = pdev; 109 rd->rid = alias; 110 } 111 112 return 0; 113 } 114 115 static union hv_device_id hv_build_devid_type_pci(struct pci_dev *pdev) 116 { 117 int pos; 118 union hv_device_id hv_devid; 119 struct rid_data data = { 120 .bridge = NULL, 121 .rid = PCI_DEVID(pdev->bus->number, pdev->devfn) 122 }; 123 124 pci_for_each_dma_alias(pdev, get_rid_cb, &data); 125 126 hv_devid.as_uint64 = 0; 127 hv_devid.device_type = HV_DEVICE_TYPE_PCI; 128 hv_devid.pci.segment = pci_domain_nr(pdev->bus); 129 130 hv_devid.pci.bdf.bus = PCI_BUS_NUM(data.rid); 131 hv_devid.pci.bdf.device = PCI_SLOT(data.rid); 132 hv_devid.pci.bdf.function = PCI_FUNC(data.rid); 133 hv_devid.pci.source_shadow = HV_SOURCE_SHADOW_NONE; 134 135 if (data.bridge == NULL) 136 goto out; 137 138 /* 139 * Microsoft Hypervisor requires a bus range when the bridge is 140 * running in PCI-X mode. 141 * 142 * To distinguish conventional vs PCI-X bridge, we can check 143 * the bridge's PCI-X Secondary Status Register, Secondary Bus 144 * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge 145 * Specification Revision 1.0 5.2.2.1.3. 146 * 147 * Value zero means it is in conventional mode, otherwise it is 148 * in PCI-X mode. 149 */ 150 151 pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX); 152 if (pos) { 153 u16 status; 154 155 pci_read_config_word(data.bridge, pos + PCI_X_BRIDGE_SSTATUS, 156 &status); 157 158 if (status & PCI_X_SSTATUS_FREQ) { 159 /* Non-zero, PCI-X mode */ 160 u8 sec_bus, sub_bus; 161 162 hv_devid.pci.source_shadow = 163 HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE; 164 165 pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, 166 &sec_bus); 167 hv_devid.pci.shadow_bus_range.secondary_bus = sec_bus; 168 pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, 169 &sub_bus); 170 hv_devid.pci.shadow_bus_range.subordinate_bus = sub_bus; 171 } 172 } 173 174 out: 175 return hv_devid; 176 } 177 178 /* 179 * hv_map_msi_interrupt() - Map the MSI IRQ in the hypervisor. 180 * @data: Describes the IRQ 181 * @out_entry: Hypervisor (MSI) interrupt entry (can be NULL) 182 * 183 * Map the IRQ in the hypervisor by issuing a MAP_DEVICE_INTERRUPT hypercall. 184 * 185 * Return: 0 on success, -errno on failure 186 */ 187 int hv_map_msi_interrupt(struct irq_data *data, 188 struct hv_interrupt_entry *out_entry) 189 { 190 struct irq_cfg *cfg = irqd_cfg(data); 191 struct hv_interrupt_entry dummy; 192 union hv_device_id hv_devid; 193 struct msi_desc *msidesc; 194 struct pci_dev *pdev; 195 int cpu; 196 197 msidesc = irq_data_get_msi_desc(data); 198 pdev = msi_desc_to_pci_dev(msidesc); 199 hv_devid = hv_build_devid_type_pci(pdev); 200 cpu = cpumask_first(irq_data_get_effective_affinity_mask(data)); 201 202 return hv_map_interrupt(hv_devid, false, cpu, cfg->vector, 203 out_entry ? out_entry : &dummy); 204 } 205 EXPORT_SYMBOL_GPL(hv_map_msi_interrupt); 206 207 static void entry_to_msi_msg(struct hv_interrupt_entry *entry, 208 struct msi_msg *msg) 209 { 210 /* High address is always 0 */ 211 msg->address_hi = 0; 212 msg->address_lo = entry->msi_entry.address.as_uint32; 213 msg->data = entry->msi_entry.data.as_uint32; 214 } 215 216 static int hv_unmap_msi_interrupt(struct pci_dev *pdev, 217 struct hv_interrupt_entry *irq_entry); 218 219 static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) 220 { 221 struct hv_interrupt_entry *stored_entry; 222 struct irq_cfg *cfg = irqd_cfg(data); 223 struct msi_desc *msidesc; 224 struct pci_dev *pdev; 225 int ret; 226 227 msidesc = irq_data_get_msi_desc(data); 228 pdev = msi_desc_to_pci_dev(msidesc); 229 230 if (!cfg) { 231 pr_debug("%s: cfg is NULL", __func__); 232 return; 233 } 234 235 if (data->chip_data) { 236 /* 237 * This interrupt is already mapped. Let's unmap first. 238 * 239 * We don't use retarget interrupt hypercalls here because 240 * Microsoft Hypervisor doesn't allow root to change the vector 241 * or specify VPs outside of the set that is initially used 242 * during mapping. 243 */ 244 stored_entry = data->chip_data; 245 data->chip_data = NULL; 246 247 ret = hv_unmap_msi_interrupt(pdev, stored_entry); 248 249 kfree(stored_entry); 250 251 if (ret) 252 return; 253 } 254 255 stored_entry = kzalloc_obj(*stored_entry, GFP_ATOMIC); 256 if (!stored_entry) 257 return; 258 259 ret = hv_map_msi_interrupt(data, stored_entry); 260 if (ret) { 261 kfree(stored_entry); 262 return; 263 } 264 265 data->chip_data = stored_entry; 266 entry_to_msi_msg(data->chip_data, msg); 267 } 268 269 static int hv_unmap_msi_interrupt(struct pci_dev *pdev, 270 struct hv_interrupt_entry *irq_entry) 271 { 272 union hv_device_id hv_devid; 273 274 hv_devid = hv_build_devid_type_pci(pdev); 275 return hv_unmap_interrupt(hv_devid.as_uint64, irq_entry); 276 } 277 278 /* NB: during map, hv_interrupt_entry is saved via data->chip_data */ 279 static void hv_teardown_msi_irq(struct pci_dev *pdev, struct irq_data *irqd) 280 { 281 struct hv_interrupt_entry irq_entry; 282 struct msi_msg msg; 283 284 if (!irqd->chip_data) { 285 pr_debug("%s: no chip data\n!", __func__); 286 return; 287 } 288 289 irq_entry = *(struct hv_interrupt_entry *)irqd->chip_data; 290 entry_to_msi_msg(&irq_entry, &msg); 291 292 kfree(irqd->chip_data); 293 irqd->chip_data = NULL; 294 295 (void)hv_unmap_msi_interrupt(pdev, &irq_entry); 296 } 297 298 /* 299 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, 300 * which implement the MSI or MSI-X Capability Structure. 301 */ 302 static struct irq_chip hv_pci_msi_controller = { 303 .name = "HV-PCI-MSI", 304 .irq_ack = irq_chip_ack_parent, 305 .irq_compose_msi_msg = hv_irq_compose_msi_msg, 306 .irq_set_affinity = irq_chip_set_affinity_parent, 307 }; 308 309 static bool hv_init_dev_msi_info(struct device *dev, struct irq_domain *domain, 310 struct irq_domain *real_parent, 311 struct msi_domain_info *info) 312 { 313 struct irq_chip *chip = info->chip; 314 315 if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) 316 return false; 317 318 chip->flags |= IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED; 319 320 info->ops->msi_prepare = pci_msi_prepare; 321 322 return true; 323 } 324 325 #define HV_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX) 326 #define HV_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ 327 MSI_FLAG_USE_DEF_CHIP_OPS) 328 329 static struct msi_parent_ops hv_msi_parent_ops = { 330 .supported_flags = HV_MSI_FLAGS_SUPPORTED, 331 .required_flags = HV_MSI_FLAGS_REQUIRED, 332 .bus_select_token = DOMAIN_BUS_NEXUS, 333 .bus_select_mask = MATCH_PCI_MSI, 334 .chip_flags = MSI_CHIP_FLAG_SET_ACK, 335 .prefix = "HV-", 336 .init_dev_msi_info = hv_init_dev_msi_info, 337 }; 338 339 /* Allocate nr_irqs IRQs for the given irq domain */ 340 static int hv_msi_domain_alloc(struct irq_domain *d, unsigned int virq, 341 unsigned int nr_irqs, void *arg) 342 { 343 /* 344 * TODO: The allocation bits of hv_irq_compose_msi_msg(), i.e. 345 * everything except entry_to_msi_msg() should be in here. 346 */ 347 int ret; 348 349 ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, arg); 350 if (ret) 351 return ret; 352 353 for (int i = 0; i < nr_irqs; ++i) { 354 irq_domain_set_info(d, virq + i, 0, &hv_pci_msi_controller, 355 NULL, handle_edge_irq, NULL, "edge"); 356 } 357 358 return 0; 359 } 360 361 static void hv_msi_domain_free(struct irq_domain *d, unsigned int virq, 362 unsigned int nr_irqs) 363 { 364 for (int i = 0; i < nr_irqs; ++i) { 365 struct irq_data *irqd = irq_domain_get_irq_data(d, virq); 366 struct msi_desc *desc; 367 368 desc = irq_data_get_msi_desc(irqd); 369 if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev))) 370 continue; 371 372 hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd); 373 } 374 375 irq_domain_free_irqs_top(d, virq, nr_irqs); 376 } 377 378 static const struct irq_domain_ops hv_msi_domain_ops = { 379 .select = msi_lib_irq_domain_select, 380 .alloc = hv_msi_domain_alloc, 381 .free = hv_msi_domain_free, 382 }; 383 384 struct irq_domain * __init hv_create_pci_msi_domain(void) 385 { 386 struct irq_domain *d = NULL; 387 388 struct irq_domain_info info = { 389 .fwnode = irq_domain_alloc_named_fwnode("HV-PCI-MSI"), 390 .ops = &hv_msi_domain_ops, 391 .parent = x86_vector_domain, 392 }; 393 394 if (info.fwnode) 395 d = msi_create_parent_irq_domain(&info, &hv_msi_parent_ops); 396 397 /* No point in going further if we can't get an irq domain */ 398 BUG_ON(!d); 399 400 return d; 401 } 402 403 #endif /* CONFIG_PCI_MSI */ 404 405 int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry) 406 { 407 union hv_device_id hv_devid; 408 409 hv_devid.as_uint64 = 0; 410 hv_devid.device_type = HV_DEVICE_TYPE_IOAPIC; 411 hv_devid.ioapic.ioapic_id = (u8)ioapic_id; 412 413 return hv_unmap_interrupt(hv_devid.as_uint64, entry); 414 } 415 EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt); 416 417 int hv_map_ioapic_interrupt(int ioapic_id, bool level, int cpu, int vector, 418 struct hv_interrupt_entry *entry) 419 { 420 union hv_device_id hv_devid; 421 422 hv_devid.as_uint64 = 0; 423 hv_devid.device_type = HV_DEVICE_TYPE_IOAPIC; 424 hv_devid.ioapic.ioapic_id = (u8)ioapic_id; 425 426 return hv_map_interrupt(hv_devid, level, cpu, vector, entry); 427 } 428 EXPORT_SYMBOL_GPL(hv_map_ioapic_interrupt); 429