1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * Irqdomain for Linux to run as the root partition on Microsoft Hypervisor. 5 * 6 * Authors: 7 * Sunil Muthuswamy <sunilmut@microsoft.com> 8 * Wei Liu <wei.liu@kernel.org> 9 */ 10 11 #include <linux/pci.h> 12 #include <linux/irq.h> 13 #include <linux/export.h> 14 #include <linux/irqchip/irq-msi-lib.h> 15 #include <asm/mshyperv.h> 16 17 static int hv_map_interrupt(union hv_device_id device_id, bool level, 18 int cpu, int vector, struct hv_interrupt_entry *entry) 19 { 20 struct hv_input_map_device_interrupt *input; 21 struct hv_output_map_device_interrupt *output; 22 struct hv_device_interrupt_descriptor *intr_desc; 23 unsigned long flags; 24 u64 status; 25 int nr_bank, var_size; 26 27 local_irq_save(flags); 28 29 input = *this_cpu_ptr(hyperv_pcpu_input_arg); 30 output = *this_cpu_ptr(hyperv_pcpu_output_arg); 31 32 intr_desc = &input->interrupt_descriptor; 33 memset(input, 0, sizeof(*input)); 34 input->partition_id = hv_current_partition_id; 35 input->device_id = device_id.as_uint64; 36 intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED; 37 intr_desc->vector_count = 1; 38 intr_desc->target.vector = vector; 39 40 if (level) 41 intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_LEVEL; 42 else 43 intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE; 44 45 intr_desc->target.vp_set.valid_bank_mask = 0; 46 intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K; 47 nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), cpumask_of(cpu)); 48 if (nr_bank < 0) { 49 local_irq_restore(flags); 50 pr_err("%s: unable to generate VP set\n", __func__); 51 return -EINVAL; 52 } 53 intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; 54 55 /* 56 * var-sized hypercall, var-size starts after vp_mask (thus 57 * vp_set.format does not count, but vp_set.valid_bank_mask 58 * does). 59 */ 60 var_size = nr_bank + 1; 61 62 status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size, 63 input, output); 64 *entry = output->interrupt_entry; 65 66 local_irq_restore(flags); 67 68 if (!hv_result_success(status)) 69 hv_status_err(status, "\n"); 70 71 return hv_result_to_errno(status); 72 } 73 74 static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry) 75 { 76 unsigned long flags; 77 struct hv_input_unmap_device_interrupt *input; 78 struct hv_interrupt_entry *intr_entry; 79 u64 status; 80 81 local_irq_save(flags); 82 input = *this_cpu_ptr(hyperv_pcpu_input_arg); 83 84 memset(input, 0, sizeof(*input)); 85 intr_entry = &input->interrupt_entry; 86 input->partition_id = hv_current_partition_id; 87 input->device_id = id; 88 *intr_entry = *old_entry; 89 90 status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL); 91 local_irq_restore(flags); 92 93 if (!hv_result_success(status)) 94 hv_status_err(status, "\n"); 95 96 return hv_result_to_errno(status); 97 } 98 99 #ifdef CONFIG_PCI_MSI 100 struct rid_data { 101 struct pci_dev *bridge; 102 u32 rid; 103 }; 104 105 static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data) 106 { 107 struct rid_data *rd = data; 108 u8 bus = PCI_BUS_NUM(rd->rid); 109 110 if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) { 111 rd->bridge = pdev; 112 rd->rid = alias; 113 } 114 115 return 0; 116 } 117 118 static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev) 119 { 120 union hv_device_id dev_id; 121 struct rid_data data = { 122 .bridge = NULL, 123 .rid = PCI_DEVID(dev->bus->number, dev->devfn) 124 }; 125 126 pci_for_each_dma_alias(dev, get_rid_cb, &data); 127 128 dev_id.as_uint64 = 0; 129 dev_id.device_type = HV_DEVICE_TYPE_PCI; 130 dev_id.pci.segment = pci_domain_nr(dev->bus); 131 132 dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid); 133 dev_id.pci.bdf.device = PCI_SLOT(data.rid); 134 dev_id.pci.bdf.function = PCI_FUNC(data.rid); 135 dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE; 136 137 if (data.bridge) { 138 int pos; 139 140 /* 141 * Microsoft Hypervisor requires a bus range when the bridge is 142 * running in PCI-X mode. 143 * 144 * To distinguish conventional vs PCI-X bridge, we can check 145 * the bridge's PCI-X Secondary Status Register, Secondary Bus 146 * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge 147 * Specification Revision 1.0 5.2.2.1.3. 148 * 149 * Value zero means it is in conventional mode, otherwise it is 150 * in PCI-X mode. 151 */ 152 153 pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX); 154 if (pos) { 155 u16 status; 156 157 pci_read_config_word(data.bridge, pos + 158 PCI_X_BRIDGE_SSTATUS, &status); 159 160 if (status & PCI_X_SSTATUS_FREQ) { 161 /* Non-zero, PCI-X mode */ 162 u8 sec_bus, sub_bus; 163 164 dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE; 165 166 pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus); 167 dev_id.pci.shadow_bus_range.secondary_bus = sec_bus; 168 pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus); 169 dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus; 170 } 171 } 172 } 173 174 return dev_id; 175 } 176 177 /** 178 * hv_map_msi_interrupt() - "Map" the MSI IRQ in the hypervisor. 179 * @data: Describes the IRQ 180 * @out_entry: Hypervisor (MSI) interrupt entry (can be NULL) 181 * 182 * Map the IRQ in the hypervisor by issuing a MAP_DEVICE_INTERRUPT hypercall. 183 * 184 * Return: 0 on success, -errno on failure 185 */ 186 int hv_map_msi_interrupt(struct irq_data *data, 187 struct hv_interrupt_entry *out_entry) 188 { 189 struct irq_cfg *cfg = irqd_cfg(data); 190 struct hv_interrupt_entry dummy; 191 union hv_device_id device_id; 192 struct msi_desc *msidesc; 193 struct pci_dev *dev; 194 int cpu; 195 196 msidesc = irq_data_get_msi_desc(data); 197 dev = msi_desc_to_pci_dev(msidesc); 198 device_id = hv_build_pci_dev_id(dev); 199 cpu = cpumask_first(irq_data_get_effective_affinity_mask(data)); 200 201 return hv_map_interrupt(device_id, false, cpu, cfg->vector, 202 out_entry ? out_entry : &dummy); 203 } 204 EXPORT_SYMBOL_GPL(hv_map_msi_interrupt); 205 206 static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg) 207 { 208 /* High address is always 0 */ 209 msg->address_hi = 0; 210 msg->address_lo = entry->msi_entry.address.as_uint32; 211 msg->data = entry->msi_entry.data.as_uint32; 212 } 213 214 static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry); 215 static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) 216 { 217 struct hv_interrupt_entry *stored_entry; 218 struct irq_cfg *cfg = irqd_cfg(data); 219 struct msi_desc *msidesc; 220 struct pci_dev *dev; 221 int ret; 222 223 msidesc = irq_data_get_msi_desc(data); 224 dev = msi_desc_to_pci_dev(msidesc); 225 226 if (!cfg) { 227 pr_debug("%s: cfg is NULL", __func__); 228 return; 229 } 230 231 if (data->chip_data) { 232 /* 233 * This interrupt is already mapped. Let's unmap first. 234 * 235 * We don't use retarget interrupt hypercalls here because 236 * Microsoft Hypervisor doesn't allow root to change the vector 237 * or specify VPs outside of the set that is initially used 238 * during mapping. 239 */ 240 stored_entry = data->chip_data; 241 data->chip_data = NULL; 242 243 ret = hv_unmap_msi_interrupt(dev, stored_entry); 244 245 kfree(stored_entry); 246 247 if (ret) 248 return; 249 } 250 251 stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC); 252 if (!stored_entry) { 253 pr_debug("%s: failed to allocate chip data\n", __func__); 254 return; 255 } 256 257 ret = hv_map_msi_interrupt(data, stored_entry); 258 if (ret) { 259 kfree(stored_entry); 260 return; 261 } 262 263 data->chip_data = stored_entry; 264 entry_to_msi_msg(data->chip_data, msg); 265 266 return; 267 } 268 269 static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry) 270 { 271 return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry); 272 } 273 274 static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd) 275 { 276 struct hv_interrupt_entry old_entry; 277 struct msi_msg msg; 278 279 if (!irqd->chip_data) { 280 pr_debug("%s: no chip data\n!", __func__); 281 return; 282 } 283 284 old_entry = *(struct hv_interrupt_entry *)irqd->chip_data; 285 entry_to_msi_msg(&old_entry, &msg); 286 287 kfree(irqd->chip_data); 288 irqd->chip_data = NULL; 289 290 (void)hv_unmap_msi_interrupt(dev, &old_entry); 291 } 292 293 /* 294 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, 295 * which implement the MSI or MSI-X Capability Structure. 296 */ 297 static struct irq_chip hv_pci_msi_controller = { 298 .name = "HV-PCI-MSI", 299 .irq_ack = irq_chip_ack_parent, 300 .irq_compose_msi_msg = hv_irq_compose_msi_msg, 301 .irq_set_affinity = irq_chip_set_affinity_parent, 302 }; 303 304 static bool hv_init_dev_msi_info(struct device *dev, struct irq_domain *domain, 305 struct irq_domain *real_parent, struct msi_domain_info *info) 306 { 307 struct irq_chip *chip = info->chip; 308 309 if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) 310 return false; 311 312 chip->flags |= IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED; 313 314 info->ops->msi_prepare = pci_msi_prepare; 315 316 return true; 317 } 318 319 #define HV_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX) 320 #define HV_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS) 321 322 static struct msi_parent_ops hv_msi_parent_ops = { 323 .supported_flags = HV_MSI_FLAGS_SUPPORTED, 324 .required_flags = HV_MSI_FLAGS_REQUIRED, 325 .bus_select_token = DOMAIN_BUS_NEXUS, 326 .bus_select_mask = MATCH_PCI_MSI, 327 .chip_flags = MSI_CHIP_FLAG_SET_ACK, 328 .prefix = "HV-", 329 .init_dev_msi_info = hv_init_dev_msi_info, 330 }; 331 332 static int hv_msi_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs, 333 void *arg) 334 { 335 /* 336 * TODO: The allocation bits of hv_irq_compose_msi_msg(), i.e. everything except 337 * entry_to_msi_msg() should be in here. 338 */ 339 340 int ret; 341 342 ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, arg); 343 if (ret) 344 return ret; 345 346 for (int i = 0; i < nr_irqs; ++i) { 347 irq_domain_set_info(d, virq + i, 0, &hv_pci_msi_controller, NULL, 348 handle_edge_irq, NULL, "edge"); 349 } 350 return 0; 351 } 352 353 static void hv_msi_domain_free(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs) 354 { 355 for (int i = 0; i < nr_irqs; ++i) { 356 struct irq_data *irqd = irq_domain_get_irq_data(d, virq); 357 struct msi_desc *desc; 358 359 desc = irq_data_get_msi_desc(irqd); 360 if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev))) 361 continue; 362 363 hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd); 364 } 365 irq_domain_free_irqs_top(d, virq, nr_irqs); 366 } 367 368 static const struct irq_domain_ops hv_msi_domain_ops = { 369 .select = msi_lib_irq_domain_select, 370 .alloc = hv_msi_domain_alloc, 371 .free = hv_msi_domain_free, 372 }; 373 374 struct irq_domain * __init hv_create_pci_msi_domain(void) 375 { 376 struct irq_domain *d = NULL; 377 378 struct irq_domain_info info = { 379 .fwnode = irq_domain_alloc_named_fwnode("HV-PCI-MSI"), 380 .ops = &hv_msi_domain_ops, 381 .parent = x86_vector_domain, 382 }; 383 384 if (info.fwnode) 385 d = msi_create_parent_irq_domain(&info, &hv_msi_parent_ops); 386 387 /* No point in going further if we can't get an irq domain */ 388 BUG_ON(!d); 389 390 return d; 391 } 392 393 #endif /* CONFIG_PCI_MSI */ 394 395 int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry) 396 { 397 union hv_device_id device_id; 398 399 device_id.as_uint64 = 0; 400 device_id.device_type = HV_DEVICE_TYPE_IOAPIC; 401 device_id.ioapic.ioapic_id = (u8)ioapic_id; 402 403 return hv_unmap_interrupt(device_id.as_uint64, entry); 404 } 405 EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt); 406 407 int hv_map_ioapic_interrupt(int ioapic_id, bool level, int cpu, int vector, 408 struct hv_interrupt_entry *entry) 409 { 410 union hv_device_id device_id; 411 412 device_id.as_uint64 = 0; 413 device_id.device_type = HV_DEVICE_TYPE_IOAPIC; 414 device_id.ioapic.ioapic_id = (u8)ioapic_id; 415 416 return hv_map_interrupt(device_id, level, cpu, vector, entry); 417 } 418 EXPORT_SYMBOL_GPL(hv_map_ioapic_interrupt); 419