1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2017-2019, IBM Corporation. 4 */ 5 6 #define pr_fmt(fmt) "xive-kvm: " fmt 7 8 #include <linux/kernel.h> 9 #include <linux/kvm_host.h> 10 #include <linux/err.h> 11 #include <linux/gfp.h> 12 #include <linux/spinlock.h> 13 #include <linux/delay.h> 14 #include <linux/file.h> 15 #include <linux/irqdomain.h> 16 #include <asm/uaccess.h> 17 #include <asm/kvm_book3s.h> 18 #include <asm/kvm_ppc.h> 19 #include <asm/hvcall.h> 20 #include <asm/xive.h> 21 #include <asm/xive-regs.h> 22 #include <asm/debug.h> 23 #include <asm/opal.h> 24 25 #include <linux/debugfs.h> 26 #include <linux/seq_file.h> 27 28 #include "book3s_xive.h" 29 30 static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset) 31 { 32 u64 val; 33 34 /* 35 * The KVM XIVE native device does not use the XIVE_ESB_SET_PQ_10 36 * load operation, so there is no need to enforce load-after-store 37 * ordering. 38 */ 39 40 val = in_be64(xd->eoi_mmio + offset); 41 return (u8)val; 42 } 43 44 static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio) 45 { 46 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 47 struct xive_q *q = &xc->queues[prio]; 48 49 xive_native_disable_queue(xc->vp_id, q, prio); 50 if (q->qpage) { 51 put_page(virt_to_page(q->qpage)); 52 q->qpage = NULL; 53 } 54 } 55 56 static int kvmppc_xive_native_configure_queue(u32 vp_id, struct xive_q *q, 57 u8 prio, __be32 *qpage, 58 u32 order, bool can_escalate) 59 { 60 int rc; 61 __be32 *qpage_prev = q->qpage; 62 63 rc = xive_native_configure_queue(vp_id, q, prio, qpage, order, 64 can_escalate); 65 if (rc) 66 return rc; 67 68 if (qpage_prev) 69 put_page(virt_to_page(qpage_prev)); 70 71 return rc; 72 } 73 74 void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) 75 { 76 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 77 int i; 78 79 if (!kvmppc_xive_enabled(vcpu)) 80 return; 81 82 if (!xc) 83 return; 84 85 pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num); 86 87 /* Ensure no interrupt is still routed to that VP */ 88 xc->valid = false; 89 kvmppc_xive_disable_vcpu_interrupts(vcpu); 90 91 /* Free escalations */ 92 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 93 /* Free the escalation irq */ 94 if (xc->esc_virq[i]) { 95 if (kvmppc_xive_has_single_escalation(xc->xive)) 96 xive_cleanup_single_escalation(vcpu, xc->esc_virq[i]); 97 free_irq(xc->esc_virq[i], vcpu); 98 irq_dispose_mapping(xc->esc_virq[i]); 99 kfree(xc->esc_virq_names[i]); 100 xc->esc_virq[i] = 0; 101 } 102 } 103 104 /* Disable the VP */ 105 xive_native_disable_vp(xc->vp_id); 106 107 /* Clear the cam word so guest entry won't try to push context */ 108 vcpu->arch.xive_cam_word = 0; 109 110 /* Free the queues */ 111 for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { 112 kvmppc_xive_native_cleanup_queue(vcpu, i); 113 } 114 115 /* Free the VP */ 116 kfree(xc); 117 118 /* Cleanup the vcpu */ 119 vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; 120 vcpu->arch.xive_vcpu = NULL; 121 } 122 123 int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, 124 struct kvm_vcpu *vcpu, u32 server_num) 125 { 126 struct kvmppc_xive *xive = dev->private; 127 struct kvmppc_xive_vcpu *xc = NULL; 128 int rc; 129 u32 vp_id; 130 131 pr_devel("native_connect_vcpu(server=%d)\n", server_num); 132 133 if (dev->ops != &kvm_xive_native_ops) { 134 pr_devel("Wrong ops !\n"); 135 return -EPERM; 136 } 137 if (xive->kvm != vcpu->kvm) 138 return -EPERM; 139 if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) 140 return -EBUSY; 141 142 mutex_lock(&xive->lock); 143 144 rc = kvmppc_xive_compute_vp_id(xive, server_num, &vp_id); 145 if (rc) 146 goto bail; 147 148 xc = kzalloc(sizeof(*xc), GFP_KERNEL); 149 if (!xc) { 150 rc = -ENOMEM; 151 goto bail; 152 } 153 154 vcpu->arch.xive_vcpu = xc; 155 xc->xive = xive; 156 xc->vcpu = vcpu; 157 xc->server_num = server_num; 158 159 xc->vp_id = vp_id; 160 xc->valid = true; 161 vcpu->arch.irq_type = KVMPPC_IRQ_XIVE; 162 163 rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id); 164 if (rc) { 165 pr_err("Failed to get VP info from OPAL: %d\n", rc); 166 goto bail; 167 } 168 169 if (!kvmppc_xive_check_save_restore(vcpu)) { 170 pr_err("inconsistent save-restore setup for VCPU %d\n", server_num); 171 rc = -EIO; 172 goto bail; 173 } 174 175 /* 176 * Enable the VP first as the single escalation mode will 177 * affect escalation interrupts numbering 178 */ 179 rc = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive)); 180 if (rc) { 181 pr_err("Failed to enable VP in OPAL: %d\n", rc); 182 goto bail; 183 } 184 185 /* Configure VCPU fields for use by assembly push/pull */ 186 vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000); 187 vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO); 188 189 /* TODO: reset all queues to a clean state ? */ 190 bail: 191 mutex_unlock(&xive->lock); 192 if (rc) 193 kvmppc_xive_native_cleanup_vcpu(vcpu); 194 195 return rc; 196 } 197 198 /* 199 * Device passthrough support 200 */ 201 static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq) 202 { 203 struct kvmppc_xive *xive = kvm->arch.xive; 204 pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2; 205 206 if (irq >= KVMPPC_XIVE_NR_IRQS) 207 return -EINVAL; 208 209 /* 210 * Clear the ESB pages of the IRQ number being mapped (or 211 * unmapped) into the guest and let the VM fault handler 212 * repopulate with the appropriate ESB pages (device or IC) 213 */ 214 pr_debug("clearing esb pages for girq 0x%lx\n", irq); 215 mutex_lock(&xive->mapping_lock); 216 if (xive->mapping) 217 unmap_mapping_range(xive->mapping, 218 esb_pgoff << PAGE_SHIFT, 219 2ull << PAGE_SHIFT, 1); 220 mutex_unlock(&xive->mapping_lock); 221 return 0; 222 } 223 224 static struct kvmppc_xive_ops kvmppc_xive_native_ops = { 225 .reset_mapped = kvmppc_xive_native_reset_mapped, 226 }; 227 228 static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf) 229 { 230 struct vm_area_struct *vma = vmf->vma; 231 struct kvm_device *dev = vma->vm_file->private_data; 232 struct kvmppc_xive *xive = dev->private; 233 struct kvmppc_xive_src_block *sb; 234 struct kvmppc_xive_irq_state *state; 235 struct xive_irq_data *xd; 236 u32 hw_num; 237 u16 src; 238 u64 page; 239 unsigned long irq; 240 u64 page_offset; 241 242 /* 243 * Linux/KVM uses a two pages ESB setting, one for trigger and 244 * one for EOI 245 */ 246 page_offset = vmf->pgoff - vma->vm_pgoff; 247 irq = page_offset / 2; 248 249 sb = kvmppc_xive_find_source(xive, irq, &src); 250 if (!sb) { 251 pr_devel("%s: source %lx not found !\n", __func__, irq); 252 return VM_FAULT_SIGBUS; 253 } 254 255 state = &sb->irq_state[src]; 256 257 /* Some sanity checking */ 258 if (!state->valid) { 259 pr_devel("%s: source %lx invalid !\n", __func__, irq); 260 return VM_FAULT_SIGBUS; 261 } 262 263 kvmppc_xive_select_irq(state, &hw_num, &xd); 264 265 arch_spin_lock(&sb->lock); 266 267 /* 268 * first/even page is for trigger 269 * second/odd page is for EOI and management. 270 */ 271 page = page_offset % 2 ? xd->eoi_page : xd->trig_page; 272 arch_spin_unlock(&sb->lock); 273 274 if (WARN_ON(!page)) { 275 pr_err("%s: accessing invalid ESB page for source %lx !\n", 276 __func__, irq); 277 return VM_FAULT_SIGBUS; 278 } 279 280 vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT); 281 return VM_FAULT_NOPAGE; 282 } 283 284 static const struct vm_operations_struct xive_native_esb_vmops = { 285 .fault = xive_native_esb_fault, 286 }; 287 288 static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf) 289 { 290 struct vm_area_struct *vma = vmf->vma; 291 292 switch (vmf->pgoff - vma->vm_pgoff) { 293 case 0: /* HW - forbid access */ 294 case 1: /* HV - forbid access */ 295 return VM_FAULT_SIGBUS; 296 case 2: /* OS */ 297 vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT); 298 return VM_FAULT_NOPAGE; 299 case 3: /* USER - TODO */ 300 default: 301 return VM_FAULT_SIGBUS; 302 } 303 } 304 305 static const struct vm_operations_struct xive_native_tima_vmops = { 306 .fault = xive_native_tima_fault, 307 }; 308 309 static int kvmppc_xive_native_mmap(struct kvm_device *dev, 310 struct vm_area_struct *vma) 311 { 312 struct kvmppc_xive *xive = dev->private; 313 314 /* We only allow mappings at fixed offset for now */ 315 if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) { 316 if (vma_pages(vma) > 4) 317 return -EINVAL; 318 vma->vm_ops = &xive_native_tima_vmops; 319 } else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) { 320 if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2) 321 return -EINVAL; 322 vma->vm_ops = &xive_native_esb_vmops; 323 } else { 324 return -EINVAL; 325 } 326 327 vm_flags_set(vma, VM_IO | VM_PFNMAP); 328 vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); 329 330 /* 331 * Grab the KVM device file address_space to be able to clear 332 * the ESB pages mapping when a device is passed-through into 333 * the guest. 334 */ 335 xive->mapping = vma->vm_file->f_mapping; 336 return 0; 337 } 338 339 static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq, 340 u64 addr) 341 { 342 struct kvmppc_xive_src_block *sb; 343 struct kvmppc_xive_irq_state *state; 344 u64 __user *ubufp = (u64 __user *) addr; 345 u64 val; 346 u16 idx; 347 int rc; 348 349 pr_devel("%s irq=0x%lx\n", __func__, irq); 350 351 if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS) 352 return -E2BIG; 353 354 sb = kvmppc_xive_find_source(xive, irq, &idx); 355 if (!sb) { 356 pr_debug("No source, creating source block...\n"); 357 sb = kvmppc_xive_create_src_block(xive, irq); 358 if (!sb) { 359 pr_err("Failed to create block...\n"); 360 return -ENOMEM; 361 } 362 } 363 state = &sb->irq_state[idx]; 364 365 if (get_user(val, ubufp)) { 366 pr_err("fault getting user info !\n"); 367 return -EFAULT; 368 } 369 370 arch_spin_lock(&sb->lock); 371 372 /* 373 * If the source doesn't already have an IPI, allocate 374 * one and get the corresponding data 375 */ 376 if (!state->ipi_number) { 377 state->ipi_number = xive_native_alloc_irq(); 378 if (state->ipi_number == 0) { 379 pr_err("Failed to allocate IRQ !\n"); 380 rc = -ENXIO; 381 goto unlock; 382 } 383 xive_native_populate_irq_data(state->ipi_number, 384 &state->ipi_data); 385 pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__, 386 state->ipi_number, irq); 387 } 388 389 /* Restore LSI state */ 390 if (val & KVM_XIVE_LEVEL_SENSITIVE) { 391 state->lsi = true; 392 if (val & KVM_XIVE_LEVEL_ASSERTED) 393 state->asserted = true; 394 pr_devel(" LSI ! Asserted=%d\n", state->asserted); 395 } 396 397 /* Mask IRQ to start with */ 398 state->act_server = 0; 399 state->act_priority = MASKED; 400 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01); 401 xive_native_configure_irq(state->ipi_number, 0, MASKED, 0); 402 403 /* Increment the number of valid sources and mark this one valid */ 404 if (!state->valid) 405 xive->src_count++; 406 state->valid = true; 407 408 rc = 0; 409 410 unlock: 411 arch_spin_unlock(&sb->lock); 412 413 return rc; 414 } 415 416 static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive, 417 struct kvmppc_xive_src_block *sb, 418 struct kvmppc_xive_irq_state *state, 419 u32 server, u8 priority, bool masked, 420 u32 eisn) 421 { 422 struct kvm *kvm = xive->kvm; 423 u32 hw_num; 424 int rc = 0; 425 426 arch_spin_lock(&sb->lock); 427 428 if (state->act_server == server && state->act_priority == priority && 429 state->eisn == eisn) 430 goto unlock; 431 432 pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n", 433 priority, server, masked, state->act_server, 434 state->act_priority); 435 436 kvmppc_xive_select_irq(state, &hw_num, NULL); 437 438 if (priority != MASKED && !masked) { 439 rc = kvmppc_xive_select_target(kvm, &server, priority); 440 if (rc) 441 goto unlock; 442 443 state->act_priority = priority; 444 state->act_server = server; 445 state->eisn = eisn; 446 447 rc = xive_native_configure_irq(hw_num, 448 kvmppc_xive_vp(xive, server), 449 priority, eisn); 450 } else { 451 state->act_priority = MASKED; 452 state->act_server = 0; 453 state->eisn = 0; 454 455 rc = xive_native_configure_irq(hw_num, 0, MASKED, 0); 456 } 457 458 unlock: 459 arch_spin_unlock(&sb->lock); 460 return rc; 461 } 462 463 static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive, 464 long irq, u64 addr) 465 { 466 struct kvmppc_xive_src_block *sb; 467 struct kvmppc_xive_irq_state *state; 468 u64 __user *ubufp = (u64 __user *) addr; 469 u16 src; 470 u64 kvm_cfg; 471 u32 server; 472 u8 priority; 473 bool masked; 474 u32 eisn; 475 476 sb = kvmppc_xive_find_source(xive, irq, &src); 477 if (!sb) 478 return -ENOENT; 479 480 state = &sb->irq_state[src]; 481 482 if (!state->valid) 483 return -EINVAL; 484 485 if (get_user(kvm_cfg, ubufp)) 486 return -EFAULT; 487 488 pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg); 489 490 priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >> 491 KVM_XIVE_SOURCE_PRIORITY_SHIFT; 492 server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >> 493 KVM_XIVE_SOURCE_SERVER_SHIFT; 494 masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >> 495 KVM_XIVE_SOURCE_MASKED_SHIFT; 496 eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >> 497 KVM_XIVE_SOURCE_EISN_SHIFT; 498 499 if (priority != xive_prio_from_guest(priority)) { 500 pr_err("invalid priority for queue %d for VCPU %d\n", 501 priority, server); 502 return -EINVAL; 503 } 504 505 return kvmppc_xive_native_update_source_config(xive, sb, state, server, 506 priority, masked, eisn); 507 } 508 509 static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive, 510 long irq, u64 addr) 511 { 512 struct kvmppc_xive_src_block *sb; 513 struct kvmppc_xive_irq_state *state; 514 struct xive_irq_data *xd; 515 u32 hw_num; 516 u16 src; 517 int rc = 0; 518 519 pr_devel("%s irq=0x%lx", __func__, irq); 520 521 sb = kvmppc_xive_find_source(xive, irq, &src); 522 if (!sb) 523 return -ENOENT; 524 525 state = &sb->irq_state[src]; 526 527 rc = -EINVAL; 528 529 arch_spin_lock(&sb->lock); 530 531 if (state->valid) { 532 kvmppc_xive_select_irq(state, &hw_num, &xd); 533 xive_native_sync_source(hw_num); 534 rc = 0; 535 } 536 537 arch_spin_unlock(&sb->lock); 538 return rc; 539 } 540 541 static int xive_native_validate_queue_size(u32 qshift) 542 { 543 /* 544 * We only support 64K pages for the moment. This is also 545 * advertised in the DT property "ibm,xive-eq-sizes" 546 */ 547 switch (qshift) { 548 case 0: /* EQ reset */ 549 case 16: 550 return 0; 551 case 12: 552 case 21: 553 case 24: 554 default: 555 return -EINVAL; 556 } 557 } 558 559 static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, 560 long eq_idx, u64 addr) 561 { 562 struct kvm *kvm = xive->kvm; 563 struct kvm_vcpu *vcpu; 564 struct kvmppc_xive_vcpu *xc; 565 void __user *ubufp = (void __user *) addr; 566 u32 server; 567 u8 priority; 568 struct kvm_ppc_xive_eq kvm_eq; 569 int rc; 570 __be32 *qaddr = NULL; 571 struct page *page; 572 struct xive_q *q; 573 gfn_t gfn; 574 unsigned long page_size; 575 int srcu_idx; 576 577 /* 578 * Demangle priority/server tuple from the EQ identifier 579 */ 580 priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >> 581 KVM_XIVE_EQ_PRIORITY_SHIFT; 582 server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >> 583 KVM_XIVE_EQ_SERVER_SHIFT; 584 585 if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq))) 586 return -EFAULT; 587 588 vcpu = kvmppc_xive_find_server(kvm, server); 589 if (!vcpu) { 590 pr_err("Can't find server %d\n", server); 591 return -ENOENT; 592 } 593 xc = vcpu->arch.xive_vcpu; 594 595 if (priority != xive_prio_from_guest(priority)) { 596 pr_err("Trying to restore invalid queue %d for VCPU %d\n", 597 priority, server); 598 return -EINVAL; 599 } 600 q = &xc->queues[priority]; 601 602 pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n", 603 __func__, server, priority, kvm_eq.flags, 604 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex); 605 606 /* reset queue and disable queueing */ 607 if (!kvm_eq.qshift) { 608 q->guest_qaddr = 0; 609 q->guest_qshift = 0; 610 611 rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority, 612 NULL, 0, true); 613 if (rc) { 614 pr_err("Failed to reset queue %d for VCPU %d: %d\n", 615 priority, xc->server_num, rc); 616 return rc; 617 } 618 619 return 0; 620 } 621 622 /* 623 * sPAPR specifies a "Unconditional Notify (n) flag" for the 624 * H_INT_SET_QUEUE_CONFIG hcall which forces notification 625 * without using the coalescing mechanisms provided by the 626 * XIVE END ESBs. This is required on KVM as notification 627 * using the END ESBs is not supported. 628 */ 629 if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) { 630 pr_err("invalid flags %d\n", kvm_eq.flags); 631 return -EINVAL; 632 } 633 634 rc = xive_native_validate_queue_size(kvm_eq.qshift); 635 if (rc) { 636 pr_err("invalid queue size %d\n", kvm_eq.qshift); 637 return rc; 638 } 639 640 if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) { 641 pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr, 642 1ull << kvm_eq.qshift); 643 return -EINVAL; 644 } 645 646 srcu_idx = srcu_read_lock(&kvm->srcu); 647 gfn = gpa_to_gfn(kvm_eq.qaddr); 648 649 page_size = kvm_host_page_size(vcpu, gfn); 650 if (1ull << kvm_eq.qshift > page_size) { 651 srcu_read_unlock(&kvm->srcu, srcu_idx); 652 pr_warn("Incompatible host page size %lx!\n", page_size); 653 return -EINVAL; 654 } 655 656 page = gfn_to_page(kvm, gfn); 657 if (is_error_page(page)) { 658 srcu_read_unlock(&kvm->srcu, srcu_idx); 659 pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr); 660 return -EINVAL; 661 } 662 663 qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK); 664 srcu_read_unlock(&kvm->srcu, srcu_idx); 665 666 /* 667 * Backup the queue page guest address to the mark EQ page 668 * dirty for migration. 669 */ 670 q->guest_qaddr = kvm_eq.qaddr; 671 q->guest_qshift = kvm_eq.qshift; 672 673 /* 674 * Unconditional Notification is forced by default at the 675 * OPAL level because the use of END ESBs is not supported by 676 * Linux. 677 */ 678 rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority, 679 (__be32 *) qaddr, kvm_eq.qshift, true); 680 if (rc) { 681 pr_err("Failed to configure queue %d for VCPU %d: %d\n", 682 priority, xc->server_num, rc); 683 put_page(page); 684 return rc; 685 } 686 687 /* 688 * Only restore the queue state when needed. When doing the 689 * H_INT_SET_SOURCE_CONFIG hcall, it should not. 690 */ 691 if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) { 692 rc = xive_native_set_queue_state(xc->vp_id, priority, 693 kvm_eq.qtoggle, 694 kvm_eq.qindex); 695 if (rc) 696 goto error; 697 } 698 699 rc = kvmppc_xive_attach_escalation(vcpu, priority, 700 kvmppc_xive_has_single_escalation(xive)); 701 error: 702 if (rc) 703 kvmppc_xive_native_cleanup_queue(vcpu, priority); 704 return rc; 705 } 706 707 static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive, 708 long eq_idx, u64 addr) 709 { 710 struct kvm *kvm = xive->kvm; 711 struct kvm_vcpu *vcpu; 712 struct kvmppc_xive_vcpu *xc; 713 struct xive_q *q; 714 void __user *ubufp = (u64 __user *) addr; 715 u32 server; 716 u8 priority; 717 struct kvm_ppc_xive_eq kvm_eq; 718 u64 qaddr; 719 u64 qshift; 720 u64 qeoi_page; 721 u32 escalate_irq; 722 u64 qflags; 723 int rc; 724 725 /* 726 * Demangle priority/server tuple from the EQ identifier 727 */ 728 priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >> 729 KVM_XIVE_EQ_PRIORITY_SHIFT; 730 server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >> 731 KVM_XIVE_EQ_SERVER_SHIFT; 732 733 vcpu = kvmppc_xive_find_server(kvm, server); 734 if (!vcpu) { 735 pr_err("Can't find server %d\n", server); 736 return -ENOENT; 737 } 738 xc = vcpu->arch.xive_vcpu; 739 740 if (priority != xive_prio_from_guest(priority)) { 741 pr_err("invalid priority for queue %d for VCPU %d\n", 742 priority, server); 743 return -EINVAL; 744 } 745 q = &xc->queues[priority]; 746 747 memset(&kvm_eq, 0, sizeof(kvm_eq)); 748 749 if (!q->qpage) 750 return 0; 751 752 rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift, 753 &qeoi_page, &escalate_irq, &qflags); 754 if (rc) 755 return rc; 756 757 kvm_eq.flags = 0; 758 if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY) 759 kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY; 760 761 kvm_eq.qshift = q->guest_qshift; 762 kvm_eq.qaddr = q->guest_qaddr; 763 764 rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle, 765 &kvm_eq.qindex); 766 if (rc) 767 return rc; 768 769 pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n", 770 __func__, server, priority, kvm_eq.flags, 771 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex); 772 773 if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq))) 774 return -EFAULT; 775 776 return 0; 777 } 778 779 static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb) 780 { 781 int i; 782 783 for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { 784 struct kvmppc_xive_irq_state *state = &sb->irq_state[i]; 785 786 if (!state->valid) 787 continue; 788 789 if (state->act_priority == MASKED) 790 continue; 791 792 state->eisn = 0; 793 state->act_server = 0; 794 state->act_priority = MASKED; 795 xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01); 796 xive_native_configure_irq(state->ipi_number, 0, MASKED, 0); 797 if (state->pt_number) { 798 xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01); 799 xive_native_configure_irq(state->pt_number, 800 0, MASKED, 0); 801 } 802 } 803 } 804 805 static int kvmppc_xive_reset(struct kvmppc_xive *xive) 806 { 807 struct kvm *kvm = xive->kvm; 808 struct kvm_vcpu *vcpu; 809 unsigned long i; 810 811 pr_devel("%s\n", __func__); 812 813 mutex_lock(&xive->lock); 814 815 kvm_for_each_vcpu(i, vcpu, kvm) { 816 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 817 unsigned int prio; 818 819 if (!xc) 820 continue; 821 822 kvmppc_xive_disable_vcpu_interrupts(vcpu); 823 824 for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) { 825 826 /* Single escalation, no queue 7 */ 827 if (prio == 7 && kvmppc_xive_has_single_escalation(xive)) 828 break; 829 830 if (xc->esc_virq[prio]) { 831 free_irq(xc->esc_virq[prio], vcpu); 832 irq_dispose_mapping(xc->esc_virq[prio]); 833 kfree(xc->esc_virq_names[prio]); 834 xc->esc_virq[prio] = 0; 835 } 836 837 kvmppc_xive_native_cleanup_queue(vcpu, prio); 838 } 839 } 840 841 for (i = 0; i <= xive->max_sbid; i++) { 842 struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; 843 844 if (sb) { 845 arch_spin_lock(&sb->lock); 846 kvmppc_xive_reset_sources(sb); 847 arch_spin_unlock(&sb->lock); 848 } 849 } 850 851 mutex_unlock(&xive->lock); 852 853 return 0; 854 } 855 856 static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb) 857 { 858 int j; 859 860 for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) { 861 struct kvmppc_xive_irq_state *state = &sb->irq_state[j]; 862 struct xive_irq_data *xd; 863 u32 hw_num; 864 865 if (!state->valid) 866 continue; 867 868 /* 869 * The struct kvmppc_xive_irq_state reflects the state 870 * of the EAS configuration and not the state of the 871 * source. The source is masked setting the PQ bits to 872 * '-Q', which is what is being done before calling 873 * the KVM_DEV_XIVE_EQ_SYNC control. 874 * 875 * If a source EAS is configured, OPAL syncs the XIVE 876 * IC of the source and the XIVE IC of the previous 877 * target if any. 878 * 879 * So it should be fine ignoring MASKED sources as 880 * they have been synced already. 881 */ 882 if (state->act_priority == MASKED) 883 continue; 884 885 kvmppc_xive_select_irq(state, &hw_num, &xd); 886 xive_native_sync_source(hw_num); 887 xive_native_sync_queue(hw_num); 888 } 889 } 890 891 static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu) 892 { 893 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 894 unsigned int prio; 895 int srcu_idx; 896 897 if (!xc) 898 return -ENOENT; 899 900 for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) { 901 struct xive_q *q = &xc->queues[prio]; 902 903 if (!q->qpage) 904 continue; 905 906 /* Mark EQ page dirty for migration */ 907 srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 908 mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr)); 909 srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); 910 } 911 return 0; 912 } 913 914 static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive) 915 { 916 struct kvm *kvm = xive->kvm; 917 struct kvm_vcpu *vcpu; 918 unsigned long i; 919 920 pr_devel("%s\n", __func__); 921 922 mutex_lock(&xive->lock); 923 for (i = 0; i <= xive->max_sbid; i++) { 924 struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; 925 926 if (sb) { 927 arch_spin_lock(&sb->lock); 928 kvmppc_xive_native_sync_sources(sb); 929 arch_spin_unlock(&sb->lock); 930 } 931 } 932 933 kvm_for_each_vcpu(i, vcpu, kvm) { 934 kvmppc_xive_native_vcpu_eq_sync(vcpu); 935 } 936 mutex_unlock(&xive->lock); 937 938 return 0; 939 } 940 941 static int kvmppc_xive_native_set_attr(struct kvm_device *dev, 942 struct kvm_device_attr *attr) 943 { 944 struct kvmppc_xive *xive = dev->private; 945 946 switch (attr->group) { 947 case KVM_DEV_XIVE_GRP_CTRL: 948 switch (attr->attr) { 949 case KVM_DEV_XIVE_RESET: 950 return kvmppc_xive_reset(xive); 951 case KVM_DEV_XIVE_EQ_SYNC: 952 return kvmppc_xive_native_eq_sync(xive); 953 case KVM_DEV_XIVE_NR_SERVERS: 954 return kvmppc_xive_set_nr_servers(xive, attr->addr); 955 } 956 break; 957 case KVM_DEV_XIVE_GRP_SOURCE: 958 return kvmppc_xive_native_set_source(xive, attr->attr, 959 attr->addr); 960 case KVM_DEV_XIVE_GRP_SOURCE_CONFIG: 961 return kvmppc_xive_native_set_source_config(xive, attr->attr, 962 attr->addr); 963 case KVM_DEV_XIVE_GRP_EQ_CONFIG: 964 return kvmppc_xive_native_set_queue_config(xive, attr->attr, 965 attr->addr); 966 case KVM_DEV_XIVE_GRP_SOURCE_SYNC: 967 return kvmppc_xive_native_sync_source(xive, attr->attr, 968 attr->addr); 969 } 970 return -ENXIO; 971 } 972 973 static int kvmppc_xive_native_get_attr(struct kvm_device *dev, 974 struct kvm_device_attr *attr) 975 { 976 struct kvmppc_xive *xive = dev->private; 977 978 switch (attr->group) { 979 case KVM_DEV_XIVE_GRP_EQ_CONFIG: 980 return kvmppc_xive_native_get_queue_config(xive, attr->attr, 981 attr->addr); 982 } 983 return -ENXIO; 984 } 985 986 static int kvmppc_xive_native_has_attr(struct kvm_device *dev, 987 struct kvm_device_attr *attr) 988 { 989 switch (attr->group) { 990 case KVM_DEV_XIVE_GRP_CTRL: 991 switch (attr->attr) { 992 case KVM_DEV_XIVE_RESET: 993 case KVM_DEV_XIVE_EQ_SYNC: 994 case KVM_DEV_XIVE_NR_SERVERS: 995 return 0; 996 } 997 break; 998 case KVM_DEV_XIVE_GRP_SOURCE: 999 case KVM_DEV_XIVE_GRP_SOURCE_CONFIG: 1000 case KVM_DEV_XIVE_GRP_SOURCE_SYNC: 1001 if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ && 1002 attr->attr < KVMPPC_XIVE_NR_IRQS) 1003 return 0; 1004 break; 1005 case KVM_DEV_XIVE_GRP_EQ_CONFIG: 1006 return 0; 1007 } 1008 return -ENXIO; 1009 } 1010 1011 /* 1012 * Called when device fd is closed. kvm->lock is held. 1013 */ 1014 static void kvmppc_xive_native_release(struct kvm_device *dev) 1015 { 1016 struct kvmppc_xive *xive = dev->private; 1017 struct kvm *kvm = xive->kvm; 1018 struct kvm_vcpu *vcpu; 1019 unsigned long i; 1020 1021 pr_devel("Releasing xive native device\n"); 1022 1023 /* 1024 * Clear the KVM device file address_space which is used to 1025 * unmap the ESB pages when a device is passed-through. 1026 */ 1027 mutex_lock(&xive->mapping_lock); 1028 xive->mapping = NULL; 1029 mutex_unlock(&xive->mapping_lock); 1030 1031 /* 1032 * Since this is the device release function, we know that 1033 * userspace does not have any open fd or mmap referring to 1034 * the device. Therefore there can not be any of the 1035 * device attribute set/get, mmap, or page fault functions 1036 * being executed concurrently, and similarly, the 1037 * connect_vcpu and set/clr_mapped functions also cannot 1038 * be being executed. 1039 */ 1040 1041 debugfs_remove(xive->dentry); 1042 1043 /* 1044 * We should clean up the vCPU interrupt presenters first. 1045 */ 1046 kvm_for_each_vcpu(i, vcpu, kvm) { 1047 /* 1048 * Take vcpu->mutex to ensure that no one_reg get/set ioctl 1049 * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done. 1050 * Holding the vcpu->mutex also means that the vcpu cannot 1051 * be executing the KVM_RUN ioctl, and therefore it cannot 1052 * be executing the XIVE push or pull code or accessing 1053 * the XIVE MMIO regions. 1054 */ 1055 mutex_lock(&vcpu->mutex); 1056 kvmppc_xive_native_cleanup_vcpu(vcpu); 1057 mutex_unlock(&vcpu->mutex); 1058 } 1059 1060 /* 1061 * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type 1062 * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe 1063 * against xive code getting called during vcpu execution or 1064 * set/get one_reg operations. 1065 */ 1066 kvm->arch.xive = NULL; 1067 1068 for (i = 0; i <= xive->max_sbid; i++) { 1069 if (xive->src_blocks[i]) 1070 kvmppc_xive_free_sources(xive->src_blocks[i]); 1071 kfree(xive->src_blocks[i]); 1072 xive->src_blocks[i] = NULL; 1073 } 1074 1075 if (xive->vp_base != XIVE_INVALID_VP) 1076 xive_native_free_vp_block(xive->vp_base); 1077 1078 /* 1079 * A reference of the kvmppc_xive pointer is now kept under 1080 * the xive_devices struct of the machine for reuse. It is 1081 * freed when the VM is destroyed for now until we fix all the 1082 * execution paths. 1083 */ 1084 1085 kfree(dev); 1086 } 1087 1088 /* 1089 * Create a XIVE device. kvm->lock is held. 1090 */ 1091 static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) 1092 { 1093 struct kvmppc_xive *xive; 1094 struct kvm *kvm = dev->kvm; 1095 1096 pr_devel("Creating xive native device\n"); 1097 1098 if (kvm->arch.xive) 1099 return -EEXIST; 1100 1101 xive = kvmppc_xive_get_device(kvm, type); 1102 if (!xive) 1103 return -ENOMEM; 1104 1105 dev->private = xive; 1106 xive->dev = dev; 1107 xive->kvm = kvm; 1108 mutex_init(&xive->mapping_lock); 1109 mutex_init(&xive->lock); 1110 1111 /* VP allocation is delayed to the first call to connect_vcpu */ 1112 xive->vp_base = XIVE_INVALID_VP; 1113 /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets 1114 * on a POWER9 system. 1115 */ 1116 xive->nr_servers = KVM_MAX_VCPUS; 1117 1118 if (xive_native_has_single_escalation()) 1119 xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; 1120 1121 if (xive_native_has_save_restore()) 1122 xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE; 1123 1124 xive->ops = &kvmppc_xive_native_ops; 1125 1126 kvm->arch.xive = xive; 1127 return 0; 1128 } 1129 1130 /* 1131 * Interrupt Pending Buffer (IPB) offset 1132 */ 1133 #define TM_IPB_SHIFT 40 1134 #define TM_IPB_MASK (((u64) 0xFF) << TM_IPB_SHIFT) 1135 1136 int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val) 1137 { 1138 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1139 u64 opal_state; 1140 int rc; 1141 1142 if (!kvmppc_xive_enabled(vcpu)) 1143 return -EPERM; 1144 1145 if (!xc) 1146 return -ENOENT; 1147 1148 /* Thread context registers. We only care about IPB and CPPR */ 1149 val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01; 1150 1151 /* Get the VP state from OPAL */ 1152 rc = xive_native_get_vp_state(xc->vp_id, &opal_state); 1153 if (rc) 1154 return rc; 1155 1156 /* 1157 * Capture the backup of IPB register in the NVT structure and 1158 * merge it in our KVM VP state. 1159 */ 1160 val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK); 1161 1162 pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n", 1163 __func__, 1164 vcpu->arch.xive_saved_state.nsr, 1165 vcpu->arch.xive_saved_state.cppr, 1166 vcpu->arch.xive_saved_state.ipb, 1167 vcpu->arch.xive_saved_state.pipr, 1168 vcpu->arch.xive_saved_state.w01, 1169 (u32) vcpu->arch.xive_cam_word, opal_state); 1170 1171 return 0; 1172 } 1173 1174 int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val) 1175 { 1176 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1177 struct kvmppc_xive *xive = vcpu->kvm->arch.xive; 1178 1179 pr_devel("%s w01=%016llx vp=%016llx\n", __func__, 1180 val->xive_timaval[0], val->xive_timaval[1]); 1181 1182 if (!kvmppc_xive_enabled(vcpu)) 1183 return -EPERM; 1184 1185 if (!xc || !xive) 1186 return -ENOENT; 1187 1188 /* We can't update the state of a "pushed" VCPU */ 1189 if (WARN_ON(vcpu->arch.xive_pushed)) 1190 return -EBUSY; 1191 1192 /* 1193 * Restore the thread context registers. IPB and CPPR should 1194 * be the only ones that matter. 1195 */ 1196 vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0]; 1197 1198 /* 1199 * There is no need to restore the XIVE internal state (IPB 1200 * stored in the NVT) as the IPB register was merged in KVM VP 1201 * state when captured. 1202 */ 1203 return 0; 1204 } 1205 1206 bool kvmppc_xive_native_supported(void) 1207 { 1208 return xive_native_has_queue_state_support(); 1209 } 1210 1211 static int xive_native_debug_show(struct seq_file *m, void *private) 1212 { 1213 struct kvmppc_xive *xive = m->private; 1214 struct kvm *kvm = xive->kvm; 1215 struct kvm_vcpu *vcpu; 1216 unsigned long i; 1217 1218 if (!kvm) 1219 return 0; 1220 1221 seq_puts(m, "=========\nVCPU state\n=========\n"); 1222 1223 kvm_for_each_vcpu(i, vcpu, kvm) { 1224 struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; 1225 1226 if (!xc) 1227 continue; 1228 1229 seq_printf(m, "VCPU %d: VP=%#x/%02x\n" 1230 " NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n", 1231 xc->server_num, xc->vp_id, xc->vp_chip_id, 1232 vcpu->arch.xive_saved_state.nsr, 1233 vcpu->arch.xive_saved_state.cppr, 1234 vcpu->arch.xive_saved_state.ipb, 1235 vcpu->arch.xive_saved_state.pipr, 1236 be64_to_cpu(vcpu->arch.xive_saved_state.w01), 1237 be32_to_cpu(vcpu->arch.xive_cam_word)); 1238 1239 kvmppc_xive_debug_show_queues(m, vcpu); 1240 } 1241 1242 seq_puts(m, "=========\nSources\n=========\n"); 1243 1244 for (i = 0; i <= xive->max_sbid; i++) { 1245 struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; 1246 1247 if (sb) { 1248 arch_spin_lock(&sb->lock); 1249 kvmppc_xive_debug_show_sources(m, sb); 1250 arch_spin_unlock(&sb->lock); 1251 } 1252 } 1253 1254 return 0; 1255 } 1256 1257 DEFINE_SHOW_ATTRIBUTE(xive_native_debug); 1258 1259 static void xive_native_debugfs_init(struct kvmppc_xive *xive) 1260 { 1261 xive->dentry = debugfs_create_file("xive", 0444, xive->kvm->debugfs_dentry, 1262 xive, &xive_native_debug_fops); 1263 1264 pr_debug("%s: created\n", __func__); 1265 } 1266 1267 static void kvmppc_xive_native_init(struct kvm_device *dev) 1268 { 1269 struct kvmppc_xive *xive = dev->private; 1270 1271 /* Register some debug interfaces */ 1272 xive_native_debugfs_init(xive); 1273 } 1274 1275 struct kvm_device_ops kvm_xive_native_ops = { 1276 .name = "kvm-xive-native", 1277 .create = kvmppc_xive_native_create, 1278 .init = kvmppc_xive_native_init, 1279 .release = kvmppc_xive_native_release, 1280 .set_attr = kvmppc_xive_native_set_attr, 1281 .get_attr = kvmppc_xive_native_get_attr, 1282 .has_attr = kvmppc_xive_native_has_attr, 1283 .mmap = kvmppc_xive_native_mmap, 1284 }; 1285