xref: /linux/arch/powerpc/kvm/book3s_xive_native.c (revision a44e4f3ab16bc808590763a543a93b6fbf3abcc4)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2017-2019, IBM Corporation.
4  */
5 
6 #define pr_fmt(fmt) "xive-kvm: " fmt
7 
8 #include <linux/kernel.h>
9 #include <linux/kvm_host.h>
10 #include <linux/err.h>
11 #include <linux/gfp.h>
12 #include <linux/spinlock.h>
13 #include <linux/delay.h>
14 #include <linux/file.h>
15 #include <asm/uaccess.h>
16 #include <asm/kvm_book3s.h>
17 #include <asm/kvm_ppc.h>
18 #include <asm/hvcall.h>
19 #include <asm/xive.h>
20 #include <asm/xive-regs.h>
21 #include <asm/debug.h>
22 #include <asm/debugfs.h>
23 #include <asm/opal.h>
24 
25 #include <linux/debugfs.h>
26 #include <linux/seq_file.h>
27 
28 #include "book3s_xive.h"
29 
30 static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
31 {
32 	u64 val;
33 
34 	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
35 		offset |= offset << 4;
36 
37 	val = in_be64(xd->eoi_mmio + offset);
38 	return (u8)val;
39 }
40 
41 static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
42 {
43 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
44 	struct xive_q *q = &xc->queues[prio];
45 
46 	xive_native_disable_queue(xc->vp_id, q, prio);
47 	if (q->qpage) {
48 		put_page(virt_to_page(q->qpage));
49 		q->qpage = NULL;
50 	}
51 }
52 
53 void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
54 {
55 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
56 	int i;
57 
58 	if (!kvmppc_xive_enabled(vcpu))
59 		return;
60 
61 	if (!xc)
62 		return;
63 
64 	pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
65 
66 	/* Ensure no interrupt is still routed to that VP */
67 	xc->valid = false;
68 	kvmppc_xive_disable_vcpu_interrupts(vcpu);
69 
70 	/* Free escalations */
71 	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
72 		/* Free the escalation irq */
73 		if (xc->esc_virq[i]) {
74 			if (xc->xive->single_escalation)
75 				xive_cleanup_single_escalation(vcpu, xc,
76 							xc->esc_virq[i]);
77 			free_irq(xc->esc_virq[i], vcpu);
78 			irq_dispose_mapping(xc->esc_virq[i]);
79 			kfree(xc->esc_virq_names[i]);
80 			xc->esc_virq[i] = 0;
81 		}
82 	}
83 
84 	/* Disable the VP */
85 	xive_native_disable_vp(xc->vp_id);
86 
87 	/* Clear the cam word so guest entry won't try to push context */
88 	vcpu->arch.xive_cam_word = 0;
89 
90 	/* Free the queues */
91 	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
92 		kvmppc_xive_native_cleanup_queue(vcpu, i);
93 	}
94 
95 	/* Free the VP */
96 	kfree(xc);
97 
98 	/* Cleanup the vcpu */
99 	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
100 	vcpu->arch.xive_vcpu = NULL;
101 }
102 
103 int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
104 				    struct kvm_vcpu *vcpu, u32 server_num)
105 {
106 	struct kvmppc_xive *xive = dev->private;
107 	struct kvmppc_xive_vcpu *xc = NULL;
108 	int rc;
109 	u32 vp_id;
110 
111 	pr_devel("native_connect_vcpu(server=%d)\n", server_num);
112 
113 	if (dev->ops != &kvm_xive_native_ops) {
114 		pr_devel("Wrong ops !\n");
115 		return -EPERM;
116 	}
117 	if (xive->kvm != vcpu->kvm)
118 		return -EPERM;
119 	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
120 		return -EBUSY;
121 	if (server_num >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) {
122 		pr_devel("Out of bounds !\n");
123 		return -EINVAL;
124 	}
125 
126 	mutex_lock(&xive->lock);
127 
128 	vp_id = kvmppc_xive_vp(xive, server_num);
129 	if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
130 		pr_devel("Duplicate !\n");
131 		rc = -EEXIST;
132 		goto bail;
133 	}
134 
135 	xc = kzalloc(sizeof(*xc), GFP_KERNEL);
136 	if (!xc) {
137 		rc = -ENOMEM;
138 		goto bail;
139 	}
140 
141 	vcpu->arch.xive_vcpu = xc;
142 	xc->xive = xive;
143 	xc->vcpu = vcpu;
144 	xc->server_num = server_num;
145 
146 	xc->vp_id = vp_id;
147 	xc->valid = true;
148 	vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
149 
150 	rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
151 	if (rc) {
152 		pr_err("Failed to get VP info from OPAL: %d\n", rc);
153 		goto bail;
154 	}
155 
156 	/*
157 	 * Enable the VP first as the single escalation mode will
158 	 * affect escalation interrupts numbering
159 	 */
160 	rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
161 	if (rc) {
162 		pr_err("Failed to enable VP in OPAL: %d\n", rc);
163 		goto bail;
164 	}
165 
166 	/* Configure VCPU fields for use by assembly push/pull */
167 	vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
168 	vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
169 
170 	/* TODO: reset all queues to a clean state ? */
171 bail:
172 	mutex_unlock(&xive->lock);
173 	if (rc)
174 		kvmppc_xive_native_cleanup_vcpu(vcpu);
175 
176 	return rc;
177 }
178 
179 /*
180  * Device passthrough support
181  */
182 static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
183 {
184 	struct kvmppc_xive *xive = kvm->arch.xive;
185 	pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2;
186 
187 	if (irq >= KVMPPC_XIVE_NR_IRQS)
188 		return -EINVAL;
189 
190 	/*
191 	 * Clear the ESB pages of the IRQ number being mapped (or
192 	 * unmapped) into the guest and let the the VM fault handler
193 	 * repopulate with the appropriate ESB pages (device or IC)
194 	 */
195 	pr_debug("clearing esb pages for girq 0x%lx\n", irq);
196 	mutex_lock(&xive->mapping_lock);
197 	if (xive->mapping)
198 		unmap_mapping_range(xive->mapping,
199 				    esb_pgoff << PAGE_SHIFT,
200 				    2ull << PAGE_SHIFT, 1);
201 	mutex_unlock(&xive->mapping_lock);
202 	return 0;
203 }
204 
205 static struct kvmppc_xive_ops kvmppc_xive_native_ops =  {
206 	.reset_mapped = kvmppc_xive_native_reset_mapped,
207 };
208 
209 static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf)
210 {
211 	struct vm_area_struct *vma = vmf->vma;
212 	struct kvm_device *dev = vma->vm_file->private_data;
213 	struct kvmppc_xive *xive = dev->private;
214 	struct kvmppc_xive_src_block *sb;
215 	struct kvmppc_xive_irq_state *state;
216 	struct xive_irq_data *xd;
217 	u32 hw_num;
218 	u16 src;
219 	u64 page;
220 	unsigned long irq;
221 	u64 page_offset;
222 
223 	/*
224 	 * Linux/KVM uses a two pages ESB setting, one for trigger and
225 	 * one for EOI
226 	 */
227 	page_offset = vmf->pgoff - vma->vm_pgoff;
228 	irq = page_offset / 2;
229 
230 	sb = kvmppc_xive_find_source(xive, irq, &src);
231 	if (!sb) {
232 		pr_devel("%s: source %lx not found !\n", __func__, irq);
233 		return VM_FAULT_SIGBUS;
234 	}
235 
236 	state = &sb->irq_state[src];
237 	kvmppc_xive_select_irq(state, &hw_num, &xd);
238 
239 	arch_spin_lock(&sb->lock);
240 
241 	/*
242 	 * first/even page is for trigger
243 	 * second/odd page is for EOI and management.
244 	 */
245 	page = page_offset % 2 ? xd->eoi_page : xd->trig_page;
246 	arch_spin_unlock(&sb->lock);
247 
248 	if (WARN_ON(!page)) {
249 		pr_err("%s: accessing invalid ESB page for source %lx !\n",
250 		       __func__, irq);
251 		return VM_FAULT_SIGBUS;
252 	}
253 
254 	vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
255 	return VM_FAULT_NOPAGE;
256 }
257 
258 static const struct vm_operations_struct xive_native_esb_vmops = {
259 	.fault = xive_native_esb_fault,
260 };
261 
262 static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf)
263 {
264 	struct vm_area_struct *vma = vmf->vma;
265 
266 	switch (vmf->pgoff - vma->vm_pgoff) {
267 	case 0: /* HW - forbid access */
268 	case 1: /* HV - forbid access */
269 		return VM_FAULT_SIGBUS;
270 	case 2: /* OS */
271 		vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT);
272 		return VM_FAULT_NOPAGE;
273 	case 3: /* USER - TODO */
274 	default:
275 		return VM_FAULT_SIGBUS;
276 	}
277 }
278 
279 static const struct vm_operations_struct xive_native_tima_vmops = {
280 	.fault = xive_native_tima_fault,
281 };
282 
283 static int kvmppc_xive_native_mmap(struct kvm_device *dev,
284 				   struct vm_area_struct *vma)
285 {
286 	struct kvmppc_xive *xive = dev->private;
287 
288 	/* We only allow mappings at fixed offset for now */
289 	if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) {
290 		if (vma_pages(vma) > 4)
291 			return -EINVAL;
292 		vma->vm_ops = &xive_native_tima_vmops;
293 	} else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) {
294 		if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2)
295 			return -EINVAL;
296 		vma->vm_ops = &xive_native_esb_vmops;
297 	} else {
298 		return -EINVAL;
299 	}
300 
301 	vma->vm_flags |= VM_IO | VM_PFNMAP;
302 	vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
303 
304 	/*
305 	 * Grab the KVM device file address_space to be able to clear
306 	 * the ESB pages mapping when a device is passed-through into
307 	 * the guest.
308 	 */
309 	xive->mapping = vma->vm_file->f_mapping;
310 	return 0;
311 }
312 
313 static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
314 					 u64 addr)
315 {
316 	struct kvmppc_xive_src_block *sb;
317 	struct kvmppc_xive_irq_state *state;
318 	u64 __user *ubufp = (u64 __user *) addr;
319 	u64 val;
320 	u16 idx;
321 	int rc;
322 
323 	pr_devel("%s irq=0x%lx\n", __func__, irq);
324 
325 	if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
326 		return -E2BIG;
327 
328 	sb = kvmppc_xive_find_source(xive, irq, &idx);
329 	if (!sb) {
330 		pr_debug("No source, creating source block...\n");
331 		sb = kvmppc_xive_create_src_block(xive, irq);
332 		if (!sb) {
333 			pr_err("Failed to create block...\n");
334 			return -ENOMEM;
335 		}
336 	}
337 	state = &sb->irq_state[idx];
338 
339 	if (get_user(val, ubufp)) {
340 		pr_err("fault getting user info !\n");
341 		return -EFAULT;
342 	}
343 
344 	arch_spin_lock(&sb->lock);
345 
346 	/*
347 	 * If the source doesn't already have an IPI, allocate
348 	 * one and get the corresponding data
349 	 */
350 	if (!state->ipi_number) {
351 		state->ipi_number = xive_native_alloc_irq();
352 		if (state->ipi_number == 0) {
353 			pr_err("Failed to allocate IRQ !\n");
354 			rc = -ENXIO;
355 			goto unlock;
356 		}
357 		xive_native_populate_irq_data(state->ipi_number,
358 					      &state->ipi_data);
359 		pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
360 			 state->ipi_number, irq);
361 	}
362 
363 	/* Restore LSI state */
364 	if (val & KVM_XIVE_LEVEL_SENSITIVE) {
365 		state->lsi = true;
366 		if (val & KVM_XIVE_LEVEL_ASSERTED)
367 			state->asserted = true;
368 		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
369 	}
370 
371 	/* Mask IRQ to start with */
372 	state->act_server = 0;
373 	state->act_priority = MASKED;
374 	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
375 	xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
376 
377 	/* Increment the number of valid sources and mark this one valid */
378 	if (!state->valid)
379 		xive->src_count++;
380 	state->valid = true;
381 
382 	rc = 0;
383 
384 unlock:
385 	arch_spin_unlock(&sb->lock);
386 
387 	return rc;
388 }
389 
390 static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive,
391 					struct kvmppc_xive_src_block *sb,
392 					struct kvmppc_xive_irq_state *state,
393 					u32 server, u8 priority, bool masked,
394 					u32 eisn)
395 {
396 	struct kvm *kvm = xive->kvm;
397 	u32 hw_num;
398 	int rc = 0;
399 
400 	arch_spin_lock(&sb->lock);
401 
402 	if (state->act_server == server && state->act_priority == priority &&
403 	    state->eisn == eisn)
404 		goto unlock;
405 
406 	pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
407 		 priority, server, masked, state->act_server,
408 		 state->act_priority);
409 
410 	kvmppc_xive_select_irq(state, &hw_num, NULL);
411 
412 	if (priority != MASKED && !masked) {
413 		rc = kvmppc_xive_select_target(kvm, &server, priority);
414 		if (rc)
415 			goto unlock;
416 
417 		state->act_priority = priority;
418 		state->act_server = server;
419 		state->eisn = eisn;
420 
421 		rc = xive_native_configure_irq(hw_num,
422 					       kvmppc_xive_vp(xive, server),
423 					       priority, eisn);
424 	} else {
425 		state->act_priority = MASKED;
426 		state->act_server = 0;
427 		state->eisn = 0;
428 
429 		rc = xive_native_configure_irq(hw_num, 0, MASKED, 0);
430 	}
431 
432 unlock:
433 	arch_spin_unlock(&sb->lock);
434 	return rc;
435 }
436 
437 static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
438 						long irq, u64 addr)
439 {
440 	struct kvmppc_xive_src_block *sb;
441 	struct kvmppc_xive_irq_state *state;
442 	u64 __user *ubufp = (u64 __user *) addr;
443 	u16 src;
444 	u64 kvm_cfg;
445 	u32 server;
446 	u8 priority;
447 	bool masked;
448 	u32 eisn;
449 
450 	sb = kvmppc_xive_find_source(xive, irq, &src);
451 	if (!sb)
452 		return -ENOENT;
453 
454 	state = &sb->irq_state[src];
455 
456 	if (!state->valid)
457 		return -EINVAL;
458 
459 	if (get_user(kvm_cfg, ubufp))
460 		return -EFAULT;
461 
462 	pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg);
463 
464 	priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >>
465 		KVM_XIVE_SOURCE_PRIORITY_SHIFT;
466 	server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >>
467 		KVM_XIVE_SOURCE_SERVER_SHIFT;
468 	masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >>
469 		KVM_XIVE_SOURCE_MASKED_SHIFT;
470 	eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >>
471 		KVM_XIVE_SOURCE_EISN_SHIFT;
472 
473 	if (priority != xive_prio_from_guest(priority)) {
474 		pr_err("invalid priority for queue %d for VCPU %d\n",
475 		       priority, server);
476 		return -EINVAL;
477 	}
478 
479 	return kvmppc_xive_native_update_source_config(xive, sb, state, server,
480 						       priority, masked, eisn);
481 }
482 
483 static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive,
484 					  long irq, u64 addr)
485 {
486 	struct kvmppc_xive_src_block *sb;
487 	struct kvmppc_xive_irq_state *state;
488 	struct xive_irq_data *xd;
489 	u32 hw_num;
490 	u16 src;
491 	int rc = 0;
492 
493 	pr_devel("%s irq=0x%lx", __func__, irq);
494 
495 	sb = kvmppc_xive_find_source(xive, irq, &src);
496 	if (!sb)
497 		return -ENOENT;
498 
499 	state = &sb->irq_state[src];
500 
501 	rc = -EINVAL;
502 
503 	arch_spin_lock(&sb->lock);
504 
505 	if (state->valid) {
506 		kvmppc_xive_select_irq(state, &hw_num, &xd);
507 		xive_native_sync_source(hw_num);
508 		rc = 0;
509 	}
510 
511 	arch_spin_unlock(&sb->lock);
512 	return rc;
513 }
514 
515 static int xive_native_validate_queue_size(u32 qshift)
516 {
517 	/*
518 	 * We only support 64K pages for the moment. This is also
519 	 * advertised in the DT property "ibm,xive-eq-sizes"
520 	 */
521 	switch (qshift) {
522 	case 0: /* EQ reset */
523 	case 16:
524 		return 0;
525 	case 12:
526 	case 21:
527 	case 24:
528 	default:
529 		return -EINVAL;
530 	}
531 }
532 
533 static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
534 					       long eq_idx, u64 addr)
535 {
536 	struct kvm *kvm = xive->kvm;
537 	struct kvm_vcpu *vcpu;
538 	struct kvmppc_xive_vcpu *xc;
539 	void __user *ubufp = (void __user *) addr;
540 	u32 server;
541 	u8 priority;
542 	struct kvm_ppc_xive_eq kvm_eq;
543 	int rc;
544 	__be32 *qaddr = 0;
545 	struct page *page;
546 	struct xive_q *q;
547 	gfn_t gfn;
548 	unsigned long page_size;
549 	int srcu_idx;
550 
551 	/*
552 	 * Demangle priority/server tuple from the EQ identifier
553 	 */
554 	priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
555 		KVM_XIVE_EQ_PRIORITY_SHIFT;
556 	server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
557 		KVM_XIVE_EQ_SERVER_SHIFT;
558 
559 	if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
560 		return -EFAULT;
561 
562 	vcpu = kvmppc_xive_find_server(kvm, server);
563 	if (!vcpu) {
564 		pr_err("Can't find server %d\n", server);
565 		return -ENOENT;
566 	}
567 	xc = vcpu->arch.xive_vcpu;
568 
569 	if (priority != xive_prio_from_guest(priority)) {
570 		pr_err("Trying to restore invalid queue %d for VCPU %d\n",
571 		       priority, server);
572 		return -EINVAL;
573 	}
574 	q = &xc->queues[priority];
575 
576 	pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
577 		 __func__, server, priority, kvm_eq.flags,
578 		 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
579 
580 	/* reset queue and disable queueing */
581 	if (!kvm_eq.qshift) {
582 		q->guest_qaddr  = 0;
583 		q->guest_qshift = 0;
584 
585 		rc = xive_native_configure_queue(xc->vp_id, q, priority,
586 						 NULL, 0, true);
587 		if (rc) {
588 			pr_err("Failed to reset queue %d for VCPU %d: %d\n",
589 			       priority, xc->server_num, rc);
590 			return rc;
591 		}
592 
593 		if (q->qpage) {
594 			put_page(virt_to_page(q->qpage));
595 			q->qpage = NULL;
596 		}
597 
598 		return 0;
599 	}
600 
601 	/*
602 	 * sPAPR specifies a "Unconditional Notify (n) flag" for the
603 	 * H_INT_SET_QUEUE_CONFIG hcall which forces notification
604 	 * without using the coalescing mechanisms provided by the
605 	 * XIVE END ESBs. This is required on KVM as notification
606 	 * using the END ESBs is not supported.
607 	 */
608 	if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
609 		pr_err("invalid flags %d\n", kvm_eq.flags);
610 		return -EINVAL;
611 	}
612 
613 	rc = xive_native_validate_queue_size(kvm_eq.qshift);
614 	if (rc) {
615 		pr_err("invalid queue size %d\n", kvm_eq.qshift);
616 		return rc;
617 	}
618 
619 	if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
620 		pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
621 		       1ull << kvm_eq.qshift);
622 		return -EINVAL;
623 	}
624 
625 	srcu_idx = srcu_read_lock(&kvm->srcu);
626 	gfn = gpa_to_gfn(kvm_eq.qaddr);
627 	page = gfn_to_page(kvm, gfn);
628 	if (is_error_page(page)) {
629 		srcu_read_unlock(&kvm->srcu, srcu_idx);
630 		pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
631 		return -EINVAL;
632 	}
633 
634 	page_size = kvm_host_page_size(kvm, gfn);
635 	if (1ull << kvm_eq.qshift > page_size) {
636 		srcu_read_unlock(&kvm->srcu, srcu_idx);
637 		pr_warn("Incompatible host page size %lx!\n", page_size);
638 		return -EINVAL;
639 	}
640 
641 	qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
642 	srcu_read_unlock(&kvm->srcu, srcu_idx);
643 
644 	/*
645 	 * Backup the queue page guest address to the mark EQ page
646 	 * dirty for migration.
647 	 */
648 	q->guest_qaddr  = kvm_eq.qaddr;
649 	q->guest_qshift = kvm_eq.qshift;
650 
651 	 /*
652 	  * Unconditional Notification is forced by default at the
653 	  * OPAL level because the use of END ESBs is not supported by
654 	  * Linux.
655 	  */
656 	rc = xive_native_configure_queue(xc->vp_id, q, priority,
657 					 (__be32 *) qaddr, kvm_eq.qshift, true);
658 	if (rc) {
659 		pr_err("Failed to configure queue %d for VCPU %d: %d\n",
660 		       priority, xc->server_num, rc);
661 		put_page(page);
662 		return rc;
663 	}
664 
665 	/*
666 	 * Only restore the queue state when needed. When doing the
667 	 * H_INT_SET_SOURCE_CONFIG hcall, it should not.
668 	 */
669 	if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
670 		rc = xive_native_set_queue_state(xc->vp_id, priority,
671 						 kvm_eq.qtoggle,
672 						 kvm_eq.qindex);
673 		if (rc)
674 			goto error;
675 	}
676 
677 	rc = kvmppc_xive_attach_escalation(vcpu, priority,
678 					   xive->single_escalation);
679 error:
680 	if (rc)
681 		kvmppc_xive_native_cleanup_queue(vcpu, priority);
682 	return rc;
683 }
684 
685 static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
686 					       long eq_idx, u64 addr)
687 {
688 	struct kvm *kvm = xive->kvm;
689 	struct kvm_vcpu *vcpu;
690 	struct kvmppc_xive_vcpu *xc;
691 	struct xive_q *q;
692 	void __user *ubufp = (u64 __user *) addr;
693 	u32 server;
694 	u8 priority;
695 	struct kvm_ppc_xive_eq kvm_eq;
696 	u64 qaddr;
697 	u64 qshift;
698 	u64 qeoi_page;
699 	u32 escalate_irq;
700 	u64 qflags;
701 	int rc;
702 
703 	/*
704 	 * Demangle priority/server tuple from the EQ identifier
705 	 */
706 	priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
707 		KVM_XIVE_EQ_PRIORITY_SHIFT;
708 	server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
709 		KVM_XIVE_EQ_SERVER_SHIFT;
710 
711 	vcpu = kvmppc_xive_find_server(kvm, server);
712 	if (!vcpu) {
713 		pr_err("Can't find server %d\n", server);
714 		return -ENOENT;
715 	}
716 	xc = vcpu->arch.xive_vcpu;
717 
718 	if (priority != xive_prio_from_guest(priority)) {
719 		pr_err("invalid priority for queue %d for VCPU %d\n",
720 		       priority, server);
721 		return -EINVAL;
722 	}
723 	q = &xc->queues[priority];
724 
725 	memset(&kvm_eq, 0, sizeof(kvm_eq));
726 
727 	if (!q->qpage)
728 		return 0;
729 
730 	rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
731 					&qeoi_page, &escalate_irq, &qflags);
732 	if (rc)
733 		return rc;
734 
735 	kvm_eq.flags = 0;
736 	if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
737 		kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
738 
739 	kvm_eq.qshift = q->guest_qshift;
740 	kvm_eq.qaddr  = q->guest_qaddr;
741 
742 	rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
743 					 &kvm_eq.qindex);
744 	if (rc)
745 		return rc;
746 
747 	pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
748 		 __func__, server, priority, kvm_eq.flags,
749 		 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
750 
751 	if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
752 		return -EFAULT;
753 
754 	return 0;
755 }
756 
757 static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb)
758 {
759 	int i;
760 
761 	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
762 		struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
763 
764 		if (!state->valid)
765 			continue;
766 
767 		if (state->act_priority == MASKED)
768 			continue;
769 
770 		state->eisn = 0;
771 		state->act_server = 0;
772 		state->act_priority = MASKED;
773 		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
774 		xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
775 		if (state->pt_number) {
776 			xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
777 			xive_native_configure_irq(state->pt_number,
778 						  0, MASKED, 0);
779 		}
780 	}
781 }
782 
783 static int kvmppc_xive_reset(struct kvmppc_xive *xive)
784 {
785 	struct kvm *kvm = xive->kvm;
786 	struct kvm_vcpu *vcpu;
787 	unsigned int i;
788 
789 	pr_devel("%s\n", __func__);
790 
791 	mutex_lock(&xive->lock);
792 
793 	kvm_for_each_vcpu(i, vcpu, kvm) {
794 		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
795 		unsigned int prio;
796 
797 		if (!xc)
798 			continue;
799 
800 		kvmppc_xive_disable_vcpu_interrupts(vcpu);
801 
802 		for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
803 
804 			/* Single escalation, no queue 7 */
805 			if (prio == 7 && xive->single_escalation)
806 				break;
807 
808 			if (xc->esc_virq[prio]) {
809 				free_irq(xc->esc_virq[prio], vcpu);
810 				irq_dispose_mapping(xc->esc_virq[prio]);
811 				kfree(xc->esc_virq_names[prio]);
812 				xc->esc_virq[prio] = 0;
813 			}
814 
815 			kvmppc_xive_native_cleanup_queue(vcpu, prio);
816 		}
817 	}
818 
819 	for (i = 0; i <= xive->max_sbid; i++) {
820 		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
821 
822 		if (sb) {
823 			arch_spin_lock(&sb->lock);
824 			kvmppc_xive_reset_sources(sb);
825 			arch_spin_unlock(&sb->lock);
826 		}
827 	}
828 
829 	mutex_unlock(&xive->lock);
830 
831 	return 0;
832 }
833 
834 static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
835 {
836 	int j;
837 
838 	for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
839 		struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
840 		struct xive_irq_data *xd;
841 		u32 hw_num;
842 
843 		if (!state->valid)
844 			continue;
845 
846 		/*
847 		 * The struct kvmppc_xive_irq_state reflects the state
848 		 * of the EAS configuration and not the state of the
849 		 * source. The source is masked setting the PQ bits to
850 		 * '-Q', which is what is being done before calling
851 		 * the KVM_DEV_XIVE_EQ_SYNC control.
852 		 *
853 		 * If a source EAS is configured, OPAL syncs the XIVE
854 		 * IC of the source and the XIVE IC of the previous
855 		 * target if any.
856 		 *
857 		 * So it should be fine ignoring MASKED sources as
858 		 * they have been synced already.
859 		 */
860 		if (state->act_priority == MASKED)
861 			continue;
862 
863 		kvmppc_xive_select_irq(state, &hw_num, &xd);
864 		xive_native_sync_source(hw_num);
865 		xive_native_sync_queue(hw_num);
866 	}
867 }
868 
869 static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
870 {
871 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
872 	unsigned int prio;
873 	int srcu_idx;
874 
875 	if (!xc)
876 		return -ENOENT;
877 
878 	for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
879 		struct xive_q *q = &xc->queues[prio];
880 
881 		if (!q->qpage)
882 			continue;
883 
884 		/* Mark EQ page dirty for migration */
885 		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
886 		mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
887 		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
888 	}
889 	return 0;
890 }
891 
892 static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
893 {
894 	struct kvm *kvm = xive->kvm;
895 	struct kvm_vcpu *vcpu;
896 	unsigned int i;
897 
898 	pr_devel("%s\n", __func__);
899 
900 	mutex_lock(&xive->lock);
901 	for (i = 0; i <= xive->max_sbid; i++) {
902 		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
903 
904 		if (sb) {
905 			arch_spin_lock(&sb->lock);
906 			kvmppc_xive_native_sync_sources(sb);
907 			arch_spin_unlock(&sb->lock);
908 		}
909 	}
910 
911 	kvm_for_each_vcpu(i, vcpu, kvm) {
912 		kvmppc_xive_native_vcpu_eq_sync(vcpu);
913 	}
914 	mutex_unlock(&xive->lock);
915 
916 	return 0;
917 }
918 
919 static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
920 				       struct kvm_device_attr *attr)
921 {
922 	struct kvmppc_xive *xive = dev->private;
923 
924 	switch (attr->group) {
925 	case KVM_DEV_XIVE_GRP_CTRL:
926 		switch (attr->attr) {
927 		case KVM_DEV_XIVE_RESET:
928 			return kvmppc_xive_reset(xive);
929 		case KVM_DEV_XIVE_EQ_SYNC:
930 			return kvmppc_xive_native_eq_sync(xive);
931 		}
932 		break;
933 	case KVM_DEV_XIVE_GRP_SOURCE:
934 		return kvmppc_xive_native_set_source(xive, attr->attr,
935 						     attr->addr);
936 	case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
937 		return kvmppc_xive_native_set_source_config(xive, attr->attr,
938 							    attr->addr);
939 	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
940 		return kvmppc_xive_native_set_queue_config(xive, attr->attr,
941 							   attr->addr);
942 	case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
943 		return kvmppc_xive_native_sync_source(xive, attr->attr,
944 						      attr->addr);
945 	}
946 	return -ENXIO;
947 }
948 
949 static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
950 				       struct kvm_device_attr *attr)
951 {
952 	struct kvmppc_xive *xive = dev->private;
953 
954 	switch (attr->group) {
955 	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
956 		return kvmppc_xive_native_get_queue_config(xive, attr->attr,
957 							   attr->addr);
958 	}
959 	return -ENXIO;
960 }
961 
962 static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
963 				       struct kvm_device_attr *attr)
964 {
965 	switch (attr->group) {
966 	case KVM_DEV_XIVE_GRP_CTRL:
967 		switch (attr->attr) {
968 		case KVM_DEV_XIVE_RESET:
969 		case KVM_DEV_XIVE_EQ_SYNC:
970 			return 0;
971 		}
972 		break;
973 	case KVM_DEV_XIVE_GRP_SOURCE:
974 	case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
975 	case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
976 		if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
977 		    attr->attr < KVMPPC_XIVE_NR_IRQS)
978 			return 0;
979 		break;
980 	case KVM_DEV_XIVE_GRP_EQ_CONFIG:
981 		return 0;
982 	}
983 	return -ENXIO;
984 }
985 
986 /*
987  * Called when device fd is closed.  kvm->lock is held.
988  */
989 static void kvmppc_xive_native_release(struct kvm_device *dev)
990 {
991 	struct kvmppc_xive *xive = dev->private;
992 	struct kvm *kvm = xive->kvm;
993 	struct kvm_vcpu *vcpu;
994 	int i;
995 
996 	pr_devel("Releasing xive native device\n");
997 
998 	/*
999 	 * Clear the KVM device file address_space which is used to
1000 	 * unmap the ESB pages when a device is passed-through.
1001 	 */
1002 	mutex_lock(&xive->mapping_lock);
1003 	xive->mapping = NULL;
1004 	mutex_unlock(&xive->mapping_lock);
1005 
1006 	/*
1007 	 * Since this is the device release function, we know that
1008 	 * userspace does not have any open fd or mmap referring to
1009 	 * the device.  Therefore there can not be any of the
1010 	 * device attribute set/get, mmap, or page fault functions
1011 	 * being executed concurrently, and similarly, the
1012 	 * connect_vcpu and set/clr_mapped functions also cannot
1013 	 * be being executed.
1014 	 */
1015 
1016 	debugfs_remove(xive->dentry);
1017 
1018 	/*
1019 	 * We should clean up the vCPU interrupt presenters first.
1020 	 */
1021 	kvm_for_each_vcpu(i, vcpu, kvm) {
1022 		/*
1023 		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
1024 		 * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
1025 		 * Holding the vcpu->mutex also means that the vcpu cannot
1026 		 * be executing the KVM_RUN ioctl, and therefore it cannot
1027 		 * be executing the XIVE push or pull code or accessing
1028 		 * the XIVE MMIO regions.
1029 		 */
1030 		mutex_lock(&vcpu->mutex);
1031 		kvmppc_xive_native_cleanup_vcpu(vcpu);
1032 		mutex_unlock(&vcpu->mutex);
1033 	}
1034 
1035 	/*
1036 	 * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
1037 	 * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
1038 	 * against xive code getting called during vcpu execution or
1039 	 * set/get one_reg operations.
1040 	 */
1041 	kvm->arch.xive = NULL;
1042 
1043 	for (i = 0; i <= xive->max_sbid; i++) {
1044 		if (xive->src_blocks[i])
1045 			kvmppc_xive_free_sources(xive->src_blocks[i]);
1046 		kfree(xive->src_blocks[i]);
1047 		xive->src_blocks[i] = NULL;
1048 	}
1049 
1050 	if (xive->vp_base != XIVE_INVALID_VP)
1051 		xive_native_free_vp_block(xive->vp_base);
1052 
1053 	/*
1054 	 * A reference of the kvmppc_xive pointer is now kept under
1055 	 * the xive_devices struct of the machine for reuse. It is
1056 	 * freed when the VM is destroyed for now until we fix all the
1057 	 * execution paths.
1058 	 */
1059 
1060 	kfree(dev);
1061 }
1062 
1063 /*
1064  * Create a XIVE device.  kvm->lock is held.
1065  */
1066 static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
1067 {
1068 	struct kvmppc_xive *xive;
1069 	struct kvm *kvm = dev->kvm;
1070 	int ret = 0;
1071 
1072 	pr_devel("Creating xive native device\n");
1073 
1074 	if (kvm->arch.xive)
1075 		return -EEXIST;
1076 
1077 	xive = kvmppc_xive_get_device(kvm, type);
1078 	if (!xive)
1079 		return -ENOMEM;
1080 
1081 	dev->private = xive;
1082 	xive->dev = dev;
1083 	xive->kvm = kvm;
1084 	kvm->arch.xive = xive;
1085 	mutex_init(&xive->mapping_lock);
1086 	mutex_init(&xive->lock);
1087 
1088 	/*
1089 	 * Allocate a bunch of VPs. KVM_MAX_VCPUS is a large value for
1090 	 * a default. Getting the max number of CPUs the VM was
1091 	 * configured with would improve our usage of the XIVE VP space.
1092 	 */
1093 	xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS);
1094 	pr_devel("VP_Base=%x\n", xive->vp_base);
1095 
1096 	if (xive->vp_base == XIVE_INVALID_VP)
1097 		ret = -ENXIO;
1098 
1099 	xive->single_escalation = xive_native_has_single_escalation();
1100 	xive->ops = &kvmppc_xive_native_ops;
1101 
1102 	if (ret)
1103 		return ret;
1104 
1105 	return 0;
1106 }
1107 
1108 /*
1109  * Interrupt Pending Buffer (IPB) offset
1110  */
1111 #define TM_IPB_SHIFT 40
1112 #define TM_IPB_MASK  (((u64) 0xFF) << TM_IPB_SHIFT)
1113 
1114 int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1115 {
1116 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1117 	u64 opal_state;
1118 	int rc;
1119 
1120 	if (!kvmppc_xive_enabled(vcpu))
1121 		return -EPERM;
1122 
1123 	if (!xc)
1124 		return -ENOENT;
1125 
1126 	/* Thread context registers. We only care about IPB and CPPR */
1127 	val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
1128 
1129 	/* Get the VP state from OPAL */
1130 	rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
1131 	if (rc)
1132 		return rc;
1133 
1134 	/*
1135 	 * Capture the backup of IPB register in the NVT structure and
1136 	 * merge it in our KVM VP state.
1137 	 */
1138 	val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
1139 
1140 	pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
1141 		 __func__,
1142 		 vcpu->arch.xive_saved_state.nsr,
1143 		 vcpu->arch.xive_saved_state.cppr,
1144 		 vcpu->arch.xive_saved_state.ipb,
1145 		 vcpu->arch.xive_saved_state.pipr,
1146 		 vcpu->arch.xive_saved_state.w01,
1147 		 (u32) vcpu->arch.xive_cam_word, opal_state);
1148 
1149 	return 0;
1150 }
1151 
1152 int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
1153 {
1154 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1155 	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
1156 
1157 	pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
1158 		 val->xive_timaval[0], val->xive_timaval[1]);
1159 
1160 	if (!kvmppc_xive_enabled(vcpu))
1161 		return -EPERM;
1162 
1163 	if (!xc || !xive)
1164 		return -ENOENT;
1165 
1166 	/* We can't update the state of a "pushed" VCPU	 */
1167 	if (WARN_ON(vcpu->arch.xive_pushed))
1168 		return -EBUSY;
1169 
1170 	/*
1171 	 * Restore the thread context registers. IPB and CPPR should
1172 	 * be the only ones that matter.
1173 	 */
1174 	vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
1175 
1176 	/*
1177 	 * There is no need to restore the XIVE internal state (IPB
1178 	 * stored in the NVT) as the IPB register was merged in KVM VP
1179 	 * state when captured.
1180 	 */
1181 	return 0;
1182 }
1183 
1184 bool kvmppc_xive_native_supported(void)
1185 {
1186 	return xive_native_has_queue_state_support();
1187 }
1188 
1189 static int xive_native_debug_show(struct seq_file *m, void *private)
1190 {
1191 	struct kvmppc_xive *xive = m->private;
1192 	struct kvm *kvm = xive->kvm;
1193 	struct kvm_vcpu *vcpu;
1194 	unsigned int i;
1195 
1196 	if (!kvm)
1197 		return 0;
1198 
1199 	seq_puts(m, "=========\nVCPU state\n=========\n");
1200 
1201 	kvm_for_each_vcpu(i, vcpu, kvm) {
1202 		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
1203 
1204 		if (!xc)
1205 			continue;
1206 
1207 		seq_printf(m, "cpu server %#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
1208 			   xc->server_num,
1209 			   vcpu->arch.xive_saved_state.nsr,
1210 			   vcpu->arch.xive_saved_state.cppr,
1211 			   vcpu->arch.xive_saved_state.ipb,
1212 			   vcpu->arch.xive_saved_state.pipr,
1213 			   vcpu->arch.xive_saved_state.w01,
1214 			   (u32) vcpu->arch.xive_cam_word);
1215 
1216 		kvmppc_xive_debug_show_queues(m, vcpu);
1217 	}
1218 
1219 	return 0;
1220 }
1221 
1222 static int xive_native_debug_open(struct inode *inode, struct file *file)
1223 {
1224 	return single_open(file, xive_native_debug_show, inode->i_private);
1225 }
1226 
1227 static const struct file_operations xive_native_debug_fops = {
1228 	.open = xive_native_debug_open,
1229 	.read = seq_read,
1230 	.llseek = seq_lseek,
1231 	.release = single_release,
1232 };
1233 
1234 static void xive_native_debugfs_init(struct kvmppc_xive *xive)
1235 {
1236 	char *name;
1237 
1238 	name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
1239 	if (!name) {
1240 		pr_err("%s: no memory for name\n", __func__);
1241 		return;
1242 	}
1243 
1244 	xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root,
1245 					   xive, &xive_native_debug_fops);
1246 
1247 	pr_debug("%s: created %s\n", __func__, name);
1248 	kfree(name);
1249 }
1250 
1251 static void kvmppc_xive_native_init(struct kvm_device *dev)
1252 {
1253 	struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
1254 
1255 	/* Register some debug interfaces */
1256 	xive_native_debugfs_init(xive);
1257 }
1258 
1259 struct kvm_device_ops kvm_xive_native_ops = {
1260 	.name = "kvm-xive-native",
1261 	.create = kvmppc_xive_native_create,
1262 	.init = kvmppc_xive_native_init,
1263 	.release = kvmppc_xive_native_release,
1264 	.set_attr = kvmppc_xive_native_set_attr,
1265 	.get_attr = kvmppc_xive_native_get_attr,
1266 	.has_attr = kvmppc_xive_native_has_attr,
1267 	.mmap = kvmppc_xive_native_mmap,
1268 };
1269 
1270 void kvmppc_xive_native_init_module(void)
1271 {
1272 	;
1273 }
1274 
1275 void kvmppc_xive_native_exit_module(void)
1276 {
1277 	;
1278 }
1279