i915/gvt/kvmgt.c

/*
 * KVMGT - the implementation of Intel mediated pass-through framework for KVM
 *
 * Copyright(c) 2014-2016 Intel Corporation. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Authors:
 *    Kevin Tian <kevin.tian@intel.com>
 *    Jike Song <jike.song@intel.com>
 *    Xiaoguang Chen <xiaoguang.chen@intel.com>
 */

#include <linux/init.h>
#include <linux/device.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/list.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
#include <linux/uuid.h>
#include <linux/kvm_host.h>
#include <linux/vfio.h>

#include "i915_drv.h"
#include "gvt.h"

static inline long kvmgt_pin_pages(struct device *dev, unsigned long *user_pfn,
			long npage, int prot, unsigned long *phys_pfn)
{
	return 0;
}
static inline long kvmgt_unpin_pages(struct device *dev, unsigned long *pfn,
			long npage)
{
	return 0;
}

static const struct intel_gvt_ops *intel_gvt_ops;


/* helper macros copied from vfio-pci */
#define VFIO_PCI_OFFSET_SHIFT   40
#define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
#define VFIO_PCI_OFFSET_MASK    (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)

struct vfio_region {
	u32				type;
	u32				subtype;
	size_t				size;
	u32				flags;
};

struct kvmgt_pgfn {
	gfn_t gfn;
	struct hlist_node hnode;
};

struct kvmgt_guest_info {
	struct kvm *kvm;
	struct intel_vgpu *vgpu;
	struct kvm_page_track_notifier_node track_node;
#define NR_BKT (1 << 18)
	struct hlist_head ptable[NR_BKT];
#undef NR_BKT
};

struct gvt_dma {
	struct rb_node node;
	gfn_t gfn;
	kvm_pfn_t pfn;
};

static struct gvt_dma *__gvt_cache_find(struct intel_vgpu *vgpu, gfn_t gfn)
{
	struct rb_node *node = vgpu->vdev.cache.rb_node;
	struct gvt_dma *ret = NULL;

	while (node) {
		struct gvt_dma *itr = rb_entry(node, struct gvt_dma, node);

		if (gfn < itr->gfn)
			node = node->rb_left;
		else if (gfn > itr->gfn)
			node = node->rb_right;
		else {
			ret = itr;
			goto out;
		}
	}

out:
	return ret;
}

static kvm_pfn_t gvt_cache_find(struct intel_vgpu *vgpu, gfn_t gfn)
{
	struct gvt_dma *entry;

	mutex_lock(&vgpu->vdev.cache_lock);
	entry = __gvt_cache_find(vgpu, gfn);
	mutex_unlock(&vgpu->vdev.cache_lock);

	return entry == NULL ? 0 : entry->pfn;
}

static void gvt_cache_add(struct intel_vgpu *vgpu, gfn_t gfn, kvm_pfn_t pfn)
{
	struct gvt_dma *new, *itr;
	struct rb_node **link = &vgpu->vdev.cache.rb_node, *parent = NULL;

	new = kzalloc(sizeof(struct gvt_dma), GFP_KERNEL);
	if (!new)
		return;

	new->gfn = gfn;
	new->pfn = pfn;

	mutex_lock(&vgpu->vdev.cache_lock);
	while (*link) {
		parent = *link;
		itr = rb_entry(parent, struct gvt_dma, node);

		if (gfn == itr->gfn)
			goto out;
		else if (gfn < itr->gfn)
			link = &parent->rb_left;
		else
			link = &parent->rb_right;
	}

	rb_link_node(&new->node, parent, link);
	rb_insert_color(&new->node, &vgpu->vdev.cache);
	mutex_unlock(&vgpu->vdev.cache_lock);
	return;

out:
	mutex_unlock(&vgpu->vdev.cache_lock);
	kfree(new);
}

static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
				struct gvt_dma *entry)
{
	rb_erase(&entry->node, &vgpu->vdev.cache);
	kfree(entry);
}

static void gvt_cache_remove(struct intel_vgpu *vgpu, gfn_t gfn)
{
	struct device *dev = vgpu->vdev.mdev;
	struct gvt_dma *this;
	unsigned long pfn;

	mutex_lock(&vgpu->vdev.cache_lock);
	this  = __gvt_cache_find(vgpu, gfn);
	if (!this) {
		mutex_unlock(&vgpu->vdev.cache_lock);
		return;
	}

	pfn = this->pfn;
	WARN_ON((kvmgt_unpin_pages(dev, &pfn, 1) != 1));
	__gvt_cache_remove_entry(vgpu, this);
	mutex_unlock(&vgpu->vdev.cache_lock);
}

static void gvt_cache_init(struct intel_vgpu *vgpu)
{
	vgpu->vdev.cache = RB_ROOT;
	mutex_init(&vgpu->vdev.cache_lock);
}

static void gvt_cache_destroy(struct intel_vgpu *vgpu)
{
	struct gvt_dma *dma;
	struct rb_node *node = NULL;
	struct device *dev = vgpu->vdev.mdev;
	unsigned long pfn;

	mutex_lock(&vgpu->vdev.cache_lock);
	while ((node = rb_first(&vgpu->vdev.cache))) {
		dma = rb_entry(node, struct gvt_dma, node);
		pfn = dma->pfn;

		kvmgt_unpin_pages(dev, &pfn, 1);
		__gvt_cache_remove_entry(vgpu, dma);
	}
	mutex_unlock(&vgpu->vdev.cache_lock);
}

static struct intel_vgpu_type *intel_gvt_find_vgpu_type(struct intel_gvt *gvt,
		const char *name)
{
	int i;
	struct intel_vgpu_type *t;
	const char *driver_name = dev_driver_string(
			&gvt->dev_priv->drm.pdev->dev);

	for (i = 0; i < gvt->num_types; i++) {
		t = &gvt->types[i];
		if (!strncmp(t->name, name + strlen(driver_name) + 1,
			sizeof(t->name)))
			return t;
	}

	return NULL;
}

static struct attribute *type_attrs[] = {
	NULL,
};

static struct attribute_group *intel_vgpu_type_groups[] = {
	[0 ... NR_MAX_INTEL_VGPU_TYPES - 1] = NULL,
};

static bool intel_gvt_init_vgpu_type_groups(struct intel_gvt *gvt)
{
	int i, j;
	struct intel_vgpu_type *type;
	struct attribute_group *group;

	for (i = 0; i < gvt->num_types; i++) {
		type = &gvt->types[i];

		group = kzalloc(sizeof(struct attribute_group), GFP_KERNEL);
		if (WARN_ON(!group))
			goto unwind;

		group->name = type->name;
		group->attrs = type_attrs;
		intel_vgpu_type_groups[i] = group;
	}

	return true;

unwind:
	for (j = 0; j < i; j++) {
		group = intel_vgpu_type_groups[j];
		kfree(group);
	}

	return false;
}

static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt)
{
	int i;
	struct attribute_group *group;

	for (i = 0; i < gvt->num_types; i++) {
		group = intel_vgpu_type_groups[i];
		kfree(group);
	}
}

static void kvmgt_protect_table_init(struct kvmgt_guest_info *info)
{
	hash_init(info->ptable);
}

static void kvmgt_protect_table_destroy(struct kvmgt_guest_info *info)
{
	struct kvmgt_pgfn *p;
	struct hlist_node *tmp;
	int i;

	hash_for_each_safe(info->ptable, i, tmp, p, hnode) {
		hash_del(&p->hnode);
		kfree(p);
	}
}

static struct kvmgt_pgfn *
__kvmgt_protect_table_find(struct kvmgt_guest_info *info, gfn_t gfn)
{
	struct kvmgt_pgfn *p, *res = NULL;

	hash_for_each_possible(info->ptable, p, hnode, gfn) {
		if (gfn == p->gfn) {
			res = p;
			break;
		}
	}

	return res;
}

static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info *info,
				gfn_t gfn)
{
	struct kvmgt_pgfn *p;

	p = __kvmgt_protect_table_find(info, gfn);
	return !!p;
}

static void kvmgt_protect_table_add(struct kvmgt_guest_info *info, gfn_t gfn)
{
	struct kvmgt_pgfn *p;

	if (kvmgt_gfn_is_write_protected(info, gfn))
		return;

	p = kmalloc(sizeof(struct kvmgt_pgfn), GFP_ATOMIC);
	if (WARN(!p, "gfn: 0x%llx\n", gfn))
		return;

	p->gfn = gfn;
	hash_add(info->ptable, &p->hnode, gfn);
}

static void kvmgt_protect_table_del(struct kvmgt_guest_info *info,
				gfn_t gfn)
{
	struct kvmgt_pgfn *p;

	p = __kvmgt_protect_table_find(info, gfn);
	if (p) {
		hash_del(&p->hnode);
		kfree(p);
	}
}

static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
{
	if (!intel_gvt_init_vgpu_type_groups(gvt))
		return -EFAULT;

	intel_gvt_ops = ops;

	/* MDEV is not yet available */
	return -ENODEV;
}

static void kvmgt_host_exit(struct device *dev, void *gvt)
{
	intel_gvt_cleanup_vgpu_type_groups(gvt);
}

static int kvmgt_write_protect_add(unsigned long handle, u64 gfn)
{
	struct kvmgt_guest_info *info = (struct kvmgt_guest_info *)handle;
	struct kvm *kvm = info->kvm;
	struct kvm_memory_slot *slot;
	int idx;

	idx = srcu_read_lock(&kvm->srcu);
	slot = gfn_to_memslot(kvm, gfn);

	spin_lock(&kvm->mmu_lock);

	if (kvmgt_gfn_is_write_protected(info, gfn))
		goto out;

	kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
	kvmgt_protect_table_add(info, gfn);

out:
	spin_unlock(&kvm->mmu_lock);
	srcu_read_unlock(&kvm->srcu, idx);
	return 0;
}

static int kvmgt_write_protect_remove(unsigned long handle, u64 gfn)
{
	struct kvmgt_guest_info *info = (struct kvmgt_guest_info *)handle;
	struct kvm *kvm = info->kvm;
	struct kvm_memory_slot *slot;
	int idx;

	idx = srcu_read_lock(&kvm->srcu);
	slot = gfn_to_memslot(kvm, gfn);

	spin_lock(&kvm->mmu_lock);

	if (!kvmgt_gfn_is_write_protected(info, gfn))
		goto out;

	kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
	kvmgt_protect_table_del(info, gfn);

out:
	spin_unlock(&kvm->mmu_lock);
	srcu_read_unlock(&kvm->srcu, idx);
	return 0;
}

static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
		const u8 *val, int len,
		struct kvm_page_track_notifier_node *node)
{
	struct kvmgt_guest_info *info = container_of(node,
					struct kvmgt_guest_info, track_node);

	if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
		intel_gvt_ops->emulate_mmio_write(info->vgpu, gpa,
					(void *)val, len);
}

static void kvmgt_page_track_flush_slot(struct kvm *kvm,
		struct kvm_memory_slot *slot,
		struct kvm_page_track_notifier_node *node)
{
	int i;
	gfn_t gfn;
	struct kvmgt_guest_info *info = container_of(node,
					struct kvmgt_guest_info, track_node);

	spin_lock(&kvm->mmu_lock);
	for (i = 0; i < slot->npages; i++) {
		gfn = slot->base_gfn + i;
		if (kvmgt_gfn_is_write_protected(info, gfn)) {
			kvm_slot_page_track_remove_page(kvm, slot, gfn,
						KVM_PAGE_TRACK_WRITE);
			kvmgt_protect_table_del(info, gfn);
		}
	}
	spin_unlock(&kvm->mmu_lock);
}

static bool kvmgt_check_guest(void)
{
	unsigned int eax, ebx, ecx, edx;
	char s[12];
	unsigned int *i;

	eax = KVM_CPUID_SIGNATURE;
	ebx = ecx = edx = 0;

	asm volatile ("cpuid"
		      : "+a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
		      :
		      : "cc", "memory");
	i = (unsigned int *)s;
	i[0] = ebx;
	i[1] = ecx;
	i[2] = edx;

	return !strncmp(s, "KVMKVMKVM", strlen("KVMKVMKVM"));
}

/**
 * NOTE:
 * It's actually impossible to check if we are running in KVM host,
 * since the "KVM host" is simply native. So we only dectect guest here.
 */
static int kvmgt_detect_host(void)
{
#ifdef CONFIG_INTEL_IOMMU
	if (intel_iommu_gfx_mapped) {
		gvt_err("Hardware IOMMU compatibility not yet supported, try to boot with intel_iommu=igfx_off\n");
		return -ENODEV;
	}
#endif
	return kvmgt_check_guest() ? -ENODEV : 0;
}

static int kvmgt_attach_vgpu(void *vgpu, unsigned long *handle)
{
	/* nothing to do here */
	return 0;
}

static void kvmgt_detach_vgpu(unsigned long handle)
{
	/* nothing to do here */
}

static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data)
{
	struct kvmgt_guest_info *info = (struct kvmgt_guest_info *)handle;
	struct intel_vgpu *vgpu = info->vgpu;

	if (vgpu->vdev.msi_trigger)
		return eventfd_signal(vgpu->vdev.msi_trigger, 1) == 1;

	return false;
}

static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn)
{
	unsigned long pfn;
	struct kvmgt_guest_info *info = (struct kvmgt_guest_info *)handle;
	int rc;

	pfn = gvt_cache_find(info->vgpu, gfn);
	if (pfn != 0)
		return pfn;

	rc = kvmgt_pin_pages(info->vgpu->vdev.mdev, &gfn, 1,
			     IOMMU_READ | IOMMU_WRITE, &pfn);
	if (rc != 1) {
		gvt_err("vfio_pin_pages failed for gfn: 0x%lx\n", gfn);
		return 0;
	}

	gvt_cache_add(info->vgpu, gfn, pfn);
	return pfn;
}

static void *kvmgt_gpa_to_hva(unsigned long handle, unsigned long gpa)
{
	unsigned long pfn;
	gfn_t gfn = gpa_to_gfn(gpa);

	pfn = kvmgt_gfn_to_pfn(handle, gfn);
	if (!pfn)
		return NULL;

	return (char *)pfn_to_kaddr(pfn) + offset_in_page(gpa);
}

static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
			void *buf, unsigned long len, bool write)
{
	void *hva = NULL;

	hva = kvmgt_gpa_to_hva(handle, gpa);
	if (!hva)
		return -EFAULT;

	if (write)
		memcpy(hva, buf, len);
	else
		memcpy(buf, hva, len);

	return 0;
}

static int kvmgt_read_gpa(unsigned long handle, unsigned long gpa,
			void *buf, unsigned long len)
{
	return kvmgt_rw_gpa(handle, gpa, buf, len, false);
}

static int kvmgt_write_gpa(unsigned long handle, unsigned long gpa,
			void *buf, unsigned long len)
{
	return kvmgt_rw_gpa(handle, gpa, buf, len, true);
}

static unsigned long kvmgt_virt_to_pfn(void *addr)
{
	return PFN_DOWN(__pa(addr));
}

struct intel_gvt_mpt kvmgt_mpt = {
	.detect_host = kvmgt_detect_host,
	.host_init = kvmgt_host_init,
	.host_exit = kvmgt_host_exit,
	.attach_vgpu = kvmgt_attach_vgpu,
	.detach_vgpu = kvmgt_detach_vgpu,
	.inject_msi = kvmgt_inject_msi,
	.from_virt_to_mfn = kvmgt_virt_to_pfn,
	.set_wp_page = kvmgt_write_protect_add,
	.unset_wp_page = kvmgt_write_protect_remove,
	.read_gpa = kvmgt_read_gpa,
	.write_gpa = kvmgt_write_gpa,
	.gfn_to_mfn = kvmgt_gfn_to_pfn,
};
EXPORT_SYMBOL_GPL(kvmgt_mpt);

static int __init kvmgt_init(void)
{
	return 0;
}

static void __exit kvmgt_exit(void)
{
}

module_init(kvmgt_init);
module_exit(kvmgt_exit);

MODULE_LICENSE("GPL and additional rights");
MODULE_AUTHOR("Intel Corporation");