/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Portions Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */
/*
 * Copyright (c) 2009, Intel Corporation.
 * All rights reserved.
 */

/*
 * DVMA code
 * This file contains Intel IOMMU code that deals with DVMA
 * i.e. DMA remapping.
 */

#include <sys/sysmacros.h>
#include <sys/pcie.h>
#include <sys/pci_cfgspace.h>
#include <vm/hat_i86.h>
#include <sys/memlist.h>
#include <sys/acpi/acpi.h>
#include <sys/acpica.h>
#include <sys/modhash.h>
#include <sys/immu.h>

#undef	TEST

/*
 * Macros based on PCI spec
 */
#define	IMMU_PCI_REV2CLASS(r)   ((r) >> 8)  /* classcode from revid */
#define	IMMU_PCI_CLASS2BASE(c)  ((c) >> 16) /* baseclass from classcode */
#define	IMMU_PCI_CLASS2SUB(c)   (((c) >> 8) & 0xff); /* classcode */

#define	IMMU_CONTIG_PADDR(d, p) \
	((d).dck_paddr && ((d).dck_paddr + IMMU_PAGESIZE) == (p))

typedef struct dvma_arg {
	immu_t *dva_immu;
	dev_info_t *dva_rdip;
	dev_info_t *dva_ddip;
	domain_t *dva_domain;
	int dva_level;
	immu_flags_t dva_flags;
	list_t *dva_list;
	int dva_error;
} dvma_arg_t;

static domain_t *domain_create(immu_t *immu, dev_info_t *ddip,
    dev_info_t *rdip, immu_flags_t immu_flags);
static immu_devi_t *create_immu_devi(dev_info_t *rdip, int bus,
    int dev, int func, immu_flags_t immu_flags);
static void destroy_immu_devi(immu_devi_t *immu_devi);
static boolean_t dvma_map(immu_t *immu, domain_t *domain, uint64_t sdvma,
    uint64_t nvpages, dcookie_t *dcookies, int dcount, dev_info_t *rdip,
    immu_flags_t immu_flags);

/* Extern globals */
extern struct memlist  *phys_install;


/* static Globals */

/*
 * Used to setup DMA objects (memory regions)
 * for DMA reads by IOMMU units
 */
static ddi_dma_attr_t immu_dma_attr = {
	DMA_ATTR_V0,
	0U,
	0xffffffffU,
	0xffffffffU,
	MMU_PAGESIZE, /* MMU page aligned */
	0x1,
	0x1,
	0xffffffffU,
	0xffffffffU,
	1,
	4,
	0
};

static ddi_device_acc_attr_t immu_acc_attr = {
	DDI_DEVICE_ATTR_V0,
	DDI_NEVERSWAP_ACC,
	DDI_STRICTORDER_ACC
};


/* globals private to this file */
static kmutex_t immu_domain_lock;
static list_t immu_unity_domain_list;
static list_t immu_xlate_domain_list;

/* structure used to store idx into each level of the page tables */
typedef struct xlate {
	int xlt_level;
	uint_t xlt_idx;
	pgtable_t *xlt_pgtable;
} xlate_t;

/* 0 is reserved by Vt-d spec. Solaris reserves 1 */
#define	IMMU_UNITY_DID   1

static mod_hash_t *bdf_domain_hash;

static domain_t *
bdf_domain_lookup(immu_devi_t *immu_devi)
{
	domain_t *domain;
	int16_t seg = immu_devi->imd_seg;
	int16_t bus = immu_devi->imd_bus;
	int16_t devfunc = immu_devi->imd_devfunc;
	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);

	if (seg < 0 || bus < 0 || devfunc < 0) {
		return (NULL);
	}

	domain = NULL;
	if (mod_hash_find(bdf_domain_hash,
	    (void *)bdf, (void *)&domain) == 0) {
		ASSERT(domain);
		ASSERT(domain->dom_did > 0);
		return (domain);
	} else {
		return (NULL);
	}
}

static void
bdf_domain_insert(immu_devi_t *immu_devi, domain_t *domain)
{
	int16_t seg = immu_devi->imd_seg;
	int16_t bus = immu_devi->imd_bus;
	int16_t devfunc = immu_devi->imd_devfunc;
	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
	int r;

	if (seg < 0 || bus < 0 || devfunc < 0) {
		return;
	}

	r = mod_hash_insert(bdf_domain_hash, (void *)bdf, (void *)domain);
	ASSERT(r != MH_ERR_DUPLICATE);
	ASSERT(r == 0);
}

static int
match_lpc(dev_info_t *pdip, void *arg)
{
	immu_devi_t *immu_devi;
	dvma_arg_t *dvap = (dvma_arg_t *)arg;

	ASSERT(dvap->dva_error == DDI_FAILURE);
	ASSERT(dvap->dva_ddip == NULL);
	ASSERT(dvap->dva_list);

	if (list_is_empty(dvap->dva_list)) {
		return (DDI_WALK_TERMINATE);
	}

	immu_devi = list_head(dvap->dva_list);
	for (; immu_devi; immu_devi = list_next(dvap->dva_list,
	    immu_devi)) {
		ASSERT(immu_devi->imd_dip);
		if (immu_devi->imd_dip == pdip) {
			dvap->dva_ddip = pdip;
			dvap->dva_error = DDI_SUCCESS;
			return (DDI_WALK_TERMINATE);
		}
	}

	return (DDI_WALK_CONTINUE);
}

static void
immu_devi_set_spclist(dev_info_t *dip, immu_t *immu)
{
	list_t *spclist = NULL;
	immu_devi_t *immu_devi;

	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_lock)));

	immu_devi = IMMU_DEVI(dip);
	if (immu_devi->imd_display == B_TRUE) {
		spclist = &(immu->immu_dvma_gfx_list);
	} else if (immu_devi->imd_lpc == B_TRUE) {
		spclist = &(immu->immu_dvma_lpc_list);
	}

	if (spclist) {
		mutex_enter(&(immu->immu_lock));
		list_insert_head(spclist, immu_devi);
		mutex_exit(&(immu->immu_lock));
	}
}

/*
 * Set the immu_devi struct in the immu_devi field of a devinfo node
 */
int
immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags)
{
	int bus, dev, func;
	immu_devi_t *new_imd;
	immu_devi_t *immu_devi;

	ASSERT(root_devinfo);
	ASSERT(dip);
	ASSERT(dip != root_devinfo);

	immu_devi = immu_devi_get(dip);
	if (immu_devi != NULL) {
		return (DDI_SUCCESS);
	}

	bus = dev = func = -1;

	/*
	 * Assume a new immu_devi struct is needed
	 */
	if (!DEVI_IS_PCI(dip) || acpica_get_bdf(dip, &bus, &dev, &func) != 0) {
		/*
		 * No BDF. Set bus = -1 to indicate this.
		 * We still need to create a immu_devi struct
		 * though
		 */
		bus = -1;
		dev = 0;
		func = 0;
	}

	new_imd = create_immu_devi(dip, bus, dev, func, immu_flags);
	if (new_imd  == NULL) {
		ddi_err(DER_WARN, dip, "Failed to create immu_devi "
		    "structure");
		return (DDI_FAILURE);
	}

	/*
	 * Check if some other thread allocated a immu_devi while we
	 * didn't own the lock.
	 */
	mutex_enter(&(DEVI(dip)->devi_lock));
	if (IMMU_DEVI(dip) == NULL) {
		IMMU_DEVI_SET(dip, new_imd);
	} else {
		destroy_immu_devi(new_imd);
	}
	mutex_exit(&(DEVI(dip)->devi_lock));

	return (DDI_SUCCESS);
}

static dev_info_t *
get_lpc_devinfo(immu_t *immu, dev_info_t *rdip, immu_flags_t immu_flags)
{
	dvma_arg_t dvarg = {0};
	dvarg.dva_list = &(immu->immu_dvma_lpc_list);
	dvarg.dva_rdip = rdip;
	dvarg.dva_error = DDI_FAILURE;

	if (immu_walk_ancestor(rdip, NULL, match_lpc,
	    &dvarg, NULL, immu_flags) != DDI_SUCCESS) {
		ddi_err(DER_MODE, rdip, "Could not walk ancestors to "
		    "find lpc_devinfo for ISA device");
		return (NULL);
	}

	if (dvarg.dva_error != DDI_SUCCESS || dvarg.dva_ddip == NULL) {
		ddi_err(DER_MODE, rdip, "Could not find lpc_devinfo for "
		    "ISA device");
		return (NULL);
	}

	return (dvarg.dva_ddip);
}

static dev_info_t *
get_gfx_devinfo(dev_info_t *rdip)
{
	immu_t *immu;
	immu_devi_t *immu_devi;
	list_t *list_gfx;

	/*
	 * The GFX device may not be on the same IMMU unit as "agpgart"
	 * so search globally
	 */
	immu_devi = NULL;
	immu = list_head(&immu_list);
	for (; immu; immu = list_next(&immu_list, immu)) {
		list_gfx = &(immu->immu_dvma_gfx_list);
		if (!list_is_empty(list_gfx)) {
			immu_devi = list_head(list_gfx);
			break;
		}
	}

	if (immu_devi == NULL) {
		ddi_err(DER_WARN, rdip, "IMMU: No GFX device. "
		    "Cannot redirect agpgart");
		return (NULL);
	}

	/* list is not empty we checked above */
	ASSERT(immu_devi);
	ASSERT(immu_devi->imd_dip);

	ddi_err(DER_LOG, rdip, "IMMU: GFX redirect to %s",
	    ddi_node_name(immu_devi->imd_dip));

	return (immu_devi->imd_dip);
}

static immu_flags_t
dma_to_immu_flags(struct ddi_dma_req *dmareq)
{
	immu_flags_t flags = 0;

	if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
		flags |= IMMU_FLAGS_SLEEP;
	} else {
		flags |= IMMU_FLAGS_NOSLEEP;
	}

#ifdef BUGGY_DRIVERS

	flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);

#else
	/*
	 * Read and write flags need to be reversed.
	 * DMA_READ means read from device and write
	 * to memory. So DMA read means DVMA write.
	 */
	if (dmareq->dmar_flags & DDI_DMA_READ)
		flags |= IMMU_FLAGS_WRITE;

	if (dmareq->dmar_flags & DDI_DMA_WRITE)
		flags |= IMMU_FLAGS_READ;

	/*
	 * Some buggy drivers specify neither READ or WRITE
	 * For such drivers set both read and write permissions
	 */
	if ((dmareq->dmar_flags & (DDI_DMA_READ | DDI_DMA_WRITE)) == 0) {
		flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
	}
#endif

	return (flags);
}

int
pgtable_ctor(void *buf, void *arg, int kmflag)
{
	size_t actual_size = 0;
	pgtable_t *pgtable;
	int (*dmafp)(caddr_t);
	caddr_t vaddr;
	void *next;

	ASSERT(buf);
	ASSERT(arg == NULL);

	pgtable = (pgtable_t *)buf;

	dmafp = (kmflag & KM_NOSLEEP) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP;

	next = kmem_zalloc(IMMU_PAGESIZE, kmflag);
	if (next == NULL) {
		return (-1);
	}

	ASSERT(root_devinfo);
	if (ddi_dma_alloc_handle(root_devinfo, &immu_dma_attr,
	    dmafp, NULL, &pgtable->hwpg_dmahdl) != DDI_SUCCESS) {
		kmem_free(next, IMMU_PAGESIZE);
		return (-1);
	}

	if (ddi_dma_mem_alloc(pgtable->hwpg_dmahdl, IMMU_PAGESIZE,
	    &immu_acc_attr, DDI_DMA_CONSISTENT | IOMEM_DATA_UNCACHED,
	    dmafp, NULL, &vaddr, &actual_size,
	    &pgtable->hwpg_memhdl) != DDI_SUCCESS) {
		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
		kmem_free(next, IMMU_PAGESIZE);
		return (-1);
	}

	/*
	 * Memory allocation failure. Maybe a temporary condition
	 * so return error rather than panic, so we can try again
	 */
	if (actual_size < IMMU_PAGESIZE) {
		ddi_dma_mem_free(&pgtable->hwpg_memhdl);
		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
		kmem_free(next, IMMU_PAGESIZE);
		return (-1);
	}

	pgtable->hwpg_paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr));
	pgtable->hwpg_vaddr = vaddr;
	pgtable->swpg_next_array = next;

	rw_init(&(pgtable->swpg_rwlock), NULL, RW_DEFAULT, NULL);

	return (0);
}

void
pgtable_dtor(void *buf, void *arg)
{
	pgtable_t *pgtable;

	ASSERT(buf);
	ASSERT(arg == NULL);

	pgtable = (pgtable_t *)buf;
	ASSERT(pgtable->swpg_next_array);

	/* destroy will panic if lock is held. */
	rw_destroy(&(pgtable->swpg_rwlock));

	ddi_dma_mem_free(&pgtable->hwpg_memhdl);
	ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
	kmem_free(pgtable->swpg_next_array, IMMU_PAGESIZE);

	/* don't zero out hwpg_vaddr and swpg_next_array for debugging */
}

/*
 * pgtable_alloc()
 *	alloc a IOMMU pgtable structure.
 *	This same struct is used for root and context tables as well.
 *	This routine allocs the f/ollowing:
 *	- a pgtable_t struct
 *	- a HW page which holds PTEs/entries which is accesssed by HW
 *        so we set up DMA for this page
 *	- a SW page which is only for our bookeeping
 *        (for example to  hold pointers to the next level pgtable).
 *        So a simple kmem_alloc suffices
 */
static pgtable_t *
pgtable_alloc(immu_t *immu, immu_flags_t immu_flags)
{
	pgtable_t *pgtable;
	int kmflags;

	ASSERT(immu);

	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;

	pgtable = kmem_cache_alloc(immu_pgtable_cache, kmflags);
	if (pgtable == NULL) {
		return (NULL);
	}
	return (pgtable);
}

static void
pgtable_zero(immu_t *immu, pgtable_t *pgtable)
{
	bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE);
	bzero(pgtable->swpg_next_array, IMMU_PAGESIZE);

	/* Dont need to flush the write we will flush when we use the entry */
	immu_regs_cpu_flush(immu, pgtable->hwpg_vaddr, IMMU_PAGESIZE);
}

static void
pgtable_free(immu_t *immu, pgtable_t *pgtable)
{
	ASSERT(immu);
	ASSERT(pgtable);

	kmem_cache_free(immu_pgtable_cache, pgtable);
}

/*
 * Function to identify a display device from the PCI class code
 */
static boolean_t
device_is_display(uint_t classcode)
{
	static uint_t disp_classes[] = {
		0x000100,
		0x030000,
		0x030001
	};
	int i, nclasses = sizeof (disp_classes) / sizeof (uint_t);

	for (i = 0; i < nclasses; i++) {
		if (classcode == disp_classes[i])
			return (B_TRUE);
	}
	return (B_FALSE);
}

/*
 * Function that determines if device is PCIEX and/or PCIEX bridge
 */
static boolean_t
device_is_pciex(
	uchar_t bus, uchar_t dev, uchar_t func, boolean_t *is_pcib)
{
	ushort_t cap;
	ushort_t capsp;
	ushort_t cap_count = PCI_CAP_MAX_PTR;
	ushort_t status;
	boolean_t is_pciex = B_FALSE;

	*is_pcib = B_FALSE;

	status = pci_getw_func(bus, dev, func, PCI_CONF_STAT);
	if (!(status & PCI_STAT_CAP))
		return (B_FALSE);

	capsp = pci_getb_func(bus, dev, func, PCI_CONF_CAP_PTR);
	while (cap_count-- && capsp >= PCI_CAP_PTR_OFF) {
		capsp &= PCI_CAP_PTR_MASK;
		cap = pci_getb_func(bus, dev, func, capsp);

		if (cap == PCI_CAP_ID_PCI_E) {
			status = pci_getw_func(bus, dev, func, capsp + 2);
			/*
			 * See section 7.8.2 of PCI-Express Base Spec v1.0a
			 * for Device/Port Type.
			 * PCIE_PCIECAP_DEV_TYPE_PCIE2PCI implies that the
			 * device is a PCIE2PCI bridge
			 */
			*is_pcib =
			    ((status & PCIE_PCIECAP_DEV_TYPE_MASK) ==
			    PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) ? B_TRUE : B_FALSE;
			is_pciex = B_TRUE;
		}

		capsp = (*pci_getb_func)(bus, dev, func,
		    capsp + PCI_CAP_NEXT_PTR);
	}

	return (is_pciex);
}


/*
 * immu_dvma_get_immu()
 *   get the immu unit structure for a dev_info node
 */
immu_t *
immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags)
{
	immu_devi_t *immu_devi;
	immu_t *immu;

	/*
	 * check if immu unit was already found earlier.
	 * If yes, then it will be stashed in immu_devi struct.
	 */
	immu_devi = immu_devi_get(dip);
	if (immu_devi == NULL) {
		if (immu_devi_set(dip, immu_flags) != DDI_SUCCESS) {
			/*
			 * May fail because of low memory. Return error rather
			 * than panic as we want driver to rey again later
			 */
			ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
			    "No immu_devi structure");
			/*NOTREACHED*/
		}
		immu_devi = immu_devi_get(dip);
		ASSERT(immu_devi);
	}

	mutex_enter(&(DEVI(dip)->devi_lock));
	if (immu_devi->imd_immu) {
		immu = immu_devi->imd_immu;
		mutex_exit(&(DEVI(dip)->devi_lock));
		return (immu);
	}
	mutex_exit(&(DEVI(dip)->devi_lock));

	immu = immu_dmar_get_immu(dip);
	if (immu == NULL) {
		ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
		    "Cannot find immu_t for device");
		/*NOTREACHED*/
	}

	/*
	 * Check if some other thread found immu
	 * while lock was not held
	 */
	immu_devi = immu_devi_get(dip);
	/* immu_devi should be present as we found it earlier */
	if (immu_devi == NULL) {
		ddi_err(DER_PANIC, dip,
		    "immu_dvma_get_immu: No immu_devi structure");
		/*NOTREACHED*/
	}

	mutex_enter(&(DEVI(dip)->devi_lock));
	if (immu_devi->imd_immu == NULL) {
		/* nobody else set it, so we should do it */
		immu_devi->imd_immu = immu;
		immu_devi_set_spclist(dip, immu);
	} else {
		/*
		 * if some other thread got immu before
		 * us, it should get the same results
		 */
		if (immu_devi->imd_immu != immu) {
			ddi_err(DER_PANIC, dip, "Multiple "
			    "immu units found for device. Expected (%p), "
			    "actual (%p)", (void *)immu,
			    (void *)immu_devi->imd_immu);
			mutex_exit(&(DEVI(dip)->devi_lock));
			/*NOTREACHED*/
		}
	}
	mutex_exit(&(DEVI(dip)->devi_lock));

	return (immu);
}


/* ############################# IMMU_DEVI code ############################ */

/*
 * Allocate a immu_devi structure and initialize it
 */
static immu_devi_t *
create_immu_devi(dev_info_t *rdip, int bus, int dev, int func,
    immu_flags_t immu_flags)
{
	uchar_t baseclass, subclass;
	uint_t classcode, revclass;
	immu_devi_t *immu_devi;
	boolean_t pciex = B_FALSE;
	int kmflags;
	boolean_t is_pcib = B_FALSE;

	/* bus ==  -1 indicate non-PCI device (no BDF) */
	ASSERT(bus == -1 || bus >= 0);
	ASSERT(dev >= 0);
	ASSERT(func >= 0);

	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
	immu_devi = kmem_zalloc(sizeof (immu_devi_t), kmflags);
	if (immu_devi == NULL) {
		ddi_err(DER_WARN, rdip, "Failed to allocate memory for "
		    "Intel IOMMU immu_devi structure");
		return (NULL);
	}
	immu_devi->imd_dip = rdip;
	immu_devi->imd_seg = 0; /* Currently seg can only be 0 */
	immu_devi->imd_bus = bus;
	immu_devi->imd_pcib_type = IMMU_PCIB_BAD;

	if (bus == -1) {
		immu_devi->imd_pcib_type = IMMU_PCIB_NOBDF;
		return (immu_devi);
	}

	immu_devi->imd_devfunc = IMMU_PCI_DEVFUNC(dev, func);
	immu_devi->imd_sec = 0;
	immu_devi->imd_sub = 0;

	revclass = pci_getl_func(bus, dev, func, PCI_CONF_REVID);

	classcode = IMMU_PCI_REV2CLASS(revclass);
	baseclass = IMMU_PCI_CLASS2BASE(classcode);
	subclass = IMMU_PCI_CLASS2SUB(classcode);

	if (baseclass == PCI_CLASS_BRIDGE && subclass == PCI_BRIDGE_PCI) {

		immu_devi->imd_sec = pci_getb_func(bus, dev, func,
		    PCI_BCNF_SECBUS);
		immu_devi->imd_sub = pci_getb_func(bus, dev, func,
		    PCI_BCNF_SUBBUS);

		pciex = device_is_pciex(bus, dev, func, &is_pcib);
		if (pciex  == B_TRUE && is_pcib == B_TRUE) {
			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCI;
		} else if (pciex == B_TRUE) {
			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCIE;
		} else {
			immu_devi->imd_pcib_type = IMMU_PCIB_PCI_PCI;
		}
	} else {
		immu_devi->imd_pcib_type = IMMU_PCIB_ENDPOINT;
	}

	/* check for certain special devices */
	immu_devi->imd_display = device_is_display(classcode);

	immu_devi->imd_lpc = ((baseclass == PCI_CLASS_BRIDGE) &&
	    (subclass == PCI_BRIDGE_ISA)) ? B_TRUE : B_FALSE;

	immu_devi->imd_domain = NULL;

	return (immu_devi);
}

static void
destroy_immu_devi(immu_devi_t *immu_devi)
{
	kmem_free(immu_devi, sizeof (immu_devi_t));
}

static domain_t *
immu_devi_domain(dev_info_t *rdip, dev_info_t **ddipp)
{
	immu_devi_t *immu_devi;
	domain_t *domain;
	dev_info_t *ddip;

	ASSERT(rdip);
	ASSERT(ddipp);

	*ddipp = NULL;

	immu_devi = immu_devi_get(rdip);
	if (immu_devi == NULL) {
		return (NULL);
	}

	mutex_enter(&(DEVI(rdip)->devi_lock));
	domain = immu_devi->imd_domain;
	ddip = immu_devi->imd_ddip;
	mutex_exit(&(DEVI(rdip)->devi_lock));

	if (domain) {
		ASSERT(domain->dom_did > 0);
		ASSERT(ddip);
		*ddipp = ddip;
	}

	return (domain);

}

/* ############################# END IMMU_DEVI code ######################## */
/* ############################# DOMAIN code ############################### */

/*
 * This routine always succeeds
 */
static int
did_alloc(immu_t *immu, dev_info_t *rdip,
    dev_info_t *ddip, immu_flags_t immu_flags)
{
	int did;

	ASSERT(immu);
	ASSERT(rdip);
	ASSERT(rdip != root_devinfo);

	did = (uintptr_t)vmem_alloc(immu->immu_did_arena, 1,
	    (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP);

	if (did == 0) {
		ASSERT(immu->immu_unity_domain);
		ASSERT(immu->immu_unity_domain->dom_did > 0);
		ddi_err(DER_WARN, rdip, "device domain-id alloc error"
		    " domain-device: %s%d. immu unit is %s. Using "
		    "unity domain with domain-id (%d)",
		    ddi_driver_name(ddip), ddi_get_instance(ddip),
		    immu->immu_name, immu->immu_unity_domain->dom_did);
		did = immu->immu_unity_domain->dom_did;
	}

	return (did);
}

static int
get_branch_domain(dev_info_t *pdip, void *arg)
{
	immu_devi_t *immu_devi;
	domain_t *domain;
	dev_info_t *ddip;
	immu_t *immu;
	dvma_arg_t *dvp = (dvma_arg_t *)arg;

	ASSERT(pdip);
	ASSERT(dvp);
	ASSERT(dvp->dva_rdip);

	/*
	 * The field dvp->dva_rdip is a work-in-progress
	 * and gets updated as we walk up the ancestor
	 * tree. The final ddip is set only when we reach
	 * the top of the tree. So the dvp->dva_ddip field cannot
	 * be relied on until we reach the top of the field.
	 */

	/* immu_devi may not be set. */
	immu_devi = immu_devi_get(pdip);
	if (immu_devi == NULL) {
		if (immu_devi_set(pdip, dvp->dva_flags) != DDI_SUCCESS) {
			dvp->dva_error = DDI_FAILURE;
			return (DDI_WALK_TERMINATE);
		}
	}

	immu_devi = immu_devi_get(pdip);
	ASSERT(immu_devi);
	immu = immu_devi->imd_immu;
	if (immu == NULL) {
		immu = immu_dvma_get_immu(pdip, dvp->dva_flags);
		ASSERT(immu);
	}

	/*
	 * If we encounter a PCIE_PCIE bridge *ANCESTOR* we need to
	 * terminate the walk (since the device under the PCIE bridge
	 * is a PCIE device and has an independent entry in the
	 * root/context table)
	 */
	if (dvp->dva_rdip != pdip &&
	    immu_devi->imd_pcib_type == IMMU_PCIB_PCIE_PCIE) {
		return (DDI_WALK_TERMINATE);
	}

	/*
	 * In order to be a domain-dim, it must be a PCI device i.e.
	 * must have valid BDF. This also eliminates the root complex.
	 */
	if (immu_devi->imd_pcib_type != IMMU_PCIB_BAD &&
	    immu_devi->imd_pcib_type != IMMU_PCIB_NOBDF) {
		ASSERT(immu_devi->imd_bus >= 0);
		ASSERT(immu_devi->imd_devfunc >= 0);
		dvp->dva_ddip = pdip;
	}

	if (immu_devi->imd_display == B_TRUE ||
	    (dvp->dva_flags & IMMU_FLAGS_UNITY)) {
		dvp->dva_domain = immu->immu_unity_domain;
		/* continue walking to find ddip */
		return (DDI_WALK_CONTINUE);
	}

	mutex_enter(&(DEVI(pdip)->devi_lock));
	domain = immu_devi->imd_domain;
	ddip = immu_devi->imd_ddip;
	mutex_exit(&(DEVI(pdip)->devi_lock));

	if (domain && ddip) {
		/* if domain is set, it must be the same */
		if (dvp->dva_domain) {
			ASSERT(domain == dvp->dva_domain);
		}
		dvp->dva_domain = domain;
		dvp->dva_ddip = ddip;
		return (DDI_WALK_TERMINATE);
	}

	/* immu_devi either has both set or both clear */
	ASSERT(domain == NULL);
	ASSERT(ddip == NULL);

	/* Domain may already be set, continue walking so that ddip gets set */
	if (dvp->dva_domain) {
		return (DDI_WALK_CONTINUE);
	}

	/* domain is not set in either immu_devi or dvp */
	domain = bdf_domain_lookup(immu_devi);
	if (domain == NULL) {
		return (DDI_WALK_CONTINUE);
	}

	/* ok, the BDF hash had a domain for this BDF. */

	/* Grab lock again to check if something else set immu_devi fields */
	mutex_enter(&(DEVI(pdip)->devi_lock));
	if (immu_devi->imd_domain != NULL) {
		ASSERT(immu_devi->imd_domain == domain);
		dvp->dva_domain = domain;
	} else {
		dvp->dva_domain = domain;
	}
	mutex_exit(&(DEVI(pdip)->devi_lock));

	/*
	 * walk upwards until the topmost PCI bridge is found
	 */
	return (DDI_WALK_CONTINUE);

}

static void
map_unity_domain(domain_t *domain)
{
	struct memlist *mp;
	uint64_t start;
	uint64_t npages;
	dcookie_t dcookies[1] = {0};
	int dcount = 0;

	ASSERT(domain);
	ASSERT(domain->dom_did == IMMU_UNITY_DID);

	/*
	 * We call into routines that grab the lock so we should
	 * not be called with the lock held. This does not matter
	 * much since, no else has a reference to this domain
	 */
	ASSERT(!rw_lock_held(&(domain->dom_pgtable_rwlock)));

	/*
	 * UNITY arenas are a mirror of the physical memory
	 * installed on the system.
	 */

#ifdef BUGGY_DRIVERS
	/*
	 * Dont skip page0. Some broken HW/FW access it.
	 */
	dcookies[0].dck_paddr = 0;
	dcookies[0].dck_npages = 1;
	dcount = 1;
	(void) dvma_map(domain->dom_immu, domain, 0, 1, dcookies, dcount, NULL,
	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
#endif

	memlist_read_lock();

	mp = phys_install;

	if (mp->ml_address == 0) {
		/* since we already mapped page1 above */
		start = IMMU_PAGESIZE;
	} else {
		start = mp->ml_address;
	}
	npages = mp->ml_size/IMMU_PAGESIZE + 1;

	dcookies[0].dck_paddr = start;
	dcookies[0].dck_npages = npages;
	dcount = 1;
	(void) dvma_map(domain->dom_immu, domain, start, npages, dcookies,
	    dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);

	ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64
	    " - 0x%" PRIx64 "]", start, start + mp->ml_size);

	mp = mp->ml_next;
	while (mp) {
		ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64
		    " - 0x%" PRIx64 "]", mp->ml_address,
		    mp->ml_address + mp->ml_size);

		start = mp->ml_address;
		npages = mp->ml_size/IMMU_PAGESIZE + 1;

		dcookies[0].dck_paddr = start;
		dcookies[0].dck_npages = npages;
		dcount = 1;
		(void) dvma_map(domain->dom_immu, domain, start, npages,
		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
		mp = mp->ml_next;
	}

	mp = bios_rsvd;
	while (mp) {
		ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64
		    " - 0x%" PRIx64 "]", mp->ml_address,
		    mp->ml_address + mp->ml_size);

		start = mp->ml_address;
		npages = mp->ml_size/IMMU_PAGESIZE + 1;

		dcookies[0].dck_paddr = start;
		dcookies[0].dck_npages = npages;
		dcount = 1;
		(void) dvma_map(domain->dom_immu, domain, start, npages,
		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);

		mp = mp->ml_next;
	}

	memlist_read_unlock();
}

/*
 * create_xlate_arena()
 * 	Create the dvma arena for a domain with translation
 *	mapping
 */
static void
create_xlate_arena(immu_t *immu, domain_t *domain,
    dev_info_t *rdip, immu_flags_t immu_flags)
{
	char *arena_name;
	struct memlist *mp;
	int vmem_flags;
	uint64_t start;
	uint_t mgaw;
	uint64_t size;
	uint64_t maxaddr;
	void *vmem_ret;

	arena_name = domain->dom_dvma_arena_name;

	/* Note, don't do sizeof (arena_name) - it is just a pointer */
	(void) snprintf(arena_name,
	    sizeof (domain->dom_dvma_arena_name),
	    "%s-domain-%d-xlate-DVMA-arena", immu->immu_name,
	    domain->dom_did);

	vmem_flags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP;

	/*
	 * No one else has access to this domain.
	 * So no domain locks needed
	 */
	ASSERT(!rw_lock_held(&(domain->dom_pgtable_rwlock)));

	/* Restrict mgaddr (max guest addr) to MGAW */
	mgaw = IMMU_CAP_MGAW(immu->immu_regs_cap);

	/*
	 * To ensure we avoid ioapic and PCI MMIO ranges we just
	 * use the physical memory address range of the system as the
	 * range
	 */
	maxaddr = ((uint64_t)1 << mgaw);

	memlist_read_lock();

	mp = phys_install;

	if (mp->ml_address == 0)
		start = MMU_PAGESIZE;
	else
		start = mp->ml_address;

	if (start + mp->ml_size > maxaddr)
		size = maxaddr - start;
	else
		size = mp->ml_size;

	ddi_err(DER_VERB, rdip,
	    "%s: Creating dvma vmem arena [0x%" PRIx64
	    " - 0x%" PRIx64 "]", arena_name, start, start + size);

	ASSERT(domain->dom_dvma_arena == NULL);

	/*
	 * We always allocate in quanta of IMMU_PAGESIZE
	 */
	domain->dom_dvma_arena = vmem_create(arena_name,
	    (void *)(uintptr_t)start,	/* start addr */
	    size,			/* size */
	    IMMU_PAGESIZE,		/* quantum */
	    NULL,			/* afunc */
	    NULL,			/* ffunc */
	    NULL,			/* source */
	    0,				/* qcache_max */
	    vmem_flags);

	if (domain->dom_dvma_arena == NULL) {
		ddi_err(DER_PANIC, rdip,
		    "Failed to allocate DVMA arena(%s) "
		    "for domain ID (%d)", arena_name, domain->dom_did);
		/*NOTREACHED*/
	}

	mp = mp->ml_next;
	while (mp) {

		if (mp->ml_address == 0)
			start = MMU_PAGESIZE;
		else
			start = mp->ml_address;

		if (start + mp->ml_size > maxaddr)
			size = maxaddr - start;
		else
			size = mp->ml_size;

		ddi_err(DER_VERB, rdip,
		    "%s: Adding dvma vmem span [0x%" PRIx64
		    " - 0x%" PRIx64 "]", arena_name, start,
		    start + size);

		vmem_ret = vmem_add(domain->dom_dvma_arena,
		    (void *)(uintptr_t)start, size,  vmem_flags);

		if (vmem_ret == NULL) {
			ddi_err(DER_PANIC, rdip,
			    "Failed to allocate DVMA arena(%s) "
			    "for domain ID (%d)",
			    arena_name, domain->dom_did);
			/*NOTREACHED*/
		}
		mp = mp->ml_next;
	}
	memlist_read_unlock();
}

/* ################################### DOMAIN CODE ######################### */

/*
 * Set the domain and domain-dip for a dip
 */
static void
set_domain(
	dev_info_t *dip,
	dev_info_t *ddip,
	domain_t *domain)
{
	immu_devi_t *immu_devi;
	domain_t *fdomain;
	dev_info_t *fddip;

	ASSERT(dip);
	ASSERT(ddip);
	ASSERT(domain);
	ASSERT(domain->dom_did > 0); /* must be an initialized domain */

	immu_devi = immu_devi_get(dip);
	ASSERT(immu_devi);

	mutex_enter(&(DEVI(dip)->devi_lock));
	fddip = immu_devi->imd_ddip;
	fdomain = immu_devi->imd_domain;

	if (fddip) {
		ASSERT(fddip == ddip);
	} else {
		immu_devi->imd_ddip = ddip;
	}

	if (fdomain) {
		ASSERT(fdomain == domain);
	} else {
		immu_devi->imd_domain = domain;
	}
	mutex_exit(&(DEVI(dip)->devi_lock));
}

/*
 * device_domain()
 * 	Get domain for a device. The domain may be global in which case it
 *	is shared between all IOMMU units. Due to potential AGAW differences
 *      between IOMMU units, such global domains *have to be* UNITY mapping
 *      domains. Alternatively, the domain may be local to a IOMMU unit.
 *	Local domains may be shared or immu_devi, although the
 *      scope of sharing
 *	is restricted to devices controlled by the IOMMU unit to
 *      which the domain
 *	belongs. If shared, they (currently) have to be UNITY domains. If
 *      immu_devi a domain may be either UNITY or translation (XLATE) domain.
 */
static domain_t *
device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags)
{
	dev_info_t *ddip; /* topmost dip in domain i.e. domain owner */
	immu_t *immu;
	domain_t *domain;
	dvma_arg_t dvarg = {0};
	int level;

	ASSERT(rdip);

	*ddipp = NULL;

	/*
	 * Check if the domain is already set. This is usually true
	 * if this is not the first DVMA transaction.
	 */
	ddip = NULL;
	domain = immu_devi_domain(rdip, &ddip);
	if (domain) {
		ASSERT(domain->dom_did > 0);
		ASSERT(ddip);
		*ddipp = ddip;
		return (domain);
	}

	immu = immu_dvma_get_immu(rdip, immu_flags);
	if (immu == NULL) {
		/*
		 * possible that there is no IOMMU unit for this device
		 * - BIOS bugs are one example.
		 */
		ddi_err(DER_WARN, rdip, "No IMMU unit found for device");
		return (NULL);
	}

	dvarg.dva_rdip = rdip;
	dvarg.dva_ddip = NULL;
	dvarg.dva_domain = NULL;
	dvarg.dva_flags = immu_flags;
	level = 0;
	if (immu_walk_ancestor(rdip, NULL, get_branch_domain,
	    &dvarg, &level, immu_flags) != DDI_SUCCESS) {
		/*
		 * maybe low memory. return error,
		 * so driver tries again later
		 */
		return (NULL);
	}

	/* should have walked at least 1 dip (i.e. edip) */
	ASSERT(level > 0);

	ddip = dvarg.dva_ddip;	/* must be present */
	domain = dvarg.dva_domain;	/* may be NULL */

	/*
	 * We may find the domain during our ancestor walk on any one of our
	 * ancestor dips, If the domain is found then the domain-dip
	 * (i.e. ddip) will also be found in the same immu_devi struct.
	 * The domain-dip is the highest ancestor dip which shares the
	 * same domain with edip.
	 * The domain may or may not be found, but the domain dip must
	 * be found.
	 */
	if (ddip == NULL) {
		ddi_err(DER_MODE, rdip, "Cannot find domain dip for device.");
		return (NULL);
	}

	/*
	 * Did we find a domain ?
	 */
	if (domain) {
		goto found;
	}

	/* nope, so allocate */
	domain = domain_create(immu, ddip, rdip, immu_flags);
	if (domain == NULL) {
		return (NULL);
	}
	ASSERT(domain->dom_did > 0);

	/*FALLTHROUGH*/
found:
	/*
	 * We know *domain *is* the right domain, so panic if
	 * another domain is set for either the request-dip or
	 * effective dip.
	 */
	set_domain(ddip, ddip, domain);
	set_domain(rdip, ddip, domain);

	*ddipp = ddip;
	return (domain);
}

static void
create_unity_domain(immu_t *immu)
{
	domain_t *domain;

	/* 0 is reserved by Vt-d */
	/*LINTED*/
	ASSERT(IMMU_UNITY_DID > 0);

	/* domain created during boot and always use sleep flag */
	domain = kmem_zalloc(sizeof (domain_t), KM_SLEEP);

	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);

	domain->dom_did = IMMU_UNITY_DID;
	domain->dom_maptype = IMMU_MAPTYPE_UNITY;

	domain->dom_immu = immu;
	immu->immu_unity_domain = domain;

	/*
	 * Setup the domain's initial page table
	 * should never fail.
	 */
	domain->dom_pgtable_root = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
	ASSERT(domain->dom_pgtable_root);
	pgtable_zero(immu, domain->dom_pgtable_root);

	map_unity_domain(domain);

	/*
	 * put it on the system-wide UNITY domain list
	 */
	mutex_enter(&(immu_domain_lock));
	list_insert_tail(&immu_unity_domain_list, domain);
	mutex_exit(&(immu_domain_lock));
}

/*
 * ddip is the domain-dip - the topmost dip in a domain
 * rdip is the requesting-dip - the device which is
 * requesting DVMA setup
 * if domain is a non-shared domain rdip == ddip
 */
static domain_t *
domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip,
    immu_flags_t immu_flags)
{
	int kmflags;
	domain_t *domain;
	char mod_hash_name[128];
	immu_devi_t *immu_devi;
	int did;
	dcookie_t dcookies[1] = {0};
	int dcount = 0;

	ASSERT(immu);
	ASSERT(ddip);

	immu_devi = immu_devi_get(rdip);

	ASSERT(immu_devi);

	/*
	 * First allocate a domainid.
	 * This routine will never fail, since if we run out
	 * of domains the unity domain will be allocated.
	 */
	did = did_alloc(immu, rdip, ddip, immu_flags);
	ASSERT(did > 0);
	if (did == IMMU_UNITY_DID) {
		/* domain overflow */
		ASSERT(immu->immu_unity_domain);
		return (immu->immu_unity_domain);
	}

	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
	domain = kmem_zalloc(sizeof (domain_t), kmflags);
	if (domain == NULL) {
		ddi_err(DER_PANIC, rdip, "Failed to alloc DVMA domain "
		    "structure for device. IOMMU unit: %s", immu->immu_name);
		/*NOTREACHED*/
	}

	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);

	(void) snprintf(mod_hash_name, sizeof (mod_hash_name),
	    "immu%s-domain%d-pava-hash", immu->immu_name, did);

	domain->dom_did = did;
	domain->dom_immu = immu;
	domain->dom_maptype = IMMU_MAPTYPE_XLATE;

	/*
	 * Create xlate DVMA arena for this domain.
	 */
	create_xlate_arena(immu, domain, rdip, immu_flags);

	/*
	 * Setup the domain's initial page table
	 */
	domain->dom_pgtable_root = pgtable_alloc(immu, immu_flags);
	if (domain->dom_pgtable_root == NULL) {
		ddi_err(DER_PANIC, rdip, "Failed to alloc root "
		    "pgtable for domain (%d). IOMMU unit: %s",
		    domain->dom_did, immu->immu_name);
		/*NOTREACHED*/
	}
	pgtable_zero(immu, domain->dom_pgtable_root);

	/*
	 * Since this is a immu unit-specific domain, put it on
	 * the per-immu domain list.
	 */
	mutex_enter(&(immu->immu_lock));
	list_insert_head(&immu->immu_domain_list, domain);
	mutex_exit(&(immu->immu_lock));

	/*
	 * Also put it on the system-wide xlate domain list
	 */
	mutex_enter(&(immu_domain_lock));
	list_insert_head(&immu_xlate_domain_list, domain);
	mutex_exit(&(immu_domain_lock));

	bdf_domain_insert(immu_devi, domain);

#ifdef BUGGY_DRIVERS
	/*
	 * Map page0. Some broken HW/FW access it.
	 */
	dcookies[0].dck_paddr = 0;
	dcookies[0].dck_npages = 1;
	dcount = 1;
	(void) dvma_map(domain->dom_immu, domain, 0, 1, dcookies, dcount, NULL,
	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
#endif
	return (domain);
}

/*
 * Create domainid arena.
 * Domainid 0 is reserved by Vt-d spec and cannot be used by
 * system software.
 * Domainid 1 is reserved by solaris and used for *all* of the following:
 *	as the "uninitialized" domain - For devices not yet controlled
 *	by Solaris
 *	as the "unity" domain - For devices that will always belong
 *	to the unity domain
 *	as the "overflow" domain - Used for any new device after we
 *	run out of domains
 * All of the above domains map into a single domain with
 * domainid 1 and UNITY DVMA mapping
 * Each IMMU unity has its own unity/uninit/overflow domain
 */
static void
did_init(immu_t *immu)
{
	(void) snprintf(immu->immu_did_arena_name,
	    sizeof (immu->immu_did_arena_name),
	    "%s_domainid_arena", immu->immu_name);

	ddi_err(DER_VERB, NULL, "%s: Creating domainid arena %s",
	    immu->immu_name, immu->immu_did_arena_name);

	immu->immu_did_arena = vmem_create(
	    immu->immu_did_arena_name,
	    (void *)(uintptr_t)(IMMU_UNITY_DID + 1),   /* start addr */
	    immu->immu_max_domains - IMMU_UNITY_DID,
	    1,				/* quantum */
	    NULL,			/* afunc */
	    NULL,			/* ffunc */
	    NULL,			/* source */
	    0,				/* qcache_max */
	    VM_SLEEP);

	/* Even with SLEEP flag, vmem_create() can fail */
	if (immu->immu_did_arena == NULL) {
		ddi_err(DER_PANIC, NULL, "%s: Failed to create Intel "
		    "IOMMU domainid allocator: %s", immu->immu_name,
		    immu->immu_did_arena_name);
	}
}

/* #########################  CONTEXT CODE ################################# */

static void
context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table,
    int bus, int devfunc)
{
	pgtable_t *context;
	pgtable_t *pgtable_root;
	pgtable_t *unity_pgtable_root;
	hw_rce_t *hw_rent;
	hw_rce_t *hw_cent;
	hw_rce_t *ctxp;
	int sid;
	krw_t rwtype;
	boolean_t fill_root;
	boolean_t fill_ctx;

	ASSERT(immu);
	ASSERT(domain);
	ASSERT(root_table);
	ASSERT(bus >= 0);
	ASSERT(devfunc >= 0);
	ASSERT(domain->dom_pgtable_root);

	pgtable_root = domain->dom_pgtable_root;

	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
	context = *(pgtable_t **)(ctxp + bus);
	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr) + bus;

	fill_root = B_FALSE;
	fill_ctx = B_FALSE;

	/* Check the most common case first with reader lock */
	rw_enter(&(immu->immu_ctx_rwlock), RW_READER);
	rwtype = RW_READER;
again:
	if (ROOT_GET_P(hw_rent)) {
		ASSERT(ROOT_GET_CONT(hw_rent) == context->hwpg_paddr);
		hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
		if (CONT_GET_AVAIL(hw_cent) == IMMU_CONT_INITED) {
			ASSERT(CONT_GET_P(hw_cent));
			ASSERT(CONT_GET_DID(hw_cent) == domain->dom_did);
			ASSERT(CONT_GET_AW(hw_cent) == immu->immu_dvma_agaw);
			ASSERT(CONT_GET_TTYPE(hw_cent) == TTYPE_XLATE_ONLY);
			ASSERT(CONT_GET_ASR(hw_cent) ==
			    pgtable_root->hwpg_paddr);
			rw_exit(&(immu->immu_ctx_rwlock));
			return;
		} else {
			fill_ctx = B_TRUE;
		}
	} else {
		fill_root = B_TRUE;
		fill_ctx = B_TRUE;
	}

	if (rwtype == RW_READER &&
	    rw_tryupgrade(&(immu->immu_ctx_rwlock)) == 0) {
		rw_exit(&(immu->immu_ctx_rwlock));
		rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
		rwtype = RW_WRITER;
		goto again;
	}
	rwtype = RW_WRITER;

	if (fill_root == B_TRUE) {
		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
		ROOT_SET_P(hw_rent);
		immu_regs_cpu_flush(immu, (caddr_t)hw_rent, sizeof (hw_rce_t));
	}

	if (fill_ctx == B_TRUE) {
		hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
		unity_pgtable_root = immu->immu_unity_domain->dom_pgtable_root;
		ASSERT(CONT_GET_AVAIL(hw_cent) == IMMU_CONT_UNINITED);
		ASSERT(CONT_GET_P(hw_cent));
		ASSERT(CONT_GET_DID(hw_cent) ==
		    immu->immu_unity_domain->dom_did);
		ASSERT(CONT_GET_AW(hw_cent) == immu->immu_dvma_agaw);
		ASSERT(CONT_GET_TTYPE(hw_cent) == TTYPE_XLATE_ONLY);
		ASSERT(CONT_GET_ASR(hw_cent) ==
		    unity_pgtable_root->hwpg_paddr);

		/* need to disable context entry before reprogramming it */
		bzero(hw_cent, sizeof (hw_rce_t));

		/* flush caches */
		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
		ASSERT(rw_write_held(&(immu->immu_ctx_rwlock)));

		sid = ((bus << 8) | devfunc);
		immu_regs_context_flush(immu, 0, sid, domain->dom_did,
		    CONTEXT_FSI);

		immu_regs_wbf_flush(immu);

		CONT_SET_AVAIL(hw_cent, IMMU_CONT_INITED);
		CONT_SET_DID(hw_cent, domain->dom_did);
		CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
		CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
		/*LINTED*/
		CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
		CONT_SET_P(hw_cent);
		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
	}
	rw_exit(&(immu->immu_ctx_rwlock));
}

static pgtable_t *
context_create(immu_t *immu)
{
	int	bus;
	int	devfunc;
	pgtable_t *root_table;
	pgtable_t *context;
	pgtable_t *pgtable_root;
	hw_rce_t *ctxp;
	hw_rce_t *hw_rent;
	hw_rce_t *hw_cent;

	/* Allocate a zeroed root table (4K 256b entries) */
	root_table = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
	pgtable_zero(immu, root_table);

	/*
	 * Setup context tables for all possible root table entries.
	 * Start out with unity domains for all entries.
	 */
	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr);
	for (bus = 0; bus < IMMU_ROOT_NUM; bus++, ctxp++, hw_rent++) {
		context = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
		pgtable_zero(immu, context);
		ASSERT(ROOT_GET_P(hw_rent) == 0);
		ROOT_SET_P(hw_rent);
		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
		hw_cent = (hw_rce_t *)(context->hwpg_vaddr);
		for (devfunc = 0; devfunc < IMMU_CONT_NUM;
		    devfunc++, hw_cent++) {
			ASSERT(CONT_GET_P(hw_cent) == 0);
			pgtable_root =
			    immu->immu_unity_domain->dom_pgtable_root;
			CONT_SET_DID(hw_cent,
			    immu->immu_unity_domain->dom_did);
			CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
			CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
			/*LINTED*/
			CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
			CONT_SET_AVAIL(hw_cent, IMMU_CONT_UNINITED);
			CONT_SET_P(hw_cent);
		}
		immu_regs_cpu_flush(immu, context->hwpg_vaddr, IMMU_PAGESIZE);
		*((pgtable_t **)ctxp) = context;
	}
	immu_regs_cpu_flush(immu, root_table->hwpg_vaddr, IMMU_PAGESIZE);

	return (root_table);
}

/*
 * Called during rootnex attach, so no locks needed
 */
static void
context_init(immu_t *immu)
{
	ASSERT(immu);
	ASSERT(immu->immu_ctx_root == NULL);

	rw_init(&(immu->immu_ctx_rwlock), NULL, RW_DEFAULT, NULL);

	immu_regs_wbf_flush(immu);

	immu->immu_ctx_root = context_create(immu);

	immu_regs_set_root_table(immu);

	rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
	immu_regs_context_flush(immu, 0, 0, 0, CONTEXT_GLOBAL);
	rw_exit(&(immu->immu_ctx_rwlock));
	immu_regs_iotlb_flush(immu, 0, 0, 0, 0, IOTLB_GLOBAL);
	immu_regs_wbf_flush(immu);
}


/*
 * Find top pcib
 */
static int
find_top_pcib(dev_info_t *dip, void *arg)
{
	immu_devi_t *immu_devi;
	dev_info_t **pcibdipp = (dev_info_t **)arg;

	ASSERT(dip);

	immu_devi = immu_devi_get(dip);
	ASSERT(immu_devi);

	if (immu_devi->imd_pcib_type == IMMU_PCIB_PCI_PCI) {
		*pcibdipp = dip;
	}

	return (DDI_WALK_CONTINUE);
}

static int
immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip,
    dev_info_t *rdip, immu_flags_t immu_flags)
{
	immu_devi_t *r_immu_devi;
	immu_devi_t *d_immu_devi;
	int r_bus;
	int d_bus;
	int r_devfunc;
	int d_devfunc;
	immu_pcib_t d_pcib_type;
	immu_pcib_t r_pcib_type;
	dev_info_t *pcibdip;

	if (ddip == NULL || rdip == NULL ||
	    ddip == root_devinfo || rdip == root_devinfo) {
		ddi_err(DER_MODE, rdip, "immu_contexts_update: domain-dip or "
		    "request-dip are NULL or are root devinfo");
		return (DDI_FAILURE);
	}

	/*
	 * We need to set the context fields
	 * based on what type of device rdip and ddip are.
	 * To do that we need the immu_devi field.
	 * Set the immu_devi field (if not already set)
	 */
	if (immu_devi_set(ddip, immu_flags) == DDI_FAILURE) {
		ddi_err(DER_MODE, rdip,
		    "immu_context_update: failed to set immu_devi for ddip");
		return (DDI_FAILURE);
	}

	if (immu_devi_set(rdip, immu_flags) == DDI_FAILURE) {
		ddi_err(DER_MODE, rdip,
		    "immu_context_update: failed to set immu_devi for rdip");
		return (DDI_FAILURE);
	}

	d_immu_devi = immu_devi_get(ddip);
	r_immu_devi = immu_devi_get(rdip);
	ASSERT(r_immu_devi);
	ASSERT(d_immu_devi);

	d_bus = d_immu_devi->imd_bus;
	d_devfunc = d_immu_devi->imd_devfunc;
	d_pcib_type = d_immu_devi->imd_pcib_type;
	r_bus = r_immu_devi->imd_bus;
	r_devfunc = r_immu_devi->imd_devfunc;
	r_pcib_type = r_immu_devi->imd_pcib_type;

	ASSERT(d_bus >= 0);

	if (rdip == ddip) {
		ASSERT(d_pcib_type == IMMU_PCIB_ENDPOINT ||
		    d_pcib_type == IMMU_PCIB_PCIE_PCIE);
		ASSERT(r_bus >= 0);
		ASSERT(r_devfunc >= 0);
		/* rdip is a PCIE device. set context for it only */
		context_set(immu, domain, immu->immu_ctx_root, r_bus,
		    r_devfunc);
#ifdef BUGGY_DRIVERS
	} else if (r_immu_devi == d_immu_devi) {
#ifdef TEST
		ddi_err(DER_WARN, rdip, "Driver bug: Devices 0x%lx and "
		    "0x%lx are identical", rdip, ddip);
#endif
		ASSERT(d_pcib_type == IMMU_PCIB_ENDPOINT);
		ASSERT(r_bus >= 0);
		ASSERT(r_devfunc >= 0);
		/* rdip is a PCIE device. set context for it only */
		context_set(immu, domain, immu->immu_ctx_root, r_bus,
		    r_devfunc);
#endif
	} else if (d_pcib_type == IMMU_PCIB_PCIE_PCI) {
		/*
		 * ddip is a PCIE_PCI bridge. Set context for ddip's
		 * secondary bus. If rdip is on ddip's secondary
		 * bus, set context for rdip. Else, set context
		 * for rdip's PCI bridge on ddip's secondary bus.
		 */
		context_set(immu, domain, immu->immu_ctx_root,
		    d_immu_devi->imd_sec, 0);
		if (d_immu_devi->imd_sec == r_bus) {
			context_set(immu, domain, immu->immu_ctx_root,
			    r_bus, r_devfunc);
		} else {
			pcibdip = NULL;
			if (immu_walk_ancestor(rdip, ddip, find_top_pcib,
			    &pcibdip, NULL, immu_flags) == DDI_SUCCESS &&
			    pcibdip != NULL) {
				ASSERT(pcibdip);
				r_immu_devi = immu_devi_get(pcibdip);
				ASSERT(d_immu_devi);
				ASSERT(d_immu_devi->imd_pcib_type ==
				    IMMU_PCIB_PCI_PCI);
				r_bus = r_immu_devi->imd_bus;
				r_devfunc = r_immu_devi->imd_devfunc;
				context_set(immu, domain, immu->immu_ctx_root,
				    r_bus, r_devfunc);
			} else {
				ddi_err(DER_PANIC, rdip, "Failed to find PCI "
				    " bridge for PCI device");
				/*NOTREACHED*/
			}
		}
	} else if (d_pcib_type == IMMU_PCIB_PCI_PCI) {
		context_set(immu, domain, immu->immu_ctx_root, d_bus,
		    d_devfunc);
	} else if (d_pcib_type == IMMU_PCIB_ENDPOINT) {
		ASSERT(r_pcib_type == IMMU_PCIB_NOBDF);
		/*
		 * ddip is a PCIE device which has a non-PCI device under it
		 * i.e. it is a PCI-nonPCI bridge. Example: pciicde-ata
		 */
		context_set(immu, domain, immu->immu_ctx_root, d_bus,
		    d_devfunc);
	} else {
		ddi_err(DER_PANIC, rdip, "unknown device type. Cannot "
		    "set IMMU context.");
		/*NOTREACHED*/
	}

	/* XXX do we need a membar_producer() here */
	return (DDI_SUCCESS);
}

/* ##################### END CONTEXT CODE ################################## */
/* ##################### MAPPING CODE ################################## */


static boolean_t
PDTE_check(immu_t *immu, hw_pdte_t pdte, pgtable_t *next, paddr_t paddr,
    dev_info_t *rdip, immu_flags_t immu_flags)
{
	if (immu_flags & IMMU_FLAGS_PAGE1) {
		ASSERT(paddr == 0);
	} else {
		ASSERT((next == NULL) ^ (paddr == 0));
	}

	/* The PDTE must be set i.e. present bit is set */
	if (!PDTE_P(pdte)) {
		ddi_err(DER_MODE, rdip, "No present flag");
		return (B_FALSE);
	}

	/*
	 * Just assert to check most significant system software field
	 * (PDTE_SW4) as it is same as present bit and we
	 * checked that above
	 */
	ASSERT(PDTE_SW4(pdte));

	/*
	 * TM field should be clear if not reserved.
	 * non-leaf is always reserved
	 */
	if (next == NULL && immu->immu_TM_reserved == B_FALSE) {
		if (PDTE_TM(pdte)) {
			ddi_err(DER_MODE, rdip, "TM flag set");
			return (B_FALSE);
		}
	}

	/*
	 * The SW3 field is not used and must be clear
	 */
	if (PDTE_SW3(pdte)) {
		ddi_err(DER_MODE, rdip, "SW3 set");
		return (B_FALSE);
	}

	/*
	 * PFN (for PTE) or next level pgtable-paddr (for PDE) must be set
	 */
	if (next == NULL) {
		ASSERT(paddr % IMMU_PAGESIZE == 0);
		if (PDTE_PADDR(pdte) != paddr) {
			ddi_err(DER_MODE, rdip,
			    "PTE paddr mismatch: %lx != %lx",
			    PDTE_PADDR(pdte), paddr);
			return (B_FALSE);
		}
	} else {
		if (PDTE_PADDR(pdte) != next->hwpg_paddr) {
			ddi_err(DER_MODE, rdip,
			    "PDE paddr mismatch: %lx != %lx",
			    PDTE_PADDR(pdte), next->hwpg_paddr);
			return (B_FALSE);
		}
	}

	/*
	 * SNP field should be clear if not reserved.
	 * non-leaf is always reserved
	 */
	if (next == NULL && immu->immu_SNP_reserved == B_FALSE) {
		if (PDTE_SNP(pdte)) {
			ddi_err(DER_MODE, rdip, "SNP set");
			return (B_FALSE);
		}
	}

	/* second field available for system software should be clear */
	if (PDTE_SW2(pdte)) {
		ddi_err(DER_MODE, rdip, "SW2 set");
		return (B_FALSE);
	}

	/* Super pages field should be clear */
	if (PDTE_SP(pdte)) {
		ddi_err(DER_MODE, rdip, "SP set");
		return (B_FALSE);
	}

	/*
	 * least significant field available for
	 * system software should be clear
	 */
	if (PDTE_SW1(pdte)) {
		ddi_err(DER_MODE, rdip, "SW1 set");
		return (B_FALSE);
	}

	if ((immu_flags & IMMU_FLAGS_READ) && !PDTE_READ(pdte)) {
		ddi_err(DER_MODE, rdip, "READ not set");
		return (B_FALSE);
	}

	if ((immu_flags & IMMU_FLAGS_WRITE) && !PDTE_WRITE(pdte)) {
		ddi_err(DER_MODE, rdip, "WRITE not set");
		return (B_FALSE);
	}

	return (B_TRUE);
}
/*ARGSUSED*/
static void
PTE_clear_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
    uint64_t *dvma_ptr, uint64_t *npages_ptr, dev_info_t *rdip)
{
	uint64_t npages;
	uint64_t dvma;
	pgtable_t *pgtable;
	hw_pdte_t *hwp;
	hw_pdte_t *shwp;
	int idx;
	hw_pdte_t pte;

	ASSERT(xlate->xlt_level == 1);

	pgtable = xlate->xlt_pgtable;
	idx = xlate->xlt_idx;

	ASSERT(pgtable);
	ASSERT(idx <= IMMU_PGTABLE_MAXIDX);

	dvma = *dvma_ptr;
	npages = *npages_ptr;

	ASSERT(dvma);
	ASSERT(dvma % IMMU_PAGESIZE == 0);
	ASSERT(npages);

	/*
	 * since a caller gets a unique dvma for a physical address,
	 * no other concurrent thread will be writing to the same
	 * PTE even if it has the same paddr. So no locks needed.
	 */
	shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;

	hwp = shwp;
	for (; npages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {

		pte = *hwp;

		/* Cannot clear a HW PTE that is aleady clear */
		ASSERT(PDTE_P(pte));
		PDTE_CLEAR_P(pte);
		*hwp = pte;

		dvma += IMMU_PAGESIZE;
		npages--;
	}


#ifdef TEST
	/* dont need to flush write during unmap */
	immu_regs_cpu_flush(immu, (caddr_t)shwp,
	    (hwp - shwp) * sizeof (hw_pdte_t));
#endif

	*dvma_ptr = dvma;
	*npages_ptr = npages;

	xlate->xlt_idx = idx;
}

/*ARGSUSED*/
static void
xlate_setup(immu_t *immu, uint64_t dvma, xlate_t *xlate,
    int nlevels, dev_info_t *rdip)
{
	int level;
	uint64_t offbits;

	/* level 0 is never used. Sanity check */
	ASSERT(xlate->xlt_level == 0);
	ASSERT(xlate->xlt_idx == 0);
	ASSERT(xlate->xlt_pgtable == NULL);
	ASSERT(dvma % IMMU_PAGESIZE == 0);

	/*
	 * Skip the first 12 bits which is the offset into
	 * 4K PFN (phys page frame based on IMMU_PAGESIZE)
	 */
	offbits = dvma >> IMMU_PAGESHIFT;

	/* skip to level 1 i.e. leaf PTE */
	for (level = 1, xlate++; level <= nlevels; level++, xlate++) {
		xlate->xlt_level = level;
		xlate->xlt_idx = (offbits & IMMU_PGTABLE_LEVEL_MASK);
		ASSERT(xlate->xlt_idx <= IMMU_PGTABLE_MAXIDX);
		xlate->xlt_pgtable = NULL;
		offbits >>= IMMU_PGTABLE_LEVEL_STRIDE;
	}
}

/*
 * Read the pgtables
 */
static void
PDE_lookup(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels,
    dev_info_t *rdip)
{
	pgtable_t *pgtable;
	pgtable_t *next;
	hw_pdte_t pde;
	uint_t idx;

	/* xlate should be at level 0 */
	ASSERT(xlate->xlt_level == 0);
	ASSERT(xlate->xlt_idx == 0);

	/* start with highest level pgtable i.e. root */
	xlate += nlevels;
	ASSERT(xlate->xlt_level == nlevels);

	if (xlate->xlt_pgtable == NULL) {
		xlate->xlt_pgtable = domain->dom_pgtable_root;
	}

	for (; xlate->xlt_level > 1; xlate--) {

		idx = xlate->xlt_idx;
		pgtable = xlate->xlt_pgtable;

		ASSERT(pgtable);
		ASSERT(idx <= IMMU_PGTABLE_MAXIDX);

		if ((xlate - 1)->xlt_pgtable) {
			continue;
		}

		/* xlate's leafier level is not set, set it now */

		/* Lock the pgtable in read mode */
		rw_enter(&(pgtable->swpg_rwlock), RW_READER);

		/*
		 * since we are unmapping, the pgtable should
		 * already point to a leafier pgtable.
		 */
		next = *(pgtable->swpg_next_array + idx);
		ASSERT(next);

		pde = *((hw_pdte_t *)(pgtable->hwpg_vaddr) + idx);

		ASSERT(PDTE_check(immu, pde, next, 0, rdip, 0) == B_TRUE);

		(xlate - 1)->xlt_pgtable = next;

		rw_exit(&(pgtable->swpg_rwlock));
	}
}

/*ARGSUSED*/
static void
PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr,
    dev_info_t *rdip, immu_flags_t immu_flags)
{
	hw_pdte_t pte;

	pte = *hwp;

#ifndef DEBUG
	/* Set paddr */
	ASSERT(paddr % IMMU_PAGESIZE == 0);
	pte = 0;
	PDTE_SET_PADDR(pte, paddr);
	PDTE_SET_READ(pte);
	PDTE_SET_WRITE(pte);
	*hwp = pte;
#else

	if (PDTE_P(pte)) {
		if (PDTE_PADDR(pte) != paddr) {
			ddi_err(DER_MODE, rdip, "PTE paddr %lx != paddr %lx",
			    PDTE_PADDR(pte), paddr);
		}
#ifdef BUGGY_DRIVERS
		return;
#else
		goto out;
#endif
	}

	/* Don't touch SW4. It is the present field */

	/* clear TM field if not reserved */
	if (immu->immu_TM_reserved == B_FALSE) {
		PDTE_CLEAR_TM(pte);
	}

#ifdef DEBUG
	/* Clear 3rd field for system software  - not used */
	PDTE_CLEAR_SW3(pte);
#endif

	/* Set paddr */
	ASSERT(paddr % IMMU_PAGESIZE == 0);
	PDTE_CLEAR_PADDR(pte);
	PDTE_SET_PADDR(pte, paddr);

	/*  clear SNP field if not reserved. */
	if (immu->immu_SNP_reserved == B_FALSE) {
		PDTE_CLEAR_SNP(pte);
	}

#ifdef DEBUG
	/* Clear SW2 field available for software */
	PDTE_CLEAR_SW2(pte);
#endif


#ifdef DEBUG
	/* SP is don't care for PTEs. Clear it for cleanliness */
	PDTE_CLEAR_SP(pte);
#endif

#ifdef DEBUG
	/* Clear SW1 field available for software */
	PDTE_CLEAR_SW1(pte);
#endif

	/*
	 * Now that we are done writing the PTE
	 * set the "present" flag. Note this present
	 * flag is a bit in the PDE/PTE that the
	 * spec says is available for system software.
	 * This is an implementation detail of Solaris
	 * bare-metal Intel IOMMU.
	 * The present field in a PDE/PTE is not defined
	 * by the Vt-d spec
	 */

	PDTE_SET_P(pte);

out:
#ifdef BUGGY_DRIVERS
	PDTE_SET_READ(pte);
	PDTE_SET_WRITE(pte);
#else
	if (immu_flags & IMMU_FLAGS_READ)
		PDTE_SET_READ(pte);
	if (immu_flags & IMMU_FLAGS_WRITE)
		PDTE_SET_WRITE(pte);
#endif

	*hwp = pte;
#endif
}

/*ARGSUSED*/
static void
PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
    uint64_t *dvma_ptr, uint64_t *nvpages_ptr, dcookie_t *dcookies,
    int dcount, dev_info_t *rdip, immu_flags_t immu_flags)
{
	paddr_t paddr;
	uint64_t nvpages;
	uint64_t nppages;
	uint64_t dvma;
	pgtable_t *pgtable;
	hw_pdte_t *hwp;
	hw_pdte_t *shwp;
	int idx;
	int j;

	ASSERT(xlate->xlt_level == 1);

	pgtable = xlate->xlt_pgtable;
	idx = xlate->xlt_idx;

	ASSERT(idx <= IMMU_PGTABLE_MAXIDX);
	ASSERT(pgtable);

	dvma = *dvma_ptr;
	nvpages = *nvpages_ptr;

	ASSERT(dvma || (immu_flags & IMMU_FLAGS_PAGE1));
	ASSERT(nvpages);

	/*
	 * since a caller gets a unique dvma for a physical address,
	 * no other concurrent thread will be writing to the same
	 * PTE even if it has the same paddr. So no locks needed.
	 */
	shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;

	hwp = shwp;
	for (j = dcount - 1; j >= 0; j--) {
		if (nvpages <= dcookies[j].dck_npages)
			break;
		nvpages -= dcookies[j].dck_npages;
	}

	ASSERT(j >= 0);
	ASSERT(nvpages);
	ASSERT(nvpages <= dcookies[j].dck_npages);
	nppages = nvpages;
	paddr = dcookies[j].dck_paddr +
	    (dcookies[j].dck_npages - nppages) * IMMU_PAGESIZE;

	nvpages = *nvpages_ptr;
	for (; nvpages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {

		ASSERT(paddr || (immu_flags & IMMU_FLAGS_PAGE1));

		PTE_set_one(immu, hwp, paddr, rdip, immu_flags);

		ASSERT(PDTE_check(immu, *hwp, NULL, paddr, rdip, immu_flags)
		    == B_TRUE);
		nppages--;
		nvpages--;
		paddr += IMMU_PAGESIZE;
		dvma += IMMU_PAGESIZE;

		if (nppages == 0) {
			j++;
		}

		if (j == dcount) {
			ASSERT(nvpages == 0);
			break;
		}

		ASSERT(nvpages);
		if (nppages == 0) {
			nppages = dcookies[j].dck_npages;
			paddr = dcookies[j].dck_paddr;
		}
	}

	/* flush writes to HW PTE table */
	immu_regs_cpu_flush(immu, (caddr_t)shwp, (hwp - shwp) *
	    sizeof (hw_pdte_t));

	if (nvpages) {
		*dvma_ptr = dvma;
		*nvpages_ptr = nvpages;
	} else {
		*dvma_ptr = 0;
		*nvpages_ptr = 0;
	}

	xlate->xlt_idx = idx;
}

/*ARGSUSED*/
static void
PDE_set_one(immu_t *immu, hw_pdte_t *hwp, pgtable_t *next,
    dev_info_t *rdip, immu_flags_t immu_flags)
{
	hw_pdte_t pde;

	pde = *hwp;

	/* if PDE is already set, make sure it is correct */
	if (PDTE_P(pde)) {
		ASSERT(PDTE_PADDR(pde) == next->hwpg_paddr);
#ifdef BUGGY_DRIVERS
		return;
#else
		goto out;
#endif
	}

	/* Dont touch SW4, it is the present bit */

	/* don't touch TM field it is reserved for PDEs */

	/* 3rd field available for system software is not used */
	PDTE_CLEAR_SW3(pde);

	/* Set next level pgtable-paddr for PDE */
	ASSERT(next->hwpg_paddr % IMMU_PAGESIZE == 0);
	PDTE_CLEAR_PADDR(pde);
	PDTE_SET_PADDR(pde, next->hwpg_paddr);

	/* don't touch SNP field it is reserved for PDEs */

	/* Clear second field available for system software */
	PDTE_CLEAR_SW2(pde);

	/* No super pages for PDEs */
	PDTE_CLEAR_SP(pde);

	/* Clear SW1 for software */
	PDTE_CLEAR_SW1(pde);

	/*
	 * Now that we are done writing the PDE
	 * set the "present" flag. Note this present
	 * flag is a bit in the PDE/PTE that the
	 * spec says is available for system software.
	 * This is an implementation detail of Solaris
	 * base-metal Intel IOMMU.
	 * The present field in a PDE/PTE is not defined
	 * by the Vt-d spec
	 */

out:
#ifdef  BUGGY_DRIVERS
	PDTE_SET_READ(pde);
	PDTE_SET_WRITE(pde);
#else
	if (immu_flags & IMMU_FLAGS_READ)
		PDTE_SET_READ(pde);
	if (immu_flags & IMMU_FLAGS_WRITE)
		PDTE_SET_WRITE(pde);
#endif

	PDTE_SET_P(pde);

	*hwp = pde;

	immu_regs_cpu_flush(immu, (caddr_t)hwp, sizeof (hw_pdte_t));
}

/*
 * Used to set PDEs
 */
static boolean_t
PDE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels,
    dev_info_t *rdip, immu_flags_t immu_flags)
{
	pgtable_t *pgtable;
	pgtable_t *new;
	pgtable_t *next;
	hw_pdte_t *hwp;
	int level;
	uint_t idx;
	krw_t rwtype;
	boolean_t set = B_FALSE;

	/* xlate should be at level 0 */
	ASSERT(xlate->xlt_level == 0);
	ASSERT(xlate->xlt_idx == 0);

	/* start with highest level pgtable i.e. root */
	xlate += nlevels;
	ASSERT(xlate->xlt_level == nlevels);

	new = NULL;
	xlate->xlt_pgtable = domain->dom_pgtable_root;
	for (level = nlevels; level > 1; level--, xlate--) {

		ASSERT(xlate->xlt_level == level);

		idx = xlate->xlt_idx;
		pgtable = xlate->xlt_pgtable;

		ASSERT(pgtable);
		ASSERT(idx <= IMMU_PGTABLE_MAXIDX);

		/* speculative alloc */
		if (new == NULL) {
			new = pgtable_alloc(immu, immu_flags);
			if (new == NULL) {
				ddi_err(DER_PANIC, rdip, "pgtable alloc err");
			}
		}

		/* Lock the pgtable in READ mode first */
		rw_enter(&(pgtable->swpg_rwlock), RW_READER);
		rwtype = RW_READER;
again:
		hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;

		ASSERT(pgtable->swpg_next_array);

		next = (pgtable->swpg_next_array)[idx];

		/*
		 * check if leafier level already has a pgtable
		 * if yes, verify
		 */
		if (next == NULL) {
			/* Change to a write lock */
			if (rwtype == RW_READER &&
			    rw_tryupgrade(&(pgtable->swpg_rwlock)) == 0) {
				rw_exit(&(pgtable->swpg_rwlock));
				rw_enter(&(pgtable->swpg_rwlock), RW_WRITER);
				rwtype = RW_WRITER;
				goto again;
			}
			rwtype = RW_WRITER;
			pgtable_zero(immu, new);
			next = new;
			new = NULL;
			(pgtable->swpg_next_array)[idx] = next;
			PDE_set_one(immu, hwp, next, rdip, immu_flags);
			set = B_TRUE;
			rw_downgrade(&(pgtable->swpg_rwlock));
			rwtype = RW_READER;
		} else {
			hw_pdte_t pde = *hwp;

#ifndef  BUGGY_DRIVERS
			/*
			 * If buggy driver we already set permission
			 * READ+WRITE so nothing to do for that case
			 * XXX Check that read writer perms change before
			 * actually setting perms. Also need to hold lock
			 */
			if (immu_flags & IMMU_FLAGS_READ)
				PDTE_SET_READ(pde);
			if (immu_flags & IMMU_FLAGS_WRITE)
				PDTE_SET_WRITE(pde);

#endif

			*hwp = pde;
		}

		ASSERT(PDTE_check(immu, *hwp, next, 0, rdip, immu_flags)
		    == B_TRUE);

		(xlate - 1)->xlt_pgtable = next;
		ASSERT(rwtype == RW_READER);
		rw_exit(&(pgtable->swpg_rwlock));
	}

	if (new) {
		pgtable_free(immu, new);
	}

	return (set);
}

/*
 * dvma_map()
 *     map a contiguous range of DVMA pages
 *
 *     immu: IOMMU unit for which we are generating DVMA cookies
 *   domain: domain
 *    sdvma: Starting dvma
 *   spaddr: Starting paddr
 *   npages: Number of pages
 *     rdip: requesting device
 *     immu_flags: flags
 */
static boolean_t
dvma_map(immu_t *immu, domain_t *domain, uint64_t sdvma, uint64_t snvpages,
    dcookie_t *dcookies, int dcount, dev_info_t *rdip, immu_flags_t immu_flags)
{
	uint64_t dvma;
	uint64_t n;
	int nlevels = immu->immu_dvma_nlevels;
	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
	boolean_t pde_set = B_FALSE;

	ASSERT(nlevels <= IMMU_PGTABLE_MAX_LEVELS);
	ASSERT(sdvma % IMMU_PAGESIZE == 0);
	ASSERT(snvpages);

	n = snvpages;
	dvma = sdvma;

	while (n > 0) {
		xlate_setup(immu, dvma, xlate, nlevels, rdip);

		/* Lookup or allocate PGDIRs and PGTABLEs if necessary */
		if (PDE_set_all(immu, domain, xlate, nlevels, rdip, immu_flags)
		    == B_TRUE) {
			pde_set = B_TRUE;
		}

		/* set all matching ptes that fit into this leaf pgtable */
		PTE_set_all(immu, domain, &xlate[1], &dvma, &n, dcookies,
		    dcount, rdip, immu_flags);
	}

	return (pde_set);
}

/*
 * dvma_unmap()
 *   unmap a range of DVMAs
 *
 * immu: IOMMU unit state
 * domain: domain for requesting device
 * ddip: domain-dip
 * dvma: starting DVMA
 * npages: Number of IMMU pages to be unmapped
 * rdip: requesting device
 */
static void
dvma_unmap(immu_t *immu, domain_t *domain, uint64_t sdvma, uint64_t snpages,
    dev_info_t *rdip)
{
	int nlevels = immu->immu_dvma_nlevels;
	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
	uint64_t n;
	uint64_t dvma;

	ASSERT(nlevels <= IMMU_PGTABLE_MAX_LEVELS);
	ASSERT(sdvma != 0);
	ASSERT(sdvma % IMMU_PAGESIZE == 0);
	ASSERT(snpages);

	dvma = sdvma;
	n = snpages;

	while (n > 0) {
		/* setup the xlate array */
		xlate_setup(immu, dvma, xlate, nlevels, rdip);

		/* just lookup existing pgtables. Should never fail */
		PDE_lookup(immu, domain, xlate, nlevels, rdip);

		/* clear all matching ptes that fit into this leaf pgtable */
		PTE_clear_all(immu, domain, &xlate[1], &dvma, &n, rdip);
	}

	/* No need to flush IOTLB after unmap */
}

static uint64_t
dvma_alloc(ddi_dma_impl_t *hp, domain_t *domain, uint_t npages)
{
	ddi_dma_attr_t *dma_attr;
	uint64_t dvma;
	size_t xsize, align;
	uint64_t minaddr, maxaddr;

	ASSERT(domain->dom_maptype != IMMU_MAPTYPE_UNITY);

	/* shotcuts */
	dma_attr = &(hp->dmai_attr);

	/* parameters */
	xsize = npages * IMMU_PAGESIZE;
	align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE);
	minaddr = dma_attr->dma_attr_addr_lo;
	maxaddr = dma_attr->dma_attr_addr_hi + 1;
	/* nocross is checked in cookie_update() */

	/* handle the rollover cases */
	if (maxaddr < dma_attr->dma_attr_addr_hi) {
		maxaddr = dma_attr->dma_attr_addr_hi;
	}

	/*
	 * allocate from vmem arena.
	 */
	dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena,
	    xsize, align, 0, 0, (void *)(uintptr_t)minaddr,
	    (void *)(uintptr_t)maxaddr, VM_NOSLEEP);

	ASSERT(dvma);
	ASSERT(dvma >= minaddr);
	ASSERT(dvma + xsize - 1 < maxaddr);

	return (dvma);
}

static void
dvma_free(domain_t *domain, uint64_t dvma, uint64_t npages)
{
	uint64_t size = npages * IMMU_PAGESIZE;

	ASSERT(domain);
	ASSERT(domain->dom_did > 0);
	ASSERT(dvma);
	ASSERT(npages);

	if (domain->dom_maptype != IMMU_MAPTYPE_XLATE) {
		ASSERT(domain->dom_maptype == IMMU_MAPTYPE_UNITY);
		return;
	}

	vmem_free(domain->dom_dvma_arena, (void *)(uintptr_t)dvma, size);
}
/*ARGSUSED*/
static void
cookie_free(rootnex_dma_t *dma, immu_t *immu, domain_t *domain,
    dev_info_t *rdip)
{
	int i;
	uint64_t dvma;
	uint64_t npages;
	dvcookie_t  *dvcookies = dma->dp_dvcookies;

	ASSERT(dma->dp_max_cookies);
	ASSERT(dma->dp_max_dcookies);
	ASSERT(dma->dp_dvmax < dma->dp_max_cookies);
	ASSERT(dma->dp_dmax < dma->dp_max_dcookies);

	/*
	 * we allocated DVMA in a single chunk. Calculate total number
	 * of pages
	 */
	for (i = 0, npages = 0; i <= dma->dp_dvmax; i++) {
		npages += dvcookies[i].dvck_npages;
	}
	dvma = dvcookies[0].dvck_dvma;
#ifdef DEBUG
	/* Unmap only in DEBUG mode */
	dvma_unmap(immu, domain, dvma, npages, rdip);
#endif
	dvma_free(domain, dvma, npages);

	kmem_free(dma->dp_dvcookies, sizeof (dvcookie_t) * dma->dp_max_cookies);
	dma->dp_dvcookies = NULL;
	kmem_free(dma->dp_dcookies, sizeof (dcookie_t) * dma->dp_max_dcookies);
	dma->dp_dcookies = NULL;
	if (dma->dp_need_to_free_cookie == B_TRUE) {
		kmem_free(dma->dp_cookies, sizeof (ddi_dma_cookie_t) *
		    dma->dp_max_cookies);
		dma->dp_dcookies = NULL;
		dma->dp_need_to_free_cookie = B_FALSE;
	}

	dma->dp_max_cookies = 0;
	dma->dp_max_dcookies = 0;
	dma->dp_cookie_size = 0;
	dma->dp_dvmax = 0;
	dma->dp_dmax = 0;
}

/*
 * cookie_alloc()
 */
static int
cookie_alloc(rootnex_dma_t *dma, struct ddi_dma_req *dmareq,
    ddi_dma_attr_t *attr, uint_t prealloc)
{
	int kmflag;
	rootnex_sglinfo_t *sinfo = &(dma->dp_sglinfo);
	dvcookie_t *dvcookies = dma->dp_dvcookies;
	dcookie_t *dcookies = dma->dp_dcookies;
	ddi_dma_cookie_t *cookies = dma->dp_cookies;
	uint64_t max_cookies;
	uint64_t max_dcookies;
	uint64_t cookie_size;

	/* we need to allocate new array */
	if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
		kmflag =  KM_SLEEP;
	} else {
		kmflag =  KM_NOSLEEP;
	}

	/*
	 * XXX make sure cookies size doen't exceed sinfo->si_max_cookie_size;
	 */

	/*
	 * figure out the rough estimate of array size
	 * At a minimum, each cookie must hold 1 page.
	 * At a maximum, it cannot exceed dma_attr_sgllen
	 */
	max_dcookies = dmareq->dmar_object.dmao_size + IMMU_PAGEOFFSET;
	max_dcookies /= IMMU_PAGESIZE;
	max_dcookies++;
	max_cookies = MIN(max_dcookies, attr->dma_attr_sgllen);

	/* allocate the dvma cookie array */
	dvcookies = kmem_zalloc(sizeof (dvcookie_t) * max_cookies, kmflag);
	if (dvcookies == NULL) {
		return (DDI_FAILURE);
	}

	/* allocate the "phys" cookie array */
	dcookies = kmem_zalloc(sizeof (dcookie_t) * max_dcookies, kmflag);
	if (dcookies == NULL) {
		kmem_free(dvcookies, sizeof (dvcookie_t) * max_cookies);
		dvcookies = NULL;
		return (DDI_FAILURE);
	}

	/* allocate the "real" cookie array  - the one given to users */
	cookie_size = sizeof (ddi_dma_cookie_t) * max_cookies;
	if (max_cookies > prealloc) {
		cookies = kmem_zalloc(cookie_size, kmflag);
		if (cookies == NULL) {
			kmem_free(dvcookies, sizeof (dvcookie_t) * max_cookies);
			kmem_free(dcookies, sizeof (dcookie_t) * max_dcookies);
			goto fail;
		}
		dma->dp_need_to_free_cookie = B_TRUE;
	} else {
		/* the preallocated buffer fits this size */
		cookies = (ddi_dma_cookie_t *)dma->dp_prealloc_buffer;
		bzero(cookies, sizeof (ddi_dma_cookie_t)* max_cookies);
		dma->dp_need_to_free_cookie = B_FALSE;
	}

	dma->dp_dvcookies = dvcookies;
	dma->dp_dcookies = dcookies;
	dma->dp_cookies = cookies;
	dma->dp_cookie_size = cookie_size;
	dma->dp_max_cookies = max_cookies;
	dma->dp_max_dcookies = max_dcookies;
	dma->dp_dvmax = 0;
	dma->dp_dmax = 0;
	sinfo->si_max_pages = dma->dp_max_cookies;

	return (DDI_SUCCESS);

fail:
	dma->dp_dvcookies = NULL;
	dma->dp_dcookies = NULL;
	dma->dp_cookies = NULL;
	dma->dp_cookie_size = 0;
	dma->dp_max_cookies = 0;
	dma->dp_max_dcookies = 0;
	dma->dp_dvmax = 0;
	dma->dp_dmax = 0;
	dma->dp_need_to_free_cookie = B_FALSE;
	sinfo->si_max_pages = 0;

	return (DDI_FAILURE);
}

/*ARGSUSED*/
static void
cookie_update(domain_t *domain, rootnex_dma_t *dma, paddr_t paddr,
    int64_t psize, uint64_t maxseg, size_t nocross)
{
	dvcookie_t *dvcookies = dma->dp_dvcookies;
	dcookie_t *dcookies = dma->dp_dcookies;
	ddi_dma_cookie_t *cookies = dma->dp_cookies;
	uint64_t dvmax = dma->dp_dvmax;
	uint64_t dmax = dma->dp_dmax;

	ASSERT(dvmax < dma->dp_max_cookies);
	ASSERT(dmax < dma->dp_max_dcookies);

	paddr &= IMMU_PAGEMASK;

	ASSERT(paddr);
	ASSERT(psize);
	ASSERT(maxseg);

	/*
	 * check to see if this page would put us
	 * over the max cookie size.
	 */
	if (cookies[dvmax].dmac_size + psize > maxseg) {
		dvmax++;    /* use the next dvcookie */
		dmax++;    /* also means we use the next dcookie */
		ASSERT(dvmax < dma->dp_max_cookies);
		ASSERT(dmax < dma->dp_max_dcookies);
	}

	/*
	 * check to see if this page would make us larger than
	 * the nocross boundary. If yes, create a new cookie
	 * otherwise we will fail later with vmem_xalloc()
	 * due to overconstrained alloc requests
	 * nocross == 0 implies no nocross constraint.
	 */
	if (nocross > 0) {
		ASSERT((dvcookies[dvmax].dvck_npages) * IMMU_PAGESIZE
		    <= nocross);
		if ((dvcookies[dvmax].dvck_npages + 1) * IMMU_PAGESIZE
		    > nocross) {
			dvmax++;    /* use the next dvcookie */
			dmax++;    /* also means we use the next dcookie */
			ASSERT(dvmax < dma->dp_max_cookies);
			ASSERT(dmax < dma->dp_max_dcookies);
		}
		ASSERT((dvcookies[dvmax].dvck_npages) * IMMU_PAGESIZE
		    <= nocross);
	}

	/*
	 * If the cookie is empty
	 */
	if (dvcookies[dvmax].dvck_npages == 0) {
		ASSERT(cookies[dvmax].dmac_size == 0);
		ASSERT(dvcookies[dvmax].dvck_dvma == 0);
		ASSERT(dvcookies[dvmax].dvck_npages
		    == 0);
		ASSERT(dcookies[dmax].dck_paddr == 0);
		ASSERT(dcookies[dmax].dck_npages == 0);

		dvcookies[dvmax].dvck_dvma = 0;
		dvcookies[dvmax].dvck_npages = 1;
		dcookies[dmax].dck_paddr = paddr;
		dcookies[dmax].dck_npages = 1;
		cookies[dvmax].dmac_size = psize;
	} else {
		/* Cookie not empty. Add to it */
		cookies[dma->dp_dvmax].dmac_size += psize;
		ASSERT(dvcookies[dma->dp_dvmax].dvck_dvma == 0);
		dvcookies[dma->dp_dvmax].dvck_npages++;
		ASSERT(dcookies[dmax].dck_paddr != 0);
		ASSERT(dcookies[dmax].dck_npages != 0);

		/* Check if this paddr is contiguous */
		if (IMMU_CONTIG_PADDR(dcookies[dmax], paddr)) {
			dcookies[dmax].dck_npages++;
		} else {
			/* No, we need a new dcookie */
			dmax++;
			ASSERT(dcookies[dmax].dck_paddr == 0);
			ASSERT(dcookies[dmax].dck_npages == 0);
			dcookies[dmax].dck_paddr = paddr;
			dcookies[dmax].dck_npages = 1;
		}
	}

	dma->dp_dvmax = dvmax;
	dma->dp_dmax = dmax;
}

static void
cookie_finalize(ddi_dma_impl_t *hp, immu_t *immu, domain_t *domain,
    dev_info_t *rdip, immu_flags_t immu_flags)
{
	int i;
	rootnex_dma_t *dma = (rootnex_dma_t *)hp->dmai_private;
	dvcookie_t *dvcookies = dma->dp_dvcookies;
	dcookie_t *dcookies = dma->dp_dcookies;
	ddi_dma_cookie_t *cookies = dma->dp_cookies;
	uint64_t npages;
	uint64_t dvma;
	boolean_t pde_set;

	/* First calculate the total number of pages required */
	for (i = 0, npages = 0; i <= dma->dp_dvmax; i++) {
		npages += dvcookies[i].dvck_npages;
	}

	/* Now allocate dvma */
	dvma = dvma_alloc(hp, domain, npages);

	/* Now map the dvma */
	pde_set = dvma_map(immu, domain, dvma, npages, dcookies,
	    dma->dp_dmax + 1, rdip, immu_flags);

	/* Invalidate the IOTLB */
	immu_regs_iotlb_flush(immu, domain->dom_did, dvma, npages,
	    pde_set == B_TRUE ? TLB_IVA_WHOLE : TLB_IVA_LEAF, IOTLB_PSI);

	/* Now setup dvcookies and real cookie addresses */
	for (i = 0; i <= dma->dp_dvmax; i++) {
		dvcookies[i].dvck_dvma = dvma;
		cookies[i].dmac_laddress = dvma;
		ASSERT(cookies[i].dmac_size != 0);
		cookies[i].dmac_type = 0;
		dvma += (dvcookies[i].dvck_npages * IMMU_PAGESIZE);
	}

#ifdef TEST
	immu_regs_iotlb_flush(immu, domain->dom_did, 0, 0, 0, IOTLB_DSI);
#endif
}

/*
 * cookie_create()
 */
static int
cookie_create(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq,
    ddi_dma_attr_t *a, immu_t *immu, domain_t *domain, dev_info_t *rdip,
    uint_t prealloc_count, immu_flags_t immu_flags)
{
	ddi_dma_atyp_t buftype;
	uint64_t offset;
	page_t **pparray;
	uint64_t paddr;
	uint_t psize;
	uint_t size;
	uint64_t maxseg;
	caddr_t vaddr;
	uint_t pcnt;
	page_t *page;
	rootnex_sglinfo_t *sglinfo;
	ddi_dma_obj_t *dmar_object;
	rootnex_dma_t *dma;
	size_t nocross;

	dma = (rootnex_dma_t *)hp->dmai_private;
	sglinfo = &(dma->dp_sglinfo);
	dmar_object = &(dmareq->dmar_object);
	maxseg = sglinfo->si_max_cookie_size;
	pparray = dmar_object->dmao_obj.virt_obj.v_priv;
	vaddr = dmar_object->dmao_obj.virt_obj.v_addr;
	buftype = dmar_object->dmao_type;
	size = dmar_object->dmao_size;
	nocross = (size_t)(a->dma_attr_seg + 1);

	/*
	 * Allocate cookie, dvcookie and dcookie
	 */
	if (cookie_alloc(dma, dmareq, a, prealloc_count) != DDI_SUCCESS) {
		return (DDI_FAILURE);
	}
	hp->dmai_cookie = dma->dp_cookies;

	pcnt = 0;

	/* retrieve paddr, psize, offset from dmareq */
	if (buftype == DMA_OTYP_PAGES) {
		page = dmar_object->dmao_obj.pp_obj.pp_pp;
		ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page));
		offset =  dmar_object->dmao_obj.pp_obj.pp_offset &
		    MMU_PAGEOFFSET;
		paddr = pfn_to_pa(page->p_pagenum) + offset;
		psize = MIN((MMU_PAGESIZE - offset), size);
		sglinfo->si_asp = NULL;
		page = page->p_next;
	} else {
		ASSERT((buftype == DMA_OTYP_VADDR) ||
		    (buftype == DMA_OTYP_BUFVADDR));
		sglinfo->si_asp = dmar_object->dmao_obj.virt_obj.v_as;
		if (sglinfo->si_asp == NULL) {
			sglinfo->si_asp = &kas;
		}
		offset = (uintptr_t)vaddr & MMU_PAGEOFFSET;
		if (pparray != NULL) {
			ASSERT(!PP_ISFREE(pparray[pcnt]));
			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset;
			psize = MIN((MMU_PAGESIZE - offset), size);
			pcnt++;
		} else {
			paddr = pfn_to_pa(hat_getpfnum(sglinfo->si_asp->a_hat,
			    vaddr)) + offset;
			psize = MIN(size, (MMU_PAGESIZE - offset));
			vaddr += psize;
		}
	}

	/* save the iommu page offset */
	sglinfo->si_buf_offset = offset & IMMU_PAGEOFFSET;

	/*
	 * setup dvcookie and dcookie for [paddr, paddr+psize)
	 */
	cookie_update(domain, dma, paddr, psize, maxseg, nocross);

	size -= psize;
	while (size > 0) {
		/* get the size for this page (i.e. partial or full page) */
		psize = MIN(size, MMU_PAGESIZE);
		if (buftype == DMA_OTYP_PAGES) {
			/* get the paddr from the page_t */
			ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page));
			paddr = pfn_to_pa(page->p_pagenum);
			page = page->p_next;
		} else if (pparray != NULL) {
			/* index into the array of page_t's to get the paddr */
			ASSERT(!PP_ISFREE(pparray[pcnt]));
			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum);
			pcnt++;
		} else {
			/* call into the VM to get the paddr */
			paddr = pfn_to_pa(hat_getpfnum
			    (sglinfo->si_asp->a_hat, vaddr));
			vaddr += psize;
		}
		/*
		 * set dvcookie and dcookie for [paddr, paddr+psize)
		 */
		cookie_update(domain, dma, paddr, psize, maxseg, nocross);
		size -= psize;
	}

	cookie_finalize(hp, immu, domain, rdip, immu_flags);

	/* take account in the offset into the first page */
	dma->dp_cookies[0].dmac_laddress += sglinfo->si_buf_offset;

	/* save away how many cookies we have */
	sglinfo->si_sgl_size = dma->dp_dvmax + 1;

	return (DDI_SUCCESS);
}

/* ############################# Functions exported ######################## */

/*
 * setup the DVMA subsystem
 * this code runs only for the first IOMMU unit
 */
void
immu_dvma_setup(list_t *listp)
{
	immu_t *immu;
	uint_t kval;
	size_t nchains;

	/* locks */
	mutex_init(&immu_domain_lock, NULL, MUTEX_DEFAULT, NULL);

	/* Create lists */
	list_create(&immu_unity_domain_list, sizeof (domain_t),
	    offsetof(domain_t, dom_maptype_node));
	list_create(&immu_xlate_domain_list, sizeof (domain_t),
	    offsetof(domain_t, dom_maptype_node));

	/* Setup BDF domain hash */
	nchains = 0xff;
	kval = mod_hash_iddata_gen(nchains);

	bdf_domain_hash = mod_hash_create_extended("BDF-DOMAIN_HASH",
	    nchains, mod_hash_null_keydtor, mod_hash_null_valdtor,
	    mod_hash_byid, (void *)(uintptr_t)kval, mod_hash_idkey_cmp,
	    KM_NOSLEEP);
	ASSERT(bdf_domain_hash);

	immu = list_head(listp);
	for (; immu; immu = list_next(listp, immu)) {
		create_unity_domain(immu);
		did_init(immu);
		context_init(immu);
		immu->immu_dvma_setup = B_TRUE;
	}
}

/*
 * Startup up one DVMA unit
 */
void
immu_dvma_startup(immu_t *immu)
{
	ASSERT(immu);
	ASSERT(immu->immu_dvma_running == B_FALSE);

	if (immu_gfxdvma_enable == B_FALSE &&
	    immu->immu_dvma_gfx_only == B_TRUE) {
		return;
	}

	/*
	 * DVMA will start once IOMMU is "running"
	 */
	ASSERT(immu->immu_dvma_running == B_FALSE);
	immu->immu_dvma_running = B_TRUE;
}

/*
 * immu_dvma_physmem_update()
 *       called when the installed memory on a
 *       system increases, to expand domain DVMA
 *       for domains with UNITY mapping
 */
void
immu_dvma_physmem_update(uint64_t addr, uint64_t size)
{
	uint64_t start;
	uint64_t npages;
	int dcount;
	dcookie_t dcookies[1] = {0};
	domain_t *domain;

	/*
	 * Just walk the system-wide list of domains with
	 * UNITY mapping. Both the list of *all* domains
	 * and *UNITY* domains is protected by the same
	 * single lock
	 */
	mutex_enter(&immu_domain_lock);
	domain = list_head(&immu_unity_domain_list);
	for (; domain; domain = list_next(&immu_unity_domain_list, domain)) {

		/* There is no vmem_arena for unity domains. Just map it */
		ddi_err(DER_LOG, NULL, "IMMU: unity-domain: Adding map "
		    "[0x%" PRIx64 " - 0x%" PRIx64 "]", addr, addr + size);

		start = IMMU_ROUNDOWN(addr);
		npages = (IMMU_ROUNDUP(size) / IMMU_PAGESIZE) + 1;

		dcookies[0].dck_paddr = start;
		dcookies[0].dck_npages = npages;
		dcount = 1;
		(void) dvma_map(domain->dom_immu, domain, start, npages,
		    dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);

	}
	mutex_exit(&immu_domain_lock);
}


int
immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, memrng_t *mrng,
    uint_t prealloc_count, dev_info_t *rdip, immu_flags_t immu_flags)
{
	ddi_dma_attr_t *attr;
	dev_info_t *ddip;
	domain_t *domain;
	immu_t *immu;
	dcookie_t dcookies[1] = {0};
	int dcount = 0;
	boolean_t pde_set = B_TRUE;
	int r = DDI_FAILURE;

	ASSERT(immu_enable == B_TRUE);
	ASSERT(immu_running == B_TRUE || !(immu_flags & IMMU_FLAGS_DMAHDL));
	ASSERT(hp || !(immu_flags & IMMU_FLAGS_DMAHDL));

	/*
	 * Intel IOMMU will only be turned on if IOMMU
	 * page size is a multiple of IOMMU page size
	 */

	/*LINTED*/
	ASSERT(MMU_PAGESIZE % IMMU_PAGESIZE == 0);

	/* Can only do DVMA if dip is attached */
	if (rdip == NULL) {
		ddi_err(DER_PANIC, rdip, "DVMA map: No device specified");
		/*NOTREACHED*/
	}

	immu_flags |= dma_to_immu_flags(dmareq);


	immu = immu_dvma_get_immu(rdip, immu_flags);
	if (immu == NULL) {
		/*
		 * possible that there is no IOMMU unit for this device
		 * - BIOS bugs are one example.
		 */
		ddi_err(DER_WARN, rdip, "No IMMU unit found for device");
		return (DDI_DMA_NORESOURCES);
	}


	/*
	 * redirect isa devices attached under lpc to lpc dip
	 */
	if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) {
		rdip = get_lpc_devinfo(immu, rdip, immu_flags);
		if (rdip == NULL) {
			ddi_err(DER_PANIC, rdip, "IMMU redirect failed");
			/*NOTREACHED*/
		}
	}

	/* Reset immu, as redirection can change IMMU */
	immu = NULL;

	/*
	 * for gart, redirect to the real graphic devinfo
	 */
	if (strcmp(ddi_node_name(rdip), "agpgart") == 0) {
		rdip = get_gfx_devinfo(rdip);
		if (rdip == NULL) {
			ddi_err(DER_PANIC, rdip, "IMMU redirect failed");
			/*NOTREACHED*/
		}
	}

	/*
	 * Setup DVMA domain for the device. This does
	 * work only the first time we do DVMA for a
	 * device.
	 */
	ddip = NULL;
	domain = device_domain(rdip, &ddip, immu_flags);
	if (domain == NULL) {
		ASSERT(ddip == NULL);
		ddi_err(DER_MODE, rdip, "Intel IOMMU setup failed for device");
		return (DDI_DMA_NORESOURCES);
	}

	/*
	 * If a domain is found, we must also have a domain dip
	 * which is the topmost ancestor dip of rdip that shares
	 * the same domain with rdip.
	 */
	if (domain->dom_did == 0 || ddip == NULL) {
		ddi_err(DER_MODE, rdip, "domain did 0(%d) or ddip NULL(%p)",
		    domain->dom_did, ddip);
		return (DDI_DMA_NORESOURCES);
	}

	immu = domain->dom_immu;
	ASSERT(immu);
	if (domain->dom_did == IMMU_UNITY_DID) {
		ASSERT(domain == immu->immu_unity_domain);
		/* mapping already done. Let rootnex create cookies */
		r = DDI_DMA_USE_PHYSICAL;
	} else  if (immu_flags & IMMU_FLAGS_DMAHDL) {

		/* if we have a DMA handle, the IOMMUs must be running */
		ASSERT(immu->immu_regs_running == B_TRUE);
		ASSERT(immu->immu_dvma_running == B_TRUE);

		attr = &hp->dmai_attr;
		if (attr == NULL) {
			ddi_err(DER_PANIC, rdip,
			    "DMA handle (%p): NULL attr", hp);
			/*NOTREACHED*/
		}

		if (cookie_create(hp, dmareq, attr, immu, domain, rdip,
		    prealloc_count, immu_flags) != DDI_SUCCESS) {
			ddi_err(DER_MODE, rdip, "dvcookie_alloc: failed");
			return (DDI_DMA_NORESOURCES);
		}
		r = DDI_DMA_MAPPED;
	} else if (immu_flags & IMMU_FLAGS_MEMRNG) {
		dcookies[0].dck_paddr = mrng->mrng_start;
		dcookies[0].dck_npages = mrng->mrng_npages;
		dcount = 1;
		pde_set = dvma_map(immu, domain, mrng->mrng_start,
		    mrng->mrng_npages, dcookies, dcount, rdip, immu_flags);
		immu_regs_iotlb_flush(immu, domain->dom_did, mrng->mrng_start,
		    mrng->mrng_npages, pde_set == B_TRUE ?
		    TLB_IVA_WHOLE : TLB_IVA_LEAF, IOTLB_PSI);
		r = DDI_DMA_MAPPED;
	} else {
		ddi_err(DER_PANIC, rdip, "invalid flags for immu_dvma_map()");
		/*NOTREACHED*/
	}

	/*
	 * Update the root and context entries
	 */
	if (immu_context_update(immu, domain, ddip, rdip, immu_flags)
	    != DDI_SUCCESS) {
		ddi_err(DER_MODE, rdip, "DVMA map: context update failed");
		return (DDI_DMA_NORESOURCES);
	}

	immu_regs_wbf_flush(immu);

	return (r);
}

int
immu_dvma_unmap(ddi_dma_impl_t *hp, dev_info_t *rdip)
{
	ddi_dma_attr_t *attr;
	rootnex_dma_t *dma;
	domain_t *domain;
	immu_t *immu;
	dev_info_t *ddip;
	immu_flags_t immu_flags;

	ASSERT(immu_enable == B_TRUE);
	ASSERT(immu_running == B_TRUE);
	ASSERT(hp);

	/*
	 * Intel IOMMU will only be turned on if IOMMU
	 * page size is same as MMU page size
	 */
	/*LINTED*/
	ASSERT(MMU_PAGESIZE == IMMU_PAGESIZE);

	/* rdip need not be attached */
	if (rdip == NULL) {
		ddi_err(DER_PANIC, rdip, "DVMA unmap: No device specified");
		return (DDI_DMA_NORESOURCES);
	}

	/*
	 * Get the device domain, this should always
	 * succeed since there had to be a domain to
	 * setup DVMA.
	 */
	dma = (rootnex_dma_t *)hp->dmai_private;
	attr = &hp->dmai_attr;
	if (attr == NULL) {
		ddi_err(DER_PANIC, rdip, "DMA handle (%p) has NULL attr", hp);
		/*NOTREACHED*/
	}
	immu_flags = dma->dp_sleep_flags;

	immu = immu_dvma_get_immu(rdip, immu_flags);
	if (immu == NULL) {
		/*
		 * possible that there is no IOMMU unit for this device
		 * - BIOS bugs are one example.
		 */
		ddi_err(DER_WARN, rdip, "No IMMU unit found for device");
		return (DDI_DMA_NORESOURCES);
	}


	/*
	 * redirect isa devices attached under lpc to lpc dip
	 */
	if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) {
		rdip = get_lpc_devinfo(immu, rdip, immu_flags);
		if (rdip == NULL) {
			ddi_err(DER_PANIC, rdip, "IMMU redirect failed");
			/*NOTREACHED*/
		}
	}

	/* Reset immu, as redirection can change IMMU */
	immu = NULL;

	/*
	 * for gart, redirect to the real graphic devinfo
	 */
	if (strcmp(ddi_node_name(rdip), "agpgart") == 0) {
		rdip = get_gfx_devinfo(rdip);
		if (rdip == NULL) {
			ddi_err(DER_PANIC, rdip, "IMMU redirect failed");
			/*NOTREACHED*/
		}
	}

	ddip = NULL;
	domain = device_domain(rdip, &ddip, immu_flags);
	if (domain == NULL || domain->dom_did == 0 || ddip == NULL) {
		ddi_err(DER_MODE, rdip, "Attempt to unmap DVMA for "
		    "a device without domain or with an uninitialized "
		    "domain");
		return (DDI_DMA_NORESOURCES);
	}

	/*
	 * immu must be set in the domain.
	 */
	immu = domain->dom_immu;
	ASSERT(immu);
	if (domain->dom_did == IMMU_UNITY_DID) {
		ASSERT(domain == immu->immu_unity_domain);
		/*
		 * domain is unity, nothing to do here, let the rootnex
		 * code free the cookies.
		 */
		return (DDI_DMA_USE_PHYSICAL);
	}

	dma = hp->dmai_private;
	if (dma == NULL) {
		ddi_err(DER_PANIC, rdip, "DVMA unmap: DMA handle (%p) has "
		    "no private dma structure", hp);
		/*NOTREACHED*/
	}

	cookie_free(dma, immu, domain, rdip);

	/* No invalidation needed for unmap */
	immu_regs_wbf_flush(immu);

	return (DDI_SUCCESS);
}

immu_devi_t *
immu_devi_get(dev_info_t *rdip)
{
	immu_devi_t *immu_devi;
	volatile uintptr_t *vptr = (uintptr_t *)&(DEVI(rdip)->devi_iommu);

	/* Just want atomic reads. No need for lock */
	immu_devi = (immu_devi_t *)(uintptr_t)atomic_or_64_nv((uint64_t *)vptr,
	    0);
	return (immu_devi);
}