/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/types.h>
#include <sys/ksynch.h>
#include <sys/kmem.h>
#include <sys/errno.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/cred.h>
#include <sys/file.h>
#include <sys/ddi.h>
#include <sys/nsctl/nsctl.h>
#include <sys/unistat/spcs_s.h>
#include <sys/unistat/spcs_errors.h>

#include <sys/unistat/spcs_s_k.h>
#include "dsw.h"
#include "dsw_dev.h"

#ifdef DS_DDICT
#include "../contract.h"
#endif

#include <sys/sdt.h>		/* dtrace is S10 or later */

/*
 * Instant Image.
 *
 * This file contains the chunk map lookup functions of II.
 *
 */
#define	CHUNK_FBA(chunk) DSW_CHK2FBA(chunk)

extern int ii_debug;	/* debug level switch */
int ii_map_debug = 0;

#ifdef II_MULTIMULTI_TERABYTE
typedef	int64_t	nodeid_t;
typedef	int32_t	nodeid32_t;
#else
typedef	int32_t	nodeid_t;
#endif

typedef struct	ii_node {
	chunkid_t	vchunk_id;		/* virtual chunk id */
} NODE;

typedef struct ii_nodelink_s {
	chunkid_t	next_chunk;
} ii_nodelink_t;

static	int	nodes_per_fba = FBA_SIZE(1) / sizeof (NODE);

ii_header_t *_ii_bm_header_get(_ii_info_t *ip, nsc_buf_t **tmp);
int _ii_bm_header_put(ii_header_t *hdr, _ii_info_t *ip,
    nsc_buf_t *tmp);
void _ii_rlse_devs(_ii_info_t *, int);
int _ii_rsrv_devs(_ii_info_t *, int, int);
void _ii_error(_ii_info_t *, int);
/*
 * Private functions for use in this file.
 */
static void free_node(_ii_info_t *ip, NODE *np, nodeid_t ni);
static chunkid_t ii_alloc_overflow(_ii_info_t *ip);
void ii_free_overflow(_ii_info_t *, chunkid_t);
extern int _ii_nsc_io(_ii_info_t *, int, nsc_fd_t *, int, nsc_off_t,
    unsigned char *, nsc_size_t);

static int
update_tree_header(_ii_info_t *ip)
{
	ii_header_t *header;
	nsc_buf_t	*tmp = NULL;

	mutex_enter(&ip->bi_mutex);
	header = _ii_bm_header_get(ip, &tmp);
	if (header == NULL) {
		/* bitmap is probably offline */
		mutex_exit(&ip->bi_mutex);
		DTRACE_PROBE(_iit_update_tree_header_end);
		return (1);
	}
	header->ii_mstchks = ip->bi_mstchks;
	header->ii_shdchks = ip->bi_shdchks;
	header->ii_shdchkused = ip->bi_shdchkused;
	header->ii_shdfchk = ip->bi_shdfchk;
	(void) _ii_bm_header_put(header, ip, tmp);
	mutex_exit(&ip->bi_mutex);

	return (0);
}

static int
update_overflow_header(_ii_info_t *ip, _ii_overflow_t *op)
{
	(void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_WRBUF,
	    II_OHEADER_FBA, (unsigned char *)&(op->ii_do),
	    sizeof (_ii_doverflow_t));

	return (0);
}

static int
node_io(_ii_info_t *ip, NODE *np, nodeid_t node, int flag)
{
	int	rc;
	int	node_fba;
	int	tree_fba = ip->bi_copyfba + (ip->bi_copyfba-ip->bi_shdfba);
	int	offset;
	nsc_buf_t *tmp = NULL;

	/*
	 * Don't use _ii_nsc_io() as _ii_nsc_io() requires io to start at
	 * an fba boundary.
	 */

	/* calculate location of node on bitmap file */
	offset = (node % nodes_per_fba) * sizeof (NODE);
	node_fba = tree_fba + node / nodes_per_fba;

	/* read disk block containing node */
	rc = nsc_alloc_buf(ip->bi_bmpfd, node_fba, 1, NSC_RDBUF|flag, &tmp);
	if (!II_SUCCESS(rc)) {
		_ii_error(ip, DSW_BMPOFFLINE);
		if (tmp)
			(void) nsc_free_buf(tmp);

		DTRACE_PROBE(_iit_node_io_end);
		return (1);
	}

	/* copy node and update bitmap file if needed */
	rc = 0;
	if (flag == NSC_RDBUF)
		bcopy(tmp->sb_vec->sv_addr+offset, np, sizeof (NODE));
	else {
		bcopy(np, tmp->sb_vec->sv_addr+offset, sizeof (NODE));
		II_NSC_WRITE(ip, bitmap, rc, tmp, node_fba, 1, 0);
		if (!II_SUCCESS(rc)) {
			_ii_error(ip, DSW_BMPOFFLINE);
			rc = EIO;
		}
	}
	if (tmp)
		(void) nsc_free_buf(tmp);

	return (0);
}

static int
node_fba_fill(_ii_info_t *ip, nsc_size_t nchunks, chunkid_t vchunk_id)
{
	int	rc;
	nsc_off_t	fba;
	nsc_size_t	fbas;
	nsc_size_t	maxfbas;
	nsc_buf_t *bp;
	nsc_vec_t *vp;

	/* Determine maximum number of FBAs to allocate */
	rc =  nsc_maxfbas(ip->bi_bmpfd, 0, &maxfbas);
	if (!II_SUCCESS(rc))
		maxfbas = DSW_CBLK_FBA;

	/* Write out blocks of initialied NODEs */
	fba = ip->bi_copyfba + (ip->bi_copyfba-ip->bi_shdfba);
	fbas = FBA_LEN(nchunks * sizeof (NODE));
	while (fbas > 0) {

		/* Determine number of FBA to allocate this time */
		if (fbas < maxfbas) maxfbas = fbas;

		/* Allocate buffer which map to FBAs containing NODEs */
		bp = NULL;
		rc = nsc_alloc_buf(ip->bi_bmpfd, fba, maxfbas, NSC_WRBUF, &bp);
		if (!II_SUCCESS(rc)) {
			_ii_error(ip, DSW_BMPOFFLINE);
			DTRACE_PROBE(alloc_buf_failed);
			return (EIO);
		}

		/* traverse vector list, filling wth initialized NODEs */
		for (vp = bp->sb_vec; vp->sv_addr && vp->sv_len; vp++) {
			NODE *pnode = (NODE *)vp->sv_addr;
			NODE *enode = (NODE *)(vp->sv_addr +  vp->sv_len);
			while (pnode < enode) {
				pnode->vchunk_id = vchunk_id;
				pnode++;
			}
		}

		/* write FBAs containing initialized NODEs */
		II_NSC_WRITE(ip, bitmap, rc, bp, fba, maxfbas, 0);
		if (!II_SUCCESS(rc)) {
			_ii_error(ip, DSW_BMPOFFLINE);
			(void) nsc_free_buf(bp);
			DTRACE_PROBE(write_failed);
			return (EIO);
		}

		/* free the buffer */
		(void) nsc_free_buf(bp);

		/* Adjust nsc buffer values */
		fba += maxfbas;
		fbas -= maxfbas;
	}

	return (0);
}

/*
 * Reads the node into core and returns a pointer to it.
 */

static NODE *
read_node(_ii_info_t *ip, nodeid_t node)
{
	NODE *new;

	new = (NODE *)kmem_alloc(sizeof (NODE), KM_SLEEP);

	if (node_io(ip, new, node, NSC_RDBUF)) {
		kmem_free(new, sizeof (NODE));
		new = NULL;
	}

	return (new);
}


static chunkid_t
alloc_chunk(_ii_info_t *ip)
{
	ii_nodelink_t nl;
	int fba;
	chunkid_t rc = II_NULLCHUNK;

	mutex_enter(&ip->bi_chksmutex);
	if (ip->bi_shdchkused < ip->bi_shdchks) {
		rc = ip->bi_shdchkused++;
	} else if (ip->bi_shdfchk != II_NULLCHUNK) {
		ASSERT(ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks);
		rc = ip->bi_shdfchk;
		fba = CHUNK_FBA(rc);
		(void) _ii_rsrv_devs(ip, SHDR, II_INTERNAL);
		(void) _ii_nsc_io(ip, KS_SHD, SHDFD(ip), NSC_RDBUF, fba,
		    (unsigned char *)&nl, sizeof (nl));
		_ii_rlse_devs(ip, SHDR);
		ip->bi_shdfchk = nl.next_chunk;
		ASSERT(ip->bi_shdfchk == II_NULLCHUNK ||
		    (ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks));
	} else {

		/* into overflow */
		rc = ii_alloc_overflow(ip);
	}
	mutex_exit(&ip->bi_chksmutex);
	(void) update_tree_header(ip);

	return (rc);
}

/*
 * releases memory for node
 */
static void	/*ARGSUSED*/
release_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
{
	kmem_free(np, sizeof (NODE));

}

static void
write_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
{
	(void) node_io(ip, np, ni, NSC_WRBUF);
	release_node(ip, np, ni);

}

static void
free_node(_ii_info_t *ip, NODE *np, nodeid_t ni)
{
	ii_nodelink_t nl;
	int	fba;

	if (np == NULL) {
		DTRACE_PROBE(_iit_free_node_end);
		return;
	}

	mutex_enter(&ip->bi_chksmutex);
	if (II_ISOVERFLOW(np->vchunk_id)) {
		/* link chunk onto overflow free list */
		ii_free_overflow(ip, np->vchunk_id);
	} else {
		/* write old free list head into chunk */
		nl.next_chunk = ip->bi_shdfchk;
		ip->bi_shdfchk = np->vchunk_id;
		ASSERT(ip->bi_shdfchk == II_NULLCHUNK ||
		    (ip->bi_shdfchk >= 0 && ip->bi_shdfchk < ip->bi_shdchks));
		fba = CHUNK_FBA(np->vchunk_id);
		(void) _ii_rsrv_devs(ip, SHDR, II_INTERNAL);
		(void) _ii_nsc_io(ip, KS_SHD, SHDFD(ip), NSC_WRBUF, fba,
		    (unsigned char *)&nl, sizeof (nl));
		_ii_rlse_devs(ip, SHDR);
		/* update free counts */
		/* ip->bi_unused++; */
	}
	np->vchunk_id = II_NULLCHUNK;
	(void) node_io(ip, np, ni, NSC_WRBUF);
	(void) update_tree_header(ip);
	mutex_exit(&ip->bi_chksmutex);

}

/*
 * Public functions for dsw_dev to use.
 */

/*
 * Overflow volume functions.
 */

/* put overflow chunk on the overflow volume free list */
void
ii_free_overflow(_ii_info_t *ip, chunkid_t chunk)
{
	ii_nodelink_t nl;
	_ii_overflow_t *op;
	int fba;

	if (!II_ISOVERFLOW(chunk)) {
		DTRACE_PROBE(_iit_free_overflow_end_1);
		return;
	}
	chunk = II_2OVERFLOW(chunk);

	op = ip->bi_overflow;
	if (op == NULL) {
#ifdef DEBUG
		cmn_err(CE_PANIC, "overflow used, but not attached ip %p",
		    (void *) ip);
#endif
		DTRACE_PROBE(_iit_free_overflow_end_2);
		return;
	}
	mutex_enter(&(op->ii_mutex));

	DTRACE_PROBE(_iit_free_overflow);

	/* write old free list head into chunk */
	nl.next_chunk = op->ii_freehead;
	fba = CHUNK_FBA(chunk);
	(void) nsc_reserve(op->ii_dev->bi_fd, NSC_MULTI);
	(void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_WRBUF, fba,
	    (unsigned char *)&nl, sizeof (nl));
	/* update free counts */
	op->ii_unused++;
	ASSERT(op->ii_used > 0);		/* always use 1 for header */

	/* write chunk id into header freelist start */
	op->ii_freehead =  chunk;

	(void) update_overflow_header(ip, op);
	nsc_release(op->ii_dev->bi_fd);
	mutex_exit(&(op->ii_mutex));

}

/* reclaim any overflow storage used by the volume */
void
ii_reclaim_overflow(_ii_info_t *ip)
{
	NODE	*node;
	nodeid_t node_id;
	_ii_overflow_t *op;

	if ((ip->bi_flags & (DSW_VOVERFLOW | DSW_FRECLAIM)) == 0) {
		DTRACE_PROBE(_iit_reclaim_overflow_end);
		return;
	}

	/*
	 * Determine whether overflow should be reclaimed:
	 * 1/ If we're not doing a group volume update
	 * OR
	 * 2/ If the number of detaches != number of attached vols
	 */
	op = ip->bi_overflow;
	if (op && (((op->ii_flags & IIO_VOL_UPDATE) == 0) ||
	    (op->ii_detachcnt != op->ii_drefcnt))) {
#ifndef II_MULTIMULTI_TERABYTE
		/* assert volume size fits into node_id */
		ASSERT(ip->bi_mstchks <= INT32_MAX);
#endif
		for (node_id = 0; node_id < ip->bi_mstchks; node_id++) {
			if ((node = read_node(ip, node_id)) == NULL) {
				DTRACE_PROBE(_iit_reclaim_overflow_end);
				return;
			}
			ii_free_overflow(ip, node->vchunk_id);
			release_node(ip, node, node_id);
		}
	} else {
		/* need to reset the overflow volume header */
		op->ii_freehead = II_NULLNODE;
		op->ii_used = 1;		/* we have used the header */
		op->ii_unused = op->ii_nchunks - op->ii_used;
		(void) update_overflow_header(ip, op);
	}

	DTRACE_PROBE(_iit_reclaim_overflow);

	if ((ip->bi_flags & DSW_VOVERFLOW) == DSW_VOVERFLOW) {
		mutex_enter(&ip->bi_mutex);
		II_FLAG_CLR(DSW_VOVERFLOW, ip);
		mutex_exit(&ip->bi_mutex);
	}
	--iigkstat.spilled_over.value.ul;

}

static chunkid_t
ii_alloc_overflow(_ii_info_t *ip)
{
	chunkid_t chunk;
	ii_nodelink_t nl;
	_ii_overflow_t *op;
	int fba;

	if ((op = ip->bi_overflow) == NULL) {
		DTRACE_PROBE(_iit_alloc_overflow_end);
		return (II_NULLCHUNK);	/* no overflow volume attached */
	}

	mutex_enter(&(op->ii_mutex));

	DTRACE_PROBE(_iit_alloc_overflow);

	if (op->ii_unused < 1) {
		mutex_exit(&(op->ii_mutex));
		DTRACE_PROBE(_iit_alloc_overflow_end);
		return (II_NULLCHUNK);
	}
	(void) nsc_reserve(op->ii_dev->bi_fd, NSC_MULTI);
	if (op->ii_freehead != II_NULLCHUNK) {
		/* pick first from free list */
		chunk = op->ii_freehead;
		fba = CHUNK_FBA(chunk);
		(void) _ii_nsc_io(ip, KS_OVR, op->ii_dev->bi_fd, NSC_RDBUF, fba,
		    (unsigned char *)&nl, sizeof (nl));
		op->ii_freehead = nl.next_chunk;
		/* decrease unused count, fix bug 4419956 */
		op->ii_unused--;
	} else {
		/* otherwise pick first unused */
		if (op->ii_used > op->ii_nchunks)
			chunk = II_NULLCHUNK;
		else {
			chunk = op->ii_used++;
			op->ii_unused--;
		}
	}
	if (chunk != II_NULLCHUNK) {
		chunk = II_2OVERFLOW(chunk);
		if ((ip->bi_flags&DSW_VOVERFLOW) == 0) {
			mutex_enter(&ip->bi_mutex);
			II_FLAG_SET(DSW_VOVERFLOW, ip);
			mutex_exit(&ip->bi_mutex);
			++iigkstat.spilled_over.value.ul;
		}
	}
	(void) update_overflow_header(ip, op);
	nsc_release(op->ii_dev->bi_fd);
	mutex_exit(&(op->ii_mutex));

	return (chunk);
}
/*
 * Find or insert key into search tree.
 */

chunkid_t
ii_tsearch(_ii_info_t *ip, chunkid_t chunk_id)
			/* Address of the root of the tree */
{
	NODE	*rootp = NULL;
	chunkid_t n;	/* New node id if key not found */

	if ((rootp = read_node(ip, chunk_id)) == NULL) {
		DTRACE_PROBE(_iit_tsearch_end);
		return (II_NULLNODE);
	}
	n = rootp->vchunk_id;
	if (n != II_NULLCHUNK) { /* chunk allocated, return location */
		release_node(ip, rootp, 0);
		DTRACE_PROBE(_iit_tsearch_end);
		return (n);
	}
	n = alloc_chunk(ip);
	if (n != II_NULLCHUNK) {
		rootp->vchunk_id = n;
		write_node(ip, rootp, chunk_id);
	} else
		release_node(ip, rootp, 0);

	return (n);
}

/* Delete node with key chunkid */
void
ii_tdelete(_ii_info_t *ip,
	chunkid_t chunkid)	/* Key to be deleted */
{
	NODE *np = NULL;

	if ((np = read_node(ip, chunkid)) == NULL) {
		DTRACE_PROBE(_iit_tdelete_end);
		return;
	}

	ASSERT(np->vchunk_id != II_NULLCHUNK);
	free_node(ip, np, chunkid);
	np->vchunk_id = II_NULLCHUNK;
	write_node(ip, np, chunkid);

}

/*
 * initialise an empty map for ip
 */

int
ii_tinit(_ii_info_t *ip)
{
	int rc = 0;

	/* overflow can't be attached before first call to this function */
	if (ip->bi_overflow)
		ii_reclaim_overflow(ip);

	mutex_enter(&ip->bi_chksmutex);
	ip->bi_shdfchk = II_NULLCHUNK;	/* set freelist to empty chain */
	ip->bi_shdchkused = 0;

	/* fill index (bi_mstchks size) with II_NULLCHUNK */
	rc = node_fba_fill(ip, ip->bi_mstchks, II_NULLCHUNK);
	if (rc == 0)
		rc = update_tree_header(ip);
	mutex_exit(&ip->bi_chksmutex);

	return (rc);
}

/*
 * Calculate the size of map space provided by a bitmap volume with
 * tree_len fba's spare for the tree.
 */

nsc_size_t
ii_btsize(nsc_size_t tree_len)
{
	nsc_size_t nchunks;

	nchunks = tree_len * nodes_per_fba;

	if (ii_debug > 1)
		cmn_err(CE_NOTE,
		    "!ii_btsize: bitmap with %" NSC_SZFMT
		    " spare fba's will map %" NSC_SZFMT " chunks",
		    tree_len, nchunks);

	return (nchunks);
}