/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Functions to implement IP address -> link layer address (PSARC 2006/482)
 */
#include <inet/ip2mac.h>
#include <inet/ip2mac_impl.h>
#include <sys/zone.h>
#include <inet/ip_ndp.h>
#include <inet/ip_if.h>
#include <inet/ip6.h>

/*
 * dispatch pending callbacks.
 */
void
ncec_cb_dispatch(ncec_t *ncec)
{
	ncec_cb_t *ncec_cb;
	ip2mac_t ip2m;

	mutex_enter(&ncec->ncec_lock);
	if (list_is_empty(&ncec->ncec_cb)) {
		mutex_exit(&ncec->ncec_lock);
		return;
	}
	ncec_ip2mac_response(&ip2m, ncec);
	ncec_cb_refhold_locked(ncec);
	/*
	 * IP does not hold internal locks like nce_lock across calls to
	 * other subsystems for fear of recursive lock entry and lock
	 * hierarchy violation. The caller may be holding locks across
	 * the call to IP. (It would be ideal if no subsystem holds locks
	 * across calls into another subsystem, especially if calls can
	 * happen in either direction).
	 */
	ncec_cb = list_head(&ncec->ncec_cb);
	for (; ncec_cb != NULL; ncec_cb = list_next(&ncec->ncec_cb, ncec_cb)) {
		if (ncec_cb->ncec_cb_flags & NCE_CB_DISPATCHED)
			continue;
		ncec_cb->ncec_cb_flags |= NCE_CB_DISPATCHED;
		mutex_exit(&ncec->ncec_lock);
		(*ncec_cb->ncec_cb_func)(&ip2m, ncec_cb->ncec_cb_arg);
		mutex_enter(&ncec->ncec_lock);
	}
	ncec_cb_refrele(ncec);
	mutex_exit(&ncec->ncec_lock);
}

/*
 * fill up the ip2m response fields with inforamation from the nce.
 */
void
ncec_ip2mac_response(ip2mac_t *ip2m, ncec_t *ncec)
{
	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
	sin_t	*sin;
	sin6_t	*sin6;
	struct sockaddr_dl *sdl;

	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
	bzero(ip2m, sizeof (*ip2m));
	if (NCE_ISREACHABLE(ncec) && !NCE_ISCONDEMNED(ncec))
		ip2m->ip2mac_err = 0;
	else
		ip2m->ip2mac_err = ESRCH;
	if (isv6) {
		sin6 = (sin6_t *)&ip2m->ip2mac_pa;
		sin6->sin6_family = AF_INET6;
		sin6->sin6_addr = ncec->ncec_addr;
	} else {
		sin = (sin_t *)&ip2m->ip2mac_pa;
		sin->sin_family = AF_INET;
		IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &sin->sin_addr);
	}
	if (ip2m->ip2mac_err == 0) {
		sdl = &ip2m->ip2mac_ha;
		sdl->sdl_family = AF_LINK;
		sdl->sdl_type = ncec->ncec_ill->ill_type;
		/*
		 * should we put ncec_ill->ill_name in there? why?
		 * likewise for the sdl_index
		 */
		sdl->sdl_nlen = 0;
		sdl->sdl_alen = ncec->ncec_ill->ill_phys_addr_length;
		if (ncec->ncec_lladdr != NULL)
			bcopy(ncec->ncec_lladdr, LLADDR(sdl), sdl->sdl_alen);
	}
}

void
ncec_cb_refhold_locked(ncec_t *ncec)
{
	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
	ncec->ncec_cb_walker_cnt++;
}

void
ncec_cb_refrele(ncec_t *ncec)
{
	ncec_cb_t *ncec_cb, *ncec_cb_next = NULL;

	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
	if (--ncec->ncec_cb_walker_cnt == 0) {
		for (ncec_cb = list_head(&ncec->ncec_cb); ncec_cb != NULL;
		    ncec_cb = ncec_cb_next) {

			ncec_cb_next = list_next(&ncec->ncec_cb, ncec_cb);
			if ((ncec_cb->ncec_cb_flags & NCE_CB_DISPATCHED) == 0)
				continue;
			list_remove(&ncec->ncec_cb, ncec_cb);
			kmem_free(ncec_cb, sizeof (*ncec_cb));
		}
	}
}

/*
 * add a callback to the nce, so that the callback can be invoked
 * after address resolution succeeds/fails.
 */
static ip2mac_id_t
ncec_add_cb(ncec_t *ncec, ip2mac_callback_t *cb, void *cbarg)
{
	ncec_cb_t	*nce_cb;
	ip2mac_id_t	ip2mid = NULL;

	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
	if ((nce_cb = kmem_zalloc(sizeof (*nce_cb), KM_NOSLEEP)) == NULL)
		return (ip2mid);
	nce_cb->ncec_cb_func = cb;
	nce_cb->ncec_cb_arg = cbarg;
	/*
	 * We identify the ncec_cb_t during cancellation by the address
	 * of the nce_cb_t itself, and, as a short-cut for eliminating
	 * clear mismatches, only look in the callback list of ncec's
	 * whose address is equal to the nce_cb_id.
	 */
	nce_cb->ncec_cb_id = ncec; /* no refs! just an address */
	list_insert_tail(&ncec->ncec_cb, nce_cb);
	ip2mid = ncec;  /* this is the id to be used in ip2mac_cancel */

	return (nce_cb);
}

/*
 * Resolve an IP address to a link-layer address using the data-structures
 * defined in PSARC 2006/482. If the current link-layer address for the
 * IP address is not known, the state-machine for resolving the resolution
 * will be triggered, and the callback function (*cb) will be invoked after
 * the resolution completes.
 */
ip2mac_id_t
ip2mac(uint_t op, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
    zoneid_t zoneid)
{
	ncec_t		*ncec;
	nce_t		*nce = NULL;
	boolean_t	isv6;
	ill_t		*ill;
	netstack_t	*ns;
	ip_stack_t	*ipst;
	ip2mac_id_t	ip2mid = NULL;
	sin_t		*sin;
	sin6_t		*sin6;
	int		err;
	uint64_t	delta;
	boolean_t	need_resolve = B_FALSE;

	isv6 = (ip2m->ip2mac_pa.ss_family == AF_INET6);

	ns = netstack_find_by_zoneid(zoneid);
	if (ns == NULL) {
		ip2m->ip2mac_err = EINVAL;
		return (NULL);
	}
	/*
	 * For exclusive stacks we reset the zoneid to zero
	 * since IP uses the global zoneid in the exclusive stacks.
	 */
	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
		zoneid = GLOBAL_ZONEID;
	ipst = ns->netstack_ip;
	/*
	 * find the ill from the ip2m->ip2mac_ifindex
	 */
	ill = ill_lookup_on_ifindex(ip2m->ip2mac_ifindex, isv6, ipst);
	if (ill == NULL) {
		ip2m->ip2mac_err = ENXIO;
		netstack_rele(ns);
		return (NULL);
	}
	if (isv6) {
		sin6 = (sin6_t *)&ip2m->ip2mac_pa;
		if (op == IP2MAC_LOOKUP) {
			nce = nce_lookup_v6(ill, &sin6->sin6_addr);
		} else {
			err = nce_lookup_then_add_v6(ill, NULL,
			    ill->ill_phys_addr_length,
			    &sin6->sin6_addr, 0, ND_UNCHANGED, &nce);
		}
	} else  {
		sin = (sin_t *)&ip2m->ip2mac_pa;
		if (op == IP2MAC_LOOKUP) {
			nce = nce_lookup_v4(ill, &sin->sin_addr.s_addr);
		} else {
			err = nce_lookup_then_add_v4(ill, NULL,
			    ill->ill_phys_addr_length,
			    &sin->sin_addr.s_addr, 0, ND_UNCHANGED, &nce);
		}
	}
	if (op == IP2MAC_LOOKUP) {
		if (nce == NULL) {
			ip2m->ip2mac_err = ESRCH;
			goto done;
		}
		ncec = nce->nce_common;
		delta = TICK_TO_MSEC(ddi_get_lbolt64()) - ncec->ncec_last;
		mutex_enter(&ncec->ncec_lock);
		if (NCE_ISREACHABLE(ncec) &&
		    delta < (uint64_t)ill->ill_reachable_time) {
			ncec_ip2mac_response(ip2m, ncec);
			ip2m->ip2mac_err = 0;
		} else {
			ip2m->ip2mac_err = ESRCH;
		}
		mutex_exit(&ncec->ncec_lock);
		goto done;
	} else {
		if (err != 0 && err != EEXIST) {
			ip2m->ip2mac_err = err;
			goto done;
		}
	}
	ncec = nce->nce_common;
	delta = TICK_TO_MSEC(ddi_get_lbolt64()) - ncec->ncec_last;
	mutex_enter(&ncec->ncec_lock);
	if (NCE_ISCONDEMNED(ncec)) {
		ip2m->ip2mac_err = ESRCH;
	} else {
		if (NCE_ISREACHABLE(ncec)) {
			if (NCE_MYADDR(ncec) ||
			    delta < (uint64_t)ill->ill_reachable_time) {
				ncec_ip2mac_response(ip2m, ncec);
				ip2m->ip2mac_err = 0;
				mutex_exit(&ncec->ncec_lock);
				goto done;
			}
			/*
			 * Since we do not control the packet output
			 * path for ip2mac() callers, we need to verify
			 * if the existing information in the nce is
			 * very old, and retrigger resolution if necessary.
			 * We will not return the existing stale
			 * information until it is verified through a
			 * resolver request/response exchange.
			 *
			 * In the future, we may want to support extensions
			 * that do additional callbacks on link-layer updates,
			 * so that we can return the stale information but
			 * also update the caller if the lladdr changes.
			 */
			ncec->ncec_rcnt = ill->ill_xmit_count;
			ncec->ncec_state = ND_PROBE;
			need_resolve = B_TRUE; /* reachable but very old nce */
		} else if (ncec->ncec_state == ND_INITIAL) {
			need_resolve = B_TRUE; /* ND_INITIAL nce */
			ncec->ncec_state = ND_INCOMPLETE;
		}
		/*
		 * NCE not known to be reachable in the recent past. We must
		 * reconfirm the information before returning it to the caller
		 */
		if (ncec->ncec_rcnt > 0) {
			/*
			 * Still resolving this ncec, so we can queue the
			 * callback information in ncec->ncec_cb
			 */
			ip2mid = ncec_add_cb(ncec, cb, cbarg);
			ip2m->ip2mac_err = EINPROGRESS;
		} else {
			/*
			 * No more retransmits allowed -- resolution failed.
			 */
			ip2m->ip2mac_err = ESRCH;
		}
	}
	mutex_exit(&ncec->ncec_lock);
done:
	/*
	 * if NCE_ISREACHABLE(ncec) but very old, or if it is ND_INITIAL,
	 * trigger resolve.
	 */
	if (need_resolve)
		ip_ndp_resolve(ncec);
	if (nce != NULL)
		nce_refrele(nce);
	netstack_rele(ns);
	ill_refrele(ill);
	return (ip2mid);
}

/*
 * data passed to ncec_walk for canceling outstanding callbacks.
 */
typedef struct ip2mac_cancel_data_s {
	ip2mac_id_t ip2m_cancel_id;
	int	ip2m_cancel_err;
} ip2mac_cancel_data_t;

/*
 * callback invoked for each active ncec. If the ip2mac_id_t corresponds
 * to an active nce_cb_t in the ncec's callback list, we want to remove
 * the callback (if there are no walkers) or return EBUSY to the caller
 */
static void
ip2mac_cancel_callback(ncec_t *ncec, void *arg)
{
	ip2mac_cancel_data_t *ip2m_wdata = arg;
	ncec_cb_t *ip2m_nce_cb = ip2m_wdata->ip2m_cancel_id;
	ncec_cb_t *ncec_cb;

	if (ip2m_nce_cb->ncec_cb_id != ncec)
		return;

	mutex_enter(&ncec->ncec_lock);
	if (list_is_empty(&ncec->ncec_cb)) {
		mutex_exit(&ncec->ncec_lock);
		return;
	}
	/*
	 * IP does not hold internal locks like nce_lock across calls to
	 * other subsystems for fear of recursive lock entry and lock
	 * hierarchy violation. The caller may be holding locks across
	 * the call to IP. (It would be ideal if no subsystem holds locks
	 * across calls into another subsystem, especially if calls can
	 * happen in either direction).
	 */
	ncec_cb = list_head(&ncec->ncec_cb);
	for (; ncec_cb != NULL; ncec_cb = list_next(&ncec->ncec_cb, ncec_cb)) {
		if (ncec_cb != ip2m_nce_cb)
			continue;
		/*
		 * If there are no walkers we can remove the nce_cb.
		 * Otherwise the exiting walker will clean up.
		 */
		if (ncec->ncec_cb_walker_cnt == 0) {
			list_remove(&ncec->ncec_cb, ncec_cb);
		} else {
			ip2m_wdata->ip2m_cancel_err = EBUSY;
		}
		break;
	}
	mutex_exit(&ncec->ncec_lock);
}

/*
 * cancel an outstanding timeout set up via ip2mac
 */
int
ip2mac_cancel(ip2mac_id_t ip2mid, zoneid_t zoneid)
{
	netstack_t	*ns;
	ip_stack_t	*ipst;
	ip2mac_cancel_data_t ip2m_wdata;

	ns = netstack_find_by_zoneid(zoneid);
	if (ns == NULL) {
		ip2m_wdata.ip2m_cancel_err = EINVAL;
		return (ip2m_wdata.ip2m_cancel_err);
	}
	/*
	 * For exclusive stacks we reset the zoneid to zero
	 * since IP uses the global zoneid in the exclusive stacks.
	 */
	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
		zoneid = GLOBAL_ZONEID;
	ipst = ns->netstack_ip;

	ip2m_wdata.ip2m_cancel_id = ip2mid;
	ip2m_wdata.ip2m_cancel_err = 0;
	ncec_walk(NULL, ip2mac_cancel_callback, &ip2m_wdata, ipst);
	/*
	 * We may return EBUSY if a walk to dispatch callbacks is
	 * in progress, in which case the caller needs to synchronize
	 * with the registered callback function to make sure the
	 * module does not exit when there is a callback pending.
	 */
	netstack_rele(ns);
	return (ip2m_wdata.ip2m_cancel_err);
}