/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include "lint.h"
#include "thr_uberdata.h"
#include <stddef.h>

/*
 * These symbols should not be exported from libc, but
 * /lib/libm.so.2 references them.  libm needs to be fixed.
 * Also, some older versions of the Studio compiler/debugger
 * components reference them.  These need to be fixed, too.
 */
#pragma weak _thr_getspecific = thr_getspecific
#pragma weak _thr_keycreate = thr_keycreate
#pragma weak _thr_setspecific = thr_setspecific

/*
 * 128 million keys should be enough for anyone.
 * This allocates half a gigabyte of memory for the keys themselves and
 * half a gigabyte of memory for each thread that uses the largest key.
 */
#define	MAX_KEYS	0x08000000U

int
thr_keycreate(thread_key_t *pkey, void (*destructor)(void *))
{
	tsd_metadata_t *tsdm = &curthread->ul_uberdata->tsd_metadata;
	void (**old_data)(void *) = NULL;
	void (**new_data)(void *);
	uint_t old_nkeys;
	uint_t new_nkeys;

	lmutex_lock(&tsdm->tsdm_lock);

	/*
	 * Unfortunately, pthread_getspecific() specifies that a
	 * pthread_getspecific() on an allocated key upon which the
	 * calling thread has not performed a pthread_setspecifc()
	 * must return NULL.  Consider the following sequence:
	 *
	 *	pthread_key_create(&key);
	 *	pthread_setspecific(key, datum);
	 *	pthread_key_delete(&key);
	 *	pthread_key_create(&key);
	 *	val = pthread_getspecific(key);
	 *
	 * According to POSIX, if the deleted key is reused for the new
	 * key returned by the second pthread_key_create(), then the
	 * pthread_getspecific() in the above example must return NULL
	 * (and not the stale datum).  The implementation is thus left
	 * with two alternatives:
	 *
	 *  (1)	Reuse deleted keys.  If this is to be implemented optimally,
	 *	it requires that pthread_key_create() somehow associate
	 *	the value NULL with the new (reused) key for each thread.
	 *	Keeping the hot path fast and lock-free induces substantial
	 *	complexity on the implementation.
	 *
	 *  (2)	Never reuse deleted keys. This allows the pthread_getspecific()
	 *	implementation to simply perform a check against the number
	 *	of keys set by the calling thread, returning NULL if the
	 *	specified key is larger than the highest set key.  This has
	 *	the disadvantage of wasting memory (a program which simply
	 *	loops calling pthread_key_create()/pthread_key_delete()
	 *	will ultimately run out of memory), but permits an optimal
	 *	pthread_getspecific() while allowing for simple key creation
	 *	and deletion.
	 *
	 * All Solaris implementations have opted for (2).  Given the
	 * ~10 years that this has been in the field, it is safe to assume
	 * that applications don't loop creating and destroying keys; we
	 * stick with (2).
	 */
	if (tsdm->tsdm_nused == (old_nkeys = tsdm->tsdm_nkeys)) {
		/*
		 * We need to allocate or double the number of keys.
		 * tsdm->tsdm_nused must always be a power of two.
		 */
		if ((new_nkeys = (old_nkeys << 1)) == 0)
			new_nkeys = 8;

		if (new_nkeys > MAX_KEYS) {
			lmutex_unlock(&tsdm->tsdm_lock);
			return (EAGAIN);
		}
		if ((new_data = lmalloc(new_nkeys * sizeof (void *))) == NULL) {
			lmutex_unlock(&tsdm->tsdm_lock);
			return (ENOMEM);
		}
		if ((old_data = tsdm->tsdm_destro) == NULL) {
			/* key == 0 is always invalid */
			new_data[0] = TSD_UNALLOCATED;
			tsdm->tsdm_nused = 1;
		} else {
			(void) memcpy(new_data, old_data,
			    old_nkeys * sizeof (void *));
		}
		tsdm->tsdm_destro = new_data;
		tsdm->tsdm_nkeys = new_nkeys;
	}

	*pkey = tsdm->tsdm_nused;
	tsdm->tsdm_destro[tsdm->tsdm_nused++] = destructor;
	lmutex_unlock(&tsdm->tsdm_lock);

	if (old_data != NULL)
		lfree(old_data, old_nkeys * sizeof (void *));

	return (0);
}

#pragma weak _pthread_key_create = pthread_key_create
int
pthread_key_create(pthread_key_t *pkey, void (*destructor)(void *))
{
	return (thr_keycreate(pkey, destructor));
}

/*
 * Same as thr_keycreate(), above, except that the key creation
 * is performed only once.  This relies upon the fact that a key
 * value of THR_ONCE_KEY is invalid, and requires that the key be
 * allocated with a value of THR_ONCE_KEY before calling here.
 * THR_ONCE_KEY and PTHREAD_ONCE_KEY_NP, defined in <thread.h>
 * and <pthread.h> respectively, must have the same value.
 * Example:
 *
 *	static pthread_key_t key = PTHREAD_ONCE_KEY_NP;
 *	...
 *	pthread_key_create_once_np(&key, destructor);
 */
#pragma weak pthread_key_create_once_np = thr_keycreate_once
int
thr_keycreate_once(thread_key_t *keyp, void (*destructor)(void *))
{
	static mutex_t key_lock = DEFAULTMUTEX;
	thread_key_t key;
	int error;

	if (*keyp == THR_ONCE_KEY) {
		lmutex_lock(&key_lock);
		if (*keyp == THR_ONCE_KEY) {
			error = thr_keycreate(&key, destructor);
			if (error) {
				lmutex_unlock(&key_lock);
				return (error);
			}
			membar_producer();
			*keyp = key;
		}
		lmutex_unlock(&key_lock);
	}
	membar_consumer();

	return (0);
}

int
pthread_key_delete(pthread_key_t key)
{
	tsd_metadata_t *tsdm = &curthread->ul_uberdata->tsd_metadata;

	lmutex_lock(&tsdm->tsdm_lock);

	if (key >= tsdm->tsdm_nused ||
	    tsdm->tsdm_destro[key] == TSD_UNALLOCATED) {
		lmutex_unlock(&tsdm->tsdm_lock);
		return (EINVAL);
	}

	tsdm->tsdm_destro[key] = TSD_UNALLOCATED;
	lmutex_unlock(&tsdm->tsdm_lock);

	return (0);
}

/*
 * Blessedly, the pthread_getspecific() interface is much better than the
 * thr_getspecific() interface in that it cannot return an error status.
 * Thus, if the key specified is bogus, pthread_getspecific()'s behavior
 * is undefined.  As an added bonus (and as an artificat of not returning
 * an error code), the requested datum is returned rather than stored
 * through a parameter -- thereby avoiding the unnecessary store/load pair
 * incurred by thr_getspecific().  Every once in a while, the Standards
 * get it right -- but usually by accident.
 */
void *
pthread_getspecific(pthread_key_t key)
{
	tsd_t *stsd;

	/*
	 * We are cycle-shaving in this function because some
	 * applications make heavy use of it and one machine cycle
	 * can make a measurable difference in performance.  This
	 * is why we waste a little memory and allocate a NULL value
	 * for the invalid key == 0 in curthread->ul_ftsd[0] rather
	 * than adjusting the key by subtracting one.
	 */
	if (key < TSD_NFAST)
		return (curthread->ul_ftsd[key]);

	if ((stsd = curthread->ul_stsd) != NULL && key < stsd->tsd_nalloc)
		return (stsd->tsd_data[key]);

	return (NULL);
}

int
thr_getspecific(thread_key_t key, void **valuep)
{
	tsd_t *stsd;

	/*
	 * Amazingly, some application code (and worse, some particularly
	 * fugly Solaris library code) _relies_ on the fact that 0 is always
	 * an invalid key.  To preserve this semantic, 0 is never returned
	 * as a key from thr_/pthread_key_create(); we explicitly check
	 * for it here and return EINVAL.
	 */
	if (key == 0)
		return (EINVAL);

	if (key < TSD_NFAST)
		*valuep = curthread->ul_ftsd[key];
	else if ((stsd = curthread->ul_stsd) != NULL && key < stsd->tsd_nalloc)
		*valuep = stsd->tsd_data[key];
	else
		*valuep = NULL;

	return (0);
}

/*
 * We call thr_setspecific_slow() when the key specified
 * is beyond the current thread's currently allocated range.
 * This case is in a separate function because we want
 * the compiler to optimize for the common case.
 */
static int
thr_setspecific_slow(thread_key_t key, void *value)
{
	ulwp_t *self = curthread;
	tsd_metadata_t *tsdm = &self->ul_uberdata->tsd_metadata;
	tsd_t *stsd;
	tsd_t *ntsd;
	uint_t nkeys;

	/*
	 * It isn't necessary to grab locks in this path;
	 * tsdm->tsdm_nused can only increase.
	 */
	if (key >= tsdm->tsdm_nused)
		return (EINVAL);

	/*
	 * We would like to test (tsdm->tsdm_destro[key] == TSD_UNALLOCATED)
	 * here but that would require acquiring tsdm->tsdm_lock and we
	 * want to avoid locks in this path.
	 *
	 * We have a key which is (or at least _was_) valid.  If this key
	 * is later deleted (or indeed, is deleted before we set the value),
	 * we don't care; such a condition would indicate an application
	 * race for which POSIX thankfully leaves the behavior unspecified.
	 *
	 * First, determine our new size.  To avoid allocating more than we
	 * have to, continue doubling our size only until the new key fits.
	 * stsd->tsd_nalloc must always be a power of two.
	 */
	nkeys = ((stsd = self->ul_stsd) != NULL)? stsd->tsd_nalloc : 8;
	for (; key >= nkeys; nkeys <<= 1)
		continue;

	/*
	 * Allocate the new TSD.
	 */
	if ((ntsd = lmalloc(nkeys * sizeof (void *))) == NULL)
		return (ENOMEM);

	if (stsd != NULL) {
		/*
		 * Copy the old TSD across to the new.
		 */
		(void) memcpy(ntsd, stsd, stsd->tsd_nalloc * sizeof (void *));
		lfree(stsd, stsd->tsd_nalloc * sizeof (void *));
	}

	ntsd->tsd_nalloc = nkeys;
	ntsd->tsd_data[key] = value;
	self->ul_stsd = ntsd;

	return (0);
}

int
thr_setspecific(thread_key_t key, void *value)
{
	tsd_t *stsd;
	int ret;
	ulwp_t *self = curthread;

	/*
	 * See the comment in thr_getspecific(), above.
	 */
	if (key == 0)
		return (EINVAL);

	if (key < TSD_NFAST) {
		curthread->ul_ftsd[key] = value;
		return (0);
	}

	if ((stsd = curthread->ul_stsd) != NULL && key < stsd->tsd_nalloc) {
		stsd->tsd_data[key] = value;
		return (0);
	}

	/*
	 * This is a critical region since we are dealing with memory
	 * allocation and free. Similar protection required in tsd_free().
	 */
	enter_critical(self);
	ret = thr_setspecific_slow(key, value);
	exit_critical(self);
	return (ret);
}

int
pthread_setspecific(pthread_key_t key, const void *value)
{
	return (thr_setspecific(key, (void *)value));
}

/*
 * Contract-private interface for java.  See PSARC/2003/159
 *
 * If the key falls within the TSD_NFAST range, return a non-negative
 * offset that can be used by the caller to fetch the TSD data value
 * directly out of the thread structure using %g7 (sparc) or %gs (x86).
 * With the advent of TLS, %g7 and %gs are part of the ABI, even though
 * the definition of the thread structure itself (ulwp_t) is private.
 *
 * We guarantee that the offset returned on sparc will fit within
 * a SIMM13 field (that is, it is less than 2048).
 *
 * On failure (key is not in the TSD_NFAST range), return -1.
 */
ptrdiff_t
_thr_slot_offset(thread_key_t key)
{
	if (key != 0 && key < TSD_NFAST)
		return ((ptrdiff_t)offsetof(ulwp_t, ul_ftsd[key]));
	return (-1);
}

/*
 * This is called by _thrp_exit() to apply destructors to the thread's tsd.
 */
void
tsd_exit()
{
	ulwp_t *self = curthread;
	tsd_metadata_t *tsdm = &self->ul_uberdata->tsd_metadata;
	thread_key_t key;
	int recheck;
	void *val;
	void (*func)(void *);

	lmutex_lock(&tsdm->tsdm_lock);

	do {
		recheck = 0;

		for (key = 1; key < TSD_NFAST &&
		    key < tsdm->tsdm_nused; key++) {
			if ((func = tsdm->tsdm_destro[key]) != NULL &&
			    func != TSD_UNALLOCATED &&
			    (val = self->ul_ftsd[key]) != NULL) {
				self->ul_ftsd[key] = NULL;
				lmutex_unlock(&tsdm->tsdm_lock);
				(*func)(val);
				lmutex_lock(&tsdm->tsdm_lock);
				recheck = 1;
			}
		}

		if (self->ul_stsd == NULL)
			continue;

		/*
		 * Any of these destructors could cause us to grow the number
		 * TSD keys in the slow TSD; we cannot cache the slow TSD
		 * pointer through this loop.
		 */
		for (; key < self->ul_stsd->tsd_nalloc &&
		    key < tsdm->tsdm_nused; key++) {
			if ((func = tsdm->tsdm_destro[key]) != NULL &&
			    func != TSD_UNALLOCATED &&
			    (val = self->ul_stsd->tsd_data[key]) != NULL) {
				self->ul_stsd->tsd_data[key] = NULL;
				lmutex_unlock(&tsdm->tsdm_lock);
				(*func)(val);
				lmutex_lock(&tsdm->tsdm_lock);
				recheck = 1;
			}
		}
	} while (recheck);

	lmutex_unlock(&tsdm->tsdm_lock);

	/*
	 * We're done; if we have slow TSD, we need to free it.
	 */
	tsd_free(self);
}

void
tsd_free(ulwp_t *ulwp)
{
	tsd_t *stsd;
	ulwp_t *self = curthread;

	enter_critical(self);
	if ((stsd = ulwp->ul_stsd) != NULL)
		lfree(stsd, stsd->tsd_nalloc * sizeof (void *));
	ulwp->ul_stsd = NULL;
	exit_critical(self);
}