/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * sqlite is not compatible with _FILE_OFFSET_BITS=64, but we need to
 * be able to statvfs(2) possibly large systems.  This define gives us
 * access to the transitional interfaces.  See lfcompile64(5) for how
 * _LARGEFILE64_SOURCE works.
 */
#define	_LARGEFILE64_SOURCE

#include <assert.h>
#include <atomic.h>
#include <door.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <pthread.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <time.h>
#include <unistd.h>
#include <zone.h>
#include <libscf_priv.h>

#include "configd.h"
#include "repcache_protocol.h"

#include <sqlite.h>
#include <sqlite-misc.h>

/*
 * This file has two purposes:
 *
 * 1. It contains the database schema, and the code for setting up our backend
 *    databases, including installing said schema.
 *
 * 2. It provides a simplified interface to the SQL database library, and
 *    synchronizes MT access to the database.
 */

#define	IS_VOLATILE(be)		((be)->be_ppath != NULL)
#define	MAX_FLIGHT_RECORDER_EVENTS	100

typedef enum backend_switch_results {
	BACKEND_SWITCH_FATAL =	-1,
	BACKEND_SWITCH_OK =	0,
	BACKEND_SWITCH_RO
} backend_switch_results_t;

typedef struct backend_spent {
	uint64_t bs_count;
	hrtime_t bs_time;
	hrtime_t bs_vtime;
} backend_spent_t;

typedef struct backend_totals {
	backend_spent_t	bt_lock;	/* waiting for lock */
	backend_spent_t	bt_exec;	/* time spent executing SQL */
} backend_totals_t;

/*
 * There are times when svcadm asks configd to move the BACKEND_TYPE_NORMAL
 * repository to volatile storage.  See backend_switch().  When the
 * repository is on volatile storage, we save the location of the permanent
 * repository in be_ppath.  We use the saved path when the time comes to
 * move the repository back.  When the repository is on permanent storage,
 * be_ppath is set to NULL.  Also see the definition of IS_VOLATILE() above
 * for testing if the repository is on volatile storage.
 */
typedef struct sqlite_backend {
	pthread_mutex_t	be_lock;
	pthread_t	be_thread;	/* thread holding lock */
	struct sqlite	*be_db;
	const char	*be_path;	/* path to db */
	const char	*be_ppath;	/* saved path to persistent db when */
					/* backend is volatile */
	const char	*be_checkpoint;	/* path to repository checkpoint */
	int		be_readonly;	/* readonly at start, and still is */
	int		be_writing;	/* held for writing */
	backend_type_t	be_type;	/* type of db */
	hrtime_t	be_lastcheck;	/* time of last read-only check */
	backend_totals_t be_totals[2];	/* one for reading, one for writing */
} sqlite_backend_t;

struct backend_tx {
	sqlite_backend_t	*bt_be;
	int			bt_readonly;
	int			bt_type;
	int			bt_full;	/* SQLITE_FULL during tx */
};

#define	UPDATE_TOTALS_WR(sb, writing, field, ts, vts) { \
	backend_spent_t *__bsp = &(sb)->be_totals[!!(writing)].field; \
	__bsp->bs_count++;						\
	__bsp->bs_time += (gethrtime() - ts);				\
	__bsp->bs_vtime += (gethrvtime() - vts);			\
}

#define	UPDATE_TOTALS(sb, field, ts, vts) \
	UPDATE_TOTALS_WR(sb, (sb)->be_writing, field, ts, vts)

struct backend_query {
	char	*bq_buf;
	size_t	bq_size;
};

struct backend_tbl_info {
	const char *bti_name;
	const char *bti_cols;
};

struct backend_idx_info {
	const char *bxi_tbl;
	const char *bxi_idx;
	const char *bxi_cols;
};

/* Definitions for the flight recorder: */

typedef enum be_flight_type {
	BE_FLIGHT_EV_NOEVENT = 0,	/* No event yet recorded. */
	BE_FLIGHT_EV_BACKUP,		/* Information about repo. backup */
	BE_FLIGHT_EV_BACKUP_ENTER,	/* Enter */
					/* backend_create_backup_locked() */
	BE_FLIGHT_EV_CHECKPOINT,	/* Request to checkpoint repository */
					/* for boot time backup */
	BE_FLIGHT_EV_CHECKPOINT_EXISTS,	/* Existing checkpoint detected on */
					/* restart */
	BE_FLIGHT_EV_LINGERING_FAST,	/* Use lingering fast repository */
	BE_FLIGHT_EV_NO_BACKUP,		/* Requested backup not made */
	BE_FLIGHT_EV_REPO_CREATE,	/* Main repository created */
	BE_FLIGHT_EV_RESTART,		/* This is a restart of configd */
	BE_FLIGHT_EV_SWITCH,		/* Switch repositories */
	BE_FLIGHT_EV_TRANS_RW		/* Root transitioned to read/write */
} be_flight_type_t;

typedef enum be_flight_status {
	BE_FLIGHT_ST_INFO = 0,		/* No status.  Event is informative */
	BE_FLIGHT_ST_BOOT_BACKUP,	/* Boot time backup */
	BE_FLIGHT_ST_CHECKPOINT_BACKUP,	/* Backup from checkpoint */
	BE_FLIGHT_ST_CLIENT,		/* Request form client as opposed to */
					/* internal call */
	BE_FLIGHT_ST_DUPLICATE,		/* Backup duplicates existing one */
	BE_FLIGHT_ST_FAIL,		/* Operation failed. */
	BE_FLIGHT_ST_FAST,		/* Fast repository (tmpfs) */
	BE_FLIGHT_ST_MI_BACKUP,		/* Manifest-import backup */
	BE_FLIGHT_ST_NO_SWITCH,		/* Don't switch repositories */
	BE_FLIGHT_ST_OTHER_BACKUP,	/* Other type of backup */
	BE_FLIGHT_ST_PERMANENT,		/* Repository on permanet storage */
	BE_FLIGHT_ST_REPO_BACKUP,	/* Backup from repository */
	BE_FLIGHT_ST_RO,		/* Main repository is read-only */
	BE_FLIGHT_ST_RW,		/* Main repository is read/write */
	BE_FLIGHT_ST_SUCCESS,		/* Operation was successful */
	BE_FLIGHT_ST_SWITCH		/* Switch repository */
} be_flight_status_t;

typedef struct be_flight_event {
	be_flight_type_t	bfe_type;	/* Type of event. */
	be_flight_status_t	bfe_status;	/* Result of the event. */
	time_t			bfe_time;	/* Time of the event. */
	uint_t			bfe_sequence;	/* Sequence number. */
} be_flight_event_t;

static pthread_mutex_t backend_panic_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t backend_panic_cv = PTHREAD_COND_INITIALIZER;
pthread_t backend_panic_thread = 0;

int backend_do_trace = 0;		/* invoke tracing callback */
int backend_print_trace = 0;		/* tracing callback prints SQL */
int backend_panic_abort = 0;		/* abort when panicking */

/* Data for the flight_recorder. */

static pthread_mutex_t backend_flight_recorder_lock = PTHREAD_MUTEX_INITIALIZER;
static be_flight_event_t flight_recorder[MAX_FLIGHT_RECORDER_EVENTS];
static uint_t flight_recorder_next = 0;
static uint_t flight_recorder_missed = 0;
static uint_t flight_recorder_sequence = 0;

/* interval between read-only checks while starting up */
#define	BACKEND_READONLY_CHECK_INTERVAL	(2 * (hrtime_t)NANOSEC)

/*
 * Any incompatible change to the below schema should bump the version number.
 * The schema has been changed to support value ordering,  but this change
 * is backwards-compatible - i.e. a previous svc.configd can use a
 * repository database with the new schema perfectly well.  As a result,
 * the schema version has not been updated,  allowing downgrade of systems
 * without losing repository data.
 */
#define	BACKEND_SCHEMA_VERSION		5

static struct backend_tbl_info tbls_normal[] = { /* BACKEND_TYPE_NORMAL */
	/*
	 * service_tbl holds all services.  svc_id is the identifier of the
	 * service.
	 */
	{
		"service_tbl",
		"svc_id          INTEGER PRIMARY KEY,"
		"svc_name        CHAR(256) NOT NULL"
	},

	/*
	 * instance_tbl holds all of the instances.  The parent service id
	 * is instance_svc.
	 */
	{
		"instance_tbl",
		"instance_id     INTEGER PRIMARY KEY,"
		"instance_name   CHAR(256) NOT NULL,"
		"instance_svc    INTEGER NOT NULL"
	},

	/*
	 * snapshot_lnk_tbl links (instance, snapshot name) with snapshots.
	 */
	{
		"snapshot_lnk_tbl",
		"lnk_id          INTEGER PRIMARY KEY,"
		"lnk_inst_id     INTEGER NOT NULL,"
		"lnk_snap_name   CHAR(256) NOT NULL,"
		"lnk_snap_id     INTEGER NOT NULL"
	},

	/*
	 * snaplevel_tbl maps a snapshot id to a set of named, ordered
	 * snaplevels.
	 */
	{
		"snaplevel_tbl",
		"snap_id                 INTEGER NOT NULL,"
		"snap_level_num          INTEGER NOT NULL,"
		"snap_level_id           INTEGER NOT NULL,"
		"snap_level_service_id   INTEGER NOT NULL,"
		"snap_level_service      CHAR(256) NOT NULL,"
		"snap_level_instance_id  INTEGER NULL,"
		"snap_level_instance     CHAR(256) NULL"
	},

	/*
	 * snaplevel_lnk_tbl links snaplevels to property groups.
	 * snaplvl_pg_* is identical to the original property group,
	 * and snaplvl_gen_id overrides the generation number.
	 * The service/instance ids are as in the snaplevel.
	 */
	{
		"snaplevel_lnk_tbl",
		"snaplvl_level_id INTEGER NOT NULL,"
		"snaplvl_pg_id    INTEGER NOT NULL,"
		"snaplvl_pg_name  CHAR(256) NOT NULL,"
		"snaplvl_pg_type  CHAR(256) NOT NULL,"
		"snaplvl_pg_flags INTEGER NOT NULL,"
		"snaplvl_gen_id   INTEGER NOT NULL"
	},

	{ NULL, NULL }
};

static struct backend_idx_info idxs_normal[] = { /* BACKEND_TYPE_NORMAL */
	{ "service_tbl",	"name",	"svc_name" },
	{ "instance_tbl",	"name",	"instance_svc, instance_name" },
	{ "snapshot_lnk_tbl",	"name",	"lnk_inst_id, lnk_snap_name" },
	{ "snapshot_lnk_tbl",	"snapid", "lnk_snap_id" },
	{ "snaplevel_tbl",	"id",	"snap_id" },
	{ "snaplevel_lnk_tbl",	"id",	"snaplvl_pg_id" },
	{ "snaplevel_lnk_tbl",	"level", "snaplvl_level_id" },
	{ NULL, NULL, NULL }
};

static struct backend_tbl_info tbls_np[] = { /* BACKEND_TYPE_NONPERSIST */
	{ NULL, NULL }
};

static struct backend_idx_info idxs_np[] = {	/* BACKEND_TYPE_NONPERSIST */
	{ NULL, NULL, NULL }
};

static struct backend_tbl_info tbls_common[] = { /* all backend types */
	/*
	 * pg_tbl defines property groups.  They are associated with a single
	 * service or instance.  The pg_gen_id links them with the latest
	 * "edited" version of its properties.
	 */
	{
		"pg_tbl",
		"pg_id           INTEGER PRIMARY KEY,"
		"pg_parent_id    INTEGER NOT NULL,"
		"pg_name         CHAR(256) NOT NULL,"
		"pg_type         CHAR(256) NOT NULL,"
		"pg_flags        INTEGER NOT NULL,"
		"pg_gen_id       INTEGER NOT NULL"
	},

	/*
	 * prop_lnk_tbl links a particular pg_id and gen_id to a set of
	 * (prop_name, prop_type, val_id) trios.
	 */
	{
		"prop_lnk_tbl",
		"lnk_prop_id     INTEGER PRIMARY KEY,"
		"lnk_pg_id       INTEGER NOT NULL,"
		"lnk_gen_id      INTEGER NOT NULL,"
		"lnk_prop_name   CHAR(256) NOT NULL,"
		"lnk_prop_type   CHAR(2) NOT NULL,"
		"lnk_val_id      INTEGER"
	},

	/*
	 * value_tbl maps a value_id to a set of values.  For any given
	 * value_id, value_type is constant.  The table definition here
	 * is repeated in backend_check_upgrade(),  and must be kept in-sync.
	 */
	{
		"value_tbl",
		"value_id        INTEGER NOT NULL,"
		"value_type      CHAR(1) NOT NULL,"
		"value_value     VARCHAR NOT NULL,"
		"value_order     INTEGER DEFAULT 0"
	},

	/*
	 * id_tbl has one row per id space
	 */
	{
		"id_tbl",
		"id_name         STRING NOT NULL,"
		"id_next         INTEGER NOT NULL"
	},

	/*
	 * schema_version has a single row, which contains
	 * BACKEND_SCHEMA_VERSION at the time of creation.
	 */
	{
		"schema_version",
		"schema_version  INTEGER"
	},
	{ NULL, NULL }
};

/*
 * The indexing of value_tbl is repeated in backend_check_upgrade() and
 * must be kept in sync with the indexing specification here.
 */
static struct backend_idx_info idxs_common[] = { /* all backend types */
	{ "pg_tbl",		"parent", "pg_parent_id" },
	{ "pg_tbl",		"name",	"pg_parent_id, pg_name" },
	{ "pg_tbl",		"type",	"pg_parent_id, pg_type" },
	{ "prop_lnk_tbl",	"base",	"lnk_pg_id, lnk_gen_id" },
	{ "prop_lnk_tbl",	"val",	"lnk_val_id" },
	{ "value_tbl",		"id",	"value_id" },
	{ "id_tbl",		"id",	"id_name" },
	{ NULL, NULL, NULL }
};

struct run_single_int_info {
	uint32_t	*rs_out;
	int		rs_result;
};

static rep_protocol_responseid_t backend_copy_repository(const char *,
    const char *, int);
static rep_protocol_responseid_t backend_do_copy(const char *, int,
    const char *, int, size_t *);

/*
 * The flight recorder keeps track of events that happen primarily while
 * the system is booting.  Once the system is up an running, one can take a
 * gcore(1) of configd and examine the events with mdb.  Since we're most
 * interested in early boot events, we stop recording events when the
 * recorder is full.
 */
static void
flight_recorder_event(be_flight_type_t type, be_flight_status_t res)
{
	be_flight_event_t *data;
	uint_t item;
	uint_t sequence;

	if (pthread_mutex_lock(&backend_flight_recorder_lock) != 0) {
		atomic_inc_uint(&flight_recorder_missed);
		return;
	}
	if (flight_recorder_next >= MAX_FLIGHT_RECORDER_EVENTS) {
		/* Hit end of the array.  No more event recording. */
		item = flight_recorder_next;
	} else {
		item = flight_recorder_next++;
		sequence = flight_recorder_sequence++;
	}
	(void) pthread_mutex_unlock(&backend_flight_recorder_lock);

	if (item >= MAX_FLIGHT_RECORDER_EVENTS) {
		/* Array is filled.  Stop recording events */
		atomic_inc_uint(&flight_recorder_missed);
		return;
	}
	data = &flight_recorder[item];
	(void) memset(data, 0, sizeof (*data));
	data->bfe_type = type;
	data->bfe_status = res;
	data->bfe_sequence = sequence;
	data->bfe_time = time(NULL);
}

/*ARGSUSED*/
static int
run_single_int_callback(void *arg, int columns, char **vals, char **names)
{
	struct run_single_int_info *info = arg;
	uint32_t val;

	char *endptr = vals[0];

	assert(info->rs_result != REP_PROTOCOL_SUCCESS);
	assert(columns == 1);

	if (vals[0] == NULL)
		return (BACKEND_CALLBACK_CONTINUE);

	errno = 0;
	val = strtoul(vals[0], &endptr, 10);
	if ((val == 0 && endptr == vals[0]) || *endptr != 0 || errno != 0)
		backend_panic("malformed integer \"%20s\"", vals[0]);

	*info->rs_out = val;
	info->rs_result = REP_PROTOCOL_SUCCESS;
	return (BACKEND_CALLBACK_CONTINUE);
}

/*ARGSUSED*/
int
backend_fail_if_seen(void *arg, int columns, char **vals, char **names)
{
	return (BACKEND_CALLBACK_ABORT);
}

/*
 * check to see if we can successfully start a transaction;  if not, the
 * filesystem is mounted read-only.
 */
static int
backend_is_readonly(struct sqlite *db, const char *path)
{
	int r;
	statvfs64_t stat;

	if (statvfs64(path, &stat) == 0 && (stat.f_flag & ST_RDONLY))
		return (SQLITE_READONLY);

	r = sqlite_exec(db,
	    "BEGIN TRANSACTION; "
	    "UPDATE schema_version SET schema_version = schema_version; ",
	    NULL, NULL, NULL);
	(void) sqlite_exec(db, "ROLLBACK TRANSACTION", NULL, NULL, NULL);
	return (r);
}

static void
backend_trace_sql(void *arg, const char *sql)
{
	sqlite_backend_t *be = arg;

	if (backend_print_trace) {
		(void) fprintf(stderr, "%d: %s\n", be->be_type, sql);
	}
}

static sqlite_backend_t be_info[BACKEND_TYPE_TOTAL];
static sqlite_backend_t *bes[BACKEND_TYPE_TOTAL];

/*
 * For a native build,  repositories are created from scratch, so upgrade
 * is not an issue.  This variable is implicitly protected by
 * bes[BACKEND_TYPE_NORMAL]->be_lock.
 */
#ifdef NATIVE_BUILD
static boolean_t be_normal_upgraded = B_TRUE;
#else
static boolean_t be_normal_upgraded = B_FALSE;
#endif	/* NATIVE_BUILD */

/*
 * Has backend been upgraded? In nonpersistent case, answer is always
 * yes.
 */
boolean_t
backend_is_upgraded(backend_tx_t *bt)
{
	if (bt->bt_type == BACKEND_TYPE_NONPERSIST)
		return (B_TRUE);
	return (be_normal_upgraded);
}

#define	BACKEND_PANIC_TIMEOUT	(50 * MILLISEC)
/*
 * backend_panic() -- some kind of database problem or corruption has been hit.
 * We attempt to quiesce the other database users -- all of the backend sql
 * entry points will call backend_panic(NULL) if a panic is in progress, as
 * will any attempt to start a transaction.
 *
 * We give threads holding a backend lock 50ms (BACKEND_PANIC_TIMEOUT) to
 * either drop the lock or call backend_panic().  If they don't respond in
 * time, we'll just exit anyway.
 */
void
backend_panic(const char *format, ...)
{
	int i;
	va_list args;
	int failed = 0;

	(void) pthread_mutex_lock(&backend_panic_lock);
	if (backend_panic_thread != 0) {
		(void) pthread_mutex_unlock(&backend_panic_lock);
		/*
		 * first, drop any backend locks we're holding, then
		 * sleep forever on the panic_cv.
		 */
		for (i = 0; i < BACKEND_TYPE_TOTAL; i++) {
			if (bes[i] != NULL &&
			    bes[i]->be_thread == pthread_self())
				(void) pthread_mutex_unlock(&bes[i]->be_lock);
		}
		(void) pthread_mutex_lock(&backend_panic_lock);
		for (;;)
			(void) pthread_cond_wait(&backend_panic_cv,
			    &backend_panic_lock);
	}
	backend_panic_thread = pthread_self();
	(void) pthread_mutex_unlock(&backend_panic_lock);

	for (i = 0; i < BACKEND_TYPE_TOTAL; i++) {
		if (bes[i] != NULL && bes[i]->be_thread == pthread_self())
			(void) pthread_mutex_unlock(&bes[i]->be_lock);
	}

	va_start(args, format);
	configd_vcritical(format, args);
	va_end(args);

	for (i = 0; i < BACKEND_TYPE_TOTAL; i++) {
		timespec_t rel;

		rel.tv_sec = 0;
		rel.tv_nsec = BACKEND_PANIC_TIMEOUT;

		if (bes[i] != NULL && bes[i]->be_thread != pthread_self()) {
			if (pthread_mutex_reltimedlock_np(&bes[i]->be_lock,
			    &rel) != 0)
				failed++;
		}
	}
	if (failed) {
		configd_critical("unable to quiesce database\n");
	}

	if (backend_panic_abort)
		abort();

	exit(CONFIGD_EXIT_DATABASE_BAD);
}

/*
 * Returns
 *   _SUCCESS
 *   _DONE - callback aborted query
 *   _NO_RESOURCES - out of memory (_FULL & _TOOBIG?)
 */
static int
backend_error(sqlite_backend_t *be, int error, char *errmsg)
{
	if (error == SQLITE_OK)
		return (REP_PROTOCOL_SUCCESS);

	switch (error) {
	case SQLITE_ABORT:
		free(errmsg);
		return (REP_PROTOCOL_DONE);

	case SQLITE_NOMEM:
	case SQLITE_FULL:
	case SQLITE_TOOBIG:
		free(errmsg);
		return (REP_PROTOCOL_FAIL_NO_RESOURCES);

	default:
		backend_panic("%s: db error: %s", be->be_path, errmsg);
		/*NOTREACHED*/
	}
}

static void
backend_backup_cleanup(const char **out_arg, ssize_t out_sz)
{
	char **out = (char **)out_arg;

	while (out_sz-- > 0)
		free(*out++);
	free(out_arg);
}

/*
 * builds a inverse-time-sorted array of backup files.  The path is a
 * a single buffer, and the pointers look like:
 *
 *	/this/is/a/full/path/to/repository-name-YYYYMMDDHHMMSS
 *	^pathname		^	       ^(pathname+pathlen)
 *				basename
 *
 * dirname will either be pathname, or ".".
 *
 * Returns the number of elements in the array, 0 if there are no previous
 * backups, or -1 on error.
 */
static ssize_t
backend_backup_get_prev(char *pathname, size_t pathlen, const char ***out_arg)
{
	char b_start, b_end;
	DIR *dir;
	char **out = NULL;
	char *name, *p;
	char *dirname, *basename;
	char *pathend;
	struct dirent *ent;

	size_t count = 0;
	size_t baselen;

	/*
	 * year, month, day, hour, min, sec, plus an '_'.
	 */
	const size_t ndigits = 4 + 5*2 + 1;
	const size_t baroffset = 4 + 2*2;

	size_t idx;

	pathend = pathname + pathlen;
	b_end = *pathend;
	*pathend = '\0';

	basename = strrchr(pathname, '/');

	if (basename != NULL) {
		assert(pathend > pathname && basename < pathend);
		basename++;
		dirname = pathname;
	} else {
		basename = pathname;
		dirname = ".";
	}

	baselen = strlen(basename);

	/*
	 * munge the string temporarily for the opendir(), then restore it.
	 */
	b_start = basename[0];

	basename[0] = '\0';
	dir = opendir(dirname);
	basename[0] = b_start;		/* restore path */

	if (dir == NULL)
		goto fail;


	while ((ent = readdir(dir)) != NULL) {
		/*
		 * Must match:
		 *	basename-YYYYMMDD_HHMMSS
		 * or we ignore it.
		 */
		if (strncmp(ent->d_name, basename, baselen) != 0)
			continue;

		name = ent->d_name;
		if (name[baselen] != '-')
			continue;

		p = name + baselen + 1;

		for (idx = 0; idx < ndigits; idx++) {
			char c = p[idx];
			if (idx == baroffset && c != '_')
				break;
			if (idx != baroffset && (c < '0' || c > '9'))
				break;
		}
		if (idx != ndigits || p[idx] != '\0')
			continue;

		/*
		 * We have a match.  insertion-sort it into our list.
		 */
		name = strdup(name);
		if (name == NULL)
			goto fail_closedir;
		p = strrchr(name, '-');

		for (idx = 0; idx < count; idx++) {
			char *tmp = out[idx];
			char *tp = strrchr(tmp, '-');

			int cmp = strcmp(p, tp);
			if (cmp == 0)
				cmp = strcmp(name, tmp);

			if (cmp == 0) {
				free(name);
				name = NULL;
				break;
			} else if (cmp > 0) {
				out[idx] = name;
				name = tmp;
				p = tp;
			}
		}

		if (idx == count) {
			char **new_out = realloc(out,
			    (count + 1) * sizeof (*out));

			if (new_out == NULL) {
				free(name);
				goto fail_closedir;
			}

			out = new_out;
			out[count++] = name;
		} else {
			assert(name == NULL);
		}
	}
	(void) closedir(dir);

	basename[baselen] = b_end;

	*out_arg = (const char **)out;
	return (count);

fail_closedir:
	(void) closedir(dir);
fail:
	basename[0] = b_start;
	*pathend = b_end;

	backend_backup_cleanup((const char **)out, count);

	*out_arg = NULL;
	return (-1);
}

/*
 * Copies the repository path into out, a buffer of out_len bytes,
 * removes the ".db" (or whatever) extension, and, if name is non-NULL,
 * appends "-name" to it.  If name is non-NULL, it can fail with:
 *
 *	_TRUNCATED	will not fit in buffer.
 *	_BAD_REQUEST	name is not a valid identifier
 */
static rep_protocol_responseid_t
backend_backup_base(sqlite_backend_t *be, const char *name,
    char *out, size_t out_len)
{
	char *p, *q;
	size_t len;

	/*
	 * for paths of the form /path/to/foo.db, we truncate at the final
	 * '.'.
	 */
	(void) strlcpy(out, IS_VOLATILE(be) ? be->be_ppath : be->be_path,
	    out_len);

	p = strrchr(out, '/');
	q = strrchr(out, '.');

	if (p != NULL && q != NULL && q > p)
		*q = 0;

	if (name != NULL) {
		len = strlen(out);
		assert(len < out_len);

		out += len;
		out_len -= len;

		len = strlen(name);

		/*
		 * verify that the name tag is entirely alphabetic,
		 * non-empty, and not too long.
		 */
		if (len == 0 || len >= REP_PROTOCOL_NAME_LEN ||
		    uu_check_name(name, UU_NAME_DOMAIN) < 0)
			return (REP_PROTOCOL_FAIL_BAD_REQUEST);

		if (snprintf(out, out_len, "-%s", name) >= out_len)
			return (REP_PROTOCOL_FAIL_TRUNCATED);
	}

	return (REP_PROTOCOL_SUCCESS);
}

/*
 * Make a checkpoint of the repository, so that we can use it for a backup
 * when the root file system becomes read/write.  We'll first copy the
 * repository into a temporary file and then rename it to
 * REPOSITORY_CHECKPOINT.  This is protection against configd crashing in
 * the middle of the copy and leaving a partial copy at
 * REPOSITORY_CHECKPOINT.  Renames are atomic.
 */
static rep_protocol_responseid_t
backend_checkpoint_repository(sqlite_backend_t *be)
{
	rep_protocol_responseid_t r;

	assert(be->be_readonly);	/* Only need a checkpoint if / is ro */
	assert(be->be_type == BACKEND_TYPE_NORMAL);
	assert(be->be_checkpoint == NULL); /* Only 1 checkpoint */

	r = backend_copy_repository(be->be_path, REPOSITORY_CHECKPOINT, 0);
	if (r == REP_PROTOCOL_SUCCESS)
		be->be_checkpoint = REPOSITORY_CHECKPOINT;

	flight_recorder_event(BE_FLIGHT_EV_CHECKPOINT,
	    r == REP_PROTOCOL_SUCCESS ? BE_FLIGHT_ST_SUCCESS :
	    BE_FLIGHT_ST_FAIL);

	return (r);
}

/*
 * See if a backup is needed.  We do a backup unless both files are
 * byte-for-byte identical.
 */
static int
backend_check_backup_needed(const char *rep_name, const char *backup_name)
{
	int repfd = open(rep_name, O_RDONLY);
	int fd = open(backup_name, O_RDONLY);
	struct stat s_rep, s_backup;
	int c1, c2;

	FILE *f_rep = NULL;
	FILE *f_backup = NULL;

	if (repfd < 0 || fd < 0)
		goto fail;

	if (fstat(repfd, &s_rep) < 0 || fstat(fd, &s_backup) < 0)
		goto fail;

	/*
	 * if they are the same file, we need to do a backup to break the
	 * hard link or symlink involved.
	 */
	if (s_rep.st_ino == s_backup.st_ino && s_rep.st_dev == s_backup.st_dev)
		goto fail;

	if (s_rep.st_size != s_backup.st_size)
		goto fail;

	if ((f_rep = fdopen(repfd, "r")) == NULL ||
	    (f_backup = fdopen(fd, "r")) == NULL)
		goto fail;

	do {
		c1 = getc(f_rep);
		c2 = getc(f_backup);
		if (c1 != c2)
			goto fail;
	} while (c1 != EOF);

	if (!ferror(f_rep) && !ferror(f_backup)) {
		(void) fclose(f_rep);
		(void) fclose(f_backup);
		(void) close(repfd);
		(void) close(fd);
		return (0);
	}

fail:
	if (f_rep != NULL)
		(void) fclose(f_rep);
	if (f_backup != NULL)
		(void) fclose(f_backup);
	if (repfd >= 0)
		(void) close(repfd);
	if (fd >= 0)
		(void) close(fd);
	return (1);
}

/*
 * This interface is called to perform the actual copy
 *
 * Return:
 *	_FAIL_UNKNOWN		read/write fails
 *	_FAIL_NO_RESOURCES	out of memory
 *	_SUCCESS		copy succeeds
 */
static rep_protocol_responseid_t
backend_do_copy(const char *src, int srcfd, const char *dst,
    int dstfd, size_t *sz)
{
	char *buf;
	off_t nrd, nwr, n, r_off = 0, w_off = 0;

	if ((buf = malloc(8192)) == NULL)
		return (REP_PROTOCOL_FAIL_NO_RESOURCES);

	while ((nrd = read(srcfd, buf, 8192)) != 0) {
		if (nrd < 0) {
			if (errno == EINTR)
				continue;

			configd_critical(
			    "Backend copy failed: fails to read from %s "
			    "at offset %d: %s\n", src, r_off, strerror(errno));
			free(buf);
			return (REP_PROTOCOL_FAIL_UNKNOWN);
		}

		r_off += nrd;

		nwr = 0;
		do {
			if ((n = write(dstfd, &buf[nwr], nrd - nwr)) < 0) {
				if (errno == EINTR)
					continue;

				configd_critical(
				    "Backend copy failed: fails to write to %s "
				    "at offset %d: %s\n", dst, w_off,
				    strerror(errno));
				free(buf);
				return (REP_PROTOCOL_FAIL_UNKNOWN);
			}

			nwr += n;
			w_off += n;

		} while (nwr < nrd);
	}

	if (sz)
		*sz = w_off;

	free(buf);
	return (REP_PROTOCOL_SUCCESS);
}

/*
 * Can return:
 *	_BAD_REQUEST		name is not valid
 *	_TRUNCATED		name is too long for current repository path
 *	_UNKNOWN		failed for unknown reason (details written to
 *				console)
 *	_BACKEND_READONLY	backend is not writable
 *	_NO_RESOURCES		out of memory
 *	_SUCCESS		Backup completed successfully.
 */
static rep_protocol_responseid_t
backend_create_backup_locked(sqlite_backend_t *be, const char *name)
{
	const char **old_list;
	ssize_t old_sz;
	ssize_t old_max = max_repository_backups;
	ssize_t cur;
	char *finalname;
	char *finalpath;
	char *tmppath;
	int infd, outfd;
	size_t len;
	time_t now;
	struct tm now_tm;
	be_flight_status_t backup_type;
	rep_protocol_responseid_t result;
	const char *src;
	int use_checkpoint;

	if (strcmp(name, REPOSITORY_BOOT_BACKUP) == 0) {
		backup_type = BE_FLIGHT_ST_BOOT_BACKUP;
	} else if (strcmp(name, "manifest_import") ==  0) {
		backup_type = BE_FLIGHT_ST_MI_BACKUP;
	} else {
		backup_type = BE_FLIGHT_ST_OTHER_BACKUP;
	}
	flight_recorder_event(BE_FLIGHT_EV_BACKUP_ENTER, backup_type);

	if ((finalpath = malloc(PATH_MAX)) == NULL)
		return (REP_PROTOCOL_FAIL_NO_RESOURCES);

	if ((tmppath = malloc(PATH_MAX)) == NULL) {
		free(finalpath);
		return (REP_PROTOCOL_FAIL_NO_RESOURCES);
	}

	if (be->be_readonly) {
		flight_recorder_event(BE_FLIGHT_EV_NO_BACKUP, BE_FLIGHT_ST_RO);
		result = REP_PROTOCOL_FAIL_BACKEND_READONLY;
		goto out;
	}

	result = backend_backup_base(be, name, finalpath, PATH_MAX);
	if (result != REP_PROTOCOL_SUCCESS)
		goto out;

	/*
	 * If this is a boot backup and if we made a checkpoint before the
	 * root file system became read/write, then we should use the
	 * checkpoint as the source.  Otherwise, we'll use the actual
	 * repository as the source.
	 */
	if (be->be_checkpoint && name &&
	    strcmp(REPOSITORY_BOOT_BACKUP, name) == 0) {
		backup_type = BE_FLIGHT_ST_CHECKPOINT_BACKUP;
		use_checkpoint = 1;
		src = be->be_checkpoint;
	} else {
		backup_type = BE_FLIGHT_ST_REPO_BACKUP;
		use_checkpoint = 0;
		src = be->be_path;
	}
	flight_recorder_event(BE_FLIGHT_EV_BACKUP, backup_type);
	if (!backend_check_backup_needed(src, finalpath)) {
		/*
		 * No changes, so there is no need for a backup.
		 */
		flight_recorder_event(BE_FLIGHT_EV_NO_BACKUP,
		    BE_FLIGHT_ST_DUPLICATE);
		result = REP_PROTOCOL_SUCCESS;
		goto out;
	}

	/*
	 * remember the original length, and the basename location
	 */
	len = strlen(finalpath);
	finalname = strrchr(finalpath, '/');
	if (finalname != NULL)
		finalname++;
	else
		finalname = finalpath;

	(void) strlcpy(tmppath, finalpath, PATH_MAX);
	if (strlcat(tmppath, "-tmpXXXXXX", PATH_MAX) >= PATH_MAX) {
		result = REP_PROTOCOL_FAIL_TRUNCATED;
		goto out;
	}

	now = time(NULL);
	if (localtime_r(&now, &now_tm) == NULL) {
		configd_critical(
		    "\"%s\" backup failed: localtime(3C) failed: %s\n", name,
		    strerror(errno));
		result = REP_PROTOCOL_FAIL_UNKNOWN;
		goto out;
	}

	if (strftime(finalpath + len, PATH_MAX - len,
	    "-%Y""%m""%d""_""%H""%M""%S", &now_tm) >= PATH_MAX - len) {
		result = REP_PROTOCOL_FAIL_TRUNCATED;
		goto out;
	}

	infd = open(src, O_RDONLY);
	if (infd < 0) {
		configd_critical("\"%s\" backup failed: opening %s: %s\n", name,
		    src, strerror(errno));
		result = REP_PROTOCOL_FAIL_UNKNOWN;
		goto out;
	}

	outfd = mkstemp(tmppath);
	if (outfd < 0) {
		configd_critical("\"%s\" backup failed: mkstemp(%s): %s\n",
		    name, tmppath, strerror(errno));
		(void) close(infd);
		result = REP_PROTOCOL_FAIL_UNKNOWN;
		goto out;
	}

	if ((result = backend_do_copy(src, infd, (const char *)tmppath,
	    outfd, NULL)) != REP_PROTOCOL_SUCCESS)
		goto fail;

	/*
	 * grab the old list before doing our re-name.
	 */
	if (old_max > 0)
		old_sz = backend_backup_get_prev(finalpath, len, &old_list);

	if (rename(tmppath, finalpath) < 0) {
		configd_critical(
		    "\"%s\" backup failed: rename(%s, %s): %s\n",
		    name, tmppath, finalpath, strerror(errno));
		result = REP_PROTOCOL_FAIL_UNKNOWN;
		goto fail;
	}

	tmppath[len] = 0;	/* strip -XXXXXX, for reference symlink */

	(void) unlink(tmppath);
	if (symlink(finalname, tmppath) < 0) {
		configd_critical(
		    "\"%s\" backup completed, but updating "
		    "\"%s\" symlink to \"%s\" failed: %s\n",
		    name, tmppath, finalname, strerror(errno));
	}

	if (old_max > 0 && old_sz > 0) {
		/* unlink all but the first (old_max - 1) files */
		for (cur = old_max - 1; cur < old_sz; cur++) {
			(void) strlcpy(finalname, old_list[cur],
			    PATH_MAX - (finalname - finalpath));
			if (unlink(finalpath) < 0)
				configd_critical(
				    "\"%s\" backup completed, but removing old "
				    "file \"%s\" failed: %s\n",
				    name, finalpath, strerror(errno));
		}

		backend_backup_cleanup(old_list, old_sz);
	}

	result = REP_PROTOCOL_SUCCESS;
	flight_recorder_event(BE_FLIGHT_EV_BACKUP, BE_FLIGHT_ST_SUCCESS);

fail:
	(void) close(infd);
	(void) close(outfd);
	if (result != REP_PROTOCOL_SUCCESS) {
		flight_recorder_event(BE_FLIGHT_EV_BACKUP, BE_FLIGHT_ST_FAIL);
		(void) unlink(tmppath);
	}

out:
	/* Get rid of the checkpoint file now that we've used it. */
	if (use_checkpoint && (result == REP_PROTOCOL_SUCCESS)) {
		(void) unlink(be->be_checkpoint);
		be->be_checkpoint = NULL;
	}
	free(finalpath);
	free(tmppath);

	return (result);
}

/*
 * Check if value_tbl has been upgraded in the main database,  and
 * if not (if the value_order column is not present),  and do_upgrade is true,
 * upgrade value_tbl in repository to contain the additional value_order
 * column. The version of sqlite used means ALTER TABLE is not
 * available, so we cannot simply use "ALTER TABLE value_tbl ADD COLUMN".
 * Rather we need to create a temporary table with the additional column,
 * import the value_tbl, drop the original value_tbl, recreate the value_tbl
 * with the additional column, import the values from value_tbl_tmp,
 * reindex and finally drop value_tbl_tmp.  During boot, we wish to check
 * if the repository has been upgraded before it is writable,  so that
 * property value retrieval can use the appropriate form of the SELECT
 * statement that retrieves property values.  As a result, we need to check
 * if the repository has been upgraded prior to the point when we can
 * actually carry out the update.
 */
void
backend_check_upgrade(sqlite_backend_t *be, boolean_t do_upgrade)
{
	char *errp;
	int r;

	if (be_normal_upgraded)
		return;
	/*
	 * Test if upgrade is needed. If value_order column does not exist,
	 * we need to upgrade the schema.
	 */
	r = sqlite_exec(be->be_db, "SELECT value_order FROM value_tbl LIMIT 1;",
	    NULL, NULL, NULL);
	if (r == SQLITE_ERROR && do_upgrade) {
		/* No value_order column - needs upgrade */
		configd_info("Upgrading SMF repository format...");
		r = sqlite_exec(be->be_db,
		    "BEGIN TRANSACTION; "
		    "CREATE TABLE value_tbl_tmp ( "
		    "value_id   INTEGER NOT NULL, "
		    "value_type CHAR(1) NOT NULL, "
		    "value_value VARCHAR NOT NULL, "
		    "value_order INTEGER DEFAULT 0); "
		    "INSERT INTO value_tbl_tmp "
		    "(value_id, value_type, value_value) "
		    "SELECT value_id, value_type, value_value FROM value_tbl; "
		    "DROP TABLE value_tbl; "
		    "CREATE TABLE value_tbl( "
		    "value_id   INTEGER NOT NULL, "
		    "value_type CHAR(1) NOT NULL, "
		    "value_value VARCHAR NOT NULL, "
		    "value_order INTEGER DEFAULT 0); "
		    "INSERT INTO value_tbl SELECT * FROM value_tbl_tmp; "
		    "CREATE INDEX value_tbl_id ON value_tbl (value_id); "
		    "DROP TABLE value_tbl_tmp; "
		    "COMMIT TRANSACTION; "
		    "VACUUM; ",
		    NULL, NULL, &errp);
		if (r == SQLITE_OK) {
			configd_info("SMF repository upgrade is complete.");
		} else {
			backend_panic("%s: repository upgrade failed: %s",
			    be->be_path, errp);
			/* NOTREACHED */
		}
	}
	if (r == SQLITE_OK)
		be_normal_upgraded = B_TRUE;
	else
		be_normal_upgraded = B_FALSE;
}

static int
backend_check_readonly(sqlite_backend_t *be, int writing, hrtime_t t)
{
	const char *check_path;
	char *errp;
	struct sqlite *new;
	int r;

	assert(be->be_readonly);
	assert(be == bes[BACKEND_TYPE_NORMAL]);

	/*
	 * If we don't *need* to be writable, only check every once in a
	 * while.
	 */
	if (!writing) {
		if ((uint64_t)(t - be->be_lastcheck) <
		    BACKEND_READONLY_CHECK_INTERVAL)
			return (REP_PROTOCOL_SUCCESS);
		be->be_lastcheck = t;
	}

	/*
	 * It could be that the repository has been moved to non-persistent
	 * storage for performance reasons.  In this case we need to check
	 * the persistent path to see if it is writable.  The
	 * non-persistent path will always be writable.
	 */
	check_path = IS_VOLATILE(be) ? be->be_ppath : be->be_path;

	new = sqlite_open(check_path, 0600, &errp);
	if (new == NULL) {
		backend_panic("reopening %s: %s\n", check_path, errp);
		/*NOTREACHED*/
	}
	r = backend_is_readonly(new, check_path);

	if (r != SQLITE_OK) {
		/*
		 * The underlying storage for the permanent repository is
		 * still read-only, so we don't want to change the state or
		 * move the checkpointed backup if it exists.  On the other
		 * hand if the repository has been copied to volatile
		 * storage, we'll let our caller go ahead and write to the
		 * database.
		 */
		sqlite_close(new);
		if (writing && (IS_VOLATILE(be) == 0))
			return (REP_PROTOCOL_FAIL_BACKEND_READONLY);
		return (REP_PROTOCOL_SUCCESS);
	}

	/*
	 * We can write!  If the repository is not on volatile storage,
	 * swap the db handles.  Mark ourself as writable, upgrade the
	 * repository if necessary and make a backup.
	 */
	be->be_readonly = 0;
	flight_recorder_event(BE_FLIGHT_EV_TRANS_RW, BE_FLIGHT_ST_RW);
	if (IS_VOLATILE(be)) {
		/*
		 * If the repository is on volatile storage, don't switch
		 * the handles.  We'll continue to use the repository that
		 * is on tmpfs until we're told to move it back by one of
		 * our clients.  Clients, specifically manifest_import,
		 * move the repository to tmpfs for performance reasons,
		 * and that is the reason to not switch it back until we're
		 * told to do so.
		 */
		flight_recorder_event(BE_FLIGHT_EV_TRANS_RW,
		    BE_FLIGHT_ST_NO_SWITCH);
		sqlite_close(new);
	} else {
		flight_recorder_event(BE_FLIGHT_EV_TRANS_RW,
		    BE_FLIGHT_ST_SWITCH);
		sqlite_close(be->be_db);
		be->be_db = new;
	}

	if (be->be_type == BACKEND_TYPE_NORMAL)
		backend_check_upgrade(be, B_TRUE);

	if (backend_create_backup_locked(be, REPOSITORY_BOOT_BACKUP) !=
	    REP_PROTOCOL_SUCCESS) {
		configd_critical(
		    "unable to create \"%s\" backup of \"%s\"\n",
		    REPOSITORY_BOOT_BACKUP, be->be_path);
	}

	return (REP_PROTOCOL_SUCCESS);
}

/*
 * If t is not BACKEND_TYPE_NORMAL, can fail with
 *   _BACKEND_ACCESS - backend does not exist
 *
 * If writing is nonzero, can also fail with
 *   _BACKEND_READONLY - backend is read-only
 */
static int
backend_lock(backend_type_t t, int writing, sqlite_backend_t **bep)
{
	sqlite_backend_t *be = NULL;
	hrtime_t ts, vts;

	*bep = NULL;

	assert(t == BACKEND_TYPE_NORMAL ||
	    t == BACKEND_TYPE_NONPERSIST);

	be = bes[t];
	if (t == BACKEND_TYPE_NORMAL)
		assert(be != NULL);		/* should always be there */

	if (be == NULL)
		return (REP_PROTOCOL_FAIL_BACKEND_ACCESS);

	if (backend_panic_thread != 0)
		backend_panic(NULL);		/* don't proceed */

	ts = gethrtime();
	vts = gethrvtime();
	(void) pthread_mutex_lock(&be->be_lock);
	UPDATE_TOTALS_WR(be, writing, bt_lock, ts, vts);

	if (backend_panic_thread != 0) {
		(void) pthread_mutex_unlock(&be->be_lock);
		backend_panic(NULL);		/* don't proceed */
	}
	be->be_thread = pthread_self();

	if (be->be_readonly) {
		int r;
		assert(t == BACKEND_TYPE_NORMAL);

		r = backend_check_readonly(be, writing, ts);
		if (r != REP_PROTOCOL_SUCCESS) {
			be->be_thread = 0;
			(void) pthread_mutex_unlock(&be->be_lock);
			return (r);
		}
	}

	if (backend_do_trace)
		(void) sqlite_trace(be->be_db, backend_trace_sql, be);
	else
		(void) sqlite_trace(be->be_db, NULL, NULL);

	be->be_writing = writing;
	*bep = be;
	return (REP_PROTOCOL_SUCCESS);
}

static void
backend_unlock(sqlite_backend_t *be)
{
	be->be_writing = 0;
	be->be_thread = 0;
	(void) pthread_mutex_unlock(&be->be_lock);
}

static void
backend_destroy(sqlite_backend_t *be)
{
	if (be->be_db != NULL) {
		sqlite_close(be->be_db);
		be->be_db = NULL;
	}
	be->be_thread = 0;
	(void) pthread_mutex_unlock(&be->be_lock);
	(void) pthread_mutex_destroy(&be->be_lock);
}

static void
backend_create_finish(backend_type_t backend_id, sqlite_backend_t *be)
{
	assert(MUTEX_HELD(&be->be_lock));
	assert(be == &be_info[backend_id]);

	bes[backend_id] = be;
	(void) pthread_mutex_unlock(&be->be_lock);
}

static int
backend_fd_write(int fd, const char *mess)
{
	int len = strlen(mess);
	int written;

	while (len > 0) {
		if ((written = write(fd, mess, len)) < 0)
			return (-1);
		mess += written;
		len -= written;
	}
	return (0);
}

/*
 * Can return:
 *	_BAD_REQUEST		name is not valid
 *	_TRUNCATED		name is too long for current repository path
 *	_UNKNOWN		failed for unknown reason (details written to
 *				console)
 *	_BACKEND_READONLY	backend is not writable
 *	_NO_RESOURCES		out of memory
 *	_SUCCESS		Backup completed successfully.
 */
rep_protocol_responseid_t
backend_create_backup(const char *name)
{
	rep_protocol_responseid_t result;
	sqlite_backend_t *be;

	flight_recorder_event(BE_FLIGHT_EV_BACKUP, BE_FLIGHT_ST_CLIENT);
	result = backend_lock(BACKEND_TYPE_NORMAL, 0, &be);
	assert(result == REP_PROTOCOL_SUCCESS);

	result = backend_create_backup_locked(be, name);
	backend_unlock(be);

	return (result);
}

/*
 * This function makes a copy of the repository at src, placing the copy at
 * dst.  It is used to copy a repository on permanent storage to volatile
 * storage or vice versa.  If the source file is on volatile storage, it is
 * often times desirable to delete it after the copy has been made and
 * verified.  To remove the source repository, set remove_src to 1.
 *
 * Can return:
 *
 *	REP_PROTOCOL_SUCCESS		successful copy and rename
 *	REP_PROTOCOL_FAIL_UNKNOWN	file operation error
 *	REP_PROTOCOL_FAIL_NO_RESOURCES	out of memory
 */
static rep_protocol_responseid_t
backend_copy_repository(const char *src, const char *dst, int remove_src)
{
	int srcfd, dstfd;
	char *tmppath = malloc(PATH_MAX);
	rep_protocol_responseid_t res = REP_PROTOCOL_SUCCESS;
	struct stat s_buf;
	size_t cpsz, sz;

	if (tmppath == NULL) {
		res = REP_PROTOCOL_FAIL_NO_RESOURCES;
		goto out;
	}

	/*
	 * Create and open the related db files
	 */
	(void) strlcpy(tmppath, dst, PATH_MAX);
	sz = strlcat(tmppath, "-XXXXXX", PATH_MAX);
	assert(sz < PATH_MAX);
	if (sz >= PATH_MAX) {
		configd_critical(
		    "Backend copy failed: strlcat %s: overflow\n", tmppath);
		abort();
	}

	if ((dstfd = mkstemp(tmppath)) < 0) {
		configd_critical("Backend copy failed: mkstemp %s: %s\n",
		    tmppath, strerror(errno));
		res = REP_PROTOCOL_FAIL_UNKNOWN;
		goto out;
	}

	if ((srcfd = open(src, O_RDONLY)) < 0) {
		configd_critical("Backend copy failed: opening %s: %s\n",
		    src, strerror(errno));
		res = REP_PROTOCOL_FAIL_UNKNOWN;
		goto errexit;
	}

	/*
	 * fstat the backend before copy for sanity check.
	 */
	if (fstat(srcfd, &s_buf) < 0) {
		configd_critical("Backend copy failed: fstat %s: %s\n",
		    src, strerror(errno));
		res = REP_PROTOCOL_FAIL_UNKNOWN;
		goto errexit;
	}

	if ((res = backend_do_copy(src, srcfd, dst, dstfd, &cpsz)) !=
	    REP_PROTOCOL_SUCCESS)
		goto errexit;

	if (cpsz != s_buf.st_size) {
		configd_critical("Backend copy failed: incomplete copy\n");
		res = REP_PROTOCOL_FAIL_UNKNOWN;
		goto errexit;
	}

	/*
	 * Rename tmppath to dst
	 */
	if (rename(tmppath, dst) < 0) {
		configd_critical(
		    "Backend copy failed: rename %s to %s: %s\n",
		    tmppath, dst, strerror(errno));
		res = REP_PROTOCOL_FAIL_UNKNOWN;
	}

errexit:
	if (res != REP_PROTOCOL_SUCCESS && unlink(tmppath) < 0)
		configd_critical(
		    "Backend copy failed: remove %s: %s\n",
		    tmppath, strerror(errno));

	(void) close(srcfd);
	(void) close(dstfd);

out:
	free(tmppath);
	if (remove_src) {
		if (unlink(src) < 0)
			configd_critical(
			    "Backend copy failed: remove %s: %s\n",
			    src, strerror(errno));
	}

	return (res);
}

/*
 * Perform sanity check on the repository.
 * Return 0 if check succeeds or -1 if fails.
 */
static int
backend_switch_check(struct sqlite *be_db, char **errp)
{
	struct run_single_int_info info;
	uint32_t val = -1UL;
	int r;

	info.rs_out = &val;
	info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;

	r = sqlite_exec(be_db,
	    "SELECT schema_version FROM schema_version;",
	    run_single_int_callback, &info, errp);

	if (r == SQLITE_OK &&
	    info.rs_result != REP_PROTOCOL_FAIL_NOT_FOUND &&
	    val == BACKEND_SCHEMA_VERSION)
		return (0);
	else
		return (-1);
}

/*
 * backend_switch() implements the REP_PROTOCOL_SWITCH request from
 * clients.  First, it blocks all other clients from accessing the
 * repository by calling backend_lock to lock the repository.  It either
 * copies the repository from it's permanent storage location
 * (REPOSITORY_DB) to its fast volatile location (FAST_REPOSITORY_DB), or
 * vice versa.  dir determines the direction of the copy.
 *
 *	dir = 0	Copy from permanent location to volatile location.
 *	dir = 1	Copy from volatile location to permanent location.
 *
 * Can return:
 *	REP_PROTOCOL_SUCCESS			successful switch
 *	REP_PROTOCOL_FAIL_BACKEND_ACCESS	backen access fails
 *	REP_PROTOCOL_FAIL_BACKEND_READONLY	backend is not writable
 *	REP_PROTOCOL_FAIL_UNKNOWN		file operation error
 *	REP_PROTOCOL_FAIL_NO_RESOURCES		out of memory
 */
rep_protocol_responseid_t
backend_switch(int dir)
{
	rep_protocol_responseid_t result;
	sqlite_backend_t *be;
	struct sqlite *new;
	char *errp;
	const char *dst;

	flight_recorder_event(BE_FLIGHT_EV_SWITCH, BE_FLIGHT_ST_CLIENT);

	/*
	 * If switching back to the main repository, lock for writing.
	 * Otherwise, lock for reading.
	 */
	result = backend_lock(BACKEND_TYPE_NORMAL, dir ? 1 : 0,
	    &be);
	if (result != REP_PROTOCOL_SUCCESS)
		return (result);

	if (dir) {
		flight_recorder_event(BE_FLIGHT_EV_SWITCH,
		    BE_FLIGHT_ST_PERMANENT);
		dst = REPOSITORY_DB;
	} else {
		flight_recorder_event(BE_FLIGHT_EV_SWITCH,
		    BE_FLIGHT_ST_FAST);
		dst = FAST_REPOSITORY_DB;
	}

	/*
	 * Do the actual copy and rename
	 */
	if (strcmp(be->be_path, dst) == 0) {
		flight_recorder_event(BE_FLIGHT_EV_SWITCH,
		    BE_FLIGHT_ST_DUPLICATE);
		result = REP_PROTOCOL_SUCCESS;
		goto errout;
	}

	result = backend_copy_repository(be->be_path, dst, dir);
	if (result != REP_PROTOCOL_SUCCESS) {
		goto errout;
	}

	/*
	 * Do the backend sanity check and switch
	 */
	new = sqlite_open(dst, 0600, &errp);
	if (new != NULL) {
		/*
		 * Sanity check
		 */
		if (backend_switch_check(new, &errp) == 0) {
			free((char *)be->be_path);
			be->be_path = strdup(dst);
			if (be->be_path == NULL) {
				configd_critical(
				    "Backend switch failed: strdup %s: %s\n",
				    dst, strerror(errno));
				result = REP_PROTOCOL_FAIL_NO_RESOURCES;
				sqlite_close(new);
			} else {
				sqlite_close(be->be_db);
				be->be_db = new;
				if (dir) {
					/* We're back on permanent storage. */
					be->be_ppath = NULL;
				} else {
					/*
					 * Repository is now on volatile
					 * storage.  Save the location of
					 * the persistent repository.
					 */
					be->be_ppath = REPOSITORY_DB;
				}
			}
		} else {
			configd_critical(
			    "Backend switch failed: integrity check %s: %s\n",
			    dst, errp);
			result = REP_PROTOCOL_FAIL_BACKEND_ACCESS;
		}
	} else {
		configd_critical("Backend switch failed: sqlite_open %s: %s\n",
		    dst, errp);
		result = REP_PROTOCOL_FAIL_BACKEND_ACCESS;
	}

errout:
	if (result == REP_PROTOCOL_SUCCESS) {
		flight_recorder_event(BE_FLIGHT_EV_SWITCH,
		    BE_FLIGHT_ST_SUCCESS);
	} else {
		flight_recorder_event(BE_FLIGHT_EV_SWITCH, BE_FLIGHT_ST_FAIL);
	}
	backend_unlock(be);
	return (result);
}

/*
 * This routine is called to attempt the recovery of
 * the most recent valid repository if possible when configd
 * is restarted for some reasons or when system crashes
 * during the switch operation.  The repository databases
 * referenced here are indicators of successful switch
 * operations.
 */
static backend_switch_results_t
backend_switch_recovery(void)
{
	const char *fast_db = FAST_REPOSITORY_DB;
	char *errp = NULL;
	struct stat s_buf;
	struct sqlite *be_db;
	int r;
	backend_switch_results_t res = BACKEND_SWITCH_OK;

	/*
	 * A good transient db containing most recent data can
	 * exist if svc.configd crashes during the
	 * switch operation.  If that is the case, check its
	 * integrity and use it.
	 */
	if (stat(fast_db, &s_buf) < 0) {
		return (BACKEND_SWITCH_OK);
	}

	/* Determine if persistent repository is read-only */
	be_db = sqlite_open(REPOSITORY_DB, 0600, &errp);
	if (be_db == NULL) {
		configd_critical("Unable to open \"%s\".  %s\n",
		    REPOSITORY_DB, errp == NULL ? "" : errp);
		free(errp);
		return (BACKEND_SWITCH_FATAL);
	}
	r = backend_is_readonly(be_db, REPOSITORY_DB);
	sqlite_close(be_db);
	if (r != SQLITE_OK) {
		if (r == SQLITE_READONLY) {
			return (BACKEND_SWITCH_RO);
		}
		return (BACKEND_SWITCH_FATAL);
	}

	/*
	 * Do sanity check on the db
	 */
	be_db = sqlite_open(fast_db, 0600, &errp);

	if (be_db != NULL) {
		if (backend_switch_check(be_db, &errp) == 0) {
			if (backend_copy_repository(fast_db,
			    REPOSITORY_DB, 1) != REP_PROTOCOL_SUCCESS) {
				res = BACKEND_SWITCH_FATAL;
			}
		}
		sqlite_close(be_db);
	}
	free(errp);

	/*
	 * If we get to this point, the fast_db has either been copied or
	 * it is useless.  Either way, get rid of it.
	 */
	(void) unlink(fast_db);

	return (res);
}

/*ARGSUSED*/
static int
backend_integrity_callback(void *private, int narg, char **vals, char **cols)
{
	char **out = private;
	char *old = *out;
	char *new;
	const char *info;
	size_t len;
	int x;

	for (x = 0; x < narg; x++) {
		if ((info = vals[x]) != NULL &&
		    strcmp(info, "ok") != 0) {
			len = (old == NULL)? 0 : strlen(old);
			len += strlen(info) + 2;	/* '\n' + '\0' */

			new = realloc(old, len);
			if (new == NULL)
				return (BACKEND_CALLBACK_ABORT);
			if (old == NULL)
				new[0] = 0;
			old = *out = new;
			(void) strlcat(new, info, len);
			(void) strlcat(new, "\n", len);
		}
	}
	return (BACKEND_CALLBACK_CONTINUE);
}

#define	BACKEND_CREATE_LOCKED		-2
#define	BACKEND_CREATE_FAIL		-1
#define	BACKEND_CREATE_SUCCESS		0
#define	BACKEND_CREATE_READONLY		1
#define	BACKEND_CREATE_NEED_INIT	2
static int
backend_create(backend_type_t backend_id, const char *db_file,
    sqlite_backend_t **bep)
{
	char *errp;
	char *integrity_results = NULL;
	sqlite_backend_t *be;
	int r;
	uint32_t val = -1UL;
	struct run_single_int_info info;
	int fd;

	assert(backend_id >= 0 && backend_id < BACKEND_TYPE_TOTAL);

	be = &be_info[backend_id];

	assert(be->be_db == NULL);

	(void) pthread_mutex_init(&be->be_lock, NULL);
	(void) pthread_mutex_lock(&be->be_lock);

	be->be_type = backend_id;
	be->be_path = strdup(db_file);
	if (be->be_path == NULL) {
		perror("malloc");
		goto fail;
	}

	be->be_db = sqlite_open(be->be_path, 0600, &errp);

	if (be->be_db == NULL) {
		if (strstr(errp, "out of memory") != NULL) {
			configd_critical("%s: %s\n", db_file, errp);
			free(errp);

			goto fail;
		}

		/* report it as an integrity failure */
		integrity_results = errp;
		errp = NULL;
		goto integrity_fail;
	}

	/*
	 * check if we are inited and of the correct schema version
	 *
	 */
	info.rs_out = &val;
	info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;

	r = sqlite_exec(be->be_db, "SELECT schema_version FROM schema_version;",
	    run_single_int_callback, &info, &errp);
	if (r == SQLITE_ERROR &&
	    strcmp("no such table: schema_version", errp) == 0) {
		free(errp);
		/*
		 * Could be an empty repository, could be pre-schema_version
		 * schema.  Check for id_tbl, which has always been there.
		 */
		r = sqlite_exec(be->be_db, "SELECT count() FROM id_tbl;",
		    NULL, NULL, &errp);
		if (r == SQLITE_ERROR &&
		    strcmp("no such table: id_tbl", errp) == 0) {
			free(errp);
			*bep = be;
			return (BACKEND_CREATE_NEED_INIT);
		}

		configd_critical("%s: schema version mismatch\n", db_file);
		goto fail;
	}
	if (r == SQLITE_BUSY || r == SQLITE_LOCKED) {
		free(errp);
		*bep = NULL;
		backend_destroy(be);
		return (BACKEND_CREATE_LOCKED);
	}
	if (r == SQLITE_OK) {
		if (info.rs_result == REP_PROTOCOL_FAIL_NOT_FOUND ||
		    val != BACKEND_SCHEMA_VERSION) {
			configd_critical("%s: schema version mismatch\n",
			    db_file);
			goto fail;
		}
	}

	/*
	 * pull in the whole database sequentially.
	 */
	if ((fd = open(db_file, O_RDONLY)) >= 0) {
		size_t sz = 64 * 1024;
		char *buffer = malloc(sz);
		if (buffer != NULL) {
			while (read(fd, buffer, sz) > 0)
				;
			free(buffer);
		}
		(void) close(fd);
	}

	/*
	 * run an integrity check
	 */
	r = sqlite_exec(be->be_db, "PRAGMA integrity_check;",
	    backend_integrity_callback, &integrity_results, &errp);

	if (r == SQLITE_BUSY || r == SQLITE_LOCKED) {
		free(errp);
		*bep = NULL;
		backend_destroy(be);
		return (BACKEND_CREATE_LOCKED);
	}
	if (r == SQLITE_ABORT) {
		free(errp);
		errp = NULL;
		integrity_results = "out of memory running integrity check\n";
	} else if (r != SQLITE_OK && integrity_results == NULL) {
		integrity_results = errp;
		errp = NULL;
	}

integrity_fail:
	if (integrity_results != NULL) {
		const char *fname = "/etc/svc/volatile/db_errors";
		if ((fd = open(fname, O_CREAT|O_WRONLY|O_APPEND, 0600)) < 0) {
			fname = NULL;
		} else {
			if (backend_fd_write(fd, "\n\n") < 0 ||
			    backend_fd_write(fd, db_file) < 0 ||
			    backend_fd_write(fd,
			    ": PRAGMA integrity_check; failed.  Results:\n") <
			    0 || backend_fd_write(fd, integrity_results) < 0 ||
			    backend_fd_write(fd, "\n\n") < 0) {
				fname = NULL;
			}
			(void) close(fd);
		}

		if (!is_main_repository ||
		    backend_id == BACKEND_TYPE_NONPERSIST) {
			if (fname != NULL)
				configd_critical(
				    "%s: integrity check failed. Details in "
				    "%s\n", db_file, fname);
			else
				configd_critical(
				    "%s: integrity check failed.\n",
				    db_file);
		} else {
			(void) fprintf(stderr,
"\n"
"svc.configd: smf(5) database integrity check of:\n"
"\n"
"    %s\n"
"\n"
"  failed. The database might be damaged or a media error might have\n"
"  prevented it from being verified.  Additional information useful to\n"
"  your service provider%s%s\n"
"\n"
"  The system will not be able to boot until you have restored a working\n"
"  database.  svc.startd(1M) will provide a sulogin(1M) prompt for recovery\n"
"  purposes.  The command:\n"
"\n"
"    /lib/svc/bin/restore_repository\n"
"\n"
"  can be run to restore a backup version of your repository.  See\n"
"  http://illumos.org/msg/SMF-8000-MY for more information.\n"
"\n",
			    db_file,
			    (fname == NULL)? ":\n\n" : " is in:\n\n    ",
			    (fname == NULL)? integrity_results : fname);
		}
		free(errp);
		goto fail;
	}

	/*
	 * Simply do check if backend has been upgraded.  We do not wish
	 * to actually carry out upgrade here - the main repository may
	 * not be writable at this point.  Actual upgrade is carried out
	 * via backend_check_readonly().  This check is done so that
	 * we determine repository state - upgraded or not - and then
	 * the appropriate SELECT statement (value-ordered or not)
	 * can be used when retrieving property values early in boot.
	 */
	if (backend_id == BACKEND_TYPE_NORMAL)
		backend_check_upgrade(be, B_FALSE);
	/*
	 * check if we are writable
	 */
	r = backend_is_readonly(be->be_db, be->be_path);

	if (r == SQLITE_BUSY || r == SQLITE_LOCKED) {
		free(errp);
		*bep = NULL;
		backend_destroy(be);
		return (BACKEND_CREATE_LOCKED);
	}
	if (r != SQLITE_OK && r != SQLITE_FULL) {
		free(errp);
		be->be_readonly = 1;
		*bep = be;
		return (BACKEND_CREATE_READONLY);
	}

	*bep = be;
	return (BACKEND_CREATE_SUCCESS);

fail:
	*bep = NULL;
	backend_destroy(be);
	return (BACKEND_CREATE_FAIL);
}

/*
 * (arg & -arg) is, through the magic of twos-complement arithmetic, the
 * lowest set bit in arg.
 */
static size_t
round_up_to_p2(size_t arg)
{
	/*
	 * Don't allow a zero result.
	 */
	assert(arg > 0 && ((ssize_t)arg > 0));

	while ((arg & (arg - 1)) != 0)
		arg += (arg & -arg);

	return (arg);
}

/*
 * Returns
 *   _NO_RESOURCES - out of memory
 *   _BACKEND_ACCESS - backend type t (other than _NORMAL) doesn't exist
 *   _DONE - callback aborted query
 *   _SUCCESS
 */
int
backend_run(backend_type_t t, backend_query_t *q,
    backend_run_callback_f *cb, void *data)
{
	char *errmsg = NULL;
	int ret;
	sqlite_backend_t *be;
	hrtime_t ts, vts;

	if (q == NULL || q->bq_buf == NULL)
		return (REP_PROTOCOL_FAIL_NO_RESOURCES);

	if ((ret = backend_lock(t, 0, &be)) != REP_PROTOCOL_SUCCESS)
		return (ret);

	ts = gethrtime();
	vts = gethrvtime();
	ret = sqlite_exec(be->be_db, q->bq_buf, cb, data, &errmsg);
	UPDATE_TOTALS(be, bt_exec, ts, vts);
	ret = backend_error(be, ret, errmsg);
	backend_unlock(be);

	return (ret);
}

/*
 * Starts a "read-only" transaction -- i.e., locks out writers as long
 * as it is active.
 *
 * Fails with
 *   _NO_RESOURCES - out of memory
 *
 * If t is not _NORMAL, can also fail with
 *   _BACKEND_ACCESS - backend does not exist
 *
 * If writable is true, can also fail with
 *   _BACKEND_READONLY
 */
static int
backend_tx_begin_common(backend_type_t t, backend_tx_t **txp, int writable)
{
	backend_tx_t *ret;
	sqlite_backend_t *be;
	int r;

	*txp = NULL;

	ret = uu_zalloc(sizeof (*ret));
	if (ret == NULL)
		return (REP_PROTOCOL_FAIL_NO_RESOURCES);

	if ((r = backend_lock(t, writable, &be)) != REP_PROTOCOL_SUCCESS) {
		uu_free(ret);
		return (r);
	}

	ret->bt_be = be;
	ret->bt_readonly = !writable;
	ret->bt_type = t;
	ret->bt_full = 0;

	*txp = ret;
	return (REP_PROTOCOL_SUCCESS);
}

int
backend_tx_begin_ro(backend_type_t t, backend_tx_t **txp)
{
	return (backend_tx_begin_common(t, txp, 0));
}

static void
backend_tx_end(backend_tx_t *tx)
{
	sqlite_backend_t *be;

	be = tx->bt_be;

	if (tx->bt_full) {
		struct sqlite *new;

		/*
		 * sqlite tends to be sticky with SQLITE_FULL, so we try
		 * to get a fresh database handle if we got a FULL warning
		 * along the way.  If that fails, no harm done.
		 */
		new = sqlite_open(be->be_path, 0600, NULL);
		if (new != NULL) {
			sqlite_close(be->be_db);
			be->be_db = new;
		}
	}
	backend_unlock(be);
	tx->bt_be = NULL;
	uu_free(tx);
}

void
backend_tx_end_ro(backend_tx_t *tx)
{
	assert(tx->bt_readonly);
	backend_tx_end(tx);
}

/*
 * Fails with
 *   _NO_RESOURCES - out of memory
 *   _BACKEND_ACCESS
 *   _BACKEND_READONLY
 */
int
backend_tx_begin(backend_type_t t, backend_tx_t **txp)
{
	int r;
	char *errmsg;
	hrtime_t ts, vts;

	r = backend_tx_begin_common(t, txp, 1);
	if (r != REP_PROTOCOL_SUCCESS)
		return (r);

	ts = gethrtime();
	vts = gethrvtime();
	r = sqlite_exec((*txp)->bt_be->be_db, "BEGIN TRANSACTION", NULL, NULL,
	    &errmsg);
	UPDATE_TOTALS((*txp)->bt_be, bt_exec, ts, vts);
	if (r == SQLITE_FULL)
		(*txp)->bt_full = 1;
	r = backend_error((*txp)->bt_be, r, errmsg);

	if (r != REP_PROTOCOL_SUCCESS) {
		assert(r != REP_PROTOCOL_DONE);
		(void) sqlite_exec((*txp)->bt_be->be_db,
		    "ROLLBACK TRANSACTION", NULL, NULL, NULL);
		backend_tx_end(*txp);
		*txp = NULL;
		return (r);
	}

	(*txp)->bt_readonly = 0;

	return (REP_PROTOCOL_SUCCESS);
}

void
backend_tx_rollback(backend_tx_t *tx)
{
	int r;
	char *errmsg;
	sqlite_backend_t *be;
	hrtime_t ts, vts;

	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
	be = tx->bt_be;

	ts = gethrtime();
	vts = gethrvtime();
	r = sqlite_exec(be->be_db, "ROLLBACK TRANSACTION", NULL, NULL,
	    &errmsg);
	UPDATE_TOTALS(be, bt_exec, ts, vts);
	if (r == SQLITE_FULL)
		tx->bt_full = 1;
	(void) backend_error(be, r, errmsg);

	backend_tx_end(tx);
}

/*
 * Fails with
 *   _NO_RESOURCES - out of memory
 */
int
backend_tx_commit(backend_tx_t *tx)
{
	int r, r2;
	char *errmsg;
	sqlite_backend_t *be;
	hrtime_t ts, vts;

	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
	be = tx->bt_be;
	ts = gethrtime();
	vts = gethrvtime();
	r = sqlite_exec(be->be_db, "COMMIT TRANSACTION", NULL, NULL,
	    &errmsg);
	UPDATE_TOTALS(be, bt_exec, ts, vts);
	if (r == SQLITE_FULL)
		tx->bt_full = 1;

	r = backend_error(be, r, errmsg);
	assert(r != REP_PROTOCOL_DONE);

	if (r != REP_PROTOCOL_SUCCESS) {
		r2 = sqlite_exec(be->be_db, "ROLLBACK TRANSACTION", NULL, NULL,
		    &errmsg);
		r2 = backend_error(be, r2, errmsg);
		if (r2 != REP_PROTOCOL_SUCCESS)
			backend_panic("cannot rollback failed commit");

		backend_tx_end(tx);
		return (r);
	}
	backend_tx_end(tx);
	return (REP_PROTOCOL_SUCCESS);
}

static const char *
id_space_to_name(enum id_space id)
{
	switch (id) {
	case BACKEND_ID_SERVICE_INSTANCE:
		return ("SI");
	case BACKEND_ID_PROPERTYGRP:
		return ("PG");
	case BACKEND_ID_GENERATION:
		return ("GEN");
	case BACKEND_ID_PROPERTY:
		return ("PROP");
	case BACKEND_ID_VALUE:
		return ("VAL");
	case BACKEND_ID_SNAPNAME:
		return ("SNAME");
	case BACKEND_ID_SNAPSHOT:
		return ("SHOT");
	case BACKEND_ID_SNAPLEVEL:
		return ("SLVL");
	default:
		abort();
		/*NOTREACHED*/
	}
}

/*
 * Returns a new id or 0 if the id argument is invalid or the query fails.
 */
uint32_t
backend_new_id(backend_tx_t *tx, enum id_space id)
{
	struct run_single_int_info info;
	uint32_t new_id = 0;
	const char *name = id_space_to_name(id);
	char *errmsg;
	int ret;
	sqlite_backend_t *be;
	hrtime_t ts, vts;

	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
	be = tx->bt_be;

	info.rs_out = &new_id;
	info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;

	ts = gethrtime();
	vts = gethrvtime();
	ret = sqlite_exec_printf(be->be_db,
	    "SELECT id_next FROM id_tbl WHERE (id_name = '%q');"
	    "UPDATE id_tbl SET id_next = id_next + 1 WHERE (id_name = '%q');",
	    run_single_int_callback, &info, &errmsg, name, name);
	UPDATE_TOTALS(be, bt_exec, ts, vts);
	if (ret == SQLITE_FULL)
		tx->bt_full = 1;

	ret = backend_error(be, ret, errmsg);

	if (ret != REP_PROTOCOL_SUCCESS) {
		return (0);
	}

	return (new_id);
}

/*
 * Returns
 *   _NO_RESOURCES - out of memory
 *   _DONE - callback aborted query
 *   _SUCCESS
 */
int
backend_tx_run(backend_tx_t *tx, backend_query_t *q,
    backend_run_callback_f *cb, void *data)
{
	char *errmsg = NULL;
	int ret;
	sqlite_backend_t *be;
	hrtime_t ts, vts;

	assert(tx != NULL && tx->bt_be != NULL);
	be = tx->bt_be;

	if (q == NULL || q->bq_buf == NULL)
		return (REP_PROTOCOL_FAIL_NO_RESOURCES);

	ts = gethrtime();
	vts = gethrvtime();
	ret = sqlite_exec(be->be_db, q->bq_buf, cb, data, &errmsg);
	UPDATE_TOTALS(be, bt_exec, ts, vts);
	if (ret == SQLITE_FULL)
		tx->bt_full = 1;
	ret = backend_error(be, ret, errmsg);

	return (ret);
}

/*
 * Returns
 *   _NO_RESOURCES - out of memory
 *   _NOT_FOUND - the query returned no results
 *   _SUCCESS - the query returned a single integer
 */
int
backend_tx_run_single_int(backend_tx_t *tx, backend_query_t *q, uint32_t *buf)
{
	struct run_single_int_info info;
	int ret;

	info.rs_out = buf;
	info.rs_result = REP_PROTOCOL_FAIL_NOT_FOUND;

	ret = backend_tx_run(tx, q, run_single_int_callback, &info);
	assert(ret != REP_PROTOCOL_DONE);

	if (ret != REP_PROTOCOL_SUCCESS)
		return (ret);

	return (info.rs_result);
}

/*
 * Fails with
 *   _NO_RESOURCES - out of memory
 */
int
backend_tx_run_update(backend_tx_t *tx, const char *format, ...)
{
	va_list a;
	char *errmsg;
	int ret;
	sqlite_backend_t *be;
	hrtime_t ts, vts;

	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
	be = tx->bt_be;

	va_start(a, format);
	ts = gethrtime();
	vts = gethrvtime();
	ret = sqlite_exec_vprintf(be->be_db, format, NULL, NULL, &errmsg, a);
	UPDATE_TOTALS(be, bt_exec, ts, vts);
	if (ret == SQLITE_FULL)
		tx->bt_full = 1;
	va_end(a);
	ret = backend_error(be, ret, errmsg);
	assert(ret != REP_PROTOCOL_DONE);

	return (ret);
}

/*
 * returns REP_PROTOCOL_FAIL_NOT_FOUND if no changes occured
 */
int
backend_tx_run_update_changed(backend_tx_t *tx, const char *format, ...)
{
	va_list a;
	char *errmsg;
	int ret;
	sqlite_backend_t *be;
	hrtime_t ts, vts;

	assert(tx != NULL && tx->bt_be != NULL && !tx->bt_readonly);
	be = tx->bt_be;

	va_start(a, format);
	ts = gethrtime();
	vts = gethrvtime();
	ret = sqlite_exec_vprintf(be->be_db, format, NULL, NULL, &errmsg, a);
	UPDATE_TOTALS(be, bt_exec, ts, vts);
	if (ret == SQLITE_FULL)
		tx->bt_full = 1;
	va_end(a);

	ret = backend_error(be, ret, errmsg);

	return (ret);
}

#define	BACKEND_ADD_SCHEMA(be, file, tbls, idxs) \
	(backend_add_schema((be), (file), \
	    (tbls), sizeof (tbls) / sizeof (*(tbls)), \
	    (idxs), sizeof (idxs) / sizeof (*(idxs))))

static int
backend_add_schema(sqlite_backend_t *be, const char *file,
    struct backend_tbl_info *tbls, int tbl_count,
    struct backend_idx_info *idxs, int idx_count)
{
	int i;
	char *errmsg;
	int ret;

	/*
	 * Create the tables.
	 */
	for (i = 0; i < tbl_count; i++) {
		if (tbls[i].bti_name == NULL) {
			assert(i + 1 == tbl_count);
			break;
		}
		ret = sqlite_exec_printf(be->be_db,
		    "CREATE TABLE %s (%s);\n",
		    NULL, NULL, &errmsg, tbls[i].bti_name, tbls[i].bti_cols);

		if (ret != SQLITE_OK) {
			configd_critical(
			    "%s: %s table creation fails: %s\n", file,
			    tbls[i].bti_name, errmsg);
			free(errmsg);
			return (-1);
		}
	}

	/*
	 * Make indices on key tables and columns.
	 */
	for (i = 0; i < idx_count; i++) {
		if (idxs[i].bxi_tbl == NULL) {
			assert(i + 1 == idx_count);
			break;
		}

		ret = sqlite_exec_printf(be->be_db,
		    "CREATE INDEX %s_%s ON %s (%s);\n",
		    NULL, NULL, &errmsg, idxs[i].bxi_tbl, idxs[i].bxi_idx,
		    idxs[i].bxi_tbl, idxs[i].bxi_cols);

		if (ret != SQLITE_OK) {
			configd_critical(
			    "%s: %s_%s index creation fails: %s\n", file,
			    idxs[i].bxi_tbl, idxs[i].bxi_idx, errmsg);
			free(errmsg);
			return (-1);
		}
	}
	return (0);
}

static int
backend_init_schema(sqlite_backend_t *be, const char *db_file, backend_type_t t)
{
	int i;
	char *errmsg;
	int ret;

	assert(t == BACKEND_TYPE_NORMAL || t == BACKEND_TYPE_NONPERSIST);

	if (t == BACKEND_TYPE_NORMAL) {
		ret = BACKEND_ADD_SCHEMA(be, db_file, tbls_normal, idxs_normal);
	} else if (t == BACKEND_TYPE_NONPERSIST) {
		ret = BACKEND_ADD_SCHEMA(be, db_file, tbls_np, idxs_np);
	} else {
		abort();		/* can't happen */
	}

	if (ret < 0) {
		return (ret);
	}

	ret = BACKEND_ADD_SCHEMA(be, db_file, tbls_common, idxs_common);
	if (ret < 0) {
		return (ret);
	}

	/*
	 * Add the schema version to the table
	 */
	ret = sqlite_exec_printf(be->be_db,
	    "INSERT INTO schema_version (schema_version) VALUES (%d)",
	    NULL, NULL, &errmsg, BACKEND_SCHEMA_VERSION);
	if (ret != SQLITE_OK) {
		configd_critical(
		    "setting schema version fails: %s\n", errmsg);
		free(errmsg);
	}

	/*
	 * Populate id_tbl with initial IDs.
	 */
	for (i = 0; i < BACKEND_ID_INVALID; i++) {
		const char *name = id_space_to_name(i);

		ret = sqlite_exec_printf(be->be_db,
		    "INSERT INTO id_tbl (id_name, id_next) "
		    "VALUES ('%q', %d);", NULL, NULL, &errmsg, name, 1);
		if (ret != SQLITE_OK) {
			configd_critical(
			    "id insertion for %s fails: %s\n", name, errmsg);
			free(errmsg);
			return (-1);
		}
	}
	/*
	 * Set the persistance of the database.  The normal database is marked
	 * "synchronous", so that all writes are synchronized to stable storage
	 * before proceeding.
	 */
	ret = sqlite_exec_printf(be->be_db,
	    "PRAGMA default_synchronous = %s; PRAGMA synchronous = %s;",
	    NULL, NULL, &errmsg,
	    (t == BACKEND_TYPE_NORMAL)? "ON" : "OFF",
	    (t == BACKEND_TYPE_NORMAL)? "ON" : "OFF");
	if (ret != SQLITE_OK) {
		configd_critical("pragma setting fails: %s\n", errmsg);
		free(errmsg);
		return (-1);
	}

	return (0);
}

int
backend_init(const char *db_file, const char *npdb_file, int have_np)
{
	sqlite_backend_t *be;
	char *errp;
	struct sqlite *fast_db;
	int r;
	backend_switch_results_t switch_result = BACKEND_SWITCH_OK;
	int writable_persist = 1;

	/* set up our temporary directory */
	sqlite_temp_directory = "/etc/svc/volatile";

	if (strcmp(SQLITE_VERSION, sqlite_version) != 0) {
		configd_critical("Mismatched link!  (%s should be %s)\n",
		    sqlite_version, SQLITE_VERSION);
		return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
	}

	if (db_file == NULL)
		db_file = REPOSITORY_DB;
	if (strcmp(db_file, REPOSITORY_DB) != 0) {
		is_main_repository = 0;
	}

	/*
	 * If the svc.configd crashed, there might be a leftover transient
	 * database at FAST_REPOSITORY_DB,which contains useful
	 * information.  Both early manifest import and late manifest
	 * import use svcadm to copy the repository to FAST_REPOSITORY_DB.
	 * One reason for doing this is that it improves the performance of
	 * manifest import.  The other reason is that the repository may be
	 * on read-only root in the case of early manifest import.
	 *
	 * If FAST_REPOSITORY_DB exists, it is an indication that
	 * svc.configd has been restarted for some reason.  Since we have
	 * no way of knowing where we are in the boot process, the safe
	 * thing to do is to move the repository back to it's non-transient
	 * location, REPOSITORY_DB.  This may slow manifest import
	 * performance, but it avoids the problem of missing the command to
	 * move the repository to permanent storage.
	 *
	 * There is a caveat, though.  If root is read-only, we'll need to
	 * leave the repository at FAST_REPOSITORY_DB.  If root is
	 * read-only, late manifest import has not yet run, so it will move
	 * the repository back to permanent storage when it runs.
	 */
	if (is_main_repository)
		switch_result = backend_switch_recovery();

	r = backend_create(BACKEND_TYPE_NORMAL, db_file, &be);
	switch (r) {
	case BACKEND_CREATE_FAIL:
		return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
	case BACKEND_CREATE_LOCKED:
		return (CONFIGD_EXIT_DATABASE_LOCKED);
	case BACKEND_CREATE_SUCCESS:
		break;		/* success */
	case BACKEND_CREATE_READONLY:
		writable_persist = 0;
		break;
	case BACKEND_CREATE_NEED_INIT:
		if (backend_init_schema(be, db_file, BACKEND_TYPE_NORMAL)) {
			backend_destroy(be);
			return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
		}
		break;
	default:
		abort();
		/*NOTREACHED*/
	}
	backend_create_finish(BACKEND_TYPE_NORMAL, be);
	flight_recorder_event(BE_FLIGHT_EV_REPO_CREATE,
	    writable_persist == 1 ? BE_FLIGHT_ST_RW : BE_FLIGHT_ST_RO);
	/*
	 * If there was a transient repository that could not be copied
	 * back because the root file system was read-only, switch over to
	 * using the transient repository.
	 */
	if (switch_result == BACKEND_SWITCH_RO) {
		char *db_name_copy = NULL;

		fast_db = sqlite_open(FAST_REPOSITORY_DB, 0600, &errp);
		if (fast_db == NULL) {
			/* Can't open fast repository.  Stick with permanent. */
			configd_critical("Cannot open \"%s\".  %s\n",
			    FAST_REPOSITORY_DB, errp == NULL ? "" : errp);
			free(errp);
		} else {
			db_name_copy = strdup(FAST_REPOSITORY_DB);
			if (db_name_copy == NULL) {
				configd_critical("backend_init: out of "
				    "memory.\n");
				sqlite_close(fast_db);
				return (CONFIGD_EXIT_INIT_FAILED);
			} else {
				flight_recorder_event(
				    BE_FLIGHT_EV_LINGERING_FAST,
				    BE_FLIGHT_ST_RO);
				sqlite_close(be->be_db);
				be->be_db = fast_db;
				be->be_ppath = be->be_path;
				be->be_path = db_name_copy;
			}
		}
	}

	if (have_np) {
		if (npdb_file == NULL)
			npdb_file = NONPERSIST_DB;

		r = backend_create(BACKEND_TYPE_NONPERSIST, npdb_file, &be);
		switch (r) {
		case BACKEND_CREATE_SUCCESS:
			break;		/* success */
		case BACKEND_CREATE_FAIL:
			return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
		case BACKEND_CREATE_LOCKED:
			return (CONFIGD_EXIT_DATABASE_LOCKED);
		case BACKEND_CREATE_READONLY:
			configd_critical("%s: unable to write\n", npdb_file);
			return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
		case BACKEND_CREATE_NEED_INIT:
			if (backend_init_schema(be, db_file,
			    BACKEND_TYPE_NONPERSIST)) {
				backend_destroy(be);
				return (CONFIGD_EXIT_DATABASE_INIT_FAILED);
			}
			break;
		default:
			abort();
			/*NOTREACHED*/
		}
		backend_create_finish(BACKEND_TYPE_NONPERSIST, be);

		if (r != BACKEND_CREATE_NEED_INIT) {
			flight_recorder_event(BE_FLIGHT_EV_RESTART,
			    BE_FLIGHT_ST_INFO);
		}

		/*
		 * If we started up with a writable filesystem, but the
		 * non-persistent database needed initialization, we are
		 * booting a non-global zone or a system with a writable
		 * root (ZFS), so do a backup.  Checking to see if the
		 * non-persistent database needed initialization also keeps
		 * us from making additional backups if configd gets
		 * restarted.
		 */
		if (r == BACKEND_CREATE_NEED_INIT && writable_persist &&
		    backend_lock(BACKEND_TYPE_NORMAL, 0, &be) ==
		    REP_PROTOCOL_SUCCESS) {
			if (backend_create_backup_locked(be,
			    REPOSITORY_BOOT_BACKUP) != REP_PROTOCOL_SUCCESS) {
				configd_critical(
				    "unable to create \"%s\" backup of "
				    "\"%s\"\n", REPOSITORY_BOOT_BACKUP,
				    be->be_path);
			}
			backend_unlock(be);
		}

		/*
		 * On the other hand if we started with a read-only file
		 * system and the non-persistent database needed
		 * initialization, then we need to take a checkpoint of the
		 * repository.  We grab the checkpoint now before Early
		 * Manifest Import starts modifying the repository.  Then
		 * when the file system becomes writable, the checkpoint
		 * can be used to create the boot time backup of the
		 * repository.  Checking that the non-persistent database
		 * needed initialization, keeps us from making additional
		 * checkpoints if configd gets restarted.
		 */
		if (r == BACKEND_CREATE_NEED_INIT && writable_persist == 0 &&
		    backend_lock(BACKEND_TYPE_NORMAL, 0, &be) ==
		    REP_PROTOCOL_SUCCESS) {
			r = backend_checkpoint_repository(be);
			if (r != REP_PROTOCOL_SUCCESS) {
				configd_critical("unable to create checkpoint "
				    "of \"%s\"\n", be->be_path);
			}
			backend_unlock(be);
		}

		/*
		 * If the non-persistent database did not need
		 * initialization, svc.configd has been restarted.  See if
		 * the boot time checkpoint exists.  If it does, use it to
		 * make a backup if root is writable.
		 */
		if (r != BACKEND_CREATE_NEED_INIT &&
		    backend_lock(BACKEND_TYPE_NORMAL, 0, &be) ==
		    REP_PROTOCOL_SUCCESS) {
			struct stat sb;

			if ((stat(REPOSITORY_CHECKPOINT, &sb) == 0) &&
			    (sb.st_size > 0) && (sb.st_mode & S_IFREG)) {
				be->be_checkpoint = REPOSITORY_CHECKPOINT;
				flight_recorder_event(
				    BE_FLIGHT_EV_CHECKPOINT_EXISTS,
				    BE_FLIGHT_ST_INFO);
			}

			/*
			 * If we have a checkpoint and root is writable,
			 * make the backup now.
			 */
			if (be->be_checkpoint && writable_persist) {
				if (backend_create_backup_locked(be,
				    REPOSITORY_BOOT_BACKUP) !=
				    REP_PROTOCOL_SUCCESS) {
					configd_critical(
					    "unable to create \"%s\" backup of "
					    "\"%s\"\n", REPOSITORY_BOOT_BACKUP,
					    be->be_path);
				}
			}
			backend_unlock(be);
		}
	}

	/*
	 * If the persistent backend is writable at this point, upgrade it.
	 * This can occur in a few cases, most notably on UFS roots if
	 * we are operating on the backend from another root, as is the case
	 * during alternate-root BFU.
	 *
	 * Otherwise, upgrade will occur via backend_check_readonly() when
	 * the repository is re-opened read-write.
	 */
	if (writable_persist) {
		r = backend_lock(BACKEND_TYPE_NORMAL, 1, &be);
		assert(r == REP_PROTOCOL_SUCCESS);
		backend_check_upgrade(be, B_TRUE);
		backend_unlock(be);
	}

	return (CONFIGD_EXIT_OKAY);
}

/*
 * quiesce all database activity prior to exiting
 */
void
backend_fini(void)
{
	sqlite_backend_t *be_normal, *be_np;

	(void) backend_lock(BACKEND_TYPE_NORMAL, 1, &be_normal);
	(void) backend_lock(BACKEND_TYPE_NONPERSIST, 1, &be_np);
}

#define	QUERY_BASE	128
backend_query_t *
backend_query_alloc(void)
{
	backend_query_t *q;
	q = calloc(1, sizeof (backend_query_t));
	if (q != NULL) {
		q->bq_size = QUERY_BASE;
		q->bq_buf = calloc(1, q->bq_size);
		if (q->bq_buf == NULL) {
			q->bq_size = 0;
		}

	}
	return (q);
}

void
backend_query_append(backend_query_t *q, const char *value)
{
	char *alloc;
	int count;
	size_t size, old_len;

	if (q == NULL) {
		/* We'll discover the error when we try to run the query. */
		return;
	}

	while (q->bq_buf != NULL) {
		old_len = strlen(q->bq_buf);
		size = q->bq_size;
		count = strlcat(q->bq_buf, value, size);

		if (count < size)
			break;				/* success */

		q->bq_buf[old_len] = 0;
		size = round_up_to_p2(count + 1);

		assert(size > q->bq_size);
		alloc = realloc(q->bq_buf, size);
		if (alloc == NULL) {
			free(q->bq_buf);
			q->bq_buf = NULL;
			break;				/* can't grow */
		}

		q->bq_buf = alloc;
		q->bq_size = size;
	}
}

void
backend_query_add(backend_query_t *q, const char *format, ...)
{
	va_list args;
	char *new;

	if (q == NULL || q->bq_buf == NULL)
		return;

	va_start(args, format);
	new = sqlite_vmprintf(format, args);
	va_end(args);

	if (new == NULL) {
		free(q->bq_buf);
		q->bq_buf = NULL;
		return;
	}

	backend_query_append(q, new);

	free(new);
}

void
backend_query_free(backend_query_t *q)
{
	if (q != NULL) {
		if (q->bq_buf != NULL) {
			free(q->bq_buf);
		}
		free(q);
	}
}