/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (C) 2012-2016 Intel Corporation
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
#include "opt_nvme.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/sysctl.h>

#include "nvme_private.h"

#ifndef NVME_USE_NVD
#define NVME_USE_NVD 0
#endif

int nvme_use_nvd = NVME_USE_NVD;
bool nvme_verbose_cmd_dump = false;

SYSCTL_NODE(_hw, OID_AUTO, nvme, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
    "NVMe sysctl tunables");
SYSCTL_INT(_hw_nvme, OID_AUTO, use_nvd, CTLFLAG_RDTUN,
    &nvme_use_nvd, 1, "1 = Create NVD devices, 0 = Create NDA devices");
SYSCTL_BOOL(_hw_nvme, OID_AUTO, verbose_cmd_dump, CTLFLAG_RWTUN,
    &nvme_verbose_cmd_dump, 0,
    "enable verbose command printing when a command fails");

static void
nvme_dump_queue(struct nvme_qpair *qpair)
{
	struct nvme_completion *cpl;
	struct nvme_command *cmd;
	int i;

	printf("id:%04Xh phase:%d\n", qpair->id, qpair->phase);

	printf("Completion queue:\n");
	for (i = 0; i < qpair->num_entries; i++) {
		cpl = &qpair->cpl[i];
		printf("%05d: ", i);
		nvme_qpair_print_completion(qpair, cpl);
	}

	printf("Submission queue:\n");
	for (i = 0; i < qpair->num_entries; i++) {
		cmd = &qpair->cmd[i];
		printf("%05d: ", i);
		nvme_qpair_print_command(qpair, cmd);
	}
}

static int
nvme_sysctl_dump_debug(SYSCTL_HANDLER_ARGS)
{
	struct nvme_qpair 	*qpair = arg1;
	uint32_t		val = 0;

	int error = sysctl_handle_int(oidp, &val, 0, req);

	if (error)
		return (error);

	if (val != 0)
		nvme_dump_queue(qpair);

	return (0);
}

static int
nvme_sysctl_int_coal_time(SYSCTL_HANDLER_ARGS)
{
	struct nvme_controller *ctrlr = arg1;
	uint32_t oldval = ctrlr->int_coal_time;
	int error = sysctl_handle_int(oidp, &ctrlr->int_coal_time, 0,
	    req);

	if (error)
		return (error);

	if (oldval != ctrlr->int_coal_time)
		nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr,
		    ctrlr->int_coal_time, ctrlr->int_coal_threshold, NULL,
		    NULL);

	return (0);
}

static int
nvme_sysctl_int_coal_threshold(SYSCTL_HANDLER_ARGS)
{
	struct nvme_controller *ctrlr = arg1;
	uint32_t oldval = ctrlr->int_coal_threshold;
	int error = sysctl_handle_int(oidp, &ctrlr->int_coal_threshold, 0,
	    req);

	if (error)
		return (error);

	if (oldval != ctrlr->int_coal_threshold)
		nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr,
		    ctrlr->int_coal_time, ctrlr->int_coal_threshold, NULL,
		    NULL);

	return (0);
}

static int
nvme_sysctl_timeout_period(SYSCTL_HANDLER_ARGS)
{
	uint32_t *ptr = arg1;
	uint32_t newval = *ptr;
	int error = sysctl_handle_int(oidp, &newval, 0, req);

	if (error || (req->newptr == NULL))
		return (error);

	if (newval > NVME_MAX_TIMEOUT_PERIOD ||
	    newval < NVME_MIN_TIMEOUT_PERIOD) {
		return (EINVAL);
	} else {
		*ptr = newval;
	}

	return (0);
}

static void
nvme_qpair_reset_stats(struct nvme_qpair *qpair)
{

	/*
	 * Reset the values. Due to sanity checks in
	 * nvme_qpair_process_completions, we reset the number of interrupt
	 * calls to 1.
	 */
	qpair->num_cmds = 0;
	qpair->num_intr_handler_calls = 1;
	qpair->num_retries = 0;
	qpair->num_failures = 0;
	qpair->num_ignored = 0;
	qpair->num_recovery_nolock = 0;
}

static int
nvme_sysctl_num_cmds(SYSCTL_HANDLER_ARGS)
{
	struct nvme_controller 	*ctrlr = arg1;
	int64_t			num_cmds = 0;
	int			i;

	num_cmds = ctrlr->adminq.num_cmds;

	if (ctrlr->ioq != NULL) {
		for (i = 0; i < ctrlr->num_io_queues; i++)
			num_cmds += ctrlr->ioq[i].num_cmds;
	}

	return (sysctl_handle_64(oidp, &num_cmds, 0, req));
}

static int
nvme_sysctl_num_intr_handler_calls(SYSCTL_HANDLER_ARGS)
{
	struct nvme_controller 	*ctrlr = arg1;
	int64_t			num_intr_handler_calls = 0;
	int			i;

	num_intr_handler_calls = ctrlr->adminq.num_intr_handler_calls;

	if (ctrlr->ioq != NULL) {
		for (i = 0; i < ctrlr->num_io_queues; i++)
			num_intr_handler_calls += ctrlr->ioq[i].num_intr_handler_calls;
	}

	return (sysctl_handle_64(oidp, &num_intr_handler_calls, 0, req));
}

static int
nvme_sysctl_num_retries(SYSCTL_HANDLER_ARGS)
{
	struct nvme_controller 	*ctrlr = arg1;
	int64_t			num_retries = 0;
	int			i;

	num_retries = ctrlr->adminq.num_retries;

	if (ctrlr->ioq != NULL) {
		for (i = 0; i < ctrlr->num_io_queues; i++)
			num_retries += ctrlr->ioq[i].num_retries;
	}

	return (sysctl_handle_64(oidp, &num_retries, 0, req));
}

static int
nvme_sysctl_num_failures(SYSCTL_HANDLER_ARGS)
{
	struct nvme_controller 	*ctrlr = arg1;
	int64_t			num_failures = 0;
	int			i;

	num_failures = ctrlr->adminq.num_failures;

	if (ctrlr->ioq != NULL) {
		for (i = 0; i < ctrlr->num_io_queues; i++)
			num_failures += ctrlr->ioq[i].num_failures;
	}

	return (sysctl_handle_64(oidp, &num_failures, 0, req));
}

static int
nvme_sysctl_num_ignored(SYSCTL_HANDLER_ARGS)
{
	struct nvme_controller 	*ctrlr = arg1;
	int64_t			num_ignored = 0;
	int			i;

	num_ignored = ctrlr->adminq.num_ignored;

	if (ctrlr->ioq != NULL) {
		for (i = 0; i < ctrlr->num_io_queues; i++)
			num_ignored += ctrlr->ioq[i].num_ignored;
	}

	return (sysctl_handle_64(oidp, &num_ignored, 0, req));
}

static int
nvme_sysctl_num_recovery_nolock(SYSCTL_HANDLER_ARGS)
{
	struct nvme_controller 	*ctrlr = arg1;
	int64_t			num;
	int			i;

	num = ctrlr->adminq.num_recovery_nolock;

	if (ctrlr->ioq != NULL) {
		for (i = 0; i < ctrlr->num_io_queues; i++)
			num += ctrlr->ioq[i].num_recovery_nolock;
	}

	return (sysctl_handle_64(oidp, &num, 0, req));
}

static int
nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS)
{
	struct nvme_controller 	*ctrlr = arg1;
	uint32_t		i, val = 0;

	int error = sysctl_handle_int(oidp, &val, 0, req);

	if (error)
		return (error);

	if (val != 0) {
		nvme_qpair_reset_stats(&ctrlr->adminq);

		if (ctrlr->ioq != NULL) {
			for (i = 0; i < ctrlr->num_io_queues; i++)
				nvme_qpair_reset_stats(&ctrlr->ioq[i]);
		}
	}

	return (0);
}

static void
nvme_sysctl_initialize_queue(struct nvme_qpair *qpair,
    struct sysctl_ctx_list *ctrlr_ctx, struct sysctl_oid *que_tree)
{
	struct sysctl_oid_list	*que_list = SYSCTL_CHILDREN(que_tree);

	SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "num_entries",
	    CTLFLAG_RD, &qpair->num_entries, 0,
	    "Number of entries in hardware queue");
	SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "num_trackers",
	    CTLFLAG_RD, &qpair->num_trackers, 0,
	    "Number of trackers pre-allocated for this queue pair");
	SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "sq_head",
	    CTLFLAG_RD, &qpair->sq_head, 0,
	    "Current head of submission queue (as observed by driver)");
	SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "sq_tail",
	    CTLFLAG_RD, &qpair->sq_tail, 0,
	    "Current tail of submission queue (as observed by driver)");
	SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "cq_head",
	    CTLFLAG_RD, &qpair->cq_head, 0,
	    "Current head of completion queue (as observed by driver)");

	SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_cmds",
	    CTLFLAG_RD, &qpair->num_cmds, "Number of commands submitted");
	SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_intr_handler_calls",
	    CTLFLAG_RD, &qpair->num_intr_handler_calls,
	    "Number of times interrupt handler was invoked (will typically be "
	    "less than number of actual interrupts generated due to "
	    "coalescing)");
	SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_retries",
	    CTLFLAG_RD, &qpair->num_retries, "Number of commands retried");
	SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_failures",
	    CTLFLAG_RD, &qpair->num_failures,
	    "Number of commands ending in failure after all retries");
	SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_ignored",
	    CTLFLAG_RD, &qpair->num_ignored,
	    "Number of interrupts posted, but were administratively ignored");
	SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_recovery_nolock",
	    CTLFLAG_RD, &qpair->num_recovery_nolock,
	    "Number of times that we failed to lock recovery in the ISR");

	SYSCTL_ADD_UINT(ctrlr_ctx, que_list, OID_AUTO, "recovery",
	    CTLFLAG_RW, &qpair->recovery_state, 0,
	    "Current recovery state of the queue");

	SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO,
	    "dump_debug", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
	    qpair, 0, nvme_sysctl_dump_debug, "IU", "Dump debug data");
}

void
nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr)
{
	struct sysctl_ctx_list	*ctrlr_ctx;
	struct sysctl_oid	*ctrlr_tree, *que_tree, *ioq_tree;
	struct sysctl_oid_list	*ctrlr_list, *ioq_list;
#define QUEUE_NAME_LENGTH	16
	char			queue_name[QUEUE_NAME_LENGTH];
	int			i;

	ctrlr_ctx = device_get_sysctl_ctx(ctrlr->dev);
	ctrlr_tree = device_get_sysctl_tree(ctrlr->dev);
	ctrlr_list = SYSCTL_CHILDREN(ctrlr_tree);

	SYSCTL_ADD_UINT(ctrlr_ctx, ctrlr_list, OID_AUTO, "num_io_queues",
	    CTLFLAG_RD, &ctrlr->num_io_queues, 0,
	    "Number of I/O queue pairs");

	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	    "int_coal_time", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
	    ctrlr, 0, nvme_sysctl_int_coal_time, "IU",
	    "Interrupt coalescing timeout (in microseconds)");

	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	    "int_coal_threshold",
	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, ctrlr, 0,
	    nvme_sysctl_int_coal_threshold, "IU",
	    "Interrupt coalescing threshold");

	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	    "admin_timeout_period", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
	    &ctrlr->admin_timeout_period, 0, nvme_sysctl_timeout_period, "IU",
	    "Timeout period for Admin queue (in seconds)");

	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	    "timeout_period", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
	    &ctrlr->timeout_period, 0, nvme_sysctl_timeout_period, "IU",
	    "Timeout period for I/O queues (in seconds)");

	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	    "num_cmds", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
	    ctrlr, 0, nvme_sysctl_num_cmds, "IU",
	    "Number of commands submitted");

	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	    "num_intr_handler_calls",
	    CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, ctrlr, 0,
	    nvme_sysctl_num_intr_handler_calls, "IU",
	    "Number of times interrupt handler was invoked (will "
	    "typically be less than number of actual interrupts "
	    "generated due to coalescing)");

	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	    "num_retries", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
	    ctrlr, 0, nvme_sysctl_num_retries, "IU",
	    "Number of commands retried");

	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	    "num_failures", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
	    ctrlr, 0, nvme_sysctl_num_failures, "IU",
	    "Number of commands ending in failure after all retries");

	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	    "num_ignored", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
	    ctrlr, 0, nvme_sysctl_num_ignored, "IU",
	    "Number of interrupts ignored administratively");

	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	    "num_recovery_nolock", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
	    ctrlr, 0, nvme_sysctl_num_recovery_nolock, "IU",
	    "Number of times that we failed to lock recovery in the ISR");

	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
	    "reset_stats", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, ctrlr,
	    0, nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");

	SYSCTL_ADD_UINT(ctrlr_ctx, ctrlr_list, OID_AUTO, "cap_lo",
	    CTLFLAG_RD, &ctrlr->cap_lo, 0,
	    "Low 32-bits of capacities for the drive");

	SYSCTL_ADD_UINT(ctrlr_ctx, ctrlr_list, OID_AUTO, "cap_hi",
	    CTLFLAG_RD, &ctrlr->cap_hi, 0,
	    "Hi 32-bits of capacities for the drive");

	SYSCTL_ADD_UINT(ctrlr_ctx, ctrlr_list, OID_AUTO, "fail_on_reset",
	    CTLFLAG_RD, &ctrlr->fail_on_reset, 0,
	    "Pretend the next reset fails and fail the controller");

	que_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ctrlr_list, OID_AUTO, "adminq",
	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue");

	nvme_sysctl_initialize_queue(&ctrlr->adminq, ctrlr_ctx, que_tree);

	/*
	 * Make sure that we've constructed the I/O queues before setting up the
	 * sysctls. Failed controllers won't allocate it, but we want the rest
	 * of the sysctls to diagnose things.
	 */
	if (ctrlr->ioq != NULL) {
		ioq_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ctrlr_list, OID_AUTO,
		    "ioq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
		ioq_list = SYSCTL_CHILDREN(ioq_tree);

		for (i = 0; i < ctrlr->num_io_queues; i++) {
			snprintf(queue_name, QUEUE_NAME_LENGTH, "%d", i);
			que_tree = SYSCTL_ADD_NODE(ctrlr_ctx, ioq_list, OID_AUTO,
			    queue_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "IO Queue");
			nvme_sysctl_initialize_queue(&ctrlr->ioq[i], ctrlr_ctx,
			    que_tree);
		}
	}

	SYSCTL_ADD_COUNTER_U64(ctrlr_ctx, ctrlr_list, OID_AUTO, "alignment_splits",
	    CTLFLAG_RD, &ctrlr->alignment_splits,
	    "Number of times we split the I/O alignment for drives with preferred alignment");
}