// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>

#include <linux/math64.h>
#include <linux/slab.h>
#include <linux/seq_file.h>

#include "subvolume_metrics.h"
#include "mds_client.h"
#include "super.h"

/**
 * struct ceph_subvol_metric_rb_entry - Per-subvolume I/O metrics node
 * @node: Red-black tree linkage for tracker->tree
 * @subvolume_id: Subvolume identifier (key for rb-tree lookup)
 * @read_ops: Accumulated read operation count since last snapshot
 * @write_ops: Accumulated write operation count since last snapshot
 * @read_bytes: Accumulated bytes read since last snapshot
 * @write_bytes: Accumulated bytes written since last snapshot
 * @read_latency_us: Sum of read latencies in microseconds
 * @write_latency_us: Sum of write latencies in microseconds
 */
struct ceph_subvol_metric_rb_entry {
	struct rb_node node;
	u64 subvolume_id;
	u64 read_ops;
	u64 write_ops;
	u64 read_bytes;
	u64 write_bytes;
	u64 read_latency_us;
	u64 write_latency_us;
};

static struct kmem_cache *ceph_subvol_metric_entry_cachep;

void ceph_subvolume_metrics_init(struct ceph_subvolume_metrics_tracker *tracker)
{
	spin_lock_init(&tracker->lock);
	tracker->tree = RB_ROOT_CACHED;
	tracker->nr_entries = 0;
	tracker->enabled = false;
	atomic64_set(&tracker->snapshot_attempts, 0);
	atomic64_set(&tracker->snapshot_empty, 0);
	atomic64_set(&tracker->snapshot_failures, 0);
	atomic64_set(&tracker->record_calls, 0);
	atomic64_set(&tracker->record_disabled, 0);
	atomic64_set(&tracker->record_no_subvol, 0);
	atomic64_set(&tracker->total_read_ops, 0);
	atomic64_set(&tracker->total_read_bytes, 0);
	atomic64_set(&tracker->total_write_ops, 0);
	atomic64_set(&tracker->total_write_bytes, 0);
}

static struct ceph_subvol_metric_rb_entry *
__lookup_entry(struct ceph_subvolume_metrics_tracker *tracker, u64 subvol_id)
{
	struct rb_node *node;

	node = tracker->tree.rb_root.rb_node;
	while (node) {
		struct ceph_subvol_metric_rb_entry *entry =
			rb_entry(node, struct ceph_subvol_metric_rb_entry, node);

		if (subvol_id < entry->subvolume_id)
			node = node->rb_left;
		else if (subvol_id > entry->subvolume_id)
			node = node->rb_right;
		else
			return entry;
	}

	return NULL;
}

static struct ceph_subvol_metric_rb_entry *
__insert_entry(struct ceph_subvolume_metrics_tracker *tracker,
	       struct ceph_subvol_metric_rb_entry *entry)
{
	struct rb_node **link = &tracker->tree.rb_root.rb_node;
	struct rb_node *parent = NULL;
	bool leftmost = true;

	while (*link) {
		struct ceph_subvol_metric_rb_entry *cur =
			rb_entry(*link, struct ceph_subvol_metric_rb_entry, node);

		parent = *link;
		if (entry->subvolume_id < cur->subvolume_id)
			link = &(*link)->rb_left;
		else if (entry->subvolume_id > cur->subvolume_id) {
			link = &(*link)->rb_right;
			leftmost = false;
		} else
			return cur;
	}

	rb_link_node(&entry->node, parent, link);
	rb_insert_color_cached(&entry->node, &tracker->tree, leftmost);
	tracker->nr_entries++;
	return entry;
}

static void ceph_subvolume_metrics_clear_locked(
		struct ceph_subvolume_metrics_tracker *tracker)
{
	struct rb_node *node = rb_first_cached(&tracker->tree);

	while (node) {
		struct ceph_subvol_metric_rb_entry *entry =
			rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
		struct rb_node *next = rb_next(node);

		rb_erase_cached(&entry->node, &tracker->tree);
		tracker->nr_entries--;
		kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
		node = next;
	}

	tracker->tree = RB_ROOT_CACHED;
}

void ceph_subvolume_metrics_destroy(struct ceph_subvolume_metrics_tracker *tracker)
{
	spin_lock(&tracker->lock);
	ceph_subvolume_metrics_clear_locked(tracker);
	tracker->enabled = false;
	spin_unlock(&tracker->lock);
}

void ceph_subvolume_metrics_enable(struct ceph_subvolume_metrics_tracker *tracker,
				   bool enable)
{
	spin_lock(&tracker->lock);
	if (enable) {
		tracker->enabled = true;
	} else {
		tracker->enabled = false;
		ceph_subvolume_metrics_clear_locked(tracker);
	}
	spin_unlock(&tracker->lock);
}

void ceph_subvolume_metrics_record(struct ceph_subvolume_metrics_tracker *tracker,
				   u64 subvol_id, bool is_write,
				   size_t size, u64 latency_us)
{
	struct ceph_subvol_metric_rb_entry *entry, *new_entry = NULL;
	bool retry = false;

	/* CEPH_SUBVOLUME_ID_NONE (0) means unknown/unset subvolume */
	if (!READ_ONCE(tracker->enabled) ||
	    subvol_id == CEPH_SUBVOLUME_ID_NONE || !size || !latency_us)
		return;

	/*
	 * Retry loop for lock-free allocation pattern:
	 * 1. First iteration: lookup under lock, if miss -> drop lock, alloc, retry
	 * 2. Second iteration: lookup again (may have been inserted), insert if still missing
	 * 3. On race (another thread inserted same key): free our alloc, retry
	 * All successful paths exit via return, so retry flag doesn't need reset.
	 */
	do {
		spin_lock(&tracker->lock);
		if (!tracker->enabled) {
			spin_unlock(&tracker->lock);
			if (new_entry)
				kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
			return;
		}

		entry = __lookup_entry(tracker, subvol_id);
		if (!entry) {
			if (!new_entry) {
				spin_unlock(&tracker->lock);
				new_entry = kmem_cache_zalloc(ceph_subvol_metric_entry_cachep,
						      GFP_NOFS);
				if (!new_entry)
					return;
				new_entry->subvolume_id = subvol_id;
				retry = true;
				continue;
			}
			entry = __insert_entry(tracker, new_entry);
			if (entry != new_entry) {
				/* raced with another insert */
				spin_unlock(&tracker->lock);
				kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
				new_entry = NULL;
				retry = true;
				continue;
			}
			new_entry = NULL;
		}

		if (is_write) {
			entry->write_ops++;
			entry->write_bytes += size;
			entry->write_latency_us += latency_us;
			atomic64_inc(&tracker->total_write_ops);
			atomic64_add(size, &tracker->total_write_bytes);
		} else {
			entry->read_ops++;
			entry->read_bytes += size;
			entry->read_latency_us += latency_us;
			atomic64_inc(&tracker->total_read_ops);
			atomic64_add(size, &tracker->total_read_bytes);
		}
		spin_unlock(&tracker->lock);
		if (new_entry)
			kmem_cache_free(ceph_subvol_metric_entry_cachep, new_entry);
		return;
	} while (retry);
}

int ceph_subvolume_metrics_snapshot(struct ceph_subvolume_metrics_tracker *tracker,
				    struct ceph_subvol_metric_snapshot **out,
				    u32 *nr, bool consume)
{
	struct ceph_subvol_metric_snapshot *snap = NULL;
	struct rb_node *node;
	u32 count = 0, idx = 0;
	int ret = 0;

	*out = NULL;
	*nr = 0;

	if (!READ_ONCE(tracker->enabled))
		return 0;

	atomic64_inc(&tracker->snapshot_attempts);

	spin_lock(&tracker->lock);
	for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) {
		struct ceph_subvol_metric_rb_entry *entry =
			rb_entry(node, struct ceph_subvol_metric_rb_entry, node);

		/* Include entries with ANY I/O activity (read OR write) */
		if (entry->read_ops || entry->write_ops)
			count++;
	}
	spin_unlock(&tracker->lock);

	if (!count) {
		atomic64_inc(&tracker->snapshot_empty);
		return 0;
	}

	snap = kcalloc(count, sizeof(*snap), GFP_NOFS);
	if (!snap) {
		atomic64_inc(&tracker->snapshot_failures);
		return -ENOMEM;
	}

	spin_lock(&tracker->lock);
	node = rb_first_cached(&tracker->tree);
	while (node) {
		struct ceph_subvol_metric_rb_entry *entry =
			rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
		struct rb_node *next = rb_next(node);

		/* Skip entries with NO I/O activity at all */
		if (!entry->read_ops && !entry->write_ops) {
			rb_erase_cached(&entry->node, &tracker->tree);
			tracker->nr_entries--;
			kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
			node = next;
			continue;
		}

		if (idx >= count) {
			pr_warn("ceph: subvol metrics snapshot race (idx=%u count=%u)\n",
				idx, count);
			break;
		}

		snap[idx].subvolume_id = entry->subvolume_id;
		snap[idx].read_ops = entry->read_ops;
		snap[idx].write_ops = entry->write_ops;
		snap[idx].read_bytes = entry->read_bytes;
		snap[idx].write_bytes = entry->write_bytes;
		snap[idx].read_latency_us = entry->read_latency_us;
		snap[idx].write_latency_us = entry->write_latency_us;
		idx++;

		if (consume) {
			entry->read_ops = 0;
			entry->write_ops = 0;
			entry->read_bytes = 0;
			entry->write_bytes = 0;
			entry->read_latency_us = 0;
			entry->write_latency_us = 0;
			rb_erase_cached(&entry->node, &tracker->tree);
			tracker->nr_entries--;
			kmem_cache_free(ceph_subvol_metric_entry_cachep, entry);
		}
		node = next;
	}
	spin_unlock(&tracker->lock);

	if (!idx) {
		kfree(snap);
		snap = NULL;
		ret = 0;
	} else {
		*nr = idx;
		*out = snap;
	}

	return ret;
}

void ceph_subvolume_metrics_free_snapshot(struct ceph_subvol_metric_snapshot *snapshot)
{
	kfree(snapshot);
}

/*
 * Dump subvolume metrics to a seq_file for debugfs.
 *
 * Iterates the rb-tree directly under spinlock to avoid allocation.
 * The lock hold time is minimal since we're only doing seq_printf calls.
 */
void ceph_subvolume_metrics_dump(struct ceph_subvolume_metrics_tracker *tracker,
				 struct seq_file *s)
{
	struct rb_node *node;
	bool found = false;

	spin_lock(&tracker->lock);
	if (!tracker->enabled) {
		spin_unlock(&tracker->lock);
		seq_puts(s, "subvolume metrics disabled\n");
		return;
	}

	for (node = rb_first_cached(&tracker->tree); node; node = rb_next(node)) {
		struct ceph_subvol_metric_rb_entry *entry =
			rb_entry(node, struct ceph_subvol_metric_rb_entry, node);
		u64 avg_rd_lat, avg_wr_lat;

		if (!entry->read_ops && !entry->write_ops)
			continue;

		if (!found) {
			seq_puts(s, "subvol_id       rd_ops    rd_bytes    rd_avg_lat_us  wr_ops    wr_bytes    wr_avg_lat_us\n");
			seq_puts(s, "------------------------------------------------------------------------------------------------\n");
			found = true;
		}

		avg_rd_lat = entry->read_ops ?
			div64_u64(entry->read_latency_us, entry->read_ops) : 0;
		avg_wr_lat = entry->write_ops ?
			div64_u64(entry->write_latency_us, entry->write_ops) : 0;

		seq_printf(s, "%-15llu%-10llu%-12llu%-16llu%-10llu%-12llu%-16llu\n",
			   entry->subvolume_id,
			   entry->read_ops,
			   entry->read_bytes,
			   avg_rd_lat,
			   entry->write_ops,
			   entry->write_bytes,
			   avg_wr_lat);
	}
	spin_unlock(&tracker->lock);

	if (!found)
		seq_puts(s, "(no subvolume metrics collected)\n");
}

void ceph_subvolume_metrics_record_io(struct ceph_mds_client *mdsc,
				      struct ceph_inode_info *ci,
				      bool is_write, size_t bytes,
				      ktime_t start, ktime_t end)
{
	struct ceph_subvolume_metrics_tracker *tracker;
	u64 subvol_id;
	s64 delta_us;

	if (!mdsc || !ci || !bytes)
		return;

	tracker = &mdsc->subvol_metrics;
	atomic64_inc(&tracker->record_calls);

	if (!ceph_subvolume_metrics_enabled(tracker)) {
		atomic64_inc(&tracker->record_disabled);
		return;
	}

	subvol_id = READ_ONCE(ci->i_subvolume_id);
	if (subvol_id == CEPH_SUBVOLUME_ID_NONE) {
		atomic64_inc(&tracker->record_no_subvol);
		return;
	}

	delta_us = ktime_to_us(ktime_sub(end, start));
	if (delta_us <= 0)
		delta_us = 1;

	ceph_subvolume_metrics_record(tracker, subvol_id, is_write,
				      bytes, (u64)delta_us);
}

int __init ceph_subvolume_metrics_cache_init(void)
{
	ceph_subvol_metric_entry_cachep = KMEM_CACHE(ceph_subvol_metric_rb_entry,
						    SLAB_RECLAIM_ACCOUNT);
	if (!ceph_subvol_metric_entry_cachep)
		return -ENOMEM;
	return 0;
}

void ceph_subvolume_metrics_cache_destroy(void)
{
	kmem_cache_destroy(ceph_subvol_metric_entry_cachep);
}
