/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * sun4u Memory Scrubbing
 *
 * On detection of a correctable memory ECC error, the sun4u kernel
 * returns the corrected data to the requester and re-writes it
 * to memory (DRAM).  So if the correctable error was transient,
 * the read has effectively been cleaned (scrubbed) from memory.
 *
 * Scrubbing thus reduces the likelyhood that multiple transient errors
 * will occur in the same memory word, making uncorrectable errors due
 * to transients less likely.
 *
 * Thus is born the desire that every memory location be periodically
 * accessed.
 *
 * This file implements a memory scrubbing thread.  This scrubber
 * guarantees that all of physical memory is accessed periodically
 * (memscrub_period_sec -- 12 hours).
 *
 * It attempts to do this as unobtrusively as possible.  The thread
 * schedules itself to wake up at an interval such that if it reads
 * memscrub_span_pages (32MB) on each wakeup, it will read all of physical
 * memory in in memscrub_period_sec (12 hours).
 *
 * The scrubber uses the block load and prefetch hardware to read memory
 * @ 1300MB/s, so it reads spans of 32MB in 0.025 seconds.  Unlike the
 * original sun4d scrubber the sun4u scrubber does not read ahead if the
 * system is idle because we can read memory very efficently.
 *
 * The scrubber maintains a private copy of the phys_install memory list
 * to keep track of what memory should be scrubbed.
 *
 * The global routines memscrub_add_span() and memscrub_delete_span() are
 * used to add and delete from this list.  If hotplug memory is later
 * supported these two routines can be used to notify the scrubber of
 * memory configuration changes.
 *
 * The following parameters can be set via /etc/system
 *
 * memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES (8MB)
 * memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC (12 hours)
 * memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI)
 * memscrub_delay_start_sec = (5 minutes)
 * memscrub_verbose = (0)
 * memscrub_override_ticks = (1 tick)
 * disable_memscrub = (0)
 * pause_memscrub = (0)
 * read_all_memscrub = (0)
 *
 * The scrubber will print NOTICE messages of what it is doing if
 * "memscrub_verbose" is set.
 *
 * If the scrubber's sleep time calculation drops to zero ticks,
 * memscrub_override_ticks will be used as the sleep time instead. The
 * sleep time should only drop to zero on a system with over 131.84
 * terabytes of memory, or where the default scrubber parameters have
 * been adjusted. For example, reducing memscrub_span_pages or
 * memscrub_period_sec causes the sleep time to drop to zero with less
 * memory. Note that since the sleep time is calculated in clock ticks,
 * using hires clock ticks allows for more memory before the sleep time
 * becomes zero.
 *
 * The scrubber will exit (or never be started) if it finds the variable
 * "disable_memscrub" set.
 *
 * The scrubber will pause (not read memory) when "pause_memscrub"
 * is set.  It will check the state of pause_memscrub at each wakeup
 * period.  The scrubber will not make up for lost time.  If you
 * pause the scrubber for a prolonged period of time you can use
 * the "read_all_memscrub" switch (see below) to catch up. In addition,
 * pause_memscrub is used internally by the post memory DR callbacks.
 * It is set for the small period of time during which the callbacks
 * are executing. This ensures "memscrub_lock" will be released,
 * allowing the callbacks to finish.
 *
 * The scrubber will read all memory if "read_all_memscrub" is set.
 * The normal span read will also occur during the wakeup.
 *
 * MEMSCRUB_MIN_PAGES (32MB) is the minimum amount of memory a system
 * must have before we'll start the scrubber.
 *
 * MEMSCRUB_DFL_SPAN_PAGES (32MB) is based on the guess that 0.025 sec
 * is a "good" amount of minimum time for the thread to run at a time.
 *
 * MEMSCRUB_DFL_PERIOD_SEC (12 hours) is nearly a total guess --
 * twice the frequency the hardware folk estimated would be necessary.
 *
 * MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI) is based on the assumption
 * that the scurbber should get its fair share of time (since it
 * is short).  At a priority of 0 the scrubber will be starved.
 */

#include <sys/systm.h>		/* timeout, types, t_lock */
#include <sys/cmn_err.h>
#include <sys/sysmacros.h>	/* MIN */
#include <sys/memlist.h>	/* memlist */
#include <sys/mem_config.h>	/* memory add/delete */
#include <sys/kmem.h>		/* KMEM_NOSLEEP */
#include <sys/cpuvar.h>		/* ncpus_online */
#include <sys/debug.h>		/* ASSERTs */
#include <sys/machsystm.h>	/* lddphys */
#include <sys/cpu_module.h>	/* vtag_flushpage */
#include <sys/kstat.h>
#include <sys/atomic.h>		/* atomic_add_32 */

#include <vm/hat.h>
#include <vm/seg_kmem.h>
#include <vm/hat_sfmmu.h>	/* XXX FIXME - delete */

#include <sys/time.h>
#include <sys/callb.h>		/* CPR callback */
#include <sys/ontrap.h>

/*
 * Should really have paddr_t defined, but it is broken.  Use
 * ms_paddr_t in the meantime to make the code cleaner
 */
typedef uint64_t ms_paddr_t;

/*
 * Global Routines:
 */
int memscrub_add_span(pfn_t pfn, pgcnt_t pages);
int memscrub_delete_span(pfn_t pfn, pgcnt_t pages);
int memscrub_init(void);
void memscrub_induced_error(void);

/*
 * Global Data:
 */

/*
 * scrub if we have at least this many pages
 */
#define	MEMSCRUB_MIN_PAGES (32 * 1024 * 1024 / PAGESIZE)

/*
 * scan all of physical memory at least once every MEMSCRUB_PERIOD_SEC
 */
#define	MEMSCRUB_DFL_PERIOD_SEC	(12 * 60 * 60)	/* 12 hours */

/*
 * scan at least MEMSCRUB_DFL_SPAN_PAGES each iteration
 */
#define	MEMSCRUB_DFL_SPAN_PAGES	((32 * 1024 * 1024) / PAGESIZE)

/*
 * almost anything is higher priority than scrubbing
 */
#define	MEMSCRUB_DFL_THREAD_PRI	MINCLSYSPRI

/*
 * size used when scanning memory
 */
#define	MEMSCRUB_BLOCK_SIZE		256
#define	MEMSCRUB_BLOCK_SIZE_SHIFT	8 	/* log2(MEMSCRUB_BLOCK_SIZE) */
#define	MEMSCRUB_BLOCKS_PER_PAGE	(PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT)

#define	MEMSCRUB_BPP4M		MMU_PAGESIZE4M >> MEMSCRUB_BLOCK_SIZE_SHIFT
#define	MEMSCRUB_BPP512K	MMU_PAGESIZE512K >> MEMSCRUB_BLOCK_SIZE_SHIFT
#define	MEMSCRUB_BPP64K		MMU_PAGESIZE64K >> MEMSCRUB_BLOCK_SIZE_SHIFT
#define	MEMSCRUB_BPP		MMU_PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT

/*
 * This message indicates that we have exceeded the limitations of
 * the memscrubber. See the comments above regarding what would
 * cause the sleep time to become zero. In DEBUG mode, this message
 * is logged on the console and in the messages file. In non-DEBUG
 * mode, it is only logged in the messages file.
 */
#ifdef DEBUG
#define	MEMSCRUB_OVERRIDE_MSG	"Memory scrubber sleep time is zero " \
	"seconds, consuming entire CPU."
#else
#define	MEMSCRUB_OVERRIDE_MSG	"!Memory scrubber sleep time is zero " \
	"seconds, consuming entire CPU."
#endif /* DEBUG */

/*
 * we can patch these defaults in /etc/system if necessary
 */
uint_t disable_memscrub = 0;
uint_t pause_memscrub = 0;
uint_t read_all_memscrub = 0;
uint_t memscrub_verbose = 0;
uint_t memscrub_all_idle = 0;
uint_t memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES;
uint_t memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC;
uint_t memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI;
uint_t memscrub_delay_start_sec = 5 * 60;
uint_t memscrub_override_ticks = 1;

/*
 * Static Routines
 */
static void memscrubber(void);
static void memscrub_cleanup(void);
static int memscrub_add_span_gen(pfn_t, pgcnt_t, struct memlist **, uint_t *);
static int memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp);
static void memscrub_scan(uint_t blks, ms_paddr_t src);

/*
 * Static Data
 */

static struct memlist *memscrub_memlist;
static uint_t memscrub_phys_pages;

static kcondvar_t memscrub_cv;
static kmutex_t memscrub_lock;
/*
 * memscrub_lock protects memscrub_memlist, interval_ticks, cprinfo, ...
 */
static void memscrub_init_mem_config(void);
static void memscrub_uninit_mem_config(void);

/*
 * Linked list of memscrub aware spans having retired pages.
 * Currently enabled only on sun4u USIII-based platforms.
 */
typedef struct memscrub_page_retire_span {
	ms_paddr_t				address;
	struct memscrub_page_retire_span	*next;
} memscrub_page_retire_span_t;

static memscrub_page_retire_span_t *memscrub_page_retire_span_list = NULL;

static void memscrub_page_retire_span_add(ms_paddr_t);
static void memscrub_page_retire_span_delete(ms_paddr_t);
static int memscrub_page_retire_span_search(ms_paddr_t);
static void memscrub_page_retire_span_list_update(void);

/*
 * add_to_page_retire_list: Set by cpu_async_log_err() routine
 * by calling memscrub_induced_error() when CE/UE occurs on a retired
 * page due to memscrub reading.  Cleared by memscrub after updating
 * global page retire span list.  Piggybacking on protection of
 * memscrub_lock, which is held during set and clear.
 * Note: When cpu_async_log_err() calls memscrub_induced_error(), it is running
 * on softint context, which gets fired on a cpu memscrub thread currently
 * running.  Memscrub thread has affinity set during memscrub_read(), hence
 * migration to new cpu not expected.
 */
static int add_to_page_retire_list = 0;

/*
 * Keep track of some interesting statistics
 */
static struct memscrub_kstats {
	kstat_named_t	done_early;	/* ahead of schedule */
	kstat_named_t	early_sec;	/* by cumulative num secs */
	kstat_named_t	done_late;	/* behind schedule */
	kstat_named_t	late_sec;	/* by cumulative num secs */
	kstat_named_t	interval_ticks;	/* num ticks between intervals */
	kstat_named_t	force_run;	/* forced to run, non-timeout */
	kstat_named_t	errors_found;	/* num errors found by memscrub */
} memscrub_counts = {
	{ "done_early",		KSTAT_DATA_UINT32 },
	{ "early_sec", 		KSTAT_DATA_UINT32 },
	{ "done_late", 		KSTAT_DATA_UINT32 },
	{ "late_sec",		KSTAT_DATA_UINT32 },
	{ "interval_ticks",	KSTAT_DATA_UINT32 },
	{ "force_run",		KSTAT_DATA_UINT32 },
	{ "errors_found",	KSTAT_DATA_UINT32 },
};

#define	MEMSCRUB_STAT_INC(stat)	memscrub_counts.stat.value.ui32++
#define	MEMSCRUB_STAT_SET(stat, val) memscrub_counts.stat.value.ui32 = (val)
#define	MEMSCRUB_STAT_NINC(stat, val) memscrub_counts.stat.value.ui32 += (val)

static struct kstat *memscrub_ksp = (struct kstat *)NULL;

static timeout_id_t memscrub_tid = 0;	/* keep track of timeout id */

/*
 * create memscrub_memlist from phys_install list
 * initialize locks, set memscrub_phys_pages.
 */
int
memscrub_init(void)
{
	struct memlist *src;

	/*
	 * only startup the scrubber if we have a minimum
	 * number of pages
	 */
	if (physinstalled >= MEMSCRUB_MIN_PAGES) {

		/*
		 * initialize locks
		 */
		mutex_init(&memscrub_lock, NULL, MUTEX_DRIVER, NULL);
		cv_init(&memscrub_cv, NULL, CV_DRIVER, NULL);

		/*
		 * copy phys_install to memscrub_memlist
		 */
		for (src = phys_install; src; src = src->ml_next) {
			if (memscrub_add_span(
			    (pfn_t)(src->ml_address >> PAGESHIFT),
			    (pgcnt_t)(src->ml_size >> PAGESHIFT))) {
				memscrub_cleanup();
				return (-1);
			}
		}

		/*
		 * initialize kstats
		 */
		memscrub_ksp = kstat_create("unix", 0, "memscrub_kstat",
		    "misc", KSTAT_TYPE_NAMED,
		    sizeof (memscrub_counts) / sizeof (kstat_named_t),
		    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);

		if (memscrub_ksp) {
			memscrub_ksp->ks_data = (void *)&memscrub_counts;
			kstat_install(memscrub_ksp);
		} else {
			cmn_err(CE_NOTE, "Memscrubber cannot create kstats\n");
		}

		/*
		 * create memscrubber thread
		 */
		(void) thread_create(NULL, 0, (void (*)())memscrubber,
		    NULL, 0, &p0, TS_RUN, memscrub_thread_pri);

		/*
		 * We don't want call backs changing the list
		 * if there is no thread running. We do not
		 * attempt to deal with stopping/starting scrubbing
		 * on memory size changes.
		 */
		memscrub_init_mem_config();
	}

	return (0);
}

static void
memscrub_cleanup(void)
{
	memscrub_uninit_mem_config();
	while (memscrub_memlist) {
		(void) memscrub_delete_span(
		    (pfn_t)(memscrub_memlist->ml_address >> PAGESHIFT),
		    (pgcnt_t)(memscrub_memlist->ml_size >> PAGESHIFT));
	}
	if (memscrub_ksp)
		kstat_delete(memscrub_ksp);
	cv_destroy(&memscrub_cv);
	mutex_destroy(&memscrub_lock);
}

#ifdef MEMSCRUB_DEBUG
static void
memscrub_printmemlist(char *title, struct memlist *listp)
{
	struct memlist *list;

	cmn_err(CE_CONT, "%s:\n", title);

	for (list = listp; list; list = list->ml_next) {
		cmn_err(CE_CONT, "addr = 0x%llx, size = 0x%llx\n",
		    list->ml_address, list->ml_size);
	}
}
#endif /* MEMSCRUB_DEBUG */

/* ARGSUSED */
static void
memscrub_wakeup(void *c)
{
	/*
	 * grab mutex to guarantee that our wakeup call
	 * arrives after we go to sleep -- so we can't sleep forever.
	 */
	mutex_enter(&memscrub_lock);
	cv_signal(&memscrub_cv);
	mutex_exit(&memscrub_lock);
}

/*
 * provide an interface external to the memscrubber
 * which will force the memscrub thread to run vs.
 * waiting for the timeout, if one is set
 */
void
memscrub_run(void)
{
	MEMSCRUB_STAT_INC(force_run);
	if (memscrub_tid) {
		(void) untimeout(memscrub_tid);
		memscrub_wakeup((void *)NULL);
	}
}

/*
 * this calculation doesn't account for the time
 * that the actual scan consumes -- so we'd fall
 * slightly behind schedule with this interval.
 * It's very small.
 */

static uint_t
compute_interval_ticks(void)
{
	/*
	 * We use msp_safe mpp_safe below to insure somebody
	 * doesn't set memscrub_span_pages or memscrub_phys_pages
	 * to 0 on us.
	 */
	static uint_t msp_safe, mpp_safe;
	static uint_t interval_ticks, period_ticks;
	msp_safe = memscrub_span_pages;
	mpp_safe = memscrub_phys_pages;

	period_ticks = memscrub_period_sec * hz;
	interval_ticks = period_ticks;

	ASSERT(mutex_owned(&memscrub_lock));

	if ((msp_safe != 0) && (mpp_safe != 0)) {
		if (memscrub_phys_pages <= msp_safe) {
			interval_ticks = period_ticks;
		} else {
			interval_ticks = (period_ticks /
			    (mpp_safe / msp_safe));
		}
	}
	return (interval_ticks);
}

void
memscrubber(void)
{
	ms_paddr_t address, addr;
	time_t deadline;
	pgcnt_t pages;
	uint_t reached_end = 1;
	uint_t paused_message = 0;
	uint_t interval_ticks = 0;
	uint_t sleep_warn_printed = 0;
	callb_cpr_t cprinfo;

	/*
	 * notify CPR of our existence
	 */
	CALLB_CPR_INIT(&cprinfo, &memscrub_lock, callb_generic_cpr, "memscrub");

	mutex_enter(&memscrub_lock);

	if (memscrub_memlist == NULL) {
		cmn_err(CE_WARN, "memscrub_memlist not initialized.");
		goto memscrub_exit;
	}

	address = memscrub_memlist->ml_address;

	deadline = gethrestime_sec() + memscrub_delay_start_sec;

	for (;;) {
		if (disable_memscrub)
			break;

		/*
		 * compute interval_ticks
		 */
		interval_ticks = compute_interval_ticks();

		/*
		 * If the calculated sleep time is zero, and pause_memscrub
		 * has been set, make sure we sleep so that another thread
		 * can acquire memscrub_lock.
		 */
		if (interval_ticks == 0 && pause_memscrub) {
			interval_ticks = hz;
		}

		/*
		 * And as a fail safe, under normal non-paused operation, do
		 * not allow the sleep time to be zero.
		 */
		if (interval_ticks == 0) {
			interval_ticks = memscrub_override_ticks;
			if (!sleep_warn_printed) {
				cmn_err(CE_NOTE, MEMSCRUB_OVERRIDE_MSG);
				sleep_warn_printed = 1;
			}
		}

		MEMSCRUB_STAT_SET(interval_ticks, interval_ticks);

		/*
		 * Did we just reach the end of memory? If we are at the
		 * end of memory, delay end of memory processing until
		 * pause_memscrub is not set.
		 */
		if (reached_end && !pause_memscrub) {
			time_t now = gethrestime_sec();

			if (now >= deadline) {
				MEMSCRUB_STAT_INC(done_late);
				MEMSCRUB_STAT_NINC(late_sec, now - deadline);
				/*
				 * past deadline, start right away
				 */
				interval_ticks = 0;

				deadline = now + memscrub_period_sec;
			} else {
				/*
				 * we finished ahead of schedule.
				 * wait till previous deadline before re-start.
				 */
				interval_ticks = (deadline - now) * hz;
				MEMSCRUB_STAT_INC(done_early);
				MEMSCRUB_STAT_NINC(early_sec, deadline - now);
				deadline += memscrub_period_sec;
			}
			reached_end = 0;
			sleep_warn_printed = 0;
		}

		if (interval_ticks != 0) {
			/*
			 * it is safe from our standpoint for CPR to
			 * suspend the system
			 */
			CALLB_CPR_SAFE_BEGIN(&cprinfo);

			/*
			 * hit the snooze bar
			 */
			memscrub_tid = timeout(memscrub_wakeup, NULL,
			    interval_ticks);

			/*
			 * go to sleep
			 */
			cv_wait(&memscrub_cv, &memscrub_lock);

			/*
			 * at this point, no timeout should be set
			 */
			memscrub_tid = 0;

			/*
			 * we need to goto work and will be modifying
			 * our internal state and mapping/unmapping
			 * TTEs
			 */
			CALLB_CPR_SAFE_END(&cprinfo, &memscrub_lock);
		}


		if (memscrub_phys_pages == 0) {
			cmn_err(CE_WARN, "Memory scrubber has 0 pages to read");
			goto memscrub_exit;
		}

		if (!pause_memscrub) {
			if (paused_message) {
				paused_message = 0;
				if (memscrub_verbose)
					cmn_err(CE_NOTE, "Memory scrubber "
					    "resuming");
			}

			if (read_all_memscrub) {
				if (memscrub_verbose)
					cmn_err(CE_NOTE, "Memory scrubber "
					    "reading all memory per request");

				addr = memscrub_memlist->ml_address;
				reached_end = 0;
				while (!reached_end) {
					if (disable_memscrub)
						break;
					pages = memscrub_phys_pages;
					reached_end = memscrub_verify_span(
					    &addr, &pages);
					memscrub_scan(pages *
					    MEMSCRUB_BLOCKS_PER_PAGE, addr);
					addr += ((uint64_t)pages * PAGESIZE);
				}
				read_all_memscrub = 0;
			}

			/*
			 * read 1 span
			 */
			pages = memscrub_span_pages;

			if (disable_memscrub)
				break;

			/*
			 * determine physical address range
			 */
			reached_end = memscrub_verify_span(&address,
			    &pages);

			memscrub_scan(pages * MEMSCRUB_BLOCKS_PER_PAGE,
			    address);

			address += ((uint64_t)pages * PAGESIZE);
		}

		if (pause_memscrub && !paused_message) {
			paused_message = 1;
			if (memscrub_verbose)
				cmn_err(CE_NOTE, "Memory scrubber paused");
		}
	}

memscrub_exit:
	cmn_err(CE_NOTE, "Memory scrubber exiting");
	CALLB_CPR_EXIT(&cprinfo);
	memscrub_cleanup();
	thread_exit();
	/* NOTREACHED */
}

/*
 * condition address and size
 * such that they span legal physical addresses.
 *
 * when appropriate, address will be rounded up to start of next
 * struct memlist, and pages will be rounded down to the end of the
 * memlist size.
 *
 * returns 1 if reached end of list, else returns 0.
 */
static int
memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp)
{
	struct memlist *mlp;
	ms_paddr_t address = *addrp;
	uint64_t bytes = (uint64_t)*pagesp * PAGESIZE;
	uint64_t bytes_remaining;
	int reached_end = 0;

	ASSERT(mutex_owned(&memscrub_lock));

	/*
	 * find memlist struct that contains addrp
	 * assumes memlist is sorted by ascending address.
	 */
	for (mlp = memscrub_memlist; mlp != NULL; mlp = mlp->ml_next) {
		/*
		 * if before this chunk, round up to beginning
		 */
		if (address < mlp->ml_address) {
			address = mlp->ml_address;
			break;
		}
		/*
		 * if before end of chunk, then we found it
		 */
		if (address < (mlp->ml_address + mlp->ml_size))
			break;

		/* else go to next struct memlist */
	}
	/*
	 * if we hit end of list, start at beginning
	 */
	if (mlp == NULL) {
		mlp = memscrub_memlist;
		address = mlp->ml_address;
	}

	/*
	 * now we have legal address, and its mlp, condition bytes
	 */
	bytes_remaining = (mlp->ml_address + mlp->ml_size) - address;

	if (bytes > bytes_remaining)
		bytes = bytes_remaining;

	/*
	 * will this span take us to end of list?
	 */
	if ((mlp->ml_next == NULL) &&
	    ((mlp->ml_address + mlp->ml_size) == (address + bytes)))
		reached_end = 1;

	/* return values */
	*addrp = address;
	*pagesp = bytes / PAGESIZE;

	return (reached_end);
}

/*
 * add a span to the memscrub list
 * add to memscrub_phys_pages
 */
int
memscrub_add_span(pfn_t pfn, pgcnt_t pages)
{
#ifdef MEMSCRUB_DEBUG
	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
#endif /* MEMSCRUB_DEBUG */

	int retval;

	mutex_enter(&memscrub_lock);

#ifdef MEMSCRUB_DEBUG
	memscrub_printmemlist("memscrub_memlist before", memscrub_memlist);
	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
	cmn_err(CE_CONT, "memscrub_add_span: address: 0x%llx"
	    " size: 0x%llx\n", address, bytes);
#endif /* MEMSCRUB_DEBUG */

	retval = memscrub_add_span_gen(pfn, pages, &memscrub_memlist,
	    &memscrub_phys_pages);

#ifdef MEMSCRUB_DEBUG
	memscrub_printmemlist("memscrub_memlist after", memscrub_memlist);
	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
#endif /* MEMSCRUB_DEBUG */

	mutex_exit(&memscrub_lock);

	return (retval);
}

static int
memscrub_add_span_gen(
	pfn_t pfn,
	pgcnt_t pages,
	struct memlist **list,
	uint_t *npgs)
{
	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
	struct memlist *dst;
	struct memlist *prev, *next;
	int retval = 0;

	/*
	 * allocate a new struct memlist
	 */

	dst = (struct memlist *)
	    kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);

	if (dst == NULL) {
		retval = -1;
		goto add_done;
	}

	dst->ml_address = address;
	dst->ml_size = bytes;

	/*
	 * first insert
	 */
	if (*list == NULL) {
		dst->ml_prev = NULL;
		dst->ml_next = NULL;
		*list = dst;

		goto add_done;
	}

	/*
	 * insert into sorted list
	 */
	for (prev = NULL, next = *list;
	    next != NULL;
	    prev = next, next = next->ml_next) {
		if (address > (next->ml_address + next->ml_size))
			continue;

		/*
		 * else insert here
		 */

		/*
		 * prepend to next
		 */
		if ((address + bytes) == next->ml_address) {
			kmem_free(dst, sizeof (struct memlist));

			next->ml_address = address;
			next->ml_size += bytes;

			goto add_done;
		}

		/*
		 * append to next
		 */
		if (address == (next->ml_address + next->ml_size)) {
			kmem_free(dst, sizeof (struct memlist));

			if (next->ml_next) {
				/*
				 * don't overlap with next->ml_next
				 */
				if ((address + bytes) >
				    next->ml_next->ml_address) {
					retval = -1;
					goto add_done;
				}
				/*
				 * concatenate next and next->ml_next
				 */
				if ((address + bytes) ==
				    next->ml_next->ml_address) {
					struct memlist *mlp = next->ml_next;

					if (next == *list)
						*list = next->ml_next;

					mlp->ml_address = next->ml_address;
					mlp->ml_size += next->ml_size;
					mlp->ml_size += bytes;

					if (next->ml_prev)
						next->ml_prev->ml_next = mlp;
					mlp->ml_prev = next->ml_prev;

					kmem_free(next,
					    sizeof (struct memlist));
					goto add_done;
				}
			}

			next->ml_size += bytes;

			goto add_done;
		}

		/* don't overlap with next */
		if ((address + bytes) > next->ml_address) {
			retval = -1;
			kmem_free(dst, sizeof (struct memlist));
			goto add_done;
		}

		/*
		 * insert before next
		 */
		dst->ml_prev = prev;
		dst->ml_next = next;
		next->ml_prev = dst;
		if (prev == NULL) {
			*list = dst;
		} else {
			prev->ml_next = dst;
		}
		goto add_done;
	}	/* end for */

	/*
	 * end of list, prev is valid and next is NULL
	 */
	prev->ml_next = dst;
	dst->ml_prev = prev;
	dst->ml_next = NULL;

add_done:

	if (retval != -1)
		*npgs += pages;

	return (retval);
}

/*
 * delete a span from the memscrub list
 * subtract from memscrub_phys_pages
 */
int
memscrub_delete_span(pfn_t pfn, pgcnt_t pages)
{
	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
	struct memlist *dst, *next;
	int retval = 0;

	mutex_enter(&memscrub_lock);

#ifdef MEMSCRUB_DEBUG
	memscrub_printmemlist("memscrub_memlist Before", memscrub_memlist);
	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
	cmn_err(CE_CONT, "memscrub_delete_span: 0x%llx 0x%llx\n",
	    address, bytes);
#endif /* MEMSCRUB_DEBUG */

	/*
	 * find struct memlist containing page
	 */
	for (next = memscrub_memlist; next != NULL; next = next->ml_next) {
		if ((address >= next->ml_address) &&
		    (address < next->ml_address + next->ml_size))
			break;
	}

	/*
	 * if start address not in list
	 */
	if (next == NULL) {
		retval = -1;
		goto delete_done;
	}

	/*
	 * error if size goes off end of this struct memlist
	 */
	if (address + bytes > next->ml_address + next->ml_size) {
		retval = -1;
		goto delete_done;
	}

	/*
	 * pages at beginning of struct memlist
	 */
	if (address == next->ml_address) {
		/*
		 * if start & size match, delete from list
		 */
		if (bytes == next->ml_size) {
			if (next == memscrub_memlist)
				memscrub_memlist = next->ml_next;
			if (next->ml_prev != NULL)
				next->ml_prev->ml_next = next->ml_next;
			if (next->ml_next != NULL)
				next->ml_next->ml_prev = next->ml_prev;

			kmem_free(next, sizeof (struct memlist));
		} else {
		/*
		 * increment start address by bytes
		 */
			next->ml_address += bytes;
			next->ml_size -= bytes;
		}
		goto delete_done;
	}

	/*
	 * pages at end of struct memlist
	 */
	if (address + bytes == next->ml_address + next->ml_size) {
		/*
		 * decrement size by bytes
		 */
		next->ml_size -= bytes;
		goto delete_done;
	}

	/*
	 * delete a span in the middle of the struct memlist
	 */
	{
		/*
		 * create a new struct memlist
		 */
		dst = (struct memlist *)
		    kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);

		if (dst == NULL) {
			retval = -1;
			goto delete_done;
		}

		/*
		 * existing struct memlist gets address
		 * and size up to pfn
		 */
		dst->ml_address = address + bytes;
		dst->ml_size =
		    (next->ml_address + next->ml_size) - dst->ml_address;
		next->ml_size = address - next->ml_address;

		/*
		 * new struct memlist gets address starting
		 * after pfn, until end
		 */

		/*
		 * link in new memlist after old
		 */
		dst->ml_next = next->ml_next;
		dst->ml_prev = next;

		if (next->ml_next != NULL)
			next->ml_next->ml_prev = dst;
		next->ml_next = dst;
	}

delete_done:
	if (retval != -1) {
		memscrub_phys_pages -= pages;
		if (memscrub_phys_pages == 0)
			disable_memscrub = 1;
	}

#ifdef MEMSCRUB_DEBUG
	memscrub_printmemlist("memscrub_memlist After", memscrub_memlist);
	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
#endif /* MEMSCRUB_DEBUG */

	mutex_exit(&memscrub_lock);
	return (retval);
}

static void
memscrub_scan(uint_t blks, ms_paddr_t src)
{
	uint_t 		psz, bpp, pgsread;
	pfn_t		pfn;
	ms_paddr_t	pa;
	caddr_t		va;
	on_trap_data_t	otd;
	int		scan_mmu_pagesize = 0;
	int		retired_pages = 0;

	extern void memscrub_read(caddr_t src, uint_t blks);

	ASSERT(mutex_owned(&memscrub_lock));

	pgsread = 0;
	pa = src;

	if (memscrub_page_retire_span_list != NULL) {
		if (memscrub_page_retire_span_search(src)) {
			/* retired pages in current span */
			scan_mmu_pagesize = 1;
		}
	}

#ifdef MEMSCRUB_DEBUG
	cmn_err(CE_NOTE, "scan_mmu_pagesize = %d\n" scan_mmu_pagesize);
#endif /* MEMSCRUB_DEBUG */

	while (blks != 0) {
		/* Ensure the PA is properly aligned */
		if (((pa & MMU_PAGEMASK4M) == pa) &&
		    (blks >= MEMSCRUB_BPP4M)) {
			psz = MMU_PAGESIZE4M;
			bpp = MEMSCRUB_BPP4M;
		} else if (((pa & MMU_PAGEMASK512K) == pa) &&
		    (blks >= MEMSCRUB_BPP512K)) {
			psz = MMU_PAGESIZE512K;
			bpp = MEMSCRUB_BPP512K;
		} else if (((pa & MMU_PAGEMASK64K) == pa) &&
		    (blks >= MEMSCRUB_BPP64K)) {
			psz = MMU_PAGESIZE64K;
			bpp = MEMSCRUB_BPP64K;
		} else if ((pa & MMU_PAGEMASK) == pa) {
			psz = MMU_PAGESIZE;
			bpp = MEMSCRUB_BPP;
		} else {
			if (memscrub_verbose) {
				cmn_err(CE_NOTE, "Memory scrubber ignoring "
				    "non-page aligned block starting at 0x%"
				    PRIx64, src);
			}
			return;
		}
		if (blks < bpp) bpp = blks;

#ifdef MEMSCRUB_DEBUG
		cmn_err(CE_NOTE, "Going to run psz=%x, "
		    "bpp=%x pa=%llx\n", psz, bpp, pa);
#endif /* MEMSCRUB_DEBUG */

		/*
		 * MEMSCRUBBASE is a 4MB aligned page in the
		 * kernel so that we can quickly map the PA
		 * to a VA for the block loads performed in
		 * memscrub_read.
		 */
		pfn = mmu_btop(pa);
		va = (caddr_t)MEMSCRUBBASE;
		hat_devload(kas.a_hat, va, psz, pfn, PROT_READ,
		    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);

		/*
		 * Can't allow the memscrubber to migrate across CPUs as
		 * we need to know whether CEEN is enabled for the current
		 * CPU to enable us to scrub the memory. Don't use
		 * kpreempt_disable as the time we take to scan a span (even
		 * without cpu_check_ce having to manually cpu_check_block)
		 * is too long to hold a higher priority thread (eg, RT)
		 * off cpu.
		 */
		thread_affinity_set(curthread, CPU_CURRENT);

		/*
		 * Protect read scrub from async faults.  For now, we simply
		 * maintain a count of such faults caught.
		 */

		if (!on_trap(&otd, OT_DATA_EC) && !scan_mmu_pagesize) {
			memscrub_read(va, bpp);
			/*
			 * Check if CEs require logging
			 */
			cpu_check_ce(SCRUBBER_CEEN_CHECK,
			    (uint64_t)pa, va, psz);
			no_trap();
			thread_affinity_clear(curthread);
		} else {
			no_trap();
			thread_affinity_clear(curthread);

			/*
			 * Got an async error..
			 * Try rescanning it at MMU_PAGESIZE
			 * granularity if we were trying to
			 * read at a larger page size.
			 * This is to ensure we continue to
			 * scan the rest of the span.
			 * OR scanning MMU_PAGESIZE granularity to avoid
			 * reading retired pages memory when scan_mmu_pagesize
			 * is set.
			 */
			if (psz > MMU_PAGESIZE || scan_mmu_pagesize) {
				caddr_t vaddr = va;
				ms_paddr_t paddr = pa;
				int tmp = 0;
				for (; tmp < bpp; tmp += MEMSCRUB_BPP) {
					/* Don't scrub retired pages */
					if (page_retire_check(paddr, NULL)
					    == 0) {
						vaddr += MMU_PAGESIZE;
						paddr += MMU_PAGESIZE;
						retired_pages++;
						continue;
					}
					thread_affinity_set(curthread,
					    CPU_CURRENT);
					if (!on_trap(&otd, OT_DATA_EC)) {
						memscrub_read(vaddr,
						    MEMSCRUB_BPP);
						cpu_check_ce(
						    SCRUBBER_CEEN_CHECK,
						    (uint64_t)paddr, vaddr,
						    MMU_PAGESIZE);
						no_trap();
					} else {
						no_trap();
						MEMSCRUB_STAT_INC(errors_found);
					}
					thread_affinity_clear(curthread);
					vaddr += MMU_PAGESIZE;
					paddr += MMU_PAGESIZE;
				}
			}
		}
		hat_unload(kas.a_hat, va, psz, HAT_UNLOAD_UNLOCK);

		blks -= bpp;
		pa += psz;
		pgsread++;
	}

	/*
	 * If just finished scrubbing MMU_PAGESIZE at a time, but no retired
	 * pages found so delete span from global list.
	 */
	if (scan_mmu_pagesize && retired_pages == 0)
		memscrub_page_retire_span_delete(src);

	/*
	 * Encountered CE/UE on a retired page during memscrub read of current
	 * span.  Adding span to global list to enable avoid reading further.
	 */
	if (add_to_page_retire_list) {
		if (!memscrub_page_retire_span_search(src))
			memscrub_page_retire_span_add(src);
		add_to_page_retire_list = 0;
	}

	if (memscrub_verbose) {
		cmn_err(CE_NOTE, "Memory scrubber read 0x%x pages starting "
		    "at 0x%" PRIx64, pgsread, src);
	}
}

/*
 * Called by cpu_async_log_err() when memscrub read causes
 * CE/UE on a retired page.
 */
void
memscrub_induced_error(void)
{
	add_to_page_retire_list = 1;
}

/*
 * Called by page_retire() when toxic pages cannot be retired
 * immediately and are scheduled for retire.  Memscrubber stops
 * scrubbing them to avoid further CE/UEs.
 */
void
memscrub_notify(ms_paddr_t pa)
{
	mutex_enter(&memscrub_lock);
	if (!memscrub_page_retire_span_search(pa))
		memscrub_page_retire_span_add(pa);
	mutex_exit(&memscrub_lock);
}

/*
 * Called by memscrub_scan() and memscrub_notify().
 * pa: physical address of span with CE/UE, add to global list.
 */
static void
memscrub_page_retire_span_add(ms_paddr_t pa)
{
	memscrub_page_retire_span_t *new_span;

	new_span = (memscrub_page_retire_span_t *)
	    kmem_zalloc(sizeof (memscrub_page_retire_span_t), KM_NOSLEEP);

	if (new_span == NULL) {
#ifdef MEMSCRUB_DEBUG
		cmn_err(CE_NOTE, "failed to allocate new span - span with"
		    " retired page/s not tracked.\n");
#endif /* MEMSCRUB_DEBUG */
		return;
	}

	new_span->address = pa;
	new_span->next = memscrub_page_retire_span_list;
	memscrub_page_retire_span_list = new_span;
}

/*
 * Called by memscrub_scan().
 * pa: physical address of span to be removed from global list.
 */
static void
memscrub_page_retire_span_delete(ms_paddr_t pa)
{
	memscrub_page_retire_span_t *prev_span, *next_span;

	prev_span = memscrub_page_retire_span_list;
	next_span = memscrub_page_retire_span_list->next;

	if (pa == prev_span->address) {
		memscrub_page_retire_span_list = next_span;
		kmem_free(prev_span, sizeof (memscrub_page_retire_span_t));
		return;
	}

	while (next_span) {
		if (pa == next_span->address) {
			prev_span->next = next_span->next;
			kmem_free(next_span,
			    sizeof (memscrub_page_retire_span_t));
			return;
		}
		prev_span = next_span;
		next_span = next_span->next;
	}
}

/*
 * Called by memscrub_scan() and memscrub_notify().
 * pa: physical address of span to be searched in global list.
 */
static int
memscrub_page_retire_span_search(ms_paddr_t pa)
{
	memscrub_page_retire_span_t *next_span = memscrub_page_retire_span_list;

	while (next_span) {
		if (pa == next_span->address)
			return (1);
		next_span = next_span->next;
	}
	return (0);
}

/*
 * Called from new_memscrub() as a result of memory delete.
 * Using page_numtopp_nolock() to determine if we have valid PA.
 */
static void
memscrub_page_retire_span_list_update(void)
{
	memscrub_page_retire_span_t *prev, *cur, *next;

	if (memscrub_page_retire_span_list == NULL)
		return;

	prev = cur = memscrub_page_retire_span_list;
	next = cur->next;

	while (cur) {
		if (page_numtopp_nolock(mmu_btop(cur->address)) == NULL) {
			if (cur == memscrub_page_retire_span_list) {
				memscrub_page_retire_span_list = next;
				kmem_free(cur,
				    sizeof (memscrub_page_retire_span_t));
				prev = cur = memscrub_page_retire_span_list;
			} else {
				prev->next = cur->next;
				kmem_free(cur,
				    sizeof (memscrub_page_retire_span_t));
				cur = next;
			}
		} else {
			prev = cur;
			cur = next;
		}
		if (cur != NULL)
			next = cur->next;
	}
}

/*
 * The memory add/delete callback mechanism does not pass in the
 * page ranges. The phys_install list has been updated though, so
 * create a new scrub list from it.
 */

static int
new_memscrub(int update_page_retire_list)
{
	struct memlist *src, *list, *old_list;
	uint_t npgs;

	/*
	 * copy phys_install to memscrub_memlist
	 */
	list = NULL;
	npgs = 0;
	memlist_read_lock();
	for (src = phys_install; src; src = src->ml_next) {
		if (memscrub_add_span_gen((pfn_t)(src->ml_address >> PAGESHIFT),
		    (pgcnt_t)(src->ml_size >> PAGESHIFT), &list, &npgs)) {
			memlist_read_unlock();
			while (list) {
				struct memlist *el;

				el = list;
				list = list->ml_next;
				kmem_free(el, sizeof (struct memlist));
			}
			return (-1);
		}
	}
	memlist_read_unlock();

	mutex_enter(&memscrub_lock);
	memscrub_phys_pages = npgs;
	old_list = memscrub_memlist;
	memscrub_memlist = list;

	if (update_page_retire_list)
		memscrub_page_retire_span_list_update();

	mutex_exit(&memscrub_lock);

	while (old_list) {
		struct memlist *el;

		el = old_list;
		old_list = old_list->ml_next;
		kmem_free(el, sizeof (struct memlist));
	}

	return (0);
}

/*ARGSUSED*/
static void
memscrub_mem_config_post_add(
	void *arg,
	pgcnt_t delta_pages)
{
	/*
	 * We increment pause_memscrub before entering new_memscrub(). This
	 * will force the memscrubber to sleep, allowing the DR callback
	 * thread to acquire memscrub_lock in new_memscrub(). The use of
	 * atomic_add_32() allows concurrent memory DR operations to use the
	 * callbacks safely.
	 */
	atomic_inc_32(&pause_memscrub);
	ASSERT(pause_memscrub != 0);

	/*
	 * "Don't care" if we are not scrubbing new memory.
	 */
	(void) new_memscrub(0);		/* retain page retire list */

	/* Restore the pause setting. */
	atomic_dec_32(&pause_memscrub);
}

/*ARGSUSED*/
static int
memscrub_mem_config_pre_del(
	void *arg,
	pgcnt_t delta_pages)
{
	/* Nothing to do. */
	return (0);
}

/*ARGSUSED*/
static void
memscrub_mem_config_post_del(
	void *arg,
	pgcnt_t delta_pages,
	int cancelled)
{
	/*
	 * We increment pause_memscrub before entering new_memscrub(). This
	 * will force the memscrubber to sleep, allowing the DR callback
	 * thread to acquire memscrub_lock in new_memscrub(). The use of
	 * atomic_add_32() allows concurrent memory DR operations to use the
	 * callbacks safely.
	 */
	atomic_inc_32(&pause_memscrub);
	ASSERT(pause_memscrub != 0);

	/*
	 * Must stop scrubbing deleted memory as it may be disconnected.
	 */
	if (new_memscrub(1)) {	/* update page retire list */
		disable_memscrub = 1;
	}

	/* Restore the pause setting. */
	atomic_dec_32(&pause_memscrub);
}

static kphysm_setup_vector_t memscrub_mem_config_vec = {
	KPHYSM_SETUP_VECTOR_VERSION,
	memscrub_mem_config_post_add,
	memscrub_mem_config_pre_del,
	memscrub_mem_config_post_del,
};

static void
memscrub_init_mem_config()
{
	int ret;

	ret = kphysm_setup_func_register(&memscrub_mem_config_vec,
	    (void *)NULL);
	ASSERT(ret == 0);
}

static void
memscrub_uninit_mem_config()
{
	/* This call is OK if the register call was not done. */
	kphysm_setup_func_unregister(&memscrub_mem_config_vec, (void *)NULL);
}