xref: /titanic_52/usr/src/uts/sun4u/os/memscrub.c (revision a29e56d91db891741f1af9f6bbd3e3c3cac5f19b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * sun4u Memory Scrubbing
28  *
29  * On detection of a correctable memory ECC error, the sun4u kernel
30  * returns the corrected data to the requester and re-writes it
31  * to memory (DRAM).  So if the correctable error was transient,
32  * the read has effectively been cleaned (scrubbed) from memory.
33  *
34  * Scrubbing thus reduces the likelyhood that multiple transient errors
35  * will occur in the same memory word, making uncorrectable errors due
36  * to transients less likely.
37  *
38  * Thus is born the desire that every memory location be periodically
39  * accessed.
40  *
41  * This file implements a memory scrubbing thread.  This scrubber
42  * guarantees that all of physical memory is accessed periodically
43  * (memscrub_period_sec -- 12 hours).
44  *
45  * It attempts to do this as unobtrusively as possible.  The thread
46  * schedules itself to wake up at an interval such that if it reads
47  * memscrub_span_pages (32MB) on each wakeup, it will read all of physical
48  * memory in in memscrub_period_sec (12 hours).
49  *
50  * The scrubber uses the block load and prefetch hardware to read memory
51  * @ 1300MB/s, so it reads spans of 32MB in 0.025 seconds.  Unlike the
52  * original sun4d scrubber the sun4u scrubber does not read ahead if the
53  * system is idle because we can read memory very efficently.
54  *
55  * The scrubber maintains a private copy of the phys_install memory list
56  * to keep track of what memory should be scrubbed.
57  *
58  * The global routines memscrub_add_span() and memscrub_delete_span() are
59  * used to add and delete from this list.  If hotplug memory is later
60  * supported these two routines can be used to notify the scrubber of
61  * memory configuration changes.
62  *
63  * The following parameters can be set via /etc/system
64  *
65  * memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES (8MB)
66  * memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC (12 hours)
67  * memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI)
68  * memscrub_delay_start_sec = (5 minutes)
69  * memscrub_verbose = (0)
70  * memscrub_override_ticks = (1 tick)
71  * disable_memscrub = (0)
72  * pause_memscrub = (0)
73  * read_all_memscrub = (0)
74  *
75  * The scrubber will print NOTICE messages of what it is doing if
76  * "memscrub_verbose" is set.
77  *
78  * If the scrubber's sleep time calculation drops to zero ticks,
79  * memscrub_override_ticks will be used as the sleep time instead. The
80  * sleep time should only drop to zero on a system with over 131.84
81  * terabytes of memory, or where the default scrubber parameters have
82  * been adjusted. For example, reducing memscrub_span_pages or
83  * memscrub_period_sec causes the sleep time to drop to zero with less
84  * memory. Note that since the sleep time is calculated in clock ticks,
85  * using hires clock ticks allows for more memory before the sleep time
86  * becomes zero.
87  *
88  * The scrubber will exit (or never be started) if it finds the variable
89  * "disable_memscrub" set.
90  *
91  * The scrubber will pause (not read memory) when "pause_memscrub"
92  * is set.  It will check the state of pause_memscrub at each wakeup
93  * period.  The scrubber will not make up for lost time.  If you
94  * pause the scrubber for a prolonged period of time you can use
95  * the "read_all_memscrub" switch (see below) to catch up. In addition,
96  * pause_memscrub is used internally by the post memory DR callbacks.
97  * It is set for the small period of time during which the callbacks
98  * are executing. This ensures "memscrub_lock" will be released,
99  * allowing the callbacks to finish.
100  *
101  * The scrubber will read all memory if "read_all_memscrub" is set.
102  * The normal span read will also occur during the wakeup.
103  *
104  * MEMSCRUB_MIN_PAGES (32MB) is the minimum amount of memory a system
105  * must have before we'll start the scrubber.
106  *
107  * MEMSCRUB_DFL_SPAN_PAGES (32MB) is based on the guess that 0.025 sec
108  * is a "good" amount of minimum time for the thread to run at a time.
109  *
110  * MEMSCRUB_DFL_PERIOD_SEC (12 hours) is nearly a total guess --
111  * twice the frequency the hardware folk estimated would be necessary.
112  *
113  * MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI) is based on the assumption
114  * that the scurbber should get its fair share of time (since it
115  * is short).  At a priority of 0 the scrubber will be starved.
116  */
117 
118 #include <sys/systm.h>		/* timeout, types, t_lock */
119 #include <sys/cmn_err.h>
120 #include <sys/sysmacros.h>	/* MIN */
121 #include <sys/memlist.h>	/* memlist */
122 #include <sys/mem_config.h>	/* memory add/delete */
123 #include <sys/kmem.h>		/* KMEM_NOSLEEP */
124 #include <sys/cpuvar.h>		/* ncpus_online */
125 #include <sys/debug.h>		/* ASSERTs */
126 #include <sys/machsystm.h>	/* lddphys */
127 #include <sys/cpu_module.h>	/* vtag_flushpage */
128 #include <sys/kstat.h>
129 #include <sys/atomic.h>		/* atomic_add_32 */
130 
131 #include <vm/hat.h>
132 #include <vm/seg_kmem.h>
133 #include <vm/hat_sfmmu.h>	/* XXX FIXME - delete */
134 
135 #include <sys/time.h>
136 #include <sys/callb.h>		/* CPR callback */
137 #include <sys/ontrap.h>
138 
139 /*
140  * Should really have paddr_t defined, but it is broken.  Use
141  * ms_paddr_t in the meantime to make the code cleaner
142  */
143 typedef uint64_t ms_paddr_t;
144 
145 /*
146  * Global Routines:
147  */
148 int memscrub_add_span(pfn_t pfn, pgcnt_t pages);
149 int memscrub_delete_span(pfn_t pfn, pgcnt_t pages);
150 int memscrub_init(void);
151 void memscrub_induced_error(void);
152 
153 /*
154  * Global Data:
155  */
156 
157 /*
158  * scrub if we have at least this many pages
159  */
160 #define	MEMSCRUB_MIN_PAGES (32 * 1024 * 1024 / PAGESIZE)
161 
162 /*
163  * scan all of physical memory at least once every MEMSCRUB_PERIOD_SEC
164  */
165 #define	MEMSCRUB_DFL_PERIOD_SEC	(12 * 60 * 60)	/* 12 hours */
166 
167 /*
168  * scan at least MEMSCRUB_DFL_SPAN_PAGES each iteration
169  */
170 #define	MEMSCRUB_DFL_SPAN_PAGES	((32 * 1024 * 1024) / PAGESIZE)
171 
172 /*
173  * almost anything is higher priority than scrubbing
174  */
175 #define	MEMSCRUB_DFL_THREAD_PRI	MINCLSYSPRI
176 
177 /*
178  * size used when scanning memory
179  */
180 #define	MEMSCRUB_BLOCK_SIZE		256
181 #define	MEMSCRUB_BLOCK_SIZE_SHIFT	8 	/* log2(MEMSCRUB_BLOCK_SIZE) */
182 #define	MEMSCRUB_BLOCKS_PER_PAGE	(PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT)
183 
184 #define	MEMSCRUB_BPP4M		MMU_PAGESIZE4M >> MEMSCRUB_BLOCK_SIZE_SHIFT
185 #define	MEMSCRUB_BPP512K	MMU_PAGESIZE512K >> MEMSCRUB_BLOCK_SIZE_SHIFT
186 #define	MEMSCRUB_BPP64K		MMU_PAGESIZE64K >> MEMSCRUB_BLOCK_SIZE_SHIFT
187 #define	MEMSCRUB_BPP		MMU_PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT
188 
189 /*
190  * This message indicates that we have exceeded the limitations of
191  * the memscrubber. See the comments above regarding what would
192  * cause the sleep time to become zero. In DEBUG mode, this message
193  * is logged on the console and in the messages file. In non-DEBUG
194  * mode, it is only logged in the messages file.
195  */
196 #ifdef DEBUG
197 #define	MEMSCRUB_OVERRIDE_MSG	"Memory scrubber sleep time is zero " \
198 	"seconds, consuming entire CPU."
199 #else
200 #define	MEMSCRUB_OVERRIDE_MSG	"!Memory scrubber sleep time is zero " \
201 	"seconds, consuming entire CPU."
202 #endif /* DEBUG */
203 
204 /*
205  * we can patch these defaults in /etc/system if necessary
206  */
207 uint_t disable_memscrub = 0;
208 uint_t pause_memscrub = 0;
209 uint_t read_all_memscrub = 0;
210 uint_t memscrub_verbose = 0;
211 uint_t memscrub_all_idle = 0;
212 uint_t memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES;
213 uint_t memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC;
214 uint_t memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI;
215 uint_t memscrub_delay_start_sec = 5 * 60;
216 uint_t memscrub_override_ticks = 1;
217 
218 /*
219  * Static Routines
220  */
221 static void memscrubber(void);
222 static void memscrub_cleanup(void);
223 static int memscrub_add_span_gen(pfn_t, pgcnt_t, struct memlist **, uint_t *);
224 static int memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp);
225 static void memscrub_scan(uint_t blks, ms_paddr_t src);
226 
227 /*
228  * Static Data
229  */
230 
231 static struct memlist *memscrub_memlist;
232 static uint_t memscrub_phys_pages;
233 
234 static kcondvar_t memscrub_cv;
235 static kmutex_t memscrub_lock;
236 /*
237  * memscrub_lock protects memscrub_memlist, interval_ticks, cprinfo, ...
238  */
239 static void memscrub_init_mem_config(void);
240 static void memscrub_uninit_mem_config(void);
241 
242 /*
243  * Linked list of memscrub aware spans having retired pages.
244  * Currently enabled only on sun4u USIII-based platforms.
245  */
246 typedef struct memscrub_page_retire_span {
247 	ms_paddr_t				address;
248 	struct memscrub_page_retire_span	*next;
249 } memscrub_page_retire_span_t;
250 
251 static memscrub_page_retire_span_t *memscrub_page_retire_span_list = NULL;
252 
253 static void memscrub_page_retire_span_add(ms_paddr_t);
254 static void memscrub_page_retire_span_delete(ms_paddr_t);
255 static int memscrub_page_retire_span_search(ms_paddr_t);
256 static void memscrub_page_retire_span_list_update(void);
257 
258 /*
259  * add_to_page_retire_list: Set by cpu_async_log_err() routine
260  * by calling memscrub_induced_error() when CE/UE occurs on a retired
261  * page due to memscrub reading.  Cleared by memscrub after updating
262  * global page retire span list.  Piggybacking on protection of
263  * memscrub_lock, which is held during set and clear.
264  * Note: When cpu_async_log_err() calls memscrub_induced_error(), it is running
265  * on softint context, which gets fired on a cpu memscrub thread currently
266  * running.  Memscrub thread has affinity set during memscrub_read(), hence
267  * migration to new cpu not expected.
268  */
269 static int add_to_page_retire_list = 0;
270 
271 /*
272  * Keep track of some interesting statistics
273  */
274 static struct memscrub_kstats {
275 	kstat_named_t	done_early;	/* ahead of schedule */
276 	kstat_named_t	early_sec;	/* by cumulative num secs */
277 	kstat_named_t	done_late;	/* behind schedule */
278 	kstat_named_t	late_sec;	/* by cumulative num secs */
279 	kstat_named_t	interval_ticks;	/* num ticks between intervals */
280 	kstat_named_t	force_run;	/* forced to run, non-timeout */
281 	kstat_named_t	errors_found;	/* num errors found by memscrub */
282 } memscrub_counts = {
283 	{ "done_early",		KSTAT_DATA_UINT32 },
284 	{ "early_sec", 		KSTAT_DATA_UINT32 },
285 	{ "done_late", 		KSTAT_DATA_UINT32 },
286 	{ "late_sec",		KSTAT_DATA_UINT32 },
287 	{ "interval_ticks",	KSTAT_DATA_UINT32 },
288 	{ "force_run",		KSTAT_DATA_UINT32 },
289 	{ "errors_found",	KSTAT_DATA_UINT32 },
290 };
291 
292 #define	MEMSCRUB_STAT_INC(stat)	memscrub_counts.stat.value.ui32++
293 #define	MEMSCRUB_STAT_SET(stat, val) memscrub_counts.stat.value.ui32 = (val)
294 #define	MEMSCRUB_STAT_NINC(stat, val) memscrub_counts.stat.value.ui32 += (val)
295 
296 static struct kstat *memscrub_ksp = (struct kstat *)NULL;
297 
298 static timeout_id_t memscrub_tid = 0;	/* keep track of timeout id */
299 
300 /*
301  * create memscrub_memlist from phys_install list
302  * initialize locks, set memscrub_phys_pages.
303  */
304 int
305 memscrub_init(void)
306 {
307 	struct memlist *src;
308 
309 	/*
310 	 * only startup the scrubber if we have a minimum
311 	 * number of pages
312 	 */
313 	if (physinstalled >= MEMSCRUB_MIN_PAGES) {
314 
315 		/*
316 		 * initialize locks
317 		 */
318 		mutex_init(&memscrub_lock, NULL, MUTEX_DRIVER, NULL);
319 		cv_init(&memscrub_cv, NULL, CV_DRIVER, NULL);
320 
321 		/*
322 		 * copy phys_install to memscrub_memlist
323 		 */
324 		for (src = phys_install; src; src = src->ml_next) {
325 			if (memscrub_add_span(
326 			    (pfn_t)(src->ml_address >> PAGESHIFT),
327 			    (pgcnt_t)(src->ml_size >> PAGESHIFT))) {
328 				memscrub_cleanup();
329 				return (-1);
330 			}
331 		}
332 
333 		/*
334 		 * initialize kstats
335 		 */
336 		memscrub_ksp = kstat_create("unix", 0, "memscrub_kstat",
337 		    "misc", KSTAT_TYPE_NAMED,
338 		    sizeof (memscrub_counts) / sizeof (kstat_named_t),
339 		    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
340 
341 		if (memscrub_ksp) {
342 			memscrub_ksp->ks_data = (void *)&memscrub_counts;
343 			kstat_install(memscrub_ksp);
344 		} else {
345 			cmn_err(CE_NOTE, "Memscrubber cannot create kstats\n");
346 		}
347 
348 		/*
349 		 * create memscrubber thread
350 		 */
351 		(void) thread_create(NULL, 0, (void (*)())memscrubber,
352 		    NULL, 0, &p0, TS_RUN, memscrub_thread_pri);
353 
354 		/*
355 		 * We don't want call backs changing the list
356 		 * if there is no thread running. We do not
357 		 * attempt to deal with stopping/starting scrubbing
358 		 * on memory size changes.
359 		 */
360 		memscrub_init_mem_config();
361 	}
362 
363 	return (0);
364 }
365 
366 static void
367 memscrub_cleanup(void)
368 {
369 	memscrub_uninit_mem_config();
370 	while (memscrub_memlist) {
371 		(void) memscrub_delete_span(
372 		    (pfn_t)(memscrub_memlist->ml_address >> PAGESHIFT),
373 		    (pgcnt_t)(memscrub_memlist->ml_size >> PAGESHIFT));
374 	}
375 	if (memscrub_ksp)
376 		kstat_delete(memscrub_ksp);
377 	cv_destroy(&memscrub_cv);
378 	mutex_destroy(&memscrub_lock);
379 }
380 
381 #ifdef MEMSCRUB_DEBUG
382 static void
383 memscrub_printmemlist(char *title, struct memlist *listp)
384 {
385 	struct memlist *list;
386 
387 	cmn_err(CE_CONT, "%s:\n", title);
388 
389 	for (list = listp; list; list = list->ml_next) {
390 		cmn_err(CE_CONT, "addr = 0x%llx, size = 0x%llx\n",
391 		    list->ml_address, list->ml_size);
392 	}
393 }
394 #endif /* MEMSCRUB_DEBUG */
395 
396 /* ARGSUSED */
397 static void
398 memscrub_wakeup(void *c)
399 {
400 	/*
401 	 * grab mutex to guarantee that our wakeup call
402 	 * arrives after we go to sleep -- so we can't sleep forever.
403 	 */
404 	mutex_enter(&memscrub_lock);
405 	cv_signal(&memscrub_cv);
406 	mutex_exit(&memscrub_lock);
407 }
408 
409 /*
410  * provide an interface external to the memscrubber
411  * which will force the memscrub thread to run vs.
412  * waiting for the timeout, if one is set
413  */
414 void
415 memscrub_run(void)
416 {
417 	MEMSCRUB_STAT_INC(force_run);
418 	if (memscrub_tid) {
419 		(void) untimeout(memscrub_tid);
420 		memscrub_wakeup((void *)NULL);
421 	}
422 }
423 
424 /*
425  * this calculation doesn't account for the time
426  * that the actual scan consumes -- so we'd fall
427  * slightly behind schedule with this interval.
428  * It's very small.
429  */
430 
431 static uint_t
432 compute_interval_ticks(void)
433 {
434 	/*
435 	 * We use msp_safe mpp_safe below to insure somebody
436 	 * doesn't set memscrub_span_pages or memscrub_phys_pages
437 	 * to 0 on us.
438 	 */
439 	static uint_t msp_safe, mpp_safe;
440 	static uint_t interval_ticks, period_ticks;
441 	msp_safe = memscrub_span_pages;
442 	mpp_safe = memscrub_phys_pages;
443 
444 	period_ticks = memscrub_period_sec * hz;
445 	interval_ticks = period_ticks;
446 
447 	ASSERT(mutex_owned(&memscrub_lock));
448 
449 	if ((msp_safe != 0) && (mpp_safe != 0)) {
450 		if (memscrub_phys_pages <= msp_safe) {
451 			interval_ticks = period_ticks;
452 		} else {
453 			interval_ticks = (period_ticks /
454 			    (mpp_safe / msp_safe));
455 		}
456 	}
457 	return (interval_ticks);
458 }
459 
460 void
461 memscrubber(void)
462 {
463 	ms_paddr_t address, addr;
464 	time_t deadline;
465 	pgcnt_t pages;
466 	uint_t reached_end = 1;
467 	uint_t paused_message = 0;
468 	uint_t interval_ticks = 0;
469 	uint_t sleep_warn_printed = 0;
470 	callb_cpr_t cprinfo;
471 
472 	/*
473 	 * notify CPR of our existence
474 	 */
475 	CALLB_CPR_INIT(&cprinfo, &memscrub_lock, callb_generic_cpr, "memscrub");
476 
477 	mutex_enter(&memscrub_lock);
478 
479 	if (memscrub_memlist == NULL) {
480 		cmn_err(CE_WARN, "memscrub_memlist not initialized.");
481 		goto memscrub_exit;
482 	}
483 
484 	address = memscrub_memlist->ml_address;
485 
486 	deadline = gethrestime_sec() + memscrub_delay_start_sec;
487 
488 	for (;;) {
489 		if (disable_memscrub)
490 			break;
491 
492 		/*
493 		 * compute interval_ticks
494 		 */
495 		interval_ticks = compute_interval_ticks();
496 
497 		/*
498 		 * If the calculated sleep time is zero, and pause_memscrub
499 		 * has been set, make sure we sleep so that another thread
500 		 * can acquire memscrub_lock.
501 		 */
502 		if (interval_ticks == 0 && pause_memscrub) {
503 			interval_ticks = hz;
504 		}
505 
506 		/*
507 		 * And as a fail safe, under normal non-paused operation, do
508 		 * not allow the sleep time to be zero.
509 		 */
510 		if (interval_ticks == 0) {
511 			interval_ticks = memscrub_override_ticks;
512 			if (!sleep_warn_printed) {
513 				cmn_err(CE_NOTE, MEMSCRUB_OVERRIDE_MSG);
514 				sleep_warn_printed = 1;
515 			}
516 		}
517 
518 		MEMSCRUB_STAT_SET(interval_ticks, interval_ticks);
519 
520 		/*
521 		 * Did we just reach the end of memory? If we are at the
522 		 * end of memory, delay end of memory processing until
523 		 * pause_memscrub is not set.
524 		 */
525 		if (reached_end && !pause_memscrub) {
526 			time_t now = gethrestime_sec();
527 
528 			if (now >= deadline) {
529 				MEMSCRUB_STAT_INC(done_late);
530 				MEMSCRUB_STAT_NINC(late_sec, now - deadline);
531 				/*
532 				 * past deadline, start right away
533 				 */
534 				interval_ticks = 0;
535 
536 				deadline = now + memscrub_period_sec;
537 			} else {
538 				/*
539 				 * we finished ahead of schedule.
540 				 * wait till previous deadline before re-start.
541 				 */
542 				interval_ticks = (deadline - now) * hz;
543 				MEMSCRUB_STAT_INC(done_early);
544 				MEMSCRUB_STAT_NINC(early_sec, deadline - now);
545 				deadline += memscrub_period_sec;
546 			}
547 			reached_end = 0;
548 			sleep_warn_printed = 0;
549 		}
550 
551 		if (interval_ticks != 0) {
552 			/*
553 			 * it is safe from our standpoint for CPR to
554 			 * suspend the system
555 			 */
556 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
557 
558 			/*
559 			 * hit the snooze bar
560 			 */
561 			memscrub_tid = timeout(memscrub_wakeup, NULL,
562 			    interval_ticks);
563 
564 			/*
565 			 * go to sleep
566 			 */
567 			cv_wait(&memscrub_cv, &memscrub_lock);
568 
569 			/*
570 			 * at this point, no timeout should be set
571 			 */
572 			memscrub_tid = 0;
573 
574 			/*
575 			 * we need to goto work and will be modifying
576 			 * our internal state and mapping/unmapping
577 			 * TTEs
578 			 */
579 			CALLB_CPR_SAFE_END(&cprinfo, &memscrub_lock);
580 		}
581 
582 
583 		if (memscrub_phys_pages == 0) {
584 			cmn_err(CE_WARN, "Memory scrubber has 0 pages to read");
585 			goto memscrub_exit;
586 		}
587 
588 		if (!pause_memscrub) {
589 			if (paused_message) {
590 				paused_message = 0;
591 				if (memscrub_verbose)
592 					cmn_err(CE_NOTE, "Memory scrubber "
593 					    "resuming");
594 			}
595 
596 			if (read_all_memscrub) {
597 				if (memscrub_verbose)
598 					cmn_err(CE_NOTE, "Memory scrubber "
599 					    "reading all memory per request");
600 
601 				addr = memscrub_memlist->ml_address;
602 				reached_end = 0;
603 				while (!reached_end) {
604 					if (disable_memscrub)
605 						break;
606 					pages = memscrub_phys_pages;
607 					reached_end = memscrub_verify_span(
608 					    &addr, &pages);
609 					memscrub_scan(pages *
610 					    MEMSCRUB_BLOCKS_PER_PAGE, addr);
611 					addr += ((uint64_t)pages * PAGESIZE);
612 				}
613 				read_all_memscrub = 0;
614 			}
615 
616 			/*
617 			 * read 1 span
618 			 */
619 			pages = memscrub_span_pages;
620 
621 			if (disable_memscrub)
622 				break;
623 
624 			/*
625 			 * determine physical address range
626 			 */
627 			reached_end = memscrub_verify_span(&address,
628 			    &pages);
629 
630 			memscrub_scan(pages * MEMSCRUB_BLOCKS_PER_PAGE,
631 			    address);
632 
633 			address += ((uint64_t)pages * PAGESIZE);
634 		}
635 
636 		if (pause_memscrub && !paused_message) {
637 			paused_message = 1;
638 			if (memscrub_verbose)
639 				cmn_err(CE_NOTE, "Memory scrubber paused");
640 		}
641 	}
642 
643 memscrub_exit:
644 	cmn_err(CE_NOTE, "Memory scrubber exiting");
645 	CALLB_CPR_EXIT(&cprinfo);
646 	memscrub_cleanup();
647 	thread_exit();
648 	/* NOTREACHED */
649 }
650 
651 /*
652  * condition address and size
653  * such that they span legal physical addresses.
654  *
655  * when appropriate, address will be rounded up to start of next
656  * struct memlist, and pages will be rounded down to the end of the
657  * memlist size.
658  *
659  * returns 1 if reached end of list, else returns 0.
660  */
661 static int
662 memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp)
663 {
664 	struct memlist *mlp;
665 	ms_paddr_t address = *addrp;
666 	uint64_t bytes = (uint64_t)*pagesp * PAGESIZE;
667 	uint64_t bytes_remaining;
668 	int reached_end = 0;
669 
670 	ASSERT(mutex_owned(&memscrub_lock));
671 
672 	/*
673 	 * find memlist struct that contains addrp
674 	 * assumes memlist is sorted by ascending address.
675 	 */
676 	for (mlp = memscrub_memlist; mlp != NULL; mlp = mlp->ml_next) {
677 		/*
678 		 * if before this chunk, round up to beginning
679 		 */
680 		if (address < mlp->ml_address) {
681 			address = mlp->ml_address;
682 			break;
683 		}
684 		/*
685 		 * if before end of chunk, then we found it
686 		 */
687 		if (address < (mlp->ml_address + mlp->ml_size))
688 			break;
689 
690 		/* else go to next struct memlist */
691 	}
692 	/*
693 	 * if we hit end of list, start at beginning
694 	 */
695 	if (mlp == NULL) {
696 		mlp = memscrub_memlist;
697 		address = mlp->ml_address;
698 	}
699 
700 	/*
701 	 * now we have legal address, and its mlp, condition bytes
702 	 */
703 	bytes_remaining = (mlp->ml_address + mlp->ml_size) - address;
704 
705 	if (bytes > bytes_remaining)
706 		bytes = bytes_remaining;
707 
708 	/*
709 	 * will this span take us to end of list?
710 	 */
711 	if ((mlp->ml_next == NULL) &&
712 	    ((mlp->ml_address + mlp->ml_size) == (address + bytes)))
713 		reached_end = 1;
714 
715 	/* return values */
716 	*addrp = address;
717 	*pagesp = bytes / PAGESIZE;
718 
719 	return (reached_end);
720 }
721 
722 /*
723  * add a span to the memscrub list
724  * add to memscrub_phys_pages
725  */
726 int
727 memscrub_add_span(pfn_t pfn, pgcnt_t pages)
728 {
729 #ifdef MEMSCRUB_DEBUG
730 	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
731 	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
732 #endif /* MEMSCRUB_DEBUG */
733 
734 	int retval;
735 
736 	mutex_enter(&memscrub_lock);
737 
738 #ifdef MEMSCRUB_DEBUG
739 	memscrub_printmemlist("memscrub_memlist before", memscrub_memlist);
740 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
741 	cmn_err(CE_CONT, "memscrub_add_span: address: 0x%llx"
742 	    " size: 0x%llx\n", address, bytes);
743 #endif /* MEMSCRUB_DEBUG */
744 
745 	retval = memscrub_add_span_gen(pfn, pages, &memscrub_memlist,
746 	    &memscrub_phys_pages);
747 
748 #ifdef MEMSCRUB_DEBUG
749 	memscrub_printmemlist("memscrub_memlist after", memscrub_memlist);
750 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
751 #endif /* MEMSCRUB_DEBUG */
752 
753 	mutex_exit(&memscrub_lock);
754 
755 	return (retval);
756 }
757 
758 static int
759 memscrub_add_span_gen(
760 	pfn_t pfn,
761 	pgcnt_t pages,
762 	struct memlist **list,
763 	uint_t *npgs)
764 {
765 	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
766 	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
767 	struct memlist *dst;
768 	struct memlist *prev, *next;
769 	int retval = 0;
770 
771 	/*
772 	 * allocate a new struct memlist
773 	 */
774 
775 	dst = (struct memlist *)
776 	    kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
777 
778 	if (dst == NULL) {
779 		retval = -1;
780 		goto add_done;
781 	}
782 
783 	dst->ml_address = address;
784 	dst->ml_size = bytes;
785 
786 	/*
787 	 * first insert
788 	 */
789 	if (*list == NULL) {
790 		dst->ml_prev = NULL;
791 		dst->ml_next = NULL;
792 		*list = dst;
793 
794 		goto add_done;
795 	}
796 
797 	/*
798 	 * insert into sorted list
799 	 */
800 	for (prev = NULL, next = *list;
801 	    next != NULL;
802 	    prev = next, next = next->ml_next) {
803 		if (address > (next->ml_address + next->ml_size))
804 			continue;
805 
806 		/*
807 		 * else insert here
808 		 */
809 
810 		/*
811 		 * prepend to next
812 		 */
813 		if ((address + bytes) == next->ml_address) {
814 			kmem_free(dst, sizeof (struct memlist));
815 
816 			next->ml_address = address;
817 			next->ml_size += bytes;
818 
819 			goto add_done;
820 		}
821 
822 		/*
823 		 * append to next
824 		 */
825 		if (address == (next->ml_address + next->ml_size)) {
826 			kmem_free(dst, sizeof (struct memlist));
827 
828 			if (next->ml_next) {
829 				/*
830 				 * don't overlap with next->ml_next
831 				 */
832 				if ((address + bytes) >
833 				    next->ml_next->ml_address) {
834 					retval = -1;
835 					goto add_done;
836 				}
837 				/*
838 				 * concatenate next and next->ml_next
839 				 */
840 				if ((address + bytes) ==
841 				    next->ml_next->ml_address) {
842 					struct memlist *mlp = next->ml_next;
843 
844 					if (next == *list)
845 						*list = next->ml_next;
846 
847 					mlp->ml_address = next->ml_address;
848 					mlp->ml_size += next->ml_size;
849 					mlp->ml_size += bytes;
850 
851 					if (next->ml_prev)
852 						next->ml_prev->ml_next = mlp;
853 					mlp->ml_prev = next->ml_prev;
854 
855 					kmem_free(next,
856 					    sizeof (struct memlist));
857 					goto add_done;
858 				}
859 			}
860 
861 			next->ml_size += bytes;
862 
863 			goto add_done;
864 		}
865 
866 		/* don't overlap with next */
867 		if ((address + bytes) > next->ml_address) {
868 			retval = -1;
869 			kmem_free(dst, sizeof (struct memlist));
870 			goto add_done;
871 		}
872 
873 		/*
874 		 * insert before next
875 		 */
876 		dst->ml_prev = prev;
877 		dst->ml_next = next;
878 		next->ml_prev = dst;
879 		if (prev == NULL) {
880 			*list = dst;
881 		} else {
882 			prev->ml_next = dst;
883 		}
884 		goto add_done;
885 	}	/* end for */
886 
887 	/*
888 	 * end of list, prev is valid and next is NULL
889 	 */
890 	prev->ml_next = dst;
891 	dst->ml_prev = prev;
892 	dst->ml_next = NULL;
893 
894 add_done:
895 
896 	if (retval != -1)
897 		*npgs += pages;
898 
899 	return (retval);
900 }
901 
902 /*
903  * delete a span from the memscrub list
904  * subtract from memscrub_phys_pages
905  */
906 int
907 memscrub_delete_span(pfn_t pfn, pgcnt_t pages)
908 {
909 	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
910 	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
911 	struct memlist *dst, *next;
912 	int retval = 0;
913 
914 	mutex_enter(&memscrub_lock);
915 
916 #ifdef MEMSCRUB_DEBUG
917 	memscrub_printmemlist("memscrub_memlist Before", memscrub_memlist);
918 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
919 	cmn_err(CE_CONT, "memscrub_delete_span: 0x%llx 0x%llx\n",
920 	    address, bytes);
921 #endif /* MEMSCRUB_DEBUG */
922 
923 	/*
924 	 * find struct memlist containing page
925 	 */
926 	for (next = memscrub_memlist; next != NULL; next = next->ml_next) {
927 		if ((address >= next->ml_address) &&
928 		    (address < next->ml_address + next->ml_size))
929 			break;
930 	}
931 
932 	/*
933 	 * if start address not in list
934 	 */
935 	if (next == NULL) {
936 		retval = -1;
937 		goto delete_done;
938 	}
939 
940 	/*
941 	 * error if size goes off end of this struct memlist
942 	 */
943 	if (address + bytes > next->ml_address + next->ml_size) {
944 		retval = -1;
945 		goto delete_done;
946 	}
947 
948 	/*
949 	 * pages at beginning of struct memlist
950 	 */
951 	if (address == next->ml_address) {
952 		/*
953 		 * if start & size match, delete from list
954 		 */
955 		if (bytes == next->ml_size) {
956 			if (next == memscrub_memlist)
957 				memscrub_memlist = next->ml_next;
958 			if (next->ml_prev != NULL)
959 				next->ml_prev->ml_next = next->ml_next;
960 			if (next->ml_next != NULL)
961 				next->ml_next->ml_prev = next->ml_prev;
962 
963 			kmem_free(next, sizeof (struct memlist));
964 		} else {
965 		/*
966 		 * increment start address by bytes
967 		 */
968 			next->ml_address += bytes;
969 			next->ml_size -= bytes;
970 		}
971 		goto delete_done;
972 	}
973 
974 	/*
975 	 * pages at end of struct memlist
976 	 */
977 	if (address + bytes == next->ml_address + next->ml_size) {
978 		/*
979 		 * decrement size by bytes
980 		 */
981 		next->ml_size -= bytes;
982 		goto delete_done;
983 	}
984 
985 	/*
986 	 * delete a span in the middle of the struct memlist
987 	 */
988 	{
989 		/*
990 		 * create a new struct memlist
991 		 */
992 		dst = (struct memlist *)
993 		    kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
994 
995 		if (dst == NULL) {
996 			retval = -1;
997 			goto delete_done;
998 		}
999 
1000 		/*
1001 		 * existing struct memlist gets address
1002 		 * and size up to pfn
1003 		 */
1004 		dst->ml_address = address + bytes;
1005 		dst->ml_size =
1006 		    (next->ml_address + next->ml_size) - dst->ml_address;
1007 		next->ml_size = address - next->ml_address;
1008 
1009 		/*
1010 		 * new struct memlist gets address starting
1011 		 * after pfn, until end
1012 		 */
1013 
1014 		/*
1015 		 * link in new memlist after old
1016 		 */
1017 		dst->ml_next = next->ml_next;
1018 		dst->ml_prev = next;
1019 
1020 		if (next->ml_next != NULL)
1021 			next->ml_next->ml_prev = dst;
1022 		next->ml_next = dst;
1023 	}
1024 
1025 delete_done:
1026 	if (retval != -1) {
1027 		memscrub_phys_pages -= pages;
1028 		if (memscrub_phys_pages == 0)
1029 			disable_memscrub = 1;
1030 	}
1031 
1032 #ifdef MEMSCRUB_DEBUG
1033 	memscrub_printmemlist("memscrub_memlist After", memscrub_memlist);
1034 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
1035 #endif /* MEMSCRUB_DEBUG */
1036 
1037 	mutex_exit(&memscrub_lock);
1038 	return (retval);
1039 }
1040 
1041 static void
1042 memscrub_scan(uint_t blks, ms_paddr_t src)
1043 {
1044 	uint_t 		psz, bpp, pgsread;
1045 	pfn_t		pfn;
1046 	ms_paddr_t	pa;
1047 	caddr_t		va;
1048 	on_trap_data_t	otd;
1049 	int		scan_mmu_pagesize = 0;
1050 	int		retired_pages = 0;
1051 
1052 	extern void memscrub_read(caddr_t src, uint_t blks);
1053 
1054 	ASSERT(mutex_owned(&memscrub_lock));
1055 
1056 	pgsread = 0;
1057 	pa = src;
1058 
1059 	if (memscrub_page_retire_span_list != NULL) {
1060 		if (memscrub_page_retire_span_search(src)) {
1061 			/* retired pages in current span */
1062 			scan_mmu_pagesize = 1;
1063 		}
1064 	}
1065 
1066 #ifdef MEMSCRUB_DEBUG
1067 	cmn_err(CE_NOTE, "scan_mmu_pagesize = %d\n" scan_mmu_pagesize);
1068 #endif /* MEMSCRUB_DEBUG */
1069 
1070 	while (blks != 0) {
1071 		/* Ensure the PA is properly aligned */
1072 		if (((pa & MMU_PAGEMASK4M) == pa) &&
1073 		    (blks >= MEMSCRUB_BPP4M)) {
1074 			psz = MMU_PAGESIZE4M;
1075 			bpp = MEMSCRUB_BPP4M;
1076 		} else if (((pa & MMU_PAGEMASK512K) == pa) &&
1077 		    (blks >= MEMSCRUB_BPP512K)) {
1078 			psz = MMU_PAGESIZE512K;
1079 			bpp = MEMSCRUB_BPP512K;
1080 		} else if (((pa & MMU_PAGEMASK64K) == pa) &&
1081 		    (blks >= MEMSCRUB_BPP64K)) {
1082 			psz = MMU_PAGESIZE64K;
1083 			bpp = MEMSCRUB_BPP64K;
1084 		} else if ((pa & MMU_PAGEMASK) == pa) {
1085 			psz = MMU_PAGESIZE;
1086 			bpp = MEMSCRUB_BPP;
1087 		} else {
1088 			if (memscrub_verbose) {
1089 				cmn_err(CE_NOTE, "Memory scrubber ignoring "
1090 				    "non-page aligned block starting at 0x%"
1091 				    PRIx64, src);
1092 			}
1093 			return;
1094 		}
1095 		if (blks < bpp) bpp = blks;
1096 
1097 #ifdef MEMSCRUB_DEBUG
1098 		cmn_err(CE_NOTE, "Going to run psz=%x, "
1099 		    "bpp=%x pa=%llx\n", psz, bpp, pa);
1100 #endif /* MEMSCRUB_DEBUG */
1101 
1102 		/*
1103 		 * MEMSCRUBBASE is a 4MB aligned page in the
1104 		 * kernel so that we can quickly map the PA
1105 		 * to a VA for the block loads performed in
1106 		 * memscrub_read.
1107 		 */
1108 		pfn = mmu_btop(pa);
1109 		va = (caddr_t)MEMSCRUBBASE;
1110 		hat_devload(kas.a_hat, va, psz, pfn, PROT_READ,
1111 		    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
1112 
1113 		/*
1114 		 * Can't allow the memscrubber to migrate across CPUs as
1115 		 * we need to know whether CEEN is enabled for the current
1116 		 * CPU to enable us to scrub the memory. Don't use
1117 		 * kpreempt_disable as the time we take to scan a span (even
1118 		 * without cpu_check_ce having to manually cpu_check_block)
1119 		 * is too long to hold a higher priority thread (eg, RT)
1120 		 * off cpu.
1121 		 */
1122 		thread_affinity_set(curthread, CPU_CURRENT);
1123 
1124 		/*
1125 		 * Protect read scrub from async faults.  For now, we simply
1126 		 * maintain a count of such faults caught.
1127 		 */
1128 
1129 		if (!on_trap(&otd, OT_DATA_EC) && !scan_mmu_pagesize) {
1130 			memscrub_read(va, bpp);
1131 			/*
1132 			 * Check if CEs require logging
1133 			 */
1134 			cpu_check_ce(SCRUBBER_CEEN_CHECK,
1135 			    (uint64_t)pa, va, psz);
1136 			no_trap();
1137 			thread_affinity_clear(curthread);
1138 		} else {
1139 			no_trap();
1140 			thread_affinity_clear(curthread);
1141 
1142 			/*
1143 			 * Got an async error..
1144 			 * Try rescanning it at MMU_PAGESIZE
1145 			 * granularity if we were trying to
1146 			 * read at a larger page size.
1147 			 * This is to ensure we continue to
1148 			 * scan the rest of the span.
1149 			 * OR scanning MMU_PAGESIZE granularity to avoid
1150 			 * reading retired pages memory when scan_mmu_pagesize
1151 			 * is set.
1152 			 */
1153 			if (psz > MMU_PAGESIZE || scan_mmu_pagesize) {
1154 				caddr_t vaddr = va;
1155 				ms_paddr_t paddr = pa;
1156 				int tmp = 0;
1157 				for (; tmp < bpp; tmp += MEMSCRUB_BPP) {
1158 					/* Don't scrub retired pages */
1159 					if (page_retire_check(paddr, NULL)
1160 					    == 0) {
1161 						vaddr += MMU_PAGESIZE;
1162 						paddr += MMU_PAGESIZE;
1163 						retired_pages++;
1164 						continue;
1165 					}
1166 					thread_affinity_set(curthread,
1167 					    CPU_CURRENT);
1168 					if (!on_trap(&otd, OT_DATA_EC)) {
1169 						memscrub_read(vaddr,
1170 						    MEMSCRUB_BPP);
1171 						cpu_check_ce(
1172 						    SCRUBBER_CEEN_CHECK,
1173 						    (uint64_t)paddr, vaddr,
1174 						    MMU_PAGESIZE);
1175 						no_trap();
1176 					} else {
1177 						no_trap();
1178 						MEMSCRUB_STAT_INC(errors_found);
1179 					}
1180 					thread_affinity_clear(curthread);
1181 					vaddr += MMU_PAGESIZE;
1182 					paddr += MMU_PAGESIZE;
1183 				}
1184 			}
1185 		}
1186 		hat_unload(kas.a_hat, va, psz, HAT_UNLOAD_UNLOCK);
1187 
1188 		blks -= bpp;
1189 		pa += psz;
1190 		pgsread++;
1191 	}
1192 
1193 	/*
1194 	 * If just finished scrubbing MMU_PAGESIZE at a time, but no retired
1195 	 * pages found so delete span from global list.
1196 	 */
1197 	if (scan_mmu_pagesize && retired_pages == 0)
1198 		memscrub_page_retire_span_delete(src);
1199 
1200 	/*
1201 	 * Encountered CE/UE on a retired page during memscrub read of current
1202 	 * span.  Adding span to global list to enable avoid reading further.
1203 	 */
1204 	if (add_to_page_retire_list) {
1205 		if (!memscrub_page_retire_span_search(src))
1206 			memscrub_page_retire_span_add(src);
1207 		add_to_page_retire_list = 0;
1208 	}
1209 
1210 	if (memscrub_verbose) {
1211 		cmn_err(CE_NOTE, "Memory scrubber read 0x%x pages starting "
1212 		    "at 0x%" PRIx64, pgsread, src);
1213 	}
1214 }
1215 
1216 /*
1217  * Called by cpu_async_log_err() when memscrub read causes
1218  * CE/UE on a retired page.
1219  */
1220 void
1221 memscrub_induced_error(void)
1222 {
1223 	add_to_page_retire_list = 1;
1224 }
1225 
1226 /*
1227  * Called by page_retire() when toxic pages cannot be retired
1228  * immediately and are scheduled for retire.  Memscrubber stops
1229  * scrubbing them to avoid further CE/UEs.
1230  */
1231 void
1232 memscrub_notify(ms_paddr_t pa)
1233 {
1234 	mutex_enter(&memscrub_lock);
1235 	if (!memscrub_page_retire_span_search(pa))
1236 		memscrub_page_retire_span_add(pa);
1237 	mutex_exit(&memscrub_lock);
1238 }
1239 
1240 /*
1241  * Called by memscrub_scan() and memscrub_notify().
1242  * pa: physical address of span with CE/UE, add to global list.
1243  */
1244 static void
1245 memscrub_page_retire_span_add(ms_paddr_t pa)
1246 {
1247 	memscrub_page_retire_span_t *new_span;
1248 
1249 	new_span = (memscrub_page_retire_span_t *)
1250 	    kmem_zalloc(sizeof (memscrub_page_retire_span_t), KM_NOSLEEP);
1251 
1252 	if (new_span == NULL) {
1253 #ifdef MEMSCRUB_DEBUG
1254 		cmn_err(CE_NOTE, "failed to allocate new span - span with"
1255 		    " retired page/s not tracked.\n");
1256 #endif /* MEMSCRUB_DEBUG */
1257 		return;
1258 	}
1259 
1260 	new_span->address = pa;
1261 	new_span->next = memscrub_page_retire_span_list;
1262 	memscrub_page_retire_span_list = new_span;
1263 }
1264 
1265 /*
1266  * Called by memscrub_scan().
1267  * pa: physical address of span to be removed from global list.
1268  */
1269 static void
1270 memscrub_page_retire_span_delete(ms_paddr_t pa)
1271 {
1272 	memscrub_page_retire_span_t *prev_span, *next_span;
1273 
1274 	prev_span = memscrub_page_retire_span_list;
1275 	next_span = memscrub_page_retire_span_list->next;
1276 
1277 	if (pa == prev_span->address) {
1278 		memscrub_page_retire_span_list = next_span;
1279 		kmem_free(prev_span, sizeof (memscrub_page_retire_span_t));
1280 		return;
1281 	}
1282 
1283 	while (next_span) {
1284 		if (pa == next_span->address) {
1285 			prev_span->next = next_span->next;
1286 			kmem_free(next_span,
1287 			    sizeof (memscrub_page_retire_span_t));
1288 			return;
1289 		}
1290 		prev_span = next_span;
1291 		next_span = next_span->next;
1292 	}
1293 }
1294 
1295 /*
1296  * Called by memscrub_scan() and memscrub_notify().
1297  * pa: physical address of span to be searched in global list.
1298  */
1299 static int
1300 memscrub_page_retire_span_search(ms_paddr_t pa)
1301 {
1302 	memscrub_page_retire_span_t *next_span = memscrub_page_retire_span_list;
1303 
1304 	while (next_span) {
1305 		if (pa == next_span->address)
1306 			return (1);
1307 		next_span = next_span->next;
1308 	}
1309 	return (0);
1310 }
1311 
1312 /*
1313  * Called from new_memscrub() as a result of memory delete.
1314  * Using page_numtopp_nolock() to determine if we have valid PA.
1315  */
1316 static void
1317 memscrub_page_retire_span_list_update(void)
1318 {
1319 	memscrub_page_retire_span_t *prev, *cur, *next;
1320 
1321 	if (memscrub_page_retire_span_list == NULL)
1322 		return;
1323 
1324 	prev = cur = memscrub_page_retire_span_list;
1325 	next = cur->next;
1326 
1327 	while (cur) {
1328 		if (page_numtopp_nolock(mmu_btop(cur->address)) == NULL) {
1329 			if (cur == memscrub_page_retire_span_list) {
1330 				memscrub_page_retire_span_list = next;
1331 				kmem_free(cur,
1332 				    sizeof (memscrub_page_retire_span_t));
1333 				prev = cur = memscrub_page_retire_span_list;
1334 			} else {
1335 				prev->next = cur->next;
1336 				kmem_free(cur,
1337 				    sizeof (memscrub_page_retire_span_t));
1338 				cur = next;
1339 			}
1340 		} else {
1341 			prev = cur;
1342 			cur = next;
1343 		}
1344 		if (cur != NULL)
1345 			next = cur->next;
1346 	}
1347 }
1348 
1349 /*
1350  * The memory add/delete callback mechanism does not pass in the
1351  * page ranges. The phys_install list has been updated though, so
1352  * create a new scrub list from it.
1353  */
1354 
1355 static int
1356 new_memscrub(int update_page_retire_list)
1357 {
1358 	struct memlist *src, *list, *old_list;
1359 	uint_t npgs;
1360 
1361 	/*
1362 	 * copy phys_install to memscrub_memlist
1363 	 */
1364 	list = NULL;
1365 	npgs = 0;
1366 	memlist_read_lock();
1367 	for (src = phys_install; src; src = src->ml_next) {
1368 		if (memscrub_add_span_gen((pfn_t)(src->ml_address >> PAGESHIFT),
1369 		    (pgcnt_t)(src->ml_size >> PAGESHIFT), &list, &npgs)) {
1370 			memlist_read_unlock();
1371 			while (list) {
1372 				struct memlist *el;
1373 
1374 				el = list;
1375 				list = list->ml_next;
1376 				kmem_free(el, sizeof (struct memlist));
1377 			}
1378 			return (-1);
1379 		}
1380 	}
1381 	memlist_read_unlock();
1382 
1383 	mutex_enter(&memscrub_lock);
1384 	memscrub_phys_pages = npgs;
1385 	old_list = memscrub_memlist;
1386 	memscrub_memlist = list;
1387 
1388 	if (update_page_retire_list)
1389 		memscrub_page_retire_span_list_update();
1390 
1391 	mutex_exit(&memscrub_lock);
1392 
1393 	while (old_list) {
1394 		struct memlist *el;
1395 
1396 		el = old_list;
1397 		old_list = old_list->ml_next;
1398 		kmem_free(el, sizeof (struct memlist));
1399 	}
1400 
1401 	return (0);
1402 }
1403 
1404 /*ARGSUSED*/
1405 static void
1406 memscrub_mem_config_post_add(
1407 	void *arg,
1408 	pgcnt_t delta_pages)
1409 {
1410 	/*
1411 	 * We increment pause_memscrub before entering new_memscrub(). This
1412 	 * will force the memscrubber to sleep, allowing the DR callback
1413 	 * thread to acquire memscrub_lock in new_memscrub(). The use of
1414 	 * atomic_add_32() allows concurrent memory DR operations to use the
1415 	 * callbacks safely.
1416 	 */
1417 	atomic_inc_32(&pause_memscrub);
1418 	ASSERT(pause_memscrub != 0);
1419 
1420 	/*
1421 	 * "Don't care" if we are not scrubbing new memory.
1422 	 */
1423 	(void) new_memscrub(0);		/* retain page retire list */
1424 
1425 	/* Restore the pause setting. */
1426 	atomic_dec_32(&pause_memscrub);
1427 }
1428 
1429 /*ARGSUSED*/
1430 static int
1431 memscrub_mem_config_pre_del(
1432 	void *arg,
1433 	pgcnt_t delta_pages)
1434 {
1435 	/* Nothing to do. */
1436 	return (0);
1437 }
1438 
1439 /*ARGSUSED*/
1440 static void
1441 memscrub_mem_config_post_del(
1442 	void *arg,
1443 	pgcnt_t delta_pages,
1444 	int cancelled)
1445 {
1446 	/*
1447 	 * We increment pause_memscrub before entering new_memscrub(). This
1448 	 * will force the memscrubber to sleep, allowing the DR callback
1449 	 * thread to acquire memscrub_lock in new_memscrub(). The use of
1450 	 * atomic_add_32() allows concurrent memory DR operations to use the
1451 	 * callbacks safely.
1452 	 */
1453 	atomic_inc_32(&pause_memscrub);
1454 	ASSERT(pause_memscrub != 0);
1455 
1456 	/*
1457 	 * Must stop scrubbing deleted memory as it may be disconnected.
1458 	 */
1459 	if (new_memscrub(1)) {	/* update page retire list */
1460 		disable_memscrub = 1;
1461 	}
1462 
1463 	/* Restore the pause setting. */
1464 	atomic_dec_32(&pause_memscrub);
1465 }
1466 
1467 static kphysm_setup_vector_t memscrub_mem_config_vec = {
1468 	KPHYSM_SETUP_VECTOR_VERSION,
1469 	memscrub_mem_config_post_add,
1470 	memscrub_mem_config_pre_del,
1471 	memscrub_mem_config_post_del,
1472 };
1473 
1474 static void
1475 memscrub_init_mem_config()
1476 {
1477 	int ret;
1478 
1479 	ret = kphysm_setup_func_register(&memscrub_mem_config_vec,
1480 	    (void *)NULL);
1481 	ASSERT(ret == 0);
1482 }
1483 
1484 static void
1485 memscrub_uninit_mem_config()
1486 {
1487 	/* This call is OK if the register call was not done. */
1488 	kphysm_setup_func_unregister(&memscrub_mem_config_vec, (void *)NULL);
1489 }
1490