xref: /titanic_52/usr/src/uts/sun4u/os/memscrub.c (revision fd0939ef389f48c901faf4bf0b60b82d4bc58b64)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * sun4u Memory Scrubbing
28  *
29  * On detection of a correctable memory ECC error, the sun4u kernel
30  * returns the corrected data to the requester and re-writes it
31  * to memory (DRAM).  So if the correctable error was transient,
32  * the read has effectively been cleaned (scrubbed) from memory.
33  *
34  * Scrubbing thus reduces the likelyhood that multiple transient errors
35  * will occur in the same memory word, making uncorrectable errors due
36  * to transients less likely.
37  *
38  * Thus is born the desire that every memory location be periodically
39  * accessed.
40  *
41  * This file implements a memory scrubbing thread.  This scrubber
42  * guarantees that all of physical memory is accessed periodically
43  * (memscrub_period_sec -- 12 hours).
44  *
45  * It attempts to do this as unobtrusively as possible.  The thread
46  * schedules itself to wake up at an interval such that if it reads
47  * memscrub_span_pages (32MB) on each wakeup, it will read all of physical
48  * memory in in memscrub_period_sec (12 hours).
49  *
50  * The scrubber uses the block load and prefetch hardware to read memory
51  * @ 1300MB/s, so it reads spans of 32MB in 0.025 seconds.  Unlike the
52  * original sun4d scrubber the sun4u scrubber does not read ahead if the
53  * system is idle because we can read memory very efficently.
54  *
55  * The scrubber maintains a private copy of the phys_install memory list
56  * to keep track of what memory should be scrubbed.
57  *
58  * The global routines memscrub_add_span() and memscrub_delete_span() are
59  * used to add and delete from this list.  If hotplug memory is later
60  * supported these two routines can be used to notify the scrubber of
61  * memory configuration changes.
62  *
63  * The following parameters can be set via /etc/system
64  *
65  * memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES (8MB)
66  * memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC (12 hours)
67  * memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI)
68  * memscrub_delay_start_sec = (5 minutes)
69  * memscrub_verbose = (0)
70  * memscrub_override_ticks = (1 tick)
71  * disable_memscrub = (0)
72  * pause_memscrub = (0)
73  * read_all_memscrub = (0)
74  *
75  * The scrubber will print NOTICE messages of what it is doing if
76  * "memscrub_verbose" is set.
77  *
78  * If the scrubber's sleep time calculation drops to zero ticks,
79  * memscrub_override_ticks will be used as the sleep time instead. The
80  * sleep time should only drop to zero on a system with over 131.84
81  * terabytes of memory, or where the default scrubber parameters have
82  * been adjusted. For example, reducing memscrub_span_pages or
83  * memscrub_period_sec causes the sleep time to drop to zero with less
84  * memory. Note that since the sleep time is calculated in clock ticks,
85  * using hires clock ticks allows for more memory before the sleep time
86  * becomes zero.
87  *
88  * The scrubber will exit (or never be started) if it finds the variable
89  * "disable_memscrub" set.
90  *
91  * The scrubber will pause (not read memory) when "pause_memscrub"
92  * is set.  It will check the state of pause_memscrub at each wakeup
93  * period.  The scrubber will not make up for lost time.  If you
94  * pause the scrubber for a prolonged period of time you can use
95  * the "read_all_memscrub" switch (see below) to catch up. In addition,
96  * pause_memscrub is used internally by the post memory DR callbacks.
97  * It is set for the small period of time during which the callbacks
98  * are executing. This ensures "memscrub_lock" will be released,
99  * allowing the callbacks to finish.
100  *
101  * The scrubber will read all memory if "read_all_memscrub" is set.
102  * The normal span read will also occur during the wakeup.
103  *
104  * MEMSCRUB_MIN_PAGES (32MB) is the minimum amount of memory a system
105  * must have before we'll start the scrubber.
106  *
107  * MEMSCRUB_DFL_SPAN_PAGES (32MB) is based on the guess that 0.025 sec
108  * is a "good" amount of minimum time for the thread to run at a time.
109  *
110  * MEMSCRUB_DFL_PERIOD_SEC (12 hours) is nearly a total guess --
111  * twice the frequency the hardware folk estimated would be necessary.
112  *
113  * MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI) is based on the assumption
114  * that the scurbber should get its fair share of time (since it
115  * is short).  At a priority of 0 the scrubber will be starved.
116  */
117 
118 #include <sys/systm.h>		/* timeout, types, t_lock */
119 #include <sys/cmn_err.h>
120 #include <sys/sysmacros.h>	/* MIN */
121 #include <sys/memlist.h>	/* memlist */
122 #include <sys/mem_config.h>	/* memory add/delete */
123 #include <sys/kmem.h>		/* KMEM_NOSLEEP */
124 #include <sys/cpuvar.h>		/* ncpus_online */
125 #include <sys/debug.h>		/* ASSERTs */
126 #include <sys/machsystm.h>	/* lddphys */
127 #include <sys/cpu_module.h>	/* vtag_flushpage */
128 #include <sys/kstat.h>
129 #include <sys/atomic.h>		/* atomic_add_32 */
130 
131 #include <vm/hat.h>
132 #include <vm/seg_kmem.h>
133 #include <vm/hat_sfmmu.h>	/* XXX FIXME - delete */
134 
135 #include <sys/time.h>
136 #include <sys/callb.h>		/* CPR callback */
137 #include <sys/ontrap.h>
138 
139 /*
140  * Should really have paddr_t defined, but it is broken.  Use
141  * ms_paddr_t in the meantime to make the code cleaner
142  */
143 typedef uint64_t ms_paddr_t;
144 
145 /*
146  * Global Routines:
147  */
148 int memscrub_add_span(pfn_t pfn, pgcnt_t pages);
149 int memscrub_delete_span(pfn_t pfn, pgcnt_t pages);
150 int memscrub_init(void);
151 void memscrub_induced_error(void);
152 
153 /*
154  * Global Data:
155  */
156 
157 /*
158  * scrub if we have at least this many pages
159  */
160 #define	MEMSCRUB_MIN_PAGES (32 * 1024 * 1024 / PAGESIZE)
161 
162 /*
163  * scan all of physical memory at least once every MEMSCRUB_PERIOD_SEC
164  */
165 #define	MEMSCRUB_DFL_PERIOD_SEC	(12 * 60 * 60)	/* 12 hours */
166 
167 /*
168  * scan at least MEMSCRUB_DFL_SPAN_PAGES each iteration
169  */
170 #define	MEMSCRUB_DFL_SPAN_PAGES	((32 * 1024 * 1024) / PAGESIZE)
171 
172 /*
173  * almost anything is higher priority than scrubbing
174  */
175 #define	MEMSCRUB_DFL_THREAD_PRI	MINCLSYSPRI
176 
177 /*
178  * size used when scanning memory
179  */
180 #define	MEMSCRUB_BLOCK_SIZE		256
181 #define	MEMSCRUB_BLOCK_SIZE_SHIFT	8 	/* log2(MEMSCRUB_BLOCK_SIZE) */
182 #define	MEMSCRUB_BLOCKS_PER_PAGE	(PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT)
183 
184 #define	MEMSCRUB_BPP4M		MMU_PAGESIZE4M >> MEMSCRUB_BLOCK_SIZE_SHIFT
185 #define	MEMSCRUB_BPP512K	MMU_PAGESIZE512K >> MEMSCRUB_BLOCK_SIZE_SHIFT
186 #define	MEMSCRUB_BPP64K		MMU_PAGESIZE64K >> MEMSCRUB_BLOCK_SIZE_SHIFT
187 #define	MEMSCRUB_BPP		MMU_PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT
188 
189 /*
190  * This message indicates that we have exceeded the limitations of
191  * the memscrubber. See the comments above regarding what would
192  * cause the sleep time to become zero. In DEBUG mode, this message
193  * is logged on the console and in the messages file. In non-DEBUG
194  * mode, it is only logged in the messages file.
195  */
196 #ifdef DEBUG
197 #define	MEMSCRUB_OVERRIDE_MSG	"Memory scrubber sleep time is zero " \
198 	"seconds, consuming entire CPU."
199 #else
200 #define	MEMSCRUB_OVERRIDE_MSG	"!Memory scrubber sleep time is zero " \
201 	"seconds, consuming entire CPU."
202 #endif /* DEBUG */
203 
204 /*
205  * we can patch these defaults in /etc/system if necessary
206  */
207 uint_t disable_memscrub = 0;
208 uint_t pause_memscrub = 0;
209 uint_t read_all_memscrub = 0;
210 uint_t memscrub_verbose = 0;
211 uint_t memscrub_all_idle = 0;
212 uint_t memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES;
213 uint_t memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC;
214 uint_t memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI;
215 uint_t memscrub_delay_start_sec = 5 * 60;
216 uint_t memscrub_override_ticks = 1;
217 
218 /*
219  * Static Routines
220  */
221 static void memscrubber(void);
222 static void memscrub_cleanup(void);
223 static int memscrub_add_span_gen(pfn_t, pgcnt_t, struct memlist **, uint_t *);
224 static int memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp);
225 static void memscrub_scan(uint_t blks, ms_paddr_t src);
226 
227 /*
228  * Static Data
229  */
230 
231 static struct memlist *memscrub_memlist;
232 static uint_t memscrub_phys_pages;
233 
234 static kcondvar_t memscrub_cv;
235 static kmutex_t memscrub_lock;
236 /*
237  * memscrub_lock protects memscrub_memlist, interval_ticks, cprinfo, ...
238  */
239 static void memscrub_init_mem_config(void);
240 static void memscrub_uninit_mem_config(void);
241 
242 /*
243  * Linked list of memscrub aware spans having retired pages.
244  * Currently enabled only on sun4u USIII-based platforms.
245  */
246 typedef struct memscrub_page_retire_span {
247 	ms_paddr_t				address;
248 	struct memscrub_page_retire_span	*next;
249 } memscrub_page_retire_span_t;
250 
251 static memscrub_page_retire_span_t *memscrub_page_retire_span_list = NULL;
252 
253 static void memscrub_page_retire_span_add(ms_paddr_t);
254 static void memscrub_page_retire_span_delete(ms_paddr_t);
255 static int memscrub_page_retire_span_search(ms_paddr_t);
256 static void memscrub_page_retire_span_list_update(void);
257 
258 /*
259  * add_to_page_retire_list: Set by cpu_async_log_err() routine
260  * by calling memscrub_induced_error() when CE/UE occurs on a retired
261  * page due to memscrub reading.  Cleared by memscrub after updating
262  * global page retire span list.  Piggybacking on protection of
263  * memscrub_lock, which is held during set and clear.
264  * Note: When cpu_async_log_err() calls memscrub_induced_error(), it is running
265  * on softint context, which gets fired on a cpu memscrub thread currently
266  * running.  Memscrub thread has affinity set during memscrub_read(), hence
267  * migration to new cpu not expected.
268  */
269 static int add_to_page_retire_list = 0;
270 
271 /*
272  * Keep track of some interesting statistics
273  */
274 static struct memscrub_kstats {
275 	kstat_named_t	done_early;	/* ahead of schedule */
276 	kstat_named_t	early_sec;	/* by cumulative num secs */
277 	kstat_named_t	done_late;	/* behind schedule */
278 	kstat_named_t	late_sec;	/* by cumulative num secs */
279 	kstat_named_t	interval_ticks;	/* num ticks between intervals */
280 	kstat_named_t	force_run;	/* forced to run, non-timeout */
281 	kstat_named_t	errors_found;	/* num errors found by memscrub */
282 } memscrub_counts = {
283 	{ "done_early",		KSTAT_DATA_UINT32 },
284 	{ "early_sec", 		KSTAT_DATA_UINT32 },
285 	{ "done_late", 		KSTAT_DATA_UINT32 },
286 	{ "late_sec",		KSTAT_DATA_UINT32 },
287 	{ "interval_ticks",	KSTAT_DATA_UINT32 },
288 	{ "force_run",		KSTAT_DATA_UINT32 },
289 	{ "errors_found",	KSTAT_DATA_UINT32 },
290 };
291 static struct kstat *memscrub_ksp = (struct kstat *)NULL;
292 
293 static timeout_id_t memscrub_tid = 0;	/* keep track of timeout id */
294 
295 /*
296  * create memscrub_memlist from phys_install list
297  * initialize locks, set memscrub_phys_pages.
298  */
299 int
300 memscrub_init(void)
301 {
302 	struct memlist *src;
303 
304 	/*
305 	 * only startup the scrubber if we have a minimum
306 	 * number of pages
307 	 */
308 	if (physinstalled >= MEMSCRUB_MIN_PAGES) {
309 
310 		/*
311 		 * initialize locks
312 		 */
313 		mutex_init(&memscrub_lock, NULL, MUTEX_DRIVER, NULL);
314 		cv_init(&memscrub_cv, NULL, CV_DRIVER, NULL);
315 
316 		/*
317 		 * copy phys_install to memscrub_memlist
318 		 */
319 		for (src = phys_install; src; src = src->ml_next) {
320 			if (memscrub_add_span(
321 			    (pfn_t)(src->ml_address >> PAGESHIFT),
322 			    (pgcnt_t)(src->ml_size >> PAGESHIFT))) {
323 				memscrub_cleanup();
324 				return (-1);
325 			}
326 		}
327 
328 		/*
329 		 * initialize kstats
330 		 */
331 		memscrub_ksp = kstat_create("unix", 0, "memscrub_kstat",
332 		    "misc", KSTAT_TYPE_NAMED,
333 		    sizeof (memscrub_counts) / sizeof (kstat_named_t),
334 		    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
335 
336 		if (memscrub_ksp) {
337 			memscrub_ksp->ks_data = (void *)&memscrub_counts;
338 			kstat_install(memscrub_ksp);
339 		} else {
340 			cmn_err(CE_NOTE, "Memscrubber cannot create kstats\n");
341 		}
342 
343 		/*
344 		 * create memscrubber thread
345 		 */
346 		(void) thread_create(NULL, 0, (void (*)())memscrubber,
347 		    NULL, 0, &p0, TS_RUN, memscrub_thread_pri);
348 
349 		/*
350 		 * We don't want call backs changing the list
351 		 * if there is no thread running. We do not
352 		 * attempt to deal with stopping/starting scrubbing
353 		 * on memory size changes.
354 		 */
355 		memscrub_init_mem_config();
356 	}
357 
358 	return (0);
359 }
360 
361 static void
362 memscrub_cleanup(void)
363 {
364 	memscrub_uninit_mem_config();
365 	while (memscrub_memlist) {
366 		(void) memscrub_delete_span(
367 		    (pfn_t)(memscrub_memlist->ml_address >> PAGESHIFT),
368 		    (pgcnt_t)(memscrub_memlist->ml_size >> PAGESHIFT));
369 	}
370 	if (memscrub_ksp)
371 		kstat_delete(memscrub_ksp);
372 	cv_destroy(&memscrub_cv);
373 	mutex_destroy(&memscrub_lock);
374 }
375 
376 #ifdef MEMSCRUB_DEBUG
377 static void
378 memscrub_printmemlist(char *title, struct memlist *listp)
379 {
380 	struct memlist *list;
381 
382 	cmn_err(CE_CONT, "%s:\n", title);
383 
384 	for (list = listp; list; list = list->ml_next) {
385 		cmn_err(CE_CONT, "addr = 0x%llx, size = 0x%llx\n",
386 		    list->ml_address, list->ml_size);
387 	}
388 }
389 #endif /* MEMSCRUB_DEBUG */
390 
391 /* ARGSUSED */
392 static void
393 memscrub_wakeup(void *c)
394 {
395 	/*
396 	 * grab mutex to guarantee that our wakeup call
397 	 * arrives after we go to sleep -- so we can't sleep forever.
398 	 */
399 	mutex_enter(&memscrub_lock);
400 	cv_signal(&memscrub_cv);
401 	mutex_exit(&memscrub_lock);
402 }
403 
404 /*
405  * provide an interface external to the memscrubber
406  * which will force the memscrub thread to run vs.
407  * waiting for the timeout, if one is set
408  */
409 void
410 memscrub_run(void)
411 {
412 	memscrub_counts.force_run.value.ui32++;
413 	if (memscrub_tid) {
414 		(void) untimeout(memscrub_tid);
415 		memscrub_wakeup((void *)NULL);
416 	}
417 }
418 
419 /*
420  * this calculation doesn't account for the time
421  * that the actual scan consumes -- so we'd fall
422  * slightly behind schedule with this interval.
423  * It's very small.
424  */
425 
426 static uint_t
427 compute_interval_ticks(void)
428 {
429 	/*
430 	 * We use msp_safe mpp_safe below to insure somebody
431 	 * doesn't set memscrub_span_pages or memscrub_phys_pages
432 	 * to 0 on us.
433 	 */
434 	static uint_t msp_safe, mpp_safe;
435 	static uint_t interval_ticks, period_ticks;
436 	msp_safe = memscrub_span_pages;
437 	mpp_safe = memscrub_phys_pages;
438 
439 	period_ticks = memscrub_period_sec * hz;
440 	interval_ticks = period_ticks;
441 
442 	ASSERT(mutex_owned(&memscrub_lock));
443 
444 	if ((msp_safe != 0) && (mpp_safe != 0)) {
445 		if (memscrub_phys_pages <= msp_safe) {
446 			interval_ticks = period_ticks;
447 		} else {
448 			interval_ticks = (period_ticks /
449 			    (mpp_safe / msp_safe));
450 		}
451 	}
452 	return (interval_ticks);
453 }
454 
455 void
456 memscrubber(void)
457 {
458 	ms_paddr_t address, addr;
459 	time_t deadline;
460 	pgcnt_t pages;
461 	uint_t reached_end = 1;
462 	uint_t paused_message = 0;
463 	uint_t interval_ticks = 0;
464 	uint_t sleep_warn_printed = 0;
465 	callb_cpr_t cprinfo;
466 
467 	/*
468 	 * notify CPR of our existence
469 	 */
470 	CALLB_CPR_INIT(&cprinfo, &memscrub_lock, callb_generic_cpr, "memscrub");
471 
472 	mutex_enter(&memscrub_lock);
473 
474 	if (memscrub_memlist == NULL) {
475 		cmn_err(CE_WARN, "memscrub_memlist not initialized.");
476 		goto memscrub_exit;
477 	}
478 
479 	address = memscrub_memlist->ml_address;
480 
481 	deadline = gethrestime_sec() + memscrub_delay_start_sec;
482 
483 	for (;;) {
484 		if (disable_memscrub)
485 			break;
486 
487 		/*
488 		 * compute interval_ticks
489 		 */
490 		interval_ticks = compute_interval_ticks();
491 
492 		/*
493 		 * If the calculated sleep time is zero, and pause_memscrub
494 		 * has been set, make sure we sleep so that another thread
495 		 * can acquire memscrub_lock.
496 		 */
497 		if (interval_ticks == 0 && pause_memscrub) {
498 			interval_ticks = hz;
499 		}
500 
501 		/*
502 		 * And as a fail safe, under normal non-paused operation, do
503 		 * not allow the sleep time to be zero.
504 		 */
505 		if (interval_ticks == 0) {
506 			interval_ticks = memscrub_override_ticks;
507 			if (!sleep_warn_printed) {
508 				cmn_err(CE_NOTE, MEMSCRUB_OVERRIDE_MSG);
509 				sleep_warn_printed = 1;
510 			}
511 		}
512 
513 		memscrub_counts.interval_ticks.value.ui32 = interval_ticks;
514 
515 		/*
516 		 * Did we just reach the end of memory? If we are at the
517 		 * end of memory, delay end of memory processing until
518 		 * pause_memscrub is not set.
519 		 */
520 		if (reached_end && !pause_memscrub) {
521 			time_t now = gethrestime_sec();
522 
523 			if (now >= deadline) {
524 				memscrub_counts.done_late.value.ui32++;
525 				memscrub_counts.late_sec.value.ui32 +=
526 				    (now - deadline);
527 				/*
528 				 * past deadline, start right away
529 				 */
530 				interval_ticks = 0;
531 
532 				deadline = now + memscrub_period_sec;
533 			} else {
534 				/*
535 				 * we finished ahead of schedule.
536 				 * wait till previous deadline before re-start.
537 				 */
538 				interval_ticks = (deadline - now) * hz;
539 				memscrub_counts.done_early.value.ui32++;
540 				memscrub_counts.early_sec.value.ui32 +=
541 				    (deadline - now);
542 				deadline += memscrub_period_sec;
543 			}
544 			reached_end = 0;
545 			sleep_warn_printed = 0;
546 		}
547 
548 		if (interval_ticks != 0) {
549 			/*
550 			 * it is safe from our standpoint for CPR to
551 			 * suspend the system
552 			 */
553 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
554 
555 			/*
556 			 * hit the snooze bar
557 			 */
558 			memscrub_tid = timeout(memscrub_wakeup, NULL,
559 			    interval_ticks);
560 
561 			/*
562 			 * go to sleep
563 			 */
564 			cv_wait(&memscrub_cv, &memscrub_lock);
565 
566 			/*
567 			 * at this point, no timeout should be set
568 			 */
569 			memscrub_tid = 0;
570 
571 			/*
572 			 * we need to goto work and will be modifying
573 			 * our internal state and mapping/unmapping
574 			 * TTEs
575 			 */
576 			CALLB_CPR_SAFE_END(&cprinfo, &memscrub_lock);
577 		}
578 
579 
580 		if (memscrub_phys_pages == 0) {
581 			cmn_err(CE_WARN, "Memory scrubber has 0 pages to read");
582 			goto memscrub_exit;
583 		}
584 
585 		if (!pause_memscrub) {
586 			if (paused_message) {
587 				paused_message = 0;
588 				if (memscrub_verbose)
589 					cmn_err(CE_NOTE, "Memory scrubber "
590 					    "resuming");
591 			}
592 
593 			if (read_all_memscrub) {
594 				if (memscrub_verbose)
595 					cmn_err(CE_NOTE, "Memory scrubber "
596 					    "reading all memory per request");
597 
598 				addr = memscrub_memlist->ml_address;
599 				reached_end = 0;
600 				while (!reached_end) {
601 					if (disable_memscrub)
602 						break;
603 					pages = memscrub_phys_pages;
604 					reached_end = memscrub_verify_span(
605 					    &addr, &pages);
606 					memscrub_scan(pages *
607 					    MEMSCRUB_BLOCKS_PER_PAGE, addr);
608 					addr += ((uint64_t)pages * PAGESIZE);
609 				}
610 				read_all_memscrub = 0;
611 			}
612 
613 			/*
614 			 * read 1 span
615 			 */
616 			pages = memscrub_span_pages;
617 
618 			if (disable_memscrub)
619 				break;
620 
621 			/*
622 			 * determine physical address range
623 			 */
624 			reached_end = memscrub_verify_span(&address,
625 			    &pages);
626 
627 			memscrub_scan(pages * MEMSCRUB_BLOCKS_PER_PAGE,
628 			    address);
629 
630 			address += ((uint64_t)pages * PAGESIZE);
631 		}
632 
633 		if (pause_memscrub && !paused_message) {
634 			paused_message = 1;
635 			if (memscrub_verbose)
636 				cmn_err(CE_NOTE, "Memory scrubber paused");
637 		}
638 	}
639 
640 memscrub_exit:
641 	cmn_err(CE_NOTE, "Memory scrubber exiting");
642 	CALLB_CPR_EXIT(&cprinfo);
643 	memscrub_cleanup();
644 	thread_exit();
645 	/* NOTREACHED */
646 }
647 
648 /*
649  * condition address and size
650  * such that they span legal physical addresses.
651  *
652  * when appropriate, address will be rounded up to start of next
653  * struct memlist, and pages will be rounded down to the end of the
654  * memlist size.
655  *
656  * returns 1 if reached end of list, else returns 0.
657  */
658 static int
659 memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp)
660 {
661 	struct memlist *mlp;
662 	ms_paddr_t address = *addrp;
663 	uint64_t bytes = (uint64_t)*pagesp * PAGESIZE;
664 	uint64_t bytes_remaining;
665 	int reached_end = 0;
666 
667 	ASSERT(mutex_owned(&memscrub_lock));
668 
669 	/*
670 	 * find memlist struct that contains addrp
671 	 * assumes memlist is sorted by ascending address.
672 	 */
673 	for (mlp = memscrub_memlist; mlp != NULL; mlp = mlp->ml_next) {
674 		/*
675 		 * if before this chunk, round up to beginning
676 		 */
677 		if (address < mlp->ml_address) {
678 			address = mlp->ml_address;
679 			break;
680 		}
681 		/*
682 		 * if before end of chunk, then we found it
683 		 */
684 		if (address < (mlp->ml_address + mlp->ml_size))
685 			break;
686 
687 		/* else go to next struct memlist */
688 	}
689 	/*
690 	 * if we hit end of list, start at beginning
691 	 */
692 	if (mlp == NULL) {
693 		mlp = memscrub_memlist;
694 		address = mlp->ml_address;
695 	}
696 
697 	/*
698 	 * now we have legal address, and its mlp, condition bytes
699 	 */
700 	bytes_remaining = (mlp->ml_address + mlp->ml_size) - address;
701 
702 	if (bytes > bytes_remaining)
703 		bytes = bytes_remaining;
704 
705 	/*
706 	 * will this span take us to end of list?
707 	 */
708 	if ((mlp->ml_next == NULL) &&
709 	    ((mlp->ml_address + mlp->ml_size) == (address + bytes)))
710 		reached_end = 1;
711 
712 	/* return values */
713 	*addrp = address;
714 	*pagesp = bytes / PAGESIZE;
715 
716 	return (reached_end);
717 }
718 
719 /*
720  * add a span to the memscrub list
721  * add to memscrub_phys_pages
722  */
723 int
724 memscrub_add_span(pfn_t pfn, pgcnt_t pages)
725 {
726 #ifdef MEMSCRUB_DEBUG
727 	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
728 	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
729 #endif /* MEMSCRUB_DEBUG */
730 
731 	int retval;
732 
733 	mutex_enter(&memscrub_lock);
734 
735 #ifdef MEMSCRUB_DEBUG
736 	memscrub_printmemlist("memscrub_memlist before", memscrub_memlist);
737 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
738 	cmn_err(CE_CONT, "memscrub_add_span: address: 0x%llx"
739 	    " size: 0x%llx\n", address, bytes);
740 #endif /* MEMSCRUB_DEBUG */
741 
742 	retval = memscrub_add_span_gen(pfn, pages, &memscrub_memlist,
743 	    &memscrub_phys_pages);
744 
745 #ifdef MEMSCRUB_DEBUG
746 	memscrub_printmemlist("memscrub_memlist after", memscrub_memlist);
747 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
748 #endif /* MEMSCRUB_DEBUG */
749 
750 	mutex_exit(&memscrub_lock);
751 
752 	return (retval);
753 }
754 
755 static int
756 memscrub_add_span_gen(
757 	pfn_t pfn,
758 	pgcnt_t pages,
759 	struct memlist **list,
760 	uint_t *npgs)
761 {
762 	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
763 	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
764 	struct memlist *dst;
765 	struct memlist *prev, *next;
766 	int retval = 0;
767 
768 	/*
769 	 * allocate a new struct memlist
770 	 */
771 
772 	dst = (struct memlist *)
773 	    kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
774 
775 	if (dst == NULL) {
776 		retval = -1;
777 		goto add_done;
778 	}
779 
780 	dst->ml_address = address;
781 	dst->ml_size = bytes;
782 
783 	/*
784 	 * first insert
785 	 */
786 	if (*list == NULL) {
787 		dst->ml_prev = NULL;
788 		dst->ml_next = NULL;
789 		*list = dst;
790 
791 		goto add_done;
792 	}
793 
794 	/*
795 	 * insert into sorted list
796 	 */
797 	for (prev = NULL, next = *list;
798 	    next != NULL;
799 	    prev = next, next = next->ml_next) {
800 		if (address > (next->ml_address + next->ml_size))
801 			continue;
802 
803 		/*
804 		 * else insert here
805 		 */
806 
807 		/*
808 		 * prepend to next
809 		 */
810 		if ((address + bytes) == next->ml_address) {
811 			kmem_free(dst, sizeof (struct memlist));
812 
813 			next->ml_address = address;
814 			next->ml_size += bytes;
815 
816 			goto add_done;
817 		}
818 
819 		/*
820 		 * append to next
821 		 */
822 		if (address == (next->ml_address + next->ml_size)) {
823 			kmem_free(dst, sizeof (struct memlist));
824 
825 			if (next->ml_next) {
826 				/*
827 				 * don't overlap with next->ml_next
828 				 */
829 				if ((address + bytes) >
830 				    next->ml_next->ml_address) {
831 					retval = -1;
832 					goto add_done;
833 				}
834 				/*
835 				 * concatenate next and next->ml_next
836 				 */
837 				if ((address + bytes) ==
838 				    next->ml_next->ml_address) {
839 					struct memlist *mlp = next->ml_next;
840 
841 					if (next == *list)
842 						*list = next->ml_next;
843 
844 					mlp->ml_address = next->ml_address;
845 					mlp->ml_size += next->ml_size;
846 					mlp->ml_size += bytes;
847 
848 					if (next->ml_prev)
849 						next->ml_prev->ml_next = mlp;
850 					mlp->ml_prev = next->ml_prev;
851 
852 					kmem_free(next,
853 					    sizeof (struct memlist));
854 					goto add_done;
855 				}
856 			}
857 
858 			next->ml_size += bytes;
859 
860 			goto add_done;
861 		}
862 
863 		/* don't overlap with next */
864 		if ((address + bytes) > next->ml_address) {
865 			retval = -1;
866 			kmem_free(dst, sizeof (struct memlist));
867 			goto add_done;
868 		}
869 
870 		/*
871 		 * insert before next
872 		 */
873 		dst->ml_prev = prev;
874 		dst->ml_next = next;
875 		next->ml_prev = dst;
876 		if (prev == NULL) {
877 			*list = dst;
878 		} else {
879 			prev->ml_next = dst;
880 		}
881 		goto add_done;
882 	}	/* end for */
883 
884 	/*
885 	 * end of list, prev is valid and next is NULL
886 	 */
887 	prev->ml_next = dst;
888 	dst->ml_prev = prev;
889 	dst->ml_next = NULL;
890 
891 add_done:
892 
893 	if (retval != -1)
894 		*npgs += pages;
895 
896 	return (retval);
897 }
898 
899 /*
900  * delete a span from the memscrub list
901  * subtract from memscrub_phys_pages
902  */
903 int
904 memscrub_delete_span(pfn_t pfn, pgcnt_t pages)
905 {
906 	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
907 	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
908 	struct memlist *dst, *next;
909 	int retval = 0;
910 
911 	mutex_enter(&memscrub_lock);
912 
913 #ifdef MEMSCRUB_DEBUG
914 	memscrub_printmemlist("memscrub_memlist Before", memscrub_memlist);
915 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
916 	cmn_err(CE_CONT, "memscrub_delete_span: 0x%llx 0x%llx\n",
917 	    address, bytes);
918 #endif /* MEMSCRUB_DEBUG */
919 
920 	/*
921 	 * find struct memlist containing page
922 	 */
923 	for (next = memscrub_memlist; next != NULL; next = next->ml_next) {
924 		if ((address >= next->ml_address) &&
925 		    (address < next->ml_address + next->ml_size))
926 			break;
927 	}
928 
929 	/*
930 	 * if start address not in list
931 	 */
932 	if (next == NULL) {
933 		retval = -1;
934 		goto delete_done;
935 	}
936 
937 	/*
938 	 * error if size goes off end of this struct memlist
939 	 */
940 	if (address + bytes > next->ml_address + next->ml_size) {
941 		retval = -1;
942 		goto delete_done;
943 	}
944 
945 	/*
946 	 * pages at beginning of struct memlist
947 	 */
948 	if (address == next->ml_address) {
949 		/*
950 		 * if start & size match, delete from list
951 		 */
952 		if (bytes == next->ml_size) {
953 			if (next == memscrub_memlist)
954 				memscrub_memlist = next->ml_next;
955 			if (next->ml_prev != NULL)
956 				next->ml_prev->ml_next = next->ml_next;
957 			if (next->ml_next != NULL)
958 				next->ml_next->ml_prev = next->ml_prev;
959 
960 			kmem_free(next, sizeof (struct memlist));
961 		} else {
962 		/*
963 		 * increment start address by bytes
964 		 */
965 			next->ml_address += bytes;
966 			next->ml_size -= bytes;
967 		}
968 		goto delete_done;
969 	}
970 
971 	/*
972 	 * pages at end of struct memlist
973 	 */
974 	if (address + bytes == next->ml_address + next->ml_size) {
975 		/*
976 		 * decrement size by bytes
977 		 */
978 		next->ml_size -= bytes;
979 		goto delete_done;
980 	}
981 
982 	/*
983 	 * delete a span in the middle of the struct memlist
984 	 */
985 	{
986 		/*
987 		 * create a new struct memlist
988 		 */
989 		dst = (struct memlist *)
990 		    kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
991 
992 		if (dst == NULL) {
993 			retval = -1;
994 			goto delete_done;
995 		}
996 
997 		/*
998 		 * existing struct memlist gets address
999 		 * and size up to pfn
1000 		 */
1001 		dst->ml_address = address + bytes;
1002 		dst->ml_size =
1003 		    (next->ml_address + next->ml_size) - dst->ml_address;
1004 		next->ml_size = address - next->ml_address;
1005 
1006 		/*
1007 		 * new struct memlist gets address starting
1008 		 * after pfn, until end
1009 		 */
1010 
1011 		/*
1012 		 * link in new memlist after old
1013 		 */
1014 		dst->ml_next = next->ml_next;
1015 		dst->ml_prev = next;
1016 
1017 		if (next->ml_next != NULL)
1018 			next->ml_next->ml_prev = dst;
1019 		next->ml_next = dst;
1020 	}
1021 
1022 delete_done:
1023 	if (retval != -1) {
1024 		memscrub_phys_pages -= pages;
1025 		if (memscrub_phys_pages == 0)
1026 			disable_memscrub = 1;
1027 	}
1028 
1029 #ifdef MEMSCRUB_DEBUG
1030 	memscrub_printmemlist("memscrub_memlist After", memscrub_memlist);
1031 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
1032 #endif /* MEMSCRUB_DEBUG */
1033 
1034 	mutex_exit(&memscrub_lock);
1035 	return (retval);
1036 }
1037 
1038 static void
1039 memscrub_scan(uint_t blks, ms_paddr_t src)
1040 {
1041 	uint_t 		psz, bpp, pgsread;
1042 	pfn_t		pfn;
1043 	ms_paddr_t	pa;
1044 	caddr_t		va;
1045 	on_trap_data_t	otd;
1046 	int		scan_mmu_pagesize = 0;
1047 	int		retired_pages = 0;
1048 
1049 	extern void memscrub_read(caddr_t src, uint_t blks);
1050 
1051 	ASSERT(mutex_owned(&memscrub_lock));
1052 
1053 	pgsread = 0;
1054 	pa = src;
1055 
1056 	if (memscrub_page_retire_span_list != NULL) {
1057 		if (memscrub_page_retire_span_search(src)) {
1058 			/* retired pages in current span */
1059 			scan_mmu_pagesize = 1;
1060 		}
1061 	}
1062 
1063 #ifdef MEMSCRUB_DEBUG
1064 	cmn_err(CE_NOTE, "scan_mmu_pagesize = %d\n" scan_mmu_pagesize);
1065 #endif /* MEMSCRUB_DEBUG */
1066 
1067 	while (blks != 0) {
1068 		/* Ensure the PA is properly aligned */
1069 		if (((pa & MMU_PAGEMASK4M) == pa) &&
1070 		    (blks >= MEMSCRUB_BPP4M)) {
1071 			psz = MMU_PAGESIZE4M;
1072 			bpp = MEMSCRUB_BPP4M;
1073 		} else if (((pa & MMU_PAGEMASK512K) == pa) &&
1074 		    (blks >= MEMSCRUB_BPP512K)) {
1075 			psz = MMU_PAGESIZE512K;
1076 			bpp = MEMSCRUB_BPP512K;
1077 		} else if (((pa & MMU_PAGEMASK64K) == pa) &&
1078 		    (blks >= MEMSCRUB_BPP64K)) {
1079 			psz = MMU_PAGESIZE64K;
1080 			bpp = MEMSCRUB_BPP64K;
1081 		} else if ((pa & MMU_PAGEMASK) == pa) {
1082 			psz = MMU_PAGESIZE;
1083 			bpp = MEMSCRUB_BPP;
1084 		} else {
1085 			if (memscrub_verbose) {
1086 				cmn_err(CE_NOTE, "Memory scrubber ignoring "
1087 				    "non-page aligned block starting at 0x%"
1088 				    PRIx64, src);
1089 			}
1090 			return;
1091 		}
1092 		if (blks < bpp) bpp = blks;
1093 
1094 #ifdef MEMSCRUB_DEBUG
1095 		cmn_err(CE_NOTE, "Going to run psz=%x, "
1096 		    "bpp=%x pa=%llx\n", psz, bpp, pa);
1097 #endif /* MEMSCRUB_DEBUG */
1098 
1099 		/*
1100 		 * MEMSCRUBBASE is a 4MB aligned page in the
1101 		 * kernel so that we can quickly map the PA
1102 		 * to a VA for the block loads performed in
1103 		 * memscrub_read.
1104 		 */
1105 		pfn = mmu_btop(pa);
1106 		va = (caddr_t)MEMSCRUBBASE;
1107 		hat_devload(kas.a_hat, va, psz, pfn, PROT_READ,
1108 		    HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
1109 
1110 		/*
1111 		 * Can't allow the memscrubber to migrate across CPUs as
1112 		 * we need to know whether CEEN is enabled for the current
1113 		 * CPU to enable us to scrub the memory. Don't use
1114 		 * kpreempt_disable as the time we take to scan a span (even
1115 		 * without cpu_check_ce having to manually cpu_check_block)
1116 		 * is too long to hold a higher priority thread (eg, RT)
1117 		 * off cpu.
1118 		 */
1119 		thread_affinity_set(curthread, CPU_CURRENT);
1120 
1121 		/*
1122 		 * Protect read scrub from async faults.  For now, we simply
1123 		 * maintain a count of such faults caught.
1124 		 */
1125 
1126 		if (!scan_mmu_pagesize && !on_trap(&otd, OT_DATA_EC)) {
1127 			memscrub_read(va, bpp);
1128 			/*
1129 			 * Check if CEs require logging
1130 			 */
1131 			cpu_check_ce(SCRUBBER_CEEN_CHECK,
1132 			    (uint64_t)pa, va, psz);
1133 			no_trap();
1134 			thread_affinity_clear(curthread);
1135 		} else {
1136 			no_trap();
1137 			thread_affinity_clear(curthread);
1138 
1139 			/*
1140 			 * Got an async error..
1141 			 * Try rescanning it at MMU_PAGESIZE
1142 			 * granularity if we were trying to
1143 			 * read at a larger page size.
1144 			 * This is to ensure we continue to
1145 			 * scan the rest of the span.
1146 			 * OR scanning MMU_PAGESIZE granularity to avoid
1147 			 * reading retired pages memory when scan_mmu_pagesize
1148 			 * is set.
1149 			 */
1150 			if (psz > MMU_PAGESIZE || scan_mmu_pagesize) {
1151 			    caddr_t vaddr = va;
1152 			    ms_paddr_t paddr = pa;
1153 			    int tmp = 0;
1154 			    for (; tmp < bpp; tmp += MEMSCRUB_BPP) {
1155 				/* Don't scrub retired pages */
1156 				if (page_retire_check(paddr, NULL) == 0) {
1157 					vaddr += MMU_PAGESIZE;
1158 					paddr += MMU_PAGESIZE;
1159 					retired_pages++;
1160 					continue;
1161 				}
1162 				thread_affinity_set(curthread, CPU_CURRENT);
1163 				if (!on_trap(&otd, OT_DATA_EC)) {
1164 				    memscrub_read(vaddr, MEMSCRUB_BPP);
1165 				    cpu_check_ce(SCRUBBER_CEEN_CHECK,
1166 					(uint64_t)paddr, vaddr, MMU_PAGESIZE);
1167 				    no_trap();
1168 				} else {
1169 				    no_trap();
1170 				    memscrub_counts.errors_found.value.ui32++;
1171 				}
1172 				thread_affinity_clear(curthread);
1173 				vaddr += MMU_PAGESIZE;
1174 				paddr += MMU_PAGESIZE;
1175 			    }
1176 			}
1177 		}
1178 		hat_unload(kas.a_hat, va, psz, HAT_UNLOAD_UNLOCK);
1179 
1180 		blks -= bpp;
1181 		pa += psz;
1182 		pgsread++;
1183 	}
1184 
1185 	/*
1186 	 * If just finished scrubbing MMU_PAGESIZE at a time, but no retired
1187 	 * pages found so delete span from global list.
1188 	 */
1189 	if (scan_mmu_pagesize && retired_pages == 0)
1190 		memscrub_page_retire_span_delete(src);
1191 
1192 	/*
1193 	 * Encountered CE/UE on a retired page during memscrub read of current
1194 	 * span.  Adding span to global list to enable avoid reading further.
1195 	 */
1196 	if (add_to_page_retire_list) {
1197 		if (!memscrub_page_retire_span_search(src))
1198 			memscrub_page_retire_span_add(src);
1199 		add_to_page_retire_list = 0;
1200 	}
1201 
1202 	if (memscrub_verbose) {
1203 		cmn_err(CE_NOTE, "Memory scrubber read 0x%x pages starting "
1204 		    "at 0x%" PRIx64, pgsread, src);
1205 	}
1206 }
1207 
1208 /*
1209  * Called by cpu_async_log_err() when memscrub read causes
1210  * CE/UE on a retired page.
1211  */
1212 void
1213 memscrub_induced_error(void)
1214 {
1215 	add_to_page_retire_list = 1;
1216 }
1217 
1218 
1219 /*
1220  * Called by memscrub_scan().
1221  * pa: physical address of span with CE/UE, add to global list.
1222  */
1223 static void
1224 memscrub_page_retire_span_add(ms_paddr_t pa)
1225 {
1226 	memscrub_page_retire_span_t *new_span;
1227 
1228 	new_span = (memscrub_page_retire_span_t *)
1229 	    kmem_zalloc(sizeof (memscrub_page_retire_span_t), KM_NOSLEEP);
1230 
1231 	if (new_span == NULL) {
1232 #ifdef MEMSCRUB_DEBUG
1233 		cmn_err(CE_NOTE, "failed to allocate new span - span with"
1234 		    " retired page/s not tracked.\n");
1235 #endif /* MEMSCRUB_DEBUG */
1236 		return;
1237 	}
1238 
1239 	new_span->address = pa;
1240 	new_span->next = memscrub_page_retire_span_list;
1241 	memscrub_page_retire_span_list = new_span;
1242 }
1243 
1244 /*
1245  * Called by memscrub_scan().
1246  * pa: physical address of span to be removed from global list.
1247  */
1248 static void
1249 memscrub_page_retire_span_delete(ms_paddr_t pa)
1250 {
1251 	memscrub_page_retire_span_t *prev_span, *next_span;
1252 
1253 	prev_span = memscrub_page_retire_span_list;
1254 	next_span = memscrub_page_retire_span_list->next;
1255 
1256 	if (pa == prev_span->address) {
1257 		memscrub_page_retire_span_list = next_span;
1258 		kmem_free(prev_span, sizeof (memscrub_page_retire_span_t));
1259 		return;
1260 	}
1261 
1262 	while (next_span) {
1263 		if (pa == next_span->address) {
1264 			prev_span->next = next_span->next;
1265 			kmem_free(next_span,
1266 			    sizeof (memscrub_page_retire_span_t));
1267 			return;
1268 		}
1269 		prev_span = next_span;
1270 		next_span = next_span->next;
1271 	}
1272 }
1273 
1274 /*
1275  * Called by memscrub_scan().
1276  * pa: physical address of span to be searched in global list.
1277  */
1278 static int
1279 memscrub_page_retire_span_search(ms_paddr_t pa)
1280 {
1281 	memscrub_page_retire_span_t *next_span = memscrub_page_retire_span_list;
1282 
1283 	while (next_span) {
1284 		if (pa == next_span->address)
1285 			return (1);
1286 		next_span = next_span->next;
1287 	}
1288 	return (0);
1289 }
1290 
1291 /*
1292  * Called from new_memscrub() as a result of memory delete.
1293  * Using page_numtopp_nolock() to determine if we have valid PA.
1294  */
1295 static void
1296 memscrub_page_retire_span_list_update(void)
1297 {
1298 	memscrub_page_retire_span_t *prev, *cur, *next;
1299 
1300 	if (memscrub_page_retire_span_list == NULL)
1301 		return;
1302 
1303 	prev = cur = memscrub_page_retire_span_list;
1304 	next = cur->next;
1305 
1306 	while (cur) {
1307 		if (page_numtopp_nolock(mmu_btop(cur->address)) == NULL) {
1308 			if (cur == memscrub_page_retire_span_list) {
1309 				memscrub_page_retire_span_list = next;
1310 				kmem_free(cur,
1311 				    sizeof (memscrub_page_retire_span_t));
1312 				prev = cur = memscrub_page_retire_span_list;
1313 			} else {
1314 				prev->next = cur->next;
1315 				kmem_free(cur,
1316 				    sizeof (memscrub_page_retire_span_t));
1317 				cur = next;
1318 			}
1319 		} else {
1320 			prev = cur;
1321 			cur = next;
1322 		}
1323 		if (cur != NULL)
1324 			next = cur->next;
1325 	}
1326 }
1327 
1328 /*
1329  * The memory add/delete callback mechanism does not pass in the
1330  * page ranges. The phys_install list has been updated though, so
1331  * create a new scrub list from it.
1332  */
1333 
1334 static int
1335 new_memscrub(int update_page_retire_list)
1336 {
1337 	struct memlist *src, *list, *old_list;
1338 	uint_t npgs;
1339 
1340 	/*
1341 	 * copy phys_install to memscrub_memlist
1342 	 */
1343 	list = NULL;
1344 	npgs = 0;
1345 	memlist_read_lock();
1346 	for (src = phys_install; src; src = src->ml_next) {
1347 		if (memscrub_add_span_gen((pfn_t)(src->ml_address >> PAGESHIFT),
1348 		    (pgcnt_t)(src->ml_size >> PAGESHIFT), &list, &npgs)) {
1349 			memlist_read_unlock();
1350 			while (list) {
1351 				struct memlist *el;
1352 
1353 				el = list;
1354 				list = list->ml_next;
1355 				kmem_free(el, sizeof (struct memlist));
1356 			}
1357 			return (-1);
1358 		}
1359 	}
1360 	memlist_read_unlock();
1361 
1362 	mutex_enter(&memscrub_lock);
1363 	memscrub_phys_pages = npgs;
1364 	old_list = memscrub_memlist;
1365 	memscrub_memlist = list;
1366 
1367 	if (update_page_retire_list)
1368 		memscrub_page_retire_span_list_update();
1369 
1370 	mutex_exit(&memscrub_lock);
1371 
1372 	while (old_list) {
1373 		struct memlist *el;
1374 
1375 		el = old_list;
1376 		old_list = old_list->ml_next;
1377 		kmem_free(el, sizeof (struct memlist));
1378 	}
1379 
1380 	return (0);
1381 }
1382 
1383 /*ARGSUSED*/
1384 static void
1385 memscrub_mem_config_post_add(
1386 	void *arg,
1387 	pgcnt_t delta_pages)
1388 {
1389 	/*
1390 	 * We increment pause_memscrub before entering new_memscrub(). This
1391 	 * will force the memscrubber to sleep, allowing the DR callback
1392 	 * thread to acquire memscrub_lock in new_memscrub(). The use of
1393 	 * atomic_add_32() allows concurrent memory DR operations to use the
1394 	 * callbacks safely.
1395 	 */
1396 	atomic_add_32(&pause_memscrub, 1);
1397 	ASSERT(pause_memscrub != 0);
1398 
1399 	/*
1400 	 * "Don't care" if we are not scrubbing new memory.
1401 	 */
1402 	(void) new_memscrub(0);		/* retain page retire list */
1403 
1404 	/* Restore the pause setting. */
1405 	atomic_add_32(&pause_memscrub, -1);
1406 }
1407 
1408 /*ARGSUSED*/
1409 static int
1410 memscrub_mem_config_pre_del(
1411 	void *arg,
1412 	pgcnt_t delta_pages)
1413 {
1414 	/* Nothing to do. */
1415 	return (0);
1416 }
1417 
1418 /*ARGSUSED*/
1419 static void
1420 memscrub_mem_config_post_del(
1421 	void *arg,
1422 	pgcnt_t delta_pages,
1423 	int cancelled)
1424 {
1425 	/*
1426 	 * We increment pause_memscrub before entering new_memscrub(). This
1427 	 * will force the memscrubber to sleep, allowing the DR callback
1428 	 * thread to acquire memscrub_lock in new_memscrub(). The use of
1429 	 * atomic_add_32() allows concurrent memory DR operations to use the
1430 	 * callbacks safely.
1431 	 */
1432 	atomic_add_32(&pause_memscrub, 1);
1433 	ASSERT(pause_memscrub != 0);
1434 
1435 	/*
1436 	 * Must stop scrubbing deleted memory as it may be disconnected.
1437 	 */
1438 	if (new_memscrub(1)) {	/* update page retire list */
1439 		disable_memscrub = 1;
1440 	}
1441 
1442 	/* Restore the pause setting. */
1443 	atomic_add_32(&pause_memscrub, -1);
1444 }
1445 
1446 static kphysm_setup_vector_t memscrub_mem_config_vec = {
1447 	KPHYSM_SETUP_VECTOR_VERSION,
1448 	memscrub_mem_config_post_add,
1449 	memscrub_mem_config_pre_del,
1450 	memscrub_mem_config_post_del,
1451 };
1452 
1453 static void
1454 memscrub_init_mem_config()
1455 {
1456 	int ret;
1457 
1458 	ret = kphysm_setup_func_register(&memscrub_mem_config_vec,
1459 	    (void *)NULL);
1460 	ASSERT(ret == 0);
1461 }
1462 
1463 static void
1464 memscrub_uninit_mem_config()
1465 {
1466 	/* This call is OK if the register call was not done. */
1467 	kphysm_setup_func_unregister(&memscrub_mem_config_vec, (void *)NULL);
1468 }
1469