xref: /illumos-gate/usr/src/uts/sun4u/os/memscrub.c (revision 814a60b13c0ad90e5d2edfd29a7a84bbf416cc1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * sun4u Memory Scrubbing
31  *
32  * On detection of a correctable memory ECC error, the sun4u kernel
33  * returns the corrected data to the requester and re-writes it
34  * to memory (DRAM).  So if the correctable error was transient,
35  * the read has effectively been cleaned (scrubbed) from memory.
36  *
37  * Scrubbing thus reduces the likelyhood that multiple transient errors
38  * will occur in the same memory word, making uncorrectable errors due
39  * to transients less likely.
40  *
41  * Thus is born the desire that every memory location be periodically
42  * accessed.
43  *
44  * This file implements a memory scrubbing thread.  This scrubber
45  * guarantees that all of physical memory is accessed periodically
46  * (memscrub_period_sec -- 12 hours).
47  *
48  * It attempts to do this as unobtrusively as possible.  The thread
49  * schedules itself to wake up at an interval such that if it reads
50  * memscrub_span_pages (8MB) on each wakeup, it will read all of physical
51  * memory in in memscrub_period_sec (12 hours).
52  *
53  * The scrubber uses the block load hardware to read memory @ 268MB/s,
54  * so it reads spans of 8MB in 0.03 seconds.  Unlike the original sun4d
55  * scrubber the sun4u scrubber does not read ahead if the system is idle
56  * because we can read memory very efficently.
57  *
58  * The scrubber maintains a private copy of the phys_install memory list
59  * to keep track of what memory should be scrubbed.
60  *
61  * The global routines memscrub_add_span() and memscrub_delete_span() are
62  * used to add and delete from this list.  If hotplug memory is later
63  * supported these two routines can be used to notify the scrubber of
64  * memory configuration changes.
65  *
66  * The following parameters can be set via /etc/system
67  *
68  * memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES (8MB)
69  * memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC (12 hours)
70  * memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI)
71  * memscrub_delay_start_sec = (5 minutes)
72  * memscrub_verbose = (0)
73  * memscrub_override_ticks = (1 tick)
74  * disable_memscrub = (0)
75  * pause_memscrub = (0)
76  * read_all_memscrub = (0)
77  *
78  * The scrubber will print NOTICE messages of what it is doing if
79  * "memscrub_verbose" is set.
80  *
81  * If the scrubber's sleep time calculation drops to zero ticks,
82  * memscrub_override_ticks will be used as the sleep time instead. The
83  * sleep time should only drop to zero on a system with over 32.95
84  * terabytes of memory, or where the default scrubber parameters have
85  * been adjusted. For example, reducing memscrub_span_pages or
86  * memscrub_period_sec causes the sleep time to drop to zero with less
87  * memory. Note that since the sleep time is calculated in clock ticks,
88  * using hires clock ticks allows for more memory before the sleep time
89  * becomes zero.
90  *
91  * The scrubber will exit (or never be started) if it finds the variable
92  * "disable_memscrub" set.
93  *
94  * The scrubber will pause (not read memory) when "pause_memscrub"
95  * is set.  It will check the state of pause_memscrub at each wakeup
96  * period.  The scrubber will not make up for lost time.  If you
97  * pause the scrubber for a prolonged period of time you can use
98  * the "read_all_memscrub" switch (see below) to catch up. In addition,
99  * pause_memscrub is used internally by the post memory DR callbacks.
100  * It is set for the small period of time during which the callbacks
101  * are executing. This ensures "memscrub_lock" will be released,
102  * allowing the callbacks to finish.
103  *
104  * The scrubber will read all memory if "read_all_memscrub" is set.
105  * The normal span read will also occur during the wakeup.
106  *
107  * MEMSCRUB_MIN_PAGES (32MB) is the minimum amount of memory a system
108  * must have before we'll start the scrubber.
109  *
110  * MEMSCRUB_DFL_SPAN_PAGES (8MB) is based on the guess that 0.03 sec
111  * is a "good" amount of minimum time for the thread to run at a time.
112  *
113  * MEMSCRUB_DFL_PERIOD_SEC (12 hours) is nearly a total guess --
114  * twice the frequency the hardware folk estimated would be necessary.
115  *
116  * MEMSCRUB_DFL_THREAD_PRI (MINCLSYSPRI) is based on the assumption
117  * that the scurbber should get its fair share of time (since it
118  * is short).  At a priority of 0 the scrubber will be starved.
119  */
120 
121 #include <sys/systm.h>		/* timeout, types, t_lock */
122 #include <sys/cmn_err.h>
123 #include <sys/sysmacros.h>	/* MIN */
124 #include <sys/memlist.h>	/* memlist */
125 #include <sys/mem_config.h>	/* memory add/delete */
126 #include <sys/kmem.h>		/* KMEM_NOSLEEP */
127 #include <sys/cpuvar.h>		/* ncpus_online */
128 #include <sys/debug.h>		/* ASSERTs */
129 #include <sys/machsystm.h>	/* lddphys */
130 #include <sys/cpu_module.h>	/* vtag_flushpage */
131 #include <sys/kstat.h>
132 #include <sys/atomic.h>		/* atomic_add_32 */
133 
134 #include <vm/hat.h>
135 #include <vm/seg_kmem.h>
136 #include <vm/hat_sfmmu.h>	/* XXX FIXME - delete */
137 
138 #include <sys/time.h>
139 #include <sys/callb.h>		/* CPR callback */
140 #include <sys/ontrap.h>
141 
142 /*
143  * Should really have paddr_t defined, but it is broken.  Use
144  * ms_paddr_t in the meantime to make the code cleaner
145  */
146 typedef uint64_t ms_paddr_t;
147 
148 /*
149  * Global Routines:
150  */
151 int memscrub_add_span(pfn_t pfn, pgcnt_t pages);
152 int memscrub_delete_span(pfn_t pfn, pgcnt_t pages);
153 int memscrub_init(void);
154 
155 /*
156  * Global Data:
157  */
158 
159 /*
160  * scrub if we have at least this many pages
161  */
162 #define	MEMSCRUB_MIN_PAGES (32 * 1024 * 1024 / PAGESIZE)
163 
164 /*
165  * scan all of physical memory at least once every MEMSCRUB_PERIOD_SEC
166  */
167 #define	MEMSCRUB_DFL_PERIOD_SEC	(12 * 60 * 60)	/* 12 hours */
168 
169 /*
170  * scan at least MEMSCRUB_DFL_SPAN_PAGES each iteration
171  */
172 #define	MEMSCRUB_DFL_SPAN_PAGES	((8 * 1024 * 1024) / PAGESIZE)
173 
174 /*
175  * almost anything is higher priority than scrubbing
176  */
177 #define	MEMSCRUB_DFL_THREAD_PRI	MINCLSYSPRI
178 
179 /*
180  * size used when scanning memory
181  */
182 #define	MEMSCRUB_BLOCK_SIZE		256
183 #define	MEMSCRUB_BLOCK_SIZE_SHIFT	8 	/* log2(MEMSCRUB_BLOCK_SIZE) */
184 #define	MEMSCRUB_BLOCKS_PER_PAGE	(PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT)
185 
186 #define	MEMSCRUB_BPP4M		MMU_PAGESIZE4M >> MEMSCRUB_BLOCK_SIZE_SHIFT
187 #define	MEMSCRUB_BPP512K	MMU_PAGESIZE512K >> MEMSCRUB_BLOCK_SIZE_SHIFT
188 #define	MEMSCRUB_BPP64K		MMU_PAGESIZE64K >> MEMSCRUB_BLOCK_SIZE_SHIFT
189 #define	MEMSCRUB_BPP		MMU_PAGESIZE >> MEMSCRUB_BLOCK_SIZE_SHIFT
190 
191 /*
192  * This message indicates that we have exceeded the limitations of
193  * the memscrubber. See the comments above regarding what would
194  * cause the sleep time to become zero. In DEBUG mode, this message
195  * is logged on the console and in the messages file. In non-DEBUG
196  * mode, it is only logged in the messages file.
197  */
198 #ifdef DEBUG
199 #define	MEMSCRUB_OVERRIDE_MSG	"Memory scrubber sleep time is zero " \
200 	"seconds, consuming entire CPU."
201 #else
202 #define	MEMSCRUB_OVERRIDE_MSG	"!Memory scrubber sleep time is zero " \
203 	"seconds, consuming entire CPU."
204 #endif /* DEBUG */
205 
206 /*
207  * we can patch these defaults in /etc/system if necessary
208  */
209 uint_t disable_memscrub = 0;
210 uint_t pause_memscrub = 0;
211 uint_t read_all_memscrub = 0;
212 uint_t memscrub_verbose = 0;
213 uint_t memscrub_all_idle = 0;
214 uint_t memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES;
215 uint_t memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC;
216 uint_t memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI;
217 uint_t memscrub_delay_start_sec = 5 * 60;
218 uint_t memscrub_override_ticks = 1;
219 
220 /*
221  * Static Routines
222  */
223 static void memscrubber(void);
224 static void memscrub_cleanup(void);
225 static int memscrub_add_span_gen(pfn_t, pgcnt_t, struct memlist **, uint_t *);
226 static int memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp);
227 static void memscrub_scan(uint_t blks, ms_paddr_t src);
228 
229 /*
230  * Static Data
231  */
232 
233 static struct memlist *memscrub_memlist;
234 static uint_t memscrub_phys_pages;
235 
236 static kcondvar_t memscrub_cv;
237 static kmutex_t memscrub_lock;
238 /*
239  * memscrub_lock protects memscrub_memlist, interval_ticks, cprinfo, ...
240  */
241 static void memscrub_init_mem_config(void);
242 static void memscrub_uninit_mem_config(void);
243 
244 /*
245  * Keep track of some interesting statistics
246  */
247 static struct memscrub_kstats {
248 	kstat_named_t	done_early;	/* ahead of schedule */
249 	kstat_named_t	early_sec;	/* by cumulative num secs */
250 	kstat_named_t	done_late;	/* behind schedule */
251 	kstat_named_t	late_sec;	/* by cumulative num secs */
252 	kstat_named_t	interval_ticks;	/* num ticks between intervals */
253 	kstat_named_t	force_run;	/* forced to run, non-timeout */
254 	kstat_named_t	errors_found;	/* num errors found by memscrub */
255 } memscrub_counts = {
256 	{ "done_early",		KSTAT_DATA_UINT32 },
257 	{ "early_sec", 		KSTAT_DATA_UINT32 },
258 	{ "done_late", 		KSTAT_DATA_UINT32 },
259 	{ "late_sec",		KSTAT_DATA_UINT32 },
260 	{ "interval_ticks",	KSTAT_DATA_UINT32 },
261 	{ "force_run",		KSTAT_DATA_UINT32 },
262 	{ "errors_found",	KSTAT_DATA_UINT32 },
263 };
264 static struct kstat *memscrub_ksp = (struct kstat *)NULL;
265 
266 static timeout_id_t memscrub_tid = 0;	/* keep track of timeout id */
267 
268 /*
269  * create memscrub_memlist from phys_install list
270  * initialize locks, set memscrub_phys_pages.
271  */
272 int
273 memscrub_init(void)
274 {
275 	struct memlist *src;
276 
277 	/*
278 	 * only startup the scrubber if we have a minimum
279 	 * number of pages
280 	 */
281 	if (physinstalled >= MEMSCRUB_MIN_PAGES) {
282 
283 		/*
284 		 * initialize locks
285 		 */
286 		mutex_init(&memscrub_lock, NULL, MUTEX_DRIVER, NULL);
287 		cv_init(&memscrub_cv, NULL, CV_DRIVER, NULL);
288 
289 		/*
290 		 * copy phys_install to memscrub_memlist
291 		 */
292 		for (src = phys_install; src; src = src->next) {
293 			if (memscrub_add_span(
294 			    (pfn_t)(src->address >> PAGESHIFT),
295 			    (pgcnt_t)(src->size >> PAGESHIFT))) {
296 				memscrub_cleanup();
297 				return (-1);
298 			}
299 		}
300 
301 		/*
302 		 * initialize kstats
303 		 */
304 		memscrub_ksp = kstat_create("unix", 0, "memscrub_kstat",
305 			"misc", KSTAT_TYPE_NAMED,
306 			sizeof (memscrub_counts) / sizeof (kstat_named_t),
307 			KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
308 
309 		if (memscrub_ksp) {
310 			memscrub_ksp->ks_data = (void *)&memscrub_counts;
311 			kstat_install(memscrub_ksp);
312 		} else {
313 			cmn_err(CE_NOTE, "Memscrubber cannot create kstats\n");
314 		}
315 
316 		/*
317 		 * create memscrubber thread
318 		 */
319 		(void) thread_create(NULL, 0, (void (*)())memscrubber,
320 		    NULL, 0, &p0, TS_RUN, memscrub_thread_pri);
321 
322 		/*
323 		 * We don't want call backs changing the list
324 		 * if there is no thread running. We do not
325 		 * attempt to deal with stopping/starting scrubbing
326 		 * on memory size changes.
327 		 */
328 		memscrub_init_mem_config();
329 	}
330 
331 	return (0);
332 }
333 
334 static void
335 memscrub_cleanup(void)
336 {
337 	memscrub_uninit_mem_config();
338 	while (memscrub_memlist) {
339 		(void) memscrub_delete_span(
340 			(pfn_t)(memscrub_memlist->address >> PAGESHIFT),
341 			(pgcnt_t)(memscrub_memlist->size >> PAGESHIFT));
342 	}
343 	if (memscrub_ksp)
344 		kstat_delete(memscrub_ksp);
345 	cv_destroy(&memscrub_cv);
346 	mutex_destroy(&memscrub_lock);
347 }
348 
349 #ifdef MEMSCRUB_DEBUG
350 static void
351 memscrub_printmemlist(char *title, struct memlist *listp)
352 {
353 	struct memlist *list;
354 
355 	cmn_err(CE_CONT, "%s:\n", title);
356 
357 	for (list = listp; list; list = list->next) {
358 		cmn_err(CE_CONT, "addr = 0x%llx, size = 0x%llx\n",
359 		    list->address, list->size);
360 	}
361 }
362 #endif /* MEMSCRUB_DEBUG */
363 
364 /* ARGSUSED */
365 static void
366 memscrub_wakeup(void *c)
367 {
368 	/*
369 	 * grab mutex to guarantee that our wakeup call
370 	 * arrives after we go to sleep -- so we can't sleep forever.
371 	 */
372 	mutex_enter(&memscrub_lock);
373 	cv_signal(&memscrub_cv);
374 	mutex_exit(&memscrub_lock);
375 }
376 
377 /*
378  * provide an interface external to the memscrubber
379  * which will force the memscrub thread to run vs.
380  * waiting for the timeout, if one is set
381  */
382 void
383 memscrub_run(void)
384 {
385 	memscrub_counts.force_run.value.ui32++;
386 	if (memscrub_tid) {
387 		(void) untimeout(memscrub_tid);
388 		memscrub_wakeup((void *)NULL);
389 	}
390 }
391 
392 /*
393  * this calculation doesn't account for the time
394  * that the actual scan consumes -- so we'd fall
395  * slightly behind schedule with this interval.
396  * It's very small.
397  */
398 
399 static uint_t
400 compute_interval_ticks(void)
401 {
402 	/*
403 	 * We use msp_safe mpp_safe below to insure somebody
404 	 * doesn't set memscrub_span_pages or memscrub_phys_pages
405 	 * to 0 on us.
406 	 */
407 	static uint_t msp_safe, mpp_safe;
408 	static uint_t interval_ticks, period_ticks;
409 	msp_safe = memscrub_span_pages;
410 	mpp_safe = memscrub_phys_pages;
411 
412 	period_ticks = memscrub_period_sec * hz;
413 	interval_ticks = period_ticks;
414 
415 	ASSERT(mutex_owned(&memscrub_lock));
416 
417 	if ((msp_safe != 0) && (mpp_safe != 0)) {
418 		if (memscrub_phys_pages <= msp_safe) {
419 			interval_ticks = period_ticks;
420 		} else {
421 			interval_ticks = (period_ticks /
422 			    (mpp_safe / msp_safe));
423 		}
424 	}
425 	return (interval_ticks);
426 }
427 
428 void
429 memscrubber(void)
430 {
431 	ms_paddr_t address, addr;
432 	time_t deadline;
433 	pgcnt_t pages;
434 	uint_t reached_end = 1;
435 	uint_t paused_message = 0;
436 	uint_t interval_ticks = 0;
437 	uint_t sleep_warn_printed = 0;
438 	callb_cpr_t cprinfo;
439 
440 	/*
441 	 * notify CPR of our existence
442 	 */
443 	CALLB_CPR_INIT(&cprinfo, &memscrub_lock, callb_generic_cpr, "memscrub");
444 
445 	mutex_enter(&memscrub_lock);
446 
447 	if (memscrub_memlist == NULL) {
448 		cmn_err(CE_WARN, "memscrub_memlist not initialized.");
449 		goto memscrub_exit;
450 	}
451 
452 	address = memscrub_memlist->address;
453 
454 	deadline = gethrestime_sec() + memscrub_delay_start_sec;
455 
456 	for (;;) {
457 		if (disable_memscrub)
458 			break;
459 
460 		/*
461 		 * compute interval_ticks
462 		 */
463 		interval_ticks = compute_interval_ticks();
464 
465 		/*
466 		 * If the calculated sleep time is zero, and pause_memscrub
467 		 * has been set, make sure we sleep so that another thread
468 		 * can acquire memscrub_lock.
469 		 */
470 		if (interval_ticks == 0 && pause_memscrub) {
471 			interval_ticks = hz;
472 		}
473 
474 		/*
475 		 * And as a fail safe, under normal non-paused operation, do
476 		 * not allow the sleep time to be zero.
477 		 */
478 		if (interval_ticks == 0) {
479 			interval_ticks = memscrub_override_ticks;
480 			if (!sleep_warn_printed) {
481 				cmn_err(CE_NOTE, MEMSCRUB_OVERRIDE_MSG);
482 				sleep_warn_printed = 1;
483 			}
484 		}
485 
486 		memscrub_counts.interval_ticks.value.ui32 = interval_ticks;
487 
488 		/*
489 		 * Did we just reach the end of memory? If we are at the
490 		 * end of memory, delay end of memory processing until
491 		 * pause_memscrub is not set.
492 		 */
493 		if (reached_end && !pause_memscrub) {
494 			time_t now = gethrestime_sec();
495 
496 			if (now >= deadline) {
497 				memscrub_counts.done_late.value.ui32++;
498 				memscrub_counts.late_sec.value.ui32 +=
499 					(now - deadline);
500 				/*
501 				 * past deadline, start right away
502 				 */
503 				interval_ticks = 0;
504 
505 				deadline = now + memscrub_period_sec;
506 			} else {
507 				/*
508 				 * we finished ahead of schedule.
509 				 * wait till previous deadline before re-start.
510 				 */
511 				interval_ticks = (deadline - now) * hz;
512 				memscrub_counts.done_early.value.ui32++;
513 				memscrub_counts.early_sec.value.ui32 +=
514 					(deadline - now);
515 				deadline += memscrub_period_sec;
516 			}
517 			reached_end = 0;
518 			sleep_warn_printed = 0;
519 		}
520 
521 		if (interval_ticks != 0) {
522 			/*
523 			 * it is safe from our standpoint for CPR to
524 			 * suspend the system
525 			 */
526 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
527 
528 			/*
529 			 * hit the snooze bar
530 			 */
531 			memscrub_tid = timeout(memscrub_wakeup, NULL,
532 			    interval_ticks);
533 
534 			/*
535 			 * go to sleep
536 			 */
537 			cv_wait(&memscrub_cv, &memscrub_lock);
538 
539 			/*
540 			 * at this point, no timeout should be set
541 			 */
542 			memscrub_tid = 0;
543 
544 			/*
545 			 * we need to goto work and will be modifying
546 			 * our internal state and mapping/unmapping
547 			 * TTEs
548 			 */
549 			CALLB_CPR_SAFE_END(&cprinfo, &memscrub_lock);
550 		}
551 
552 
553 		if (memscrub_phys_pages == 0) {
554 			cmn_err(CE_WARN, "Memory scrubber has 0 pages to read");
555 			goto memscrub_exit;
556 		}
557 
558 		if (!pause_memscrub) {
559 			if (paused_message) {
560 				paused_message = 0;
561 				if (memscrub_verbose)
562 					cmn_err(CE_NOTE, "Memory scrubber "
563 					    "resuming");
564 			}
565 
566 			if (read_all_memscrub) {
567 				if (memscrub_verbose)
568 					cmn_err(CE_NOTE, "Memory scrubber "
569 					    "reading all memory per request");
570 
571 				addr = memscrub_memlist->address;
572 				reached_end = 0;
573 				while (!reached_end) {
574 					if (disable_memscrub)
575 						break;
576 					pages = memscrub_phys_pages;
577 					reached_end = memscrub_verify_span(
578 					    &addr, &pages);
579 					memscrub_scan(pages *
580 					    MEMSCRUB_BLOCKS_PER_PAGE, addr);
581 					addr += ((uint64_t)pages * PAGESIZE);
582 				}
583 				read_all_memscrub = 0;
584 			}
585 
586 			/*
587 			 * read 1 span
588 			 */
589 			pages = memscrub_span_pages;
590 
591 			if (disable_memscrub)
592 				break;
593 
594 			/*
595 			 * determine physical address range
596 			 */
597 			reached_end = memscrub_verify_span(&address,
598 			    &pages);
599 
600 			memscrub_scan(pages * MEMSCRUB_BLOCKS_PER_PAGE,
601 			    address);
602 
603 			address += ((uint64_t)pages * PAGESIZE);
604 		}
605 
606 		if (pause_memscrub && !paused_message) {
607 			paused_message = 1;
608 			if (memscrub_verbose)
609 				cmn_err(CE_NOTE, "Memory scrubber paused");
610 		}
611 	}
612 
613 memscrub_exit:
614 	cmn_err(CE_NOTE, "Memory scrubber exiting");
615 	CALLB_CPR_EXIT(&cprinfo);
616 	memscrub_cleanup();
617 	thread_exit();
618 	/* NOTREACHED */
619 }
620 
621 /*
622  * condition address and size
623  * such that they span legal physical addresses.
624  *
625  * when appropriate, address will be rounded up to start of next
626  * struct memlist, and pages will be rounded down to the end of the
627  * memlist size.
628  *
629  * returns 1 if reached end of list, else returns 0.
630  */
631 static int
632 memscrub_verify_span(ms_paddr_t *addrp, pgcnt_t *pagesp)
633 {
634 	struct memlist *mlp;
635 	ms_paddr_t address = *addrp;
636 	uint64_t bytes = (uint64_t)*pagesp * PAGESIZE;
637 	uint64_t bytes_remaining;
638 	int reached_end = 0;
639 
640 	ASSERT(mutex_owned(&memscrub_lock));
641 
642 	/*
643 	 * find memlist struct that contains addrp
644 	 * assumes memlist is sorted by ascending address.
645 	 */
646 	for (mlp = memscrub_memlist; mlp != NULL; mlp = mlp->next) {
647 		/*
648 		 * if before this chunk, round up to beginning
649 		 */
650 		if (address < mlp->address) {
651 			address = mlp->address;
652 			break;
653 		}
654 		/*
655 		 * if before end of chunk, then we found it
656 		 */
657 		if (address < (mlp->address + mlp->size))
658 			break;
659 
660 		/* else go to next struct memlist */
661 	}
662 	/*
663 	 * if we hit end of list, start at beginning
664 	 */
665 	if (mlp == NULL) {
666 		mlp = memscrub_memlist;
667 		address = mlp->address;
668 	}
669 
670 	/*
671 	 * now we have legal address, and its mlp, condition bytes
672 	 */
673 	bytes_remaining = (mlp->address + mlp->size) - address;
674 
675 	if (bytes > bytes_remaining)
676 		bytes = bytes_remaining;
677 
678 	/*
679 	 * will this span take us to end of list?
680 	 */
681 	if ((mlp->next == NULL) &&
682 	    ((mlp->address + mlp->size) == (address + bytes)))
683 		reached_end = 1;
684 
685 	/* return values */
686 	*addrp = address;
687 	*pagesp = bytes / PAGESIZE;
688 
689 	return (reached_end);
690 }
691 
692 /*
693  * add a span to the memscrub list
694  * add to memscrub_phys_pages
695  */
696 int
697 memscrub_add_span(pfn_t pfn, pgcnt_t pages)
698 {
699 #ifdef MEMSCRUB_DEBUG
700 	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
701 	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
702 #endif /* MEMSCRUB_DEBUG */
703 
704 	int retval;
705 
706 	mutex_enter(&memscrub_lock);
707 
708 #ifdef MEMSCRUB_DEBUG
709 	memscrub_printmemlist("memscrub_memlist before", memscrub_memlist);
710 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
711 	cmn_err(CE_CONT, "memscrub_add_span: address: 0x%llx"
712 	    " size: 0x%llx\n", address, bytes);
713 #endif /* MEMSCRUB_DEBUG */
714 
715 	retval = memscrub_add_span_gen(pfn, pages, &memscrub_memlist,
716 	    &memscrub_phys_pages);
717 
718 #ifdef MEMSCRUB_DEBUG
719 	memscrub_printmemlist("memscrub_memlist after", memscrub_memlist);
720 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
721 #endif /* MEMSCRUB_DEBUG */
722 
723 	mutex_exit(&memscrub_lock);
724 
725 	return (retval);
726 }
727 
728 static int
729 memscrub_add_span_gen(
730 	pfn_t pfn,
731 	pgcnt_t pages,
732 	struct memlist **list,
733 	uint_t *npgs)
734 {
735 	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
736 	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
737 	struct memlist *dst;
738 	struct memlist *prev, *next;
739 	int retval = 0;
740 
741 	/*
742 	 * allocate a new struct memlist
743 	 */
744 
745 	dst = (struct memlist *)
746 	    kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
747 
748 	if (dst == NULL) {
749 		retval = -1;
750 		goto add_done;
751 	}
752 
753 	dst->address = address;
754 	dst->size = bytes;
755 
756 	/*
757 	 * first insert
758 	 */
759 	if (*list == NULL) {
760 		dst->prev = NULL;
761 		dst->next = NULL;
762 		*list = dst;
763 
764 		goto add_done;
765 	}
766 
767 	/*
768 	 * insert into sorted list
769 	 */
770 	for (prev = NULL, next = *list;
771 	    next != NULL;
772 	    prev = next, next = next->next) {
773 		if (address > (next->address + next->size))
774 			continue;
775 
776 		/*
777 		 * else insert here
778 		 */
779 
780 		/*
781 		 * prepend to next
782 		 */
783 		if ((address + bytes) == next->address) {
784 			kmem_free(dst, sizeof (struct memlist));
785 
786 			next->address = address;
787 			next->size += bytes;
788 
789 			goto add_done;
790 		}
791 
792 		/*
793 		 * append to next
794 		 */
795 		if (address == (next->address + next->size)) {
796 			kmem_free(dst, sizeof (struct memlist));
797 
798 			if (next->next) {
799 				/*
800 				 * don't overlap with next->next
801 				 */
802 				if ((address + bytes) > next->next->address) {
803 					retval = -1;
804 					goto add_done;
805 				}
806 				/*
807 				 * concatenate next and next->next
808 				 */
809 				if ((address + bytes) == next->next->address) {
810 					struct memlist *mlp = next->next;
811 
812 					if (next == *list)
813 						*list = next->next;
814 
815 					mlp->address = next->address;
816 					mlp->size += next->size;
817 					mlp->size += bytes;
818 
819 					if (next->prev)
820 						next->prev->next = mlp;
821 					mlp->prev = next->prev;
822 
823 					kmem_free(next,
824 						sizeof (struct memlist));
825 					goto add_done;
826 				}
827 			}
828 
829 			next->size += bytes;
830 
831 			goto add_done;
832 		}
833 
834 		/* don't overlap with next */
835 		if ((address + bytes) > next->address) {
836 			retval = -1;
837 			kmem_free(dst, sizeof (struct memlist));
838 			goto add_done;
839 		}
840 
841 		/*
842 		 * insert before next
843 		 */
844 		dst->prev = prev;
845 		dst->next = next;
846 		next->prev = dst;
847 		if (prev == NULL) {
848 			*list = dst;
849 		} else {
850 			prev->next = dst;
851 		}
852 		goto add_done;
853 	}	/* end for */
854 
855 	/*
856 	 * end of list, prev is valid and next is NULL
857 	 */
858 	prev->next = dst;
859 	dst->prev = prev;
860 	dst->next = NULL;
861 
862 add_done:
863 
864 	if (retval != -1)
865 		*npgs += pages;
866 
867 	return (retval);
868 }
869 
870 /*
871  * delete a span from the memscrub list
872  * subtract from memscrub_phys_pages
873  */
874 int
875 memscrub_delete_span(pfn_t pfn, pgcnt_t pages)
876 {
877 	ms_paddr_t address = (ms_paddr_t)pfn << PAGESHIFT;
878 	uint64_t bytes = (uint64_t)pages << PAGESHIFT;
879 	struct memlist *dst, *next;
880 	int retval = 0;
881 
882 	mutex_enter(&memscrub_lock);
883 
884 #ifdef MEMSCRUB_DEBUG
885 	memscrub_printmemlist("memscrub_memlist Before", memscrub_memlist);
886 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
887 	cmn_err(CE_CONT, "memscrub_delete_span: 0x%llx 0x%llx\n",
888 	    address, bytes);
889 #endif /* MEMSCRUB_DEBUG */
890 
891 	/*
892 	 * find struct memlist containing page
893 	 */
894 	for (next = memscrub_memlist; next != NULL; next = next->next) {
895 		if ((address >= next->address) &&
896 		    (address < next->address + next->size))
897 			break;
898 	}
899 
900 	/*
901 	 * if start address not in list
902 	 */
903 	if (next == NULL) {
904 		retval = -1;
905 		goto delete_done;
906 	}
907 
908 	/*
909 	 * error if size goes off end of this struct memlist
910 	 */
911 	if (address + bytes > next->address + next->size) {
912 		retval = -1;
913 		goto delete_done;
914 	}
915 
916 	/*
917 	 * pages at beginning of struct memlist
918 	 */
919 	if (address == next->address) {
920 		/*
921 		 * if start & size match, delete from list
922 		 */
923 		if (bytes == next->size) {
924 			if (next == memscrub_memlist)
925 				memscrub_memlist = next->next;
926 			if (next->prev != NULL)
927 				next->prev->next = next->next;
928 			if (next->next != NULL)
929 				next->next->prev = next->prev;
930 
931 			kmem_free(next, sizeof (struct memlist));
932 		} else {
933 		/*
934 		 * increment start address by bytes
935 		 */
936 			next->address += bytes;
937 			next->size -= bytes;
938 		}
939 		goto delete_done;
940 	}
941 
942 	/*
943 	 * pages at end of struct memlist
944 	 */
945 	if (address + bytes == next->address + next->size) {
946 		/*
947 		 * decrement size by bytes
948 		 */
949 		next->size -= bytes;
950 		goto delete_done;
951 	}
952 
953 	/*
954 	 * delete a span in the middle of the struct memlist
955 	 */
956 	{
957 		/*
958 		 * create a new struct memlist
959 		 */
960 		dst = (struct memlist *)
961 		    kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
962 
963 		if (dst == NULL) {
964 			retval = -1;
965 			goto delete_done;
966 		}
967 
968 		/*
969 		 * existing struct memlist gets address
970 		 * and size up to pfn
971 		 */
972 		dst->address = address + bytes;
973 		dst->size = (next->address + next->size) - dst->address;
974 		next->size = address - next->address;
975 
976 		/*
977 		 * new struct memlist gets address starting
978 		 * after pfn, until end
979 		 */
980 
981 		/*
982 		 * link in new memlist after old
983 		 */
984 		dst->next = next->next;
985 		dst->prev = next;
986 
987 		if (next->next != NULL)
988 			next->next->prev = dst;
989 		next->next = dst;
990 	}
991 
992 delete_done:
993 	if (retval != -1) {
994 		memscrub_phys_pages -= pages;
995 		if (memscrub_phys_pages == 0)
996 			disable_memscrub = 1;
997 	}
998 
999 #ifdef MEMSCRUB_DEBUG
1000 	memscrub_printmemlist("memscrub_memlist After", memscrub_memlist);
1001 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
1002 #endif /* MEMSCRUB_DEBUG */
1003 
1004 	mutex_exit(&memscrub_lock);
1005 	return (retval);
1006 }
1007 
1008 static void
1009 memscrub_scan(uint_t blks, ms_paddr_t src)
1010 {
1011 	uint_t 		psz, bpp, pgsread;
1012 	pfn_t		pfn;
1013 	ms_paddr_t	pa;
1014 	caddr_t		va;
1015 	on_trap_data_t	otd;
1016 
1017 	extern void memscrub_read(caddr_t src, uint_t blks);
1018 
1019 	ASSERT(mutex_owned(&memscrub_lock));
1020 
1021 	pgsread = 0;
1022 	pa = src;
1023 
1024 	while (blks != 0) {
1025 		/* Ensure the PA is properly aligned */
1026 		if (((pa & MMU_PAGEMASK4M) == pa) &&
1027 			(blks >= MEMSCRUB_BPP4M)) {
1028 			psz = MMU_PAGESIZE4M;
1029 			bpp = MEMSCRUB_BPP4M;
1030 		} else if (((pa & MMU_PAGEMASK512K) == pa) &&
1031 			(blks >= MEMSCRUB_BPP512K)) {
1032 			psz = MMU_PAGESIZE512K;
1033 			bpp = MEMSCRUB_BPP512K;
1034 		} else if (((pa & MMU_PAGEMASK64K) == pa) &&
1035 			(blks >= MEMSCRUB_BPP64K)) {
1036 			psz = MMU_PAGESIZE64K;
1037 			bpp = MEMSCRUB_BPP64K;
1038 		} else if ((pa & MMU_PAGEMASK) == pa) {
1039 			psz = MMU_PAGESIZE;
1040 			bpp = MEMSCRUB_BPP;
1041 		} else {
1042 			if (memscrub_verbose) {
1043 				cmn_err(CE_NOTE, "Memory scrubber ignoring "
1044 				    "non-page aligned block starting at 0x%"
1045 				    PRIx64, src);
1046 			}
1047 			return;
1048 		}
1049 		if (blks < bpp) bpp = blks;
1050 
1051 #ifdef MEMSCRUB_DEBUG
1052 		cmn_err(CE_NOTE, "Going to run psz=%x, "
1053 		    "bpp=%x pa=%llx\n", psz, bpp, pa);
1054 #endif /* MEMSCRUB_DEBUG */
1055 
1056 		/*
1057 		 * MEMSCRUBBASE is a 4MB aligned page in the
1058 		 * kernel so that we can quickly map the PA
1059 		 * to a VA for the block loads performed in
1060 		 * memscrub_read.
1061 		 */
1062 		pfn = mmu_btop(pa);
1063 		va = (caddr_t)MEMSCRUBBASE;
1064 		hat_devload(kas.a_hat, va, psz, pfn, PROT_READ,
1065 			HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
1066 
1067 		/*
1068 		 * Can't allow the memscrubber to migrate across CPUs as
1069 		 * we need to know whether CEEN is enabled for the current
1070 		 * CPU to enable us to scrub the memory. Don't use
1071 		 * kpreempt_disable as the time we take to scan a span (even
1072 		 * without cpu_check_ce having to manually cpu_check_block)
1073 		 * is too long to hold a higher priority thread (eg, RT)
1074 		 * off cpu.
1075 		 */
1076 		thread_affinity_set(curthread, CPU_CURRENT);
1077 
1078 		/*
1079 		 * Protect read scrub from async faults.  For now, we simply
1080 		 * maintain a count of such faults caught.
1081 		 */
1082 
1083 		if (!on_trap(&otd, OT_DATA_EC)) {
1084 			memscrub_read(va, bpp);
1085 			/*
1086 			 * Check if CEs require logging
1087 			 */
1088 			cpu_check_ce(SCRUBBER_CEEN_CHECK,
1089 			    (uint64_t)pa, va, psz);
1090 			no_trap();
1091 			thread_affinity_clear(curthread);
1092 		} else {
1093 			no_trap();
1094 			thread_affinity_clear(curthread);
1095 
1096 			/*
1097 			 * Got an async error..
1098 			 * Try rescanning it at MMU_PAGESIZE
1099 			 * granularity if we were trying to
1100 			 * read at a larger page size.
1101 			 * This is to ensure we continue to
1102 			 * scan the rest of the span.
1103 			 */
1104 			if (psz > MMU_PAGESIZE) {
1105 			    caddr_t vaddr = va;
1106 			    ms_paddr_t paddr = pa;
1107 			    int tmp = 0;
1108 			    for (; tmp < bpp; tmp += MEMSCRUB_BPP) {
1109 				thread_affinity_set(curthread, CPU_CURRENT);
1110 				if (!on_trap(&otd, OT_DATA_EC)) {
1111 				    memscrub_read(vaddr, MEMSCRUB_BPP);
1112 				    cpu_check_ce(SCRUBBER_CEEN_CHECK,
1113 					(uint64_t)paddr, vaddr, MMU_PAGESIZE);
1114 				    no_trap();
1115 				} else {
1116 				    no_trap();
1117 				    memscrub_counts.errors_found.value.ui32++;
1118 				}
1119 				thread_affinity_clear(curthread);
1120 				vaddr += MMU_PAGESIZE;
1121 				paddr += MMU_PAGESIZE;
1122 			    }
1123 			}
1124 		}
1125 		hat_unload(kas.a_hat, va, psz, HAT_UNLOAD_UNLOCK);
1126 
1127 		blks -= bpp;
1128 		pa += psz;
1129 		pgsread++;
1130 	}
1131 	if (memscrub_verbose) {
1132 		cmn_err(CE_NOTE, "Memory scrubber read 0x%x pages starting "
1133 		    "at 0x%" PRIx64, pgsread, src);
1134 	}
1135 }
1136 
1137 /*
1138  * The memory add/delete callback mechanism does not pass in the
1139  * page ranges. The phys_install list has been updated though, so
1140  * create a new scrub list from it.
1141  */
1142 
1143 static int
1144 new_memscrub()
1145 {
1146 	struct memlist *src, *list, *old_list;
1147 	uint_t npgs;
1148 
1149 	/*
1150 	 * copy phys_install to memscrub_memlist
1151 	 */
1152 	list = NULL;
1153 	npgs = 0;
1154 	memlist_read_lock();
1155 	for (src = phys_install; src; src = src->next) {
1156 		if (memscrub_add_span_gen((pfn_t)(src->address >> PAGESHIFT),
1157 		    (pgcnt_t)(src->size >> PAGESHIFT), &list, &npgs)) {
1158 			memlist_read_unlock();
1159 			while (list) {
1160 				struct memlist *el;
1161 
1162 				el = list;
1163 				list = list->next;
1164 				kmem_free(el, sizeof (struct memlist));
1165 			}
1166 			return (-1);
1167 		}
1168 	}
1169 	memlist_read_unlock();
1170 
1171 	mutex_enter(&memscrub_lock);
1172 	memscrub_phys_pages = npgs;
1173 	old_list = memscrub_memlist;
1174 	memscrub_memlist = list;
1175 	mutex_exit(&memscrub_lock);
1176 
1177 	while (old_list) {
1178 		struct memlist *el;
1179 
1180 		el = old_list;
1181 		old_list = old_list->next;
1182 		kmem_free(el, sizeof (struct memlist));
1183 	}
1184 	return (0);
1185 }
1186 
1187 /*ARGSUSED*/
1188 static void
1189 memscrub_mem_config_post_add(
1190 	void *arg,
1191 	pgcnt_t delta_pages)
1192 {
1193 	/*
1194 	 * We increment pause_memscrub before entering new_memscrub(). This
1195 	 * will force the memscrubber to sleep, allowing the DR callback
1196 	 * thread to acquire memscrub_lock in new_memscrub(). The use of
1197 	 * atomic_add_32() allows concurrent memory DR operations to use the
1198 	 * callbacks safely.
1199 	 */
1200 	atomic_add_32(&pause_memscrub, 1);
1201 	ASSERT(pause_memscrub != 0);
1202 
1203 	/*
1204 	 * "Don't care" if we are not scrubbing new memory.
1205 	 */
1206 	(void) new_memscrub();
1207 
1208 	/* Restore the pause setting. */
1209 	atomic_add_32(&pause_memscrub, -1);
1210 }
1211 
1212 /*ARGSUSED*/
1213 static int
1214 memscrub_mem_config_pre_del(
1215 	void *arg,
1216 	pgcnt_t delta_pages)
1217 {
1218 	/* Nothing to do. */
1219 	return (0);
1220 }
1221 
1222 /*ARGSUSED*/
1223 static void
1224 memscrub_mem_config_post_del(
1225 	void *arg,
1226 	pgcnt_t delta_pages,
1227 	int cancelled)
1228 {
1229 	/*
1230 	 * We increment pause_memscrub before entering new_memscrub(). This
1231 	 * will force the memscrubber to sleep, allowing the DR callback
1232 	 * thread to acquire memscrub_lock in new_memscrub(). The use of
1233 	 * atomic_add_32() allows concurrent memory DR operations to use the
1234 	 * callbacks safely.
1235 	 */
1236 	atomic_add_32(&pause_memscrub, 1);
1237 	ASSERT(pause_memscrub != 0);
1238 
1239 	/*
1240 	 * Must stop scrubbing deleted memory as it may be disconnected.
1241 	 */
1242 	if (new_memscrub()) {
1243 		disable_memscrub = 1;
1244 	}
1245 
1246 	/* Restore the pause setting. */
1247 	atomic_add_32(&pause_memscrub, -1);
1248 }
1249 
1250 static kphysm_setup_vector_t memscrub_mem_config_vec = {
1251 	KPHYSM_SETUP_VECTOR_VERSION,
1252 	memscrub_mem_config_post_add,
1253 	memscrub_mem_config_pre_del,
1254 	memscrub_mem_config_post_del,
1255 };
1256 
1257 static void
1258 memscrub_init_mem_config()
1259 {
1260 	int ret;
1261 
1262 	ret = kphysm_setup_func_register(&memscrub_mem_config_vec,
1263 	    (void *)NULL);
1264 	ASSERT(ret == 0);
1265 }
1266 
1267 static void
1268 memscrub_uninit_mem_config()
1269 {
1270 	/* This call is OK if the register call was not done. */
1271 	kphysm_setup_func_unregister(&memscrub_mem_config_vec, (void *)NULL);
1272 }
1273