xref: /illumos-gate/usr/src/uts/i86pc/os/memscrub.c (revision f498645a3eecf2ddd304b4ea9c7f1b4c155ff79e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 /*
31  * i86pc Memory Scrubbing
32  *
33  * On detection of a correctable memory ECC error, the i86pc hardware
34  * returns the corrected data to the requester and may re-write it
35  * to memory (DRAM or NVRAM). Machines which do not re-write this to
36  * memory should add an NMI handler to correct and rewrite.
37  *
38  * Scrubbing thus reduces the likelyhood that multiple transient errors
39  * will occur in the same memory word, making uncorrectable errors due
40  * to transients less likely.
41  *
42  * Thus is born the desire that every memory location be periodically
43  * accessed.
44  *
45  * This file implements a memory scrubbing thread.  This scrubber
46  * guarantees that all of physical memory is accessed periodically
47  * (memscrub_period_sec -- 12 hours).
48  *
49  * It attempts to do this as unobtrusively as possible.  The thread
50  * schedules itself to wake up at an interval such that if it reads
51  * memscrub_span_pages (4MB) on each wakeup, it will read all of physical
52  * memory in in memscrub_period_sec (12 hours).
53  *
54  * The scrubber uses the REP LODS so it reads 4MB in 0.15 secs (on P5-200).
55  * When it completes a span, if all the CPUs are idle, it reads another span.
56  * Typically it soaks up idle time this way to reach its deadline early
57  * -- and sleeps until the next period begins.
58  *
59  * Maximal Cost Estimate:  8GB @ xxMB/s = xxx seconds spent in 640 wakeups
60  * that run for 0.15 seconds at intervals of 67 seconds.
61  *
62  * In practice, the scrubber finds enough idle time to finish in a few
63  * minutes, and sleeps until its 12 hour deadline.
64  *
65  * The scrubber maintains a private copy of the phys_install memory list
66  * to keep track of what memory should be scrubbed.
67  *
68  * The following parameters can be set via /etc/system
69  *
70  * memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES (4MB)
71  * memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC (12 hours)
72  * memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI (0)
73  * memscrub_delay_start_sec = (10 seconds)
74  * disable_memscrub = (0)
75  *
76  * the scrubber will exit (or never be started) if it finds the variable
77  * "disable_memscrub" set.
78  *
79  * MEMSCRUB_DFL_SPAN_PAGES  is based on the guess that 0.15 sec
80  * is a "good" amount of minimum time for the thread to run at a time.
81  *
82  * MEMSCRUB_DFL_PERIOD_SEC (12 hours) is nearly a total guess --
83  * twice the frequency the hardware folk estimated would be necessary.
84  *
85  * MEMSCRUB_DFL_THREAD_PRI (0) is based on the assumption that nearly
86  * any other use of the system should be higher priority than scrubbing.
87  */
88 
89 #include <sys/types.h>
90 #include <sys/systm.h>		/* timeout, types, t_lock */
91 #include <sys/cmn_err.h>
92 #include <sys/sysmacros.h>	/* MIN */
93 #include <sys/memlist.h>	/* memlist */
94 #include <sys/kmem.h>		/* KMEM_NOSLEEP */
95 #include <sys/cpuvar.h>		/* ncpus_online */
96 #include <sys/debug.h>		/* ASSERTs */
97 #include <sys/vmem.h>
98 #include <sys/mman.h>
99 #include <vm/seg_kmem.h>
100 #include <vm/seg_kpm.h>
101 #include <vm/hat_i86.h>
102 
103 static caddr_t	memscrub_window;
104 static void	*memscrub_pte;
105 
106 /*
107  * Global Data:
108  */
109 /*
110  * scan all of physical memory at least once every MEMSCRUB_PERIOD_SEC
111  */
112 #define	MEMSCRUB_DFL_PERIOD_SEC	(12 * 60 * 60)	/* 12 hours */
113 
114 /*
115  * start only if at least MEMSCRUB_MIN_PAGES in system
116  */
117 #define	MEMSCRUB_MIN_PAGES	((32 * 1024 * 1024) / PAGESIZE)
118 
119 /*
120  * scan at least MEMSCRUB_DFL_SPAN_PAGES each iteration
121  */
122 #define	MEMSCRUB_DFL_SPAN_PAGES	((4 * 1024 * 1024) / PAGESIZE)
123 
124 /*
125  * almost anything is higher priority than scrubbing
126  */
127 #define	MEMSCRUB_DFL_THREAD_PRI	0
128 
129 /*
130  * we can patch these defaults in /etc/system if necessary
131  */
132 uint_t disable_memscrub = 0;
133 static uint_t disable_memscrub_quietly = 0;
134 pgcnt_t memscrub_min_pages = MEMSCRUB_MIN_PAGES;
135 pgcnt_t memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES;
136 time_t memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC;
137 uint_t memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI;
138 time_t memscrub_delay_start_sec = 10;
139 
140 /*
141  * Static Routines
142  */
143 static void memscrubber(void);
144 static int system_is_idle(void);
145 static int memscrub_add_span(uint64_t, uint64_t);
146 
147 /*
148  * Static Data
149  */
150 static struct memlist *memscrub_memlist;
151 static uint_t memscrub_phys_pages;
152 
153 static kcondvar_t memscrub_cv;
154 static kmutex_t memscrub_lock;
155 
156 /*
157  * memscrub_lock protects memscrub_memlist
158  */
159 uint_t memscrub_scans_done;
160 
161 uint_t memscrub_done_early;
162 uint_t memscrub_early_sec;
163 
164 uint_t memscrub_done_late;
165 time_t memscrub_late_sec;
166 
167 /*
168  * create memscrub_memlist from phys_install list
169  * initialize locks, set memscrub_phys_pages.
170  */
171 void
172 memscrub_init()
173 {
174 	struct memlist *src;
175 
176 	if (physmem < memscrub_min_pages)
177 		return;
178 
179 	if (!kpm_enable) {
180 		memscrub_window = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
181 		memscrub_pte = hat_mempte_setup(memscrub_window);
182 	}
183 
184 	/*
185 	 * copy phys_install to memscrub_memlist
186 	 */
187 	for (src = phys_install; src; src = src->next) {
188 		if (memscrub_add_span(src->address, src->size)) {
189 			cmn_err(CE_WARN,
190 			    "Memory scrubber failed to initialize\n");
191 			return;
192 		}
193 	}
194 
195 	mutex_init(&memscrub_lock, NULL, MUTEX_DRIVER, NULL);
196 	cv_init(&memscrub_cv, NULL, CV_DRIVER, NULL);
197 
198 	/*
199 	 * create memscrubber thread
200 	 */
201 	(void) thread_create(NULL, 0, (void (*)())memscrubber, NULL, 0, &p0,
202 	    TS_RUN, memscrub_thread_pri);
203 }
204 
205 /*
206  * Function to cause the software memscrubber to exit quietly if the
207  * platform support has located a hardware scrubber and enabled it.
208  */
209 void
210 memscrub_disable(void)
211 {
212 	disable_memscrub_quietly = 1;
213 }
214 
215 #ifdef MEMSCRUB_DEBUG
216 void
217 memscrub_printmemlist(char *title, struct memlist *listp)
218 {
219 	struct memlist *list;
220 
221 	cmn_err(CE_CONT, "%s:\n", title);
222 
223 	for (list = listp; list; list = list->next) {
224 		cmn_err(CE_CONT, "addr = 0x%llx, size = 0x%llx\n",
225 		    list->address, list->size);
226 	}
227 }
228 #endif /* MEMSCRUB_DEBUG */
229 
230 /* ARGSUSED */
231 void
232 memscrub_wakeup(void *c)
233 {
234 	/*
235 	 * grab mutex to guarantee that our wakeup call
236 	 * arrives after we go to sleep -- so we can't sleep forever.
237 	 */
238 	mutex_enter(&memscrub_lock);
239 	cv_signal(&memscrub_cv);
240 	mutex_exit(&memscrub_lock);
241 }
242 
243 /*
244  * this calculation doesn't account for the time that the actual scan
245  * consumes -- so we'd fall slightly behind schedule with this
246  * interval_sec.  but the idle loop optimization below usually makes us
247  * come in way ahead of schedule.
248  */
249 static int
250 compute_interval_sec()
251 {
252 	if (memscrub_phys_pages <= memscrub_span_pages)
253 		return (memscrub_period_sec);
254 	else
255 		return (memscrub_period_sec/
256 			(memscrub_phys_pages/memscrub_span_pages));
257 }
258 
259 void
260 memscrubber()
261 {
262 	time_t deadline;
263 	uint64_t mlp_last_addr;
264 	uint64_t mlp_next_addr;
265 	int reached_end = 1;
266 	time_t interval_sec = 0;
267 	struct memlist *mlp;
268 
269 	extern void scan_memory(caddr_t, size_t);
270 
271 	if (memscrub_memlist == NULL) {
272 		cmn_err(CE_WARN, "memscrub_memlist not initialized.");
273 		goto memscrub_exit;
274 	}
275 
276 	mlp = memscrub_memlist;
277 	mlp_next_addr = mlp->address;
278 	mlp_last_addr = mlp->address + mlp->size;
279 
280 	deadline = gethrestime_sec() + memscrub_delay_start_sec;
281 
282 	for (;;) {
283 		if (disable_memscrub || disable_memscrub_quietly)
284 			break;
285 
286 		mutex_enter(&memscrub_lock);
287 
288 		/*
289 		 * did we just reach the end of memory?
290 		 */
291 		if (reached_end) {
292 			time_t now = gethrestime_sec();
293 
294 			if (now >= deadline) {
295 				memscrub_done_late++;
296 				memscrub_late_sec += (now - deadline);
297 				/*
298 				 * past deadline, start right away
299 				 */
300 				interval_sec = 0;
301 
302 				deadline = now + memscrub_period_sec;
303 			} else {
304 				/*
305 				 * we finished ahead of schedule.
306 				 * wait till previous dealine before re-start.
307 				 */
308 				interval_sec = deadline - now;
309 				memscrub_done_early++;
310 				memscrub_early_sec += interval_sec;
311 				deadline += memscrub_period_sec;
312 			}
313 		} else {
314 			interval_sec = compute_interval_sec();
315 		}
316 
317 		/*
318 		 * hit the snooze bar
319 		 */
320 		(void) timeout(memscrub_wakeup, NULL, interval_sec * hz);
321 
322 		/*
323 		 * go to sleep
324 		 */
325 		cv_wait(&memscrub_cv, &memscrub_lock);
326 
327 		mutex_exit(&memscrub_lock);
328 
329 		do {
330 			pgcnt_t pages = memscrub_span_pages;
331 			uint64_t address = mlp_next_addr;
332 
333 			if (disable_memscrub || disable_memscrub_quietly)
334 				break;
335 
336 			mutex_enter(&memscrub_lock);
337 
338 			/*
339 			 * Make sure we don't try to scan beyond the end of
340 			 * the current memlist.  If we would, then resize
341 			 * our scan target for this iteration, and prepare
342 			 * to read the next memlist entry on the next
343 			 * iteration.
344 			 */
345 			reached_end = 0;
346 			if (address + mmu_ptob(pages) >= mlp_last_addr) {
347 				pages = mmu_btop(mlp_last_addr - address);
348 				mlp = mlp->next;
349 				if (mlp == NULL) {
350 					reached_end = 1;
351 					mlp = memscrub_memlist;
352 				}
353 				mlp_next_addr = mlp->address;
354 				mlp_last_addr = mlp->address + mlp->size;
355 			} else {
356 				mlp_next_addr += mmu_ptob(pages);
357 			}
358 
359 			mutex_exit(&memscrub_lock);
360 
361 			while (pages--) {
362 				pfn_t pfn = btop(address);
363 
364 				/*
365 				 * Without segkpm, the memscrubber cannot
366 				 * be allowed to migrate across CPUs, as
367 				 * the CPU-specific mapping of
368 				 * memscrub_window would be incorrect.
369 				 * With segkpm, switching CPUs is legal, but
370 				 * inefficient.  We don't use
371 				 * kpreempt_disable as it might hold a
372 				 * higher priority thread (eg, RT) too long
373 				 * off CPU.
374 				 */
375 				thread_affinity_set(curthread, CPU_CURRENT);
376 				if (kpm_enable)
377 					memscrub_window = hat_kpm_pfn2va(pfn);
378 				else
379 					hat_mempte_remap(pfn, memscrub_window,
380 					    memscrub_pte,
381 					    PROT_READ, HAT_LOAD_NOCONSIST);
382 
383 				scan_memory(memscrub_window, PAGESIZE);
384 
385 				thread_affinity_clear(curthread);
386 				address += MMU_PAGESIZE;
387 			}
388 
389 			memscrub_scans_done++;
390 		} while (!reached_end && system_is_idle());
391 	}
392 
393 memscrub_exit:
394 
395 	if (!disable_memscrub_quietly)
396 		cmn_err(CE_NOTE, "memory scrubber exiting.");
397 
398 	cv_destroy(&memscrub_cv);
399 
400 	thread_exit();
401 }
402 
403 
404 /*
405  * return 1 if we're MP and all the other CPUs are idle
406  */
407 static int
408 system_is_idle()
409 {
410 	int cpu_id;
411 	int found = 0;
412 
413 	if (1 == ncpus_online)
414 		return (0);
415 
416 	for (cpu_id = 0; cpu_id < NCPU; ++cpu_id) {
417 		if (!cpu[cpu_id])
418 			continue;
419 
420 		found++;
421 
422 		if (cpu[cpu_id]->cpu_thread != cpu[cpu_id]->cpu_idle_thread) {
423 			if (CPU->cpu_id == cpu_id &&
424 			    CPU->cpu_disp->disp_nrunnable == 0)
425 				continue;
426 			return (0);
427 		}
428 
429 		if (found == ncpus)
430 			break;
431 	}
432 	return (1);
433 }
434 
435 /*
436  * add a span to the memscrub list
437  */
438 static int
439 memscrub_add_span(uint64_t start, uint64_t bytes)
440 {
441 	struct memlist *dst;
442 	struct memlist *prev, *next;
443 	uint64_t end = start + bytes - 1;
444 	int retval = 0;
445 
446 	mutex_enter(&memscrub_lock);
447 
448 #ifdef MEMSCRUB_DEBUG
449 	memscrub_printmemlist("memscrub_memlist before", memscrub_memlist);
450 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
451 	cmn_err(CE_CONT, "memscrub_add_span: address: 0x%llx"
452 		" size: 0x%llx\n", start, bytes);
453 #endif /* MEMSCRUB_DEBUG */
454 
455 	/*
456 	 * Scan through the list to find the proper place to install it.
457 	 */
458 	prev = NULL;
459 	next = memscrub_memlist;
460 	while (next) {
461 		uint64_t ns = next->address;
462 		uint64_t ne = next->address + next->size - 1;
463 
464 		/*
465 		 * If this span overlaps with an existing span, then
466 		 * something has gone horribly wrong with the phys_install
467 		 * list.  In fact, I'm surprised we made it this far.
468 		 */
469 		if ((start >= ns && start <= ne) || (end >= ns && end <= ne) ||
470 		    (start < ns && end > ne))
471 			panic("memscrub found overlapping memory ranges "
472 			    "(0x%p-0x%p) and (0x%p-0x%p)",
473 			    (void *)(uintptr_t)start, (void *)(uintptr_t)end,
474 			    (void *)(uintptr_t)ns, (void *)(uintptr_t)ne);
475 
476 		/*
477 		 * New span can be appended to an existing one.
478 		 */
479 		if (start == ne + 1) {
480 			next->size += bytes;
481 			goto add_done;
482 		}
483 
484 		/*
485 		 * New span can be prepended to an existing one.
486 		 */
487 		if (end + 1 == ns) {
488 			next->size += bytes;
489 			next->address = start;
490 			goto add_done;
491 		}
492 
493 		/*
494 		 * If the next span has a higher start address than the new
495 		 * one, then we have found the right spot for our
496 		 * insertion.
497 		 */
498 		if (ns > start)
499 			break;
500 
501 		prev = next;
502 		next = next->next;
503 	}
504 
505 	/*
506 	 * allocate a new struct memlist
507 	 */
508 	dst = kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
509 	if (dst == NULL) {
510 		retval = -1;
511 		goto add_done;
512 	}
513 	dst->address = start;
514 	dst->size = bytes;
515 	dst->prev = prev;
516 	dst->next = next;
517 
518 	if (prev)
519 		prev->next = dst;
520 	else
521 		memscrub_memlist = dst;
522 
523 	if (next)
524 		next->prev = dst;
525 
526 add_done:
527 
528 	if (retval != -1)
529 		memscrub_phys_pages += mmu_btop(bytes);
530 
531 #ifdef MEMSCRUB_DEBUG
532 	memscrub_printmemlist("memscrub_memlist after", memscrub_memlist);
533 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
534 #endif /* MEMSCRUB_DEBUG */
535 
536 	mutex_exit(&memscrub_lock);
537 	return (retval);
538 }
539