xref: /illumos-gate/usr/src/uts/i86pc/os/memscrub.c (revision cc6c5292fa8a241fe50604cf6a918edfbf7cd7d2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * i86pc Memory Scrubbing
31  *
32  * On detection of a correctable memory ECC error, the i86pc hardware
33  * returns the corrected data to the requester and may re-write it
34  * to memory (DRAM or NVRAM). Machines which do not re-write this to
35  * memory should add an NMI handler to correct and rewrite.
36  *
37  * Scrubbing thus reduces the likelyhood that multiple transient errors
38  * will occur in the same memory word, making uncorrectable errors due
39  * to transients less likely.
40  *
41  * Thus is born the desire that every memory location be periodically
42  * accessed.
43  *
44  * This file implements a memory scrubbing thread.  This scrubber
45  * guarantees that all of physical memory is accessed periodically
46  * (memscrub_period_sec -- 12 hours).
47  *
48  * It attempts to do this as unobtrusively as possible.  The thread
49  * schedules itself to wake up at an interval such that if it reads
50  * memscrub_span_pages (4MB) on each wakeup, it will read all of physical
51  * memory in in memscrub_period_sec (12 hours).
52  *
53  * The scrubber uses the REP LODS so it reads 4MB in 0.15 secs (on P5-200).
54  * When it completes a span, if all the CPUs are idle, it reads another span.
55  * Typically it soaks up idle time this way to reach its deadline early
56  * -- and sleeps until the next period begins.
57  *
58  * Maximal Cost Estimate:  8GB @ xxMB/s = xxx seconds spent in 640 wakeups
59  * that run for 0.15 seconds at intervals of 67 seconds.
60  *
61  * In practice, the scrubber finds enough idle time to finish in a few
62  * minutes, and sleeps until its 12 hour deadline.
63  *
64  * The scrubber maintains a private copy of the phys_install memory list
65  * to keep track of what memory should be scrubbed.
66  *
67  * The following parameters can be set via /etc/system
68  *
69  * memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES (4MB)
70  * memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC (12 hours)
71  * memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI (0)
72  * memscrub_delay_start_sec = (10 seconds)
73  * disable_memscrub = (0)
74  *
75  * the scrubber will exit (or never be started) if it finds the variable
76  * "disable_memscrub" set.
77  *
78  * MEMSCRUB_DFL_SPAN_PAGES  is based on the guess that 0.15 sec
79  * is a "good" amount of minimum time for the thread to run at a time.
80  *
81  * MEMSCRUB_DFL_PERIOD_SEC (12 hours) is nearly a total guess --
82  * twice the frequency the hardware folk estimated would be necessary.
83  *
84  * MEMSCRUB_DFL_THREAD_PRI (0) is based on the assumption that nearly
85  * any other use of the system should be higher priority than scrubbing.
86  */
87 
88 #include <sys/types.h>
89 #include <sys/systm.h>		/* timeout, types, t_lock */
90 #include <sys/cmn_err.h>
91 #include <sys/sysmacros.h>	/* MIN */
92 #include <sys/memlist.h>	/* memlist */
93 #include <sys/kmem.h>		/* KMEM_NOSLEEP */
94 #include <sys/cpuvar.h>		/* ncpus_online */
95 #include <sys/debug.h>		/* ASSERTs */
96 #include <sys/vmem.h>
97 #include <sys/mman.h>
98 #include <vm/seg_kmem.h>
99 #include <vm/seg_kpm.h>
100 #include <vm/hat_i86.h>
101 
102 static caddr_t	memscrub_window;
103 static void	*memscrub_pte;
104 
105 /*
106  * Global Data:
107  */
108 /*
109  * scan all of physical memory at least once every MEMSCRUB_PERIOD_SEC
110  */
111 #define	MEMSCRUB_DFL_PERIOD_SEC	(12 * 60 * 60)	/* 12 hours */
112 
113 /*
114  * start only if at least MEMSCRUB_MIN_PAGES in system
115  */
116 #define	MEMSCRUB_MIN_PAGES	((32 * 1024 * 1024) / PAGESIZE)
117 
118 /*
119  * scan at least MEMSCRUB_DFL_SPAN_PAGES each iteration
120  */
121 #define	MEMSCRUB_DFL_SPAN_PAGES	((4 * 1024 * 1024) / PAGESIZE)
122 
123 /*
124  * almost anything is higher priority than scrubbing
125  */
126 #define	MEMSCRUB_DFL_THREAD_PRI	0
127 
128 /*
129  * we can patch these defaults in /etc/system if necessary
130  */
131 uint_t disable_memscrub = 0;
132 pgcnt_t memscrub_min_pages = MEMSCRUB_MIN_PAGES;
133 pgcnt_t memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES;
134 time_t memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC;
135 uint_t memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI;
136 time_t memscrub_delay_start_sec = 10;
137 
138 /*
139  * Static Routines
140  */
141 static void memscrubber(void);
142 static int system_is_idle(void);
143 static int memscrub_add_span(uint64_t, uint64_t);
144 
145 /*
146  * Static Data
147  */
148 static struct memlist *memscrub_memlist;
149 static uint_t memscrub_phys_pages;
150 
151 static kcondvar_t memscrub_cv;
152 static kmutex_t memscrub_lock;
153 /*
154  * memscrub_lock protects memscrub_memlist
155  */
156 uint_t memscrub_scans_done;
157 
158 uint_t memscrub_done_early;
159 uint_t memscrub_early_sec;
160 
161 uint_t memscrub_done_late;
162 time_t memscrub_late_sec;
163 
164 /*
165  * create memscrub_memlist from phys_install list
166  * initialize locks, set memscrub_phys_pages.
167  */
168 void
169 memscrub_init()
170 {
171 	struct memlist *src;
172 
173 	if (physmem < memscrub_min_pages)
174 		return;
175 
176 	if (!kpm_enable) {
177 		memscrub_window = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
178 		memscrub_pte = hat_mempte_setup(memscrub_window);
179 	}
180 
181 	/*
182 	 * copy phys_install to memscrub_memlist
183 	 */
184 	for (src = phys_install; src; src = src->next) {
185 		if (memscrub_add_span(src->address, src->size)) {
186 			cmn_err(CE_WARN,
187 			    "Memory scrubber failed to initialize\n");
188 			return;
189 		}
190 	}
191 
192 	mutex_init(&memscrub_lock, NULL, MUTEX_DRIVER, NULL);
193 	cv_init(&memscrub_cv, NULL, CV_DRIVER, NULL);
194 
195 	/*
196 	 * create memscrubber thread
197 	 */
198 	(void) thread_create(NULL, 0, (void (*)())memscrubber, NULL, 0, &p0,
199 	    TS_RUN, memscrub_thread_pri);
200 }
201 
202 #ifdef MEMSCRUB_DEBUG
203 void
204 memscrub_printmemlist(char *title, struct memlist *listp)
205 {
206 	struct memlist *list;
207 
208 	cmn_err(CE_CONT, "%s:\n", title);
209 
210 	for (list = listp; list; list = list->next) {
211 		cmn_err(CE_CONT, "addr = 0x%llx, size = 0x%llx\n",
212 		    list->address, list->size);
213 	}
214 }
215 #endif /* MEMSCRUB_DEBUG */
216 
217 /* ARGSUSED */
218 void
219 memscrub_wakeup(void *c)
220 {
221 	/*
222 	 * grab mutex to guarantee that our wakeup call
223 	 * arrives after we go to sleep -- so we can't sleep forever.
224 	 */
225 	mutex_enter(&memscrub_lock);
226 	cv_signal(&memscrub_cv);
227 	mutex_exit(&memscrub_lock);
228 }
229 
230 /*
231  * this calculation doesn't account for the time that the actual scan
232  * consumes -- so we'd fall slightly behind schedule with this
233  * interval_sec.  but the idle loop optimization below usually makes us
234  * come in way ahead of schedule.
235  */
236 static int
237 compute_interval_sec()
238 {
239 	if (memscrub_phys_pages <= memscrub_span_pages)
240 		return (memscrub_period_sec);
241 	else
242 		return (memscrub_period_sec/
243 			(memscrub_phys_pages/memscrub_span_pages));
244 }
245 
246 void
247 memscrubber()
248 {
249 	time_t deadline;
250 	uint64_t mlp_last_addr;
251 	uint64_t mlp_next_addr;
252 	int reached_end = 1;
253 	time_t interval_sec = 0;
254 	struct memlist *mlp;
255 
256 	extern void scan_memory(caddr_t, size_t);
257 
258 	if (memscrub_memlist == NULL) {
259 		cmn_err(CE_WARN, "memscrub_memlist not initialized.");
260 		goto memscrub_exit;
261 	}
262 
263 	mlp = memscrub_memlist;
264 	mlp_next_addr = mlp->address;
265 	mlp_last_addr = mlp->address + mlp->size;
266 
267 	deadline = gethrestime_sec() + memscrub_delay_start_sec;
268 
269 	for (;;) {
270 		if (disable_memscrub)
271 			break;
272 
273 		mutex_enter(&memscrub_lock);
274 
275 		/*
276 		 * did we just reach the end of memory?
277 		 */
278 		if (reached_end) {
279 			time_t now = gethrestime_sec();
280 
281 			if (now >= deadline) {
282 				memscrub_done_late++;
283 				memscrub_late_sec += (now - deadline);
284 				/*
285 				 * past deadline, start right away
286 				 */
287 				interval_sec = 0;
288 
289 				deadline = now + memscrub_period_sec;
290 			} else {
291 				/*
292 				 * we finished ahead of schedule.
293 				 * wait till previous dealine before re-start.
294 				 */
295 				interval_sec = deadline - now;
296 				memscrub_done_early++;
297 				memscrub_early_sec += interval_sec;
298 				deadline += memscrub_period_sec;
299 			}
300 		} else {
301 			interval_sec = compute_interval_sec();
302 		}
303 
304 		/*
305 		 * hit the snooze bar
306 		 */
307 		(void) timeout(memscrub_wakeup, NULL, interval_sec * hz);
308 
309 		/*
310 		 * go to sleep
311 		 */
312 		cv_wait(&memscrub_cv, &memscrub_lock);
313 
314 		mutex_exit(&memscrub_lock);
315 
316 		do {
317 			pgcnt_t pages = memscrub_span_pages;
318 			uint64_t address = mlp_next_addr;
319 
320 			if (disable_memscrub)
321 				break;
322 
323 			mutex_enter(&memscrub_lock);
324 
325 			/*
326 			 * Make sure we don't try to scan beyond the end of
327 			 * the current memlist.  If we would, then resize
328 			 * our scan target for this iteration, and prepare
329 			 * to read the next memlist entry on the next
330 			 * iteration.
331 			 */
332 			reached_end = 0;
333 			if (address + mmu_ptob(pages) >= mlp_last_addr) {
334 				pages = mmu_btop(mlp_last_addr - address);
335 				mlp = mlp->next;
336 				if (mlp == NULL) {
337 					reached_end = 1;
338 					mlp = memscrub_memlist;
339 				}
340 				mlp_next_addr = mlp->address;
341 				mlp_last_addr = mlp->address + mlp->size;
342 			} else {
343 				mlp_next_addr += mmu_ptob(pages);
344 			}
345 
346 			mutex_exit(&memscrub_lock);
347 
348 			while (pages--) {
349 				pfn_t pfn = btop(address);
350 
351 				/*
352 				 * Without segkpm, the memscrubber cannot
353 				 * be allowed to migrate across CPUs, as
354 				 * the CPU-specific mapping of
355 				 * memscrub_window would be incorrect.
356 				 * With segkpm, switching CPUs is legal, but
357 				 * inefficient.  We don't use
358 				 * kpreempt_disable as it might hold a
359 				 * higher priority thread (eg, RT) too long
360 				 * off CPU.
361 				 */
362 				thread_affinity_set(curthread, CPU_CURRENT);
363 				if (kpm_enable)
364 					memscrub_window = hat_kpm_pfn2va(pfn);
365 				else
366 					hat_mempte_remap(pfn, memscrub_window,
367 					    memscrub_pte,
368 					    PROT_READ, HAT_LOAD_NOCONSIST);
369 
370 				scan_memory(memscrub_window, PAGESIZE);
371 
372 				thread_affinity_clear(curthread);
373 				address += MMU_PAGESIZE;
374 			}
375 
376 			memscrub_scans_done++;
377 		} while (!reached_end && system_is_idle());
378 	}
379 
380 memscrub_exit:
381 
382 	cmn_err(CE_NOTE, "memory scrubber exiting.");
383 
384 	cv_destroy(&memscrub_cv);
385 
386 	thread_exit();
387 }
388 
389 
390 /*
391  * return 1 if we're MP and all the other CPUs are idle
392  */
393 static int
394 system_is_idle()
395 {
396 	int cpu_id;
397 	int found = 0;
398 
399 	if (1 == ncpus_online)
400 		return (0);
401 
402 	for (cpu_id = 0; cpu_id < NCPU; ++cpu_id) {
403 		if (!cpu[cpu_id])
404 			continue;
405 
406 		found++;
407 
408 		if (cpu[cpu_id]->cpu_thread != cpu[cpu_id]->cpu_idle_thread) {
409 			if (CPU->cpu_id == cpu_id &&
410 			    CPU->cpu_disp->disp_nrunnable == 0)
411 				continue;
412 			return (0);
413 		}
414 
415 		if (found == ncpus)
416 			break;
417 	}
418 	return (1);
419 }
420 
421 /*
422  * add a span to the memscrub list
423  */
424 static int
425 memscrub_add_span(uint64_t start, uint64_t bytes)
426 {
427 	struct memlist *dst;
428 	struct memlist *prev, *next;
429 	uint64_t end = start + bytes - 1;
430 	int retval = 0;
431 
432 	mutex_enter(&memscrub_lock);
433 
434 #ifdef MEMSCRUB_DEBUG
435 	memscrub_printmemlist("memscrub_memlist before", memscrub_memlist);
436 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
437 	cmn_err(CE_CONT, "memscrub_add_span: address: 0x%llx"
438 		" size: 0x%llx\n", start, bytes);
439 #endif /* MEMSCRUB_DEBUG */
440 
441 	/*
442 	 * Scan through the list to find the proper place to install it.
443 	 */
444 	prev = NULL;
445 	next = memscrub_memlist;
446 	while (next) {
447 		uint64_t ns = next->address;
448 		uint64_t ne = next->address + next->size - 1;
449 
450 		/*
451 		 * If this span overlaps with an existing span, then
452 		 * something has gone horribly wrong with the phys_install
453 		 * list.  In fact, I'm surprised we made it this far.
454 		 */
455 		if ((start >= ns && start <= ne) || (end >= ns && end <= ne) ||
456 		    (start < ns && end > ne))
457 			panic("memscrub found overlapping memory ranges "
458 			    "(0x%p-0x%p) and (0x%p-0x%p)",
459 			    (void *)(uintptr_t)start, (void *)(uintptr_t)end,
460 			    (void *)(uintptr_t)ns, (void *)(uintptr_t)ne);
461 
462 		/*
463 		 * New span can be appended to an existing one.
464 		 */
465 		if (start == ne + 1) {
466 			next->size += bytes;
467 			goto add_done;
468 		}
469 
470 		/*
471 		 * New span can be prepended to an existing one.
472 		 */
473 		if (end + 1 == ns) {
474 			next->size += bytes;
475 			next->address = start;
476 			goto add_done;
477 		}
478 
479 		/*
480 		 * If the next span has a higher start address than the new
481 		 * one, then we have found the right spot for our
482 		 * insertion.
483 		 */
484 		if (ns > start)
485 			break;
486 
487 		prev = next;
488 		next = next->next;
489 	}
490 
491 	/*
492 	 * allocate a new struct memlist
493 	 */
494 	dst = kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
495 	if (dst == NULL) {
496 		retval = -1;
497 		goto add_done;
498 	}
499 	dst->address = start;
500 	dst->size = bytes;
501 	dst->prev = prev;
502 	dst->next = next;
503 
504 	if (prev)
505 		prev->next = dst;
506 	else
507 		memscrub_memlist = dst;
508 
509 	if (next)
510 		next->prev = dst;
511 
512 add_done:
513 
514 	if (retval != -1)
515 		memscrub_phys_pages += mmu_btop(bytes);
516 
517 #ifdef MEMSCRUB_DEBUG
518 	memscrub_printmemlist("memscrub_memlist after", memscrub_memlist);
519 	cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
520 #endif /* MEMSCRUB_DEBUG */
521 
522 	mutex_exit(&memscrub_lock);
523 	return (retval);
524 }
525