xref: /linux/mm/memory-failure.c (revision d524dac9279b6a41ffdf7ff7958c577f2e387db6)
1 /*
2  * Copyright (C) 2008, 2009 Intel Corporation
3  * Authors: Andi Kleen, Fengguang Wu
4  *
5  * This software may be redistributed and/or modified under the terms of
6  * the GNU General Public License ("GPL") version 2 only as published by the
7  * Free Software Foundation.
8  *
9  * High level machine check handler. Handles pages reported by the
10  * hardware as being corrupted usually due to a multi-bit ECC memory or cache
11  * failure.
12  *
13  * In addition there is a "soft offline" entry point that allows stop using
14  * not-yet-corrupted-by-suspicious pages without killing anything.
15  *
16  * Handles page cache pages in various states.	The tricky part
17  * here is that we can access any page asynchronously in respect to
18  * other VM users, because memory failures could happen anytime and
19  * anywhere. This could violate some of their assumptions. This is why
20  * this code has to be extremely careful. Generally it tries to use
21  * normal locking rules, as in get the standard locks, even if that means
22  * the error handling takes potentially a long time.
23  *
24  * There are several operations here with exponential complexity because
25  * of unsuitable VM data structures. For example the operation to map back
26  * from RMAP chains to processes has to walk the complete process list and
27  * has non linear complexity with the number. But since memory corruptions
28  * are rare we hope to get away with this. This avoids impacting the core
29  * VM.
30  */
31 
32 /*
33  * Notebook:
34  * - hugetlb needs more code
35  * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
36  * - pass bad pages to kdump next kernel
37  */
38 #include <linux/kernel.h>
39 #include <linux/mm.h>
40 #include <linux/page-flags.h>
41 #include <linux/kernel-page-flags.h>
42 #include <linux/sched.h>
43 #include <linux/ksm.h>
44 #include <linux/rmap.h>
45 #include <linux/pagemap.h>
46 #include <linux/swap.h>
47 #include <linux/backing-dev.h>
48 #include <linux/migrate.h>
49 #include <linux/page-isolation.h>
50 #include <linux/suspend.h>
51 #include <linux/slab.h>
52 #include <linux/swapops.h>
53 #include <linux/hugetlb.h>
54 #include <linux/memory_hotplug.h>
55 #include "internal.h"
56 
57 int sysctl_memory_failure_early_kill __read_mostly = 0;
58 
59 int sysctl_memory_failure_recovery __read_mostly = 1;
60 
61 atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
62 
63 #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
64 
65 u32 hwpoison_filter_enable = 0;
66 u32 hwpoison_filter_dev_major = ~0U;
67 u32 hwpoison_filter_dev_minor = ~0U;
68 u64 hwpoison_filter_flags_mask;
69 u64 hwpoison_filter_flags_value;
70 EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
71 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
72 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
73 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
74 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
75 
76 static int hwpoison_filter_dev(struct page *p)
77 {
78 	struct address_space *mapping;
79 	dev_t dev;
80 
81 	if (hwpoison_filter_dev_major == ~0U &&
82 	    hwpoison_filter_dev_minor == ~0U)
83 		return 0;
84 
85 	/*
86 	 * page_mapping() does not accept slab pages.
87 	 */
88 	if (PageSlab(p))
89 		return -EINVAL;
90 
91 	mapping = page_mapping(p);
92 	if (mapping == NULL || mapping->host == NULL)
93 		return -EINVAL;
94 
95 	dev = mapping->host->i_sb->s_dev;
96 	if (hwpoison_filter_dev_major != ~0U &&
97 	    hwpoison_filter_dev_major != MAJOR(dev))
98 		return -EINVAL;
99 	if (hwpoison_filter_dev_minor != ~0U &&
100 	    hwpoison_filter_dev_minor != MINOR(dev))
101 		return -EINVAL;
102 
103 	return 0;
104 }
105 
106 static int hwpoison_filter_flags(struct page *p)
107 {
108 	if (!hwpoison_filter_flags_mask)
109 		return 0;
110 
111 	if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
112 				    hwpoison_filter_flags_value)
113 		return 0;
114 	else
115 		return -EINVAL;
116 }
117 
118 /*
119  * This allows stress tests to limit test scope to a collection of tasks
120  * by putting them under some memcg. This prevents killing unrelated/important
121  * processes such as /sbin/init. Note that the target task may share clean
122  * pages with init (eg. libc text), which is harmless. If the target task
123  * share _dirty_ pages with another task B, the test scheme must make sure B
124  * is also included in the memcg. At last, due to race conditions this filter
125  * can only guarantee that the page either belongs to the memcg tasks, or is
126  * a freed page.
127  */
128 #ifdef	CONFIG_CGROUP_MEM_RES_CTLR_SWAP
129 u64 hwpoison_filter_memcg;
130 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
131 static int hwpoison_filter_task(struct page *p)
132 {
133 	struct mem_cgroup *mem;
134 	struct cgroup_subsys_state *css;
135 	unsigned long ino;
136 
137 	if (!hwpoison_filter_memcg)
138 		return 0;
139 
140 	mem = try_get_mem_cgroup_from_page(p);
141 	if (!mem)
142 		return -EINVAL;
143 
144 	css = mem_cgroup_css(mem);
145 	/* root_mem_cgroup has NULL dentries */
146 	if (!css->cgroup->dentry)
147 		return -EINVAL;
148 
149 	ino = css->cgroup->dentry->d_inode->i_ino;
150 	css_put(css);
151 
152 	if (ino != hwpoison_filter_memcg)
153 		return -EINVAL;
154 
155 	return 0;
156 }
157 #else
158 static int hwpoison_filter_task(struct page *p) { return 0; }
159 #endif
160 
161 int hwpoison_filter(struct page *p)
162 {
163 	if (!hwpoison_filter_enable)
164 		return 0;
165 
166 	if (hwpoison_filter_dev(p))
167 		return -EINVAL;
168 
169 	if (hwpoison_filter_flags(p))
170 		return -EINVAL;
171 
172 	if (hwpoison_filter_task(p))
173 		return -EINVAL;
174 
175 	return 0;
176 }
177 #else
178 int hwpoison_filter(struct page *p)
179 {
180 	return 0;
181 }
182 #endif
183 
184 EXPORT_SYMBOL_GPL(hwpoison_filter);
185 
186 /*
187  * Send all the processes who have the page mapped an ``action optional''
188  * signal.
189  */
190 static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
191 			unsigned long pfn, struct page *page)
192 {
193 	struct siginfo si;
194 	int ret;
195 
196 	printk(KERN_ERR
197 		"MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
198 		pfn, t->comm, t->pid);
199 	si.si_signo = SIGBUS;
200 	si.si_errno = 0;
201 	si.si_code = BUS_MCEERR_AO;
202 	si.si_addr = (void *)addr;
203 #ifdef __ARCH_SI_TRAPNO
204 	si.si_trapno = trapno;
205 #endif
206 	si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
207 	/*
208 	 * Don't use force here, it's convenient if the signal
209 	 * can be temporarily blocked.
210 	 * This could cause a loop when the user sets SIGBUS
211 	 * to SIG_IGN, but hopefully noone will do that?
212 	 */
213 	ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
214 	if (ret < 0)
215 		printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
216 		       t->comm, t->pid, ret);
217 	return ret;
218 }
219 
220 /*
221  * When a unknown page type is encountered drain as many buffers as possible
222  * in the hope to turn the page into a LRU or free page, which we can handle.
223  */
224 void shake_page(struct page *p, int access)
225 {
226 	if (!PageSlab(p)) {
227 		lru_add_drain_all();
228 		if (PageLRU(p))
229 			return;
230 		drain_all_pages();
231 		if (PageLRU(p) || is_free_buddy_page(p))
232 			return;
233 	}
234 
235 	/*
236 	 * Only all shrink_slab here (which would also
237 	 * shrink other caches) if access is not potentially fatal.
238 	 */
239 	if (access) {
240 		int nr;
241 		do {
242 			nr = shrink_slab(1000, GFP_KERNEL, 1000);
243 			if (page_count(p) == 1)
244 				break;
245 		} while (nr > 10);
246 	}
247 }
248 EXPORT_SYMBOL_GPL(shake_page);
249 
250 /*
251  * Kill all processes that have a poisoned page mapped and then isolate
252  * the page.
253  *
254  * General strategy:
255  * Find all processes having the page mapped and kill them.
256  * But we keep a page reference around so that the page is not
257  * actually freed yet.
258  * Then stash the page away
259  *
260  * There's no convenient way to get back to mapped processes
261  * from the VMAs. So do a brute-force search over all
262  * running processes.
263  *
264  * Remember that machine checks are not common (or rather
265  * if they are common you have other problems), so this shouldn't
266  * be a performance issue.
267  *
268  * Also there are some races possible while we get from the
269  * error detection to actually handle it.
270  */
271 
272 struct to_kill {
273 	struct list_head nd;
274 	struct task_struct *tsk;
275 	unsigned long addr;
276 	char addr_valid;
277 };
278 
279 /*
280  * Failure handling: if we can't find or can't kill a process there's
281  * not much we can do.	We just print a message and ignore otherwise.
282  */
283 
284 /*
285  * Schedule a process for later kill.
286  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
287  * TBD would GFP_NOIO be enough?
288  */
289 static void add_to_kill(struct task_struct *tsk, struct page *p,
290 		       struct vm_area_struct *vma,
291 		       struct list_head *to_kill,
292 		       struct to_kill **tkc)
293 {
294 	struct to_kill *tk;
295 
296 	if (*tkc) {
297 		tk = *tkc;
298 		*tkc = NULL;
299 	} else {
300 		tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
301 		if (!tk) {
302 			printk(KERN_ERR
303 		"MCE: Out of memory while machine check handling\n");
304 			return;
305 		}
306 	}
307 	tk->addr = page_address_in_vma(p, vma);
308 	tk->addr_valid = 1;
309 
310 	/*
311 	 * In theory we don't have to kill when the page was
312 	 * munmaped. But it could be also a mremap. Since that's
313 	 * likely very rare kill anyways just out of paranoia, but use
314 	 * a SIGKILL because the error is not contained anymore.
315 	 */
316 	if (tk->addr == -EFAULT) {
317 		pr_info("MCE: Unable to find user space address %lx in %s\n",
318 			page_to_pfn(p), tsk->comm);
319 		tk->addr_valid = 0;
320 	}
321 	get_task_struct(tsk);
322 	tk->tsk = tsk;
323 	list_add_tail(&tk->nd, to_kill);
324 }
325 
326 /*
327  * Kill the processes that have been collected earlier.
328  *
329  * Only do anything when DOIT is set, otherwise just free the list
330  * (this is used for clean pages which do not need killing)
331  * Also when FAIL is set do a force kill because something went
332  * wrong earlier.
333  */
334 static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
335 			  int fail, struct page *page, unsigned long pfn)
336 {
337 	struct to_kill *tk, *next;
338 
339 	list_for_each_entry_safe (tk, next, to_kill, nd) {
340 		if (doit) {
341 			/*
342 			 * In case something went wrong with munmapping
343 			 * make sure the process doesn't catch the
344 			 * signal and then access the memory. Just kill it.
345 			 */
346 			if (fail || tk->addr_valid == 0) {
347 				printk(KERN_ERR
348 		"MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
349 					pfn, tk->tsk->comm, tk->tsk->pid);
350 				force_sig(SIGKILL, tk->tsk);
351 			}
352 
353 			/*
354 			 * In theory the process could have mapped
355 			 * something else on the address in-between. We could
356 			 * check for that, but we need to tell the
357 			 * process anyways.
358 			 */
359 			else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
360 					      pfn, page) < 0)
361 				printk(KERN_ERR
362 		"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
363 					pfn, tk->tsk->comm, tk->tsk->pid);
364 		}
365 		put_task_struct(tk->tsk);
366 		kfree(tk);
367 	}
368 }
369 
370 static int task_early_kill(struct task_struct *tsk)
371 {
372 	if (!tsk->mm)
373 		return 0;
374 	if (tsk->flags & PF_MCE_PROCESS)
375 		return !!(tsk->flags & PF_MCE_EARLY);
376 	return sysctl_memory_failure_early_kill;
377 }
378 
379 /*
380  * Collect processes when the error hit an anonymous page.
381  */
382 static void collect_procs_anon(struct page *page, struct list_head *to_kill,
383 			      struct to_kill **tkc)
384 {
385 	struct vm_area_struct *vma;
386 	struct task_struct *tsk;
387 	struct anon_vma *av;
388 
389 	if (!PageHuge(page) && unlikely(split_huge_page(page)))
390 		return;
391 	read_lock(&tasklist_lock);
392 	av = page_lock_anon_vma(page);
393 	if (av == NULL)	/* Not actually mapped anymore */
394 		goto out;
395 	for_each_process (tsk) {
396 		struct anon_vma_chain *vmac;
397 
398 		if (!task_early_kill(tsk))
399 			continue;
400 		list_for_each_entry(vmac, &av->head, same_anon_vma) {
401 			vma = vmac->vma;
402 			if (!page_mapped_in_vma(page, vma))
403 				continue;
404 			if (vma->vm_mm == tsk->mm)
405 				add_to_kill(tsk, page, vma, to_kill, tkc);
406 		}
407 	}
408 	page_unlock_anon_vma(av);
409 out:
410 	read_unlock(&tasklist_lock);
411 }
412 
413 /*
414  * Collect processes when the error hit a file mapped page.
415  */
416 static void collect_procs_file(struct page *page, struct list_head *to_kill,
417 			      struct to_kill **tkc)
418 {
419 	struct vm_area_struct *vma;
420 	struct task_struct *tsk;
421 	struct prio_tree_iter iter;
422 	struct address_space *mapping = page->mapping;
423 
424 	/*
425 	 * A note on the locking order between the two locks.
426 	 * We don't rely on this particular order.
427 	 * If you have some other code that needs a different order
428 	 * feel free to switch them around. Or add a reverse link
429 	 * from mm_struct to task_struct, then this could be all
430 	 * done without taking tasklist_lock and looping over all tasks.
431 	 */
432 
433 	read_lock(&tasklist_lock);
434 	spin_lock(&mapping->i_mmap_lock);
435 	for_each_process(tsk) {
436 		pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
437 
438 		if (!task_early_kill(tsk))
439 			continue;
440 
441 		vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
442 				      pgoff) {
443 			/*
444 			 * Send early kill signal to tasks where a vma covers
445 			 * the page but the corrupted page is not necessarily
446 			 * mapped it in its pte.
447 			 * Assume applications who requested early kill want
448 			 * to be informed of all such data corruptions.
449 			 */
450 			if (vma->vm_mm == tsk->mm)
451 				add_to_kill(tsk, page, vma, to_kill, tkc);
452 		}
453 	}
454 	spin_unlock(&mapping->i_mmap_lock);
455 	read_unlock(&tasklist_lock);
456 }
457 
458 /*
459  * Collect the processes who have the corrupted page mapped to kill.
460  * This is done in two steps for locking reasons.
461  * First preallocate one tokill structure outside the spin locks,
462  * so that we can kill at least one process reasonably reliable.
463  */
464 static void collect_procs(struct page *page, struct list_head *tokill)
465 {
466 	struct to_kill *tk;
467 
468 	if (!page->mapping)
469 		return;
470 
471 	tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
472 	if (!tk)
473 		return;
474 	if (PageAnon(page))
475 		collect_procs_anon(page, tokill, &tk);
476 	else
477 		collect_procs_file(page, tokill, &tk);
478 	kfree(tk);
479 }
480 
481 /*
482  * Error handlers for various types of pages.
483  */
484 
485 enum outcome {
486 	IGNORED,	/* Error: cannot be handled */
487 	FAILED,		/* Error: handling failed */
488 	DELAYED,	/* Will be handled later */
489 	RECOVERED,	/* Successfully recovered */
490 };
491 
492 static const char *action_name[] = {
493 	[IGNORED] = "Ignored",
494 	[FAILED] = "Failed",
495 	[DELAYED] = "Delayed",
496 	[RECOVERED] = "Recovered",
497 };
498 
499 /*
500  * XXX: It is possible that a page is isolated from LRU cache,
501  * and then kept in swap cache or failed to remove from page cache.
502  * The page count will stop it from being freed by unpoison.
503  * Stress tests should be aware of this memory leak problem.
504  */
505 static int delete_from_lru_cache(struct page *p)
506 {
507 	if (!isolate_lru_page(p)) {
508 		/*
509 		 * Clear sensible page flags, so that the buddy system won't
510 		 * complain when the page is unpoison-and-freed.
511 		 */
512 		ClearPageActive(p);
513 		ClearPageUnevictable(p);
514 		/*
515 		 * drop the page count elevated by isolate_lru_page()
516 		 */
517 		page_cache_release(p);
518 		return 0;
519 	}
520 	return -EIO;
521 }
522 
523 /*
524  * Error hit kernel page.
525  * Do nothing, try to be lucky and not touch this instead. For a few cases we
526  * could be more sophisticated.
527  */
528 static int me_kernel(struct page *p, unsigned long pfn)
529 {
530 	return IGNORED;
531 }
532 
533 /*
534  * Page in unknown state. Do nothing.
535  */
536 static int me_unknown(struct page *p, unsigned long pfn)
537 {
538 	printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
539 	return FAILED;
540 }
541 
542 /*
543  * Clean (or cleaned) page cache page.
544  */
545 static int me_pagecache_clean(struct page *p, unsigned long pfn)
546 {
547 	int err;
548 	int ret = FAILED;
549 	struct address_space *mapping;
550 
551 	delete_from_lru_cache(p);
552 
553 	/*
554 	 * For anonymous pages we're done the only reference left
555 	 * should be the one m_f() holds.
556 	 */
557 	if (PageAnon(p))
558 		return RECOVERED;
559 
560 	/*
561 	 * Now truncate the page in the page cache. This is really
562 	 * more like a "temporary hole punch"
563 	 * Don't do this for block devices when someone else
564 	 * has a reference, because it could be file system metadata
565 	 * and that's not safe to truncate.
566 	 */
567 	mapping = page_mapping(p);
568 	if (!mapping) {
569 		/*
570 		 * Page has been teared down in the meanwhile
571 		 */
572 		return FAILED;
573 	}
574 
575 	/*
576 	 * Truncation is a bit tricky. Enable it per file system for now.
577 	 *
578 	 * Open: to take i_mutex or not for this? Right now we don't.
579 	 */
580 	if (mapping->a_ops->error_remove_page) {
581 		err = mapping->a_ops->error_remove_page(mapping, p);
582 		if (err != 0) {
583 			printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
584 					pfn, err);
585 		} else if (page_has_private(p) &&
586 				!try_to_release_page(p, GFP_NOIO)) {
587 			pr_info("MCE %#lx: failed to release buffers\n", pfn);
588 		} else {
589 			ret = RECOVERED;
590 		}
591 	} else {
592 		/*
593 		 * If the file system doesn't support it just invalidate
594 		 * This fails on dirty or anything with private pages
595 		 */
596 		if (invalidate_inode_page(p))
597 			ret = RECOVERED;
598 		else
599 			printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
600 				pfn);
601 	}
602 	return ret;
603 }
604 
605 /*
606  * Dirty cache page page
607  * Issues: when the error hit a hole page the error is not properly
608  * propagated.
609  */
610 static int me_pagecache_dirty(struct page *p, unsigned long pfn)
611 {
612 	struct address_space *mapping = page_mapping(p);
613 
614 	SetPageError(p);
615 	/* TBD: print more information about the file. */
616 	if (mapping) {
617 		/*
618 		 * IO error will be reported by write(), fsync(), etc.
619 		 * who check the mapping.
620 		 * This way the application knows that something went
621 		 * wrong with its dirty file data.
622 		 *
623 		 * There's one open issue:
624 		 *
625 		 * The EIO will be only reported on the next IO
626 		 * operation and then cleared through the IO map.
627 		 * Normally Linux has two mechanisms to pass IO error
628 		 * first through the AS_EIO flag in the address space
629 		 * and then through the PageError flag in the page.
630 		 * Since we drop pages on memory failure handling the
631 		 * only mechanism open to use is through AS_AIO.
632 		 *
633 		 * This has the disadvantage that it gets cleared on
634 		 * the first operation that returns an error, while
635 		 * the PageError bit is more sticky and only cleared
636 		 * when the page is reread or dropped.  If an
637 		 * application assumes it will always get error on
638 		 * fsync, but does other operations on the fd before
639 		 * and the page is dropped inbetween then the error
640 		 * will not be properly reported.
641 		 *
642 		 * This can already happen even without hwpoisoned
643 		 * pages: first on metadata IO errors (which only
644 		 * report through AS_EIO) or when the page is dropped
645 		 * at the wrong time.
646 		 *
647 		 * So right now we assume that the application DTRT on
648 		 * the first EIO, but we're not worse than other parts
649 		 * of the kernel.
650 		 */
651 		mapping_set_error(mapping, EIO);
652 	}
653 
654 	return me_pagecache_clean(p, pfn);
655 }
656 
657 /*
658  * Clean and dirty swap cache.
659  *
660  * Dirty swap cache page is tricky to handle. The page could live both in page
661  * cache and swap cache(ie. page is freshly swapped in). So it could be
662  * referenced concurrently by 2 types of PTEs:
663  * normal PTEs and swap PTEs. We try to handle them consistently by calling
664  * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
665  * and then
666  *      - clear dirty bit to prevent IO
667  *      - remove from LRU
668  *      - but keep in the swap cache, so that when we return to it on
669  *        a later page fault, we know the application is accessing
670  *        corrupted data and shall be killed (we installed simple
671  *        interception code in do_swap_page to catch it).
672  *
673  * Clean swap cache pages can be directly isolated. A later page fault will
674  * bring in the known good data from disk.
675  */
676 static int me_swapcache_dirty(struct page *p, unsigned long pfn)
677 {
678 	ClearPageDirty(p);
679 	/* Trigger EIO in shmem: */
680 	ClearPageUptodate(p);
681 
682 	if (!delete_from_lru_cache(p))
683 		return DELAYED;
684 	else
685 		return FAILED;
686 }
687 
688 static int me_swapcache_clean(struct page *p, unsigned long pfn)
689 {
690 	delete_from_swap_cache(p);
691 
692 	if (!delete_from_lru_cache(p))
693 		return RECOVERED;
694 	else
695 		return FAILED;
696 }
697 
698 /*
699  * Huge pages. Needs work.
700  * Issues:
701  * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
702  *   To narrow down kill region to one page, we need to break up pmd.
703  */
704 static int me_huge_page(struct page *p, unsigned long pfn)
705 {
706 	int res = 0;
707 	struct page *hpage = compound_head(p);
708 	/*
709 	 * We can safely recover from error on free or reserved (i.e.
710 	 * not in-use) hugepage by dequeuing it from freelist.
711 	 * To check whether a hugepage is in-use or not, we can't use
712 	 * page->lru because it can be used in other hugepage operations,
713 	 * such as __unmap_hugepage_range() and gather_surplus_pages().
714 	 * So instead we use page_mapping() and PageAnon().
715 	 * We assume that this function is called with page lock held,
716 	 * so there is no race between isolation and mapping/unmapping.
717 	 */
718 	if (!(page_mapping(hpage) || PageAnon(hpage))) {
719 		res = dequeue_hwpoisoned_huge_page(hpage);
720 		if (!res)
721 			return RECOVERED;
722 	}
723 	return DELAYED;
724 }
725 
726 /*
727  * Various page states we can handle.
728  *
729  * A page state is defined by its current page->flags bits.
730  * The table matches them in order and calls the right handler.
731  *
732  * This is quite tricky because we can access page at any time
733  * in its live cycle, so all accesses have to be extremly careful.
734  *
735  * This is not complete. More states could be added.
736  * For any missing state don't attempt recovery.
737  */
738 
739 #define dirty		(1UL << PG_dirty)
740 #define sc		(1UL << PG_swapcache)
741 #define unevict		(1UL << PG_unevictable)
742 #define mlock		(1UL << PG_mlocked)
743 #define writeback	(1UL << PG_writeback)
744 #define lru		(1UL << PG_lru)
745 #define swapbacked	(1UL << PG_swapbacked)
746 #define head		(1UL << PG_head)
747 #define tail		(1UL << PG_tail)
748 #define compound	(1UL << PG_compound)
749 #define slab		(1UL << PG_slab)
750 #define reserved	(1UL << PG_reserved)
751 
752 static struct page_state {
753 	unsigned long mask;
754 	unsigned long res;
755 	char *msg;
756 	int (*action)(struct page *p, unsigned long pfn);
757 } error_states[] = {
758 	{ reserved,	reserved,	"reserved kernel",	me_kernel },
759 	/*
760 	 * free pages are specially detected outside this table:
761 	 * PG_buddy pages only make a small fraction of all free pages.
762 	 */
763 
764 	/*
765 	 * Could in theory check if slab page is free or if we can drop
766 	 * currently unused objects without touching them. But just
767 	 * treat it as standard kernel for now.
768 	 */
769 	{ slab,		slab,		"kernel slab",	me_kernel },
770 
771 #ifdef CONFIG_PAGEFLAGS_EXTENDED
772 	{ head,		head,		"huge",		me_huge_page },
773 	{ tail,		tail,		"huge",		me_huge_page },
774 #else
775 	{ compound,	compound,	"huge",		me_huge_page },
776 #endif
777 
778 	{ sc|dirty,	sc|dirty,	"swapcache",	me_swapcache_dirty },
779 	{ sc|dirty,	sc,		"swapcache",	me_swapcache_clean },
780 
781 	{ unevict|dirty, unevict|dirty,	"unevictable LRU", me_pagecache_dirty},
782 	{ unevict,	unevict,	"unevictable LRU", me_pagecache_clean},
783 
784 	{ mlock|dirty,	mlock|dirty,	"mlocked LRU",	me_pagecache_dirty },
785 	{ mlock,	mlock,		"mlocked LRU",	me_pagecache_clean },
786 
787 	{ lru|dirty,	lru|dirty,	"LRU",		me_pagecache_dirty },
788 	{ lru|dirty,	lru,		"clean LRU",	me_pagecache_clean },
789 
790 	/*
791 	 * Catchall entry: must be at end.
792 	 */
793 	{ 0,		0,		"unknown page state",	me_unknown },
794 };
795 
796 #undef dirty
797 #undef sc
798 #undef unevict
799 #undef mlock
800 #undef writeback
801 #undef lru
802 #undef swapbacked
803 #undef head
804 #undef tail
805 #undef compound
806 #undef slab
807 #undef reserved
808 
809 static void action_result(unsigned long pfn, char *msg, int result)
810 {
811 	struct page *page = pfn_to_page(pfn);
812 
813 	printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
814 		pfn,
815 		PageDirty(page) ? "dirty " : "",
816 		msg, action_name[result]);
817 }
818 
819 static int page_action(struct page_state *ps, struct page *p,
820 			unsigned long pfn)
821 {
822 	int result;
823 	int count;
824 
825 	result = ps->action(p, pfn);
826 	action_result(pfn, ps->msg, result);
827 
828 	count = page_count(p) - 1;
829 	if (ps->action == me_swapcache_dirty && result == DELAYED)
830 		count--;
831 	if (count != 0) {
832 		printk(KERN_ERR
833 		       "MCE %#lx: %s page still referenced by %d users\n",
834 		       pfn, ps->msg, count);
835 		result = FAILED;
836 	}
837 
838 	/* Could do more checks here if page looks ok */
839 	/*
840 	 * Could adjust zone counters here to correct for the missing page.
841 	 */
842 
843 	return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
844 }
845 
846 /*
847  * Do all that is necessary to remove user space mappings. Unmap
848  * the pages and send SIGBUS to the processes if the data was dirty.
849  */
850 static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
851 				  int trapno)
852 {
853 	enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
854 	struct address_space *mapping;
855 	LIST_HEAD(tokill);
856 	int ret;
857 	int kill = 1;
858 	struct page *hpage = compound_head(p);
859 
860 	if (PageReserved(p) || PageSlab(p))
861 		return SWAP_SUCCESS;
862 
863 	/*
864 	 * This check implies we don't kill processes if their pages
865 	 * are in the swap cache early. Those are always late kills.
866 	 */
867 	if (!page_mapped(hpage))
868 		return SWAP_SUCCESS;
869 
870 	if (PageKsm(p))
871 		return SWAP_FAIL;
872 
873 	if (PageSwapCache(p)) {
874 		printk(KERN_ERR
875 		       "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
876 		ttu |= TTU_IGNORE_HWPOISON;
877 	}
878 
879 	/*
880 	 * Propagate the dirty bit from PTEs to struct page first, because we
881 	 * need this to decide if we should kill or just drop the page.
882 	 * XXX: the dirty test could be racy: set_page_dirty() may not always
883 	 * be called inside page lock (it's recommended but not enforced).
884 	 */
885 	mapping = page_mapping(hpage);
886 	if (!PageDirty(hpage) && mapping &&
887 	    mapping_cap_writeback_dirty(mapping)) {
888 		if (page_mkclean(hpage)) {
889 			SetPageDirty(hpage);
890 		} else {
891 			kill = 0;
892 			ttu |= TTU_IGNORE_HWPOISON;
893 			printk(KERN_INFO
894 	"MCE %#lx: corrupted page was clean: dropped without side effects\n",
895 				pfn);
896 		}
897 	}
898 
899 	/*
900 	 * First collect all the processes that have the page
901 	 * mapped in dirty form.  This has to be done before try_to_unmap,
902 	 * because ttu takes the rmap data structures down.
903 	 *
904 	 * Error handling: We ignore errors here because
905 	 * there's nothing that can be done.
906 	 */
907 	if (kill)
908 		collect_procs(hpage, &tokill);
909 
910 	ret = try_to_unmap(hpage, ttu);
911 	if (ret != SWAP_SUCCESS)
912 		printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
913 				pfn, page_mapcount(hpage));
914 
915 	/*
916 	 * Now that the dirty bit has been propagated to the
917 	 * struct page and all unmaps done we can decide if
918 	 * killing is needed or not.  Only kill when the page
919 	 * was dirty, otherwise the tokill list is merely
920 	 * freed.  When there was a problem unmapping earlier
921 	 * use a more force-full uncatchable kill to prevent
922 	 * any accesses to the poisoned memory.
923 	 */
924 	kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
925 		      ret != SWAP_SUCCESS, p, pfn);
926 
927 	return ret;
928 }
929 
930 static void set_page_hwpoison_huge_page(struct page *hpage)
931 {
932 	int i;
933 	int nr_pages = 1 << compound_trans_order(hpage);
934 	for (i = 0; i < nr_pages; i++)
935 		SetPageHWPoison(hpage + i);
936 }
937 
938 static void clear_page_hwpoison_huge_page(struct page *hpage)
939 {
940 	int i;
941 	int nr_pages = 1 << compound_trans_order(hpage);
942 	for (i = 0; i < nr_pages; i++)
943 		ClearPageHWPoison(hpage + i);
944 }
945 
946 int __memory_failure(unsigned long pfn, int trapno, int flags)
947 {
948 	struct page_state *ps;
949 	struct page *p;
950 	struct page *hpage;
951 	int res;
952 	unsigned int nr_pages;
953 
954 	if (!sysctl_memory_failure_recovery)
955 		panic("Memory failure from trap %d on page %lx", trapno, pfn);
956 
957 	if (!pfn_valid(pfn)) {
958 		printk(KERN_ERR
959 		       "MCE %#lx: memory outside kernel control\n",
960 		       pfn);
961 		return -ENXIO;
962 	}
963 
964 	p = pfn_to_page(pfn);
965 	hpage = compound_head(p);
966 	if (TestSetPageHWPoison(p)) {
967 		printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
968 		return 0;
969 	}
970 
971 	nr_pages = 1 << compound_trans_order(hpage);
972 	atomic_long_add(nr_pages, &mce_bad_pages);
973 
974 	/*
975 	 * We need/can do nothing about count=0 pages.
976 	 * 1) it's a free page, and therefore in safe hand:
977 	 *    prep_new_page() will be the gate keeper.
978 	 * 2) it's a free hugepage, which is also safe:
979 	 *    an affected hugepage will be dequeued from hugepage freelist,
980 	 *    so there's no concern about reusing it ever after.
981 	 * 3) it's part of a non-compound high order page.
982 	 *    Implies some kernel user: cannot stop them from
983 	 *    R/W the page; let's pray that the page has been
984 	 *    used and will be freed some time later.
985 	 * In fact it's dangerous to directly bump up page count from 0,
986 	 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
987 	 */
988 	if (!(flags & MF_COUNT_INCREASED) &&
989 		!get_page_unless_zero(hpage)) {
990 		if (is_free_buddy_page(p)) {
991 			action_result(pfn, "free buddy", DELAYED);
992 			return 0;
993 		} else if (PageHuge(hpage)) {
994 			/*
995 			 * Check "just unpoisoned", "filter hit", and
996 			 * "race with other subpage."
997 			 */
998 			lock_page_nosync(hpage);
999 			if (!PageHWPoison(hpage)
1000 			    || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1001 			    || (p != hpage && TestSetPageHWPoison(hpage))) {
1002 				atomic_long_sub(nr_pages, &mce_bad_pages);
1003 				return 0;
1004 			}
1005 			set_page_hwpoison_huge_page(hpage);
1006 			res = dequeue_hwpoisoned_huge_page(hpage);
1007 			action_result(pfn, "free huge",
1008 				      res ? IGNORED : DELAYED);
1009 			unlock_page(hpage);
1010 			return res;
1011 		} else {
1012 			action_result(pfn, "high order kernel", IGNORED);
1013 			return -EBUSY;
1014 		}
1015 	}
1016 
1017 	/*
1018 	 * We ignore non-LRU pages for good reasons.
1019 	 * - PG_locked is only well defined for LRU pages and a few others
1020 	 * - to avoid races with __set_page_locked()
1021 	 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
1022 	 * The check (unnecessarily) ignores LRU pages being isolated and
1023 	 * walked by the page reclaim code, however that's not a big loss.
1024 	 */
1025 	if (!PageLRU(p) && !PageHuge(p))
1026 		shake_page(p, 0);
1027 	if (!PageLRU(p) && !PageHuge(p)) {
1028 		/*
1029 		 * shake_page could have turned it free.
1030 		 */
1031 		if (is_free_buddy_page(p)) {
1032 			action_result(pfn, "free buddy, 2nd try", DELAYED);
1033 			return 0;
1034 		}
1035 		action_result(pfn, "non LRU", IGNORED);
1036 		put_page(p);
1037 		return -EBUSY;
1038 	}
1039 
1040 	/*
1041 	 * Lock the page and wait for writeback to finish.
1042 	 * It's very difficult to mess with pages currently under IO
1043 	 * and in many cases impossible, so we just avoid it here.
1044 	 */
1045 	lock_page_nosync(hpage);
1046 
1047 	/*
1048 	 * unpoison always clear PG_hwpoison inside page lock
1049 	 */
1050 	if (!PageHWPoison(p)) {
1051 		printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1052 		res = 0;
1053 		goto out;
1054 	}
1055 	if (hwpoison_filter(p)) {
1056 		if (TestClearPageHWPoison(p))
1057 			atomic_long_sub(nr_pages, &mce_bad_pages);
1058 		unlock_page(hpage);
1059 		put_page(hpage);
1060 		return 0;
1061 	}
1062 
1063 	/*
1064 	 * For error on the tail page, we should set PG_hwpoison
1065 	 * on the head page to show that the hugepage is hwpoisoned
1066 	 */
1067 	if (PageTail(p) && TestSetPageHWPoison(hpage)) {
1068 		action_result(pfn, "hugepage already hardware poisoned",
1069 				IGNORED);
1070 		unlock_page(hpage);
1071 		put_page(hpage);
1072 		return 0;
1073 	}
1074 	/*
1075 	 * Set PG_hwpoison on all pages in an error hugepage,
1076 	 * because containment is done in hugepage unit for now.
1077 	 * Since we have done TestSetPageHWPoison() for the head page with
1078 	 * page lock held, we can safely set PG_hwpoison bits on tail pages.
1079 	 */
1080 	if (PageHuge(p))
1081 		set_page_hwpoison_huge_page(hpage);
1082 
1083 	wait_on_page_writeback(p);
1084 
1085 	/*
1086 	 * Now take care of user space mappings.
1087 	 * Abort on fail: __remove_from_page_cache() assumes unmapped page.
1088 	 */
1089 	if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1090 		printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1091 		res = -EBUSY;
1092 		goto out;
1093 	}
1094 
1095 	/*
1096 	 * Torn down by someone else?
1097 	 */
1098 	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1099 		action_result(pfn, "already truncated LRU", IGNORED);
1100 		res = -EBUSY;
1101 		goto out;
1102 	}
1103 
1104 	res = -EBUSY;
1105 	for (ps = error_states;; ps++) {
1106 		if ((p->flags & ps->mask) == ps->res) {
1107 			res = page_action(ps, p, pfn);
1108 			break;
1109 		}
1110 	}
1111 out:
1112 	unlock_page(hpage);
1113 	return res;
1114 }
1115 EXPORT_SYMBOL_GPL(__memory_failure);
1116 
1117 /**
1118  * memory_failure - Handle memory failure of a page.
1119  * @pfn: Page Number of the corrupted page
1120  * @trapno: Trap number reported in the signal to user space.
1121  *
1122  * This function is called by the low level machine check code
1123  * of an architecture when it detects hardware memory corruption
1124  * of a page. It tries its best to recover, which includes
1125  * dropping pages, killing processes etc.
1126  *
1127  * The function is primarily of use for corruptions that
1128  * happen outside the current execution context (e.g. when
1129  * detected by a background scrubber)
1130  *
1131  * Must run in process context (e.g. a work queue) with interrupts
1132  * enabled and no spinlocks hold.
1133  */
1134 void memory_failure(unsigned long pfn, int trapno)
1135 {
1136 	__memory_failure(pfn, trapno, 0);
1137 }
1138 
1139 /**
1140  * unpoison_memory - Unpoison a previously poisoned page
1141  * @pfn: Page number of the to be unpoisoned page
1142  *
1143  * Software-unpoison a page that has been poisoned by
1144  * memory_failure() earlier.
1145  *
1146  * This is only done on the software-level, so it only works
1147  * for linux injected failures, not real hardware failures
1148  *
1149  * Returns 0 for success, otherwise -errno.
1150  */
1151 int unpoison_memory(unsigned long pfn)
1152 {
1153 	struct page *page;
1154 	struct page *p;
1155 	int freeit = 0;
1156 	unsigned int nr_pages;
1157 
1158 	if (!pfn_valid(pfn))
1159 		return -ENXIO;
1160 
1161 	p = pfn_to_page(pfn);
1162 	page = compound_head(p);
1163 
1164 	if (!PageHWPoison(p)) {
1165 		pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1166 		return 0;
1167 	}
1168 
1169 	nr_pages = 1 << compound_trans_order(page);
1170 
1171 	if (!get_page_unless_zero(page)) {
1172 		/*
1173 		 * Since HWPoisoned hugepage should have non-zero refcount,
1174 		 * race between memory failure and unpoison seems to happen.
1175 		 * In such case unpoison fails and memory failure runs
1176 		 * to the end.
1177 		 */
1178 		if (PageHuge(page)) {
1179 			pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1180 			return 0;
1181 		}
1182 		if (TestClearPageHWPoison(p))
1183 			atomic_long_sub(nr_pages, &mce_bad_pages);
1184 		pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1185 		return 0;
1186 	}
1187 
1188 	lock_page_nosync(page);
1189 	/*
1190 	 * This test is racy because PG_hwpoison is set outside of page lock.
1191 	 * That's acceptable because that won't trigger kernel panic. Instead,
1192 	 * the PG_hwpoison page will be caught and isolated on the entrance to
1193 	 * the free buddy page pool.
1194 	 */
1195 	if (TestClearPageHWPoison(page)) {
1196 		pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1197 		atomic_long_sub(nr_pages, &mce_bad_pages);
1198 		freeit = 1;
1199 		if (PageHuge(page))
1200 			clear_page_hwpoison_huge_page(page);
1201 	}
1202 	unlock_page(page);
1203 
1204 	put_page(page);
1205 	if (freeit)
1206 		put_page(page);
1207 
1208 	return 0;
1209 }
1210 EXPORT_SYMBOL(unpoison_memory);
1211 
1212 static struct page *new_page(struct page *p, unsigned long private, int **x)
1213 {
1214 	int nid = page_to_nid(p);
1215 	if (PageHuge(p))
1216 		return alloc_huge_page_node(page_hstate(compound_head(p)),
1217 						   nid);
1218 	else
1219 		return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1220 }
1221 
1222 /*
1223  * Safely get reference count of an arbitrary page.
1224  * Returns 0 for a free page, -EIO for a zero refcount page
1225  * that is not free, and 1 for any other page type.
1226  * For 1 the page is returned with increased page count, otherwise not.
1227  */
1228 static int get_any_page(struct page *p, unsigned long pfn, int flags)
1229 {
1230 	int ret;
1231 
1232 	if (flags & MF_COUNT_INCREASED)
1233 		return 1;
1234 
1235 	/*
1236 	 * The lock_memory_hotplug prevents a race with memory hotplug.
1237 	 * This is a big hammer, a better would be nicer.
1238 	 */
1239 	lock_memory_hotplug();
1240 
1241 	/*
1242 	 * Isolate the page, so that it doesn't get reallocated if it
1243 	 * was free.
1244 	 */
1245 	set_migratetype_isolate(p);
1246 	/*
1247 	 * When the target page is a free hugepage, just remove it
1248 	 * from free hugepage list.
1249 	 */
1250 	if (!get_page_unless_zero(compound_head(p))) {
1251 		if (PageHuge(p)) {
1252 			pr_info("get_any_page: %#lx free huge page\n", pfn);
1253 			ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1254 		} else if (is_free_buddy_page(p)) {
1255 			pr_info("get_any_page: %#lx free buddy page\n", pfn);
1256 			/* Set hwpoison bit while page is still isolated */
1257 			SetPageHWPoison(p);
1258 			ret = 0;
1259 		} else {
1260 			pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1261 				pfn, p->flags);
1262 			ret = -EIO;
1263 		}
1264 	} else {
1265 		/* Not a free page */
1266 		ret = 1;
1267 	}
1268 	unset_migratetype_isolate(p);
1269 	unlock_memory_hotplug();
1270 	return ret;
1271 }
1272 
1273 static int soft_offline_huge_page(struct page *page, int flags)
1274 {
1275 	int ret;
1276 	unsigned long pfn = page_to_pfn(page);
1277 	struct page *hpage = compound_head(page);
1278 	LIST_HEAD(pagelist);
1279 
1280 	ret = get_any_page(page, pfn, flags);
1281 	if (ret < 0)
1282 		return ret;
1283 	if (ret == 0)
1284 		goto done;
1285 
1286 	if (PageHWPoison(hpage)) {
1287 		put_page(hpage);
1288 		pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
1289 		return -EBUSY;
1290 	}
1291 
1292 	/* Keep page count to indicate a given hugepage is isolated. */
1293 
1294 	list_add(&hpage->lru, &pagelist);
1295 	ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
1296 				true);
1297 	if (ret) {
1298 		putback_lru_pages(&pagelist);
1299 		pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1300 			 pfn, ret, page->flags);
1301 		if (ret > 0)
1302 			ret = -EIO;
1303 		return ret;
1304 	}
1305 done:
1306 	if (!PageHWPoison(hpage))
1307 		atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
1308 	set_page_hwpoison_huge_page(hpage);
1309 	dequeue_hwpoisoned_huge_page(hpage);
1310 	/* keep elevated page count for bad page */
1311 	return ret;
1312 }
1313 
1314 /**
1315  * soft_offline_page - Soft offline a page.
1316  * @page: page to offline
1317  * @flags: flags. Same as memory_failure().
1318  *
1319  * Returns 0 on success, otherwise negated errno.
1320  *
1321  * Soft offline a page, by migration or invalidation,
1322  * without killing anything. This is for the case when
1323  * a page is not corrupted yet (so it's still valid to access),
1324  * but has had a number of corrected errors and is better taken
1325  * out.
1326  *
1327  * The actual policy on when to do that is maintained by
1328  * user space.
1329  *
1330  * This should never impact any application or cause data loss,
1331  * however it might take some time.
1332  *
1333  * This is not a 100% solution for all memory, but tries to be
1334  * ``good enough'' for the majority of memory.
1335  */
1336 int soft_offline_page(struct page *page, int flags)
1337 {
1338 	int ret;
1339 	unsigned long pfn = page_to_pfn(page);
1340 
1341 	if (PageHuge(page))
1342 		return soft_offline_huge_page(page, flags);
1343 
1344 	ret = get_any_page(page, pfn, flags);
1345 	if (ret < 0)
1346 		return ret;
1347 	if (ret == 0)
1348 		goto done;
1349 
1350 	/*
1351 	 * Page cache page we can handle?
1352 	 */
1353 	if (!PageLRU(page)) {
1354 		/*
1355 		 * Try to free it.
1356 		 */
1357 		put_page(page);
1358 		shake_page(page, 1);
1359 
1360 		/*
1361 		 * Did it turn free?
1362 		 */
1363 		ret = get_any_page(page, pfn, 0);
1364 		if (ret < 0)
1365 			return ret;
1366 		if (ret == 0)
1367 			goto done;
1368 	}
1369 	if (!PageLRU(page)) {
1370 		pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1371 				pfn, page->flags);
1372 		return -EIO;
1373 	}
1374 
1375 	lock_page(page);
1376 	wait_on_page_writeback(page);
1377 
1378 	/*
1379 	 * Synchronized using the page lock with memory_failure()
1380 	 */
1381 	if (PageHWPoison(page)) {
1382 		unlock_page(page);
1383 		put_page(page);
1384 		pr_info("soft offline: %#lx page already poisoned\n", pfn);
1385 		return -EBUSY;
1386 	}
1387 
1388 	/*
1389 	 * Try to invalidate first. This should work for
1390 	 * non dirty unmapped page cache pages.
1391 	 */
1392 	ret = invalidate_inode_page(page);
1393 	unlock_page(page);
1394 
1395 	/*
1396 	 * Drop count because page migration doesn't like raised
1397 	 * counts. The page could get re-allocated, but if it becomes
1398 	 * LRU the isolation will just fail.
1399 	 * RED-PEN would be better to keep it isolated here, but we
1400 	 * would need to fix isolation locking first.
1401 	 */
1402 	put_page(page);
1403 	if (ret == 1) {
1404 		ret = 0;
1405 		pr_info("soft_offline: %#lx: invalidated\n", pfn);
1406 		goto done;
1407 	}
1408 
1409 	/*
1410 	 * Simple invalidation didn't work.
1411 	 * Try to migrate to a new page instead. migrate.c
1412 	 * handles a large number of cases for us.
1413 	 */
1414 	ret = isolate_lru_page(page);
1415 	if (!ret) {
1416 		LIST_HEAD(pagelist);
1417 
1418 		list_add(&page->lru, &pagelist);
1419 		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1420 								0, true);
1421 		if (ret) {
1422 			pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1423 				pfn, ret, page->flags);
1424 			if (ret > 0)
1425 				ret = -EIO;
1426 		}
1427 	} else {
1428 		pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1429 				pfn, ret, page_count(page), page->flags);
1430 	}
1431 	if (ret)
1432 		return ret;
1433 
1434 done:
1435 	atomic_long_add(1, &mce_bad_pages);
1436 	SetPageHWPoison(page);
1437 	/* keep elevated page count for bad page */
1438 	return ret;
1439 }
1440 
1441 /*
1442  * The caller must hold current->mm->mmap_sem in read mode.
1443  */
1444 int is_hwpoison_address(unsigned long addr)
1445 {
1446 	pgd_t *pgdp;
1447 	pud_t pud, *pudp;
1448 	pmd_t pmd, *pmdp;
1449 	pte_t pte, *ptep;
1450 	swp_entry_t entry;
1451 
1452 	pgdp = pgd_offset(current->mm, addr);
1453 	if (!pgd_present(*pgdp))
1454 		return 0;
1455 	pudp = pud_offset(pgdp, addr);
1456 	pud = *pudp;
1457 	if (!pud_present(pud) || pud_large(pud))
1458 		return 0;
1459 	pmdp = pmd_offset(pudp, addr);
1460 	pmd = *pmdp;
1461 	if (!pmd_present(pmd) || pmd_large(pmd))
1462 		return 0;
1463 	ptep = pte_offset_map(pmdp, addr);
1464 	pte = *ptep;
1465 	pte_unmap(ptep);
1466 	if (!is_swap_pte(pte))
1467 		return 0;
1468 	entry = pte_to_swp_entry(pte);
1469 	return is_hwpoison_entry(entry);
1470 }
1471 EXPORT_SYMBOL_GPL(is_hwpoison_address);
1472