xref: /linux/mm/memory_hotplug.c (revision 895931232d9358e0016f580f26b336c29c9528cc)
1 /*
2  *  linux/mm/memory_hotplug.c
3  *
4  *  Copyright (C)
5  */
6 
7 #include <linux/stddef.h>
8 #include <linux/mm.h>
9 #include <linux/sched/signal.h>
10 #include <linux/swap.h>
11 #include <linux/interrupt.h>
12 #include <linux/pagemap.h>
13 #include <linux/compiler.h>
14 #include <linux/export.h>
15 #include <linux/pagevec.h>
16 #include <linux/writeback.h>
17 #include <linux/slab.h>
18 #include <linux/sysctl.h>
19 #include <linux/cpu.h>
20 #include <linux/memory.h>
21 #include <linux/memremap.h>
22 #include <linux/memory_hotplug.h>
23 #include <linux/highmem.h>
24 #include <linux/vmalloc.h>
25 #include <linux/ioport.h>
26 #include <linux/delay.h>
27 #include <linux/migrate.h>
28 #include <linux/page-isolation.h>
29 #include <linux/pfn.h>
30 #include <linux/suspend.h>
31 #include <linux/mm_inline.h>
32 #include <linux/firmware-map.h>
33 #include <linux/stop_machine.h>
34 #include <linux/hugetlb.h>
35 #include <linux/memblock.h>
36 #include <linux/bootmem.h>
37 #include <linux/compaction.h>
38 
39 #include <asm/tlbflush.h>
40 
41 #include "internal.h"
42 
43 /*
44  * online_page_callback contains pointer to current page onlining function.
45  * Initially it is generic_online_page(). If it is required it could be
46  * changed by calling set_online_page_callback() for callback registration
47  * and restore_online_page_callback() for generic callback restore.
48  */
49 
50 static void generic_online_page(struct page *page);
51 
52 static online_page_callback_t online_page_callback = generic_online_page;
53 static DEFINE_MUTEX(online_page_callback_lock);
54 
55 /* The same as the cpu_hotplug lock, but for memory hotplug. */
56 static struct {
57 	struct task_struct *active_writer;
58 	struct mutex lock; /* Synchronizes accesses to refcount, */
59 	/*
60 	 * Also blocks the new readers during
61 	 * an ongoing mem hotplug operation.
62 	 */
63 	int refcount;
64 
65 #ifdef CONFIG_DEBUG_LOCK_ALLOC
66 	struct lockdep_map dep_map;
67 #endif
68 } mem_hotplug = {
69 	.active_writer = NULL,
70 	.lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
71 	.refcount = 0,
72 #ifdef CONFIG_DEBUG_LOCK_ALLOC
73 	.dep_map = {.name = "mem_hotplug.lock" },
74 #endif
75 };
76 
77 /* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
78 #define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
79 #define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)
80 #define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map)
81 
82 bool movable_node_enabled = false;
83 
84 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
85 bool memhp_auto_online;
86 #else
87 bool memhp_auto_online = true;
88 #endif
89 EXPORT_SYMBOL_GPL(memhp_auto_online);
90 
91 static int __init setup_memhp_default_state(char *str)
92 {
93 	if (!strcmp(str, "online"))
94 		memhp_auto_online = true;
95 	else if (!strcmp(str, "offline"))
96 		memhp_auto_online = false;
97 
98 	return 1;
99 }
100 __setup("memhp_default_state=", setup_memhp_default_state);
101 
102 void get_online_mems(void)
103 {
104 	might_sleep();
105 	if (mem_hotplug.active_writer == current)
106 		return;
107 	memhp_lock_acquire_read();
108 	mutex_lock(&mem_hotplug.lock);
109 	mem_hotplug.refcount++;
110 	mutex_unlock(&mem_hotplug.lock);
111 
112 }
113 
114 void put_online_mems(void)
115 {
116 	if (mem_hotplug.active_writer == current)
117 		return;
118 	mutex_lock(&mem_hotplug.lock);
119 
120 	if (WARN_ON(!mem_hotplug.refcount))
121 		mem_hotplug.refcount++; /* try to fix things up */
122 
123 	if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
124 		wake_up_process(mem_hotplug.active_writer);
125 	mutex_unlock(&mem_hotplug.lock);
126 	memhp_lock_release();
127 
128 }
129 
130 /* Serializes write accesses to mem_hotplug.active_writer. */
131 static DEFINE_MUTEX(memory_add_remove_lock);
132 
133 void mem_hotplug_begin(void)
134 {
135 	mutex_lock(&memory_add_remove_lock);
136 
137 	mem_hotplug.active_writer = current;
138 
139 	memhp_lock_acquire();
140 	for (;;) {
141 		mutex_lock(&mem_hotplug.lock);
142 		if (likely(!mem_hotplug.refcount))
143 			break;
144 		__set_current_state(TASK_UNINTERRUPTIBLE);
145 		mutex_unlock(&mem_hotplug.lock);
146 		schedule();
147 	}
148 }
149 
150 void mem_hotplug_done(void)
151 {
152 	mem_hotplug.active_writer = NULL;
153 	mutex_unlock(&mem_hotplug.lock);
154 	memhp_lock_release();
155 	mutex_unlock(&memory_add_remove_lock);
156 }
157 
158 /* add this memory to iomem resource */
159 static struct resource *register_memory_resource(u64 start, u64 size)
160 {
161 	struct resource *res;
162 	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
163 	if (!res)
164 		return ERR_PTR(-ENOMEM);
165 
166 	res->name = "System RAM";
167 	res->start = start;
168 	res->end = start + size - 1;
169 	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
170 	if (request_resource(&iomem_resource, res) < 0) {
171 		pr_debug("System RAM resource %pR cannot be added\n", res);
172 		kfree(res);
173 		return ERR_PTR(-EEXIST);
174 	}
175 	return res;
176 }
177 
178 static void release_memory_resource(struct resource *res)
179 {
180 	if (!res)
181 		return;
182 	release_resource(res);
183 	kfree(res);
184 	return;
185 }
186 
187 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
188 void get_page_bootmem(unsigned long info,  struct page *page,
189 		      unsigned long type)
190 {
191 	page->freelist = (void *)type;
192 	SetPagePrivate(page);
193 	set_page_private(page, info);
194 	page_ref_inc(page);
195 }
196 
197 void put_page_bootmem(struct page *page)
198 {
199 	unsigned long type;
200 
201 	type = (unsigned long) page->freelist;
202 	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
203 	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
204 
205 	if (page_ref_dec_return(page) == 1) {
206 		page->freelist = NULL;
207 		ClearPagePrivate(page);
208 		set_page_private(page, 0);
209 		INIT_LIST_HEAD(&page->lru);
210 		free_reserved_page(page);
211 	}
212 }
213 
214 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
215 #ifndef CONFIG_SPARSEMEM_VMEMMAP
216 static void register_page_bootmem_info_section(unsigned long start_pfn)
217 {
218 	unsigned long *usemap, mapsize, section_nr, i;
219 	struct mem_section *ms;
220 	struct page *page, *memmap;
221 
222 	section_nr = pfn_to_section_nr(start_pfn);
223 	ms = __nr_to_section(section_nr);
224 
225 	/* Get section's memmap address */
226 	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
227 
228 	/*
229 	 * Get page for the memmap's phys address
230 	 * XXX: need more consideration for sparse_vmemmap...
231 	 */
232 	page = virt_to_page(memmap);
233 	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
234 	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
235 
236 	/* remember memmap's page */
237 	for (i = 0; i < mapsize; i++, page++)
238 		get_page_bootmem(section_nr, page, SECTION_INFO);
239 
240 	usemap = __nr_to_section(section_nr)->pageblock_flags;
241 	page = virt_to_page(usemap);
242 
243 	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
244 
245 	for (i = 0; i < mapsize; i++, page++)
246 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
247 
248 }
249 #else /* CONFIG_SPARSEMEM_VMEMMAP */
250 static void register_page_bootmem_info_section(unsigned long start_pfn)
251 {
252 	unsigned long *usemap, mapsize, section_nr, i;
253 	struct mem_section *ms;
254 	struct page *page, *memmap;
255 
256 	if (!pfn_valid(start_pfn))
257 		return;
258 
259 	section_nr = pfn_to_section_nr(start_pfn);
260 	ms = __nr_to_section(section_nr);
261 
262 	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
263 
264 	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
265 
266 	usemap = __nr_to_section(section_nr)->pageblock_flags;
267 	page = virt_to_page(usemap);
268 
269 	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
270 
271 	for (i = 0; i < mapsize; i++, page++)
272 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
273 }
274 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
275 
276 void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
277 {
278 	unsigned long i, pfn, end_pfn, nr_pages;
279 	int node = pgdat->node_id;
280 	struct page *page;
281 
282 	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
283 	page = virt_to_page(pgdat);
284 
285 	for (i = 0; i < nr_pages; i++, page++)
286 		get_page_bootmem(node, page, NODE_INFO);
287 
288 	pfn = pgdat->node_start_pfn;
289 	end_pfn = pgdat_end_pfn(pgdat);
290 
291 	/* register section info */
292 	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
293 		/*
294 		 * Some platforms can assign the same pfn to multiple nodes - on
295 		 * node0 as well as nodeN.  To avoid registering a pfn against
296 		 * multiple nodes we check that this pfn does not already
297 		 * reside in some other nodes.
298 		 */
299 		if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
300 			register_page_bootmem_info_section(pfn);
301 	}
302 }
303 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
304 
305 static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
306 		bool want_memblock)
307 {
308 	int ret;
309 	int i;
310 
311 	if (pfn_valid(phys_start_pfn))
312 		return -EEXIST;
313 
314 	ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn);
315 	if (ret < 0)
316 		return ret;
317 
318 	/*
319 	 * Make all the pages reserved so that nobody will stumble over half
320 	 * initialized state.
321 	 * FIXME: We also have to associate it with a node because pfn_to_node
322 	 * relies on having page with the proper node.
323 	 */
324 	for (i = 0; i < PAGES_PER_SECTION; i++) {
325 		unsigned long pfn = phys_start_pfn + i;
326 		struct page *page;
327 		if (!pfn_valid(pfn))
328 			continue;
329 
330 		page = pfn_to_page(pfn);
331 		set_page_node(page, nid);
332 		SetPageReserved(page);
333 	}
334 
335 	if (!want_memblock)
336 		return 0;
337 
338 	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
339 }
340 
341 /*
342  * Reasonably generic function for adding memory.  It is
343  * expected that archs that support memory hotplug will
344  * call this function after deciding the zone to which to
345  * add the new pages.
346  */
347 int __ref __add_pages(int nid, unsigned long phys_start_pfn,
348 			unsigned long nr_pages, bool want_memblock)
349 {
350 	unsigned long i;
351 	int err = 0;
352 	int start_sec, end_sec;
353 	struct vmem_altmap *altmap;
354 
355 	/* during initialize mem_map, align hot-added range to section */
356 	start_sec = pfn_to_section_nr(phys_start_pfn);
357 	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
358 
359 	altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn));
360 	if (altmap) {
361 		/*
362 		 * Validate altmap is within bounds of the total request
363 		 */
364 		if (altmap->base_pfn != phys_start_pfn
365 				|| vmem_altmap_offset(altmap) > nr_pages) {
366 			pr_warn_once("memory add fail, invalid altmap\n");
367 			err = -EINVAL;
368 			goto out;
369 		}
370 		altmap->alloc = 0;
371 	}
372 
373 	for (i = start_sec; i <= end_sec; i++) {
374 		err = __add_section(nid, section_nr_to_pfn(i), want_memblock);
375 
376 		/*
377 		 * EEXIST is finally dealt with by ioresource collision
378 		 * check. see add_memory() => register_memory_resource()
379 		 * Warning will be printed if there is collision.
380 		 */
381 		if (err && (err != -EEXIST))
382 			break;
383 		err = 0;
384 	}
385 	vmemmap_populate_print_last();
386 out:
387 	return err;
388 }
389 EXPORT_SYMBOL_GPL(__add_pages);
390 
391 #ifdef CONFIG_MEMORY_HOTREMOVE
392 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */
393 static int find_smallest_section_pfn(int nid, struct zone *zone,
394 				     unsigned long start_pfn,
395 				     unsigned long end_pfn)
396 {
397 	struct mem_section *ms;
398 
399 	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
400 		ms = __pfn_to_section(start_pfn);
401 
402 		if (unlikely(!valid_section(ms)))
403 			continue;
404 
405 		if (unlikely(pfn_to_nid(start_pfn) != nid))
406 			continue;
407 
408 		if (zone && zone != page_zone(pfn_to_page(start_pfn)))
409 			continue;
410 
411 		return start_pfn;
412 	}
413 
414 	return 0;
415 }
416 
417 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */
418 static int find_biggest_section_pfn(int nid, struct zone *zone,
419 				    unsigned long start_pfn,
420 				    unsigned long end_pfn)
421 {
422 	struct mem_section *ms;
423 	unsigned long pfn;
424 
425 	/* pfn is the end pfn of a memory section. */
426 	pfn = end_pfn - 1;
427 	for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
428 		ms = __pfn_to_section(pfn);
429 
430 		if (unlikely(!valid_section(ms)))
431 			continue;
432 
433 		if (unlikely(pfn_to_nid(pfn) != nid))
434 			continue;
435 
436 		if (zone && zone != page_zone(pfn_to_page(pfn)))
437 			continue;
438 
439 		return pfn;
440 	}
441 
442 	return 0;
443 }
444 
445 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
446 			     unsigned long end_pfn)
447 {
448 	unsigned long zone_start_pfn = zone->zone_start_pfn;
449 	unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
450 	unsigned long zone_end_pfn = z;
451 	unsigned long pfn;
452 	struct mem_section *ms;
453 	int nid = zone_to_nid(zone);
454 
455 	zone_span_writelock(zone);
456 	if (zone_start_pfn == start_pfn) {
457 		/*
458 		 * If the section is smallest section in the zone, it need
459 		 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
460 		 * In this case, we find second smallest valid mem_section
461 		 * for shrinking zone.
462 		 */
463 		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
464 						zone_end_pfn);
465 		if (pfn) {
466 			zone->zone_start_pfn = pfn;
467 			zone->spanned_pages = zone_end_pfn - pfn;
468 		}
469 	} else if (zone_end_pfn == end_pfn) {
470 		/*
471 		 * If the section is biggest section in the zone, it need
472 		 * shrink zone->spanned_pages.
473 		 * In this case, we find second biggest valid mem_section for
474 		 * shrinking zone.
475 		 */
476 		pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
477 					       start_pfn);
478 		if (pfn)
479 			zone->spanned_pages = pfn - zone_start_pfn + 1;
480 	}
481 
482 	/*
483 	 * The section is not biggest or smallest mem_section in the zone, it
484 	 * only creates a hole in the zone. So in this case, we need not
485 	 * change the zone. But perhaps, the zone has only hole data. Thus
486 	 * it check the zone has only hole or not.
487 	 */
488 	pfn = zone_start_pfn;
489 	for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
490 		ms = __pfn_to_section(pfn);
491 
492 		if (unlikely(!valid_section(ms)))
493 			continue;
494 
495 		if (page_zone(pfn_to_page(pfn)) != zone)
496 			continue;
497 
498 		 /* If the section is current section, it continues the loop */
499 		if (start_pfn == pfn)
500 			continue;
501 
502 		/* If we find valid section, we have nothing to do */
503 		zone_span_writeunlock(zone);
504 		return;
505 	}
506 
507 	/* The zone has no valid section */
508 	zone->zone_start_pfn = 0;
509 	zone->spanned_pages = 0;
510 	zone_span_writeunlock(zone);
511 }
512 
513 static void shrink_pgdat_span(struct pglist_data *pgdat,
514 			      unsigned long start_pfn, unsigned long end_pfn)
515 {
516 	unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
517 	unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
518 	unsigned long pgdat_end_pfn = p;
519 	unsigned long pfn;
520 	struct mem_section *ms;
521 	int nid = pgdat->node_id;
522 
523 	if (pgdat_start_pfn == start_pfn) {
524 		/*
525 		 * If the section is smallest section in the pgdat, it need
526 		 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
527 		 * In this case, we find second smallest valid mem_section
528 		 * for shrinking zone.
529 		 */
530 		pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
531 						pgdat_end_pfn);
532 		if (pfn) {
533 			pgdat->node_start_pfn = pfn;
534 			pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
535 		}
536 	} else if (pgdat_end_pfn == end_pfn) {
537 		/*
538 		 * If the section is biggest section in the pgdat, it need
539 		 * shrink pgdat->node_spanned_pages.
540 		 * In this case, we find second biggest valid mem_section for
541 		 * shrinking zone.
542 		 */
543 		pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
544 					       start_pfn);
545 		if (pfn)
546 			pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
547 	}
548 
549 	/*
550 	 * If the section is not biggest or smallest mem_section in the pgdat,
551 	 * it only creates a hole in the pgdat. So in this case, we need not
552 	 * change the pgdat.
553 	 * But perhaps, the pgdat has only hole data. Thus it check the pgdat
554 	 * has only hole or not.
555 	 */
556 	pfn = pgdat_start_pfn;
557 	for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
558 		ms = __pfn_to_section(pfn);
559 
560 		if (unlikely(!valid_section(ms)))
561 			continue;
562 
563 		if (pfn_to_nid(pfn) != nid)
564 			continue;
565 
566 		 /* If the section is current section, it continues the loop */
567 		if (start_pfn == pfn)
568 			continue;
569 
570 		/* If we find valid section, we have nothing to do */
571 		return;
572 	}
573 
574 	/* The pgdat has no valid section */
575 	pgdat->node_start_pfn = 0;
576 	pgdat->node_spanned_pages = 0;
577 }
578 
579 static void __remove_zone(struct zone *zone, unsigned long start_pfn)
580 {
581 	struct pglist_data *pgdat = zone->zone_pgdat;
582 	int nr_pages = PAGES_PER_SECTION;
583 	int zone_type;
584 	unsigned long flags;
585 
586 	zone_type = zone - pgdat->node_zones;
587 
588 	pgdat_resize_lock(zone->zone_pgdat, &flags);
589 	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
590 	shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
591 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
592 }
593 
594 static int __remove_section(struct zone *zone, struct mem_section *ms,
595 		unsigned long map_offset)
596 {
597 	unsigned long start_pfn;
598 	int scn_nr;
599 	int ret = -EINVAL;
600 
601 	if (!valid_section(ms))
602 		return ret;
603 
604 	ret = unregister_memory_section(ms);
605 	if (ret)
606 		return ret;
607 
608 	scn_nr = __section_nr(ms);
609 	start_pfn = section_nr_to_pfn(scn_nr);
610 	__remove_zone(zone, start_pfn);
611 
612 	sparse_remove_one_section(zone, ms, map_offset);
613 	return 0;
614 }
615 
616 /**
617  * __remove_pages() - remove sections of pages from a zone
618  * @zone: zone from which pages need to be removed
619  * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
620  * @nr_pages: number of pages to remove (must be multiple of section size)
621  *
622  * Generic helper function to remove section mappings and sysfs entries
623  * for the section of the memory we are removing. Caller needs to make
624  * sure that pages are marked reserved and zones are adjust properly by
625  * calling offline_pages().
626  */
627 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
628 		 unsigned long nr_pages)
629 {
630 	unsigned long i;
631 	unsigned long map_offset = 0;
632 	int sections_to_remove, ret = 0;
633 
634 	/* In the ZONE_DEVICE case device driver owns the memory region */
635 	if (is_dev_zone(zone)) {
636 		struct page *page = pfn_to_page(phys_start_pfn);
637 		struct vmem_altmap *altmap;
638 
639 		altmap = to_vmem_altmap((unsigned long) page);
640 		if (altmap)
641 			map_offset = vmem_altmap_offset(altmap);
642 	} else {
643 		resource_size_t start, size;
644 
645 		start = phys_start_pfn << PAGE_SHIFT;
646 		size = nr_pages * PAGE_SIZE;
647 
648 		ret = release_mem_region_adjustable(&iomem_resource, start,
649 					size);
650 		if (ret) {
651 			resource_size_t endres = start + size - 1;
652 
653 			pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
654 					&start, &endres, ret);
655 		}
656 	}
657 
658 	clear_zone_contiguous(zone);
659 
660 	/*
661 	 * We can only remove entire sections
662 	 */
663 	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
664 	BUG_ON(nr_pages % PAGES_PER_SECTION);
665 
666 	sections_to_remove = nr_pages / PAGES_PER_SECTION;
667 	for (i = 0; i < sections_to_remove; i++) {
668 		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
669 
670 		ret = __remove_section(zone, __pfn_to_section(pfn), map_offset);
671 		map_offset = 0;
672 		if (ret)
673 			break;
674 	}
675 
676 	set_zone_contiguous(zone);
677 
678 	return ret;
679 }
680 #endif /* CONFIG_MEMORY_HOTREMOVE */
681 
682 int set_online_page_callback(online_page_callback_t callback)
683 {
684 	int rc = -EINVAL;
685 
686 	get_online_mems();
687 	mutex_lock(&online_page_callback_lock);
688 
689 	if (online_page_callback == generic_online_page) {
690 		online_page_callback = callback;
691 		rc = 0;
692 	}
693 
694 	mutex_unlock(&online_page_callback_lock);
695 	put_online_mems();
696 
697 	return rc;
698 }
699 EXPORT_SYMBOL_GPL(set_online_page_callback);
700 
701 int restore_online_page_callback(online_page_callback_t callback)
702 {
703 	int rc = -EINVAL;
704 
705 	get_online_mems();
706 	mutex_lock(&online_page_callback_lock);
707 
708 	if (online_page_callback == callback) {
709 		online_page_callback = generic_online_page;
710 		rc = 0;
711 	}
712 
713 	mutex_unlock(&online_page_callback_lock);
714 	put_online_mems();
715 
716 	return rc;
717 }
718 EXPORT_SYMBOL_GPL(restore_online_page_callback);
719 
720 void __online_page_set_limits(struct page *page)
721 {
722 }
723 EXPORT_SYMBOL_GPL(__online_page_set_limits);
724 
725 void __online_page_increment_counters(struct page *page)
726 {
727 	adjust_managed_page_count(page, 1);
728 }
729 EXPORT_SYMBOL_GPL(__online_page_increment_counters);
730 
731 void __online_page_free(struct page *page)
732 {
733 	__free_reserved_page(page);
734 }
735 EXPORT_SYMBOL_GPL(__online_page_free);
736 
737 static void generic_online_page(struct page *page)
738 {
739 	__online_page_set_limits(page);
740 	__online_page_increment_counters(page);
741 	__online_page_free(page);
742 }
743 
744 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
745 			void *arg)
746 {
747 	unsigned long i;
748 	unsigned long onlined_pages = *(unsigned long *)arg;
749 	struct page *page;
750 
751 	if (PageReserved(pfn_to_page(start_pfn)))
752 		for (i = 0; i < nr_pages; i++) {
753 			page = pfn_to_page(start_pfn + i);
754 			(*online_page_callback)(page);
755 			onlined_pages++;
756 		}
757 
758 	online_mem_sections(start_pfn, start_pfn + nr_pages);
759 
760 	*(unsigned long *)arg = onlined_pages;
761 	return 0;
762 }
763 
764 /* check which state of node_states will be changed when online memory */
765 static void node_states_check_changes_online(unsigned long nr_pages,
766 	struct zone *zone, struct memory_notify *arg)
767 {
768 	int nid = zone_to_nid(zone);
769 	enum zone_type zone_last = ZONE_NORMAL;
770 
771 	/*
772 	 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
773 	 * contains nodes which have zones of 0...ZONE_NORMAL,
774 	 * set zone_last to ZONE_NORMAL.
775 	 *
776 	 * If we don't have HIGHMEM nor movable node,
777 	 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
778 	 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
779 	 */
780 	if (N_MEMORY == N_NORMAL_MEMORY)
781 		zone_last = ZONE_MOVABLE;
782 
783 	/*
784 	 * if the memory to be online is in a zone of 0...zone_last, and
785 	 * the zones of 0...zone_last don't have memory before online, we will
786 	 * need to set the node to node_states[N_NORMAL_MEMORY] after
787 	 * the memory is online.
788 	 */
789 	if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
790 		arg->status_change_nid_normal = nid;
791 	else
792 		arg->status_change_nid_normal = -1;
793 
794 #ifdef CONFIG_HIGHMEM
795 	/*
796 	 * If we have movable node, node_states[N_HIGH_MEMORY]
797 	 * contains nodes which have zones of 0...ZONE_HIGHMEM,
798 	 * set zone_last to ZONE_HIGHMEM.
799 	 *
800 	 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
801 	 * contains nodes which have zones of 0...ZONE_MOVABLE,
802 	 * set zone_last to ZONE_MOVABLE.
803 	 */
804 	zone_last = ZONE_HIGHMEM;
805 	if (N_MEMORY == N_HIGH_MEMORY)
806 		zone_last = ZONE_MOVABLE;
807 
808 	if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
809 		arg->status_change_nid_high = nid;
810 	else
811 		arg->status_change_nid_high = -1;
812 #else
813 	arg->status_change_nid_high = arg->status_change_nid_normal;
814 #endif
815 
816 	/*
817 	 * if the node don't have memory befor online, we will need to
818 	 * set the node to node_states[N_MEMORY] after the memory
819 	 * is online.
820 	 */
821 	if (!node_state(nid, N_MEMORY))
822 		arg->status_change_nid = nid;
823 	else
824 		arg->status_change_nid = -1;
825 }
826 
827 static void node_states_set_node(int node, struct memory_notify *arg)
828 {
829 	if (arg->status_change_nid_normal >= 0)
830 		node_set_state(node, N_NORMAL_MEMORY);
831 
832 	if (arg->status_change_nid_high >= 0)
833 		node_set_state(node, N_HIGH_MEMORY);
834 
835 	node_set_state(node, N_MEMORY);
836 }
837 
838 bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
839 {
840 	struct pglist_data *pgdat = NODE_DATA(nid);
841 	struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
842 	struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
843 
844 	/*
845 	 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
846 	 * physically before ZONE_MOVABLE. All we need is they do not
847 	 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
848 	 * though so let's stick with it for simplicity for now.
849 	 * TODO make sure we do not overlap with ZONE_DEVICE
850 	 */
851 	if (online_type == MMOP_ONLINE_KERNEL) {
852 		if (zone_is_empty(movable_zone))
853 			return true;
854 		return movable_zone->zone_start_pfn >= pfn + nr_pages;
855 	} else if (online_type == MMOP_ONLINE_MOVABLE) {
856 		return zone_end_pfn(default_zone) <= pfn;
857 	}
858 
859 	/* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
860 	return online_type == MMOP_ONLINE_KEEP;
861 }
862 
863 static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
864 		unsigned long nr_pages)
865 {
866 	unsigned long old_end_pfn = zone_end_pfn(zone);
867 
868 	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
869 		zone->zone_start_pfn = start_pfn;
870 
871 	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
872 }
873 
874 static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
875                                      unsigned long nr_pages)
876 {
877 	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
878 
879 	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
880 		pgdat->node_start_pfn = start_pfn;
881 
882 	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
883 }
884 
885 void __ref move_pfn_range_to_zone(struct zone *zone,
886 		unsigned long start_pfn, unsigned long nr_pages)
887 {
888 	struct pglist_data *pgdat = zone->zone_pgdat;
889 	int nid = pgdat->node_id;
890 	unsigned long flags;
891 
892 	if (zone_is_empty(zone))
893 		init_currently_empty_zone(zone, start_pfn, nr_pages);
894 
895 	clear_zone_contiguous(zone);
896 
897 	/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
898 	pgdat_resize_lock(pgdat, &flags);
899 	zone_span_writelock(zone);
900 	resize_zone_range(zone, start_pfn, nr_pages);
901 	zone_span_writeunlock(zone);
902 	resize_pgdat_range(pgdat, start_pfn, nr_pages);
903 	pgdat_resize_unlock(pgdat, &flags);
904 
905 	/*
906 	 * TODO now we have a visible range of pages which are not associated
907 	 * with their zone properly. Not nice but set_pfnblock_flags_mask
908 	 * expects the zone spans the pfn range. All the pages in the range
909 	 * are reserved so nobody should be touching them so we should be safe
910 	 */
911 	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG);
912 
913 	set_zone_contiguous(zone);
914 }
915 
916 /*
917  * Returns a default kernel memory zone for the given pfn range.
918  * If no kernel zone covers this pfn range it will automatically go
919  * to the ZONE_NORMAL.
920  */
921 struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
922 		unsigned long nr_pages)
923 {
924 	struct pglist_data *pgdat = NODE_DATA(nid);
925 	int zid;
926 
927 	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
928 		struct zone *zone = &pgdat->node_zones[zid];
929 
930 		if (zone_intersects(zone, start_pfn, nr_pages))
931 			return zone;
932 	}
933 
934 	return &pgdat->node_zones[ZONE_NORMAL];
935 }
936 
937 /*
938  * Associates the given pfn range with the given node and the zone appropriate
939  * for the given online type.
940  */
941 static struct zone * __meminit move_pfn_range(int online_type, int nid,
942 		unsigned long start_pfn, unsigned long nr_pages)
943 {
944 	struct pglist_data *pgdat = NODE_DATA(nid);
945 	struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
946 
947 	if (online_type == MMOP_ONLINE_KEEP) {
948 		struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
949 		/*
950 		 * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
951 		 * movable zone if that is not possible (e.g. we are within
952 		 * or past the existing movable zone)
953 		 */
954 		if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
955 					MMOP_ONLINE_KERNEL))
956 			zone = movable_zone;
957 	} else if (online_type == MMOP_ONLINE_MOVABLE) {
958 		zone = &pgdat->node_zones[ZONE_MOVABLE];
959 	}
960 
961 	move_pfn_range_to_zone(zone, start_pfn, nr_pages);
962 	return zone;
963 }
964 
965 /* Must be protected by mem_hotplug_begin() */
966 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
967 {
968 	unsigned long flags;
969 	unsigned long onlined_pages = 0;
970 	struct zone *zone;
971 	int need_zonelists_rebuild = 0;
972 	int nid;
973 	int ret;
974 	struct memory_notify arg;
975 
976 	nid = pfn_to_nid(pfn);
977 	if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
978 		return -EINVAL;
979 
980 	/* associate pfn range with the zone */
981 	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
982 
983 	arg.start_pfn = pfn;
984 	arg.nr_pages = nr_pages;
985 	node_states_check_changes_online(nr_pages, zone, &arg);
986 
987 	ret = memory_notify(MEM_GOING_ONLINE, &arg);
988 	ret = notifier_to_errno(ret);
989 	if (ret)
990 		goto failed_addition;
991 
992 	/*
993 	 * If this zone is not populated, then it is not in zonelist.
994 	 * This means the page allocator ignores this zone.
995 	 * So, zonelist must be updated after online.
996 	 */
997 	mutex_lock(&zonelists_mutex);
998 	if (!populated_zone(zone)) {
999 		need_zonelists_rebuild = 1;
1000 		build_all_zonelists(NULL, zone);
1001 	}
1002 
1003 	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
1004 		online_pages_range);
1005 	if (ret) {
1006 		if (need_zonelists_rebuild)
1007 			zone_pcp_reset(zone);
1008 		mutex_unlock(&zonelists_mutex);
1009 		goto failed_addition;
1010 	}
1011 
1012 	zone->present_pages += onlined_pages;
1013 
1014 	pgdat_resize_lock(zone->zone_pgdat, &flags);
1015 	zone->zone_pgdat->node_present_pages += onlined_pages;
1016 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
1017 
1018 	if (onlined_pages) {
1019 		node_states_set_node(nid, &arg);
1020 		if (need_zonelists_rebuild)
1021 			build_all_zonelists(NULL, NULL);
1022 		else
1023 			zone_pcp_update(zone);
1024 	}
1025 
1026 	mutex_unlock(&zonelists_mutex);
1027 
1028 	init_per_zone_wmark_min();
1029 
1030 	if (onlined_pages) {
1031 		kswapd_run(nid);
1032 		kcompactd_run(nid);
1033 	}
1034 
1035 	vm_total_pages = nr_free_pagecache_pages();
1036 
1037 	writeback_set_ratelimit();
1038 
1039 	if (onlined_pages)
1040 		memory_notify(MEM_ONLINE, &arg);
1041 	return 0;
1042 
1043 failed_addition:
1044 	pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
1045 		 (unsigned long long) pfn << PAGE_SHIFT,
1046 		 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
1047 	memory_notify(MEM_CANCEL_ONLINE, &arg);
1048 	return ret;
1049 }
1050 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
1051 
1052 static void reset_node_present_pages(pg_data_t *pgdat)
1053 {
1054 	struct zone *z;
1055 
1056 	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
1057 		z->present_pages = 0;
1058 
1059 	pgdat->node_present_pages = 0;
1060 }
1061 
1062 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1063 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
1064 {
1065 	struct pglist_data *pgdat;
1066 	unsigned long zones_size[MAX_NR_ZONES] = {0};
1067 	unsigned long zholes_size[MAX_NR_ZONES] = {0};
1068 	unsigned long start_pfn = PFN_DOWN(start);
1069 
1070 	pgdat = NODE_DATA(nid);
1071 	if (!pgdat) {
1072 		pgdat = arch_alloc_nodedata(nid);
1073 		if (!pgdat)
1074 			return NULL;
1075 
1076 		arch_refresh_nodedata(nid, pgdat);
1077 	} else {
1078 		/*
1079 		 * Reset the nr_zones, order and classzone_idx before reuse.
1080 		 * Note that kswapd will init kswapd_classzone_idx properly
1081 		 * when it starts in the near future.
1082 		 */
1083 		pgdat->nr_zones = 0;
1084 		pgdat->kswapd_order = 0;
1085 		pgdat->kswapd_classzone_idx = 0;
1086 	}
1087 
1088 	/* we can use NODE_DATA(nid) from here */
1089 
1090 	/* init node's zones as empty zones, we don't have any present pages.*/
1091 	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
1092 	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
1093 
1094 	/*
1095 	 * The node we allocated has no zone fallback lists. For avoiding
1096 	 * to access not-initialized zonelist, build here.
1097 	 */
1098 	mutex_lock(&zonelists_mutex);
1099 	build_all_zonelists(pgdat, NULL);
1100 	mutex_unlock(&zonelists_mutex);
1101 
1102 	/*
1103 	 * zone->managed_pages is set to an approximate value in
1104 	 * free_area_init_core(), which will cause
1105 	 * /sys/device/system/node/nodeX/meminfo has wrong data.
1106 	 * So reset it to 0 before any memory is onlined.
1107 	 */
1108 	reset_node_managed_pages(pgdat);
1109 
1110 	/*
1111 	 * When memory is hot-added, all the memory is in offline state. So
1112 	 * clear all zones' present_pages because they will be updated in
1113 	 * online_pages() and offline_pages().
1114 	 */
1115 	reset_node_present_pages(pgdat);
1116 
1117 	return pgdat;
1118 }
1119 
1120 static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
1121 {
1122 	arch_refresh_nodedata(nid, NULL);
1123 	free_percpu(pgdat->per_cpu_nodestats);
1124 	arch_free_nodedata(pgdat);
1125 	return;
1126 }
1127 
1128 
1129 /**
1130  * try_online_node - online a node if offlined
1131  *
1132  * called by cpu_up() to online a node without onlined memory.
1133  */
1134 int try_online_node(int nid)
1135 {
1136 	pg_data_t	*pgdat;
1137 	int	ret;
1138 
1139 	if (node_online(nid))
1140 		return 0;
1141 
1142 	mem_hotplug_begin();
1143 	pgdat = hotadd_new_pgdat(nid, 0);
1144 	if (!pgdat) {
1145 		pr_err("Cannot online node %d due to NULL pgdat\n", nid);
1146 		ret = -ENOMEM;
1147 		goto out;
1148 	}
1149 	node_set_online(nid);
1150 	ret = register_one_node(nid);
1151 	BUG_ON(ret);
1152 
1153 	if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
1154 		mutex_lock(&zonelists_mutex);
1155 		build_all_zonelists(NULL, NULL);
1156 		mutex_unlock(&zonelists_mutex);
1157 	}
1158 
1159 out:
1160 	mem_hotplug_done();
1161 	return ret;
1162 }
1163 
1164 static int check_hotplug_memory_range(u64 start, u64 size)
1165 {
1166 	u64 start_pfn = PFN_DOWN(start);
1167 	u64 nr_pages = size >> PAGE_SHIFT;
1168 
1169 	/* Memory range must be aligned with section */
1170 	if ((start_pfn & ~PAGE_SECTION_MASK) ||
1171 	    (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) {
1172 		pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n",
1173 				(unsigned long long)start,
1174 				(unsigned long long)size);
1175 		return -EINVAL;
1176 	}
1177 
1178 	return 0;
1179 }
1180 
1181 static int online_memory_block(struct memory_block *mem, void *arg)
1182 {
1183 	return device_online(&mem->dev);
1184 }
1185 
1186 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1187 int __ref add_memory_resource(int nid, struct resource *res, bool online)
1188 {
1189 	u64 start, size;
1190 	pg_data_t *pgdat = NULL;
1191 	bool new_pgdat;
1192 	bool new_node;
1193 	int ret;
1194 
1195 	start = res->start;
1196 	size = resource_size(res);
1197 
1198 	ret = check_hotplug_memory_range(start, size);
1199 	if (ret)
1200 		return ret;
1201 
1202 	{	/* Stupid hack to suppress address-never-null warning */
1203 		void *p = NODE_DATA(nid);
1204 		new_pgdat = !p;
1205 	}
1206 
1207 	mem_hotplug_begin();
1208 
1209 	/*
1210 	 * Add new range to memblock so that when hotadd_new_pgdat() is called
1211 	 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
1212 	 * this new range and calculate total pages correctly.  The range will
1213 	 * be removed at hot-remove time.
1214 	 */
1215 	memblock_add_node(start, size, nid);
1216 
1217 	new_node = !node_online(nid);
1218 	if (new_node) {
1219 		pgdat = hotadd_new_pgdat(nid, start);
1220 		ret = -ENOMEM;
1221 		if (!pgdat)
1222 			goto error;
1223 	}
1224 
1225 	/* call arch's memory hotadd */
1226 	ret = arch_add_memory(nid, start, size, true);
1227 
1228 	if (ret < 0)
1229 		goto error;
1230 
1231 	/* we online node here. we can't roll back from here. */
1232 	node_set_online(nid);
1233 
1234 	if (new_node) {
1235 		unsigned long start_pfn = start >> PAGE_SHIFT;
1236 		unsigned long nr_pages = size >> PAGE_SHIFT;
1237 
1238 		ret = __register_one_node(nid);
1239 		if (ret)
1240 			goto register_fail;
1241 
1242 		/*
1243 		 * link memory sections under this node. This is already
1244 		 * done when creatig memory section in register_new_memory
1245 		 * but that depends to have the node registered so offline
1246 		 * nodes have to go through register_node.
1247 		 * TODO clean up this mess.
1248 		 */
1249 		ret = link_mem_sections(nid, start_pfn, nr_pages);
1250 register_fail:
1251 		/*
1252 		 * If sysfs file of new node can't create, cpu on the node
1253 		 * can't be hot-added. There is no rollback way now.
1254 		 * So, check by BUG_ON() to catch it reluctantly..
1255 		 */
1256 		BUG_ON(ret);
1257 	}
1258 
1259 	/* create new memmap entry */
1260 	firmware_map_add_hotplug(start, start + size, "System RAM");
1261 
1262 	/* online pages if requested */
1263 	if (online)
1264 		walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
1265 				  NULL, online_memory_block);
1266 
1267 	goto out;
1268 
1269 error:
1270 	/* rollback pgdat allocation and others */
1271 	if (new_pgdat)
1272 		rollback_node_hotadd(nid, pgdat);
1273 	memblock_remove(start, size);
1274 
1275 out:
1276 	mem_hotplug_done();
1277 	return ret;
1278 }
1279 EXPORT_SYMBOL_GPL(add_memory_resource);
1280 
1281 int __ref add_memory(int nid, u64 start, u64 size)
1282 {
1283 	struct resource *res;
1284 	int ret;
1285 
1286 	res = register_memory_resource(start, size);
1287 	if (IS_ERR(res))
1288 		return PTR_ERR(res);
1289 
1290 	ret = add_memory_resource(nid, res, memhp_auto_online);
1291 	if (ret < 0)
1292 		release_memory_resource(res);
1293 	return ret;
1294 }
1295 EXPORT_SYMBOL_GPL(add_memory);
1296 
1297 #ifdef CONFIG_MEMORY_HOTREMOVE
1298 /*
1299  * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
1300  * set and the size of the free page is given by page_order(). Using this,
1301  * the function determines if the pageblock contains only free pages.
1302  * Due to buddy contraints, a free page at least the size of a pageblock will
1303  * be located at the start of the pageblock
1304  */
1305 static inline int pageblock_free(struct page *page)
1306 {
1307 	return PageBuddy(page) && page_order(page) >= pageblock_order;
1308 }
1309 
1310 /* Return the start of the next active pageblock after a given page */
1311 static struct page *next_active_pageblock(struct page *page)
1312 {
1313 	/* Ensure the starting page is pageblock-aligned */
1314 	BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
1315 
1316 	/* If the entire pageblock is free, move to the end of free page */
1317 	if (pageblock_free(page)) {
1318 		int order;
1319 		/* be careful. we don't have locks, page_order can be changed.*/
1320 		order = page_order(page);
1321 		if ((order < MAX_ORDER) && (order >= pageblock_order))
1322 			return page + (1 << order);
1323 	}
1324 
1325 	return page + pageblock_nr_pages;
1326 }
1327 
1328 /* Checks if this range of memory is likely to be hot-removable. */
1329 bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
1330 {
1331 	struct page *page = pfn_to_page(start_pfn);
1332 	struct page *end_page = page + nr_pages;
1333 
1334 	/* Check the starting page of each pageblock within the range */
1335 	for (; page < end_page; page = next_active_pageblock(page)) {
1336 		if (!is_pageblock_removable_nolock(page))
1337 			return false;
1338 		cond_resched();
1339 	}
1340 
1341 	/* All pageblocks in the memory block are likely to be hot-removable */
1342 	return true;
1343 }
1344 
1345 /*
1346  * Confirm all pages in a range [start, end) belong to the same zone.
1347  * When true, return its valid [start, end).
1348  */
1349 int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
1350 			 unsigned long *valid_start, unsigned long *valid_end)
1351 {
1352 	unsigned long pfn, sec_end_pfn;
1353 	unsigned long start, end;
1354 	struct zone *zone = NULL;
1355 	struct page *page;
1356 	int i;
1357 	for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
1358 	     pfn < end_pfn;
1359 	     pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
1360 		/* Make sure the memory section is present first */
1361 		if (!present_section_nr(pfn_to_section_nr(pfn)))
1362 			continue;
1363 		for (; pfn < sec_end_pfn && pfn < end_pfn;
1364 		     pfn += MAX_ORDER_NR_PAGES) {
1365 			i = 0;
1366 			/* This is just a CONFIG_HOLES_IN_ZONE check.*/
1367 			while ((i < MAX_ORDER_NR_PAGES) &&
1368 				!pfn_valid_within(pfn + i))
1369 				i++;
1370 			if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
1371 				continue;
1372 			page = pfn_to_page(pfn + i);
1373 			if (zone && page_zone(page) != zone)
1374 				return 0;
1375 			if (!zone)
1376 				start = pfn + i;
1377 			zone = page_zone(page);
1378 			end = pfn + MAX_ORDER_NR_PAGES;
1379 		}
1380 	}
1381 
1382 	if (zone) {
1383 		*valid_start = start;
1384 		*valid_end = min(end, end_pfn);
1385 		return 1;
1386 	} else {
1387 		return 0;
1388 	}
1389 }
1390 
1391 /*
1392  * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
1393  * non-lru movable pages and hugepages). We scan pfn because it's much
1394  * easier than scanning over linked list. This function returns the pfn
1395  * of the first found movable page if it's found, otherwise 0.
1396  */
1397 static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1398 {
1399 	unsigned long pfn;
1400 	struct page *page;
1401 	for (pfn = start; pfn < end; pfn++) {
1402 		if (pfn_valid(pfn)) {
1403 			page = pfn_to_page(pfn);
1404 			if (PageLRU(page))
1405 				return pfn;
1406 			if (__PageMovable(page))
1407 				return pfn;
1408 			if (PageHuge(page)) {
1409 				if (page_huge_active(page))
1410 					return pfn;
1411 				else
1412 					pfn = round_up(pfn + 1,
1413 						1 << compound_order(page)) - 1;
1414 			}
1415 		}
1416 	}
1417 	return 0;
1418 }
1419 
1420 static struct page *new_node_page(struct page *page, unsigned long private,
1421 		int **result)
1422 {
1423 	gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
1424 	int nid = page_to_nid(page);
1425 	nodemask_t nmask = node_states[N_MEMORY];
1426 	struct page *new_page = NULL;
1427 
1428 	/*
1429 	 * TODO: allocate a destination hugepage from a nearest neighbor node,
1430 	 * accordance with memory policy of the user process if possible. For
1431 	 * now as a simple work-around, we use the next node for destination.
1432 	 */
1433 	if (PageHuge(page))
1434 		return alloc_huge_page_node(page_hstate(compound_head(page)),
1435 					next_node_in(nid, nmask));
1436 
1437 	node_clear(nid, nmask);
1438 
1439 	if (PageHighMem(page)
1440 	    || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
1441 		gfp_mask |= __GFP_HIGHMEM;
1442 
1443 	if (!nodes_empty(nmask))
1444 		new_page = __alloc_pages_nodemask(gfp_mask, 0, nid, &nmask);
1445 	if (!new_page)
1446 		new_page = __alloc_pages(gfp_mask, 0, nid);
1447 
1448 	return new_page;
1449 }
1450 
1451 #define NR_OFFLINE_AT_ONCE_PAGES	(256)
1452 static int
1453 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1454 {
1455 	unsigned long pfn;
1456 	struct page *page;
1457 	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
1458 	int not_managed = 0;
1459 	int ret = 0;
1460 	LIST_HEAD(source);
1461 
1462 	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
1463 		if (!pfn_valid(pfn))
1464 			continue;
1465 		page = pfn_to_page(pfn);
1466 
1467 		if (PageHuge(page)) {
1468 			struct page *head = compound_head(page);
1469 			pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
1470 			if (compound_order(head) > PFN_SECTION_SHIFT) {
1471 				ret = -EBUSY;
1472 				break;
1473 			}
1474 			if (isolate_huge_page(page, &source))
1475 				move_pages -= 1 << compound_order(head);
1476 			continue;
1477 		}
1478 
1479 		if (!get_page_unless_zero(page))
1480 			continue;
1481 		/*
1482 		 * We can skip free pages. And we can deal with pages on
1483 		 * LRU and non-lru movable pages.
1484 		 */
1485 		if (PageLRU(page))
1486 			ret = isolate_lru_page(page);
1487 		else
1488 			ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1489 		if (!ret) { /* Success */
1490 			put_page(page);
1491 			list_add_tail(&page->lru, &source);
1492 			move_pages--;
1493 			if (!__PageMovable(page))
1494 				inc_node_page_state(page, NR_ISOLATED_ANON +
1495 						    page_is_file_cache(page));
1496 
1497 		} else {
1498 #ifdef CONFIG_DEBUG_VM
1499 			pr_alert("failed to isolate pfn %lx\n", pfn);
1500 			dump_page(page, "isolation failed");
1501 #endif
1502 			put_page(page);
1503 			/* Because we don't have big zone->lock. we should
1504 			   check this again here. */
1505 			if (page_count(page)) {
1506 				not_managed++;
1507 				ret = -EBUSY;
1508 				break;
1509 			}
1510 		}
1511 	}
1512 	if (!list_empty(&source)) {
1513 		if (not_managed) {
1514 			putback_movable_pages(&source);
1515 			goto out;
1516 		}
1517 
1518 		/* Allocate a new page from the nearest neighbor node */
1519 		ret = migrate_pages(&source, new_node_page, NULL, 0,
1520 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1521 		if (ret)
1522 			putback_movable_pages(&source);
1523 	}
1524 out:
1525 	return ret;
1526 }
1527 
1528 /*
1529  * remove from free_area[] and mark all as Reserved.
1530  */
1531 static int
1532 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
1533 			void *data)
1534 {
1535 	__offline_isolated_pages(start, start + nr_pages);
1536 	return 0;
1537 }
1538 
1539 static void
1540 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
1541 {
1542 	walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
1543 				offline_isolated_pages_cb);
1544 }
1545 
1546 /*
1547  * Check all pages in range, recoreded as memory resource, are isolated.
1548  */
1549 static int
1550 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
1551 			void *data)
1552 {
1553 	int ret;
1554 	long offlined = *(long *)data;
1555 	ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
1556 	offlined = nr_pages;
1557 	if (!ret)
1558 		*(long *)data += offlined;
1559 	return ret;
1560 }
1561 
1562 static long
1563 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
1564 {
1565 	long offlined = 0;
1566 	int ret;
1567 
1568 	ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
1569 			check_pages_isolated_cb);
1570 	if (ret < 0)
1571 		offlined = (long)ret;
1572 	return offlined;
1573 }
1574 
1575 static int __init cmdline_parse_movable_node(char *p)
1576 {
1577 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
1578 	movable_node_enabled = true;
1579 #else
1580 	pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
1581 #endif
1582 	return 0;
1583 }
1584 early_param("movable_node", cmdline_parse_movable_node);
1585 
1586 /* check which state of node_states will be changed when offline memory */
1587 static void node_states_check_changes_offline(unsigned long nr_pages,
1588 		struct zone *zone, struct memory_notify *arg)
1589 {
1590 	struct pglist_data *pgdat = zone->zone_pgdat;
1591 	unsigned long present_pages = 0;
1592 	enum zone_type zt, zone_last = ZONE_NORMAL;
1593 
1594 	/*
1595 	 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
1596 	 * contains nodes which have zones of 0...ZONE_NORMAL,
1597 	 * set zone_last to ZONE_NORMAL.
1598 	 *
1599 	 * If we don't have HIGHMEM nor movable node,
1600 	 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
1601 	 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
1602 	 */
1603 	if (N_MEMORY == N_NORMAL_MEMORY)
1604 		zone_last = ZONE_MOVABLE;
1605 
1606 	/*
1607 	 * check whether node_states[N_NORMAL_MEMORY] will be changed.
1608 	 * If the memory to be offline is in a zone of 0...zone_last,
1609 	 * and it is the last present memory, 0...zone_last will
1610 	 * become empty after offline , thus we can determind we will
1611 	 * need to clear the node from node_states[N_NORMAL_MEMORY].
1612 	 */
1613 	for (zt = 0; zt <= zone_last; zt++)
1614 		present_pages += pgdat->node_zones[zt].present_pages;
1615 	if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1616 		arg->status_change_nid_normal = zone_to_nid(zone);
1617 	else
1618 		arg->status_change_nid_normal = -1;
1619 
1620 #ifdef CONFIG_HIGHMEM
1621 	/*
1622 	 * If we have movable node, node_states[N_HIGH_MEMORY]
1623 	 * contains nodes which have zones of 0...ZONE_HIGHMEM,
1624 	 * set zone_last to ZONE_HIGHMEM.
1625 	 *
1626 	 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
1627 	 * contains nodes which have zones of 0...ZONE_MOVABLE,
1628 	 * set zone_last to ZONE_MOVABLE.
1629 	 */
1630 	zone_last = ZONE_HIGHMEM;
1631 	if (N_MEMORY == N_HIGH_MEMORY)
1632 		zone_last = ZONE_MOVABLE;
1633 
1634 	for (; zt <= zone_last; zt++)
1635 		present_pages += pgdat->node_zones[zt].present_pages;
1636 	if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1637 		arg->status_change_nid_high = zone_to_nid(zone);
1638 	else
1639 		arg->status_change_nid_high = -1;
1640 #else
1641 	arg->status_change_nid_high = arg->status_change_nid_normal;
1642 #endif
1643 
1644 	/*
1645 	 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
1646 	 */
1647 	zone_last = ZONE_MOVABLE;
1648 
1649 	/*
1650 	 * check whether node_states[N_HIGH_MEMORY] will be changed
1651 	 * If we try to offline the last present @nr_pages from the node,
1652 	 * we can determind we will need to clear the node from
1653 	 * node_states[N_HIGH_MEMORY].
1654 	 */
1655 	for (; zt <= zone_last; zt++)
1656 		present_pages += pgdat->node_zones[zt].present_pages;
1657 	if (nr_pages >= present_pages)
1658 		arg->status_change_nid = zone_to_nid(zone);
1659 	else
1660 		arg->status_change_nid = -1;
1661 }
1662 
1663 static void node_states_clear_node(int node, struct memory_notify *arg)
1664 {
1665 	if (arg->status_change_nid_normal >= 0)
1666 		node_clear_state(node, N_NORMAL_MEMORY);
1667 
1668 	if ((N_MEMORY != N_NORMAL_MEMORY) &&
1669 	    (arg->status_change_nid_high >= 0))
1670 		node_clear_state(node, N_HIGH_MEMORY);
1671 
1672 	if ((N_MEMORY != N_HIGH_MEMORY) &&
1673 	    (arg->status_change_nid >= 0))
1674 		node_clear_state(node, N_MEMORY);
1675 }
1676 
1677 static int __ref __offline_pages(unsigned long start_pfn,
1678 		  unsigned long end_pfn, unsigned long timeout)
1679 {
1680 	unsigned long pfn, nr_pages, expire;
1681 	long offlined_pages;
1682 	int ret, drain, retry_max, node;
1683 	unsigned long flags;
1684 	unsigned long valid_start, valid_end;
1685 	struct zone *zone;
1686 	struct memory_notify arg;
1687 
1688 	/* at least, alignment against pageblock is necessary */
1689 	if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
1690 		return -EINVAL;
1691 	if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
1692 		return -EINVAL;
1693 	/* This makes hotplug much easier...and readable.
1694 	   we assume this for now. .*/
1695 	if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
1696 		return -EINVAL;
1697 
1698 	zone = page_zone(pfn_to_page(valid_start));
1699 	node = zone_to_nid(zone);
1700 	nr_pages = end_pfn - start_pfn;
1701 
1702 	/* set above range as isolated */
1703 	ret = start_isolate_page_range(start_pfn, end_pfn,
1704 				       MIGRATE_MOVABLE, true);
1705 	if (ret)
1706 		return ret;
1707 
1708 	arg.start_pfn = start_pfn;
1709 	arg.nr_pages = nr_pages;
1710 	node_states_check_changes_offline(nr_pages, zone, &arg);
1711 
1712 	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
1713 	ret = notifier_to_errno(ret);
1714 	if (ret)
1715 		goto failed_removal;
1716 
1717 	pfn = start_pfn;
1718 	expire = jiffies + timeout;
1719 	drain = 0;
1720 	retry_max = 5;
1721 repeat:
1722 	/* start memory hot removal */
1723 	ret = -EAGAIN;
1724 	if (time_after(jiffies, expire))
1725 		goto failed_removal;
1726 	ret = -EINTR;
1727 	if (signal_pending(current))
1728 		goto failed_removal;
1729 	ret = 0;
1730 	if (drain) {
1731 		lru_add_drain_all();
1732 		cond_resched();
1733 		drain_all_pages(zone);
1734 	}
1735 
1736 	pfn = scan_movable_pages(start_pfn, end_pfn);
1737 	if (pfn) { /* We have movable pages */
1738 		ret = do_migrate_range(pfn, end_pfn);
1739 		if (!ret) {
1740 			drain = 1;
1741 			goto repeat;
1742 		} else {
1743 			if (ret < 0)
1744 				if (--retry_max == 0)
1745 					goto failed_removal;
1746 			yield();
1747 			drain = 1;
1748 			goto repeat;
1749 		}
1750 	}
1751 	/* drain all zone's lru pagevec, this is asynchronous... */
1752 	lru_add_drain_all();
1753 	yield();
1754 	/* drain pcp pages, this is synchronous. */
1755 	drain_all_pages(zone);
1756 	/*
1757 	 * dissolve free hugepages in the memory block before doing offlining
1758 	 * actually in order to make hugetlbfs's object counting consistent.
1759 	 */
1760 	ret = dissolve_free_huge_pages(start_pfn, end_pfn);
1761 	if (ret)
1762 		goto failed_removal;
1763 	/* check again */
1764 	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1765 	if (offlined_pages < 0) {
1766 		ret = -EBUSY;
1767 		goto failed_removal;
1768 	}
1769 	pr_info("Offlined Pages %ld\n", offlined_pages);
1770 	/* Ok, all of our target is isolated.
1771 	   We cannot do rollback at this point. */
1772 	offline_isolated_pages(start_pfn, end_pfn);
1773 	/* reset pagetype flags and makes migrate type to be MOVABLE */
1774 	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1775 	/* removal success */
1776 	adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
1777 	zone->present_pages -= offlined_pages;
1778 
1779 	pgdat_resize_lock(zone->zone_pgdat, &flags);
1780 	zone->zone_pgdat->node_present_pages -= offlined_pages;
1781 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
1782 
1783 	init_per_zone_wmark_min();
1784 
1785 	if (!populated_zone(zone)) {
1786 		zone_pcp_reset(zone);
1787 		mutex_lock(&zonelists_mutex);
1788 		build_all_zonelists(NULL, NULL);
1789 		mutex_unlock(&zonelists_mutex);
1790 	} else
1791 		zone_pcp_update(zone);
1792 
1793 	node_states_clear_node(node, &arg);
1794 	if (arg.status_change_nid >= 0) {
1795 		kswapd_stop(node);
1796 		kcompactd_stop(node);
1797 	}
1798 
1799 	vm_total_pages = nr_free_pagecache_pages();
1800 	writeback_set_ratelimit();
1801 
1802 	memory_notify(MEM_OFFLINE, &arg);
1803 	return 0;
1804 
1805 failed_removal:
1806 	pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
1807 		 (unsigned long long) start_pfn << PAGE_SHIFT,
1808 		 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
1809 	memory_notify(MEM_CANCEL_OFFLINE, &arg);
1810 	/* pushback to free area */
1811 	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1812 	return ret;
1813 }
1814 
1815 /* Must be protected by mem_hotplug_begin() */
1816 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1817 {
1818 	return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1819 }
1820 #endif /* CONFIG_MEMORY_HOTREMOVE */
1821 
1822 /**
1823  * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
1824  * @start_pfn: start pfn of the memory range
1825  * @end_pfn: end pfn of the memory range
1826  * @arg: argument passed to func
1827  * @func: callback for each memory section walked
1828  *
1829  * This function walks through all present mem sections in range
1830  * [start_pfn, end_pfn) and call func on each mem section.
1831  *
1832  * Returns the return value of func.
1833  */
1834 int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1835 		void *arg, int (*func)(struct memory_block *, void *))
1836 {
1837 	struct memory_block *mem = NULL;
1838 	struct mem_section *section;
1839 	unsigned long pfn, section_nr;
1840 	int ret;
1841 
1842 	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1843 		section_nr = pfn_to_section_nr(pfn);
1844 		if (!present_section_nr(section_nr))
1845 			continue;
1846 
1847 		section = __nr_to_section(section_nr);
1848 		/* same memblock? */
1849 		if (mem)
1850 			if ((section_nr >= mem->start_section_nr) &&
1851 			    (section_nr <= mem->end_section_nr))
1852 				continue;
1853 
1854 		mem = find_memory_block_hinted(section, mem);
1855 		if (!mem)
1856 			continue;
1857 
1858 		ret = func(mem, arg);
1859 		if (ret) {
1860 			kobject_put(&mem->dev.kobj);
1861 			return ret;
1862 		}
1863 	}
1864 
1865 	if (mem)
1866 		kobject_put(&mem->dev.kobj);
1867 
1868 	return 0;
1869 }
1870 
1871 #ifdef CONFIG_MEMORY_HOTREMOVE
1872 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
1873 {
1874 	int ret = !is_memblock_offlined(mem);
1875 
1876 	if (unlikely(ret)) {
1877 		phys_addr_t beginpa, endpa;
1878 
1879 		beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
1880 		endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
1881 		pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
1882 			&beginpa, &endpa);
1883 	}
1884 
1885 	return ret;
1886 }
1887 
1888 static int check_cpu_on_node(pg_data_t *pgdat)
1889 {
1890 	int cpu;
1891 
1892 	for_each_present_cpu(cpu) {
1893 		if (cpu_to_node(cpu) == pgdat->node_id)
1894 			/*
1895 			 * the cpu on this node isn't removed, and we can't
1896 			 * offline this node.
1897 			 */
1898 			return -EBUSY;
1899 	}
1900 
1901 	return 0;
1902 }
1903 
1904 static void unmap_cpu_on_node(pg_data_t *pgdat)
1905 {
1906 #ifdef CONFIG_ACPI_NUMA
1907 	int cpu;
1908 
1909 	for_each_possible_cpu(cpu)
1910 		if (cpu_to_node(cpu) == pgdat->node_id)
1911 			numa_clear_node(cpu);
1912 #endif
1913 }
1914 
1915 static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
1916 {
1917 	int ret;
1918 
1919 	ret = check_cpu_on_node(pgdat);
1920 	if (ret)
1921 		return ret;
1922 
1923 	/*
1924 	 * the node will be offlined when we come here, so we can clear
1925 	 * the cpu_to_node() now.
1926 	 */
1927 
1928 	unmap_cpu_on_node(pgdat);
1929 	return 0;
1930 }
1931 
1932 /**
1933  * try_offline_node
1934  *
1935  * Offline a node if all memory sections and cpus of the node are removed.
1936  *
1937  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1938  * and online/offline operations before this call.
1939  */
1940 void try_offline_node(int nid)
1941 {
1942 	pg_data_t *pgdat = NODE_DATA(nid);
1943 	unsigned long start_pfn = pgdat->node_start_pfn;
1944 	unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
1945 	unsigned long pfn;
1946 
1947 	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1948 		unsigned long section_nr = pfn_to_section_nr(pfn);
1949 
1950 		if (!present_section_nr(section_nr))
1951 			continue;
1952 
1953 		if (pfn_to_nid(pfn) != nid)
1954 			continue;
1955 
1956 		/*
1957 		 * some memory sections of this node are not removed, and we
1958 		 * can't offline node now.
1959 		 */
1960 		return;
1961 	}
1962 
1963 	if (check_and_unmap_cpu_on_node(pgdat))
1964 		return;
1965 
1966 	/*
1967 	 * all memory/cpu of this node are removed, we can offline this
1968 	 * node now.
1969 	 */
1970 	node_set_offline(nid);
1971 	unregister_one_node(nid);
1972 }
1973 EXPORT_SYMBOL(try_offline_node);
1974 
1975 /**
1976  * remove_memory
1977  *
1978  * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1979  * and online/offline operations before this call, as required by
1980  * try_offline_node().
1981  */
1982 void __ref remove_memory(int nid, u64 start, u64 size)
1983 {
1984 	int ret;
1985 
1986 	BUG_ON(check_hotplug_memory_range(start, size));
1987 
1988 	mem_hotplug_begin();
1989 
1990 	/*
1991 	 * All memory blocks must be offlined before removing memory.  Check
1992 	 * whether all memory blocks in question are offline and trigger a BUG()
1993 	 * if this is not the case.
1994 	 */
1995 	ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
1996 				check_memblock_offlined_cb);
1997 	if (ret)
1998 		BUG();
1999 
2000 	/* remove memmap entry */
2001 	firmware_map_remove(start, start + size, "System RAM");
2002 	memblock_free(start, size);
2003 	memblock_remove(start, size);
2004 
2005 	arch_remove_memory(start, size);
2006 
2007 	try_offline_node(nid);
2008 
2009 	mem_hotplug_done();
2010 }
2011 EXPORT_SYMBOL_GPL(remove_memory);
2012 #endif /* CONFIG_MEMORY_HOTREMOVE */
2013