xref: /linux/mm/hugetlb.c (revision badfa4361cb116fd9af71aaa2ea470236a8aa25b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Generic hugetlb support.
4  * (C) Nadia Yvette Chambers, April 2004
5  */
6 #include <linux/list.h>
7 #include <linux/init.h>
8 #include <linux/mm.h>
9 #include <linux/seq_file.h>
10 #include <linux/sysctl.h>
11 #include <linux/highmem.h>
12 #include <linux/mmu_notifier.h>
13 #include <linux/nodemask.h>
14 #include <linux/pagemap.h>
15 #include <linux/mempolicy.h>
16 #include <linux/compiler.h>
17 #include <linux/cpumask.h>
18 #include <linux/cpuset.h>
19 #include <linux/mutex.h>
20 #include <linux/memblock.h>
21 #include <linux/minmax.h>
22 #include <linux/sysfs.h>
23 #include <linux/slab.h>
24 #include <linux/sched/mm.h>
25 #include <linux/mmdebug.h>
26 #include <linux/sched/signal.h>
27 #include <linux/rmap.h>
28 #include <linux/string_choices.h>
29 #include <linux/string_helpers.h>
30 #include <linux/swap.h>
31 #include <linux/swapops.h>
32 #include <linux/jhash.h>
33 #include <linux/numa.h>
34 #include <linux/llist.h>
35 #include <linux/cma.h>
36 #include <linux/migrate.h>
37 #include <linux/nospec.h>
38 #include <linux/delayacct.h>
39 #include <linux/memory.h>
40 #include <linux/mm_inline.h>
41 #include <linux/padata.h>
42 
43 #include <asm/page.h>
44 #include <asm/pgalloc.h>
45 #include <asm/tlb.h>
46 #include <asm/setup.h>
47 
48 #include <linux/io.h>
49 #include <linux/hugetlb.h>
50 #include <linux/hugetlb_cgroup.h>
51 #include <linux/node.h>
52 #include <linux/page_owner.h>
53 #include "internal.h"
54 #include "hugetlb_vmemmap.h"
55 #include "hugetlb_cma.h"
56 #include <linux/page-isolation.h>
57 
58 int hugetlb_max_hstate __read_mostly;
59 unsigned int default_hstate_idx;
60 struct hstate hstates[HUGE_MAX_HSTATE];
61 
62 __initdata nodemask_t hugetlb_bootmem_nodes;
63 __initdata struct list_head huge_boot_pages[MAX_NUMNODES];
64 static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata;
65 
66 /*
67  * Due to ordering constraints across the init code for various
68  * architectures, hugetlb hstate cmdline parameters can't simply
69  * be early_param. early_param might call the setup function
70  * before valid hugetlb page sizes are determined, leading to
71  * incorrect rejection of valid hugepagesz= options.
72  *
73  * So, record the parameters early and consume them whenever the
74  * init code is ready for them, by calling hugetlb_parse_params().
75  */
76 
77 /* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */
78 #define HUGE_MAX_CMDLINE_ARGS	(2 * HUGE_MAX_HSTATE + 1)
79 struct hugetlb_cmdline {
80 	char *val;
81 	int (*setup)(char *val);
82 };
83 
84 /* for command line parsing */
85 static struct hstate * __initdata parsed_hstate;
86 static unsigned long __initdata default_hstate_max_huge_pages;
87 static bool __initdata parsed_valid_hugepagesz = true;
88 static bool __initdata parsed_default_hugepagesz;
89 static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
90 static unsigned long hugepage_allocation_threads __initdata;
91 
92 static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata;
93 static int hstate_cmdline_index __initdata;
94 static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata;
95 static int hugetlb_param_index __initdata;
96 static __init int hugetlb_add_param(char *s, int (*setup)(char *val));
97 static __init void hugetlb_parse_params(void);
98 
99 #define hugetlb_early_param(str, func) \
100 static __init int func##args(char *s) \
101 { \
102 	return hugetlb_add_param(s, func); \
103 } \
104 early_param(str, func##args)
105 
106 /*
107  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
108  * free_huge_pages, and surplus_huge_pages.
109  */
110 __cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock);
111 
112 /*
113  * Serializes faults on the same logical page.  This is used to
114  * prevent spurious OOMs when the hugepage pool is fully utilized.
115  */
116 static int num_fault_mutexes __ro_after_init;
117 struct mutex *hugetlb_fault_mutex_table __ro_after_init;
118 
119 /* Forward declaration */
120 static int hugetlb_acct_memory(struct hstate *h, long delta);
121 static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
122 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
123 static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
124 		unsigned long start, unsigned long end, bool take_locks);
125 static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
126 
127 static void hugetlb_free_folio(struct folio *folio)
128 {
129 	if (folio_test_hugetlb_cma(folio)) {
130 		hugetlb_cma_free_folio(folio);
131 		return;
132 	}
133 
134 	folio_put(folio);
135 }
136 
137 /*
138  * Check if the hstate represents gigantic pages but gigantic page
139  * runtime support is not available. This is a common condition used to
140  * skip operations that cannot be performed on gigantic pages when runtime
141  * support is disabled.
142  */
143 static inline bool hstate_is_gigantic_no_runtime(struct hstate *h)
144 {
145 	return hstate_is_gigantic(h) && !gigantic_page_runtime_supported();
146 }
147 
148 static inline bool subpool_is_free(struct hugepage_subpool *spool)
149 {
150 	if (spool->count)
151 		return false;
152 	if (spool->max_hpages != -1)
153 		return spool->used_hpages == 0;
154 	if (spool->min_hpages != -1)
155 		return spool->rsv_hpages == spool->min_hpages;
156 
157 	return true;
158 }
159 
160 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
161 						unsigned long irq_flags)
162 {
163 	spin_unlock_irqrestore(&spool->lock, irq_flags);
164 
165 	/* If no pages are used, and no other handles to the subpool
166 	 * remain, give up any reservations based on minimum size and
167 	 * free the subpool */
168 	if (subpool_is_free(spool)) {
169 		if (spool->min_hpages != -1)
170 			hugetlb_acct_memory(spool->hstate,
171 						-spool->min_hpages);
172 		kfree(spool);
173 	}
174 }
175 
176 struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
177 						long min_hpages)
178 {
179 	struct hugepage_subpool *spool;
180 
181 	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
182 	if (!spool)
183 		return NULL;
184 
185 	spin_lock_init(&spool->lock);
186 	spool->count = 1;
187 	spool->max_hpages = max_hpages;
188 	spool->hstate = h;
189 	spool->min_hpages = min_hpages;
190 
191 	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
192 		kfree(spool);
193 		return NULL;
194 	}
195 	spool->rsv_hpages = min_hpages;
196 
197 	return spool;
198 }
199 
200 void hugepage_put_subpool(struct hugepage_subpool *spool)
201 {
202 	unsigned long flags;
203 
204 	spin_lock_irqsave(&spool->lock, flags);
205 	BUG_ON(!spool->count);
206 	spool->count--;
207 	unlock_or_release_subpool(spool, flags);
208 }
209 
210 /*
211  * Subpool accounting for allocating and reserving pages.
212  * Return -ENOMEM if there are not enough resources to satisfy the
213  * request.  Otherwise, return the number of pages by which the
214  * global pools must be adjusted (upward).  The returned value may
215  * only be different than the passed value (delta) in the case where
216  * a subpool minimum size must be maintained.
217  */
218 static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
219 				      long delta)
220 {
221 	long ret = delta;
222 
223 	if (!spool)
224 		return ret;
225 
226 	spin_lock_irq(&spool->lock);
227 
228 	if (spool->max_hpages != -1) {		/* maximum size accounting */
229 		if ((spool->used_hpages + delta) <= spool->max_hpages)
230 			spool->used_hpages += delta;
231 		else {
232 			ret = -ENOMEM;
233 			goto unlock_ret;
234 		}
235 	}
236 
237 	/* minimum size accounting */
238 	if (spool->min_hpages != -1 && spool->rsv_hpages) {
239 		if (delta > spool->rsv_hpages) {
240 			/*
241 			 * Asking for more reserves than those already taken on
242 			 * behalf of subpool.  Return difference.
243 			 */
244 			ret = delta - spool->rsv_hpages;
245 			spool->rsv_hpages = 0;
246 		} else {
247 			ret = 0;	/* reserves already accounted for */
248 			spool->rsv_hpages -= delta;
249 		}
250 	}
251 
252 unlock_ret:
253 	spin_unlock_irq(&spool->lock);
254 	return ret;
255 }
256 
257 /*
258  * Subpool accounting for freeing and unreserving pages.
259  * Return the number of global page reservations that must be dropped.
260  * The return value may only be different than the passed value (delta)
261  * in the case where a subpool minimum size must be maintained.
262  */
263 static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
264 				       long delta)
265 {
266 	long ret = delta;
267 	unsigned long flags;
268 
269 	if (!spool)
270 		return delta;
271 
272 	spin_lock_irqsave(&spool->lock, flags);
273 
274 	if (spool->max_hpages != -1)		/* maximum size accounting */
275 		spool->used_hpages -= delta;
276 
277 	 /* minimum size accounting */
278 	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
279 		if (spool->rsv_hpages + delta <= spool->min_hpages)
280 			ret = 0;
281 		else
282 			ret = spool->rsv_hpages + delta - spool->min_hpages;
283 
284 		spool->rsv_hpages += delta;
285 		if (spool->rsv_hpages > spool->min_hpages)
286 			spool->rsv_hpages = spool->min_hpages;
287 	}
288 
289 	/*
290 	 * If hugetlbfs_put_super couldn't free spool due to an outstanding
291 	 * quota reference, free it now.
292 	 */
293 	unlock_or_release_subpool(spool, flags);
294 
295 	return ret;
296 }
297 
298 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
299 {
300 	return subpool_inode(file_inode(vma->vm_file));
301 }
302 
303 /*
304  * hugetlb vma_lock helper routines
305  */
306 void hugetlb_vma_lock_read(struct vm_area_struct *vma)
307 {
308 	if (__vma_shareable_lock(vma)) {
309 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
310 
311 		down_read(&vma_lock->rw_sema);
312 	} else if (__vma_private_lock(vma)) {
313 		struct resv_map *resv_map = vma_resv_map(vma);
314 
315 		down_read(&resv_map->rw_sema);
316 	}
317 }
318 
319 void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
320 {
321 	if (__vma_shareable_lock(vma)) {
322 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
323 
324 		up_read(&vma_lock->rw_sema);
325 	} else if (__vma_private_lock(vma)) {
326 		struct resv_map *resv_map = vma_resv_map(vma);
327 
328 		up_read(&resv_map->rw_sema);
329 	}
330 }
331 
332 void hugetlb_vma_lock_write(struct vm_area_struct *vma)
333 {
334 	if (__vma_shareable_lock(vma)) {
335 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
336 
337 		down_write(&vma_lock->rw_sema);
338 	} else if (__vma_private_lock(vma)) {
339 		struct resv_map *resv_map = vma_resv_map(vma);
340 
341 		down_write(&resv_map->rw_sema);
342 	}
343 }
344 
345 void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
346 {
347 	if (__vma_shareable_lock(vma)) {
348 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
349 
350 		up_write(&vma_lock->rw_sema);
351 	} else if (__vma_private_lock(vma)) {
352 		struct resv_map *resv_map = vma_resv_map(vma);
353 
354 		up_write(&resv_map->rw_sema);
355 	}
356 }
357 
358 int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
359 {
360 
361 	if (__vma_shareable_lock(vma)) {
362 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
363 
364 		return down_write_trylock(&vma_lock->rw_sema);
365 	} else if (__vma_private_lock(vma)) {
366 		struct resv_map *resv_map = vma_resv_map(vma);
367 
368 		return down_write_trylock(&resv_map->rw_sema);
369 	}
370 
371 	return 1;
372 }
373 
374 void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
375 {
376 	if (__vma_shareable_lock(vma)) {
377 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
378 
379 		lockdep_assert_held(&vma_lock->rw_sema);
380 	} else if (__vma_private_lock(vma)) {
381 		struct resv_map *resv_map = vma_resv_map(vma);
382 
383 		lockdep_assert_held(&resv_map->rw_sema);
384 	}
385 }
386 
387 void hugetlb_vma_lock_release(struct kref *kref)
388 {
389 	struct hugetlb_vma_lock *vma_lock = container_of(kref,
390 			struct hugetlb_vma_lock, refs);
391 
392 	kfree(vma_lock);
393 }
394 
395 static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
396 {
397 	struct vm_area_struct *vma = vma_lock->vma;
398 
399 	/*
400 	 * vma_lock structure may or not be released as a result of put,
401 	 * it certainly will no longer be attached to vma so clear pointer.
402 	 * Semaphore synchronizes access to vma_lock->vma field.
403 	 */
404 	vma_lock->vma = NULL;
405 	vma->vm_private_data = NULL;
406 	up_write(&vma_lock->rw_sema);
407 	kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
408 }
409 
410 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
411 {
412 	if (__vma_shareable_lock(vma)) {
413 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
414 
415 		__hugetlb_vma_unlock_write_put(vma_lock);
416 	} else if (__vma_private_lock(vma)) {
417 		struct resv_map *resv_map = vma_resv_map(vma);
418 
419 		/* no free for anon vmas, but still need to unlock */
420 		up_write(&resv_map->rw_sema);
421 	}
422 }
423 
424 static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
425 {
426 	/*
427 	 * Only present in sharable vmas.
428 	 */
429 	if (!vma || !__vma_shareable_lock(vma))
430 		return;
431 
432 	if (vma->vm_private_data) {
433 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
434 
435 		down_write(&vma_lock->rw_sema);
436 		__hugetlb_vma_unlock_write_put(vma_lock);
437 	}
438 }
439 
440 /*
441  * vma specific semaphore used for pmd sharing and fault/truncation
442  * synchronization
443  */
444 int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
445 {
446 	struct hugetlb_vma_lock *vma_lock;
447 
448 	/* Only establish in (flags) sharable vmas */
449 	if (!vma || !(vma->vm_flags & VM_MAYSHARE))
450 		return 0;
451 
452 	/* Should never get here with non-NULL vm_private_data */
453 	if (vma->vm_private_data)
454 		return -EINVAL;
455 
456 	vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
457 	if (!vma_lock) {
458 		/*
459 		 * If we can not allocate structure, then vma can not
460 		 * participate in pmd sharing.  This is only a possible
461 		 * performance enhancement and memory saving issue.
462 		 * However, the lock is also used to synchronize page
463 		 * faults with truncation.  If the lock is not present,
464 		 * unlikely races could leave pages in a file past i_size
465 		 * until the file is removed.  Warn in the unlikely case of
466 		 * allocation failure.
467 		 */
468 		pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
469 		return -EINVAL;
470 	}
471 
472 	kref_init(&vma_lock->refs);
473 	init_rwsem(&vma_lock->rw_sema);
474 	vma_lock->vma = vma;
475 	vma->vm_private_data = vma_lock;
476 
477 	return 0;
478 }
479 
480 /* Helper that removes a struct file_region from the resv_map cache and returns
481  * it for use.
482  */
483 static struct file_region *
484 get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
485 {
486 	struct file_region *nrg;
487 
488 	VM_BUG_ON(resv->region_cache_count <= 0);
489 
490 	resv->region_cache_count--;
491 	nrg = list_first_entry(&resv->region_cache, struct file_region, link);
492 	list_del(&nrg->link);
493 
494 	nrg->from = from;
495 	nrg->to = to;
496 
497 	return nrg;
498 }
499 
500 static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
501 					      struct file_region *rg)
502 {
503 #ifdef CONFIG_CGROUP_HUGETLB
504 	nrg->reservation_counter = rg->reservation_counter;
505 	nrg->css = rg->css;
506 	if (rg->css)
507 		css_get(rg->css);
508 #endif
509 }
510 
511 /* Helper that records hugetlb_cgroup uncharge info. */
512 static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
513 						struct hstate *h,
514 						struct resv_map *resv,
515 						struct file_region *nrg)
516 {
517 #ifdef CONFIG_CGROUP_HUGETLB
518 	if (h_cg) {
519 		nrg->reservation_counter =
520 			&h_cg->rsvd_hugepage[hstate_index(h)];
521 		nrg->css = &h_cg->css;
522 		/*
523 		 * The caller will hold exactly one h_cg->css reference for the
524 		 * whole contiguous reservation region. But this area might be
525 		 * scattered when there are already some file_regions reside in
526 		 * it. As a result, many file_regions may share only one css
527 		 * reference. In order to ensure that one file_region must hold
528 		 * exactly one h_cg->css reference, we should do css_get for
529 		 * each file_region and leave the reference held by caller
530 		 * untouched.
531 		 */
532 		css_get(&h_cg->css);
533 		if (!resv->pages_per_hpage)
534 			resv->pages_per_hpage = pages_per_huge_page(h);
535 		/* pages_per_hpage should be the same for all entries in
536 		 * a resv_map.
537 		 */
538 		VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
539 	} else {
540 		nrg->reservation_counter = NULL;
541 		nrg->css = NULL;
542 	}
543 #endif
544 }
545 
546 static void put_uncharge_info(struct file_region *rg)
547 {
548 #ifdef CONFIG_CGROUP_HUGETLB
549 	if (rg->css)
550 		css_put(rg->css);
551 #endif
552 }
553 
554 static bool has_same_uncharge_info(struct file_region *rg,
555 				   struct file_region *org)
556 {
557 #ifdef CONFIG_CGROUP_HUGETLB
558 	return rg->reservation_counter == org->reservation_counter &&
559 	       rg->css == org->css;
560 
561 #else
562 	return true;
563 #endif
564 }
565 
566 static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
567 {
568 	struct file_region *nrg, *prg;
569 
570 	prg = list_prev_entry(rg, link);
571 	if (&prg->link != &resv->regions && prg->to == rg->from &&
572 	    has_same_uncharge_info(prg, rg)) {
573 		prg->to = rg->to;
574 
575 		list_del(&rg->link);
576 		put_uncharge_info(rg);
577 		kfree(rg);
578 
579 		rg = prg;
580 	}
581 
582 	nrg = list_next_entry(rg, link);
583 	if (&nrg->link != &resv->regions && nrg->from == rg->to &&
584 	    has_same_uncharge_info(nrg, rg)) {
585 		nrg->from = rg->from;
586 
587 		list_del(&rg->link);
588 		put_uncharge_info(rg);
589 		kfree(rg);
590 	}
591 }
592 
593 static inline long
594 hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
595 		     long to, struct hstate *h, struct hugetlb_cgroup *cg,
596 		     long *regions_needed)
597 {
598 	struct file_region *nrg;
599 
600 	if (!regions_needed) {
601 		nrg = get_file_region_entry_from_cache(map, from, to);
602 		record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
603 		list_add(&nrg->link, rg);
604 		coalesce_file_region(map, nrg);
605 	} else
606 		*regions_needed += 1;
607 
608 	return to - from;
609 }
610 
611 /*
612  * Must be called with resv->lock held.
613  *
614  * Calling this with regions_needed != NULL will count the number of pages
615  * to be added but will not modify the linked list. And regions_needed will
616  * indicate the number of file_regions needed in the cache to carry out to add
617  * the regions for this range.
618  */
619 static long add_reservation_in_range(struct resv_map *resv, long f, long t,
620 				     struct hugetlb_cgroup *h_cg,
621 				     struct hstate *h, long *regions_needed)
622 {
623 	long add = 0;
624 	struct list_head *head = &resv->regions;
625 	long last_accounted_offset = f;
626 	struct file_region *iter, *trg = NULL;
627 	struct list_head *rg = NULL;
628 
629 	if (regions_needed)
630 		*regions_needed = 0;
631 
632 	/* In this loop, we essentially handle an entry for the range
633 	 * [last_accounted_offset, iter->from), at every iteration, with some
634 	 * bounds checking.
635 	 */
636 	list_for_each_entry_safe(iter, trg, head, link) {
637 		/* Skip irrelevant regions that start before our range. */
638 		if (iter->from < f) {
639 			/* If this region ends after the last accounted offset,
640 			 * then we need to update last_accounted_offset.
641 			 */
642 			if (iter->to > last_accounted_offset)
643 				last_accounted_offset = iter->to;
644 			continue;
645 		}
646 
647 		/* When we find a region that starts beyond our range, we've
648 		 * finished.
649 		 */
650 		if (iter->from >= t) {
651 			rg = iter->link.prev;
652 			break;
653 		}
654 
655 		/* Add an entry for last_accounted_offset -> iter->from, and
656 		 * update last_accounted_offset.
657 		 */
658 		if (iter->from > last_accounted_offset)
659 			add += hugetlb_resv_map_add(resv, iter->link.prev,
660 						    last_accounted_offset,
661 						    iter->from, h, h_cg,
662 						    regions_needed);
663 
664 		last_accounted_offset = iter->to;
665 	}
666 
667 	/* Handle the case where our range extends beyond
668 	 * last_accounted_offset.
669 	 */
670 	if (!rg)
671 		rg = head->prev;
672 	if (last_accounted_offset < t)
673 		add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
674 					    t, h, h_cg, regions_needed);
675 
676 	return add;
677 }
678 
679 /* Must be called with resv->lock acquired. Will drop lock to allocate entries.
680  */
681 static int allocate_file_region_entries(struct resv_map *resv,
682 					int regions_needed)
683 	__must_hold(&resv->lock)
684 {
685 	LIST_HEAD(allocated_regions);
686 	int to_allocate = 0, i = 0;
687 	struct file_region *trg = NULL, *rg = NULL;
688 
689 	VM_BUG_ON(regions_needed < 0);
690 
691 	/*
692 	 * Check for sufficient descriptors in the cache to accommodate
693 	 * the number of in progress add operations plus regions_needed.
694 	 *
695 	 * This is a while loop because when we drop the lock, some other call
696 	 * to region_add or region_del may have consumed some region_entries,
697 	 * so we keep looping here until we finally have enough entries for
698 	 * (adds_in_progress + regions_needed).
699 	 */
700 	while (resv->region_cache_count <
701 	       (resv->adds_in_progress + regions_needed)) {
702 		to_allocate = resv->adds_in_progress + regions_needed -
703 			      resv->region_cache_count;
704 
705 		/* At this point, we should have enough entries in the cache
706 		 * for all the existing adds_in_progress. We should only be
707 		 * needing to allocate for regions_needed.
708 		 */
709 		VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
710 
711 		spin_unlock(&resv->lock);
712 		for (i = 0; i < to_allocate; i++) {
713 			trg = kmalloc(sizeof(*trg), GFP_KERNEL);
714 			if (!trg)
715 				goto out_of_memory;
716 			list_add(&trg->link, &allocated_regions);
717 		}
718 
719 		spin_lock(&resv->lock);
720 
721 		list_splice(&allocated_regions, &resv->region_cache);
722 		resv->region_cache_count += to_allocate;
723 	}
724 
725 	return 0;
726 
727 out_of_memory:
728 	list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
729 		list_del(&rg->link);
730 		kfree(rg);
731 	}
732 	return -ENOMEM;
733 }
734 
735 /*
736  * Add the huge page range represented by [f, t) to the reserve
737  * map.  Regions will be taken from the cache to fill in this range.
738  * Sufficient regions should exist in the cache due to the previous
739  * call to region_chg with the same range, but in some cases the cache will not
740  * have sufficient entries due to races with other code doing region_add or
741  * region_del.  The extra needed entries will be allocated.
742  *
743  * regions_needed is the out value provided by a previous call to region_chg.
744  *
745  * Return the number of new huge pages added to the map.  This number is greater
746  * than or equal to zero.  If file_region entries needed to be allocated for
747  * this operation and we were not able to allocate, it returns -ENOMEM.
748  * region_add of regions of length 1 never allocate file_regions and cannot
749  * fail; region_chg will always allocate at least 1 entry and a region_add for
750  * 1 page will only require at most 1 entry.
751  */
752 static long region_add(struct resv_map *resv, long f, long t,
753 		       long in_regions_needed, struct hstate *h,
754 		       struct hugetlb_cgroup *h_cg)
755 {
756 	long add = 0, actual_regions_needed = 0;
757 
758 	spin_lock(&resv->lock);
759 retry:
760 
761 	/* Count how many regions are actually needed to execute this add. */
762 	add_reservation_in_range(resv, f, t, NULL, NULL,
763 				 &actual_regions_needed);
764 
765 	/*
766 	 * Check for sufficient descriptors in the cache to accommodate
767 	 * this add operation. Note that actual_regions_needed may be greater
768 	 * than in_regions_needed, as the resv_map may have been modified since
769 	 * the region_chg call. In this case, we need to make sure that we
770 	 * allocate extra entries, such that we have enough for all the
771 	 * existing adds_in_progress, plus the excess needed for this
772 	 * operation.
773 	 */
774 	if (actual_regions_needed > in_regions_needed &&
775 	    resv->region_cache_count <
776 		    resv->adds_in_progress +
777 			    (actual_regions_needed - in_regions_needed)) {
778 		/* region_add operation of range 1 should never need to
779 		 * allocate file_region entries.
780 		 */
781 		VM_BUG_ON(t - f <= 1);
782 
783 		if (allocate_file_region_entries(
784 			    resv, actual_regions_needed - in_regions_needed)) {
785 			return -ENOMEM;
786 		}
787 
788 		goto retry;
789 	}
790 
791 	add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
792 
793 	resv->adds_in_progress -= in_regions_needed;
794 
795 	spin_unlock(&resv->lock);
796 	return add;
797 }
798 
799 /*
800  * Examine the existing reserve map and determine how many
801  * huge pages in the specified range [f, t) are NOT currently
802  * represented.  This routine is called before a subsequent
803  * call to region_add that will actually modify the reserve
804  * map to add the specified range [f, t).  region_chg does
805  * not change the number of huge pages represented by the
806  * map.  A number of new file_region structures is added to the cache as a
807  * placeholder, for the subsequent region_add call to use. At least 1
808  * file_region structure is added.
809  *
810  * out_regions_needed is the number of regions added to the
811  * resv->adds_in_progress.  This value needs to be provided to a follow up call
812  * to region_add or region_abort for proper accounting.
813  *
814  * Returns the number of huge pages that need to be added to the existing
815  * reservation map for the range [f, t).  This number is greater or equal to
816  * zero.  -ENOMEM is returned if a new file_region structure or cache entry
817  * is needed and can not be allocated.
818  */
819 static long region_chg(struct resv_map *resv, long f, long t,
820 		       long *out_regions_needed)
821 {
822 	long chg = 0;
823 
824 	spin_lock(&resv->lock);
825 
826 	/* Count how many hugepages in this range are NOT represented. */
827 	chg = add_reservation_in_range(resv, f, t, NULL, NULL,
828 				       out_regions_needed);
829 
830 	if (*out_regions_needed == 0)
831 		*out_regions_needed = 1;
832 
833 	if (allocate_file_region_entries(resv, *out_regions_needed))
834 		return -ENOMEM;
835 
836 	resv->adds_in_progress += *out_regions_needed;
837 
838 	spin_unlock(&resv->lock);
839 	return chg;
840 }
841 
842 /*
843  * Abort the in progress add operation.  The adds_in_progress field
844  * of the resv_map keeps track of the operations in progress between
845  * calls to region_chg and region_add.  Operations are sometimes
846  * aborted after the call to region_chg.  In such cases, region_abort
847  * is called to decrement the adds_in_progress counter. regions_needed
848  * is the value returned by the region_chg call, it is used to decrement
849  * the adds_in_progress counter.
850  *
851  * NOTE: The range arguments [f, t) are not needed or used in this
852  * routine.  They are kept to make reading the calling code easier as
853  * arguments will match the associated region_chg call.
854  */
855 static void region_abort(struct resv_map *resv, long f, long t,
856 			 long regions_needed)
857 {
858 	spin_lock(&resv->lock);
859 	VM_BUG_ON(!resv->region_cache_count);
860 	resv->adds_in_progress -= regions_needed;
861 	spin_unlock(&resv->lock);
862 }
863 
864 /*
865  * Delete the specified range [f, t) from the reserve map.  If the
866  * t parameter is LONG_MAX, this indicates that ALL regions after f
867  * should be deleted.  Locate the regions which intersect [f, t)
868  * and either trim, delete or split the existing regions.
869  *
870  * Returns the number of huge pages deleted from the reserve map.
871  * In the normal case, the return value is zero or more.  In the
872  * case where a region must be split, a new region descriptor must
873  * be allocated.  If the allocation fails, -ENOMEM will be returned.
874  * NOTE: If the parameter t == LONG_MAX, then we will never split
875  * a region and possibly return -ENOMEM.  Callers specifying
876  * t == LONG_MAX do not need to check for -ENOMEM error.
877  */
878 static long region_del(struct resv_map *resv, long f, long t)
879 {
880 	struct list_head *head = &resv->regions;
881 	struct file_region *rg, *trg;
882 	struct file_region *nrg = NULL;
883 	long del = 0;
884 
885 retry:
886 	spin_lock(&resv->lock);
887 	list_for_each_entry_safe(rg, trg, head, link) {
888 		/*
889 		 * Skip regions before the range to be deleted.  file_region
890 		 * ranges are normally of the form [from, to).  However, there
891 		 * may be a "placeholder" entry in the map which is of the form
892 		 * (from, to) with from == to.  Check for placeholder entries
893 		 * at the beginning of the range to be deleted.
894 		 */
895 		if (rg->to <= f && (rg->to != rg->from || rg->to != f))
896 			continue;
897 
898 		if (rg->from >= t)
899 			break;
900 
901 		if (f > rg->from && t < rg->to) { /* Must split region */
902 			/*
903 			 * Check for an entry in the cache before dropping
904 			 * lock and attempting allocation.
905 			 */
906 			if (!nrg &&
907 			    resv->region_cache_count > resv->adds_in_progress) {
908 				nrg = list_first_entry(&resv->region_cache,
909 							struct file_region,
910 							link);
911 				list_del(&nrg->link);
912 				resv->region_cache_count--;
913 			}
914 
915 			if (!nrg) {
916 				spin_unlock(&resv->lock);
917 				nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
918 				if (!nrg)
919 					return -ENOMEM;
920 				goto retry;
921 			}
922 
923 			del += t - f;
924 			hugetlb_cgroup_uncharge_file_region(
925 				resv, rg, t - f, false);
926 
927 			/* New entry for end of split region */
928 			nrg->from = t;
929 			nrg->to = rg->to;
930 
931 			copy_hugetlb_cgroup_uncharge_info(nrg, rg);
932 
933 			INIT_LIST_HEAD(&nrg->link);
934 
935 			/* Original entry is trimmed */
936 			rg->to = f;
937 
938 			list_add(&nrg->link, &rg->link);
939 			nrg = NULL;
940 			break;
941 		}
942 
943 		if (f <= rg->from && t >= rg->to) { /* Remove entire region */
944 			del += rg->to - rg->from;
945 			hugetlb_cgroup_uncharge_file_region(resv, rg,
946 							    rg->to - rg->from, true);
947 			list_del(&rg->link);
948 			kfree(rg);
949 			continue;
950 		}
951 
952 		if (f <= rg->from) {	/* Trim beginning of region */
953 			hugetlb_cgroup_uncharge_file_region(resv, rg,
954 							    t - rg->from, false);
955 
956 			del += t - rg->from;
957 			rg->from = t;
958 		} else {		/* Trim end of region */
959 			hugetlb_cgroup_uncharge_file_region(resv, rg,
960 							    rg->to - f, false);
961 
962 			del += rg->to - f;
963 			rg->to = f;
964 		}
965 	}
966 
967 	spin_unlock(&resv->lock);
968 	kfree(nrg);
969 	return del;
970 }
971 
972 /*
973  * A rare out of memory error was encountered which prevented removal of
974  * the reserve map region for a page.  The huge page itself was free'ed
975  * and removed from the page cache.  This routine will adjust the subpool
976  * usage count, and the global reserve count if needed.  By incrementing
977  * these counts, the reserve map entry which could not be deleted will
978  * appear as a "reserved" entry instead of simply dangling with incorrect
979  * counts.
980  */
981 void hugetlb_fix_reserve_counts(struct inode *inode)
982 {
983 	struct hugepage_subpool *spool = subpool_inode(inode);
984 	long rsv_adjust;
985 	bool reserved = false;
986 
987 	rsv_adjust = hugepage_subpool_get_pages(spool, 1);
988 	if (rsv_adjust > 0) {
989 		struct hstate *h = hstate_inode(inode);
990 
991 		if (!hugetlb_acct_memory(h, 1))
992 			reserved = true;
993 	} else if (!rsv_adjust) {
994 		reserved = true;
995 	}
996 
997 	if (!reserved)
998 		pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
999 }
1000 
1001 /*
1002  * Count and return the number of huge pages in the reserve map
1003  * that intersect with the range [f, t).
1004  */
1005 static long region_count(struct resv_map *resv, long f, long t)
1006 {
1007 	struct list_head *head = &resv->regions;
1008 	struct file_region *rg;
1009 	long chg = 0;
1010 
1011 	spin_lock(&resv->lock);
1012 	/* Locate each segment we overlap with, and count that overlap. */
1013 	list_for_each_entry(rg, head, link) {
1014 		long seg_from;
1015 		long seg_to;
1016 
1017 		if (rg->to <= f)
1018 			continue;
1019 		if (rg->from >= t)
1020 			break;
1021 
1022 		seg_from = max(rg->from, f);
1023 		seg_to = min(rg->to, t);
1024 
1025 		chg += seg_to - seg_from;
1026 	}
1027 	spin_unlock(&resv->lock);
1028 
1029 	return chg;
1030 }
1031 
1032 /*
1033  * Convert the address within this vma to the page offset within
1034  * the mapping, huge page units here.
1035  */
1036 static pgoff_t vma_hugecache_offset(struct hstate *h,
1037 			struct vm_area_struct *vma, unsigned long address)
1038 {
1039 	return ((address - vma->vm_start) >> huge_page_shift(h)) +
1040 			(vma->vm_pgoff >> huge_page_order(h));
1041 }
1042 
1043 /**
1044  * vma_kernel_pagesize - Page size granularity for this VMA.
1045  * @vma: The user mapping.
1046  *
1047  * Folios in this VMA will be aligned to, and at least the size of the
1048  * number of bytes returned by this function.
1049  *
1050  * Return: The default size of the folios allocated when backing a VMA.
1051  */
1052 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
1053 {
1054 	if (vma->vm_ops && vma->vm_ops->pagesize)
1055 		return vma->vm_ops->pagesize(vma);
1056 	return PAGE_SIZE;
1057 }
1058 EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
1059 
1060 /*
1061  * Return the page size being used by the MMU to back a VMA. In the majority
1062  * of cases, the page size used by the kernel matches the MMU size. On
1063  * architectures where it differs, an architecture-specific 'strong'
1064  * version of this symbol is required.
1065  */
1066 __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
1067 {
1068 	return vma_kernel_pagesize(vma);
1069 }
1070 
1071 /*
1072  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
1073  * bits of the reservation map pointer, which are always clear due to
1074  * alignment.
1075  */
1076 #define HPAGE_RESV_OWNER    (1UL << 0)
1077 #define HPAGE_RESV_UNMAPPED (1UL << 1)
1078 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
1079 
1080 /*
1081  * These helpers are used to track how many pages are reserved for
1082  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
1083  * is guaranteed to have their future faults succeed.
1084  *
1085  * With the exception of hugetlb_dup_vma_private() which is called at fork(),
1086  * the reserve counters are updated with the hugetlb_lock held. It is safe
1087  * to reset the VMA at fork() time as it is not in use yet and there is no
1088  * chance of the global counters getting corrupted as a result of the values.
1089  *
1090  * The private mapping reservation is represented in a subtly different
1091  * manner to a shared mapping.  A shared mapping has a region map associated
1092  * with the underlying file, this region map represents the backing file
1093  * pages which have ever had a reservation assigned which this persists even
1094  * after the page is instantiated.  A private mapping has a region map
1095  * associated with the original mmap which is attached to all VMAs which
1096  * reference it, this region map represents those offsets which have consumed
1097  * reservation ie. where pages have been instantiated.
1098  */
1099 static unsigned long get_vma_private_data(struct vm_area_struct *vma)
1100 {
1101 	return (unsigned long)vma->vm_private_data;
1102 }
1103 
1104 static void set_vma_private_data(struct vm_area_struct *vma,
1105 							unsigned long value)
1106 {
1107 	vma->vm_private_data = (void *)value;
1108 }
1109 
1110 static void
1111 resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
1112 					  struct hugetlb_cgroup *h_cg,
1113 					  struct hstate *h)
1114 {
1115 #ifdef CONFIG_CGROUP_HUGETLB
1116 	if (!h_cg || !h) {
1117 		resv_map->reservation_counter = NULL;
1118 		resv_map->pages_per_hpage = 0;
1119 		resv_map->css = NULL;
1120 	} else {
1121 		resv_map->reservation_counter =
1122 			&h_cg->rsvd_hugepage[hstate_index(h)];
1123 		resv_map->pages_per_hpage = pages_per_huge_page(h);
1124 		resv_map->css = &h_cg->css;
1125 	}
1126 #endif
1127 }
1128 
1129 struct resv_map *resv_map_alloc(void)
1130 {
1131 	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
1132 	struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
1133 
1134 	if (!resv_map || !rg) {
1135 		kfree(resv_map);
1136 		kfree(rg);
1137 		return NULL;
1138 	}
1139 
1140 	kref_init(&resv_map->refs);
1141 	spin_lock_init(&resv_map->lock);
1142 	INIT_LIST_HEAD(&resv_map->regions);
1143 	init_rwsem(&resv_map->rw_sema);
1144 
1145 	resv_map->adds_in_progress = 0;
1146 	/*
1147 	 * Initialize these to 0. On shared mappings, 0's here indicate these
1148 	 * fields don't do cgroup accounting. On private mappings, these will be
1149 	 * re-initialized to the proper values, to indicate that hugetlb cgroup
1150 	 * reservations are to be un-charged from here.
1151 	 */
1152 	resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
1153 
1154 	INIT_LIST_HEAD(&resv_map->region_cache);
1155 	list_add(&rg->link, &resv_map->region_cache);
1156 	resv_map->region_cache_count = 1;
1157 
1158 	return resv_map;
1159 }
1160 
1161 void resv_map_release(struct kref *ref)
1162 {
1163 	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
1164 	struct list_head *head = &resv_map->region_cache;
1165 	struct file_region *rg, *trg;
1166 
1167 	/* Clear out any active regions before we release the map. */
1168 	region_del(resv_map, 0, LONG_MAX);
1169 
1170 	/* ... and any entries left in the cache */
1171 	list_for_each_entry_safe(rg, trg, head, link) {
1172 		list_del(&rg->link);
1173 		kfree(rg);
1174 	}
1175 
1176 	VM_BUG_ON(resv_map->adds_in_progress);
1177 
1178 	kfree(resv_map);
1179 }
1180 
1181 static inline struct resv_map *inode_resv_map(struct inode *inode)
1182 {
1183 	/*
1184 	 * At inode evict time, i_mapping may not point to the original
1185 	 * address space within the inode.  This original address space
1186 	 * contains the pointer to the resv_map.  So, always use the
1187 	 * address space embedded within the inode.
1188 	 * The VERY common case is inode->mapping == &inode->i_data but,
1189 	 * this may not be true for device special inodes.
1190 	 */
1191 	return (struct resv_map *)(&inode->i_data)->i_private_data;
1192 }
1193 
1194 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
1195 {
1196 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1197 	if (vma->vm_flags & VM_MAYSHARE) {
1198 		struct address_space *mapping = vma->vm_file->f_mapping;
1199 		struct inode *inode = mapping->host;
1200 
1201 		return inode_resv_map(inode);
1202 
1203 	} else {
1204 		return (struct resv_map *)(get_vma_private_data(vma) &
1205 							~HPAGE_RESV_MASK);
1206 	}
1207 }
1208 
1209 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
1210 {
1211 	VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma);
1212 	VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma);
1213 
1214 	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
1215 }
1216 
1217 static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map)
1218 {
1219 	VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
1220 	VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
1221 
1222 	desc->private_data = map;
1223 }
1224 
1225 static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags)
1226 {
1227 	VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
1228 	VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
1229 
1230 	desc->private_data = (void *)((unsigned long)desc->private_data | flags);
1231 }
1232 
1233 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
1234 {
1235 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1236 
1237 	return (get_vma_private_data(vma) & flag) != 0;
1238 }
1239 
1240 static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag)
1241 {
1242 	VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
1243 
1244 	return ((unsigned long)desc->private_data) & flag;
1245 }
1246 
1247 bool __vma_private_lock(struct vm_area_struct *vma)
1248 {
1249 	return !(vma->vm_flags & VM_MAYSHARE) &&
1250 		get_vma_private_data(vma) & ~HPAGE_RESV_MASK &&
1251 		is_vma_resv_set(vma, HPAGE_RESV_OWNER);
1252 }
1253 
1254 void hugetlb_dup_vma_private(struct vm_area_struct *vma)
1255 {
1256 	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1257 	/*
1258 	 * Clear vm_private_data
1259 	 * - For shared mappings this is a per-vma semaphore that may be
1260 	 *   allocated in a subsequent call to hugetlb_vm_op_open.
1261 	 *   Before clearing, make sure pointer is not associated with vma
1262 	 *   as this will leak the structure.  This is the case when called
1263 	 *   via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
1264 	 *   been called to allocate a new structure.
1265 	 * - For MAP_PRIVATE mappings, this is the reserve map which does
1266 	 *   not apply to children.  Faults generated by the children are
1267 	 *   not guaranteed to succeed, even if read-only.
1268 	 */
1269 	if (vma->vm_flags & VM_MAYSHARE) {
1270 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
1271 
1272 		if (vma_lock && vma_lock->vma != vma)
1273 			vma->vm_private_data = NULL;
1274 	} else
1275 		vma->vm_private_data = NULL;
1276 }
1277 
1278 /*
1279  * Reset and decrement one ref on hugepage private reservation.
1280  * Called with mm->mmap_lock writer semaphore held.
1281  * This function should be only used by mremap and operate on
1282  * same sized vma. It should never come here with last ref on the
1283  * reservation.
1284  */
1285 void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
1286 {
1287 	/*
1288 	 * Clear the old hugetlb private page reservation.
1289 	 * It has already been transferred to new_vma.
1290 	 *
1291 	 * During a mremap() operation of a hugetlb vma we call move_vma()
1292 	 * which copies vma into new_vma and unmaps vma. After the copy
1293 	 * operation both new_vma and vma share a reference to the resv_map
1294 	 * struct, and at that point vma is about to be unmapped. We don't
1295 	 * want to return the reservation to the pool at unmap of vma because
1296 	 * the reservation still lives on in new_vma, so simply decrement the
1297 	 * ref here and remove the resv_map reference from this vma.
1298 	 */
1299 	struct resv_map *reservations = vma_resv_map(vma);
1300 
1301 	if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1302 		resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
1303 		kref_put(&reservations->refs, resv_map_release);
1304 	}
1305 
1306 	hugetlb_dup_vma_private(vma);
1307 }
1308 
1309 static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
1310 {
1311 	int nid = folio_nid(folio);
1312 
1313 	lockdep_assert_held(&hugetlb_lock);
1314 	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
1315 
1316 	list_move(&folio->lru, &h->hugepage_freelists[nid]);
1317 	h->free_huge_pages++;
1318 	h->free_huge_pages_node[nid]++;
1319 	folio_set_hugetlb_freed(folio);
1320 }
1321 
1322 static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
1323 								int nid)
1324 {
1325 	struct folio *folio;
1326 	bool pin = !!(current->flags & PF_MEMALLOC_PIN);
1327 
1328 	lockdep_assert_held(&hugetlb_lock);
1329 	list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) {
1330 		if (pin && !folio_is_longterm_pinnable(folio))
1331 			continue;
1332 
1333 		if (folio_test_hwpoison(folio))
1334 			continue;
1335 
1336 		if (is_migrate_isolate_page(&folio->page))
1337 			continue;
1338 
1339 		list_move(&folio->lru, &h->hugepage_activelist);
1340 		folio_ref_unfreeze(folio, 1);
1341 		folio_clear_hugetlb_freed(folio);
1342 		h->free_huge_pages--;
1343 		h->free_huge_pages_node[nid]--;
1344 		return folio;
1345 	}
1346 
1347 	return NULL;
1348 }
1349 
1350 static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask,
1351 							int nid, nodemask_t *nmask)
1352 {
1353 	unsigned int cpuset_mems_cookie;
1354 	struct zonelist *zonelist;
1355 	struct zone *zone;
1356 	struct zoneref *z;
1357 	int node = NUMA_NO_NODE;
1358 
1359 	/* 'nid' should not be NUMA_NO_NODE. Try to catch any misuse of it and rectifiy. */
1360 	if (nid == NUMA_NO_NODE)
1361 		nid = numa_node_id();
1362 
1363 	zonelist = node_zonelist(nid, gfp_mask);
1364 
1365 retry_cpuset:
1366 	cpuset_mems_cookie = read_mems_allowed_begin();
1367 	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
1368 		struct folio *folio;
1369 
1370 		if (!cpuset_zone_allowed(zone, gfp_mask))
1371 			continue;
1372 		/*
1373 		 * no need to ask again on the same node. Pool is node rather than
1374 		 * zone aware
1375 		 */
1376 		if (zone_to_nid(zone) == node)
1377 			continue;
1378 		node = zone_to_nid(zone);
1379 
1380 		folio = dequeue_hugetlb_folio_node_exact(h, node);
1381 		if (folio)
1382 			return folio;
1383 	}
1384 	if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
1385 		goto retry_cpuset;
1386 
1387 	return NULL;
1388 }
1389 
1390 static unsigned long available_huge_pages(struct hstate *h)
1391 {
1392 	return h->free_huge_pages - h->resv_huge_pages;
1393 }
1394 
1395 static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
1396 				struct vm_area_struct *vma,
1397 				unsigned long address, long gbl_chg)
1398 {
1399 	struct folio *folio = NULL;
1400 	struct mempolicy *mpol;
1401 	gfp_t gfp_mask;
1402 	nodemask_t *nodemask;
1403 	int nid;
1404 
1405 	/*
1406 	 * gbl_chg==1 means the allocation requires a new page that was not
1407 	 * reserved before.  Making sure there's at least one free page.
1408 	 */
1409 	if (gbl_chg && !available_huge_pages(h))
1410 		goto err;
1411 
1412 	gfp_mask = htlb_alloc_mask(h);
1413 	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
1414 
1415 	if (mpol_is_preferred_many(mpol)) {
1416 		folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
1417 							nid, nodemask);
1418 
1419 		/* Fallback to all nodes if page==NULL */
1420 		nodemask = NULL;
1421 	}
1422 
1423 	if (!folio)
1424 		folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
1425 							nid, nodemask);
1426 
1427 	mpol_cond_put(mpol);
1428 	return folio;
1429 
1430 err:
1431 	return NULL;
1432 }
1433 
1434 /*
1435  * common helper functions for hstate_next_node_to_{alloc|free}.
1436  * We may have allocated or freed a huge page based on a different
1437  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
1438  * be outside of *nodes_allowed.  Ensure that we use an allowed
1439  * node for alloc or free.
1440  */
1441 static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
1442 {
1443 	nid = next_node_in(nid, *nodes_allowed);
1444 	VM_BUG_ON(nid >= MAX_NUMNODES);
1445 
1446 	return nid;
1447 }
1448 
1449 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
1450 {
1451 	if (!node_isset(nid, *nodes_allowed))
1452 		nid = next_node_allowed(nid, nodes_allowed);
1453 	return nid;
1454 }
1455 
1456 /*
1457  * returns the previously saved node ["this node"] from which to
1458  * allocate a persistent huge page for the pool and advance the
1459  * next node from which to allocate, handling wrap at end of node
1460  * mask.
1461  */
1462 static int hstate_next_node_to_alloc(int *next_node,
1463 					nodemask_t *nodes_allowed)
1464 {
1465 	int nid;
1466 
1467 	VM_BUG_ON(!nodes_allowed);
1468 
1469 	nid = get_valid_node_allowed(*next_node, nodes_allowed);
1470 	*next_node = next_node_allowed(nid, nodes_allowed);
1471 
1472 	return nid;
1473 }
1474 
1475 /*
1476  * helper for remove_pool_hugetlb_folio() - return the previously saved
1477  * node ["this node"] from which to free a huge page.  Advance the
1478  * next node id whether or not we find a free huge page to free so
1479  * that the next attempt to free addresses the next node.
1480  */
1481 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
1482 {
1483 	int nid;
1484 
1485 	VM_BUG_ON(!nodes_allowed);
1486 
1487 	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
1488 	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
1489 
1490 	return nid;
1491 }
1492 
1493 #define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask)		\
1494 	for (nr_nodes = nodes_weight(*mask);				\
1495 		nr_nodes > 0 &&						\
1496 		((node = hstate_next_node_to_alloc(next_node, mask)) || 1);	\
1497 		nr_nodes--)
1498 
1499 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
1500 	for (nr_nodes = nodes_weight(*mask);				\
1501 		nr_nodes > 0 &&						\
1502 		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
1503 		nr_nodes--)
1504 
1505 #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
1506 #ifdef CONFIG_CONTIG_ALLOC
1507 static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask,
1508 		int nid, nodemask_t *nodemask)
1509 {
1510 	struct folio *folio;
1511 	bool retried = false;
1512 
1513 retry:
1514 	folio = hugetlb_cma_alloc_folio(order, gfp_mask, nid, nodemask);
1515 	if (!folio) {
1516 		if (hugetlb_cma_exclusive_alloc())
1517 			return NULL;
1518 
1519 		folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask);
1520 		if (!folio)
1521 			return NULL;
1522 	}
1523 
1524 	if (folio_ref_freeze(folio, 1))
1525 		return folio;
1526 
1527 	pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio));
1528 	hugetlb_free_folio(folio);
1529 	if (!retried) {
1530 		retried = true;
1531 		goto retry;
1532 	}
1533 	return NULL;
1534 }
1535 
1536 #else /* !CONFIG_CONTIG_ALLOC */
1537 static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid,
1538 					  nodemask_t *nodemask)
1539 {
1540 	return NULL;
1541 }
1542 #endif /* CONFIG_CONTIG_ALLOC */
1543 
1544 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
1545 static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid,
1546 					  nodemask_t *nodemask)
1547 {
1548 	return NULL;
1549 }
1550 #endif
1551 
1552 /*
1553  * Remove hugetlb folio from lists.
1554  * If vmemmap exists for the folio, clear the hugetlb flag so that the
1555  * folio appears as just a compound page.  Otherwise, wait until after
1556  * allocating vmemmap to clear the flag.
1557  *
1558  * Must be called with hugetlb lock held.
1559  */
1560 static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
1561 							bool adjust_surplus)
1562 {
1563 	int nid = folio_nid(folio);
1564 
1565 	VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
1566 	VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);
1567 
1568 	lockdep_assert_held(&hugetlb_lock);
1569 	if (hstate_is_gigantic_no_runtime(h))
1570 		return;
1571 
1572 	list_del(&folio->lru);
1573 
1574 	if (folio_test_hugetlb_freed(folio)) {
1575 		folio_clear_hugetlb_freed(folio);
1576 		h->free_huge_pages--;
1577 		h->free_huge_pages_node[nid]--;
1578 	}
1579 	if (adjust_surplus) {
1580 		h->surplus_huge_pages--;
1581 		h->surplus_huge_pages_node[nid]--;
1582 	}
1583 
1584 	/*
1585 	 * We can only clear the hugetlb flag after allocating vmemmap
1586 	 * pages.  Otherwise, someone (memory error handling) may try to write
1587 	 * to tail struct pages.
1588 	 */
1589 	if (!folio_test_hugetlb_vmemmap_optimized(folio))
1590 		__folio_clear_hugetlb(folio);
1591 
1592 	h->nr_huge_pages--;
1593 	h->nr_huge_pages_node[nid]--;
1594 }
1595 
1596 static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
1597 			     bool adjust_surplus)
1598 {
1599 	int nid = folio_nid(folio);
1600 
1601 	VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);
1602 
1603 	lockdep_assert_held(&hugetlb_lock);
1604 
1605 	INIT_LIST_HEAD(&folio->lru);
1606 	h->nr_huge_pages++;
1607 	h->nr_huge_pages_node[nid]++;
1608 
1609 	if (adjust_surplus) {
1610 		h->surplus_huge_pages++;
1611 		h->surplus_huge_pages_node[nid]++;
1612 	}
1613 
1614 	__folio_set_hugetlb(folio);
1615 	folio_change_private(folio, NULL);
1616 	/*
1617 	 * We have to set hugetlb_vmemmap_optimized again as above
1618 	 * folio_change_private(folio, NULL) cleared it.
1619 	 */
1620 	folio_set_hugetlb_vmemmap_optimized(folio);
1621 
1622 	arch_clear_hugetlb_flags(folio);
1623 	enqueue_hugetlb_folio(h, folio);
1624 }
1625 
1626 static void __update_and_free_hugetlb_folio(struct hstate *h,
1627 						struct folio *folio)
1628 {
1629 	bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);
1630 
1631 	if (hstate_is_gigantic_no_runtime(h))
1632 		return;
1633 
1634 	/*
1635 	 * If we don't know which subpages are hwpoisoned, we can't free
1636 	 * the hugepage, so it's leaked intentionally.
1637 	 */
1638 	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1639 		return;
1640 
1641 	/*
1642 	 * If folio is not vmemmap optimized (!clear_flag), then the folio
1643 	 * is no longer identified as a hugetlb page.  hugetlb_vmemmap_restore_folio
1644 	 * can only be passed hugetlb pages and will BUG otherwise.
1645 	 */
1646 	if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) {
1647 		spin_lock_irq(&hugetlb_lock);
1648 		/*
1649 		 * If we cannot allocate vmemmap pages, just refuse to free the
1650 		 * page and put the page back on the hugetlb free list and treat
1651 		 * as a surplus page.
1652 		 */
1653 		add_hugetlb_folio(h, folio, true);
1654 		spin_unlock_irq(&hugetlb_lock);
1655 		return;
1656 	}
1657 
1658 	/*
1659 	 * If vmemmap pages were allocated above, then we need to clear the
1660 	 * hugetlb flag under the hugetlb lock.
1661 	 */
1662 	if (folio_test_hugetlb(folio)) {
1663 		spin_lock_irq(&hugetlb_lock);
1664 		__folio_clear_hugetlb(folio);
1665 		spin_unlock_irq(&hugetlb_lock);
1666 	}
1667 
1668 	/*
1669 	 * Move PageHWPoison flag from head page to the raw error pages,
1670 	 * which makes any healthy subpages reusable.
1671 	 */
1672 	if (unlikely(folio_test_hwpoison(folio)))
1673 		folio_clear_hugetlb_hwpoison(folio);
1674 
1675 	folio_ref_unfreeze(folio, 1);
1676 
1677 	hugetlb_free_folio(folio);
1678 }
1679 
1680 /*
1681  * As update_and_free_hugetlb_folio() can be called under any context, so we cannot
1682  * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
1683  * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
1684  * the vmemmap pages.
1685  *
1686  * free_hpage_workfn() locklessly retrieves the linked list of pages to be
1687  * freed and frees them one-by-one. As the page->mapping pointer is going
1688  * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
1689  * structure of a lockless linked list of huge pages to be freed.
1690  */
1691 static LLIST_HEAD(hpage_freelist);
1692 
1693 static void free_hpage_workfn(struct work_struct *work)
1694 {
1695 	struct llist_node *node;
1696 
1697 	node = llist_del_all(&hpage_freelist);
1698 
1699 	while (node) {
1700 		struct folio *folio;
1701 		struct hstate *h;
1702 
1703 		folio = container_of((struct address_space **)node,
1704 				     struct folio, mapping);
1705 		node = node->next;
1706 		folio->mapping = NULL;
1707 		/*
1708 		 * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
1709 		 * folio_hstate() is going to trigger because a previous call to
1710 		 * remove_hugetlb_folio() will clear the hugetlb bit, so do
1711 		 * not use folio_hstate() directly.
1712 		 */
1713 		h = size_to_hstate(folio_size(folio));
1714 
1715 		__update_and_free_hugetlb_folio(h, folio);
1716 
1717 		cond_resched();
1718 	}
1719 }
1720 static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
1721 
1722 static inline void flush_free_hpage_work(struct hstate *h)
1723 {
1724 	if (hugetlb_vmemmap_optimizable(h))
1725 		flush_work(&free_hpage_work);
1726 }
1727 
1728 static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
1729 				 bool atomic)
1730 {
1731 	if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
1732 		__update_and_free_hugetlb_folio(h, folio);
1733 		return;
1734 	}
1735 
1736 	/*
1737 	 * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
1738 	 *
1739 	 * Only call schedule_work() if hpage_freelist is previously
1740 	 * empty. Otherwise, schedule_work() had been called but the workfn
1741 	 * hasn't retrieved the list yet.
1742 	 */
1743 	if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
1744 		schedule_work(&free_hpage_work);
1745 }
1746 
1747 static void bulk_vmemmap_restore_error(struct hstate *h,
1748 					struct list_head *folio_list,
1749 					struct list_head *non_hvo_folios)
1750 {
1751 	struct folio *folio, *t_folio;
1752 
1753 	if (!list_empty(non_hvo_folios)) {
1754 		/*
1755 		 * Free any restored hugetlb pages so that restore of the
1756 		 * entire list can be retried.
1757 		 * The idea is that in the common case of ENOMEM errors freeing
1758 		 * hugetlb pages with vmemmap we will free up memory so that we
1759 		 * can allocate vmemmap for more hugetlb pages.
1760 		 */
1761 		list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {
1762 			list_del(&folio->lru);
1763 			spin_lock_irq(&hugetlb_lock);
1764 			__folio_clear_hugetlb(folio);
1765 			spin_unlock_irq(&hugetlb_lock);
1766 			update_and_free_hugetlb_folio(h, folio, false);
1767 			cond_resched();
1768 		}
1769 	} else {
1770 		/*
1771 		 * In the case where there are no folios which can be
1772 		 * immediately freed, we loop through the list trying to restore
1773 		 * vmemmap individually in the hope that someone elsewhere may
1774 		 * have done something to cause success (such as freeing some
1775 		 * memory).  If unable to restore a hugetlb page, the hugetlb
1776 		 * page is made a surplus page and removed from the list.
1777 		 * If are able to restore vmemmap and free one hugetlb page, we
1778 		 * quit processing the list to retry the bulk operation.
1779 		 */
1780 		list_for_each_entry_safe(folio, t_folio, folio_list, lru)
1781 			if (hugetlb_vmemmap_restore_folio(h, folio)) {
1782 				list_del(&folio->lru);
1783 				spin_lock_irq(&hugetlb_lock);
1784 				add_hugetlb_folio(h, folio, true);
1785 				spin_unlock_irq(&hugetlb_lock);
1786 			} else {
1787 				list_del(&folio->lru);
1788 				spin_lock_irq(&hugetlb_lock);
1789 				__folio_clear_hugetlb(folio);
1790 				spin_unlock_irq(&hugetlb_lock);
1791 				update_and_free_hugetlb_folio(h, folio, false);
1792 				cond_resched();
1793 				break;
1794 			}
1795 	}
1796 }
1797 
1798 static void update_and_free_pages_bulk(struct hstate *h,
1799 						struct list_head *folio_list)
1800 {
1801 	long ret;
1802 	struct folio *folio, *t_folio;
1803 	LIST_HEAD(non_hvo_folios);
1804 
1805 	/*
1806 	 * First allocate required vmemmmap (if necessary) for all folios.
1807 	 * Carefully handle errors and free up any available hugetlb pages
1808 	 * in an effort to make forward progress.
1809 	 */
1810 retry:
1811 	ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios);
1812 	if (ret < 0) {
1813 		bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios);
1814 		goto retry;
1815 	}
1816 
1817 	/*
1818 	 * At this point, list should be empty, ret should be >= 0 and there
1819 	 * should only be pages on the non_hvo_folios list.
1820 	 * Do note that the non_hvo_folios list could be empty.
1821 	 * Without HVO enabled, ret will be 0 and there is no need to call
1822 	 * __folio_clear_hugetlb as this was done previously.
1823 	 */
1824 	VM_WARN_ON(!list_empty(folio_list));
1825 	VM_WARN_ON(ret < 0);
1826 	if (!list_empty(&non_hvo_folios) && ret) {
1827 		spin_lock_irq(&hugetlb_lock);
1828 		list_for_each_entry(folio, &non_hvo_folios, lru)
1829 			__folio_clear_hugetlb(folio);
1830 		spin_unlock_irq(&hugetlb_lock);
1831 	}
1832 
1833 	list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) {
1834 		update_and_free_hugetlb_folio(h, folio, false);
1835 		cond_resched();
1836 	}
1837 }
1838 
1839 struct hstate *size_to_hstate(unsigned long size)
1840 {
1841 	struct hstate *h;
1842 
1843 	for_each_hstate(h) {
1844 		if (huge_page_size(h) == size)
1845 			return h;
1846 	}
1847 	return NULL;
1848 }
1849 
1850 void free_huge_folio(struct folio *folio)
1851 {
1852 	/*
1853 	 * Can't pass hstate in here because it is called from the
1854 	 * generic mm code.
1855 	 */
1856 	struct hstate *h = folio_hstate(folio);
1857 	int nid = folio_nid(folio);
1858 	struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
1859 	bool restore_reserve;
1860 	unsigned long flags;
1861 
1862 	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
1863 	VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
1864 
1865 	hugetlb_set_folio_subpool(folio, NULL);
1866 	if (folio_test_anon(folio))
1867 		__ClearPageAnonExclusive(&folio->page);
1868 	folio->mapping = NULL;
1869 	restore_reserve = folio_test_hugetlb_restore_reserve(folio);
1870 	folio_clear_hugetlb_restore_reserve(folio);
1871 
1872 	/*
1873 	 * If HPageRestoreReserve was set on page, page allocation consumed a
1874 	 * reservation.  If the page was associated with a subpool, there
1875 	 * would have been a page reserved in the subpool before allocation
1876 	 * via hugepage_subpool_get_pages().  Since we are 'restoring' the
1877 	 * reservation, do not call hugepage_subpool_put_pages() as this will
1878 	 * remove the reserved page from the subpool.
1879 	 */
1880 	if (!restore_reserve) {
1881 		/*
1882 		 * A return code of zero implies that the subpool will be
1883 		 * under its minimum size if the reservation is not restored
1884 		 * after page is free.  Therefore, force restore_reserve
1885 		 * operation.
1886 		 */
1887 		if (hugepage_subpool_put_pages(spool, 1) == 0)
1888 			restore_reserve = true;
1889 	}
1890 
1891 	spin_lock_irqsave(&hugetlb_lock, flags);
1892 	folio_clear_hugetlb_migratable(folio);
1893 	hugetlb_cgroup_uncharge_folio(hstate_index(h),
1894 				     pages_per_huge_page(h), folio);
1895 	hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
1896 					  pages_per_huge_page(h), folio);
1897 	lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h));
1898 	mem_cgroup_uncharge(folio);
1899 	if (restore_reserve)
1900 		h->resv_huge_pages++;
1901 
1902 	if (folio_test_hugetlb_temporary(folio)) {
1903 		remove_hugetlb_folio(h, folio, false);
1904 		spin_unlock_irqrestore(&hugetlb_lock, flags);
1905 		update_and_free_hugetlb_folio(h, folio, true);
1906 	} else if (h->surplus_huge_pages_node[nid]) {
1907 		/* remove the page from active list */
1908 		remove_hugetlb_folio(h, folio, true);
1909 		spin_unlock_irqrestore(&hugetlb_lock, flags);
1910 		update_and_free_hugetlb_folio(h, folio, true);
1911 	} else {
1912 		arch_clear_hugetlb_flags(folio);
1913 		enqueue_hugetlb_folio(h, folio);
1914 		spin_unlock_irqrestore(&hugetlb_lock, flags);
1915 	}
1916 }
1917 
1918 /*
1919  * Must be called with the hugetlb lock held
1920  */
1921 static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio)
1922 {
1923 	lockdep_assert_held(&hugetlb_lock);
1924 	h->nr_huge_pages++;
1925 	h->nr_huge_pages_node[folio_nid(folio)]++;
1926 }
1927 
1928 static void init_new_hugetlb_folio(struct folio *folio)
1929 {
1930 	__folio_set_hugetlb(folio);
1931 	INIT_LIST_HEAD(&folio->lru);
1932 	hugetlb_set_folio_subpool(folio, NULL);
1933 	set_hugetlb_cgroup(folio, NULL);
1934 	set_hugetlb_cgroup_rsvd(folio, NULL);
1935 }
1936 
1937 /*
1938  * Find and lock address space (mapping) in write mode.
1939  *
1940  * Upon entry, the folio is locked which means that folio_mapping() is
1941  * stable.  Due to locking order, we can only trylock_write.  If we can
1942  * not get the lock, simply return NULL to caller.
1943  */
1944 struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
1945 {
1946 	struct address_space *mapping = folio_mapping(folio);
1947 
1948 	if (!mapping)
1949 		return mapping;
1950 
1951 	if (i_mmap_trylock_write(mapping))
1952 		return mapping;
1953 
1954 	return NULL;
1955 }
1956 
1957 static struct folio *alloc_buddy_hugetlb_folio(int order, gfp_t gfp_mask,
1958 		int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry)
1959 {
1960 	struct folio *folio;
1961 	bool alloc_try_hard = true;
1962 
1963 	/*
1964 	 * By default we always try hard to allocate the folio with
1965 	 * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating folios in
1966 	 * a loop (to adjust global huge page counts) and previous allocation
1967 	 * failed, do not continue to try hard on the same node.  Use the
1968 	 * node_alloc_noretry bitmap to manage this state information.
1969 	 */
1970 	if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
1971 		alloc_try_hard = false;
1972 	if (alloc_try_hard)
1973 		gfp_mask |= __GFP_RETRY_MAYFAIL;
1974 
1975 	folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask);
1976 
1977 	/*
1978 	 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a
1979 	 * folio this indicates an overall state change.  Clear bit so
1980 	 * that we resume normal 'try hard' allocations.
1981 	 */
1982 	if (node_alloc_noretry && folio && !alloc_try_hard)
1983 		node_clear(nid, *node_alloc_noretry);
1984 
1985 	/*
1986 	 * If we tried hard to get a folio but failed, set bit so that
1987 	 * subsequent attempts will not try as hard until there is an
1988 	 * overall state change.
1989 	 */
1990 	if (node_alloc_noretry && !folio && alloc_try_hard)
1991 		node_set(nid, *node_alloc_noretry);
1992 
1993 	if (!folio) {
1994 		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1995 		return NULL;
1996 	}
1997 
1998 	__count_vm_event(HTLB_BUDDY_PGALLOC);
1999 	return folio;
2000 }
2001 
2002 static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h,
2003 		gfp_t gfp_mask, int nid, nodemask_t *nmask,
2004 		nodemask_t *node_alloc_noretry)
2005 {
2006 	struct folio *folio;
2007 	int order = huge_page_order(h);
2008 
2009 	if (nid == NUMA_NO_NODE)
2010 		nid = numa_mem_id();
2011 
2012 	if (order_is_gigantic(order))
2013 		folio = alloc_gigantic_folio(order, gfp_mask, nid, nmask);
2014 	else
2015 		folio = alloc_buddy_hugetlb_folio(order, gfp_mask, nid, nmask,
2016 						  node_alloc_noretry);
2017 	if (folio)
2018 		init_new_hugetlb_folio(folio);
2019 	return folio;
2020 }
2021 
2022 /*
2023  * Common helper to allocate a fresh hugetlb folio. All specific allocators
2024  * should use this function to get new hugetlb folio
2025  *
2026  * Note that returned folio is 'frozen':  ref count of head page and all tail
2027  * pages is zero, and the accounting must be done in the caller.
2028  */
2029 static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
2030 		gfp_t gfp_mask, int nid, nodemask_t *nmask)
2031 {
2032 	struct folio *folio;
2033 
2034 	folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
2035 	if (folio)
2036 		hugetlb_vmemmap_optimize_folio(h, folio);
2037 	return folio;
2038 }
2039 
2040 static void prep_and_add_allocated_folios(struct hstate *h,
2041 					struct list_head *folio_list)
2042 {
2043 	unsigned long flags;
2044 	struct folio *folio, *tmp_f;
2045 
2046 	/* Send list for bulk vmemmap optimization processing */
2047 	hugetlb_vmemmap_optimize_folios(h, folio_list);
2048 
2049 	/* Add all new pool pages to free lists in one lock cycle */
2050 	spin_lock_irqsave(&hugetlb_lock, flags);
2051 	list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
2052 		account_new_hugetlb_folio(h, folio);
2053 		enqueue_hugetlb_folio(h, folio);
2054 	}
2055 	spin_unlock_irqrestore(&hugetlb_lock, flags);
2056 }
2057 
2058 /*
2059  * Allocates a fresh hugetlb page in a node interleaved manner.  The page
2060  * will later be added to the appropriate hugetlb pool.
2061  */
2062 static struct folio *alloc_pool_huge_folio(struct hstate *h,
2063 					nodemask_t *nodes_allowed,
2064 					nodemask_t *node_alloc_noretry,
2065 					int *next_node)
2066 {
2067 	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2068 	int nr_nodes, node;
2069 
2070 	for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) {
2071 		struct folio *folio;
2072 
2073 		folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node,
2074 					nodes_allowed, node_alloc_noretry);
2075 		if (folio)
2076 			return folio;
2077 	}
2078 
2079 	return NULL;
2080 }
2081 
2082 /*
2083  * Remove huge page from pool from next node to free.  Attempt to keep
2084  * persistent huge pages more or less balanced over allowed nodes.
2085  * This routine only 'removes' the hugetlb page.  The caller must make
2086  * an additional call to free the page to low level allocators.
2087  * Called with hugetlb_lock locked.
2088  */
2089 static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
2090 		nodemask_t *nodes_allowed, bool acct_surplus)
2091 {
2092 	int nr_nodes, node;
2093 	struct folio *folio = NULL;
2094 
2095 	lockdep_assert_held(&hugetlb_lock);
2096 	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
2097 		/*
2098 		 * If we're returning unused surplus pages, only examine
2099 		 * nodes with surplus pages.
2100 		 */
2101 		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
2102 		    !list_empty(&h->hugepage_freelists[node])) {
2103 			folio = list_entry(h->hugepage_freelists[node].next,
2104 					  struct folio, lru);
2105 			remove_hugetlb_folio(h, folio, acct_surplus);
2106 			break;
2107 		}
2108 	}
2109 
2110 	return folio;
2111 }
2112 
2113 /*
2114  * Dissolve a given free hugetlb folio into free buddy pages. This function
2115  * does nothing for in-use hugetlb folios and non-hugetlb folios.
2116  * This function returns values like below:
2117  *
2118  *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
2119  *           when the system is under memory pressure and the feature of
2120  *           freeing unused vmemmap pages associated with each hugetlb page
2121  *           is enabled.
2122  *  -EBUSY:  failed to dissolved free hugepages or the hugepage is in-use
2123  *           (allocated or reserved.)
2124  *       0:  successfully dissolved free hugepages or the page is not a
2125  *           hugepage (considered as already dissolved)
2126  */
2127 int dissolve_free_hugetlb_folio(struct folio *folio)
2128 {
2129 	int rc = -EBUSY;
2130 
2131 retry:
2132 	/* Not to disrupt normal path by vainly holding hugetlb_lock */
2133 	if (!folio_test_hugetlb(folio))
2134 		return 0;
2135 
2136 	spin_lock_irq(&hugetlb_lock);
2137 	if (!folio_test_hugetlb(folio)) {
2138 		rc = 0;
2139 		goto out;
2140 	}
2141 
2142 	if (!folio_ref_count(folio)) {
2143 		struct hstate *h = folio_hstate(folio);
2144 		bool adjust_surplus = false;
2145 
2146 		if (!available_huge_pages(h))
2147 			goto out;
2148 
2149 		/*
2150 		 * We should make sure that the page is already on the free list
2151 		 * when it is dissolved.
2152 		 */
2153 		if (unlikely(!folio_test_hugetlb_freed(folio))) {
2154 			spin_unlock_irq(&hugetlb_lock);
2155 			cond_resched();
2156 
2157 			/*
2158 			 * Theoretically, we should return -EBUSY when we
2159 			 * encounter this race. In fact, we have a chance
2160 			 * to successfully dissolve the page if we do a
2161 			 * retry. Because the race window is quite small.
2162 			 * If we seize this opportunity, it is an optimization
2163 			 * for increasing the success rate of dissolving page.
2164 			 */
2165 			goto retry;
2166 		}
2167 
2168 		if (h->surplus_huge_pages_node[folio_nid(folio)])
2169 			adjust_surplus = true;
2170 		remove_hugetlb_folio(h, folio, adjust_surplus);
2171 		h->max_huge_pages--;
2172 		spin_unlock_irq(&hugetlb_lock);
2173 
2174 		/*
2175 		 * Normally update_and_free_hugtlb_folio will allocate required vmemmmap
2176 		 * before freeing the page.  update_and_free_hugtlb_folio will fail to
2177 		 * free the page if it can not allocate required vmemmap.  We
2178 		 * need to adjust max_huge_pages if the page is not freed.
2179 		 * Attempt to allocate vmemmmap here so that we can take
2180 		 * appropriate action on failure.
2181 		 *
2182 		 * The folio_test_hugetlb check here is because
2183 		 * remove_hugetlb_folio will clear hugetlb folio flag for
2184 		 * non-vmemmap optimized hugetlb folios.
2185 		 */
2186 		if (folio_test_hugetlb(folio)) {
2187 			rc = hugetlb_vmemmap_restore_folio(h, folio);
2188 			if (rc) {
2189 				spin_lock_irq(&hugetlb_lock);
2190 				add_hugetlb_folio(h, folio, adjust_surplus);
2191 				h->max_huge_pages++;
2192 				goto out;
2193 			}
2194 		} else
2195 			rc = 0;
2196 
2197 		update_and_free_hugetlb_folio(h, folio, false);
2198 		return rc;
2199 	}
2200 out:
2201 	spin_unlock_irq(&hugetlb_lock);
2202 	return rc;
2203 }
2204 
2205 /*
2206  * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
2207  * make specified memory blocks removable from the system.
2208  * Note that this will dissolve a free gigantic hugepage completely, if any
2209  * part of it lies within the given range.
2210  * Also note that if dissolve_free_hugetlb_folio() returns with an error, all
2211  * free hugetlb folios that were dissolved before that error are lost.
2212  */
2213 int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn)
2214 {
2215 	unsigned long pfn;
2216 	struct folio *folio;
2217 	int rc = 0;
2218 	unsigned int order;
2219 	struct hstate *h;
2220 
2221 	if (!hugepages_supported())
2222 		return rc;
2223 
2224 	order = huge_page_order(&default_hstate);
2225 	for_each_hstate(h)
2226 		order = min(order, huge_page_order(h));
2227 
2228 	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
2229 		folio = pfn_folio(pfn);
2230 		rc = dissolve_free_hugetlb_folio(folio);
2231 		if (rc)
2232 			break;
2233 	}
2234 
2235 	return rc;
2236 }
2237 
2238 /*
2239  * Allocates a fresh surplus page from the page allocator.
2240  */
2241 static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
2242 				gfp_t gfp_mask,	int nid, nodemask_t *nmask)
2243 {
2244 	struct folio *folio = NULL;
2245 
2246 	if (hstate_is_gigantic_no_runtime(h))
2247 		return NULL;
2248 
2249 	spin_lock_irq(&hugetlb_lock);
2250 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
2251 		goto out_unlock;
2252 	spin_unlock_irq(&hugetlb_lock);
2253 
2254 	folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
2255 	if (!folio)
2256 		return NULL;
2257 
2258 	spin_lock_irq(&hugetlb_lock);
2259 	/*
2260 	 * nr_huge_pages needs to be adjusted within the same lock cycle
2261 	 * as surplus_pages, otherwise it might confuse
2262 	 * persistent_huge_pages() momentarily.
2263 	 */
2264 	account_new_hugetlb_folio(h, folio);
2265 
2266 	/*
2267 	 * We could have raced with the pool size change.
2268 	 * Double check that and simply deallocate the new page
2269 	 * if we would end up overcommiting the surpluses. Abuse
2270 	 * temporary page to workaround the nasty free_huge_folio
2271 	 * codeflow
2272 	 */
2273 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
2274 		folio_set_hugetlb_temporary(folio);
2275 		spin_unlock_irq(&hugetlb_lock);
2276 		free_huge_folio(folio);
2277 		return NULL;
2278 	}
2279 
2280 	h->surplus_huge_pages++;
2281 	h->surplus_huge_pages_node[folio_nid(folio)]++;
2282 
2283 out_unlock:
2284 	spin_unlock_irq(&hugetlb_lock);
2285 
2286 	return folio;
2287 }
2288 
2289 static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask,
2290 				     int nid, nodemask_t *nmask)
2291 {
2292 	struct folio *folio;
2293 
2294 	if (hstate_is_gigantic(h))
2295 		return NULL;
2296 
2297 	folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
2298 	if (!folio)
2299 		return NULL;
2300 
2301 	spin_lock_irq(&hugetlb_lock);
2302 	account_new_hugetlb_folio(h, folio);
2303 	spin_unlock_irq(&hugetlb_lock);
2304 
2305 	/* fresh huge pages are frozen */
2306 	folio_ref_unfreeze(folio, 1);
2307 	/*
2308 	 * We do not account these pages as surplus because they are only
2309 	 * temporary and will be released properly on the last reference
2310 	 */
2311 	folio_set_hugetlb_temporary(folio);
2312 
2313 	return folio;
2314 }
2315 
2316 /*
2317  * Use the VMA's mpolicy to allocate a huge page from the buddy.
2318  */
2319 static
2320 struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
2321 		struct vm_area_struct *vma, unsigned long addr)
2322 {
2323 	struct folio *folio = NULL;
2324 	struct mempolicy *mpol;
2325 	gfp_t gfp_mask = htlb_alloc_mask(h);
2326 	int nid;
2327 	nodemask_t *nodemask;
2328 
2329 	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
2330 	if (mpol_is_preferred_many(mpol)) {
2331 		gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2332 
2333 		folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask);
2334 
2335 		/* Fallback to all nodes if page==NULL */
2336 		nodemask = NULL;
2337 	}
2338 
2339 	if (!folio)
2340 		folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask);
2341 	mpol_cond_put(mpol);
2342 	return folio;
2343 }
2344 
2345 struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
2346 		nodemask_t *nmask, gfp_t gfp_mask)
2347 {
2348 	struct folio *folio;
2349 
2350 	spin_lock_irq(&hugetlb_lock);
2351 	if (!h->resv_huge_pages) {
2352 		spin_unlock_irq(&hugetlb_lock);
2353 		return NULL;
2354 	}
2355 
2356 	folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid,
2357 					       nmask);
2358 	if (folio)
2359 		h->resv_huge_pages--;
2360 
2361 	spin_unlock_irq(&hugetlb_lock);
2362 	return folio;
2363 }
2364 
2365 /* folio migration callback function */
2366 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
2367 		nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback)
2368 {
2369 	spin_lock_irq(&hugetlb_lock);
2370 	if (available_huge_pages(h)) {
2371 		struct folio *folio;
2372 
2373 		folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
2374 						preferred_nid, nmask);
2375 		if (folio) {
2376 			spin_unlock_irq(&hugetlb_lock);
2377 			return folio;
2378 		}
2379 	}
2380 	spin_unlock_irq(&hugetlb_lock);
2381 
2382 	/* We cannot fallback to other nodes, as we could break the per-node pool. */
2383 	if (!allow_alloc_fallback)
2384 		gfp_mask |= __GFP_THISNODE;
2385 
2386 	return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
2387 }
2388 
2389 static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
2390 {
2391 #ifdef CONFIG_NUMA
2392 	struct mempolicy *mpol = get_task_policy(current);
2393 
2394 	/*
2395 	 * Only enforce MPOL_BIND policy which overlaps with cpuset policy
2396 	 * (from policy_nodemask) specifically for hugetlb case
2397 	 */
2398 	if (mpol->mode == MPOL_BIND &&
2399 		(apply_policy_zone(mpol, gfp_zone(gfp)) &&
2400 		 cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
2401 		return &mpol->nodes;
2402 #endif
2403 	return NULL;
2404 }
2405 
2406 /*
2407  * Increase the hugetlb pool such that it can accommodate a reservation
2408  * of size 'delta'.
2409  */
2410 static int gather_surplus_pages(struct hstate *h, long delta)
2411 	__must_hold(&hugetlb_lock)
2412 {
2413 	LIST_HEAD(surplus_list);
2414 	struct folio *folio, *tmp;
2415 	int ret;
2416 	long i;
2417 	long needed, allocated;
2418 	bool alloc_ok = true;
2419 	nodemask_t *mbind_nodemask, alloc_nodemask;
2420 
2421 	mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
2422 	if (mbind_nodemask)
2423 		nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed);
2424 	else
2425 		alloc_nodemask = cpuset_current_mems_allowed;
2426 
2427 	lockdep_assert_held(&hugetlb_lock);
2428 	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
2429 	if (needed <= 0) {
2430 		h->resv_huge_pages += delta;
2431 		return 0;
2432 	}
2433 
2434 	allocated = 0;
2435 
2436 	ret = -ENOMEM;
2437 retry:
2438 	spin_unlock_irq(&hugetlb_lock);
2439 	for (i = 0; i < needed; i++) {
2440 		folio = NULL;
2441 
2442 		/*
2443 		 * It is okay to use NUMA_NO_NODE because we use numa_mem_id()
2444 		 * down the road to pick the current node if that is the case.
2445 		 */
2446 		folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
2447 						    NUMA_NO_NODE, &alloc_nodemask);
2448 		if (!folio) {
2449 			alloc_ok = false;
2450 			break;
2451 		}
2452 		list_add(&folio->lru, &surplus_list);
2453 		cond_resched();
2454 	}
2455 	allocated += i;
2456 
2457 	/*
2458 	 * After retaking hugetlb_lock, we need to recalculate 'needed'
2459 	 * because either resv_huge_pages or free_huge_pages may have changed.
2460 	 */
2461 	spin_lock_irq(&hugetlb_lock);
2462 	needed = (h->resv_huge_pages + delta) -
2463 			(h->free_huge_pages + allocated);
2464 	if (needed > 0) {
2465 		if (alloc_ok)
2466 			goto retry;
2467 		/*
2468 		 * We were not able to allocate enough pages to
2469 		 * satisfy the entire reservation so we free what
2470 		 * we've allocated so far.
2471 		 */
2472 		goto free;
2473 	}
2474 	/*
2475 	 * The surplus_list now contains _at_least_ the number of extra pages
2476 	 * needed to accommodate the reservation.  Add the appropriate number
2477 	 * of pages to the hugetlb pool and free the extras back to the buddy
2478 	 * allocator.  Commit the entire reservation here to prevent another
2479 	 * process from stealing the pages as they are added to the pool but
2480 	 * before they are reserved.
2481 	 */
2482 	needed += allocated;
2483 	h->resv_huge_pages += delta;
2484 	ret = 0;
2485 
2486 	/* Free the needed pages to the hugetlb pool */
2487 	list_for_each_entry_safe(folio, tmp, &surplus_list, lru) {
2488 		if ((--needed) < 0)
2489 			break;
2490 		/* Add the page to the hugetlb allocator */
2491 		enqueue_hugetlb_folio(h, folio);
2492 	}
2493 free:
2494 	spin_unlock_irq(&hugetlb_lock);
2495 
2496 	/*
2497 	 * Free unnecessary surplus pages to the buddy allocator.
2498 	 * Pages have no ref count, call free_huge_folio directly.
2499 	 */
2500 	list_for_each_entry_safe(folio, tmp, &surplus_list, lru)
2501 		free_huge_folio(folio);
2502 	spin_lock_irq(&hugetlb_lock);
2503 
2504 	return ret;
2505 }
2506 
2507 /*
2508  * This routine has two main purposes:
2509  * 1) Decrement the reservation count (resv_huge_pages) by the value passed
2510  *    in unused_resv_pages.  This corresponds to the prior adjustments made
2511  *    to the associated reservation map.
2512  * 2) Free any unused surplus pages that may have been allocated to satisfy
2513  *    the reservation.  As many as unused_resv_pages may be freed.
2514  */
2515 static void return_unused_surplus_pages(struct hstate *h,
2516 					unsigned long unused_resv_pages)
2517 {
2518 	unsigned long nr_pages;
2519 	LIST_HEAD(page_list);
2520 
2521 	lockdep_assert_held(&hugetlb_lock);
2522 	/* Uncommit the reservation */
2523 	h->resv_huge_pages -= unused_resv_pages;
2524 
2525 	if (hstate_is_gigantic_no_runtime(h))
2526 		goto out;
2527 
2528 	/*
2529 	 * Part (or even all) of the reservation could have been backed
2530 	 * by pre-allocated pages. Only free surplus pages.
2531 	 */
2532 	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
2533 
2534 	/*
2535 	 * We want to release as many surplus pages as possible, spread
2536 	 * evenly across all nodes with memory. Iterate across these nodes
2537 	 * until we can no longer free unreserved surplus pages. This occurs
2538 	 * when the nodes with surplus pages have no free pages.
2539 	 * remove_pool_hugetlb_folio() will balance the freed pages across the
2540 	 * on-line nodes with memory and will handle the hstate accounting.
2541 	 */
2542 	while (nr_pages--) {
2543 		struct folio *folio;
2544 
2545 		folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1);
2546 		if (!folio)
2547 			goto out;
2548 
2549 		list_add(&folio->lru, &page_list);
2550 	}
2551 
2552 out:
2553 	spin_unlock_irq(&hugetlb_lock);
2554 	update_and_free_pages_bulk(h, &page_list);
2555 	spin_lock_irq(&hugetlb_lock);
2556 }
2557 
2558 
2559 /*
2560  * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
2561  * are used by the huge page allocation routines to manage reservations.
2562  *
2563  * vma_needs_reservation is called to determine if the huge page at addr
2564  * within the vma has an associated reservation.  If a reservation is
2565  * needed, the value 1 is returned.  The caller is then responsible for
2566  * managing the global reservation and subpool usage counts.  After
2567  * the huge page has been allocated, vma_commit_reservation is called
2568  * to add the page to the reservation map.  If the page allocation fails,
2569  * the reservation must be ended instead of committed.  vma_end_reservation
2570  * is called in such cases.
2571  *
2572  * In the normal case, vma_commit_reservation returns the same value
2573  * as the preceding vma_needs_reservation call.  The only time this
2574  * is not the case is if a reserve map was changed between calls.  It
2575  * is the responsibility of the caller to notice the difference and
2576  * take appropriate action.
2577  *
2578  * vma_add_reservation is used in error paths where a reservation must
2579  * be restored when a newly allocated huge page must be freed.  It is
2580  * to be called after calling vma_needs_reservation to determine if a
2581  * reservation exists.
2582  *
2583  * vma_del_reservation is used in error paths where an entry in the reserve
2584  * map was created during huge page allocation and must be removed.  It is to
2585  * be called after calling vma_needs_reservation to determine if a reservation
2586  * exists.
2587  */
2588 enum vma_resv_mode {
2589 	VMA_NEEDS_RESV,
2590 	VMA_COMMIT_RESV,
2591 	VMA_END_RESV,
2592 	VMA_ADD_RESV,
2593 	VMA_DEL_RESV,
2594 };
2595 static long __vma_reservation_common(struct hstate *h,
2596 				struct vm_area_struct *vma, unsigned long addr,
2597 				enum vma_resv_mode mode)
2598 {
2599 	struct resv_map *resv;
2600 	pgoff_t idx;
2601 	long ret;
2602 	long dummy_out_regions_needed;
2603 
2604 	resv = vma_resv_map(vma);
2605 	if (!resv)
2606 		return 1;
2607 
2608 	idx = vma_hugecache_offset(h, vma, addr);
2609 	switch (mode) {
2610 	case VMA_NEEDS_RESV:
2611 		ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
2612 		/* We assume that vma_reservation_* routines always operate on
2613 		 * 1 page, and that adding to resv map a 1 page entry can only
2614 		 * ever require 1 region.
2615 		 */
2616 		VM_BUG_ON(dummy_out_regions_needed != 1);
2617 		break;
2618 	case VMA_COMMIT_RESV:
2619 		ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2620 		/* region_add calls of range 1 should never fail. */
2621 		VM_BUG_ON(ret < 0);
2622 		break;
2623 	case VMA_END_RESV:
2624 		region_abort(resv, idx, idx + 1, 1);
2625 		ret = 0;
2626 		break;
2627 	case VMA_ADD_RESV:
2628 		if (vma->vm_flags & VM_MAYSHARE) {
2629 			ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2630 			/* region_add calls of range 1 should never fail. */
2631 			VM_BUG_ON(ret < 0);
2632 		} else {
2633 			region_abort(resv, idx, idx + 1, 1);
2634 			ret = region_del(resv, idx, idx + 1);
2635 		}
2636 		break;
2637 	case VMA_DEL_RESV:
2638 		if (vma->vm_flags & VM_MAYSHARE) {
2639 			region_abort(resv, idx, idx + 1, 1);
2640 			ret = region_del(resv, idx, idx + 1);
2641 		} else {
2642 			ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2643 			/* region_add calls of range 1 should never fail. */
2644 			VM_BUG_ON(ret < 0);
2645 		}
2646 		break;
2647 	default:
2648 		BUG();
2649 	}
2650 
2651 	if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
2652 		return ret;
2653 	/*
2654 	 * We know private mapping must have HPAGE_RESV_OWNER set.
2655 	 *
2656 	 * In most cases, reserves always exist for private mappings.
2657 	 * However, a file associated with mapping could have been
2658 	 * hole punched or truncated after reserves were consumed.
2659 	 * As subsequent fault on such a range will not use reserves.
2660 	 * Subtle - The reserve map for private mappings has the
2661 	 * opposite meaning than that of shared mappings.  If NO
2662 	 * entry is in the reserve map, it means a reservation exists.
2663 	 * If an entry exists in the reserve map, it means the
2664 	 * reservation has already been consumed.  As a result, the
2665 	 * return value of this routine is the opposite of the
2666 	 * value returned from reserve map manipulation routines above.
2667 	 */
2668 	if (ret > 0)
2669 		return 0;
2670 	if (ret == 0)
2671 		return 1;
2672 	return ret;
2673 }
2674 
2675 static long vma_needs_reservation(struct hstate *h,
2676 			struct vm_area_struct *vma, unsigned long addr)
2677 {
2678 	return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
2679 }
2680 
2681 static long vma_commit_reservation(struct hstate *h,
2682 			struct vm_area_struct *vma, unsigned long addr)
2683 {
2684 	return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
2685 }
2686 
2687 static void vma_end_reservation(struct hstate *h,
2688 			struct vm_area_struct *vma, unsigned long addr)
2689 {
2690 	(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
2691 }
2692 
2693 static long vma_add_reservation(struct hstate *h,
2694 			struct vm_area_struct *vma, unsigned long addr)
2695 {
2696 	return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
2697 }
2698 
2699 static long vma_del_reservation(struct hstate *h,
2700 			struct vm_area_struct *vma, unsigned long addr)
2701 {
2702 	return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
2703 }
2704 
2705 /*
2706  * This routine is called to restore reservation information on error paths.
2707  * It should ONLY be called for folios allocated via alloc_hugetlb_folio(),
2708  * and the hugetlb mutex should remain held when calling this routine.
2709  *
2710  * It handles two specific cases:
2711  * 1) A reservation was in place and the folio consumed the reservation.
2712  *    hugetlb_restore_reserve is set in the folio.
2713  * 2) No reservation was in place for the page, so hugetlb_restore_reserve is
2714  *    not set.  However, alloc_hugetlb_folio always updates the reserve map.
2715  *
2716  * In case 1, free_huge_folio later in the error path will increment the
2717  * global reserve count.  But, free_huge_folio does not have enough context
2718  * to adjust the reservation map.  This case deals primarily with private
2719  * mappings.  Adjust the reserve map here to be consistent with global
2720  * reserve count adjustments to be made by free_huge_folio.  Make sure the
2721  * reserve map indicates there is a reservation present.
2722  *
2723  * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio.
2724  */
2725 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
2726 			unsigned long address, struct folio *folio)
2727 {
2728 	long rc = vma_needs_reservation(h, vma, address);
2729 
2730 	if (folio_test_hugetlb_restore_reserve(folio)) {
2731 		if (unlikely(rc < 0))
2732 			/*
2733 			 * Rare out of memory condition in reserve map
2734 			 * manipulation.  Clear hugetlb_restore_reserve so
2735 			 * that global reserve count will not be incremented
2736 			 * by free_huge_folio.  This will make it appear
2737 			 * as though the reservation for this folio was
2738 			 * consumed.  This may prevent the task from
2739 			 * faulting in the folio at a later time.  This
2740 			 * is better than inconsistent global huge page
2741 			 * accounting of reserve counts.
2742 			 */
2743 			folio_clear_hugetlb_restore_reserve(folio);
2744 		else if (rc)
2745 			(void)vma_add_reservation(h, vma, address);
2746 		else
2747 			vma_end_reservation(h, vma, address);
2748 	} else {
2749 		if (!rc) {
2750 			/*
2751 			 * This indicates there is an entry in the reserve map
2752 			 * not added by alloc_hugetlb_folio.  We know it was added
2753 			 * before the alloc_hugetlb_folio call, otherwise
2754 			 * hugetlb_restore_reserve would be set on the folio.
2755 			 * Remove the entry so that a subsequent allocation
2756 			 * does not consume a reservation.
2757 			 */
2758 			rc = vma_del_reservation(h, vma, address);
2759 			if (rc < 0)
2760 				/*
2761 				 * VERY rare out of memory condition.  Since
2762 				 * we can not delete the entry, set
2763 				 * hugetlb_restore_reserve so that the reserve
2764 				 * count will be incremented when the folio
2765 				 * is freed.  This reserve will be consumed
2766 				 * on a subsequent allocation.
2767 				 */
2768 				folio_set_hugetlb_restore_reserve(folio);
2769 		} else if (rc < 0) {
2770 			/*
2771 			 * Rare out of memory condition from
2772 			 * vma_needs_reservation call.  Memory allocation is
2773 			 * only attempted if a new entry is needed.  Therefore,
2774 			 * this implies there is not an entry in the
2775 			 * reserve map.
2776 			 *
2777 			 * For shared mappings, no entry in the map indicates
2778 			 * no reservation.  We are done.
2779 			 */
2780 			if (!(vma->vm_flags & VM_MAYSHARE))
2781 				/*
2782 				 * For private mappings, no entry indicates
2783 				 * a reservation is present.  Since we can
2784 				 * not add an entry, set hugetlb_restore_reserve
2785 				 * on the folio so reserve count will be
2786 				 * incremented when freed.  This reserve will
2787 				 * be consumed on a subsequent allocation.
2788 				 */
2789 				folio_set_hugetlb_restore_reserve(folio);
2790 		} else
2791 			/*
2792 			 * No reservation present, do nothing
2793 			 */
2794 			 vma_end_reservation(h, vma, address);
2795 	}
2796 }
2797 
2798 /*
2799  * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
2800  * the old one
2801  * @old_folio: Old folio to dissolve
2802  * @list: List to isolate the page in case we need to
2803  * Returns 0 on success, otherwise negated error.
2804  */
2805 static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio,
2806 			struct list_head *list)
2807 {
2808 	gfp_t gfp_mask;
2809 	struct hstate *h;
2810 	int nid = folio_nid(old_folio);
2811 	struct folio *new_folio = NULL;
2812 	int ret = 0;
2813 
2814 retry:
2815 	/*
2816 	 * The old_folio might have been dissolved from under our feet, so make sure
2817 	 * to carefully check the state under the lock.
2818 	 */
2819 	spin_lock_irq(&hugetlb_lock);
2820 	if (!folio_test_hugetlb(old_folio)) {
2821 		/*
2822 		 * Freed from under us. Drop new_folio too.
2823 		 */
2824 		goto free_new;
2825 	} else if (folio_ref_count(old_folio)) {
2826 		bool isolated;
2827 
2828 		/*
2829 		 * Someone has grabbed the folio, try to isolate it here.
2830 		 * Fail with -EBUSY if not possible.
2831 		 */
2832 		spin_unlock_irq(&hugetlb_lock);
2833 		isolated = folio_isolate_hugetlb(old_folio, list);
2834 		ret = isolated ? 0 : -EBUSY;
2835 		spin_lock_irq(&hugetlb_lock);
2836 		goto free_new;
2837 	} else if (!folio_test_hugetlb_freed(old_folio)) {
2838 		/*
2839 		 * Folio's refcount is 0 but it has not been enqueued in the
2840 		 * freelist yet. Race window is small, so we can succeed here if
2841 		 * we retry.
2842 		 */
2843 		spin_unlock_irq(&hugetlb_lock);
2844 		cond_resched();
2845 		goto retry;
2846 	} else {
2847 		h = folio_hstate(old_folio);
2848 		if (!new_folio) {
2849 			spin_unlock_irq(&hugetlb_lock);
2850 			gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2851 			new_folio = alloc_fresh_hugetlb_folio(h, gfp_mask,
2852 							      nid, NULL);
2853 			if (!new_folio)
2854 				return -ENOMEM;
2855 			goto retry;
2856 		}
2857 
2858 		/*
2859 		 * Ok, old_folio is still a genuine free hugepage. Remove it from
2860 		 * the freelist and decrease the counters. These will be
2861 		 * incremented again when calling account_new_hugetlb_folio()
2862 		 * and enqueue_hugetlb_folio() for new_folio. The counters will
2863 		 * remain stable since this happens under the lock.
2864 		 */
2865 		remove_hugetlb_folio(h, old_folio, false);
2866 
2867 		/*
2868 		 * Ref count on new_folio is already zero as it was dropped
2869 		 * earlier.  It can be directly added to the pool free list.
2870 		 */
2871 		account_new_hugetlb_folio(h, new_folio);
2872 		enqueue_hugetlb_folio(h, new_folio);
2873 
2874 		/*
2875 		 * Folio has been replaced, we can safely free the old one.
2876 		 */
2877 		spin_unlock_irq(&hugetlb_lock);
2878 		update_and_free_hugetlb_folio(h, old_folio, false);
2879 	}
2880 
2881 	return ret;
2882 
2883 free_new:
2884 	spin_unlock_irq(&hugetlb_lock);
2885 	if (new_folio)
2886 		update_and_free_hugetlb_folio(h, new_folio, false);
2887 
2888 	return ret;
2889 }
2890 
2891 int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list)
2892 {
2893 	int ret = -EBUSY;
2894 
2895 	/* Not to disrupt normal path by vainly holding hugetlb_lock */
2896 	if (!folio_test_hugetlb(folio))
2897 		return 0;
2898 
2899 	/*
2900 	 * Fence off gigantic pages as there is a cyclic dependency between
2901 	 * alloc_contig_range and them. Return -ENOMEM as this has the effect
2902 	 * of bailing out right away without further retrying.
2903 	 */
2904 	if (order_is_gigantic(folio_order(folio)))
2905 		return -ENOMEM;
2906 
2907 	if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list))
2908 		ret = 0;
2909 	else if (!folio_ref_count(folio))
2910 		ret = alloc_and_dissolve_hugetlb_folio(folio, list);
2911 
2912 	return ret;
2913 }
2914 
2915 /*
2916  *  replace_free_hugepage_folios - Replace free hugepage folios in a given pfn
2917  *  range with new folios.
2918  *  @start_pfn: start pfn of the given pfn range
2919  *  @end_pfn: end pfn of the given pfn range
2920  *  Returns 0 on success, otherwise negated error.
2921  */
2922 int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
2923 {
2924 	struct folio *folio;
2925 	int ret = 0;
2926 
2927 	LIST_HEAD(isolate_list);
2928 
2929 	while (start_pfn < end_pfn) {
2930 		folio = pfn_folio(start_pfn);
2931 
2932 		/* Not to disrupt normal path by vainly holding hugetlb_lock */
2933 		if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) {
2934 			ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list);
2935 			if (ret)
2936 				break;
2937 
2938 			putback_movable_pages(&isolate_list);
2939 		}
2940 		start_pfn++;
2941 	}
2942 
2943 	return ret;
2944 }
2945 
2946 void wait_for_freed_hugetlb_folios(void)
2947 {
2948 	if (llist_empty(&hpage_freelist))
2949 		return;
2950 
2951 	flush_work(&free_hpage_work);
2952 }
2953 
2954 typedef enum {
2955 	/*
2956 	 * For either 0/1: we checked the per-vma resv map, and one resv
2957 	 * count either can be reused (0), or an extra needed (1).
2958 	 */
2959 	MAP_CHG_REUSE = 0,
2960 	MAP_CHG_NEEDED = 1,
2961 	/*
2962 	 * Cannot use per-vma resv count can be used, hence a new resv
2963 	 * count is enforced.
2964 	 *
2965 	 * NOTE: This is mostly identical to MAP_CHG_NEEDED, except
2966 	 * that currently vma_needs_reservation() has an unwanted side
2967 	 * effect to either use end() or commit() to complete the
2968 	 * transaction. Hence it needs to differentiate from NEEDED.
2969 	 */
2970 	MAP_CHG_ENFORCED = 2,
2971 } map_chg_state;
2972 
2973 /*
2974  * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW
2975  * faults of hugetlb private mappings on top of a non-page-cache folio (in
2976  * which case even if there's a private vma resv map it won't cover such
2977  * allocation).  New call sites should (probably) never set it to true!!
2978  * When it's set, the allocation will bypass all vma level reservations.
2979  */
2980 struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
2981 				    unsigned long addr, bool cow_from_owner)
2982 {
2983 	struct hugepage_subpool *spool = subpool_vma(vma);
2984 	struct hstate *h = hstate_vma(vma);
2985 	struct folio *folio;
2986 	long retval, gbl_chg, gbl_reserve;
2987 	map_chg_state map_chg;
2988 	int ret, idx;
2989 	struct hugetlb_cgroup *h_cg = NULL;
2990 	gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
2991 
2992 	idx = hstate_index(h);
2993 
2994 	/* Whether we need a separate per-vma reservation? */
2995 	if (cow_from_owner) {
2996 		/*
2997 		 * Special case!  Since it's a CoW on top of a reserved
2998 		 * page, the private resv map doesn't count.  So it cannot
2999 		 * consume the per-vma resv map even if it's reserved.
3000 		 */
3001 		map_chg = MAP_CHG_ENFORCED;
3002 	} else {
3003 		/*
3004 		 * Examine the region/reserve map to determine if the process
3005 		 * has a reservation for the page to be allocated.  A return
3006 		 * code of zero indicates a reservation exists (no change).
3007 		 */
3008 		retval = vma_needs_reservation(h, vma, addr);
3009 		if (retval < 0)
3010 			return ERR_PTR(-ENOMEM);
3011 		map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE;
3012 	}
3013 
3014 	/*
3015 	 * Whether we need a separate global reservation?
3016 	 *
3017 	 * Processes that did not create the mapping will have no
3018 	 * reserves as indicated by the region/reserve map. Check
3019 	 * that the allocation will not exceed the subpool limit.
3020 	 * Or if it can get one from the pool reservation directly.
3021 	 */
3022 	if (map_chg) {
3023 		gbl_chg = hugepage_subpool_get_pages(spool, 1);
3024 		if (gbl_chg < 0)
3025 			goto out_end_reservation;
3026 	} else {
3027 		/*
3028 		 * If we have the vma reservation ready, no need for extra
3029 		 * global reservation.
3030 		 */
3031 		gbl_chg = 0;
3032 	}
3033 
3034 	/*
3035 	 * If this allocation is not consuming a per-vma reservation,
3036 	 * charge the hugetlb cgroup now.
3037 	 */
3038 	if (map_chg) {
3039 		ret = hugetlb_cgroup_charge_cgroup_rsvd(
3040 			idx, pages_per_huge_page(h), &h_cg);
3041 		if (ret)
3042 			goto out_subpool_put;
3043 	}
3044 
3045 	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
3046 	if (ret)
3047 		goto out_uncharge_cgroup_reservation;
3048 
3049 	spin_lock_irq(&hugetlb_lock);
3050 	/*
3051 	 * glb_chg is passed to indicate whether or not a page must be taken
3052 	 * from the global free pool (global change).  gbl_chg == 0 indicates
3053 	 * a reservation exists for the allocation.
3054 	 */
3055 	folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg);
3056 	if (!folio) {
3057 		spin_unlock_irq(&hugetlb_lock);
3058 		folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
3059 		if (!folio)
3060 			goto out_uncharge_cgroup;
3061 		spin_lock_irq(&hugetlb_lock);
3062 		list_add(&folio->lru, &h->hugepage_activelist);
3063 		folio_ref_unfreeze(folio, 1);
3064 		/* Fall through */
3065 	}
3066 
3067 	/*
3068 	 * Either dequeued or buddy-allocated folio needs to add special
3069 	 * mark to the folio when it consumes a global reservation.
3070 	 */
3071 	if (!gbl_chg) {
3072 		folio_set_hugetlb_restore_reserve(folio);
3073 		h->resv_huge_pages--;
3074 	}
3075 
3076 	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
3077 	/* If allocation is not consuming a reservation, also store the
3078 	 * hugetlb_cgroup pointer on the page.
3079 	 */
3080 	if (map_chg) {
3081 		hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
3082 						  h_cg, folio);
3083 	}
3084 
3085 	spin_unlock_irq(&hugetlb_lock);
3086 
3087 	hugetlb_set_folio_subpool(folio, spool);
3088 
3089 	if (map_chg != MAP_CHG_ENFORCED) {
3090 		/* commit() is only needed if the map_chg is not enforced */
3091 		retval = vma_commit_reservation(h, vma, addr);
3092 		/*
3093 		 * Check for possible race conditions. When it happens..
3094 		 * The page was added to the reservation map between
3095 		 * vma_needs_reservation and vma_commit_reservation.
3096 		 * This indicates a race with hugetlb_reserve_pages.
3097 		 * Adjust for the subpool count incremented above AND
3098 		 * in hugetlb_reserve_pages for the same page.	Also,
3099 		 * the reservation count added in hugetlb_reserve_pages
3100 		 * no longer applies.
3101 		 */
3102 		if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) {
3103 			long rsv_adjust;
3104 
3105 			rsv_adjust = hugepage_subpool_put_pages(spool, 1);
3106 			hugetlb_acct_memory(h, -rsv_adjust);
3107 			if (map_chg) {
3108 				spin_lock_irq(&hugetlb_lock);
3109 				hugetlb_cgroup_uncharge_folio_rsvd(
3110 				    hstate_index(h), pages_per_huge_page(h),
3111 				    folio);
3112 				spin_unlock_irq(&hugetlb_lock);
3113 			}
3114 		}
3115 	}
3116 
3117 	ret = mem_cgroup_charge_hugetlb(folio, gfp);
3118 	/*
3119 	 * Unconditionally increment NR_HUGETLB here. If it turns out that
3120 	 * mem_cgroup_charge_hugetlb failed, then immediately free the page and
3121 	 * decrement NR_HUGETLB.
3122 	 */
3123 	lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));
3124 
3125 	if (ret == -ENOMEM) {
3126 		free_huge_folio(folio);
3127 		return ERR_PTR(-ENOMEM);
3128 	}
3129 
3130 	return folio;
3131 
3132 out_uncharge_cgroup:
3133 	hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
3134 out_uncharge_cgroup_reservation:
3135 	if (map_chg)
3136 		hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
3137 						    h_cg);
3138 out_subpool_put:
3139 	/*
3140 	 * put page to subpool iff the quota of subpool's rsv_hpages is used
3141 	 * during hugepage_subpool_get_pages.
3142 	 */
3143 	if (map_chg && !gbl_chg) {
3144 		gbl_reserve = hugepage_subpool_put_pages(spool, 1);
3145 		hugetlb_acct_memory(h, -gbl_reserve);
3146 	}
3147 
3148 
3149 out_end_reservation:
3150 	if (map_chg != MAP_CHG_ENFORCED)
3151 		vma_end_reservation(h, vma, addr);
3152 	return ERR_PTR(-ENOSPC);
3153 }
3154 
3155 static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact)
3156 {
3157 	struct huge_bootmem_page *m;
3158 	int listnode = nid;
3159 
3160 	if (hugetlb_early_cma(h))
3161 		m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact);
3162 	else {
3163 		if (node_exact)
3164 			m = memblock_alloc_exact_nid_raw(huge_page_size(h),
3165 				huge_page_size(h), 0,
3166 				MEMBLOCK_ALLOC_ACCESSIBLE, nid);
3167 		else {
3168 			m = memblock_alloc_try_nid_raw(huge_page_size(h),
3169 				huge_page_size(h), 0,
3170 				MEMBLOCK_ALLOC_ACCESSIBLE, nid);
3171 			/*
3172 			 * For pre-HVO to work correctly, pages need to be on
3173 			 * the list for the node they were actually allocated
3174 			 * from. That node may be different in the case of
3175 			 * fallback by memblock_alloc_try_nid_raw. So,
3176 			 * extract the actual node first.
3177 			 */
3178 			if (m)
3179 				listnode = early_pfn_to_nid(PHYS_PFN(virt_to_phys(m)));
3180 		}
3181 
3182 		if (m) {
3183 			m->flags = 0;
3184 			m->cma = NULL;
3185 		}
3186 	}
3187 
3188 	if (m) {
3189 		/*
3190 		 * Use the beginning of the huge page to store the
3191 		 * huge_bootmem_page struct (until gather_bootmem
3192 		 * puts them into the mem_map).
3193 		 *
3194 		 * Put them into a private list first because mem_map
3195 		 * is not up yet.
3196 		 */
3197 		INIT_LIST_HEAD(&m->list);
3198 		list_add(&m->list, &huge_boot_pages[listnode]);
3199 		m->hstate = h;
3200 	}
3201 
3202 	return m;
3203 }
3204 
3205 int alloc_bootmem_huge_page(struct hstate *h, int nid)
3206 	__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
3207 int __alloc_bootmem_huge_page(struct hstate *h, int nid)
3208 {
3209 	struct huge_bootmem_page *m = NULL; /* initialize for clang */
3210 	int nr_nodes, node = nid;
3211 
3212 	/* do node specific alloc */
3213 	if (nid != NUMA_NO_NODE) {
3214 		m = alloc_bootmem(h, node, true);
3215 		if (!m)
3216 			return 0;
3217 		goto found;
3218 	}
3219 
3220 	/* allocate from next node when distributing huge pages */
3221 	for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node,
3222 				    &hugetlb_bootmem_nodes) {
3223 		m = alloc_bootmem(h, node, false);
3224 		if (!m)
3225 			return 0;
3226 		goto found;
3227 	}
3228 
3229 found:
3230 
3231 	/*
3232 	 * Only initialize the head struct page in memmap_init_reserved_pages,
3233 	 * rest of the struct pages will be initialized by the HugeTLB
3234 	 * subsystem itself.
3235 	 * The head struct page is used to get folio information by the HugeTLB
3236 	 * subsystem like zone id and node id.
3237 	 */
3238 	memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
3239 		huge_page_size(h) - PAGE_SIZE);
3240 
3241 	return 1;
3242 }
3243 
3244 /* Initialize [start_page:end_page_number] tail struct pages of a hugepage */
3245 static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
3246 					unsigned long start_page_number,
3247 					unsigned long end_page_number)
3248 {
3249 	enum zone_type zone = zone_idx(folio_zone(folio));
3250 	int nid = folio_nid(folio);
3251 	struct page *page = folio_page(folio, start_page_number);
3252 	unsigned long head_pfn = folio_pfn(folio);
3253 	unsigned long pfn, end_pfn = head_pfn + end_page_number;
3254 
3255 	/*
3256 	 * As we marked all tail pages with memblock_reserved_mark_noinit(),
3257 	 * we must initialize them ourselves here.
3258 	 */
3259 	for (pfn = head_pfn + start_page_number; pfn < end_pfn; page++, pfn++) {
3260 		__init_single_page(page, pfn, zone, nid);
3261 		prep_compound_tail((struct page *)folio, pfn - head_pfn);
3262 		set_page_count(page, 0);
3263 	}
3264 }
3265 
3266 static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
3267 					      struct hstate *h,
3268 					      unsigned long nr_pages)
3269 {
3270 	int ret;
3271 
3272 	/*
3273 	 * This is an open-coded prep_compound_page() whereby we avoid
3274 	 * walking pages twice by initializing/preparing+freezing them in the
3275 	 * same go.
3276 	 */
3277 	__folio_clear_reserved(folio);
3278 	__folio_set_head(folio);
3279 	ret = folio_ref_freeze(folio, 1);
3280 	VM_BUG_ON(!ret);
3281 	hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages);
3282 	prep_compound_head((struct page *)folio, huge_page_order(h));
3283 }
3284 
3285 static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
3286 {
3287 	return m->flags & HUGE_BOOTMEM_HVO;
3288 }
3289 
3290 static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m)
3291 {
3292 	return m->flags & HUGE_BOOTMEM_CMA;
3293 }
3294 
3295 /*
3296  * memblock-allocated pageblocks might not have the migrate type set
3297  * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE)
3298  * here, or MIGRATE_CMA if this was a page allocated through an early CMA
3299  * reservation.
3300  *
3301  * In case of vmemmap optimized folios, the tail vmemmap pages are mapped
3302  * read-only, but that's ok - for sparse vmemmap this does not write to
3303  * the page structure.
3304  */
3305 static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
3306 							  struct hstate *h)
3307 {
3308 	unsigned long nr_pages = pages_per_huge_page(h), i;
3309 
3310 	WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio)));
3311 
3312 	for (i = 0; i < nr_pages; i += pageblock_nr_pages) {
3313 		if (folio_test_hugetlb_cma(folio))
3314 			init_cma_pageblock(folio_page(folio, i));
3315 		else
3316 			init_pageblock_migratetype(folio_page(folio, i),
3317 					  MIGRATE_MOVABLE, false);
3318 	}
3319 }
3320 
3321 static void __init prep_and_add_bootmem_folios(struct hstate *h,
3322 					struct list_head *folio_list)
3323 {
3324 	unsigned long flags;
3325 	struct folio *folio, *tmp_f;
3326 
3327 	/* Send list for bulk vmemmap optimization processing */
3328 	hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list);
3329 
3330 	list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
3331 		if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
3332 			/*
3333 			 * If HVO fails, initialize all tail struct pages
3334 			 * We do not worry about potential long lock hold
3335 			 * time as this is early in boot and there should
3336 			 * be no contention.
3337 			 */
3338 			hugetlb_folio_init_tail_vmemmap(folio,
3339 					HUGETLB_VMEMMAP_RESERVE_PAGES,
3340 					pages_per_huge_page(h));
3341 		}
3342 		hugetlb_bootmem_init_migratetype(folio, h);
3343 		/* Subdivide locks to achieve better parallel performance */
3344 		spin_lock_irqsave(&hugetlb_lock, flags);
3345 		account_new_hugetlb_folio(h, folio);
3346 		enqueue_hugetlb_folio(h, folio);
3347 		spin_unlock_irqrestore(&hugetlb_lock, flags);
3348 	}
3349 }
3350 
3351 bool __init hugetlb_bootmem_page_zones_valid(int nid,
3352 					     struct huge_bootmem_page *m)
3353 {
3354 	unsigned long start_pfn;
3355 	bool valid;
3356 
3357 	if (m->flags & HUGE_BOOTMEM_ZONES_VALID) {
3358 		/*
3359 		 * Already validated, skip check.
3360 		 */
3361 		return true;
3362 	}
3363 
3364 	if (hugetlb_bootmem_page_earlycma(m)) {
3365 		valid = cma_validate_zones(m->cma);
3366 		goto out;
3367 	}
3368 
3369 	start_pfn = virt_to_phys(m) >> PAGE_SHIFT;
3370 
3371 	valid = !pfn_range_intersects_zones(nid, start_pfn,
3372 			pages_per_huge_page(m->hstate));
3373 out:
3374 	if (!valid)
3375 		hstate_boot_nrinvalid[hstate_index(m->hstate)]++;
3376 
3377 	return valid;
3378 }
3379 
3380 /*
3381  * Free a bootmem page that was found to be invalid (intersecting with
3382  * multiple zones).
3383  *
3384  * Since it intersects with multiple zones, we can't just do a free
3385  * operation on all pages at once, but instead have to walk all
3386  * pages, freeing them one by one.
3387  */
3388 static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page,
3389 					     struct hstate *h)
3390 {
3391 	unsigned long npages = pages_per_huge_page(h);
3392 	unsigned long pfn;
3393 
3394 	while (npages--) {
3395 		pfn = page_to_pfn(page);
3396 		__init_page_from_nid(pfn, nid);
3397 		free_reserved_page(page);
3398 		page++;
3399 	}
3400 }
3401 
3402 /*
3403  * Put bootmem huge pages into the standard lists after mem_map is up.
3404  * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
3405  */
3406 static void __init gather_bootmem_prealloc_node(unsigned long nid)
3407 {
3408 	LIST_HEAD(folio_list);
3409 	struct huge_bootmem_page *m, *tm;
3410 	struct hstate *h = NULL, *prev_h = NULL;
3411 
3412 	list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
3413 		struct page *page = virt_to_page(m);
3414 		struct folio *folio = (void *)page;
3415 
3416 		h = m->hstate;
3417 		if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
3418 			/*
3419 			 * Can't use this page. Initialize the
3420 			 * page structures if that hasn't already
3421 			 * been done, and give them to the page
3422 			 * allocator.
3423 			 */
3424 			hugetlb_bootmem_free_invalid_page(nid, page, h);
3425 			continue;
3426 		}
3427 
3428 		/*
3429 		 * It is possible to have multiple huge page sizes (hstates)
3430 		 * in this list.  If so, process each size separately.
3431 		 */
3432 		if (h != prev_h && prev_h != NULL)
3433 			prep_and_add_bootmem_folios(prev_h, &folio_list);
3434 		prev_h = h;
3435 
3436 		VM_BUG_ON(!hstate_is_gigantic(h));
3437 		WARN_ON(folio_ref_count(folio) != 1);
3438 
3439 		hugetlb_folio_init_vmemmap(folio, h,
3440 					   HUGETLB_VMEMMAP_RESERVE_PAGES);
3441 		init_new_hugetlb_folio(folio);
3442 
3443 		if (hugetlb_bootmem_page_prehvo(m))
3444 			/*
3445 			 * If pre-HVO was done, just set the
3446 			 * flag, the HVO code will then skip
3447 			 * this folio.
3448 			 */
3449 			folio_set_hugetlb_vmemmap_optimized(folio);
3450 
3451 		if (hugetlb_bootmem_page_earlycma(m))
3452 			folio_set_hugetlb_cma(folio);
3453 
3454 		list_add(&folio->lru, &folio_list);
3455 
3456 		/*
3457 		 * We need to restore the 'stolen' pages to totalram_pages
3458 		 * in order to fix confusing memory reports from free(1) and
3459 		 * other side-effects, like CommitLimit going negative.
3460 		 *
3461 		 * For CMA pages, this is done in init_cma_pageblock
3462 		 * (via hugetlb_bootmem_init_migratetype), so skip it here.
3463 		 */
3464 		if (!folio_test_hugetlb_cma(folio))
3465 			adjust_managed_page_count(page, pages_per_huge_page(h));
3466 		cond_resched();
3467 	}
3468 
3469 	prep_and_add_bootmem_folios(h, &folio_list);
3470 }
3471 
3472 static void __init gather_bootmem_prealloc_parallel(unsigned long start,
3473 						    unsigned long end, void *arg)
3474 {
3475 	int nid;
3476 
3477 	for (nid = start; nid < end; nid++)
3478 		gather_bootmem_prealloc_node(nid);
3479 }
3480 
3481 static void __init gather_bootmem_prealloc(void)
3482 {
3483 	struct padata_mt_job job = {
3484 		.thread_fn	= gather_bootmem_prealloc_parallel,
3485 		.fn_arg		= NULL,
3486 		.start		= 0,
3487 		.size		= nr_node_ids,
3488 		.align		= 1,
3489 		.min_chunk	= 1,
3490 		.max_threads	= num_node_state(N_MEMORY),
3491 		.numa_aware	= true,
3492 	};
3493 
3494 	padata_do_multithreaded(&job);
3495 }
3496 
3497 static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
3498 {
3499 	unsigned long i;
3500 	char buf[32];
3501 	LIST_HEAD(folio_list);
3502 
3503 	for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
3504 		if (hstate_is_gigantic(h)) {
3505 			if (!alloc_bootmem_huge_page(h, nid))
3506 				break;
3507 		} else {
3508 			struct folio *folio;
3509 			gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
3510 
3511 			folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
3512 					&node_states[N_MEMORY], NULL);
3513 			if (!folio)
3514 				break;
3515 			list_add(&folio->lru, &folio_list);
3516 		}
3517 		cond_resched();
3518 	}
3519 
3520 	if (!list_empty(&folio_list))
3521 		prep_and_add_allocated_folios(h, &folio_list);
3522 
3523 	if (i == h->max_huge_pages_node[nid])
3524 		return;
3525 
3526 	string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3527 	pr_warn("HugeTLB: allocating %u of page size %s failed node%d.  Only allocated %lu hugepages.\n",
3528 		h->max_huge_pages_node[nid], buf, nid, i);
3529 	h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
3530 	h->max_huge_pages_node[nid] = i;
3531 }
3532 
3533 static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h)
3534 {
3535 	int i;
3536 	bool node_specific_alloc = false;
3537 
3538 	for_each_online_node(i) {
3539 		if (h->max_huge_pages_node[i] > 0) {
3540 			hugetlb_hstate_alloc_pages_onenode(h, i);
3541 			node_specific_alloc = true;
3542 		}
3543 	}
3544 
3545 	return node_specific_alloc;
3546 }
3547 
3548 static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h)
3549 {
3550 	if (allocated < h->max_huge_pages) {
3551 		char buf[32];
3552 
3553 		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3554 		pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
3555 			h->max_huge_pages, buf, allocated);
3556 		h->max_huge_pages = allocated;
3557 	}
3558 }
3559 
3560 static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg)
3561 {
3562 	struct hstate *h = (struct hstate *)arg;
3563 	int i, num = end - start;
3564 	nodemask_t node_alloc_noretry;
3565 	LIST_HEAD(folio_list);
3566 	int next_node = first_online_node;
3567 
3568 	/* Bit mask controlling how hard we retry per-node allocations.*/
3569 	nodes_clear(node_alloc_noretry);
3570 
3571 	for (i = 0; i < num; ++i) {
3572 		struct folio *folio;
3573 
3574 		if (hugetlb_vmemmap_optimizable_size(h) &&
3575 		    (si_mem_available() == 0) && !list_empty(&folio_list)) {
3576 			prep_and_add_allocated_folios(h, &folio_list);
3577 			INIT_LIST_HEAD(&folio_list);
3578 		}
3579 		folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
3580 						&node_alloc_noretry, &next_node);
3581 		if (!folio)
3582 			break;
3583 
3584 		list_move(&folio->lru, &folio_list);
3585 		cond_resched();
3586 	}
3587 
3588 	prep_and_add_allocated_folios(h, &folio_list);
3589 }
3590 
3591 static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
3592 {
3593 	unsigned long i;
3594 
3595 	for (i = 0; i < h->max_huge_pages; ++i) {
3596 		if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
3597 			break;
3598 		cond_resched();
3599 	}
3600 
3601 	return i;
3602 }
3603 
3604 static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
3605 {
3606 	struct padata_mt_job job = {
3607 		.fn_arg		= h,
3608 		.align		= 1,
3609 		.numa_aware	= true
3610 	};
3611 
3612 	unsigned long jiffies_start;
3613 	unsigned long jiffies_end;
3614 	unsigned long remaining;
3615 
3616 	job.thread_fn	= hugetlb_pages_alloc_boot_node;
3617 
3618 	/*
3619 	 * job.max_threads is 25% of the available cpu threads by default.
3620 	 *
3621 	 * On large servers with terabytes of memory, huge page allocation
3622 	 * can consume a considerably amount of time.
3623 	 *
3624 	 * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages.
3625 	 * 2MiB huge pages. Using more threads can significantly improve allocation time.
3626 	 *
3627 	 * +-----------------------+-------+-------+-------+-------+-------+
3628 	 * | threads               |   8   |   16  |   32  |   64  |   128 |
3629 	 * +-----------------------+-------+-------+-------+-------+-------+
3630 	 * | skylake      144 cpus |   44s |   22s |   16s |   19s |   20s |
3631 	 * | cascade lake 192 cpus |   39s |   20s |   11s |   10s |    9s |
3632 	 * +-----------------------+-------+-------+-------+-------+-------+
3633 	 */
3634 	if (hugepage_allocation_threads == 0) {
3635 		hugepage_allocation_threads = num_online_cpus() / 4;
3636 		hugepage_allocation_threads = max(hugepage_allocation_threads, 1);
3637 	}
3638 
3639 	job.max_threads	= hugepage_allocation_threads;
3640 
3641 	jiffies_start = jiffies;
3642 	do {
3643 		remaining = h->max_huge_pages - h->nr_huge_pages;
3644 
3645 		job.start     = h->nr_huge_pages;
3646 		job.size      = remaining;
3647 		job.min_chunk = remaining / hugepage_allocation_threads;
3648 		padata_do_multithreaded(&job);
3649 
3650 		if (h->nr_huge_pages == h->max_huge_pages)
3651 			break;
3652 
3653 		/*
3654 		 * Retry only if the vmemmap optimization might have been able to free
3655 		 * some memory back to the system.
3656 		 */
3657 		if (!hugetlb_vmemmap_optimizable(h))
3658 			break;
3659 
3660 		/* Continue if progress was made in last iteration */
3661 	} while (remaining != (h->max_huge_pages - h->nr_huge_pages));
3662 
3663 	jiffies_end = jiffies;
3664 
3665 	pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n",
3666 		jiffies_to_msecs(jiffies_end - jiffies_start),
3667 		hugepage_allocation_threads);
3668 
3669 	return h->nr_huge_pages;
3670 }
3671 
3672 /*
3673  * NOTE: this routine is called in different contexts for gigantic and
3674  * non-gigantic pages.
3675  * - For gigantic pages, this is called early in the boot process and
3676  *   pages are allocated from memblock allocated or something similar.
3677  *   Gigantic pages are actually added to pools later with the routine
3678  *   gather_bootmem_prealloc.
3679  * - For non-gigantic pages, this is called later in the boot process after
3680  *   all of mm is up and functional.  Pages are allocated from buddy and
3681  *   then added to hugetlb pools.
3682  */
3683 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
3684 {
3685 	unsigned long allocated;
3686 
3687 	/*
3688 	 * Skip gigantic hugepages allocation if early CMA
3689 	 * reservations are not available.
3690 	 */
3691 	if (hstate_is_gigantic(h) && hugetlb_cma_total_size() &&
3692 	    !hugetlb_early_cma(h)) {
3693 		pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
3694 		return;
3695 	}
3696 
3697 	if (!h->max_huge_pages)
3698 		return;
3699 
3700 	/* do node specific alloc */
3701 	if (hugetlb_hstate_alloc_pages_specific_nodes(h))
3702 		return;
3703 
3704 	/* below will do all node balanced alloc */
3705 	if (hstate_is_gigantic(h))
3706 		allocated = hugetlb_gigantic_pages_alloc_boot(h);
3707 	else
3708 		allocated = hugetlb_pages_alloc_boot(h);
3709 
3710 	hugetlb_hstate_alloc_pages_errcheck(allocated, h);
3711 }
3712 
3713 static void __init hugetlb_init_hstates(void)
3714 {
3715 	struct hstate *h, *h2;
3716 
3717 	for_each_hstate(h) {
3718 		/*
3719 		 * Always reset to first_memory_node here, even if
3720 		 * next_nid_to_alloc was set before - we can't
3721 		 * reference hugetlb_bootmem_nodes after init, and
3722 		 * first_memory_node is right for all further allocations.
3723 		 */
3724 		h->next_nid_to_alloc = first_memory_node;
3725 		h->next_nid_to_free = first_memory_node;
3726 
3727 		/* oversize hugepages were init'ed in early boot */
3728 		if (!hstate_is_gigantic(h))
3729 			hugetlb_hstate_alloc_pages(h);
3730 
3731 		/*
3732 		 * Set demote order for each hstate.  Note that
3733 		 * h->demote_order is initially 0.
3734 		 * - We can not demote gigantic pages if runtime freeing
3735 		 *   is not supported, so skip this.
3736 		 * - If CMA allocation is possible, we can not demote
3737 		 *   HUGETLB_PAGE_ORDER or smaller size pages.
3738 		 */
3739 		if (hstate_is_gigantic_no_runtime(h))
3740 			continue;
3741 		if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER)
3742 			continue;
3743 		for_each_hstate(h2) {
3744 			if (h2 == h)
3745 				continue;
3746 			if (h2->order < h->order &&
3747 			    h2->order > h->demote_order)
3748 				h->demote_order = h2->order;
3749 		}
3750 	}
3751 }
3752 
3753 static void __init report_hugepages(void)
3754 {
3755 	struct hstate *h;
3756 	unsigned long nrinvalid;
3757 
3758 	for_each_hstate(h) {
3759 		char buf[32];
3760 
3761 		nrinvalid = hstate_boot_nrinvalid[hstate_index(h)];
3762 		h->max_huge_pages -= nrinvalid;
3763 
3764 		string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3765 		pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
3766 			buf, h->nr_huge_pages);
3767 		if (nrinvalid)
3768 			pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
3769 					buf, nrinvalid, str_plural(nrinvalid));
3770 		pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
3771 			hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
3772 	}
3773 }
3774 
3775 #ifdef CONFIG_HIGHMEM
3776 static void try_to_free_low(struct hstate *h, unsigned long count,
3777 						nodemask_t *nodes_allowed)
3778 {
3779 	int i;
3780 	LIST_HEAD(page_list);
3781 
3782 	lockdep_assert_held(&hugetlb_lock);
3783 	if (hstate_is_gigantic(h))
3784 		return;
3785 
3786 	/*
3787 	 * Collect pages to be freed on a list, and free after dropping lock
3788 	 */
3789 	for_each_node_mask(i, *nodes_allowed) {
3790 		struct folio *folio, *next;
3791 		struct list_head *freel = &h->hugepage_freelists[i];
3792 		list_for_each_entry_safe(folio, next, freel, lru) {
3793 			if (count >= h->nr_huge_pages)
3794 				goto out;
3795 			if (folio_test_highmem(folio))
3796 				continue;
3797 			remove_hugetlb_folio(h, folio, false);
3798 			list_add(&folio->lru, &page_list);
3799 		}
3800 	}
3801 
3802 out:
3803 	spin_unlock_irq(&hugetlb_lock);
3804 	update_and_free_pages_bulk(h, &page_list);
3805 	spin_lock_irq(&hugetlb_lock);
3806 }
3807 #else
3808 static inline void try_to_free_low(struct hstate *h, unsigned long count,
3809 						nodemask_t *nodes_allowed)
3810 {
3811 }
3812 #endif
3813 
3814 /*
3815  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
3816  * balanced by operating on them in a round-robin fashion.
3817  * Returns 1 if an adjustment was made.
3818  */
3819 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
3820 				int delta)
3821 {
3822 	int nr_nodes, node;
3823 
3824 	lockdep_assert_held(&hugetlb_lock);
3825 	VM_BUG_ON(delta != -1 && delta != 1);
3826 
3827 	if (delta < 0) {
3828 		for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) {
3829 			if (h->surplus_huge_pages_node[node])
3830 				goto found;
3831 		}
3832 	} else {
3833 		for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3834 			if (h->surplus_huge_pages_node[node] <
3835 					h->nr_huge_pages_node[node])
3836 				goto found;
3837 		}
3838 	}
3839 	return 0;
3840 
3841 found:
3842 	h->surplus_huge_pages += delta;
3843 	h->surplus_huge_pages_node[node] += delta;
3844 	return 1;
3845 }
3846 
3847 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
3848 static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
3849 			      nodemask_t *nodes_allowed)
3850 {
3851 	unsigned long persistent_free_count;
3852 	unsigned long min_count;
3853 	unsigned long allocated;
3854 	struct folio *folio;
3855 	LIST_HEAD(page_list);
3856 	NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
3857 
3858 	/*
3859 	 * Bit mask controlling how hard we retry per-node allocations.
3860 	 * If we can not allocate the bit mask, do not attempt to allocate
3861 	 * the requested huge pages.
3862 	 */
3863 	if (node_alloc_noretry)
3864 		nodes_clear(*node_alloc_noretry);
3865 	else
3866 		return -ENOMEM;
3867 
3868 	/*
3869 	 * resize_lock mutex prevents concurrent adjustments to number of
3870 	 * pages in hstate via the proc/sysfs interfaces.
3871 	 */
3872 	mutex_lock(&h->resize_lock);
3873 	flush_free_hpage_work(h);
3874 	spin_lock_irq(&hugetlb_lock);
3875 
3876 	/*
3877 	 * Check for a node specific request.
3878 	 * Changing node specific huge page count may require a corresponding
3879 	 * change to the global count.  In any case, the passed node mask
3880 	 * (nodes_allowed) will restrict alloc/free to the specified node.
3881 	 */
3882 	if (nid != NUMA_NO_NODE) {
3883 		unsigned long old_count = count;
3884 
3885 		count += persistent_huge_pages(h) -
3886 			 (h->nr_huge_pages_node[nid] -
3887 			  h->surplus_huge_pages_node[nid]);
3888 		/*
3889 		 * User may have specified a large count value which caused the
3890 		 * above calculation to overflow.  In this case, they wanted
3891 		 * to allocate as many huge pages as possible.  Set count to
3892 		 * largest possible value to align with their intention.
3893 		 */
3894 		if (count < old_count)
3895 			count = ULONG_MAX;
3896 	}
3897 
3898 	/*
3899 	 * Gigantic pages runtime allocation depend on the capability for large
3900 	 * page range allocation.
3901 	 * If the system does not provide this feature, return an error when
3902 	 * the user tries to allocate gigantic pages but let the user free the
3903 	 * boottime allocated gigantic pages.
3904 	 */
3905 	if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
3906 		if (count > persistent_huge_pages(h)) {
3907 			spin_unlock_irq(&hugetlb_lock);
3908 			mutex_unlock(&h->resize_lock);
3909 			NODEMASK_FREE(node_alloc_noretry);
3910 			return -EINVAL;
3911 		}
3912 		/* Fall through to decrease pool */
3913 	}
3914 
3915 	/*
3916 	 * Increase the pool size
3917 	 * First take pages out of surplus state.  Then make up the
3918 	 * remaining difference by allocating fresh huge pages.
3919 	 *
3920 	 * We might race with alloc_surplus_hugetlb_folio() here and be unable
3921 	 * to convert a surplus huge page to a normal huge page. That is
3922 	 * not critical, though, it just means the overall size of the
3923 	 * pool might be one hugepage larger than it needs to be, but
3924 	 * within all the constraints specified by the sysctls.
3925 	 */
3926 	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
3927 		if (!adjust_pool_surplus(h, nodes_allowed, -1))
3928 			break;
3929 	}
3930 
3931 	allocated = 0;
3932 	while (count > (persistent_huge_pages(h) + allocated)) {
3933 		/*
3934 		 * If this allocation races such that we no longer need the
3935 		 * page, free_huge_folio will handle it by freeing the page
3936 		 * and reducing the surplus.
3937 		 */
3938 		spin_unlock_irq(&hugetlb_lock);
3939 
3940 		/* yield cpu to avoid soft lockup */
3941 		cond_resched();
3942 
3943 		folio = alloc_pool_huge_folio(h, nodes_allowed,
3944 						node_alloc_noretry,
3945 						&h->next_nid_to_alloc);
3946 		if (!folio) {
3947 			prep_and_add_allocated_folios(h, &page_list);
3948 			spin_lock_irq(&hugetlb_lock);
3949 			goto out;
3950 		}
3951 
3952 		list_add(&folio->lru, &page_list);
3953 		allocated++;
3954 
3955 		/* Bail for signals. Probably ctrl-c from user */
3956 		if (signal_pending(current)) {
3957 			prep_and_add_allocated_folios(h, &page_list);
3958 			spin_lock_irq(&hugetlb_lock);
3959 			goto out;
3960 		}
3961 
3962 		spin_lock_irq(&hugetlb_lock);
3963 	}
3964 
3965 	/* Add allocated pages to the pool */
3966 	if (!list_empty(&page_list)) {
3967 		spin_unlock_irq(&hugetlb_lock);
3968 		prep_and_add_allocated_folios(h, &page_list);
3969 		spin_lock_irq(&hugetlb_lock);
3970 	}
3971 
3972 	/*
3973 	 * Decrease the pool size
3974 	 * First return free pages to the buddy allocator (being careful
3975 	 * to keep enough around to satisfy reservations).  Then place
3976 	 * pages into surplus state as needed so the pool will shrink
3977 	 * to the desired size as pages become free.
3978 	 *
3979 	 * By placing pages into the surplus state independent of the
3980 	 * overcommit value, we are allowing the surplus pool size to
3981 	 * exceed overcommit. There are few sane options here. Since
3982 	 * alloc_surplus_hugetlb_folio() is checking the global counter,
3983 	 * though, we'll note that we're not allowed to exceed surplus
3984 	 * and won't grow the pool anywhere else. Not until one of the
3985 	 * sysctls are changed, or the surplus pages go out of use.
3986 	 *
3987 	 * min_count is the expected number of persistent pages, we
3988 	 * shouldn't calculate min_count by using
3989 	 * resv_huge_pages + persistent_huge_pages() - free_huge_pages,
3990 	 * because there may exist free surplus huge pages, and this will
3991 	 * lead to subtracting twice. Free surplus huge pages come from HVO
3992 	 * failing to restore vmemmap, see comments in the callers of
3993 	 * hugetlb_vmemmap_restore_folio(). Thus, we should calculate
3994 	 * persistent free count first.
3995 	 */
3996 	persistent_free_count = h->free_huge_pages;
3997 	if (h->free_huge_pages > persistent_huge_pages(h)) {
3998 		if (h->free_huge_pages > h->surplus_huge_pages)
3999 			persistent_free_count -= h->surplus_huge_pages;
4000 		else
4001 			persistent_free_count = 0;
4002 	}
4003 	min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count;
4004 	min_count = max(count, min_count);
4005 	try_to_free_low(h, min_count, nodes_allowed);
4006 
4007 	/*
4008 	 * Collect pages to be removed on list without dropping lock
4009 	 */
4010 	while (min_count < persistent_huge_pages(h)) {
4011 		folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0);
4012 		if (!folio)
4013 			break;
4014 
4015 		list_add(&folio->lru, &page_list);
4016 	}
4017 	/* free the pages after dropping lock */
4018 	spin_unlock_irq(&hugetlb_lock);
4019 	update_and_free_pages_bulk(h, &page_list);
4020 	flush_free_hpage_work(h);
4021 	spin_lock_irq(&hugetlb_lock);
4022 
4023 	while (count < persistent_huge_pages(h)) {
4024 		if (!adjust_pool_surplus(h, nodes_allowed, 1))
4025 			break;
4026 	}
4027 out:
4028 	h->max_huge_pages = persistent_huge_pages(h);
4029 	spin_unlock_irq(&hugetlb_lock);
4030 	mutex_unlock(&h->resize_lock);
4031 
4032 	NODEMASK_FREE(node_alloc_noretry);
4033 
4034 	return 0;
4035 }
4036 
4037 static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
4038 				       struct list_head *src_list)
4039 {
4040 	long rc;
4041 	struct folio *folio, *next;
4042 	LIST_HEAD(dst_list);
4043 	LIST_HEAD(ret_list);
4044 
4045 	rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list);
4046 	list_splice_init(&ret_list, src_list);
4047 
4048 	/*
4049 	 * Taking target hstate mutex synchronizes with set_max_huge_pages.
4050 	 * Without the mutex, pages added to target hstate could be marked
4051 	 * as surplus.
4052 	 *
4053 	 * Note that we already hold src->resize_lock.  To prevent deadlock,
4054 	 * use the convention of always taking larger size hstate mutex first.
4055 	 */
4056 	mutex_lock(&dst->resize_lock);
4057 
4058 	list_for_each_entry_safe(folio, next, src_list, lru) {
4059 		int i;
4060 		bool cma;
4061 
4062 		if (folio_test_hugetlb_vmemmap_optimized(folio))
4063 			continue;
4064 
4065 		cma = folio_test_hugetlb_cma(folio);
4066 
4067 		list_del(&folio->lru);
4068 
4069 		split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst));
4070 		pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst));
4071 
4072 		for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
4073 			struct page *page = folio_page(folio, i);
4074 			/* Careful: see __split_huge_page_tail() */
4075 			struct folio *new_folio = (struct folio *)page;
4076 
4077 			clear_compound_head(page);
4078 			prep_compound_page(page, dst->order);
4079 
4080 			new_folio->mapping = NULL;
4081 			init_new_hugetlb_folio(new_folio);
4082 			/* Copy the CMA flag so that it is freed correctly */
4083 			if (cma)
4084 				folio_set_hugetlb_cma(new_folio);
4085 			list_add(&new_folio->lru, &dst_list);
4086 		}
4087 	}
4088 
4089 	prep_and_add_allocated_folios(dst, &dst_list);
4090 
4091 	mutex_unlock(&dst->resize_lock);
4092 
4093 	return rc;
4094 }
4095 
4096 static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed,
4097 				  unsigned long nr_to_demote)
4098 	__must_hold(&hugetlb_lock)
4099 {
4100 	int nr_nodes, node;
4101 	struct hstate *dst;
4102 	long rc = 0;
4103 	long nr_demoted = 0;
4104 
4105 	lockdep_assert_held(&hugetlb_lock);
4106 
4107 	/* We should never get here if no demote order */
4108 	if (!src->demote_order) {
4109 		pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
4110 		return -EINVAL;		/* internal error */
4111 	}
4112 	dst = size_to_hstate(PAGE_SIZE << src->demote_order);
4113 
4114 	for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) {
4115 		LIST_HEAD(list);
4116 		struct folio *folio, *next;
4117 
4118 		list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) {
4119 			if (folio_test_hwpoison(folio))
4120 				continue;
4121 
4122 			remove_hugetlb_folio(src, folio, false);
4123 			list_add(&folio->lru, &list);
4124 
4125 			if (++nr_demoted == nr_to_demote)
4126 				break;
4127 		}
4128 
4129 		spin_unlock_irq(&hugetlb_lock);
4130 
4131 		rc = demote_free_hugetlb_folios(src, dst, &list);
4132 
4133 		spin_lock_irq(&hugetlb_lock);
4134 
4135 		list_for_each_entry_safe(folio, next, &list, lru) {
4136 			list_del(&folio->lru);
4137 			add_hugetlb_folio(src, folio, false);
4138 
4139 			nr_demoted--;
4140 		}
4141 
4142 		if (rc < 0 || nr_demoted == nr_to_demote)
4143 			break;
4144 	}
4145 
4146 	/*
4147 	 * Not absolutely necessary, but for consistency update max_huge_pages
4148 	 * based on pool changes for the demoted page.
4149 	 */
4150 	src->max_huge_pages -= nr_demoted;
4151 	dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst));
4152 
4153 	if (rc < 0)
4154 		return rc;
4155 
4156 	if (nr_demoted)
4157 		return nr_demoted;
4158 	/*
4159 	 * Only way to get here is if all pages on free lists are poisoned.
4160 	 * Return -EBUSY so that caller will not retry.
4161 	 */
4162 	return -EBUSY;
4163 }
4164 
4165 #define HSTATE_ATTR_RO(_name) \
4166 	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
4167 
4168 #define HSTATE_ATTR_WO(_name) \
4169 	static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
4170 
4171 #define HSTATE_ATTR(_name) \
4172 	static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
4173 
4174 static struct kobject *hugepages_kobj;
4175 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
4176 
4177 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
4178 
4179 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
4180 {
4181 	int i;
4182 
4183 	for (i = 0; i < HUGE_MAX_HSTATE; i++)
4184 		if (hstate_kobjs[i] == kobj) {
4185 			if (nidp)
4186 				*nidp = NUMA_NO_NODE;
4187 			return &hstates[i];
4188 		}
4189 
4190 	return kobj_to_node_hstate(kobj, nidp);
4191 }
4192 
4193 static ssize_t nr_hugepages_show_common(struct kobject *kobj,
4194 					struct kobj_attribute *attr, char *buf)
4195 {
4196 	struct hstate *h;
4197 	unsigned long nr_huge_pages;
4198 	int nid;
4199 
4200 	h = kobj_to_hstate(kobj, &nid);
4201 	if (nid == NUMA_NO_NODE)
4202 		nr_huge_pages = h->nr_huge_pages;
4203 	else
4204 		nr_huge_pages = h->nr_huge_pages_node[nid];
4205 
4206 	return sysfs_emit(buf, "%lu\n", nr_huge_pages);
4207 }
4208 
4209 static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
4210 					   struct hstate *h, int nid,
4211 					   unsigned long count, size_t len)
4212 {
4213 	int err;
4214 	nodemask_t nodes_allowed, *n_mask;
4215 
4216 	if (hstate_is_gigantic_no_runtime(h))
4217 		return -EINVAL;
4218 
4219 	if (nid == NUMA_NO_NODE) {
4220 		/*
4221 		 * global hstate attribute
4222 		 */
4223 		if (!(obey_mempolicy &&
4224 				init_nodemask_of_mempolicy(&nodes_allowed)))
4225 			n_mask = &node_states[N_MEMORY];
4226 		else
4227 			n_mask = &nodes_allowed;
4228 	} else {
4229 		/*
4230 		 * Node specific request.  count adjustment happens in
4231 		 * set_max_huge_pages() after acquiring hugetlb_lock.
4232 		 */
4233 		init_nodemask_of_node(&nodes_allowed, nid);
4234 		n_mask = &nodes_allowed;
4235 	}
4236 
4237 	err = set_max_huge_pages(h, count, nid, n_mask);
4238 
4239 	return err ? err : len;
4240 }
4241 
4242 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
4243 					 struct kobject *kobj, const char *buf,
4244 					 size_t len)
4245 {
4246 	struct hstate *h;
4247 	unsigned long count;
4248 	int nid;
4249 	int err;
4250 
4251 	err = kstrtoul(buf, 10, &count);
4252 	if (err)
4253 		return err;
4254 
4255 	h = kobj_to_hstate(kobj, &nid);
4256 	return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
4257 }
4258 
4259 static ssize_t nr_hugepages_show(struct kobject *kobj,
4260 				       struct kobj_attribute *attr, char *buf)
4261 {
4262 	return nr_hugepages_show_common(kobj, attr, buf);
4263 }
4264 
4265 static ssize_t nr_hugepages_store(struct kobject *kobj,
4266 	       struct kobj_attribute *attr, const char *buf, size_t len)
4267 {
4268 	return nr_hugepages_store_common(false, kobj, buf, len);
4269 }
4270 HSTATE_ATTR(nr_hugepages);
4271 
4272 #ifdef CONFIG_NUMA
4273 
4274 /*
4275  * hstate attribute for optionally mempolicy-based constraint on persistent
4276  * huge page alloc/free.
4277  */
4278 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
4279 					   struct kobj_attribute *attr,
4280 					   char *buf)
4281 {
4282 	return nr_hugepages_show_common(kobj, attr, buf);
4283 }
4284 
4285 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
4286 	       struct kobj_attribute *attr, const char *buf, size_t len)
4287 {
4288 	return nr_hugepages_store_common(true, kobj, buf, len);
4289 }
4290 HSTATE_ATTR(nr_hugepages_mempolicy);
4291 #endif
4292 
4293 
4294 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
4295 					struct kobj_attribute *attr, char *buf)
4296 {
4297 	struct hstate *h = kobj_to_hstate(kobj, NULL);
4298 	return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
4299 }
4300 
4301 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
4302 		struct kobj_attribute *attr, const char *buf, size_t count)
4303 {
4304 	int err;
4305 	unsigned long input;
4306 	struct hstate *h = kobj_to_hstate(kobj, NULL);
4307 
4308 	if (hstate_is_gigantic_no_runtime(h))
4309 		return -EINVAL;
4310 
4311 	err = kstrtoul(buf, 10, &input);
4312 	if (err)
4313 		return err;
4314 
4315 	spin_lock_irq(&hugetlb_lock);
4316 	h->nr_overcommit_huge_pages = input;
4317 	spin_unlock_irq(&hugetlb_lock);
4318 
4319 	return count;
4320 }
4321 HSTATE_ATTR(nr_overcommit_hugepages);
4322 
4323 static ssize_t free_hugepages_show(struct kobject *kobj,
4324 					struct kobj_attribute *attr, char *buf)
4325 {
4326 	struct hstate *h;
4327 	unsigned long free_huge_pages;
4328 	int nid;
4329 
4330 	h = kobj_to_hstate(kobj, &nid);
4331 	if (nid == NUMA_NO_NODE)
4332 		free_huge_pages = h->free_huge_pages;
4333 	else
4334 		free_huge_pages = h->free_huge_pages_node[nid];
4335 
4336 	return sysfs_emit(buf, "%lu\n", free_huge_pages);
4337 }
4338 HSTATE_ATTR_RO(free_hugepages);
4339 
4340 static ssize_t resv_hugepages_show(struct kobject *kobj,
4341 					struct kobj_attribute *attr, char *buf)
4342 {
4343 	struct hstate *h = kobj_to_hstate(kobj, NULL);
4344 	return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
4345 }
4346 HSTATE_ATTR_RO(resv_hugepages);
4347 
4348 static ssize_t surplus_hugepages_show(struct kobject *kobj,
4349 					struct kobj_attribute *attr, char *buf)
4350 {
4351 	struct hstate *h;
4352 	unsigned long surplus_huge_pages;
4353 	int nid;
4354 
4355 	h = kobj_to_hstate(kobj, &nid);
4356 	if (nid == NUMA_NO_NODE)
4357 		surplus_huge_pages = h->surplus_huge_pages;
4358 	else
4359 		surplus_huge_pages = h->surplus_huge_pages_node[nid];
4360 
4361 	return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
4362 }
4363 HSTATE_ATTR_RO(surplus_hugepages);
4364 
4365 static ssize_t demote_store(struct kobject *kobj,
4366 	       struct kobj_attribute *attr, const char *buf, size_t len)
4367 {
4368 	unsigned long nr_demote;
4369 	unsigned long nr_available;
4370 	nodemask_t nodes_allowed, *n_mask;
4371 	struct hstate *h;
4372 	int err;
4373 	int nid;
4374 
4375 	err = kstrtoul(buf, 10, &nr_demote);
4376 	if (err)
4377 		return err;
4378 	h = kobj_to_hstate(kobj, &nid);
4379 
4380 	if (nid != NUMA_NO_NODE) {
4381 		init_nodemask_of_node(&nodes_allowed, nid);
4382 		n_mask = &nodes_allowed;
4383 	} else {
4384 		n_mask = &node_states[N_MEMORY];
4385 	}
4386 
4387 	/* Synchronize with other sysfs operations modifying huge pages */
4388 	mutex_lock(&h->resize_lock);
4389 	spin_lock_irq(&hugetlb_lock);
4390 
4391 	while (nr_demote) {
4392 		long rc;
4393 
4394 		/*
4395 		 * Check for available pages to demote each time thorough the
4396 		 * loop as demote_pool_huge_page will drop hugetlb_lock.
4397 		 */
4398 		if (nid != NUMA_NO_NODE)
4399 			nr_available = h->free_huge_pages_node[nid];
4400 		else
4401 			nr_available = h->free_huge_pages;
4402 		nr_available -= h->resv_huge_pages;
4403 		if (!nr_available)
4404 			break;
4405 
4406 		rc = demote_pool_huge_page(h, n_mask, nr_demote);
4407 		if (rc < 0) {
4408 			err = rc;
4409 			break;
4410 		}
4411 
4412 		nr_demote -= rc;
4413 	}
4414 
4415 	spin_unlock_irq(&hugetlb_lock);
4416 	mutex_unlock(&h->resize_lock);
4417 
4418 	if (err)
4419 		return err;
4420 	return len;
4421 }
4422 HSTATE_ATTR_WO(demote);
4423 
4424 static ssize_t demote_size_show(struct kobject *kobj,
4425 					struct kobj_attribute *attr, char *buf)
4426 {
4427 	struct hstate *h = kobj_to_hstate(kobj, NULL);
4428 	unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
4429 
4430 	return sysfs_emit(buf, "%lukB\n", demote_size);
4431 }
4432 
4433 static ssize_t demote_size_store(struct kobject *kobj,
4434 					struct kobj_attribute *attr,
4435 					const char *buf, size_t count)
4436 {
4437 	struct hstate *h, *demote_hstate;
4438 	unsigned long demote_size;
4439 	unsigned int demote_order;
4440 
4441 	demote_size = (unsigned long)memparse(buf, NULL);
4442 
4443 	demote_hstate = size_to_hstate(demote_size);
4444 	if (!demote_hstate)
4445 		return -EINVAL;
4446 	demote_order = demote_hstate->order;
4447 	if (demote_order < HUGETLB_PAGE_ORDER)
4448 		return -EINVAL;
4449 
4450 	/* demote order must be smaller than hstate order */
4451 	h = kobj_to_hstate(kobj, NULL);
4452 	if (demote_order >= h->order)
4453 		return -EINVAL;
4454 
4455 	/* resize_lock synchronizes access to demote size and writes */
4456 	mutex_lock(&h->resize_lock);
4457 	h->demote_order = demote_order;
4458 	mutex_unlock(&h->resize_lock);
4459 
4460 	return count;
4461 }
4462 HSTATE_ATTR(demote_size);
4463 
4464 static struct attribute *hstate_attrs[] = {
4465 	&nr_hugepages_attr.attr,
4466 	&nr_overcommit_hugepages_attr.attr,
4467 	&free_hugepages_attr.attr,
4468 	&resv_hugepages_attr.attr,
4469 	&surplus_hugepages_attr.attr,
4470 #ifdef CONFIG_NUMA
4471 	&nr_hugepages_mempolicy_attr.attr,
4472 #endif
4473 	NULL,
4474 };
4475 
4476 static const struct attribute_group hstate_attr_group = {
4477 	.attrs = hstate_attrs,
4478 };
4479 
4480 static struct attribute *hstate_demote_attrs[] = {
4481 	&demote_size_attr.attr,
4482 	&demote_attr.attr,
4483 	NULL,
4484 };
4485 
4486 static const struct attribute_group hstate_demote_attr_group = {
4487 	.attrs = hstate_demote_attrs,
4488 };
4489 
4490 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
4491 				    struct kobject **hstate_kobjs,
4492 				    const struct attribute_group *hstate_attr_group)
4493 {
4494 	int retval;
4495 	int hi = hstate_index(h);
4496 
4497 	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
4498 	if (!hstate_kobjs[hi])
4499 		return -ENOMEM;
4500 
4501 	retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
4502 	if (retval) {
4503 		kobject_put(hstate_kobjs[hi]);
4504 		hstate_kobjs[hi] = NULL;
4505 		return retval;
4506 	}
4507 
4508 	if (h->demote_order) {
4509 		retval = sysfs_create_group(hstate_kobjs[hi],
4510 					    &hstate_demote_attr_group);
4511 		if (retval) {
4512 			pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
4513 			sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
4514 			kobject_put(hstate_kobjs[hi]);
4515 			hstate_kobjs[hi] = NULL;
4516 			return retval;
4517 		}
4518 	}
4519 
4520 	return 0;
4521 }
4522 
4523 #ifdef CONFIG_NUMA
4524 static bool hugetlb_sysfs_initialized __ro_after_init;
4525 
4526 /*
4527  * node_hstate/s - associate per node hstate attributes, via their kobjects,
4528  * with node devices in node_devices[] using a parallel array.  The array
4529  * index of a node device or _hstate == node id.
4530  * This is here to avoid any static dependency of the node device driver, in
4531  * the base kernel, on the hugetlb module.
4532  */
4533 struct node_hstate {
4534 	struct kobject		*hugepages_kobj;
4535 	struct kobject		*hstate_kobjs[HUGE_MAX_HSTATE];
4536 };
4537 static struct node_hstate node_hstates[MAX_NUMNODES];
4538 
4539 /*
4540  * A subset of global hstate attributes for node devices
4541  */
4542 static struct attribute *per_node_hstate_attrs[] = {
4543 	&nr_hugepages_attr.attr,
4544 	&free_hugepages_attr.attr,
4545 	&surplus_hugepages_attr.attr,
4546 	NULL,
4547 };
4548 
4549 static const struct attribute_group per_node_hstate_attr_group = {
4550 	.attrs = per_node_hstate_attrs,
4551 };
4552 
4553 /*
4554  * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
4555  * Returns node id via non-NULL nidp.
4556  */
4557 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
4558 {
4559 	int nid;
4560 
4561 	for (nid = 0; nid < nr_node_ids; nid++) {
4562 		struct node_hstate *nhs = &node_hstates[nid];
4563 		int i;
4564 		for (i = 0; i < HUGE_MAX_HSTATE; i++)
4565 			if (nhs->hstate_kobjs[i] == kobj) {
4566 				if (nidp)
4567 					*nidp = nid;
4568 				return &hstates[i];
4569 			}
4570 	}
4571 
4572 	BUG();
4573 	return NULL;
4574 }
4575 
4576 /*
4577  * Unregister hstate attributes from a single node device.
4578  * No-op if no hstate attributes attached.
4579  */
4580 void hugetlb_unregister_node(struct node *node)
4581 {
4582 	struct hstate *h;
4583 	struct node_hstate *nhs = &node_hstates[node->dev.id];
4584 
4585 	if (!nhs->hugepages_kobj)
4586 		return;		/* no hstate attributes */
4587 
4588 	for_each_hstate(h) {
4589 		int idx = hstate_index(h);
4590 		struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
4591 
4592 		if (!hstate_kobj)
4593 			continue;
4594 		if (h->demote_order)
4595 			sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
4596 		sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
4597 		kobject_put(hstate_kobj);
4598 		nhs->hstate_kobjs[idx] = NULL;
4599 	}
4600 
4601 	kobject_put(nhs->hugepages_kobj);
4602 	nhs->hugepages_kobj = NULL;
4603 }
4604 
4605 
4606 /*
4607  * Register hstate attributes for a single node device.
4608  * No-op if attributes already registered.
4609  */
4610 void hugetlb_register_node(struct node *node)
4611 {
4612 	struct hstate *h;
4613 	struct node_hstate *nhs = &node_hstates[node->dev.id];
4614 	int err;
4615 
4616 	if (!hugetlb_sysfs_initialized)
4617 		return;
4618 
4619 	if (nhs->hugepages_kobj)
4620 		return;		/* already allocated */
4621 
4622 	nhs->hugepages_kobj = kobject_create_and_add("hugepages",
4623 							&node->dev.kobj);
4624 	if (!nhs->hugepages_kobj)
4625 		return;
4626 
4627 	for_each_hstate(h) {
4628 		err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
4629 						nhs->hstate_kobjs,
4630 						&per_node_hstate_attr_group);
4631 		if (err) {
4632 			pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
4633 				h->name, node->dev.id);
4634 			hugetlb_unregister_node(node);
4635 			break;
4636 		}
4637 	}
4638 }
4639 
4640 /*
4641  * hugetlb init time:  register hstate attributes for all registered node
4642  * devices of nodes that have memory.  All on-line nodes should have
4643  * registered their associated device by this time.
4644  */
4645 static void __init hugetlb_register_all_nodes(void)
4646 {
4647 	int nid;
4648 
4649 	for_each_online_node(nid)
4650 		hugetlb_register_node(node_devices[nid]);
4651 }
4652 #else	/* !CONFIG_NUMA */
4653 
4654 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
4655 {
4656 	BUG();
4657 	if (nidp)
4658 		*nidp = -1;
4659 	return NULL;
4660 }
4661 
4662 static void hugetlb_register_all_nodes(void) { }
4663 
4664 #endif
4665 
4666 static void __init hugetlb_sysfs_init(void)
4667 {
4668 	struct hstate *h;
4669 	int err;
4670 
4671 	hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
4672 	if (!hugepages_kobj)
4673 		return;
4674 
4675 	for_each_hstate(h) {
4676 		err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
4677 					 hstate_kobjs, &hstate_attr_group);
4678 		if (err)
4679 			pr_err("HugeTLB: Unable to add hstate %s\n", h->name);
4680 	}
4681 
4682 #ifdef CONFIG_NUMA
4683 	hugetlb_sysfs_initialized = true;
4684 #endif
4685 	hugetlb_register_all_nodes();
4686 }
4687 
4688 #ifdef CONFIG_SYSCTL
4689 static void hugetlb_sysctl_init(void);
4690 #else
4691 static inline void hugetlb_sysctl_init(void) { }
4692 #endif
4693 
4694 static int __init hugetlb_init(void)
4695 {
4696 	int i;
4697 
4698 	BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
4699 			__NR_HPAGEFLAGS);
4700 	BUILD_BUG_ON_INVALID(HUGETLB_PAGE_ORDER > MAX_FOLIO_ORDER);
4701 
4702 	if (!hugepages_supported()) {
4703 		if (hugetlb_max_hstate || default_hstate_max_huge_pages)
4704 			pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
4705 		return 0;
4706 	}
4707 
4708 	/*
4709 	 * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists.  Some
4710 	 * architectures depend on setup being done here.
4711 	 */
4712 	hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
4713 	if (!parsed_default_hugepagesz) {
4714 		/*
4715 		 * If we did not parse a default huge page size, set
4716 		 * default_hstate_idx to HPAGE_SIZE hstate. And, if the
4717 		 * number of huge pages for this default size was implicitly
4718 		 * specified, set that here as well.
4719 		 * Note that the implicit setting will overwrite an explicit
4720 		 * setting.  A warning will be printed in this case.
4721 		 */
4722 		default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
4723 		if (default_hstate_max_huge_pages) {
4724 			if (default_hstate.max_huge_pages) {
4725 				char buf[32];
4726 
4727 				string_get_size(huge_page_size(&default_hstate),
4728 					1, STRING_UNITS_2, buf, 32);
4729 				pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
4730 					default_hstate.max_huge_pages, buf);
4731 				pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
4732 					default_hstate_max_huge_pages);
4733 			}
4734 			default_hstate.max_huge_pages =
4735 				default_hstate_max_huge_pages;
4736 
4737 			for_each_online_node(i)
4738 				default_hstate.max_huge_pages_node[i] =
4739 					default_hugepages_in_node[i];
4740 		}
4741 	}
4742 
4743 	hugetlb_cma_check();
4744 	hugetlb_init_hstates();
4745 	gather_bootmem_prealloc();
4746 	report_hugepages();
4747 
4748 	hugetlb_sysfs_init();
4749 	hugetlb_cgroup_file_init();
4750 	hugetlb_sysctl_init();
4751 
4752 #ifdef CONFIG_SMP
4753 	num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
4754 #else
4755 	num_fault_mutexes = 1;
4756 #endif
4757 	hugetlb_fault_mutex_table =
4758 		kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
4759 			      GFP_KERNEL);
4760 	BUG_ON(!hugetlb_fault_mutex_table);
4761 
4762 	for (i = 0; i < num_fault_mutexes; i++)
4763 		mutex_init(&hugetlb_fault_mutex_table[i]);
4764 	return 0;
4765 }
4766 subsys_initcall(hugetlb_init);
4767 
4768 /* Overwritten by architectures with more huge page sizes */
4769 bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
4770 {
4771 	return size == HPAGE_SIZE;
4772 }
4773 
4774 void __init hugetlb_add_hstate(unsigned int order)
4775 {
4776 	struct hstate *h;
4777 	unsigned long i;
4778 
4779 	if (size_to_hstate(PAGE_SIZE << order)) {
4780 		return;
4781 	}
4782 	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
4783 	BUG_ON(order < order_base_2(__NR_USED_SUBPAGE));
4784 	WARN_ON(order > MAX_FOLIO_ORDER);
4785 	h = &hstates[hugetlb_max_hstate++];
4786 	__mutex_init(&h->resize_lock, "resize mutex", &h->resize_key);
4787 	h->order = order;
4788 	h->mask = ~(huge_page_size(h) - 1);
4789 	for (i = 0; i < MAX_NUMNODES; ++i)
4790 		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
4791 	INIT_LIST_HEAD(&h->hugepage_activelist);
4792 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
4793 					huge_page_size(h)/SZ_1K);
4794 
4795 	parsed_hstate = h;
4796 }
4797 
4798 bool __init __weak hugetlb_node_alloc_supported(void)
4799 {
4800 	return true;
4801 }
4802 
4803 static void __init hugepages_clear_pages_in_node(void)
4804 {
4805 	if (!hugetlb_max_hstate) {
4806 		default_hstate_max_huge_pages = 0;
4807 		memset(default_hugepages_in_node, 0,
4808 			sizeof(default_hugepages_in_node));
4809 	} else {
4810 		parsed_hstate->max_huge_pages = 0;
4811 		memset(parsed_hstate->max_huge_pages_node, 0,
4812 			sizeof(parsed_hstate->max_huge_pages_node));
4813 	}
4814 }
4815 
4816 static __init int hugetlb_add_param(char *s, int (*setup)(char *))
4817 {
4818 	size_t len;
4819 	char *p;
4820 
4821 	if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS)
4822 		return -EINVAL;
4823 
4824 	len = strlen(s) + 1;
4825 	if (len + hstate_cmdline_index > sizeof(hstate_cmdline_buf))
4826 		return -EINVAL;
4827 
4828 	p = &hstate_cmdline_buf[hstate_cmdline_index];
4829 	memcpy(p, s, len);
4830 	hstate_cmdline_index += len;
4831 
4832 	hugetlb_params[hugetlb_param_index].val = p;
4833 	hugetlb_params[hugetlb_param_index].setup = setup;
4834 
4835 	hugetlb_param_index++;
4836 
4837 	return 0;
4838 }
4839 
4840 static __init void hugetlb_parse_params(void)
4841 {
4842 	int i;
4843 	struct hugetlb_cmdline *hcp;
4844 
4845 	for (i = 0; i < hugetlb_param_index; i++) {
4846 		hcp = &hugetlb_params[i];
4847 
4848 		hcp->setup(hcp->val);
4849 	}
4850 
4851 	hugetlb_cma_validate_params();
4852 }
4853 
4854 /*
4855  * hugepages command line processing
4856  * hugepages normally follows a valid hugepagsz or default_hugepagsz
4857  * specification.  If not, ignore the hugepages value.  hugepages can also
4858  * be the first huge page command line  option in which case it implicitly
4859  * specifies the number of huge pages for the default size.
4860  */
4861 static int __init hugepages_setup(char *s)
4862 {
4863 	unsigned long *mhp;
4864 	static unsigned long *last_mhp;
4865 	int node = NUMA_NO_NODE;
4866 	int count;
4867 	unsigned long tmp;
4868 	char *p = s;
4869 
4870 	if (!parsed_valid_hugepagesz) {
4871 		pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
4872 		parsed_valid_hugepagesz = true;
4873 		return -EINVAL;
4874 	}
4875 
4876 	/*
4877 	 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
4878 	 * yet, so this hugepages= parameter goes to the "default hstate".
4879 	 * Otherwise, it goes with the previously parsed hugepagesz or
4880 	 * default_hugepagesz.
4881 	 */
4882 	else if (!hugetlb_max_hstate)
4883 		mhp = &default_hstate_max_huge_pages;
4884 	else
4885 		mhp = &parsed_hstate->max_huge_pages;
4886 
4887 	if (mhp == last_mhp) {
4888 		pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
4889 		return 1;
4890 	}
4891 
4892 	while (*p) {
4893 		count = 0;
4894 		if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4895 			goto invalid;
4896 		/* Parameter is node format */
4897 		if (p[count] == ':') {
4898 			if (!hugetlb_node_alloc_supported()) {
4899 				pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
4900 				return 1;
4901 			}
4902 			if (tmp >= MAX_NUMNODES || !node_online(tmp))
4903 				goto invalid;
4904 			node = array_index_nospec(tmp, MAX_NUMNODES);
4905 			p += count + 1;
4906 			/* Parse hugepages */
4907 			if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4908 				goto invalid;
4909 			if (!hugetlb_max_hstate)
4910 				default_hugepages_in_node[node] = tmp;
4911 			else
4912 				parsed_hstate->max_huge_pages_node[node] = tmp;
4913 			*mhp += tmp;
4914 			/* Go to parse next node*/
4915 			if (p[count] == ',')
4916 				p += count + 1;
4917 			else
4918 				break;
4919 		} else {
4920 			if (p != s)
4921 				goto invalid;
4922 			*mhp = tmp;
4923 			break;
4924 		}
4925 	}
4926 
4927 	last_mhp = mhp;
4928 
4929 	return 0;
4930 
4931 invalid:
4932 	pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
4933 	hugepages_clear_pages_in_node();
4934 	return -EINVAL;
4935 }
4936 hugetlb_early_param("hugepages", hugepages_setup);
4937 
4938 /*
4939  * hugepagesz command line processing
4940  * A specific huge page size can only be specified once with hugepagesz.
4941  * hugepagesz is followed by hugepages on the command line.  The global
4942  * variable 'parsed_valid_hugepagesz' is used to determine if prior
4943  * hugepagesz argument was valid.
4944  */
4945 static int __init hugepagesz_setup(char *s)
4946 {
4947 	unsigned long size;
4948 	struct hstate *h;
4949 
4950 	parsed_valid_hugepagesz = false;
4951 	size = (unsigned long)memparse(s, NULL);
4952 
4953 	if (!arch_hugetlb_valid_size(size)) {
4954 		pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
4955 		return -EINVAL;
4956 	}
4957 
4958 	h = size_to_hstate(size);
4959 	if (h) {
4960 		/*
4961 		 * hstate for this size already exists.  This is normally
4962 		 * an error, but is allowed if the existing hstate is the
4963 		 * default hstate.  More specifically, it is only allowed if
4964 		 * the number of huge pages for the default hstate was not
4965 		 * previously specified.
4966 		 */
4967 		if (!parsed_default_hugepagesz ||  h != &default_hstate ||
4968 		    default_hstate.max_huge_pages) {
4969 			pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
4970 			return -EINVAL;
4971 		}
4972 
4973 		/*
4974 		 * No need to call hugetlb_add_hstate() as hstate already
4975 		 * exists.  But, do set parsed_hstate so that a following
4976 		 * hugepages= parameter will be applied to this hstate.
4977 		 */
4978 		parsed_hstate = h;
4979 		parsed_valid_hugepagesz = true;
4980 		return 0;
4981 	}
4982 
4983 	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4984 	parsed_valid_hugepagesz = true;
4985 	return 0;
4986 }
4987 hugetlb_early_param("hugepagesz", hugepagesz_setup);
4988 
4989 /*
4990  * default_hugepagesz command line input
4991  * Only one instance of default_hugepagesz allowed on command line.
4992  */
4993 static int __init default_hugepagesz_setup(char *s)
4994 {
4995 	unsigned long size;
4996 	int i;
4997 
4998 	parsed_valid_hugepagesz = false;
4999 	if (parsed_default_hugepagesz) {
5000 		pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
5001 		return -EINVAL;
5002 	}
5003 
5004 	size = (unsigned long)memparse(s, NULL);
5005 
5006 	if (!arch_hugetlb_valid_size(size)) {
5007 		pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
5008 		return -EINVAL;
5009 	}
5010 
5011 	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
5012 	parsed_valid_hugepagesz = true;
5013 	parsed_default_hugepagesz = true;
5014 	default_hstate_idx = hstate_index(size_to_hstate(size));
5015 
5016 	/*
5017 	 * The number of default huge pages (for this size) could have been
5018 	 * specified as the first hugetlb parameter: hugepages=X.  If so,
5019 	 * then default_hstate_max_huge_pages is set.  If the default huge
5020 	 * page size is gigantic (> MAX_PAGE_ORDER), then the pages must be
5021 	 * allocated here from bootmem allocator.
5022 	 */
5023 	if (default_hstate_max_huge_pages) {
5024 		default_hstate.max_huge_pages = default_hstate_max_huge_pages;
5025 		/*
5026 		 * Since this is an early parameter, we can't check
5027 		 * NUMA node state yet, so loop through MAX_NUMNODES.
5028 		 */
5029 		for (i = 0; i < MAX_NUMNODES; i++) {
5030 			if (default_hugepages_in_node[i] != 0)
5031 				default_hstate.max_huge_pages_node[i] =
5032 					default_hugepages_in_node[i];
5033 		}
5034 		default_hstate_max_huge_pages = 0;
5035 	}
5036 
5037 	return 0;
5038 }
5039 hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup);
5040 
5041 void __init hugetlb_bootmem_set_nodes(void)
5042 {
5043 	int i, nid;
5044 	unsigned long start_pfn, end_pfn;
5045 
5046 	if (!nodes_empty(hugetlb_bootmem_nodes))
5047 		return;
5048 
5049 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
5050 		if (end_pfn > start_pfn)
5051 			node_set(nid, hugetlb_bootmem_nodes);
5052 	}
5053 }
5054 
5055 static bool __hugetlb_bootmem_allocated __initdata;
5056 
5057 bool __init hugetlb_bootmem_allocated(void)
5058 {
5059 	return __hugetlb_bootmem_allocated;
5060 }
5061 
5062 void __init hugetlb_bootmem_alloc(void)
5063 {
5064 	struct hstate *h;
5065 	int i;
5066 
5067 	if (__hugetlb_bootmem_allocated)
5068 		return;
5069 
5070 	hugetlb_bootmem_set_nodes();
5071 
5072 	for (i = 0; i < MAX_NUMNODES; i++)
5073 		INIT_LIST_HEAD(&huge_boot_pages[i]);
5074 
5075 	hugetlb_parse_params();
5076 
5077 	for_each_hstate(h) {
5078 		h->next_nid_to_alloc = first_online_node;
5079 
5080 		if (hstate_is_gigantic(h))
5081 			hugetlb_hstate_alloc_pages(h);
5082 	}
5083 
5084 	__hugetlb_bootmem_allocated = true;
5085 }
5086 
5087 /*
5088  * hugepage_alloc_threads command line parsing.
5089  *
5090  * When set, use this specific number of threads for the boot
5091  * allocation of hugepages.
5092  */
5093 static int __init hugepage_alloc_threads_setup(char *s)
5094 {
5095 	unsigned long allocation_threads;
5096 
5097 	if (kstrtoul(s, 0, &allocation_threads) != 0)
5098 		return 1;
5099 
5100 	if (allocation_threads == 0)
5101 		return 1;
5102 
5103 	hugepage_allocation_threads = allocation_threads;
5104 
5105 	return 1;
5106 }
5107 __setup("hugepage_alloc_threads=", hugepage_alloc_threads_setup);
5108 
5109 static unsigned int allowed_mems_nr(struct hstate *h)
5110 {
5111 	int node;
5112 	unsigned int nr = 0;
5113 	nodemask_t *mbind_nodemask;
5114 	unsigned int *array = h->free_huge_pages_node;
5115 	gfp_t gfp_mask = htlb_alloc_mask(h);
5116 
5117 	mbind_nodemask = policy_mbind_nodemask(gfp_mask);
5118 	for_each_node_mask(node, cpuset_current_mems_allowed) {
5119 		if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
5120 			nr += array[node];
5121 	}
5122 
5123 	return nr;
5124 }
5125 
5126 #ifdef CONFIG_SYSCTL
5127 static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write,
5128 					  void *buffer, size_t *length,
5129 					  loff_t *ppos, unsigned long *out)
5130 {
5131 	struct ctl_table dup_table;
5132 
5133 	/*
5134 	 * In order to avoid races with __do_proc_doulongvec_minmax(), we
5135 	 * can duplicate the @table and alter the duplicate of it.
5136 	 */
5137 	dup_table = *table;
5138 	dup_table.data = out;
5139 
5140 	return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
5141 }
5142 
5143 static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
5144 			 const struct ctl_table *table, int write,
5145 			 void *buffer, size_t *length, loff_t *ppos)
5146 {
5147 	struct hstate *h = &default_hstate;
5148 	unsigned long tmp = h->max_huge_pages;
5149 	int ret;
5150 
5151 	if (!hugepages_supported())
5152 		return -EOPNOTSUPP;
5153 
5154 	ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
5155 					     &tmp);
5156 	if (ret)
5157 		goto out;
5158 
5159 	if (write)
5160 		ret = __nr_hugepages_store_common(obey_mempolicy, h,
5161 						  NUMA_NO_NODE, tmp, *length);
5162 out:
5163 	return ret;
5164 }
5165 
5166 static int hugetlb_sysctl_handler(const struct ctl_table *table, int write,
5167 			  void *buffer, size_t *length, loff_t *ppos)
5168 {
5169 
5170 	return hugetlb_sysctl_handler_common(false, table, write,
5171 							buffer, length, ppos);
5172 }
5173 
5174 #ifdef CONFIG_NUMA
5175 static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write,
5176 			  void *buffer, size_t *length, loff_t *ppos)
5177 {
5178 	return hugetlb_sysctl_handler_common(true, table, write,
5179 							buffer, length, ppos);
5180 }
5181 #endif /* CONFIG_NUMA */
5182 
5183 static int hugetlb_overcommit_handler(const struct ctl_table *table, int write,
5184 		void *buffer, size_t *length, loff_t *ppos)
5185 {
5186 	struct hstate *h = &default_hstate;
5187 	unsigned long tmp;
5188 	int ret;
5189 
5190 	if (!hugepages_supported())
5191 		return -EOPNOTSUPP;
5192 
5193 	tmp = h->nr_overcommit_huge_pages;
5194 
5195 	if (write && hstate_is_gigantic_no_runtime(h))
5196 		return -EINVAL;
5197 
5198 	ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
5199 					     &tmp);
5200 	if (ret)
5201 		goto out;
5202 
5203 	if (write) {
5204 		spin_lock_irq(&hugetlb_lock);
5205 		h->nr_overcommit_huge_pages = tmp;
5206 		spin_unlock_irq(&hugetlb_lock);
5207 	}
5208 out:
5209 	return ret;
5210 }
5211 
5212 static const struct ctl_table hugetlb_table[] = {
5213 	{
5214 		.procname	= "nr_hugepages",
5215 		.data		= NULL,
5216 		.maxlen		= sizeof(unsigned long),
5217 		.mode		= 0644,
5218 		.proc_handler	= hugetlb_sysctl_handler,
5219 	},
5220 #ifdef CONFIG_NUMA
5221 	{
5222 		.procname       = "nr_hugepages_mempolicy",
5223 		.data           = NULL,
5224 		.maxlen         = sizeof(unsigned long),
5225 		.mode           = 0644,
5226 		.proc_handler   = &hugetlb_mempolicy_sysctl_handler,
5227 	},
5228 #endif
5229 	{
5230 		.procname	= "hugetlb_shm_group",
5231 		.data		= &sysctl_hugetlb_shm_group,
5232 		.maxlen		= sizeof(gid_t),
5233 		.mode		= 0644,
5234 		.proc_handler	= proc_dointvec,
5235 	},
5236 	{
5237 		.procname	= "nr_overcommit_hugepages",
5238 		.data		= NULL,
5239 		.maxlen		= sizeof(unsigned long),
5240 		.mode		= 0644,
5241 		.proc_handler	= hugetlb_overcommit_handler,
5242 	},
5243 };
5244 
5245 static void __init hugetlb_sysctl_init(void)
5246 {
5247 	register_sysctl_init("vm", hugetlb_table);
5248 }
5249 #endif /* CONFIG_SYSCTL */
5250 
5251 void hugetlb_report_meminfo(struct seq_file *m)
5252 {
5253 	struct hstate *h;
5254 	unsigned long total = 0;
5255 
5256 	if (!hugepages_supported())
5257 		return;
5258 
5259 	for_each_hstate(h) {
5260 		unsigned long count = h->nr_huge_pages;
5261 
5262 		total += huge_page_size(h) * count;
5263 
5264 		if (h == &default_hstate)
5265 			seq_printf(m,
5266 				   "HugePages_Total:   %5lu\n"
5267 				   "HugePages_Free:    %5lu\n"
5268 				   "HugePages_Rsvd:    %5lu\n"
5269 				   "HugePages_Surp:    %5lu\n"
5270 				   "Hugepagesize:   %8lu kB\n",
5271 				   count,
5272 				   h->free_huge_pages,
5273 				   h->resv_huge_pages,
5274 				   h->surplus_huge_pages,
5275 				   huge_page_size(h) / SZ_1K);
5276 	}
5277 
5278 	seq_printf(m, "Hugetlb:        %8lu kB\n", total / SZ_1K);
5279 }
5280 
5281 int hugetlb_report_node_meminfo(char *buf, int len, int nid)
5282 {
5283 	struct hstate *h = &default_hstate;
5284 
5285 	if (!hugepages_supported())
5286 		return 0;
5287 
5288 	return sysfs_emit_at(buf, len,
5289 			     "Node %d HugePages_Total: %5u\n"
5290 			     "Node %d HugePages_Free:  %5u\n"
5291 			     "Node %d HugePages_Surp:  %5u\n",
5292 			     nid, h->nr_huge_pages_node[nid],
5293 			     nid, h->free_huge_pages_node[nid],
5294 			     nid, h->surplus_huge_pages_node[nid]);
5295 }
5296 
5297 void hugetlb_show_meminfo_node(int nid)
5298 {
5299 	struct hstate *h;
5300 
5301 	if (!hugepages_supported())
5302 		return;
5303 
5304 	for_each_hstate(h)
5305 		printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
5306 			nid,
5307 			h->nr_huge_pages_node[nid],
5308 			h->free_huge_pages_node[nid],
5309 			h->surplus_huge_pages_node[nid],
5310 			huge_page_size(h) / SZ_1K);
5311 }
5312 
5313 void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
5314 {
5315 	seq_printf(m, "HugetlbPages:\t%8lu kB\n",
5316 		   K(atomic_long_read(&mm->hugetlb_usage)));
5317 }
5318 
5319 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
5320 unsigned long hugetlb_total_pages(void)
5321 {
5322 	struct hstate *h;
5323 	unsigned long nr_total_pages = 0;
5324 
5325 	for_each_hstate(h)
5326 		nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
5327 	return nr_total_pages;
5328 }
5329 
5330 static int hugetlb_acct_memory(struct hstate *h, long delta)
5331 {
5332 	int ret = -ENOMEM;
5333 
5334 	if (!delta)
5335 		return 0;
5336 
5337 	spin_lock_irq(&hugetlb_lock);
5338 	/*
5339 	 * When cpuset is configured, it breaks the strict hugetlb page
5340 	 * reservation as the accounting is done on a global variable. Such
5341 	 * reservation is completely rubbish in the presence of cpuset because
5342 	 * the reservation is not checked against page availability for the
5343 	 * current cpuset. Application can still potentially OOM'ed by kernel
5344 	 * with lack of free htlb page in cpuset that the task is in.
5345 	 * Attempt to enforce strict accounting with cpuset is almost
5346 	 * impossible (or too ugly) because cpuset is too fluid that
5347 	 * task or memory node can be dynamically moved between cpusets.
5348 	 *
5349 	 * The change of semantics for shared hugetlb mapping with cpuset is
5350 	 * undesirable. However, in order to preserve some of the semantics,
5351 	 * we fall back to check against current free page availability as
5352 	 * a best attempt and hopefully to minimize the impact of changing
5353 	 * semantics that cpuset has.
5354 	 *
5355 	 * Apart from cpuset, we also have memory policy mechanism that
5356 	 * also determines from which node the kernel will allocate memory
5357 	 * in a NUMA system. So similar to cpuset, we also should consider
5358 	 * the memory policy of the current task. Similar to the description
5359 	 * above.
5360 	 */
5361 	if (delta > 0) {
5362 		if (gather_surplus_pages(h, delta) < 0)
5363 			goto out;
5364 
5365 		if (delta > allowed_mems_nr(h)) {
5366 			return_unused_surplus_pages(h, delta);
5367 			goto out;
5368 		}
5369 	}
5370 
5371 	ret = 0;
5372 	if (delta < 0)
5373 		return_unused_surplus_pages(h, (unsigned long) -delta);
5374 
5375 out:
5376 	spin_unlock_irq(&hugetlb_lock);
5377 	return ret;
5378 }
5379 
5380 static void hugetlb_vm_op_open(struct vm_area_struct *vma)
5381 {
5382 	struct resv_map *resv = vma_resv_map(vma);
5383 
5384 	/*
5385 	 * HPAGE_RESV_OWNER indicates a private mapping.
5386 	 * This new VMA should share its siblings reservation map if present.
5387 	 * The VMA will only ever have a valid reservation map pointer where
5388 	 * it is being copied for another still existing VMA.  As that VMA
5389 	 * has a reference to the reservation map it cannot disappear until
5390 	 * after this open call completes.  It is therefore safe to take a
5391 	 * new reference here without additional locking.
5392 	 */
5393 	if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
5394 		resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
5395 		kref_get(&resv->refs);
5396 	}
5397 
5398 	/*
5399 	 * vma_lock structure for sharable mappings is vma specific.
5400 	 * Clear old pointer (if copied via vm_area_dup) and allocate
5401 	 * new structure.  Before clearing, make sure vma_lock is not
5402 	 * for this vma.
5403 	 */
5404 	if (vma->vm_flags & VM_MAYSHARE) {
5405 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
5406 
5407 		if (vma_lock) {
5408 			if (vma_lock->vma != vma) {
5409 				vma->vm_private_data = NULL;
5410 				hugetlb_vma_lock_alloc(vma);
5411 			} else
5412 				pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
5413 		} else
5414 			hugetlb_vma_lock_alloc(vma);
5415 	}
5416 }
5417 
5418 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
5419 {
5420 	struct hstate *h = hstate_vma(vma);
5421 	struct resv_map *resv;
5422 	struct hugepage_subpool *spool = subpool_vma(vma);
5423 	unsigned long reserve, start, end;
5424 	long gbl_reserve;
5425 
5426 	hugetlb_vma_lock_free(vma);
5427 
5428 	resv = vma_resv_map(vma);
5429 	if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
5430 		return;
5431 
5432 	start = vma_hugecache_offset(h, vma, vma->vm_start);
5433 	end = vma_hugecache_offset(h, vma, vma->vm_end);
5434 
5435 	reserve = (end - start) - region_count(resv, start, end);
5436 	hugetlb_cgroup_uncharge_counter(resv, start, end);
5437 	if (reserve) {
5438 		/*
5439 		 * Decrement reserve counts.  The global reserve count may be
5440 		 * adjusted if the subpool has a minimum size.
5441 		 */
5442 		gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
5443 		hugetlb_acct_memory(h, -gbl_reserve);
5444 	}
5445 
5446 	kref_put(&resv->refs, resv_map_release);
5447 }
5448 
5449 static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
5450 {
5451 	if (addr & ~(huge_page_mask(hstate_vma(vma))))
5452 		return -EINVAL;
5453 	return 0;
5454 }
5455 
5456 void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
5457 {
5458 	/*
5459 	 * PMD sharing is only possible for PUD_SIZE-aligned address ranges
5460 	 * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
5461 	 * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
5462 	 * This function is called in the middle of a VMA split operation, with
5463 	 * MM, VMA and rmap all write-locked to prevent concurrent page table
5464 	 * walks (except hardware and gup_fast()).
5465 	 */
5466 	vma_assert_write_locked(vma);
5467 	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
5468 
5469 	if (addr & ~PUD_MASK) {
5470 		unsigned long floor = addr & PUD_MASK;
5471 		unsigned long ceil = floor + PUD_SIZE;
5472 
5473 		if (floor >= vma->vm_start && ceil <= vma->vm_end) {
5474 			/*
5475 			 * Locking:
5476 			 * Use take_locks=false here.
5477 			 * The file rmap lock is already held.
5478 			 * The hugetlb VMA lock can't be taken when we already
5479 			 * hold the file rmap lock, and we don't need it because
5480 			 * its purpose is to synchronize against concurrent page
5481 			 * table walks, which are not possible thanks to the
5482 			 * locks held by our caller.
5483 			 */
5484 			hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
5485 		}
5486 	}
5487 }
5488 
5489 static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
5490 {
5491 	return huge_page_size(hstate_vma(vma));
5492 }
5493 
5494 /*
5495  * We cannot handle pagefaults against hugetlb pages at all.  They cause
5496  * handle_mm_fault() to try to instantiate regular-sized pages in the
5497  * hugepage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
5498  * this far.
5499  */
5500 static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
5501 {
5502 	BUG();
5503 	return 0;
5504 }
5505 
5506 /*
5507  * When a new function is introduced to vm_operations_struct and added
5508  * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
5509  * This is because under System V memory model, mappings created via
5510  * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
5511  * their original vm_ops are overwritten with shm_vm_ops.
5512  */
5513 const struct vm_operations_struct hugetlb_vm_ops = {
5514 	.fault = hugetlb_vm_op_fault,
5515 	.open = hugetlb_vm_op_open,
5516 	.close = hugetlb_vm_op_close,
5517 	.may_split = hugetlb_vm_op_split,
5518 	.pagesize = hugetlb_vm_op_pagesize,
5519 };
5520 
5521 static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio,
5522 		bool try_mkwrite)
5523 {
5524 	pte_t entry = folio_mk_pte(folio, vma->vm_page_prot);
5525 	unsigned int shift = huge_page_shift(hstate_vma(vma));
5526 
5527 	if (try_mkwrite && (vma->vm_flags & VM_WRITE)) {
5528 		entry = pte_mkwrite_novma(pte_mkdirty(entry));
5529 	} else {
5530 		entry = pte_wrprotect(entry);
5531 	}
5532 	entry = pte_mkyoung(entry);
5533 	entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
5534 
5535 	return entry;
5536 }
5537 
5538 static void set_huge_ptep_writable(struct vm_area_struct *vma,
5539 				   unsigned long address, pte_t *ptep)
5540 {
5541 	pte_t entry;
5542 
5543 	entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(vma->vm_mm, address, ptep)));
5544 	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
5545 		update_mmu_cache(vma, address, ptep);
5546 }
5547 
5548 static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma,
5549 					 unsigned long address, pte_t *ptep)
5550 {
5551 	if (vma->vm_flags & VM_WRITE)
5552 		set_huge_ptep_writable(vma, address, ptep);
5553 }
5554 
5555 bool is_hugetlb_entry_migration(pte_t pte)
5556 {
5557 	swp_entry_t swp;
5558 
5559 	if (huge_pte_none(pte) || pte_present(pte))
5560 		return false;
5561 	swp = pte_to_swp_entry(pte);
5562 	if (is_migration_entry(swp))
5563 		return true;
5564 	else
5565 		return false;
5566 }
5567 
5568 bool is_hugetlb_entry_hwpoisoned(pte_t pte)
5569 {
5570 	swp_entry_t swp;
5571 
5572 	if (huge_pte_none(pte) || pte_present(pte))
5573 		return false;
5574 	swp = pte_to_swp_entry(pte);
5575 	if (is_hwpoison_entry(swp))
5576 		return true;
5577 	else
5578 		return false;
5579 }
5580 
5581 static void
5582 hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
5583 		      struct folio *new_folio, pte_t old, unsigned long sz)
5584 {
5585 	pte_t newpte = make_huge_pte(vma, new_folio, true);
5586 
5587 	__folio_mark_uptodate(new_folio);
5588 	hugetlb_add_new_anon_rmap(new_folio, vma, addr);
5589 	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old))
5590 		newpte = huge_pte_mkuffd_wp(newpte);
5591 	set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz);
5592 	hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
5593 	folio_set_hugetlb_migratable(new_folio);
5594 }
5595 
5596 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
5597 			    struct vm_area_struct *dst_vma,
5598 			    struct vm_area_struct *src_vma)
5599 {
5600 	pte_t *src_pte, *dst_pte, entry;
5601 	struct folio *pte_folio;
5602 	unsigned long addr;
5603 	bool cow = is_cow_mapping(src_vma->vm_flags);
5604 	struct hstate *h = hstate_vma(src_vma);
5605 	unsigned long sz = huge_page_size(h);
5606 	unsigned long npages = pages_per_huge_page(h);
5607 	struct mmu_notifier_range range;
5608 	unsigned long last_addr_mask;
5609 	int ret = 0;
5610 
5611 	if (cow) {
5612 		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src,
5613 					src_vma->vm_start,
5614 					src_vma->vm_end);
5615 		mmu_notifier_invalidate_range_start(&range);
5616 		vma_assert_write_locked(src_vma);
5617 		raw_write_seqcount_begin(&src->write_protect_seq);
5618 	} else {
5619 		/*
5620 		 * For shared mappings the vma lock must be held before
5621 		 * calling hugetlb_walk() in the src vma. Otherwise, the
5622 		 * returned ptep could go away if part of a shared pmd and
5623 		 * another thread calls huge_pmd_unshare.
5624 		 */
5625 		hugetlb_vma_lock_read(src_vma);
5626 	}
5627 
5628 	last_addr_mask = hugetlb_mask_last_page(h);
5629 	for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
5630 		spinlock_t *src_ptl, *dst_ptl;
5631 		src_pte = hugetlb_walk(src_vma, addr, sz);
5632 		if (!src_pte) {
5633 			addr |= last_addr_mask;
5634 			continue;
5635 		}
5636 		dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
5637 		if (!dst_pte) {
5638 			ret = -ENOMEM;
5639 			break;
5640 		}
5641 
5642 #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
5643 		/* If the pagetables are shared, there is nothing to do */
5644 		if (ptdesc_pmd_is_shared(virt_to_ptdesc(dst_pte))) {
5645 			addr |= last_addr_mask;
5646 			continue;
5647 		}
5648 #endif
5649 
5650 		dst_ptl = huge_pte_lock(h, dst, dst_pte);
5651 		src_ptl = huge_pte_lockptr(h, src, src_pte);
5652 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5653 		entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
5654 again:
5655 		if (huge_pte_none(entry)) {
5656 			/*
5657 			 * Skip if src entry none.
5658 			 */
5659 			;
5660 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
5661 			if (!userfaultfd_wp(dst_vma))
5662 				entry = huge_pte_clear_uffd_wp(entry);
5663 			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
5664 		} else if (unlikely(is_hugetlb_entry_migration(entry))) {
5665 			swp_entry_t swp_entry = pte_to_swp_entry(entry);
5666 			bool uffd_wp = pte_swp_uffd_wp(entry);
5667 
5668 			if (!is_readable_migration_entry(swp_entry) && cow) {
5669 				/*
5670 				 * COW mappings require pages in both
5671 				 * parent and child to be set to read.
5672 				 */
5673 				swp_entry = make_readable_migration_entry(
5674 							swp_offset(swp_entry));
5675 				entry = swp_entry_to_pte(swp_entry);
5676 				if (userfaultfd_wp(src_vma) && uffd_wp)
5677 					entry = pte_swp_mkuffd_wp(entry);
5678 				set_huge_pte_at(src, addr, src_pte, entry, sz);
5679 			}
5680 			if (!userfaultfd_wp(dst_vma))
5681 				entry = huge_pte_clear_uffd_wp(entry);
5682 			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
5683 		} else if (unlikely(is_pte_marker(entry))) {
5684 			pte_marker marker = copy_pte_marker(
5685 				pte_to_swp_entry(entry), dst_vma);
5686 
5687 			if (marker)
5688 				set_huge_pte_at(dst, addr, dst_pte,
5689 						make_pte_marker(marker), sz);
5690 		} else {
5691 			entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
5692 			pte_folio = page_folio(pte_page(entry));
5693 			folio_get(pte_folio);
5694 
5695 			/*
5696 			 * Failing to duplicate the anon rmap is a rare case
5697 			 * where we see pinned hugetlb pages while they're
5698 			 * prone to COW. We need to do the COW earlier during
5699 			 * fork.
5700 			 *
5701 			 * When pre-allocating the page or copying data, we
5702 			 * need to be without the pgtable locks since we could
5703 			 * sleep during the process.
5704 			 */
5705 			if (!folio_test_anon(pte_folio)) {
5706 				hugetlb_add_file_rmap(pte_folio);
5707 			} else if (hugetlb_try_dup_anon_rmap(pte_folio, src_vma)) {
5708 				pte_t src_pte_old = entry;
5709 				struct folio *new_folio;
5710 
5711 				spin_unlock(src_ptl);
5712 				spin_unlock(dst_ptl);
5713 				/* Do not use reserve as it's private owned */
5714 				new_folio = alloc_hugetlb_folio(dst_vma, addr, false);
5715 				if (IS_ERR(new_folio)) {
5716 					folio_put(pte_folio);
5717 					ret = PTR_ERR(new_folio);
5718 					break;
5719 				}
5720 				ret = copy_user_large_folio(new_folio, pte_folio,
5721 							    addr, dst_vma);
5722 				folio_put(pte_folio);
5723 				if (ret) {
5724 					folio_put(new_folio);
5725 					break;
5726 				}
5727 
5728 				/* Install the new hugetlb folio if src pte stable */
5729 				dst_ptl = huge_pte_lock(h, dst, dst_pte);
5730 				src_ptl = huge_pte_lockptr(h, src, src_pte);
5731 				spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5732 				entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
5733 				if (!pte_same(src_pte_old, entry)) {
5734 					restore_reserve_on_error(h, dst_vma, addr,
5735 								new_folio);
5736 					folio_put(new_folio);
5737 					/* huge_ptep of dst_pte won't change as in child */
5738 					goto again;
5739 				}
5740 				hugetlb_install_folio(dst_vma, dst_pte, addr,
5741 						      new_folio, src_pte_old, sz);
5742 				spin_unlock(src_ptl);
5743 				spin_unlock(dst_ptl);
5744 				continue;
5745 			}
5746 
5747 			if (cow) {
5748 				/*
5749 				 * No need to notify as we are downgrading page
5750 				 * table protection not changing it to point
5751 				 * to a new page.
5752 				 *
5753 				 * See Documentation/mm/mmu_notifier.rst
5754 				 */
5755 				huge_ptep_set_wrprotect(src, addr, src_pte);
5756 				entry = huge_pte_wrprotect(entry);
5757 			}
5758 
5759 			if (!userfaultfd_wp(dst_vma))
5760 				entry = huge_pte_clear_uffd_wp(entry);
5761 
5762 			set_huge_pte_at(dst, addr, dst_pte, entry, sz);
5763 			hugetlb_count_add(npages, dst);
5764 		}
5765 		spin_unlock(src_ptl);
5766 		spin_unlock(dst_ptl);
5767 	}
5768 
5769 	if (cow) {
5770 		raw_write_seqcount_end(&src->write_protect_seq);
5771 		mmu_notifier_invalidate_range_end(&range);
5772 	} else {
5773 		hugetlb_vma_unlock_read(src_vma);
5774 	}
5775 
5776 	return ret;
5777 }
5778 
5779 static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
5780 			  unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
5781 			  unsigned long sz)
5782 {
5783 	bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
5784 	struct hstate *h = hstate_vma(vma);
5785 	struct mm_struct *mm = vma->vm_mm;
5786 	spinlock_t *src_ptl, *dst_ptl;
5787 	pte_t pte;
5788 
5789 	dst_ptl = huge_pte_lock(h, mm, dst_pte);
5790 	src_ptl = huge_pte_lockptr(h, mm, src_pte);
5791 
5792 	/*
5793 	 * We don't have to worry about the ordering of src and dst ptlocks
5794 	 * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock.
5795 	 */
5796 	if (src_ptl != dst_ptl)
5797 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5798 
5799 	pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz);
5800 
5801 	if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
5802 		huge_pte_clear(mm, new_addr, dst_pte, sz);
5803 	else {
5804 		if (need_clear_uffd_wp) {
5805 			if (pte_present(pte))
5806 				pte = huge_pte_clear_uffd_wp(pte);
5807 			else if (is_swap_pte(pte))
5808 				pte = pte_swp_clear_uffd_wp(pte);
5809 		}
5810 		set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
5811 	}
5812 
5813 	if (src_ptl != dst_ptl)
5814 		spin_unlock(src_ptl);
5815 	spin_unlock(dst_ptl);
5816 }
5817 
5818 int move_hugetlb_page_tables(struct vm_area_struct *vma,
5819 			     struct vm_area_struct *new_vma,
5820 			     unsigned long old_addr, unsigned long new_addr,
5821 			     unsigned long len)
5822 {
5823 	struct hstate *h = hstate_vma(vma);
5824 	struct address_space *mapping = vma->vm_file->f_mapping;
5825 	unsigned long sz = huge_page_size(h);
5826 	struct mm_struct *mm = vma->vm_mm;
5827 	unsigned long old_end = old_addr + len;
5828 	unsigned long last_addr_mask;
5829 	pte_t *src_pte, *dst_pte;
5830 	struct mmu_notifier_range range;
5831 	bool shared_pmd = false;
5832 
5833 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
5834 				old_end);
5835 	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
5836 	/*
5837 	 * In case of shared PMDs, we should cover the maximum possible
5838 	 * range.
5839 	 */
5840 	flush_cache_range(vma, range.start, range.end);
5841 
5842 	mmu_notifier_invalidate_range_start(&range);
5843 	last_addr_mask = hugetlb_mask_last_page(h);
5844 	/* Prevent race with file truncation */
5845 	hugetlb_vma_lock_write(vma);
5846 	i_mmap_lock_write(mapping);
5847 	for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
5848 		src_pte = hugetlb_walk(vma, old_addr, sz);
5849 		if (!src_pte) {
5850 			old_addr |= last_addr_mask;
5851 			new_addr |= last_addr_mask;
5852 			continue;
5853 		}
5854 		if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte)))
5855 			continue;
5856 
5857 		if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
5858 			shared_pmd = true;
5859 			old_addr |= last_addr_mask;
5860 			new_addr |= last_addr_mask;
5861 			continue;
5862 		}
5863 
5864 		dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
5865 		if (!dst_pte)
5866 			break;
5867 
5868 		move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
5869 	}
5870 
5871 	if (shared_pmd)
5872 		flush_hugetlb_tlb_range(vma, range.start, range.end);
5873 	else
5874 		flush_hugetlb_tlb_range(vma, old_end - len, old_end);
5875 	mmu_notifier_invalidate_range_end(&range);
5876 	i_mmap_unlock_write(mapping);
5877 	hugetlb_vma_unlock_write(vma);
5878 
5879 	return len + old_addr - old_end;
5880 }
5881 
5882 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
5883 			    unsigned long start, unsigned long end,
5884 			    struct folio *folio, zap_flags_t zap_flags)
5885 {
5886 	struct mm_struct *mm = vma->vm_mm;
5887 	const bool folio_provided = !!folio;
5888 	unsigned long address;
5889 	pte_t *ptep;
5890 	pte_t pte;
5891 	spinlock_t *ptl;
5892 	struct hstate *h = hstate_vma(vma);
5893 	unsigned long sz = huge_page_size(h);
5894 	bool adjust_reservation;
5895 	unsigned long last_addr_mask;
5896 	bool force_flush = false;
5897 
5898 	WARN_ON(!is_vm_hugetlb_page(vma));
5899 	BUG_ON(start & ~huge_page_mask(h));
5900 	BUG_ON(end & ~huge_page_mask(h));
5901 
5902 	/*
5903 	 * This is a hugetlb vma, all the pte entries should point
5904 	 * to huge page.
5905 	 */
5906 	tlb_change_page_size(tlb, sz);
5907 	tlb_start_vma(tlb, vma);
5908 
5909 	last_addr_mask = hugetlb_mask_last_page(h);
5910 	address = start;
5911 	for (; address < end; address += sz) {
5912 		ptep = hugetlb_walk(vma, address, sz);
5913 		if (!ptep) {
5914 			address |= last_addr_mask;
5915 			continue;
5916 		}
5917 
5918 		ptl = huge_pte_lock(h, mm, ptep);
5919 		if (huge_pmd_unshare(mm, vma, address, ptep)) {
5920 			spin_unlock(ptl);
5921 			tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
5922 			force_flush = true;
5923 			address |= last_addr_mask;
5924 			continue;
5925 		}
5926 
5927 		pte = huge_ptep_get(mm, address, ptep);
5928 		if (huge_pte_none(pte)) {
5929 			spin_unlock(ptl);
5930 			continue;
5931 		}
5932 
5933 		/*
5934 		 * Migrating hugepage or HWPoisoned hugepage is already
5935 		 * unmapped and its refcount is dropped, so just clear pte here.
5936 		 */
5937 		if (unlikely(!pte_present(pte))) {
5938 			/*
5939 			 * If the pte was wr-protected by uffd-wp in any of the
5940 			 * swap forms, meanwhile the caller does not want to
5941 			 * drop the uffd-wp bit in this zap, then replace the
5942 			 * pte with a marker.
5943 			 */
5944 			if (pte_swp_uffd_wp_any(pte) &&
5945 			    !(zap_flags & ZAP_FLAG_DROP_MARKER))
5946 				set_huge_pte_at(mm, address, ptep,
5947 						make_pte_marker(PTE_MARKER_UFFD_WP),
5948 						sz);
5949 			else
5950 				huge_pte_clear(mm, address, ptep, sz);
5951 			spin_unlock(ptl);
5952 			continue;
5953 		}
5954 
5955 		/*
5956 		 * If a folio is supplied, it is because a specific
5957 		 * folio is being unmapped, not a range. Ensure the folio we
5958 		 * are about to unmap is the actual folio of interest.
5959 		 */
5960 		if (folio_provided) {
5961 			if (folio != page_folio(pte_page(pte))) {
5962 				spin_unlock(ptl);
5963 				continue;
5964 			}
5965 			/*
5966 			 * Mark the VMA as having unmapped its page so that
5967 			 * future faults in this VMA will fail rather than
5968 			 * looking like data was lost
5969 			 */
5970 			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
5971 		} else {
5972 			folio = page_folio(pte_page(pte));
5973 		}
5974 
5975 		pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
5976 		tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
5977 		if (huge_pte_dirty(pte))
5978 			folio_mark_dirty(folio);
5979 		/* Leave a uffd-wp pte marker if needed */
5980 		if (huge_pte_uffd_wp(pte) &&
5981 		    !(zap_flags & ZAP_FLAG_DROP_MARKER))
5982 			set_huge_pte_at(mm, address, ptep,
5983 					make_pte_marker(PTE_MARKER_UFFD_WP),
5984 					sz);
5985 		hugetlb_count_sub(pages_per_huge_page(h), mm);
5986 		hugetlb_remove_rmap(folio);
5987 		spin_unlock(ptl);
5988 
5989 		/*
5990 		 * Restore the reservation for anonymous page, otherwise the
5991 		 * backing page could be stolen by someone.
5992 		 * If there we are freeing a surplus, do not set the restore
5993 		 * reservation bit.
5994 		 */
5995 		adjust_reservation = false;
5996 
5997 		spin_lock_irq(&hugetlb_lock);
5998 		if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
5999 		    folio_test_anon(folio)) {
6000 			folio_set_hugetlb_restore_reserve(folio);
6001 			/* Reservation to be adjusted after the spin lock */
6002 			adjust_reservation = true;
6003 		}
6004 		spin_unlock_irq(&hugetlb_lock);
6005 
6006 		/*
6007 		 * Adjust the reservation for the region that will have the
6008 		 * reserve restored. Keep in mind that vma_needs_reservation() changes
6009 		 * resv->adds_in_progress if it succeeds. If this is not done,
6010 		 * do_exit() will not see it, and will keep the reservation
6011 		 * forever.
6012 		 */
6013 		if (adjust_reservation) {
6014 			int rc = vma_needs_reservation(h, vma, address);
6015 
6016 			if (rc < 0)
6017 				/* Pressumably allocate_file_region_entries failed
6018 				 * to allocate a file_region struct. Clear
6019 				 * hugetlb_restore_reserve so that global reserve
6020 				 * count will not be incremented by free_huge_folio.
6021 				 * Act as if we consumed the reservation.
6022 				 */
6023 				folio_clear_hugetlb_restore_reserve(folio);
6024 			else if (rc)
6025 				vma_add_reservation(h, vma, address);
6026 		}
6027 
6028 		tlb_remove_page_size(tlb, folio_page(folio, 0),
6029 				     folio_size(folio));
6030 		/*
6031 		 * If we were instructed to unmap a specific folio, we're done.
6032 		 */
6033 		if (folio_provided)
6034 			break;
6035 	}
6036 	tlb_end_vma(tlb, vma);
6037 
6038 	/*
6039 	 * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
6040 	 * could defer the flush until now, since by holding i_mmap_rwsem we
6041 	 * guaranteed that the last reference would not be dropped. But we must
6042 	 * do the flushing before we return, as otherwise i_mmap_rwsem will be
6043 	 * dropped and the last reference to the shared PMDs page might be
6044 	 * dropped as well.
6045 	 *
6046 	 * In theory we could defer the freeing of the PMD pages as well, but
6047 	 * huge_pmd_unshare() relies on the exact page_count for the PMD page to
6048 	 * detect sharing, so we cannot defer the release of the page either.
6049 	 * Instead, do flush now.
6050 	 */
6051 	if (force_flush)
6052 		tlb_flush_mmu_tlbonly(tlb);
6053 }
6054 
6055 void __hugetlb_zap_begin(struct vm_area_struct *vma,
6056 			 unsigned long *start, unsigned long *end)
6057 {
6058 	if (!vma->vm_file)	/* hugetlbfs_file_mmap error */
6059 		return;
6060 
6061 	adjust_range_if_pmd_sharing_possible(vma, start, end);
6062 	hugetlb_vma_lock_write(vma);
6063 	if (vma->vm_file)
6064 		i_mmap_lock_write(vma->vm_file->f_mapping);
6065 }
6066 
6067 void __hugetlb_zap_end(struct vm_area_struct *vma,
6068 		       struct zap_details *details)
6069 {
6070 	zap_flags_t zap_flags = details ? details->zap_flags : 0;
6071 
6072 	if (!vma->vm_file)	/* hugetlbfs_file_mmap error */
6073 		return;
6074 
6075 	if (zap_flags & ZAP_FLAG_UNMAP) {	/* final unmap */
6076 		/*
6077 		 * Unlock and free the vma lock before releasing i_mmap_rwsem.
6078 		 * When the vma_lock is freed, this makes the vma ineligible
6079 		 * for pmd sharing.  And, i_mmap_rwsem is required to set up
6080 		 * pmd sharing.  This is important as page tables for this
6081 		 * unmapped range will be asynchrously deleted.  If the page
6082 		 * tables are shared, there will be issues when accessed by
6083 		 * someone else.
6084 		 */
6085 		__hugetlb_vma_unlock_write_free(vma);
6086 	} else {
6087 		hugetlb_vma_unlock_write(vma);
6088 	}
6089 
6090 	if (vma->vm_file)
6091 		i_mmap_unlock_write(vma->vm_file->f_mapping);
6092 }
6093 
6094 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
6095 			  unsigned long end, struct folio *folio,
6096 			  zap_flags_t zap_flags)
6097 {
6098 	struct mmu_notifier_range range;
6099 	struct mmu_gather tlb;
6100 
6101 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
6102 				start, end);
6103 	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
6104 	mmu_notifier_invalidate_range_start(&range);
6105 	tlb_gather_mmu(&tlb, vma->vm_mm);
6106 
6107 	__unmap_hugepage_range(&tlb, vma, start, end,
6108 			       folio, zap_flags);
6109 
6110 	mmu_notifier_invalidate_range_end(&range);
6111 	tlb_finish_mmu(&tlb);
6112 }
6113 
6114 /*
6115  * This is called when the original mapper is failing to COW a MAP_PRIVATE
6116  * mapping it owns the reserve page for. The intention is to unmap the page
6117  * from other VMAs and let the children be SIGKILLed if they are faulting the
6118  * same region.
6119  */
6120 static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
6121 			      struct folio *folio, unsigned long address)
6122 {
6123 	struct hstate *h = hstate_vma(vma);
6124 	struct vm_area_struct *iter_vma;
6125 	struct address_space *mapping;
6126 	pgoff_t pgoff;
6127 
6128 	/*
6129 	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
6130 	 * from page cache lookup which is in HPAGE_SIZE units.
6131 	 */
6132 	address = address & huge_page_mask(h);
6133 	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
6134 			vma->vm_pgoff;
6135 	mapping = vma->vm_file->f_mapping;
6136 
6137 	/*
6138 	 * Take the mapping lock for the duration of the table walk. As
6139 	 * this mapping should be shared between all the VMAs,
6140 	 * __unmap_hugepage_range() is called as the lock is already held
6141 	 */
6142 	i_mmap_lock_write(mapping);
6143 	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
6144 		/* Do not unmap the current VMA */
6145 		if (iter_vma == vma)
6146 			continue;
6147 
6148 		/*
6149 		 * Shared VMAs have their own reserves and do not affect
6150 		 * MAP_PRIVATE accounting but it is possible that a shared
6151 		 * VMA is using the same page so check and skip such VMAs.
6152 		 */
6153 		if (iter_vma->vm_flags & VM_MAYSHARE)
6154 			continue;
6155 
6156 		/*
6157 		 * Unmap the page from other VMAs without their own reserves.
6158 		 * They get marked to be SIGKILLed if they fault in these
6159 		 * areas. This is because a future no-page fault on this VMA
6160 		 * could insert a zeroed page instead of the data existing
6161 		 * from the time of fork. This would look like data corruption
6162 		 */
6163 		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
6164 			unmap_hugepage_range(iter_vma, address,
6165 					     address + huge_page_size(h),
6166 					     folio, 0);
6167 	}
6168 	i_mmap_unlock_write(mapping);
6169 }
6170 
6171 /*
6172  * hugetlb_wp() should be called with page lock of the original hugepage held.
6173  * Called with hugetlb_fault_mutex_table held and pte_page locked so we
6174  * cannot race with other handlers or page migration.
6175  * Keep the pte_same checks anyway to make transition from the mutex easier.
6176  */
6177 static vm_fault_t hugetlb_wp(struct vm_fault *vmf)
6178 {
6179 	struct vm_area_struct *vma = vmf->vma;
6180 	struct mm_struct *mm = vma->vm_mm;
6181 	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
6182 	pte_t pte = huge_ptep_get(mm, vmf->address, vmf->pte);
6183 	struct hstate *h = hstate_vma(vma);
6184 	struct folio *old_folio;
6185 	struct folio *new_folio;
6186 	bool cow_from_owner = 0;
6187 	vm_fault_t ret = 0;
6188 	struct mmu_notifier_range range;
6189 
6190 	/*
6191 	 * Never handle CoW for uffd-wp protected pages.  It should be only
6192 	 * handled when the uffd-wp protection is removed.
6193 	 *
6194 	 * Note that only the CoW optimization path (in hugetlb_no_page())
6195 	 * can trigger this, because hugetlb_fault() will always resolve
6196 	 * uffd-wp bit first.
6197 	 */
6198 	if (!unshare && huge_pte_uffd_wp(pte))
6199 		return 0;
6200 
6201 	/* Let's take out MAP_SHARED mappings first. */
6202 	if (vma->vm_flags & VM_MAYSHARE) {
6203 		set_huge_ptep_writable(vma, vmf->address, vmf->pte);
6204 		return 0;
6205 	}
6206 
6207 	old_folio = page_folio(pte_page(pte));
6208 
6209 	delayacct_wpcopy_start();
6210 
6211 retry_avoidcopy:
6212 	/*
6213 	 * If no-one else is actually using this page, we're the exclusive
6214 	 * owner and can reuse this page.
6215 	 *
6216 	 * Note that we don't rely on the (safer) folio refcount here, because
6217 	 * copying the hugetlb folio when there are unexpected (temporary)
6218 	 * folio references could harm simple fork()+exit() users when
6219 	 * we run out of free hugetlb folios: we would have to kill processes
6220 	 * in scenarios that used to work. As a side effect, there can still
6221 	 * be leaks between processes, for example, with FOLL_GET users.
6222 	 */
6223 	if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
6224 		if (!PageAnonExclusive(&old_folio->page)) {
6225 			folio_move_anon_rmap(old_folio, vma);
6226 			SetPageAnonExclusive(&old_folio->page);
6227 		}
6228 		if (likely(!unshare))
6229 			set_huge_ptep_maybe_writable(vma, vmf->address,
6230 						     vmf->pte);
6231 
6232 		delayacct_wpcopy_end();
6233 		return 0;
6234 	}
6235 	VM_BUG_ON_PAGE(folio_test_anon(old_folio) &&
6236 		       PageAnonExclusive(&old_folio->page), &old_folio->page);
6237 
6238 	/*
6239 	 * If the process that created a MAP_PRIVATE mapping is about to perform
6240 	 * a COW due to a shared page count, attempt to satisfy the allocation
6241 	 * without using the existing reserves.
6242 	 * In order to determine where this is a COW on a MAP_PRIVATE mapping it
6243 	 * is enough to check whether the old_folio is anonymous. This means that
6244 	 * the reserve for this address was consumed. If reserves were used, a
6245 	 * partial faulted mapping at the fime of fork() could consume its reserves
6246 	 * on COW instead of the full address range.
6247 	 */
6248 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
6249 	    folio_test_anon(old_folio))
6250 		cow_from_owner = true;
6251 
6252 	folio_get(old_folio);
6253 
6254 	/*
6255 	 * Drop page table lock as buddy allocator may be called. It will
6256 	 * be acquired again before returning to the caller, as expected.
6257 	 */
6258 	spin_unlock(vmf->ptl);
6259 	new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner);
6260 
6261 	if (IS_ERR(new_folio)) {
6262 		/*
6263 		 * If a process owning a MAP_PRIVATE mapping fails to COW,
6264 		 * it is due to references held by a child and an insufficient
6265 		 * huge page pool. To guarantee the original mappers
6266 		 * reliability, unmap the page from child processes. The child
6267 		 * may get SIGKILLed if it later faults.
6268 		 */
6269 		if (cow_from_owner) {
6270 			struct address_space *mapping = vma->vm_file->f_mapping;
6271 			pgoff_t idx;
6272 			u32 hash;
6273 
6274 			folio_put(old_folio);
6275 			/*
6276 			 * Drop hugetlb_fault_mutex and vma_lock before
6277 			 * unmapping.  unmapping needs to hold vma_lock
6278 			 * in write mode.  Dropping vma_lock in read mode
6279 			 * here is OK as COW mappings do not interact with
6280 			 * PMD sharing.
6281 			 *
6282 			 * Reacquire both after unmap operation.
6283 			 */
6284 			idx = vma_hugecache_offset(h, vma, vmf->address);
6285 			hash = hugetlb_fault_mutex_hash(mapping, idx);
6286 			hugetlb_vma_unlock_read(vma);
6287 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6288 
6289 			unmap_ref_private(mm, vma, old_folio, vmf->address);
6290 
6291 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
6292 			hugetlb_vma_lock_read(vma);
6293 			spin_lock(vmf->ptl);
6294 			vmf->pte = hugetlb_walk(vma, vmf->address,
6295 					huge_page_size(h));
6296 			if (likely(vmf->pte &&
6297 				   pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte)))
6298 				goto retry_avoidcopy;
6299 			/*
6300 			 * race occurs while re-acquiring page table
6301 			 * lock, and our job is done.
6302 			 */
6303 			delayacct_wpcopy_end();
6304 			return 0;
6305 		}
6306 
6307 		ret = vmf_error(PTR_ERR(new_folio));
6308 		goto out_release_old;
6309 	}
6310 
6311 	/*
6312 	 * When the original hugepage is shared one, it does not have
6313 	 * anon_vma prepared.
6314 	 */
6315 	ret = __vmf_anon_prepare(vmf);
6316 	if (unlikely(ret))
6317 		goto out_release_all;
6318 
6319 	if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) {
6320 		ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h));
6321 		goto out_release_all;
6322 	}
6323 	__folio_mark_uptodate(new_folio);
6324 
6325 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address,
6326 				vmf->address + huge_page_size(h));
6327 	mmu_notifier_invalidate_range_start(&range);
6328 
6329 	/*
6330 	 * Retake the page table lock to check for racing updates
6331 	 * before the page tables are altered
6332 	 */
6333 	spin_lock(vmf->ptl);
6334 	vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
6335 	if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) {
6336 		pte_t newpte = make_huge_pte(vma, new_folio, !unshare);
6337 
6338 		/* Break COW or unshare */
6339 		huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
6340 		hugetlb_remove_rmap(old_folio);
6341 		hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address);
6342 		if (huge_pte_uffd_wp(pte))
6343 			newpte = huge_pte_mkuffd_wp(newpte);
6344 		set_huge_pte_at(mm, vmf->address, vmf->pte, newpte,
6345 				huge_page_size(h));
6346 		folio_set_hugetlb_migratable(new_folio);
6347 		/* Make the old page be freed below */
6348 		new_folio = old_folio;
6349 	}
6350 	spin_unlock(vmf->ptl);
6351 	mmu_notifier_invalidate_range_end(&range);
6352 out_release_all:
6353 	/*
6354 	 * No restore in case of successful pagetable update (Break COW or
6355 	 * unshare)
6356 	 */
6357 	if (new_folio != old_folio)
6358 		restore_reserve_on_error(h, vma, vmf->address, new_folio);
6359 	folio_put(new_folio);
6360 out_release_old:
6361 	folio_put(old_folio);
6362 
6363 	spin_lock(vmf->ptl); /* Caller expects lock to be held */
6364 
6365 	delayacct_wpcopy_end();
6366 	return ret;
6367 }
6368 
6369 /*
6370  * Return whether there is a pagecache page to back given address within VMA.
6371  */
6372 bool hugetlbfs_pagecache_present(struct hstate *h,
6373 				 struct vm_area_struct *vma, unsigned long address)
6374 {
6375 	struct address_space *mapping = vma->vm_file->f_mapping;
6376 	pgoff_t idx = linear_page_index(vma, address);
6377 	struct folio *folio;
6378 
6379 	folio = filemap_get_folio(mapping, idx);
6380 	if (IS_ERR(folio))
6381 		return false;
6382 	folio_put(folio);
6383 	return true;
6384 }
6385 
6386 int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
6387 			   pgoff_t idx)
6388 {
6389 	struct inode *inode = mapping->host;
6390 	struct hstate *h = hstate_inode(inode);
6391 	int err;
6392 
6393 	idx <<= huge_page_order(h);
6394 	__folio_set_locked(folio);
6395 	err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
6396 
6397 	if (unlikely(err)) {
6398 		__folio_clear_locked(folio);
6399 		return err;
6400 	}
6401 	folio_clear_hugetlb_restore_reserve(folio);
6402 
6403 	/*
6404 	 * mark folio dirty so that it will not be removed from cache/file
6405 	 * by non-hugetlbfs specific code paths.
6406 	 */
6407 	folio_mark_dirty(folio);
6408 
6409 	spin_lock(&inode->i_lock);
6410 	inode->i_blocks += blocks_per_huge_page(h);
6411 	spin_unlock(&inode->i_lock);
6412 	return 0;
6413 }
6414 
6415 static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf,
6416 						  struct address_space *mapping,
6417 						  unsigned long reason)
6418 {
6419 	u32 hash;
6420 
6421 	/*
6422 	 * vma_lock and hugetlb_fault_mutex must be dropped before handling
6423 	 * userfault. Also mmap_lock could be dropped due to handling
6424 	 * userfault, any vma operation should be careful from here.
6425 	 */
6426 	hugetlb_vma_unlock_read(vmf->vma);
6427 	hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
6428 	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6429 	return handle_userfault(vmf, reason);
6430 }
6431 
6432 /*
6433  * Recheck pte with pgtable lock.  Returns true if pte didn't change, or
6434  * false if pte changed or is changing.
6435  */
6436 static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned long addr,
6437 			       pte_t *ptep, pte_t old_pte)
6438 {
6439 	spinlock_t *ptl;
6440 	bool same;
6441 
6442 	ptl = huge_pte_lock(h, mm, ptep);
6443 	same = pte_same(huge_ptep_get(mm, addr, ptep), old_pte);
6444 	spin_unlock(ptl);
6445 
6446 	return same;
6447 }
6448 
6449 static vm_fault_t hugetlb_no_page(struct address_space *mapping,
6450 			struct vm_fault *vmf)
6451 {
6452 	u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
6453 	bool new_folio, new_anon_folio = false;
6454 	struct vm_area_struct *vma = vmf->vma;
6455 	struct mm_struct *mm = vma->vm_mm;
6456 	struct hstate *h = hstate_vma(vma);
6457 	vm_fault_t ret = VM_FAULT_SIGBUS;
6458 	bool folio_locked = true;
6459 	struct folio *folio;
6460 	unsigned long size;
6461 	pte_t new_pte;
6462 
6463 	/*
6464 	 * Currently, we are forced to kill the process in the event the
6465 	 * original mapper has unmapped pages from the child due to a failed
6466 	 * COW/unsharing. Warn that such a situation has occurred as it may not
6467 	 * be obvious.
6468 	 */
6469 	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
6470 		pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
6471 			   current->pid);
6472 		goto out;
6473 	}
6474 
6475 	/*
6476 	 * Use page lock to guard against racing truncation
6477 	 * before we get page_table_lock.
6478 	 */
6479 	new_folio = false;
6480 	folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff);
6481 	if (IS_ERR(folio)) {
6482 		size = i_size_read(mapping->host) >> huge_page_shift(h);
6483 		if (vmf->pgoff >= size)
6484 			goto out;
6485 		/* Check for page in userfault range */
6486 		if (userfaultfd_missing(vma)) {
6487 			/*
6488 			 * Since hugetlb_no_page() was examining pte
6489 			 * without pgtable lock, we need to re-test under
6490 			 * lock because the pte may not be stable and could
6491 			 * have changed from under us.  Try to detect
6492 			 * either changed or during-changing ptes and retry
6493 			 * properly when needed.
6494 			 *
6495 			 * Note that userfaultfd is actually fine with
6496 			 * false positives (e.g. caused by pte changed),
6497 			 * but not wrong logical events (e.g. caused by
6498 			 * reading a pte during changing).  The latter can
6499 			 * confuse the userspace, so the strictness is very
6500 			 * much preferred.  E.g., MISSING event should
6501 			 * never happen on the page after UFFDIO_COPY has
6502 			 * correctly installed the page and returned.
6503 			 */
6504 			if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) {
6505 				ret = 0;
6506 				goto out;
6507 			}
6508 
6509 			return hugetlb_handle_userfault(vmf, mapping,
6510 							VM_UFFD_MISSING);
6511 		}
6512 
6513 		if (!(vma->vm_flags & VM_MAYSHARE)) {
6514 			ret = __vmf_anon_prepare(vmf);
6515 			if (unlikely(ret))
6516 				goto out;
6517 		}
6518 
6519 		folio = alloc_hugetlb_folio(vma, vmf->address, false);
6520 		if (IS_ERR(folio)) {
6521 			/*
6522 			 * Returning error will result in faulting task being
6523 			 * sent SIGBUS.  The hugetlb fault mutex prevents two
6524 			 * tasks from racing to fault in the same page which
6525 			 * could result in false unable to allocate errors.
6526 			 * Page migration does not take the fault mutex, but
6527 			 * does a clear then write of pte's under page table
6528 			 * lock.  Page fault code could race with migration,
6529 			 * notice the clear pte and try to allocate a page
6530 			 * here.  Before returning error, get ptl and make
6531 			 * sure there really is no pte entry.
6532 			 */
6533 			if (hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte))
6534 				ret = vmf_error(PTR_ERR(folio));
6535 			else
6536 				ret = 0;
6537 			goto out;
6538 		}
6539 		folio_zero_user(folio, vmf->real_address);
6540 		__folio_mark_uptodate(folio);
6541 		new_folio = true;
6542 
6543 		if (vma->vm_flags & VM_MAYSHARE) {
6544 			int err = hugetlb_add_to_page_cache(folio, mapping,
6545 							vmf->pgoff);
6546 			if (err) {
6547 				/*
6548 				 * err can't be -EEXIST which implies someone
6549 				 * else consumed the reservation since hugetlb
6550 				 * fault mutex is held when add a hugetlb page
6551 				 * to the page cache. So it's safe to call
6552 				 * restore_reserve_on_error() here.
6553 				 */
6554 				restore_reserve_on_error(h, vma, vmf->address,
6555 							folio);
6556 				folio_put(folio);
6557 				ret = VM_FAULT_SIGBUS;
6558 				goto out;
6559 			}
6560 		} else {
6561 			new_anon_folio = true;
6562 			folio_lock(folio);
6563 		}
6564 	} else {
6565 		/*
6566 		 * If memory error occurs between mmap() and fault, some process
6567 		 * don't have hwpoisoned swap entry for errored virtual address.
6568 		 * So we need to block hugepage fault by PG_hwpoison bit check.
6569 		 */
6570 		if (unlikely(folio_test_hwpoison(folio))) {
6571 			ret = VM_FAULT_HWPOISON_LARGE |
6572 				VM_FAULT_SET_HINDEX(hstate_index(h));
6573 			goto backout_unlocked;
6574 		}
6575 
6576 		/* Check for page in userfault range. */
6577 		if (userfaultfd_minor(vma)) {
6578 			folio_unlock(folio);
6579 			folio_put(folio);
6580 			/* See comment in userfaultfd_missing() block above */
6581 			if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) {
6582 				ret = 0;
6583 				goto out;
6584 			}
6585 			return hugetlb_handle_userfault(vmf, mapping,
6586 							VM_UFFD_MINOR);
6587 		}
6588 	}
6589 
6590 	/*
6591 	 * If we are going to COW a private mapping later, we examine the
6592 	 * pending reservations for this page now. This will ensure that
6593 	 * any allocations necessary to record that reservation occur outside
6594 	 * the spinlock.
6595 	 */
6596 	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
6597 		if (vma_needs_reservation(h, vma, vmf->address) < 0) {
6598 			ret = VM_FAULT_OOM;
6599 			goto backout_unlocked;
6600 		}
6601 		/* Just decrements count, does not deallocate */
6602 		vma_end_reservation(h, vma, vmf->address);
6603 	}
6604 
6605 	vmf->ptl = huge_pte_lock(h, mm, vmf->pte);
6606 	ret = 0;
6607 	/* If pte changed from under us, retry */
6608 	if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte))
6609 		goto backout;
6610 
6611 	if (new_anon_folio)
6612 		hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
6613 	else
6614 		hugetlb_add_file_rmap(folio);
6615 	new_pte = make_huge_pte(vma, folio, vma->vm_flags & VM_SHARED);
6616 	/*
6617 	 * If this pte was previously wr-protected, keep it wr-protected even
6618 	 * if populated.
6619 	 */
6620 	if (unlikely(pte_marker_uffd_wp(vmf->orig_pte)))
6621 		new_pte = huge_pte_mkuffd_wp(new_pte);
6622 	set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h));
6623 
6624 	hugetlb_count_add(pages_per_huge_page(h), mm);
6625 	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
6626 		/*
6627 		 * No need to keep file folios locked. See comment in
6628 		 * hugetlb_fault().
6629 		 */
6630 		if (!new_anon_folio) {
6631 			folio_locked = false;
6632 			folio_unlock(folio);
6633 		}
6634 		/* Optimization, do the COW without a second fault */
6635 		ret = hugetlb_wp(vmf);
6636 	}
6637 
6638 	spin_unlock(vmf->ptl);
6639 
6640 	/*
6641 	 * Only set hugetlb_migratable in newly allocated pages.  Existing pages
6642 	 * found in the pagecache may not have hugetlb_migratable if they have
6643 	 * been isolated for migration.
6644 	 */
6645 	if (new_folio)
6646 		folio_set_hugetlb_migratable(folio);
6647 
6648 	if (folio_locked)
6649 		folio_unlock(folio);
6650 out:
6651 	hugetlb_vma_unlock_read(vma);
6652 
6653 	/*
6654 	 * We must check to release the per-VMA lock. __vmf_anon_prepare() is
6655 	 * the only way ret can be set to VM_FAULT_RETRY.
6656 	 */
6657 	if (unlikely(ret & VM_FAULT_RETRY))
6658 		vma_end_read(vma);
6659 
6660 	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6661 	return ret;
6662 
6663 backout:
6664 	spin_unlock(vmf->ptl);
6665 backout_unlocked:
6666 	/* We only need to restore reservations for private mappings */
6667 	if (new_anon_folio)
6668 		restore_reserve_on_error(h, vma, vmf->address, folio);
6669 
6670 	folio_unlock(folio);
6671 	folio_put(folio);
6672 	goto out;
6673 }
6674 
6675 #ifdef CONFIG_SMP
6676 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
6677 {
6678 	unsigned long key[2];
6679 	u32 hash;
6680 
6681 	key[0] = (unsigned long) mapping;
6682 	key[1] = idx;
6683 
6684 	hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
6685 
6686 	return hash & (num_fault_mutexes - 1);
6687 }
6688 #else
6689 /*
6690  * For uniprocessor systems we always use a single mutex, so just
6691  * return 0 and avoid the hashing overhead.
6692  */
6693 u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
6694 {
6695 	return 0;
6696 }
6697 #endif
6698 
6699 vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
6700 			unsigned long address, unsigned int flags)
6701 {
6702 	vm_fault_t ret;
6703 	u32 hash;
6704 	struct folio *folio = NULL;
6705 	struct hstate *h = hstate_vma(vma);
6706 	struct address_space *mapping;
6707 	bool need_wait_lock = false;
6708 	struct vm_fault vmf = {
6709 		.vma = vma,
6710 		.address = address & huge_page_mask(h),
6711 		.real_address = address,
6712 		.flags = flags,
6713 		.pgoff = vma_hugecache_offset(h, vma,
6714 				address & huge_page_mask(h)),
6715 		/* TODO: Track hugetlb faults using vm_fault */
6716 
6717 		/*
6718 		 * Some fields may not be initialized, be careful as it may
6719 		 * be hard to debug if called functions make assumptions
6720 		 */
6721 	};
6722 
6723 	/*
6724 	 * Serialize hugepage allocation and instantiation, so that we don't
6725 	 * get spurious allocation failures if two CPUs race to instantiate
6726 	 * the same page in the page cache.
6727 	 */
6728 	mapping = vma->vm_file->f_mapping;
6729 	hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff);
6730 	mutex_lock(&hugetlb_fault_mutex_table[hash]);
6731 
6732 	/*
6733 	 * Acquire vma lock before calling huge_pte_alloc and hold
6734 	 * until finished with vmf.pte.  This prevents huge_pmd_unshare from
6735 	 * being called elsewhere and making the vmf.pte no longer valid.
6736 	 */
6737 	hugetlb_vma_lock_read(vma);
6738 	vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h));
6739 	if (!vmf.pte) {
6740 		hugetlb_vma_unlock_read(vma);
6741 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6742 		return VM_FAULT_OOM;
6743 	}
6744 
6745 	vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte);
6746 	if (huge_pte_none_mostly(vmf.orig_pte)) {
6747 		if (is_pte_marker(vmf.orig_pte)) {
6748 			pte_marker marker =
6749 				pte_marker_get(pte_to_swp_entry(vmf.orig_pte));
6750 
6751 			if (marker & PTE_MARKER_POISONED) {
6752 				ret = VM_FAULT_HWPOISON_LARGE |
6753 				      VM_FAULT_SET_HINDEX(hstate_index(h));
6754 				goto out_mutex;
6755 			} else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) {
6756 				/* This isn't supported in hugetlb. */
6757 				ret = VM_FAULT_SIGSEGV;
6758 				goto out_mutex;
6759 			}
6760 		}
6761 
6762 		/*
6763 		 * Other PTE markers should be handled the same way as none PTE.
6764 		 *
6765 		 * hugetlb_no_page will drop vma lock and hugetlb fault
6766 		 * mutex internally, which make us return immediately.
6767 		 */
6768 		return hugetlb_no_page(mapping, &vmf);
6769 	}
6770 
6771 	ret = 0;
6772 
6773 	/* Not present, either a migration or a hwpoisoned entry */
6774 	if (!pte_present(vmf.orig_pte)) {
6775 		if (is_hugetlb_entry_migration(vmf.orig_pte)) {
6776 			/*
6777 			 * Release the hugetlb fault lock now, but retain
6778 			 * the vma lock, because it is needed to guard the
6779 			 * huge_pte_lockptr() later in
6780 			 * migration_entry_wait_huge(). The vma lock will
6781 			 * be released there.
6782 			 */
6783 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6784 			migration_entry_wait_huge(vma, vmf.address, vmf.pte);
6785 			return 0;
6786 		} else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte))
6787 			ret = VM_FAULT_HWPOISON_LARGE |
6788 			    VM_FAULT_SET_HINDEX(hstate_index(h));
6789 		goto out_mutex;
6790 	}
6791 
6792 	/*
6793 	 * If we are going to COW/unshare the mapping later, we examine the
6794 	 * pending reservations for this page now. This will ensure that any
6795 	 * allocations necessary to record that reservation occur outside the
6796 	 * spinlock.
6797 	 */
6798 	if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
6799 	    !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
6800 		if (vma_needs_reservation(h, vma, vmf.address) < 0) {
6801 			ret = VM_FAULT_OOM;
6802 			goto out_mutex;
6803 		}
6804 		/* Just decrements count, does not deallocate */
6805 		vma_end_reservation(h, vma, vmf.address);
6806 	}
6807 
6808 	vmf.ptl = huge_pte_lock(h, mm, vmf.pte);
6809 
6810 	/* Check for a racing update before calling hugetlb_wp() */
6811 	if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(mm, vmf.address, vmf.pte))))
6812 		goto out_ptl;
6813 
6814 	/* Handle userfault-wp first, before trying to lock more pages */
6815 	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(mm, vmf.address, vmf.pte)) &&
6816 	    (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
6817 		if (!userfaultfd_wp_async(vma)) {
6818 			spin_unlock(vmf.ptl);
6819 			hugetlb_vma_unlock_read(vma);
6820 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6821 			return handle_userfault(&vmf, VM_UFFD_WP);
6822 		}
6823 
6824 		vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte);
6825 		set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,
6826 				huge_page_size(hstate_vma(vma)));
6827 		/* Fallthrough to CoW */
6828 	}
6829 
6830 	if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
6831 		if (!huge_pte_write(vmf.orig_pte)) {
6832 			/*
6833 			 * Anonymous folios need to be lock since hugetlb_wp()
6834 			 * checks whether we can re-use the folio exclusively
6835 			 * for us in case we are the only user of it.
6836 			 */
6837 			folio = page_folio(pte_page(vmf.orig_pte));
6838 			if (folio_test_anon(folio) && !folio_trylock(folio)) {
6839 				need_wait_lock = true;
6840 				goto out_ptl;
6841 			}
6842 			folio_get(folio);
6843 			ret = hugetlb_wp(&vmf);
6844 			if (folio_test_anon(folio))
6845 				folio_unlock(folio);
6846 			folio_put(folio);
6847 			goto out_ptl;
6848 		} else if (likely(flags & FAULT_FLAG_WRITE)) {
6849 			vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
6850 		}
6851 	}
6852 	vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
6853 	if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
6854 						flags & FAULT_FLAG_WRITE))
6855 		update_mmu_cache(vma, vmf.address, vmf.pte);
6856 out_ptl:
6857 	spin_unlock(vmf.ptl);
6858 out_mutex:
6859 	hugetlb_vma_unlock_read(vma);
6860 
6861 	/*
6862 	 * We must check to release the per-VMA lock. __vmf_anon_prepare() in
6863 	 * hugetlb_wp() is the only way ret can be set to VM_FAULT_RETRY.
6864 	 */
6865 	if (unlikely(ret & VM_FAULT_RETRY))
6866 		vma_end_read(vma);
6867 
6868 	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6869 	/*
6870 	 * hugetlb_wp drops all the locks, but the folio lock, before trying to
6871 	 * unmap the folio from other processes. During that window, if another
6872 	 * process mapping that folio faults in, it will take the mutex and then
6873 	 * it will wait on folio_lock, causing an ABBA deadlock.
6874 	 * Use trylock instead and bail out if we fail.
6875 	 *
6876 	 * Ideally, we should hold a refcount on the folio we wait for, but we do
6877 	 * not want to use the folio after it becomes unlocked, but rather just
6878 	 * wait for it to become unlocked, so hopefully next fault successes on
6879 	 * the trylock.
6880 	 */
6881 	if (need_wait_lock)
6882 		folio_wait_locked(folio);
6883 	return ret;
6884 }
6885 
6886 #ifdef CONFIG_USERFAULTFD
6887 /*
6888  * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte().
6889  */
6890 static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
6891 		struct vm_area_struct *vma, unsigned long address)
6892 {
6893 	struct mempolicy *mpol;
6894 	nodemask_t *nodemask;
6895 	struct folio *folio;
6896 	gfp_t gfp_mask;
6897 	int node;
6898 
6899 	gfp_mask = htlb_alloc_mask(h);
6900 	node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
6901 	/*
6902 	 * This is used to allocate a temporary hugetlb to hold the copied
6903 	 * content, which will then be copied again to the final hugetlb
6904 	 * consuming a reservation. Set the alloc_fallback to false to indicate
6905 	 * that breaking the per-node hugetlb pool is not allowed in this case.
6906 	 */
6907 	folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false);
6908 	mpol_cond_put(mpol);
6909 
6910 	return folio;
6911 }
6912 
6913 /*
6914  * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
6915  * with modifications for hugetlb pages.
6916  */
6917 int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
6918 			     struct vm_area_struct *dst_vma,
6919 			     unsigned long dst_addr,
6920 			     unsigned long src_addr,
6921 			     uffd_flags_t flags,
6922 			     struct folio **foliop)
6923 {
6924 	struct mm_struct *dst_mm = dst_vma->vm_mm;
6925 	bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE);
6926 	bool wp_enabled = (flags & MFILL_ATOMIC_WP);
6927 	struct hstate *h = hstate_vma(dst_vma);
6928 	struct address_space *mapping = dst_vma->vm_file->f_mapping;
6929 	pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
6930 	unsigned long size = huge_page_size(h);
6931 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
6932 	pte_t _dst_pte;
6933 	spinlock_t *ptl;
6934 	int ret = -ENOMEM;
6935 	struct folio *folio;
6936 	bool folio_in_pagecache = false;
6937 
6938 	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
6939 		ptl = huge_pte_lock(h, dst_mm, dst_pte);
6940 
6941 		/* Don't overwrite any existing PTEs (even markers) */
6942 		if (!huge_pte_none(huge_ptep_get(dst_mm, dst_addr, dst_pte))) {
6943 			spin_unlock(ptl);
6944 			return -EEXIST;
6945 		}
6946 
6947 		_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
6948 		set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size);
6949 
6950 		/* No need to invalidate - it was non-present before */
6951 		update_mmu_cache(dst_vma, dst_addr, dst_pte);
6952 
6953 		spin_unlock(ptl);
6954 		return 0;
6955 	}
6956 
6957 	if (is_continue) {
6958 		ret = -EFAULT;
6959 		folio = filemap_lock_hugetlb_folio(h, mapping, idx);
6960 		if (IS_ERR(folio))
6961 			goto out;
6962 		folio_in_pagecache = true;
6963 	} else if (!*foliop) {
6964 		/* If a folio already exists, then it's UFFDIO_COPY for
6965 		 * a non-missing case. Return -EEXIST.
6966 		 */
6967 		if (vm_shared &&
6968 		    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
6969 			ret = -EEXIST;
6970 			goto out;
6971 		}
6972 
6973 		folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
6974 		if (IS_ERR(folio)) {
6975 			pte_t *actual_pte = hugetlb_walk(dst_vma, dst_addr, PMD_SIZE);
6976 			if (actual_pte) {
6977 				ret = -EEXIST;
6978 				goto out;
6979 			}
6980 			ret = -ENOMEM;
6981 			goto out;
6982 		}
6983 
6984 		ret = copy_folio_from_user(folio, (const void __user *) src_addr,
6985 					   false);
6986 
6987 		/* fallback to copy_from_user outside mmap_lock */
6988 		if (unlikely(ret)) {
6989 			ret = -ENOENT;
6990 			/* Free the allocated folio which may have
6991 			 * consumed a reservation.
6992 			 */
6993 			restore_reserve_on_error(h, dst_vma, dst_addr, folio);
6994 			folio_put(folio);
6995 
6996 			/* Allocate a temporary folio to hold the copied
6997 			 * contents.
6998 			 */
6999 			folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr);
7000 			if (!folio) {
7001 				ret = -ENOMEM;
7002 				goto out;
7003 			}
7004 			*foliop = folio;
7005 			/* Set the outparam foliop and return to the caller to
7006 			 * copy the contents outside the lock. Don't free the
7007 			 * folio.
7008 			 */
7009 			goto out;
7010 		}
7011 	} else {
7012 		if (vm_shared &&
7013 		    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
7014 			folio_put(*foliop);
7015 			ret = -EEXIST;
7016 			*foliop = NULL;
7017 			goto out;
7018 		}
7019 
7020 		folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
7021 		if (IS_ERR(folio)) {
7022 			folio_put(*foliop);
7023 			ret = -ENOMEM;
7024 			*foliop = NULL;
7025 			goto out;
7026 		}
7027 		ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
7028 		folio_put(*foliop);
7029 		*foliop = NULL;
7030 		if (ret) {
7031 			folio_put(folio);
7032 			goto out;
7033 		}
7034 	}
7035 
7036 	/*
7037 	 * If we just allocated a new page, we need a memory barrier to ensure
7038 	 * that preceding stores to the page become visible before the
7039 	 * set_pte_at() write. The memory barrier inside __folio_mark_uptodate
7040 	 * is what we need.
7041 	 *
7042 	 * In the case where we have not allocated a new page (is_continue),
7043 	 * the page must already be uptodate. UFFDIO_CONTINUE already includes
7044 	 * an earlier smp_wmb() to ensure that prior stores will be visible
7045 	 * before the set_pte_at() write.
7046 	 */
7047 	if (!is_continue)
7048 		__folio_mark_uptodate(folio);
7049 	else
7050 		WARN_ON_ONCE(!folio_test_uptodate(folio));
7051 
7052 	/* Add shared, newly allocated pages to the page cache. */
7053 	if (vm_shared && !is_continue) {
7054 		ret = -EFAULT;
7055 		if (idx >= (i_size_read(mapping->host) >> huge_page_shift(h)))
7056 			goto out_release_nounlock;
7057 
7058 		/*
7059 		 * Serialization between remove_inode_hugepages() and
7060 		 * hugetlb_add_to_page_cache() below happens through the
7061 		 * hugetlb_fault_mutex_table that here must be hold by
7062 		 * the caller.
7063 		 */
7064 		ret = hugetlb_add_to_page_cache(folio, mapping, idx);
7065 		if (ret)
7066 			goto out_release_nounlock;
7067 		folio_in_pagecache = true;
7068 	}
7069 
7070 	ptl = huge_pte_lock(h, dst_mm, dst_pte);
7071 
7072 	ret = -EIO;
7073 	if (folio_test_hwpoison(folio))
7074 		goto out_release_unlock;
7075 
7076 	/*
7077 	 * We allow to overwrite a pte marker: consider when both MISSING|WP
7078 	 * registered, we firstly wr-protect a none pte which has no page cache
7079 	 * page backing it, then access the page.
7080 	 */
7081 	ret = -EEXIST;
7082 	if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte)))
7083 		goto out_release_unlock;
7084 
7085 	if (folio_in_pagecache)
7086 		hugetlb_add_file_rmap(folio);
7087 	else
7088 		hugetlb_add_new_anon_rmap(folio, dst_vma, dst_addr);
7089 
7090 	/*
7091 	 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
7092 	 * with wp flag set, don't set pte write bit.
7093 	 */
7094 	_dst_pte = make_huge_pte(dst_vma, folio,
7095 				 !wp_enabled && !(is_continue && !vm_shared));
7096 	/*
7097 	 * Always mark UFFDIO_COPY page dirty; note that this may not be
7098 	 * extremely important for hugetlbfs for now since swapping is not
7099 	 * supported, but we should still be clear in that this page cannot be
7100 	 * thrown away at will, even if write bit not set.
7101 	 */
7102 	_dst_pte = huge_pte_mkdirty(_dst_pte);
7103 	_dst_pte = pte_mkyoung(_dst_pte);
7104 
7105 	if (wp_enabled)
7106 		_dst_pte = huge_pte_mkuffd_wp(_dst_pte);
7107 
7108 	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size);
7109 
7110 	hugetlb_count_add(pages_per_huge_page(h), dst_mm);
7111 
7112 	/* No need to invalidate - it was non-present before */
7113 	update_mmu_cache(dst_vma, dst_addr, dst_pte);
7114 
7115 	spin_unlock(ptl);
7116 	if (!is_continue)
7117 		folio_set_hugetlb_migratable(folio);
7118 	if (vm_shared || is_continue)
7119 		folio_unlock(folio);
7120 	ret = 0;
7121 out:
7122 	return ret;
7123 out_release_unlock:
7124 	spin_unlock(ptl);
7125 	if (vm_shared || is_continue)
7126 		folio_unlock(folio);
7127 out_release_nounlock:
7128 	if (!folio_in_pagecache)
7129 		restore_reserve_on_error(h, dst_vma, dst_addr, folio);
7130 	folio_put(folio);
7131 	goto out;
7132 }
7133 #endif /* CONFIG_USERFAULTFD */
7134 
7135 long hugetlb_change_protection(struct vm_area_struct *vma,
7136 		unsigned long address, unsigned long end,
7137 		pgprot_t newprot, unsigned long cp_flags)
7138 {
7139 	struct mm_struct *mm = vma->vm_mm;
7140 	unsigned long start = address;
7141 	pte_t *ptep;
7142 	pte_t pte;
7143 	struct hstate *h = hstate_vma(vma);
7144 	long pages = 0, psize = huge_page_size(h);
7145 	bool shared_pmd = false;
7146 	struct mmu_notifier_range range;
7147 	unsigned long last_addr_mask;
7148 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
7149 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
7150 
7151 	/*
7152 	 * In the case of shared PMDs, the area to flush could be beyond
7153 	 * start/end.  Set range.start/range.end to cover the maximum possible
7154 	 * range if PMD sharing is possible.
7155 	 */
7156 	mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
7157 				0, mm, start, end);
7158 	adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
7159 
7160 	BUG_ON(address >= end);
7161 	flush_cache_range(vma, range.start, range.end);
7162 
7163 	mmu_notifier_invalidate_range_start(&range);
7164 	hugetlb_vma_lock_write(vma);
7165 	i_mmap_lock_write(vma->vm_file->f_mapping);
7166 	last_addr_mask = hugetlb_mask_last_page(h);
7167 	for (; address < end; address += psize) {
7168 		spinlock_t *ptl;
7169 		ptep = hugetlb_walk(vma, address, psize);
7170 		if (!ptep) {
7171 			if (!uffd_wp) {
7172 				address |= last_addr_mask;
7173 				continue;
7174 			}
7175 			/*
7176 			 * Userfaultfd wr-protect requires pgtable
7177 			 * pre-allocations to install pte markers.
7178 			 */
7179 			ptep = huge_pte_alloc(mm, vma, address, psize);
7180 			if (!ptep) {
7181 				pages = -ENOMEM;
7182 				break;
7183 			}
7184 		}
7185 		ptl = huge_pte_lock(h, mm, ptep);
7186 		if (huge_pmd_unshare(mm, vma, address, ptep)) {
7187 			/*
7188 			 * When uffd-wp is enabled on the vma, unshare
7189 			 * shouldn't happen at all.  Warn about it if it
7190 			 * happened due to some reason.
7191 			 */
7192 			WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
7193 			pages++;
7194 			spin_unlock(ptl);
7195 			shared_pmd = true;
7196 			address |= last_addr_mask;
7197 			continue;
7198 		}
7199 		pte = huge_ptep_get(mm, address, ptep);
7200 		if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
7201 			/* Nothing to do. */
7202 		} else if (unlikely(is_hugetlb_entry_migration(pte))) {
7203 			swp_entry_t entry = pte_to_swp_entry(pte);
7204 			struct folio *folio = pfn_swap_entry_folio(entry);
7205 			pte_t newpte = pte;
7206 
7207 			if (is_writable_migration_entry(entry)) {
7208 				if (folio_test_anon(folio))
7209 					entry = make_readable_exclusive_migration_entry(
7210 								swp_offset(entry));
7211 				else
7212 					entry = make_readable_migration_entry(
7213 								swp_offset(entry));
7214 				newpte = swp_entry_to_pte(entry);
7215 				pages++;
7216 			}
7217 
7218 			if (uffd_wp)
7219 				newpte = pte_swp_mkuffd_wp(newpte);
7220 			else if (uffd_wp_resolve)
7221 				newpte = pte_swp_clear_uffd_wp(newpte);
7222 			if (!pte_same(pte, newpte))
7223 				set_huge_pte_at(mm, address, ptep, newpte, psize);
7224 		} else if (unlikely(is_pte_marker(pte))) {
7225 			/*
7226 			 * Do nothing on a poison marker; page is
7227 			 * corrupted, permissions do not apply. Here
7228 			 * pte_marker_uffd_wp()==true implies !poison
7229 			 * because they're mutual exclusive.
7230 			 */
7231 			if (pte_marker_uffd_wp(pte) && uffd_wp_resolve)
7232 				/* Safe to modify directly (non-present->none). */
7233 				huge_pte_clear(mm, address, ptep, psize);
7234 		} else if (!huge_pte_none(pte)) {
7235 			pte_t old_pte;
7236 			unsigned int shift = huge_page_shift(hstate_vma(vma));
7237 
7238 			old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
7239 			pte = huge_pte_modify(old_pte, newprot);
7240 			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
7241 			if (uffd_wp)
7242 				pte = huge_pte_mkuffd_wp(pte);
7243 			else if (uffd_wp_resolve)
7244 				pte = huge_pte_clear_uffd_wp(pte);
7245 			huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
7246 			pages++;
7247 		} else {
7248 			/* None pte */
7249 			if (unlikely(uffd_wp))
7250 				/* Safe to modify directly (none->non-present). */
7251 				set_huge_pte_at(mm, address, ptep,
7252 						make_pte_marker(PTE_MARKER_UFFD_WP),
7253 						psize);
7254 		}
7255 		spin_unlock(ptl);
7256 
7257 		cond_resched();
7258 	}
7259 	/*
7260 	 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
7261 	 * may have cleared our pud entry and done put_page on the page table:
7262 	 * once we release i_mmap_rwsem, another task can do the final put_page
7263 	 * and that page table be reused and filled with junk.  If we actually
7264 	 * did unshare a page of pmds, flush the range corresponding to the pud.
7265 	 */
7266 	if (shared_pmd)
7267 		flush_hugetlb_tlb_range(vma, range.start, range.end);
7268 	else
7269 		flush_hugetlb_tlb_range(vma, start, end);
7270 	/*
7271 	 * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
7272 	 * downgrading page table protection not changing it to point to a new
7273 	 * page.
7274 	 *
7275 	 * See Documentation/mm/mmu_notifier.rst
7276 	 */
7277 	i_mmap_unlock_write(vma->vm_file->f_mapping);
7278 	hugetlb_vma_unlock_write(vma);
7279 	mmu_notifier_invalidate_range_end(&range);
7280 
7281 	return pages > 0 ? (pages << h->order) : pages;
7282 }
7283 
7284 /*
7285  * Update the reservation map for the range [from, to].
7286  *
7287  * Returns the number of entries that would be added to the reservation map
7288  * associated with the range [from, to].  This number is greater or equal to
7289  * zero. -EINVAL or -ENOMEM is returned in case of any errors.
7290  */
7291 
7292 long hugetlb_reserve_pages(struct inode *inode,
7293 		long from, long to,
7294 		struct vm_area_desc *desc,
7295 		vm_flags_t vm_flags)
7296 {
7297 	long chg = -1, add = -1, spool_resv, gbl_resv;
7298 	struct hstate *h = hstate_inode(inode);
7299 	struct hugepage_subpool *spool = subpool_inode(inode);
7300 	struct resv_map *resv_map;
7301 	struct hugetlb_cgroup *h_cg = NULL;
7302 	long gbl_reserve, regions_needed = 0;
7303 
7304 	/* This should never happen */
7305 	if (from > to) {
7306 		VM_WARN(1, "%s called with a negative range\n", __func__);
7307 		return -EINVAL;
7308 	}
7309 
7310 	/*
7311 	 * Only apply hugepage reservation if asked. At fault time, an
7312 	 * attempt will be made for VM_NORESERVE to allocate a page
7313 	 * without using reserves
7314 	 */
7315 	if (vm_flags & VM_NORESERVE)
7316 		return 0;
7317 
7318 	/*
7319 	 * Shared mappings base their reservation on the number of pages that
7320 	 * are already allocated on behalf of the file. Private mappings need
7321 	 * to reserve the full area even if read-only as mprotect() may be
7322 	 * called to make the mapping read-write. Assume !desc is a shm mapping
7323 	 */
7324 	if (!desc || desc->vm_flags & VM_MAYSHARE) {
7325 		/*
7326 		 * resv_map can not be NULL as hugetlb_reserve_pages is only
7327 		 * called for inodes for which resv_maps were created (see
7328 		 * hugetlbfs_get_inode).
7329 		 */
7330 		resv_map = inode_resv_map(inode);
7331 
7332 		chg = region_chg(resv_map, from, to, &regions_needed);
7333 	} else {
7334 		/* Private mapping. */
7335 		resv_map = resv_map_alloc();
7336 		if (!resv_map)
7337 			goto out_err;
7338 
7339 		chg = to - from;
7340 
7341 		set_vma_desc_resv_map(desc, resv_map);
7342 		set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER);
7343 	}
7344 
7345 	if (chg < 0)
7346 		goto out_err;
7347 
7348 	if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
7349 				chg * pages_per_huge_page(h), &h_cg) < 0)
7350 		goto out_err;
7351 
7352 	if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) {
7353 		/* For private mappings, the hugetlb_cgroup uncharge info hangs
7354 		 * of the resv_map.
7355 		 */
7356 		resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
7357 	}
7358 
7359 	/*
7360 	 * There must be enough pages in the subpool for the mapping. If
7361 	 * the subpool has a minimum size, there may be some global
7362 	 * reservations already in place (gbl_reserve).
7363 	 */
7364 	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
7365 	if (gbl_reserve < 0)
7366 		goto out_uncharge_cgroup;
7367 
7368 	/*
7369 	 * Check enough hugepages are available for the reservation.
7370 	 * Hand the pages back to the subpool if there are not
7371 	 */
7372 	if (hugetlb_acct_memory(h, gbl_reserve) < 0)
7373 		goto out_put_pages;
7374 
7375 	/*
7376 	 * Account for the reservations made. Shared mappings record regions
7377 	 * that have reservations as they are shared by multiple VMAs.
7378 	 * When the last VMA disappears, the region map says how much
7379 	 * the reservation was and the page cache tells how much of
7380 	 * the reservation was consumed. Private mappings are per-VMA and
7381 	 * only the consumed reservations are tracked. When the VMA
7382 	 * disappears, the original reservation is the VMA size and the
7383 	 * consumed reservations are stored in the map. Hence, nothing
7384 	 * else has to be done for private mappings here
7385 	 */
7386 	if (!desc || desc->vm_flags & VM_MAYSHARE) {
7387 		add = region_add(resv_map, from, to, regions_needed, h, h_cg);
7388 
7389 		if (unlikely(add < 0)) {
7390 			hugetlb_acct_memory(h, -gbl_reserve);
7391 			goto out_put_pages;
7392 		} else if (unlikely(chg > add)) {
7393 			/*
7394 			 * pages in this range were added to the reserve
7395 			 * map between region_chg and region_add.  This
7396 			 * indicates a race with alloc_hugetlb_folio.  Adjust
7397 			 * the subpool and reserve counts modified above
7398 			 * based on the difference.
7399 			 */
7400 			long rsv_adjust;
7401 
7402 			/*
7403 			 * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
7404 			 * reference to h_cg->css. See comment below for detail.
7405 			 */
7406 			hugetlb_cgroup_uncharge_cgroup_rsvd(
7407 				hstate_index(h),
7408 				(chg - add) * pages_per_huge_page(h), h_cg);
7409 
7410 			rsv_adjust = hugepage_subpool_put_pages(spool,
7411 								chg - add);
7412 			hugetlb_acct_memory(h, -rsv_adjust);
7413 		} else if (h_cg) {
7414 			/*
7415 			 * The file_regions will hold their own reference to
7416 			 * h_cg->css. So we should release the reference held
7417 			 * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
7418 			 * done.
7419 			 */
7420 			hugetlb_cgroup_put_rsvd_cgroup(h_cg);
7421 		}
7422 	}
7423 	return chg;
7424 
7425 out_put_pages:
7426 	spool_resv = chg - gbl_reserve;
7427 	if (spool_resv) {
7428 		/* put sub pool's reservation back, chg - gbl_reserve */
7429 		gbl_resv = hugepage_subpool_put_pages(spool, spool_resv);
7430 		/*
7431 		 * subpool's reserved pages can not be put back due to race,
7432 		 * return to hstate.
7433 		 */
7434 		hugetlb_acct_memory(h, -gbl_resv);
7435 	}
7436 out_uncharge_cgroup:
7437 	hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
7438 					    chg * pages_per_huge_page(h), h_cg);
7439 out_err:
7440 	if (!desc || desc->vm_flags & VM_MAYSHARE)
7441 		/* Only call region_abort if the region_chg succeeded but the
7442 		 * region_add failed or didn't run.
7443 		 */
7444 		if (chg >= 0 && add < 0)
7445 			region_abort(resv_map, from, to, regions_needed);
7446 	if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) {
7447 		kref_put(&resv_map->refs, resv_map_release);
7448 		set_vma_desc_resv_map(desc, NULL);
7449 	}
7450 	return chg < 0 ? chg : add < 0 ? add : -EINVAL;
7451 }
7452 
7453 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
7454 								long freed)
7455 {
7456 	struct hstate *h = hstate_inode(inode);
7457 	struct resv_map *resv_map = inode_resv_map(inode);
7458 	long chg = 0;
7459 	struct hugepage_subpool *spool = subpool_inode(inode);
7460 	long gbl_reserve;
7461 
7462 	/*
7463 	 * Since this routine can be called in the evict inode path for all
7464 	 * hugetlbfs inodes, resv_map could be NULL.
7465 	 */
7466 	if (resv_map) {
7467 		chg = region_del(resv_map, start, end);
7468 		/*
7469 		 * region_del() can fail in the rare case where a region
7470 		 * must be split and another region descriptor can not be
7471 		 * allocated.  If end == LONG_MAX, it will not fail.
7472 		 */
7473 		if (chg < 0)
7474 			return chg;
7475 	}
7476 
7477 	spin_lock(&inode->i_lock);
7478 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
7479 	spin_unlock(&inode->i_lock);
7480 
7481 	/*
7482 	 * If the subpool has a minimum size, the number of global
7483 	 * reservations to be released may be adjusted.
7484 	 *
7485 	 * Note that !resv_map implies freed == 0. So (chg - freed)
7486 	 * won't go negative.
7487 	 */
7488 	gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
7489 	hugetlb_acct_memory(h, -gbl_reserve);
7490 
7491 	return 0;
7492 }
7493 
7494 #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
7495 static unsigned long page_table_shareable(struct vm_area_struct *svma,
7496 				struct vm_area_struct *vma,
7497 				unsigned long addr, pgoff_t idx)
7498 {
7499 	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
7500 				svma->vm_start;
7501 	unsigned long sbase = saddr & PUD_MASK;
7502 	unsigned long s_end = sbase + PUD_SIZE;
7503 
7504 	/* Allow segments to share if only one is marked locked */
7505 	vm_flags_t vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
7506 	vm_flags_t svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
7507 
7508 	/*
7509 	 * match the virtual addresses, permission and the alignment of the
7510 	 * page table page.
7511 	 *
7512 	 * Also, vma_lock (vm_private_data) is required for sharing.
7513 	 */
7514 	if (pmd_index(addr) != pmd_index(saddr) ||
7515 	    vm_flags != svm_flags ||
7516 	    !range_in_vma(svma, sbase, s_end) ||
7517 	    !svma->vm_private_data)
7518 		return 0;
7519 
7520 	return saddr;
7521 }
7522 
7523 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
7524 {
7525 	unsigned long start = addr & PUD_MASK;
7526 	unsigned long end = start + PUD_SIZE;
7527 
7528 #ifdef CONFIG_USERFAULTFD
7529 	if (uffd_disable_huge_pmd_share(vma))
7530 		return false;
7531 #endif
7532 	/*
7533 	 * check on proper vm_flags and page table alignment
7534 	 */
7535 	if (!(vma->vm_flags & VM_MAYSHARE))
7536 		return false;
7537 	if (!vma->vm_private_data)	/* vma lock required for sharing */
7538 		return false;
7539 	if (!range_in_vma(vma, start, end))
7540 		return false;
7541 	return true;
7542 }
7543 
7544 /*
7545  * Determine if start,end range within vma could be mapped by shared pmd.
7546  * If yes, adjust start and end to cover range associated with possible
7547  * shared pmd mappings.
7548  */
7549 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
7550 				unsigned long *start, unsigned long *end)
7551 {
7552 	unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
7553 		v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
7554 
7555 	/*
7556 	 * vma needs to span at least one aligned PUD size, and the range
7557 	 * must be at least partially within in.
7558 	 */
7559 	if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
7560 		(*end <= v_start) || (*start >= v_end))
7561 		return;
7562 
7563 	/* Extend the range to be PUD aligned for a worst case scenario */
7564 	if (*start > v_start)
7565 		*start = ALIGN_DOWN(*start, PUD_SIZE);
7566 
7567 	if (*end < v_end)
7568 		*end = ALIGN(*end, PUD_SIZE);
7569 }
7570 
7571 /*
7572  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
7573  * and returns the corresponding pte. While this is not necessary for the
7574  * !shared pmd case because we can allocate the pmd later as well, it makes the
7575  * code much cleaner. pmd allocation is essential for the shared case because
7576  * pud has to be populated inside the same i_mmap_rwsem section - otherwise
7577  * racing tasks could either miss the sharing (see huge_pte_offset) or select a
7578  * bad pmd for sharing.
7579  */
7580 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
7581 		      unsigned long addr, pud_t *pud)
7582 {
7583 	struct address_space *mapping = vma->vm_file->f_mapping;
7584 	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
7585 			vma->vm_pgoff;
7586 	struct vm_area_struct *svma;
7587 	unsigned long saddr;
7588 	pte_t *spte = NULL;
7589 	pte_t *pte;
7590 
7591 	i_mmap_lock_read(mapping);
7592 	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
7593 		if (svma == vma)
7594 			continue;
7595 
7596 		saddr = page_table_shareable(svma, vma, addr, idx);
7597 		if (saddr) {
7598 			spte = hugetlb_walk(svma, saddr,
7599 					    vma_mmu_pagesize(svma));
7600 			if (spte) {
7601 				ptdesc_pmd_pts_inc(virt_to_ptdesc(spte));
7602 				break;
7603 			}
7604 		}
7605 	}
7606 
7607 	if (!spte)
7608 		goto out;
7609 
7610 	spin_lock(&mm->page_table_lock);
7611 	if (pud_none(*pud)) {
7612 		pud_populate(mm, pud,
7613 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
7614 		mm_inc_nr_pmds(mm);
7615 	} else {
7616 		ptdesc_pmd_pts_dec(virt_to_ptdesc(spte));
7617 	}
7618 	spin_unlock(&mm->page_table_lock);
7619 out:
7620 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
7621 	i_mmap_unlock_read(mapping);
7622 	return pte;
7623 }
7624 
7625 /*
7626  * unmap huge page backed by shared pte.
7627  *
7628  * Called with page table lock held.
7629  *
7630  * returns: 1 successfully unmapped a shared pte page
7631  *	    0 the underlying pte page is not shared, or it is the last user
7632  */
7633 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
7634 					unsigned long addr, pte_t *ptep)
7635 {
7636 	unsigned long sz = huge_page_size(hstate_vma(vma));
7637 	pgd_t *pgd = pgd_offset(mm, addr);
7638 	p4d_t *p4d = p4d_offset(pgd, addr);
7639 	pud_t *pud = pud_offset(p4d, addr);
7640 
7641 	if (sz != PMD_SIZE)
7642 		return 0;
7643 	if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep)))
7644 		return 0;
7645 	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
7646 	hugetlb_vma_assert_locked(vma);
7647 	pud_clear(pud);
7648 	/*
7649 	 * Once our caller drops the rmap lock, some other process might be
7650 	 * using this page table as a normal, non-hugetlb page table.
7651 	 * Wait for pending gup_fast() in other threads to finish before letting
7652 	 * that happen.
7653 	 */
7654 	tlb_remove_table_sync_one();
7655 	ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
7656 	mm_dec_nr_pmds(mm);
7657 	return 1;
7658 }
7659 
7660 #else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
7661 
7662 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
7663 		      unsigned long addr, pud_t *pud)
7664 {
7665 	return NULL;
7666 }
7667 
7668 int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
7669 				unsigned long addr, pte_t *ptep)
7670 {
7671 	return 0;
7672 }
7673 
7674 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
7675 				unsigned long *start, unsigned long *end)
7676 {
7677 }
7678 
7679 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
7680 {
7681 	return false;
7682 }
7683 #endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
7684 
7685 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
7686 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
7687 			unsigned long addr, unsigned long sz)
7688 {
7689 	pgd_t *pgd;
7690 	p4d_t *p4d;
7691 	pud_t *pud;
7692 	pte_t *pte = NULL;
7693 
7694 	pgd = pgd_offset(mm, addr);
7695 	p4d = p4d_alloc(mm, pgd, addr);
7696 	if (!p4d)
7697 		return NULL;
7698 	pud = pud_alloc(mm, p4d, addr);
7699 	if (pud) {
7700 		if (sz == PUD_SIZE) {
7701 			pte = (pte_t *)pud;
7702 		} else {
7703 			BUG_ON(sz != PMD_SIZE);
7704 			if (want_pmd_share(vma, addr) && pud_none(*pud))
7705 				pte = huge_pmd_share(mm, vma, addr, pud);
7706 			else
7707 				pte = (pte_t *)pmd_alloc(mm, pud, addr);
7708 		}
7709 	}
7710 
7711 	if (pte) {
7712 		pte_t pteval = ptep_get_lockless(pte);
7713 
7714 		BUG_ON(pte_present(pteval) && !pte_huge(pteval));
7715 	}
7716 
7717 	return pte;
7718 }
7719 
7720 /*
7721  * huge_pte_offset() - Walk the page table to resolve the hugepage
7722  * entry at address @addr
7723  *
7724  * Return: Pointer to page table entry (PUD or PMD) for
7725  * address @addr, or NULL if a !p*d_present() entry is encountered and the
7726  * size @sz doesn't match the hugepage size at this level of the page
7727  * table.
7728  */
7729 pte_t *huge_pte_offset(struct mm_struct *mm,
7730 		       unsigned long addr, unsigned long sz)
7731 {
7732 	pgd_t *pgd;
7733 	p4d_t *p4d;
7734 	pud_t *pud;
7735 	pmd_t *pmd;
7736 
7737 	pgd = pgd_offset(mm, addr);
7738 	if (!pgd_present(*pgd))
7739 		return NULL;
7740 	p4d = p4d_offset(pgd, addr);
7741 	if (!p4d_present(*p4d))
7742 		return NULL;
7743 
7744 	pud = pud_offset(p4d, addr);
7745 	if (sz == PUD_SIZE)
7746 		/* must be pud huge, non-present or none */
7747 		return (pte_t *)pud;
7748 	if (!pud_present(*pud))
7749 		return NULL;
7750 	/* must have a valid entry and size to go further */
7751 
7752 	pmd = pmd_offset(pud, addr);
7753 	/* must be pmd huge, non-present or none */
7754 	return (pte_t *)pmd;
7755 }
7756 
7757 /*
7758  * Return a mask that can be used to update an address to the last huge
7759  * page in a page table page mapping size.  Used to skip non-present
7760  * page table entries when linearly scanning address ranges.  Architectures
7761  * with unique huge page to page table relationships can define their own
7762  * version of this routine.
7763  */
7764 unsigned long hugetlb_mask_last_page(struct hstate *h)
7765 {
7766 	unsigned long hp_size = huge_page_size(h);
7767 
7768 	if (hp_size == PUD_SIZE)
7769 		return P4D_SIZE - PUD_SIZE;
7770 	else if (hp_size == PMD_SIZE)
7771 		return PUD_SIZE - PMD_SIZE;
7772 	else
7773 		return 0UL;
7774 }
7775 
7776 #else
7777 
7778 /* See description above.  Architectures can provide their own version. */
7779 __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
7780 {
7781 #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
7782 	if (huge_page_size(h) == PMD_SIZE)
7783 		return PUD_SIZE - PMD_SIZE;
7784 #endif
7785 	return 0UL;
7786 }
7787 
7788 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
7789 
7790 /**
7791  * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio
7792  * @folio: the folio to isolate
7793  * @list: the list to add the folio to on success
7794  *
7795  * Isolate an allocated (refcount > 0) hugetlb folio, marking it as
7796  * isolated/non-migratable, and moving it from the active list to the
7797  * given list.
7798  *
7799  * Isolation will fail if @folio is not an allocated hugetlb folio, or if
7800  * it is already isolated/non-migratable.
7801  *
7802  * On success, an additional folio reference is taken that must be dropped
7803  * using folio_putback_hugetlb() to undo the isolation.
7804  *
7805  * Return: True if isolation worked, otherwise False.
7806  */
7807 bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list)
7808 {
7809 	bool ret = true;
7810 
7811 	spin_lock_irq(&hugetlb_lock);
7812 	if (!folio_test_hugetlb(folio) ||
7813 	    !folio_test_hugetlb_migratable(folio) ||
7814 	    !folio_try_get(folio)) {
7815 		ret = false;
7816 		goto unlock;
7817 	}
7818 	folio_clear_hugetlb_migratable(folio);
7819 	list_move_tail(&folio->lru, list);
7820 unlock:
7821 	spin_unlock_irq(&hugetlb_lock);
7822 	return ret;
7823 }
7824 
7825 int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
7826 {
7827 	int ret = 0;
7828 
7829 	*hugetlb = false;
7830 	spin_lock_irq(&hugetlb_lock);
7831 	if (folio_test_hugetlb(folio)) {
7832 		*hugetlb = true;
7833 		if (folio_test_hugetlb_freed(folio))
7834 			ret = 0;
7835 		else if (folio_test_hugetlb_migratable(folio) || unpoison)
7836 			ret = folio_try_get(folio);
7837 		else
7838 			ret = -EBUSY;
7839 	}
7840 	spin_unlock_irq(&hugetlb_lock);
7841 	return ret;
7842 }
7843 
7844 int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
7845 				bool *migratable_cleared)
7846 {
7847 	int ret;
7848 
7849 	spin_lock_irq(&hugetlb_lock);
7850 	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
7851 	spin_unlock_irq(&hugetlb_lock);
7852 	return ret;
7853 }
7854 
7855 /**
7856  * folio_putback_hugetlb - unisolate a hugetlb folio
7857  * @folio: the isolated hugetlb folio
7858  *
7859  * Putback/un-isolate the hugetlb folio that was previous isolated using
7860  * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it
7861  * back onto the active list.
7862  *
7863  * Will drop the additional folio reference obtained through
7864  * folio_isolate_hugetlb().
7865  */
7866 void folio_putback_hugetlb(struct folio *folio)
7867 {
7868 	spin_lock_irq(&hugetlb_lock);
7869 	folio_set_hugetlb_migratable(folio);
7870 	list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist);
7871 	spin_unlock_irq(&hugetlb_lock);
7872 	folio_put(folio);
7873 }
7874 
7875 void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
7876 {
7877 	struct hstate *h = folio_hstate(old_folio);
7878 
7879 	hugetlb_cgroup_migrate(old_folio, new_folio);
7880 	folio_set_owner_migrate_reason(new_folio, reason);
7881 
7882 	/*
7883 	 * transfer temporary state of the new hugetlb folio. This is
7884 	 * reverse to other transitions because the newpage is going to
7885 	 * be final while the old one will be freed so it takes over
7886 	 * the temporary status.
7887 	 *
7888 	 * Also note that we have to transfer the per-node surplus state
7889 	 * here as well otherwise the global surplus count will not match
7890 	 * the per-node's.
7891 	 */
7892 	if (folio_test_hugetlb_temporary(new_folio)) {
7893 		int old_nid = folio_nid(old_folio);
7894 		int new_nid = folio_nid(new_folio);
7895 
7896 		folio_set_hugetlb_temporary(old_folio);
7897 		folio_clear_hugetlb_temporary(new_folio);
7898 
7899 
7900 		/*
7901 		 * There is no need to transfer the per-node surplus state
7902 		 * when we do not cross the node.
7903 		 */
7904 		if (new_nid == old_nid)
7905 			return;
7906 		spin_lock_irq(&hugetlb_lock);
7907 		if (h->surplus_huge_pages_node[old_nid]) {
7908 			h->surplus_huge_pages_node[old_nid]--;
7909 			h->surplus_huge_pages_node[new_nid]++;
7910 		}
7911 		spin_unlock_irq(&hugetlb_lock);
7912 	}
7913 
7914 	/*
7915 	 * Our old folio is isolated and has "migratable" cleared until it
7916 	 * is putback. As migration succeeded, set the new folio "migratable"
7917 	 * and add it to the active list.
7918 	 */
7919 	spin_lock_irq(&hugetlb_lock);
7920 	folio_set_hugetlb_migratable(new_folio);
7921 	list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist);
7922 	spin_unlock_irq(&hugetlb_lock);
7923 }
7924 
7925 /*
7926  * If @take_locks is false, the caller must ensure that no concurrent page table
7927  * access can happen (except for gup_fast() and hardware page walks).
7928  * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
7929  * concurrent page fault handling) and the file rmap lock.
7930  */
7931 static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
7932 				   unsigned long start,
7933 				   unsigned long end,
7934 				   bool take_locks)
7935 {
7936 	struct hstate *h = hstate_vma(vma);
7937 	unsigned long sz = huge_page_size(h);
7938 	struct mm_struct *mm = vma->vm_mm;
7939 	struct mmu_notifier_range range;
7940 	unsigned long address;
7941 	spinlock_t *ptl;
7942 	pte_t *ptep;
7943 
7944 	if (!(vma->vm_flags & VM_MAYSHARE))
7945 		return;
7946 
7947 	if (start >= end)
7948 		return;
7949 
7950 	flush_cache_range(vma, start, end);
7951 	/*
7952 	 * No need to call adjust_range_if_pmd_sharing_possible(), because
7953 	 * we have already done the PUD_SIZE alignment.
7954 	 */
7955 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
7956 				start, end);
7957 	mmu_notifier_invalidate_range_start(&range);
7958 	if (take_locks) {
7959 		hugetlb_vma_lock_write(vma);
7960 		i_mmap_lock_write(vma->vm_file->f_mapping);
7961 	} else {
7962 		i_mmap_assert_write_locked(vma->vm_file->f_mapping);
7963 	}
7964 	for (address = start; address < end; address += PUD_SIZE) {
7965 		ptep = hugetlb_walk(vma, address, sz);
7966 		if (!ptep)
7967 			continue;
7968 		ptl = huge_pte_lock(h, mm, ptep);
7969 		huge_pmd_unshare(mm, vma, address, ptep);
7970 		spin_unlock(ptl);
7971 	}
7972 	flush_hugetlb_tlb_range(vma, start, end);
7973 	if (take_locks) {
7974 		i_mmap_unlock_write(vma->vm_file->f_mapping);
7975 		hugetlb_vma_unlock_write(vma);
7976 	}
7977 	/*
7978 	 * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
7979 	 * Documentation/mm/mmu_notifier.rst.
7980 	 */
7981 	mmu_notifier_invalidate_range_end(&range);
7982 }
7983 
7984 /*
7985  * This function will unconditionally remove all the shared pmd pgtable entries
7986  * within the specific vma for a hugetlbfs memory range.
7987  */
7988 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
7989 {
7990 	hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
7991 			ALIGN_DOWN(vma->vm_end, PUD_SIZE),
7992 			/* take_locks = */ true);
7993 }
7994 
7995 /*
7996  * For hugetlb, mremap() is an odd edge case - while the VMA copying is
7997  * performed, we permit both the old and new VMAs to reference the same
7998  * reservation.
7999  *
8000  * We fix this up after the operation succeeds, or if a newly allocated VMA
8001  * is closed as a result of a failure to allocate memory.
8002  */
8003 void fixup_hugetlb_reservations(struct vm_area_struct *vma)
8004 {
8005 	if (is_vm_hugetlb_page(vma))
8006 		clear_vma_resv_huge_pages(vma);
8007 }
8008