xref: /linux/mm/percpu.c (revision 6685b357363bfe295e3ae73665014db4aed62c58)
1fbf59bc9STejun Heo /*
288999a89STejun Heo  * mm/percpu.c - percpu memory allocator
3fbf59bc9STejun Heo  *
4fbf59bc9STejun Heo  * Copyright (C) 2009		SUSE Linux Products GmbH
5fbf59bc9STejun Heo  * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
6fbf59bc9STejun Heo  *
75e81ee3eSDennis Zhou (Facebook)  * Copyright (C) 2017		Facebook Inc.
85e81ee3eSDennis Zhou (Facebook)  * Copyright (C) 2017		Dennis Zhou <dennisszhou@gmail.com>
95e81ee3eSDennis Zhou (Facebook)  *
109c015162SDennis Zhou (Facebook)  * This file is released under the GPLv2 license.
11fbf59bc9STejun Heo  *
129c015162SDennis Zhou (Facebook)  * The percpu allocator handles both static and dynamic areas.  Percpu
139c015162SDennis Zhou (Facebook)  * areas are allocated in chunks which are divided into units.  There is
149c015162SDennis Zhou (Facebook)  * a 1-to-1 mapping for units to possible cpus.  These units are grouped
159c015162SDennis Zhou (Facebook)  * based on NUMA properties of the machine.
16fbf59bc9STejun Heo  *
17fbf59bc9STejun Heo  *  c0                           c1                         c2
18fbf59bc9STejun Heo  *  -------------------          -------------------        ------------
19fbf59bc9STejun Heo  * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
20fbf59bc9STejun Heo  *  -------------------  ......  -------------------  ....  ------------
21fbf59bc9STejun Heo  *
229c015162SDennis Zhou (Facebook)  * Allocation is done by offsets into a unit's address space.  Ie., an
239c015162SDennis Zhou (Facebook)  * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
249c015162SDennis Zhou (Facebook)  * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
259c015162SDennis Zhou (Facebook)  * and even sparse.  Access is handled by configuring percpu base
269c015162SDennis Zhou (Facebook)  * registers according to the cpu to unit mappings and offsetting the
279c015162SDennis Zhou (Facebook)  * base address using pcpu_unit_size.
28fbf59bc9STejun Heo  *
299c015162SDennis Zhou (Facebook)  * There is special consideration for the first chunk which must handle
309c015162SDennis Zhou (Facebook)  * the static percpu variables in the kernel image as allocation services
315e81ee3eSDennis Zhou (Facebook)  * are not online yet.  In short, the first chunk is structured like so:
329c015162SDennis Zhou (Facebook)  *
339c015162SDennis Zhou (Facebook)  *                  <Static | [Reserved] | Dynamic>
349c015162SDennis Zhou (Facebook)  *
359c015162SDennis Zhou (Facebook)  * The static data is copied from the original section managed by the
369c015162SDennis Zhou (Facebook)  * linker.  The reserved section, if non-zero, primarily manages static
379c015162SDennis Zhou (Facebook)  * percpu variables from kernel modules.  Finally, the dynamic section
389c015162SDennis Zhou (Facebook)  * takes care of normal allocations.
39fbf59bc9STejun Heo  *
405e81ee3eSDennis Zhou (Facebook)  * The allocator organizes chunks into lists according to free size and
415e81ee3eSDennis Zhou (Facebook)  * tries to allocate from the fullest chunk first.  Each chunk is managed
425e81ee3eSDennis Zhou (Facebook)  * by a bitmap with metadata blocks.  The allocation map is updated on
435e81ee3eSDennis Zhou (Facebook)  * every allocation and free to reflect the current state while the boundary
445e81ee3eSDennis Zhou (Facebook)  * map is only updated on allocation.  Each metadata block contains
455e81ee3eSDennis Zhou (Facebook)  * information to help mitigate the need to iterate over large portions
465e81ee3eSDennis Zhou (Facebook)  * of the bitmap.  The reverse mapping from page to chunk is stored in
475e81ee3eSDennis Zhou (Facebook)  * the page's index.  Lastly, units are lazily backed and grow in unison.
48fbf59bc9STejun Heo  *
495e81ee3eSDennis Zhou (Facebook)  * There is a unique conversion that goes on here between bytes and bits.
505e81ee3eSDennis Zhou (Facebook)  * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
515e81ee3eSDennis Zhou (Facebook)  * tracks the number of pages it is responsible for in nr_pages.  Helper
525e81ee3eSDennis Zhou (Facebook)  * functions are used to convert from between the bytes, bits, and blocks.
535e81ee3eSDennis Zhou (Facebook)  * All hints are managed in bits unless explicitly stated.
549c015162SDennis Zhou (Facebook)  *
554091fb95SMasahiro Yamada  * To use this allocator, arch code should do the following:
56fbf59bc9STejun Heo  *
57fbf59bc9STejun Heo  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
58e0100983STejun Heo  *   regular address to percpu pointer and back if they need to be
59e0100983STejun Heo  *   different from the default
60fbf59bc9STejun Heo  *
618d408b4bSTejun Heo  * - use pcpu_setup_first_chunk() during percpu area initialization to
628d408b4bSTejun Heo  *   setup the first chunk containing the kernel static percpu area
63fbf59bc9STejun Heo  */
64fbf59bc9STejun Heo 
65870d4b12SJoe Perches #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
66870d4b12SJoe Perches 
67fbf59bc9STejun Heo #include <linux/bitmap.h>
68fbf59bc9STejun Heo #include <linux/bootmem.h>
69fd1e8a1fSTejun Heo #include <linux/err.h>
70ca460b3cSDennis Zhou (Facebook) #include <linux/lcm.h>
71fbf59bc9STejun Heo #include <linux/list.h>
72a530b795STejun Heo #include <linux/log2.h>
73fbf59bc9STejun Heo #include <linux/mm.h>
74fbf59bc9STejun Heo #include <linux/module.h>
75fbf59bc9STejun Heo #include <linux/mutex.h>
76fbf59bc9STejun Heo #include <linux/percpu.h>
77fbf59bc9STejun Heo #include <linux/pfn.h>
78fbf59bc9STejun Heo #include <linux/slab.h>
79ccea34b5STejun Heo #include <linux/spinlock.h>
80fbf59bc9STejun Heo #include <linux/vmalloc.h>
81a56dbddfSTejun Heo #include <linux/workqueue.h>
82f528f0b8SCatalin Marinas #include <linux/kmemleak.h>
8371546d10STejun Heo #include <linux/sched.h>
84fbf59bc9STejun Heo 
85fbf59bc9STejun Heo #include <asm/cacheflush.h>
86e0100983STejun Heo #include <asm/sections.h>
87fbf59bc9STejun Heo #include <asm/tlbflush.h>
883b034b0dSVivek Goyal #include <asm/io.h>
89fbf59bc9STejun Heo 
90df95e795SDennis Zhou #define CREATE_TRACE_POINTS
91df95e795SDennis Zhou #include <trace/events/percpu.h>
92df95e795SDennis Zhou 
938fa3ed80SDennis Zhou #include "percpu-internal.h"
948fa3ed80SDennis Zhou 
9540064aecSDennis Zhou (Facebook) /* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
9640064aecSDennis Zhou (Facebook) #define PCPU_SLOT_BASE_SHIFT		5
9740064aecSDennis Zhou (Facebook) 
981a4d7607STejun Heo #define PCPU_EMPTY_POP_PAGES_LOW	2
991a4d7607STejun Heo #define PCPU_EMPTY_POP_PAGES_HIGH	4
100fbf59bc9STejun Heo 
101bbddff05STejun Heo #ifdef CONFIG_SMP
102e0100983STejun Heo /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
103e0100983STejun Heo #ifndef __addr_to_pcpu_ptr
104e0100983STejun Heo #define __addr_to_pcpu_ptr(addr)					\
10543cf38ebSTejun Heo 	(void __percpu *)((unsigned long)(addr) -			\
10643cf38ebSTejun Heo 			  (unsigned long)pcpu_base_addr	+		\
10743cf38ebSTejun Heo 			  (unsigned long)__per_cpu_start)
108e0100983STejun Heo #endif
109e0100983STejun Heo #ifndef __pcpu_ptr_to_addr
110e0100983STejun Heo #define __pcpu_ptr_to_addr(ptr)						\
11143cf38ebSTejun Heo 	(void __force *)((unsigned long)(ptr) +				\
11243cf38ebSTejun Heo 			 (unsigned long)pcpu_base_addr -		\
11343cf38ebSTejun Heo 			 (unsigned long)__per_cpu_start)
114e0100983STejun Heo #endif
115bbddff05STejun Heo #else	/* CONFIG_SMP */
116bbddff05STejun Heo /* on UP, it's always identity mapped */
117bbddff05STejun Heo #define __addr_to_pcpu_ptr(addr)	(void __percpu *)(addr)
118bbddff05STejun Heo #define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
119bbddff05STejun Heo #endif	/* CONFIG_SMP */
120e0100983STejun Heo 
1211328710bSDaniel Micay static int pcpu_unit_pages __ro_after_init;
1221328710bSDaniel Micay static int pcpu_unit_size __ro_after_init;
1231328710bSDaniel Micay static int pcpu_nr_units __ro_after_init;
1241328710bSDaniel Micay static int pcpu_atom_size __ro_after_init;
1258fa3ed80SDennis Zhou int pcpu_nr_slots __ro_after_init;
1261328710bSDaniel Micay static size_t pcpu_chunk_struct_size __ro_after_init;
127fbf59bc9STejun Heo 
128a855b84cSTejun Heo /* cpus with the lowest and highest unit addresses */
1291328710bSDaniel Micay static unsigned int pcpu_low_unit_cpu __ro_after_init;
1301328710bSDaniel Micay static unsigned int pcpu_high_unit_cpu __ro_after_init;
1312f39e637STejun Heo 
132fbf59bc9STejun Heo /* the address of the first chunk which starts with the kernel static area */
1331328710bSDaniel Micay void *pcpu_base_addr __ro_after_init;
134fbf59bc9STejun Heo EXPORT_SYMBOL_GPL(pcpu_base_addr);
135fbf59bc9STejun Heo 
1361328710bSDaniel Micay static const int *pcpu_unit_map __ro_after_init;		/* cpu -> unit */
1371328710bSDaniel Micay const unsigned long *pcpu_unit_offsets __ro_after_init;	/* cpu -> unit offset */
1382f39e637STejun Heo 
1396563297cSTejun Heo /* group information, used for vm allocation */
1401328710bSDaniel Micay static int pcpu_nr_groups __ro_after_init;
1411328710bSDaniel Micay static const unsigned long *pcpu_group_offsets __ro_after_init;
1421328710bSDaniel Micay static const size_t *pcpu_group_sizes __ro_after_init;
1436563297cSTejun Heo 
144ae9e6bc9STejun Heo /*
145ae9e6bc9STejun Heo  * The first chunk which always exists.  Note that unlike other
146ae9e6bc9STejun Heo  * chunks, this one can be allocated and mapped in several different
147ae9e6bc9STejun Heo  * ways and thus often doesn't live in the vmalloc area.
148ae9e6bc9STejun Heo  */
1498fa3ed80SDennis Zhou struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
150ae9e6bc9STejun Heo 
151ae9e6bc9STejun Heo /*
152ae9e6bc9STejun Heo  * Optional reserved chunk.  This chunk reserves part of the first
153e2266705SDennis Zhou (Facebook)  * chunk and serves it for reserved allocations.  When the reserved
154e2266705SDennis Zhou (Facebook)  * region doesn't exist, the following variable is NULL.
155ae9e6bc9STejun Heo  */
1568fa3ed80SDennis Zhou struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
157edcb4639STejun Heo 
1588fa3ed80SDennis Zhou DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
1596710e594STejun Heo static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
160fbf59bc9STejun Heo 
1618fa3ed80SDennis Zhou struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
162fbf59bc9STejun Heo 
1634f996e23STejun Heo /* chunks which need their map areas extended, protected by pcpu_lock */
1644f996e23STejun Heo static LIST_HEAD(pcpu_map_extend_chunks);
1654f996e23STejun Heo 
166b539b87fSTejun Heo /*
167b539b87fSTejun Heo  * The number of empty populated pages, protected by pcpu_lock.  The
168b539b87fSTejun Heo  * reserved chunk doesn't contribute to the count.
169b539b87fSTejun Heo  */
1706b9b6f39SDennis Zhou (Facebook) int pcpu_nr_empty_pop_pages;
171b539b87fSTejun Heo 
1721a4d7607STejun Heo /*
1737e8a6304SDennis Zhou (Facebook)  * The number of populated pages in use by the allocator, protected by
1747e8a6304SDennis Zhou (Facebook)  * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
1757e8a6304SDennis Zhou (Facebook)  * allocated/deallocated, it is allocated/deallocated in all units of a chunk
1767e8a6304SDennis Zhou (Facebook)  * and increments/decrements this count by 1).
1777e8a6304SDennis Zhou (Facebook)  */
1787e8a6304SDennis Zhou (Facebook) static unsigned long pcpu_nr_populated;
1797e8a6304SDennis Zhou (Facebook) 
1807e8a6304SDennis Zhou (Facebook) /*
1811a4d7607STejun Heo  * Balance work is used to populate or destroy chunks asynchronously.  We
1821a4d7607STejun Heo  * try to keep the number of populated free pages between
1831a4d7607STejun Heo  * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
1841a4d7607STejun Heo  * empty chunk.
1851a4d7607STejun Heo  */
186fe6bd8c3STejun Heo static void pcpu_balance_workfn(struct work_struct *work);
187fe6bd8c3STejun Heo static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
1881a4d7607STejun Heo static bool pcpu_async_enabled __read_mostly;
1891a4d7607STejun Heo static bool pcpu_atomic_alloc_failed;
1901a4d7607STejun Heo 
1911a4d7607STejun Heo static void pcpu_schedule_balance_work(void)
1921a4d7607STejun Heo {
1931a4d7607STejun Heo 	if (pcpu_async_enabled)
1941a4d7607STejun Heo 		schedule_work(&pcpu_balance_work);
1951a4d7607STejun Heo }
196a56dbddfSTejun Heo 
197c0ebfdc3SDennis Zhou (Facebook) /**
198560f2c23SDennis Zhou (Facebook)  * pcpu_addr_in_chunk - check if the address is served from this chunk
199560f2c23SDennis Zhou (Facebook)  * @chunk: chunk of interest
200560f2c23SDennis Zhou (Facebook)  * @addr: percpu address
201c0ebfdc3SDennis Zhou (Facebook)  *
202c0ebfdc3SDennis Zhou (Facebook)  * RETURNS:
203560f2c23SDennis Zhou (Facebook)  * True if the address is served from this chunk.
204c0ebfdc3SDennis Zhou (Facebook)  */
205560f2c23SDennis Zhou (Facebook) static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
206020ec653STejun Heo {
207c0ebfdc3SDennis Zhou (Facebook) 	void *start_addr, *end_addr;
208020ec653STejun Heo 
209560f2c23SDennis Zhou (Facebook) 	if (!chunk)
210c0ebfdc3SDennis Zhou (Facebook) 		return false;
211c0ebfdc3SDennis Zhou (Facebook) 
212560f2c23SDennis Zhou (Facebook) 	start_addr = chunk->base_addr + chunk->start_offset;
213560f2c23SDennis Zhou (Facebook) 	end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
214560f2c23SDennis Zhou (Facebook) 		   chunk->end_offset;
215c0ebfdc3SDennis Zhou (Facebook) 
216c0ebfdc3SDennis Zhou (Facebook) 	return addr >= start_addr && addr < end_addr;
217020ec653STejun Heo }
218020ec653STejun Heo 
219d9b55eebSTejun Heo static int __pcpu_size_to_slot(int size)
220fbf59bc9STejun Heo {
221cae3aeb8STejun Heo 	int highbit = fls(size);	/* size is in bytes */
222fbf59bc9STejun Heo 	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
223fbf59bc9STejun Heo }
224fbf59bc9STejun Heo 
225d9b55eebSTejun Heo static int pcpu_size_to_slot(int size)
226d9b55eebSTejun Heo {
227d9b55eebSTejun Heo 	if (size == pcpu_unit_size)
228d9b55eebSTejun Heo 		return pcpu_nr_slots - 1;
229d9b55eebSTejun Heo 	return __pcpu_size_to_slot(size);
230d9b55eebSTejun Heo }
231d9b55eebSTejun Heo 
232fbf59bc9STejun Heo static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
233fbf59bc9STejun Heo {
23440064aecSDennis Zhou (Facebook) 	if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
235fbf59bc9STejun Heo 		return 0;
236fbf59bc9STejun Heo 
23740064aecSDennis Zhou (Facebook) 	return pcpu_size_to_slot(chunk->free_bytes);
238fbf59bc9STejun Heo }
239fbf59bc9STejun Heo 
24088999a89STejun Heo /* set the pointer to a chunk in a page struct */
24188999a89STejun Heo static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
24288999a89STejun Heo {
24388999a89STejun Heo 	page->index = (unsigned long)pcpu;
24488999a89STejun Heo }
24588999a89STejun Heo 
24688999a89STejun Heo /* obtain pointer to a chunk from a page struct */
24788999a89STejun Heo static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
24888999a89STejun Heo {
24988999a89STejun Heo 	return (struct pcpu_chunk *)page->index;
25088999a89STejun Heo }
25188999a89STejun Heo 
25288999a89STejun Heo static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
253fbf59bc9STejun Heo {
2542f39e637STejun Heo 	return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
255fbf59bc9STejun Heo }
256fbf59bc9STejun Heo 
257c0ebfdc3SDennis Zhou (Facebook) static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
258c0ebfdc3SDennis Zhou (Facebook) {
259c0ebfdc3SDennis Zhou (Facebook) 	return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
260c0ebfdc3SDennis Zhou (Facebook) }
261c0ebfdc3SDennis Zhou (Facebook) 
2629983b6f0STejun Heo static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
263fbf59bc9STejun Heo 				     unsigned int cpu, int page_idx)
264fbf59bc9STejun Heo {
265c0ebfdc3SDennis Zhou (Facebook) 	return (unsigned long)chunk->base_addr +
266c0ebfdc3SDennis Zhou (Facebook) 	       pcpu_unit_page_offset(cpu, page_idx);
267fbf59bc9STejun Heo }
268fbf59bc9STejun Heo 
26991e914c5SDennis Zhou (Facebook) static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
270ce3141a2STejun Heo {
27191e914c5SDennis Zhou (Facebook) 	*rs = find_next_zero_bit(bitmap, end, *rs);
27291e914c5SDennis Zhou (Facebook) 	*re = find_next_bit(bitmap, end, *rs + 1);
273ce3141a2STejun Heo }
274ce3141a2STejun Heo 
27591e914c5SDennis Zhou (Facebook) static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
276ce3141a2STejun Heo {
27791e914c5SDennis Zhou (Facebook) 	*rs = find_next_bit(bitmap, end, *rs);
27891e914c5SDennis Zhou (Facebook) 	*re = find_next_zero_bit(bitmap, end, *rs + 1);
279ce3141a2STejun Heo }
280ce3141a2STejun Heo 
281ce3141a2STejun Heo /*
28291e914c5SDennis Zhou (Facebook)  * Bitmap region iterators.  Iterates over the bitmap between
28391e914c5SDennis Zhou (Facebook)  * [@start, @end) in @chunk.  @rs and @re should be integer variables
28491e914c5SDennis Zhou (Facebook)  * and will be set to start and end index of the current free region.
285ce3141a2STejun Heo  */
28691e914c5SDennis Zhou (Facebook) #define pcpu_for_each_unpop_region(bitmap, rs, re, start, end)		     \
28791e914c5SDennis Zhou (Facebook) 	for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
288ce3141a2STejun Heo 	     (rs) < (re);						     \
28991e914c5SDennis Zhou (Facebook) 	     (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
290ce3141a2STejun Heo 
29191e914c5SDennis Zhou (Facebook) #define pcpu_for_each_pop_region(bitmap, rs, re, start, end)		     \
29291e914c5SDennis Zhou (Facebook) 	for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end));   \
293ce3141a2STejun Heo 	     (rs) < (re);						     \
29491e914c5SDennis Zhou (Facebook) 	     (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
295ce3141a2STejun Heo 
296ca460b3cSDennis Zhou (Facebook) /*
297ca460b3cSDennis Zhou (Facebook)  * The following are helper functions to help access bitmaps and convert
298ca460b3cSDennis Zhou (Facebook)  * between bitmap offsets to address offsets.
299ca460b3cSDennis Zhou (Facebook)  */
300ca460b3cSDennis Zhou (Facebook) static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
301ca460b3cSDennis Zhou (Facebook) {
302ca460b3cSDennis Zhou (Facebook) 	return chunk->alloc_map +
303ca460b3cSDennis Zhou (Facebook) 	       (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
304ca460b3cSDennis Zhou (Facebook) }
305ca460b3cSDennis Zhou (Facebook) 
306ca460b3cSDennis Zhou (Facebook) static unsigned long pcpu_off_to_block_index(int off)
307ca460b3cSDennis Zhou (Facebook) {
308ca460b3cSDennis Zhou (Facebook) 	return off / PCPU_BITMAP_BLOCK_BITS;
309ca460b3cSDennis Zhou (Facebook) }
310ca460b3cSDennis Zhou (Facebook) 
311ca460b3cSDennis Zhou (Facebook) static unsigned long pcpu_off_to_block_off(int off)
312ca460b3cSDennis Zhou (Facebook) {
313ca460b3cSDennis Zhou (Facebook) 	return off & (PCPU_BITMAP_BLOCK_BITS - 1);
314ca460b3cSDennis Zhou (Facebook) }
315ca460b3cSDennis Zhou (Facebook) 
316b185cd0dSDennis Zhou (Facebook) static unsigned long pcpu_block_off_to_off(int index, int off)
317b185cd0dSDennis Zhou (Facebook) {
318b185cd0dSDennis Zhou (Facebook) 	return index * PCPU_BITMAP_BLOCK_BITS + off;
319b185cd0dSDennis Zhou (Facebook) }
320b185cd0dSDennis Zhou (Facebook) 
321fbf59bc9STejun Heo /**
322525ca84dSDennis Zhou (Facebook)  * pcpu_next_md_free_region - finds the next hint free area
323525ca84dSDennis Zhou (Facebook)  * @chunk: chunk of interest
324525ca84dSDennis Zhou (Facebook)  * @bit_off: chunk offset
325525ca84dSDennis Zhou (Facebook)  * @bits: size of free area
326525ca84dSDennis Zhou (Facebook)  *
327525ca84dSDennis Zhou (Facebook)  * Helper function for pcpu_for_each_md_free_region.  It checks
328525ca84dSDennis Zhou (Facebook)  * block->contig_hint and performs aggregation across blocks to find the
329525ca84dSDennis Zhou (Facebook)  * next hint.  It modifies bit_off and bits in-place to be consumed in the
330525ca84dSDennis Zhou (Facebook)  * loop.
331525ca84dSDennis Zhou (Facebook)  */
332525ca84dSDennis Zhou (Facebook) static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
333525ca84dSDennis Zhou (Facebook) 				     int *bits)
334525ca84dSDennis Zhou (Facebook) {
335525ca84dSDennis Zhou (Facebook) 	int i = pcpu_off_to_block_index(*bit_off);
336525ca84dSDennis Zhou (Facebook) 	int block_off = pcpu_off_to_block_off(*bit_off);
337525ca84dSDennis Zhou (Facebook) 	struct pcpu_block_md *block;
338525ca84dSDennis Zhou (Facebook) 
339525ca84dSDennis Zhou (Facebook) 	*bits = 0;
340525ca84dSDennis Zhou (Facebook) 	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
341525ca84dSDennis Zhou (Facebook) 	     block++, i++) {
342525ca84dSDennis Zhou (Facebook) 		/* handles contig area across blocks */
343525ca84dSDennis Zhou (Facebook) 		if (*bits) {
344525ca84dSDennis Zhou (Facebook) 			*bits += block->left_free;
345525ca84dSDennis Zhou (Facebook) 			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
346525ca84dSDennis Zhou (Facebook) 				continue;
347525ca84dSDennis Zhou (Facebook) 			return;
348525ca84dSDennis Zhou (Facebook) 		}
349525ca84dSDennis Zhou (Facebook) 
350525ca84dSDennis Zhou (Facebook) 		/*
351525ca84dSDennis Zhou (Facebook) 		 * This checks three things.  First is there a contig_hint to
352525ca84dSDennis Zhou (Facebook) 		 * check.  Second, have we checked this hint before by
353525ca84dSDennis Zhou (Facebook) 		 * comparing the block_off.  Third, is this the same as the
354525ca84dSDennis Zhou (Facebook) 		 * right contig hint.  In the last case, it spills over into
355525ca84dSDennis Zhou (Facebook) 		 * the next block and should be handled by the contig area
356525ca84dSDennis Zhou (Facebook) 		 * across blocks code.
357525ca84dSDennis Zhou (Facebook) 		 */
358525ca84dSDennis Zhou (Facebook) 		*bits = block->contig_hint;
359525ca84dSDennis Zhou (Facebook) 		if (*bits && block->contig_hint_start >= block_off &&
360525ca84dSDennis Zhou (Facebook) 		    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
361525ca84dSDennis Zhou (Facebook) 			*bit_off = pcpu_block_off_to_off(i,
362525ca84dSDennis Zhou (Facebook) 					block->contig_hint_start);
363525ca84dSDennis Zhou (Facebook) 			return;
364525ca84dSDennis Zhou (Facebook) 		}
3651fa4df3eSDennis Zhou 		/* reset to satisfy the second predicate above */
3661fa4df3eSDennis Zhou 		block_off = 0;
367525ca84dSDennis Zhou (Facebook) 
368525ca84dSDennis Zhou (Facebook) 		*bits = block->right_free;
369525ca84dSDennis Zhou (Facebook) 		*bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
370525ca84dSDennis Zhou (Facebook) 	}
371525ca84dSDennis Zhou (Facebook) }
372525ca84dSDennis Zhou (Facebook) 
373b4c2116cSDennis Zhou (Facebook) /**
374b4c2116cSDennis Zhou (Facebook)  * pcpu_next_fit_region - finds fit areas for a given allocation request
375b4c2116cSDennis Zhou (Facebook)  * @chunk: chunk of interest
376b4c2116cSDennis Zhou (Facebook)  * @alloc_bits: size of allocation
377b4c2116cSDennis Zhou (Facebook)  * @align: alignment of area (max PAGE_SIZE)
378b4c2116cSDennis Zhou (Facebook)  * @bit_off: chunk offset
379b4c2116cSDennis Zhou (Facebook)  * @bits: size of free area
380b4c2116cSDennis Zhou (Facebook)  *
381b4c2116cSDennis Zhou (Facebook)  * Finds the next free region that is viable for use with a given size and
382b4c2116cSDennis Zhou (Facebook)  * alignment.  This only returns if there is a valid area to be used for this
383b4c2116cSDennis Zhou (Facebook)  * allocation.  block->first_free is returned if the allocation request fits
384b4c2116cSDennis Zhou (Facebook)  * within the block to see if the request can be fulfilled prior to the contig
385b4c2116cSDennis Zhou (Facebook)  * hint.
386b4c2116cSDennis Zhou (Facebook)  */
387b4c2116cSDennis Zhou (Facebook) static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
388b4c2116cSDennis Zhou (Facebook) 				 int align, int *bit_off, int *bits)
389b4c2116cSDennis Zhou (Facebook) {
390b4c2116cSDennis Zhou (Facebook) 	int i = pcpu_off_to_block_index(*bit_off);
391b4c2116cSDennis Zhou (Facebook) 	int block_off = pcpu_off_to_block_off(*bit_off);
392b4c2116cSDennis Zhou (Facebook) 	struct pcpu_block_md *block;
393b4c2116cSDennis Zhou (Facebook) 
394b4c2116cSDennis Zhou (Facebook) 	*bits = 0;
395b4c2116cSDennis Zhou (Facebook) 	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
396b4c2116cSDennis Zhou (Facebook) 	     block++, i++) {
397b4c2116cSDennis Zhou (Facebook) 		/* handles contig area across blocks */
398b4c2116cSDennis Zhou (Facebook) 		if (*bits) {
399b4c2116cSDennis Zhou (Facebook) 			*bits += block->left_free;
400b4c2116cSDennis Zhou (Facebook) 			if (*bits >= alloc_bits)
401b4c2116cSDennis Zhou (Facebook) 				return;
402b4c2116cSDennis Zhou (Facebook) 			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
403b4c2116cSDennis Zhou (Facebook) 				continue;
404b4c2116cSDennis Zhou (Facebook) 		}
405b4c2116cSDennis Zhou (Facebook) 
406b4c2116cSDennis Zhou (Facebook) 		/* check block->contig_hint */
407b4c2116cSDennis Zhou (Facebook) 		*bits = ALIGN(block->contig_hint_start, align) -
408b4c2116cSDennis Zhou (Facebook) 			block->contig_hint_start;
409b4c2116cSDennis Zhou (Facebook) 		/*
410b4c2116cSDennis Zhou (Facebook) 		 * This uses the block offset to determine if this has been
411b4c2116cSDennis Zhou (Facebook) 		 * checked in the prior iteration.
412b4c2116cSDennis Zhou (Facebook) 		 */
413b4c2116cSDennis Zhou (Facebook) 		if (block->contig_hint &&
414b4c2116cSDennis Zhou (Facebook) 		    block->contig_hint_start >= block_off &&
415b4c2116cSDennis Zhou (Facebook) 		    block->contig_hint >= *bits + alloc_bits) {
416b4c2116cSDennis Zhou (Facebook) 			*bits += alloc_bits + block->contig_hint_start -
417b4c2116cSDennis Zhou (Facebook) 				 block->first_free;
418b4c2116cSDennis Zhou (Facebook) 			*bit_off = pcpu_block_off_to_off(i, block->first_free);
419b4c2116cSDennis Zhou (Facebook) 			return;
420b4c2116cSDennis Zhou (Facebook) 		}
4211fa4df3eSDennis Zhou 		/* reset to satisfy the second predicate above */
4221fa4df3eSDennis Zhou 		block_off = 0;
423b4c2116cSDennis Zhou (Facebook) 
424b4c2116cSDennis Zhou (Facebook) 		*bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
425b4c2116cSDennis Zhou (Facebook) 				 align);
426b4c2116cSDennis Zhou (Facebook) 		*bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
427b4c2116cSDennis Zhou (Facebook) 		*bit_off = pcpu_block_off_to_off(i, *bit_off);
428b4c2116cSDennis Zhou (Facebook) 		if (*bits >= alloc_bits)
429b4c2116cSDennis Zhou (Facebook) 			return;
430b4c2116cSDennis Zhou (Facebook) 	}
431b4c2116cSDennis Zhou (Facebook) 
432b4c2116cSDennis Zhou (Facebook) 	/* no valid offsets were found - fail condition */
433b4c2116cSDennis Zhou (Facebook) 	*bit_off = pcpu_chunk_map_bits(chunk);
434b4c2116cSDennis Zhou (Facebook) }
435b4c2116cSDennis Zhou (Facebook) 
436525ca84dSDennis Zhou (Facebook) /*
437525ca84dSDennis Zhou (Facebook)  * Metadata free area iterators.  These perform aggregation of free areas
438525ca84dSDennis Zhou (Facebook)  * based on the metadata blocks and return the offset @bit_off and size in
439b4c2116cSDennis Zhou (Facebook)  * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
440b4c2116cSDennis Zhou (Facebook)  * a fit is found for the allocation request.
441525ca84dSDennis Zhou (Facebook)  */
442525ca84dSDennis Zhou (Facebook) #define pcpu_for_each_md_free_region(chunk, bit_off, bits)		\
443525ca84dSDennis Zhou (Facebook) 	for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));	\
444525ca84dSDennis Zhou (Facebook) 	     (bit_off) < pcpu_chunk_map_bits((chunk));			\
445525ca84dSDennis Zhou (Facebook) 	     (bit_off) += (bits) + 1,					\
446525ca84dSDennis Zhou (Facebook) 	     pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
447525ca84dSDennis Zhou (Facebook) 
448b4c2116cSDennis Zhou (Facebook) #define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
449b4c2116cSDennis Zhou (Facebook) 	for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
450b4c2116cSDennis Zhou (Facebook) 				  &(bits));				      \
451b4c2116cSDennis Zhou (Facebook) 	     (bit_off) < pcpu_chunk_map_bits((chunk));			      \
452b4c2116cSDennis Zhou (Facebook) 	     (bit_off) += (bits),					      \
453b4c2116cSDennis Zhou (Facebook) 	     pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
454b4c2116cSDennis Zhou (Facebook) 				  &(bits)))
455b4c2116cSDennis Zhou (Facebook) 
456525ca84dSDennis Zhou (Facebook) /**
45790459ce0SBob Liu  * pcpu_mem_zalloc - allocate memory
4581880d93bSTejun Heo  * @size: bytes to allocate
45947504ee0SDennis Zhou  * @gfp: allocation flags
460fbf59bc9STejun Heo  *
4611880d93bSTejun Heo  * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
46247504ee0SDennis Zhou  * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
46347504ee0SDennis Zhou  * This is to facilitate passing through whitelisted flags.  The
46447504ee0SDennis Zhou  * returned memory is always zeroed.
465fbf59bc9STejun Heo  *
466fbf59bc9STejun Heo  * RETURNS:
4671880d93bSTejun Heo  * Pointer to the allocated area on success, NULL on failure.
468fbf59bc9STejun Heo  */
46947504ee0SDennis Zhou static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
470fbf59bc9STejun Heo {
471099a19d9STejun Heo 	if (WARN_ON_ONCE(!slab_is_available()))
472099a19d9STejun Heo 		return NULL;
473099a19d9STejun Heo 
474fbf59bc9STejun Heo 	if (size <= PAGE_SIZE)
475554fef1cSDennis Zhou 		return kzalloc(size, gfp);
4767af4c093SJesper Juhl 	else
477554fef1cSDennis Zhou 		return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
4781880d93bSTejun Heo }
479fbf59bc9STejun Heo 
4801880d93bSTejun Heo /**
4811880d93bSTejun Heo  * pcpu_mem_free - free memory
4821880d93bSTejun Heo  * @ptr: memory to free
4831880d93bSTejun Heo  *
48490459ce0SBob Liu  * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
4851880d93bSTejun Heo  */
4861d5cfdb0STetsuo Handa static void pcpu_mem_free(void *ptr)
4871880d93bSTejun Heo {
4881d5cfdb0STetsuo Handa 	kvfree(ptr);
489fbf59bc9STejun Heo }
490fbf59bc9STejun Heo 
491fbf59bc9STejun Heo /**
492fbf59bc9STejun Heo  * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
493fbf59bc9STejun Heo  * @chunk: chunk of interest
494fbf59bc9STejun Heo  * @oslot: the previous slot it was on
495fbf59bc9STejun Heo  *
496fbf59bc9STejun Heo  * This function is called after an allocation or free changed @chunk.
497fbf59bc9STejun Heo  * New slot according to the changed state is determined and @chunk is
498edcb4639STejun Heo  * moved to the slot.  Note that the reserved chunk is never put on
499edcb4639STejun Heo  * chunk slots.
500ccea34b5STejun Heo  *
501ccea34b5STejun Heo  * CONTEXT:
502ccea34b5STejun Heo  * pcpu_lock.
503fbf59bc9STejun Heo  */
504fbf59bc9STejun Heo static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
505fbf59bc9STejun Heo {
506fbf59bc9STejun Heo 	int nslot = pcpu_chunk_slot(chunk);
507fbf59bc9STejun Heo 
508edcb4639STejun Heo 	if (chunk != pcpu_reserved_chunk && oslot != nslot) {
509fbf59bc9STejun Heo 		if (oslot < nslot)
510fbf59bc9STejun Heo 			list_move(&chunk->list, &pcpu_slot[nslot]);
511fbf59bc9STejun Heo 		else
512fbf59bc9STejun Heo 			list_move_tail(&chunk->list, &pcpu_slot[nslot]);
513fbf59bc9STejun Heo 	}
514fbf59bc9STejun Heo }
515fbf59bc9STejun Heo 
516fbf59bc9STejun Heo /**
51740064aecSDennis Zhou (Facebook)  * pcpu_cnt_pop_pages- counts populated backing pages in range
518833af842STejun Heo  * @chunk: chunk of interest
51940064aecSDennis Zhou (Facebook)  * @bit_off: start offset
52040064aecSDennis Zhou (Facebook)  * @bits: size of area to check
5219f7dcf22STejun Heo  *
52240064aecSDennis Zhou (Facebook)  * Calculates the number of populated pages in the region
52340064aecSDennis Zhou (Facebook)  * [page_start, page_end).  This keeps track of how many empty populated
52440064aecSDennis Zhou (Facebook)  * pages are available and decide if async work should be scheduled.
525ccea34b5STejun Heo  *
5269f7dcf22STejun Heo  * RETURNS:
52740064aecSDennis Zhou (Facebook)  * The nr of populated pages.
5289f7dcf22STejun Heo  */
52940064aecSDennis Zhou (Facebook) static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
53040064aecSDennis Zhou (Facebook) 				     int bits)
5319f7dcf22STejun Heo {
53240064aecSDennis Zhou (Facebook) 	int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
53340064aecSDennis Zhou (Facebook) 	int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
53440064aecSDennis Zhou (Facebook) 
53540064aecSDennis Zhou (Facebook) 	if (page_start >= page_end)
53640064aecSDennis Zhou (Facebook) 		return 0;
53740064aecSDennis Zhou (Facebook) 
53840064aecSDennis Zhou (Facebook) 	/*
53940064aecSDennis Zhou (Facebook) 	 * bitmap_weight counts the number of bits set in a bitmap up to
54040064aecSDennis Zhou (Facebook) 	 * the specified number of bits.  This is counting the populated
54140064aecSDennis Zhou (Facebook) 	 * pages up to page_end and then subtracting the populated pages
54240064aecSDennis Zhou (Facebook) 	 * up to page_start to count the populated pages in
54340064aecSDennis Zhou (Facebook) 	 * [page_start, page_end).
54440064aecSDennis Zhou (Facebook) 	 */
54540064aecSDennis Zhou (Facebook) 	return bitmap_weight(chunk->populated, page_end) -
54640064aecSDennis Zhou (Facebook) 	       bitmap_weight(chunk->populated, page_start);
54740064aecSDennis Zhou (Facebook) }
54840064aecSDennis Zhou (Facebook) 
54940064aecSDennis Zhou (Facebook) /**
55040064aecSDennis Zhou (Facebook)  * pcpu_chunk_update - updates the chunk metadata given a free area
55140064aecSDennis Zhou (Facebook)  * @chunk: chunk of interest
55240064aecSDennis Zhou (Facebook)  * @bit_off: chunk offset
55340064aecSDennis Zhou (Facebook)  * @bits: size of free area
55440064aecSDennis Zhou (Facebook)  *
55513f96637SDennis Zhou (Facebook)  * This updates the chunk's contig hint and starting offset given a free area.
556268625a6SDennis Zhou (Facebook)  * Choose the best starting offset if the contig hint is equal.
55740064aecSDennis Zhou (Facebook)  */
55840064aecSDennis Zhou (Facebook) static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
55940064aecSDennis Zhou (Facebook) {
56013f96637SDennis Zhou (Facebook) 	if (bits > chunk->contig_bits) {
56113f96637SDennis Zhou (Facebook) 		chunk->contig_bits_start = bit_off;
56240064aecSDennis Zhou (Facebook) 		chunk->contig_bits = bits;
563268625a6SDennis Zhou (Facebook) 	} else if (bits == chunk->contig_bits && chunk->contig_bits_start &&
564268625a6SDennis Zhou (Facebook) 		   (!bit_off ||
565268625a6SDennis Zhou (Facebook) 		    __ffs(bit_off) > __ffs(chunk->contig_bits_start))) {
566268625a6SDennis Zhou (Facebook) 		/* use the start with the best alignment */
567268625a6SDennis Zhou (Facebook) 		chunk->contig_bits_start = bit_off;
56840064aecSDennis Zhou (Facebook) 	}
56913f96637SDennis Zhou (Facebook) }
57040064aecSDennis Zhou (Facebook) 
57140064aecSDennis Zhou (Facebook) /**
57240064aecSDennis Zhou (Facebook)  * pcpu_chunk_refresh_hint - updates metadata about a chunk
57340064aecSDennis Zhou (Facebook)  * @chunk: chunk of interest
57440064aecSDennis Zhou (Facebook)  *
575525ca84dSDennis Zhou (Facebook)  * Iterates over the metadata blocks to find the largest contig area.
576525ca84dSDennis Zhou (Facebook)  * It also counts the populated pages and uses the delta to update the
577525ca84dSDennis Zhou (Facebook)  * global count.
57840064aecSDennis Zhou (Facebook)  *
57940064aecSDennis Zhou (Facebook)  * Updates:
58040064aecSDennis Zhou (Facebook)  *      chunk->contig_bits
58113f96637SDennis Zhou (Facebook)  *      chunk->contig_bits_start
582525ca84dSDennis Zhou (Facebook)  *      nr_empty_pop_pages (chunk and global)
58340064aecSDennis Zhou (Facebook)  */
58440064aecSDennis Zhou (Facebook) static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
58540064aecSDennis Zhou (Facebook) {
586525ca84dSDennis Zhou (Facebook) 	int bit_off, bits, nr_empty_pop_pages;
58740064aecSDennis Zhou (Facebook) 
58840064aecSDennis Zhou (Facebook) 	/* clear metadata */
58940064aecSDennis Zhou (Facebook) 	chunk->contig_bits = 0;
59040064aecSDennis Zhou (Facebook) 
591525ca84dSDennis Zhou (Facebook) 	bit_off = chunk->first_bit;
59240064aecSDennis Zhou (Facebook) 	bits = nr_empty_pop_pages = 0;
593525ca84dSDennis Zhou (Facebook) 	pcpu_for_each_md_free_region(chunk, bit_off, bits) {
594525ca84dSDennis Zhou (Facebook) 		pcpu_chunk_update(chunk, bit_off, bits);
59540064aecSDennis Zhou (Facebook) 
596525ca84dSDennis Zhou (Facebook) 		nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits);
59740064aecSDennis Zhou (Facebook) 	}
59840064aecSDennis Zhou (Facebook) 
59940064aecSDennis Zhou (Facebook) 	/*
60040064aecSDennis Zhou (Facebook) 	 * Keep track of nr_empty_pop_pages.
60140064aecSDennis Zhou (Facebook) 	 *
60240064aecSDennis Zhou (Facebook) 	 * The chunk maintains the previous number of free pages it held,
60340064aecSDennis Zhou (Facebook) 	 * so the delta is used to update the global counter.  The reserved
60440064aecSDennis Zhou (Facebook) 	 * chunk is not part of the free page count as they are populated
60540064aecSDennis Zhou (Facebook) 	 * at init and are special to serving reserved allocations.
60640064aecSDennis Zhou (Facebook) 	 */
60740064aecSDennis Zhou (Facebook) 	if (chunk != pcpu_reserved_chunk)
60840064aecSDennis Zhou (Facebook) 		pcpu_nr_empty_pop_pages +=
60940064aecSDennis Zhou (Facebook) 			(nr_empty_pop_pages - chunk->nr_empty_pop_pages);
61040064aecSDennis Zhou (Facebook) 
61140064aecSDennis Zhou (Facebook) 	chunk->nr_empty_pop_pages = nr_empty_pop_pages;
61240064aecSDennis Zhou (Facebook) }
61340064aecSDennis Zhou (Facebook) 
61440064aecSDennis Zhou (Facebook) /**
615ca460b3cSDennis Zhou (Facebook)  * pcpu_block_update - updates a block given a free area
616ca460b3cSDennis Zhou (Facebook)  * @block: block of interest
617ca460b3cSDennis Zhou (Facebook)  * @start: start offset in block
618ca460b3cSDennis Zhou (Facebook)  * @end: end offset in block
619ca460b3cSDennis Zhou (Facebook)  *
620ca460b3cSDennis Zhou (Facebook)  * Updates a block given a known free area.  The region [start, end) is
621268625a6SDennis Zhou (Facebook)  * expected to be the entirety of the free area within a block.  Chooses
622268625a6SDennis Zhou (Facebook)  * the best starting offset if the contig hints are equal.
623ca460b3cSDennis Zhou (Facebook)  */
624ca460b3cSDennis Zhou (Facebook) static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
625ca460b3cSDennis Zhou (Facebook) {
626ca460b3cSDennis Zhou (Facebook) 	int contig = end - start;
627ca460b3cSDennis Zhou (Facebook) 
628ca460b3cSDennis Zhou (Facebook) 	block->first_free = min(block->first_free, start);
629ca460b3cSDennis Zhou (Facebook) 	if (start == 0)
630ca460b3cSDennis Zhou (Facebook) 		block->left_free = contig;
631ca460b3cSDennis Zhou (Facebook) 
632ca460b3cSDennis Zhou (Facebook) 	if (end == PCPU_BITMAP_BLOCK_BITS)
633ca460b3cSDennis Zhou (Facebook) 		block->right_free = contig;
634ca460b3cSDennis Zhou (Facebook) 
635ca460b3cSDennis Zhou (Facebook) 	if (contig > block->contig_hint) {
636ca460b3cSDennis Zhou (Facebook) 		block->contig_hint_start = start;
637ca460b3cSDennis Zhou (Facebook) 		block->contig_hint = contig;
638268625a6SDennis Zhou (Facebook) 	} else if (block->contig_hint_start && contig == block->contig_hint &&
639268625a6SDennis Zhou (Facebook) 		   (!start || __ffs(start) > __ffs(block->contig_hint_start))) {
640268625a6SDennis Zhou (Facebook) 		/* use the start with the best alignment */
641268625a6SDennis Zhou (Facebook) 		block->contig_hint_start = start;
642ca460b3cSDennis Zhou (Facebook) 	}
643ca460b3cSDennis Zhou (Facebook) }
644ca460b3cSDennis Zhou (Facebook) 
645ca460b3cSDennis Zhou (Facebook) /**
646ca460b3cSDennis Zhou (Facebook)  * pcpu_block_refresh_hint
647ca460b3cSDennis Zhou (Facebook)  * @chunk: chunk of interest
648ca460b3cSDennis Zhou (Facebook)  * @index: index of the metadata block
649ca460b3cSDennis Zhou (Facebook)  *
650ca460b3cSDennis Zhou (Facebook)  * Scans over the block beginning at first_free and updates the block
651ca460b3cSDennis Zhou (Facebook)  * metadata accordingly.
652ca460b3cSDennis Zhou (Facebook)  */
653ca460b3cSDennis Zhou (Facebook) static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
654ca460b3cSDennis Zhou (Facebook) {
655ca460b3cSDennis Zhou (Facebook) 	struct pcpu_block_md *block = chunk->md_blocks + index;
656ca460b3cSDennis Zhou (Facebook) 	unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
657ca460b3cSDennis Zhou (Facebook) 	int rs, re;	/* region start, region end */
658ca460b3cSDennis Zhou (Facebook) 
659ca460b3cSDennis Zhou (Facebook) 	/* clear hints */
660ca460b3cSDennis Zhou (Facebook) 	block->contig_hint = 0;
661ca460b3cSDennis Zhou (Facebook) 	block->left_free = block->right_free = 0;
662ca460b3cSDennis Zhou (Facebook) 
663ca460b3cSDennis Zhou (Facebook) 	/* iterate over free areas and update the contig hints */
664ca460b3cSDennis Zhou (Facebook) 	pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free,
665ca460b3cSDennis Zhou (Facebook) 				   PCPU_BITMAP_BLOCK_BITS) {
666ca460b3cSDennis Zhou (Facebook) 		pcpu_block_update(block, rs, re);
667ca460b3cSDennis Zhou (Facebook) 	}
668ca460b3cSDennis Zhou (Facebook) }
669ca460b3cSDennis Zhou (Facebook) 
670ca460b3cSDennis Zhou (Facebook) /**
671ca460b3cSDennis Zhou (Facebook)  * pcpu_block_update_hint_alloc - update hint on allocation path
672ca460b3cSDennis Zhou (Facebook)  * @chunk: chunk of interest
673ca460b3cSDennis Zhou (Facebook)  * @bit_off: chunk offset
674ca460b3cSDennis Zhou (Facebook)  * @bits: size of request
675fc304334SDennis Zhou (Facebook)  *
676fc304334SDennis Zhou (Facebook)  * Updates metadata for the allocation path.  The metadata only has to be
677fc304334SDennis Zhou (Facebook)  * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
678fc304334SDennis Zhou (Facebook)  * scans are required if the block's contig hint is broken.
679ca460b3cSDennis Zhou (Facebook)  */
680ca460b3cSDennis Zhou (Facebook) static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
681ca460b3cSDennis Zhou (Facebook) 					 int bits)
682ca460b3cSDennis Zhou (Facebook) {
683ca460b3cSDennis Zhou (Facebook) 	struct pcpu_block_md *s_block, *e_block, *block;
684ca460b3cSDennis Zhou (Facebook) 	int s_index, e_index;	/* block indexes of the freed allocation */
685ca460b3cSDennis Zhou (Facebook) 	int s_off, e_off;	/* block offsets of the freed allocation */
686ca460b3cSDennis Zhou (Facebook) 
687ca460b3cSDennis Zhou (Facebook) 	/*
688ca460b3cSDennis Zhou (Facebook) 	 * Calculate per block offsets.
689ca460b3cSDennis Zhou (Facebook) 	 * The calculation uses an inclusive range, but the resulting offsets
690ca460b3cSDennis Zhou (Facebook) 	 * are [start, end).  e_index always points to the last block in the
691ca460b3cSDennis Zhou (Facebook) 	 * range.
692ca460b3cSDennis Zhou (Facebook) 	 */
693ca460b3cSDennis Zhou (Facebook) 	s_index = pcpu_off_to_block_index(bit_off);
694ca460b3cSDennis Zhou (Facebook) 	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
695ca460b3cSDennis Zhou (Facebook) 	s_off = pcpu_off_to_block_off(bit_off);
696ca460b3cSDennis Zhou (Facebook) 	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
697ca460b3cSDennis Zhou (Facebook) 
698ca460b3cSDennis Zhou (Facebook) 	s_block = chunk->md_blocks + s_index;
699ca460b3cSDennis Zhou (Facebook) 	e_block = chunk->md_blocks + e_index;
700ca460b3cSDennis Zhou (Facebook) 
701ca460b3cSDennis Zhou (Facebook) 	/*
702ca460b3cSDennis Zhou (Facebook) 	 * Update s_block.
703fc304334SDennis Zhou (Facebook) 	 * block->first_free must be updated if the allocation takes its place.
704fc304334SDennis Zhou (Facebook) 	 * If the allocation breaks the contig_hint, a scan is required to
705fc304334SDennis Zhou (Facebook) 	 * restore this hint.
706ca460b3cSDennis Zhou (Facebook) 	 */
707fc304334SDennis Zhou (Facebook) 	if (s_off == s_block->first_free)
708fc304334SDennis Zhou (Facebook) 		s_block->first_free = find_next_zero_bit(
709fc304334SDennis Zhou (Facebook) 					pcpu_index_alloc_map(chunk, s_index),
710fc304334SDennis Zhou (Facebook) 					PCPU_BITMAP_BLOCK_BITS,
711fc304334SDennis Zhou (Facebook) 					s_off + bits);
712fc304334SDennis Zhou (Facebook) 
713fc304334SDennis Zhou (Facebook) 	if (s_off >= s_block->contig_hint_start &&
714fc304334SDennis Zhou (Facebook) 	    s_off < s_block->contig_hint_start + s_block->contig_hint) {
715fc304334SDennis Zhou (Facebook) 		/* block contig hint is broken - scan to fix it */
716ca460b3cSDennis Zhou (Facebook) 		pcpu_block_refresh_hint(chunk, s_index);
717fc304334SDennis Zhou (Facebook) 	} else {
718fc304334SDennis Zhou (Facebook) 		/* update left and right contig manually */
719fc304334SDennis Zhou (Facebook) 		s_block->left_free = min(s_block->left_free, s_off);
720fc304334SDennis Zhou (Facebook) 		if (s_index == e_index)
721fc304334SDennis Zhou (Facebook) 			s_block->right_free = min_t(int, s_block->right_free,
722fc304334SDennis Zhou (Facebook) 					PCPU_BITMAP_BLOCK_BITS - e_off);
723fc304334SDennis Zhou (Facebook) 		else
724fc304334SDennis Zhou (Facebook) 			s_block->right_free = 0;
725fc304334SDennis Zhou (Facebook) 	}
726ca460b3cSDennis Zhou (Facebook) 
727ca460b3cSDennis Zhou (Facebook) 	/*
728ca460b3cSDennis Zhou (Facebook) 	 * Update e_block.
729ca460b3cSDennis Zhou (Facebook) 	 */
730ca460b3cSDennis Zhou (Facebook) 	if (s_index != e_index) {
731fc304334SDennis Zhou (Facebook) 		/*
732fc304334SDennis Zhou (Facebook) 		 * When the allocation is across blocks, the end is along
733fc304334SDennis Zhou (Facebook) 		 * the left part of the e_block.
734fc304334SDennis Zhou (Facebook) 		 */
735fc304334SDennis Zhou (Facebook) 		e_block->first_free = find_next_zero_bit(
736fc304334SDennis Zhou (Facebook) 				pcpu_index_alloc_map(chunk, e_index),
737fc304334SDennis Zhou (Facebook) 				PCPU_BITMAP_BLOCK_BITS, e_off);
738fc304334SDennis Zhou (Facebook) 
739fc304334SDennis Zhou (Facebook) 		if (e_off == PCPU_BITMAP_BLOCK_BITS) {
740fc304334SDennis Zhou (Facebook) 			/* reset the block */
741fc304334SDennis Zhou (Facebook) 			e_block++;
742fc304334SDennis Zhou (Facebook) 		} else {
743fc304334SDennis Zhou (Facebook) 			if (e_off > e_block->contig_hint_start) {
744fc304334SDennis Zhou (Facebook) 				/* contig hint is broken - scan to fix it */
745ca460b3cSDennis Zhou (Facebook) 				pcpu_block_refresh_hint(chunk, e_index);
746fc304334SDennis Zhou (Facebook) 			} else {
747fc304334SDennis Zhou (Facebook) 				e_block->left_free = 0;
748fc304334SDennis Zhou (Facebook) 				e_block->right_free =
749fc304334SDennis Zhou (Facebook) 					min_t(int, e_block->right_free,
750fc304334SDennis Zhou (Facebook) 					      PCPU_BITMAP_BLOCK_BITS - e_off);
751fc304334SDennis Zhou (Facebook) 			}
752fc304334SDennis Zhou (Facebook) 		}
753ca460b3cSDennis Zhou (Facebook) 
754ca460b3cSDennis Zhou (Facebook) 		/* update in-between md_blocks */
755ca460b3cSDennis Zhou (Facebook) 		for (block = s_block + 1; block < e_block; block++) {
756ca460b3cSDennis Zhou (Facebook) 			block->contig_hint = 0;
757ca460b3cSDennis Zhou (Facebook) 			block->left_free = 0;
758ca460b3cSDennis Zhou (Facebook) 			block->right_free = 0;
759ca460b3cSDennis Zhou (Facebook) 		}
760ca460b3cSDennis Zhou (Facebook) 	}
761ca460b3cSDennis Zhou (Facebook) 
762fc304334SDennis Zhou (Facebook) 	/*
763fc304334SDennis Zhou (Facebook) 	 * The only time a full chunk scan is required is if the chunk
764fc304334SDennis Zhou (Facebook) 	 * contig hint is broken.  Otherwise, it means a smaller space
765fc304334SDennis Zhou (Facebook) 	 * was used and therefore the chunk contig hint is still correct.
766fc304334SDennis Zhou (Facebook) 	 */
767fc304334SDennis Zhou (Facebook) 	if (bit_off >= chunk->contig_bits_start  &&
768fc304334SDennis Zhou (Facebook) 	    bit_off < chunk->contig_bits_start + chunk->contig_bits)
769ca460b3cSDennis Zhou (Facebook) 		pcpu_chunk_refresh_hint(chunk);
770ca460b3cSDennis Zhou (Facebook) }
771ca460b3cSDennis Zhou (Facebook) 
772ca460b3cSDennis Zhou (Facebook) /**
773ca460b3cSDennis Zhou (Facebook)  * pcpu_block_update_hint_free - updates the block hints on the free path
774ca460b3cSDennis Zhou (Facebook)  * @chunk: chunk of interest
775ca460b3cSDennis Zhou (Facebook)  * @bit_off: chunk offset
776ca460b3cSDennis Zhou (Facebook)  * @bits: size of request
777b185cd0dSDennis Zhou (Facebook)  *
778b185cd0dSDennis Zhou (Facebook)  * Updates metadata for the allocation path.  This avoids a blind block
779b185cd0dSDennis Zhou (Facebook)  * refresh by making use of the block contig hints.  If this fails, it scans
780b185cd0dSDennis Zhou (Facebook)  * forward and backward to determine the extent of the free area.  This is
781b185cd0dSDennis Zhou (Facebook)  * capped at the boundary of blocks.
782b185cd0dSDennis Zhou (Facebook)  *
783b185cd0dSDennis Zhou (Facebook)  * A chunk update is triggered if a page becomes free, a block becomes free,
784b185cd0dSDennis Zhou (Facebook)  * or the free spans across blocks.  This tradeoff is to minimize iterating
785b185cd0dSDennis Zhou (Facebook)  * over the block metadata to update chunk->contig_bits.  chunk->contig_bits
786b185cd0dSDennis Zhou (Facebook)  * may be off by up to a page, but it will never be more than the available
787b185cd0dSDennis Zhou (Facebook)  * space.  If the contig hint is contained in one block, it will be accurate.
788ca460b3cSDennis Zhou (Facebook)  */
789ca460b3cSDennis Zhou (Facebook) static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
790ca460b3cSDennis Zhou (Facebook) 					int bits)
791ca460b3cSDennis Zhou (Facebook) {
792ca460b3cSDennis Zhou (Facebook) 	struct pcpu_block_md *s_block, *e_block, *block;
793ca460b3cSDennis Zhou (Facebook) 	int s_index, e_index;	/* block indexes of the freed allocation */
794ca460b3cSDennis Zhou (Facebook) 	int s_off, e_off;	/* block offsets of the freed allocation */
795b185cd0dSDennis Zhou (Facebook) 	int start, end;		/* start and end of the whole free area */
796ca460b3cSDennis Zhou (Facebook) 
797ca460b3cSDennis Zhou (Facebook) 	/*
798ca460b3cSDennis Zhou (Facebook) 	 * Calculate per block offsets.
799ca460b3cSDennis Zhou (Facebook) 	 * The calculation uses an inclusive range, but the resulting offsets
800ca460b3cSDennis Zhou (Facebook) 	 * are [start, end).  e_index always points to the last block in the
801ca460b3cSDennis Zhou (Facebook) 	 * range.
802ca460b3cSDennis Zhou (Facebook) 	 */
803ca460b3cSDennis Zhou (Facebook) 	s_index = pcpu_off_to_block_index(bit_off);
804ca460b3cSDennis Zhou (Facebook) 	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
805ca460b3cSDennis Zhou (Facebook) 	s_off = pcpu_off_to_block_off(bit_off);
806ca460b3cSDennis Zhou (Facebook) 	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
807ca460b3cSDennis Zhou (Facebook) 
808ca460b3cSDennis Zhou (Facebook) 	s_block = chunk->md_blocks + s_index;
809ca460b3cSDennis Zhou (Facebook) 	e_block = chunk->md_blocks + e_index;
810ca460b3cSDennis Zhou (Facebook) 
811b185cd0dSDennis Zhou (Facebook) 	/*
812b185cd0dSDennis Zhou (Facebook) 	 * Check if the freed area aligns with the block->contig_hint.
813b185cd0dSDennis Zhou (Facebook) 	 * If it does, then the scan to find the beginning/end of the
814b185cd0dSDennis Zhou (Facebook) 	 * larger free area can be avoided.
815b185cd0dSDennis Zhou (Facebook) 	 *
816b185cd0dSDennis Zhou (Facebook) 	 * start and end refer to beginning and end of the free area
817b185cd0dSDennis Zhou (Facebook) 	 * within each their respective blocks.  This is not necessarily
818b185cd0dSDennis Zhou (Facebook) 	 * the entire free area as it may span blocks past the beginning
819b185cd0dSDennis Zhou (Facebook) 	 * or end of the block.
820b185cd0dSDennis Zhou (Facebook) 	 */
821b185cd0dSDennis Zhou (Facebook) 	start = s_off;
822b185cd0dSDennis Zhou (Facebook) 	if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
823b185cd0dSDennis Zhou (Facebook) 		start = s_block->contig_hint_start;
824b185cd0dSDennis Zhou (Facebook) 	} else {
825b185cd0dSDennis Zhou (Facebook) 		/*
826b185cd0dSDennis Zhou (Facebook) 		 * Scan backwards to find the extent of the free area.
827b185cd0dSDennis Zhou (Facebook) 		 * find_last_bit returns the starting bit, so if the start bit
828b185cd0dSDennis Zhou (Facebook) 		 * is returned, that means there was no last bit and the
829b185cd0dSDennis Zhou (Facebook) 		 * remainder of the chunk is free.
830b185cd0dSDennis Zhou (Facebook) 		 */
831b185cd0dSDennis Zhou (Facebook) 		int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
832b185cd0dSDennis Zhou (Facebook) 					  start);
833b185cd0dSDennis Zhou (Facebook) 		start = (start == l_bit) ? 0 : l_bit + 1;
834b185cd0dSDennis Zhou (Facebook) 	}
835b185cd0dSDennis Zhou (Facebook) 
836b185cd0dSDennis Zhou (Facebook) 	end = e_off;
837b185cd0dSDennis Zhou (Facebook) 	if (e_off == e_block->contig_hint_start)
838b185cd0dSDennis Zhou (Facebook) 		end = e_block->contig_hint_start + e_block->contig_hint;
839b185cd0dSDennis Zhou (Facebook) 	else
840b185cd0dSDennis Zhou (Facebook) 		end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
841b185cd0dSDennis Zhou (Facebook) 				    PCPU_BITMAP_BLOCK_BITS, end);
842b185cd0dSDennis Zhou (Facebook) 
843ca460b3cSDennis Zhou (Facebook) 	/* update s_block */
844b185cd0dSDennis Zhou (Facebook) 	e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
845b185cd0dSDennis Zhou (Facebook) 	pcpu_block_update(s_block, start, e_off);
846ca460b3cSDennis Zhou (Facebook) 
847ca460b3cSDennis Zhou (Facebook) 	/* freeing in the same block */
848ca460b3cSDennis Zhou (Facebook) 	if (s_index != e_index) {
849ca460b3cSDennis Zhou (Facebook) 		/* update e_block */
850b185cd0dSDennis Zhou (Facebook) 		pcpu_block_update(e_block, 0, end);
851ca460b3cSDennis Zhou (Facebook) 
852ca460b3cSDennis Zhou (Facebook) 		/* reset md_blocks in the middle */
853ca460b3cSDennis Zhou (Facebook) 		for (block = s_block + 1; block < e_block; block++) {
854ca460b3cSDennis Zhou (Facebook) 			block->first_free = 0;
855ca460b3cSDennis Zhou (Facebook) 			block->contig_hint_start = 0;
856ca460b3cSDennis Zhou (Facebook) 			block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
857ca460b3cSDennis Zhou (Facebook) 			block->left_free = PCPU_BITMAP_BLOCK_BITS;
858ca460b3cSDennis Zhou (Facebook) 			block->right_free = PCPU_BITMAP_BLOCK_BITS;
859ca460b3cSDennis Zhou (Facebook) 		}
860ca460b3cSDennis Zhou (Facebook) 	}
861ca460b3cSDennis Zhou (Facebook) 
862b185cd0dSDennis Zhou (Facebook) 	/*
863b185cd0dSDennis Zhou (Facebook) 	 * Refresh chunk metadata when the free makes a page free, a block
864b185cd0dSDennis Zhou (Facebook) 	 * free, or spans across blocks.  The contig hint may be off by up to
865b185cd0dSDennis Zhou (Facebook) 	 * a page, but if the hint is contained in a block, it will be accurate
866b185cd0dSDennis Zhou (Facebook) 	 * with the else condition below.
867b185cd0dSDennis Zhou (Facebook) 	 */
868b185cd0dSDennis Zhou (Facebook) 	if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) >
869b185cd0dSDennis Zhou (Facebook) 	     ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) ||
870b185cd0dSDennis Zhou (Facebook) 	    s_index != e_index)
871ca460b3cSDennis Zhou (Facebook) 		pcpu_chunk_refresh_hint(chunk);
872b185cd0dSDennis Zhou (Facebook) 	else
873b185cd0dSDennis Zhou (Facebook) 		pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start),
874b185cd0dSDennis Zhou (Facebook) 				  s_block->contig_hint);
875ca460b3cSDennis Zhou (Facebook) }
876ca460b3cSDennis Zhou (Facebook) 
877ca460b3cSDennis Zhou (Facebook) /**
87840064aecSDennis Zhou (Facebook)  * pcpu_is_populated - determines if the region is populated
87940064aecSDennis Zhou (Facebook)  * @chunk: chunk of interest
88040064aecSDennis Zhou (Facebook)  * @bit_off: chunk offset
88140064aecSDennis Zhou (Facebook)  * @bits: size of area
88240064aecSDennis Zhou (Facebook)  * @next_off: return value for the next offset to start searching
88340064aecSDennis Zhou (Facebook)  *
88440064aecSDennis Zhou (Facebook)  * For atomic allocations, check if the backing pages are populated.
88540064aecSDennis Zhou (Facebook)  *
88640064aecSDennis Zhou (Facebook)  * RETURNS:
88740064aecSDennis Zhou (Facebook)  * Bool if the backing pages are populated.
88840064aecSDennis Zhou (Facebook)  * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
88940064aecSDennis Zhou (Facebook)  */
89040064aecSDennis Zhou (Facebook) static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
89140064aecSDennis Zhou (Facebook) 			      int *next_off)
89240064aecSDennis Zhou (Facebook) {
89340064aecSDennis Zhou (Facebook) 	int page_start, page_end, rs, re;
89440064aecSDennis Zhou (Facebook) 
89540064aecSDennis Zhou (Facebook) 	page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
89640064aecSDennis Zhou (Facebook) 	page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
89740064aecSDennis Zhou (Facebook) 
89840064aecSDennis Zhou (Facebook) 	rs = page_start;
89940064aecSDennis Zhou (Facebook) 	pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
90040064aecSDennis Zhou (Facebook) 	if (rs >= page_end)
90140064aecSDennis Zhou (Facebook) 		return true;
90240064aecSDennis Zhou (Facebook) 
90340064aecSDennis Zhou (Facebook) 	*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
90440064aecSDennis Zhou (Facebook) 	return false;
90540064aecSDennis Zhou (Facebook) }
90640064aecSDennis Zhou (Facebook) 
90740064aecSDennis Zhou (Facebook) /**
90840064aecSDennis Zhou (Facebook)  * pcpu_find_block_fit - finds the block index to start searching
90940064aecSDennis Zhou (Facebook)  * @chunk: chunk of interest
91040064aecSDennis Zhou (Facebook)  * @alloc_bits: size of request in allocation units
91140064aecSDennis Zhou (Facebook)  * @align: alignment of area (max PAGE_SIZE bytes)
91240064aecSDennis Zhou (Facebook)  * @pop_only: use populated regions only
91340064aecSDennis Zhou (Facebook)  *
914b4c2116cSDennis Zhou (Facebook)  * Given a chunk and an allocation spec, find the offset to begin searching
915b4c2116cSDennis Zhou (Facebook)  * for a free region.  This iterates over the bitmap metadata blocks to
916b4c2116cSDennis Zhou (Facebook)  * find an offset that will be guaranteed to fit the requirements.  It is
917b4c2116cSDennis Zhou (Facebook)  * not quite first fit as if the allocation does not fit in the contig hint
918b4c2116cSDennis Zhou (Facebook)  * of a block or chunk, it is skipped.  This errs on the side of caution
919b4c2116cSDennis Zhou (Facebook)  * to prevent excess iteration.  Poor alignment can cause the allocator to
920b4c2116cSDennis Zhou (Facebook)  * skip over blocks and chunks that have valid free areas.
921b4c2116cSDennis Zhou (Facebook)  *
92240064aecSDennis Zhou (Facebook)  * RETURNS:
92340064aecSDennis Zhou (Facebook)  * The offset in the bitmap to begin searching.
92440064aecSDennis Zhou (Facebook)  * -1 if no offset is found.
92540064aecSDennis Zhou (Facebook)  */
92640064aecSDennis Zhou (Facebook) static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
92740064aecSDennis Zhou (Facebook) 			       size_t align, bool pop_only)
92840064aecSDennis Zhou (Facebook) {
929b4c2116cSDennis Zhou (Facebook) 	int bit_off, bits, next_off;
93040064aecSDennis Zhou (Facebook) 
93113f96637SDennis Zhou (Facebook) 	/*
93213f96637SDennis Zhou (Facebook) 	 * Check to see if the allocation can fit in the chunk's contig hint.
93313f96637SDennis Zhou (Facebook) 	 * This is an optimization to prevent scanning by assuming if it
93413f96637SDennis Zhou (Facebook) 	 * cannot fit in the global hint, there is memory pressure and creating
93513f96637SDennis Zhou (Facebook) 	 * a new chunk would happen soon.
93613f96637SDennis Zhou (Facebook) 	 */
93713f96637SDennis Zhou (Facebook) 	bit_off = ALIGN(chunk->contig_bits_start, align) -
93813f96637SDennis Zhou (Facebook) 		  chunk->contig_bits_start;
93913f96637SDennis Zhou (Facebook) 	if (bit_off + alloc_bits > chunk->contig_bits)
94013f96637SDennis Zhou (Facebook) 		return -1;
94113f96637SDennis Zhou (Facebook) 
942b4c2116cSDennis Zhou (Facebook) 	bit_off = chunk->first_bit;
943b4c2116cSDennis Zhou (Facebook) 	bits = 0;
944b4c2116cSDennis Zhou (Facebook) 	pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
94540064aecSDennis Zhou (Facebook) 		if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
946b4c2116cSDennis Zhou (Facebook) 						   &next_off))
94740064aecSDennis Zhou (Facebook) 			break;
94840064aecSDennis Zhou (Facebook) 
949b4c2116cSDennis Zhou (Facebook) 		bit_off = next_off;
95040064aecSDennis Zhou (Facebook) 		bits = 0;
95140064aecSDennis Zhou (Facebook) 	}
95240064aecSDennis Zhou (Facebook) 
95340064aecSDennis Zhou (Facebook) 	if (bit_off == pcpu_chunk_map_bits(chunk))
95440064aecSDennis Zhou (Facebook) 		return -1;
95540064aecSDennis Zhou (Facebook) 
95640064aecSDennis Zhou (Facebook) 	return bit_off;
95740064aecSDennis Zhou (Facebook) }
95840064aecSDennis Zhou (Facebook) 
95940064aecSDennis Zhou (Facebook) /**
96040064aecSDennis Zhou (Facebook)  * pcpu_alloc_area - allocates an area from a pcpu_chunk
96140064aecSDennis Zhou (Facebook)  * @chunk: chunk of interest
96240064aecSDennis Zhou (Facebook)  * @alloc_bits: size of request in allocation units
96340064aecSDennis Zhou (Facebook)  * @align: alignment of area (max PAGE_SIZE)
96440064aecSDennis Zhou (Facebook)  * @start: bit_off to start searching
96540064aecSDennis Zhou (Facebook)  *
96640064aecSDennis Zhou (Facebook)  * This function takes in a @start offset to begin searching to fit an
967b4c2116cSDennis Zhou (Facebook)  * allocation of @alloc_bits with alignment @align.  It needs to scan
968b4c2116cSDennis Zhou (Facebook)  * the allocation map because if it fits within the block's contig hint,
969b4c2116cSDennis Zhou (Facebook)  * @start will be block->first_free. This is an attempt to fill the
970b4c2116cSDennis Zhou (Facebook)  * allocation prior to breaking the contig hint.  The allocation and
971b4c2116cSDennis Zhou (Facebook)  * boundary maps are updated accordingly if it confirms a valid
972b4c2116cSDennis Zhou (Facebook)  * free area.
97340064aecSDennis Zhou (Facebook)  *
97440064aecSDennis Zhou (Facebook)  * RETURNS:
97540064aecSDennis Zhou (Facebook)  * Allocated addr offset in @chunk on success.
97640064aecSDennis Zhou (Facebook)  * -1 if no matching area is found.
97740064aecSDennis Zhou (Facebook)  */
97840064aecSDennis Zhou (Facebook) static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
97940064aecSDennis Zhou (Facebook) 			   size_t align, int start)
98040064aecSDennis Zhou (Facebook) {
98140064aecSDennis Zhou (Facebook) 	size_t align_mask = (align) ? (align - 1) : 0;
98240064aecSDennis Zhou (Facebook) 	int bit_off, end, oslot;
9839f7dcf22STejun Heo 
9844f996e23STejun Heo 	lockdep_assert_held(&pcpu_lock);
9854f996e23STejun Heo 
98640064aecSDennis Zhou (Facebook) 	oslot = pcpu_chunk_slot(chunk);
987833af842STejun Heo 
988833af842STejun Heo 	/*
98940064aecSDennis Zhou (Facebook) 	 * Search to find a fit.
990833af842STejun Heo 	 */
991b4c2116cSDennis Zhou (Facebook) 	end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS;
99240064aecSDennis Zhou (Facebook) 	bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
99340064aecSDennis Zhou (Facebook) 					     alloc_bits, align_mask);
99440064aecSDennis Zhou (Facebook) 	if (bit_off >= end)
995a16037c8STejun Heo 		return -1;
996a16037c8STejun Heo 
99740064aecSDennis Zhou (Facebook) 	/* update alloc map */
99840064aecSDennis Zhou (Facebook) 	bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
999a16037c8STejun Heo 
100040064aecSDennis Zhou (Facebook) 	/* update boundary map */
100140064aecSDennis Zhou (Facebook) 	set_bit(bit_off, chunk->bound_map);
100240064aecSDennis Zhou (Facebook) 	bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
100340064aecSDennis Zhou (Facebook) 	set_bit(bit_off + alloc_bits, chunk->bound_map);
1004a16037c8STejun Heo 
100540064aecSDennis Zhou (Facebook) 	chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
100640064aecSDennis Zhou (Facebook) 
100786b442fbSDennis Zhou (Facebook) 	/* update first free bit */
100886b442fbSDennis Zhou (Facebook) 	if (bit_off == chunk->first_bit)
100986b442fbSDennis Zhou (Facebook) 		chunk->first_bit = find_next_zero_bit(
101086b442fbSDennis Zhou (Facebook) 					chunk->alloc_map,
101186b442fbSDennis Zhou (Facebook) 					pcpu_chunk_map_bits(chunk),
101286b442fbSDennis Zhou (Facebook) 					bit_off + alloc_bits);
101386b442fbSDennis Zhou (Facebook) 
1014ca460b3cSDennis Zhou (Facebook) 	pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
101540064aecSDennis Zhou (Facebook) 
101640064aecSDennis Zhou (Facebook) 	pcpu_chunk_relocate(chunk, oslot);
101740064aecSDennis Zhou (Facebook) 
101840064aecSDennis Zhou (Facebook) 	return bit_off * PCPU_MIN_ALLOC_SIZE;
1019a16037c8STejun Heo }
1020a16037c8STejun Heo 
1021a16037c8STejun Heo /**
102240064aecSDennis Zhou (Facebook)  * pcpu_free_area - frees the corresponding offset
1023fbf59bc9STejun Heo  * @chunk: chunk of interest
102440064aecSDennis Zhou (Facebook)  * @off: addr offset into chunk
1025fbf59bc9STejun Heo  *
102640064aecSDennis Zhou (Facebook)  * This function determines the size of an allocation to free using
102740064aecSDennis Zhou (Facebook)  * the boundary bitmap and clears the allocation map.
1028fbf59bc9STejun Heo  */
102940064aecSDennis Zhou (Facebook) static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
1030fbf59bc9STejun Heo {
103140064aecSDennis Zhou (Facebook) 	int bit_off, bits, end, oslot;
1032fbf59bc9STejun Heo 
10335ccd30e4SDennis Zhou 	lockdep_assert_held(&pcpu_lock);
103430a5b536SDennis Zhou 	pcpu_stats_area_dealloc(chunk);
10355ccd30e4SDennis Zhou 
103640064aecSDennis Zhou (Facebook) 	oslot = pcpu_chunk_slot(chunk);
1037723ad1d9SAl Viro 
103840064aecSDennis Zhou (Facebook) 	bit_off = off / PCPU_MIN_ALLOC_SIZE;
1039fbf59bc9STejun Heo 
104040064aecSDennis Zhou (Facebook) 	/* find end index */
104140064aecSDennis Zhou (Facebook) 	end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
104240064aecSDennis Zhou (Facebook) 			    bit_off + 1);
104340064aecSDennis Zhou (Facebook) 	bits = end - bit_off;
104440064aecSDennis Zhou (Facebook) 	bitmap_clear(chunk->alloc_map, bit_off, bits);
10453d331ad7SAl Viro 
104640064aecSDennis Zhou (Facebook) 	/* update metadata */
104740064aecSDennis Zhou (Facebook) 	chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
1048fbf59bc9STejun Heo 
104986b442fbSDennis Zhou (Facebook) 	/* update first free bit */
105086b442fbSDennis Zhou (Facebook) 	chunk->first_bit = min(chunk->first_bit, bit_off);
105186b442fbSDennis Zhou (Facebook) 
1052ca460b3cSDennis Zhou (Facebook) 	pcpu_block_update_hint_free(chunk, bit_off, bits);
1053b539b87fSTejun Heo 
1054fbf59bc9STejun Heo 	pcpu_chunk_relocate(chunk, oslot);
1055fbf59bc9STejun Heo }
1056fbf59bc9STejun Heo 
1057ca460b3cSDennis Zhou (Facebook) static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
1058ca460b3cSDennis Zhou (Facebook) {
1059ca460b3cSDennis Zhou (Facebook) 	struct pcpu_block_md *md_block;
1060ca460b3cSDennis Zhou (Facebook) 
1061ca460b3cSDennis Zhou (Facebook) 	for (md_block = chunk->md_blocks;
1062ca460b3cSDennis Zhou (Facebook) 	     md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
1063ca460b3cSDennis Zhou (Facebook) 	     md_block++) {
1064ca460b3cSDennis Zhou (Facebook) 		md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
1065ca460b3cSDennis Zhou (Facebook) 		md_block->left_free = PCPU_BITMAP_BLOCK_BITS;
1066ca460b3cSDennis Zhou (Facebook) 		md_block->right_free = PCPU_BITMAP_BLOCK_BITS;
1067ca460b3cSDennis Zhou (Facebook) 	}
1068ca460b3cSDennis Zhou (Facebook) }
1069ca460b3cSDennis Zhou (Facebook) 
107040064aecSDennis Zhou (Facebook) /**
107140064aecSDennis Zhou (Facebook)  * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
107240064aecSDennis Zhou (Facebook)  * @tmp_addr: the start of the region served
107340064aecSDennis Zhou (Facebook)  * @map_size: size of the region served
107440064aecSDennis Zhou (Facebook)  *
107540064aecSDennis Zhou (Facebook)  * This is responsible for creating the chunks that serve the first chunk.  The
107640064aecSDennis Zhou (Facebook)  * base_addr is page aligned down of @tmp_addr while the region end is page
107740064aecSDennis Zhou (Facebook)  * aligned up.  Offsets are kept track of to determine the region served. All
107840064aecSDennis Zhou (Facebook)  * this is done to appease the bitmap allocator in avoiding partial blocks.
107940064aecSDennis Zhou (Facebook)  *
108040064aecSDennis Zhou (Facebook)  * RETURNS:
108140064aecSDennis Zhou (Facebook)  * Chunk serving the region at @tmp_addr of @map_size.
108240064aecSDennis Zhou (Facebook)  */
1083c0ebfdc3SDennis Zhou (Facebook) static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
108440064aecSDennis Zhou (Facebook) 							 int map_size)
108510edf5b0SDennis Zhou (Facebook) {
108610edf5b0SDennis Zhou (Facebook) 	struct pcpu_chunk *chunk;
1087ca460b3cSDennis Zhou (Facebook) 	unsigned long aligned_addr, lcm_align;
108840064aecSDennis Zhou (Facebook) 	int start_offset, offset_bits, region_size, region_bits;
1089c0ebfdc3SDennis Zhou (Facebook) 
1090c0ebfdc3SDennis Zhou (Facebook) 	/* region calculations */
1091c0ebfdc3SDennis Zhou (Facebook) 	aligned_addr = tmp_addr & PAGE_MASK;
1092c0ebfdc3SDennis Zhou (Facebook) 
1093c0ebfdc3SDennis Zhou (Facebook) 	start_offset = tmp_addr - aligned_addr;
10946b9d7c8eSDennis Zhou (Facebook) 
1095ca460b3cSDennis Zhou (Facebook) 	/*
1096ca460b3cSDennis Zhou (Facebook) 	 * Align the end of the region with the LCM of PAGE_SIZE and
1097ca460b3cSDennis Zhou (Facebook) 	 * PCPU_BITMAP_BLOCK_SIZE.  One of these constants is a multiple of
1098ca460b3cSDennis Zhou (Facebook) 	 * the other.
1099ca460b3cSDennis Zhou (Facebook) 	 */
1100ca460b3cSDennis Zhou (Facebook) 	lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
1101ca460b3cSDennis Zhou (Facebook) 	region_size = ALIGN(start_offset + map_size, lcm_align);
110210edf5b0SDennis Zhou (Facebook) 
1103c0ebfdc3SDennis Zhou (Facebook) 	/* allocate chunk */
11048ab16c43SDennis Zhou (Facebook) 	chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) +
11058ab16c43SDennis Zhou (Facebook) 				    BITS_TO_LONGS(region_size >> PAGE_SHIFT),
11068ab16c43SDennis Zhou (Facebook) 				    0);
1107c0ebfdc3SDennis Zhou (Facebook) 
110810edf5b0SDennis Zhou (Facebook) 	INIT_LIST_HEAD(&chunk->list);
1109c0ebfdc3SDennis Zhou (Facebook) 
1110c0ebfdc3SDennis Zhou (Facebook) 	chunk->base_addr = (void *)aligned_addr;
111110edf5b0SDennis Zhou (Facebook) 	chunk->start_offset = start_offset;
11126b9d7c8eSDennis Zhou (Facebook) 	chunk->end_offset = region_size - chunk->start_offset - map_size;
1113c0ebfdc3SDennis Zhou (Facebook) 
11148ab16c43SDennis Zhou (Facebook) 	chunk->nr_pages = region_size >> PAGE_SHIFT;
111540064aecSDennis Zhou (Facebook) 	region_bits = pcpu_chunk_map_bits(chunk);
1116c0ebfdc3SDennis Zhou (Facebook) 
1117ca460b3cSDennis Zhou (Facebook) 	chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) *
111840064aecSDennis Zhou (Facebook) 					       sizeof(chunk->alloc_map[0]), 0);
1119ca460b3cSDennis Zhou (Facebook) 	chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) *
112040064aecSDennis Zhou (Facebook) 					       sizeof(chunk->bound_map[0]), 0);
1121ca460b3cSDennis Zhou (Facebook) 	chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) *
1122ca460b3cSDennis Zhou (Facebook) 					       sizeof(chunk->md_blocks[0]), 0);
1123ca460b3cSDennis Zhou (Facebook) 	pcpu_init_md_blocks(chunk);
112410edf5b0SDennis Zhou (Facebook) 
112510edf5b0SDennis Zhou (Facebook) 	/* manage populated page bitmap */
112610edf5b0SDennis Zhou (Facebook) 	chunk->immutable = true;
11278ab16c43SDennis Zhou (Facebook) 	bitmap_fill(chunk->populated, chunk->nr_pages);
11288ab16c43SDennis Zhou (Facebook) 	chunk->nr_populated = chunk->nr_pages;
112940064aecSDennis Zhou (Facebook) 	chunk->nr_empty_pop_pages =
113040064aecSDennis Zhou (Facebook) 		pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
113140064aecSDennis Zhou (Facebook) 				   map_size / PCPU_MIN_ALLOC_SIZE);
113210edf5b0SDennis Zhou (Facebook) 
113340064aecSDennis Zhou (Facebook) 	chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
113440064aecSDennis Zhou (Facebook) 	chunk->free_bytes = map_size;
1135c0ebfdc3SDennis Zhou (Facebook) 
1136c0ebfdc3SDennis Zhou (Facebook) 	if (chunk->start_offset) {
1137c0ebfdc3SDennis Zhou (Facebook) 		/* hide the beginning of the bitmap */
113840064aecSDennis Zhou (Facebook) 		offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
113940064aecSDennis Zhou (Facebook) 		bitmap_set(chunk->alloc_map, 0, offset_bits);
114040064aecSDennis Zhou (Facebook) 		set_bit(0, chunk->bound_map);
114140064aecSDennis Zhou (Facebook) 		set_bit(offset_bits, chunk->bound_map);
1142ca460b3cSDennis Zhou (Facebook) 
114386b442fbSDennis Zhou (Facebook) 		chunk->first_bit = offset_bits;
114486b442fbSDennis Zhou (Facebook) 
1145ca460b3cSDennis Zhou (Facebook) 		pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
1146c0ebfdc3SDennis Zhou (Facebook) 	}
1147c0ebfdc3SDennis Zhou (Facebook) 
11486b9d7c8eSDennis Zhou (Facebook) 	if (chunk->end_offset) {
11496b9d7c8eSDennis Zhou (Facebook) 		/* hide the end of the bitmap */
115040064aecSDennis Zhou (Facebook) 		offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
115140064aecSDennis Zhou (Facebook) 		bitmap_set(chunk->alloc_map,
115240064aecSDennis Zhou (Facebook) 			   pcpu_chunk_map_bits(chunk) - offset_bits,
115340064aecSDennis Zhou (Facebook) 			   offset_bits);
115440064aecSDennis Zhou (Facebook) 		set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
115540064aecSDennis Zhou (Facebook) 			chunk->bound_map);
115640064aecSDennis Zhou (Facebook) 		set_bit(region_bits, chunk->bound_map);
11576b9d7c8eSDennis Zhou (Facebook) 
1158ca460b3cSDennis Zhou (Facebook) 		pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
1159ca460b3cSDennis Zhou (Facebook) 					     - offset_bits, offset_bits);
1160ca460b3cSDennis Zhou (Facebook) 	}
116140064aecSDennis Zhou (Facebook) 
116210edf5b0SDennis Zhou (Facebook) 	return chunk;
116310edf5b0SDennis Zhou (Facebook) }
116410edf5b0SDennis Zhou (Facebook) 
116547504ee0SDennis Zhou static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
11666081089fSTejun Heo {
11676081089fSTejun Heo 	struct pcpu_chunk *chunk;
116840064aecSDennis Zhou (Facebook) 	int region_bits;
11696081089fSTejun Heo 
117047504ee0SDennis Zhou 	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
11716081089fSTejun Heo 	if (!chunk)
11726081089fSTejun Heo 		return NULL;
11736081089fSTejun Heo 
11746081089fSTejun Heo 	INIT_LIST_HEAD(&chunk->list);
1175c0ebfdc3SDennis Zhou (Facebook) 	chunk->nr_pages = pcpu_unit_pages;
117640064aecSDennis Zhou (Facebook) 	region_bits = pcpu_chunk_map_bits(chunk);
117740064aecSDennis Zhou (Facebook) 
117840064aecSDennis Zhou (Facebook) 	chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
117947504ee0SDennis Zhou 					   sizeof(chunk->alloc_map[0]), gfp);
118040064aecSDennis Zhou (Facebook) 	if (!chunk->alloc_map)
118140064aecSDennis Zhou (Facebook) 		goto alloc_map_fail;
118240064aecSDennis Zhou (Facebook) 
118340064aecSDennis Zhou (Facebook) 	chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
118447504ee0SDennis Zhou 					   sizeof(chunk->bound_map[0]), gfp);
118540064aecSDennis Zhou (Facebook) 	if (!chunk->bound_map)
118640064aecSDennis Zhou (Facebook) 		goto bound_map_fail;
118740064aecSDennis Zhou (Facebook) 
1188ca460b3cSDennis Zhou (Facebook) 	chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
118947504ee0SDennis Zhou 					   sizeof(chunk->md_blocks[0]), gfp);
1190ca460b3cSDennis Zhou (Facebook) 	if (!chunk->md_blocks)
1191ca460b3cSDennis Zhou (Facebook) 		goto md_blocks_fail;
1192ca460b3cSDennis Zhou (Facebook) 
1193ca460b3cSDennis Zhou (Facebook) 	pcpu_init_md_blocks(chunk);
1194ca460b3cSDennis Zhou (Facebook) 
119540064aecSDennis Zhou (Facebook) 	/* init metadata */
119640064aecSDennis Zhou (Facebook) 	chunk->contig_bits = region_bits;
119740064aecSDennis Zhou (Facebook) 	chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
1198c0ebfdc3SDennis Zhou (Facebook) 
11996081089fSTejun Heo 	return chunk;
120040064aecSDennis Zhou (Facebook) 
1201ca460b3cSDennis Zhou (Facebook) md_blocks_fail:
1202ca460b3cSDennis Zhou (Facebook) 	pcpu_mem_free(chunk->bound_map);
120340064aecSDennis Zhou (Facebook) bound_map_fail:
120440064aecSDennis Zhou (Facebook) 	pcpu_mem_free(chunk->alloc_map);
120540064aecSDennis Zhou (Facebook) alloc_map_fail:
120640064aecSDennis Zhou (Facebook) 	pcpu_mem_free(chunk);
120740064aecSDennis Zhou (Facebook) 
120840064aecSDennis Zhou (Facebook) 	return NULL;
12096081089fSTejun Heo }
12106081089fSTejun Heo 
12116081089fSTejun Heo static void pcpu_free_chunk(struct pcpu_chunk *chunk)
12126081089fSTejun Heo {
12136081089fSTejun Heo 	if (!chunk)
12146081089fSTejun Heo 		return;
1215*6685b357SMike Rapoport 	pcpu_mem_free(chunk->md_blocks);
121640064aecSDennis Zhou (Facebook) 	pcpu_mem_free(chunk->bound_map);
121740064aecSDennis Zhou (Facebook) 	pcpu_mem_free(chunk->alloc_map);
12181d5cfdb0STetsuo Handa 	pcpu_mem_free(chunk);
12196081089fSTejun Heo }
12206081089fSTejun Heo 
1221b539b87fSTejun Heo /**
1222b539b87fSTejun Heo  * pcpu_chunk_populated - post-population bookkeeping
1223b539b87fSTejun Heo  * @chunk: pcpu_chunk which got populated
1224b539b87fSTejun Heo  * @page_start: the start page
1225b539b87fSTejun Heo  * @page_end: the end page
122640064aecSDennis Zhou (Facebook)  * @for_alloc: if this is to populate for allocation
1227b539b87fSTejun Heo  *
1228b539b87fSTejun Heo  * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
1229b539b87fSTejun Heo  * the bookkeeping information accordingly.  Must be called after each
1230b539b87fSTejun Heo  * successful population.
123140064aecSDennis Zhou (Facebook)  *
123240064aecSDennis Zhou (Facebook)  * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
123340064aecSDennis Zhou (Facebook)  * is to serve an allocation in that area.
1234b539b87fSTejun Heo  */
123540064aecSDennis Zhou (Facebook) static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
123640064aecSDennis Zhou (Facebook) 				 int page_end, bool for_alloc)
1237b539b87fSTejun Heo {
1238b539b87fSTejun Heo 	int nr = page_end - page_start;
1239b539b87fSTejun Heo 
1240b539b87fSTejun Heo 	lockdep_assert_held(&pcpu_lock);
1241b539b87fSTejun Heo 
1242b539b87fSTejun Heo 	bitmap_set(chunk->populated, page_start, nr);
1243b539b87fSTejun Heo 	chunk->nr_populated += nr;
12447e8a6304SDennis Zhou (Facebook) 	pcpu_nr_populated += nr;
124540064aecSDennis Zhou (Facebook) 
124640064aecSDennis Zhou (Facebook) 	if (!for_alloc) {
12470cecf50cSDennis Zhou (Facebook) 		chunk->nr_empty_pop_pages += nr;
1248b539b87fSTejun Heo 		pcpu_nr_empty_pop_pages += nr;
1249b539b87fSTejun Heo 	}
125040064aecSDennis Zhou (Facebook) }
1251b539b87fSTejun Heo 
1252b539b87fSTejun Heo /**
1253b539b87fSTejun Heo  * pcpu_chunk_depopulated - post-depopulation bookkeeping
1254b539b87fSTejun Heo  * @chunk: pcpu_chunk which got depopulated
1255b539b87fSTejun Heo  * @page_start: the start page
1256b539b87fSTejun Heo  * @page_end: the end page
1257b539b87fSTejun Heo  *
1258b539b87fSTejun Heo  * Pages in [@page_start,@page_end) have been depopulated from @chunk.
1259b539b87fSTejun Heo  * Update the bookkeeping information accordingly.  Must be called after
1260b539b87fSTejun Heo  * each successful depopulation.
1261b539b87fSTejun Heo  */
1262b539b87fSTejun Heo static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
1263b539b87fSTejun Heo 				   int page_start, int page_end)
1264b539b87fSTejun Heo {
1265b539b87fSTejun Heo 	int nr = page_end - page_start;
1266b539b87fSTejun Heo 
1267b539b87fSTejun Heo 	lockdep_assert_held(&pcpu_lock);
1268b539b87fSTejun Heo 
1269b539b87fSTejun Heo 	bitmap_clear(chunk->populated, page_start, nr);
1270b539b87fSTejun Heo 	chunk->nr_populated -= nr;
12710cecf50cSDennis Zhou (Facebook) 	chunk->nr_empty_pop_pages -= nr;
1272b539b87fSTejun Heo 	pcpu_nr_empty_pop_pages -= nr;
12737e8a6304SDennis Zhou (Facebook) 	pcpu_nr_populated -= nr;
1274b539b87fSTejun Heo }
1275b539b87fSTejun Heo 
1276fbf59bc9STejun Heo /*
12779f645532STejun Heo  * Chunk management implementation.
1278fbf59bc9STejun Heo  *
12799f645532STejun Heo  * To allow different implementations, chunk alloc/free and
12809f645532STejun Heo  * [de]population are implemented in a separate file which is pulled
12819f645532STejun Heo  * into this file and compiled together.  The following functions
12829f645532STejun Heo  * should be implemented.
1283ccea34b5STejun Heo  *
12849f645532STejun Heo  * pcpu_populate_chunk		- populate the specified range of a chunk
12859f645532STejun Heo  * pcpu_depopulate_chunk	- depopulate the specified range of a chunk
12869f645532STejun Heo  * pcpu_create_chunk		- create a new chunk
12879f645532STejun Heo  * pcpu_destroy_chunk		- destroy a chunk, always preceded by full depop
12889f645532STejun Heo  * pcpu_addr_to_page		- translate address to physical address
12899f645532STejun Heo  * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
1290fbf59bc9STejun Heo  */
129115d9f3d1SDennis Zhou static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
129247504ee0SDennis Zhou 			       int page_start, int page_end, gfp_t gfp);
129315d9f3d1SDennis Zhou static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
129415d9f3d1SDennis Zhou 				  int page_start, int page_end);
129547504ee0SDennis Zhou static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
12969f645532STejun Heo static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
12979f645532STejun Heo static struct page *pcpu_addr_to_page(void *addr);
12989f645532STejun Heo static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
1299fbf59bc9STejun Heo 
1300b0c9778bSTejun Heo #ifdef CONFIG_NEED_PER_CPU_KM
1301b0c9778bSTejun Heo #include "percpu-km.c"
1302b0c9778bSTejun Heo #else
13039f645532STejun Heo #include "percpu-vm.c"
1304b0c9778bSTejun Heo #endif
1305fbf59bc9STejun Heo 
1306fbf59bc9STejun Heo /**
130788999a89STejun Heo  * pcpu_chunk_addr_search - determine chunk containing specified address
130888999a89STejun Heo  * @addr: address for which the chunk needs to be determined.
130988999a89STejun Heo  *
1310c0ebfdc3SDennis Zhou (Facebook)  * This is an internal function that handles all but static allocations.
1311c0ebfdc3SDennis Zhou (Facebook)  * Static percpu address values should never be passed into the allocator.
1312c0ebfdc3SDennis Zhou (Facebook)  *
131388999a89STejun Heo  * RETURNS:
131488999a89STejun Heo  * The address of the found chunk.
131588999a89STejun Heo  */
131688999a89STejun Heo static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
131788999a89STejun Heo {
1318c0ebfdc3SDennis Zhou (Facebook) 	/* is it in the dynamic region (first chunk)? */
1319560f2c23SDennis Zhou (Facebook) 	if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
1320c0ebfdc3SDennis Zhou (Facebook) 		return pcpu_first_chunk;
1321c0ebfdc3SDennis Zhou (Facebook) 
1322c0ebfdc3SDennis Zhou (Facebook) 	/* is it in the reserved region? */
1323560f2c23SDennis Zhou (Facebook) 	if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
132488999a89STejun Heo 		return pcpu_reserved_chunk;
132588999a89STejun Heo 
132688999a89STejun Heo 	/*
132788999a89STejun Heo 	 * The address is relative to unit0 which might be unused and
132888999a89STejun Heo 	 * thus unmapped.  Offset the address to the unit space of the
132988999a89STejun Heo 	 * current processor before looking it up in the vmalloc
133088999a89STejun Heo 	 * space.  Note that any possible cpu id can be used here, so
133188999a89STejun Heo 	 * there's no need to worry about preemption or cpu hotplug.
133288999a89STejun Heo 	 */
133388999a89STejun Heo 	addr += pcpu_unit_offsets[raw_smp_processor_id()];
13349f645532STejun Heo 	return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
133588999a89STejun Heo }
133688999a89STejun Heo 
133788999a89STejun Heo /**
1338edcb4639STejun Heo  * pcpu_alloc - the percpu allocator
1339cae3aeb8STejun Heo  * @size: size of area to allocate in bytes
1340fbf59bc9STejun Heo  * @align: alignment of area (max PAGE_SIZE)
1341edcb4639STejun Heo  * @reserved: allocate from the reserved chunk if available
13425835d96eSTejun Heo  * @gfp: allocation flags
1343fbf59bc9STejun Heo  *
13445835d96eSTejun Heo  * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
13450ea7eeecSDaniel Borkmann  * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
13460ea7eeecSDaniel Borkmann  * then no warning will be triggered on invalid or failed allocation
13470ea7eeecSDaniel Borkmann  * requests.
1348fbf59bc9STejun Heo  *
1349fbf59bc9STejun Heo  * RETURNS:
1350fbf59bc9STejun Heo  * Percpu pointer to the allocated area on success, NULL on failure.
1351fbf59bc9STejun Heo  */
13525835d96eSTejun Heo static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
13535835d96eSTejun Heo 				 gfp_t gfp)
1354fbf59bc9STejun Heo {
1355554fef1cSDennis Zhou 	/* whitelisted flags that can be passed to the backing allocators */
1356554fef1cSDennis Zhou 	gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
13570ea7eeecSDaniel Borkmann 	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
13580ea7eeecSDaniel Borkmann 	bool do_warn = !(gfp & __GFP_NOWARN);
1359f2badb0cSTejun Heo 	static int warn_limit = 10;
1360fbf59bc9STejun Heo 	struct pcpu_chunk *chunk;
1361f2badb0cSTejun Heo 	const char *err;
136240064aecSDennis Zhou (Facebook) 	int slot, off, cpu, ret;
1363403a91b1SJiri Kosina 	unsigned long flags;
1364f528f0b8SCatalin Marinas 	void __percpu *ptr;
136540064aecSDennis Zhou (Facebook) 	size_t bits, bit_align;
1366fbf59bc9STejun Heo 
1367723ad1d9SAl Viro 	/*
136840064aecSDennis Zhou (Facebook) 	 * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
136940064aecSDennis Zhou (Facebook) 	 * therefore alignment must be a minimum of that many bytes.
137040064aecSDennis Zhou (Facebook) 	 * An allocation may have internal fragmentation from rounding up
137140064aecSDennis Zhou (Facebook) 	 * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
1372723ad1d9SAl Viro 	 */
1373d2f3c384SDennis Zhou (Facebook) 	if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
1374d2f3c384SDennis Zhou (Facebook) 		align = PCPU_MIN_ALLOC_SIZE;
1375723ad1d9SAl Viro 
1376d2f3c384SDennis Zhou (Facebook) 	size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
137740064aecSDennis Zhou (Facebook) 	bits = size >> PCPU_MIN_ALLOC_SHIFT;
137840064aecSDennis Zhou (Facebook) 	bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
13792f69fa82SViro 
13803ca45a46Szijun_hu 	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
13813ca45a46Szijun_hu 		     !is_power_of_2(align))) {
13820ea7eeecSDaniel Borkmann 		WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
1383756a025fSJoe Perches 		     size, align);
1384fbf59bc9STejun Heo 		return NULL;
1385fbf59bc9STejun Heo 	}
1386fbf59bc9STejun Heo 
1387f52ba1feSKirill Tkhai 	if (!is_atomic) {
1388f52ba1feSKirill Tkhai 		/*
1389f52ba1feSKirill Tkhai 		 * pcpu_balance_workfn() allocates memory under this mutex,
1390f52ba1feSKirill Tkhai 		 * and it may wait for memory reclaim. Allow current task
1391f52ba1feSKirill Tkhai 		 * to become OOM victim, in case of memory pressure.
1392f52ba1feSKirill Tkhai 		 */
1393f52ba1feSKirill Tkhai 		if (gfp & __GFP_NOFAIL)
13946710e594STejun Heo 			mutex_lock(&pcpu_alloc_mutex);
1395f52ba1feSKirill Tkhai 		else if (mutex_lock_killable(&pcpu_alloc_mutex))
1396f52ba1feSKirill Tkhai 			return NULL;
1397f52ba1feSKirill Tkhai 	}
13986710e594STejun Heo 
1399403a91b1SJiri Kosina 	spin_lock_irqsave(&pcpu_lock, flags);
1400fbf59bc9STejun Heo 
1401edcb4639STejun Heo 	/* serve reserved allocations from the reserved chunk if available */
1402edcb4639STejun Heo 	if (reserved && pcpu_reserved_chunk) {
1403edcb4639STejun Heo 		chunk = pcpu_reserved_chunk;
1404833af842STejun Heo 
140540064aecSDennis Zhou (Facebook) 		off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
140640064aecSDennis Zhou (Facebook) 		if (off < 0) {
1407833af842STejun Heo 			err = "alloc from reserved chunk failed";
1408ccea34b5STejun Heo 			goto fail_unlock;
1409f2badb0cSTejun Heo 		}
1410833af842STejun Heo 
141140064aecSDennis Zhou (Facebook) 		off = pcpu_alloc_area(chunk, bits, bit_align, off);
1412edcb4639STejun Heo 		if (off >= 0)
1413edcb4639STejun Heo 			goto area_found;
1414833af842STejun Heo 
1415f2badb0cSTejun Heo 		err = "alloc from reserved chunk failed";
1416ccea34b5STejun Heo 		goto fail_unlock;
1417edcb4639STejun Heo 	}
1418edcb4639STejun Heo 
1419ccea34b5STejun Heo restart:
1420edcb4639STejun Heo 	/* search through normal chunks */
1421fbf59bc9STejun Heo 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
1422fbf59bc9STejun Heo 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
142340064aecSDennis Zhou (Facebook) 			off = pcpu_find_block_fit(chunk, bits, bit_align,
142440064aecSDennis Zhou (Facebook) 						  is_atomic);
142540064aecSDennis Zhou (Facebook) 			if (off < 0)
1426fbf59bc9STejun Heo 				continue;
1427ccea34b5STejun Heo 
142840064aecSDennis Zhou (Facebook) 			off = pcpu_alloc_area(chunk, bits, bit_align, off);
1429fbf59bc9STejun Heo 			if (off >= 0)
1430fbf59bc9STejun Heo 				goto area_found;
143140064aecSDennis Zhou (Facebook) 
1432fbf59bc9STejun Heo 		}
1433fbf59bc9STejun Heo 	}
1434fbf59bc9STejun Heo 
1435403a91b1SJiri Kosina 	spin_unlock_irqrestore(&pcpu_lock, flags);
1436ccea34b5STejun Heo 
1437b38d08f3STejun Heo 	/*
1438b38d08f3STejun Heo 	 * No space left.  Create a new chunk.  We don't want multiple
1439b38d08f3STejun Heo 	 * tasks to create chunks simultaneously.  Serialize and create iff
1440b38d08f3STejun Heo 	 * there's still no empty chunk after grabbing the mutex.
1441b38d08f3STejun Heo 	 */
144211df02bfSDennis Zhou 	if (is_atomic) {
144311df02bfSDennis Zhou 		err = "atomic alloc failed, no space left";
14445835d96eSTejun Heo 		goto fail;
144511df02bfSDennis Zhou 	}
14465835d96eSTejun Heo 
1447b38d08f3STejun Heo 	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
1448554fef1cSDennis Zhou 		chunk = pcpu_create_chunk(pcpu_gfp);
1449f2badb0cSTejun Heo 		if (!chunk) {
1450f2badb0cSTejun Heo 			err = "failed to allocate new chunk";
1451b38d08f3STejun Heo 			goto fail;
1452f2badb0cSTejun Heo 		}
1453ccea34b5STejun Heo 
1454403a91b1SJiri Kosina 		spin_lock_irqsave(&pcpu_lock, flags);
1455fbf59bc9STejun Heo 		pcpu_chunk_relocate(chunk, -1);
1456b38d08f3STejun Heo 	} else {
1457b38d08f3STejun Heo 		spin_lock_irqsave(&pcpu_lock, flags);
1458b38d08f3STejun Heo 	}
1459b38d08f3STejun Heo 
1460ccea34b5STejun Heo 	goto restart;
1461fbf59bc9STejun Heo 
1462fbf59bc9STejun Heo area_found:
146330a5b536SDennis Zhou 	pcpu_stats_area_alloc(chunk, size);
1464403a91b1SJiri Kosina 	spin_unlock_irqrestore(&pcpu_lock, flags);
1465ccea34b5STejun Heo 
1466dca49645STejun Heo 	/* populate if not all pages are already there */
14675835d96eSTejun Heo 	if (!is_atomic) {
1468e04d3208STejun Heo 		int page_start, page_end, rs, re;
1469e04d3208STejun Heo 
1470dca49645STejun Heo 		page_start = PFN_DOWN(off);
1471dca49645STejun Heo 		page_end = PFN_UP(off + size);
1472dca49645STejun Heo 
147391e914c5SDennis Zhou (Facebook) 		pcpu_for_each_unpop_region(chunk->populated, rs, re,
147491e914c5SDennis Zhou (Facebook) 					   page_start, page_end) {
1475dca49645STejun Heo 			WARN_ON(chunk->immutable);
1476dca49645STejun Heo 
1477554fef1cSDennis Zhou 			ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
1478b38d08f3STejun Heo 
1479403a91b1SJiri Kosina 			spin_lock_irqsave(&pcpu_lock, flags);
1480b38d08f3STejun Heo 			if (ret) {
148140064aecSDennis Zhou (Facebook) 				pcpu_free_area(chunk, off);
1482f2badb0cSTejun Heo 				err = "failed to populate";
1483ccea34b5STejun Heo 				goto fail_unlock;
1484fbf59bc9STejun Heo 			}
148540064aecSDennis Zhou (Facebook) 			pcpu_chunk_populated(chunk, rs, re, true);
1486b38d08f3STejun Heo 			spin_unlock_irqrestore(&pcpu_lock, flags);
1487dca49645STejun Heo 		}
1488dca49645STejun Heo 
1489ccea34b5STejun Heo 		mutex_unlock(&pcpu_alloc_mutex);
1490e04d3208STejun Heo 	}
1491ccea34b5STejun Heo 
14921a4d7607STejun Heo 	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
14931a4d7607STejun Heo 		pcpu_schedule_balance_work();
14941a4d7607STejun Heo 
1495dca49645STejun Heo 	/* clear the areas and return address relative to base address */
1496dca49645STejun Heo 	for_each_possible_cpu(cpu)
1497dca49645STejun Heo 		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1498dca49645STejun Heo 
1499f528f0b8SCatalin Marinas 	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
15008a8c35faSLarry Finger 	kmemleak_alloc_percpu(ptr, size, gfp);
1501df95e795SDennis Zhou 
1502df95e795SDennis Zhou 	trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
1503df95e795SDennis Zhou 			chunk->base_addr, off, ptr);
1504df95e795SDennis Zhou 
1505f528f0b8SCatalin Marinas 	return ptr;
1506ccea34b5STejun Heo 
1507ccea34b5STejun Heo fail_unlock:
1508403a91b1SJiri Kosina 	spin_unlock_irqrestore(&pcpu_lock, flags);
1509b38d08f3STejun Heo fail:
1510df95e795SDennis Zhou 	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
1511df95e795SDennis Zhou 
15120ea7eeecSDaniel Borkmann 	if (!is_atomic && do_warn && warn_limit) {
1513870d4b12SJoe Perches 		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
15145835d96eSTejun Heo 			size, align, is_atomic, err);
1515f2badb0cSTejun Heo 		dump_stack();
1516f2badb0cSTejun Heo 		if (!--warn_limit)
1517870d4b12SJoe Perches 			pr_info("limit reached, disable warning\n");
1518f2badb0cSTejun Heo 	}
15191a4d7607STejun Heo 	if (is_atomic) {
15201a4d7607STejun Heo 		/* see the flag handling in pcpu_blance_workfn() */
15211a4d7607STejun Heo 		pcpu_atomic_alloc_failed = true;
15221a4d7607STejun Heo 		pcpu_schedule_balance_work();
15236710e594STejun Heo 	} else {
15246710e594STejun Heo 		mutex_unlock(&pcpu_alloc_mutex);
15251a4d7607STejun Heo 	}
1526ccea34b5STejun Heo 	return NULL;
1527fbf59bc9STejun Heo }
1528edcb4639STejun Heo 
1529edcb4639STejun Heo /**
15305835d96eSTejun Heo  * __alloc_percpu_gfp - allocate dynamic percpu area
1531edcb4639STejun Heo  * @size: size of area to allocate in bytes
1532edcb4639STejun Heo  * @align: alignment of area (max PAGE_SIZE)
15335835d96eSTejun Heo  * @gfp: allocation flags
1534edcb4639STejun Heo  *
15355835d96eSTejun Heo  * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
15365835d96eSTejun Heo  * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
15370ea7eeecSDaniel Borkmann  * be called from any context but is a lot more likely to fail. If @gfp
15380ea7eeecSDaniel Borkmann  * has __GFP_NOWARN then no warning will be triggered on invalid or failed
15390ea7eeecSDaniel Borkmann  * allocation requests.
1540ccea34b5STejun Heo  *
1541edcb4639STejun Heo  * RETURNS:
1542edcb4639STejun Heo  * Percpu pointer to the allocated area on success, NULL on failure.
1543edcb4639STejun Heo  */
15445835d96eSTejun Heo void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
15455835d96eSTejun Heo {
15465835d96eSTejun Heo 	return pcpu_alloc(size, align, false, gfp);
15475835d96eSTejun Heo }
15485835d96eSTejun Heo EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
15495835d96eSTejun Heo 
15505835d96eSTejun Heo /**
15515835d96eSTejun Heo  * __alloc_percpu - allocate dynamic percpu area
15525835d96eSTejun Heo  * @size: size of area to allocate in bytes
15535835d96eSTejun Heo  * @align: alignment of area (max PAGE_SIZE)
15545835d96eSTejun Heo  *
15555835d96eSTejun Heo  * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
15565835d96eSTejun Heo  */
155743cf38ebSTejun Heo void __percpu *__alloc_percpu(size_t size, size_t align)
1558edcb4639STejun Heo {
15595835d96eSTejun Heo 	return pcpu_alloc(size, align, false, GFP_KERNEL);
1560edcb4639STejun Heo }
1561fbf59bc9STejun Heo EXPORT_SYMBOL_GPL(__alloc_percpu);
1562fbf59bc9STejun Heo 
1563edcb4639STejun Heo /**
1564edcb4639STejun Heo  * __alloc_reserved_percpu - allocate reserved percpu area
1565edcb4639STejun Heo  * @size: size of area to allocate in bytes
1566edcb4639STejun Heo  * @align: alignment of area (max PAGE_SIZE)
1567edcb4639STejun Heo  *
15689329ba97STejun Heo  * Allocate zero-filled percpu area of @size bytes aligned at @align
15699329ba97STejun Heo  * from reserved percpu area if arch has set it up; otherwise,
15709329ba97STejun Heo  * allocation is served from the same dynamic area.  Might sleep.
15719329ba97STejun Heo  * Might trigger writeouts.
1572edcb4639STejun Heo  *
1573ccea34b5STejun Heo  * CONTEXT:
1574ccea34b5STejun Heo  * Does GFP_KERNEL allocation.
1575ccea34b5STejun Heo  *
1576edcb4639STejun Heo  * RETURNS:
1577edcb4639STejun Heo  * Percpu pointer to the allocated area on success, NULL on failure.
1578edcb4639STejun Heo  */
157943cf38ebSTejun Heo void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1580edcb4639STejun Heo {
15815835d96eSTejun Heo 	return pcpu_alloc(size, align, true, GFP_KERNEL);
1582edcb4639STejun Heo }
1583edcb4639STejun Heo 
1584a56dbddfSTejun Heo /**
15851a4d7607STejun Heo  * pcpu_balance_workfn - manage the amount of free chunks and populated pages
1586a56dbddfSTejun Heo  * @work: unused
1587a56dbddfSTejun Heo  *
158847504ee0SDennis Zhou  * Reclaim all fully free chunks except for the first one.  This is also
158947504ee0SDennis Zhou  * responsible for maintaining the pool of empty populated pages.  However,
159047504ee0SDennis Zhou  * it is possible that this is called when physical memory is scarce causing
159147504ee0SDennis Zhou  * OOM killer to be triggered.  We should avoid doing so until an actual
159247504ee0SDennis Zhou  * allocation causes the failure as it is possible that requests can be
159347504ee0SDennis Zhou  * serviced from already backed regions.
1594a56dbddfSTejun Heo  */
1595fe6bd8c3STejun Heo static void pcpu_balance_workfn(struct work_struct *work)
1596fbf59bc9STejun Heo {
159747504ee0SDennis Zhou 	/* gfp flags passed to underlying allocators */
1598554fef1cSDennis Zhou 	const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
1599fe6bd8c3STejun Heo 	LIST_HEAD(to_free);
1600fe6bd8c3STejun Heo 	struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
1601a56dbddfSTejun Heo 	struct pcpu_chunk *chunk, *next;
16021a4d7607STejun Heo 	int slot, nr_to_pop, ret;
1603a56dbddfSTejun Heo 
16041a4d7607STejun Heo 	/*
16051a4d7607STejun Heo 	 * There's no reason to keep around multiple unused chunks and VM
16061a4d7607STejun Heo 	 * areas can be scarce.  Destroy all free chunks except for one.
16071a4d7607STejun Heo 	 */
1608ccea34b5STejun Heo 	mutex_lock(&pcpu_alloc_mutex);
1609ccea34b5STejun Heo 	spin_lock_irq(&pcpu_lock);
1610a56dbddfSTejun Heo 
1611fe6bd8c3STejun Heo 	list_for_each_entry_safe(chunk, next, free_head, list) {
16128d408b4bSTejun Heo 		WARN_ON(chunk->immutable);
1613a56dbddfSTejun Heo 
1614a56dbddfSTejun Heo 		/* spare the first one */
1615fe6bd8c3STejun Heo 		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
1616a56dbddfSTejun Heo 			continue;
1617a56dbddfSTejun Heo 
1618fe6bd8c3STejun Heo 		list_move(&chunk->list, &to_free);
1619a56dbddfSTejun Heo 	}
1620a56dbddfSTejun Heo 
1621ccea34b5STejun Heo 	spin_unlock_irq(&pcpu_lock);
1622a56dbddfSTejun Heo 
1623fe6bd8c3STejun Heo 	list_for_each_entry_safe(chunk, next, &to_free, list) {
1624a93ace48STejun Heo 		int rs, re;
1625dca49645STejun Heo 
162691e914c5SDennis Zhou (Facebook) 		pcpu_for_each_pop_region(chunk->populated, rs, re, 0,
162791e914c5SDennis Zhou (Facebook) 					 chunk->nr_pages) {
1628a93ace48STejun Heo 			pcpu_depopulate_chunk(chunk, rs, re);
1629b539b87fSTejun Heo 			spin_lock_irq(&pcpu_lock);
1630b539b87fSTejun Heo 			pcpu_chunk_depopulated(chunk, rs, re);
1631b539b87fSTejun Heo 			spin_unlock_irq(&pcpu_lock);
1632a93ace48STejun Heo 		}
16336081089fSTejun Heo 		pcpu_destroy_chunk(chunk);
1634accd4f36SEric Dumazet 		cond_resched();
1635fbf59bc9STejun Heo 	}
1636971f3918STejun Heo 
16371a4d7607STejun Heo 	/*
16381a4d7607STejun Heo 	 * Ensure there are certain number of free populated pages for
16391a4d7607STejun Heo 	 * atomic allocs.  Fill up from the most packed so that atomic
16401a4d7607STejun Heo 	 * allocs don't increase fragmentation.  If atomic allocation
16411a4d7607STejun Heo 	 * failed previously, always populate the maximum amount.  This
16421a4d7607STejun Heo 	 * should prevent atomic allocs larger than PAGE_SIZE from keeping
16431a4d7607STejun Heo 	 * failing indefinitely; however, large atomic allocs are not
16441a4d7607STejun Heo 	 * something we support properly and can be highly unreliable and
16451a4d7607STejun Heo 	 * inefficient.
16461a4d7607STejun Heo 	 */
16471a4d7607STejun Heo retry_pop:
16481a4d7607STejun Heo 	if (pcpu_atomic_alloc_failed) {
16491a4d7607STejun Heo 		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
16501a4d7607STejun Heo 		/* best effort anyway, don't worry about synchronization */
16511a4d7607STejun Heo 		pcpu_atomic_alloc_failed = false;
16521a4d7607STejun Heo 	} else {
16531a4d7607STejun Heo 		nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
16541a4d7607STejun Heo 				  pcpu_nr_empty_pop_pages,
16551a4d7607STejun Heo 				  0, PCPU_EMPTY_POP_PAGES_HIGH);
16561a4d7607STejun Heo 	}
16571a4d7607STejun Heo 
16581a4d7607STejun Heo 	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
16591a4d7607STejun Heo 		int nr_unpop = 0, rs, re;
16601a4d7607STejun Heo 
16611a4d7607STejun Heo 		if (!nr_to_pop)
16621a4d7607STejun Heo 			break;
16631a4d7607STejun Heo 
16641a4d7607STejun Heo 		spin_lock_irq(&pcpu_lock);
16651a4d7607STejun Heo 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
16668ab16c43SDennis Zhou (Facebook) 			nr_unpop = chunk->nr_pages - chunk->nr_populated;
16671a4d7607STejun Heo 			if (nr_unpop)
16681a4d7607STejun Heo 				break;
16691a4d7607STejun Heo 		}
16701a4d7607STejun Heo 		spin_unlock_irq(&pcpu_lock);
16711a4d7607STejun Heo 
16721a4d7607STejun Heo 		if (!nr_unpop)
16731a4d7607STejun Heo 			continue;
16741a4d7607STejun Heo 
16751a4d7607STejun Heo 		/* @chunk can't go away while pcpu_alloc_mutex is held */
167691e914c5SDennis Zhou (Facebook) 		pcpu_for_each_unpop_region(chunk->populated, rs, re, 0,
167791e914c5SDennis Zhou (Facebook) 					   chunk->nr_pages) {
16781a4d7607STejun Heo 			int nr = min(re - rs, nr_to_pop);
16791a4d7607STejun Heo 
168047504ee0SDennis Zhou 			ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
16811a4d7607STejun Heo 			if (!ret) {
16821a4d7607STejun Heo 				nr_to_pop -= nr;
16831a4d7607STejun Heo 				spin_lock_irq(&pcpu_lock);
168440064aecSDennis Zhou (Facebook) 				pcpu_chunk_populated(chunk, rs, rs + nr, false);
16851a4d7607STejun Heo 				spin_unlock_irq(&pcpu_lock);
16861a4d7607STejun Heo 			} else {
16871a4d7607STejun Heo 				nr_to_pop = 0;
16881a4d7607STejun Heo 			}
16891a4d7607STejun Heo 
16901a4d7607STejun Heo 			if (!nr_to_pop)
16911a4d7607STejun Heo 				break;
16921a4d7607STejun Heo 		}
16931a4d7607STejun Heo 	}
16941a4d7607STejun Heo 
16951a4d7607STejun Heo 	if (nr_to_pop) {
16961a4d7607STejun Heo 		/* ran out of chunks to populate, create a new one and retry */
169747504ee0SDennis Zhou 		chunk = pcpu_create_chunk(gfp);
16981a4d7607STejun Heo 		if (chunk) {
16991a4d7607STejun Heo 			spin_lock_irq(&pcpu_lock);
17001a4d7607STejun Heo 			pcpu_chunk_relocate(chunk, -1);
17011a4d7607STejun Heo 			spin_unlock_irq(&pcpu_lock);
17021a4d7607STejun Heo 			goto retry_pop;
17031a4d7607STejun Heo 		}
17041a4d7607STejun Heo 	}
17051a4d7607STejun Heo 
1706971f3918STejun Heo 	mutex_unlock(&pcpu_alloc_mutex);
1707a56dbddfSTejun Heo }
1708fbf59bc9STejun Heo 
1709fbf59bc9STejun Heo /**
1710fbf59bc9STejun Heo  * free_percpu - free percpu area
1711fbf59bc9STejun Heo  * @ptr: pointer to area to free
1712fbf59bc9STejun Heo  *
1713ccea34b5STejun Heo  * Free percpu area @ptr.
1714ccea34b5STejun Heo  *
1715ccea34b5STejun Heo  * CONTEXT:
1716ccea34b5STejun Heo  * Can be called from atomic context.
1717fbf59bc9STejun Heo  */
171843cf38ebSTejun Heo void free_percpu(void __percpu *ptr)
1719fbf59bc9STejun Heo {
1720129182e5SAndrew Morton 	void *addr;
1721fbf59bc9STejun Heo 	struct pcpu_chunk *chunk;
1722ccea34b5STejun Heo 	unsigned long flags;
172340064aecSDennis Zhou (Facebook) 	int off;
1724fbf59bc9STejun Heo 
1725fbf59bc9STejun Heo 	if (!ptr)
1726fbf59bc9STejun Heo 		return;
1727fbf59bc9STejun Heo 
1728f528f0b8SCatalin Marinas 	kmemleak_free_percpu(ptr);
1729f528f0b8SCatalin Marinas 
1730129182e5SAndrew Morton 	addr = __pcpu_ptr_to_addr(ptr);
1731129182e5SAndrew Morton 
1732ccea34b5STejun Heo 	spin_lock_irqsave(&pcpu_lock, flags);
1733fbf59bc9STejun Heo 
1734fbf59bc9STejun Heo 	chunk = pcpu_chunk_addr_search(addr);
1735bba174f5STejun Heo 	off = addr - chunk->base_addr;
1736fbf59bc9STejun Heo 
173740064aecSDennis Zhou (Facebook) 	pcpu_free_area(chunk, off);
1738fbf59bc9STejun Heo 
1739a56dbddfSTejun Heo 	/* if there are more than one fully free chunks, wake up grim reaper */
174040064aecSDennis Zhou (Facebook) 	if (chunk->free_bytes == pcpu_unit_size) {
1741fbf59bc9STejun Heo 		struct pcpu_chunk *pos;
1742fbf59bc9STejun Heo 
1743a56dbddfSTejun Heo 		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
1744fbf59bc9STejun Heo 			if (pos != chunk) {
17451a4d7607STejun Heo 				pcpu_schedule_balance_work();
1746fbf59bc9STejun Heo 				break;
1747fbf59bc9STejun Heo 			}
1748fbf59bc9STejun Heo 	}
1749fbf59bc9STejun Heo 
1750df95e795SDennis Zhou 	trace_percpu_free_percpu(chunk->base_addr, off, ptr);
1751df95e795SDennis Zhou 
1752ccea34b5STejun Heo 	spin_unlock_irqrestore(&pcpu_lock, flags);
1753fbf59bc9STejun Heo }
1754fbf59bc9STejun Heo EXPORT_SYMBOL_GPL(free_percpu);
1755fbf59bc9STejun Heo 
1756383776faSThomas Gleixner bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
1757383776faSThomas Gleixner {
1758383776faSThomas Gleixner #ifdef CONFIG_SMP
1759383776faSThomas Gleixner 	const size_t static_size = __per_cpu_end - __per_cpu_start;
1760383776faSThomas Gleixner 	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1761383776faSThomas Gleixner 	unsigned int cpu;
1762383776faSThomas Gleixner 
1763383776faSThomas Gleixner 	for_each_possible_cpu(cpu) {
1764383776faSThomas Gleixner 		void *start = per_cpu_ptr(base, cpu);
1765383776faSThomas Gleixner 		void *va = (void *)addr;
1766383776faSThomas Gleixner 
1767383776faSThomas Gleixner 		if (va >= start && va < start + static_size) {
17688ce371f9SPeter Zijlstra 			if (can_addr) {
1769383776faSThomas Gleixner 				*can_addr = (unsigned long) (va - start);
17708ce371f9SPeter Zijlstra 				*can_addr += (unsigned long)
17718ce371f9SPeter Zijlstra 					per_cpu_ptr(base, get_boot_cpu_id());
17728ce371f9SPeter Zijlstra 			}
1773383776faSThomas Gleixner 			return true;
1774383776faSThomas Gleixner 		}
1775383776faSThomas Gleixner 	}
1776383776faSThomas Gleixner #endif
1777383776faSThomas Gleixner 	/* on UP, can't distinguish from other static vars, always false */
1778383776faSThomas Gleixner 	return false;
1779383776faSThomas Gleixner }
1780383776faSThomas Gleixner 
17813b034b0dSVivek Goyal /**
178210fad5e4STejun Heo  * is_kernel_percpu_address - test whether address is from static percpu area
178310fad5e4STejun Heo  * @addr: address to test
178410fad5e4STejun Heo  *
178510fad5e4STejun Heo  * Test whether @addr belongs to in-kernel static percpu area.  Module
178610fad5e4STejun Heo  * static percpu areas are not considered.  For those, use
178710fad5e4STejun Heo  * is_module_percpu_address().
178810fad5e4STejun Heo  *
178910fad5e4STejun Heo  * RETURNS:
179010fad5e4STejun Heo  * %true if @addr is from in-kernel static percpu area, %false otherwise.
179110fad5e4STejun Heo  */
179210fad5e4STejun Heo bool is_kernel_percpu_address(unsigned long addr)
179310fad5e4STejun Heo {
1794383776faSThomas Gleixner 	return __is_kernel_percpu_address(addr, NULL);
179510fad5e4STejun Heo }
179610fad5e4STejun Heo 
179710fad5e4STejun Heo /**
17983b034b0dSVivek Goyal  * per_cpu_ptr_to_phys - convert translated percpu address to physical address
17993b034b0dSVivek Goyal  * @addr: the address to be converted to physical address
18003b034b0dSVivek Goyal  *
18013b034b0dSVivek Goyal  * Given @addr which is dereferenceable address obtained via one of
18023b034b0dSVivek Goyal  * percpu access macros, this function translates it into its physical
18033b034b0dSVivek Goyal  * address.  The caller is responsible for ensuring @addr stays valid
18043b034b0dSVivek Goyal  * until this function finishes.
18053b034b0dSVivek Goyal  *
180667589c71SDave Young  * percpu allocator has special setup for the first chunk, which currently
180767589c71SDave Young  * supports either embedding in linear address space or vmalloc mapping,
180867589c71SDave Young  * and, from the second one, the backing allocator (currently either vm or
180967589c71SDave Young  * km) provides translation.
181067589c71SDave Young  *
1811bffc4375SYannick Guerrini  * The addr can be translated simply without checking if it falls into the
181267589c71SDave Young  * first chunk. But the current code reflects better how percpu allocator
181367589c71SDave Young  * actually works, and the verification can discover both bugs in percpu
181467589c71SDave Young  * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
181567589c71SDave Young  * code.
181667589c71SDave Young  *
18173b034b0dSVivek Goyal  * RETURNS:
18183b034b0dSVivek Goyal  * The physical address for @addr.
18193b034b0dSVivek Goyal  */
18203b034b0dSVivek Goyal phys_addr_t per_cpu_ptr_to_phys(void *addr)
18213b034b0dSVivek Goyal {
18229983b6f0STejun Heo 	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
18239983b6f0STejun Heo 	bool in_first_chunk = false;
1824a855b84cSTejun Heo 	unsigned long first_low, first_high;
18259983b6f0STejun Heo 	unsigned int cpu;
18269983b6f0STejun Heo 
18279983b6f0STejun Heo 	/*
1828a855b84cSTejun Heo 	 * The following test on unit_low/high isn't strictly
18299983b6f0STejun Heo 	 * necessary but will speed up lookups of addresses which
18309983b6f0STejun Heo 	 * aren't in the first chunk.
1831c0ebfdc3SDennis Zhou (Facebook) 	 *
1832c0ebfdc3SDennis Zhou (Facebook) 	 * The address check is against full chunk sizes.  pcpu_base_addr
1833c0ebfdc3SDennis Zhou (Facebook) 	 * points to the beginning of the first chunk including the
1834c0ebfdc3SDennis Zhou (Facebook) 	 * static region.  Assumes good intent as the first chunk may
1835c0ebfdc3SDennis Zhou (Facebook) 	 * not be full (ie. < pcpu_unit_pages in size).
18369983b6f0STejun Heo 	 */
1837c0ebfdc3SDennis Zhou (Facebook) 	first_low = (unsigned long)pcpu_base_addr +
1838c0ebfdc3SDennis Zhou (Facebook) 		    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
1839c0ebfdc3SDennis Zhou (Facebook) 	first_high = (unsigned long)pcpu_base_addr +
1840c0ebfdc3SDennis Zhou (Facebook) 		     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
1841a855b84cSTejun Heo 	if ((unsigned long)addr >= first_low &&
1842a855b84cSTejun Heo 	    (unsigned long)addr < first_high) {
18439983b6f0STejun Heo 		for_each_possible_cpu(cpu) {
18449983b6f0STejun Heo 			void *start = per_cpu_ptr(base, cpu);
18459983b6f0STejun Heo 
18469983b6f0STejun Heo 			if (addr >= start && addr < start + pcpu_unit_size) {
18479983b6f0STejun Heo 				in_first_chunk = true;
18489983b6f0STejun Heo 				break;
18499983b6f0STejun Heo 			}
18509983b6f0STejun Heo 		}
18519983b6f0STejun Heo 	}
18529983b6f0STejun Heo 
18539983b6f0STejun Heo 	if (in_first_chunk) {
1854eac522efSDavid Howells 		if (!is_vmalloc_addr(addr))
18553b034b0dSVivek Goyal 			return __pa(addr);
18563b034b0dSVivek Goyal 		else
18579f57bd4dSEugene Surovegin 			return page_to_phys(vmalloc_to_page(addr)) +
18589f57bd4dSEugene Surovegin 			       offset_in_page(addr);
1859020ec653STejun Heo 	} else
18609f57bd4dSEugene Surovegin 		return page_to_phys(pcpu_addr_to_page(addr)) +
18619f57bd4dSEugene Surovegin 		       offset_in_page(addr);
18623b034b0dSVivek Goyal }
18633b034b0dSVivek Goyal 
1864fbf59bc9STejun Heo /**
1865fd1e8a1fSTejun Heo  * pcpu_alloc_alloc_info - allocate percpu allocation info
1866fd1e8a1fSTejun Heo  * @nr_groups: the number of groups
1867fd1e8a1fSTejun Heo  * @nr_units: the number of units
1868033e48fbSTejun Heo  *
1869fd1e8a1fSTejun Heo  * Allocate ai which is large enough for @nr_groups groups containing
1870fd1e8a1fSTejun Heo  * @nr_units units.  The returned ai's groups[0].cpu_map points to the
1871fd1e8a1fSTejun Heo  * cpu_map array which is long enough for @nr_units and filled with
1872fd1e8a1fSTejun Heo  * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
1873fd1e8a1fSTejun Heo  * pointer of other groups.
1874033e48fbSTejun Heo  *
1875033e48fbSTejun Heo  * RETURNS:
1876fd1e8a1fSTejun Heo  * Pointer to the allocated pcpu_alloc_info on success, NULL on
1877fd1e8a1fSTejun Heo  * failure.
1878033e48fbSTejun Heo  */
1879fd1e8a1fSTejun Heo struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1880fd1e8a1fSTejun Heo 						      int nr_units)
1881fd1e8a1fSTejun Heo {
1882fd1e8a1fSTejun Heo 	struct pcpu_alloc_info *ai;
1883fd1e8a1fSTejun Heo 	size_t base_size, ai_size;
1884fd1e8a1fSTejun Heo 	void *ptr;
1885fd1e8a1fSTejun Heo 	int unit;
1886fd1e8a1fSTejun Heo 
1887fd1e8a1fSTejun Heo 	base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
1888fd1e8a1fSTejun Heo 			  __alignof__(ai->groups[0].cpu_map[0]));
1889fd1e8a1fSTejun Heo 	ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1890fd1e8a1fSTejun Heo 
1891438a5061SNicolas Pitre 	ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE);
1892fd1e8a1fSTejun Heo 	if (!ptr)
1893fd1e8a1fSTejun Heo 		return NULL;
1894fd1e8a1fSTejun Heo 	ai = ptr;
1895fd1e8a1fSTejun Heo 	ptr += base_size;
1896fd1e8a1fSTejun Heo 
1897fd1e8a1fSTejun Heo 	ai->groups[0].cpu_map = ptr;
1898fd1e8a1fSTejun Heo 
1899fd1e8a1fSTejun Heo 	for (unit = 0; unit < nr_units; unit++)
1900fd1e8a1fSTejun Heo 		ai->groups[0].cpu_map[unit] = NR_CPUS;
1901fd1e8a1fSTejun Heo 
1902fd1e8a1fSTejun Heo 	ai->nr_groups = nr_groups;
1903fd1e8a1fSTejun Heo 	ai->__ai_size = PFN_ALIGN(ai_size);
1904fd1e8a1fSTejun Heo 
1905fd1e8a1fSTejun Heo 	return ai;
1906fd1e8a1fSTejun Heo }
1907fd1e8a1fSTejun Heo 
1908fd1e8a1fSTejun Heo /**
1909fd1e8a1fSTejun Heo  * pcpu_free_alloc_info - free percpu allocation info
1910fd1e8a1fSTejun Heo  * @ai: pcpu_alloc_info to free
1911fd1e8a1fSTejun Heo  *
1912fd1e8a1fSTejun Heo  * Free @ai which was allocated by pcpu_alloc_alloc_info().
1913fd1e8a1fSTejun Heo  */
1914fd1e8a1fSTejun Heo void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1915fd1e8a1fSTejun Heo {
1916999c17e3SSantosh Shilimkar 	memblock_free_early(__pa(ai), ai->__ai_size);
1917fd1e8a1fSTejun Heo }
1918fd1e8a1fSTejun Heo 
1919fd1e8a1fSTejun Heo /**
1920fd1e8a1fSTejun Heo  * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
1921fd1e8a1fSTejun Heo  * @lvl: loglevel
1922fd1e8a1fSTejun Heo  * @ai: allocation info to dump
1923fd1e8a1fSTejun Heo  *
1924fd1e8a1fSTejun Heo  * Print out information about @ai using loglevel @lvl.
1925fd1e8a1fSTejun Heo  */
1926fd1e8a1fSTejun Heo static void pcpu_dump_alloc_info(const char *lvl,
1927fd1e8a1fSTejun Heo 				 const struct pcpu_alloc_info *ai)
1928033e48fbSTejun Heo {
1929fd1e8a1fSTejun Heo 	int group_width = 1, cpu_width = 1, width;
1930033e48fbSTejun Heo 	char empty_str[] = "--------";
1931fd1e8a1fSTejun Heo 	int alloc = 0, alloc_end = 0;
1932fd1e8a1fSTejun Heo 	int group, v;
1933fd1e8a1fSTejun Heo 	int upa, apl;	/* units per alloc, allocs per line */
1934033e48fbSTejun Heo 
1935fd1e8a1fSTejun Heo 	v = ai->nr_groups;
1936033e48fbSTejun Heo 	while (v /= 10)
1937fd1e8a1fSTejun Heo 		group_width++;
1938033e48fbSTejun Heo 
1939fd1e8a1fSTejun Heo 	v = num_possible_cpus();
1940fd1e8a1fSTejun Heo 	while (v /= 10)
1941fd1e8a1fSTejun Heo 		cpu_width++;
1942fd1e8a1fSTejun Heo 	empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
1943033e48fbSTejun Heo 
1944fd1e8a1fSTejun Heo 	upa = ai->alloc_size / ai->unit_size;
1945fd1e8a1fSTejun Heo 	width = upa * (cpu_width + 1) + group_width + 3;
1946fd1e8a1fSTejun Heo 	apl = rounddown_pow_of_two(max(60 / width, 1));
1947033e48fbSTejun Heo 
1948fd1e8a1fSTejun Heo 	printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
1949fd1e8a1fSTejun Heo 	       lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
1950fd1e8a1fSTejun Heo 	       ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
1951fd1e8a1fSTejun Heo 
1952fd1e8a1fSTejun Heo 	for (group = 0; group < ai->nr_groups; group++) {
1953fd1e8a1fSTejun Heo 		const struct pcpu_group_info *gi = &ai->groups[group];
1954fd1e8a1fSTejun Heo 		int unit = 0, unit_end = 0;
1955fd1e8a1fSTejun Heo 
1956fd1e8a1fSTejun Heo 		BUG_ON(gi->nr_units % upa);
1957fd1e8a1fSTejun Heo 		for (alloc_end += gi->nr_units / upa;
1958fd1e8a1fSTejun Heo 		     alloc < alloc_end; alloc++) {
1959fd1e8a1fSTejun Heo 			if (!(alloc % apl)) {
19601170532bSJoe Perches 				pr_cont("\n");
1961fd1e8a1fSTejun Heo 				printk("%spcpu-alloc: ", lvl);
1962033e48fbSTejun Heo 			}
19631170532bSJoe Perches 			pr_cont("[%0*d] ", group_width, group);
1964fd1e8a1fSTejun Heo 
1965fd1e8a1fSTejun Heo 			for (unit_end += upa; unit < unit_end; unit++)
1966fd1e8a1fSTejun Heo 				if (gi->cpu_map[unit] != NR_CPUS)
19671170532bSJoe Perches 					pr_cont("%0*d ",
19681170532bSJoe Perches 						cpu_width, gi->cpu_map[unit]);
1969033e48fbSTejun Heo 				else
19701170532bSJoe Perches 					pr_cont("%s ", empty_str);
1971033e48fbSTejun Heo 		}
1972fd1e8a1fSTejun Heo 	}
19731170532bSJoe Perches 	pr_cont("\n");
1974033e48fbSTejun Heo }
1975033e48fbSTejun Heo 
1976fbf59bc9STejun Heo /**
19778d408b4bSTejun Heo  * pcpu_setup_first_chunk - initialize the first percpu chunk
1978fd1e8a1fSTejun Heo  * @ai: pcpu_alloc_info describing how to percpu area is shaped
197938a6be52STejun Heo  * @base_addr: mapped address
1980fbf59bc9STejun Heo  *
19818d408b4bSTejun Heo  * Initialize the first percpu chunk which contains the kernel static
19828d408b4bSTejun Heo  * perpcu area.  This function is to be called from arch percpu area
198338a6be52STejun Heo  * setup path.
19848d408b4bSTejun Heo  *
1985fd1e8a1fSTejun Heo  * @ai contains all information necessary to initialize the first
1986fd1e8a1fSTejun Heo  * chunk and prime the dynamic percpu allocator.
19878d408b4bSTejun Heo  *
1988fd1e8a1fSTejun Heo  * @ai->static_size is the size of static percpu area.
1989fd1e8a1fSTejun Heo  *
1990fd1e8a1fSTejun Heo  * @ai->reserved_size, if non-zero, specifies the amount of bytes to
1991edcb4639STejun Heo  * reserve after the static area in the first chunk.  This reserves
1992edcb4639STejun Heo  * the first chunk such that it's available only through reserved
1993edcb4639STejun Heo  * percpu allocation.  This is primarily used to serve module percpu
1994edcb4639STejun Heo  * static areas on architectures where the addressing model has
1995edcb4639STejun Heo  * limited offset range for symbol relocations to guarantee module
1996edcb4639STejun Heo  * percpu symbols fall inside the relocatable range.
1997edcb4639STejun Heo  *
1998fd1e8a1fSTejun Heo  * @ai->dyn_size determines the number of bytes available for dynamic
1999fd1e8a1fSTejun Heo  * allocation in the first chunk.  The area between @ai->static_size +
2000fd1e8a1fSTejun Heo  * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
20016074d5b0STejun Heo  *
2002fd1e8a1fSTejun Heo  * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
2003fd1e8a1fSTejun Heo  * and equal to or larger than @ai->static_size + @ai->reserved_size +
2004fd1e8a1fSTejun Heo  * @ai->dyn_size.
20058d408b4bSTejun Heo  *
2006fd1e8a1fSTejun Heo  * @ai->atom_size is the allocation atom size and used as alignment
2007fd1e8a1fSTejun Heo  * for vm areas.
20088d408b4bSTejun Heo  *
2009fd1e8a1fSTejun Heo  * @ai->alloc_size is the allocation size and always multiple of
2010fd1e8a1fSTejun Heo  * @ai->atom_size.  This is larger than @ai->atom_size if
2011fd1e8a1fSTejun Heo  * @ai->unit_size is larger than @ai->atom_size.
2012fd1e8a1fSTejun Heo  *
2013fd1e8a1fSTejun Heo  * @ai->nr_groups and @ai->groups describe virtual memory layout of
2014fd1e8a1fSTejun Heo  * percpu areas.  Units which should be colocated are put into the
2015fd1e8a1fSTejun Heo  * same group.  Dynamic VM areas will be allocated according to these
2016fd1e8a1fSTejun Heo  * groupings.  If @ai->nr_groups is zero, a single group containing
2017fd1e8a1fSTejun Heo  * all units is assumed.
20188d408b4bSTejun Heo  *
201938a6be52STejun Heo  * The caller should have mapped the first chunk at @base_addr and
202038a6be52STejun Heo  * copied static data to each unit.
2021fbf59bc9STejun Heo  *
2022c0ebfdc3SDennis Zhou (Facebook)  * The first chunk will always contain a static and a dynamic region.
2023c0ebfdc3SDennis Zhou (Facebook)  * However, the static region is not managed by any chunk.  If the first
2024c0ebfdc3SDennis Zhou (Facebook)  * chunk also contains a reserved region, it is served by two chunks -
2025c0ebfdc3SDennis Zhou (Facebook)  * one for the reserved region and one for the dynamic region.  They
2026c0ebfdc3SDennis Zhou (Facebook)  * share the same vm, but use offset regions in the area allocation map.
2027c0ebfdc3SDennis Zhou (Facebook)  * The chunk serving the dynamic region is circulated in the chunk slots
2028c0ebfdc3SDennis Zhou (Facebook)  * and available for dynamic allocation like any other chunk.
2029edcb4639STejun Heo  *
2030fbf59bc9STejun Heo  * RETURNS:
2031fb435d52STejun Heo  * 0 on success, -errno on failure.
2032fbf59bc9STejun Heo  */
2033fb435d52STejun Heo int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
2034fd1e8a1fSTejun Heo 				  void *base_addr)
2035fbf59bc9STejun Heo {
2036b9c39442SDennis Zhou (Facebook) 	size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
2037d2f3c384SDennis Zhou (Facebook) 	size_t static_size, dyn_size;
20380c4169c3SDennis Zhou (Facebook) 	struct pcpu_chunk *chunk;
20396563297cSTejun Heo 	unsigned long *group_offsets;
20406563297cSTejun Heo 	size_t *group_sizes;
2041fb435d52STejun Heo 	unsigned long *unit_off;
2042fbf59bc9STejun Heo 	unsigned int cpu;
2043fd1e8a1fSTejun Heo 	int *unit_map;
2044fd1e8a1fSTejun Heo 	int group, unit, i;
2045c0ebfdc3SDennis Zhou (Facebook) 	int map_size;
2046c0ebfdc3SDennis Zhou (Facebook) 	unsigned long tmp_addr;
2047fbf59bc9STejun Heo 
2048635b75fcSTejun Heo #define PCPU_SETUP_BUG_ON(cond)	do {					\
2049635b75fcSTejun Heo 	if (unlikely(cond)) {						\
2050870d4b12SJoe Perches 		pr_emerg("failed to initialize, %s\n", #cond);		\
2051870d4b12SJoe Perches 		pr_emerg("cpu_possible_mask=%*pb\n",			\
2052807de073STejun Heo 			 cpumask_pr_args(cpu_possible_mask));		\
2053635b75fcSTejun Heo 		pcpu_dump_alloc_info(KERN_EMERG, ai);			\
2054635b75fcSTejun Heo 		BUG();							\
2055635b75fcSTejun Heo 	}								\
2056635b75fcSTejun Heo } while (0)
2057635b75fcSTejun Heo 
20582f39e637STejun Heo 	/* sanity checks */
2059635b75fcSTejun Heo 	PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
2060bbddff05STejun Heo #ifdef CONFIG_SMP
2061635b75fcSTejun Heo 	PCPU_SETUP_BUG_ON(!ai->static_size);
2062f09f1243SAlexander Kuleshov 	PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
2063bbddff05STejun Heo #endif
2064635b75fcSTejun Heo 	PCPU_SETUP_BUG_ON(!base_addr);
2065f09f1243SAlexander Kuleshov 	PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
2066635b75fcSTejun Heo 	PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
2067f09f1243SAlexander Kuleshov 	PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
2068635b75fcSTejun Heo 	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
2069ca460b3cSDennis Zhou (Facebook) 	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
2070099a19d9STejun Heo 	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
2071fb29a2ccSDennis Zhou (Facebook) 	PCPU_SETUP_BUG_ON(!ai->dyn_size);
2072d2f3c384SDennis Zhou (Facebook) 	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
2073ca460b3cSDennis Zhou (Facebook) 	PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
2074ca460b3cSDennis Zhou (Facebook) 			    IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
20759f645532STejun Heo 	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
20768d408b4bSTejun Heo 
20776563297cSTejun Heo 	/* process group information and build config tables accordingly */
2078999c17e3SSantosh Shilimkar 	group_offsets = memblock_virt_alloc(ai->nr_groups *
2079999c17e3SSantosh Shilimkar 					     sizeof(group_offsets[0]), 0);
2080999c17e3SSantosh Shilimkar 	group_sizes = memblock_virt_alloc(ai->nr_groups *
2081999c17e3SSantosh Shilimkar 					   sizeof(group_sizes[0]), 0);
2082999c17e3SSantosh Shilimkar 	unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
2083999c17e3SSantosh Shilimkar 	unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
20842f39e637STejun Heo 
2085fd1e8a1fSTejun Heo 	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
2086ffe0d5a5STejun Heo 		unit_map[cpu] = UINT_MAX;
2087a855b84cSTejun Heo 
2088a855b84cSTejun Heo 	pcpu_low_unit_cpu = NR_CPUS;
2089a855b84cSTejun Heo 	pcpu_high_unit_cpu = NR_CPUS;
20902f39e637STejun Heo 
2091fd1e8a1fSTejun Heo 	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
2092fd1e8a1fSTejun Heo 		const struct pcpu_group_info *gi = &ai->groups[group];
20932f39e637STejun Heo 
20946563297cSTejun Heo 		group_offsets[group] = gi->base_offset;
20956563297cSTejun Heo 		group_sizes[group] = gi->nr_units * ai->unit_size;
20966563297cSTejun Heo 
2097fd1e8a1fSTejun Heo 		for (i = 0; i < gi->nr_units; i++) {
2098fd1e8a1fSTejun Heo 			cpu = gi->cpu_map[i];
2099fd1e8a1fSTejun Heo 			if (cpu == NR_CPUS)
2100fd1e8a1fSTejun Heo 				continue;
2101fd1e8a1fSTejun Heo 
21029f295664SDan Carpenter 			PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
2103635b75fcSTejun Heo 			PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
2104635b75fcSTejun Heo 			PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
2105fd1e8a1fSTejun Heo 
2106fd1e8a1fSTejun Heo 			unit_map[cpu] = unit + i;
2107fb435d52STejun Heo 			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
2108fb435d52STejun Heo 
2109a855b84cSTejun Heo 			/* determine low/high unit_cpu */
2110a855b84cSTejun Heo 			if (pcpu_low_unit_cpu == NR_CPUS ||
2111a855b84cSTejun Heo 			    unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
2112a855b84cSTejun Heo 				pcpu_low_unit_cpu = cpu;
2113a855b84cSTejun Heo 			if (pcpu_high_unit_cpu == NR_CPUS ||
2114a855b84cSTejun Heo 			    unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
2115a855b84cSTejun Heo 				pcpu_high_unit_cpu = cpu;
21160fc0531eSLinus Torvalds 		}
21170fc0531eSLinus Torvalds 	}
2118fd1e8a1fSTejun Heo 	pcpu_nr_units = unit;
21192f39e637STejun Heo 
21202f39e637STejun Heo 	for_each_possible_cpu(cpu)
2121635b75fcSTejun Heo 		PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
2122635b75fcSTejun Heo 
2123635b75fcSTejun Heo 	/* we're done parsing the input, undefine BUG macro and dump config */
2124635b75fcSTejun Heo #undef PCPU_SETUP_BUG_ON
2125bcbea798STejun Heo 	pcpu_dump_alloc_info(KERN_DEBUG, ai);
21262f39e637STejun Heo 
21276563297cSTejun Heo 	pcpu_nr_groups = ai->nr_groups;
21286563297cSTejun Heo 	pcpu_group_offsets = group_offsets;
21296563297cSTejun Heo 	pcpu_group_sizes = group_sizes;
2130fd1e8a1fSTejun Heo 	pcpu_unit_map = unit_map;
2131fb435d52STejun Heo 	pcpu_unit_offsets = unit_off;
21322f39e637STejun Heo 
21332f39e637STejun Heo 	/* determine basic parameters */
2134fd1e8a1fSTejun Heo 	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
2135d9b55eebSTejun Heo 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
21366563297cSTejun Heo 	pcpu_atom_size = ai->atom_size;
2137ce3141a2STejun Heo 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
2138ce3141a2STejun Heo 		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
2139cafe8816STejun Heo 
214030a5b536SDennis Zhou 	pcpu_stats_save_ai(ai);
214130a5b536SDennis Zhou 
2142d9b55eebSTejun Heo 	/*
2143d9b55eebSTejun Heo 	 * Allocate chunk slots.  The additional last slot is for
2144d9b55eebSTejun Heo 	 * empty chunks.
2145d9b55eebSTejun Heo 	 */
2146d9b55eebSTejun Heo 	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
2147999c17e3SSantosh Shilimkar 	pcpu_slot = memblock_virt_alloc(
2148999c17e3SSantosh Shilimkar 			pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
2149fbf59bc9STejun Heo 	for (i = 0; i < pcpu_nr_slots; i++)
2150fbf59bc9STejun Heo 		INIT_LIST_HEAD(&pcpu_slot[i]);
2151fbf59bc9STejun Heo 
2152edcb4639STejun Heo 	/*
2153d2f3c384SDennis Zhou (Facebook) 	 * The end of the static region needs to be aligned with the
2154d2f3c384SDennis Zhou (Facebook) 	 * minimum allocation size as this offsets the reserved and
2155d2f3c384SDennis Zhou (Facebook) 	 * dynamic region.  The first chunk ends page aligned by
2156d2f3c384SDennis Zhou (Facebook) 	 * expanding the dynamic region, therefore the dynamic region
2157d2f3c384SDennis Zhou (Facebook) 	 * can be shrunk to compensate while still staying above the
2158d2f3c384SDennis Zhou (Facebook) 	 * configured sizes.
2159d2f3c384SDennis Zhou (Facebook) 	 */
2160d2f3c384SDennis Zhou (Facebook) 	static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
2161d2f3c384SDennis Zhou (Facebook) 	dyn_size = ai->dyn_size - (static_size - ai->static_size);
2162d2f3c384SDennis Zhou (Facebook) 
2163d2f3c384SDennis Zhou (Facebook) 	/*
2164c0ebfdc3SDennis Zhou (Facebook) 	 * Initialize first chunk.
2165c0ebfdc3SDennis Zhou (Facebook) 	 * If the reserved_size is non-zero, this initializes the reserved
2166c0ebfdc3SDennis Zhou (Facebook) 	 * chunk.  If the reserved_size is zero, the reserved chunk is NULL
2167c0ebfdc3SDennis Zhou (Facebook) 	 * and the dynamic region is initialized here.  The first chunk,
2168c0ebfdc3SDennis Zhou (Facebook) 	 * pcpu_first_chunk, will always point to the chunk that serves
2169c0ebfdc3SDennis Zhou (Facebook) 	 * the dynamic region.
2170edcb4639STejun Heo 	 */
2171d2f3c384SDennis Zhou (Facebook) 	tmp_addr = (unsigned long)base_addr + static_size;
2172d2f3c384SDennis Zhou (Facebook) 	map_size = ai->reserved_size ?: dyn_size;
217340064aecSDennis Zhou (Facebook) 	chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
217461ace7faSTejun Heo 
2175edcb4639STejun Heo 	/* init dynamic chunk if necessary */
2176b9c39442SDennis Zhou (Facebook) 	if (ai->reserved_size) {
21770c4169c3SDennis Zhou (Facebook) 		pcpu_reserved_chunk = chunk;
2178b9c39442SDennis Zhou (Facebook) 
2179d2f3c384SDennis Zhou (Facebook) 		tmp_addr = (unsigned long)base_addr + static_size +
2180c0ebfdc3SDennis Zhou (Facebook) 			   ai->reserved_size;
2181d2f3c384SDennis Zhou (Facebook) 		map_size = dyn_size;
218240064aecSDennis Zhou (Facebook) 		chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
2183edcb4639STejun Heo 	}
2184edcb4639STejun Heo 
21852441d15cSTejun Heo 	/* link the first chunk in */
21860c4169c3SDennis Zhou (Facebook) 	pcpu_first_chunk = chunk;
21870cecf50cSDennis Zhou (Facebook) 	pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
2188ae9e6bc9STejun Heo 	pcpu_chunk_relocate(pcpu_first_chunk, -1);
2189fbf59bc9STejun Heo 
21907e8a6304SDennis Zhou (Facebook) 	/* include all regions of the first chunk */
21917e8a6304SDennis Zhou (Facebook) 	pcpu_nr_populated += PFN_DOWN(size_sum);
21927e8a6304SDennis Zhou (Facebook) 
219330a5b536SDennis Zhou 	pcpu_stats_chunk_alloc();
2194df95e795SDennis Zhou 	trace_percpu_create_chunk(base_addr);
219530a5b536SDennis Zhou 
2196fbf59bc9STejun Heo 	/* we're done */
2197bba174f5STejun Heo 	pcpu_base_addr = base_addr;
2198fb435d52STejun Heo 	return 0;
2199fbf59bc9STejun Heo }
220066c3a757STejun Heo 
2201bbddff05STejun Heo #ifdef CONFIG_SMP
2202bbddff05STejun Heo 
220317f3609cSAndi Kleen const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
2204f58dc01bSTejun Heo 	[PCPU_FC_AUTO]	= "auto",
2205f58dc01bSTejun Heo 	[PCPU_FC_EMBED]	= "embed",
2206f58dc01bSTejun Heo 	[PCPU_FC_PAGE]	= "page",
2207f58dc01bSTejun Heo };
220866c3a757STejun Heo 
2209f58dc01bSTejun Heo enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
2210f58dc01bSTejun Heo 
2211f58dc01bSTejun Heo static int __init percpu_alloc_setup(char *str)
221266c3a757STejun Heo {
22135479c78aSCyrill Gorcunov 	if (!str)
22145479c78aSCyrill Gorcunov 		return -EINVAL;
22155479c78aSCyrill Gorcunov 
2216f58dc01bSTejun Heo 	if (0)
2217f58dc01bSTejun Heo 		/* nada */;
2218f58dc01bSTejun Heo #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
2219f58dc01bSTejun Heo 	else if (!strcmp(str, "embed"))
2220f58dc01bSTejun Heo 		pcpu_chosen_fc = PCPU_FC_EMBED;
2221f58dc01bSTejun Heo #endif
2222f58dc01bSTejun Heo #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
2223f58dc01bSTejun Heo 	else if (!strcmp(str, "page"))
2224f58dc01bSTejun Heo 		pcpu_chosen_fc = PCPU_FC_PAGE;
2225f58dc01bSTejun Heo #endif
2226f58dc01bSTejun Heo 	else
2227870d4b12SJoe Perches 		pr_warn("unknown allocator %s specified\n", str);
222866c3a757STejun Heo 
2229f58dc01bSTejun Heo 	return 0;
223066c3a757STejun Heo }
2231f58dc01bSTejun Heo early_param("percpu_alloc", percpu_alloc_setup);
223266c3a757STejun Heo 
22333c9a024fSTejun Heo /*
22343c9a024fSTejun Heo  * pcpu_embed_first_chunk() is used by the generic percpu setup.
22353c9a024fSTejun Heo  * Build it if needed by the arch config or the generic setup is going
22363c9a024fSTejun Heo  * to be used.
22373c9a024fSTejun Heo  */
223808fc4580STejun Heo #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
223908fc4580STejun Heo 	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
22403c9a024fSTejun Heo #define BUILD_EMBED_FIRST_CHUNK
22413c9a024fSTejun Heo #endif
22423c9a024fSTejun Heo 
22433c9a024fSTejun Heo /* build pcpu_page_first_chunk() iff needed by the arch config */
22443c9a024fSTejun Heo #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
22453c9a024fSTejun Heo #define BUILD_PAGE_FIRST_CHUNK
22463c9a024fSTejun Heo #endif
22473c9a024fSTejun Heo 
22483c9a024fSTejun Heo /* pcpu_build_alloc_info() is used by both embed and page first chunk */
22493c9a024fSTejun Heo #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
22503c9a024fSTejun Heo /**
2251fbf59bc9STejun Heo  * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
2252fbf59bc9STejun Heo  * @reserved_size: the size of reserved percpu area in bytes
2253fbf59bc9STejun Heo  * @dyn_size: minimum free size for dynamic allocation in bytes
2254fbf59bc9STejun Heo  * @atom_size: allocation atom size
2255fbf59bc9STejun Heo  * @cpu_distance_fn: callback to determine distance between cpus, optional
2256fbf59bc9STejun Heo  *
2257fbf59bc9STejun Heo  * This function determines grouping of units, their mappings to cpus
2258fbf59bc9STejun Heo  * and other parameters considering needed percpu size, allocation
2259fbf59bc9STejun Heo  * atom size and distances between CPUs.
2260fbf59bc9STejun Heo  *
2261bffc4375SYannick Guerrini  * Groups are always multiples of atom size and CPUs which are of
2262fbf59bc9STejun Heo  * LOCAL_DISTANCE both ways are grouped together and share space for
2263fbf59bc9STejun Heo  * units in the same group.  The returned configuration is guaranteed
2264fbf59bc9STejun Heo  * to have CPUs on different nodes on different groups and >=75% usage
2265fbf59bc9STejun Heo  * of allocated virtual address space.
2266fbf59bc9STejun Heo  *
2267fbf59bc9STejun Heo  * RETURNS:
2268fbf59bc9STejun Heo  * On success, pointer to the new allocation_info is returned.  On
2269fbf59bc9STejun Heo  * failure, ERR_PTR value is returned.
2270fbf59bc9STejun Heo  */
2271fbf59bc9STejun Heo static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
2272fbf59bc9STejun Heo 				size_t reserved_size, size_t dyn_size,
2273fbf59bc9STejun Heo 				size_t atom_size,
2274fbf59bc9STejun Heo 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
2275fbf59bc9STejun Heo {
2276fbf59bc9STejun Heo 	static int group_map[NR_CPUS] __initdata;
2277fbf59bc9STejun Heo 	static int group_cnt[NR_CPUS] __initdata;
2278fbf59bc9STejun Heo 	const size_t static_size = __per_cpu_end - __per_cpu_start;
2279fbf59bc9STejun Heo 	int nr_groups = 1, nr_units = 0;
2280fbf59bc9STejun Heo 	size_t size_sum, min_unit_size, alloc_size;
2281fbf59bc9STejun Heo 	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
2282fbf59bc9STejun Heo 	int last_allocs, group, unit;
2283fbf59bc9STejun Heo 	unsigned int cpu, tcpu;
2284fbf59bc9STejun Heo 	struct pcpu_alloc_info *ai;
2285fbf59bc9STejun Heo 	unsigned int *cpu_map;
2286fbf59bc9STejun Heo 
2287fbf59bc9STejun Heo 	/* this function may be called multiple times */
2288fbf59bc9STejun Heo 	memset(group_map, 0, sizeof(group_map));
2289fbf59bc9STejun Heo 	memset(group_cnt, 0, sizeof(group_cnt));
2290fbf59bc9STejun Heo 
2291fbf59bc9STejun Heo 	/* calculate size_sum and ensure dyn_size is enough for early alloc */
2292fbf59bc9STejun Heo 	size_sum = PFN_ALIGN(static_size + reserved_size +
2293fbf59bc9STejun Heo 			    max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
2294fbf59bc9STejun Heo 	dyn_size = size_sum - static_size - reserved_size;
2295fbf59bc9STejun Heo 
2296fbf59bc9STejun Heo 	/*
2297fbf59bc9STejun Heo 	 * Determine min_unit_size, alloc_size and max_upa such that
2298fbf59bc9STejun Heo 	 * alloc_size is multiple of atom_size and is the smallest
229925985edcSLucas De Marchi 	 * which can accommodate 4k aligned segments which are equal to
2300fbf59bc9STejun Heo 	 * or larger than min_unit_size.
2301fbf59bc9STejun Heo 	 */
2302fbf59bc9STejun Heo 	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
2303fbf59bc9STejun Heo 
23049c015162SDennis Zhou (Facebook) 	/* determine the maximum # of units that can fit in an allocation */
2305fbf59bc9STejun Heo 	alloc_size = roundup(min_unit_size, atom_size);
2306fbf59bc9STejun Heo 	upa = alloc_size / min_unit_size;
2307f09f1243SAlexander Kuleshov 	while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
2308fbf59bc9STejun Heo 		upa--;
2309fbf59bc9STejun Heo 	max_upa = upa;
2310fbf59bc9STejun Heo 
2311fbf59bc9STejun Heo 	/* group cpus according to their proximity */
2312fbf59bc9STejun Heo 	for_each_possible_cpu(cpu) {
2313fbf59bc9STejun Heo 		group = 0;
2314fbf59bc9STejun Heo 	next_group:
2315fbf59bc9STejun Heo 		for_each_possible_cpu(tcpu) {
2316fbf59bc9STejun Heo 			if (cpu == tcpu)
2317fbf59bc9STejun Heo 				break;
2318fbf59bc9STejun Heo 			if (group_map[tcpu] == group && cpu_distance_fn &&
2319fbf59bc9STejun Heo 			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
2320fbf59bc9STejun Heo 			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
2321fbf59bc9STejun Heo 				group++;
2322fbf59bc9STejun Heo 				nr_groups = max(nr_groups, group + 1);
2323fbf59bc9STejun Heo 				goto next_group;
2324fbf59bc9STejun Heo 			}
2325fbf59bc9STejun Heo 		}
2326fbf59bc9STejun Heo 		group_map[cpu] = group;
2327fbf59bc9STejun Heo 		group_cnt[group]++;
2328fbf59bc9STejun Heo 	}
2329fbf59bc9STejun Heo 
2330fbf59bc9STejun Heo 	/*
23319c015162SDennis Zhou (Facebook) 	 * Wasted space is caused by a ratio imbalance of upa to group_cnt.
23329c015162SDennis Zhou (Facebook) 	 * Expand the unit_size until we use >= 75% of the units allocated.
23339c015162SDennis Zhou (Facebook) 	 * Related to atom_size, which could be much larger than the unit_size.
2334fbf59bc9STejun Heo 	 */
2335fbf59bc9STejun Heo 	last_allocs = INT_MAX;
2336fbf59bc9STejun Heo 	for (upa = max_upa; upa; upa--) {
2337fbf59bc9STejun Heo 		int allocs = 0, wasted = 0;
2338fbf59bc9STejun Heo 
2339f09f1243SAlexander Kuleshov 		if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
2340fbf59bc9STejun Heo 			continue;
2341fbf59bc9STejun Heo 
2342fbf59bc9STejun Heo 		for (group = 0; group < nr_groups; group++) {
2343fbf59bc9STejun Heo 			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
2344fbf59bc9STejun Heo 			allocs += this_allocs;
2345fbf59bc9STejun Heo 			wasted += this_allocs * upa - group_cnt[group];
2346fbf59bc9STejun Heo 		}
2347fbf59bc9STejun Heo 
2348fbf59bc9STejun Heo 		/*
2349fbf59bc9STejun Heo 		 * Don't accept if wastage is over 1/3.  The
2350fbf59bc9STejun Heo 		 * greater-than comparison ensures upa==1 always
2351fbf59bc9STejun Heo 		 * passes the following check.
2352fbf59bc9STejun Heo 		 */
2353fbf59bc9STejun Heo 		if (wasted > num_possible_cpus() / 3)
2354fbf59bc9STejun Heo 			continue;
2355fbf59bc9STejun Heo 
2356fbf59bc9STejun Heo 		/* and then don't consume more memory */
2357fbf59bc9STejun Heo 		if (allocs > last_allocs)
2358fbf59bc9STejun Heo 			break;
2359fbf59bc9STejun Heo 		last_allocs = allocs;
2360fbf59bc9STejun Heo 		best_upa = upa;
2361fbf59bc9STejun Heo 	}
2362fbf59bc9STejun Heo 	upa = best_upa;
2363fbf59bc9STejun Heo 
2364fbf59bc9STejun Heo 	/* allocate and fill alloc_info */
2365fbf59bc9STejun Heo 	for (group = 0; group < nr_groups; group++)
2366fbf59bc9STejun Heo 		nr_units += roundup(group_cnt[group], upa);
2367fbf59bc9STejun Heo 
2368fbf59bc9STejun Heo 	ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
2369fbf59bc9STejun Heo 	if (!ai)
2370fbf59bc9STejun Heo 		return ERR_PTR(-ENOMEM);
2371fbf59bc9STejun Heo 	cpu_map = ai->groups[0].cpu_map;
2372fbf59bc9STejun Heo 
2373fbf59bc9STejun Heo 	for (group = 0; group < nr_groups; group++) {
2374fbf59bc9STejun Heo 		ai->groups[group].cpu_map = cpu_map;
2375fbf59bc9STejun Heo 		cpu_map += roundup(group_cnt[group], upa);
2376fbf59bc9STejun Heo 	}
2377fbf59bc9STejun Heo 
2378fbf59bc9STejun Heo 	ai->static_size = static_size;
2379fbf59bc9STejun Heo 	ai->reserved_size = reserved_size;
2380fbf59bc9STejun Heo 	ai->dyn_size = dyn_size;
2381fbf59bc9STejun Heo 	ai->unit_size = alloc_size / upa;
2382fbf59bc9STejun Heo 	ai->atom_size = atom_size;
2383fbf59bc9STejun Heo 	ai->alloc_size = alloc_size;
2384fbf59bc9STejun Heo 
2385fbf59bc9STejun Heo 	for (group = 0, unit = 0; group_cnt[group]; group++) {
2386fbf59bc9STejun Heo 		struct pcpu_group_info *gi = &ai->groups[group];
2387fbf59bc9STejun Heo 
2388fbf59bc9STejun Heo 		/*
2389fbf59bc9STejun Heo 		 * Initialize base_offset as if all groups are located
2390fbf59bc9STejun Heo 		 * back-to-back.  The caller should update this to
2391fbf59bc9STejun Heo 		 * reflect actual allocation.
2392fbf59bc9STejun Heo 		 */
2393fbf59bc9STejun Heo 		gi->base_offset = unit * ai->unit_size;
2394fbf59bc9STejun Heo 
2395fbf59bc9STejun Heo 		for_each_possible_cpu(cpu)
2396fbf59bc9STejun Heo 			if (group_map[cpu] == group)
2397fbf59bc9STejun Heo 				gi->cpu_map[gi->nr_units++] = cpu;
2398fbf59bc9STejun Heo 		gi->nr_units = roundup(gi->nr_units, upa);
2399fbf59bc9STejun Heo 		unit += gi->nr_units;
2400fbf59bc9STejun Heo 	}
2401fbf59bc9STejun Heo 	BUG_ON(unit != nr_units);
2402fbf59bc9STejun Heo 
2403fbf59bc9STejun Heo 	return ai;
2404fbf59bc9STejun Heo }
24053c9a024fSTejun Heo #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
2406fbf59bc9STejun Heo 
24073c9a024fSTejun Heo #if defined(BUILD_EMBED_FIRST_CHUNK)
240866c3a757STejun Heo /**
240966c3a757STejun Heo  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
241066c3a757STejun Heo  * @reserved_size: the size of reserved percpu area in bytes
24114ba6ce25STejun Heo  * @dyn_size: minimum free size for dynamic allocation in bytes
2412c8826dd5STejun Heo  * @atom_size: allocation atom size
2413c8826dd5STejun Heo  * @cpu_distance_fn: callback to determine distance between cpus, optional
2414c8826dd5STejun Heo  * @alloc_fn: function to allocate percpu page
241525985edcSLucas De Marchi  * @free_fn: function to free percpu page
241666c3a757STejun Heo  *
241766c3a757STejun Heo  * This is a helper to ease setting up embedded first percpu chunk and
241866c3a757STejun Heo  * can be called where pcpu_setup_first_chunk() is expected.
241966c3a757STejun Heo  *
242066c3a757STejun Heo  * If this function is used to setup the first chunk, it is allocated
2421c8826dd5STejun Heo  * by calling @alloc_fn and used as-is without being mapped into
2422c8826dd5STejun Heo  * vmalloc area.  Allocations are always whole multiples of @atom_size
2423c8826dd5STejun Heo  * aligned to @atom_size.
2424c8826dd5STejun Heo  *
2425c8826dd5STejun Heo  * This enables the first chunk to piggy back on the linear physical
2426c8826dd5STejun Heo  * mapping which often uses larger page size.  Please note that this
2427c8826dd5STejun Heo  * can result in very sparse cpu->unit mapping on NUMA machines thus
2428c8826dd5STejun Heo  * requiring large vmalloc address space.  Don't use this allocator if
2429c8826dd5STejun Heo  * vmalloc space is not orders of magnitude larger than distances
2430c8826dd5STejun Heo  * between node memory addresses (ie. 32bit NUMA machines).
243166c3a757STejun Heo  *
24324ba6ce25STejun Heo  * @dyn_size specifies the minimum dynamic area size.
243366c3a757STejun Heo  *
243466c3a757STejun Heo  * If the needed size is smaller than the minimum or specified unit
2435c8826dd5STejun Heo  * size, the leftover is returned using @free_fn.
243666c3a757STejun Heo  *
243766c3a757STejun Heo  * RETURNS:
2438fb435d52STejun Heo  * 0 on success, -errno on failure.
243966c3a757STejun Heo  */
24404ba6ce25STejun Heo int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
2441c8826dd5STejun Heo 				  size_t atom_size,
2442c8826dd5STejun Heo 				  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
2443c8826dd5STejun Heo 				  pcpu_fc_alloc_fn_t alloc_fn,
2444c8826dd5STejun Heo 				  pcpu_fc_free_fn_t free_fn)
244566c3a757STejun Heo {
2446c8826dd5STejun Heo 	void *base = (void *)ULONG_MAX;
2447c8826dd5STejun Heo 	void **areas = NULL;
2448fd1e8a1fSTejun Heo 	struct pcpu_alloc_info *ai;
244993c76b6bSzijun_hu 	size_t size_sum, areas_size;
245093c76b6bSzijun_hu 	unsigned long max_distance;
24519b739662Szijun_hu 	int group, i, highest_group, rc;
245266c3a757STejun Heo 
2453c8826dd5STejun Heo 	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
2454c8826dd5STejun Heo 				   cpu_distance_fn);
2455fd1e8a1fSTejun Heo 	if (IS_ERR(ai))
2456fd1e8a1fSTejun Heo 		return PTR_ERR(ai);
245766c3a757STejun Heo 
2458fd1e8a1fSTejun Heo 	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
2459c8826dd5STejun Heo 	areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
246066c3a757STejun Heo 
2461999c17e3SSantosh Shilimkar 	areas = memblock_virt_alloc_nopanic(areas_size, 0);
2462c8826dd5STejun Heo 	if (!areas) {
2463fb435d52STejun Heo 		rc = -ENOMEM;
2464c8826dd5STejun Heo 		goto out_free;
2465fa8a7094STejun Heo 	}
246666c3a757STejun Heo 
24679b739662Szijun_hu 	/* allocate, copy and determine base address & max_distance */
24689b739662Szijun_hu 	highest_group = 0;
2469c8826dd5STejun Heo 	for (group = 0; group < ai->nr_groups; group++) {
2470c8826dd5STejun Heo 		struct pcpu_group_info *gi = &ai->groups[group];
2471c8826dd5STejun Heo 		unsigned int cpu = NR_CPUS;
2472c8826dd5STejun Heo 		void *ptr;
247366c3a757STejun Heo 
2474c8826dd5STejun Heo 		for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
2475c8826dd5STejun Heo 			cpu = gi->cpu_map[i];
2476c8826dd5STejun Heo 		BUG_ON(cpu == NR_CPUS);
2477c8826dd5STejun Heo 
2478c8826dd5STejun Heo 		/* allocate space for the whole group */
2479c8826dd5STejun Heo 		ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
2480c8826dd5STejun Heo 		if (!ptr) {
2481c8826dd5STejun Heo 			rc = -ENOMEM;
2482c8826dd5STejun Heo 			goto out_free_areas;
2483c8826dd5STejun Heo 		}
2484f528f0b8SCatalin Marinas 		/* kmemleak tracks the percpu allocations separately */
2485f528f0b8SCatalin Marinas 		kmemleak_free(ptr);
2486c8826dd5STejun Heo 		areas[group] = ptr;
2487c8826dd5STejun Heo 
2488c8826dd5STejun Heo 		base = min(ptr, base);
24899b739662Szijun_hu 		if (ptr > areas[highest_group])
24909b739662Szijun_hu 			highest_group = group;
24919b739662Szijun_hu 	}
24929b739662Szijun_hu 	max_distance = areas[highest_group] - base;
24939b739662Szijun_hu 	max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
24949b739662Szijun_hu 
24959b739662Szijun_hu 	/* warn if maximum distance is further than 75% of vmalloc space */
24969b739662Szijun_hu 	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
24979b739662Szijun_hu 		pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
24989b739662Szijun_hu 				max_distance, VMALLOC_TOTAL);
24999b739662Szijun_hu #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
25009b739662Szijun_hu 		/* and fail if we have fallback */
25019b739662Szijun_hu 		rc = -EINVAL;
25029b739662Szijun_hu 		goto out_free_areas;
25039b739662Szijun_hu #endif
250442b64281STejun Heo 	}
250542b64281STejun Heo 
250642b64281STejun Heo 	/*
250742b64281STejun Heo 	 * Copy data and free unused parts.  This should happen after all
250842b64281STejun Heo 	 * allocations are complete; otherwise, we may end up with
250942b64281STejun Heo 	 * overlapping groups.
251042b64281STejun Heo 	 */
251142b64281STejun Heo 	for (group = 0; group < ai->nr_groups; group++) {
251242b64281STejun Heo 		struct pcpu_group_info *gi = &ai->groups[group];
251342b64281STejun Heo 		void *ptr = areas[group];
2514c8826dd5STejun Heo 
2515c8826dd5STejun Heo 		for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
2516c8826dd5STejun Heo 			if (gi->cpu_map[i] == NR_CPUS) {
2517c8826dd5STejun Heo 				/* unused unit, free whole */
2518c8826dd5STejun Heo 				free_fn(ptr, ai->unit_size);
2519c8826dd5STejun Heo 				continue;
2520c8826dd5STejun Heo 			}
2521c8826dd5STejun Heo 			/* copy and return the unused part */
2522fd1e8a1fSTejun Heo 			memcpy(ptr, __per_cpu_load, ai->static_size);
2523c8826dd5STejun Heo 			free_fn(ptr + size_sum, ai->unit_size - size_sum);
2524c8826dd5STejun Heo 		}
252566c3a757STejun Heo 	}
252666c3a757STejun Heo 
2527c8826dd5STejun Heo 	/* base address is now known, determine group base offsets */
25286ea529a2STejun Heo 	for (group = 0; group < ai->nr_groups; group++) {
2529c8826dd5STejun Heo 		ai->groups[group].base_offset = areas[group] - base;
25306ea529a2STejun Heo 	}
2531c8826dd5STejun Heo 
2532870d4b12SJoe Perches 	pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
2533fd1e8a1fSTejun Heo 		PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
2534fd1e8a1fSTejun Heo 		ai->dyn_size, ai->unit_size);
253566c3a757STejun Heo 
2536fb435d52STejun Heo 	rc = pcpu_setup_first_chunk(ai, base);
2537c8826dd5STejun Heo 	goto out_free;
2538c8826dd5STejun Heo 
2539c8826dd5STejun Heo out_free_areas:
2540c8826dd5STejun Heo 	for (group = 0; group < ai->nr_groups; group++)
2541f851c8d8SMichael Holzheu 		if (areas[group])
2542c8826dd5STejun Heo 			free_fn(areas[group],
2543c8826dd5STejun Heo 				ai->groups[group].nr_units * ai->unit_size);
2544c8826dd5STejun Heo out_free:
2545fd1e8a1fSTejun Heo 	pcpu_free_alloc_info(ai);
2546c8826dd5STejun Heo 	if (areas)
2547999c17e3SSantosh Shilimkar 		memblock_free_early(__pa(areas), areas_size);
2548fb435d52STejun Heo 	return rc;
2549d4b95f80STejun Heo }
25503c9a024fSTejun Heo #endif /* BUILD_EMBED_FIRST_CHUNK */
2551d4b95f80STejun Heo 
25523c9a024fSTejun Heo #ifdef BUILD_PAGE_FIRST_CHUNK
2553d4b95f80STejun Heo /**
255400ae4064STejun Heo  * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
2555d4b95f80STejun Heo  * @reserved_size: the size of reserved percpu area in bytes
2556d4b95f80STejun Heo  * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
255725985edcSLucas De Marchi  * @free_fn: function to free percpu page, always called with PAGE_SIZE
2558d4b95f80STejun Heo  * @populate_pte_fn: function to populate pte
2559d4b95f80STejun Heo  *
256000ae4064STejun Heo  * This is a helper to ease setting up page-remapped first percpu
256100ae4064STejun Heo  * chunk and can be called where pcpu_setup_first_chunk() is expected.
2562d4b95f80STejun Heo  *
2563d4b95f80STejun Heo  * This is the basic allocator.  Static percpu area is allocated
2564d4b95f80STejun Heo  * page-by-page into vmalloc area.
2565d4b95f80STejun Heo  *
2566d4b95f80STejun Heo  * RETURNS:
2567fb435d52STejun Heo  * 0 on success, -errno on failure.
2568d4b95f80STejun Heo  */
2569fb435d52STejun Heo int __init pcpu_page_first_chunk(size_t reserved_size,
2570d4b95f80STejun Heo 				 pcpu_fc_alloc_fn_t alloc_fn,
2571d4b95f80STejun Heo 				 pcpu_fc_free_fn_t free_fn,
2572d4b95f80STejun Heo 				 pcpu_fc_populate_pte_fn_t populate_pte_fn)
2573d4b95f80STejun Heo {
25748f05a6a6STejun Heo 	static struct vm_struct vm;
2575fd1e8a1fSTejun Heo 	struct pcpu_alloc_info *ai;
257600ae4064STejun Heo 	char psize_str[16];
2577ce3141a2STejun Heo 	int unit_pages;
2578d4b95f80STejun Heo 	size_t pages_size;
2579ce3141a2STejun Heo 	struct page **pages;
2580fb435d52STejun Heo 	int unit, i, j, rc;
25818f606604Szijun_hu 	int upa;
25828f606604Szijun_hu 	int nr_g0_units;
2583d4b95f80STejun Heo 
258400ae4064STejun Heo 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
258500ae4064STejun Heo 
25864ba6ce25STejun Heo 	ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
2587fd1e8a1fSTejun Heo 	if (IS_ERR(ai))
2588fd1e8a1fSTejun Heo 		return PTR_ERR(ai);
2589fd1e8a1fSTejun Heo 	BUG_ON(ai->nr_groups != 1);
25908f606604Szijun_hu 	upa = ai->alloc_size/ai->unit_size;
25918f606604Szijun_hu 	nr_g0_units = roundup(num_possible_cpus(), upa);
25928f606604Szijun_hu 	if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
25938f606604Szijun_hu 		pcpu_free_alloc_info(ai);
25948f606604Szijun_hu 		return -EINVAL;
25958f606604Szijun_hu 	}
2596fd1e8a1fSTejun Heo 
2597fd1e8a1fSTejun Heo 	unit_pages = ai->unit_size >> PAGE_SHIFT;
2598d4b95f80STejun Heo 
2599d4b95f80STejun Heo 	/* unaligned allocations can't be freed, round up to page size */
2600fd1e8a1fSTejun Heo 	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
2601fd1e8a1fSTejun Heo 			       sizeof(pages[0]));
2602999c17e3SSantosh Shilimkar 	pages = memblock_virt_alloc(pages_size, 0);
2603d4b95f80STejun Heo 
26048f05a6a6STejun Heo 	/* allocate pages */
2605d4b95f80STejun Heo 	j = 0;
26068f606604Szijun_hu 	for (unit = 0; unit < num_possible_cpus(); unit++) {
2607fd1e8a1fSTejun Heo 		unsigned int cpu = ai->groups[0].cpu_map[unit];
26088f606604Szijun_hu 		for (i = 0; i < unit_pages; i++) {
2609d4b95f80STejun Heo 			void *ptr;
2610d4b95f80STejun Heo 
26113cbc8565STejun Heo 			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
2612d4b95f80STejun Heo 			if (!ptr) {
2613870d4b12SJoe Perches 				pr_warn("failed to allocate %s page for cpu%u\n",
2614598d8091SJoe Perches 						psize_str, cpu);
2615d4b95f80STejun Heo 				goto enomem;
2616d4b95f80STejun Heo 			}
2617f528f0b8SCatalin Marinas 			/* kmemleak tracks the percpu allocations separately */
2618f528f0b8SCatalin Marinas 			kmemleak_free(ptr);
2619ce3141a2STejun Heo 			pages[j++] = virt_to_page(ptr);
2620d4b95f80STejun Heo 		}
26218f606604Szijun_hu 	}
2622d4b95f80STejun Heo 
26238f05a6a6STejun Heo 	/* allocate vm area, map the pages and copy static data */
26248f05a6a6STejun Heo 	vm.flags = VM_ALLOC;
2625fd1e8a1fSTejun Heo 	vm.size = num_possible_cpus() * ai->unit_size;
26268f05a6a6STejun Heo 	vm_area_register_early(&vm, PAGE_SIZE);
26278f05a6a6STejun Heo 
2628fd1e8a1fSTejun Heo 	for (unit = 0; unit < num_possible_cpus(); unit++) {
26291d9d3257STejun Heo 		unsigned long unit_addr =
2630fd1e8a1fSTejun Heo 			(unsigned long)vm.addr + unit * ai->unit_size;
26318f05a6a6STejun Heo 
2632ce3141a2STejun Heo 		for (i = 0; i < unit_pages; i++)
26338f05a6a6STejun Heo 			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
26348f05a6a6STejun Heo 
26358f05a6a6STejun Heo 		/* pte already populated, the following shouldn't fail */
2636fb435d52STejun Heo 		rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
2637ce3141a2STejun Heo 				      unit_pages);
2638fb435d52STejun Heo 		if (rc < 0)
2639fb435d52STejun Heo 			panic("failed to map percpu area, err=%d\n", rc);
26408f05a6a6STejun Heo 
26418f05a6a6STejun Heo 		/*
26428f05a6a6STejun Heo 		 * FIXME: Archs with virtual cache should flush local
26438f05a6a6STejun Heo 		 * cache for the linear mapping here - something
26448f05a6a6STejun Heo 		 * equivalent to flush_cache_vmap() on the local cpu.
26458f05a6a6STejun Heo 		 * flush_cache_vmap() can't be used as most supporting
26468f05a6a6STejun Heo 		 * data structures are not set up yet.
26478f05a6a6STejun Heo 		 */
26488f05a6a6STejun Heo 
26498f05a6a6STejun Heo 		/* copy static data */
2650fd1e8a1fSTejun Heo 		memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
265166c3a757STejun Heo 	}
265266c3a757STejun Heo 
265366c3a757STejun Heo 	/* we're ready, commit */
2654870d4b12SJoe Perches 	pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n",
2655fd1e8a1fSTejun Heo 		unit_pages, psize_str, vm.addr, ai->static_size,
2656fd1e8a1fSTejun Heo 		ai->reserved_size, ai->dyn_size);
265766c3a757STejun Heo 
2658fb435d52STejun Heo 	rc = pcpu_setup_first_chunk(ai, vm.addr);
2659d4b95f80STejun Heo 	goto out_free_ar;
2660d4b95f80STejun Heo 
2661d4b95f80STejun Heo enomem:
2662d4b95f80STejun Heo 	while (--j >= 0)
2663ce3141a2STejun Heo 		free_fn(page_address(pages[j]), PAGE_SIZE);
2664fb435d52STejun Heo 	rc = -ENOMEM;
2665d4b95f80STejun Heo out_free_ar:
2666999c17e3SSantosh Shilimkar 	memblock_free_early(__pa(pages), pages_size);
2667fd1e8a1fSTejun Heo 	pcpu_free_alloc_info(ai);
2668fb435d52STejun Heo 	return rc;
266966c3a757STejun Heo }
26703c9a024fSTejun Heo #endif /* BUILD_PAGE_FIRST_CHUNK */
2671d4b95f80STejun Heo 
2672bbddff05STejun Heo #ifndef	CONFIG_HAVE_SETUP_PER_CPU_AREA
26738c4bfc6eSTejun Heo /*
2674bbddff05STejun Heo  * Generic SMP percpu area setup.
2675e74e3962STejun Heo  *
2676e74e3962STejun Heo  * The embedding helper is used because its behavior closely resembles
2677e74e3962STejun Heo  * the original non-dynamic generic percpu area setup.  This is
2678e74e3962STejun Heo  * important because many archs have addressing restrictions and might
2679e74e3962STejun Heo  * fail if the percpu area is located far away from the previous
2680e74e3962STejun Heo  * location.  As an added bonus, in non-NUMA cases, embedding is
2681e74e3962STejun Heo  * generally a good idea TLB-wise because percpu area can piggy back
2682e74e3962STejun Heo  * on the physical linear memory mapping which uses large page
2683e74e3962STejun Heo  * mappings on applicable archs.
2684e74e3962STejun Heo  */
2685e74e3962STejun Heo unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
2686e74e3962STejun Heo EXPORT_SYMBOL(__per_cpu_offset);
2687e74e3962STejun Heo 
2688c8826dd5STejun Heo static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
2689c8826dd5STejun Heo 				       size_t align)
2690c8826dd5STejun Heo {
2691999c17e3SSantosh Shilimkar 	return  memblock_virt_alloc_from_nopanic(
2692999c17e3SSantosh Shilimkar 			size, align, __pa(MAX_DMA_ADDRESS));
2693c8826dd5STejun Heo }
2694c8826dd5STejun Heo 
2695c8826dd5STejun Heo static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
2696c8826dd5STejun Heo {
2697999c17e3SSantosh Shilimkar 	memblock_free_early(__pa(ptr), size);
2698c8826dd5STejun Heo }
2699c8826dd5STejun Heo 
2700e74e3962STejun Heo void __init setup_per_cpu_areas(void)
2701e74e3962STejun Heo {
2702e74e3962STejun Heo 	unsigned long delta;
2703e74e3962STejun Heo 	unsigned int cpu;
2704fb435d52STejun Heo 	int rc;
2705e74e3962STejun Heo 
2706e74e3962STejun Heo 	/*
2707e74e3962STejun Heo 	 * Always reserve area for module percpu variables.  That's
2708e74e3962STejun Heo 	 * what the legacy allocator did.
2709e74e3962STejun Heo 	 */
2710fb435d52STejun Heo 	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2711c8826dd5STejun Heo 				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2712c8826dd5STejun Heo 				    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2713fb435d52STejun Heo 	if (rc < 0)
2714bbddff05STejun Heo 		panic("Failed to initialize percpu areas.");
2715e74e3962STejun Heo 
2716e74e3962STejun Heo 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2717e74e3962STejun Heo 	for_each_possible_cpu(cpu)
2718fb435d52STejun Heo 		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
2719e74e3962STejun Heo }
2720e74e3962STejun Heo #endif	/* CONFIG_HAVE_SETUP_PER_CPU_AREA */
2721099a19d9STejun Heo 
2722bbddff05STejun Heo #else	/* CONFIG_SMP */
2723bbddff05STejun Heo 
2724bbddff05STejun Heo /*
2725bbddff05STejun Heo  * UP percpu area setup.
2726bbddff05STejun Heo  *
2727bbddff05STejun Heo  * UP always uses km-based percpu allocator with identity mapping.
2728bbddff05STejun Heo  * Static percpu variables are indistinguishable from the usual static
2729bbddff05STejun Heo  * variables and don't require any special preparation.
2730bbddff05STejun Heo  */
2731bbddff05STejun Heo void __init setup_per_cpu_areas(void)
2732bbddff05STejun Heo {
2733bbddff05STejun Heo 	const size_t unit_size =
2734bbddff05STejun Heo 		roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
2735bbddff05STejun Heo 					 PERCPU_DYNAMIC_RESERVE));
2736bbddff05STejun Heo 	struct pcpu_alloc_info *ai;
2737bbddff05STejun Heo 	void *fc;
2738bbddff05STejun Heo 
2739bbddff05STejun Heo 	ai = pcpu_alloc_alloc_info(1, 1);
2740999c17e3SSantosh Shilimkar 	fc = memblock_virt_alloc_from_nopanic(unit_size,
2741999c17e3SSantosh Shilimkar 					      PAGE_SIZE,
2742999c17e3SSantosh Shilimkar 					      __pa(MAX_DMA_ADDRESS));
2743bbddff05STejun Heo 	if (!ai || !fc)
2744bbddff05STejun Heo 		panic("Failed to allocate memory for percpu areas.");
2745100d13c3SCatalin Marinas 	/* kmemleak tracks the percpu allocations separately */
2746100d13c3SCatalin Marinas 	kmemleak_free(fc);
2747bbddff05STejun Heo 
2748bbddff05STejun Heo 	ai->dyn_size = unit_size;
2749bbddff05STejun Heo 	ai->unit_size = unit_size;
2750bbddff05STejun Heo 	ai->atom_size = unit_size;
2751bbddff05STejun Heo 	ai->alloc_size = unit_size;
2752bbddff05STejun Heo 	ai->groups[0].nr_units = 1;
2753bbddff05STejun Heo 	ai->groups[0].cpu_map[0] = 0;
2754bbddff05STejun Heo 
2755bbddff05STejun Heo 	if (pcpu_setup_first_chunk(ai, fc) < 0)
2756bbddff05STejun Heo 		panic("Failed to initialize percpu areas.");
2757438a5061SNicolas Pitre 	pcpu_free_alloc_info(ai);
2758bbddff05STejun Heo }
2759bbddff05STejun Heo 
2760bbddff05STejun Heo #endif	/* CONFIG_SMP */
2761bbddff05STejun Heo 
2762099a19d9STejun Heo /*
27637e8a6304SDennis Zhou (Facebook)  * pcpu_nr_pages - calculate total number of populated backing pages
27647e8a6304SDennis Zhou (Facebook)  *
27657e8a6304SDennis Zhou (Facebook)  * This reflects the number of pages populated to back chunks.  Metadata is
27667e8a6304SDennis Zhou (Facebook)  * excluded in the number exposed in meminfo as the number of backing pages
27677e8a6304SDennis Zhou (Facebook)  * scales with the number of cpus and can quickly outweigh the memory used for
27687e8a6304SDennis Zhou (Facebook)  * metadata.  It also keeps this calculation nice and simple.
27697e8a6304SDennis Zhou (Facebook)  *
27707e8a6304SDennis Zhou (Facebook)  * RETURNS:
27717e8a6304SDennis Zhou (Facebook)  * Total number of populated backing pages in use by the allocator.
27727e8a6304SDennis Zhou (Facebook)  */
27737e8a6304SDennis Zhou (Facebook) unsigned long pcpu_nr_pages(void)
27747e8a6304SDennis Zhou (Facebook) {
27757e8a6304SDennis Zhou (Facebook) 	return pcpu_nr_populated * pcpu_nr_units;
27767e8a6304SDennis Zhou (Facebook) }
27777e8a6304SDennis Zhou (Facebook) 
27787e8a6304SDennis Zhou (Facebook) /*
27791a4d7607STejun Heo  * Percpu allocator is initialized early during boot when neither slab or
27801a4d7607STejun Heo  * workqueue is available.  Plug async management until everything is up
27811a4d7607STejun Heo  * and running.
27821a4d7607STejun Heo  */
27831a4d7607STejun Heo static int __init percpu_enable_async(void)
27841a4d7607STejun Heo {
27851a4d7607STejun Heo 	pcpu_async_enabled = true;
27861a4d7607STejun Heo 	return 0;
27871a4d7607STejun Heo }
27881a4d7607STejun Heo subsys_initcall(percpu_enable_async);
2789