1fbf59bc9STejun Heo /* 2fbf59bc9STejun Heo * linux/mm/percpu.c - percpu memory allocator 3fbf59bc9STejun Heo * 4fbf59bc9STejun Heo * Copyright (C) 2009 SUSE Linux Products GmbH 5fbf59bc9STejun Heo * Copyright (C) 2009 Tejun Heo <tj@kernel.org> 6fbf59bc9STejun Heo * 7fbf59bc9STejun Heo * This file is released under the GPLv2. 8fbf59bc9STejun Heo * 9fbf59bc9STejun Heo * This is percpu allocator which can handle both static and dynamic 10fbf59bc9STejun Heo * areas. Percpu areas are allocated in chunks in vmalloc area. Each 112f39e637STejun Heo * chunk is consisted of boot-time determined number of units and the 122f39e637STejun Heo * first chunk is used for static percpu variables in the kernel image 132f39e637STejun Heo * (special boot time alloc/init handling necessary as these areas 142f39e637STejun Heo * need to be brought up before allocation services are running). 152f39e637STejun Heo * Unit grows as necessary and all units grow or shrink in unison. 162f39e637STejun Heo * When a chunk is filled up, another chunk is allocated. ie. in 172f39e637STejun Heo * vmalloc area 18fbf59bc9STejun Heo * 19fbf59bc9STejun Heo * c0 c1 c2 20fbf59bc9STejun Heo * ------------------- ------------------- ------------ 21fbf59bc9STejun Heo * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u 22fbf59bc9STejun Heo * ------------------- ...... ------------------- .... ------------ 23fbf59bc9STejun Heo * 24fbf59bc9STejun Heo * Allocation is done in offset-size areas of single unit space. Ie, 25fbf59bc9STejun Heo * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 262f39e637STejun Heo * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to 272f39e637STejun Heo * cpus. On NUMA, the mapping can be non-linear and even sparse. 282f39e637STejun Heo * Percpu access can be done by configuring percpu base registers 292f39e637STejun Heo * according to cpu to unit mapping and pcpu_unit_size. 30fbf59bc9STejun Heo * 312f39e637STejun Heo * There are usually many small percpu allocations many of them being 322f39e637STejun Heo * as small as 4 bytes. The allocator organizes chunks into lists 33fbf59bc9STejun Heo * according to free size and tries to allocate from the fullest one. 34fbf59bc9STejun Heo * Each chunk keeps the maximum contiguous area size hint which is 35fbf59bc9STejun Heo * guaranteed to be eqaul to or larger than the maximum contiguous 36fbf59bc9STejun Heo * area in the chunk. This helps the allocator not to iterate the 37fbf59bc9STejun Heo * chunk maps unnecessarily. 38fbf59bc9STejun Heo * 39fbf59bc9STejun Heo * Allocation state in each chunk is kept using an array of integers 40fbf59bc9STejun Heo * on chunk->map. A positive value in the map represents a free 41fbf59bc9STejun Heo * region and negative allocated. Allocation inside a chunk is done 42fbf59bc9STejun Heo * by scanning this map sequentially and serving the first matching 43fbf59bc9STejun Heo * entry. This is mostly copied from the percpu_modalloc() allocator. 44e1b9aa3fSChristoph Lameter * Chunks can be determined from the address using the index field 45e1b9aa3fSChristoph Lameter * in the page struct. The index field contains a pointer to the chunk. 46fbf59bc9STejun Heo * 47fbf59bc9STejun Heo * To use this allocator, arch code should do the followings. 48fbf59bc9STejun Heo * 49e74e3962STejun Heo * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA 50fbf59bc9STejun Heo * 51fbf59bc9STejun Heo * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate 52e0100983STejun Heo * regular address to percpu pointer and back if they need to be 53e0100983STejun Heo * different from the default 54fbf59bc9STejun Heo * 558d408b4bSTejun Heo * - use pcpu_setup_first_chunk() during percpu area initialization to 568d408b4bSTejun Heo * setup the first chunk containing the kernel static percpu area 57fbf59bc9STejun Heo */ 58fbf59bc9STejun Heo 59fbf59bc9STejun Heo #include <linux/bitmap.h> 60fbf59bc9STejun Heo #include <linux/bootmem.h> 61fd1e8a1fSTejun Heo #include <linux/err.h> 62fbf59bc9STejun Heo #include <linux/list.h> 63a530b795STejun Heo #include <linux/log2.h> 64fbf59bc9STejun Heo #include <linux/mm.h> 65fbf59bc9STejun Heo #include <linux/module.h> 66fbf59bc9STejun Heo #include <linux/mutex.h> 67fbf59bc9STejun Heo #include <linux/percpu.h> 68fbf59bc9STejun Heo #include <linux/pfn.h> 69fbf59bc9STejun Heo #include <linux/slab.h> 70ccea34b5STejun Heo #include <linux/spinlock.h> 71fbf59bc9STejun Heo #include <linux/vmalloc.h> 72a56dbddfSTejun Heo #include <linux/workqueue.h> 73fbf59bc9STejun Heo 74fbf59bc9STejun Heo #include <asm/cacheflush.h> 75e0100983STejun Heo #include <asm/sections.h> 76fbf59bc9STejun Heo #include <asm/tlbflush.h> 77fbf59bc9STejun Heo 78fbf59bc9STejun Heo #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 79fbf59bc9STejun Heo #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 80fbf59bc9STejun Heo 81e0100983STejun Heo /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 82e0100983STejun Heo #ifndef __addr_to_pcpu_ptr 83e0100983STejun Heo #define __addr_to_pcpu_ptr(addr) \ 84e0100983STejun Heo (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ 85e0100983STejun Heo + (unsigned long)__per_cpu_start) 86e0100983STejun Heo #endif 87e0100983STejun Heo #ifndef __pcpu_ptr_to_addr 88e0100983STejun Heo #define __pcpu_ptr_to_addr(ptr) \ 89e0100983STejun Heo (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ 90e0100983STejun Heo - (unsigned long)__per_cpu_start) 91e0100983STejun Heo #endif 92e0100983STejun Heo 93fbf59bc9STejun Heo struct pcpu_chunk { 94fbf59bc9STejun Heo struct list_head list; /* linked to pcpu_slot lists */ 95fbf59bc9STejun Heo int free_size; /* free bytes in the chunk */ 96fbf59bc9STejun Heo int contig_hint; /* max contiguous size hint */ 97bba174f5STejun Heo void *base_addr; /* base address of this chunk */ 98fbf59bc9STejun Heo int map_used; /* # of map entries used */ 99fbf59bc9STejun Heo int map_alloc; /* # of map entries allocated */ 100fbf59bc9STejun Heo int *map; /* allocation map */ 1016563297cSTejun Heo struct vm_struct **vms; /* mapped vmalloc regions */ 1028d408b4bSTejun Heo bool immutable; /* no [de]population allowed */ 103ce3141a2STejun Heo unsigned long populated[]; /* populated bitmap */ 104fbf59bc9STejun Heo }; 105fbf59bc9STejun Heo 10640150d37STejun Heo static int pcpu_unit_pages __read_mostly; 10740150d37STejun Heo static int pcpu_unit_size __read_mostly; 1082f39e637STejun Heo static int pcpu_nr_units __read_mostly; 1096563297cSTejun Heo static int pcpu_atom_size __read_mostly; 11040150d37STejun Heo static int pcpu_nr_slots __read_mostly; 11140150d37STejun Heo static size_t pcpu_chunk_struct_size __read_mostly; 112fbf59bc9STejun Heo 1132f39e637STejun Heo /* cpus with the lowest and highest unit numbers */ 1142f39e637STejun Heo static unsigned int pcpu_first_unit_cpu __read_mostly; 1152f39e637STejun Heo static unsigned int pcpu_last_unit_cpu __read_mostly; 1162f39e637STejun Heo 117fbf59bc9STejun Heo /* the address of the first chunk which starts with the kernel static area */ 11840150d37STejun Heo void *pcpu_base_addr __read_mostly; 119fbf59bc9STejun Heo EXPORT_SYMBOL_GPL(pcpu_base_addr); 120fbf59bc9STejun Heo 121fb435d52STejun Heo static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ 122fb435d52STejun Heo const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ 1232f39e637STejun Heo 1246563297cSTejun Heo /* group information, used for vm allocation */ 1256563297cSTejun Heo static int pcpu_nr_groups __read_mostly; 1266563297cSTejun Heo static const unsigned long *pcpu_group_offsets __read_mostly; 1276563297cSTejun Heo static const size_t *pcpu_group_sizes __read_mostly; 1286563297cSTejun Heo 129ae9e6bc9STejun Heo /* 130ae9e6bc9STejun Heo * The first chunk which always exists. Note that unlike other 131ae9e6bc9STejun Heo * chunks, this one can be allocated and mapped in several different 132ae9e6bc9STejun Heo * ways and thus often doesn't live in the vmalloc area. 133ae9e6bc9STejun Heo */ 134ae9e6bc9STejun Heo static struct pcpu_chunk *pcpu_first_chunk; 135ae9e6bc9STejun Heo 136ae9e6bc9STejun Heo /* 137ae9e6bc9STejun Heo * Optional reserved chunk. This chunk reserves part of the first 138ae9e6bc9STejun Heo * chunk and serves it for reserved allocations. The amount of 139ae9e6bc9STejun Heo * reserved offset is in pcpu_reserved_chunk_limit. When reserved 140ae9e6bc9STejun Heo * area doesn't exist, the following variables contain NULL and 0 141ae9e6bc9STejun Heo * respectively. 142ae9e6bc9STejun Heo */ 143edcb4639STejun Heo static struct pcpu_chunk *pcpu_reserved_chunk; 144edcb4639STejun Heo static int pcpu_reserved_chunk_limit; 145edcb4639STejun Heo 146fbf59bc9STejun Heo /* 147ccea34b5STejun Heo * Synchronization rules. 148fbf59bc9STejun Heo * 149ccea34b5STejun Heo * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 150ce3141a2STejun Heo * protects allocation/reclaim paths, chunks, populated bitmap and 151ce3141a2STejun Heo * vmalloc mapping. The latter is a spinlock and protects the index 152ce3141a2STejun Heo * data structures - chunk slots, chunks and area maps in chunks. 153fbf59bc9STejun Heo * 154ccea34b5STejun Heo * During allocation, pcpu_alloc_mutex is kept locked all the time and 155ccea34b5STejun Heo * pcpu_lock is grabbed and released as necessary. All actual memory 156ccea34b5STejun Heo * allocations are done using GFP_KERNEL with pcpu_lock released. 157ccea34b5STejun Heo * 158ccea34b5STejun Heo * Free path accesses and alters only the index data structures, so it 159ccea34b5STejun Heo * can be safely called from atomic context. When memory needs to be 160ccea34b5STejun Heo * returned to the system, free path schedules reclaim_work which 161ccea34b5STejun Heo * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be 162ccea34b5STejun Heo * reclaimed, release both locks and frees the chunks. Note that it's 163ccea34b5STejun Heo * necessary to grab both locks to remove a chunk from circulation as 164ccea34b5STejun Heo * allocation path might be referencing the chunk with only 165ccea34b5STejun Heo * pcpu_alloc_mutex locked. 166fbf59bc9STejun Heo */ 167ccea34b5STejun Heo static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ 168ccea34b5STejun Heo static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ 169fbf59bc9STejun Heo 17040150d37STejun Heo static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 171fbf59bc9STejun Heo 172a56dbddfSTejun Heo /* reclaim work to release fully free chunks, scheduled from free path */ 173a56dbddfSTejun Heo static void pcpu_reclaim(struct work_struct *work); 174a56dbddfSTejun Heo static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); 175a56dbddfSTejun Heo 176d9b55eebSTejun Heo static int __pcpu_size_to_slot(int size) 177fbf59bc9STejun Heo { 178cae3aeb8STejun Heo int highbit = fls(size); /* size is in bytes */ 179fbf59bc9STejun Heo return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); 180fbf59bc9STejun Heo } 181fbf59bc9STejun Heo 182d9b55eebSTejun Heo static int pcpu_size_to_slot(int size) 183d9b55eebSTejun Heo { 184d9b55eebSTejun Heo if (size == pcpu_unit_size) 185d9b55eebSTejun Heo return pcpu_nr_slots - 1; 186d9b55eebSTejun Heo return __pcpu_size_to_slot(size); 187d9b55eebSTejun Heo } 188d9b55eebSTejun Heo 189fbf59bc9STejun Heo static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) 190fbf59bc9STejun Heo { 191fbf59bc9STejun Heo if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) 192fbf59bc9STejun Heo return 0; 193fbf59bc9STejun Heo 194fbf59bc9STejun Heo return pcpu_size_to_slot(chunk->free_size); 195fbf59bc9STejun Heo } 196fbf59bc9STejun Heo 197fbf59bc9STejun Heo static int pcpu_page_idx(unsigned int cpu, int page_idx) 198fbf59bc9STejun Heo { 1992f39e637STejun Heo return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; 200fbf59bc9STejun Heo } 201fbf59bc9STejun Heo 202fbf59bc9STejun Heo static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, 203fbf59bc9STejun Heo unsigned int cpu, int page_idx) 204fbf59bc9STejun Heo { 205bba174f5STejun Heo return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + 206fb435d52STejun Heo (page_idx << PAGE_SHIFT); 207fbf59bc9STejun Heo } 208fbf59bc9STejun Heo 209ce3141a2STejun Heo static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, 210c8a51be4STejun Heo unsigned int cpu, int page_idx) 211fbf59bc9STejun Heo { 212ce3141a2STejun Heo /* must not be used on pre-mapped chunk */ 213ce3141a2STejun Heo WARN_ON(chunk->immutable); 214c8a51be4STejun Heo 215ce3141a2STejun Heo return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); 216fbf59bc9STejun Heo } 217fbf59bc9STejun Heo 218e1b9aa3fSChristoph Lameter /* set the pointer to a chunk in a page struct */ 219e1b9aa3fSChristoph Lameter static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) 220e1b9aa3fSChristoph Lameter { 221e1b9aa3fSChristoph Lameter page->index = (unsigned long)pcpu; 222e1b9aa3fSChristoph Lameter } 223e1b9aa3fSChristoph Lameter 224e1b9aa3fSChristoph Lameter /* obtain pointer to a chunk from a page struct */ 225e1b9aa3fSChristoph Lameter static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) 226e1b9aa3fSChristoph Lameter { 227e1b9aa3fSChristoph Lameter return (struct pcpu_chunk *)page->index; 228e1b9aa3fSChristoph Lameter } 229e1b9aa3fSChristoph Lameter 230ce3141a2STejun Heo static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end) 231ce3141a2STejun Heo { 232ce3141a2STejun Heo *rs = find_next_zero_bit(chunk->populated, end, *rs); 233ce3141a2STejun Heo *re = find_next_bit(chunk->populated, end, *rs + 1); 234ce3141a2STejun Heo } 235ce3141a2STejun Heo 236ce3141a2STejun Heo static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end) 237ce3141a2STejun Heo { 238ce3141a2STejun Heo *rs = find_next_bit(chunk->populated, end, *rs); 239ce3141a2STejun Heo *re = find_next_zero_bit(chunk->populated, end, *rs + 1); 240ce3141a2STejun Heo } 241ce3141a2STejun Heo 242ce3141a2STejun Heo /* 243ce3141a2STejun Heo * (Un)populated page region iterators. Iterate over (un)populated 244ce3141a2STejun Heo * page regions betwen @start and @end in @chunk. @rs and @re should 245ce3141a2STejun Heo * be integer variables and will be set to start and end page index of 246ce3141a2STejun Heo * the current region. 247ce3141a2STejun Heo */ 248ce3141a2STejun Heo #define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ 249ce3141a2STejun Heo for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ 250ce3141a2STejun Heo (rs) < (re); \ 251ce3141a2STejun Heo (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) 252ce3141a2STejun Heo 253ce3141a2STejun Heo #define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ 254ce3141a2STejun Heo for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ 255ce3141a2STejun Heo (rs) < (re); \ 256ce3141a2STejun Heo (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) 257ce3141a2STejun Heo 258fbf59bc9STejun Heo /** 2591880d93bSTejun Heo * pcpu_mem_alloc - allocate memory 2601880d93bSTejun Heo * @size: bytes to allocate 261fbf59bc9STejun Heo * 2621880d93bSTejun Heo * Allocate @size bytes. If @size is smaller than PAGE_SIZE, 2631880d93bSTejun Heo * kzalloc() is used; otherwise, vmalloc() is used. The returned 2641880d93bSTejun Heo * memory is always zeroed. 265fbf59bc9STejun Heo * 266ccea34b5STejun Heo * CONTEXT: 267ccea34b5STejun Heo * Does GFP_KERNEL allocation. 268ccea34b5STejun Heo * 269fbf59bc9STejun Heo * RETURNS: 2701880d93bSTejun Heo * Pointer to the allocated area on success, NULL on failure. 271fbf59bc9STejun Heo */ 2721880d93bSTejun Heo static void *pcpu_mem_alloc(size_t size) 273fbf59bc9STejun Heo { 274fbf59bc9STejun Heo if (size <= PAGE_SIZE) 2751880d93bSTejun Heo return kzalloc(size, GFP_KERNEL); 2761880d93bSTejun Heo else { 2771880d93bSTejun Heo void *ptr = vmalloc(size); 2781880d93bSTejun Heo if (ptr) 2791880d93bSTejun Heo memset(ptr, 0, size); 2801880d93bSTejun Heo return ptr; 2811880d93bSTejun Heo } 2821880d93bSTejun Heo } 283fbf59bc9STejun Heo 2841880d93bSTejun Heo /** 2851880d93bSTejun Heo * pcpu_mem_free - free memory 2861880d93bSTejun Heo * @ptr: memory to free 2871880d93bSTejun Heo * @size: size of the area 2881880d93bSTejun Heo * 2891880d93bSTejun Heo * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc(). 2901880d93bSTejun Heo */ 2911880d93bSTejun Heo static void pcpu_mem_free(void *ptr, size_t size) 2921880d93bSTejun Heo { 2931880d93bSTejun Heo if (size <= PAGE_SIZE) 2941880d93bSTejun Heo kfree(ptr); 2951880d93bSTejun Heo else 2961880d93bSTejun Heo vfree(ptr); 297fbf59bc9STejun Heo } 298fbf59bc9STejun Heo 299fbf59bc9STejun Heo /** 300fbf59bc9STejun Heo * pcpu_chunk_relocate - put chunk in the appropriate chunk slot 301fbf59bc9STejun Heo * @chunk: chunk of interest 302fbf59bc9STejun Heo * @oslot: the previous slot it was on 303fbf59bc9STejun Heo * 304fbf59bc9STejun Heo * This function is called after an allocation or free changed @chunk. 305fbf59bc9STejun Heo * New slot according to the changed state is determined and @chunk is 306edcb4639STejun Heo * moved to the slot. Note that the reserved chunk is never put on 307edcb4639STejun Heo * chunk slots. 308ccea34b5STejun Heo * 309ccea34b5STejun Heo * CONTEXT: 310ccea34b5STejun Heo * pcpu_lock. 311fbf59bc9STejun Heo */ 312fbf59bc9STejun Heo static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) 313fbf59bc9STejun Heo { 314fbf59bc9STejun Heo int nslot = pcpu_chunk_slot(chunk); 315fbf59bc9STejun Heo 316edcb4639STejun Heo if (chunk != pcpu_reserved_chunk && oslot != nslot) { 317fbf59bc9STejun Heo if (oslot < nslot) 318fbf59bc9STejun Heo list_move(&chunk->list, &pcpu_slot[nslot]); 319fbf59bc9STejun Heo else 320fbf59bc9STejun Heo list_move_tail(&chunk->list, &pcpu_slot[nslot]); 321fbf59bc9STejun Heo } 322fbf59bc9STejun Heo } 323fbf59bc9STejun Heo 324fbf59bc9STejun Heo /** 325e1b9aa3fSChristoph Lameter * pcpu_chunk_addr_search - determine chunk containing specified address 326e1b9aa3fSChristoph Lameter * @addr: address for which the chunk needs to be determined. 327ccea34b5STejun Heo * 328fbf59bc9STejun Heo * RETURNS: 329fbf59bc9STejun Heo * The address of the found chunk. 330fbf59bc9STejun Heo */ 331fbf59bc9STejun Heo static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) 332fbf59bc9STejun Heo { 333bba174f5STejun Heo void *first_start = pcpu_first_chunk->base_addr; 334fbf59bc9STejun Heo 335ae9e6bc9STejun Heo /* is it in the first chunk? */ 33679ba6ac8STejun Heo if (addr >= first_start && addr < first_start + pcpu_unit_size) { 337ae9e6bc9STejun Heo /* is it in the reserved area? */ 338ae9e6bc9STejun Heo if (addr < first_start + pcpu_reserved_chunk_limit) 339edcb4639STejun Heo return pcpu_reserved_chunk; 340ae9e6bc9STejun Heo return pcpu_first_chunk; 341edcb4639STejun Heo } 342edcb4639STejun Heo 34304a13c7cSTejun Heo /* 34404a13c7cSTejun Heo * The address is relative to unit0 which might be unused and 34504a13c7cSTejun Heo * thus unmapped. Offset the address to the unit space of the 34604a13c7cSTejun Heo * current processor before looking it up in the vmalloc 34704a13c7cSTejun Heo * space. Note that any possible cpu id can be used here, so 34804a13c7cSTejun Heo * there's no need to worry about preemption or cpu hotplug. 34904a13c7cSTejun Heo */ 3505579fd7eSTejun Heo addr += pcpu_unit_offsets[raw_smp_processor_id()]; 351e1b9aa3fSChristoph Lameter return pcpu_get_page_chunk(vmalloc_to_page(addr)); 352fbf59bc9STejun Heo } 353fbf59bc9STejun Heo 354fbf59bc9STejun Heo /** 3559f7dcf22STejun Heo * pcpu_extend_area_map - extend area map for allocation 3569f7dcf22STejun Heo * @chunk: target chunk 3579f7dcf22STejun Heo * 3589f7dcf22STejun Heo * Extend area map of @chunk so that it can accomodate an allocation. 3599f7dcf22STejun Heo * A single allocation can split an area into three areas, so this 3609f7dcf22STejun Heo * function makes sure that @chunk->map has at least two extra slots. 3619f7dcf22STejun Heo * 362ccea34b5STejun Heo * CONTEXT: 363ccea34b5STejun Heo * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired 364ccea34b5STejun Heo * if area map is extended. 365ccea34b5STejun Heo * 3669f7dcf22STejun Heo * RETURNS: 3679f7dcf22STejun Heo * 0 if noop, 1 if successfully extended, -errno on failure. 3689f7dcf22STejun Heo */ 3699f7dcf22STejun Heo static int pcpu_extend_area_map(struct pcpu_chunk *chunk) 3709f7dcf22STejun Heo { 3719f7dcf22STejun Heo int new_alloc; 3729f7dcf22STejun Heo int *new; 3739f7dcf22STejun Heo size_t size; 3749f7dcf22STejun Heo 3759f7dcf22STejun Heo /* has enough? */ 3769f7dcf22STejun Heo if (chunk->map_alloc >= chunk->map_used + 2) 3779f7dcf22STejun Heo return 0; 3789f7dcf22STejun Heo 379ccea34b5STejun Heo spin_unlock_irq(&pcpu_lock); 380ccea34b5STejun Heo 3819f7dcf22STejun Heo new_alloc = PCPU_DFL_MAP_ALLOC; 3829f7dcf22STejun Heo while (new_alloc < chunk->map_used + 2) 3839f7dcf22STejun Heo new_alloc *= 2; 3849f7dcf22STejun Heo 3859f7dcf22STejun Heo new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); 386ccea34b5STejun Heo if (!new) { 387ccea34b5STejun Heo spin_lock_irq(&pcpu_lock); 3889f7dcf22STejun Heo return -ENOMEM; 389ccea34b5STejun Heo } 390ccea34b5STejun Heo 391ccea34b5STejun Heo /* 392ccea34b5STejun Heo * Acquire pcpu_lock and switch to new area map. Only free 393ccea34b5STejun Heo * could have happened inbetween, so map_used couldn't have 394ccea34b5STejun Heo * grown. 395ccea34b5STejun Heo */ 396ccea34b5STejun Heo spin_lock_irq(&pcpu_lock); 397ccea34b5STejun Heo BUG_ON(new_alloc < chunk->map_used + 2); 3989f7dcf22STejun Heo 3999f7dcf22STejun Heo size = chunk->map_alloc * sizeof(chunk->map[0]); 4009f7dcf22STejun Heo memcpy(new, chunk->map, size); 4019f7dcf22STejun Heo 4029f7dcf22STejun Heo /* 4039f7dcf22STejun Heo * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is 4049f7dcf22STejun Heo * one of the first chunks and still using static map. 4059f7dcf22STejun Heo */ 4069f7dcf22STejun Heo if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) 4079f7dcf22STejun Heo pcpu_mem_free(chunk->map, size); 4089f7dcf22STejun Heo 4099f7dcf22STejun Heo chunk->map_alloc = new_alloc; 4109f7dcf22STejun Heo chunk->map = new; 4119f7dcf22STejun Heo return 0; 4129f7dcf22STejun Heo } 4139f7dcf22STejun Heo 4149f7dcf22STejun Heo /** 415fbf59bc9STejun Heo * pcpu_split_block - split a map block 416fbf59bc9STejun Heo * @chunk: chunk of interest 417fbf59bc9STejun Heo * @i: index of map block to split 418cae3aeb8STejun Heo * @head: head size in bytes (can be 0) 419cae3aeb8STejun Heo * @tail: tail size in bytes (can be 0) 420fbf59bc9STejun Heo * 421fbf59bc9STejun Heo * Split the @i'th map block into two or three blocks. If @head is 422fbf59bc9STejun Heo * non-zero, @head bytes block is inserted before block @i moving it 423fbf59bc9STejun Heo * to @i+1 and reducing its size by @head bytes. 424fbf59bc9STejun Heo * 425fbf59bc9STejun Heo * If @tail is non-zero, the target block, which can be @i or @i+1 426fbf59bc9STejun Heo * depending on @head, is reduced by @tail bytes and @tail byte block 427fbf59bc9STejun Heo * is inserted after the target block. 428fbf59bc9STejun Heo * 4299f7dcf22STejun Heo * @chunk->map must have enough free slots to accomodate the split. 430ccea34b5STejun Heo * 431ccea34b5STejun Heo * CONTEXT: 432ccea34b5STejun Heo * pcpu_lock. 433fbf59bc9STejun Heo */ 4349f7dcf22STejun Heo static void pcpu_split_block(struct pcpu_chunk *chunk, int i, 4359f7dcf22STejun Heo int head, int tail) 436fbf59bc9STejun Heo { 437fbf59bc9STejun Heo int nr_extra = !!head + !!tail; 438fbf59bc9STejun Heo 4399f7dcf22STejun Heo BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); 440fbf59bc9STejun Heo 4419f7dcf22STejun Heo /* insert new subblocks */ 442fbf59bc9STejun Heo memmove(&chunk->map[i + nr_extra], &chunk->map[i], 443fbf59bc9STejun Heo sizeof(chunk->map[0]) * (chunk->map_used - i)); 444fbf59bc9STejun Heo chunk->map_used += nr_extra; 445fbf59bc9STejun Heo 446fbf59bc9STejun Heo if (head) { 447fbf59bc9STejun Heo chunk->map[i + 1] = chunk->map[i] - head; 448fbf59bc9STejun Heo chunk->map[i++] = head; 449fbf59bc9STejun Heo } 450fbf59bc9STejun Heo if (tail) { 451fbf59bc9STejun Heo chunk->map[i++] -= tail; 452fbf59bc9STejun Heo chunk->map[i] = tail; 453fbf59bc9STejun Heo } 454fbf59bc9STejun Heo } 455fbf59bc9STejun Heo 456fbf59bc9STejun Heo /** 457fbf59bc9STejun Heo * pcpu_alloc_area - allocate area from a pcpu_chunk 458fbf59bc9STejun Heo * @chunk: chunk of interest 459cae3aeb8STejun Heo * @size: wanted size in bytes 460fbf59bc9STejun Heo * @align: wanted align 461fbf59bc9STejun Heo * 462fbf59bc9STejun Heo * Try to allocate @size bytes area aligned at @align from @chunk. 463fbf59bc9STejun Heo * Note that this function only allocates the offset. It doesn't 464fbf59bc9STejun Heo * populate or map the area. 465fbf59bc9STejun Heo * 4669f7dcf22STejun Heo * @chunk->map must have at least two free slots. 4679f7dcf22STejun Heo * 468ccea34b5STejun Heo * CONTEXT: 469ccea34b5STejun Heo * pcpu_lock. 470ccea34b5STejun Heo * 471fbf59bc9STejun Heo * RETURNS: 4729f7dcf22STejun Heo * Allocated offset in @chunk on success, -1 if no matching area is 4739f7dcf22STejun Heo * found. 474fbf59bc9STejun Heo */ 475fbf59bc9STejun Heo static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) 476fbf59bc9STejun Heo { 477fbf59bc9STejun Heo int oslot = pcpu_chunk_slot(chunk); 478fbf59bc9STejun Heo int max_contig = 0; 479fbf59bc9STejun Heo int i, off; 480fbf59bc9STejun Heo 481fbf59bc9STejun Heo for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { 482fbf59bc9STejun Heo bool is_last = i + 1 == chunk->map_used; 483fbf59bc9STejun Heo int head, tail; 484fbf59bc9STejun Heo 485fbf59bc9STejun Heo /* extra for alignment requirement */ 486fbf59bc9STejun Heo head = ALIGN(off, align) - off; 487fbf59bc9STejun Heo BUG_ON(i == 0 && head != 0); 488fbf59bc9STejun Heo 489fbf59bc9STejun Heo if (chunk->map[i] < 0) 490fbf59bc9STejun Heo continue; 491fbf59bc9STejun Heo if (chunk->map[i] < head + size) { 492fbf59bc9STejun Heo max_contig = max(chunk->map[i], max_contig); 493fbf59bc9STejun Heo continue; 494fbf59bc9STejun Heo } 495fbf59bc9STejun Heo 496fbf59bc9STejun Heo /* 497fbf59bc9STejun Heo * If head is small or the previous block is free, 498fbf59bc9STejun Heo * merge'em. Note that 'small' is defined as smaller 499fbf59bc9STejun Heo * than sizeof(int), which is very small but isn't too 500fbf59bc9STejun Heo * uncommon for percpu allocations. 501fbf59bc9STejun Heo */ 502fbf59bc9STejun Heo if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { 503fbf59bc9STejun Heo if (chunk->map[i - 1] > 0) 504fbf59bc9STejun Heo chunk->map[i - 1] += head; 505fbf59bc9STejun Heo else { 506fbf59bc9STejun Heo chunk->map[i - 1] -= head; 507fbf59bc9STejun Heo chunk->free_size -= head; 508fbf59bc9STejun Heo } 509fbf59bc9STejun Heo chunk->map[i] -= head; 510fbf59bc9STejun Heo off += head; 511fbf59bc9STejun Heo head = 0; 512fbf59bc9STejun Heo } 513fbf59bc9STejun Heo 514fbf59bc9STejun Heo /* if tail is small, just keep it around */ 515fbf59bc9STejun Heo tail = chunk->map[i] - head - size; 516fbf59bc9STejun Heo if (tail < sizeof(int)) 517fbf59bc9STejun Heo tail = 0; 518fbf59bc9STejun Heo 519fbf59bc9STejun Heo /* split if warranted */ 520fbf59bc9STejun Heo if (head || tail) { 5219f7dcf22STejun Heo pcpu_split_block(chunk, i, head, tail); 522fbf59bc9STejun Heo if (head) { 523fbf59bc9STejun Heo i++; 524fbf59bc9STejun Heo off += head; 525fbf59bc9STejun Heo max_contig = max(chunk->map[i - 1], max_contig); 526fbf59bc9STejun Heo } 527fbf59bc9STejun Heo if (tail) 528fbf59bc9STejun Heo max_contig = max(chunk->map[i + 1], max_contig); 529fbf59bc9STejun Heo } 530fbf59bc9STejun Heo 531fbf59bc9STejun Heo /* update hint and mark allocated */ 532fbf59bc9STejun Heo if (is_last) 533fbf59bc9STejun Heo chunk->contig_hint = max_contig; /* fully scanned */ 534fbf59bc9STejun Heo else 535fbf59bc9STejun Heo chunk->contig_hint = max(chunk->contig_hint, 536fbf59bc9STejun Heo max_contig); 537fbf59bc9STejun Heo 538fbf59bc9STejun Heo chunk->free_size -= chunk->map[i]; 539fbf59bc9STejun Heo chunk->map[i] = -chunk->map[i]; 540fbf59bc9STejun Heo 541fbf59bc9STejun Heo pcpu_chunk_relocate(chunk, oslot); 542fbf59bc9STejun Heo return off; 543fbf59bc9STejun Heo } 544fbf59bc9STejun Heo 545fbf59bc9STejun Heo chunk->contig_hint = max_contig; /* fully scanned */ 546fbf59bc9STejun Heo pcpu_chunk_relocate(chunk, oslot); 547fbf59bc9STejun Heo 5489f7dcf22STejun Heo /* tell the upper layer that this chunk has no matching area */ 5499f7dcf22STejun Heo return -1; 550fbf59bc9STejun Heo } 551fbf59bc9STejun Heo 552fbf59bc9STejun Heo /** 553fbf59bc9STejun Heo * pcpu_free_area - free area to a pcpu_chunk 554fbf59bc9STejun Heo * @chunk: chunk of interest 555fbf59bc9STejun Heo * @freeme: offset of area to free 556fbf59bc9STejun Heo * 557fbf59bc9STejun Heo * Free area starting from @freeme to @chunk. Note that this function 558fbf59bc9STejun Heo * only modifies the allocation map. It doesn't depopulate or unmap 559fbf59bc9STejun Heo * the area. 560ccea34b5STejun Heo * 561ccea34b5STejun Heo * CONTEXT: 562ccea34b5STejun Heo * pcpu_lock. 563fbf59bc9STejun Heo */ 564fbf59bc9STejun Heo static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) 565fbf59bc9STejun Heo { 566fbf59bc9STejun Heo int oslot = pcpu_chunk_slot(chunk); 567fbf59bc9STejun Heo int i, off; 568fbf59bc9STejun Heo 569fbf59bc9STejun Heo for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) 570fbf59bc9STejun Heo if (off == freeme) 571fbf59bc9STejun Heo break; 572fbf59bc9STejun Heo BUG_ON(off != freeme); 573fbf59bc9STejun Heo BUG_ON(chunk->map[i] > 0); 574fbf59bc9STejun Heo 575fbf59bc9STejun Heo chunk->map[i] = -chunk->map[i]; 576fbf59bc9STejun Heo chunk->free_size += chunk->map[i]; 577fbf59bc9STejun Heo 578fbf59bc9STejun Heo /* merge with previous? */ 579fbf59bc9STejun Heo if (i > 0 && chunk->map[i - 1] >= 0) { 580fbf59bc9STejun Heo chunk->map[i - 1] += chunk->map[i]; 581fbf59bc9STejun Heo chunk->map_used--; 582fbf59bc9STejun Heo memmove(&chunk->map[i], &chunk->map[i + 1], 583fbf59bc9STejun Heo (chunk->map_used - i) * sizeof(chunk->map[0])); 584fbf59bc9STejun Heo i--; 585fbf59bc9STejun Heo } 586fbf59bc9STejun Heo /* merge with next? */ 587fbf59bc9STejun Heo if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { 588fbf59bc9STejun Heo chunk->map[i] += chunk->map[i + 1]; 589fbf59bc9STejun Heo chunk->map_used--; 590fbf59bc9STejun Heo memmove(&chunk->map[i + 1], &chunk->map[i + 2], 591fbf59bc9STejun Heo (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); 592fbf59bc9STejun Heo } 593fbf59bc9STejun Heo 594fbf59bc9STejun Heo chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); 595fbf59bc9STejun Heo pcpu_chunk_relocate(chunk, oslot); 596fbf59bc9STejun Heo } 597fbf59bc9STejun Heo 598fbf59bc9STejun Heo /** 599ce3141a2STejun Heo * pcpu_get_pages_and_bitmap - get temp pages array and bitmap 600fbf59bc9STejun Heo * @chunk: chunk of interest 601ce3141a2STejun Heo * @bitmapp: output parameter for bitmap 602ce3141a2STejun Heo * @may_alloc: may allocate the array 603fbf59bc9STejun Heo * 604ce3141a2STejun Heo * Returns pointer to array of pointers to struct page and bitmap, 605ce3141a2STejun Heo * both of which can be indexed with pcpu_page_idx(). The returned 606ce3141a2STejun Heo * array is cleared to zero and *@bitmapp is copied from 607ce3141a2STejun Heo * @chunk->populated. Note that there is only one array and bitmap 608ce3141a2STejun Heo * and access exclusion is the caller's responsibility. 609ce3141a2STejun Heo * 610ce3141a2STejun Heo * CONTEXT: 611ce3141a2STejun Heo * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. 612ce3141a2STejun Heo * Otherwise, don't care. 613ce3141a2STejun Heo * 614ce3141a2STejun Heo * RETURNS: 615ce3141a2STejun Heo * Pointer to temp pages array on success, NULL on failure. 616fbf59bc9STejun Heo */ 617ce3141a2STejun Heo static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, 618ce3141a2STejun Heo unsigned long **bitmapp, 619ce3141a2STejun Heo bool may_alloc) 620ce3141a2STejun Heo { 621ce3141a2STejun Heo static struct page **pages; 622ce3141a2STejun Heo static unsigned long *bitmap; 6232f39e637STejun Heo size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); 624ce3141a2STejun Heo size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * 625ce3141a2STejun Heo sizeof(unsigned long); 626ce3141a2STejun Heo 627ce3141a2STejun Heo if (!pages || !bitmap) { 628ce3141a2STejun Heo if (may_alloc && !pages) 629ce3141a2STejun Heo pages = pcpu_mem_alloc(pages_size); 630ce3141a2STejun Heo if (may_alloc && !bitmap) 631ce3141a2STejun Heo bitmap = pcpu_mem_alloc(bitmap_size); 632ce3141a2STejun Heo if (!pages || !bitmap) 633ce3141a2STejun Heo return NULL; 634ce3141a2STejun Heo } 635ce3141a2STejun Heo 636ce3141a2STejun Heo memset(pages, 0, pages_size); 637ce3141a2STejun Heo bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); 638ce3141a2STejun Heo 639ce3141a2STejun Heo *bitmapp = bitmap; 640ce3141a2STejun Heo return pages; 641ce3141a2STejun Heo } 642ce3141a2STejun Heo 643ce3141a2STejun Heo /** 644ce3141a2STejun Heo * pcpu_free_pages - free pages which were allocated for @chunk 645ce3141a2STejun Heo * @chunk: chunk pages were allocated for 646ce3141a2STejun Heo * @pages: array of pages to be freed, indexed by pcpu_page_idx() 647ce3141a2STejun Heo * @populated: populated bitmap 648ce3141a2STejun Heo * @page_start: page index of the first page to be freed 649ce3141a2STejun Heo * @page_end: page index of the last page to be freed + 1 650ce3141a2STejun Heo * 651ce3141a2STejun Heo * Free pages [@page_start and @page_end) in @pages for all units. 652ce3141a2STejun Heo * The pages were allocated for @chunk. 653ce3141a2STejun Heo */ 654ce3141a2STejun Heo static void pcpu_free_pages(struct pcpu_chunk *chunk, 655ce3141a2STejun Heo struct page **pages, unsigned long *populated, 656ce3141a2STejun Heo int page_start, int page_end) 657ce3141a2STejun Heo { 658ce3141a2STejun Heo unsigned int cpu; 659ce3141a2STejun Heo int i; 660ce3141a2STejun Heo 661ce3141a2STejun Heo for_each_possible_cpu(cpu) { 662ce3141a2STejun Heo for (i = page_start; i < page_end; i++) { 663ce3141a2STejun Heo struct page *page = pages[pcpu_page_idx(cpu, i)]; 664ce3141a2STejun Heo 665ce3141a2STejun Heo if (page) 666ce3141a2STejun Heo __free_page(page); 667ce3141a2STejun Heo } 668ce3141a2STejun Heo } 669ce3141a2STejun Heo } 670ce3141a2STejun Heo 671ce3141a2STejun Heo /** 672ce3141a2STejun Heo * pcpu_alloc_pages - allocates pages for @chunk 673ce3141a2STejun Heo * @chunk: target chunk 674ce3141a2STejun Heo * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() 675ce3141a2STejun Heo * @populated: populated bitmap 676ce3141a2STejun Heo * @page_start: page index of the first page to be allocated 677ce3141a2STejun Heo * @page_end: page index of the last page to be allocated + 1 678ce3141a2STejun Heo * 679ce3141a2STejun Heo * Allocate pages [@page_start,@page_end) into @pages for all units. 680ce3141a2STejun Heo * The allocation is for @chunk. Percpu core doesn't care about the 681ce3141a2STejun Heo * content of @pages and will pass it verbatim to pcpu_map_pages(). 682ce3141a2STejun Heo */ 683ce3141a2STejun Heo static int pcpu_alloc_pages(struct pcpu_chunk *chunk, 684ce3141a2STejun Heo struct page **pages, unsigned long *populated, 685ce3141a2STejun Heo int page_start, int page_end) 686ce3141a2STejun Heo { 687ce3141a2STejun Heo const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; 688ce3141a2STejun Heo unsigned int cpu; 689ce3141a2STejun Heo int i; 690ce3141a2STejun Heo 691ce3141a2STejun Heo for_each_possible_cpu(cpu) { 692ce3141a2STejun Heo for (i = page_start; i < page_end; i++) { 693ce3141a2STejun Heo struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; 694ce3141a2STejun Heo 695ce3141a2STejun Heo *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); 696ce3141a2STejun Heo if (!*pagep) { 697ce3141a2STejun Heo pcpu_free_pages(chunk, pages, populated, 698ce3141a2STejun Heo page_start, page_end); 699ce3141a2STejun Heo return -ENOMEM; 700ce3141a2STejun Heo } 701ce3141a2STejun Heo } 702ce3141a2STejun Heo } 703ce3141a2STejun Heo return 0; 704ce3141a2STejun Heo } 705ce3141a2STejun Heo 706ce3141a2STejun Heo /** 707ce3141a2STejun Heo * pcpu_pre_unmap_flush - flush cache prior to unmapping 708ce3141a2STejun Heo * @chunk: chunk the regions to be flushed belongs to 709ce3141a2STejun Heo * @page_start: page index of the first page to be flushed 710ce3141a2STejun Heo * @page_end: page index of the last page to be flushed + 1 711ce3141a2STejun Heo * 712ce3141a2STejun Heo * Pages in [@page_start,@page_end) of @chunk are about to be 713ce3141a2STejun Heo * unmapped. Flush cache. As each flushing trial can be very 714ce3141a2STejun Heo * expensive, issue flush on the whole region at once rather than 715ce3141a2STejun Heo * doing it for each cpu. This could be an overkill but is more 716ce3141a2STejun Heo * scalable. 717ce3141a2STejun Heo */ 718ce3141a2STejun Heo static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, 719ce3141a2STejun Heo int page_start, int page_end) 720fbf59bc9STejun Heo { 7212f39e637STejun Heo flush_cache_vunmap( 7222f39e637STejun Heo pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), 7232f39e637STejun Heo pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); 724ce3141a2STejun Heo } 725fbf59bc9STejun Heo 726ce3141a2STejun Heo static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) 727ce3141a2STejun Heo { 728ce3141a2STejun Heo unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); 729ce3141a2STejun Heo } 730fbf59bc9STejun Heo 731ce3141a2STejun Heo /** 732ce3141a2STejun Heo * pcpu_unmap_pages - unmap pages out of a pcpu_chunk 733ce3141a2STejun Heo * @chunk: chunk of interest 734ce3141a2STejun Heo * @pages: pages array which can be used to pass information to free 735ce3141a2STejun Heo * @populated: populated bitmap 736fbf59bc9STejun Heo * @page_start: page index of the first page to unmap 737fbf59bc9STejun Heo * @page_end: page index of the last page to unmap + 1 738fbf59bc9STejun Heo * 739fbf59bc9STejun Heo * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 740ce3141a2STejun Heo * Corresponding elements in @pages were cleared by the caller and can 741ce3141a2STejun Heo * be used to carry information to pcpu_free_pages() which will be 742ce3141a2STejun Heo * called after all unmaps are finished. The caller should call 743ce3141a2STejun Heo * proper pre/post flush functions. 744fbf59bc9STejun Heo */ 745ce3141a2STejun Heo static void pcpu_unmap_pages(struct pcpu_chunk *chunk, 746ce3141a2STejun Heo struct page **pages, unsigned long *populated, 747ce3141a2STejun Heo int page_start, int page_end) 748fbf59bc9STejun Heo { 749fbf59bc9STejun Heo unsigned int cpu; 750ce3141a2STejun Heo int i; 751fbf59bc9STejun Heo 752ce3141a2STejun Heo for_each_possible_cpu(cpu) { 753ce3141a2STejun Heo for (i = page_start; i < page_end; i++) { 754ce3141a2STejun Heo struct page *page; 755fbf59bc9STejun Heo 756ce3141a2STejun Heo page = pcpu_chunk_page(chunk, cpu, i); 757ce3141a2STejun Heo WARN_ON(!page); 758ce3141a2STejun Heo pages[pcpu_page_idx(cpu, i)] = page; 759ce3141a2STejun Heo } 760ce3141a2STejun Heo __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), 761ce3141a2STejun Heo page_end - page_start); 762ce3141a2STejun Heo } 763ce3141a2STejun Heo 764ce3141a2STejun Heo for (i = page_start; i < page_end; i++) 765ce3141a2STejun Heo __clear_bit(i, populated); 766ce3141a2STejun Heo } 767ce3141a2STejun Heo 768ce3141a2STejun Heo /** 769ce3141a2STejun Heo * pcpu_post_unmap_tlb_flush - flush TLB after unmapping 770ce3141a2STejun Heo * @chunk: pcpu_chunk the regions to be flushed belong to 771ce3141a2STejun Heo * @page_start: page index of the first page to be flushed 772ce3141a2STejun Heo * @page_end: page index of the last page to be flushed + 1 773ce3141a2STejun Heo * 774ce3141a2STejun Heo * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush 775ce3141a2STejun Heo * TLB for the regions. This can be skipped if the area is to be 776ce3141a2STejun Heo * returned to vmalloc as vmalloc will handle TLB flushing lazily. 777ce3141a2STejun Heo * 778ce3141a2STejun Heo * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once 779ce3141a2STejun Heo * for the whole region. 780fbf59bc9STejun Heo */ 781ce3141a2STejun Heo static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, 782ce3141a2STejun Heo int page_start, int page_end) 783ce3141a2STejun Heo { 7842f39e637STejun Heo flush_tlb_kernel_range( 7852f39e637STejun Heo pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), 7862f39e637STejun Heo pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); 787fbf59bc9STejun Heo } 788fbf59bc9STejun Heo 789c8a51be4STejun Heo static int __pcpu_map_pages(unsigned long addr, struct page **pages, 790c8a51be4STejun Heo int nr_pages) 791c8a51be4STejun Heo { 792c8a51be4STejun Heo return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, 793c8a51be4STejun Heo PAGE_KERNEL, pages); 794c8a51be4STejun Heo } 795c8a51be4STejun Heo 796c8a51be4STejun Heo /** 797ce3141a2STejun Heo * pcpu_map_pages - map pages into a pcpu_chunk 798c8a51be4STejun Heo * @chunk: chunk of interest 799ce3141a2STejun Heo * @pages: pages array containing pages to be mapped 800ce3141a2STejun Heo * @populated: populated bitmap 801c8a51be4STejun Heo * @page_start: page index of the first page to map 802c8a51be4STejun Heo * @page_end: page index of the last page to map + 1 803c8a51be4STejun Heo * 804ce3141a2STejun Heo * For each cpu, map pages [@page_start,@page_end) into @chunk. The 805ce3141a2STejun Heo * caller is responsible for calling pcpu_post_map_flush() after all 806ce3141a2STejun Heo * mappings are complete. 807ce3141a2STejun Heo * 808ce3141a2STejun Heo * This function is responsible for setting corresponding bits in 809ce3141a2STejun Heo * @chunk->populated bitmap and whatever is necessary for reverse 810ce3141a2STejun Heo * lookup (addr -> chunk). 811c8a51be4STejun Heo */ 812ce3141a2STejun Heo static int pcpu_map_pages(struct pcpu_chunk *chunk, 813ce3141a2STejun Heo struct page **pages, unsigned long *populated, 814ce3141a2STejun Heo int page_start, int page_end) 815c8a51be4STejun Heo { 816ce3141a2STejun Heo unsigned int cpu, tcpu; 817ce3141a2STejun Heo int i, err; 818c8a51be4STejun Heo 819c8a51be4STejun Heo for_each_possible_cpu(cpu) { 820c8a51be4STejun Heo err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), 821ce3141a2STejun Heo &pages[pcpu_page_idx(cpu, page_start)], 822c8a51be4STejun Heo page_end - page_start); 823c8a51be4STejun Heo if (err < 0) 824ce3141a2STejun Heo goto err; 825ce3141a2STejun Heo } 826ce3141a2STejun Heo 827ce3141a2STejun Heo /* mapping successful, link chunk and mark populated */ 828ce3141a2STejun Heo for (i = page_start; i < page_end; i++) { 829fbf59bc9STejun Heo for_each_possible_cpu(cpu) 830ce3141a2STejun Heo pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], 831ce3141a2STejun Heo chunk); 832ce3141a2STejun Heo __set_bit(i, populated); 833ce3141a2STejun Heo } 834fbf59bc9STejun Heo 835ce3141a2STejun Heo return 0; 836ce3141a2STejun Heo 837ce3141a2STejun Heo err: 838ce3141a2STejun Heo for_each_possible_cpu(tcpu) { 839ce3141a2STejun Heo if (tcpu == cpu) 840ce3141a2STejun Heo break; 841ce3141a2STejun Heo __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), 842ce3141a2STejun Heo page_end - page_start); 843ce3141a2STejun Heo } 844c8a51be4STejun Heo return err; 845c8a51be4STejun Heo } 846c8a51be4STejun Heo 847ce3141a2STejun Heo /** 848ce3141a2STejun Heo * pcpu_post_map_flush - flush cache after mapping 849ce3141a2STejun Heo * @chunk: pcpu_chunk the regions to be flushed belong to 850ce3141a2STejun Heo * @page_start: page index of the first page to be flushed 851ce3141a2STejun Heo * @page_end: page index of the last page to be flushed + 1 852ce3141a2STejun Heo * 853ce3141a2STejun Heo * Pages [@page_start,@page_end) of @chunk have been mapped. Flush 854ce3141a2STejun Heo * cache. 855ce3141a2STejun Heo * 856ce3141a2STejun Heo * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once 857ce3141a2STejun Heo * for the whole region. 858ce3141a2STejun Heo */ 859ce3141a2STejun Heo static void pcpu_post_map_flush(struct pcpu_chunk *chunk, 860ce3141a2STejun Heo int page_start, int page_end) 861ce3141a2STejun Heo { 8622f39e637STejun Heo flush_cache_vmap( 8632f39e637STejun Heo pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), 8642f39e637STejun Heo pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); 865fbf59bc9STejun Heo } 866fbf59bc9STejun Heo 867fbf59bc9STejun Heo /** 868fbf59bc9STejun Heo * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 869fbf59bc9STejun Heo * @chunk: chunk to depopulate 870fbf59bc9STejun Heo * @off: offset to the area to depopulate 871cae3aeb8STejun Heo * @size: size of the area to depopulate in bytes 872fbf59bc9STejun Heo * @flush: whether to flush cache and tlb or not 873fbf59bc9STejun Heo * 874fbf59bc9STejun Heo * For each cpu, depopulate and unmap pages [@page_start,@page_end) 875fbf59bc9STejun Heo * from @chunk. If @flush is true, vcache is flushed before unmapping 876fbf59bc9STejun Heo * and tlb after. 877ccea34b5STejun Heo * 878ccea34b5STejun Heo * CONTEXT: 879ccea34b5STejun Heo * pcpu_alloc_mutex. 880fbf59bc9STejun Heo */ 881ce3141a2STejun Heo static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) 882fbf59bc9STejun Heo { 883fbf59bc9STejun Heo int page_start = PFN_DOWN(off); 884fbf59bc9STejun Heo int page_end = PFN_UP(off + size); 885ce3141a2STejun Heo struct page **pages; 886ce3141a2STejun Heo unsigned long *populated; 887ce3141a2STejun Heo int rs, re; 888fbf59bc9STejun Heo 889ce3141a2STejun Heo /* quick path, check whether it's empty already */ 890ce3141a2STejun Heo pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 891ce3141a2STejun Heo if (rs == page_start && re == page_end) 892ce3141a2STejun Heo return; 893ce3141a2STejun Heo break; 894fbf59bc9STejun Heo } 895fbf59bc9STejun Heo 896ce3141a2STejun Heo /* immutable chunks can't be depopulated */ 8978d408b4bSTejun Heo WARN_ON(chunk->immutable); 8988d408b4bSTejun Heo 899fbf59bc9STejun Heo /* 900ce3141a2STejun Heo * If control reaches here, there must have been at least one 901ce3141a2STejun Heo * successful population attempt so the temp pages array must 902ce3141a2STejun Heo * be available now. 903fbf59bc9STejun Heo */ 904ce3141a2STejun Heo pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); 905ce3141a2STejun Heo BUG_ON(!pages); 906fbf59bc9STejun Heo 907ce3141a2STejun Heo /* unmap and free */ 908ce3141a2STejun Heo pcpu_pre_unmap_flush(chunk, page_start, page_end); 909fbf59bc9STejun Heo 910ce3141a2STejun Heo pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) 911ce3141a2STejun Heo pcpu_unmap_pages(chunk, pages, populated, rs, re); 912ce3141a2STejun Heo 913ce3141a2STejun Heo /* no need to flush tlb, vmalloc will handle it lazily */ 914ce3141a2STejun Heo 915ce3141a2STejun Heo pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) 916ce3141a2STejun Heo pcpu_free_pages(chunk, pages, populated, rs, re); 917ce3141a2STejun Heo 918ce3141a2STejun Heo /* commit new bitmap */ 919ce3141a2STejun Heo bitmap_copy(chunk->populated, populated, pcpu_unit_pages); 920fbf59bc9STejun Heo } 921fbf59bc9STejun Heo 922fbf59bc9STejun Heo /** 923fbf59bc9STejun Heo * pcpu_populate_chunk - populate and map an area of a pcpu_chunk 924fbf59bc9STejun Heo * @chunk: chunk of interest 925fbf59bc9STejun Heo * @off: offset to the area to populate 926cae3aeb8STejun Heo * @size: size of the area to populate in bytes 927fbf59bc9STejun Heo * 928fbf59bc9STejun Heo * For each cpu, populate and map pages [@page_start,@page_end) into 929fbf59bc9STejun Heo * @chunk. The area is cleared on return. 930ccea34b5STejun Heo * 931ccea34b5STejun Heo * CONTEXT: 932ccea34b5STejun Heo * pcpu_alloc_mutex, does GFP_KERNEL allocation. 933fbf59bc9STejun Heo */ 934fbf59bc9STejun Heo static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 935fbf59bc9STejun Heo { 936fbf59bc9STejun Heo int page_start = PFN_DOWN(off); 937fbf59bc9STejun Heo int page_end = PFN_UP(off + size); 938ce3141a2STejun Heo int free_end = page_start, unmap_end = page_start; 939ce3141a2STejun Heo struct page **pages; 940ce3141a2STejun Heo unsigned long *populated; 941fbf59bc9STejun Heo unsigned int cpu; 942ce3141a2STejun Heo int rs, re, rc; 943fbf59bc9STejun Heo 944ce3141a2STejun Heo /* quick path, check whether all pages are already there */ 945ce3141a2STejun Heo pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { 946ce3141a2STejun Heo if (rs == page_start && re == page_end) 947ce3141a2STejun Heo goto clear; 948ce3141a2STejun Heo break; 949fbf59bc9STejun Heo } 950fbf59bc9STejun Heo 951ce3141a2STejun Heo /* need to allocate and map pages, this chunk can't be immutable */ 952ce3141a2STejun Heo WARN_ON(chunk->immutable); 953fbf59bc9STejun Heo 954ce3141a2STejun Heo pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); 955ce3141a2STejun Heo if (!pages) 956fbf59bc9STejun Heo return -ENOMEM; 957fbf59bc9STejun Heo 958ce3141a2STejun Heo /* alloc and map */ 959ce3141a2STejun Heo pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 960ce3141a2STejun Heo rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); 961ce3141a2STejun Heo if (rc) 962ce3141a2STejun Heo goto err_free; 963ce3141a2STejun Heo free_end = re; 964fbf59bc9STejun Heo } 965fbf59bc9STejun Heo 966ce3141a2STejun Heo pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 967ce3141a2STejun Heo rc = pcpu_map_pages(chunk, pages, populated, rs, re); 968ce3141a2STejun Heo if (rc) 969ce3141a2STejun Heo goto err_unmap; 970ce3141a2STejun Heo unmap_end = re; 971ce3141a2STejun Heo } 972ce3141a2STejun Heo pcpu_post_map_flush(chunk, page_start, page_end); 973fbf59bc9STejun Heo 974ce3141a2STejun Heo /* commit new bitmap */ 975ce3141a2STejun Heo bitmap_copy(chunk->populated, populated, pcpu_unit_pages); 976ce3141a2STejun Heo clear: 977fbf59bc9STejun Heo for_each_possible_cpu(cpu) 9782f39e637STejun Heo memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); 979fbf59bc9STejun Heo return 0; 980ce3141a2STejun Heo 981ce3141a2STejun Heo err_unmap: 982ce3141a2STejun Heo pcpu_pre_unmap_flush(chunk, page_start, unmap_end); 983ce3141a2STejun Heo pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) 984ce3141a2STejun Heo pcpu_unmap_pages(chunk, pages, populated, rs, re); 985ce3141a2STejun Heo pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); 986ce3141a2STejun Heo err_free: 987ce3141a2STejun Heo pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) 988ce3141a2STejun Heo pcpu_free_pages(chunk, pages, populated, rs, re); 989ce3141a2STejun Heo return rc; 990fbf59bc9STejun Heo } 991fbf59bc9STejun Heo 992fbf59bc9STejun Heo static void free_pcpu_chunk(struct pcpu_chunk *chunk) 993fbf59bc9STejun Heo { 994fbf59bc9STejun Heo if (!chunk) 995fbf59bc9STejun Heo return; 9966563297cSTejun Heo if (chunk->vms) 9976563297cSTejun Heo pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups); 9981880d93bSTejun Heo pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 999fbf59bc9STejun Heo kfree(chunk); 1000fbf59bc9STejun Heo } 1001fbf59bc9STejun Heo 1002fbf59bc9STejun Heo static struct pcpu_chunk *alloc_pcpu_chunk(void) 1003fbf59bc9STejun Heo { 1004fbf59bc9STejun Heo struct pcpu_chunk *chunk; 1005fbf59bc9STejun Heo 1006fbf59bc9STejun Heo chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); 1007fbf59bc9STejun Heo if (!chunk) 1008fbf59bc9STejun Heo return NULL; 1009fbf59bc9STejun Heo 10101880d93bSTejun Heo chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); 1011fbf59bc9STejun Heo chunk->map_alloc = PCPU_DFL_MAP_ALLOC; 1012fbf59bc9STejun Heo chunk->map[chunk->map_used++] = pcpu_unit_size; 1013fbf59bc9STejun Heo 10146563297cSTejun Heo chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, 10156563297cSTejun Heo pcpu_nr_groups, pcpu_atom_size, 10166563297cSTejun Heo GFP_KERNEL); 10176563297cSTejun Heo if (!chunk->vms) { 1018fbf59bc9STejun Heo free_pcpu_chunk(chunk); 1019fbf59bc9STejun Heo return NULL; 1020fbf59bc9STejun Heo } 1021fbf59bc9STejun Heo 1022fbf59bc9STejun Heo INIT_LIST_HEAD(&chunk->list); 1023fbf59bc9STejun Heo chunk->free_size = pcpu_unit_size; 1024fbf59bc9STejun Heo chunk->contig_hint = pcpu_unit_size; 10256563297cSTejun Heo chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0]; 1026fbf59bc9STejun Heo 1027fbf59bc9STejun Heo return chunk; 1028fbf59bc9STejun Heo } 1029fbf59bc9STejun Heo 1030fbf59bc9STejun Heo /** 1031edcb4639STejun Heo * pcpu_alloc - the percpu allocator 1032cae3aeb8STejun Heo * @size: size of area to allocate in bytes 1033fbf59bc9STejun Heo * @align: alignment of area (max PAGE_SIZE) 1034edcb4639STejun Heo * @reserved: allocate from the reserved chunk if available 1035fbf59bc9STejun Heo * 1036ccea34b5STejun Heo * Allocate percpu area of @size bytes aligned at @align. 1037ccea34b5STejun Heo * 1038ccea34b5STejun Heo * CONTEXT: 1039ccea34b5STejun Heo * Does GFP_KERNEL allocation. 1040fbf59bc9STejun Heo * 1041fbf59bc9STejun Heo * RETURNS: 1042fbf59bc9STejun Heo * Percpu pointer to the allocated area on success, NULL on failure. 1043fbf59bc9STejun Heo */ 1044edcb4639STejun Heo static void *pcpu_alloc(size_t size, size_t align, bool reserved) 1045fbf59bc9STejun Heo { 1046fbf59bc9STejun Heo struct pcpu_chunk *chunk; 1047fbf59bc9STejun Heo int slot, off; 1048fbf59bc9STejun Heo 10498d408b4bSTejun Heo if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { 1050fbf59bc9STejun Heo WARN(true, "illegal size (%zu) or align (%zu) for " 1051fbf59bc9STejun Heo "percpu allocation\n", size, align); 1052fbf59bc9STejun Heo return NULL; 1053fbf59bc9STejun Heo } 1054fbf59bc9STejun Heo 1055ccea34b5STejun Heo mutex_lock(&pcpu_alloc_mutex); 1056ccea34b5STejun Heo spin_lock_irq(&pcpu_lock); 1057fbf59bc9STejun Heo 1058edcb4639STejun Heo /* serve reserved allocations from the reserved chunk if available */ 1059edcb4639STejun Heo if (reserved && pcpu_reserved_chunk) { 1060edcb4639STejun Heo chunk = pcpu_reserved_chunk; 10619f7dcf22STejun Heo if (size > chunk->contig_hint || 10629f7dcf22STejun Heo pcpu_extend_area_map(chunk) < 0) 1063ccea34b5STejun Heo goto fail_unlock; 1064edcb4639STejun Heo off = pcpu_alloc_area(chunk, size, align); 1065edcb4639STejun Heo if (off >= 0) 1066edcb4639STejun Heo goto area_found; 1067ccea34b5STejun Heo goto fail_unlock; 1068edcb4639STejun Heo } 1069edcb4639STejun Heo 1070ccea34b5STejun Heo restart: 1071edcb4639STejun Heo /* search through normal chunks */ 1072fbf59bc9STejun Heo for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { 1073fbf59bc9STejun Heo list_for_each_entry(chunk, &pcpu_slot[slot], list) { 1074fbf59bc9STejun Heo if (size > chunk->contig_hint) 1075fbf59bc9STejun Heo continue; 1076ccea34b5STejun Heo 1077ccea34b5STejun Heo switch (pcpu_extend_area_map(chunk)) { 1078ccea34b5STejun Heo case 0: 1079ccea34b5STejun Heo break; 1080ccea34b5STejun Heo case 1: 1081ccea34b5STejun Heo goto restart; /* pcpu_lock dropped, restart */ 1082ccea34b5STejun Heo default: 1083ccea34b5STejun Heo goto fail_unlock; 1084ccea34b5STejun Heo } 1085ccea34b5STejun Heo 1086fbf59bc9STejun Heo off = pcpu_alloc_area(chunk, size, align); 1087fbf59bc9STejun Heo if (off >= 0) 1088fbf59bc9STejun Heo goto area_found; 1089fbf59bc9STejun Heo } 1090fbf59bc9STejun Heo } 1091fbf59bc9STejun Heo 1092fbf59bc9STejun Heo /* hmmm... no space left, create a new chunk */ 1093ccea34b5STejun Heo spin_unlock_irq(&pcpu_lock); 1094ccea34b5STejun Heo 1095fbf59bc9STejun Heo chunk = alloc_pcpu_chunk(); 1096fbf59bc9STejun Heo if (!chunk) 1097ccea34b5STejun Heo goto fail_unlock_mutex; 1098ccea34b5STejun Heo 1099ccea34b5STejun Heo spin_lock_irq(&pcpu_lock); 1100fbf59bc9STejun Heo pcpu_chunk_relocate(chunk, -1); 1101ccea34b5STejun Heo goto restart; 1102fbf59bc9STejun Heo 1103fbf59bc9STejun Heo area_found: 1104ccea34b5STejun Heo spin_unlock_irq(&pcpu_lock); 1105ccea34b5STejun Heo 1106fbf59bc9STejun Heo /* populate, map and clear the area */ 1107fbf59bc9STejun Heo if (pcpu_populate_chunk(chunk, off, size)) { 1108ccea34b5STejun Heo spin_lock_irq(&pcpu_lock); 1109fbf59bc9STejun Heo pcpu_free_area(chunk, off); 1110ccea34b5STejun Heo goto fail_unlock; 1111fbf59bc9STejun Heo } 1112fbf59bc9STejun Heo 1113ccea34b5STejun Heo mutex_unlock(&pcpu_alloc_mutex); 1114ccea34b5STejun Heo 1115bba174f5STejun Heo /* return address relative to base address */ 1116bba174f5STejun Heo return __addr_to_pcpu_ptr(chunk->base_addr + off); 1117ccea34b5STejun Heo 1118ccea34b5STejun Heo fail_unlock: 1119ccea34b5STejun Heo spin_unlock_irq(&pcpu_lock); 1120ccea34b5STejun Heo fail_unlock_mutex: 1121ccea34b5STejun Heo mutex_unlock(&pcpu_alloc_mutex); 1122ccea34b5STejun Heo return NULL; 1123fbf59bc9STejun Heo } 1124edcb4639STejun Heo 1125edcb4639STejun Heo /** 1126edcb4639STejun Heo * __alloc_percpu - allocate dynamic percpu area 1127edcb4639STejun Heo * @size: size of area to allocate in bytes 1128edcb4639STejun Heo * @align: alignment of area (max PAGE_SIZE) 1129edcb4639STejun Heo * 1130edcb4639STejun Heo * Allocate percpu area of @size bytes aligned at @align. Might 1131edcb4639STejun Heo * sleep. Might trigger writeouts. 1132edcb4639STejun Heo * 1133ccea34b5STejun Heo * CONTEXT: 1134ccea34b5STejun Heo * Does GFP_KERNEL allocation. 1135ccea34b5STejun Heo * 1136edcb4639STejun Heo * RETURNS: 1137edcb4639STejun Heo * Percpu pointer to the allocated area on success, NULL on failure. 1138edcb4639STejun Heo */ 1139edcb4639STejun Heo void *__alloc_percpu(size_t size, size_t align) 1140edcb4639STejun Heo { 1141edcb4639STejun Heo return pcpu_alloc(size, align, false); 1142edcb4639STejun Heo } 1143fbf59bc9STejun Heo EXPORT_SYMBOL_GPL(__alloc_percpu); 1144fbf59bc9STejun Heo 1145edcb4639STejun Heo /** 1146edcb4639STejun Heo * __alloc_reserved_percpu - allocate reserved percpu area 1147edcb4639STejun Heo * @size: size of area to allocate in bytes 1148edcb4639STejun Heo * @align: alignment of area (max PAGE_SIZE) 1149edcb4639STejun Heo * 1150edcb4639STejun Heo * Allocate percpu area of @size bytes aligned at @align from reserved 1151edcb4639STejun Heo * percpu area if arch has set it up; otherwise, allocation is served 1152edcb4639STejun Heo * from the same dynamic area. Might sleep. Might trigger writeouts. 1153edcb4639STejun Heo * 1154ccea34b5STejun Heo * CONTEXT: 1155ccea34b5STejun Heo * Does GFP_KERNEL allocation. 1156ccea34b5STejun Heo * 1157edcb4639STejun Heo * RETURNS: 1158edcb4639STejun Heo * Percpu pointer to the allocated area on success, NULL on failure. 1159edcb4639STejun Heo */ 1160edcb4639STejun Heo void *__alloc_reserved_percpu(size_t size, size_t align) 1161edcb4639STejun Heo { 1162edcb4639STejun Heo return pcpu_alloc(size, align, true); 1163edcb4639STejun Heo } 1164edcb4639STejun Heo 1165a56dbddfSTejun Heo /** 1166a56dbddfSTejun Heo * pcpu_reclaim - reclaim fully free chunks, workqueue function 1167a56dbddfSTejun Heo * @work: unused 1168a56dbddfSTejun Heo * 1169a56dbddfSTejun Heo * Reclaim all fully free chunks except for the first one. 1170ccea34b5STejun Heo * 1171ccea34b5STejun Heo * CONTEXT: 1172ccea34b5STejun Heo * workqueue context. 1173a56dbddfSTejun Heo */ 1174a56dbddfSTejun Heo static void pcpu_reclaim(struct work_struct *work) 1175fbf59bc9STejun Heo { 1176a56dbddfSTejun Heo LIST_HEAD(todo); 1177a56dbddfSTejun Heo struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; 1178a56dbddfSTejun Heo struct pcpu_chunk *chunk, *next; 1179a56dbddfSTejun Heo 1180ccea34b5STejun Heo mutex_lock(&pcpu_alloc_mutex); 1181ccea34b5STejun Heo spin_lock_irq(&pcpu_lock); 1182a56dbddfSTejun Heo 1183a56dbddfSTejun Heo list_for_each_entry_safe(chunk, next, head, list) { 11848d408b4bSTejun Heo WARN_ON(chunk->immutable); 1185a56dbddfSTejun Heo 1186a56dbddfSTejun Heo /* spare the first one */ 1187a56dbddfSTejun Heo if (chunk == list_first_entry(head, struct pcpu_chunk, list)) 1188a56dbddfSTejun Heo continue; 1189a56dbddfSTejun Heo 1190a56dbddfSTejun Heo list_move(&chunk->list, &todo); 1191a56dbddfSTejun Heo } 1192a56dbddfSTejun Heo 1193ccea34b5STejun Heo spin_unlock_irq(&pcpu_lock); 1194a56dbddfSTejun Heo 1195a56dbddfSTejun Heo list_for_each_entry_safe(chunk, next, &todo, list) { 1196ce3141a2STejun Heo pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); 1197fbf59bc9STejun Heo free_pcpu_chunk(chunk); 1198fbf59bc9STejun Heo } 1199971f3918STejun Heo 1200971f3918STejun Heo mutex_unlock(&pcpu_alloc_mutex); 1201a56dbddfSTejun Heo } 1202fbf59bc9STejun Heo 1203fbf59bc9STejun Heo /** 1204fbf59bc9STejun Heo * free_percpu - free percpu area 1205fbf59bc9STejun Heo * @ptr: pointer to area to free 1206fbf59bc9STejun Heo * 1207ccea34b5STejun Heo * Free percpu area @ptr. 1208ccea34b5STejun Heo * 1209ccea34b5STejun Heo * CONTEXT: 1210ccea34b5STejun Heo * Can be called from atomic context. 1211fbf59bc9STejun Heo */ 1212fbf59bc9STejun Heo void free_percpu(void *ptr) 1213fbf59bc9STejun Heo { 1214fbf59bc9STejun Heo void *addr = __pcpu_ptr_to_addr(ptr); 1215fbf59bc9STejun Heo struct pcpu_chunk *chunk; 1216ccea34b5STejun Heo unsigned long flags; 1217fbf59bc9STejun Heo int off; 1218fbf59bc9STejun Heo 1219fbf59bc9STejun Heo if (!ptr) 1220fbf59bc9STejun Heo return; 1221fbf59bc9STejun Heo 1222ccea34b5STejun Heo spin_lock_irqsave(&pcpu_lock, flags); 1223fbf59bc9STejun Heo 1224fbf59bc9STejun Heo chunk = pcpu_chunk_addr_search(addr); 1225bba174f5STejun Heo off = addr - chunk->base_addr; 1226fbf59bc9STejun Heo 1227fbf59bc9STejun Heo pcpu_free_area(chunk, off); 1228fbf59bc9STejun Heo 1229a56dbddfSTejun Heo /* if there are more than one fully free chunks, wake up grim reaper */ 1230fbf59bc9STejun Heo if (chunk->free_size == pcpu_unit_size) { 1231fbf59bc9STejun Heo struct pcpu_chunk *pos; 1232fbf59bc9STejun Heo 1233a56dbddfSTejun Heo list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) 1234fbf59bc9STejun Heo if (pos != chunk) { 1235a56dbddfSTejun Heo schedule_work(&pcpu_reclaim_work); 1236fbf59bc9STejun Heo break; 1237fbf59bc9STejun Heo } 1238fbf59bc9STejun Heo } 1239fbf59bc9STejun Heo 1240ccea34b5STejun Heo spin_unlock_irqrestore(&pcpu_lock, flags); 1241fbf59bc9STejun Heo } 1242fbf59bc9STejun Heo EXPORT_SYMBOL_GPL(free_percpu); 1243fbf59bc9STejun Heo 1244033e48fbSTejun Heo static inline size_t pcpu_calc_fc_sizes(size_t static_size, 1245033e48fbSTejun Heo size_t reserved_size, 1246033e48fbSTejun Heo ssize_t *dyn_sizep) 1247033e48fbSTejun Heo { 1248033e48fbSTejun Heo size_t size_sum; 1249033e48fbSTejun Heo 1250033e48fbSTejun Heo size_sum = PFN_ALIGN(static_size + reserved_size + 1251033e48fbSTejun Heo (*dyn_sizep >= 0 ? *dyn_sizep : 0)); 1252033e48fbSTejun Heo if (*dyn_sizep != 0) 1253033e48fbSTejun Heo *dyn_sizep = size_sum - static_size - reserved_size; 1254033e48fbSTejun Heo 1255033e48fbSTejun Heo return size_sum; 1256033e48fbSTejun Heo } 1257033e48fbSTejun Heo 1258fbf59bc9STejun Heo /** 1259fd1e8a1fSTejun Heo * pcpu_alloc_alloc_info - allocate percpu allocation info 1260fd1e8a1fSTejun Heo * @nr_groups: the number of groups 1261fd1e8a1fSTejun Heo * @nr_units: the number of units 1262033e48fbSTejun Heo * 1263fd1e8a1fSTejun Heo * Allocate ai which is large enough for @nr_groups groups containing 1264fd1e8a1fSTejun Heo * @nr_units units. The returned ai's groups[0].cpu_map points to the 1265fd1e8a1fSTejun Heo * cpu_map array which is long enough for @nr_units and filled with 1266fd1e8a1fSTejun Heo * NR_CPUS. It's the caller's responsibility to initialize cpu_map 1267fd1e8a1fSTejun Heo * pointer of other groups. 1268033e48fbSTejun Heo * 1269033e48fbSTejun Heo * RETURNS: 1270fd1e8a1fSTejun Heo * Pointer to the allocated pcpu_alloc_info on success, NULL on 1271fd1e8a1fSTejun Heo * failure. 1272033e48fbSTejun Heo */ 1273fd1e8a1fSTejun Heo struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, 1274fd1e8a1fSTejun Heo int nr_units) 1275fd1e8a1fSTejun Heo { 1276fd1e8a1fSTejun Heo struct pcpu_alloc_info *ai; 1277fd1e8a1fSTejun Heo size_t base_size, ai_size; 1278fd1e8a1fSTejun Heo void *ptr; 1279fd1e8a1fSTejun Heo int unit; 1280fd1e8a1fSTejun Heo 1281fd1e8a1fSTejun Heo base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), 1282fd1e8a1fSTejun Heo __alignof__(ai->groups[0].cpu_map[0])); 1283fd1e8a1fSTejun Heo ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); 1284fd1e8a1fSTejun Heo 1285fd1e8a1fSTejun Heo ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); 1286fd1e8a1fSTejun Heo if (!ptr) 1287fd1e8a1fSTejun Heo return NULL; 1288fd1e8a1fSTejun Heo ai = ptr; 1289fd1e8a1fSTejun Heo ptr += base_size; 1290fd1e8a1fSTejun Heo 1291fd1e8a1fSTejun Heo ai->groups[0].cpu_map = ptr; 1292fd1e8a1fSTejun Heo 1293fd1e8a1fSTejun Heo for (unit = 0; unit < nr_units; unit++) 1294fd1e8a1fSTejun Heo ai->groups[0].cpu_map[unit] = NR_CPUS; 1295fd1e8a1fSTejun Heo 1296fd1e8a1fSTejun Heo ai->nr_groups = nr_groups; 1297fd1e8a1fSTejun Heo ai->__ai_size = PFN_ALIGN(ai_size); 1298fd1e8a1fSTejun Heo 1299fd1e8a1fSTejun Heo return ai; 1300fd1e8a1fSTejun Heo } 1301fd1e8a1fSTejun Heo 1302fd1e8a1fSTejun Heo /** 1303fd1e8a1fSTejun Heo * pcpu_free_alloc_info - free percpu allocation info 1304fd1e8a1fSTejun Heo * @ai: pcpu_alloc_info to free 1305fd1e8a1fSTejun Heo * 1306fd1e8a1fSTejun Heo * Free @ai which was allocated by pcpu_alloc_alloc_info(). 1307fd1e8a1fSTejun Heo */ 1308fd1e8a1fSTejun Heo void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) 1309fd1e8a1fSTejun Heo { 1310fd1e8a1fSTejun Heo free_bootmem(__pa(ai), ai->__ai_size); 1311fd1e8a1fSTejun Heo } 1312fd1e8a1fSTejun Heo 1313fd1e8a1fSTejun Heo /** 1314fd1e8a1fSTejun Heo * pcpu_build_alloc_info - build alloc_info considering distances between CPUs 1315edcb4639STejun Heo * @reserved_size: the size of reserved percpu area in bytes 1316cafe8816STejun Heo * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1317fd1e8a1fSTejun Heo * @atom_size: allocation atom size 1318fd1e8a1fSTejun Heo * @cpu_distance_fn: callback to determine distance between cpus, optional 1319fd1e8a1fSTejun Heo * 1320fd1e8a1fSTejun Heo * This function determines grouping of units, their mappings to cpus 1321fd1e8a1fSTejun Heo * and other parameters considering needed percpu size, allocation 1322fd1e8a1fSTejun Heo * atom size and distances between CPUs. 1323fd1e8a1fSTejun Heo * 1324fd1e8a1fSTejun Heo * Groups are always mutliples of atom size and CPUs which are of 1325fd1e8a1fSTejun Heo * LOCAL_DISTANCE both ways are grouped together and share space for 1326fd1e8a1fSTejun Heo * units in the same group. The returned configuration is guaranteed 1327fd1e8a1fSTejun Heo * to have CPUs on different nodes on different groups and >=75% usage 1328fd1e8a1fSTejun Heo * of allocated virtual address space. 1329fd1e8a1fSTejun Heo * 1330fd1e8a1fSTejun Heo * RETURNS: 1331fd1e8a1fSTejun Heo * On success, pointer to the new allocation_info is returned. On 1332fd1e8a1fSTejun Heo * failure, ERR_PTR value is returned. 1333fd1e8a1fSTejun Heo */ 1334fd1e8a1fSTejun Heo struct pcpu_alloc_info * __init pcpu_build_alloc_info( 1335fd1e8a1fSTejun Heo size_t reserved_size, ssize_t dyn_size, 1336fd1e8a1fSTejun Heo size_t atom_size, 1337033e48fbSTejun Heo pcpu_fc_cpu_distance_fn_t cpu_distance_fn) 1338033e48fbSTejun Heo { 1339033e48fbSTejun Heo static int group_map[NR_CPUS] __initdata; 1340033e48fbSTejun Heo static int group_cnt[NR_CPUS] __initdata; 1341033e48fbSTejun Heo const size_t static_size = __per_cpu_end - __per_cpu_start; 1342fd1e8a1fSTejun Heo int group_cnt_max = 0, nr_groups = 1, nr_units = 0; 1343033e48fbSTejun Heo size_t size_sum, min_unit_size, alloc_size; 1344033e48fbSTejun Heo int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ 1345fd1e8a1fSTejun Heo int last_allocs, group, unit; 1346033e48fbSTejun Heo unsigned int cpu, tcpu; 1347fd1e8a1fSTejun Heo struct pcpu_alloc_info *ai; 1348fd1e8a1fSTejun Heo unsigned int *cpu_map; 1349033e48fbSTejun Heo 1350fb59e72eSTejun Heo /* this function may be called multiple times */ 1351fb59e72eSTejun Heo memset(group_map, 0, sizeof(group_map)); 1352fb59e72eSTejun Heo memset(group_cnt, 0, sizeof(group_map)); 1353fb59e72eSTejun Heo 1354033e48fbSTejun Heo /* 1355033e48fbSTejun Heo * Determine min_unit_size, alloc_size and max_upa such that 1356fd1e8a1fSTejun Heo * alloc_size is multiple of atom_size and is the smallest 1357033e48fbSTejun Heo * which can accomodate 4k aligned segments which are equal to 1358033e48fbSTejun Heo * or larger than min_unit_size. 1359033e48fbSTejun Heo */ 1360fd1e8a1fSTejun Heo size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); 1361033e48fbSTejun Heo min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); 1362033e48fbSTejun Heo 1363fd1e8a1fSTejun Heo alloc_size = roundup(min_unit_size, atom_size); 1364033e48fbSTejun Heo upa = alloc_size / min_unit_size; 1365033e48fbSTejun Heo while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) 1366033e48fbSTejun Heo upa--; 1367033e48fbSTejun Heo max_upa = upa; 1368033e48fbSTejun Heo 1369033e48fbSTejun Heo /* group cpus according to their proximity */ 1370033e48fbSTejun Heo for_each_possible_cpu(cpu) { 1371033e48fbSTejun Heo group = 0; 1372033e48fbSTejun Heo next_group: 1373033e48fbSTejun Heo for_each_possible_cpu(tcpu) { 1374033e48fbSTejun Heo if (cpu == tcpu) 1375033e48fbSTejun Heo break; 1376fd1e8a1fSTejun Heo if (group_map[tcpu] == group && cpu_distance_fn && 1377033e48fbSTejun Heo (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || 1378033e48fbSTejun Heo cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { 1379033e48fbSTejun Heo group++; 1380fd1e8a1fSTejun Heo nr_groups = max(nr_groups, group + 1); 1381033e48fbSTejun Heo goto next_group; 1382033e48fbSTejun Heo } 1383033e48fbSTejun Heo } 1384033e48fbSTejun Heo group_map[cpu] = group; 1385033e48fbSTejun Heo group_cnt[group]++; 1386033e48fbSTejun Heo group_cnt_max = max(group_cnt_max, group_cnt[group]); 1387033e48fbSTejun Heo } 1388033e48fbSTejun Heo 1389033e48fbSTejun Heo /* 1390033e48fbSTejun Heo * Expand unit size until address space usage goes over 75% 1391033e48fbSTejun Heo * and then as much as possible without using more address 1392033e48fbSTejun Heo * space. 1393033e48fbSTejun Heo */ 1394033e48fbSTejun Heo last_allocs = INT_MAX; 1395033e48fbSTejun Heo for (upa = max_upa; upa; upa--) { 1396033e48fbSTejun Heo int allocs = 0, wasted = 0; 1397033e48fbSTejun Heo 1398033e48fbSTejun Heo if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) 1399033e48fbSTejun Heo continue; 1400033e48fbSTejun Heo 1401fd1e8a1fSTejun Heo for (group = 0; group < nr_groups; group++) { 1402033e48fbSTejun Heo int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); 1403033e48fbSTejun Heo allocs += this_allocs; 1404033e48fbSTejun Heo wasted += this_allocs * upa - group_cnt[group]; 1405033e48fbSTejun Heo } 1406033e48fbSTejun Heo 1407033e48fbSTejun Heo /* 1408033e48fbSTejun Heo * Don't accept if wastage is over 25%. The 1409033e48fbSTejun Heo * greater-than comparison ensures upa==1 always 1410033e48fbSTejun Heo * passes the following check. 1411033e48fbSTejun Heo */ 1412033e48fbSTejun Heo if (wasted > num_possible_cpus() / 3) 1413033e48fbSTejun Heo continue; 1414033e48fbSTejun Heo 1415033e48fbSTejun Heo /* and then don't consume more memory */ 1416033e48fbSTejun Heo if (allocs > last_allocs) 1417033e48fbSTejun Heo break; 1418033e48fbSTejun Heo last_allocs = allocs; 1419033e48fbSTejun Heo best_upa = upa; 1420033e48fbSTejun Heo } 1421fd1e8a1fSTejun Heo upa = best_upa; 1422033e48fbSTejun Heo 1423fd1e8a1fSTejun Heo /* allocate and fill alloc_info */ 1424fd1e8a1fSTejun Heo for (group = 0; group < nr_groups; group++) 1425fd1e8a1fSTejun Heo nr_units += roundup(group_cnt[group], upa); 1426fd1e8a1fSTejun Heo 1427fd1e8a1fSTejun Heo ai = pcpu_alloc_alloc_info(nr_groups, nr_units); 1428fd1e8a1fSTejun Heo if (!ai) 1429fd1e8a1fSTejun Heo return ERR_PTR(-ENOMEM); 1430fd1e8a1fSTejun Heo cpu_map = ai->groups[0].cpu_map; 1431fd1e8a1fSTejun Heo 1432fd1e8a1fSTejun Heo for (group = 0; group < nr_groups; group++) { 1433fd1e8a1fSTejun Heo ai->groups[group].cpu_map = cpu_map; 1434fd1e8a1fSTejun Heo cpu_map += roundup(group_cnt[group], upa); 1435fd1e8a1fSTejun Heo } 1436fd1e8a1fSTejun Heo 1437fd1e8a1fSTejun Heo ai->static_size = static_size; 1438fd1e8a1fSTejun Heo ai->reserved_size = reserved_size; 1439fd1e8a1fSTejun Heo ai->dyn_size = dyn_size; 1440fd1e8a1fSTejun Heo ai->unit_size = alloc_size / upa; 1441fd1e8a1fSTejun Heo ai->atom_size = atom_size; 1442fd1e8a1fSTejun Heo ai->alloc_size = alloc_size; 1443fd1e8a1fSTejun Heo 1444fd1e8a1fSTejun Heo for (group = 0, unit = 0; group_cnt[group]; group++) { 1445fd1e8a1fSTejun Heo struct pcpu_group_info *gi = &ai->groups[group]; 1446fd1e8a1fSTejun Heo 1447fd1e8a1fSTejun Heo /* 1448fd1e8a1fSTejun Heo * Initialize base_offset as if all groups are located 1449fd1e8a1fSTejun Heo * back-to-back. The caller should update this to 1450fd1e8a1fSTejun Heo * reflect actual allocation. 1451fd1e8a1fSTejun Heo */ 1452fd1e8a1fSTejun Heo gi->base_offset = unit * ai->unit_size; 1453fd1e8a1fSTejun Heo 1454033e48fbSTejun Heo for_each_possible_cpu(cpu) 1455033e48fbSTejun Heo if (group_map[cpu] == group) 1456fd1e8a1fSTejun Heo gi->cpu_map[gi->nr_units++] = cpu; 1457fd1e8a1fSTejun Heo gi->nr_units = roundup(gi->nr_units, upa); 1458fd1e8a1fSTejun Heo unit += gi->nr_units; 1459fd1e8a1fSTejun Heo } 1460fd1e8a1fSTejun Heo BUG_ON(unit != nr_units); 1461fd1e8a1fSTejun Heo 1462fd1e8a1fSTejun Heo return ai; 1463033e48fbSTejun Heo } 1464033e48fbSTejun Heo 1465fd1e8a1fSTejun Heo /** 1466fd1e8a1fSTejun Heo * pcpu_dump_alloc_info - print out information about pcpu_alloc_info 1467fd1e8a1fSTejun Heo * @lvl: loglevel 1468fd1e8a1fSTejun Heo * @ai: allocation info to dump 1469fd1e8a1fSTejun Heo * 1470fd1e8a1fSTejun Heo * Print out information about @ai using loglevel @lvl. 1471fd1e8a1fSTejun Heo */ 1472fd1e8a1fSTejun Heo static void pcpu_dump_alloc_info(const char *lvl, 1473fd1e8a1fSTejun Heo const struct pcpu_alloc_info *ai) 1474033e48fbSTejun Heo { 1475fd1e8a1fSTejun Heo int group_width = 1, cpu_width = 1, width; 1476033e48fbSTejun Heo char empty_str[] = "--------"; 1477fd1e8a1fSTejun Heo int alloc = 0, alloc_end = 0; 1478fd1e8a1fSTejun Heo int group, v; 1479fd1e8a1fSTejun Heo int upa, apl; /* units per alloc, allocs per line */ 1480033e48fbSTejun Heo 1481fd1e8a1fSTejun Heo v = ai->nr_groups; 1482033e48fbSTejun Heo while (v /= 10) 1483fd1e8a1fSTejun Heo group_width++; 1484033e48fbSTejun Heo 1485fd1e8a1fSTejun Heo v = num_possible_cpus(); 1486fd1e8a1fSTejun Heo while (v /= 10) 1487fd1e8a1fSTejun Heo cpu_width++; 1488fd1e8a1fSTejun Heo empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; 1489033e48fbSTejun Heo 1490fd1e8a1fSTejun Heo upa = ai->alloc_size / ai->unit_size; 1491fd1e8a1fSTejun Heo width = upa * (cpu_width + 1) + group_width + 3; 1492fd1e8a1fSTejun Heo apl = rounddown_pow_of_two(max(60 / width, 1)); 1493033e48fbSTejun Heo 1494fd1e8a1fSTejun Heo printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", 1495fd1e8a1fSTejun Heo lvl, ai->static_size, ai->reserved_size, ai->dyn_size, 1496fd1e8a1fSTejun Heo ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); 1497fd1e8a1fSTejun Heo 1498fd1e8a1fSTejun Heo for (group = 0; group < ai->nr_groups; group++) { 1499fd1e8a1fSTejun Heo const struct pcpu_group_info *gi = &ai->groups[group]; 1500fd1e8a1fSTejun Heo int unit = 0, unit_end = 0; 1501fd1e8a1fSTejun Heo 1502fd1e8a1fSTejun Heo BUG_ON(gi->nr_units % upa); 1503fd1e8a1fSTejun Heo for (alloc_end += gi->nr_units / upa; 1504fd1e8a1fSTejun Heo alloc < alloc_end; alloc++) { 1505fd1e8a1fSTejun Heo if (!(alloc % apl)) { 1506033e48fbSTejun Heo printk("\n"); 1507fd1e8a1fSTejun Heo printk("%spcpu-alloc: ", lvl); 1508033e48fbSTejun Heo } 1509fd1e8a1fSTejun Heo printk("[%0*d] ", group_width, group); 1510fd1e8a1fSTejun Heo 1511fd1e8a1fSTejun Heo for (unit_end += upa; unit < unit_end; unit++) 1512fd1e8a1fSTejun Heo if (gi->cpu_map[unit] != NR_CPUS) 1513fd1e8a1fSTejun Heo printk("%0*d ", cpu_width, 1514fd1e8a1fSTejun Heo gi->cpu_map[unit]); 1515033e48fbSTejun Heo else 1516033e48fbSTejun Heo printk("%s ", empty_str); 1517033e48fbSTejun Heo } 1518fd1e8a1fSTejun Heo } 1519033e48fbSTejun Heo printk("\n"); 1520033e48fbSTejun Heo } 1521033e48fbSTejun Heo 1522fbf59bc9STejun Heo /** 15238d408b4bSTejun Heo * pcpu_setup_first_chunk - initialize the first percpu chunk 1524fd1e8a1fSTejun Heo * @ai: pcpu_alloc_info describing how to percpu area is shaped 152538a6be52STejun Heo * @base_addr: mapped address 1526fbf59bc9STejun Heo * 15278d408b4bSTejun Heo * Initialize the first percpu chunk which contains the kernel static 15288d408b4bSTejun Heo * perpcu area. This function is to be called from arch percpu area 152938a6be52STejun Heo * setup path. 15308d408b4bSTejun Heo * 1531fd1e8a1fSTejun Heo * @ai contains all information necessary to initialize the first 1532fd1e8a1fSTejun Heo * chunk and prime the dynamic percpu allocator. 15338d408b4bSTejun Heo * 1534fd1e8a1fSTejun Heo * @ai->static_size is the size of static percpu area. 1535fd1e8a1fSTejun Heo * 1536fd1e8a1fSTejun Heo * @ai->reserved_size, if non-zero, specifies the amount of bytes to 1537edcb4639STejun Heo * reserve after the static area in the first chunk. This reserves 1538edcb4639STejun Heo * the first chunk such that it's available only through reserved 1539edcb4639STejun Heo * percpu allocation. This is primarily used to serve module percpu 1540edcb4639STejun Heo * static areas on architectures where the addressing model has 1541edcb4639STejun Heo * limited offset range for symbol relocations to guarantee module 1542edcb4639STejun Heo * percpu symbols fall inside the relocatable range. 1543edcb4639STejun Heo * 1544fd1e8a1fSTejun Heo * @ai->dyn_size determines the number of bytes available for dynamic 1545fd1e8a1fSTejun Heo * allocation in the first chunk. The area between @ai->static_size + 1546fd1e8a1fSTejun Heo * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. 15476074d5b0STejun Heo * 1548fd1e8a1fSTejun Heo * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE 1549fd1e8a1fSTejun Heo * and equal to or larger than @ai->static_size + @ai->reserved_size + 1550fd1e8a1fSTejun Heo * @ai->dyn_size. 15518d408b4bSTejun Heo * 1552fd1e8a1fSTejun Heo * @ai->atom_size is the allocation atom size and used as alignment 1553fd1e8a1fSTejun Heo * for vm areas. 15548d408b4bSTejun Heo * 1555fd1e8a1fSTejun Heo * @ai->alloc_size is the allocation size and always multiple of 1556fd1e8a1fSTejun Heo * @ai->atom_size. This is larger than @ai->atom_size if 1557fd1e8a1fSTejun Heo * @ai->unit_size is larger than @ai->atom_size. 1558fd1e8a1fSTejun Heo * 1559fd1e8a1fSTejun Heo * @ai->nr_groups and @ai->groups describe virtual memory layout of 1560fd1e8a1fSTejun Heo * percpu areas. Units which should be colocated are put into the 1561fd1e8a1fSTejun Heo * same group. Dynamic VM areas will be allocated according to these 1562fd1e8a1fSTejun Heo * groupings. If @ai->nr_groups is zero, a single group containing 1563fd1e8a1fSTejun Heo * all units is assumed. 15648d408b4bSTejun Heo * 156538a6be52STejun Heo * The caller should have mapped the first chunk at @base_addr and 156638a6be52STejun Heo * copied static data to each unit. 1567fbf59bc9STejun Heo * 1568edcb4639STejun Heo * If the first chunk ends up with both reserved and dynamic areas, it 1569edcb4639STejun Heo * is served by two chunks - one to serve the core static and reserved 1570edcb4639STejun Heo * areas and the other for the dynamic area. They share the same vm 1571edcb4639STejun Heo * and page map but uses different area allocation map to stay away 1572edcb4639STejun Heo * from each other. The latter chunk is circulated in the chunk slots 1573edcb4639STejun Heo * and available for dynamic allocation like any other chunks. 1574edcb4639STejun Heo * 1575fbf59bc9STejun Heo * RETURNS: 1576fb435d52STejun Heo * 0 on success, -errno on failure. 1577fbf59bc9STejun Heo */ 1578fb435d52STejun Heo int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, 1579fd1e8a1fSTejun Heo void *base_addr) 1580fbf59bc9STejun Heo { 1581*635b75fcSTejun Heo static char cpus_buf[4096] __initdata; 1582edcb4639STejun Heo static int smap[2], dmap[2]; 1583fd1e8a1fSTejun Heo size_t dyn_size = ai->dyn_size; 1584fd1e8a1fSTejun Heo size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; 1585edcb4639STejun Heo struct pcpu_chunk *schunk, *dchunk = NULL; 15866563297cSTejun Heo unsigned long *group_offsets; 15876563297cSTejun Heo size_t *group_sizes; 1588fb435d52STejun Heo unsigned long *unit_off; 1589fbf59bc9STejun Heo unsigned int cpu; 1590fd1e8a1fSTejun Heo int *unit_map; 1591fd1e8a1fSTejun Heo int group, unit, i; 1592fbf59bc9STejun Heo 1593*635b75fcSTejun Heo cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask); 1594*635b75fcSTejun Heo 1595*635b75fcSTejun Heo #define PCPU_SETUP_BUG_ON(cond) do { \ 1596*635b75fcSTejun Heo if (unlikely(cond)) { \ 1597*635b75fcSTejun Heo pr_emerg("PERCPU: failed to initialize, %s", #cond); \ 1598*635b75fcSTejun Heo pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ 1599*635b75fcSTejun Heo pcpu_dump_alloc_info(KERN_EMERG, ai); \ 1600*635b75fcSTejun Heo BUG(); \ 1601*635b75fcSTejun Heo } \ 1602*635b75fcSTejun Heo } while (0) 1603*635b75fcSTejun Heo 16042f39e637STejun Heo /* sanity checks */ 1605edcb4639STejun Heo BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1606edcb4639STejun Heo ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); 1607*635b75fcSTejun Heo PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); 1608*635b75fcSTejun Heo PCPU_SETUP_BUG_ON(!ai->static_size); 1609*635b75fcSTejun Heo PCPU_SETUP_BUG_ON(!base_addr); 1610*635b75fcSTejun Heo PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); 1611*635b75fcSTejun Heo PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); 1612*635b75fcSTejun Heo PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); 16138d408b4bSTejun Heo 16146563297cSTejun Heo /* process group information and build config tables accordingly */ 16156563297cSTejun Heo group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); 16166563297cSTejun Heo group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); 1617fd1e8a1fSTejun Heo unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); 1618fb435d52STejun Heo unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); 16192f39e637STejun Heo 1620fd1e8a1fSTejun Heo for (cpu = 0; cpu < nr_cpu_ids; cpu++) 1621ffe0d5a5STejun Heo unit_map[cpu] = UINT_MAX; 1622fd1e8a1fSTejun Heo pcpu_first_unit_cpu = NR_CPUS; 16232f39e637STejun Heo 1624fd1e8a1fSTejun Heo for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { 1625fd1e8a1fSTejun Heo const struct pcpu_group_info *gi = &ai->groups[group]; 16262f39e637STejun Heo 16276563297cSTejun Heo group_offsets[group] = gi->base_offset; 16286563297cSTejun Heo group_sizes[group] = gi->nr_units * ai->unit_size; 16296563297cSTejun Heo 1630fd1e8a1fSTejun Heo for (i = 0; i < gi->nr_units; i++) { 1631fd1e8a1fSTejun Heo cpu = gi->cpu_map[i]; 1632fd1e8a1fSTejun Heo if (cpu == NR_CPUS) 1633fd1e8a1fSTejun Heo continue; 1634fd1e8a1fSTejun Heo 1635*635b75fcSTejun Heo PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids); 1636*635b75fcSTejun Heo PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); 1637*635b75fcSTejun Heo PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); 1638fd1e8a1fSTejun Heo 1639fd1e8a1fSTejun Heo unit_map[cpu] = unit + i; 1640fb435d52STejun Heo unit_off[cpu] = gi->base_offset + i * ai->unit_size; 1641fb435d52STejun Heo 1642fd1e8a1fSTejun Heo if (pcpu_first_unit_cpu == NR_CPUS) 16432f39e637STejun Heo pcpu_first_unit_cpu = cpu; 16442f39e637STejun Heo } 1645fd1e8a1fSTejun Heo } 16462f39e637STejun Heo pcpu_last_unit_cpu = cpu; 1647fd1e8a1fSTejun Heo pcpu_nr_units = unit; 16482f39e637STejun Heo 16492f39e637STejun Heo for_each_possible_cpu(cpu) 1650*635b75fcSTejun Heo PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); 1651*635b75fcSTejun Heo 1652*635b75fcSTejun Heo /* we're done parsing the input, undefine BUG macro and dump config */ 1653*635b75fcSTejun Heo #undef PCPU_SETUP_BUG_ON 1654*635b75fcSTejun Heo pcpu_dump_alloc_info(KERN_INFO, ai); 16552f39e637STejun Heo 16566563297cSTejun Heo pcpu_nr_groups = ai->nr_groups; 16576563297cSTejun Heo pcpu_group_offsets = group_offsets; 16586563297cSTejun Heo pcpu_group_sizes = group_sizes; 1659fd1e8a1fSTejun Heo pcpu_unit_map = unit_map; 1660fb435d52STejun Heo pcpu_unit_offsets = unit_off; 16612f39e637STejun Heo 16622f39e637STejun Heo /* determine basic parameters */ 1663fd1e8a1fSTejun Heo pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; 1664d9b55eebSTejun Heo pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 16656563297cSTejun Heo pcpu_atom_size = ai->atom_size; 1666ce3141a2STejun Heo pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + 1667ce3141a2STejun Heo BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); 1668cafe8816STejun Heo 1669d9b55eebSTejun Heo /* 1670d9b55eebSTejun Heo * Allocate chunk slots. The additional last slot is for 1671d9b55eebSTejun Heo * empty chunks. 1672d9b55eebSTejun Heo */ 1673d9b55eebSTejun Heo pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; 1674fbf59bc9STejun Heo pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); 1675fbf59bc9STejun Heo for (i = 0; i < pcpu_nr_slots; i++) 1676fbf59bc9STejun Heo INIT_LIST_HEAD(&pcpu_slot[i]); 1677fbf59bc9STejun Heo 1678edcb4639STejun Heo /* 1679edcb4639STejun Heo * Initialize static chunk. If reserved_size is zero, the 1680edcb4639STejun Heo * static chunk covers static area + dynamic allocation area 1681edcb4639STejun Heo * in the first chunk. If reserved_size is not zero, it 1682edcb4639STejun Heo * covers static area + reserved area (mostly used for module 1683edcb4639STejun Heo * static percpu allocation). 1684edcb4639STejun Heo */ 16852441d15cSTejun Heo schunk = alloc_bootmem(pcpu_chunk_struct_size); 16862441d15cSTejun Heo INIT_LIST_HEAD(&schunk->list); 1687bba174f5STejun Heo schunk->base_addr = base_addr; 168861ace7faSTejun Heo schunk->map = smap; 168961ace7faSTejun Heo schunk->map_alloc = ARRAY_SIZE(smap); 169038a6be52STejun Heo schunk->immutable = true; 1691ce3141a2STejun Heo bitmap_fill(schunk->populated, pcpu_unit_pages); 1692edcb4639STejun Heo 1693fd1e8a1fSTejun Heo if (ai->reserved_size) { 1694fd1e8a1fSTejun Heo schunk->free_size = ai->reserved_size; 1695ae9e6bc9STejun Heo pcpu_reserved_chunk = schunk; 1696fd1e8a1fSTejun Heo pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; 1697edcb4639STejun Heo } else { 16982441d15cSTejun Heo schunk->free_size = dyn_size; 1699edcb4639STejun Heo dyn_size = 0; /* dynamic area covered */ 1700edcb4639STejun Heo } 17012441d15cSTejun Heo schunk->contig_hint = schunk->free_size; 1702fbf59bc9STejun Heo 1703fd1e8a1fSTejun Heo schunk->map[schunk->map_used++] = -ai->static_size; 170461ace7faSTejun Heo if (schunk->free_size) 170561ace7faSTejun Heo schunk->map[schunk->map_used++] = schunk->free_size; 170661ace7faSTejun Heo 1707edcb4639STejun Heo /* init dynamic chunk if necessary */ 1708edcb4639STejun Heo if (dyn_size) { 1709ce3141a2STejun Heo dchunk = alloc_bootmem(pcpu_chunk_struct_size); 1710edcb4639STejun Heo INIT_LIST_HEAD(&dchunk->list); 1711bba174f5STejun Heo dchunk->base_addr = base_addr; 1712edcb4639STejun Heo dchunk->map = dmap; 1713edcb4639STejun Heo dchunk->map_alloc = ARRAY_SIZE(dmap); 171438a6be52STejun Heo dchunk->immutable = true; 1715ce3141a2STejun Heo bitmap_fill(dchunk->populated, pcpu_unit_pages); 1716edcb4639STejun Heo 1717edcb4639STejun Heo dchunk->contig_hint = dchunk->free_size = dyn_size; 1718edcb4639STejun Heo dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; 1719edcb4639STejun Heo dchunk->map[dchunk->map_used++] = dchunk->free_size; 1720edcb4639STejun Heo } 1721edcb4639STejun Heo 17222441d15cSTejun Heo /* link the first chunk in */ 1723ae9e6bc9STejun Heo pcpu_first_chunk = dchunk ?: schunk; 1724ae9e6bc9STejun Heo pcpu_chunk_relocate(pcpu_first_chunk, -1); 1725fbf59bc9STejun Heo 1726fbf59bc9STejun Heo /* we're done */ 1727bba174f5STejun Heo pcpu_base_addr = base_addr; 1728fb435d52STejun Heo return 0; 1729fbf59bc9STejun Heo } 173066c3a757STejun Heo 1731f58dc01bSTejun Heo const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { 1732f58dc01bSTejun Heo [PCPU_FC_AUTO] = "auto", 1733f58dc01bSTejun Heo [PCPU_FC_EMBED] = "embed", 1734f58dc01bSTejun Heo [PCPU_FC_PAGE] = "page", 1735f58dc01bSTejun Heo }; 173666c3a757STejun Heo 1737f58dc01bSTejun Heo enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; 1738f58dc01bSTejun Heo 1739f58dc01bSTejun Heo static int __init percpu_alloc_setup(char *str) 174066c3a757STejun Heo { 1741f58dc01bSTejun Heo if (0) 1742f58dc01bSTejun Heo /* nada */; 1743f58dc01bSTejun Heo #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK 1744f58dc01bSTejun Heo else if (!strcmp(str, "embed")) 1745f58dc01bSTejun Heo pcpu_chosen_fc = PCPU_FC_EMBED; 1746f58dc01bSTejun Heo #endif 1747f58dc01bSTejun Heo #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 1748f58dc01bSTejun Heo else if (!strcmp(str, "page")) 1749f58dc01bSTejun Heo pcpu_chosen_fc = PCPU_FC_PAGE; 1750f58dc01bSTejun Heo #endif 1751f58dc01bSTejun Heo else 1752f58dc01bSTejun Heo pr_warning("PERCPU: unknown allocator %s specified\n", str); 175366c3a757STejun Heo 1754f58dc01bSTejun Heo return 0; 175566c3a757STejun Heo } 1756f58dc01bSTejun Heo early_param("percpu_alloc", percpu_alloc_setup); 175766c3a757STejun Heo 175808fc4580STejun Heo #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ 175908fc4580STejun Heo !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) 176066c3a757STejun Heo /** 176166c3a757STejun Heo * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 176266c3a757STejun Heo * @reserved_size: the size of reserved percpu area in bytes 176366c3a757STejun Heo * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1764c8826dd5STejun Heo * @atom_size: allocation atom size 1765c8826dd5STejun Heo * @cpu_distance_fn: callback to determine distance between cpus, optional 1766c8826dd5STejun Heo * @alloc_fn: function to allocate percpu page 1767c8826dd5STejun Heo * @free_fn: funtion to free percpu page 176866c3a757STejun Heo * 176966c3a757STejun Heo * This is a helper to ease setting up embedded first percpu chunk and 177066c3a757STejun Heo * can be called where pcpu_setup_first_chunk() is expected. 177166c3a757STejun Heo * 177266c3a757STejun Heo * If this function is used to setup the first chunk, it is allocated 1773c8826dd5STejun Heo * by calling @alloc_fn and used as-is without being mapped into 1774c8826dd5STejun Heo * vmalloc area. Allocations are always whole multiples of @atom_size 1775c8826dd5STejun Heo * aligned to @atom_size. 1776c8826dd5STejun Heo * 1777c8826dd5STejun Heo * This enables the first chunk to piggy back on the linear physical 1778c8826dd5STejun Heo * mapping which often uses larger page size. Please note that this 1779c8826dd5STejun Heo * can result in very sparse cpu->unit mapping on NUMA machines thus 1780c8826dd5STejun Heo * requiring large vmalloc address space. Don't use this allocator if 1781c8826dd5STejun Heo * vmalloc space is not orders of magnitude larger than distances 1782c8826dd5STejun Heo * between node memory addresses (ie. 32bit NUMA machines). 178366c3a757STejun Heo * 178466c3a757STejun Heo * When @dyn_size is positive, dynamic area might be larger than 1785788e5abcSTejun Heo * specified to fill page alignment. When @dyn_size is auto, 1786788e5abcSTejun Heo * @dyn_size is just big enough to fill page alignment after static 1787788e5abcSTejun Heo * and reserved areas. 178866c3a757STejun Heo * 178966c3a757STejun Heo * If the needed size is smaller than the minimum or specified unit 1790c8826dd5STejun Heo * size, the leftover is returned using @free_fn. 179166c3a757STejun Heo * 179266c3a757STejun Heo * RETURNS: 1793fb435d52STejun Heo * 0 on success, -errno on failure. 179466c3a757STejun Heo */ 1795c8826dd5STejun Heo int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, 1796c8826dd5STejun Heo size_t atom_size, 1797c8826dd5STejun Heo pcpu_fc_cpu_distance_fn_t cpu_distance_fn, 1798c8826dd5STejun Heo pcpu_fc_alloc_fn_t alloc_fn, 1799c8826dd5STejun Heo pcpu_fc_free_fn_t free_fn) 180066c3a757STejun Heo { 1801c8826dd5STejun Heo void *base = (void *)ULONG_MAX; 1802c8826dd5STejun Heo void **areas = NULL; 1803fd1e8a1fSTejun Heo struct pcpu_alloc_info *ai; 18046ea529a2STejun Heo size_t size_sum, areas_size, max_distance; 1805c8826dd5STejun Heo int group, i, rc; 180666c3a757STejun Heo 1807c8826dd5STejun Heo ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, 1808c8826dd5STejun Heo cpu_distance_fn); 1809fd1e8a1fSTejun Heo if (IS_ERR(ai)) 1810fd1e8a1fSTejun Heo return PTR_ERR(ai); 181166c3a757STejun Heo 1812fd1e8a1fSTejun Heo size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; 1813c8826dd5STejun Heo areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); 181466c3a757STejun Heo 1815c8826dd5STejun Heo areas = alloc_bootmem_nopanic(areas_size); 1816c8826dd5STejun Heo if (!areas) { 1817fb435d52STejun Heo rc = -ENOMEM; 1818c8826dd5STejun Heo goto out_free; 1819fa8a7094STejun Heo } 182066c3a757STejun Heo 1821c8826dd5STejun Heo /* allocate, copy and determine base address */ 1822c8826dd5STejun Heo for (group = 0; group < ai->nr_groups; group++) { 1823c8826dd5STejun Heo struct pcpu_group_info *gi = &ai->groups[group]; 1824c8826dd5STejun Heo unsigned int cpu = NR_CPUS; 1825c8826dd5STejun Heo void *ptr; 182666c3a757STejun Heo 1827c8826dd5STejun Heo for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) 1828c8826dd5STejun Heo cpu = gi->cpu_map[i]; 1829c8826dd5STejun Heo BUG_ON(cpu == NR_CPUS); 1830c8826dd5STejun Heo 1831c8826dd5STejun Heo /* allocate space for the whole group */ 1832c8826dd5STejun Heo ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); 1833c8826dd5STejun Heo if (!ptr) { 1834c8826dd5STejun Heo rc = -ENOMEM; 1835c8826dd5STejun Heo goto out_free_areas; 1836c8826dd5STejun Heo } 1837c8826dd5STejun Heo areas[group] = ptr; 1838c8826dd5STejun Heo 1839c8826dd5STejun Heo base = min(ptr, base); 1840c8826dd5STejun Heo 1841c8826dd5STejun Heo for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { 1842c8826dd5STejun Heo if (gi->cpu_map[i] == NR_CPUS) { 1843c8826dd5STejun Heo /* unused unit, free whole */ 1844c8826dd5STejun Heo free_fn(ptr, ai->unit_size); 1845c8826dd5STejun Heo continue; 1846c8826dd5STejun Heo } 1847c8826dd5STejun Heo /* copy and return the unused part */ 1848fd1e8a1fSTejun Heo memcpy(ptr, __per_cpu_load, ai->static_size); 1849c8826dd5STejun Heo free_fn(ptr + size_sum, ai->unit_size - size_sum); 1850c8826dd5STejun Heo } 185166c3a757STejun Heo } 185266c3a757STejun Heo 1853c8826dd5STejun Heo /* base address is now known, determine group base offsets */ 18546ea529a2STejun Heo max_distance = 0; 18556ea529a2STejun Heo for (group = 0; group < ai->nr_groups; group++) { 1856c8826dd5STejun Heo ai->groups[group].base_offset = areas[group] - base; 18576ea529a2STejun Heo max_distance = max(max_distance, ai->groups[group].base_offset); 18586ea529a2STejun Heo } 18596ea529a2STejun Heo max_distance += ai->unit_size; 18606ea529a2STejun Heo 18616ea529a2STejun Heo /* warn if maximum distance is further than 75% of vmalloc space */ 18626ea529a2STejun Heo if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { 18636ea529a2STejun Heo pr_warning("PERCPU: max_distance=0x%lx too large for vmalloc " 18646ea529a2STejun Heo "space 0x%lx\n", 18656ea529a2STejun Heo max_distance, VMALLOC_END - VMALLOC_START); 18666ea529a2STejun Heo #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 18676ea529a2STejun Heo /* and fail if we have fallback */ 18686ea529a2STejun Heo rc = -EINVAL; 18696ea529a2STejun Heo goto out_free; 18706ea529a2STejun Heo #endif 18716ea529a2STejun Heo } 1872c8826dd5STejun Heo 1873004018e2STejun Heo pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", 1874fd1e8a1fSTejun Heo PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, 1875fd1e8a1fSTejun Heo ai->dyn_size, ai->unit_size); 187666c3a757STejun Heo 1877fb435d52STejun Heo rc = pcpu_setup_first_chunk(ai, base); 1878c8826dd5STejun Heo goto out_free; 1879c8826dd5STejun Heo 1880c8826dd5STejun Heo out_free_areas: 1881c8826dd5STejun Heo for (group = 0; group < ai->nr_groups; group++) 1882c8826dd5STejun Heo free_fn(areas[group], 1883c8826dd5STejun Heo ai->groups[group].nr_units * ai->unit_size); 1884c8826dd5STejun Heo out_free: 1885fd1e8a1fSTejun Heo pcpu_free_alloc_info(ai); 1886c8826dd5STejun Heo if (areas) 1887c8826dd5STejun Heo free_bootmem(__pa(areas), areas_size); 1888fb435d52STejun Heo return rc; 1889d4b95f80STejun Heo } 189008fc4580STejun Heo #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || 189108fc4580STejun Heo !CONFIG_HAVE_SETUP_PER_CPU_AREA */ 1892d4b95f80STejun Heo 189308fc4580STejun Heo #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 1894d4b95f80STejun Heo /** 189500ae4064STejun Heo * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages 1896d4b95f80STejun Heo * @reserved_size: the size of reserved percpu area in bytes 1897d4b95f80STejun Heo * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE 1898d4b95f80STejun Heo * @free_fn: funtion to free percpu page, always called with PAGE_SIZE 1899d4b95f80STejun Heo * @populate_pte_fn: function to populate pte 1900d4b95f80STejun Heo * 190100ae4064STejun Heo * This is a helper to ease setting up page-remapped first percpu 190200ae4064STejun Heo * chunk and can be called where pcpu_setup_first_chunk() is expected. 1903d4b95f80STejun Heo * 1904d4b95f80STejun Heo * This is the basic allocator. Static percpu area is allocated 1905d4b95f80STejun Heo * page-by-page into vmalloc area. 1906d4b95f80STejun Heo * 1907d4b95f80STejun Heo * RETURNS: 1908fb435d52STejun Heo * 0 on success, -errno on failure. 1909d4b95f80STejun Heo */ 1910fb435d52STejun Heo int __init pcpu_page_first_chunk(size_t reserved_size, 1911d4b95f80STejun Heo pcpu_fc_alloc_fn_t alloc_fn, 1912d4b95f80STejun Heo pcpu_fc_free_fn_t free_fn, 1913d4b95f80STejun Heo pcpu_fc_populate_pte_fn_t populate_pte_fn) 1914d4b95f80STejun Heo { 19158f05a6a6STejun Heo static struct vm_struct vm; 1916fd1e8a1fSTejun Heo struct pcpu_alloc_info *ai; 191700ae4064STejun Heo char psize_str[16]; 1918ce3141a2STejun Heo int unit_pages; 1919d4b95f80STejun Heo size_t pages_size; 1920ce3141a2STejun Heo struct page **pages; 1921fb435d52STejun Heo int unit, i, j, rc; 1922d4b95f80STejun Heo 192300ae4064STejun Heo snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); 192400ae4064STejun Heo 1925fd1e8a1fSTejun Heo ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL); 1926fd1e8a1fSTejun Heo if (IS_ERR(ai)) 1927fd1e8a1fSTejun Heo return PTR_ERR(ai); 1928fd1e8a1fSTejun Heo BUG_ON(ai->nr_groups != 1); 1929fd1e8a1fSTejun Heo BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); 1930fd1e8a1fSTejun Heo 1931fd1e8a1fSTejun Heo unit_pages = ai->unit_size >> PAGE_SHIFT; 1932d4b95f80STejun Heo 1933d4b95f80STejun Heo /* unaligned allocations can't be freed, round up to page size */ 1934fd1e8a1fSTejun Heo pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * 1935fd1e8a1fSTejun Heo sizeof(pages[0])); 1936ce3141a2STejun Heo pages = alloc_bootmem(pages_size); 1937d4b95f80STejun Heo 19388f05a6a6STejun Heo /* allocate pages */ 1939d4b95f80STejun Heo j = 0; 1940fd1e8a1fSTejun Heo for (unit = 0; unit < num_possible_cpus(); unit++) 1941ce3141a2STejun Heo for (i = 0; i < unit_pages; i++) { 1942fd1e8a1fSTejun Heo unsigned int cpu = ai->groups[0].cpu_map[unit]; 1943d4b95f80STejun Heo void *ptr; 1944d4b95f80STejun Heo 19453cbc8565STejun Heo ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); 1946d4b95f80STejun Heo if (!ptr) { 194700ae4064STejun Heo pr_warning("PERCPU: failed to allocate %s page " 194800ae4064STejun Heo "for cpu%u\n", psize_str, cpu); 1949d4b95f80STejun Heo goto enomem; 1950d4b95f80STejun Heo } 1951ce3141a2STejun Heo pages[j++] = virt_to_page(ptr); 1952d4b95f80STejun Heo } 1953d4b95f80STejun Heo 19548f05a6a6STejun Heo /* allocate vm area, map the pages and copy static data */ 19558f05a6a6STejun Heo vm.flags = VM_ALLOC; 1956fd1e8a1fSTejun Heo vm.size = num_possible_cpus() * ai->unit_size; 19578f05a6a6STejun Heo vm_area_register_early(&vm, PAGE_SIZE); 19588f05a6a6STejun Heo 1959fd1e8a1fSTejun Heo for (unit = 0; unit < num_possible_cpus(); unit++) { 19601d9d3257STejun Heo unsigned long unit_addr = 1961fd1e8a1fSTejun Heo (unsigned long)vm.addr + unit * ai->unit_size; 19628f05a6a6STejun Heo 1963ce3141a2STejun Heo for (i = 0; i < unit_pages; i++) 19648f05a6a6STejun Heo populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); 19658f05a6a6STejun Heo 19668f05a6a6STejun Heo /* pte already populated, the following shouldn't fail */ 1967fb435d52STejun Heo rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], 1968ce3141a2STejun Heo unit_pages); 1969fb435d52STejun Heo if (rc < 0) 1970fb435d52STejun Heo panic("failed to map percpu area, err=%d\n", rc); 19718f05a6a6STejun Heo 19728f05a6a6STejun Heo /* 19738f05a6a6STejun Heo * FIXME: Archs with virtual cache should flush local 19748f05a6a6STejun Heo * cache for the linear mapping here - something 19758f05a6a6STejun Heo * equivalent to flush_cache_vmap() on the local cpu. 19768f05a6a6STejun Heo * flush_cache_vmap() can't be used as most supporting 19778f05a6a6STejun Heo * data structures are not set up yet. 19788f05a6a6STejun Heo */ 19798f05a6a6STejun Heo 19808f05a6a6STejun Heo /* copy static data */ 1981fd1e8a1fSTejun Heo memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); 198266c3a757STejun Heo } 198366c3a757STejun Heo 198466c3a757STejun Heo /* we're ready, commit */ 19851d9d3257STejun Heo pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", 1986fd1e8a1fSTejun Heo unit_pages, psize_str, vm.addr, ai->static_size, 1987fd1e8a1fSTejun Heo ai->reserved_size, ai->dyn_size); 198866c3a757STejun Heo 1989fb435d52STejun Heo rc = pcpu_setup_first_chunk(ai, vm.addr); 1990d4b95f80STejun Heo goto out_free_ar; 1991d4b95f80STejun Heo 1992d4b95f80STejun Heo enomem: 1993d4b95f80STejun Heo while (--j >= 0) 1994ce3141a2STejun Heo free_fn(page_address(pages[j]), PAGE_SIZE); 1995fb435d52STejun Heo rc = -ENOMEM; 1996d4b95f80STejun Heo out_free_ar: 1997ce3141a2STejun Heo free_bootmem(__pa(pages), pages_size); 1998fd1e8a1fSTejun Heo pcpu_free_alloc_info(ai); 1999fb435d52STejun Heo return rc; 200066c3a757STejun Heo } 200108fc4580STejun Heo #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ 2002d4b95f80STejun Heo 20038c4bfc6eSTejun Heo /* 2004e74e3962STejun Heo * Generic percpu area setup. 2005e74e3962STejun Heo * 2006e74e3962STejun Heo * The embedding helper is used because its behavior closely resembles 2007e74e3962STejun Heo * the original non-dynamic generic percpu area setup. This is 2008e74e3962STejun Heo * important because many archs have addressing restrictions and might 2009e74e3962STejun Heo * fail if the percpu area is located far away from the previous 2010e74e3962STejun Heo * location. As an added bonus, in non-NUMA cases, embedding is 2011e74e3962STejun Heo * generally a good idea TLB-wise because percpu area can piggy back 2012e74e3962STejun Heo * on the physical linear memory mapping which uses large page 2013e74e3962STejun Heo * mappings on applicable archs. 2014e74e3962STejun Heo */ 2015e74e3962STejun Heo #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA 2016e74e3962STejun Heo unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 2017e74e3962STejun Heo EXPORT_SYMBOL(__per_cpu_offset); 2018e74e3962STejun Heo 2019c8826dd5STejun Heo static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, 2020c8826dd5STejun Heo size_t align) 2021c8826dd5STejun Heo { 2022c8826dd5STejun Heo return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); 2023c8826dd5STejun Heo } 2024c8826dd5STejun Heo 2025c8826dd5STejun Heo static void __init pcpu_dfl_fc_free(void *ptr, size_t size) 2026c8826dd5STejun Heo { 2027c8826dd5STejun Heo free_bootmem(__pa(ptr), size); 2028c8826dd5STejun Heo } 2029c8826dd5STejun Heo 2030e74e3962STejun Heo void __init setup_per_cpu_areas(void) 2031e74e3962STejun Heo { 2032e74e3962STejun Heo unsigned long delta; 2033e74e3962STejun Heo unsigned int cpu; 2034fb435d52STejun Heo int rc; 2035e74e3962STejun Heo 2036e74e3962STejun Heo /* 2037e74e3962STejun Heo * Always reserve area for module percpu variables. That's 2038e74e3962STejun Heo * what the legacy allocator did. 2039e74e3962STejun Heo */ 2040fb435d52STejun Heo rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, 2041c8826dd5STejun Heo PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, 2042c8826dd5STejun Heo pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); 2043fb435d52STejun Heo if (rc < 0) 2044e74e3962STejun Heo panic("Failed to initialized percpu areas."); 2045e74e3962STejun Heo 2046e74e3962STejun Heo delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 2047e74e3962STejun Heo for_each_possible_cpu(cpu) 2048fb435d52STejun Heo __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; 2049e74e3962STejun Heo } 2050e74e3962STejun Heo #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ 2051