1378b39a4SYinghai Lu #include <linux/kernel.h> 2378b39a4SYinghai Lu #include <linux/module.h> 3378b39a4SYinghai Lu #include <linux/init.h> 4378b39a4SYinghai Lu #include <linux/bootmem.h> 5378b39a4SYinghai Lu #include <linux/percpu.h> 6378b39a4SYinghai Lu #include <linux/kexec.h> 7378b39a4SYinghai Lu #include <linux/crash_dump.h> 88a87dd9aSJaswinder Singh Rajput #include <linux/smp.h> 98a87dd9aSJaswinder Singh Rajput #include <linux/topology.h> 105f5d8405STejun Heo #include <linux/pfn.h> 11378b39a4SYinghai Lu #include <asm/sections.h> 12378b39a4SYinghai Lu #include <asm/processor.h> 13378b39a4SYinghai Lu #include <asm/setup.h> 14378b39a4SYinghai Lu #include <asm/mpspec.h> 15378b39a4SYinghai Lu #include <asm/apicdef.h> 16378b39a4SYinghai Lu #include <asm/highmem.h> 171a51e3a0STejun Heo #include <asm/proto.h> 1806879033SJaswinder Singh Rajput #include <asm/cpumask.h> 1934019be1SBrian Gerst #include <asm/cpu.h> 2060a5317fSTejun Heo #include <asm/stackprotector.h> 21378b39a4SYinghai Lu 22c90aa894SMike Travis #ifdef CONFIG_DEBUG_PER_CPU_MAPS 23c90aa894SMike Travis # define DBG(x...) printk(KERN_DEBUG x) 24c90aa894SMike Travis #else 25c90aa894SMike Travis # define DBG(x...) 26c90aa894SMike Travis #endif 27c90aa894SMike Travis 28ea927906SBrian Gerst DEFINE_PER_CPU(int, cpu_number); 29ea927906SBrian Gerst EXPORT_PER_CPU_SYMBOL(cpu_number); 30ea927906SBrian Gerst 311688401aSBrian Gerst #ifdef CONFIG_X86_64 321688401aSBrian Gerst #define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) 331688401aSBrian Gerst #else 341688401aSBrian Gerst #define BOOT_PERCPU_OFFSET 0 351688401aSBrian Gerst #endif 361688401aSBrian Gerst 371688401aSBrian Gerst DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; 381688401aSBrian Gerst EXPORT_PER_CPU_SYMBOL(this_cpu_off); 391688401aSBrian Gerst 409939ddafSTejun Heo unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { 4134019be1SBrian Gerst [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, 429939ddafSTejun Heo }; 439939ddafSTejun Heo EXPORT_SYMBOL(__per_cpu_offset); 44378b39a4SYinghai Lu 456b19b0c2STejun Heo /* 466b19b0c2STejun Heo * On x86_64 symbols referenced from code should be reachable using 476b19b0c2STejun Heo * 32bit relocations. Reserve space for static percpu variables in 486b19b0c2STejun Heo * modules so that they are always served from the first chunk which 496b19b0c2STejun Heo * is located at the percpu segment base. On x86_32, anything can 506b19b0c2STejun Heo * address anywhere. No need to reserve space in the first chunk. 516b19b0c2STejun Heo */ 526b19b0c2STejun Heo #ifdef CONFIG_X86_64 536b19b0c2STejun Heo #define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE 546b19b0c2STejun Heo #else 556b19b0c2STejun Heo #define PERCPU_FIRST_CHUNK_RESERVE 0 566b19b0c2STejun Heo #endif 576b19b0c2STejun Heo 585f5d8405STejun Heo /** 5989c92151STejun Heo * pcpu_need_numa - determine percpu allocation needs to consider NUMA 6089c92151STejun Heo * 6189c92151STejun Heo * If NUMA is not configured or there is only one NUMA node available, 6289c92151STejun Heo * there is no reason to consider NUMA. This function determines 6389c92151STejun Heo * whether percpu allocation should consider NUMA or not. 6489c92151STejun Heo * 6589c92151STejun Heo * RETURNS: 6689c92151STejun Heo * true if NUMA should be considered; otherwise, false. 6789c92151STejun Heo */ 6889c92151STejun Heo static bool __init pcpu_need_numa(void) 6989c92151STejun Heo { 7089c92151STejun Heo #ifdef CONFIG_NEED_MULTIPLE_NODES 7189c92151STejun Heo pg_data_t *last = NULL; 7289c92151STejun Heo unsigned int cpu; 7389c92151STejun Heo 7489c92151STejun Heo for_each_possible_cpu(cpu) { 7589c92151STejun Heo int node = early_cpu_to_node(cpu); 7689c92151STejun Heo 7789c92151STejun Heo if (node_online(node) && NODE_DATA(node) && 7889c92151STejun Heo last && last != NODE_DATA(node)) 7989c92151STejun Heo return true; 8089c92151STejun Heo 8189c92151STejun Heo last = NODE_DATA(node); 8289c92151STejun Heo } 8389c92151STejun Heo #endif 8489c92151STejun Heo return false; 8589c92151STejun Heo } 8689c92151STejun Heo 8789c92151STejun Heo /** 885f5d8405STejun Heo * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu 895f5d8405STejun Heo * @cpu: cpu to allocate for 905f5d8405STejun Heo * @size: size allocation in bytes 915f5d8405STejun Heo * @align: alignment 925f5d8405STejun Heo * 935f5d8405STejun Heo * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper 945f5d8405STejun Heo * does the right thing for NUMA regardless of the current 955f5d8405STejun Heo * configuration. 965f5d8405STejun Heo * 975f5d8405STejun Heo * RETURNS: 985f5d8405STejun Heo * Pointer to the allocated area on success, NULL on failure. 995f5d8405STejun Heo */ 1005f5d8405STejun Heo static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, 1015f5d8405STejun Heo unsigned long align) 1025f5d8405STejun Heo { 1035f5d8405STejun Heo const unsigned long goal = __pa(MAX_DMA_ADDRESS); 1045f5d8405STejun Heo #ifdef CONFIG_NEED_MULTIPLE_NODES 1055f5d8405STejun Heo int node = early_cpu_to_node(cpu); 1065f5d8405STejun Heo void *ptr; 1075f5d8405STejun Heo 1085f5d8405STejun Heo if (!node_online(node) || !NODE_DATA(node)) { 1095f5d8405STejun Heo ptr = __alloc_bootmem_nopanic(size, align, goal); 1105f5d8405STejun Heo pr_info("cpu %d has no node %d or node-local memory\n", 1115f5d8405STejun Heo cpu, node); 1125f5d8405STejun Heo pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", 1135f5d8405STejun Heo cpu, size, __pa(ptr)); 1145f5d8405STejun Heo } else { 1155f5d8405STejun Heo ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), 1165f5d8405STejun Heo size, align, goal); 1175f5d8405STejun Heo pr_debug("per cpu data for cpu%d %lu bytes on node%d at " 1185f5d8405STejun Heo "%016lx\n", cpu, size, node, __pa(ptr)); 1195f5d8405STejun Heo } 1205f5d8405STejun Heo return ptr; 1215f5d8405STejun Heo #else 1225f5d8405STejun Heo return __alloc_bootmem_nopanic(size, align, goal); 1235f5d8405STejun Heo #endif 1245f5d8405STejun Heo } 1255f5d8405STejun Heo 1265f5d8405STejun Heo /* 12797c9bf06STejun Heo * Large page remap allocator 1288ac83757STejun Heo * 1298ac83757STejun Heo * This allocator uses PMD page as unit. A PMD page is allocated for 1308ac83757STejun Heo * each cpu and each is remapped into vmalloc area using PMD mapping. 1318ac83757STejun Heo * As PMD page is quite large, only part of it is used for the first 1328ac83757STejun Heo * chunk. Unused part is returned to the bootmem allocator. 1338ac83757STejun Heo * 1348ac83757STejun Heo * So, the PMD pages are mapped twice - once to the physical mapping 1358ac83757STejun Heo * and to the vmalloc area for the first percpu chunk. The double 1368ac83757STejun Heo * mapping does add one more PMD TLB entry pressure but still is much 1378ac83757STejun Heo * better than only using 4k mappings while still being NUMA friendly. 1388ac83757STejun Heo */ 1398ac83757STejun Heo #ifdef CONFIG_NEED_MULTIPLE_NODES 1400ff2587fSTejun Heo struct pcpul_ent { 1410ff2587fSTejun Heo unsigned int cpu; 1420ff2587fSTejun Heo void *ptr; 1430ff2587fSTejun Heo }; 1440ff2587fSTejun Heo 145e59a1bb2STejun Heo static size_t pcpul_size; 146e59a1bb2STejun Heo static struct pcpul_ent *pcpul_map; 1470ff2587fSTejun Heo static struct vm_struct pcpul_vm; 1488ac83757STejun Heo 14997c9bf06STejun Heo static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) 1508ac83757STejun Heo { 1518ac83757STejun Heo size_t off = (size_t)pageno << PAGE_SHIFT; 1528ac83757STejun Heo 15397c9bf06STejun Heo if (off >= pcpul_size) 1548ac83757STejun Heo return NULL; 1558ac83757STejun Heo 1560ff2587fSTejun Heo return virt_to_page(pcpul_map[cpu].ptr + off); 1578ac83757STejun Heo } 1588ac83757STejun Heo 159fa8a7094STejun Heo static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 1608ac83757STejun Heo { 1610ff2587fSTejun Heo size_t map_size, dyn_size; 1628ac83757STejun Heo unsigned int cpu; 163e59a1bb2STejun Heo int i, j; 1648ac83757STejun Heo ssize_t ret; 1658ac83757STejun Heo 166*0017c869STejun Heo if (!chosen) { 167*0017c869STejun Heo size_t vm_size = VMALLOC_END - VMALLOC_START; 168*0017c869STejun Heo size_t tot_size = num_possible_cpus() * PMD_SIZE; 169*0017c869STejun Heo 170fa8a7094STejun Heo /* on non-NUMA, embedding is better */ 171*0017c869STejun Heo if (!pcpu_need_numa()) 1728ac83757STejun Heo return -EINVAL; 1738ac83757STejun Heo 174*0017c869STejun Heo /* don't consume more than 20% of vmalloc area */ 175*0017c869STejun Heo if (tot_size > vm_size / 5) { 176*0017c869STejun Heo pr_info("PERCPU: too large chunk size %zuMB for " 177*0017c869STejun Heo "large page remap\n", tot_size >> 20); 178*0017c869STejun Heo return -EINVAL; 179*0017c869STejun Heo } 180*0017c869STejun Heo } 181*0017c869STejun Heo 182fa8a7094STejun Heo /* need PSE */ 183fa8a7094STejun Heo if (!cpu_has_pse) { 184fa8a7094STejun Heo pr_warning("PERCPU: lpage allocator requires PSE\n"); 185fa8a7094STejun Heo return -EINVAL; 186fa8a7094STejun Heo } 187fa8a7094STejun Heo 1888ac83757STejun Heo /* 1898ac83757STejun Heo * Currently supports only single page. Supporting multiple 1908ac83757STejun Heo * pages won't be too difficult if it ever becomes necessary. 1918ac83757STejun Heo */ 19297c9bf06STejun Heo pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + 1936b19b0c2STejun Heo PERCPU_DYNAMIC_RESERVE); 19497c9bf06STejun Heo if (pcpul_size > PMD_SIZE) { 1958ac83757STejun Heo pr_warning("PERCPU: static data is larger than large page, " 1968ac83757STejun Heo "can't use large page\n"); 1978ac83757STejun Heo return -EINVAL; 1988ac83757STejun Heo } 19997c9bf06STejun Heo dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; 2008ac83757STejun Heo 2018ac83757STejun Heo /* allocate pointer array and alloc large pages */ 2020ff2587fSTejun Heo map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); 2030ff2587fSTejun Heo pcpul_map = alloc_bootmem(map_size); 2048ac83757STejun Heo 2058ac83757STejun Heo for_each_possible_cpu(cpu) { 2060ff2587fSTejun Heo pcpul_map[cpu].cpu = cpu; 2070ff2587fSTejun Heo pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, 2080ff2587fSTejun Heo PMD_SIZE); 209fa8a7094STejun Heo if (!pcpul_map[cpu].ptr) { 210fa8a7094STejun Heo pr_warning("PERCPU: failed to allocate large page " 211fa8a7094STejun Heo "for cpu%u\n", cpu); 2128ac83757STejun Heo goto enomem; 213fa8a7094STejun Heo } 2148ac83757STejun Heo 2158ac83757STejun Heo /* 21697c9bf06STejun Heo * Only use pcpul_size bytes and give back the rest. 2178ac83757STejun Heo * 2188ac83757STejun Heo * Ingo: The 2MB up-rounding bootmem is needed to make 2198ac83757STejun Heo * sure the partial 2MB page is still fully RAM - it's 2208ac83757STejun Heo * not well-specified to have a PAT-incompatible area 2218ac83757STejun Heo * (unmapped RAM, device memory, etc.) in that hole. 2228ac83757STejun Heo */ 2230ff2587fSTejun Heo free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), 22497c9bf06STejun Heo PMD_SIZE - pcpul_size); 2258ac83757STejun Heo 2260ff2587fSTejun Heo memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); 2278ac83757STejun Heo } 2288ac83757STejun Heo 2298ac83757STejun Heo /* allocate address and map */ 2300ff2587fSTejun Heo pcpul_vm.flags = VM_ALLOC; 2310ff2587fSTejun Heo pcpul_vm.size = num_possible_cpus() * PMD_SIZE; 2320ff2587fSTejun Heo vm_area_register_early(&pcpul_vm, PMD_SIZE); 2338ac83757STejun Heo 2348ac83757STejun Heo for_each_possible_cpu(cpu) { 2350ff2587fSTejun Heo pmd_t *pmd, pmd_v; 2368ac83757STejun Heo 2370ff2587fSTejun Heo pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + 2380ff2587fSTejun Heo cpu * PMD_SIZE); 2390ff2587fSTejun Heo pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), 2400ff2587fSTejun Heo PAGE_KERNEL_LARGE); 2410ff2587fSTejun Heo set_pmd(pmd, pmd_v); 2428ac83757STejun Heo } 2438ac83757STejun Heo 2448ac83757STejun Heo /* we're ready, commit */ 2458ac83757STejun Heo pr_info("PERCPU: Remapped at %p with large pages, static data " 2460ff2587fSTejun Heo "%zu bytes\n", pcpul_vm.addr, static_size); 2478ac83757STejun Heo 24897c9bf06STejun Heo ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, 2496074d5b0STejun Heo PERCPU_FIRST_CHUNK_RESERVE, dyn_size, 2500ff2587fSTejun Heo PMD_SIZE, pcpul_vm.addr, NULL); 251e59a1bb2STejun Heo 252e59a1bb2STejun Heo /* sort pcpul_map array for pcpu_lpage_remapped() */ 253e59a1bb2STejun Heo for (i = 0; i < num_possible_cpus() - 1; i++) 254e59a1bb2STejun Heo for (j = i + 1; j < num_possible_cpus(); j++) 255e59a1bb2STejun Heo if (pcpul_map[i].ptr > pcpul_map[j].ptr) { 256e59a1bb2STejun Heo struct pcpul_ent tmp = pcpul_map[i]; 257e59a1bb2STejun Heo pcpul_map[i] = pcpul_map[j]; 258e59a1bb2STejun Heo pcpul_map[j] = tmp; 259e59a1bb2STejun Heo } 260e59a1bb2STejun Heo 261e59a1bb2STejun Heo return ret; 2628ac83757STejun Heo 2638ac83757STejun Heo enomem: 2648ac83757STejun Heo for_each_possible_cpu(cpu) 2650ff2587fSTejun Heo if (pcpul_map[cpu].ptr) 2660ff2587fSTejun Heo free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); 2670ff2587fSTejun Heo free_bootmem(__pa(pcpul_map), map_size); 268e59a1bb2STejun Heo return -ENOMEM; 269e59a1bb2STejun Heo } 270e59a1bb2STejun Heo 271e59a1bb2STejun Heo /** 272e59a1bb2STejun Heo * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area 273e59a1bb2STejun Heo * @kaddr: the kernel address in question 274e59a1bb2STejun Heo * 275e59a1bb2STejun Heo * Determine whether @kaddr falls in the pcpul recycled area. This is 276e59a1bb2STejun Heo * used by pageattr to detect VM aliases and break up the pcpu PMD 277e59a1bb2STejun Heo * mapping such that the same physical page is not mapped under 278e59a1bb2STejun Heo * different attributes. 279e59a1bb2STejun Heo * 280e59a1bb2STejun Heo * The recycled area is always at the tail of a partially used PMD 281e59a1bb2STejun Heo * page. 282e59a1bb2STejun Heo * 283e59a1bb2STejun Heo * RETURNS: 284e59a1bb2STejun Heo * Address of corresponding remapped pcpu address if match is found; 285e59a1bb2STejun Heo * otherwise, NULL. 286e59a1bb2STejun Heo */ 287e59a1bb2STejun Heo void *pcpu_lpage_remapped(void *kaddr) 288e59a1bb2STejun Heo { 289e59a1bb2STejun Heo void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); 290e59a1bb2STejun Heo unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; 291e59a1bb2STejun Heo int left = 0, right = num_possible_cpus() - 1; 292e59a1bb2STejun Heo int pos; 293e59a1bb2STejun Heo 294e59a1bb2STejun Heo /* pcpul in use at all? */ 295e59a1bb2STejun Heo if (!pcpul_map) 296e59a1bb2STejun Heo return NULL; 297e59a1bb2STejun Heo 298e59a1bb2STejun Heo /* okay, perform binary search */ 299e59a1bb2STejun Heo while (left <= right) { 300e59a1bb2STejun Heo pos = (left + right) / 2; 301e59a1bb2STejun Heo 302e59a1bb2STejun Heo if (pcpul_map[pos].ptr < pmd_addr) 303e59a1bb2STejun Heo left = pos + 1; 304e59a1bb2STejun Heo else if (pcpul_map[pos].ptr > pmd_addr) 305e59a1bb2STejun Heo right = pos - 1; 306e59a1bb2STejun Heo else { 307e59a1bb2STejun Heo /* it shouldn't be in the area for the first chunk */ 308e59a1bb2STejun Heo WARN_ON(offset < pcpul_size); 309e59a1bb2STejun Heo 310e59a1bb2STejun Heo return pcpul_vm.addr + 311e59a1bb2STejun Heo pcpul_map[pos].cpu * PMD_SIZE + offset; 312e59a1bb2STejun Heo } 313e59a1bb2STejun Heo } 314e59a1bb2STejun Heo 315e59a1bb2STejun Heo return NULL; 3168ac83757STejun Heo } 3178ac83757STejun Heo #else 318fa8a7094STejun Heo static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 3198ac83757STejun Heo { 3208ac83757STejun Heo return -EINVAL; 3218ac83757STejun Heo } 3228ac83757STejun Heo #endif 3238ac83757STejun Heo 3248ac83757STejun Heo /* 32589c92151STejun Heo * Embedding allocator 32689c92151STejun Heo * 32789c92151STejun Heo * The first chunk is sized to just contain the static area plus 32866c3a757STejun Heo * module and dynamic reserves and embedded into linear physical 32966c3a757STejun Heo * mapping so that it can use PMD mapping without additional TLB 33066c3a757STejun Heo * pressure. 33189c92151STejun Heo */ 332fa8a7094STejun Heo static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) 33389c92151STejun Heo { 33466c3a757STejun Heo size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; 33589c92151STejun Heo 33689c92151STejun Heo /* 33789c92151STejun Heo * If large page isn't supported, there's no benefit in doing 33889c92151STejun Heo * this. Also, embedding allocation doesn't play well with 33989c92151STejun Heo * NUMA. 34089c92151STejun Heo */ 341fa8a7094STejun Heo if (!chosen && (!cpu_has_pse || pcpu_need_numa())) 34289c92151STejun Heo return -EINVAL; 34389c92151STejun Heo 34466c3a757STejun Heo return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, 34566c3a757STejun Heo reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); 34689c92151STejun Heo } 34789c92151STejun Heo 34889c92151STejun Heo /* 3495f5d8405STejun Heo * 4k page allocator 3505f5d8405STejun Heo * 3515f5d8405STejun Heo * This is the basic allocator. Static percpu area is allocated 3525f5d8405STejun Heo * page-by-page and most of initialization is done by the generic 3535f5d8405STejun Heo * setup function. 3545f5d8405STejun Heo */ 3558d408b4bSTejun Heo static struct page **pcpu4k_pages __initdata; 3568d408b4bSTejun Heo static int pcpu4k_nr_static_pages __initdata; 3578d408b4bSTejun Heo 3588d408b4bSTejun Heo static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) 3598d408b4bSTejun Heo { 3608d408b4bSTejun Heo if (pageno < pcpu4k_nr_static_pages) 3618d408b4bSTejun Heo return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; 3628d408b4bSTejun Heo return NULL; 3638d408b4bSTejun Heo } 3648d408b4bSTejun Heo 365458a3e64STejun Heo static void __init pcpu4k_populate_pte(unsigned long addr) 366458a3e64STejun Heo { 367458a3e64STejun Heo populate_extra_pte(addr); 368458a3e64STejun Heo } 369458a3e64STejun Heo 3705f5d8405STejun Heo static ssize_t __init setup_pcpu_4k(size_t static_size) 3715f5d8405STejun Heo { 3725f5d8405STejun Heo size_t pages_size; 3735f5d8405STejun Heo unsigned int cpu; 3745f5d8405STejun Heo int i, j; 3755f5d8405STejun Heo ssize_t ret; 3765f5d8405STejun Heo 3775f5d8405STejun Heo pcpu4k_nr_static_pages = PFN_UP(static_size); 3785f5d8405STejun Heo 3795f5d8405STejun Heo /* unaligned allocations can't be freed, round up to page size */ 3805f5d8405STejun Heo pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() 3815f5d8405STejun Heo * sizeof(pcpu4k_pages[0])); 3825f5d8405STejun Heo pcpu4k_pages = alloc_bootmem(pages_size); 3835f5d8405STejun Heo 3845f5d8405STejun Heo /* allocate and copy */ 3855f5d8405STejun Heo j = 0; 3865f5d8405STejun Heo for_each_possible_cpu(cpu) 3875f5d8405STejun Heo for (i = 0; i < pcpu4k_nr_static_pages; i++) { 3885f5d8405STejun Heo void *ptr; 3895f5d8405STejun Heo 3905f5d8405STejun Heo ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); 391fa8a7094STejun Heo if (!ptr) { 392fa8a7094STejun Heo pr_warning("PERCPU: failed to allocate " 393fa8a7094STejun Heo "4k page for cpu%u\n", cpu); 3945f5d8405STejun Heo goto enomem; 395fa8a7094STejun Heo } 3965f5d8405STejun Heo 3975f5d8405STejun Heo memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); 3985f5d8405STejun Heo pcpu4k_pages[j++] = virt_to_page(ptr); 3995f5d8405STejun Heo } 4005f5d8405STejun Heo 4015f5d8405STejun Heo /* we're ready, commit */ 4025f5d8405STejun Heo pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", 4035f5d8405STejun Heo pcpu4k_nr_static_pages, static_size); 4045f5d8405STejun Heo 4056b19b0c2STejun Heo ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 4066074d5b0STejun Heo PERCPU_FIRST_CHUNK_RESERVE, -1, 4076074d5b0STejun Heo -1, NULL, pcpu4k_populate_pte); 4085f5d8405STejun Heo goto out_free_ar; 4095f5d8405STejun Heo 4105f5d8405STejun Heo enomem: 4115f5d8405STejun Heo while (--j >= 0) 4125f5d8405STejun Heo free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); 4135f5d8405STejun Heo ret = -ENOMEM; 4145f5d8405STejun Heo out_free_ar: 4155f5d8405STejun Heo free_bootmem(__pa(pcpu4k_pages), pages_size); 4165f5d8405STejun Heo return ret; 4175f5d8405STejun Heo } 4185f5d8405STejun Heo 419fa8a7094STejun Heo /* for explicit first chunk allocator selection */ 420fa8a7094STejun Heo static char pcpu_chosen_alloc[16] __initdata; 421fa8a7094STejun Heo 422fa8a7094STejun Heo static int __init percpu_alloc_setup(char *str) 423fa8a7094STejun Heo { 424fa8a7094STejun Heo strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); 425fa8a7094STejun Heo return 0; 426fa8a7094STejun Heo } 427fa8a7094STejun Heo early_param("percpu_alloc", percpu_alloc_setup); 428fa8a7094STejun Heo 429b2d2f431SBrian Gerst static inline void setup_percpu_segment(int cpu) 430b2d2f431SBrian Gerst { 431b2d2f431SBrian Gerst #ifdef CONFIG_X86_32 432b2d2f431SBrian Gerst struct desc_struct gdt; 433b2d2f431SBrian Gerst 434b2d2f431SBrian Gerst pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, 435b2d2f431SBrian Gerst 0x2 | DESCTYPE_S, 0x8); 436b2d2f431SBrian Gerst gdt.s = 1; 437b2d2f431SBrian Gerst write_gdt_entry(get_cpu_gdt_table(cpu), 438b2d2f431SBrian Gerst GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); 439b2d2f431SBrian Gerst #endif 440b2d2f431SBrian Gerst } 441b2d2f431SBrian Gerst 442378b39a4SYinghai Lu void __init setup_per_cpu_areas(void) 443378b39a4SYinghai Lu { 4445f5d8405STejun Heo size_t static_size = __per_cpu_end - __per_cpu_start; 4455f5d8405STejun Heo unsigned int cpu; 44611124411STejun Heo unsigned long delta; 44711124411STejun Heo size_t pcpu_unit_size; 4485f5d8405STejun Heo ssize_t ret; 449a1681965SMike Travis 450ab14398aSCyrill Gorcunov pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 451a1681965SMike Travis NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 452a1681965SMike Travis 4538ac83757STejun Heo /* 4548ac83757STejun Heo * Allocate percpu area. If PSE is supported, try to make use 4558ac83757STejun Heo * of large page mappings. Please read comments on top of 4568ac83757STejun Heo * each allocator for details. 4578ac83757STejun Heo */ 458fa8a7094STejun Heo ret = -EINVAL; 459fa8a7094STejun Heo if (strlen(pcpu_chosen_alloc)) { 460fa8a7094STejun Heo if (strcmp(pcpu_chosen_alloc, "4k")) { 461fa8a7094STejun Heo if (!strcmp(pcpu_chosen_alloc, "lpage")) 462fa8a7094STejun Heo ret = setup_pcpu_lpage(static_size, true); 463fa8a7094STejun Heo else if (!strcmp(pcpu_chosen_alloc, "embed")) 464fa8a7094STejun Heo ret = setup_pcpu_embed(static_size, true); 465fa8a7094STejun Heo else 466fa8a7094STejun Heo pr_warning("PERCPU: unknown allocator %s " 467fa8a7094STejun Heo "specified\n", pcpu_chosen_alloc); 4688ac83757STejun Heo if (ret < 0) 469fa8a7094STejun Heo pr_warning("PERCPU: %s allocator failed (%zd), " 470fa8a7094STejun Heo "falling back to 4k\n", 471fa8a7094STejun Heo pcpu_chosen_alloc, ret); 472fa8a7094STejun Heo } 473fa8a7094STejun Heo } else { 474fa8a7094STejun Heo ret = setup_pcpu_lpage(static_size, false); 475fa8a7094STejun Heo if (ret < 0) 476fa8a7094STejun Heo ret = setup_pcpu_embed(static_size, false); 477fa8a7094STejun Heo } 47889c92151STejun Heo if (ret < 0) 4795f5d8405STejun Heo ret = setup_pcpu_4k(static_size); 4805f5d8405STejun Heo if (ret < 0) 4815f5d8405STejun Heo panic("cannot allocate static percpu area (%zu bytes, err=%zd)", 4825f5d8405STejun Heo static_size, ret); 483378b39a4SYinghai Lu 4845f5d8405STejun Heo pcpu_unit_size = ret; 48511124411STejun Heo 4865f5d8405STejun Heo /* alrighty, percpu areas up and running */ 48711124411STejun Heo delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 48811124411STejun Heo for_each_possible_cpu(cpu) { 48911124411STejun Heo per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; 49026f80bd6SBrian Gerst per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 491ea927906SBrian Gerst per_cpu(cpu_number, cpu) = cpu; 492b2d2f431SBrian Gerst setup_percpu_segment(cpu); 49360a5317fSTejun Heo setup_stack_canary_segment(cpu); 4940d77e7f0SBrian Gerst /* 495cf3997f5STejun Heo * Copy data used in early init routines from the 496cf3997f5STejun Heo * initial arrays to the per cpu data areas. These 497cf3997f5STejun Heo * arrays then become expendable and the *_early_ptr's 498cf3997f5STejun Heo * are zeroed indicating that the static arrays are 499cf3997f5STejun Heo * gone. 5000d77e7f0SBrian Gerst */ 501ec70de8bSBrian Gerst #ifdef CONFIG_X86_LOCAL_APIC 5020d77e7f0SBrian Gerst per_cpu(x86_cpu_to_apicid, cpu) = 5030d77e7f0SBrian Gerst early_per_cpu_map(x86_cpu_to_apicid, cpu); 5040d77e7f0SBrian Gerst per_cpu(x86_bios_cpu_apicid, cpu) = 5050d77e7f0SBrian Gerst early_per_cpu_map(x86_bios_cpu_apicid, cpu); 506ec70de8bSBrian Gerst #endif 5071a51e3a0STejun Heo #ifdef CONFIG_X86_64 50826f80bd6SBrian Gerst per_cpu(irq_stack_ptr, cpu) = 509cf3997f5STejun Heo per_cpu(irq_stack_union.irq_stack, cpu) + 510cf3997f5STejun Heo IRQ_STACK_SIZE - 64; 5116470aff6SBrian Gerst #ifdef CONFIG_NUMA 5126470aff6SBrian Gerst per_cpu(x86_cpu_to_node_map, cpu) = 5136470aff6SBrian Gerst early_per_cpu_map(x86_cpu_to_node_map, cpu); 5146470aff6SBrian Gerst #endif 5152697fbd5SBrian Gerst #endif 5161a51e3a0STejun Heo /* 51734019be1SBrian Gerst * Up to this point, the boot CPU has been using .data.init 5182697fbd5SBrian Gerst * area. Reload any changed state for the boot CPU. 5191a51e3a0STejun Heo */ 52034019be1SBrian Gerst if (cpu == boot_cpu_id) 521552be871SBrian Gerst switch_to_new_gdt(cpu); 522378b39a4SYinghai Lu } 523378b39a4SYinghai Lu 5240d77e7f0SBrian Gerst /* indicate the early static arrays will soon be gone */ 52522f25138SJames Bottomley #ifdef CONFIG_X86_LOCAL_APIC 5260d77e7f0SBrian Gerst early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 5270d77e7f0SBrian Gerst early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; 52822f25138SJames Bottomley #endif 5296470aff6SBrian Gerst #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) 5300d77e7f0SBrian Gerst early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 5310d77e7f0SBrian Gerst #endif 532378b39a4SYinghai Lu 53335d5a9a6SYinghai Lu #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) 53435d5a9a6SYinghai Lu /* 53535d5a9a6SYinghai Lu * make sure boot cpu node_number is right, when boot cpu is on the 53635d5a9a6SYinghai Lu * node that doesn't have mem installed 53735d5a9a6SYinghai Lu */ 53835d5a9a6SYinghai Lu per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); 53935d5a9a6SYinghai Lu #endif 54035d5a9a6SYinghai Lu 541378b39a4SYinghai Lu /* Setup node to cpumask map */ 542378b39a4SYinghai Lu setup_node_to_cpumask_map(); 543c2d1cec1SMike Travis 544c2d1cec1SMike Travis /* Setup cpu initialized, callin, callout masks */ 545c2d1cec1SMike Travis setup_cpu_local_masks(); 546378b39a4SYinghai Lu } 547