1378b39a4SYinghai Lu #include <linux/kernel.h> 2378b39a4SYinghai Lu #include <linux/module.h> 3378b39a4SYinghai Lu #include <linux/init.h> 4378b39a4SYinghai Lu #include <linux/bootmem.h> 5378b39a4SYinghai Lu #include <linux/percpu.h> 6378b39a4SYinghai Lu #include <linux/kexec.h> 7378b39a4SYinghai Lu #include <linux/crash_dump.h> 88a87dd9aSJaswinder Singh Rajput #include <linux/smp.h> 98a87dd9aSJaswinder Singh Rajput #include <linux/topology.h> 105f5d8405STejun Heo #include <linux/pfn.h> 11378b39a4SYinghai Lu #include <asm/sections.h> 12378b39a4SYinghai Lu #include <asm/processor.h> 13378b39a4SYinghai Lu #include <asm/setup.h> 14378b39a4SYinghai Lu #include <asm/mpspec.h> 15378b39a4SYinghai Lu #include <asm/apicdef.h> 16378b39a4SYinghai Lu #include <asm/highmem.h> 171a51e3a0STejun Heo #include <asm/proto.h> 1806879033SJaswinder Singh Rajput #include <asm/cpumask.h> 1934019be1SBrian Gerst #include <asm/cpu.h> 2060a5317fSTejun Heo #include <asm/stackprotector.h> 21378b39a4SYinghai Lu 22c90aa894SMike Travis #ifdef CONFIG_DEBUG_PER_CPU_MAPS 23c90aa894SMike Travis # define DBG(x...) printk(KERN_DEBUG x) 24c90aa894SMike Travis #else 25c90aa894SMike Travis # define DBG(x...) 26c90aa894SMike Travis #endif 27c90aa894SMike Travis 28ea927906SBrian Gerst DEFINE_PER_CPU(int, cpu_number); 29ea927906SBrian Gerst EXPORT_PER_CPU_SYMBOL(cpu_number); 30ea927906SBrian Gerst 311688401aSBrian Gerst #ifdef CONFIG_X86_64 321688401aSBrian Gerst #define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) 331688401aSBrian Gerst #else 341688401aSBrian Gerst #define BOOT_PERCPU_OFFSET 0 351688401aSBrian Gerst #endif 361688401aSBrian Gerst 371688401aSBrian Gerst DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; 381688401aSBrian Gerst EXPORT_PER_CPU_SYMBOL(this_cpu_off); 391688401aSBrian Gerst 409939ddafSTejun Heo unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { 4134019be1SBrian Gerst [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, 429939ddafSTejun Heo }; 439939ddafSTejun Heo EXPORT_SYMBOL(__per_cpu_offset); 44378b39a4SYinghai Lu 45*6b19b0c2STejun Heo /* 46*6b19b0c2STejun Heo * On x86_64 symbols referenced from code should be reachable using 47*6b19b0c2STejun Heo * 32bit relocations. Reserve space for static percpu variables in 48*6b19b0c2STejun Heo * modules so that they are always served from the first chunk which 49*6b19b0c2STejun Heo * is located at the percpu segment base. On x86_32, anything can 50*6b19b0c2STejun Heo * address anywhere. No need to reserve space in the first chunk. 51*6b19b0c2STejun Heo */ 52*6b19b0c2STejun Heo #ifdef CONFIG_X86_64 53*6b19b0c2STejun Heo #define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE 54*6b19b0c2STejun Heo #else 55*6b19b0c2STejun Heo #define PERCPU_FIRST_CHUNK_RESERVE 0 56*6b19b0c2STejun Heo #endif 57*6b19b0c2STejun Heo 585f5d8405STejun Heo /** 5989c92151STejun Heo * pcpu_need_numa - determine percpu allocation needs to consider NUMA 6089c92151STejun Heo * 6189c92151STejun Heo * If NUMA is not configured or there is only one NUMA node available, 6289c92151STejun Heo * there is no reason to consider NUMA. This function determines 6389c92151STejun Heo * whether percpu allocation should consider NUMA or not. 6489c92151STejun Heo * 6589c92151STejun Heo * RETURNS: 6689c92151STejun Heo * true if NUMA should be considered; otherwise, false. 6789c92151STejun Heo */ 6889c92151STejun Heo static bool __init pcpu_need_numa(void) 6989c92151STejun Heo { 7089c92151STejun Heo #ifdef CONFIG_NEED_MULTIPLE_NODES 7189c92151STejun Heo pg_data_t *last = NULL; 7289c92151STejun Heo unsigned int cpu; 7389c92151STejun Heo 7489c92151STejun Heo for_each_possible_cpu(cpu) { 7589c92151STejun Heo int node = early_cpu_to_node(cpu); 7689c92151STejun Heo 7789c92151STejun Heo if (node_online(node) && NODE_DATA(node) && 7889c92151STejun Heo last && last != NODE_DATA(node)) 7989c92151STejun Heo return true; 8089c92151STejun Heo 8189c92151STejun Heo last = NODE_DATA(node); 8289c92151STejun Heo } 8389c92151STejun Heo #endif 8489c92151STejun Heo return false; 8589c92151STejun Heo } 8689c92151STejun Heo 8789c92151STejun Heo /** 885f5d8405STejun Heo * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu 895f5d8405STejun Heo * @cpu: cpu to allocate for 905f5d8405STejun Heo * @size: size allocation in bytes 915f5d8405STejun Heo * @align: alignment 925f5d8405STejun Heo * 935f5d8405STejun Heo * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper 945f5d8405STejun Heo * does the right thing for NUMA regardless of the current 955f5d8405STejun Heo * configuration. 965f5d8405STejun Heo * 975f5d8405STejun Heo * RETURNS: 985f5d8405STejun Heo * Pointer to the allocated area on success, NULL on failure. 995f5d8405STejun Heo */ 1005f5d8405STejun Heo static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, 1015f5d8405STejun Heo unsigned long align) 1025f5d8405STejun Heo { 1035f5d8405STejun Heo const unsigned long goal = __pa(MAX_DMA_ADDRESS); 1045f5d8405STejun Heo #ifdef CONFIG_NEED_MULTIPLE_NODES 1055f5d8405STejun Heo int node = early_cpu_to_node(cpu); 1065f5d8405STejun Heo void *ptr; 1075f5d8405STejun Heo 1085f5d8405STejun Heo if (!node_online(node) || !NODE_DATA(node)) { 1095f5d8405STejun Heo ptr = __alloc_bootmem_nopanic(size, align, goal); 1105f5d8405STejun Heo pr_info("cpu %d has no node %d or node-local memory\n", 1115f5d8405STejun Heo cpu, node); 1125f5d8405STejun Heo pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", 1135f5d8405STejun Heo cpu, size, __pa(ptr)); 1145f5d8405STejun Heo } else { 1155f5d8405STejun Heo ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), 1165f5d8405STejun Heo size, align, goal); 1175f5d8405STejun Heo pr_debug("per cpu data for cpu%d %lu bytes on node%d at " 1185f5d8405STejun Heo "%016lx\n", cpu, size, node, __pa(ptr)); 1195f5d8405STejun Heo } 1205f5d8405STejun Heo return ptr; 1215f5d8405STejun Heo #else 1225f5d8405STejun Heo return __alloc_bootmem_nopanic(size, align, goal); 1235f5d8405STejun Heo #endif 1245f5d8405STejun Heo } 1255f5d8405STejun Heo 1265f5d8405STejun Heo /* 1278ac83757STejun Heo * Remap allocator 1288ac83757STejun Heo * 1298ac83757STejun Heo * This allocator uses PMD page as unit. A PMD page is allocated for 1308ac83757STejun Heo * each cpu and each is remapped into vmalloc area using PMD mapping. 1318ac83757STejun Heo * As PMD page is quite large, only part of it is used for the first 1328ac83757STejun Heo * chunk. Unused part is returned to the bootmem allocator. 1338ac83757STejun Heo * 1348ac83757STejun Heo * So, the PMD pages are mapped twice - once to the physical mapping 1358ac83757STejun Heo * and to the vmalloc area for the first percpu chunk. The double 1368ac83757STejun Heo * mapping does add one more PMD TLB entry pressure but still is much 1378ac83757STejun Heo * better than only using 4k mappings while still being NUMA friendly. 1388ac83757STejun Heo */ 1398ac83757STejun Heo #ifdef CONFIG_NEED_MULTIPLE_NODES 1408ac83757STejun Heo static size_t pcpur_size __initdata; 1418ac83757STejun Heo static void **pcpur_ptrs __initdata; 1428ac83757STejun Heo 1438ac83757STejun Heo static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) 1448ac83757STejun Heo { 1458ac83757STejun Heo size_t off = (size_t)pageno << PAGE_SHIFT; 1468ac83757STejun Heo 1478ac83757STejun Heo if (off >= pcpur_size) 1488ac83757STejun Heo return NULL; 1498ac83757STejun Heo 1508ac83757STejun Heo return virt_to_page(pcpur_ptrs[cpu] + off); 1518ac83757STejun Heo } 1528ac83757STejun Heo 1538ac83757STejun Heo static ssize_t __init setup_pcpu_remap(size_t static_size) 1548ac83757STejun Heo { 1558ac83757STejun Heo static struct vm_struct vm; 1568ac83757STejun Heo pg_data_t *last; 157*6b19b0c2STejun Heo size_t ptrs_size, dyn_size; 1588ac83757STejun Heo unsigned int cpu; 1598ac83757STejun Heo ssize_t ret; 1608ac83757STejun Heo 1618ac83757STejun Heo /* 1628ac83757STejun Heo * If large page isn't supported, there's no benefit in doing 1638ac83757STejun Heo * this. Also, on non-NUMA, embedding is better. 1648ac83757STejun Heo */ 1658ac83757STejun Heo if (!cpu_has_pse || pcpu_need_numa()) 1668ac83757STejun Heo return -EINVAL; 1678ac83757STejun Heo 1688ac83757STejun Heo last = NULL; 1698ac83757STejun Heo for_each_possible_cpu(cpu) { 1708ac83757STejun Heo int node = early_cpu_to_node(cpu); 1718ac83757STejun Heo 1728ac83757STejun Heo if (node_online(node) && NODE_DATA(node) && 1738ac83757STejun Heo last && last != NODE_DATA(node)) 1748ac83757STejun Heo goto proceed; 1758ac83757STejun Heo 1768ac83757STejun Heo last = NODE_DATA(node); 1778ac83757STejun Heo } 1788ac83757STejun Heo return -EINVAL; 1798ac83757STejun Heo 1808ac83757STejun Heo proceed: 1818ac83757STejun Heo /* 1828ac83757STejun Heo * Currently supports only single page. Supporting multiple 1838ac83757STejun Heo * pages won't be too difficult if it ever becomes necessary. 1848ac83757STejun Heo */ 185*6b19b0c2STejun Heo pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + 186*6b19b0c2STejun Heo PERCPU_DYNAMIC_RESERVE); 1878ac83757STejun Heo if (pcpur_size > PMD_SIZE) { 1888ac83757STejun Heo pr_warning("PERCPU: static data is larger than large page, " 1898ac83757STejun Heo "can't use large page\n"); 1908ac83757STejun Heo return -EINVAL; 1918ac83757STejun Heo } 192*6b19b0c2STejun Heo dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; 1938ac83757STejun Heo 1948ac83757STejun Heo /* allocate pointer array and alloc large pages */ 1958ac83757STejun Heo ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); 1968ac83757STejun Heo pcpur_ptrs = alloc_bootmem(ptrs_size); 1978ac83757STejun Heo 1988ac83757STejun Heo for_each_possible_cpu(cpu) { 1998ac83757STejun Heo pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); 2008ac83757STejun Heo if (!pcpur_ptrs[cpu]) 2018ac83757STejun Heo goto enomem; 2028ac83757STejun Heo 2038ac83757STejun Heo /* 2048ac83757STejun Heo * Only use pcpur_size bytes and give back the rest. 2058ac83757STejun Heo * 2068ac83757STejun Heo * Ingo: The 2MB up-rounding bootmem is needed to make 2078ac83757STejun Heo * sure the partial 2MB page is still fully RAM - it's 2088ac83757STejun Heo * not well-specified to have a PAT-incompatible area 2098ac83757STejun Heo * (unmapped RAM, device memory, etc.) in that hole. 2108ac83757STejun Heo */ 2118ac83757STejun Heo free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), 2128ac83757STejun Heo PMD_SIZE - pcpur_size); 2138ac83757STejun Heo 2148ac83757STejun Heo memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); 2158ac83757STejun Heo } 2168ac83757STejun Heo 2178ac83757STejun Heo /* allocate address and map */ 2188ac83757STejun Heo vm.flags = VM_ALLOC; 2198ac83757STejun Heo vm.size = num_possible_cpus() * PMD_SIZE; 2208ac83757STejun Heo vm_area_register_early(&vm, PMD_SIZE); 2218ac83757STejun Heo 2228ac83757STejun Heo for_each_possible_cpu(cpu) { 2238ac83757STejun Heo pmd_t *pmd; 2248ac83757STejun Heo 2258ac83757STejun Heo pmd = populate_extra_pmd((unsigned long)vm.addr 2268ac83757STejun Heo + cpu * PMD_SIZE); 2278ac83757STejun Heo set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), 2288ac83757STejun Heo PAGE_KERNEL_LARGE)); 2298ac83757STejun Heo } 2308ac83757STejun Heo 2318ac83757STejun Heo /* we're ready, commit */ 2328ac83757STejun Heo pr_info("PERCPU: Remapped at %p with large pages, static data " 2338ac83757STejun Heo "%zu bytes\n", vm.addr, static_size); 2348ac83757STejun Heo 235*6b19b0c2STejun Heo ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 236*6b19b0c2STejun Heo PERCPU_FIRST_CHUNK_RESERVE, 237*6b19b0c2STejun Heo PMD_SIZE, dyn_size, vm.addr, NULL); 2388ac83757STejun Heo goto out_free_ar; 2398ac83757STejun Heo 2408ac83757STejun Heo enomem: 2418ac83757STejun Heo for_each_possible_cpu(cpu) 2428ac83757STejun Heo if (pcpur_ptrs[cpu]) 2438ac83757STejun Heo free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); 2448ac83757STejun Heo ret = -ENOMEM; 2458ac83757STejun Heo out_free_ar: 2468ac83757STejun Heo free_bootmem(__pa(pcpur_ptrs), ptrs_size); 2478ac83757STejun Heo return ret; 2488ac83757STejun Heo } 2498ac83757STejun Heo #else 2508ac83757STejun Heo static ssize_t __init setup_pcpu_remap(size_t static_size) 2518ac83757STejun Heo { 2528ac83757STejun Heo return -EINVAL; 2538ac83757STejun Heo } 2548ac83757STejun Heo #endif 2558ac83757STejun Heo 2568ac83757STejun Heo /* 25789c92151STejun Heo * Embedding allocator 25889c92151STejun Heo * 25989c92151STejun Heo * The first chunk is sized to just contain the static area plus 2609a4f8a87STejun Heo * module and dynamic reserves, and allocated as a contiguous area 2619a4f8a87STejun Heo * using bootmem allocator and used as-is without being mapped into 2629a4f8a87STejun Heo * vmalloc area. This enables the first chunk to piggy back on the 2639a4f8a87STejun Heo * linear physical PMD mapping and doesn't add any additional pressure 2649a4f8a87STejun Heo * to TLB. Note that if the needed size is smaller than the minimum 2659a4f8a87STejun Heo * unit size, the leftover is returned to the bootmem allocator. 26689c92151STejun Heo */ 26789c92151STejun Heo static void *pcpue_ptr __initdata; 2689a4f8a87STejun Heo static size_t pcpue_size __initdata; 26989c92151STejun Heo static size_t pcpue_unit_size __initdata; 27089c92151STejun Heo 27189c92151STejun Heo static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) 27289c92151STejun Heo { 2739a4f8a87STejun Heo size_t off = (size_t)pageno << PAGE_SHIFT; 2749a4f8a87STejun Heo 2759a4f8a87STejun Heo if (off >= pcpue_size) 2769a4f8a87STejun Heo return NULL; 2779a4f8a87STejun Heo 2789a4f8a87STejun Heo return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); 27989c92151STejun Heo } 28089c92151STejun Heo 28189c92151STejun Heo static ssize_t __init setup_pcpu_embed(size_t static_size) 28289c92151STejun Heo { 28389c92151STejun Heo unsigned int cpu; 2849a4f8a87STejun Heo size_t dyn_size; 28589c92151STejun Heo 28689c92151STejun Heo /* 28789c92151STejun Heo * If large page isn't supported, there's no benefit in doing 28889c92151STejun Heo * this. Also, embedding allocation doesn't play well with 28989c92151STejun Heo * NUMA. 29089c92151STejun Heo */ 29189c92151STejun Heo if (!cpu_has_pse || pcpu_need_numa()) 29289c92151STejun Heo return -EINVAL; 29389c92151STejun Heo 29489c92151STejun Heo /* allocate and copy */ 295*6b19b0c2STejun Heo pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + 296*6b19b0c2STejun Heo PERCPU_DYNAMIC_RESERVE); 2979a4f8a87STejun Heo pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 298*6b19b0c2STejun Heo dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; 2999a4f8a87STejun Heo 30089c92151STejun Heo pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, 30189c92151STejun Heo PAGE_SIZE); 30289c92151STejun Heo if (!pcpue_ptr) 30389c92151STejun Heo return -ENOMEM; 30489c92151STejun Heo 3059a4f8a87STejun Heo for_each_possible_cpu(cpu) { 3069a4f8a87STejun Heo void *ptr = pcpue_ptr + cpu * pcpue_unit_size; 3079a4f8a87STejun Heo 3089a4f8a87STejun Heo free_bootmem(__pa(ptr + pcpue_size), 3099a4f8a87STejun Heo pcpue_unit_size - pcpue_size); 3109a4f8a87STejun Heo memcpy(ptr, __per_cpu_load, static_size); 3119a4f8a87STejun Heo } 31289c92151STejun Heo 31389c92151STejun Heo /* we're ready, commit */ 31489c92151STejun Heo pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", 3159a4f8a87STejun Heo pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); 31689c92151STejun Heo 317*6b19b0c2STejun Heo return pcpu_setup_first_chunk(pcpue_get_page, static_size, 318*6b19b0c2STejun Heo PERCPU_FIRST_CHUNK_RESERVE, 3199a4f8a87STejun Heo pcpue_unit_size, dyn_size, 3209a4f8a87STejun Heo pcpue_ptr, NULL); 32189c92151STejun Heo } 32289c92151STejun Heo 32389c92151STejun Heo /* 3245f5d8405STejun Heo * 4k page allocator 3255f5d8405STejun Heo * 3265f5d8405STejun Heo * This is the basic allocator. Static percpu area is allocated 3275f5d8405STejun Heo * page-by-page and most of initialization is done by the generic 3285f5d8405STejun Heo * setup function. 3295f5d8405STejun Heo */ 3308d408b4bSTejun Heo static struct page **pcpu4k_pages __initdata; 3318d408b4bSTejun Heo static int pcpu4k_nr_static_pages __initdata; 3328d408b4bSTejun Heo 3338d408b4bSTejun Heo static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) 3348d408b4bSTejun Heo { 3358d408b4bSTejun Heo if (pageno < pcpu4k_nr_static_pages) 3368d408b4bSTejun Heo return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; 3378d408b4bSTejun Heo return NULL; 3388d408b4bSTejun Heo } 3398d408b4bSTejun Heo 340458a3e64STejun Heo static void __init pcpu4k_populate_pte(unsigned long addr) 341458a3e64STejun Heo { 342458a3e64STejun Heo populate_extra_pte(addr); 343458a3e64STejun Heo } 344458a3e64STejun Heo 3455f5d8405STejun Heo static ssize_t __init setup_pcpu_4k(size_t static_size) 3465f5d8405STejun Heo { 3475f5d8405STejun Heo size_t pages_size; 3485f5d8405STejun Heo unsigned int cpu; 3495f5d8405STejun Heo int i, j; 3505f5d8405STejun Heo ssize_t ret; 3515f5d8405STejun Heo 3525f5d8405STejun Heo pcpu4k_nr_static_pages = PFN_UP(static_size); 3535f5d8405STejun Heo 3545f5d8405STejun Heo /* unaligned allocations can't be freed, round up to page size */ 3555f5d8405STejun Heo pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() 3565f5d8405STejun Heo * sizeof(pcpu4k_pages[0])); 3575f5d8405STejun Heo pcpu4k_pages = alloc_bootmem(pages_size); 3585f5d8405STejun Heo 3595f5d8405STejun Heo /* allocate and copy */ 3605f5d8405STejun Heo j = 0; 3615f5d8405STejun Heo for_each_possible_cpu(cpu) 3625f5d8405STejun Heo for (i = 0; i < pcpu4k_nr_static_pages; i++) { 3635f5d8405STejun Heo void *ptr; 3645f5d8405STejun Heo 3655f5d8405STejun Heo ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); 3665f5d8405STejun Heo if (!ptr) 3675f5d8405STejun Heo goto enomem; 3685f5d8405STejun Heo 3695f5d8405STejun Heo memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); 3705f5d8405STejun Heo pcpu4k_pages[j++] = virt_to_page(ptr); 3715f5d8405STejun Heo } 3725f5d8405STejun Heo 3735f5d8405STejun Heo /* we're ready, commit */ 3745f5d8405STejun Heo pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", 3755f5d8405STejun Heo pcpu4k_nr_static_pages, static_size); 3765f5d8405STejun Heo 377*6b19b0c2STejun Heo ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 378*6b19b0c2STejun Heo PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL, 379*6b19b0c2STejun Heo pcpu4k_populate_pte); 3805f5d8405STejun Heo goto out_free_ar; 3815f5d8405STejun Heo 3825f5d8405STejun Heo enomem: 3835f5d8405STejun Heo while (--j >= 0) 3845f5d8405STejun Heo free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); 3855f5d8405STejun Heo ret = -ENOMEM; 3865f5d8405STejun Heo out_free_ar: 3875f5d8405STejun Heo free_bootmem(__pa(pcpu4k_pages), pages_size); 3885f5d8405STejun Heo return ret; 3895f5d8405STejun Heo } 3905f5d8405STejun Heo 391b2d2f431SBrian Gerst static inline void setup_percpu_segment(int cpu) 392b2d2f431SBrian Gerst { 393b2d2f431SBrian Gerst #ifdef CONFIG_X86_32 394b2d2f431SBrian Gerst struct desc_struct gdt; 395b2d2f431SBrian Gerst 396b2d2f431SBrian Gerst pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, 397b2d2f431SBrian Gerst 0x2 | DESCTYPE_S, 0x8); 398b2d2f431SBrian Gerst gdt.s = 1; 399b2d2f431SBrian Gerst write_gdt_entry(get_cpu_gdt_table(cpu), 400b2d2f431SBrian Gerst GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); 401b2d2f431SBrian Gerst #endif 402b2d2f431SBrian Gerst } 403b2d2f431SBrian Gerst 404378b39a4SYinghai Lu /* 405378b39a4SYinghai Lu * Great future plan: 406378b39a4SYinghai Lu * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. 407378b39a4SYinghai Lu * Always point %gs to its beginning 408378b39a4SYinghai Lu */ 409378b39a4SYinghai Lu void __init setup_per_cpu_areas(void) 410378b39a4SYinghai Lu { 4115f5d8405STejun Heo size_t static_size = __per_cpu_end - __per_cpu_start; 4125f5d8405STejun Heo unsigned int cpu; 41311124411STejun Heo unsigned long delta; 41411124411STejun Heo size_t pcpu_unit_size; 4155f5d8405STejun Heo ssize_t ret; 416a1681965SMike Travis 417ab14398aSCyrill Gorcunov pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 418a1681965SMike Travis NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 419a1681965SMike Travis 4208ac83757STejun Heo /* 4218ac83757STejun Heo * Allocate percpu area. If PSE is supported, try to make use 4228ac83757STejun Heo * of large page mappings. Please read comments on top of 4238ac83757STejun Heo * each allocator for details. 4248ac83757STejun Heo */ 4258ac83757STejun Heo ret = setup_pcpu_remap(static_size); 4268ac83757STejun Heo if (ret < 0) 42789c92151STejun Heo ret = setup_pcpu_embed(static_size); 42889c92151STejun Heo if (ret < 0) 4295f5d8405STejun Heo ret = setup_pcpu_4k(static_size); 4305f5d8405STejun Heo if (ret < 0) 4315f5d8405STejun Heo panic("cannot allocate static percpu area (%zu bytes, err=%zd)", 4325f5d8405STejun Heo static_size, ret); 433378b39a4SYinghai Lu 4345f5d8405STejun Heo pcpu_unit_size = ret; 43511124411STejun Heo 4365f5d8405STejun Heo /* alrighty, percpu areas up and running */ 43711124411STejun Heo delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 43811124411STejun Heo for_each_possible_cpu(cpu) { 43911124411STejun Heo per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; 44026f80bd6SBrian Gerst per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 441ea927906SBrian Gerst per_cpu(cpu_number, cpu) = cpu; 442b2d2f431SBrian Gerst setup_percpu_segment(cpu); 44360a5317fSTejun Heo setup_stack_canary_segment(cpu); 4440d77e7f0SBrian Gerst /* 445cf3997f5STejun Heo * Copy data used in early init routines from the 446cf3997f5STejun Heo * initial arrays to the per cpu data areas. These 447cf3997f5STejun Heo * arrays then become expendable and the *_early_ptr's 448cf3997f5STejun Heo * are zeroed indicating that the static arrays are 449cf3997f5STejun Heo * gone. 4500d77e7f0SBrian Gerst */ 451ec70de8bSBrian Gerst #ifdef CONFIG_X86_LOCAL_APIC 4520d77e7f0SBrian Gerst per_cpu(x86_cpu_to_apicid, cpu) = 4530d77e7f0SBrian Gerst early_per_cpu_map(x86_cpu_to_apicid, cpu); 4540d77e7f0SBrian Gerst per_cpu(x86_bios_cpu_apicid, cpu) = 4550d77e7f0SBrian Gerst early_per_cpu_map(x86_bios_cpu_apicid, cpu); 456ec70de8bSBrian Gerst #endif 4571a51e3a0STejun Heo #ifdef CONFIG_X86_64 45826f80bd6SBrian Gerst per_cpu(irq_stack_ptr, cpu) = 459cf3997f5STejun Heo per_cpu(irq_stack_union.irq_stack, cpu) + 460cf3997f5STejun Heo IRQ_STACK_SIZE - 64; 4616470aff6SBrian Gerst #ifdef CONFIG_NUMA 4626470aff6SBrian Gerst per_cpu(x86_cpu_to_node_map, cpu) = 4636470aff6SBrian Gerst early_per_cpu_map(x86_cpu_to_node_map, cpu); 4646470aff6SBrian Gerst #endif 4652697fbd5SBrian Gerst #endif 4661a51e3a0STejun Heo /* 46734019be1SBrian Gerst * Up to this point, the boot CPU has been using .data.init 4682697fbd5SBrian Gerst * area. Reload any changed state for the boot CPU. 4691a51e3a0STejun Heo */ 47034019be1SBrian Gerst if (cpu == boot_cpu_id) 471552be871SBrian Gerst switch_to_new_gdt(cpu); 472378b39a4SYinghai Lu } 473378b39a4SYinghai Lu 4740d77e7f0SBrian Gerst /* indicate the early static arrays will soon be gone */ 47522f25138SJames Bottomley #ifdef CONFIG_X86_LOCAL_APIC 4760d77e7f0SBrian Gerst early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 4770d77e7f0SBrian Gerst early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; 47822f25138SJames Bottomley #endif 4796470aff6SBrian Gerst #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) 4800d77e7f0SBrian Gerst early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 4810d77e7f0SBrian Gerst #endif 482378b39a4SYinghai Lu 483378b39a4SYinghai Lu /* Setup node to cpumask map */ 484378b39a4SYinghai Lu setup_node_to_cpumask_map(); 485c2d1cec1SMike Travis 486c2d1cec1SMike Travis /* Setup cpu initialized, callin, callout masks */ 487c2d1cec1SMike Travis setup_cpu_local_masks(); 488378b39a4SYinghai Lu } 489