1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm/percpu-vm.c - vmalloc area based chunk allocation 4 * 5 * Copyright (C) 2010 SUSE Linux Products GmbH 6 * Copyright (C) 2010 Tejun Heo <tj@kernel.org> 7 * 8 * Chunks are mapped into vmalloc areas and populated page by page. 9 * This is the default chunk allocator. 10 */ 11 #include "internal.h" 12 13 static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, 14 unsigned int cpu, int page_idx) 15 { 16 /* must not be used on pre-mapped chunk */ 17 WARN_ON(chunk->immutable); 18 19 return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); 20 } 21 22 /** 23 * pcpu_get_pages - get temp pages array 24 * 25 * Returns pointer to array of pointers to struct page which can be indexed 26 * with pcpu_page_idx(). Note that there is only one array and accesses 27 * should be serialized by pcpu_alloc_mutex. 28 * 29 * RETURNS: 30 * Pointer to temp pages array on success. 31 */ 32 static struct page **pcpu_get_pages(void) 33 { 34 static struct page **pages; 35 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); 36 37 lockdep_assert_held(&pcpu_alloc_mutex); 38 39 if (!pages) 40 pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL); 41 return pages; 42 } 43 44 /** 45 * pcpu_free_pages - free pages which were allocated for @chunk 46 * @chunk: chunk pages were allocated for 47 * @pages: array of pages to be freed, indexed by pcpu_page_idx() 48 * @page_start: page index of the first page to be freed 49 * @page_end: page index of the last page to be freed + 1 50 * 51 * Free pages [@page_start and @page_end) in @pages for all units. 52 * The pages were allocated for @chunk. 53 */ 54 static void pcpu_free_pages(struct pcpu_chunk *chunk, 55 struct page **pages, int page_start, int page_end) 56 { 57 unsigned int cpu; 58 int i; 59 60 for_each_possible_cpu(cpu) { 61 for (i = page_start; i < page_end; i++) { 62 struct page *page = pages[pcpu_page_idx(cpu, i)]; 63 64 if (page) 65 __free_page(page); 66 } 67 } 68 } 69 70 /** 71 * pcpu_alloc_pages - allocates pages for @chunk 72 * @chunk: target chunk 73 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() 74 * @page_start: page index of the first page to be allocated 75 * @page_end: page index of the last page to be allocated + 1 76 * @gfp: allocation flags passed to the underlying allocator 77 * 78 * Allocate pages [@page_start,@page_end) into @pages for all units. 79 * The allocation is for @chunk. Percpu core doesn't care about the 80 * content of @pages and will pass it verbatim to pcpu_map_pages(). 81 */ 82 static int pcpu_alloc_pages(struct pcpu_chunk *chunk, 83 struct page **pages, int page_start, int page_end, 84 gfp_t gfp) 85 { 86 unsigned int cpu, tcpu; 87 int i; 88 89 gfp |= __GFP_HIGHMEM; 90 91 for_each_possible_cpu(cpu) { 92 for (i = page_start; i < page_end; i++) { 93 struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; 94 95 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); 96 if (!*pagep) 97 goto err; 98 } 99 } 100 return 0; 101 102 err: 103 while (--i >= page_start) 104 __free_page(pages[pcpu_page_idx(cpu, i)]); 105 106 for_each_possible_cpu(tcpu) { 107 if (tcpu == cpu) 108 break; 109 for (i = page_start; i < page_end; i++) 110 __free_page(pages[pcpu_page_idx(tcpu, i)]); 111 } 112 return -ENOMEM; 113 } 114 115 /** 116 * pcpu_pre_unmap_flush - flush cache prior to unmapping 117 * @chunk: chunk the regions to be flushed belongs to 118 * @page_start: page index of the first page to be flushed 119 * @page_end: page index of the last page to be flushed + 1 120 * 121 * Pages in [@page_start,@page_end) of @chunk are about to be 122 * unmapped. Flush cache. As each flushing trial can be very 123 * expensive, issue flush on the whole region at once rather than 124 * doing it for each cpu. This could be an overkill but is more 125 * scalable. 126 */ 127 static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, 128 int page_start, int page_end) 129 { 130 flush_cache_vunmap( 131 pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), 132 pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); 133 } 134 135 static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) 136 { 137 vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT)); 138 } 139 140 /** 141 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk 142 * @chunk: chunk of interest 143 * @pages: pages array which can be used to pass information to free 144 * @page_start: page index of the first page to unmap 145 * @page_end: page index of the last page to unmap + 1 146 * 147 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 148 * Corresponding elements in @pages were cleared by the caller and can 149 * be used to carry information to pcpu_free_pages() which will be 150 * called after all unmaps are finished. The caller should call 151 * proper pre/post flush functions. 152 */ 153 static void pcpu_unmap_pages(struct pcpu_chunk *chunk, 154 struct page **pages, int page_start, int page_end) 155 { 156 unsigned int cpu; 157 int i; 158 159 for_each_possible_cpu(cpu) { 160 for (i = page_start; i < page_end; i++) { 161 struct page *page; 162 163 page = pcpu_chunk_page(chunk, cpu, i); 164 WARN_ON(!page); 165 pages[pcpu_page_idx(cpu, i)] = page; 166 } 167 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), 168 page_end - page_start); 169 } 170 } 171 172 /** 173 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping 174 * @chunk: pcpu_chunk the regions to be flushed belong to 175 * @page_start: page index of the first page to be flushed 176 * @page_end: page index of the last page to be flushed + 1 177 * 178 * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush 179 * TLB for the regions. This can be skipped if the area is to be 180 * returned to vmalloc as vmalloc will handle TLB flushing lazily. 181 * 182 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once 183 * for the whole region. 184 */ 185 static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, 186 int page_start, int page_end) 187 { 188 flush_tlb_kernel_range( 189 pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), 190 pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); 191 } 192 193 static int __pcpu_map_pages(unsigned long addr, struct page **pages, 194 int nr_pages) 195 { 196 return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), 197 PAGE_KERNEL, pages, PAGE_SHIFT); 198 } 199 200 /** 201 * pcpu_map_pages - map pages into a pcpu_chunk 202 * @chunk: chunk of interest 203 * @pages: pages array containing pages to be mapped 204 * @page_start: page index of the first page to map 205 * @page_end: page index of the last page to map + 1 206 * 207 * For each cpu, map pages [@page_start,@page_end) into @chunk. The 208 * caller is responsible for calling pcpu_post_map_flush() after all 209 * mappings are complete. 210 * 211 * This function is responsible for setting up whatever is necessary for 212 * reverse lookup (addr -> chunk). 213 */ 214 static int pcpu_map_pages(struct pcpu_chunk *chunk, 215 struct page **pages, int page_start, int page_end) 216 { 217 unsigned int cpu, tcpu; 218 int i, err; 219 220 for_each_possible_cpu(cpu) { 221 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), 222 &pages[pcpu_page_idx(cpu, page_start)], 223 page_end - page_start); 224 if (err < 0) 225 goto err; 226 227 for (i = page_start; i < page_end; i++) 228 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], 229 chunk); 230 } 231 return 0; 232 err: 233 for_each_possible_cpu(tcpu) { 234 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), 235 page_end - page_start); 236 if (tcpu == cpu) 237 break; 238 } 239 pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); 240 return err; 241 } 242 243 /** 244 * pcpu_post_map_flush - flush cache after mapping 245 * @chunk: pcpu_chunk the regions to be flushed belong to 246 * @page_start: page index of the first page to be flushed 247 * @page_end: page index of the last page to be flushed + 1 248 * 249 * Pages [@page_start,@page_end) of @chunk have been mapped. Flush 250 * cache. 251 * 252 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once 253 * for the whole region. 254 */ 255 static void pcpu_post_map_flush(struct pcpu_chunk *chunk, 256 int page_start, int page_end) 257 { 258 flush_cache_vmap( 259 pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), 260 pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); 261 } 262 263 /** 264 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk 265 * @chunk: chunk of interest 266 * @page_start: the start page 267 * @page_end: the end page 268 * @gfp: allocation flags passed to the underlying memory allocator 269 * 270 * For each cpu, populate and map pages [@page_start,@page_end) into 271 * @chunk. 272 * 273 * CONTEXT: 274 * pcpu_alloc_mutex, does GFP_KERNEL allocation. 275 */ 276 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, 277 int page_start, int page_end, gfp_t gfp) 278 { 279 struct page **pages; 280 281 pages = pcpu_get_pages(); 282 if (!pages) 283 return -ENOMEM; 284 285 if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp)) 286 return -ENOMEM; 287 288 if (pcpu_map_pages(chunk, pages, page_start, page_end)) { 289 pcpu_free_pages(chunk, pages, page_start, page_end); 290 return -ENOMEM; 291 } 292 pcpu_post_map_flush(chunk, page_start, page_end); 293 294 return 0; 295 } 296 297 /** 298 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 299 * @chunk: chunk to depopulate 300 * @page_start: the start page 301 * @page_end: the end page 302 * 303 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 304 * from @chunk. 305 * 306 * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the 307 * region back to vmalloc() which will lazily flush the tlb. 308 * 309 * CONTEXT: 310 * pcpu_alloc_mutex. 311 */ 312 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, 313 int page_start, int page_end) 314 { 315 struct page **pages; 316 317 /* 318 * If control reaches here, there must have been at least one 319 * successful population attempt so the temp pages array must 320 * be available now. 321 */ 322 pages = pcpu_get_pages(); 323 BUG_ON(!pages); 324 325 /* unmap and free */ 326 pcpu_pre_unmap_flush(chunk, page_start, page_end); 327 328 pcpu_unmap_pages(chunk, pages, page_start, page_end); 329 330 pcpu_free_pages(chunk, pages, page_start, page_end); 331 } 332 333 static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) 334 { 335 struct pcpu_chunk *chunk; 336 struct vm_struct **vms; 337 338 chunk = pcpu_alloc_chunk(gfp); 339 if (!chunk) 340 return NULL; 341 342 vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, 343 pcpu_nr_groups, pcpu_atom_size); 344 if (!vms) { 345 pcpu_free_chunk(chunk); 346 return NULL; 347 } 348 349 chunk->data = vms; 350 chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; 351 352 pcpu_stats_chunk_alloc(); 353 trace_percpu_create_chunk(chunk->base_addr); 354 355 return chunk; 356 } 357 358 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) 359 { 360 if (!chunk) 361 return; 362 363 pcpu_stats_chunk_dealloc(); 364 trace_percpu_destroy_chunk(chunk->base_addr); 365 366 if (chunk->data) 367 pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); 368 pcpu_free_chunk(chunk); 369 } 370 371 static struct page *pcpu_addr_to_page(void *addr) 372 { 373 return vmalloc_to_page(addr); 374 } 375 376 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) 377 { 378 /* no extra restriction */ 379 return 0; 380 } 381 382 /** 383 * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim 384 * @chunk: chunk of interest 385 * 386 * This is the entry point for percpu reclaim. If a chunk qualifies, it is then 387 * isolated and managed in separate lists at the back of pcpu_slot: sidelined 388 * and to_depopulate respectively. The to_depopulate list holds chunks slated 389 * for depopulation. They no longer contribute to pcpu_nr_empty_pop_pages once 390 * they are on this list. Once depopulated, they are moved onto the sidelined 391 * list which enables them to be pulled back in for allocation if no other chunk 392 * can suffice the allocation. 393 */ 394 static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk) 395 { 396 /* do not reclaim either the first chunk or reserved chunk */ 397 if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk) 398 return false; 399 400 /* 401 * If it is isolated, it may be on the sidelined list so move it back to 402 * the to_depopulate list. If we hit at least 1/4 pages empty pages AND 403 * there is no system-wide shortage of empty pages aside from this 404 * chunk, move it to the to_depopulate list. 405 */ 406 return ((chunk->isolated && chunk->nr_empty_pop_pages) || 407 (pcpu_nr_empty_pop_pages > 408 (PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) && 409 chunk->nr_empty_pop_pages >= chunk->nr_pages / 4)); 410 } 411