1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/init.h> 4 #include <linux/errno.h> 5 #include <linux/mm.h> 6 #include <linux/mman.h> 7 #include <linux/slab.h> 8 #include <linux/vmalloc.h> 9 #include <linux/io_uring.h> 10 #include <linux/io_uring_types.h> 11 #include <asm/shmparam.h> 12 13 #include "memmap.h" 14 #include "kbuf.h" 15 #include "rsrc.h" 16 #include "zcrx.h" 17 18 static bool io_mem_alloc_compound(struct page **pages, int nr_pages, 19 size_t size, gfp_t gfp) 20 { 21 struct page *page; 22 int i, order; 23 24 order = get_order(size); 25 if (order > MAX_PAGE_ORDER) 26 return false; 27 else if (order) 28 gfp |= __GFP_COMP; 29 30 page = alloc_pages(gfp, order); 31 if (!page) 32 return false; 33 34 for (i = 0; i < nr_pages; i++) 35 pages[i] = page + i; 36 37 return true; 38 } 39 40 struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) 41 { 42 unsigned long start, end, nr_pages; 43 struct page **pages; 44 int ret; 45 46 if (check_add_overflow(uaddr, len, &end)) 47 return ERR_PTR(-EOVERFLOW); 48 if (check_add_overflow(end, PAGE_SIZE - 1, &end)) 49 return ERR_PTR(-EOVERFLOW); 50 51 end = end >> PAGE_SHIFT; 52 start = uaddr >> PAGE_SHIFT; 53 nr_pages = end - start; 54 if (WARN_ON_ONCE(!nr_pages)) 55 return ERR_PTR(-EINVAL); 56 if (WARN_ON_ONCE(nr_pages > INT_MAX)) 57 return ERR_PTR(-EOVERFLOW); 58 59 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 60 if (!pages) 61 return ERR_PTR(-ENOMEM); 62 63 ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 64 pages); 65 /* success, mapped all pages */ 66 if (ret == nr_pages) { 67 *npages = nr_pages; 68 return pages; 69 } 70 71 /* partial map, or didn't map anything */ 72 if (ret >= 0) { 73 /* if we did partial map, release any pages we did get */ 74 if (ret) 75 unpin_user_pages(pages, ret); 76 ret = -EFAULT; 77 } 78 kvfree(pages); 79 return ERR_PTR(ret); 80 } 81 82 enum { 83 /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ 84 IO_REGION_F_VMAP = 1, 85 /* memory is provided by user and pinned by the kernel */ 86 IO_REGION_F_USER_PROVIDED = 2, 87 /* only the first page in the array is ref'ed */ 88 IO_REGION_F_SINGLE_REF = 4, 89 }; 90 91 void io_free_region(struct user_struct *user, struct io_mapped_region *mr) 92 { 93 if (mr->pages) { 94 long nr_refs = mr->nr_pages; 95 96 if (mr->flags & IO_REGION_F_SINGLE_REF) 97 nr_refs = 1; 98 99 if (mr->flags & IO_REGION_F_USER_PROVIDED) 100 unpin_user_pages(mr->pages, nr_refs); 101 else 102 release_pages(mr->pages, nr_refs); 103 104 kvfree(mr->pages); 105 } 106 if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) 107 vunmap(mr->ptr); 108 if (mr->nr_pages && user) 109 __io_unaccount_mem(user, mr->nr_pages); 110 111 memset(mr, 0, sizeof(*mr)); 112 } 113 114 static int io_region_init_ptr(struct io_mapped_region *mr) 115 { 116 struct io_imu_folio_data ifd; 117 void *ptr; 118 119 if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { 120 if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) { 121 mr->ptr = page_address(mr->pages[0]); 122 return 0; 123 } 124 } 125 ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL); 126 if (!ptr) 127 return -ENOMEM; 128 129 mr->ptr = ptr; 130 mr->flags |= IO_REGION_F_VMAP; 131 return 0; 132 } 133 134 static int io_region_pin_pages(struct io_mapped_region *mr, 135 struct io_uring_region_desc *reg) 136 { 137 size_t size = io_region_size(mr); 138 struct page **pages; 139 int nr_pages; 140 141 pages = io_pin_pages(reg->user_addr, size, &nr_pages); 142 if (IS_ERR(pages)) 143 return PTR_ERR(pages); 144 if (WARN_ON_ONCE(nr_pages != mr->nr_pages)) 145 return -EFAULT; 146 147 mr->pages = pages; 148 mr->flags |= IO_REGION_F_USER_PROVIDED; 149 return 0; 150 } 151 152 static int io_region_allocate_pages(struct io_mapped_region *mr, 153 struct io_uring_region_desc *reg, 154 unsigned long mmap_offset) 155 { 156 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; 157 size_t size = io_region_size(mr); 158 unsigned long nr_allocated; 159 struct page **pages; 160 161 pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); 162 if (!pages) 163 return -ENOMEM; 164 165 if (io_mem_alloc_compound(pages, mr->nr_pages, size, gfp)) { 166 mr->flags |= IO_REGION_F_SINGLE_REF; 167 goto done; 168 } 169 170 nr_allocated = alloc_pages_bulk_node(gfp, NUMA_NO_NODE, 171 mr->nr_pages, pages); 172 if (nr_allocated != mr->nr_pages) { 173 if (nr_allocated) 174 release_pages(pages, nr_allocated); 175 kvfree(pages); 176 return -ENOMEM; 177 } 178 done: 179 reg->mmap_offset = mmap_offset; 180 mr->pages = pages; 181 return 0; 182 } 183 184 int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, 185 struct io_uring_region_desc *reg, 186 unsigned long mmap_offset) 187 { 188 int nr_pages, ret; 189 u64 end; 190 191 if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages)) 192 return -EFAULT; 193 if (memchr_inv(®->__resv, 0, sizeof(reg->__resv))) 194 return -EINVAL; 195 if (reg->flags & ~IORING_MEM_REGION_TYPE_USER) 196 return -EINVAL; 197 /* user_addr should be set IFF it's a user memory backed region */ 198 if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr) 199 return -EFAULT; 200 if (!reg->size || reg->mmap_offset || reg->id) 201 return -EINVAL; 202 if ((reg->size >> PAGE_SHIFT) > INT_MAX) 203 return -E2BIG; 204 if ((reg->user_addr | reg->size) & ~PAGE_MASK) 205 return -EINVAL; 206 if (check_add_overflow(reg->user_addr, reg->size, &end)) 207 return -EOVERFLOW; 208 209 nr_pages = reg->size >> PAGE_SHIFT; 210 if (ctx->user) { 211 ret = __io_account_mem(ctx->user, nr_pages); 212 if (ret) 213 return ret; 214 } 215 mr->nr_pages = nr_pages; 216 217 if (reg->flags & IORING_MEM_REGION_TYPE_USER) 218 ret = io_region_pin_pages(mr, reg); 219 else 220 ret = io_region_allocate_pages(mr, reg, mmap_offset); 221 if (ret) 222 goto out_free; 223 224 ret = io_region_init_ptr(mr); 225 if (ret) 226 goto out_free; 227 return 0; 228 out_free: 229 io_free_region(ctx->user, mr); 230 return ret; 231 } 232 233 static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, 234 loff_t pgoff) 235 { 236 loff_t offset = pgoff << PAGE_SHIFT; 237 unsigned int id; 238 239 240 switch (offset & IORING_OFF_MMAP_MASK) { 241 case IORING_OFF_SQ_RING: 242 case IORING_OFF_CQ_RING: 243 return &ctx->ring_region; 244 case IORING_OFF_SQES: 245 return &ctx->sq_region; 246 case IORING_OFF_PBUF_RING: 247 id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; 248 return io_pbuf_get_region(ctx, id); 249 case IORING_MAP_OFF_PARAM_REGION: 250 return &ctx->param_region; 251 case IORING_MAP_OFF_ZCRX_REGION: 252 id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT; 253 return io_zcrx_get_region(ctx, id); 254 } 255 return NULL; 256 } 257 258 static void *io_region_validate_mmap(struct io_ring_ctx *ctx, 259 struct io_mapped_region *mr) 260 { 261 lockdep_assert_held(&ctx->mmap_lock); 262 263 if (!io_region_is_set(mr)) 264 return ERR_PTR(-EINVAL); 265 if (mr->flags & IO_REGION_F_USER_PROVIDED) 266 return ERR_PTR(-EINVAL); 267 268 return io_region_get_ptr(mr); 269 } 270 271 static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, 272 size_t sz) 273 { 274 struct io_ring_ctx *ctx = file->private_data; 275 struct io_mapped_region *region; 276 277 region = io_mmap_get_region(ctx, pgoff); 278 if (!region) 279 return ERR_PTR(-EINVAL); 280 return io_region_validate_mmap(ctx, region); 281 } 282 283 #ifdef CONFIG_MMU 284 285 static int io_region_mmap(struct io_ring_ctx *ctx, 286 struct io_mapped_region *mr, 287 struct vm_area_struct *vma, 288 unsigned max_pages) 289 { 290 unsigned long nr_pages = min(mr->nr_pages, max_pages); 291 292 vm_flags_set(vma, VM_DONTEXPAND); 293 return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages); 294 } 295 296 __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 297 { 298 struct io_ring_ctx *ctx = file->private_data; 299 size_t sz = vma->vm_end - vma->vm_start; 300 long offset = vma->vm_pgoff << PAGE_SHIFT; 301 unsigned int page_limit = UINT_MAX; 302 struct io_mapped_region *region; 303 void *ptr; 304 305 guard(mutex)(&ctx->mmap_lock); 306 307 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 308 if (IS_ERR(ptr)) 309 return PTR_ERR(ptr); 310 311 switch (offset & IORING_OFF_MMAP_MASK) { 312 case IORING_OFF_SQ_RING: 313 case IORING_OFF_CQ_RING: 314 page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT; 315 break; 316 } 317 318 region = io_mmap_get_region(ctx, vma->vm_pgoff); 319 return io_region_mmap(ctx, region, vma, page_limit); 320 } 321 322 unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, 323 unsigned long len, unsigned long pgoff, 324 unsigned long flags) 325 { 326 struct io_ring_ctx *ctx = filp->private_data; 327 void *ptr; 328 329 /* 330 * Do not allow to map to user-provided address to avoid breaking the 331 * aliasing rules. Userspace is not able to guess the offset address of 332 * kernel kmalloc()ed memory area. 333 */ 334 if (addr) 335 return -EINVAL; 336 337 guard(mutex)(&ctx->mmap_lock); 338 339 ptr = io_uring_validate_mmap_request(filp, pgoff, len); 340 if (IS_ERR(ptr)) 341 return -ENOMEM; 342 343 /* 344 * Some architectures have strong cache aliasing requirements. 345 * For such architectures we need a coherent mapping which aliases 346 * kernel memory *and* userspace memory. To achieve that: 347 * - use a NULL file pointer to reference physical memory, and 348 * - use the kernel virtual address of the shared io_uring context 349 * (instead of the userspace-provided address, which has to be 0UL 350 * anyway). 351 * - use the same pgoff which the get_unmapped_area() uses to 352 * calculate the page colouring. 353 * For architectures without such aliasing requirements, the 354 * architecture will return any suitable mapping because addr is 0. 355 */ 356 filp = NULL; 357 flags |= MAP_SHARED; 358 pgoff = 0; /* has been translated to ptr above */ 359 #ifdef SHM_COLOUR 360 addr = (uintptr_t) ptr; 361 pgoff = addr >> PAGE_SHIFT; 362 #else 363 addr = 0UL; 364 #endif 365 return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); 366 } 367 368 #else /* !CONFIG_MMU */ 369 370 int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 371 { 372 return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; 373 } 374 375 unsigned int io_uring_nommu_mmap_capabilities(struct file *file) 376 { 377 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; 378 } 379 380 unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, 381 unsigned long len, unsigned long pgoff, 382 unsigned long flags) 383 { 384 struct io_ring_ctx *ctx = file->private_data; 385 void *ptr; 386 387 guard(mutex)(&ctx->mmap_lock); 388 389 ptr = io_uring_validate_mmap_request(file, pgoff, len); 390 if (IS_ERR(ptr)) 391 return PTR_ERR(ptr); 392 393 return (unsigned long) ptr; 394 } 395 396 #endif /* !CONFIG_MMU */ 397