xref: /linux/io_uring/memmap.c (revision 6dfafbd0299a60bfb5d5e277fdf100037c7ded07)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/init.h>
4 #include <linux/errno.h>
5 #include <linux/mm.h>
6 #include <linux/mman.h>
7 #include <linux/slab.h>
8 #include <linux/vmalloc.h>
9 #include <linux/io_uring.h>
10 #include <linux/io_uring_types.h>
11 #include <asm/shmparam.h>
12 
13 #include "memmap.h"
14 #include "kbuf.h"
15 #include "rsrc.h"
16 #include "zcrx.h"
17 
18 static bool io_mem_alloc_compound(struct page **pages, int nr_pages,
19 				  size_t size, gfp_t gfp)
20 {
21 	struct page *page;
22 	int i, order;
23 
24 	order = get_order(size);
25 	if (order > MAX_PAGE_ORDER)
26 		return false;
27 	else if (order)
28 		gfp |= __GFP_COMP;
29 
30 	page = alloc_pages(gfp, order);
31 	if (!page)
32 		return false;
33 
34 	for (i = 0; i < nr_pages; i++)
35 		pages[i] = page + i;
36 
37 	return true;
38 }
39 
40 struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
41 {
42 	unsigned long start, end, nr_pages;
43 	struct page **pages;
44 	int ret;
45 
46 	if (check_add_overflow(uaddr, len, &end))
47 		return ERR_PTR(-EOVERFLOW);
48 	if (check_add_overflow(end, PAGE_SIZE - 1, &end))
49 		return ERR_PTR(-EOVERFLOW);
50 
51 	end = end >> PAGE_SHIFT;
52 	start = uaddr >> PAGE_SHIFT;
53 	nr_pages = end - start;
54 	if (WARN_ON_ONCE(!nr_pages))
55 		return ERR_PTR(-EINVAL);
56 	if (WARN_ON_ONCE(nr_pages > INT_MAX))
57 		return ERR_PTR(-EOVERFLOW);
58 
59 	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
60 	if (!pages)
61 		return ERR_PTR(-ENOMEM);
62 
63 	ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
64 					pages);
65 	/* success, mapped all pages */
66 	if (ret == nr_pages) {
67 		*npages = nr_pages;
68 		return pages;
69 	}
70 
71 	/* partial map, or didn't map anything */
72 	if (ret >= 0) {
73 		/* if we did partial map, release any pages we did get */
74 		if (ret)
75 			unpin_user_pages(pages, ret);
76 		ret = -EFAULT;
77 	}
78 	kvfree(pages);
79 	return ERR_PTR(ret);
80 }
81 
82 enum {
83 	/* memory was vmap'ed for the kernel, freeing the region vunmap's it */
84 	IO_REGION_F_VMAP			= 1,
85 	/* memory is provided by user and pinned by the kernel */
86 	IO_REGION_F_USER_PROVIDED		= 2,
87 	/* only the first page in the array is ref'ed */
88 	IO_REGION_F_SINGLE_REF			= 4,
89 };
90 
91 void io_free_region(struct user_struct *user, struct io_mapped_region *mr)
92 {
93 	if (mr->pages) {
94 		long nr_refs = mr->nr_pages;
95 
96 		if (mr->flags & IO_REGION_F_SINGLE_REF)
97 			nr_refs = 1;
98 
99 		if (mr->flags & IO_REGION_F_USER_PROVIDED)
100 			unpin_user_pages(mr->pages, nr_refs);
101 		else
102 			release_pages(mr->pages, nr_refs);
103 
104 		kvfree(mr->pages);
105 	}
106 	if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr)
107 		vunmap(mr->ptr);
108 	if (mr->nr_pages && user)
109 		__io_unaccount_mem(user, mr->nr_pages);
110 
111 	memset(mr, 0, sizeof(*mr));
112 }
113 
114 static int io_region_init_ptr(struct io_mapped_region *mr)
115 {
116 	struct io_imu_folio_data ifd;
117 	void *ptr;
118 
119 	if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) {
120 		if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) {
121 			mr->ptr = page_address(mr->pages[0]);
122 			return 0;
123 		}
124 	}
125 	ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL);
126 	if (!ptr)
127 		return -ENOMEM;
128 
129 	mr->ptr = ptr;
130 	mr->flags |= IO_REGION_F_VMAP;
131 	return 0;
132 }
133 
134 static int io_region_pin_pages(struct io_mapped_region *mr,
135 			       struct io_uring_region_desc *reg)
136 {
137 	size_t size = io_region_size(mr);
138 	struct page **pages;
139 	int nr_pages;
140 
141 	pages = io_pin_pages(reg->user_addr, size, &nr_pages);
142 	if (IS_ERR(pages))
143 		return PTR_ERR(pages);
144 	if (WARN_ON_ONCE(nr_pages != mr->nr_pages))
145 		return -EFAULT;
146 
147 	mr->pages = pages;
148 	mr->flags |= IO_REGION_F_USER_PROVIDED;
149 	return 0;
150 }
151 
152 static int io_region_allocate_pages(struct io_mapped_region *mr,
153 				    struct io_uring_region_desc *reg,
154 				    unsigned long mmap_offset)
155 {
156 	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
157 	size_t size = io_region_size(mr);
158 	unsigned long nr_allocated;
159 	struct page **pages;
160 
161 	pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp);
162 	if (!pages)
163 		return -ENOMEM;
164 
165 	if (io_mem_alloc_compound(pages, mr->nr_pages, size, gfp)) {
166 		mr->flags |= IO_REGION_F_SINGLE_REF;
167 		goto done;
168 	}
169 
170 	nr_allocated = alloc_pages_bulk_node(gfp, NUMA_NO_NODE,
171 					     mr->nr_pages, pages);
172 	if (nr_allocated != mr->nr_pages) {
173 		if (nr_allocated)
174 			release_pages(pages, nr_allocated);
175 		kvfree(pages);
176 		return -ENOMEM;
177 	}
178 done:
179 	reg->mmap_offset = mmap_offset;
180 	mr->pages = pages;
181 	return 0;
182 }
183 
184 int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
185 		     struct io_uring_region_desc *reg,
186 		     unsigned long mmap_offset)
187 {
188 	int nr_pages, ret;
189 	u64 end;
190 
191 	if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages))
192 		return -EFAULT;
193 	if (memchr_inv(&reg->__resv, 0, sizeof(reg->__resv)))
194 		return -EINVAL;
195 	if (reg->flags & ~IORING_MEM_REGION_TYPE_USER)
196 		return -EINVAL;
197 	/* user_addr should be set IFF it's a user memory backed region */
198 	if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr)
199 		return -EFAULT;
200 	if (!reg->size || reg->mmap_offset || reg->id)
201 		return -EINVAL;
202 	if ((reg->size >> PAGE_SHIFT) > INT_MAX)
203 		return -E2BIG;
204 	if ((reg->user_addr | reg->size) & ~PAGE_MASK)
205 		return -EINVAL;
206 	if (check_add_overflow(reg->user_addr, reg->size, &end))
207 		return -EOVERFLOW;
208 
209 	nr_pages = reg->size >> PAGE_SHIFT;
210 	if (ctx->user) {
211 		ret = __io_account_mem(ctx->user, nr_pages);
212 		if (ret)
213 			return ret;
214 	}
215 	mr->nr_pages = nr_pages;
216 
217 	if (reg->flags & IORING_MEM_REGION_TYPE_USER)
218 		ret = io_region_pin_pages(mr, reg);
219 	else
220 		ret = io_region_allocate_pages(mr, reg, mmap_offset);
221 	if (ret)
222 		goto out_free;
223 
224 	ret = io_region_init_ptr(mr);
225 	if (ret)
226 		goto out_free;
227 	return 0;
228 out_free:
229 	io_free_region(ctx->user, mr);
230 	return ret;
231 }
232 
233 static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
234 						   loff_t pgoff)
235 {
236 	loff_t offset = pgoff << PAGE_SHIFT;
237 	unsigned int id;
238 
239 
240 	switch (offset & IORING_OFF_MMAP_MASK) {
241 	case IORING_OFF_SQ_RING:
242 	case IORING_OFF_CQ_RING:
243 		return &ctx->ring_region;
244 	case IORING_OFF_SQES:
245 		return &ctx->sq_region;
246 	case IORING_OFF_PBUF_RING:
247 		id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
248 		return io_pbuf_get_region(ctx, id);
249 	case IORING_MAP_OFF_PARAM_REGION:
250 		return &ctx->param_region;
251 	case IORING_MAP_OFF_ZCRX_REGION:
252 		id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT;
253 		return io_zcrx_get_region(ctx, id);
254 	}
255 	return NULL;
256 }
257 
258 static void *io_region_validate_mmap(struct io_ring_ctx *ctx,
259 				     struct io_mapped_region *mr)
260 {
261 	lockdep_assert_held(&ctx->mmap_lock);
262 
263 	if (!io_region_is_set(mr))
264 		return ERR_PTR(-EINVAL);
265 	if (mr->flags & IO_REGION_F_USER_PROVIDED)
266 		return ERR_PTR(-EINVAL);
267 
268 	return io_region_get_ptr(mr);
269 }
270 
271 static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
272 					    size_t sz)
273 {
274 	struct io_ring_ctx *ctx = file->private_data;
275 	struct io_mapped_region *region;
276 
277 	region = io_mmap_get_region(ctx, pgoff);
278 	if (!region)
279 		return ERR_PTR(-EINVAL);
280 	return io_region_validate_mmap(ctx, region);
281 }
282 
283 #ifdef CONFIG_MMU
284 
285 static int io_region_mmap(struct io_ring_ctx *ctx,
286 			  struct io_mapped_region *mr,
287 			  struct vm_area_struct *vma,
288 			  unsigned max_pages)
289 {
290 	unsigned long nr_pages = min(mr->nr_pages, max_pages);
291 
292 	vm_flags_set(vma, VM_DONTEXPAND);
293 	return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages);
294 }
295 
296 __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
297 {
298 	struct io_ring_ctx *ctx = file->private_data;
299 	size_t sz = vma->vm_end - vma->vm_start;
300 	long offset = vma->vm_pgoff << PAGE_SHIFT;
301 	unsigned int page_limit = UINT_MAX;
302 	struct io_mapped_region *region;
303 	void *ptr;
304 
305 	guard(mutex)(&ctx->mmap_lock);
306 
307 	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
308 	if (IS_ERR(ptr))
309 		return PTR_ERR(ptr);
310 
311 	switch (offset & IORING_OFF_MMAP_MASK) {
312 	case IORING_OFF_SQ_RING:
313 	case IORING_OFF_CQ_RING:
314 		page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
315 		break;
316 	}
317 
318 	region = io_mmap_get_region(ctx, vma->vm_pgoff);
319 	return io_region_mmap(ctx, region, vma, page_limit);
320 }
321 
322 unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
323 					 unsigned long len, unsigned long pgoff,
324 					 unsigned long flags)
325 {
326 	struct io_ring_ctx *ctx = filp->private_data;
327 	void *ptr;
328 
329 	/*
330 	 * Do not allow to map to user-provided address to avoid breaking the
331 	 * aliasing rules. Userspace is not able to guess the offset address of
332 	 * kernel kmalloc()ed memory area.
333 	 */
334 	if (addr)
335 		return -EINVAL;
336 
337 	guard(mutex)(&ctx->mmap_lock);
338 
339 	ptr = io_uring_validate_mmap_request(filp, pgoff, len);
340 	if (IS_ERR(ptr))
341 		return -ENOMEM;
342 
343 	/*
344 	 * Some architectures have strong cache aliasing requirements.
345 	 * For such architectures we need a coherent mapping which aliases
346 	 * kernel memory *and* userspace memory. To achieve that:
347 	 * - use a NULL file pointer to reference physical memory, and
348 	 * - use the kernel virtual address of the shared io_uring context
349 	 *   (instead of the userspace-provided address, which has to be 0UL
350 	 *   anyway).
351 	 * - use the same pgoff which the get_unmapped_area() uses to
352 	 *   calculate the page colouring.
353 	 * For architectures without such aliasing requirements, the
354 	 * architecture will return any suitable mapping because addr is 0.
355 	 */
356 	filp = NULL;
357 	flags |= MAP_SHARED;
358 	pgoff = 0;	/* has been translated to ptr above */
359 #ifdef SHM_COLOUR
360 	addr = (uintptr_t) ptr;
361 	pgoff = addr >> PAGE_SHIFT;
362 #else
363 	addr = 0UL;
364 #endif
365 	return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
366 }
367 
368 #else /* !CONFIG_MMU */
369 
370 int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
371 {
372 	return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
373 }
374 
375 unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
376 {
377 	return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
378 }
379 
380 unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
381 					 unsigned long len, unsigned long pgoff,
382 					 unsigned long flags)
383 {
384 	struct io_ring_ctx *ctx = file->private_data;
385 	void *ptr;
386 
387 	guard(mutex)(&ctx->mmap_lock);
388 
389 	ptr = io_uring_validate_mmap_request(file, pgoff, len);
390 	if (IS_ERR(ptr))
391 		return PTR_ERR(ptr);
392 
393 	return (unsigned long) ptr;
394 }
395 
396 #endif /* !CONFIG_MMU */
397