xref: /linux/drivers/infiniband/core/umem_odp.c (revision e49a3eac9207e9575337f70feeb29430f6f16bb7)
1 /*
2  * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <linux/types.h>
34 #include <linux/sched.h>
35 #include <linux/sched/mm.h>
36 #include <linux/sched/task.h>
37 #include <linux/pid.h>
38 #include <linux/slab.h>
39 #include <linux/export.h>
40 #include <linux/vmalloc.h>
41 #include <linux/hugetlb.h>
42 #include <linux/interval_tree.h>
43 #include <linux/hmm.h>
44 #include <linux/hmm-dma.h>
45 #include <linux/pagemap.h>
46 
47 #include <rdma/ib_umem_odp.h>
48 
49 #include "uverbs.h"
50 
51 static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp)
52 {
53 	umem_odp->is_implicit_odp = 1;
54 	umem_odp->umem.is_odp = 1;
55 	mutex_init(&umem_odp->umem_mutex);
56 }
57 
58 static int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
59 			    const struct mmu_interval_notifier_ops *ops)
60 {
61 	struct ib_device *dev = umem_odp->umem.ibdev;
62 	size_t page_size = 1UL << umem_odp->page_shift;
63 	struct hmm_dma_map *map;
64 	unsigned long start;
65 	unsigned long end;
66 	size_t nr_entries;
67 	int ret = 0;
68 
69 	umem_odp->umem.is_odp = 1;
70 	mutex_init(&umem_odp->umem_mutex);
71 
72 	start = ALIGN_DOWN(umem_odp->umem.address, page_size);
73 	if (check_add_overflow(umem_odp->umem.address,
74 			       (unsigned long)umem_odp->umem.length, &end))
75 		return -EOVERFLOW;
76 	end = ALIGN(end, page_size);
77 	if (unlikely(end < page_size))
78 		return -EOVERFLOW;
79 
80 	nr_entries = (end - start) >> PAGE_SHIFT;
81 	if (!(nr_entries * PAGE_SIZE / page_size))
82 		return -EINVAL;
83 
84 	map = &umem_odp->map;
85 	if (ib_uses_virt_dma(dev)) {
86 		map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
87 					 GFP_KERNEL | __GFP_NOWARN);
88 		if (!map->pfn_list)
89 			ret = -ENOMEM;
90 	} else
91 		ret = hmm_dma_map_alloc(dev->dma_device, map,
92 					(end - start) >> PAGE_SHIFT,
93 					1 << umem_odp->page_shift);
94 	if (ret)
95 		return ret;
96 
97 	ret = mmu_interval_notifier_insert(&umem_odp->notifier,
98 					   umem_odp->umem.owning_mm, start,
99 					   end - start, ops);
100 	if (ret)
101 		goto out_free_map;
102 
103 	return 0;
104 
105 out_free_map:
106 	if (ib_uses_virt_dma(dev))
107 		kfree(map->pfn_list);
108 	else
109 		hmm_dma_map_free(dev->dma_device, map);
110 	return ret;
111 }
112 
113 /**
114  * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem
115  *
116  * Implicit ODP umems do not have a VA range and do not have any page lists.
117  * They exist only to hold the per_mm reference to help the driver create
118  * children umems.
119  *
120  * @device: IB device to create UMEM
121  * @access: ib_reg_mr access flags
122  */
123 struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
124 					       int access)
125 {
126 	struct ib_umem *umem;
127 	struct ib_umem_odp *umem_odp;
128 
129 	if (access & IB_ACCESS_HUGETLB)
130 		return ERR_PTR(-EINVAL);
131 
132 	umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
133 	if (!umem_odp)
134 		return ERR_PTR(-ENOMEM);
135 	umem = &umem_odp->umem;
136 	umem->ibdev = device;
137 	umem->writable = ib_access_writable(access);
138 	umem->owning_mm = current->mm;
139 	umem_odp->page_shift = PAGE_SHIFT;
140 
141 	umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
142 	ib_init_umem_implicit_odp(umem_odp);
143 	return umem_odp;
144 }
145 EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
146 
147 /**
148  * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit
149  *                           parent ODP umem
150  *
151  * @root: The parent umem enclosing the child. This must be allocated using
152  *        ib_alloc_implicit_odp_umem()
153  * @addr: The starting userspace VA
154  * @size: The length of the userspace VA
155  * @ops: MMU interval ops, currently only @invalidate
156  */
157 struct ib_umem_odp *
158 ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
159 			size_t size,
160 			const struct mmu_interval_notifier_ops *ops)
161 {
162 	/*
163 	 * Caller must ensure that root cannot be freed during the call to
164 	 * ib_alloc_odp_umem.
165 	 */
166 	struct ib_umem_odp *odp_data;
167 	struct ib_umem *umem;
168 	int ret;
169 
170 	if (WARN_ON(!root->is_implicit_odp))
171 		return ERR_PTR(-EINVAL);
172 
173 	odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
174 	if (!odp_data)
175 		return ERR_PTR(-ENOMEM);
176 	umem = &odp_data->umem;
177 	umem->ibdev = root->umem.ibdev;
178 	umem->length     = size;
179 	umem->address    = addr;
180 	umem->writable   = root->umem.writable;
181 	umem->owning_mm  = root->umem.owning_mm;
182 	odp_data->page_shift = PAGE_SHIFT;
183 	odp_data->notifier.ops = ops;
184 
185 	/*
186 	 * A mmget must be held when registering a notifier, the owming_mm only
187 	 * has a mm_grab at this point.
188 	 */
189 	if (!mmget_not_zero(umem->owning_mm)) {
190 		ret = -EFAULT;
191 		goto out_free;
192 	}
193 
194 	odp_data->tgid = get_pid(root->tgid);
195 	ret = ib_init_umem_odp(odp_data, ops);
196 	if (ret)
197 		goto out_tgid;
198 	mmput(umem->owning_mm);
199 	return odp_data;
200 
201 out_tgid:
202 	put_pid(odp_data->tgid);
203 	mmput(umem->owning_mm);
204 out_free:
205 	kfree(odp_data);
206 	return ERR_PTR(ret);
207 }
208 EXPORT_SYMBOL(ib_umem_odp_alloc_child);
209 
210 /**
211  * ib_umem_odp_get - Create a umem_odp for a userspace va
212  *
213  * @device: IB device struct to get UMEM
214  * @addr: userspace virtual address to start at
215  * @size: length of region to pin
216  * @access: IB_ACCESS_xxx flags for memory being pinned
217  * @ops: MMU interval ops, currently only @invalidate
218  *
219  * The driver should use when the access flags indicate ODP memory. It avoids
220  * pinning, instead, stores the mm for future page fault handling in
221  * conjunction with MMU notifiers.
222  */
223 struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
224 				    unsigned long addr, size_t size, int access,
225 				    const struct mmu_interval_notifier_ops *ops)
226 {
227 	struct ib_umem_odp *umem_odp;
228 	int ret;
229 
230 	if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
231 		return ERR_PTR(-EINVAL);
232 
233 	umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
234 	if (!umem_odp)
235 		return ERR_PTR(-ENOMEM);
236 
237 	umem_odp->umem.ibdev = device;
238 	umem_odp->umem.length = size;
239 	umem_odp->umem.address = addr;
240 	umem_odp->umem.writable = ib_access_writable(access);
241 	umem_odp->umem.owning_mm = current->mm;
242 	umem_odp->notifier.ops = ops;
243 
244 	umem_odp->page_shift = PAGE_SHIFT;
245 #ifdef CONFIG_HUGETLB_PAGE
246 	if (access & IB_ACCESS_HUGETLB)
247 		umem_odp->page_shift = HPAGE_SHIFT;
248 #endif
249 
250 	umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
251 	ret = ib_init_umem_odp(umem_odp, ops);
252 	if (ret)
253 		goto err_put_pid;
254 	return umem_odp;
255 
256 err_put_pid:
257 	put_pid(umem_odp->tgid);
258 	kfree(umem_odp);
259 	return ERR_PTR(ret);
260 }
261 EXPORT_SYMBOL(ib_umem_odp_get);
262 
263 static void ib_umem_odp_free(struct ib_umem_odp *umem_odp)
264 {
265 	struct ib_device *dev = umem_odp->umem.ibdev;
266 
267 	/*
268 	 * Ensure that no more pages are mapped in the umem.
269 	 *
270 	 * It is the driver's responsibility to ensure, before calling us,
271 	 * that the hardware will not attempt to access the MR any more.
272 	 */
273 	mutex_lock(&umem_odp->umem_mutex);
274 	ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
275 				    ib_umem_end(umem_odp));
276 	mutex_unlock(&umem_odp->umem_mutex);
277 	mmu_interval_notifier_remove(&umem_odp->notifier);
278 	if (ib_uses_virt_dma(dev))
279 		kfree(umem_odp->map.pfn_list);
280 	else
281 		hmm_dma_map_free(dev->dma_device, &umem_odp->map);
282 }
283 
284 void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
285 {
286 	if (!umem_odp->is_implicit_odp)
287 		ib_umem_odp_free(umem_odp);
288 
289 	put_pid(umem_odp->tgid);
290 	kfree(umem_odp);
291 }
292 EXPORT_SYMBOL(ib_umem_odp_release);
293 
294 /**
295  * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
296  *
297  * Maps the range passed in the argument to DMA addresses.
298  * Upon success the ODP MR will be locked to let caller complete its device
299  * page table update.
300  *
301  * Returns the number of pages mapped in success, negative error code
302  * for failure.
303  * @umem_odp: the umem to map and pin
304  * @user_virt: the address from which we need to map.
305  * @bcnt: the minimal number of bytes to pin and map. The mapping might be
306  *        bigger due to alignment, and may also be smaller in case of an error
307  *        pinning or mapping a page. The actual pages mapped is returned in
308  *        the return value.
309  * @access_mask: bit mask of the requested access permissions for the given
310  *               range.
311  * @fault: is faulting required for the given range
312  */
313 int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
314 				 u64 bcnt, u64 access_mask, bool fault)
315 			__acquires(&umem_odp->umem_mutex)
316 {
317 	struct task_struct *owning_process  = NULL;
318 	struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
319 	int pfn_index, dma_index, ret = 0, start_idx;
320 	unsigned int page_shift, hmm_order, pfn_start_idx;
321 	unsigned long num_pfns, current_seq;
322 	struct hmm_range range = {};
323 	unsigned long timeout;
324 
325 	if (user_virt < ib_umem_start(umem_odp) ||
326 	    user_virt + bcnt > ib_umem_end(umem_odp))
327 		return -EFAULT;
328 
329 	page_shift = umem_odp->page_shift;
330 
331 	/*
332 	 * owning_process is allowed to be NULL, this means somehow the mm is
333 	 * existing beyond the lifetime of the originating process.. Presumably
334 	 * mmget_not_zero will fail in this case.
335 	 */
336 	owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID);
337 	if (!owning_process || !mmget_not_zero(owning_mm)) {
338 		ret = -EINVAL;
339 		goto out_put_task;
340 	}
341 
342 	range.notifier = &umem_odp->notifier;
343 	range.start = ALIGN_DOWN(user_virt, 1UL << page_shift);
344 	range.end = ALIGN(user_virt + bcnt, 1UL << page_shift);
345 	pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
346 	num_pfns = (range.end - range.start) >> PAGE_SHIFT;
347 	if (fault) {
348 		range.default_flags = HMM_PFN_REQ_FAULT;
349 
350 		if (access_mask & HMM_PFN_WRITE)
351 			range.default_flags |= HMM_PFN_REQ_WRITE;
352 	}
353 
354 	range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]);
355 	timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
356 
357 retry:
358 	current_seq = range.notifier_seq =
359 		mmu_interval_read_begin(&umem_odp->notifier);
360 
361 	mmap_read_lock(owning_mm);
362 	ret = hmm_range_fault(&range);
363 	mmap_read_unlock(owning_mm);
364 	if (unlikely(ret)) {
365 		if (ret == -EBUSY && !time_after(jiffies, timeout))
366 			goto retry;
367 		goto out_put_mm;
368 	}
369 
370 	start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift;
371 	dma_index = start_idx;
372 
373 	mutex_lock(&umem_odp->umem_mutex);
374 	if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) {
375 		mutex_unlock(&umem_odp->umem_mutex);
376 		goto retry;
377 	}
378 
379 	for (pfn_index = 0; pfn_index < num_pfns;
380 		pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
381 
382 		/*
383 		 * Since we asked for hmm_range_fault() to populate
384 		 * pages it shouldn't return an error entry on success.
385 		 */
386 		WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
387 		WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
388 		if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
389 			continue;
390 
391 		if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
392 			continue;
393 
394 		hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
395 		/* If a hugepage was detected and ODP wasn't set for, the umem
396 		 * page_shift will be used, the opposite case is an error.
397 		 */
398 		if (hmm_order + PAGE_SHIFT < page_shift) {
399 			ret = -EINVAL;
400 			ibdev_dbg(umem_odp->umem.ibdev,
401 				  "%s: un-expected hmm_order %u, page_shift %u\n",
402 				  __func__, hmm_order, page_shift);
403 			break;
404 		}
405 	}
406 	/* upon success lock should stay on hold for the callee */
407 	if (!ret)
408 		ret = dma_index - start_idx;
409 	else
410 		mutex_unlock(&umem_odp->umem_mutex);
411 
412 out_put_mm:
413 	mmput_async(owning_mm);
414 out_put_task:
415 	if (owning_process)
416 		put_task_struct(owning_process);
417 	return ret;
418 }
419 EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
420 
421 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
422 				 u64 bound)
423 {
424 	struct ib_device *dev = umem_odp->umem.ibdev;
425 	u64 addr;
426 
427 	lockdep_assert_held(&umem_odp->umem_mutex);
428 
429 	virt = max_t(u64, virt, ib_umem_start(umem_odp));
430 	bound = min_t(u64, bound, ib_umem_end(umem_odp));
431 	for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
432 		u64 offset = addr - ib_umem_start(umem_odp);
433 		size_t idx = offset >> umem_odp->page_shift;
434 		unsigned long pfn = umem_odp->map.pfn_list[idx];
435 
436 		if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx))
437 			goto clear;
438 
439 		if (pfn & HMM_PFN_WRITE) {
440 			struct page *page = hmm_pfn_to_page(pfn);
441 			struct page *head_page = compound_head(page);
442 			/*
443 			 * set_page_dirty prefers being called with
444 			 * the page lock. However, MMU notifiers are
445 			 * called sometimes with and sometimes without
446 			 * the lock. We rely on the umem_mutex instead
447 			 * to prevent other mmu notifiers from
448 			 * continuing and allowing the page mapping to
449 			 * be removed.
450 			 */
451 			set_page_dirty(head_page);
452 		}
453 		umem_odp->npages--;
454 clear:
455 		umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS;
456 	}
457 }
458 EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
459