1 /* 2 * Copyright (c) 2014 Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <linux/types.h> 34 #include <linux/sched.h> 35 #include <linux/sched/mm.h> 36 #include <linux/sched/task.h> 37 #include <linux/pid.h> 38 #include <linux/slab.h> 39 #include <linux/export.h> 40 #include <linux/vmalloc.h> 41 #include <linux/hugetlb.h> 42 #include <linux/interval_tree.h> 43 #include <linux/hmm.h> 44 #include <linux/hmm-dma.h> 45 #include <linux/pagemap.h> 46 47 #include <rdma/ib_umem_odp.h> 48 49 #include "uverbs.h" 50 51 static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp) 52 { 53 umem_odp->is_implicit_odp = 1; 54 umem_odp->umem.is_odp = 1; 55 mutex_init(&umem_odp->umem_mutex); 56 } 57 58 static int ib_init_umem_odp(struct ib_umem_odp *umem_odp, 59 const struct mmu_interval_notifier_ops *ops) 60 { 61 struct ib_device *dev = umem_odp->umem.ibdev; 62 size_t page_size = 1UL << umem_odp->page_shift; 63 struct hmm_dma_map *map; 64 unsigned long start; 65 unsigned long end; 66 size_t nr_entries; 67 int ret = 0; 68 69 umem_odp->umem.is_odp = 1; 70 mutex_init(&umem_odp->umem_mutex); 71 72 start = ALIGN_DOWN(umem_odp->umem.address, page_size); 73 if (check_add_overflow(umem_odp->umem.address, 74 (unsigned long)umem_odp->umem.length, &end)) 75 return -EOVERFLOW; 76 end = ALIGN(end, page_size); 77 if (unlikely(end < page_size)) 78 return -EOVERFLOW; 79 /* 80 * The mmu notifier can be called within reclaim contexts and takes the 81 * umem_mutex. This is rare to trigger in testing, teach lockdep about 82 * it. 83 */ 84 if (IS_ENABLED(CONFIG_LOCKDEP)) { 85 fs_reclaim_acquire(GFP_KERNEL); 86 mutex_lock(&umem_odp->umem_mutex); 87 mutex_unlock(&umem_odp->umem_mutex); 88 fs_reclaim_release(GFP_KERNEL); 89 } 90 91 nr_entries = (end - start) >> PAGE_SHIFT; 92 if (!(nr_entries * PAGE_SIZE / page_size)) 93 return -EINVAL; 94 95 map = &umem_odp->map; 96 if (ib_uses_virt_dma(dev)) { 97 map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list), 98 GFP_KERNEL | __GFP_NOWARN); 99 if (!map->pfn_list) 100 ret = -ENOMEM; 101 } else 102 ret = hmm_dma_map_alloc(dev->dma_device, map, 103 (end - start) >> PAGE_SHIFT, 104 1 << umem_odp->page_shift); 105 if (ret) 106 return ret; 107 108 ret = mmu_interval_notifier_insert(&umem_odp->notifier, 109 umem_odp->umem.owning_mm, start, 110 end - start, ops); 111 if (ret) 112 goto out_free_map; 113 114 return 0; 115 116 out_free_map: 117 if (ib_uses_virt_dma(dev)) 118 kfree(map->pfn_list); 119 else 120 hmm_dma_map_free(dev->dma_device, map); 121 return ret; 122 } 123 124 /** 125 * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem 126 * 127 * Implicit ODP umems do not have a VA range and do not have any page lists. 128 * They exist only to hold the per_mm reference to help the driver create 129 * children umems. 130 * 131 * @device: IB device to create UMEM 132 * @access: ib_reg_mr access flags 133 */ 134 struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, 135 int access) 136 { 137 struct ib_umem *umem; 138 struct ib_umem_odp *umem_odp; 139 140 if (access & IB_ACCESS_HUGETLB) 141 return ERR_PTR(-EINVAL); 142 143 umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); 144 if (!umem_odp) 145 return ERR_PTR(-ENOMEM); 146 umem = &umem_odp->umem; 147 umem->ibdev = device; 148 umem->writable = ib_access_writable(access); 149 umem->owning_mm = current->mm; 150 umem_odp->page_shift = PAGE_SHIFT; 151 152 umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); 153 ib_init_umem_implicit_odp(umem_odp); 154 return umem_odp; 155 } 156 EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); 157 158 /** 159 * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit 160 * parent ODP umem 161 * 162 * @root: The parent umem enclosing the child. This must be allocated using 163 * ib_alloc_implicit_odp_umem() 164 * @addr: The starting userspace VA 165 * @size: The length of the userspace VA 166 * @ops: MMU interval ops, currently only @invalidate 167 */ 168 struct ib_umem_odp * 169 ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr, 170 size_t size, 171 const struct mmu_interval_notifier_ops *ops) 172 { 173 /* 174 * Caller must ensure that root cannot be freed during the call to 175 * ib_alloc_odp_umem. 176 */ 177 struct ib_umem_odp *odp_data; 178 struct ib_umem *umem; 179 int ret; 180 181 if (WARN_ON(!root->is_implicit_odp)) 182 return ERR_PTR(-EINVAL); 183 184 odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); 185 if (!odp_data) 186 return ERR_PTR(-ENOMEM); 187 umem = &odp_data->umem; 188 umem->ibdev = root->umem.ibdev; 189 umem->length = size; 190 umem->address = addr; 191 umem->writable = root->umem.writable; 192 umem->owning_mm = root->umem.owning_mm; 193 odp_data->page_shift = PAGE_SHIFT; 194 odp_data->notifier.ops = ops; 195 196 /* 197 * A mmget must be held when registering a notifier, the owming_mm only 198 * has a mm_grab at this point. 199 */ 200 if (!mmget_not_zero(umem->owning_mm)) { 201 ret = -EFAULT; 202 goto out_free; 203 } 204 205 odp_data->tgid = get_pid(root->tgid); 206 ret = ib_init_umem_odp(odp_data, ops); 207 if (ret) 208 goto out_tgid; 209 mmput(umem->owning_mm); 210 return odp_data; 211 212 out_tgid: 213 put_pid(odp_data->tgid); 214 mmput(umem->owning_mm); 215 out_free: 216 kfree(odp_data); 217 return ERR_PTR(ret); 218 } 219 EXPORT_SYMBOL(ib_umem_odp_alloc_child); 220 221 /** 222 * ib_umem_odp_get - Create a umem_odp for a userspace va 223 * 224 * @device: IB device struct to get UMEM 225 * @addr: userspace virtual address to start at 226 * @size: length of region to pin 227 * @access: IB_ACCESS_xxx flags for memory being pinned 228 * @ops: MMU interval ops, currently only @invalidate 229 * 230 * The driver should use when the access flags indicate ODP memory. It avoids 231 * pinning, instead, stores the mm for future page fault handling in 232 * conjunction with MMU notifiers. 233 */ 234 struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device, 235 unsigned long addr, size_t size, int access, 236 const struct mmu_interval_notifier_ops *ops) 237 { 238 struct ib_umem_odp *umem_odp; 239 int ret; 240 241 if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND))) 242 return ERR_PTR(-EINVAL); 243 244 umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); 245 if (!umem_odp) 246 return ERR_PTR(-ENOMEM); 247 248 umem_odp->umem.ibdev = device; 249 umem_odp->umem.length = size; 250 umem_odp->umem.address = addr; 251 umem_odp->umem.writable = ib_access_writable(access); 252 umem_odp->umem.owning_mm = current->mm; 253 umem_odp->notifier.ops = ops; 254 255 umem_odp->page_shift = PAGE_SHIFT; 256 #ifdef CONFIG_HUGETLB_PAGE 257 if (access & IB_ACCESS_HUGETLB) 258 umem_odp->page_shift = HPAGE_SHIFT; 259 #endif 260 261 umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); 262 ret = ib_init_umem_odp(umem_odp, ops); 263 if (ret) 264 goto err_put_pid; 265 return umem_odp; 266 267 err_put_pid: 268 put_pid(umem_odp->tgid); 269 kfree(umem_odp); 270 return ERR_PTR(ret); 271 } 272 EXPORT_SYMBOL(ib_umem_odp_get); 273 274 static void ib_umem_odp_free(struct ib_umem_odp *umem_odp) 275 { 276 struct ib_device *dev = umem_odp->umem.ibdev; 277 278 /* 279 * Ensure that no more pages are mapped in the umem. 280 * 281 * It is the driver's responsibility to ensure, before calling us, 282 * that the hardware will not attempt to access the MR any more. 283 */ 284 mutex_lock(&umem_odp->umem_mutex); 285 ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), 286 ib_umem_end(umem_odp)); 287 mutex_unlock(&umem_odp->umem_mutex); 288 mmu_interval_notifier_remove(&umem_odp->notifier); 289 if (ib_uses_virt_dma(dev)) 290 kfree(umem_odp->map.pfn_list); 291 else 292 hmm_dma_map_free(dev->dma_device, &umem_odp->map); 293 } 294 295 void ib_umem_odp_release(struct ib_umem_odp *umem_odp) 296 { 297 if (!umem_odp->is_implicit_odp) 298 ib_umem_odp_free(umem_odp); 299 300 put_pid(umem_odp->tgid); 301 kfree(umem_odp); 302 } 303 EXPORT_SYMBOL(ib_umem_odp_release); 304 305 /** 306 * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it. 307 * 308 * Maps the range passed in the argument to DMA addresses. 309 * Upon success the ODP MR will be locked to let caller complete its device 310 * page table update. 311 * 312 * Returns the number of pages mapped in success, negative error code 313 * for failure. 314 * @umem_odp: the umem to map and pin 315 * @user_virt: the address from which we need to map. 316 * @bcnt: the minimal number of bytes to pin and map. The mapping might be 317 * bigger due to alignment, and may also be smaller in case of an error 318 * pinning or mapping a page. The actual pages mapped is returned in 319 * the return value. 320 * @access_mask: bit mask of the requested access permissions for the given 321 * range. 322 * @fault: is faulting required for the given range 323 */ 324 int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, 325 u64 bcnt, u64 access_mask, bool fault) 326 __acquires(&umem_odp->umem_mutex) 327 { 328 struct task_struct *owning_process = NULL; 329 struct mm_struct *owning_mm = umem_odp->umem.owning_mm; 330 int pfn_index, dma_index, ret = 0, start_idx; 331 unsigned int page_shift, hmm_order, pfn_start_idx; 332 unsigned long num_pfns, current_seq; 333 struct hmm_range range = {}; 334 unsigned long timeout; 335 336 if (user_virt < ib_umem_start(umem_odp) || 337 user_virt + bcnt > ib_umem_end(umem_odp)) 338 return -EFAULT; 339 340 page_shift = umem_odp->page_shift; 341 342 /* 343 * owning_process is allowed to be NULL, this means somehow the mm is 344 * existing beyond the lifetime of the originating process.. Presumably 345 * mmget_not_zero will fail in this case. 346 */ 347 owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID); 348 if (!owning_process || !mmget_not_zero(owning_mm)) { 349 ret = -EINVAL; 350 goto out_put_task; 351 } 352 353 range.notifier = &umem_odp->notifier; 354 range.start = ALIGN_DOWN(user_virt, 1UL << page_shift); 355 range.end = ALIGN(user_virt + bcnt, 1UL << page_shift); 356 pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT; 357 num_pfns = (range.end - range.start) >> PAGE_SHIFT; 358 if (fault) { 359 range.default_flags = HMM_PFN_REQ_FAULT; 360 361 if (access_mask & HMM_PFN_WRITE) 362 range.default_flags |= HMM_PFN_REQ_WRITE; 363 } 364 365 range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]); 366 timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 367 368 retry: 369 current_seq = range.notifier_seq = 370 mmu_interval_read_begin(&umem_odp->notifier); 371 372 mmap_read_lock(owning_mm); 373 ret = hmm_range_fault(&range); 374 mmap_read_unlock(owning_mm); 375 if (unlikely(ret)) { 376 if (ret == -EBUSY && !time_after(jiffies, timeout)) 377 goto retry; 378 goto out_put_mm; 379 } 380 381 start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift; 382 dma_index = start_idx; 383 384 mutex_lock(&umem_odp->umem_mutex); 385 if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) { 386 mutex_unlock(&umem_odp->umem_mutex); 387 goto retry; 388 } 389 390 for (pfn_index = 0; pfn_index < num_pfns; 391 pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { 392 393 /* 394 * Since we asked for hmm_range_fault() to populate 395 * pages it shouldn't return an error entry on success. 396 */ 397 WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); 398 WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); 399 if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) 400 continue; 401 402 if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED) 403 continue; 404 405 hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); 406 /* If a hugepage was detected and ODP wasn't set for, the umem 407 * page_shift will be used, the opposite case is an error. 408 */ 409 if (hmm_order + PAGE_SHIFT < page_shift) { 410 ret = -EINVAL; 411 ibdev_dbg(umem_odp->umem.ibdev, 412 "%s: un-expected hmm_order %u, page_shift %u\n", 413 __func__, hmm_order, page_shift); 414 break; 415 } 416 } 417 /* upon success lock should stay on hold for the callee */ 418 if (!ret) 419 ret = dma_index - start_idx; 420 else 421 mutex_unlock(&umem_odp->umem_mutex); 422 423 out_put_mm: 424 mmput_async(owning_mm); 425 out_put_task: 426 if (owning_process) 427 put_task_struct(owning_process); 428 return ret; 429 } 430 EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); 431 432 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, 433 u64 bound) 434 { 435 struct ib_device *dev = umem_odp->umem.ibdev; 436 u64 addr; 437 438 lockdep_assert_held(&umem_odp->umem_mutex); 439 440 virt = max_t(u64, virt, ib_umem_start(umem_odp)); 441 bound = min_t(u64, bound, ib_umem_end(umem_odp)); 442 for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { 443 u64 offset = addr - ib_umem_start(umem_odp); 444 size_t idx = offset >> umem_odp->page_shift; 445 unsigned long pfn = umem_odp->map.pfn_list[idx]; 446 447 if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx)) 448 goto clear; 449 450 if (pfn & HMM_PFN_WRITE) { 451 struct page *page = hmm_pfn_to_page(pfn); 452 struct page *head_page = compound_head(page); 453 /* 454 * set_page_dirty prefers being called with 455 * the page lock. However, MMU notifiers are 456 * called sometimes with and sometimes without 457 * the lock. We rely on the umem_mutex instead 458 * to prevent other mmu notifiers from 459 * continuing and allowing the page mapping to 460 * be removed. 461 */ 462 set_page_dirty(head_page); 463 } 464 umem_odp->npages--; 465 clear: 466 umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS; 467 } 468 } 469 EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); 470