1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 4 /* Copyright (c) 2008-2019, IBM Corporation */ 5 6 #include <linux/gfp.h> 7 #include <rdma/ib_verbs.h> 8 #include <linux/dma-mapping.h> 9 #include <linux/slab.h> 10 #include <linux/sched/mm.h> 11 #include <linux/resource.h> 12 13 #include "siw.h" 14 #include "siw_mem.h" 15 16 /* 17 * Stag lookup is based on its index part only (24 bits). 18 * The code avoids special Stag of zero and tries to randomize 19 * STag values between 1 and SIW_STAG_MAX_INDEX. 20 */ 21 int siw_mem_add(struct siw_device *sdev, struct siw_mem *m) 22 { 23 struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); 24 u32 id, next; 25 26 get_random_bytes(&next, 4); 27 next &= 0x00ffffff; 28 29 if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next, 30 GFP_KERNEL) < 0) 31 return -ENOMEM; 32 33 /* Set the STag index part */ 34 m->stag = id << 8; 35 36 siw_dbg_mem(m, "new MEM object\n"); 37 38 return 0; 39 } 40 41 /* 42 * siw_mem_id2obj() 43 * 44 * resolves memory from stag given by id. might be called from: 45 * o process context before sending out of sgl, or 46 * o in softirq when resolving target memory 47 */ 48 struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) 49 { 50 struct siw_mem *mem; 51 52 rcu_read_lock(); 53 mem = xa_load(&sdev->mem_xa, stag_index); 54 if (likely(mem && kref_get_unless_zero(&mem->ref))) { 55 rcu_read_unlock(); 56 return mem; 57 } 58 rcu_read_unlock(); 59 60 return NULL; 61 } 62 63 static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, 64 bool dirty) 65 { 66 unpin_user_pages_dirty_lock(chunk->plist, num_pages, dirty); 67 } 68 69 void siw_umem_release(struct siw_umem *umem, bool dirty) 70 { 71 struct mm_struct *mm_s = umem->owning_mm; 72 int i, num_pages = umem->num_pages; 73 74 for (i = 0; num_pages; i++) { 75 int to_free = min_t(int, PAGES_PER_CHUNK, num_pages); 76 77 siw_free_plist(&umem->page_chunk[i], to_free, 78 umem->writable && dirty); 79 kfree(umem->page_chunk[i].plist); 80 num_pages -= to_free; 81 } 82 atomic64_sub(umem->num_pages, &mm_s->pinned_vm); 83 84 mmdrop(mm_s); 85 kfree(umem->page_chunk); 86 kfree(umem); 87 } 88 89 int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, 90 u64 start, u64 len, int rights) 91 { 92 struct siw_device *sdev = to_siw_dev(pd->device); 93 struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL); 94 struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); 95 u32 id, next; 96 97 if (!mem) 98 return -ENOMEM; 99 100 mem->mem_obj = mem_obj; 101 mem->stag_valid = 0; 102 mem->sdev = sdev; 103 mem->va = start; 104 mem->len = len; 105 mem->pd = pd; 106 mem->perms = rights & IWARP_ACCESS_MASK; 107 kref_init(&mem->ref); 108 109 get_random_bytes(&next, 4); 110 next &= 0x00ffffff; 111 112 if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next, 113 GFP_KERNEL) < 0) { 114 kfree(mem); 115 return -ENOMEM; 116 } 117 118 mr->mem = mem; 119 /* Set the STag index part */ 120 mem->stag = id << 8; 121 mr->base_mr.lkey = mr->base_mr.rkey = mem->stag; 122 123 return 0; 124 } 125 126 void siw_mr_drop_mem(struct siw_mr *mr) 127 { 128 struct siw_mem *mem = mr->mem, *found; 129 130 mem->stag_valid = 0; 131 132 /* make STag invalid visible asap */ 133 smp_mb(); 134 135 found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8); 136 WARN_ON(found != mem); 137 siw_mem_put(mem); 138 } 139 140 void siw_free_mem(struct kref *ref) 141 { 142 struct siw_mem *mem = container_of(ref, struct siw_mem, ref); 143 144 siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n"); 145 146 if (!mem->is_mw && mem->mem_obj) { 147 if (mem->is_pbl == 0) 148 siw_umem_release(mem->umem, true); 149 else 150 kfree(mem->pbl); 151 } 152 kfree(mem); 153 } 154 155 /* 156 * siw_check_mem() 157 * 158 * Check protection domain, STAG state, access permissions and 159 * address range for memory object. 160 * 161 * @pd: Protection Domain memory should belong to 162 * @mem: memory to be checked 163 * @addr: starting addr of mem 164 * @perms: requested access permissions 165 * @len: len of memory interval to be checked 166 * 167 */ 168 int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, 169 enum ib_access_flags perms, int len) 170 { 171 if (!mem->stag_valid) { 172 siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag); 173 return -E_STAG_INVALID; 174 } 175 if (mem->pd != pd) { 176 siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag); 177 return -E_PD_MISMATCH; 178 } 179 /* 180 * check access permissions 181 */ 182 if ((mem->perms & perms) < perms) { 183 siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n", 184 mem->perms, perms); 185 return -E_ACCESS_PERM; 186 } 187 /* 188 * Check if access falls into valid memory interval. 189 */ 190 if (addr < mem->va || addr + len > mem->va + mem->len) { 191 siw_dbg_pd(pd, "MEM interval len %d\n", len); 192 siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n", 193 (void *)(uintptr_t)addr, 194 (void *)(uintptr_t)(addr + len)); 195 siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n", 196 (void *)(uintptr_t)mem->va, 197 (void *)(uintptr_t)(mem->va + mem->len), 198 mem->stag); 199 200 return -E_BASE_BOUNDS; 201 } 202 return E_ACCESS_OK; 203 } 204 205 /* 206 * siw_check_sge() 207 * 208 * Check SGE for access rights in given interval 209 * 210 * @pd: Protection Domain memory should belong to 211 * @sge: SGE to be checked 212 * @mem: location of memory reference within array 213 * @perms: requested access permissions 214 * @off: starting offset in SGE 215 * @len: len of memory interval to be checked 216 * 217 * NOTE: Function references SGE's memory object (mem->obj) 218 * if not yet done. New reference is kept if check went ok and 219 * released if check failed. If mem->obj is already valid, no new 220 * lookup is being done and mem is not released it check fails. 221 */ 222 int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[], 223 enum ib_access_flags perms, u32 off, int len) 224 { 225 struct siw_device *sdev = to_siw_dev(pd->device); 226 struct siw_mem *new = NULL; 227 int rv = E_ACCESS_OK; 228 229 if (len + off > sge->length) { 230 rv = -E_BASE_BOUNDS; 231 goto fail; 232 } 233 if (*mem == NULL) { 234 new = siw_mem_id2obj(sdev, sge->lkey >> 8); 235 if (unlikely(!new)) { 236 siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey); 237 rv = -E_STAG_INVALID; 238 goto fail; 239 } 240 *mem = new; 241 } 242 /* Check if user re-registered with different STag key */ 243 if (unlikely((*mem)->stag != sge->lkey)) { 244 siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey); 245 rv = -E_STAG_INVALID; 246 goto fail; 247 } 248 rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len); 249 if (unlikely(rv)) 250 goto fail; 251 252 return 0; 253 254 fail: 255 if (new) { 256 *mem = NULL; 257 siw_mem_put(new); 258 } 259 return rv; 260 } 261 262 void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op) 263 { 264 switch (op) { 265 case SIW_OP_SEND: 266 case SIW_OP_WRITE: 267 case SIW_OP_SEND_WITH_IMM: 268 case SIW_OP_SEND_REMOTE_INV: 269 case SIW_OP_READ: 270 case SIW_OP_READ_LOCAL_INV: 271 if (!(wqe->sqe.flags & SIW_WQE_INLINE)) 272 siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge); 273 break; 274 275 case SIW_OP_RECEIVE: 276 siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge); 277 break; 278 279 case SIW_OP_READ_RESPONSE: 280 siw_unref_mem_sgl(wqe->mem, 1); 281 break; 282 283 default: 284 /* 285 * SIW_OP_INVAL_STAG and SIW_OP_REG_MR 286 * do not hold memory references 287 */ 288 break; 289 } 290 } 291 292 int siw_invalidate_stag(struct ib_pd *pd, u32 stag) 293 { 294 struct siw_device *sdev = to_siw_dev(pd->device); 295 struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8); 296 int rv = 0; 297 298 if (unlikely(!mem)) { 299 siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag); 300 return -EINVAL; 301 } 302 if (unlikely(mem->pd != pd)) { 303 siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag); 304 rv = -EACCES; 305 goto out; 306 } 307 /* 308 * Per RDMA verbs definition, an STag may already be in invalid 309 * state if invalidation is requested. So no state check here. 310 */ 311 mem->stag_valid = 0; 312 313 siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag); 314 out: 315 siw_mem_put(mem); 316 return rv; 317 } 318 319 /* 320 * Gets physical address backed by PBL element. Address is referenced 321 * by linear byte offset into list of variably sized PB elements. 322 * Optionally, provides remaining len within current element, and 323 * current PBL index for later resume at same element. 324 */ 325 dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx) 326 { 327 int i = idx ? *idx : 0; 328 329 while (i < pbl->num_buf) { 330 struct siw_pble *pble = &pbl->pbe[i]; 331 332 if (pble->pbl_off + pble->size > off) { 333 u64 pble_off = off - pble->pbl_off; 334 335 if (len) 336 *len = pble->size - pble_off; 337 if (idx) 338 *idx = i; 339 340 return pble->addr + pble_off; 341 } 342 i++; 343 } 344 if (len) 345 *len = 0; 346 return 0; 347 } 348 349 struct siw_pbl *siw_pbl_alloc(u32 num_buf) 350 { 351 struct siw_pbl *pbl; 352 353 if (num_buf == 0) 354 return ERR_PTR(-EINVAL); 355 356 pbl = kzalloc(struct_size(pbl, pbe, num_buf), GFP_KERNEL); 357 if (!pbl) 358 return ERR_PTR(-ENOMEM); 359 360 pbl->max_buf = num_buf; 361 362 return pbl; 363 } 364 365 struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) 366 { 367 struct siw_umem *umem; 368 struct mm_struct *mm_s; 369 u64 first_page_va; 370 unsigned long mlock_limit; 371 unsigned int foll_flags = FOLL_LONGTERM; 372 int num_pages, num_chunks, i, rv = 0; 373 374 if (!can_do_mlock()) 375 return ERR_PTR(-EPERM); 376 377 if (!len) 378 return ERR_PTR(-EINVAL); 379 380 first_page_va = start & PAGE_MASK; 381 num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT; 382 num_chunks = (num_pages >> CHUNK_SHIFT) + 1; 383 384 umem = kzalloc(sizeof(*umem), GFP_KERNEL); 385 if (!umem) 386 return ERR_PTR(-ENOMEM); 387 388 mm_s = current->mm; 389 umem->owning_mm = mm_s; 390 umem->writable = writable; 391 392 mmgrab(mm_s); 393 394 if (writable) 395 foll_flags |= FOLL_WRITE; 396 397 mmap_read_lock(mm_s); 398 399 mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 400 401 if (atomic64_add_return(num_pages, &mm_s->pinned_vm) > mlock_limit) { 402 rv = -ENOMEM; 403 goto out_sem_up; 404 } 405 umem->fp_addr = first_page_va; 406 407 umem->page_chunk = 408 kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL); 409 if (!umem->page_chunk) { 410 rv = -ENOMEM; 411 goto out_sem_up; 412 } 413 for (i = 0; num_pages; i++) { 414 int nents = min_t(int, num_pages, PAGES_PER_CHUNK); 415 struct page **plist = 416 kcalloc(nents, sizeof(struct page *), GFP_KERNEL); 417 418 if (!plist) { 419 rv = -ENOMEM; 420 goto out_sem_up; 421 } 422 umem->page_chunk[i].plist = plist; 423 while (nents) { 424 rv = pin_user_pages(first_page_va, nents, foll_flags, 425 plist); 426 if (rv < 0) 427 goto out_sem_up; 428 429 umem->num_pages += rv; 430 first_page_va += rv * PAGE_SIZE; 431 plist += rv; 432 nents -= rv; 433 num_pages -= rv; 434 } 435 } 436 out_sem_up: 437 mmap_read_unlock(mm_s); 438 439 if (rv > 0) 440 return umem; 441 442 /* Adjust accounting for pages not pinned */ 443 if (num_pages) 444 atomic64_sub(num_pages, &mm_s->pinned_vm); 445 446 siw_umem_release(umem, false); 447 448 return ERR_PTR(rv); 449 } 450