1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2 /* 3 * Copyright(c) 2023 - Cornelis Networks, Inc. 4 */ 5 6 #include <linux/types.h> 7 8 #include "hfi.h" 9 #include "common.h" 10 #include "device.h" 11 #include "pinning.h" 12 #include "mmu_rb.h" 13 #include "user_sdma.h" 14 #include "trace.h" 15 16 struct sdma_mmu_node { 17 struct mmu_rb_node rb; 18 struct hfi1_user_sdma_pkt_q *pq; 19 struct page **pages; 20 unsigned int npages; 21 }; 22 23 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 24 unsigned long len); 25 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, void *arg2, 26 bool *stop); 27 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 28 29 static const struct mmu_rb_ops sdma_rb_ops = { 30 .filter = sdma_rb_filter, 31 .evict = sdma_rb_evict, 32 .remove = sdma_rb_remove, 33 }; 34 35 int hfi1_init_system_pinning(struct hfi1_user_sdma_pkt_q *pq) 36 { 37 struct hfi1_devdata *dd = pq->dd; 38 int ret; 39 40 ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq, 41 &pq->handler); 42 if (ret) 43 dd_dev_err(dd, 44 "[%u:%u] Failed to register system memory DMA support with MMU: %d\n", 45 pq->ctxt, pq->subctxt, ret); 46 return ret; 47 } 48 49 void hfi1_free_system_pinning(struct hfi1_user_sdma_pkt_q *pq) 50 { 51 if (pq->handler) 52 hfi1_mmu_rb_unregister(pq->handler); 53 } 54 55 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 56 { 57 struct evict_data evict_data; 58 59 evict_data.cleared = 0; 60 evict_data.target = npages; 61 hfi1_mmu_rb_evict(pq->handler, &evict_data); 62 return evict_data.cleared; 63 } 64 65 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 66 unsigned int start, unsigned int npages) 67 { 68 hfi1_release_user_pages(mm, pages + start, npages, false); 69 kfree(pages); 70 } 71 72 static inline struct mm_struct *mm_from_sdma_node(struct sdma_mmu_node *node) 73 { 74 return node->rb.handler->mn.mm; 75 } 76 77 static void free_system_node(struct sdma_mmu_node *node) 78 { 79 if (node->npages) { 80 unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0, 81 node->npages); 82 atomic_sub(node->npages, &node->pq->n_locked); 83 } 84 kfree(node); 85 } 86 87 /* 88 * kref_get()'s an additional kref on the returned rb_node to prevent rb_node 89 * from being released until after rb_node is assigned to an SDMA descriptor 90 * (struct sdma_desc) under add_system_iovec_to_sdma_packet(), even if the 91 * virtual address range for rb_node is invalidated between now and then. 92 */ 93 static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler, 94 unsigned long start, 95 unsigned long end) 96 { 97 struct mmu_rb_node *rb_node; 98 unsigned long flags; 99 100 spin_lock_irqsave(&handler->lock, flags); 101 rb_node = hfi1_mmu_rb_get_first(handler, start, (end - start)); 102 if (!rb_node) { 103 spin_unlock_irqrestore(&handler->lock, flags); 104 return NULL; 105 } 106 107 /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ 108 kref_get(&rb_node->refcount); 109 spin_unlock_irqrestore(&handler->lock, flags); 110 111 return container_of(rb_node, struct sdma_mmu_node, rb); 112 } 113 114 static int pin_system_pages(struct user_sdma_request *req, 115 uintptr_t start_address, size_t length, 116 struct sdma_mmu_node *node, int npages) 117 { 118 struct hfi1_user_sdma_pkt_q *pq = req->pq; 119 int pinned, cleared; 120 struct page **pages; 121 122 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 123 if (!pages) 124 return -ENOMEM; 125 126 retry: 127 if (!hfi1_can_pin_pages(pq->dd, current->mm, atomic_read(&pq->n_locked), 128 npages)) { 129 SDMA_DBG(req, "Evicting: nlocked %u npages %u", 130 atomic_read(&pq->n_locked), npages); 131 cleared = sdma_cache_evict(pq, npages); 132 if (cleared >= npages) 133 goto retry; 134 } 135 136 SDMA_DBG(req, "Acquire user pages start_address %lx node->npages %u npages %u", 137 start_address, node->npages, npages); 138 pinned = hfi1_acquire_user_pages(current->mm, start_address, npages, 0, 139 pages); 140 141 if (pinned < 0) { 142 kfree(pages); 143 SDMA_DBG(req, "pinned %d", pinned); 144 return pinned; 145 } 146 if (pinned != npages) { 147 unpin_vector_pages(current->mm, pages, node->npages, pinned); 148 SDMA_DBG(req, "npages %u pinned %d", npages, pinned); 149 return -EFAULT; 150 } 151 node->rb.addr = start_address; 152 node->rb.len = length; 153 node->pages = pages; 154 node->npages = npages; 155 atomic_add(pinned, &pq->n_locked); 156 SDMA_DBG(req, "done. pinned %d", pinned); 157 return 0; 158 } 159 160 /* 161 * kref refcount on *node_p will be 2 on successful addition: one kref from 162 * kref_init() for mmu_rb_handler and one kref to prevent *node_p from being 163 * released until after *node_p is assigned to an SDMA descriptor (struct 164 * sdma_desc) under add_system_iovec_to_sdma_packet(), even if the virtual 165 * address range for *node_p is invalidated between now and then. 166 */ 167 static int add_system_pinning(struct user_sdma_request *req, 168 struct sdma_mmu_node **node_p, 169 unsigned long start, unsigned long len) 170 171 { 172 struct hfi1_user_sdma_pkt_q *pq = req->pq; 173 struct sdma_mmu_node *node; 174 int ret; 175 176 node = kzalloc(sizeof(*node), GFP_KERNEL); 177 if (!node) 178 return -ENOMEM; 179 180 /* First kref "moves" to mmu_rb_handler */ 181 kref_init(&node->rb.refcount); 182 183 /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ 184 kref_get(&node->rb.refcount); 185 186 node->pq = pq; 187 ret = pin_system_pages(req, start, len, node, PFN_DOWN(len)); 188 if (ret == 0) { 189 ret = hfi1_mmu_rb_insert(pq->handler, &node->rb); 190 if (ret) 191 free_system_node(node); 192 else 193 *node_p = node; 194 195 return ret; 196 } 197 198 kfree(node); 199 return ret; 200 } 201 202 static int get_system_cache_entry(struct user_sdma_request *req, 203 struct sdma_mmu_node **node_p, 204 size_t req_start, size_t req_len) 205 { 206 struct hfi1_user_sdma_pkt_q *pq = req->pq; 207 u64 start = ALIGN_DOWN(req_start, PAGE_SIZE); 208 u64 end = PFN_ALIGN(req_start + req_len); 209 int ret; 210 211 if ((end - start) == 0) { 212 SDMA_DBG(req, 213 "Request for empty cache entry req_start %lx req_len %lx start %llx end %llx", 214 req_start, req_len, start, end); 215 return -EINVAL; 216 } 217 218 SDMA_DBG(req, "req_start %lx req_len %lu", req_start, req_len); 219 220 while (1) { 221 struct sdma_mmu_node *node = 222 find_system_node(pq->handler, start, end); 223 u64 prepend_len = 0; 224 225 SDMA_DBG(req, "node %p start %llx end %llu", node, start, end); 226 if (!node) { 227 ret = add_system_pinning(req, node_p, start, 228 end - start); 229 if (ret == -EEXIST) { 230 /* 231 * Another execution context has inserted a 232 * conficting entry first. 233 */ 234 continue; 235 } 236 return ret; 237 } 238 239 if (node->rb.addr <= start) { 240 /* 241 * This entry covers at least part of the region. If it doesn't extend 242 * to the end, then this will be called again for the next segment. 243 */ 244 *node_p = node; 245 return 0; 246 } 247 248 SDMA_DBG(req, "prepend: node->rb.addr %lx, node->rb.refcount %d", 249 node->rb.addr, kref_read(&node->rb.refcount)); 250 prepend_len = node->rb.addr - start; 251 252 /* 253 * This node will not be returned, instead a new node 254 * will be. So release the reference. 255 */ 256 kref_put(&node->rb.refcount, hfi1_mmu_rb_release); 257 258 /* Prepend a node to cover the beginning of the allocation */ 259 ret = add_system_pinning(req, node_p, start, prepend_len); 260 if (ret == -EEXIST) { 261 /* Another execution context has inserted a conficting entry first. */ 262 continue; 263 } 264 return ret; 265 } 266 } 267 268 static void sdma_mmu_rb_node_get(void *ctx) 269 { 270 struct mmu_rb_node *node = ctx; 271 272 kref_get(&node->refcount); 273 } 274 275 static void sdma_mmu_rb_node_put(void *ctx) 276 { 277 struct sdma_mmu_node *node = ctx; 278 279 kref_put(&node->rb.refcount, hfi1_mmu_rb_release); 280 } 281 282 static int add_mapping_to_sdma_packet(struct user_sdma_request *req, 283 struct user_sdma_txreq *tx, 284 struct sdma_mmu_node *cache_entry, 285 size_t start, 286 size_t from_this_cache_entry) 287 { 288 struct hfi1_user_sdma_pkt_q *pq = req->pq; 289 unsigned int page_offset; 290 unsigned int from_this_page; 291 size_t page_index; 292 void *ctx; 293 int ret; 294 295 /* 296 * Because the cache may be more fragmented than the memory that is being accessed, 297 * it's not strictly necessary to have a descriptor per cache entry. 298 */ 299 300 while (from_this_cache_entry) { 301 page_index = PFN_DOWN(start - cache_entry->rb.addr); 302 303 if (page_index >= cache_entry->npages) { 304 SDMA_DBG(req, 305 "Request for page_index %zu >= cache_entry->npages %u", 306 page_index, cache_entry->npages); 307 return -EINVAL; 308 } 309 310 page_offset = start - ALIGN_DOWN(start, PAGE_SIZE); 311 from_this_page = PAGE_SIZE - page_offset; 312 313 if (from_this_page < from_this_cache_entry) { 314 ctx = NULL; 315 } else { 316 /* 317 * In the case they are equal the next line has no practical effect, 318 * but it's better to do a register to register copy than a conditional 319 * branch. 320 */ 321 from_this_page = from_this_cache_entry; 322 ctx = cache_entry; 323 } 324 325 ret = sdma_txadd_page(pq->dd, &tx->txreq, 326 cache_entry->pages[page_index], 327 page_offset, from_this_page, 328 ctx, 329 sdma_mmu_rb_node_get, 330 sdma_mmu_rb_node_put); 331 if (ret) { 332 /* 333 * When there's a failure, the entire request is freed by 334 * user_sdma_send_pkts(). 335 */ 336 SDMA_DBG(req, 337 "sdma_txadd_page failed %d page_index %lu page_offset %u from_this_page %u", 338 ret, page_index, page_offset, from_this_page); 339 return ret; 340 } 341 start += from_this_page; 342 from_this_cache_entry -= from_this_page; 343 } 344 return 0; 345 } 346 347 static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req, 348 struct user_sdma_txreq *tx, 349 struct user_sdma_iovec *iovec, 350 size_t from_this_iovec) 351 { 352 while (from_this_iovec > 0) { 353 struct sdma_mmu_node *cache_entry; 354 size_t from_this_cache_entry; 355 size_t start; 356 int ret; 357 358 start = (uintptr_t)iovec->iov.iov_base + iovec->offset; 359 ret = get_system_cache_entry(req, &cache_entry, start, 360 from_this_iovec); 361 if (ret) { 362 SDMA_DBG(req, "pin system segment failed %d", ret); 363 return ret; 364 } 365 366 from_this_cache_entry = cache_entry->rb.len - (start - cache_entry->rb.addr); 367 if (from_this_cache_entry > from_this_iovec) 368 from_this_cache_entry = from_this_iovec; 369 370 ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start, 371 from_this_cache_entry); 372 373 /* 374 * Done adding cache_entry to zero or more sdma_desc. Can 375 * kref_put() the "safety" kref taken under 376 * get_system_cache_entry(). 377 */ 378 kref_put(&cache_entry->rb.refcount, hfi1_mmu_rb_release); 379 380 if (ret) { 381 SDMA_DBG(req, "add system segment failed %d", ret); 382 return ret; 383 } 384 385 iovec->offset += from_this_cache_entry; 386 from_this_iovec -= from_this_cache_entry; 387 } 388 389 return 0; 390 } 391 392 /* 393 * Add up to pkt_data_remaining bytes to the txreq, starting at the current 394 * offset in the given iovec entry and continuing until all data has been added 395 * to the iovec or the iovec entry type changes. 396 * 397 * On success, prior to returning, adjust pkt_data_remaining, req->iov_idx, and 398 * the offset value in req->iov[req->iov_idx] to reflect the data that has been 399 * consumed. 400 */ 401 int hfi1_add_pages_to_sdma_packet(struct user_sdma_request *req, 402 struct user_sdma_txreq *tx, 403 struct user_sdma_iovec *iovec, 404 u32 *pkt_data_remaining) 405 { 406 size_t remaining_to_add = *pkt_data_remaining; 407 /* 408 * Walk through iovec entries, ensure the associated pages 409 * are pinned and mapped, add data to the packet until no more 410 * data remains to be added or the iovec entry type changes. 411 */ 412 while (remaining_to_add > 0) { 413 struct user_sdma_iovec *cur_iovec; 414 size_t from_this_iovec; 415 int ret; 416 417 cur_iovec = iovec; 418 from_this_iovec = iovec->iov.iov_len - iovec->offset; 419 420 if (from_this_iovec > remaining_to_add) { 421 from_this_iovec = remaining_to_add; 422 } else { 423 /* The current iovec entry will be consumed by this pass. */ 424 req->iov_idx++; 425 iovec++; 426 } 427 428 ret = add_system_iovec_to_sdma_packet(req, tx, cur_iovec, 429 from_this_iovec); 430 if (ret) 431 return ret; 432 433 remaining_to_add -= from_this_iovec; 434 } 435 *pkt_data_remaining = remaining_to_add; 436 437 return 0; 438 } 439 440 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 441 unsigned long len) 442 { 443 return (bool)(node->addr == addr); 444 } 445 446 /* 447 * Return 1 to remove the node from the rb tree and call the remove op. 448 * 449 * Called with the rb tree lock held. 450 */ 451 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 452 void *evict_arg, bool *stop) 453 { 454 struct sdma_mmu_node *node = 455 container_of(mnode, struct sdma_mmu_node, rb); 456 struct evict_data *evict_data = evict_arg; 457 458 /* this node will be evicted, add its pages to our count */ 459 evict_data->cleared += node->npages; 460 461 /* have enough pages been cleared? */ 462 if (evict_data->cleared >= evict_data->target) 463 *stop = true; 464 465 return 1; /* remove this node */ 466 } 467 468 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 469 { 470 struct sdma_mmu_node *node = 471 container_of(mnode, struct sdma_mmu_node, rb); 472 473 free_system_node(node); 474 } 475