1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include <linux/circ_buf.h> 7 8 #include <drm/drm_exec.h> 9 #include <drm/drm_managed.h> 10 11 #include "xe_bo.h" 12 #include "xe_device.h" 13 #include "xe_gt_printk.h" 14 #include "xe_gt_types.h" 15 #include "xe_gt_stats.h" 16 #include "xe_hw_engine.h" 17 #include "xe_pagefault.h" 18 #include "xe_pagefault_types.h" 19 #include "xe_svm.h" 20 #include "xe_trace_bo.h" 21 #include "xe_vm.h" 22 23 /** 24 * DOC: Xe page faults 25 * 26 * Xe page faults are handled in two layers. The producer layer interacts with 27 * hardware or firmware to receive and parse faults into struct xe_pagefault, 28 * then forwards them to the consumer. The consumer layer services the faults 29 * (e.g., memory migration, page table updates) and acknowledges the result back 30 * to the producer, which then forwards the results to the hardware or firmware. 31 * The consumer uses a page fault queue sized to absorb all potential faults and 32 * a multi-threaded worker to process them. Multiple producers are supported, 33 * with a single shared consumer. 34 * 35 * xe_pagefault.c implements the consumer layer. 36 */ 37 38 static int xe_pagefault_entry_size(void) 39 { 40 /* 41 * Power of two alignment is not a hardware requirement, rather a 42 * software restriction which makes the math for page fault queue 43 * management simplier. 44 */ 45 return roundup_pow_of_two(sizeof(struct xe_pagefault)); 46 } 47 48 static int xe_pagefault_begin(struct drm_exec *exec, struct xe_vma *vma, 49 struct xe_vram_region *vram, bool need_vram_move) 50 { 51 struct xe_bo *bo = xe_vma_bo(vma); 52 struct xe_vm *vm = xe_vma_vm(vma); 53 int err; 54 55 err = xe_vm_lock_vma(exec, vma); 56 if (err) 57 return err; 58 59 if (!bo) 60 return 0; 61 62 /* 63 * Skip validate/migrate for DONTNEED/purged BOs - repopulating 64 * their pages would prevent the shrinker from reclaiming them. 65 * For non-scratch VMs there is no safe fallback so fail the fault. 66 * For scratch VMs let xe_vma_rebind() run normally; it will install 67 * scratch PTEs so the GPU gets safe zero reads instead of faulting. 68 */ 69 if (unlikely(xe_bo_madv_is_dontneed(bo) || xe_bo_is_purged(bo))) { 70 if (!xe_vm_has_scratch(vm)) 71 return -EACCES; 72 return 0; 73 } 74 75 return need_vram_move ? xe_bo_migrate(bo, vram->placement, NULL, exec) : 76 xe_bo_validate(bo, vm, true, exec); 77 } 78 79 static int xe_pagefault_handle_vma(struct xe_gt *gt, struct xe_vma *vma, 80 bool atomic) 81 { 82 struct xe_vm *vm = xe_vma_vm(vma); 83 struct xe_tile *tile = gt_to_tile(gt); 84 struct xe_validation_ctx ctx; 85 struct drm_exec exec; 86 struct dma_fence *fence; 87 int err, needs_vram; 88 89 lockdep_assert_held_write(&vm->lock); 90 91 needs_vram = xe_vma_need_vram_for_atomic(vm->xe, vma, atomic); 92 if (needs_vram < 0 || (needs_vram && xe_vma_is_userptr(vma))) 93 return needs_vram < 0 ? needs_vram : -EACCES; 94 95 xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT, 1); 96 xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_KB, 97 xe_vma_size(vma) / SZ_1K); 98 99 trace_xe_vma_pagefault(vma); 100 101 /* Check if VMA is valid, opportunistic check only */ 102 if (xe_vm_has_valid_gpu_mapping(tile, vma->tile_present, 103 vma->tile_invalidated) && !atomic) 104 return 0; 105 106 retry_userptr: 107 if (xe_vma_is_userptr(vma) && 108 xe_vma_userptr_check_repin(to_userptr_vma(vma))) { 109 struct xe_userptr_vma *uvma = to_userptr_vma(vma); 110 111 err = xe_vma_userptr_pin_pages(uvma); 112 if (err) 113 return err; 114 } 115 116 /* Lock VM and BOs dma-resv */ 117 xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {}); 118 drm_exec_until_all_locked(&exec) { 119 err = xe_pagefault_begin(&exec, vma, tile->mem.vram, 120 needs_vram == 1); 121 drm_exec_retry_on_contention(&exec); 122 xe_validation_retry_on_oom(&ctx, &err); 123 if (err) 124 goto unlock_dma_resv; 125 126 /* Bind VMA only to the GT that has faulted */ 127 trace_xe_vma_pf_bind(vma); 128 xe_vm_set_validation_exec(vm, &exec); 129 fence = xe_vma_rebind(vm, vma, BIT(tile->id)); 130 xe_vm_set_validation_exec(vm, NULL); 131 if (IS_ERR(fence)) { 132 err = PTR_ERR(fence); 133 xe_validation_retry_on_oom(&ctx, &err); 134 goto unlock_dma_resv; 135 } 136 } 137 138 dma_fence_wait(fence, false); 139 dma_fence_put(fence); 140 141 unlock_dma_resv: 142 xe_validation_ctx_fini(&ctx); 143 if (err == -EAGAIN) 144 goto retry_userptr; 145 146 return err; 147 } 148 149 static bool 150 xe_pagefault_access_is_atomic(enum xe_pagefault_access_type access_type) 151 { 152 return (access_type & XE_PAGEFAULT_ACCESS_TYPE_MASK) == XE_PAGEFAULT_ACCESS_TYPE_ATOMIC; 153 } 154 155 static struct xe_vm *xe_pagefault_asid_to_vm(struct xe_device *xe, u32 asid) 156 { 157 struct xe_vm *vm; 158 159 down_read(&xe->usm.lock); 160 vm = xa_load(&xe->usm.asid_to_vm, asid); 161 if (vm && (xe_vm_in_fault_mode(vm) || xe_vm_has_scratch(vm))) 162 xe_vm_get(vm); 163 else 164 vm = ERR_PTR(-EINVAL); 165 up_read(&xe->usm.lock); 166 167 return vm; 168 } 169 170 static int xe_pagefault_service(struct xe_pagefault *pf) 171 { 172 struct xe_gt *gt = pf->gt; 173 struct xe_device *xe = gt_to_xe(gt); 174 struct xe_vm *vm; 175 struct xe_vma *vma = NULL; 176 int err; 177 bool atomic; 178 179 /* Producer flagged this fault to be nacked */ 180 if (pf->consumer.fault_type_level == XE_PAGEFAULT_TYPE_LEVEL_NACK) 181 return -EFAULT; 182 183 vm = xe_pagefault_asid_to_vm(xe, pf->consumer.asid); 184 if (IS_ERR(vm)) 185 return PTR_ERR(vm); 186 187 /* 188 * TODO: Change to read lock? Using write lock for simplicity. 189 */ 190 down_write(&vm->lock); 191 192 if (xe_vm_is_closed(vm)) { 193 err = -ENOENT; 194 goto unlock_vm; 195 } 196 197 vma = xe_vm_find_vma_by_addr(vm, pf->consumer.page_addr); 198 if (!vma) { 199 err = -EINVAL; 200 goto unlock_vm; 201 } 202 203 if (xe_vma_read_only(vma) && 204 pf->consumer.access_type != XE_PAGEFAULT_ACCESS_TYPE_READ) { 205 err = -EPERM; 206 goto unlock_vm; 207 } 208 209 atomic = xe_pagefault_access_is_atomic(pf->consumer.access_type); 210 211 if (xe_vma_is_cpu_addr_mirror(vma)) 212 err = xe_svm_handle_pagefault(vm, vma, gt, 213 pf->consumer.page_addr, atomic); 214 else 215 err = xe_pagefault_handle_vma(gt, vma, atomic); 216 217 unlock_vm: 218 if (!err) 219 vm->usm.last_fault_vma = vma; 220 up_write(&vm->lock); 221 xe_vm_put(vm); 222 223 return err; 224 } 225 226 static bool xe_pagefault_queue_pop(struct xe_pagefault_queue *pf_queue, 227 struct xe_pagefault *pf) 228 { 229 bool found_fault = false; 230 231 spin_lock_irq(&pf_queue->lock); 232 if (pf_queue->tail != pf_queue->head) { 233 memcpy(pf, pf_queue->data + pf_queue->tail, sizeof(*pf)); 234 pf_queue->tail = (pf_queue->tail + xe_pagefault_entry_size()) % 235 pf_queue->size; 236 found_fault = true; 237 } 238 spin_unlock_irq(&pf_queue->lock); 239 240 return found_fault; 241 } 242 243 static void xe_pagefault_print(struct xe_pagefault *pf) 244 { 245 xe_gt_info(pf->gt, "\n\tASID: %d\n" 246 "\tFaulted Address: 0x%08x%08x\n" 247 "\tFaultType: %lu\n" 248 "\tAccessType: %lu\n" 249 "\tFaultLevel: %lu\n" 250 "\tEngineClass: %d %s\n" 251 "\tEngineInstance: %d\n", 252 pf->consumer.asid, 253 upper_32_bits(pf->consumer.page_addr), 254 lower_32_bits(pf->consumer.page_addr), 255 FIELD_GET(XE_PAGEFAULT_TYPE_MASK, 256 pf->consumer.fault_type_level), 257 FIELD_GET(XE_PAGEFAULT_ACCESS_TYPE_MASK, 258 pf->consumer.access_type), 259 FIELD_GET(XE_PAGEFAULT_LEVEL_MASK, 260 pf->consumer.fault_type_level), 261 pf->consumer.engine_class, 262 xe_hw_engine_class_to_str(pf->consumer.engine_class), 263 pf->consumer.engine_instance); 264 } 265 266 static void xe_pagefault_save_to_vm(struct xe_device *xe, struct xe_pagefault *pf) 267 { 268 struct xe_vm *vm; 269 270 /* 271 * Pagefault may be asociated to VM that is not in fault mode. 272 * Perform asid_to_vm behavior, except if VM is not in fault 273 * mode, return VM anyways. 274 */ 275 down_read(&xe->usm.lock); 276 vm = xa_load(&xe->usm.asid_to_vm, pf->consumer.asid); 277 if (vm) 278 xe_vm_get(vm); 279 else 280 vm = ERR_PTR(-EINVAL); 281 up_read(&xe->usm.lock); 282 283 if (IS_ERR(vm)) 284 return; 285 286 xe_vm_add_fault_entry_pf(vm, pf); 287 288 xe_vm_put(vm); 289 } 290 291 static void xe_pagefault_queue_work(struct work_struct *w) 292 { 293 struct xe_pagefault_queue *pf_queue = 294 container_of(w, typeof(*pf_queue), worker); 295 struct xe_pagefault pf; 296 unsigned long threshold; 297 298 #define USM_QUEUE_MAX_RUNTIME_MS 20 299 threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS); 300 301 while (xe_pagefault_queue_pop(pf_queue, &pf)) { 302 int err; 303 304 if (!pf.gt) /* Fault squashed during reset */ 305 continue; 306 307 err = xe_pagefault_service(&pf); 308 if (err) { 309 xe_pagefault_save_to_vm(gt_to_xe(pf.gt), &pf); 310 if (!(pf.consumer.access_type & XE_PAGEFAULT_ACCESS_PREFETCH)) { 311 xe_pagefault_print(&pf); 312 xe_gt_info(pf.gt, "Fault response: Unsuccessful %pe\n", 313 ERR_PTR(err)); 314 } else { 315 xe_gt_stats_incr(pf.gt, XE_GT_STATS_ID_INVALID_PREFETCH_PAGEFAULT_COUNT, 1); 316 xe_gt_dbg(pf.gt, "Prefetch Fault response: Unsuccessful %pe\n", 317 ERR_PTR(err)); 318 } 319 } 320 321 pf.producer.ops->ack_fault(&pf, err); 322 323 if (time_after(jiffies, threshold)) { 324 queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w); 325 break; 326 } 327 } 328 #undef USM_QUEUE_MAX_RUNTIME_MS 329 } 330 331 static int xe_pagefault_queue_init(struct xe_device *xe, 332 struct xe_pagefault_queue *pf_queue) 333 { 334 struct xe_gt *gt; 335 int total_num_eus = 0; 336 u8 id; 337 338 for_each_gt(gt, xe, id) { 339 xe_dss_mask_t all_dss; 340 int num_dss, num_eus; 341 342 bitmap_or(all_dss, gt->fuse_topo.g_dss_mask, 343 gt->fuse_topo.c_dss_mask, XE_MAX_DSS_FUSE_BITS); 344 345 num_dss = bitmap_weight(all_dss, XE_MAX_DSS_FUSE_BITS); 346 num_eus = bitmap_weight(gt->fuse_topo.eu_mask_per_dss, 347 XE_MAX_EU_FUSE_BITS) * num_dss; 348 349 total_num_eus += num_eus; 350 } 351 352 xe_assert(xe, total_num_eus); 353 354 /* 355 * user can issue separate page faults per EU and per CS 356 * 357 * XXX: Multiplier required as compute UMD are getting PF queue errors 358 * without it. Follow on why this multiplier is required. 359 */ 360 #define PF_MULTIPLIER 8 361 pf_queue->size = (total_num_eus + XE_NUM_HW_ENGINES) * 362 xe_pagefault_entry_size() * PF_MULTIPLIER; 363 pf_queue->size = roundup_pow_of_two(pf_queue->size); 364 #undef PF_MULTIPLIER 365 366 drm_dbg(&xe->drm, "xe_pagefault_entry_size=%d, total_num_eus=%d, pf_queue->size=%u", 367 xe_pagefault_entry_size(), total_num_eus, pf_queue->size); 368 369 spin_lock_init(&pf_queue->lock); 370 INIT_WORK(&pf_queue->worker, xe_pagefault_queue_work); 371 372 pf_queue->data = drmm_kzalloc(&xe->drm, pf_queue->size, GFP_KERNEL); 373 if (!pf_queue->data) 374 return -ENOMEM; 375 376 return 0; 377 } 378 379 static void xe_pagefault_fini(void *arg) 380 { 381 struct xe_device *xe = arg; 382 383 destroy_workqueue(xe->usm.pf_wq); 384 } 385 386 /** 387 * xe_pagefault_init() - Page fault init 388 * @xe: xe device instance 389 * 390 * Initialize Xe page fault state. Must be done after reading fuses. 391 * 392 * Return: 0 on Success, errno on failure 393 */ 394 int xe_pagefault_init(struct xe_device *xe) 395 { 396 int err, i; 397 398 if (!xe->info.has_usm) 399 return 0; 400 401 xe->usm.pf_wq = alloc_workqueue("xe_page_fault_work_queue", 402 WQ_UNBOUND | WQ_HIGHPRI, 403 XE_PAGEFAULT_QUEUE_COUNT); 404 if (!xe->usm.pf_wq) 405 return -ENOMEM; 406 407 for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) { 408 err = xe_pagefault_queue_init(xe, xe->usm.pf_queue + i); 409 if (err) 410 goto err_out; 411 } 412 413 return devm_add_action_or_reset(xe->drm.dev, xe_pagefault_fini, xe); 414 415 err_out: 416 destroy_workqueue(xe->usm.pf_wq); 417 return err; 418 } 419 420 static void xe_pagefault_queue_reset(struct xe_device *xe, struct xe_gt *gt, 421 struct xe_pagefault_queue *pf_queue) 422 { 423 u32 i; 424 425 /* Driver load failure guard / USM not enabled guard */ 426 if (!pf_queue->data) 427 return; 428 429 /* Squash all pending faults on the GT */ 430 431 spin_lock_irq(&pf_queue->lock); 432 for (i = pf_queue->tail; i != pf_queue->head; 433 i = (i + xe_pagefault_entry_size()) % pf_queue->size) { 434 struct xe_pagefault *pf = pf_queue->data + i; 435 436 if (pf->gt == gt) 437 pf->gt = NULL; 438 } 439 spin_unlock_irq(&pf_queue->lock); 440 } 441 442 /** 443 * xe_pagefault_reset() - Page fault reset for a GT 444 * @xe: xe device instance 445 * @gt: GT being reset 446 * 447 * Reset the Xe page fault state for a GT; that is, squash any pending faults on 448 * the GT. 449 */ 450 void xe_pagefault_reset(struct xe_device *xe, struct xe_gt *gt) 451 { 452 int i; 453 454 for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) 455 xe_pagefault_queue_reset(xe, gt, xe->usm.pf_queue + i); 456 } 457 458 static bool xe_pagefault_queue_full(struct xe_pagefault_queue *pf_queue) 459 { 460 lockdep_assert_held(&pf_queue->lock); 461 462 return CIRC_SPACE(pf_queue->head, pf_queue->tail, pf_queue->size) <= 463 xe_pagefault_entry_size(); 464 } 465 466 /** 467 * xe_pagefault_handler() - Page fault handler 468 * @xe: xe device instance 469 * @pf: Page fault 470 * 471 * Sink the page fault to a queue (i.e., a memory buffer) and queue a worker to 472 * service it. Safe to be called from IRQ or process context. Reclaim safe. 473 * 474 * Return: 0 on success, errno on failure 475 */ 476 int xe_pagefault_handler(struct xe_device *xe, struct xe_pagefault *pf) 477 { 478 struct xe_pagefault_queue *pf_queue = xe->usm.pf_queue + 479 (pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT); 480 unsigned long flags; 481 bool full; 482 483 spin_lock_irqsave(&pf_queue->lock, flags); 484 full = xe_pagefault_queue_full(pf_queue); 485 if (!full) { 486 memcpy(pf_queue->data + pf_queue->head, pf, sizeof(*pf)); 487 pf_queue->head = (pf_queue->head + xe_pagefault_entry_size()) % 488 pf_queue->size; 489 queue_work(xe->usm.pf_wq, &pf_queue->worker); 490 } else { 491 drm_warn(&xe->drm, 492 "PageFault Queue (%d) full, shouldn't be possible\n", 493 pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT); 494 } 495 spin_unlock_irqrestore(&pf_queue->lock, flags); 496 497 return full ? -ENOSPC : 0; 498 } 499