1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include <linux/circ_buf.h> 7 8 #include <drm/drm_exec.h> 9 #include <drm/drm_managed.h> 10 11 #include "xe_bo.h" 12 #include "xe_device.h" 13 #include "xe_gt_printk.h" 14 #include "xe_gt_types.h" 15 #include "xe_gt_stats.h" 16 #include "xe_hw_engine.h" 17 #include "xe_pagefault.h" 18 #include "xe_pagefault_types.h" 19 #include "xe_svm.h" 20 #include "xe_trace_bo.h" 21 #include "xe_vm.h" 22 23 /** 24 * DOC: Xe page faults 25 * 26 * Xe page faults are handled in two layers. The producer layer interacts with 27 * hardware or firmware to receive and parse faults into struct xe_pagefault, 28 * then forwards them to the consumer. The consumer layer services the faults 29 * (e.g., memory migration, page table updates) and acknowledges the result back 30 * to the producer, which then forwards the results to the hardware or firmware. 31 * The consumer uses a page fault queue sized to absorb all potential faults and 32 * a multi-threaded worker to process them. Multiple producers are supported, 33 * with a single shared consumer. 34 * 35 * xe_pagefault.c implements the consumer layer. 36 */ 37 38 static int xe_pagefault_entry_size(void) 39 { 40 /* 41 * Power of two alignment is not a hardware requirement, rather a 42 * software restriction which makes the math for page fault queue 43 * management simplier. 44 */ 45 return roundup_pow_of_two(sizeof(struct xe_pagefault)); 46 } 47 48 static int xe_pagefault_begin(struct drm_exec *exec, struct xe_vma *vma, 49 struct xe_vram_region *vram, bool need_vram_move) 50 { 51 struct xe_bo *bo = xe_vma_bo(vma); 52 struct xe_vm *vm = xe_vma_vm(vma); 53 int err; 54 55 err = xe_vm_lock_vma(exec, vma); 56 if (err) 57 return err; 58 59 if (!bo) 60 return 0; 61 62 /* 63 * Skip validate/migrate for DONTNEED/purged BOs - repopulating 64 * their pages would prevent the shrinker from reclaiming them. 65 * For non-scratch VMs there is no safe fallback so fail the fault. 66 * For scratch VMs let xe_vma_rebind() run normally; it will install 67 * scratch PTEs so the GPU gets safe zero reads instead of faulting. 68 */ 69 if (unlikely(xe_bo_madv_is_dontneed(bo) || xe_bo_is_purged(bo))) { 70 if (!xe_vm_has_scratch(vm)) 71 return -EACCES; 72 return 0; 73 } 74 75 return need_vram_move ? xe_bo_migrate(bo, vram->placement, NULL, exec) : 76 xe_bo_validate(bo, vm, true, exec); 77 } 78 79 static int xe_pagefault_handle_vma(struct xe_gt *gt, struct xe_vma *vma, 80 bool atomic) 81 { 82 struct xe_vm *vm = xe_vma_vm(vma); 83 struct xe_tile *tile = gt_to_tile(gt); 84 struct xe_validation_ctx ctx; 85 struct drm_exec exec; 86 struct dma_fence *fence; 87 int err, needs_vram; 88 89 lockdep_assert_held_write(&vm->lock); 90 91 needs_vram = xe_vma_need_vram_for_atomic(vm->xe, vma, atomic); 92 if (needs_vram < 0 || (needs_vram && xe_vma_is_userptr(vma))) 93 return needs_vram < 0 ? needs_vram : -EACCES; 94 95 xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT, 1); 96 xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_KB, 97 xe_vma_size(vma) / SZ_1K); 98 99 trace_xe_vma_pagefault(vma); 100 101 /* Check if VMA is valid, opportunistic check only */ 102 if (xe_vm_has_valid_gpu_mapping(tile, vma->tile_present, 103 vma->tile_invalidated) && !atomic) 104 return 0; 105 106 retry_userptr: 107 if (xe_vma_is_userptr(vma) && 108 xe_vma_userptr_check_repin(to_userptr_vma(vma))) { 109 struct xe_userptr_vma *uvma = to_userptr_vma(vma); 110 111 err = xe_vma_userptr_pin_pages(uvma); 112 if (err) 113 return err; 114 } 115 116 /* Lock VM and BOs dma-resv */ 117 xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {}); 118 drm_exec_until_all_locked(&exec) { 119 err = xe_pagefault_begin(&exec, vma, tile->mem.vram, 120 needs_vram == 1); 121 drm_exec_retry_on_contention(&exec); 122 xe_validation_retry_on_oom(&ctx, &err); 123 if (err) 124 goto unlock_dma_resv; 125 126 /* Bind VMA only to the GT that has faulted */ 127 trace_xe_vma_pf_bind(vma); 128 xe_vm_set_validation_exec(vm, &exec); 129 fence = xe_vma_rebind(vm, vma, BIT(tile->id)); 130 xe_vm_set_validation_exec(vm, NULL); 131 if (IS_ERR(fence)) { 132 err = PTR_ERR(fence); 133 xe_validation_retry_on_oom(&ctx, &err); 134 goto unlock_dma_resv; 135 } 136 } 137 138 dma_fence_wait(fence, false); 139 dma_fence_put(fence); 140 141 unlock_dma_resv: 142 xe_validation_ctx_fini(&ctx); 143 if (err == -EAGAIN) 144 goto retry_userptr; 145 146 return err; 147 } 148 149 static bool 150 xe_pagefault_access_is_atomic(enum xe_pagefault_access_type access_type) 151 { 152 return (access_type & XE_PAGEFAULT_ACCESS_TYPE_MASK) == XE_PAGEFAULT_ACCESS_TYPE_ATOMIC; 153 } 154 155 static struct xe_vm *xe_pagefault_asid_to_vm(struct xe_device *xe, u32 asid) 156 { 157 struct xe_vm *vm; 158 159 down_read(&xe->usm.lock); 160 vm = xa_load(&xe->usm.asid_to_vm, asid); 161 if (vm && (xe_vm_in_fault_mode(vm) || xe_vm_has_scratch(vm))) 162 xe_vm_get(vm); 163 else 164 vm = ERR_PTR(-EINVAL); 165 up_read(&xe->usm.lock); 166 167 return vm; 168 } 169 170 static int xe_pagefault_service(struct xe_pagefault *pf) 171 { 172 struct xe_gt *gt = pf->gt; 173 struct xe_device *xe = gt_to_xe(gt); 174 struct xe_vm *vm; 175 struct xe_vma *vma = NULL; 176 int err; 177 bool atomic; 178 179 /* Producer flagged this fault to be nacked */ 180 if (pf->consumer.fault_type_level == XE_PAGEFAULT_TYPE_LEVEL_NACK) 181 return -EFAULT; 182 183 vm = xe_pagefault_asid_to_vm(xe, pf->consumer.asid); 184 if (IS_ERR(vm)) 185 return PTR_ERR(vm); 186 187 /* 188 * TODO: Change to read lock? Using write lock for simplicity. 189 */ 190 down_write(&vm->lock); 191 192 if (xe_vm_is_closed(vm)) { 193 err = -ENOENT; 194 goto unlock_vm; 195 } 196 197 vma = xe_vm_find_vma_by_addr(vm, pf->consumer.page_addr); 198 if (!vma) { 199 err = -EINVAL; 200 goto unlock_vm; 201 } 202 203 if (xe_vma_read_only(vma) && 204 pf->consumer.access_type != XE_PAGEFAULT_ACCESS_TYPE_READ) { 205 err = -EPERM; 206 goto unlock_vm; 207 } 208 209 atomic = xe_pagefault_access_is_atomic(pf->consumer.access_type); 210 211 if (xe_vma_is_cpu_addr_mirror(vma)) 212 err = xe_svm_handle_pagefault(vm, vma, gt, 213 pf->consumer.page_addr, atomic); 214 else 215 err = xe_pagefault_handle_vma(gt, vma, atomic); 216 217 unlock_vm: 218 if (!err) 219 vm->usm.last_fault_vma = vma; 220 up_write(&vm->lock); 221 xe_vm_put(vm); 222 223 return err; 224 } 225 226 static bool xe_pagefault_queue_pop(struct xe_pagefault_queue *pf_queue, 227 struct xe_pagefault *pf) 228 { 229 bool found_fault = false; 230 231 spin_lock_irq(&pf_queue->lock); 232 if (pf_queue->tail != pf_queue->head) { 233 memcpy(pf, pf_queue->data + pf_queue->tail, sizeof(*pf)); 234 pf_queue->tail = (pf_queue->tail + xe_pagefault_entry_size()) % 235 pf_queue->size; 236 found_fault = true; 237 } 238 spin_unlock_irq(&pf_queue->lock); 239 240 return found_fault; 241 } 242 243 static void xe_pagefault_print(struct xe_pagefault *pf) 244 { 245 xe_gt_info(pf->gt, "\n\tASID: %d\n" 246 "\tFaulted Address: 0x%08x%08x\n" 247 "\tFaultType: %lu\n" 248 "\tAccessType: %lu\n" 249 "\tFaultLevel: %lu\n" 250 "\tEngineClass: %d %s\n" 251 "\tEngineInstance: %d\n", 252 pf->consumer.asid, 253 upper_32_bits(pf->consumer.page_addr), 254 lower_32_bits(pf->consumer.page_addr), 255 FIELD_GET(XE_PAGEFAULT_TYPE_MASK, 256 pf->consumer.fault_type_level), 257 FIELD_GET(XE_PAGEFAULT_ACCESS_TYPE_MASK, 258 pf->consumer.access_type), 259 FIELD_GET(XE_PAGEFAULT_LEVEL_MASK, 260 pf->consumer.fault_type_level), 261 pf->consumer.engine_class, 262 xe_hw_engine_class_to_str(pf->consumer.engine_class), 263 pf->consumer.engine_instance); 264 } 265 266 static void xe_pagefault_save_to_vm(struct xe_device *xe, struct xe_pagefault *pf) 267 { 268 struct xe_vm *vm; 269 270 /* 271 * Pagefault may be asociated to VM that is not in fault mode. 272 * Perform asid_to_vm behavior, except if VM is not in fault 273 * mode, return VM anyways. 274 */ 275 down_read(&xe->usm.lock); 276 vm = xa_load(&xe->usm.asid_to_vm, pf->consumer.asid); 277 if (vm) 278 xe_vm_get(vm); 279 else 280 vm = ERR_PTR(-EINVAL); 281 up_read(&xe->usm.lock); 282 283 if (IS_ERR(vm)) 284 return; 285 286 xe_vm_add_fault_entry_pf(vm, pf); 287 288 xe_vm_put(vm); 289 } 290 291 static void xe_pagefault_queue_work(struct work_struct *w) 292 { 293 struct xe_pagefault_queue *pf_queue = 294 container_of(w, typeof(*pf_queue), worker); 295 struct xe_pagefault pf; 296 unsigned long threshold; 297 298 #define USM_QUEUE_MAX_RUNTIME_MS 20 299 threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS); 300 301 while (xe_pagefault_queue_pop(pf_queue, &pf)) { 302 int err; 303 304 if (!pf.gt) /* Fault squashed during reset */ 305 continue; 306 307 err = xe_pagefault_service(&pf); 308 if (err) { 309 xe_pagefault_save_to_vm(gt_to_xe(pf.gt), &pf); 310 if (!(pf.consumer.access_type & XE_PAGEFAULT_ACCESS_PREFETCH)) { 311 xe_pagefault_print(&pf); 312 xe_gt_info(pf.gt, "Fault response: Unsuccessful %pe\n", 313 ERR_PTR(err)); 314 } else { 315 xe_gt_stats_incr(pf.gt, XE_GT_STATS_ID_INVALID_PREFETCH_PAGEFAULT_COUNT, 1); 316 xe_gt_dbg(pf.gt, "Prefetch Fault response: Unsuccessful %pe\n", 317 ERR_PTR(err)); 318 } 319 } 320 321 pf.producer.ops->ack_fault(&pf, err); 322 323 if (time_after(jiffies, threshold)) { 324 queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w); 325 break; 326 } 327 } 328 #undef USM_QUEUE_MAX_RUNTIME_MS 329 } 330 331 static int xe_pagefault_queue_init(struct xe_device *xe, 332 struct xe_pagefault_queue *pf_queue) 333 { 334 struct xe_gt *gt; 335 int total_num_eus = 0; 336 u8 id; 337 338 for_each_gt(gt, xe, id) { 339 xe_dss_mask_t all_dss; 340 int num_dss, num_eus; 341 342 num_dss = bitmap_weighted_or(all_dss, gt->fuse_topo.g_dss_mask, 343 gt->fuse_topo.c_dss_mask, XE_MAX_DSS_FUSE_BITS); 344 345 num_eus = bitmap_weight(gt->fuse_topo.eu_mask_per_dss, 346 XE_MAX_EU_FUSE_BITS) * num_dss; 347 348 total_num_eus += num_eus; 349 } 350 351 xe_assert(xe, total_num_eus); 352 353 /* 354 * user can issue separate page faults per EU and per CS 355 * 356 * XXX: Multiplier required as compute UMD are getting PF queue errors 357 * without it. Follow on why this multiplier is required. 358 */ 359 #define PF_MULTIPLIER 8 360 pf_queue->size = (total_num_eus + XE_NUM_HW_ENGINES) * 361 xe_pagefault_entry_size() * PF_MULTIPLIER; 362 pf_queue->size = roundup_pow_of_two(pf_queue->size); 363 #undef PF_MULTIPLIER 364 365 drm_dbg(&xe->drm, "xe_pagefault_entry_size=%d, total_num_eus=%d, pf_queue->size=%u", 366 xe_pagefault_entry_size(), total_num_eus, pf_queue->size); 367 368 spin_lock_init(&pf_queue->lock); 369 INIT_WORK(&pf_queue->worker, xe_pagefault_queue_work); 370 371 pf_queue->data = drmm_kzalloc(&xe->drm, pf_queue->size, GFP_KERNEL); 372 if (!pf_queue->data) 373 return -ENOMEM; 374 375 return 0; 376 } 377 378 static void xe_pagefault_fini(void *arg) 379 { 380 struct xe_device *xe = arg; 381 382 destroy_workqueue(xe->usm.pf_wq); 383 } 384 385 /** 386 * xe_pagefault_init() - Page fault init 387 * @xe: xe device instance 388 * 389 * Initialize Xe page fault state. Must be done after reading fuses. 390 * 391 * Return: 0 on Success, errno on failure 392 */ 393 int xe_pagefault_init(struct xe_device *xe) 394 { 395 int err, i; 396 397 if (!xe->info.has_usm) 398 return 0; 399 400 xe->usm.pf_wq = alloc_workqueue("xe_page_fault_work_queue", 401 WQ_UNBOUND | WQ_HIGHPRI, 402 XE_PAGEFAULT_QUEUE_COUNT); 403 if (!xe->usm.pf_wq) 404 return -ENOMEM; 405 406 for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) { 407 err = xe_pagefault_queue_init(xe, xe->usm.pf_queue + i); 408 if (err) 409 goto err_out; 410 } 411 412 return devm_add_action_or_reset(xe->drm.dev, xe_pagefault_fini, xe); 413 414 err_out: 415 destroy_workqueue(xe->usm.pf_wq); 416 return err; 417 } 418 419 static void xe_pagefault_queue_reset(struct xe_device *xe, struct xe_gt *gt, 420 struct xe_pagefault_queue *pf_queue) 421 { 422 u32 i; 423 424 /* Driver load failure guard / USM not enabled guard */ 425 if (!pf_queue->data) 426 return; 427 428 /* Squash all pending faults on the GT */ 429 430 spin_lock_irq(&pf_queue->lock); 431 for (i = pf_queue->tail; i != pf_queue->head; 432 i = (i + xe_pagefault_entry_size()) % pf_queue->size) { 433 struct xe_pagefault *pf = pf_queue->data + i; 434 435 if (pf->gt == gt) 436 pf->gt = NULL; 437 } 438 spin_unlock_irq(&pf_queue->lock); 439 } 440 441 /** 442 * xe_pagefault_reset() - Page fault reset for a GT 443 * @xe: xe device instance 444 * @gt: GT being reset 445 * 446 * Reset the Xe page fault state for a GT; that is, squash any pending faults on 447 * the GT. 448 */ 449 void xe_pagefault_reset(struct xe_device *xe, struct xe_gt *gt) 450 { 451 int i; 452 453 for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) 454 xe_pagefault_queue_reset(xe, gt, xe->usm.pf_queue + i); 455 } 456 457 static bool xe_pagefault_queue_full(struct xe_pagefault_queue *pf_queue) 458 { 459 lockdep_assert_held(&pf_queue->lock); 460 461 return CIRC_SPACE(pf_queue->head, pf_queue->tail, pf_queue->size) <= 462 xe_pagefault_entry_size(); 463 } 464 465 /** 466 * xe_pagefault_handler() - Page fault handler 467 * @xe: xe device instance 468 * @pf: Page fault 469 * 470 * Sink the page fault to a queue (i.e., a memory buffer) and queue a worker to 471 * service it. Safe to be called from IRQ or process context. Reclaim safe. 472 * 473 * Return: 0 on success, errno on failure 474 */ 475 int xe_pagefault_handler(struct xe_device *xe, struct xe_pagefault *pf) 476 { 477 struct xe_pagefault_queue *pf_queue = xe->usm.pf_queue + 478 (pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT); 479 unsigned long flags; 480 bool full; 481 482 spin_lock_irqsave(&pf_queue->lock, flags); 483 full = xe_pagefault_queue_full(pf_queue); 484 if (!full) { 485 memcpy(pf_queue->data + pf_queue->head, pf, sizeof(*pf)); 486 pf_queue->head = (pf_queue->head + xe_pagefault_entry_size()) % 487 pf_queue->size; 488 queue_work(xe->usm.pf_wq, &pf_queue->worker); 489 } else { 490 drm_warn(&xe->drm, 491 "PageFault Queue (%d) full, shouldn't be possible\n", 492 pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT); 493 } 494 spin_unlock_irqrestore(&pf_queue->lock, flags); 495 496 return full ? -ENOSPC : 0; 497 } 498