1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include <linux/circ_buf.h> 7 8 #include <drm/drm_exec.h> 9 #include <drm/drm_managed.h> 10 11 #include "xe_bo.h" 12 #include "xe_device.h" 13 #include "xe_gt_printk.h" 14 #include "xe_gt_types.h" 15 #include "xe_gt_stats.h" 16 #include "xe_hw_engine.h" 17 #include "xe_pagefault.h" 18 #include "xe_pagefault_types.h" 19 #include "xe_svm.h" 20 #include "xe_trace_bo.h" 21 #include "xe_vm.h" 22 23 /** 24 * DOC: Xe page faults 25 * 26 * Xe page faults are handled in two layers. The producer layer interacts with 27 * hardware or firmware to receive and parse faults into struct xe_pagefault, 28 * then forwards them to the consumer. The consumer layer services the faults 29 * (e.g., memory migration, page table updates) and acknowledges the result back 30 * to the producer, which then forwards the results to the hardware or firmware. 31 * The consumer uses a page fault queue sized to absorb all potential faults and 32 * a multi-threaded worker to process them. Multiple producers are supported, 33 * with a single shared consumer. 34 * 35 * xe_pagefault.c implements the consumer layer. 36 */ 37 38 static int xe_pagefault_entry_size(void) 39 { 40 /* 41 * Power of two alignment is not a hardware requirement, rather a 42 * software restriction which makes the math for page fault queue 43 * management simplier. 44 */ 45 return roundup_pow_of_two(sizeof(struct xe_pagefault)); 46 } 47 48 static int xe_pagefault_begin(struct drm_exec *exec, struct xe_vma *vma, 49 struct xe_vram_region *vram, bool need_vram_move) 50 { 51 struct xe_bo *bo = xe_vma_bo(vma); 52 struct xe_vm *vm = xe_vma_vm(vma); 53 int err; 54 55 err = xe_vm_lock_vma(exec, vma); 56 if (err) 57 return err; 58 59 if (!bo) 60 return 0; 61 62 return need_vram_move ? xe_bo_migrate(bo, vram->placement, NULL, exec) : 63 xe_bo_validate(bo, vm, true, exec); 64 } 65 66 static int xe_pagefault_handle_vma(struct xe_gt *gt, struct xe_vma *vma, 67 bool atomic) 68 { 69 struct xe_vm *vm = xe_vma_vm(vma); 70 struct xe_tile *tile = gt_to_tile(gt); 71 struct xe_validation_ctx ctx; 72 struct drm_exec exec; 73 struct dma_fence *fence; 74 int err, needs_vram; 75 76 lockdep_assert_held_write(&vm->lock); 77 78 needs_vram = xe_vma_need_vram_for_atomic(vm->xe, vma, atomic); 79 if (needs_vram < 0 || (needs_vram && xe_vma_is_userptr(vma))) 80 return needs_vram < 0 ? needs_vram : -EACCES; 81 82 xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT, 1); 83 xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_KB, 84 xe_vma_size(vma) / SZ_1K); 85 86 trace_xe_vma_pagefault(vma); 87 88 /* Check if VMA is valid, opportunistic check only */ 89 if (xe_vm_has_valid_gpu_mapping(tile, vma->tile_present, 90 vma->tile_invalidated) && !atomic) 91 return 0; 92 93 retry_userptr: 94 if (xe_vma_is_userptr(vma) && 95 xe_vma_userptr_check_repin(to_userptr_vma(vma))) { 96 struct xe_userptr_vma *uvma = to_userptr_vma(vma); 97 98 err = xe_vma_userptr_pin_pages(uvma); 99 if (err) 100 return err; 101 } 102 103 /* Lock VM and BOs dma-resv */ 104 xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {}); 105 drm_exec_until_all_locked(&exec) { 106 err = xe_pagefault_begin(&exec, vma, tile->mem.vram, 107 needs_vram == 1); 108 drm_exec_retry_on_contention(&exec); 109 xe_validation_retry_on_oom(&ctx, &err); 110 if (err) 111 goto unlock_dma_resv; 112 113 /* Bind VMA only to the GT that has faulted */ 114 trace_xe_vma_pf_bind(vma); 115 xe_vm_set_validation_exec(vm, &exec); 116 fence = xe_vma_rebind(vm, vma, BIT(tile->id)); 117 xe_vm_set_validation_exec(vm, NULL); 118 if (IS_ERR(fence)) { 119 err = PTR_ERR(fence); 120 xe_validation_retry_on_oom(&ctx, &err); 121 goto unlock_dma_resv; 122 } 123 } 124 125 dma_fence_wait(fence, false); 126 dma_fence_put(fence); 127 128 unlock_dma_resv: 129 xe_validation_ctx_fini(&ctx); 130 if (err == -EAGAIN) 131 goto retry_userptr; 132 133 return err; 134 } 135 136 static bool 137 xe_pagefault_access_is_atomic(enum xe_pagefault_access_type access_type) 138 { 139 return (access_type & XE_PAGEFAULT_ACCESS_TYPE_MASK) == XE_PAGEFAULT_ACCESS_TYPE_ATOMIC; 140 } 141 142 static struct xe_vm *xe_pagefault_asid_to_vm(struct xe_device *xe, u32 asid) 143 { 144 struct xe_vm *vm; 145 146 down_read(&xe->usm.lock); 147 vm = xa_load(&xe->usm.asid_to_vm, asid); 148 if (vm && xe_vm_in_fault_mode(vm)) 149 xe_vm_get(vm); 150 else 151 vm = ERR_PTR(-EINVAL); 152 up_read(&xe->usm.lock); 153 154 return vm; 155 } 156 157 static int xe_pagefault_service(struct xe_pagefault *pf) 158 { 159 struct xe_gt *gt = pf->gt; 160 struct xe_device *xe = gt_to_xe(gt); 161 struct xe_vm *vm; 162 struct xe_vma *vma = NULL; 163 int err; 164 bool atomic; 165 166 /* Producer flagged this fault to be nacked */ 167 if (pf->consumer.fault_type_level == XE_PAGEFAULT_TYPE_LEVEL_NACK) 168 return -EFAULT; 169 170 vm = xe_pagefault_asid_to_vm(xe, pf->consumer.asid); 171 if (IS_ERR(vm)) 172 return PTR_ERR(vm); 173 174 /* 175 * TODO: Change to read lock? Using write lock for simplicity. 176 */ 177 down_write(&vm->lock); 178 179 if (xe_vm_is_closed(vm)) { 180 err = -ENOENT; 181 goto unlock_vm; 182 } 183 184 vma = xe_vm_find_vma_by_addr(vm, pf->consumer.page_addr); 185 if (!vma) { 186 err = -EINVAL; 187 goto unlock_vm; 188 } 189 190 atomic = xe_pagefault_access_is_atomic(pf->consumer.access_type); 191 192 if (xe_vma_is_cpu_addr_mirror(vma)) 193 err = xe_svm_handle_pagefault(vm, vma, gt, 194 pf->consumer.page_addr, atomic); 195 else 196 err = xe_pagefault_handle_vma(gt, vma, atomic); 197 198 unlock_vm: 199 if (!err) 200 vm->usm.last_fault_vma = vma; 201 up_write(&vm->lock); 202 xe_vm_put(vm); 203 204 return err; 205 } 206 207 static bool xe_pagefault_queue_pop(struct xe_pagefault_queue *pf_queue, 208 struct xe_pagefault *pf) 209 { 210 bool found_fault = false; 211 212 spin_lock_irq(&pf_queue->lock); 213 if (pf_queue->tail != pf_queue->head) { 214 memcpy(pf, pf_queue->data + pf_queue->tail, sizeof(*pf)); 215 pf_queue->tail = (pf_queue->tail + xe_pagefault_entry_size()) % 216 pf_queue->size; 217 found_fault = true; 218 } 219 spin_unlock_irq(&pf_queue->lock); 220 221 return found_fault; 222 } 223 224 static void xe_pagefault_print(struct xe_pagefault *pf) 225 { 226 xe_gt_info(pf->gt, "\n\tASID: %d\n" 227 "\tFaulted Address: 0x%08x%08x\n" 228 "\tFaultType: %lu\n" 229 "\tAccessType: %lu\n" 230 "\tFaultLevel: %lu\n" 231 "\tEngineClass: %d %s\n" 232 "\tEngineInstance: %d\n", 233 pf->consumer.asid, 234 upper_32_bits(pf->consumer.page_addr), 235 lower_32_bits(pf->consumer.page_addr), 236 FIELD_GET(XE_PAGEFAULT_TYPE_MASK, 237 pf->consumer.fault_type_level), 238 FIELD_GET(XE_PAGEFAULT_ACCESS_TYPE_MASK, 239 pf->consumer.access_type), 240 FIELD_GET(XE_PAGEFAULT_LEVEL_MASK, 241 pf->consumer.fault_type_level), 242 pf->consumer.engine_class, 243 xe_hw_engine_class_to_str(pf->consumer.engine_class), 244 pf->consumer.engine_instance); 245 } 246 247 static void xe_pagefault_queue_work(struct work_struct *w) 248 { 249 struct xe_pagefault_queue *pf_queue = 250 container_of(w, typeof(*pf_queue), worker); 251 struct xe_pagefault pf; 252 unsigned long threshold; 253 254 #define USM_QUEUE_MAX_RUNTIME_MS 20 255 threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS); 256 257 while (xe_pagefault_queue_pop(pf_queue, &pf)) { 258 int err; 259 260 if (!pf.gt) /* Fault squashed during reset */ 261 continue; 262 263 err = xe_pagefault_service(&pf); 264 if (err) { 265 if (!(pf.consumer.access_type & XE_PAGEFAULT_ACCESS_PREFETCH)) { 266 xe_pagefault_print(&pf); 267 xe_gt_info(pf.gt, "Fault response: Unsuccessful %pe\n", 268 ERR_PTR(err)); 269 } else { 270 xe_gt_stats_incr(pf.gt, XE_GT_STATS_ID_INVALID_PREFETCH_PAGEFAULT_COUNT, 1); 271 xe_gt_dbg(pf.gt, "Prefetch Fault response: Unsuccessful %pe\n", 272 ERR_PTR(err)); 273 } 274 } 275 276 pf.producer.ops->ack_fault(&pf, err); 277 278 if (time_after(jiffies, threshold)) { 279 queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w); 280 break; 281 } 282 } 283 #undef USM_QUEUE_MAX_RUNTIME_MS 284 } 285 286 static int xe_pagefault_queue_init(struct xe_device *xe, 287 struct xe_pagefault_queue *pf_queue) 288 { 289 struct xe_gt *gt; 290 int total_num_eus = 0; 291 u8 id; 292 293 for_each_gt(gt, xe, id) { 294 xe_dss_mask_t all_dss; 295 int num_dss, num_eus; 296 297 bitmap_or(all_dss, gt->fuse_topo.g_dss_mask, 298 gt->fuse_topo.c_dss_mask, XE_MAX_DSS_FUSE_BITS); 299 300 num_dss = bitmap_weight(all_dss, XE_MAX_DSS_FUSE_BITS); 301 num_eus = bitmap_weight(gt->fuse_topo.eu_mask_per_dss, 302 XE_MAX_EU_FUSE_BITS) * num_dss; 303 304 total_num_eus += num_eus; 305 } 306 307 xe_assert(xe, total_num_eus); 308 309 /* 310 * user can issue separate page faults per EU and per CS 311 * 312 * XXX: Multiplier required as compute UMD are getting PF queue errors 313 * without it. Follow on why this multiplier is required. 314 */ 315 #define PF_MULTIPLIER 8 316 pf_queue->size = (total_num_eus + XE_NUM_HW_ENGINES) * 317 xe_pagefault_entry_size() * PF_MULTIPLIER; 318 pf_queue->size = roundup_pow_of_two(pf_queue->size); 319 #undef PF_MULTIPLIER 320 321 drm_dbg(&xe->drm, "xe_pagefault_entry_size=%d, total_num_eus=%d, pf_queue->size=%u", 322 xe_pagefault_entry_size(), total_num_eus, pf_queue->size); 323 324 spin_lock_init(&pf_queue->lock); 325 INIT_WORK(&pf_queue->worker, xe_pagefault_queue_work); 326 327 pf_queue->data = drmm_kzalloc(&xe->drm, pf_queue->size, GFP_KERNEL); 328 if (!pf_queue->data) 329 return -ENOMEM; 330 331 return 0; 332 } 333 334 static void xe_pagefault_fini(void *arg) 335 { 336 struct xe_device *xe = arg; 337 338 destroy_workqueue(xe->usm.pf_wq); 339 } 340 341 /** 342 * xe_pagefault_init() - Page fault init 343 * @xe: xe device instance 344 * 345 * Initialize Xe page fault state. Must be done after reading fuses. 346 * 347 * Return: 0 on Success, errno on failure 348 */ 349 int xe_pagefault_init(struct xe_device *xe) 350 { 351 int err, i; 352 353 if (!xe->info.has_usm) 354 return 0; 355 356 xe->usm.pf_wq = alloc_workqueue("xe_page_fault_work_queue", 357 WQ_UNBOUND | WQ_HIGHPRI, 358 XE_PAGEFAULT_QUEUE_COUNT); 359 if (!xe->usm.pf_wq) 360 return -ENOMEM; 361 362 for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) { 363 err = xe_pagefault_queue_init(xe, xe->usm.pf_queue + i); 364 if (err) 365 goto err_out; 366 } 367 368 return devm_add_action_or_reset(xe->drm.dev, xe_pagefault_fini, xe); 369 370 err_out: 371 destroy_workqueue(xe->usm.pf_wq); 372 return err; 373 } 374 375 static void xe_pagefault_queue_reset(struct xe_device *xe, struct xe_gt *gt, 376 struct xe_pagefault_queue *pf_queue) 377 { 378 u32 i; 379 380 /* Driver load failure guard / USM not enabled guard */ 381 if (!pf_queue->data) 382 return; 383 384 /* Squash all pending faults on the GT */ 385 386 spin_lock_irq(&pf_queue->lock); 387 for (i = pf_queue->tail; i != pf_queue->head; 388 i = (i + xe_pagefault_entry_size()) % pf_queue->size) { 389 struct xe_pagefault *pf = pf_queue->data + i; 390 391 if (pf->gt == gt) 392 pf->gt = NULL; 393 } 394 spin_unlock_irq(&pf_queue->lock); 395 } 396 397 /** 398 * xe_pagefault_reset() - Page fault reset for a GT 399 * @xe: xe device instance 400 * @gt: GT being reset 401 * 402 * Reset the Xe page fault state for a GT; that is, squash any pending faults on 403 * the GT. 404 */ 405 void xe_pagefault_reset(struct xe_device *xe, struct xe_gt *gt) 406 { 407 int i; 408 409 for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) 410 xe_pagefault_queue_reset(xe, gt, xe->usm.pf_queue + i); 411 } 412 413 static bool xe_pagefault_queue_full(struct xe_pagefault_queue *pf_queue) 414 { 415 lockdep_assert_held(&pf_queue->lock); 416 417 return CIRC_SPACE(pf_queue->head, pf_queue->tail, pf_queue->size) <= 418 xe_pagefault_entry_size(); 419 } 420 421 /** 422 * xe_pagefault_handler() - Page fault handler 423 * @xe: xe device instance 424 * @pf: Page fault 425 * 426 * Sink the page fault to a queue (i.e., a memory buffer) and queue a worker to 427 * service it. Safe to be called from IRQ or process context. Reclaim safe. 428 * 429 * Return: 0 on success, errno on failure 430 */ 431 int xe_pagefault_handler(struct xe_device *xe, struct xe_pagefault *pf) 432 { 433 struct xe_pagefault_queue *pf_queue = xe->usm.pf_queue + 434 (pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT); 435 unsigned long flags; 436 bool full; 437 438 spin_lock_irqsave(&pf_queue->lock, flags); 439 full = xe_pagefault_queue_full(pf_queue); 440 if (!full) { 441 memcpy(pf_queue->data + pf_queue->head, pf, sizeof(*pf)); 442 pf_queue->head = (pf_queue->head + xe_pagefault_entry_size()) % 443 pf_queue->size; 444 queue_work(xe->usm.pf_wq, &pf_queue->worker); 445 } else { 446 drm_warn(&xe->drm, 447 "PageFault Queue (%d) full, shouldn't be possible\n", 448 pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT); 449 } 450 spin_unlock_irqrestore(&pf_queue->lock, flags); 451 452 return full ? -ENOSPC : 0; 453 } 454