1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include <linux/circ_buf.h> 7 8 #include <drm/drm_exec.h> 9 #include <drm/drm_managed.h> 10 11 #include "xe_bo.h" 12 #include "xe_device.h" 13 #include "xe_gt_printk.h" 14 #include "xe_gt_types.h" 15 #include "xe_gt_stats.h" 16 #include "xe_hw_engine.h" 17 #include "xe_pagefault.h" 18 #include "xe_pagefault_types.h" 19 #include "xe_svm.h" 20 #include "xe_trace_bo.h" 21 #include "xe_vm.h" 22 23 /** 24 * DOC: Xe page faults 25 * 26 * Xe page faults are handled in two layers. The producer layer interacts with 27 * hardware or firmware to receive and parse faults into struct xe_pagefault, 28 * then forwards them to the consumer. The consumer layer services the faults 29 * (e.g., memory migration, page table updates) and acknowledges the result back 30 * to the producer, which then forwards the results to the hardware or firmware. 31 * The consumer uses a page fault queue sized to absorb all potential faults and 32 * a multi-threaded worker to process them. Multiple producers are supported, 33 * with a single shared consumer. 34 * 35 * xe_pagefault.c implements the consumer layer. 36 */ 37 38 static int xe_pagefault_entry_size(void) 39 { 40 /* 41 * Power of two alignment is not a hardware requirement, rather a 42 * software restriction which makes the math for page fault queue 43 * management simplier. 44 */ 45 return roundup_pow_of_two(sizeof(struct xe_pagefault)); 46 } 47 48 static int xe_pagefault_begin(struct drm_exec *exec, struct xe_vma *vma, 49 struct xe_vram_region *vram, bool need_vram_move) 50 { 51 struct xe_bo *bo = xe_vma_bo(vma); 52 struct xe_vm *vm = xe_vma_vm(vma); 53 int err; 54 55 err = xe_vm_lock_vma(exec, vma); 56 if (err) 57 return err; 58 59 if (!bo) 60 return 0; 61 62 return need_vram_move ? xe_bo_migrate(bo, vram->placement, NULL, exec) : 63 xe_bo_validate(bo, vm, true, exec); 64 } 65 66 static int xe_pagefault_handle_vma(struct xe_gt *gt, struct xe_vma *vma, 67 bool atomic) 68 { 69 struct xe_vm *vm = xe_vma_vm(vma); 70 struct xe_tile *tile = gt_to_tile(gt); 71 struct xe_validation_ctx ctx; 72 struct drm_exec exec; 73 struct dma_fence *fence; 74 int err, needs_vram; 75 76 lockdep_assert_held_write(&vm->lock); 77 78 needs_vram = xe_vma_need_vram_for_atomic(vm->xe, vma, atomic); 79 if (needs_vram < 0 || (needs_vram && xe_vma_is_userptr(vma))) 80 return needs_vram < 0 ? needs_vram : -EACCES; 81 82 xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT, 1); 83 xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_KB, 84 xe_vma_size(vma) / SZ_1K); 85 86 trace_xe_vma_pagefault(vma); 87 88 /* Check if VMA is valid, opportunistic check only */ 89 if (xe_vm_has_valid_gpu_mapping(tile, vma->tile_present, 90 vma->tile_invalidated) && !atomic) 91 return 0; 92 93 retry_userptr: 94 if (xe_vma_is_userptr(vma) && 95 xe_vma_userptr_check_repin(to_userptr_vma(vma))) { 96 struct xe_userptr_vma *uvma = to_userptr_vma(vma); 97 98 err = xe_vma_userptr_pin_pages(uvma); 99 if (err) 100 return err; 101 } 102 103 /* Lock VM and BOs dma-resv */ 104 xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {}); 105 drm_exec_init(&exec, 0, 0); 106 drm_exec_until_all_locked(&exec) { 107 err = xe_pagefault_begin(&exec, vma, tile->mem.vram, 108 needs_vram == 1); 109 drm_exec_retry_on_contention(&exec); 110 xe_validation_retry_on_oom(&ctx, &err); 111 if (err) 112 goto unlock_dma_resv; 113 114 /* Bind VMA only to the GT that has faulted */ 115 trace_xe_vma_pf_bind(vma); 116 xe_vm_set_validation_exec(vm, &exec); 117 fence = xe_vma_rebind(vm, vma, BIT(tile->id)); 118 xe_vm_set_validation_exec(vm, NULL); 119 if (IS_ERR(fence)) { 120 err = PTR_ERR(fence); 121 xe_validation_retry_on_oom(&ctx, &err); 122 goto unlock_dma_resv; 123 } 124 } 125 126 dma_fence_wait(fence, false); 127 dma_fence_put(fence); 128 129 unlock_dma_resv: 130 xe_validation_ctx_fini(&ctx); 131 if (err == -EAGAIN) 132 goto retry_userptr; 133 134 return err; 135 } 136 137 static bool 138 xe_pagefault_access_is_atomic(enum xe_pagefault_access_type access_type) 139 { 140 return access_type == XE_PAGEFAULT_ACCESS_TYPE_ATOMIC; 141 } 142 143 static struct xe_vm *xe_pagefault_asid_to_vm(struct xe_device *xe, u32 asid) 144 { 145 struct xe_vm *vm; 146 147 down_read(&xe->usm.lock); 148 vm = xa_load(&xe->usm.asid_to_vm, asid); 149 if (vm && xe_vm_in_fault_mode(vm)) 150 xe_vm_get(vm); 151 else 152 vm = ERR_PTR(-EINVAL); 153 up_read(&xe->usm.lock); 154 155 return vm; 156 } 157 158 static int xe_pagefault_service(struct xe_pagefault *pf) 159 { 160 struct xe_gt *gt = pf->gt; 161 struct xe_device *xe = gt_to_xe(gt); 162 struct xe_vm *vm; 163 struct xe_vma *vma = NULL; 164 int err; 165 bool atomic; 166 167 /* Producer flagged this fault to be nacked */ 168 if (pf->consumer.fault_level == XE_PAGEFAULT_LEVEL_NACK) 169 return -EFAULT; 170 171 vm = xe_pagefault_asid_to_vm(xe, pf->consumer.asid); 172 if (IS_ERR(vm)) 173 return PTR_ERR(vm); 174 175 /* 176 * TODO: Change to read lock? Using write lock for simplicity. 177 */ 178 down_write(&vm->lock); 179 180 if (xe_vm_is_closed(vm)) { 181 err = -ENOENT; 182 goto unlock_vm; 183 } 184 185 vma = xe_vm_find_vma_by_addr(vm, pf->consumer.page_addr); 186 if (!vma) { 187 err = -EINVAL; 188 goto unlock_vm; 189 } 190 191 atomic = xe_pagefault_access_is_atomic(pf->consumer.access_type); 192 193 if (xe_vma_is_cpu_addr_mirror(vma)) 194 err = xe_svm_handle_pagefault(vm, vma, gt, 195 pf->consumer.page_addr, atomic); 196 else 197 err = xe_pagefault_handle_vma(gt, vma, atomic); 198 199 unlock_vm: 200 if (!err) 201 vm->usm.last_fault_vma = vma; 202 up_write(&vm->lock); 203 xe_vm_put(vm); 204 205 return err; 206 } 207 208 static bool xe_pagefault_queue_pop(struct xe_pagefault_queue *pf_queue, 209 struct xe_pagefault *pf) 210 { 211 bool found_fault = false; 212 213 spin_lock_irq(&pf_queue->lock); 214 if (pf_queue->tail != pf_queue->head) { 215 memcpy(pf, pf_queue->data + pf_queue->tail, sizeof(*pf)); 216 pf_queue->tail = (pf_queue->tail + xe_pagefault_entry_size()) % 217 pf_queue->size; 218 found_fault = true; 219 } 220 spin_unlock_irq(&pf_queue->lock); 221 222 return found_fault; 223 } 224 225 static void xe_pagefault_print(struct xe_pagefault *pf) 226 { 227 xe_gt_dbg(pf->gt, "\n\tASID: %d\n" 228 "\tFaulted Address: 0x%08x%08x\n" 229 "\tFaultType: %d\n" 230 "\tAccessType: %d\n" 231 "\tFaultLevel: %d\n" 232 "\tEngineClass: %d %s\n" 233 "\tEngineInstance: %d\n", 234 pf->consumer.asid, 235 upper_32_bits(pf->consumer.page_addr), 236 lower_32_bits(pf->consumer.page_addr), 237 pf->consumer.fault_type, 238 pf->consumer.access_type, 239 pf->consumer.fault_level, 240 pf->consumer.engine_class, 241 xe_hw_engine_class_to_str(pf->consumer.engine_class), 242 pf->consumer.engine_instance); 243 } 244 245 static void xe_pagefault_queue_work(struct work_struct *w) 246 { 247 struct xe_pagefault_queue *pf_queue = 248 container_of(w, typeof(*pf_queue), worker); 249 struct xe_pagefault pf; 250 unsigned long threshold; 251 252 #define USM_QUEUE_MAX_RUNTIME_MS 20 253 threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS); 254 255 while (xe_pagefault_queue_pop(pf_queue, &pf)) { 256 int err; 257 258 if (!pf.gt) /* Fault squashed during reset */ 259 continue; 260 261 err = xe_pagefault_service(&pf); 262 if (err) { 263 xe_pagefault_print(&pf); 264 xe_gt_dbg(pf.gt, "Fault response: Unsuccessful %pe\n", 265 ERR_PTR(err)); 266 } 267 268 pf.producer.ops->ack_fault(&pf, err); 269 270 if (time_after(jiffies, threshold)) { 271 queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w); 272 break; 273 } 274 } 275 #undef USM_QUEUE_MAX_RUNTIME_MS 276 } 277 278 static int xe_pagefault_queue_init(struct xe_device *xe, 279 struct xe_pagefault_queue *pf_queue) 280 { 281 struct xe_gt *gt; 282 int total_num_eus = 0; 283 u8 id; 284 285 for_each_gt(gt, xe, id) { 286 xe_dss_mask_t all_dss; 287 int num_dss, num_eus; 288 289 bitmap_or(all_dss, gt->fuse_topo.g_dss_mask, 290 gt->fuse_topo.c_dss_mask, XE_MAX_DSS_FUSE_BITS); 291 292 num_dss = bitmap_weight(all_dss, XE_MAX_DSS_FUSE_BITS); 293 num_eus = bitmap_weight(gt->fuse_topo.eu_mask_per_dss, 294 XE_MAX_EU_FUSE_BITS) * num_dss; 295 296 total_num_eus += num_eus; 297 } 298 299 xe_assert(xe, total_num_eus); 300 301 /* 302 * user can issue separate page faults per EU and per CS 303 * 304 * XXX: Multiplier required as compute UMD are getting PF queue errors 305 * without it. Follow on why this multiplier is required. 306 */ 307 #define PF_MULTIPLIER 8 308 pf_queue->size = (total_num_eus + XE_NUM_HW_ENGINES) * 309 xe_pagefault_entry_size() * PF_MULTIPLIER; 310 pf_queue->size = roundup_pow_of_two(pf_queue->size); 311 #undef PF_MULTIPLIER 312 313 drm_dbg(&xe->drm, "xe_pagefault_entry_size=%d, total_num_eus=%d, pf_queue->size=%u", 314 xe_pagefault_entry_size(), total_num_eus, pf_queue->size); 315 316 spin_lock_init(&pf_queue->lock); 317 INIT_WORK(&pf_queue->worker, xe_pagefault_queue_work); 318 319 pf_queue->data = drmm_kzalloc(&xe->drm, pf_queue->size, GFP_KERNEL); 320 if (!pf_queue->data) 321 return -ENOMEM; 322 323 return 0; 324 } 325 326 static void xe_pagefault_fini(void *arg) 327 { 328 struct xe_device *xe = arg; 329 330 destroy_workqueue(xe->usm.pf_wq); 331 } 332 333 /** 334 * xe_pagefault_init() - Page fault init 335 * @xe: xe device instance 336 * 337 * Initialize Xe page fault state. Must be done after reading fuses. 338 * 339 * Return: 0 on Success, errno on failure 340 */ 341 int xe_pagefault_init(struct xe_device *xe) 342 { 343 int err, i; 344 345 if (!xe->info.has_usm) 346 return 0; 347 348 xe->usm.pf_wq = alloc_workqueue("xe_page_fault_work_queue", 349 WQ_UNBOUND | WQ_HIGHPRI, 350 XE_PAGEFAULT_QUEUE_COUNT); 351 if (!xe->usm.pf_wq) 352 return -ENOMEM; 353 354 for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) { 355 err = xe_pagefault_queue_init(xe, xe->usm.pf_queue + i); 356 if (err) 357 goto err_out; 358 } 359 360 return devm_add_action_or_reset(xe->drm.dev, xe_pagefault_fini, xe); 361 362 err_out: 363 destroy_workqueue(xe->usm.pf_wq); 364 return err; 365 } 366 367 static void xe_pagefault_queue_reset(struct xe_device *xe, struct xe_gt *gt, 368 struct xe_pagefault_queue *pf_queue) 369 { 370 u32 i; 371 372 /* Driver load failure guard / USM not enabled guard */ 373 if (!pf_queue->data) 374 return; 375 376 /* Squash all pending faults on the GT */ 377 378 spin_lock_irq(&pf_queue->lock); 379 for (i = pf_queue->tail; i != pf_queue->head; 380 i = (i + xe_pagefault_entry_size()) % pf_queue->size) { 381 struct xe_pagefault *pf = pf_queue->data + i; 382 383 if (pf->gt == gt) 384 pf->gt = NULL; 385 } 386 spin_unlock_irq(&pf_queue->lock); 387 } 388 389 /** 390 * xe_pagefault_reset() - Page fault reset for a GT 391 * @xe: xe device instance 392 * @gt: GT being reset 393 * 394 * Reset the Xe page fault state for a GT; that is, squash any pending faults on 395 * the GT. 396 */ 397 void xe_pagefault_reset(struct xe_device *xe, struct xe_gt *gt) 398 { 399 int i; 400 401 for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) 402 xe_pagefault_queue_reset(xe, gt, xe->usm.pf_queue + i); 403 } 404 405 static bool xe_pagefault_queue_full(struct xe_pagefault_queue *pf_queue) 406 { 407 lockdep_assert_held(&pf_queue->lock); 408 409 return CIRC_SPACE(pf_queue->head, pf_queue->tail, pf_queue->size) <= 410 xe_pagefault_entry_size(); 411 } 412 413 /** 414 * xe_pagefault_handler() - Page fault handler 415 * @xe: xe device instance 416 * @pf: Page fault 417 * 418 * Sink the page fault to a queue (i.e., a memory buffer) and queue a worker to 419 * service it. Safe to be called from IRQ or process context. Reclaim safe. 420 * 421 * Return: 0 on success, errno on failure 422 */ 423 int xe_pagefault_handler(struct xe_device *xe, struct xe_pagefault *pf) 424 { 425 struct xe_pagefault_queue *pf_queue = xe->usm.pf_queue + 426 (pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT); 427 unsigned long flags; 428 bool full; 429 430 spin_lock_irqsave(&pf_queue->lock, flags); 431 full = xe_pagefault_queue_full(pf_queue); 432 if (!full) { 433 memcpy(pf_queue->data + pf_queue->head, pf, sizeof(*pf)); 434 pf_queue->head = (pf_queue->head + xe_pagefault_entry_size()) % 435 pf_queue->size; 436 queue_work(xe->usm.pf_wq, &pf_queue->worker); 437 } else { 438 drm_warn(&xe->drm, 439 "PageFault Queue (%d) full, shouldn't be possible\n", 440 pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT); 441 } 442 spin_unlock_irqrestore(&pf_queue->lock, flags); 443 444 return full ? -ENOSPC : 0; 445 } 446