1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include <linux/circ_buf.h> 7 8 #include <drm/drm_exec.h> 9 #include <drm/drm_managed.h> 10 11 #include "xe_bo.h" 12 #include "xe_device.h" 13 #include "xe_gt_printk.h" 14 #include "xe_gt_types.h" 15 #include "xe_gt_stats.h" 16 #include "xe_hw_engine.h" 17 #include "xe_pagefault.h" 18 #include "xe_pagefault_types.h" 19 #include "xe_svm.h" 20 #include "xe_trace_bo.h" 21 #include "xe_vm.h" 22 23 /** 24 * DOC: Xe page faults 25 * 26 * Xe page faults are handled in two layers. The producer layer interacts with 27 * hardware or firmware to receive and parse faults into struct xe_pagefault, 28 * then forwards them to the consumer. The consumer layer services the faults 29 * (e.g., memory migration, page table updates) and acknowledges the result back 30 * to the producer, which then forwards the results to the hardware or firmware. 31 * The consumer uses a page fault queue sized to absorb all potential faults and 32 * a multi-threaded worker to process them. Multiple producers are supported, 33 * with a single shared consumer. 34 * 35 * xe_pagefault.c implements the consumer layer. 36 */ 37 38 static int xe_pagefault_entry_size(void) 39 { 40 /* 41 * Power of two alignment is not a hardware requirement, rather a 42 * software restriction which makes the math for page fault queue 43 * management simplier. 44 */ 45 return roundup_pow_of_two(sizeof(struct xe_pagefault)); 46 } 47 48 static int xe_pagefault_begin(struct drm_exec *exec, struct xe_vma *vma, 49 struct xe_vram_region *vram, bool need_vram_move) 50 { 51 struct xe_bo *bo = xe_vma_bo(vma); 52 struct xe_vm *vm = xe_vma_vm(vma); 53 int err; 54 55 err = xe_vm_lock_vma(exec, vma); 56 if (err) 57 return err; 58 59 if (!bo) 60 return 0; 61 62 return need_vram_move ? xe_bo_migrate(bo, vram->placement, NULL, exec) : 63 xe_bo_validate(bo, vm, true, exec); 64 } 65 66 static int xe_pagefault_handle_vma(struct xe_gt *gt, struct xe_vma *vma, 67 bool atomic) 68 { 69 struct xe_vm *vm = xe_vma_vm(vma); 70 struct xe_tile *tile = gt_to_tile(gt); 71 struct xe_validation_ctx ctx; 72 struct drm_exec exec; 73 struct dma_fence *fence; 74 int err, needs_vram; 75 76 lockdep_assert_held_write(&vm->lock); 77 78 needs_vram = xe_vma_need_vram_for_atomic(vm->xe, vma, atomic); 79 if (needs_vram < 0 || (needs_vram && xe_vma_is_userptr(vma))) 80 return needs_vram < 0 ? needs_vram : -EACCES; 81 82 xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT, 1); 83 xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_KB, 84 xe_vma_size(vma) / SZ_1K); 85 86 trace_xe_vma_pagefault(vma); 87 88 /* Check if VMA is valid, opportunistic check only */ 89 if (xe_vm_has_valid_gpu_mapping(tile, vma->tile_present, 90 vma->tile_invalidated) && !atomic) 91 return 0; 92 93 retry_userptr: 94 if (xe_vma_is_userptr(vma) && 95 xe_vma_userptr_check_repin(to_userptr_vma(vma))) { 96 struct xe_userptr_vma *uvma = to_userptr_vma(vma); 97 98 err = xe_vma_userptr_pin_pages(uvma); 99 if (err) 100 return err; 101 } 102 103 /* Lock VM and BOs dma-resv */ 104 xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {}); 105 drm_exec_until_all_locked(&exec) { 106 err = xe_pagefault_begin(&exec, vma, tile->mem.vram, 107 needs_vram == 1); 108 drm_exec_retry_on_contention(&exec); 109 xe_validation_retry_on_oom(&ctx, &err); 110 if (err) 111 goto unlock_dma_resv; 112 113 /* Bind VMA only to the GT that has faulted */ 114 trace_xe_vma_pf_bind(vma); 115 xe_vm_set_validation_exec(vm, &exec); 116 fence = xe_vma_rebind(vm, vma, BIT(tile->id)); 117 xe_vm_set_validation_exec(vm, NULL); 118 if (IS_ERR(fence)) { 119 err = PTR_ERR(fence); 120 xe_validation_retry_on_oom(&ctx, &err); 121 goto unlock_dma_resv; 122 } 123 } 124 125 dma_fence_wait(fence, false); 126 dma_fence_put(fence); 127 128 unlock_dma_resv: 129 xe_validation_ctx_fini(&ctx); 130 if (err == -EAGAIN) 131 goto retry_userptr; 132 133 return err; 134 } 135 136 static bool 137 xe_pagefault_access_is_atomic(enum xe_pagefault_access_type access_type) 138 { 139 return access_type == XE_PAGEFAULT_ACCESS_TYPE_ATOMIC; 140 } 141 142 static struct xe_vm *xe_pagefault_asid_to_vm(struct xe_device *xe, u32 asid) 143 { 144 struct xe_vm *vm; 145 146 down_read(&xe->usm.lock); 147 vm = xa_load(&xe->usm.asid_to_vm, asid); 148 if (vm && xe_vm_in_fault_mode(vm)) 149 xe_vm_get(vm); 150 else 151 vm = ERR_PTR(-EINVAL); 152 up_read(&xe->usm.lock); 153 154 return vm; 155 } 156 157 static int xe_pagefault_service(struct xe_pagefault *pf) 158 { 159 struct xe_gt *gt = pf->gt; 160 struct xe_device *xe = gt_to_xe(gt); 161 struct xe_vm *vm; 162 struct xe_vma *vma = NULL; 163 int err; 164 bool atomic; 165 166 /* Producer flagged this fault to be nacked */ 167 if (pf->consumer.fault_level == XE_PAGEFAULT_LEVEL_NACK) 168 return -EFAULT; 169 170 vm = xe_pagefault_asid_to_vm(xe, pf->consumer.asid); 171 if (IS_ERR(vm)) 172 return PTR_ERR(vm); 173 174 /* 175 * TODO: Change to read lock? Using write lock for simplicity. 176 */ 177 down_write(&vm->lock); 178 179 if (xe_vm_is_closed(vm)) { 180 err = -ENOENT; 181 goto unlock_vm; 182 } 183 184 vma = xe_vm_find_vma_by_addr(vm, pf->consumer.page_addr); 185 if (!vma) { 186 err = -EINVAL; 187 goto unlock_vm; 188 } 189 190 atomic = xe_pagefault_access_is_atomic(pf->consumer.access_type); 191 192 if (xe_vma_is_cpu_addr_mirror(vma)) 193 err = xe_svm_handle_pagefault(vm, vma, gt, 194 pf->consumer.page_addr, atomic); 195 else 196 err = xe_pagefault_handle_vma(gt, vma, atomic); 197 198 unlock_vm: 199 if (!err) 200 vm->usm.last_fault_vma = vma; 201 up_write(&vm->lock); 202 xe_vm_put(vm); 203 204 return err; 205 } 206 207 static bool xe_pagefault_queue_pop(struct xe_pagefault_queue *pf_queue, 208 struct xe_pagefault *pf) 209 { 210 bool found_fault = false; 211 212 spin_lock_irq(&pf_queue->lock); 213 if (pf_queue->tail != pf_queue->head) { 214 memcpy(pf, pf_queue->data + pf_queue->tail, sizeof(*pf)); 215 pf_queue->tail = (pf_queue->tail + xe_pagefault_entry_size()) % 216 pf_queue->size; 217 found_fault = true; 218 } 219 spin_unlock_irq(&pf_queue->lock); 220 221 return found_fault; 222 } 223 224 static void xe_pagefault_print(struct xe_pagefault *pf) 225 { 226 xe_gt_dbg(pf->gt, "\n\tASID: %d\n" 227 "\tFaulted Address: 0x%08x%08x\n" 228 "\tFaultType: %d\n" 229 "\tAccessType: %d\n" 230 "\tFaultLevel: %d\n" 231 "\tEngineClass: %d %s\n" 232 "\tEngineInstance: %d\n", 233 pf->consumer.asid, 234 upper_32_bits(pf->consumer.page_addr), 235 lower_32_bits(pf->consumer.page_addr), 236 pf->consumer.fault_type, 237 pf->consumer.access_type, 238 pf->consumer.fault_level, 239 pf->consumer.engine_class, 240 xe_hw_engine_class_to_str(pf->consumer.engine_class), 241 pf->consumer.engine_instance); 242 } 243 244 static void xe_pagefault_queue_work(struct work_struct *w) 245 { 246 struct xe_pagefault_queue *pf_queue = 247 container_of(w, typeof(*pf_queue), worker); 248 struct xe_pagefault pf; 249 unsigned long threshold; 250 251 #define USM_QUEUE_MAX_RUNTIME_MS 20 252 threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS); 253 254 while (xe_pagefault_queue_pop(pf_queue, &pf)) { 255 int err; 256 257 if (!pf.gt) /* Fault squashed during reset */ 258 continue; 259 260 err = xe_pagefault_service(&pf); 261 if (err) { 262 xe_pagefault_print(&pf); 263 xe_gt_dbg(pf.gt, "Fault response: Unsuccessful %pe\n", 264 ERR_PTR(err)); 265 } 266 267 pf.producer.ops->ack_fault(&pf, err); 268 269 if (time_after(jiffies, threshold)) { 270 queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w); 271 break; 272 } 273 } 274 #undef USM_QUEUE_MAX_RUNTIME_MS 275 } 276 277 static int xe_pagefault_queue_init(struct xe_device *xe, 278 struct xe_pagefault_queue *pf_queue) 279 { 280 struct xe_gt *gt; 281 int total_num_eus = 0; 282 u8 id; 283 284 for_each_gt(gt, xe, id) { 285 xe_dss_mask_t all_dss; 286 int num_dss, num_eus; 287 288 bitmap_or(all_dss, gt->fuse_topo.g_dss_mask, 289 gt->fuse_topo.c_dss_mask, XE_MAX_DSS_FUSE_BITS); 290 291 num_dss = bitmap_weight(all_dss, XE_MAX_DSS_FUSE_BITS); 292 num_eus = bitmap_weight(gt->fuse_topo.eu_mask_per_dss, 293 XE_MAX_EU_FUSE_BITS) * num_dss; 294 295 total_num_eus += num_eus; 296 } 297 298 xe_assert(xe, total_num_eus); 299 300 /* 301 * user can issue separate page faults per EU and per CS 302 * 303 * XXX: Multiplier required as compute UMD are getting PF queue errors 304 * without it. Follow on why this multiplier is required. 305 */ 306 #define PF_MULTIPLIER 8 307 pf_queue->size = (total_num_eus + XE_NUM_HW_ENGINES) * 308 xe_pagefault_entry_size() * PF_MULTIPLIER; 309 pf_queue->size = roundup_pow_of_two(pf_queue->size); 310 #undef PF_MULTIPLIER 311 312 drm_dbg(&xe->drm, "xe_pagefault_entry_size=%d, total_num_eus=%d, pf_queue->size=%u", 313 xe_pagefault_entry_size(), total_num_eus, pf_queue->size); 314 315 spin_lock_init(&pf_queue->lock); 316 INIT_WORK(&pf_queue->worker, xe_pagefault_queue_work); 317 318 pf_queue->data = drmm_kzalloc(&xe->drm, pf_queue->size, GFP_KERNEL); 319 if (!pf_queue->data) 320 return -ENOMEM; 321 322 return 0; 323 } 324 325 static void xe_pagefault_fini(void *arg) 326 { 327 struct xe_device *xe = arg; 328 329 destroy_workqueue(xe->usm.pf_wq); 330 } 331 332 /** 333 * xe_pagefault_init() - Page fault init 334 * @xe: xe device instance 335 * 336 * Initialize Xe page fault state. Must be done after reading fuses. 337 * 338 * Return: 0 on Success, errno on failure 339 */ 340 int xe_pagefault_init(struct xe_device *xe) 341 { 342 int err, i; 343 344 if (!xe->info.has_usm) 345 return 0; 346 347 xe->usm.pf_wq = alloc_workqueue("xe_page_fault_work_queue", 348 WQ_UNBOUND | WQ_HIGHPRI, 349 XE_PAGEFAULT_QUEUE_COUNT); 350 if (!xe->usm.pf_wq) 351 return -ENOMEM; 352 353 for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) { 354 err = xe_pagefault_queue_init(xe, xe->usm.pf_queue + i); 355 if (err) 356 goto err_out; 357 } 358 359 return devm_add_action_or_reset(xe->drm.dev, xe_pagefault_fini, xe); 360 361 err_out: 362 destroy_workqueue(xe->usm.pf_wq); 363 return err; 364 } 365 366 static void xe_pagefault_queue_reset(struct xe_device *xe, struct xe_gt *gt, 367 struct xe_pagefault_queue *pf_queue) 368 { 369 u32 i; 370 371 /* Driver load failure guard / USM not enabled guard */ 372 if (!pf_queue->data) 373 return; 374 375 /* Squash all pending faults on the GT */ 376 377 spin_lock_irq(&pf_queue->lock); 378 for (i = pf_queue->tail; i != pf_queue->head; 379 i = (i + xe_pagefault_entry_size()) % pf_queue->size) { 380 struct xe_pagefault *pf = pf_queue->data + i; 381 382 if (pf->gt == gt) 383 pf->gt = NULL; 384 } 385 spin_unlock_irq(&pf_queue->lock); 386 } 387 388 /** 389 * xe_pagefault_reset() - Page fault reset for a GT 390 * @xe: xe device instance 391 * @gt: GT being reset 392 * 393 * Reset the Xe page fault state for a GT; that is, squash any pending faults on 394 * the GT. 395 */ 396 void xe_pagefault_reset(struct xe_device *xe, struct xe_gt *gt) 397 { 398 int i; 399 400 for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) 401 xe_pagefault_queue_reset(xe, gt, xe->usm.pf_queue + i); 402 } 403 404 static bool xe_pagefault_queue_full(struct xe_pagefault_queue *pf_queue) 405 { 406 lockdep_assert_held(&pf_queue->lock); 407 408 return CIRC_SPACE(pf_queue->head, pf_queue->tail, pf_queue->size) <= 409 xe_pagefault_entry_size(); 410 } 411 412 /** 413 * xe_pagefault_handler() - Page fault handler 414 * @xe: xe device instance 415 * @pf: Page fault 416 * 417 * Sink the page fault to a queue (i.e., a memory buffer) and queue a worker to 418 * service it. Safe to be called from IRQ or process context. Reclaim safe. 419 * 420 * Return: 0 on success, errno on failure 421 */ 422 int xe_pagefault_handler(struct xe_device *xe, struct xe_pagefault *pf) 423 { 424 struct xe_pagefault_queue *pf_queue = xe->usm.pf_queue + 425 (pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT); 426 unsigned long flags; 427 bool full; 428 429 spin_lock_irqsave(&pf_queue->lock, flags); 430 full = xe_pagefault_queue_full(pf_queue); 431 if (!full) { 432 memcpy(pf_queue->data + pf_queue->head, pf, sizeof(*pf)); 433 pf_queue->head = (pf_queue->head + xe_pagefault_entry_size()) % 434 pf_queue->size; 435 queue_work(xe->usm.pf_wq, &pf_queue->worker); 436 } else { 437 drm_warn(&xe->drm, 438 "PageFault Queue (%d) full, shouldn't be possible\n", 439 pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT); 440 } 441 spin_unlock_irqrestore(&pf_queue->lock, flags); 442 443 return full ? -ENOSPC : 0; 444 } 445