1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include <linux/circ_buf.h> 7 8 #include <drm/drm_exec.h> 9 #include <drm/drm_managed.h> 10 11 #include "xe_bo.h" 12 #include "xe_device.h" 13 #include "xe_gt_printk.h" 14 #include "xe_gt_types.h" 15 #include "xe_gt_stats.h" 16 #include "xe_hw_engine.h" 17 #include "xe_pagefault.h" 18 #include "xe_pagefault_types.h" 19 #include "xe_svm.h" 20 #include "xe_trace_bo.h" 21 #include "xe_vm.h" 22 23 /** 24 * DOC: Xe page faults 25 * 26 * Xe page faults are handled in two layers. The producer layer interacts with 27 * hardware or firmware to receive and parse faults into struct xe_pagefault, 28 * then forwards them to the consumer. The consumer layer services the faults 29 * (e.g., memory migration, page table updates) and acknowledges the result back 30 * to the producer, which then forwards the results to the hardware or firmware. 31 * The consumer uses a page fault queue sized to absorb all potential faults and 32 * a multi-threaded worker to process them. Multiple producers are supported, 33 * with a single shared consumer. 34 * 35 * xe_pagefault.c implements the consumer layer. 36 */ 37 38 static int xe_pagefault_entry_size(void) 39 { 40 /* 41 * Power of two alignment is not a hardware requirement, rather a 42 * software restriction which makes the math for page fault queue 43 * management simplier. 44 */ 45 return roundup_pow_of_two(sizeof(struct xe_pagefault)); 46 } 47 48 static int xe_pagefault_begin(struct drm_exec *exec, struct xe_vma *vma, 49 struct xe_vram_region *vram, bool need_vram_move) 50 { 51 struct xe_bo *bo = xe_vma_bo(vma); 52 struct xe_vm *vm = xe_vma_vm(vma); 53 int err; 54 55 err = xe_vm_lock_vma(exec, vma); 56 if (err) 57 return err; 58 59 if (!bo) 60 return 0; 61 62 return need_vram_move ? xe_bo_migrate(bo, vram->placement, NULL, exec) : 63 xe_bo_validate(bo, vm, true, exec); 64 } 65 66 static int xe_pagefault_handle_vma(struct xe_gt *gt, struct xe_vma *vma, 67 bool atomic) 68 { 69 struct xe_vm *vm = xe_vma_vm(vma); 70 struct xe_tile *tile = gt_to_tile(gt); 71 struct xe_validation_ctx ctx; 72 struct drm_exec exec; 73 struct dma_fence *fence; 74 int err, needs_vram; 75 76 lockdep_assert_held_write(&vm->lock); 77 78 needs_vram = xe_vma_need_vram_for_atomic(vm->xe, vma, atomic); 79 if (needs_vram < 0 || (needs_vram && xe_vma_is_userptr(vma))) 80 return needs_vram < 0 ? needs_vram : -EACCES; 81 82 xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_COUNT, 1); 83 xe_gt_stats_incr(gt, XE_GT_STATS_ID_VMA_PAGEFAULT_KB, 84 xe_vma_size(vma) / SZ_1K); 85 86 trace_xe_vma_pagefault(vma); 87 88 /* Check if VMA is valid, opportunistic check only */ 89 if (xe_vm_has_valid_gpu_mapping(tile, vma->tile_present, 90 vma->tile_invalidated) && !atomic) 91 return 0; 92 93 retry_userptr: 94 if (xe_vma_is_userptr(vma) && 95 xe_vma_userptr_check_repin(to_userptr_vma(vma))) { 96 struct xe_userptr_vma *uvma = to_userptr_vma(vma); 97 98 err = xe_vma_userptr_pin_pages(uvma); 99 if (err) 100 return err; 101 } 102 103 /* Lock VM and BOs dma-resv */ 104 xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {}); 105 drm_exec_until_all_locked(&exec) { 106 err = xe_pagefault_begin(&exec, vma, tile->mem.vram, 107 needs_vram == 1); 108 drm_exec_retry_on_contention(&exec); 109 xe_validation_retry_on_oom(&ctx, &err); 110 if (err) 111 goto unlock_dma_resv; 112 113 /* Bind VMA only to the GT that has faulted */ 114 trace_xe_vma_pf_bind(vma); 115 xe_vm_set_validation_exec(vm, &exec); 116 fence = xe_vma_rebind(vm, vma, BIT(tile->id)); 117 xe_vm_set_validation_exec(vm, NULL); 118 if (IS_ERR(fence)) { 119 err = PTR_ERR(fence); 120 xe_validation_retry_on_oom(&ctx, &err); 121 goto unlock_dma_resv; 122 } 123 } 124 125 dma_fence_wait(fence, false); 126 dma_fence_put(fence); 127 128 unlock_dma_resv: 129 xe_validation_ctx_fini(&ctx); 130 if (err == -EAGAIN) 131 goto retry_userptr; 132 133 return err; 134 } 135 136 static bool 137 xe_pagefault_access_is_atomic(enum xe_pagefault_access_type access_type) 138 { 139 return access_type == XE_PAGEFAULT_ACCESS_TYPE_ATOMIC; 140 } 141 142 static struct xe_vm *xe_pagefault_asid_to_vm(struct xe_device *xe, u32 asid) 143 { 144 struct xe_vm *vm; 145 146 down_read(&xe->usm.lock); 147 vm = xa_load(&xe->usm.asid_to_vm, asid); 148 if (vm && xe_vm_in_fault_mode(vm)) 149 xe_vm_get(vm); 150 else 151 vm = ERR_PTR(-EINVAL); 152 up_read(&xe->usm.lock); 153 154 return vm; 155 } 156 157 static int xe_pagefault_service(struct xe_pagefault *pf) 158 { 159 struct xe_gt *gt = pf->gt; 160 struct xe_device *xe = gt_to_xe(gt); 161 struct xe_vm *vm; 162 struct xe_vma *vma = NULL; 163 int err; 164 bool atomic; 165 166 /* Producer flagged this fault to be nacked */ 167 if (pf->consumer.fault_level == XE_PAGEFAULT_LEVEL_NACK) 168 return -EFAULT; 169 170 vm = xe_pagefault_asid_to_vm(xe, pf->consumer.asid); 171 if (IS_ERR(vm)) 172 return PTR_ERR(vm); 173 174 /* 175 * TODO: Change to read lock? Using write lock for simplicity. 176 */ 177 down_write(&vm->lock); 178 179 if (xe_vm_is_closed(vm)) { 180 err = -ENOENT; 181 goto unlock_vm; 182 } 183 184 vma = xe_vm_find_vma_by_addr(vm, pf->consumer.page_addr); 185 if (!vma) { 186 err = -EINVAL; 187 goto unlock_vm; 188 } 189 190 if (xe_vma_read_only(vma) && 191 pf->consumer.access_type != XE_PAGEFAULT_ACCESS_TYPE_READ) { 192 err = -EPERM; 193 goto unlock_vm; 194 } 195 196 atomic = xe_pagefault_access_is_atomic(pf->consumer.access_type); 197 198 if (xe_vma_is_cpu_addr_mirror(vma)) 199 err = xe_svm_handle_pagefault(vm, vma, gt, 200 pf->consumer.page_addr, atomic); 201 else 202 err = xe_pagefault_handle_vma(gt, vma, atomic); 203 204 unlock_vm: 205 if (!err) 206 vm->usm.last_fault_vma = vma; 207 up_write(&vm->lock); 208 xe_vm_put(vm); 209 210 return err; 211 } 212 213 static bool xe_pagefault_queue_pop(struct xe_pagefault_queue *pf_queue, 214 struct xe_pagefault *pf) 215 { 216 bool found_fault = false; 217 218 spin_lock_irq(&pf_queue->lock); 219 if (pf_queue->tail != pf_queue->head) { 220 memcpy(pf, pf_queue->data + pf_queue->tail, sizeof(*pf)); 221 pf_queue->tail = (pf_queue->tail + xe_pagefault_entry_size()) % 222 pf_queue->size; 223 found_fault = true; 224 } 225 spin_unlock_irq(&pf_queue->lock); 226 227 return found_fault; 228 } 229 230 static void xe_pagefault_print(struct xe_pagefault *pf) 231 { 232 xe_gt_info(pf->gt, "\n\tASID: %d\n" 233 "\tFaulted Address: 0x%08x%08x\n" 234 "\tFaultType: %d\n" 235 "\tAccessType: %d\n" 236 "\tFaultLevel: %d\n" 237 "\tEngineClass: %d %s\n" 238 "\tEngineInstance: %d\n", 239 pf->consumer.asid, 240 upper_32_bits(pf->consumer.page_addr), 241 lower_32_bits(pf->consumer.page_addr), 242 pf->consumer.fault_type, 243 pf->consumer.access_type, 244 pf->consumer.fault_level, 245 pf->consumer.engine_class, 246 xe_hw_engine_class_to_str(pf->consumer.engine_class), 247 pf->consumer.engine_instance); 248 } 249 250 static void xe_pagefault_queue_work(struct work_struct *w) 251 { 252 struct xe_pagefault_queue *pf_queue = 253 container_of(w, typeof(*pf_queue), worker); 254 struct xe_pagefault pf; 255 unsigned long threshold; 256 257 #define USM_QUEUE_MAX_RUNTIME_MS 20 258 threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS); 259 260 while (xe_pagefault_queue_pop(pf_queue, &pf)) { 261 int err; 262 263 if (!pf.gt) /* Fault squashed during reset */ 264 continue; 265 266 err = xe_pagefault_service(&pf); 267 if (err) { 268 xe_pagefault_print(&pf); 269 xe_gt_info(pf.gt, "Fault response: Unsuccessful %pe\n", 270 ERR_PTR(err)); 271 } 272 273 pf.producer.ops->ack_fault(&pf, err); 274 275 if (time_after(jiffies, threshold)) { 276 queue_work(gt_to_xe(pf.gt)->usm.pf_wq, w); 277 break; 278 } 279 } 280 #undef USM_QUEUE_MAX_RUNTIME_MS 281 } 282 283 static int xe_pagefault_queue_init(struct xe_device *xe, 284 struct xe_pagefault_queue *pf_queue) 285 { 286 struct xe_gt *gt; 287 int total_num_eus = 0; 288 u8 id; 289 290 for_each_gt(gt, xe, id) { 291 xe_dss_mask_t all_dss; 292 int num_dss, num_eus; 293 294 bitmap_or(all_dss, gt->fuse_topo.g_dss_mask, 295 gt->fuse_topo.c_dss_mask, XE_MAX_DSS_FUSE_BITS); 296 297 num_dss = bitmap_weight(all_dss, XE_MAX_DSS_FUSE_BITS); 298 num_eus = bitmap_weight(gt->fuse_topo.eu_mask_per_dss, 299 XE_MAX_EU_FUSE_BITS) * num_dss; 300 301 total_num_eus += num_eus; 302 } 303 304 xe_assert(xe, total_num_eus); 305 306 /* 307 * user can issue separate page faults per EU and per CS 308 * 309 * XXX: Multiplier required as compute UMD are getting PF queue errors 310 * without it. Follow on why this multiplier is required. 311 */ 312 #define PF_MULTIPLIER 8 313 pf_queue->size = (total_num_eus + XE_NUM_HW_ENGINES) * 314 xe_pagefault_entry_size() * PF_MULTIPLIER; 315 pf_queue->size = roundup_pow_of_two(pf_queue->size); 316 #undef PF_MULTIPLIER 317 318 drm_dbg(&xe->drm, "xe_pagefault_entry_size=%d, total_num_eus=%d, pf_queue->size=%u", 319 xe_pagefault_entry_size(), total_num_eus, pf_queue->size); 320 321 spin_lock_init(&pf_queue->lock); 322 INIT_WORK(&pf_queue->worker, xe_pagefault_queue_work); 323 324 pf_queue->data = drmm_kzalloc(&xe->drm, pf_queue->size, GFP_KERNEL); 325 if (!pf_queue->data) 326 return -ENOMEM; 327 328 return 0; 329 } 330 331 static void xe_pagefault_fini(void *arg) 332 { 333 struct xe_device *xe = arg; 334 335 destroy_workqueue(xe->usm.pf_wq); 336 } 337 338 /** 339 * xe_pagefault_init() - Page fault init 340 * @xe: xe device instance 341 * 342 * Initialize Xe page fault state. Must be done after reading fuses. 343 * 344 * Return: 0 on Success, errno on failure 345 */ 346 int xe_pagefault_init(struct xe_device *xe) 347 { 348 int err, i; 349 350 if (!xe->info.has_usm) 351 return 0; 352 353 xe->usm.pf_wq = alloc_workqueue("xe_page_fault_work_queue", 354 WQ_UNBOUND | WQ_HIGHPRI, 355 XE_PAGEFAULT_QUEUE_COUNT); 356 if (!xe->usm.pf_wq) 357 return -ENOMEM; 358 359 for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) { 360 err = xe_pagefault_queue_init(xe, xe->usm.pf_queue + i); 361 if (err) 362 goto err_out; 363 } 364 365 return devm_add_action_or_reset(xe->drm.dev, xe_pagefault_fini, xe); 366 367 err_out: 368 destroy_workqueue(xe->usm.pf_wq); 369 return err; 370 } 371 372 static void xe_pagefault_queue_reset(struct xe_device *xe, struct xe_gt *gt, 373 struct xe_pagefault_queue *pf_queue) 374 { 375 u32 i; 376 377 /* Driver load failure guard / USM not enabled guard */ 378 if (!pf_queue->data) 379 return; 380 381 /* Squash all pending faults on the GT */ 382 383 spin_lock_irq(&pf_queue->lock); 384 for (i = pf_queue->tail; i != pf_queue->head; 385 i = (i + xe_pagefault_entry_size()) % pf_queue->size) { 386 struct xe_pagefault *pf = pf_queue->data + i; 387 388 if (pf->gt == gt) 389 pf->gt = NULL; 390 } 391 spin_unlock_irq(&pf_queue->lock); 392 } 393 394 /** 395 * xe_pagefault_reset() - Page fault reset for a GT 396 * @xe: xe device instance 397 * @gt: GT being reset 398 * 399 * Reset the Xe page fault state for a GT; that is, squash any pending faults on 400 * the GT. 401 */ 402 void xe_pagefault_reset(struct xe_device *xe, struct xe_gt *gt) 403 { 404 int i; 405 406 for (i = 0; i < XE_PAGEFAULT_QUEUE_COUNT; ++i) 407 xe_pagefault_queue_reset(xe, gt, xe->usm.pf_queue + i); 408 } 409 410 static bool xe_pagefault_queue_full(struct xe_pagefault_queue *pf_queue) 411 { 412 lockdep_assert_held(&pf_queue->lock); 413 414 return CIRC_SPACE(pf_queue->head, pf_queue->tail, pf_queue->size) <= 415 xe_pagefault_entry_size(); 416 } 417 418 /** 419 * xe_pagefault_handler() - Page fault handler 420 * @xe: xe device instance 421 * @pf: Page fault 422 * 423 * Sink the page fault to a queue (i.e., a memory buffer) and queue a worker to 424 * service it. Safe to be called from IRQ or process context. Reclaim safe. 425 * 426 * Return: 0 on success, errno on failure 427 */ 428 int xe_pagefault_handler(struct xe_device *xe, struct xe_pagefault *pf) 429 { 430 struct xe_pagefault_queue *pf_queue = xe->usm.pf_queue + 431 (pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT); 432 unsigned long flags; 433 bool full; 434 435 spin_lock_irqsave(&pf_queue->lock, flags); 436 full = xe_pagefault_queue_full(pf_queue); 437 if (!full) { 438 memcpy(pf_queue->data + pf_queue->head, pf, sizeof(*pf)); 439 pf_queue->head = (pf_queue->head + xe_pagefault_entry_size()) % 440 pf_queue->size; 441 queue_work(xe->usm.pf_wq, &pf_queue->worker); 442 } else { 443 drm_warn(&xe->drm, 444 "PageFault Queue (%d) full, shouldn't be possible\n", 445 pf->consumer.asid % XE_PAGEFAULT_QUEUE_COUNT); 446 } 447 spin_unlock_irqrestore(&pf_queue->lock, flags); 448 449 return full ? -ENOSPC : 0; 450 } 451