1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include "xe_vm_madvise.h" 7 8 #include <linux/nospec.h> 9 #include <drm/xe_drm.h> 10 11 #include "xe_bo.h" 12 #include "xe_pat.h" 13 #include "xe_pt.h" 14 #include "xe_svm.h" 15 #include "xe_tlb_inval.h" 16 17 struct xe_vmas_in_madvise_range { 18 u64 addr; 19 u64 range; 20 struct xe_vma **vmas; 21 int num_vmas; 22 bool has_bo_vmas; 23 bool has_svm_userptr_vmas; 24 }; 25 26 /** 27 * struct xe_madvise_details - Argument to madvise_funcs 28 * @dpagemap: Reference-counted pointer to a struct drm_pagemap. 29 * 30 * The madvise IOCTL handler may, in addition to the user-space 31 * args, have additional info to pass into the madvise_func that 32 * handles the madvise type. Use a struct_xe_madvise_details 33 * for that and extend the struct as necessary. 34 */ 35 struct xe_madvise_details { 36 struct drm_pagemap *dpagemap; 37 }; 38 39 static int get_vmas(struct xe_vm *vm, struct xe_vmas_in_madvise_range *madvise_range) 40 { 41 u64 addr = madvise_range->addr; 42 u64 range = madvise_range->range; 43 44 struct xe_vma **__vmas; 45 struct drm_gpuva *gpuva; 46 int max_vmas = 8; 47 48 lockdep_assert_held(&vm->lock); 49 50 madvise_range->num_vmas = 0; 51 madvise_range->vmas = kmalloc_objs(*madvise_range->vmas, max_vmas); 52 if (!madvise_range->vmas) 53 return -ENOMEM; 54 55 vm_dbg(&vm->xe->drm, "VMA's in range: start=0x%016llx, end=0x%016llx", addr, addr + range); 56 57 drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, addr, addr + range) { 58 struct xe_vma *vma = gpuva_to_vma(gpuva); 59 60 if (xe_vma_bo(vma)) 61 madvise_range->has_bo_vmas = true; 62 else if (xe_vma_is_cpu_addr_mirror(vma) || xe_vma_is_userptr(vma)) 63 madvise_range->has_svm_userptr_vmas = true; 64 65 if (madvise_range->num_vmas == max_vmas) { 66 max_vmas <<= 1; 67 __vmas = krealloc(madvise_range->vmas, 68 max_vmas * sizeof(*madvise_range->vmas), 69 GFP_KERNEL); 70 if (!__vmas) { 71 kfree(madvise_range->vmas); 72 return -ENOMEM; 73 } 74 madvise_range->vmas = __vmas; 75 } 76 77 madvise_range->vmas[madvise_range->num_vmas] = vma; 78 (madvise_range->num_vmas)++; 79 } 80 81 if (!madvise_range->num_vmas) 82 kfree(madvise_range->vmas); 83 84 vm_dbg(&vm->xe->drm, "madvise_range-num_vmas = %d\n", madvise_range->num_vmas); 85 86 return 0; 87 } 88 89 static void madvise_preferred_mem_loc(struct xe_device *xe, struct xe_vm *vm, 90 struct xe_vma **vmas, int num_vmas, 91 struct drm_xe_madvise *op, 92 struct xe_madvise_details *details) 93 { 94 int i; 95 96 xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC); 97 98 for (i = 0; i < num_vmas; i++) { 99 struct xe_vma *vma = vmas[i]; 100 struct xe_vma_preferred_loc *loc = &vma->attr.preferred_loc; 101 102 /*TODO: Extend attributes to bo based vmas */ 103 if ((loc->devmem_fd == op->preferred_mem_loc.devmem_fd && 104 loc->migration_policy == op->preferred_mem_loc.migration_policy) || 105 !xe_vma_is_cpu_addr_mirror(vma)) { 106 vma->skip_invalidation = true; 107 } else { 108 vma->skip_invalidation = false; 109 loc->devmem_fd = op->preferred_mem_loc.devmem_fd; 110 /* Till multi-device support is not added migration_policy 111 * is of no use and can be ignored. 112 */ 113 loc->migration_policy = op->preferred_mem_loc.migration_policy; 114 drm_pagemap_put(loc->dpagemap); 115 loc->dpagemap = NULL; 116 if (details->dpagemap) 117 loc->dpagemap = drm_pagemap_get(details->dpagemap); 118 } 119 } 120 } 121 122 static void madvise_atomic(struct xe_device *xe, struct xe_vm *vm, 123 struct xe_vma **vmas, int num_vmas, 124 struct drm_xe_madvise *op, 125 struct xe_madvise_details *details) 126 { 127 struct xe_bo *bo; 128 int i; 129 130 xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC); 131 xe_assert(vm->xe, op->atomic.val <= DRM_XE_ATOMIC_CPU); 132 133 for (i = 0; i < num_vmas; i++) { 134 if (xe_vma_is_userptr(vmas[i]) && 135 !(op->atomic.val == DRM_XE_ATOMIC_DEVICE && 136 xe->info.has_device_atomics_on_smem)) { 137 vmas[i]->skip_invalidation = true; 138 continue; 139 } 140 141 if (vmas[i]->attr.atomic_access == op->atomic.val) { 142 vmas[i]->skip_invalidation = true; 143 } else { 144 vmas[i]->skip_invalidation = false; 145 vmas[i]->attr.atomic_access = op->atomic.val; 146 } 147 148 bo = xe_vma_bo(vmas[i]); 149 if (!bo || bo->attr.atomic_access == op->atomic.val) 150 continue; 151 152 vmas[i]->skip_invalidation = false; 153 xe_bo_assert_held(bo); 154 bo->attr.atomic_access = op->atomic.val; 155 156 /* Invalidate cpu page table, so bo can migrate to smem in next access */ 157 if (xe_bo_is_vram(bo) && 158 (bo->attr.atomic_access == DRM_XE_ATOMIC_CPU || 159 bo->attr.atomic_access == DRM_XE_ATOMIC_GLOBAL)) 160 ttm_bo_unmap_virtual(&bo->ttm); 161 } 162 } 163 164 static void madvise_pat_index(struct xe_device *xe, struct xe_vm *vm, 165 struct xe_vma **vmas, int num_vmas, 166 struct drm_xe_madvise *op, 167 struct xe_madvise_details *details) 168 { 169 int i; 170 171 xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_PAT); 172 173 for (i = 0; i < num_vmas; i++) { 174 if (vmas[i]->attr.pat_index == op->pat_index.val) { 175 vmas[i]->skip_invalidation = true; 176 } else { 177 vmas[i]->skip_invalidation = false; 178 vmas[i]->attr.pat_index = op->pat_index.val; 179 } 180 } 181 } 182 183 typedef void (*madvise_func)(struct xe_device *xe, struct xe_vm *vm, 184 struct xe_vma **vmas, int num_vmas, 185 struct drm_xe_madvise *op, 186 struct xe_madvise_details *details); 187 188 static const madvise_func madvise_funcs[] = { 189 [DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC] = madvise_preferred_mem_loc, 190 [DRM_XE_MEM_RANGE_ATTR_ATOMIC] = madvise_atomic, 191 [DRM_XE_MEM_RANGE_ATTR_PAT] = madvise_pat_index, 192 }; 193 194 static u8 xe_zap_ptes_in_madvise_range(struct xe_vm *vm, u64 start, u64 end) 195 { 196 struct drm_gpuva *gpuva; 197 struct xe_tile *tile; 198 u8 id, tile_mask = 0; 199 200 lockdep_assert_held_write(&vm->lock); 201 202 /* Wait for pending binds */ 203 if (dma_resv_wait_timeout(xe_vm_resv(vm), DMA_RESV_USAGE_BOOKKEEP, 204 false, MAX_SCHEDULE_TIMEOUT) <= 0) 205 XE_WARN_ON(1); 206 207 drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) { 208 struct xe_vma *vma = gpuva_to_vma(gpuva); 209 210 if (vma->skip_invalidation || xe_vma_is_null(vma)) 211 continue; 212 213 if (xe_vma_is_cpu_addr_mirror(vma)) { 214 tile_mask |= xe_svm_ranges_zap_ptes_in_range(vm, 215 xe_vma_start(vma), 216 xe_vma_end(vma)); 217 } else { 218 for_each_tile(tile, vm->xe, id) { 219 if (xe_pt_zap_ptes(tile, vma)) { 220 tile_mask |= BIT(id); 221 222 /* 223 * WRITE_ONCE pairs with READ_ONCE 224 * in xe_vm_has_valid_gpu_mapping() 225 */ 226 WRITE_ONCE(vma->tile_invalidated, 227 vma->tile_invalidated | BIT(id)); 228 } 229 } 230 } 231 } 232 233 return tile_mask; 234 } 235 236 static int xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64 start, u64 end) 237 { 238 u8 tile_mask = xe_zap_ptes_in_madvise_range(vm, start, end); 239 struct xe_tlb_inval_batch batch; 240 int err; 241 242 if (!tile_mask) 243 return 0; 244 245 xe_device_wmb(vm->xe); 246 247 err = xe_tlb_inval_range_tilemask_submit(vm->xe, vm->usm.asid, start, end, 248 tile_mask, &batch); 249 if (!err) 250 xe_tlb_inval_batch_wait(&batch); 251 252 return err; 253 } 254 255 static bool madvise_args_are_sane(struct xe_device *xe, const struct drm_xe_madvise *args) 256 { 257 if (XE_IOCTL_DBG(xe, !args)) 258 return false; 259 260 if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->start, SZ_4K))) 261 return false; 262 263 if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->range, SZ_4K))) 264 return false; 265 266 if (XE_IOCTL_DBG(xe, args->range < SZ_4K)) 267 return false; 268 269 switch (args->type) { 270 case DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC: 271 { 272 s32 fd = (s32)args->preferred_mem_loc.devmem_fd; 273 274 if (XE_IOCTL_DBG(xe, fd < DRM_XE_PREFERRED_LOC_DEFAULT_SYSTEM)) 275 return false; 276 277 if (XE_IOCTL_DBG(xe, fd <= DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE && 278 args->preferred_mem_loc.region_instance != 0)) 279 return false; 280 281 if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.migration_policy > 282 DRM_XE_MIGRATE_ONLY_SYSTEM_PAGES)) 283 return false; 284 285 if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.reserved)) 286 return false; 287 break; 288 } 289 case DRM_XE_MEM_RANGE_ATTR_ATOMIC: 290 if (XE_IOCTL_DBG(xe, args->atomic.val > DRM_XE_ATOMIC_CPU)) 291 return false; 292 293 if (XE_IOCTL_DBG(xe, args->atomic.pad)) 294 return false; 295 296 if (XE_IOCTL_DBG(xe, args->atomic.reserved)) 297 return false; 298 299 break; 300 case DRM_XE_MEM_RANGE_ATTR_PAT: 301 { 302 u16 pat_index, coh_mode; 303 304 if (XE_IOCTL_DBG(xe, args->pat_index.val >= xe->pat.n_entries)) 305 return false; 306 307 pat_index = array_index_nospec(args->pat_index.val, xe->pat.n_entries); 308 coh_mode = xe_pat_index_get_coh_mode(xe, pat_index); 309 if (XE_IOCTL_DBG(xe, !coh_mode)) 310 return false; 311 312 if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) 313 return false; 314 315 if (XE_IOCTL_DBG(xe, args->pat_index.pad)) 316 return false; 317 318 if (XE_IOCTL_DBG(xe, args->pat_index.reserved)) 319 return false; 320 break; 321 } 322 default: 323 if (XE_IOCTL_DBG(xe, 1)) 324 return false; 325 } 326 327 if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1])) 328 return false; 329 330 return true; 331 } 332 333 static int xe_madvise_details_init(struct xe_vm *vm, const struct drm_xe_madvise *args, 334 struct xe_madvise_details *details) 335 { 336 struct xe_device *xe = vm->xe; 337 338 memset(details, 0, sizeof(*details)); 339 340 if (args->type == DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC) { 341 int fd = args->preferred_mem_loc.devmem_fd; 342 struct drm_pagemap *dpagemap; 343 344 if (fd <= 0) 345 return 0; 346 347 dpagemap = xe_drm_pagemap_from_fd(args->preferred_mem_loc.devmem_fd, 348 args->preferred_mem_loc.region_instance); 349 if (XE_IOCTL_DBG(xe, IS_ERR(dpagemap))) 350 return PTR_ERR(dpagemap); 351 352 /* Don't allow a foreign placement without a fast interconnect! */ 353 if (XE_IOCTL_DBG(xe, dpagemap->pagemap->owner != vm->svm.peer.owner)) { 354 drm_pagemap_put(dpagemap); 355 return -ENOLINK; 356 } 357 details->dpagemap = dpagemap; 358 } 359 360 return 0; 361 } 362 363 static void xe_madvise_details_fini(struct xe_madvise_details *details) 364 { 365 drm_pagemap_put(details->dpagemap); 366 } 367 368 static bool check_bo_args_are_sane(struct xe_vm *vm, struct xe_vma **vmas, 369 int num_vmas, u32 atomic_val) 370 { 371 struct xe_device *xe = vm->xe; 372 struct xe_bo *bo; 373 int i; 374 375 for (i = 0; i < num_vmas; i++) { 376 bo = xe_vma_bo(vmas[i]); 377 if (!bo) 378 continue; 379 /* 380 * NOTE: The following atomic checks are platform-specific. For example, 381 * if a device supports CXL atomics, these may not be necessary or 382 * may behave differently. 383 */ 384 if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_CPU && 385 !(bo->flags & XE_BO_FLAG_SYSTEM))) 386 return false; 387 388 if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_DEVICE && 389 !(bo->flags & XE_BO_FLAG_VRAM0) && 390 !(bo->flags & XE_BO_FLAG_VRAM1) && 391 !(bo->flags & XE_BO_FLAG_SYSTEM && 392 xe->info.has_device_atomics_on_smem))) 393 return false; 394 395 if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_GLOBAL && 396 (!(bo->flags & XE_BO_FLAG_SYSTEM) || 397 (!(bo->flags & XE_BO_FLAG_VRAM0) && 398 !(bo->flags & XE_BO_FLAG_VRAM1))))) 399 return false; 400 } 401 return true; 402 } 403 /** 404 * xe_vm_madvise_ioctl - Handle MADVise ioctl for a VM 405 * @dev: DRM device pointer 406 * @data: Pointer to ioctl data (drm_xe_madvise*) 407 * @file: DRM file pointer 408 * 409 * Handles the MADVISE ioctl to provide memory advice for vma's within 410 * input range. 411 * 412 * Return: 0 on success or a negative error code on failure. 413 */ 414 int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *file) 415 { 416 struct xe_device *xe = to_xe_device(dev); 417 struct xe_file *xef = to_xe_file(file); 418 struct drm_xe_madvise *args = data; 419 struct xe_vmas_in_madvise_range madvise_range = {.addr = args->start, 420 .range = args->range, }; 421 struct xe_madvise_details details; 422 struct xe_vm *vm; 423 struct drm_exec exec; 424 int err, attr_type; 425 426 vm = xe_vm_lookup(xef, args->vm_id); 427 if (XE_IOCTL_DBG(xe, !vm)) 428 return -EINVAL; 429 430 if (!madvise_args_are_sane(vm->xe, args)) { 431 err = -EINVAL; 432 goto put_vm; 433 } 434 435 xe_svm_flush(vm); 436 437 err = down_write_killable(&vm->lock); 438 if (err) 439 goto put_vm; 440 441 if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) { 442 err = -ENOENT; 443 goto unlock_vm; 444 } 445 446 err = xe_madvise_details_init(vm, args, &details); 447 if (err) 448 goto unlock_vm; 449 450 err = xe_vm_alloc_madvise_vma(vm, args->start, args->range); 451 if (err) 452 goto madv_fini; 453 454 err = get_vmas(vm, &madvise_range); 455 if (err || !madvise_range.num_vmas) 456 goto madv_fini; 457 458 if (madvise_range.has_bo_vmas) { 459 if (args->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC) { 460 if (!check_bo_args_are_sane(vm, madvise_range.vmas, 461 madvise_range.num_vmas, 462 args->atomic.val)) { 463 err = -EINVAL; 464 goto free_vmas; 465 } 466 } 467 468 drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES | DRM_EXEC_INTERRUPTIBLE_WAIT, 0); 469 drm_exec_until_all_locked(&exec) { 470 for (int i = 0; i < madvise_range.num_vmas; i++) { 471 struct xe_bo *bo = xe_vma_bo(madvise_range.vmas[i]); 472 473 if (!bo) 474 continue; 475 err = drm_exec_lock_obj(&exec, &bo->ttm.base); 476 drm_exec_retry_on_contention(&exec); 477 if (err) 478 goto err_fini; 479 } 480 } 481 } 482 483 if (madvise_range.has_svm_userptr_vmas) { 484 err = xe_svm_notifier_lock_interruptible(vm); 485 if (err) 486 goto err_fini; 487 } 488 489 attr_type = array_index_nospec(args->type, ARRAY_SIZE(madvise_funcs)); 490 madvise_funcs[attr_type](xe, vm, madvise_range.vmas, madvise_range.num_vmas, args, 491 &details); 492 493 err = xe_vm_invalidate_madvise_range(vm, args->start, args->start + args->range); 494 495 if (madvise_range.has_svm_userptr_vmas) 496 xe_svm_notifier_unlock(vm); 497 498 err_fini: 499 if (madvise_range.has_bo_vmas) 500 drm_exec_fini(&exec); 501 free_vmas: 502 kfree(madvise_range.vmas); 503 madvise_range.vmas = NULL; 504 madv_fini: 505 xe_madvise_details_fini(&details); 506 unlock_vm: 507 up_write(&vm->lock); 508 put_vm: 509 xe_vm_put(vm); 510 return err; 511 } 512