1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2025 Intel Corporation 4 */ 5 6 #include "xe_vm_madvise.h" 7 8 #include <linux/nospec.h> 9 #include <drm/xe_drm.h> 10 11 #include "xe_bo.h" 12 #include "xe_pat.h" 13 #include "xe_pt.h" 14 #include "xe_svm.h" 15 16 struct xe_vmas_in_madvise_range { 17 u64 addr; 18 u64 range; 19 struct xe_vma **vmas; 20 int num_vmas; 21 bool has_bo_vmas; 22 bool has_svm_userptr_vmas; 23 }; 24 25 /** 26 * struct xe_madvise_details - Argument to madvise_funcs 27 * @dpagemap: Reference-counted pointer to a struct drm_pagemap. 28 * 29 * The madvise IOCTL handler may, in addition to the user-space 30 * args, have additional info to pass into the madvise_func that 31 * handles the madvise type. Use a struct_xe_madvise_details 32 * for that and extend the struct as necessary. 33 */ 34 struct xe_madvise_details { 35 struct drm_pagemap *dpagemap; 36 }; 37 38 static int get_vmas(struct xe_vm *vm, struct xe_vmas_in_madvise_range *madvise_range) 39 { 40 u64 addr = madvise_range->addr; 41 u64 range = madvise_range->range; 42 43 struct xe_vma **__vmas; 44 struct drm_gpuva *gpuva; 45 int max_vmas = 8; 46 47 lockdep_assert_held(&vm->lock); 48 49 madvise_range->num_vmas = 0; 50 madvise_range->vmas = kmalloc_objs(*madvise_range->vmas, max_vmas); 51 if (!madvise_range->vmas) 52 return -ENOMEM; 53 54 vm_dbg(&vm->xe->drm, "VMA's in range: start=0x%016llx, end=0x%016llx", addr, addr + range); 55 56 drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, addr, addr + range) { 57 struct xe_vma *vma = gpuva_to_vma(gpuva); 58 59 if (xe_vma_bo(vma)) 60 madvise_range->has_bo_vmas = true; 61 else if (xe_vma_is_cpu_addr_mirror(vma) || xe_vma_is_userptr(vma)) 62 madvise_range->has_svm_userptr_vmas = true; 63 64 if (madvise_range->num_vmas == max_vmas) { 65 max_vmas <<= 1; 66 __vmas = krealloc(madvise_range->vmas, 67 max_vmas * sizeof(*madvise_range->vmas), 68 GFP_KERNEL); 69 if (!__vmas) { 70 kfree(madvise_range->vmas); 71 return -ENOMEM; 72 } 73 madvise_range->vmas = __vmas; 74 } 75 76 madvise_range->vmas[madvise_range->num_vmas] = vma; 77 (madvise_range->num_vmas)++; 78 } 79 80 if (!madvise_range->num_vmas) 81 kfree(madvise_range->vmas); 82 83 vm_dbg(&vm->xe->drm, "madvise_range-num_vmas = %d\n", madvise_range->num_vmas); 84 85 return 0; 86 } 87 88 static void madvise_preferred_mem_loc(struct xe_device *xe, struct xe_vm *vm, 89 struct xe_vma **vmas, int num_vmas, 90 struct drm_xe_madvise *op, 91 struct xe_madvise_details *details) 92 { 93 int i; 94 95 xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC); 96 97 for (i = 0; i < num_vmas; i++) { 98 struct xe_vma *vma = vmas[i]; 99 struct xe_vma_preferred_loc *loc = &vma->attr.preferred_loc; 100 101 /*TODO: Extend attributes to bo based vmas */ 102 if ((loc->devmem_fd == op->preferred_mem_loc.devmem_fd && 103 loc->migration_policy == op->preferred_mem_loc.migration_policy) || 104 !xe_vma_is_cpu_addr_mirror(vma)) { 105 vma->skip_invalidation = true; 106 } else { 107 vma->skip_invalidation = false; 108 loc->devmem_fd = op->preferred_mem_loc.devmem_fd; 109 /* Till multi-device support is not added migration_policy 110 * is of no use and can be ignored. 111 */ 112 loc->migration_policy = op->preferred_mem_loc.migration_policy; 113 drm_pagemap_put(loc->dpagemap); 114 loc->dpagemap = NULL; 115 if (details->dpagemap) 116 loc->dpagemap = drm_pagemap_get(details->dpagemap); 117 } 118 } 119 } 120 121 static void madvise_atomic(struct xe_device *xe, struct xe_vm *vm, 122 struct xe_vma **vmas, int num_vmas, 123 struct drm_xe_madvise *op, 124 struct xe_madvise_details *details) 125 { 126 struct xe_bo *bo; 127 int i; 128 129 xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC); 130 xe_assert(vm->xe, op->atomic.val <= DRM_XE_ATOMIC_CPU); 131 132 for (i = 0; i < num_vmas; i++) { 133 if (xe_vma_is_userptr(vmas[i]) && 134 !(op->atomic.val == DRM_XE_ATOMIC_DEVICE && 135 xe->info.has_device_atomics_on_smem)) { 136 vmas[i]->skip_invalidation = true; 137 continue; 138 } 139 140 if (vmas[i]->attr.atomic_access == op->atomic.val) { 141 vmas[i]->skip_invalidation = true; 142 } else { 143 vmas[i]->skip_invalidation = false; 144 vmas[i]->attr.atomic_access = op->atomic.val; 145 } 146 147 bo = xe_vma_bo(vmas[i]); 148 if (!bo || bo->attr.atomic_access == op->atomic.val) 149 continue; 150 151 vmas[i]->skip_invalidation = false; 152 xe_bo_assert_held(bo); 153 bo->attr.atomic_access = op->atomic.val; 154 155 /* Invalidate cpu page table, so bo can migrate to smem in next access */ 156 if (xe_bo_is_vram(bo) && 157 (bo->attr.atomic_access == DRM_XE_ATOMIC_CPU || 158 bo->attr.atomic_access == DRM_XE_ATOMIC_GLOBAL)) 159 ttm_bo_unmap_virtual(&bo->ttm); 160 } 161 } 162 163 static void madvise_pat_index(struct xe_device *xe, struct xe_vm *vm, 164 struct xe_vma **vmas, int num_vmas, 165 struct drm_xe_madvise *op, 166 struct xe_madvise_details *details) 167 { 168 int i; 169 170 xe_assert(vm->xe, op->type == DRM_XE_MEM_RANGE_ATTR_PAT); 171 172 for (i = 0; i < num_vmas; i++) { 173 if (vmas[i]->attr.pat_index == op->pat_index.val) { 174 vmas[i]->skip_invalidation = true; 175 } else { 176 vmas[i]->skip_invalidation = false; 177 vmas[i]->attr.pat_index = op->pat_index.val; 178 } 179 } 180 } 181 182 typedef void (*madvise_func)(struct xe_device *xe, struct xe_vm *vm, 183 struct xe_vma **vmas, int num_vmas, 184 struct drm_xe_madvise *op, 185 struct xe_madvise_details *details); 186 187 static const madvise_func madvise_funcs[] = { 188 [DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC] = madvise_preferred_mem_loc, 189 [DRM_XE_MEM_RANGE_ATTR_ATOMIC] = madvise_atomic, 190 [DRM_XE_MEM_RANGE_ATTR_PAT] = madvise_pat_index, 191 }; 192 193 static u8 xe_zap_ptes_in_madvise_range(struct xe_vm *vm, u64 start, u64 end) 194 { 195 struct drm_gpuva *gpuva; 196 struct xe_tile *tile; 197 u8 id, tile_mask = 0; 198 199 lockdep_assert_held_write(&vm->lock); 200 201 /* Wait for pending binds */ 202 if (dma_resv_wait_timeout(xe_vm_resv(vm), DMA_RESV_USAGE_BOOKKEEP, 203 false, MAX_SCHEDULE_TIMEOUT) <= 0) 204 XE_WARN_ON(1); 205 206 drm_gpuvm_for_each_va_range(gpuva, &vm->gpuvm, start, end) { 207 struct xe_vma *vma = gpuva_to_vma(gpuva); 208 209 if (vma->skip_invalidation || xe_vma_is_null(vma)) 210 continue; 211 212 if (xe_vma_is_cpu_addr_mirror(vma)) { 213 tile_mask |= xe_svm_ranges_zap_ptes_in_range(vm, 214 xe_vma_start(vma), 215 xe_vma_end(vma)); 216 } else { 217 for_each_tile(tile, vm->xe, id) { 218 if (xe_pt_zap_ptes(tile, vma)) { 219 tile_mask |= BIT(id); 220 221 /* 222 * WRITE_ONCE pairs with READ_ONCE 223 * in xe_vm_has_valid_gpu_mapping() 224 */ 225 WRITE_ONCE(vma->tile_invalidated, 226 vma->tile_invalidated | BIT(id)); 227 } 228 } 229 } 230 } 231 232 return tile_mask; 233 } 234 235 static int xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64 start, u64 end) 236 { 237 u8 tile_mask = xe_zap_ptes_in_madvise_range(vm, start, end); 238 239 if (!tile_mask) 240 return 0; 241 242 xe_device_wmb(vm->xe); 243 244 return xe_vm_range_tilemask_tlb_inval(vm, start, end, tile_mask); 245 } 246 247 static bool madvise_args_are_sane(struct xe_device *xe, const struct drm_xe_madvise *args) 248 { 249 if (XE_IOCTL_DBG(xe, !args)) 250 return false; 251 252 if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->start, SZ_4K))) 253 return false; 254 255 if (XE_IOCTL_DBG(xe, !IS_ALIGNED(args->range, SZ_4K))) 256 return false; 257 258 if (XE_IOCTL_DBG(xe, args->range < SZ_4K)) 259 return false; 260 261 switch (args->type) { 262 case DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC: 263 { 264 s32 fd = (s32)args->preferred_mem_loc.devmem_fd; 265 266 if (XE_IOCTL_DBG(xe, fd < DRM_XE_PREFERRED_LOC_DEFAULT_SYSTEM)) 267 return false; 268 269 if (XE_IOCTL_DBG(xe, fd <= DRM_XE_PREFERRED_LOC_DEFAULT_DEVICE && 270 args->preferred_mem_loc.region_instance != 0)) 271 return false; 272 273 if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.migration_policy > 274 DRM_XE_MIGRATE_ONLY_SYSTEM_PAGES)) 275 return false; 276 277 if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.reserved)) 278 return false; 279 break; 280 } 281 case DRM_XE_MEM_RANGE_ATTR_ATOMIC: 282 if (XE_IOCTL_DBG(xe, args->atomic.val > DRM_XE_ATOMIC_CPU)) 283 return false; 284 285 if (XE_IOCTL_DBG(xe, args->atomic.pad)) 286 return false; 287 288 if (XE_IOCTL_DBG(xe, args->atomic.reserved)) 289 return false; 290 291 break; 292 case DRM_XE_MEM_RANGE_ATTR_PAT: 293 { 294 u16 pat_index, coh_mode; 295 296 if (XE_IOCTL_DBG(xe, args->pat_index.val >= xe->pat.n_entries)) 297 return false; 298 299 pat_index = array_index_nospec(args->pat_index.val, xe->pat.n_entries); 300 coh_mode = xe_pat_index_get_coh_mode(xe, pat_index); 301 if (XE_IOCTL_DBG(xe, !coh_mode)) 302 return false; 303 304 if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) 305 return false; 306 307 if (XE_IOCTL_DBG(xe, args->pat_index.pad)) 308 return false; 309 310 if (XE_IOCTL_DBG(xe, args->pat_index.reserved)) 311 return false; 312 break; 313 } 314 default: 315 if (XE_IOCTL_DBG(xe, 1)) 316 return false; 317 } 318 319 if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1])) 320 return false; 321 322 return true; 323 } 324 325 static int xe_madvise_details_init(struct xe_vm *vm, const struct drm_xe_madvise *args, 326 struct xe_madvise_details *details) 327 { 328 struct xe_device *xe = vm->xe; 329 330 memset(details, 0, sizeof(*details)); 331 332 if (args->type == DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC) { 333 int fd = args->preferred_mem_loc.devmem_fd; 334 struct drm_pagemap *dpagemap; 335 336 if (fd <= 0) 337 return 0; 338 339 dpagemap = xe_drm_pagemap_from_fd(args->preferred_mem_loc.devmem_fd, 340 args->preferred_mem_loc.region_instance); 341 if (XE_IOCTL_DBG(xe, IS_ERR(dpagemap))) 342 return PTR_ERR(dpagemap); 343 344 /* Don't allow a foreign placement without a fast interconnect! */ 345 if (XE_IOCTL_DBG(xe, dpagemap->pagemap->owner != vm->svm.peer.owner)) { 346 drm_pagemap_put(dpagemap); 347 return -ENOLINK; 348 } 349 details->dpagemap = dpagemap; 350 } 351 352 return 0; 353 } 354 355 static void xe_madvise_details_fini(struct xe_madvise_details *details) 356 { 357 drm_pagemap_put(details->dpagemap); 358 } 359 360 static bool check_bo_args_are_sane(struct xe_vm *vm, struct xe_vma **vmas, 361 int num_vmas, u32 atomic_val) 362 { 363 struct xe_device *xe = vm->xe; 364 struct xe_bo *bo; 365 int i; 366 367 for (i = 0; i < num_vmas; i++) { 368 bo = xe_vma_bo(vmas[i]); 369 if (!bo) 370 continue; 371 /* 372 * NOTE: The following atomic checks are platform-specific. For example, 373 * if a device supports CXL atomics, these may not be necessary or 374 * may behave differently. 375 */ 376 if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_CPU && 377 !(bo->flags & XE_BO_FLAG_SYSTEM))) 378 return false; 379 380 if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_DEVICE && 381 !(bo->flags & XE_BO_FLAG_VRAM0) && 382 !(bo->flags & XE_BO_FLAG_VRAM1) && 383 !(bo->flags & XE_BO_FLAG_SYSTEM && 384 xe->info.has_device_atomics_on_smem))) 385 return false; 386 387 if (XE_IOCTL_DBG(xe, atomic_val == DRM_XE_ATOMIC_GLOBAL && 388 (!(bo->flags & XE_BO_FLAG_SYSTEM) || 389 (!(bo->flags & XE_BO_FLAG_VRAM0) && 390 !(bo->flags & XE_BO_FLAG_VRAM1))))) 391 return false; 392 } 393 return true; 394 } 395 /** 396 * xe_vm_madvise_ioctl - Handle MADVise ioctl for a VM 397 * @dev: DRM device pointer 398 * @data: Pointer to ioctl data (drm_xe_madvise*) 399 * @file: DRM file pointer 400 * 401 * Handles the MADVISE ioctl to provide memory advice for vma's within 402 * input range. 403 * 404 * Return: 0 on success or a negative error code on failure. 405 */ 406 int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *file) 407 { 408 struct xe_device *xe = to_xe_device(dev); 409 struct xe_file *xef = to_xe_file(file); 410 struct drm_xe_madvise *args = data; 411 struct xe_vmas_in_madvise_range madvise_range = {.addr = args->start, 412 .range = args->range, }; 413 struct xe_madvise_details details; 414 struct xe_vm *vm; 415 struct drm_exec exec; 416 int err, attr_type; 417 418 vm = xe_vm_lookup(xef, args->vm_id); 419 if (XE_IOCTL_DBG(xe, !vm)) 420 return -EINVAL; 421 422 if (!madvise_args_are_sane(vm->xe, args)) { 423 err = -EINVAL; 424 goto put_vm; 425 } 426 427 xe_svm_flush(vm); 428 429 err = down_write_killable(&vm->lock); 430 if (err) 431 goto put_vm; 432 433 if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) { 434 err = -ENOENT; 435 goto unlock_vm; 436 } 437 438 err = xe_madvise_details_init(vm, args, &details); 439 if (err) 440 goto unlock_vm; 441 442 err = xe_vm_alloc_madvise_vma(vm, args->start, args->range); 443 if (err) 444 goto madv_fini; 445 446 err = get_vmas(vm, &madvise_range); 447 if (err || !madvise_range.num_vmas) 448 goto madv_fini; 449 450 if (madvise_range.has_bo_vmas) { 451 if (args->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC) { 452 if (!check_bo_args_are_sane(vm, madvise_range.vmas, 453 madvise_range.num_vmas, 454 args->atomic.val)) { 455 err = -EINVAL; 456 goto madv_fini; 457 } 458 } 459 460 drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES | DRM_EXEC_INTERRUPTIBLE_WAIT, 0); 461 drm_exec_until_all_locked(&exec) { 462 for (int i = 0; i < madvise_range.num_vmas; i++) { 463 struct xe_bo *bo = xe_vma_bo(madvise_range.vmas[i]); 464 465 if (!bo) 466 continue; 467 err = drm_exec_lock_obj(&exec, &bo->ttm.base); 468 drm_exec_retry_on_contention(&exec); 469 if (err) 470 goto err_fini; 471 } 472 } 473 } 474 475 if (madvise_range.has_svm_userptr_vmas) { 476 err = xe_svm_notifier_lock_interruptible(vm); 477 if (err) 478 goto err_fini; 479 } 480 481 attr_type = array_index_nospec(args->type, ARRAY_SIZE(madvise_funcs)); 482 madvise_funcs[attr_type](xe, vm, madvise_range.vmas, madvise_range.num_vmas, args, 483 &details); 484 485 err = xe_vm_invalidate_madvise_range(vm, args->start, args->start + args->range); 486 487 if (madvise_range.has_svm_userptr_vmas) 488 xe_svm_notifier_unlock(vm); 489 490 err_fini: 491 if (madvise_range.has_bo_vmas) 492 drm_exec_fini(&exec); 493 kfree(madvise_range.vmas); 494 madvise_range.vmas = NULL; 495 madv_fini: 496 xe_madvise_details_fini(&details); 497 unlock_vm: 498 up_write(&vm->lock); 499 put_vm: 500 xe_vm_put(vm); 501 return err; 502 } 503