1 /* 2 * Copyright 2023 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "kfd_debug.h" 24 #include "kfd_device_queue_manager.h" 25 #include "kfd_topology.h" 26 #include <linux/file.h> 27 #include <uapi/linux/kfd_ioctl.h> 28 #include <uapi/linux/kfd_sysfs.h> 29 30 #define MAX_WATCH_ADDRESSES 4 31 32 int kfd_dbg_ev_query_debug_event(struct kfd_process *process, 33 unsigned int *queue_id, 34 unsigned int *gpu_id, 35 uint64_t exception_clear_mask, 36 uint64_t *event_status) 37 { 38 struct process_queue_manager *pqm; 39 struct process_queue_node *pqn; 40 int i; 41 42 if (!(process && process->debug_trap_enabled)) 43 return -ENODATA; 44 45 mutex_lock(&process->event_mutex); 46 *event_status = 0; 47 *queue_id = 0; 48 *gpu_id = 0; 49 50 /* find and report queue events */ 51 pqm = &process->pqm; 52 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 53 uint64_t tmp = process->exception_enable_mask; 54 55 if (!pqn->q) 56 continue; 57 58 tmp &= pqn->q->properties.exception_status; 59 60 if (!tmp) 61 continue; 62 63 *event_status = pqn->q->properties.exception_status; 64 *queue_id = pqn->q->properties.queue_id; 65 *gpu_id = pqn->q->device->id; 66 pqn->q->properties.exception_status &= ~exception_clear_mask; 67 goto out; 68 } 69 70 /* find and report device events */ 71 for (i = 0; i < process->n_pdds; i++) { 72 struct kfd_process_device *pdd = process->pdds[i]; 73 uint64_t tmp = process->exception_enable_mask 74 & pdd->exception_status; 75 76 if (!tmp) 77 continue; 78 79 *event_status = pdd->exception_status; 80 *gpu_id = pdd->dev->id; 81 pdd->exception_status &= ~exception_clear_mask; 82 goto out; 83 } 84 85 /* report process events */ 86 if (process->exception_enable_mask & process->exception_status) { 87 *event_status = process->exception_status; 88 process->exception_status &= ~exception_clear_mask; 89 } 90 91 out: 92 mutex_unlock(&process->event_mutex); 93 return *event_status ? 0 : -EAGAIN; 94 } 95 96 void debug_event_write_work_handler(struct work_struct *work) 97 { 98 struct kfd_process *process; 99 100 static const char write_data = '.'; 101 loff_t pos = 0; 102 103 process = container_of(work, 104 struct kfd_process, 105 debug_event_workarea); 106 107 if (process->debug_trap_enabled && process->dbg_ev_file) 108 kernel_write(process->dbg_ev_file, &write_data, 1, &pos); 109 } 110 111 /* update process/device/queue exception status, write to descriptor 112 * only if exception_status is enabled. 113 */ 114 bool kfd_dbg_ev_raise(uint64_t event_mask, 115 struct kfd_process *process, struct kfd_node *dev, 116 unsigned int source_id, bool use_worker, 117 void *exception_data, size_t exception_data_size) 118 { 119 struct process_queue_manager *pqm; 120 struct process_queue_node *pqn; 121 int i; 122 static const char write_data = '.'; 123 loff_t pos = 0; 124 bool is_subscribed = true; 125 126 if (!(process && process->debug_trap_enabled)) 127 return false; 128 129 mutex_lock(&process->event_mutex); 130 131 if (event_mask & KFD_EC_MASK_DEVICE) { 132 for (i = 0; i < process->n_pdds; i++) { 133 struct kfd_process_device *pdd = process->pdds[i]; 134 135 if (pdd->dev != dev) 136 continue; 137 138 pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE; 139 140 if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 141 if (!pdd->vm_fault_exc_data) { 142 pdd->vm_fault_exc_data = kmemdup( 143 exception_data, 144 exception_data_size, 145 GFP_KERNEL); 146 if (!pdd->vm_fault_exc_data) 147 pr_debug("Failed to allocate exception data memory"); 148 } else { 149 pr_debug("Debugger exception data not saved\n"); 150 print_hex_dump_bytes("exception data: ", 151 DUMP_PREFIX_OFFSET, 152 exception_data, 153 exception_data_size); 154 } 155 } 156 break; 157 } 158 } else if (event_mask & KFD_EC_MASK_PROCESS) { 159 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS; 160 } else { 161 pqm = &process->pqm; 162 list_for_each_entry(pqn, &pqm->queues, 163 process_queue_list) { 164 int target_id; 165 166 if (!pqn->q) 167 continue; 168 169 target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ? 170 pqn->q->properties.queue_id : 171 pqn->q->doorbell_id; 172 173 if (pqn->q->device != dev || target_id != source_id) 174 continue; 175 176 pqn->q->properties.exception_status |= event_mask; 177 break; 178 } 179 } 180 181 if (process->exception_enable_mask & event_mask) { 182 if (use_worker) 183 schedule_work(&process->debug_event_workarea); 184 else 185 kernel_write(process->dbg_ev_file, 186 &write_data, 187 1, 188 &pos); 189 } else { 190 is_subscribed = false; 191 } 192 193 mutex_unlock(&process->event_mutex); 194 195 return is_subscribed; 196 } 197 198 /* set pending event queue entry from ring entry */ 199 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev, 200 unsigned int pasid, 201 uint32_t doorbell_id, 202 uint64_t trap_mask, 203 void *exception_data, 204 size_t exception_data_size) 205 { 206 struct kfd_process *p; 207 struct kfd_process_device *pdd = NULL; 208 bool signaled_to_debugger_or_runtime = false; 209 210 p = kfd_lookup_process_by_pasid(pasid, &pdd); 211 212 if (!pdd) 213 return false; 214 215 if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true, 216 exception_data, exception_data_size)) { 217 struct process_queue_manager *pqm; 218 struct process_queue_node *pqn; 219 220 if (!!(trap_mask & KFD_EC_MASK_QUEUE) && 221 p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) { 222 mutex_lock(&p->mutex); 223 224 pqm = &p->pqm; 225 list_for_each_entry(pqn, &pqm->queues, 226 process_queue_list) { 227 228 if (!(pqn->q && pqn->q->device == dev && 229 pqn->q->doorbell_id == doorbell_id)) 230 continue; 231 232 kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id, 233 trap_mask); 234 235 signaled_to_debugger_or_runtime = true; 236 237 break; 238 } 239 240 mutex_unlock(&p->mutex); 241 } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 242 kfd_evict_process_device(pdd); 243 kfd_signal_vm_fault_event(pdd, NULL, exception_data); 244 245 signaled_to_debugger_or_runtime = true; 246 } 247 } else { 248 signaled_to_debugger_or_runtime = true; 249 } 250 251 kfd_unref_process(p); 252 253 return signaled_to_debugger_or_runtime; 254 } 255 256 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p, 257 unsigned int dev_id, 258 unsigned int queue_id, 259 uint64_t error_reason) 260 { 261 if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 262 struct kfd_process_device *pdd = NULL; 263 struct kfd_hsa_memory_exception_data *data; 264 int i; 265 266 for (i = 0; i < p->n_pdds; i++) { 267 if (p->pdds[i]->dev->id == dev_id) { 268 pdd = p->pdds[i]; 269 break; 270 } 271 } 272 273 if (!pdd) 274 return -ENODEV; 275 276 data = (struct kfd_hsa_memory_exception_data *) 277 pdd->vm_fault_exc_data; 278 279 kfd_evict_process_device(pdd); 280 kfd_signal_vm_fault_event(pdd, NULL, data); 281 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION); 282 } 283 284 if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) { 285 /* 286 * block should only happen after the debugger receives runtime 287 * enable notice. 288 */ 289 up(&p->runtime_enable_sema); 290 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME); 291 } 292 293 if (error_reason) 294 return kfd_send_exception_to_runtime(p, queue_id, error_reason); 295 296 return 0; 297 } 298 299 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable) 300 { 301 struct mqd_update_info minfo = {0}; 302 int err; 303 304 if (!q) 305 return 0; 306 307 if (!kfd_dbg_has_cwsr_workaround(q->device)) 308 return 0; 309 310 if (enable && q->properties.is_user_cu_masked) 311 return -EBUSY; 312 313 minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE; 314 315 q->properties.is_dbg_wa = enable; 316 err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo); 317 if (err) 318 q->properties.is_dbg_wa = false; 319 320 return err; 321 } 322 323 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable) 324 { 325 struct process_queue_manager *pqm = &target->pqm; 326 struct process_queue_node *pqn; 327 int r = 0; 328 329 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 330 r = kfd_dbg_set_queue_workaround(pqn->q, enable); 331 if (enable && r) 332 goto unwind; 333 } 334 335 return 0; 336 337 unwind: 338 list_for_each_entry(pqn, &pqm->queues, process_queue_list) 339 kfd_dbg_set_queue_workaround(pqn->q, false); 340 341 if (enable) 342 target->runtime_info.runtime_state = r == -EBUSY ? 343 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 344 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 345 346 return r; 347 } 348 349 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en) 350 { 351 uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode; 352 uint32_t flags = pdd->process->dbg_flags; 353 struct amdgpu_device *adev = pdd->dev->adev; 354 int r; 355 356 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 357 return 0; 358 359 if (!pdd->proc_ctx_cpu_ptr) { 360 r = amdgpu_amdkfd_alloc_kernel_mem(adev, 361 AMDGPU_MES_PROC_CTX_SIZE, 362 AMDGPU_GEM_DOMAIN_GTT, 363 &pdd->proc_ctx_bo, 364 &pdd->proc_ctx_gpu_addr, 365 &pdd->proc_ctx_cpu_ptr, 366 false); 367 if (r) { 368 dev_err(adev->dev, 369 "failed to allocate process context bo\n"); 370 return r; 371 } 372 memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); 373 } 374 375 return amdgpu_mes_set_shader_debugger(pdd->dev->adev, 376 pdd->proc_ctx_gpu_addr, spi_dbg_cntl, 377 pdd->watch_points, flags, sq_trap_en, 378 ffs(pdd->dev->xcc_mask) - 1); 379 } 380 381 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1 382 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id) 383 { 384 int i; 385 386 *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID; 387 388 spin_lock(&pdd->dev->watch_points_lock); 389 390 for (i = 0; i < MAX_WATCH_ADDRESSES; i++) { 391 /* device watchpoint in use so skip */ 392 if ((pdd->dev->alloc_watch_ids >> i) & 0x1) 393 continue; 394 395 pdd->alloc_watch_ids |= 0x1 << i; 396 pdd->dev->alloc_watch_ids |= 0x1 << i; 397 *watch_id = i; 398 spin_unlock(&pdd->dev->watch_points_lock); 399 return 0; 400 } 401 402 spin_unlock(&pdd->dev->watch_points_lock); 403 404 return -ENOMEM; 405 } 406 407 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, u32 watch_id) 408 { 409 spin_lock(&pdd->dev->watch_points_lock); 410 411 /* process owns device watch point so safe to clear */ 412 if (pdd->alloc_watch_ids & BIT(watch_id)) { 413 pdd->alloc_watch_ids &= ~BIT(watch_id); 414 pdd->dev->alloc_watch_ids &= ~BIT(watch_id); 415 } 416 417 spin_unlock(&pdd->dev->watch_points_lock); 418 } 419 420 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, u32 watch_id) 421 { 422 bool owns_watch_id = false; 423 424 spin_lock(&pdd->dev->watch_points_lock); 425 owns_watch_id = pdd->alloc_watch_ids & BIT(watch_id); 426 spin_unlock(&pdd->dev->watch_points_lock); 427 428 return owns_watch_id; 429 } 430 431 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd, 432 uint32_t watch_id) 433 { 434 int r; 435 436 if (watch_id >= MAX_WATCH_ADDRESSES) 437 return -EINVAL; 438 439 if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id)) 440 return -EINVAL; 441 442 if (!pdd->dev->kfd->shared_resources.enable_mes) { 443 r = debug_lock_and_unmap(pdd->dev->dqm); 444 if (r) 445 return r; 446 } 447 448 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 449 pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch( 450 pdd->dev->adev, 451 watch_id); 452 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 453 454 if (!pdd->dev->kfd->shared_resources.enable_mes) 455 r = debug_map_and_unlock(pdd->dev->dqm); 456 else 457 r = kfd_dbg_set_mes_debug_mode(pdd, true); 458 459 kfd_dbg_clear_dev_watch_id(pdd, watch_id); 460 461 return r; 462 } 463 464 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd, 465 uint64_t watch_address, 466 uint32_t watch_address_mask, 467 uint32_t *watch_id, 468 uint32_t watch_mode) 469 { 470 int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id); 471 uint32_t xcc_mask = pdd->dev->xcc_mask; 472 473 if (r) 474 return r; 475 476 if (*watch_id >= MAX_WATCH_ADDRESSES) 477 return -EINVAL; 478 479 if (!pdd->dev->kfd->shared_resources.enable_mes) { 480 r = debug_lock_and_unmap(pdd->dev->dqm); 481 if (r) { 482 kfd_dbg_clear_dev_watch_id(pdd, *watch_id); 483 return r; 484 } 485 } 486 487 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 488 for_each_inst(xcc_id, xcc_mask) 489 pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch( 490 pdd->dev->adev, 491 watch_address, 492 watch_address_mask, 493 *watch_id, 494 watch_mode, 495 pdd->dev->vm_info.last_vmid_kfd, 496 xcc_id); 497 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 498 499 if (!pdd->dev->kfd->shared_resources.enable_mes) 500 r = debug_map_and_unlock(pdd->dev->dqm); 501 else 502 r = kfd_dbg_set_mes_debug_mode(pdd, true); 503 504 /* HWS is broken so no point in HW rollback but release the watchpoint anyways */ 505 if (r) 506 kfd_dbg_clear_dev_watch_id(pdd, *watch_id); 507 508 return 0; 509 } 510 511 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target) 512 { 513 int i, j; 514 515 for (i = 0; i < target->n_pdds; i++) 516 for (j = 0; j < MAX_WATCH_ADDRESSES; j++) 517 kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j); 518 } 519 520 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) 521 { 522 uint32_t prev_flags = target->dbg_flags; 523 int i, r = 0, rewind_count = 0; 524 525 for (i = 0; i < target->n_pdds; i++) { 526 uint32_t caps; 527 uint32_t caps2; 528 struct kfd_topology_device *topo_dev = 529 kfd_topology_device_by_id(target->pdds[i]->dev->id); 530 if (!topo_dev) 531 return -EINVAL; 532 533 caps = topo_dev->node_props.capability; 534 caps2 = topo_dev->node_props.capability2; 535 536 if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) && 537 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) { 538 *flags = prev_flags; 539 return -EACCES; 540 } 541 542 if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) && 543 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) { 544 *flags = prev_flags; 545 return -EACCES; 546 } 547 548 if (!(caps2 & HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED) && 549 (*flags & KFD_DBG_TRAP_FLAG_LDS_OUT_OF_ADDR_RANGE)) { 550 *flags = prev_flags; 551 return -EACCES; 552 } 553 } 554 555 target->dbg_flags = *flags; 556 *flags = prev_flags; 557 for (i = 0; i < target->n_pdds; i++) { 558 struct kfd_process_device *pdd = target->pdds[i]; 559 560 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 561 continue; 562 563 if (!pdd->dev->kfd->shared_resources.enable_mes) 564 r = debug_refresh_runlist(pdd->dev->dqm); 565 else 566 r = kfd_dbg_set_mes_debug_mode(pdd, true); 567 568 if (r) { 569 target->dbg_flags = prev_flags; 570 break; 571 } 572 573 rewind_count++; 574 } 575 576 /* Rewind flags */ 577 if (r) { 578 target->dbg_flags = prev_flags; 579 580 for (i = 0; i < rewind_count; i++) { 581 struct kfd_process_device *pdd = target->pdds[i]; 582 583 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 584 continue; 585 586 if (!pdd->dev->kfd->shared_resources.enable_mes) 587 (void)debug_refresh_runlist(pdd->dev->dqm); 588 else 589 (void)kfd_dbg_set_mes_debug_mode(pdd, true); 590 } 591 } 592 593 return r; 594 } 595 596 /* kfd_dbg_trap_deactivate: 597 * target: target process 598 * unwind: If this is unwinding a failed kfd_dbg_trap_enable() 599 * unwind_count: 600 * If unwind == true, how far down the pdd list we need 601 * to unwind 602 * else: ignored 603 */ 604 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count) 605 { 606 int i; 607 608 if (!unwind) { 609 uint32_t flags = 0; 610 int resume_count = resume_queues(target, 0, NULL); 611 612 if (resume_count) 613 pr_debug("Resumed %d queues\n", resume_count); 614 615 cancel_work_sync(&target->debug_event_workarea); 616 kfd_dbg_clear_process_address_watch(target); 617 kfd_dbg_trap_set_wave_launch_mode(target, 0); 618 619 kfd_dbg_trap_set_flags(target, &flags); 620 } 621 622 for (i = 0; i < target->n_pdds; i++) { 623 struct kfd_process_device *pdd = target->pdds[i]; 624 625 /* If this is an unwind, and we have unwound the required 626 * enable calls on the pdd list, we need to stop now 627 * otherwise we may mess up another debugger session. 628 */ 629 if (unwind && i == unwind_count) 630 break; 631 632 kfd_process_set_trap_debug_flag(&pdd->qpd, false); 633 634 /* GFX off is already disabled by debug activate if not RLC restore supported. */ 635 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 636 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 637 pdd->spi_dbg_override = 638 pdd->dev->kfd2kgd->disable_debug_trap( 639 pdd->dev->adev, 640 target->runtime_info.ttmp_setup, 641 pdd->dev->vm_info.last_vmid_kfd); 642 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 643 644 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) && 645 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd)) 646 pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id); 647 648 if (!pdd->dev->kfd->shared_resources.enable_mes) 649 (void)debug_refresh_runlist(pdd->dev->dqm); 650 else 651 (void)kfd_dbg_set_mes_debug_mode(pdd, 652 !kfd_dbg_has_cwsr_workaround(pdd->dev)); 653 } 654 655 kfd_dbg_set_workaround(target, false); 656 } 657 658 static void kfd_dbg_clean_exception_status(struct kfd_process *target) 659 { 660 struct process_queue_manager *pqm; 661 struct process_queue_node *pqn; 662 int i; 663 664 for (i = 0; i < target->n_pdds; i++) { 665 struct kfd_process_device *pdd = target->pdds[i]; 666 667 kfd_process_drain_interrupts(pdd); 668 669 pdd->exception_status = 0; 670 } 671 672 pqm = &target->pqm; 673 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 674 if (!pqn->q) 675 continue; 676 677 pqn->q->properties.exception_status = 0; 678 } 679 680 target->exception_status = 0; 681 } 682 683 int kfd_dbg_trap_disable(struct kfd_process *target) 684 { 685 if (!target->debug_trap_enabled) 686 return 0; 687 688 /* 689 * Defer deactivation to runtime if runtime not enabled otherwise reset 690 * attached running target runtime state to enable for re-attach. 691 */ 692 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 693 kfd_dbg_trap_deactivate(target, false, 0); 694 else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) 695 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; 696 697 cancel_work_sync(&target->debug_event_workarea); 698 fput(target->dbg_ev_file); 699 target->dbg_ev_file = NULL; 700 701 if (target->debugger_process) { 702 atomic_dec(&target->debugger_process->debugged_process_count); 703 target->debugger_process = NULL; 704 } 705 706 target->debug_trap_enabled = false; 707 kfd_dbg_clean_exception_status(target); 708 kfd_unref_process(target); 709 710 return 0; 711 } 712 713 int kfd_dbg_trap_activate(struct kfd_process *target) 714 { 715 int i, r = 0; 716 717 r = kfd_dbg_set_workaround(target, true); 718 if (r) 719 return r; 720 721 for (i = 0; i < target->n_pdds; i++) { 722 struct kfd_process_device *pdd = target->pdds[i]; 723 724 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) { 725 r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd); 726 727 if (r) { 728 target->runtime_info.runtime_state = (r == -EBUSY) ? 729 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 730 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 731 732 goto unwind_err; 733 } 734 } 735 736 /* Disable GFX OFF to prevent garbage read/writes to debug registers. 737 * If RLC restore of debug registers is not supported and runtime enable 738 * hasn't done so already on ttmp setup request, restore the trap config registers. 739 * 740 * If RLC restore of debug registers is not supported, keep gfx off disabled for 741 * the debug session. 742 */ 743 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 744 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) || 745 target->runtime_info.ttmp_setup)) 746 pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true, 747 pdd->dev->vm_info.last_vmid_kfd); 748 749 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap( 750 pdd->dev->adev, 751 false, 752 pdd->dev->vm_info.last_vmid_kfd); 753 754 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 755 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 756 757 /* 758 * Setting the debug flag in the trap handler requires that the TMA has been 759 * allocated, which occurs during CWSR initialization. 760 * In the event that CWSR has not been initialized at this point, setting the 761 * flag will be called again during CWSR initialization if the target process 762 * is still debug enabled. 763 */ 764 kfd_process_set_trap_debug_flag(&pdd->qpd, true); 765 766 if (!pdd->dev->kfd->shared_resources.enable_mes) 767 r = debug_refresh_runlist(pdd->dev->dqm); 768 else 769 r = kfd_dbg_set_mes_debug_mode(pdd, true); 770 771 if (r) { 772 target->runtime_info.runtime_state = 773 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 774 goto unwind_err; 775 } 776 } 777 778 return 0; 779 780 unwind_err: 781 /* Enabling debug failed, we need to disable on 782 * all GPUs so the enable is all or nothing. 783 */ 784 kfd_dbg_trap_deactivate(target, true, i); 785 return r; 786 } 787 788 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, 789 void __user *runtime_info, uint32_t *runtime_size) 790 { 791 struct file *f; 792 uint32_t copy_size; 793 int i, r = 0; 794 795 if (target->debug_trap_enabled) 796 return -EALREADY; 797 798 /* Enable pre-checks */ 799 for (i = 0; i < target->n_pdds; i++) { 800 struct kfd_process_device *pdd = target->pdds[i]; 801 802 if (!KFD_IS_SOC15(pdd->dev)) 803 return -ENODEV; 804 805 if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) || 806 kfd_dbg_has_cwsr_workaround(pdd->dev))) 807 return -EBUSY; 808 } 809 810 copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info)); 811 812 f = fget(fd); 813 if (!f) { 814 pr_err("Failed to get file for (%i)\n", fd); 815 return -EBADF; 816 } 817 818 target->dbg_ev_file = f; 819 820 /* defer activation to runtime if not runtime enabled */ 821 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 822 kfd_dbg_trap_activate(target); 823 824 /* We already hold the process reference but hold another one for the 825 * debug session. 826 */ 827 kref_get(&target->ref); 828 target->debug_trap_enabled = true; 829 830 if (target->debugger_process) 831 atomic_inc(&target->debugger_process->debugged_process_count); 832 833 if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) { 834 kfd_dbg_trap_deactivate(target, false, 0); 835 fput(target->dbg_ev_file); 836 target->dbg_ev_file = NULL; 837 if (target->debugger_process) 838 atomic_dec(&target->debugger_process->debugged_process_count); 839 target->debug_trap_enabled = false; 840 kfd_unref_process(target); 841 r = -EFAULT; 842 } 843 844 *runtime_size = sizeof(target->runtime_info); 845 846 return r; 847 } 848 849 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p, 850 uint32_t trap_override, 851 uint32_t trap_mask_request, 852 uint32_t *trap_mask_supported) 853 { 854 int i = 0; 855 856 *trap_mask_supported = 0xffffffff; 857 858 for (i = 0; i < p->n_pdds; i++) { 859 struct kfd_process_device *pdd = p->pdds[i]; 860 int err = pdd->dev->kfd2kgd->validate_trap_override_request( 861 pdd->dev->adev, 862 trap_override, 863 trap_mask_supported); 864 865 if (err) 866 return err; 867 } 868 869 if (trap_mask_request & ~*trap_mask_supported) 870 return -EACCES; 871 872 return 0; 873 } 874 875 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target, 876 uint32_t trap_override, 877 uint32_t trap_mask_bits, 878 uint32_t trap_mask_request, 879 uint32_t *trap_mask_prev, 880 uint32_t *trap_mask_supported) 881 { 882 int r = 0, i; 883 884 r = kfd_dbg_validate_trap_override_request(target, 885 trap_override, 886 trap_mask_request, 887 trap_mask_supported); 888 889 if (r) 890 return r; 891 892 for (i = 0; i < target->n_pdds; i++) { 893 struct kfd_process_device *pdd = target->pdds[i]; 894 895 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 896 pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override( 897 pdd->dev->adev, 898 pdd->dev->vm_info.last_vmid_kfd, 899 trap_override, 900 trap_mask_bits, 901 trap_mask_request, 902 trap_mask_prev, 903 pdd->spi_dbg_override); 904 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 905 906 if (!pdd->dev->kfd->shared_resources.enable_mes) 907 r = debug_refresh_runlist(pdd->dev->dqm); 908 else 909 r = kfd_dbg_set_mes_debug_mode(pdd, true); 910 911 if (r) 912 break; 913 } 914 915 return r; 916 } 917 918 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target, 919 uint8_t wave_launch_mode) 920 { 921 int r = 0, i; 922 923 if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL && 924 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT && 925 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG) 926 return -EINVAL; 927 928 for (i = 0; i < target->n_pdds; i++) { 929 struct kfd_process_device *pdd = target->pdds[i]; 930 931 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 932 pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode( 933 pdd->dev->adev, 934 wave_launch_mode, 935 pdd->dev->vm_info.last_vmid_kfd); 936 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 937 938 if (!pdd->dev->kfd->shared_resources.enable_mes) 939 r = debug_refresh_runlist(pdd->dev->dqm); 940 else 941 r = kfd_dbg_set_mes_debug_mode(pdd, true); 942 943 if (r) 944 break; 945 } 946 947 return r; 948 } 949 950 int kfd_dbg_trap_query_exception_info(struct kfd_process *target, 951 uint32_t source_id, 952 uint32_t exception_code, 953 bool clear_exception, 954 void __user *info, 955 uint32_t *info_size) 956 { 957 bool found = false; 958 int r = 0; 959 uint32_t copy_size, actual_info_size = 0; 960 uint64_t *exception_status_ptr = NULL; 961 962 if (!target) 963 return -EINVAL; 964 965 if (!info || !info_size) 966 return -EINVAL; 967 968 mutex_lock(&target->event_mutex); 969 970 if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) { 971 /* Per queue exceptions */ 972 struct queue *queue = NULL; 973 int i; 974 975 for (i = 0; i < target->n_pdds; i++) { 976 struct kfd_process_device *pdd = target->pdds[i]; 977 struct qcm_process_device *qpd = &pdd->qpd; 978 979 list_for_each_entry(queue, &qpd->queues_list, list) { 980 if (!found && queue->properties.queue_id == source_id) { 981 found = true; 982 break; 983 } 984 } 985 if (found) 986 break; 987 } 988 989 if (!found) { 990 r = -EINVAL; 991 goto out; 992 } 993 994 if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) { 995 r = -ENODATA; 996 goto out; 997 } 998 exception_status_ptr = &queue->properties.exception_status; 999 } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) { 1000 /* Per device exceptions */ 1001 struct kfd_process_device *pdd = NULL; 1002 int i; 1003 1004 for (i = 0; i < target->n_pdds; i++) { 1005 pdd = target->pdds[i]; 1006 if (pdd->dev->id == source_id) { 1007 found = true; 1008 break; 1009 } 1010 } 1011 1012 if (!found) { 1013 r = -EINVAL; 1014 goto out; 1015 } 1016 1017 if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) { 1018 r = -ENODATA; 1019 goto out; 1020 } 1021 1022 if (exception_code == EC_DEVICE_MEMORY_VIOLATION) { 1023 copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size); 1024 1025 if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) { 1026 r = -EFAULT; 1027 goto out; 1028 } 1029 actual_info_size = pdd->vm_fault_exc_data_size; 1030 if (clear_exception) { 1031 kfree(pdd->vm_fault_exc_data); 1032 pdd->vm_fault_exc_data = NULL; 1033 pdd->vm_fault_exc_data_size = 0; 1034 } 1035 } 1036 exception_status_ptr = &pdd->exception_status; 1037 } else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) { 1038 /* Per process exceptions */ 1039 if (!(target->exception_status & KFD_EC_MASK(exception_code))) { 1040 r = -ENODATA; 1041 goto out; 1042 } 1043 1044 if (exception_code == EC_PROCESS_RUNTIME) { 1045 copy_size = min((size_t)(*info_size), sizeof(target->runtime_info)); 1046 1047 if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) { 1048 r = -EFAULT; 1049 goto out; 1050 } 1051 1052 actual_info_size = sizeof(target->runtime_info); 1053 } 1054 1055 exception_status_ptr = &target->exception_status; 1056 } else { 1057 pr_debug("Bad exception type [%i]\n", exception_code); 1058 r = -EINVAL; 1059 goto out; 1060 } 1061 1062 *info_size = actual_info_size; 1063 if (clear_exception) 1064 *exception_status_ptr &= ~KFD_EC_MASK(exception_code); 1065 out: 1066 mutex_unlock(&target->event_mutex); 1067 return r; 1068 } 1069 1070 int kfd_dbg_trap_device_snapshot(struct kfd_process *target, 1071 uint64_t exception_clear_mask, 1072 void __user *user_info, 1073 uint32_t *number_of_device_infos, 1074 uint32_t *entry_size) 1075 { 1076 struct kfd_dbg_device_info_entry device_info; 1077 uint32_t tmp_entry_size, tmp_num_devices; 1078 int i, r = 0; 1079 1080 if (!(target && user_info && number_of_device_infos && entry_size)) 1081 return -EINVAL; 1082 1083 tmp_entry_size = *entry_size; 1084 1085 tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds); 1086 *number_of_device_infos = target->n_pdds; 1087 *entry_size = min_t(size_t, *entry_size, sizeof(device_info)); 1088 1089 if (!tmp_num_devices) 1090 return 0; 1091 1092 memset(&device_info, 0, sizeof(device_info)); 1093 1094 mutex_lock(&target->event_mutex); 1095 1096 /* Run over all pdd of the process */ 1097 for (i = 0; i < tmp_num_devices; i++) { 1098 struct kfd_process_device *pdd = target->pdds[i]; 1099 struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id); 1100 if (!topo_dev) { 1101 r = -EINVAL; 1102 break; 1103 } 1104 1105 device_info.gpu_id = pdd->dev->id; 1106 device_info.exception_status = pdd->exception_status; 1107 device_info.lds_base = pdd->lds_base; 1108 device_info.lds_limit = pdd->lds_limit; 1109 device_info.scratch_base = pdd->scratch_base; 1110 device_info.scratch_limit = pdd->scratch_limit; 1111 device_info.gpuvm_base = pdd->gpuvm_base; 1112 device_info.gpuvm_limit = pdd->gpuvm_limit; 1113 device_info.location_id = topo_dev->node_props.location_id; 1114 device_info.vendor_id = topo_dev->node_props.vendor_id; 1115 device_info.device_id = topo_dev->node_props.device_id; 1116 device_info.revision_id = pdd->dev->adev->pdev->revision; 1117 device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor; 1118 device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device; 1119 device_info.fw_version = pdd->dev->kfd->mec_fw_version; 1120 device_info.gfx_target_version = 1121 topo_dev->node_props.gfx_target_version; 1122 device_info.simd_count = topo_dev->node_props.simd_count; 1123 device_info.max_waves_per_simd = 1124 topo_dev->node_props.max_waves_per_simd; 1125 device_info.array_count = topo_dev->node_props.array_count; 1126 device_info.simd_arrays_per_engine = 1127 topo_dev->node_props.simd_arrays_per_engine; 1128 device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask); 1129 device_info.capability = topo_dev->node_props.capability; 1130 device_info.debug_prop = topo_dev->node_props.debug_prop; 1131 device_info.capability2 = topo_dev->node_props.capability2; 1132 1133 if (exception_clear_mask) 1134 pdd->exception_status &= ~exception_clear_mask; 1135 1136 if (copy_to_user(user_info, &device_info, *entry_size)) { 1137 r = -EFAULT; 1138 break; 1139 } 1140 1141 user_info += tmp_entry_size; 1142 } 1143 1144 mutex_unlock(&target->event_mutex); 1145 1146 return r; 1147 } 1148 1149 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target, 1150 uint64_t exception_set_mask) 1151 { 1152 uint64_t found_mask = 0; 1153 struct process_queue_manager *pqm; 1154 struct process_queue_node *pqn; 1155 static const char write_data = '.'; 1156 loff_t pos = 0; 1157 int i; 1158 1159 mutex_lock(&target->event_mutex); 1160 1161 found_mask |= target->exception_status; 1162 1163 pqm = &target->pqm; 1164 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 1165 if (!pqn->q) 1166 continue; 1167 1168 found_mask |= pqn->q->properties.exception_status; 1169 } 1170 1171 for (i = 0; i < target->n_pdds; i++) { 1172 struct kfd_process_device *pdd = target->pdds[i]; 1173 1174 found_mask |= pdd->exception_status; 1175 } 1176 1177 if (exception_set_mask & found_mask) 1178 kernel_write(target->dbg_ev_file, &write_data, 1, &pos); 1179 1180 target->exception_enable_mask = exception_set_mask; 1181 1182 mutex_unlock(&target->event_mutex); 1183 } 1184