1 /* 2 * Copyright 2023 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23 #include "kfd_debug.h" 24 #include "kfd_device_queue_manager.h" 25 #include "kfd_topology.h" 26 #include <linux/file.h> 27 #include <uapi/linux/kfd_ioctl.h> 28 #include <uapi/linux/kfd_sysfs.h> 29 30 #define MAX_WATCH_ADDRESSES 4 31 32 int kfd_dbg_ev_query_debug_event(struct kfd_process *process, 33 unsigned int *queue_id, 34 unsigned int *gpu_id, 35 uint64_t exception_clear_mask, 36 uint64_t *event_status) 37 { 38 struct process_queue_manager *pqm; 39 struct process_queue_node *pqn; 40 int i; 41 42 if (!(process && process->debug_trap_enabled)) 43 return -ENODATA; 44 45 mutex_lock(&process->event_mutex); 46 *event_status = 0; 47 *queue_id = 0; 48 *gpu_id = 0; 49 50 /* find and report queue events */ 51 pqm = &process->pqm; 52 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 53 uint64_t tmp = process->exception_enable_mask; 54 55 if (!pqn->q) 56 continue; 57 58 tmp &= pqn->q->properties.exception_status; 59 60 if (!tmp) 61 continue; 62 63 *event_status = pqn->q->properties.exception_status; 64 *queue_id = pqn->q->properties.queue_id; 65 *gpu_id = pqn->q->device->id; 66 pqn->q->properties.exception_status &= ~exception_clear_mask; 67 goto out; 68 } 69 70 /* find and report device events */ 71 for (i = 0; i < process->n_pdds; i++) { 72 struct kfd_process_device *pdd = process->pdds[i]; 73 uint64_t tmp = process->exception_enable_mask 74 & pdd->exception_status; 75 76 if (!tmp) 77 continue; 78 79 *event_status = pdd->exception_status; 80 *gpu_id = pdd->dev->id; 81 pdd->exception_status &= ~exception_clear_mask; 82 goto out; 83 } 84 85 /* report process events */ 86 if (process->exception_enable_mask & process->exception_status) { 87 *event_status = process->exception_status; 88 process->exception_status &= ~exception_clear_mask; 89 } 90 91 out: 92 mutex_unlock(&process->event_mutex); 93 return *event_status ? 0 : -EAGAIN; 94 } 95 96 void debug_event_write_work_handler(struct work_struct *work) 97 { 98 struct kfd_process *process; 99 100 static const char write_data = '.'; 101 loff_t pos = 0; 102 103 process = container_of(work, 104 struct kfd_process, 105 debug_event_workarea); 106 107 kernel_write(process->dbg_ev_file, &write_data, 1, &pos); 108 } 109 110 /* update process/device/queue exception status, write to descriptor 111 * only if exception_status is enabled. 112 */ 113 bool kfd_dbg_ev_raise(uint64_t event_mask, 114 struct kfd_process *process, struct kfd_node *dev, 115 unsigned int source_id, bool use_worker, 116 void *exception_data, size_t exception_data_size) 117 { 118 struct process_queue_manager *pqm; 119 struct process_queue_node *pqn; 120 int i; 121 static const char write_data = '.'; 122 loff_t pos = 0; 123 bool is_subscribed = true; 124 125 if (!(process && process->debug_trap_enabled)) 126 return false; 127 128 mutex_lock(&process->event_mutex); 129 130 if (event_mask & KFD_EC_MASK_DEVICE) { 131 for (i = 0; i < process->n_pdds; i++) { 132 struct kfd_process_device *pdd = process->pdds[i]; 133 134 if (pdd->dev != dev) 135 continue; 136 137 pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE; 138 139 if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 140 if (!pdd->vm_fault_exc_data) { 141 pdd->vm_fault_exc_data = kmemdup( 142 exception_data, 143 exception_data_size, 144 GFP_KERNEL); 145 if (!pdd->vm_fault_exc_data) 146 pr_debug("Failed to allocate exception data memory"); 147 } else { 148 pr_debug("Debugger exception data not saved\n"); 149 print_hex_dump_bytes("exception data: ", 150 DUMP_PREFIX_OFFSET, 151 exception_data, 152 exception_data_size); 153 } 154 } 155 break; 156 } 157 } else if (event_mask & KFD_EC_MASK_PROCESS) { 158 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS; 159 } else { 160 pqm = &process->pqm; 161 list_for_each_entry(pqn, &pqm->queues, 162 process_queue_list) { 163 int target_id; 164 165 if (!pqn->q) 166 continue; 167 168 target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ? 169 pqn->q->properties.queue_id : 170 pqn->q->doorbell_id; 171 172 if (pqn->q->device != dev || target_id != source_id) 173 continue; 174 175 pqn->q->properties.exception_status |= event_mask; 176 break; 177 } 178 } 179 180 if (process->exception_enable_mask & event_mask) { 181 if (use_worker) 182 schedule_work(&process->debug_event_workarea); 183 else 184 kernel_write(process->dbg_ev_file, 185 &write_data, 186 1, 187 &pos); 188 } else { 189 is_subscribed = false; 190 } 191 192 mutex_unlock(&process->event_mutex); 193 194 return is_subscribed; 195 } 196 197 /* set pending event queue entry from ring entry */ 198 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev, 199 unsigned int pasid, 200 uint32_t doorbell_id, 201 uint64_t trap_mask, 202 void *exception_data, 203 size_t exception_data_size) 204 { 205 struct kfd_process *p; 206 bool signaled_to_debugger_or_runtime = false; 207 208 p = kfd_lookup_process_by_pasid(pasid); 209 210 if (!p) 211 return false; 212 213 if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true, 214 exception_data, exception_data_size)) { 215 struct process_queue_manager *pqm; 216 struct process_queue_node *pqn; 217 218 if (!!(trap_mask & KFD_EC_MASK_QUEUE) && 219 p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) { 220 mutex_lock(&p->mutex); 221 222 pqm = &p->pqm; 223 list_for_each_entry(pqn, &pqm->queues, 224 process_queue_list) { 225 226 if (!(pqn->q && pqn->q->device == dev && 227 pqn->q->doorbell_id == doorbell_id)) 228 continue; 229 230 kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id, 231 trap_mask); 232 233 signaled_to_debugger_or_runtime = true; 234 235 break; 236 } 237 238 mutex_unlock(&p->mutex); 239 } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 240 kfd_dqm_evict_pasid(dev->dqm, p->pasid); 241 kfd_signal_vm_fault_event(dev, p->pasid, NULL, 242 exception_data); 243 244 signaled_to_debugger_or_runtime = true; 245 } 246 } else { 247 signaled_to_debugger_or_runtime = true; 248 } 249 250 kfd_unref_process(p); 251 252 return signaled_to_debugger_or_runtime; 253 } 254 255 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p, 256 unsigned int dev_id, 257 unsigned int queue_id, 258 uint64_t error_reason) 259 { 260 if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { 261 struct kfd_process_device *pdd = NULL; 262 struct kfd_hsa_memory_exception_data *data; 263 int i; 264 265 for (i = 0; i < p->n_pdds; i++) { 266 if (p->pdds[i]->dev->id == dev_id) { 267 pdd = p->pdds[i]; 268 break; 269 } 270 } 271 272 if (!pdd) 273 return -ENODEV; 274 275 data = (struct kfd_hsa_memory_exception_data *) 276 pdd->vm_fault_exc_data; 277 278 kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid); 279 kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data); 280 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION); 281 } 282 283 if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) { 284 /* 285 * block should only happen after the debugger receives runtime 286 * enable notice. 287 */ 288 up(&p->runtime_enable_sema); 289 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME); 290 } 291 292 if (error_reason) 293 return kfd_send_exception_to_runtime(p, queue_id, error_reason); 294 295 return 0; 296 } 297 298 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable) 299 { 300 struct mqd_update_info minfo = {0}; 301 int err; 302 303 if (!q) 304 return 0; 305 306 if (!kfd_dbg_has_cwsr_workaround(q->device)) 307 return 0; 308 309 if (enable && q->properties.is_user_cu_masked) 310 return -EBUSY; 311 312 minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE; 313 314 q->properties.is_dbg_wa = enable; 315 err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo); 316 if (err) 317 q->properties.is_dbg_wa = false; 318 319 return err; 320 } 321 322 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable) 323 { 324 struct process_queue_manager *pqm = &target->pqm; 325 struct process_queue_node *pqn; 326 int r = 0; 327 328 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 329 r = kfd_dbg_set_queue_workaround(pqn->q, enable); 330 if (enable && r) 331 goto unwind; 332 } 333 334 return 0; 335 336 unwind: 337 list_for_each_entry(pqn, &pqm->queues, process_queue_list) 338 kfd_dbg_set_queue_workaround(pqn->q, false); 339 340 if (enable) 341 target->runtime_info.runtime_state = r == -EBUSY ? 342 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 343 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 344 345 return r; 346 } 347 348 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en) 349 { 350 uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode; 351 uint32_t flags = pdd->process->dbg_flags; 352 353 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 354 return 0; 355 356 return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl, 357 pdd->watch_points, flags, sq_trap_en); 358 } 359 360 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1 361 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id) 362 { 363 int i; 364 365 *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID; 366 367 spin_lock(&pdd->dev->kfd->watch_points_lock); 368 369 for (i = 0; i < MAX_WATCH_ADDRESSES; i++) { 370 /* device watchpoint in use so skip */ 371 if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1) 372 continue; 373 374 pdd->alloc_watch_ids |= 0x1 << i; 375 pdd->dev->kfd->alloc_watch_ids |= 0x1 << i; 376 *watch_id = i; 377 spin_unlock(&pdd->dev->kfd->watch_points_lock); 378 return 0; 379 } 380 381 spin_unlock(&pdd->dev->kfd->watch_points_lock); 382 383 return -ENOMEM; 384 } 385 386 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id) 387 { 388 spin_lock(&pdd->dev->kfd->watch_points_lock); 389 390 /* process owns device watch point so safe to clear */ 391 if ((pdd->alloc_watch_ids >> watch_id) & 0x1) { 392 pdd->alloc_watch_ids &= ~(0x1 << watch_id); 393 pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id); 394 } 395 396 spin_unlock(&pdd->dev->kfd->watch_points_lock); 397 } 398 399 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id) 400 { 401 bool owns_watch_id = false; 402 403 spin_lock(&pdd->dev->kfd->watch_points_lock); 404 owns_watch_id = watch_id < MAX_WATCH_ADDRESSES && 405 ((pdd->alloc_watch_ids >> watch_id) & 0x1); 406 407 spin_unlock(&pdd->dev->kfd->watch_points_lock); 408 409 return owns_watch_id; 410 } 411 412 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd, 413 uint32_t watch_id) 414 { 415 int r; 416 417 if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id)) 418 return -EINVAL; 419 420 if (!pdd->dev->kfd->shared_resources.enable_mes) { 421 r = debug_lock_and_unmap(pdd->dev->dqm); 422 if (r) 423 return r; 424 } 425 426 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 427 pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch( 428 pdd->dev->adev, 429 watch_id); 430 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 431 432 if (!pdd->dev->kfd->shared_resources.enable_mes) 433 r = debug_map_and_unlock(pdd->dev->dqm); 434 else 435 r = kfd_dbg_set_mes_debug_mode(pdd, true); 436 437 kfd_dbg_clear_dev_watch_id(pdd, watch_id); 438 439 return r; 440 } 441 442 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd, 443 uint64_t watch_address, 444 uint32_t watch_address_mask, 445 uint32_t *watch_id, 446 uint32_t watch_mode) 447 { 448 int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id); 449 uint32_t xcc_mask = pdd->dev->xcc_mask; 450 451 if (r) 452 return r; 453 454 if (!pdd->dev->kfd->shared_resources.enable_mes) { 455 r = debug_lock_and_unmap(pdd->dev->dqm); 456 if (r) { 457 kfd_dbg_clear_dev_watch_id(pdd, *watch_id); 458 return r; 459 } 460 } 461 462 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 463 for_each_inst(xcc_id, xcc_mask) 464 pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch( 465 pdd->dev->adev, 466 watch_address, 467 watch_address_mask, 468 *watch_id, 469 watch_mode, 470 pdd->dev->vm_info.last_vmid_kfd, 471 xcc_id); 472 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 473 474 if (!pdd->dev->kfd->shared_resources.enable_mes) 475 r = debug_map_and_unlock(pdd->dev->dqm); 476 else 477 r = kfd_dbg_set_mes_debug_mode(pdd, true); 478 479 /* HWS is broken so no point in HW rollback but release the watchpoint anyways */ 480 if (r) 481 kfd_dbg_clear_dev_watch_id(pdd, *watch_id); 482 483 return 0; 484 } 485 486 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target) 487 { 488 int i, j; 489 490 for (i = 0; i < target->n_pdds; i++) 491 for (j = 0; j < MAX_WATCH_ADDRESSES; j++) 492 kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j); 493 } 494 495 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) 496 { 497 uint32_t prev_flags = target->dbg_flags; 498 int i, r = 0, rewind_count = 0; 499 500 for (i = 0; i < target->n_pdds; i++) { 501 struct kfd_topology_device *topo_dev = 502 kfd_topology_device_by_id(target->pdds[i]->dev->id); 503 uint32_t caps = topo_dev->node_props.capability; 504 505 if (!(caps | HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) && 506 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) { 507 *flags = prev_flags; 508 return -EACCES; 509 } 510 511 if (!(caps | HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) && 512 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) { 513 *flags = prev_flags; 514 return -EACCES; 515 } 516 } 517 518 target->dbg_flags = *flags; 519 *flags = prev_flags; 520 for (i = 0; i < target->n_pdds; i++) { 521 struct kfd_process_device *pdd = target->pdds[i]; 522 523 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 524 continue; 525 526 if (!pdd->dev->kfd->shared_resources.enable_mes) 527 r = debug_refresh_runlist(pdd->dev->dqm); 528 else 529 r = kfd_dbg_set_mes_debug_mode(pdd, true); 530 531 if (r) { 532 target->dbg_flags = prev_flags; 533 break; 534 } 535 536 rewind_count++; 537 } 538 539 /* Rewind flags */ 540 if (r) { 541 target->dbg_flags = prev_flags; 542 543 for (i = 0; i < rewind_count; i++) { 544 struct kfd_process_device *pdd = target->pdds[i]; 545 546 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) 547 continue; 548 549 if (!pdd->dev->kfd->shared_resources.enable_mes) 550 debug_refresh_runlist(pdd->dev->dqm); 551 else 552 kfd_dbg_set_mes_debug_mode(pdd, true); 553 } 554 } 555 556 return r; 557 } 558 559 /* kfd_dbg_trap_deactivate: 560 * target: target process 561 * unwind: If this is unwinding a failed kfd_dbg_trap_enable() 562 * unwind_count: 563 * If unwind == true, how far down the pdd list we need 564 * to unwind 565 * else: ignored 566 */ 567 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count) 568 { 569 int i; 570 571 if (!unwind) { 572 uint32_t flags = 0; 573 int resume_count = resume_queues(target, 0, NULL); 574 575 if (resume_count) 576 pr_debug("Resumed %d queues\n", resume_count); 577 578 cancel_work_sync(&target->debug_event_workarea); 579 kfd_dbg_clear_process_address_watch(target); 580 kfd_dbg_trap_set_wave_launch_mode(target, 0); 581 582 kfd_dbg_trap_set_flags(target, &flags); 583 } 584 585 for (i = 0; i < target->n_pdds; i++) { 586 struct kfd_process_device *pdd = target->pdds[i]; 587 588 /* If this is an unwind, and we have unwound the required 589 * enable calls on the pdd list, we need to stop now 590 * otherwise we may mess up another debugger session. 591 */ 592 if (unwind && i == unwind_count) 593 break; 594 595 kfd_process_set_trap_debug_flag(&pdd->qpd, false); 596 597 /* GFX off is already disabled by debug activate if not RLC restore supported. */ 598 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 599 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 600 pdd->spi_dbg_override = 601 pdd->dev->kfd2kgd->disable_debug_trap( 602 pdd->dev->adev, 603 target->runtime_info.ttmp_setup, 604 pdd->dev->vm_info.last_vmid_kfd); 605 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 606 607 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) && 608 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd)) 609 pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id); 610 611 if (!pdd->dev->kfd->shared_resources.enable_mes) 612 debug_refresh_runlist(pdd->dev->dqm); 613 else 614 kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev)); 615 } 616 617 kfd_dbg_set_workaround(target, false); 618 } 619 620 static void kfd_dbg_clean_exception_status(struct kfd_process *target) 621 { 622 struct process_queue_manager *pqm; 623 struct process_queue_node *pqn; 624 int i; 625 626 for (i = 0; i < target->n_pdds; i++) { 627 struct kfd_process_device *pdd = target->pdds[i]; 628 629 kfd_process_drain_interrupts(pdd); 630 631 pdd->exception_status = 0; 632 } 633 634 pqm = &target->pqm; 635 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 636 if (!pqn->q) 637 continue; 638 639 pqn->q->properties.exception_status = 0; 640 } 641 642 target->exception_status = 0; 643 } 644 645 int kfd_dbg_trap_disable(struct kfd_process *target) 646 { 647 if (!target->debug_trap_enabled) 648 return 0; 649 650 /* 651 * Defer deactivation to runtime if runtime not enabled otherwise reset 652 * attached running target runtime state to enable for re-attach. 653 */ 654 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 655 kfd_dbg_trap_deactivate(target, false, 0); 656 else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) 657 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; 658 659 fput(target->dbg_ev_file); 660 target->dbg_ev_file = NULL; 661 662 if (target->debugger_process) { 663 atomic_dec(&target->debugger_process->debugged_process_count); 664 target->debugger_process = NULL; 665 } 666 667 target->debug_trap_enabled = false; 668 kfd_dbg_clean_exception_status(target); 669 kfd_unref_process(target); 670 671 return 0; 672 } 673 674 int kfd_dbg_trap_activate(struct kfd_process *target) 675 { 676 int i, r = 0; 677 678 r = kfd_dbg_set_workaround(target, true); 679 if (r) 680 return r; 681 682 for (i = 0; i < target->n_pdds; i++) { 683 struct kfd_process_device *pdd = target->pdds[i]; 684 685 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) { 686 r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd); 687 688 if (r) { 689 target->runtime_info.runtime_state = (r == -EBUSY) ? 690 DEBUG_RUNTIME_STATE_ENABLED_BUSY : 691 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 692 693 goto unwind_err; 694 } 695 } 696 697 /* Disable GFX OFF to prevent garbage read/writes to debug registers. 698 * If RLC restore of debug registers is not supported and runtime enable 699 * hasn't done so already on ttmp setup request, restore the trap config registers. 700 * 701 * If RLC restore of debug registers is not supported, keep gfx off disabled for 702 * the debug session. 703 */ 704 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 705 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) || 706 target->runtime_info.ttmp_setup)) 707 pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true, 708 pdd->dev->vm_info.last_vmid_kfd); 709 710 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap( 711 pdd->dev->adev, 712 false, 713 pdd->dev->vm_info.last_vmid_kfd); 714 715 if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) 716 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 717 718 /* 719 * Setting the debug flag in the trap handler requires that the TMA has been 720 * allocated, which occurs during CWSR initialization. 721 * In the event that CWSR has not been initialized at this point, setting the 722 * flag will be called again during CWSR initialization if the target process 723 * is still debug enabled. 724 */ 725 kfd_process_set_trap_debug_flag(&pdd->qpd, true); 726 727 if (!pdd->dev->kfd->shared_resources.enable_mes) 728 r = debug_refresh_runlist(pdd->dev->dqm); 729 else 730 r = kfd_dbg_set_mes_debug_mode(pdd, true); 731 732 if (r) { 733 target->runtime_info.runtime_state = 734 DEBUG_RUNTIME_STATE_ENABLED_ERROR; 735 goto unwind_err; 736 } 737 } 738 739 return 0; 740 741 unwind_err: 742 /* Enabling debug failed, we need to disable on 743 * all GPUs so the enable is all or nothing. 744 */ 745 kfd_dbg_trap_deactivate(target, true, i); 746 return r; 747 } 748 749 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, 750 void __user *runtime_info, uint32_t *runtime_size) 751 { 752 struct file *f; 753 uint32_t copy_size; 754 int i, r = 0; 755 756 if (target->debug_trap_enabled) 757 return -EALREADY; 758 759 /* Enable pre-checks */ 760 for (i = 0; i < target->n_pdds; i++) { 761 struct kfd_process_device *pdd = target->pdds[i]; 762 763 if (!KFD_IS_SOC15(pdd->dev)) 764 return -ENODEV; 765 766 if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) || 767 kfd_dbg_has_cwsr_workaround(pdd->dev))) 768 return -EBUSY; 769 } 770 771 copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info)); 772 773 f = fget(fd); 774 if (!f) { 775 pr_err("Failed to get file for (%i)\n", fd); 776 return -EBADF; 777 } 778 779 target->dbg_ev_file = f; 780 781 /* defer activation to runtime if not runtime enabled */ 782 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) 783 kfd_dbg_trap_activate(target); 784 785 /* We already hold the process reference but hold another one for the 786 * debug session. 787 */ 788 kref_get(&target->ref); 789 target->debug_trap_enabled = true; 790 791 if (target->debugger_process) 792 atomic_inc(&target->debugger_process->debugged_process_count); 793 794 if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) { 795 kfd_dbg_trap_deactivate(target, false, 0); 796 r = -EFAULT; 797 } 798 799 *runtime_size = sizeof(target->runtime_info); 800 801 return r; 802 } 803 804 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p, 805 uint32_t trap_override, 806 uint32_t trap_mask_request, 807 uint32_t *trap_mask_supported) 808 { 809 int i = 0; 810 811 *trap_mask_supported = 0xffffffff; 812 813 for (i = 0; i < p->n_pdds; i++) { 814 struct kfd_process_device *pdd = p->pdds[i]; 815 int err = pdd->dev->kfd2kgd->validate_trap_override_request( 816 pdd->dev->adev, 817 trap_override, 818 trap_mask_supported); 819 820 if (err) 821 return err; 822 } 823 824 if (trap_mask_request & ~*trap_mask_supported) 825 return -EACCES; 826 827 return 0; 828 } 829 830 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target, 831 uint32_t trap_override, 832 uint32_t trap_mask_bits, 833 uint32_t trap_mask_request, 834 uint32_t *trap_mask_prev, 835 uint32_t *trap_mask_supported) 836 { 837 int r = 0, i; 838 839 r = kfd_dbg_validate_trap_override_request(target, 840 trap_override, 841 trap_mask_request, 842 trap_mask_supported); 843 844 if (r) 845 return r; 846 847 for (i = 0; i < target->n_pdds; i++) { 848 struct kfd_process_device *pdd = target->pdds[i]; 849 850 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 851 pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override( 852 pdd->dev->adev, 853 pdd->dev->vm_info.last_vmid_kfd, 854 trap_override, 855 trap_mask_bits, 856 trap_mask_request, 857 trap_mask_prev, 858 pdd->spi_dbg_override); 859 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 860 861 if (!pdd->dev->kfd->shared_resources.enable_mes) 862 r = debug_refresh_runlist(pdd->dev->dqm); 863 else 864 r = kfd_dbg_set_mes_debug_mode(pdd, true); 865 866 if (r) 867 break; 868 } 869 870 return r; 871 } 872 873 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target, 874 uint8_t wave_launch_mode) 875 { 876 int r = 0, i; 877 878 if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL && 879 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT && 880 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG) 881 return -EINVAL; 882 883 for (i = 0; i < target->n_pdds; i++) { 884 struct kfd_process_device *pdd = target->pdds[i]; 885 886 amdgpu_gfx_off_ctrl(pdd->dev->adev, false); 887 pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode( 888 pdd->dev->adev, 889 wave_launch_mode, 890 pdd->dev->vm_info.last_vmid_kfd); 891 amdgpu_gfx_off_ctrl(pdd->dev->adev, true); 892 893 if (!pdd->dev->kfd->shared_resources.enable_mes) 894 r = debug_refresh_runlist(pdd->dev->dqm); 895 else 896 r = kfd_dbg_set_mes_debug_mode(pdd, true); 897 898 if (r) 899 break; 900 } 901 902 return r; 903 } 904 905 int kfd_dbg_trap_query_exception_info(struct kfd_process *target, 906 uint32_t source_id, 907 uint32_t exception_code, 908 bool clear_exception, 909 void __user *info, 910 uint32_t *info_size) 911 { 912 bool found = false; 913 int r = 0; 914 uint32_t copy_size, actual_info_size = 0; 915 uint64_t *exception_status_ptr = NULL; 916 917 if (!target) 918 return -EINVAL; 919 920 if (!info || !info_size) 921 return -EINVAL; 922 923 mutex_lock(&target->event_mutex); 924 925 if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) { 926 /* Per queue exceptions */ 927 struct queue *queue = NULL; 928 int i; 929 930 for (i = 0; i < target->n_pdds; i++) { 931 struct kfd_process_device *pdd = target->pdds[i]; 932 struct qcm_process_device *qpd = &pdd->qpd; 933 934 list_for_each_entry(queue, &qpd->queues_list, list) { 935 if (!found && queue->properties.queue_id == source_id) { 936 found = true; 937 break; 938 } 939 } 940 if (found) 941 break; 942 } 943 944 if (!found) { 945 r = -EINVAL; 946 goto out; 947 } 948 949 if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) { 950 r = -ENODATA; 951 goto out; 952 } 953 exception_status_ptr = &queue->properties.exception_status; 954 } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) { 955 /* Per device exceptions */ 956 struct kfd_process_device *pdd = NULL; 957 int i; 958 959 for (i = 0; i < target->n_pdds; i++) { 960 pdd = target->pdds[i]; 961 if (pdd->dev->id == source_id) { 962 found = true; 963 break; 964 } 965 } 966 967 if (!found) { 968 r = -EINVAL; 969 goto out; 970 } 971 972 if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) { 973 r = -ENODATA; 974 goto out; 975 } 976 977 if (exception_code == EC_DEVICE_MEMORY_VIOLATION) { 978 copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size); 979 980 if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) { 981 r = -EFAULT; 982 goto out; 983 } 984 actual_info_size = pdd->vm_fault_exc_data_size; 985 if (clear_exception) { 986 kfree(pdd->vm_fault_exc_data); 987 pdd->vm_fault_exc_data = NULL; 988 pdd->vm_fault_exc_data_size = 0; 989 } 990 } 991 exception_status_ptr = &pdd->exception_status; 992 } else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) { 993 /* Per process exceptions */ 994 if (!(target->exception_status & KFD_EC_MASK(exception_code))) { 995 r = -ENODATA; 996 goto out; 997 } 998 999 if (exception_code == EC_PROCESS_RUNTIME) { 1000 copy_size = min((size_t)(*info_size), sizeof(target->runtime_info)); 1001 1002 if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) { 1003 r = -EFAULT; 1004 goto out; 1005 } 1006 1007 actual_info_size = sizeof(target->runtime_info); 1008 } 1009 1010 exception_status_ptr = &target->exception_status; 1011 } else { 1012 pr_debug("Bad exception type [%i]\n", exception_code); 1013 r = -EINVAL; 1014 goto out; 1015 } 1016 1017 *info_size = actual_info_size; 1018 if (clear_exception) 1019 *exception_status_ptr &= ~KFD_EC_MASK(exception_code); 1020 out: 1021 mutex_unlock(&target->event_mutex); 1022 return r; 1023 } 1024 1025 int kfd_dbg_trap_device_snapshot(struct kfd_process *target, 1026 uint64_t exception_clear_mask, 1027 void __user *user_info, 1028 uint32_t *number_of_device_infos, 1029 uint32_t *entry_size) 1030 { 1031 struct kfd_dbg_device_info_entry device_info; 1032 uint32_t tmp_entry_size, tmp_num_devices; 1033 int i, r = 0; 1034 1035 if (!(target && user_info && number_of_device_infos && entry_size)) 1036 return -EINVAL; 1037 1038 tmp_entry_size = *entry_size; 1039 1040 tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds); 1041 *number_of_device_infos = target->n_pdds; 1042 *entry_size = min_t(size_t, *entry_size, sizeof(device_info)); 1043 1044 if (!tmp_num_devices) 1045 return 0; 1046 1047 memset(&device_info, 0, sizeof(device_info)); 1048 1049 mutex_lock(&target->event_mutex); 1050 1051 /* Run over all pdd of the process */ 1052 for (i = 0; i < tmp_num_devices; i++) { 1053 struct kfd_process_device *pdd = target->pdds[i]; 1054 struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id); 1055 1056 device_info.gpu_id = pdd->dev->id; 1057 device_info.exception_status = pdd->exception_status; 1058 device_info.lds_base = pdd->lds_base; 1059 device_info.lds_limit = pdd->lds_limit; 1060 device_info.scratch_base = pdd->scratch_base; 1061 device_info.scratch_limit = pdd->scratch_limit; 1062 device_info.gpuvm_base = pdd->gpuvm_base; 1063 device_info.gpuvm_limit = pdd->gpuvm_limit; 1064 device_info.location_id = topo_dev->node_props.location_id; 1065 device_info.vendor_id = topo_dev->node_props.vendor_id; 1066 device_info.device_id = topo_dev->node_props.device_id; 1067 device_info.revision_id = pdd->dev->adev->pdev->revision; 1068 device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor; 1069 device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device; 1070 device_info.fw_version = pdd->dev->kfd->mec_fw_version; 1071 device_info.gfx_target_version = 1072 topo_dev->node_props.gfx_target_version; 1073 device_info.simd_count = topo_dev->node_props.simd_count; 1074 device_info.max_waves_per_simd = 1075 topo_dev->node_props.max_waves_per_simd; 1076 device_info.array_count = topo_dev->node_props.array_count; 1077 device_info.simd_arrays_per_engine = 1078 topo_dev->node_props.simd_arrays_per_engine; 1079 device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask); 1080 device_info.capability = topo_dev->node_props.capability; 1081 device_info.debug_prop = topo_dev->node_props.debug_prop; 1082 1083 if (exception_clear_mask) 1084 pdd->exception_status &= ~exception_clear_mask; 1085 1086 if (copy_to_user(user_info, &device_info, *entry_size)) { 1087 r = -EFAULT; 1088 break; 1089 } 1090 1091 user_info += tmp_entry_size; 1092 } 1093 1094 mutex_unlock(&target->event_mutex); 1095 1096 return r; 1097 } 1098 1099 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target, 1100 uint64_t exception_set_mask) 1101 { 1102 uint64_t found_mask = 0; 1103 struct process_queue_manager *pqm; 1104 struct process_queue_node *pqn; 1105 static const char write_data = '.'; 1106 loff_t pos = 0; 1107 int i; 1108 1109 mutex_lock(&target->event_mutex); 1110 1111 found_mask |= target->exception_status; 1112 1113 pqm = &target->pqm; 1114 list_for_each_entry(pqn, &pqm->queues, process_queue_list) { 1115 if (!pqn->q) 1116 continue; 1117 1118 found_mask |= pqn->q->properties.exception_status; 1119 } 1120 1121 for (i = 0; i < target->n_pdds; i++) { 1122 struct kfd_process_device *pdd = target->pdds[i]; 1123 1124 found_mask |= pdd->exception_status; 1125 } 1126 1127 if (exception_set_mask & found_mask) 1128 kernel_write(target->dbg_ev_file, &write_data, 1, &pos); 1129 1130 target->exception_enable_mask = exception_set_mask; 1131 1132 mutex_unlock(&target->event_mutex); 1133 } 1134