1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 166 struct bin_attribute *attr, char *buf, 167 loff_t ppos, size_t count) 168 { 169 struct device *dev = kobj_to_dev(kobj); 170 struct drm_device *ddev = dev_get_drvdata(dev); 171 struct amdgpu_device *adev = drm_to_adev(ddev); 172 ssize_t bytes_read; 173 174 switch (ppos) { 175 case AMDGPU_SYS_REG_STATE_XGMI: 176 bytes_read = amdgpu_asic_get_reg_state( 177 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 178 break; 179 case AMDGPU_SYS_REG_STATE_WAFL: 180 bytes_read = amdgpu_asic_get_reg_state( 181 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 182 break; 183 case AMDGPU_SYS_REG_STATE_PCIE: 184 bytes_read = amdgpu_asic_get_reg_state( 185 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 186 break; 187 case AMDGPU_SYS_REG_STATE_USR: 188 bytes_read = amdgpu_asic_get_reg_state( 189 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 190 break; 191 case AMDGPU_SYS_REG_STATE_USR_1: 192 bytes_read = amdgpu_asic_get_reg_state( 193 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 194 break; 195 default: 196 return -EINVAL; 197 } 198 199 return bytes_read; 200 } 201 202 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 203 AMDGPU_SYS_REG_STATE_END); 204 205 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 206 { 207 int ret; 208 209 if (!amdgpu_asic_get_reg_state_supported(adev)) 210 return 0; 211 212 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 213 214 return ret; 215 } 216 217 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 218 { 219 if (!amdgpu_asic_get_reg_state_supported(adev)) 220 return; 221 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 222 } 223 224 /** 225 * DOC: board_info 226 * 227 * The amdgpu driver provides a sysfs API for giving board related information. 228 * It provides the form factor information in the format 229 * 230 * type : form factor 231 * 232 * Possible form factor values 233 * 234 * - "cem" - PCIE CEM card 235 * - "oam" - Open Compute Accelerator Module 236 * - "unknown" - Not known 237 * 238 */ 239 240 static ssize_t amdgpu_device_get_board_info(struct device *dev, 241 struct device_attribute *attr, 242 char *buf) 243 { 244 struct drm_device *ddev = dev_get_drvdata(dev); 245 struct amdgpu_device *adev = drm_to_adev(ddev); 246 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 247 const char *pkg; 248 249 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 250 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 251 252 switch (pkg_type) { 253 case AMDGPU_PKG_TYPE_CEM: 254 pkg = "cem"; 255 break; 256 case AMDGPU_PKG_TYPE_OAM: 257 pkg = "oam"; 258 break; 259 default: 260 pkg = "unknown"; 261 break; 262 } 263 264 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 265 } 266 267 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 268 269 static struct attribute *amdgpu_board_attrs[] = { 270 &dev_attr_board_info.attr, 271 NULL, 272 }; 273 274 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 275 struct attribute *attr, int n) 276 { 277 struct device *dev = kobj_to_dev(kobj); 278 struct drm_device *ddev = dev_get_drvdata(dev); 279 struct amdgpu_device *adev = drm_to_adev(ddev); 280 281 if (adev->flags & AMD_IS_APU) 282 return 0; 283 284 return attr->mode; 285 } 286 287 static const struct attribute_group amdgpu_board_attrs_group = { 288 .attrs = amdgpu_board_attrs, 289 .is_visible = amdgpu_board_attrs_is_visible 290 }; 291 292 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 293 294 295 /** 296 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 297 * 298 * @dev: drm_device pointer 299 * 300 * Returns true if the device is a dGPU with ATPX power control, 301 * otherwise return false. 302 */ 303 bool amdgpu_device_supports_px(struct drm_device *dev) 304 { 305 struct amdgpu_device *adev = drm_to_adev(dev); 306 307 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 308 return true; 309 return false; 310 } 311 312 /** 313 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 314 * 315 * @dev: drm_device pointer 316 * 317 * Returns true if the device is a dGPU with ACPI power control, 318 * otherwise return false. 319 */ 320 bool amdgpu_device_supports_boco(struct drm_device *dev) 321 { 322 struct amdgpu_device *adev = drm_to_adev(dev); 323 324 if (adev->has_pr3 || 325 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 326 return true; 327 return false; 328 } 329 330 /** 331 * amdgpu_device_supports_baco - Does the device support BACO 332 * 333 * @dev: drm_device pointer 334 * 335 * Returns true if the device supporte BACO, 336 * otherwise return false. 337 */ 338 bool amdgpu_device_supports_baco(struct drm_device *dev) 339 { 340 struct amdgpu_device *adev = drm_to_adev(dev); 341 342 return amdgpu_asic_supports_baco(adev); 343 } 344 345 /** 346 * amdgpu_device_supports_smart_shift - Is the device dGPU with 347 * smart shift support 348 * 349 * @dev: drm_device pointer 350 * 351 * Returns true if the device is a dGPU with Smart Shift support, 352 * otherwise returns false. 353 */ 354 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 355 { 356 return (amdgpu_device_supports_boco(dev) && 357 amdgpu_acpi_is_power_shift_control_supported()); 358 } 359 360 /* 361 * VRAM access helper functions 362 */ 363 364 /** 365 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 366 * 367 * @adev: amdgpu_device pointer 368 * @pos: offset of the buffer in vram 369 * @buf: virtual address of the buffer in system memory 370 * @size: read/write size, sizeof(@buf) must > @size 371 * @write: true - write to vram, otherwise - read from vram 372 */ 373 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 374 void *buf, size_t size, bool write) 375 { 376 unsigned long flags; 377 uint32_t hi = ~0, tmp = 0; 378 uint32_t *data = buf; 379 uint64_t last; 380 int idx; 381 382 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 383 return; 384 385 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 386 387 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 388 for (last = pos + size; pos < last; pos += 4) { 389 tmp = pos >> 31; 390 391 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 392 if (tmp != hi) { 393 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 394 hi = tmp; 395 } 396 if (write) 397 WREG32_NO_KIQ(mmMM_DATA, *data++); 398 else 399 *data++ = RREG32_NO_KIQ(mmMM_DATA); 400 } 401 402 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 403 drm_dev_exit(idx); 404 } 405 406 /** 407 * amdgpu_device_aper_access - access vram by vram aperature 408 * 409 * @adev: amdgpu_device pointer 410 * @pos: offset of the buffer in vram 411 * @buf: virtual address of the buffer in system memory 412 * @size: read/write size, sizeof(@buf) must > @size 413 * @write: true - write to vram, otherwise - read from vram 414 * 415 * The return value means how many bytes have been transferred. 416 */ 417 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 418 void *buf, size_t size, bool write) 419 { 420 #ifdef CONFIG_64BIT 421 void __iomem *addr; 422 size_t count = 0; 423 uint64_t last; 424 425 if (!adev->mman.aper_base_kaddr) 426 return 0; 427 428 last = min(pos + size, adev->gmc.visible_vram_size); 429 if (last > pos) { 430 addr = adev->mman.aper_base_kaddr + pos; 431 count = last - pos; 432 433 if (write) { 434 memcpy_toio(addr, buf, count); 435 /* Make sure HDP write cache flush happens without any reordering 436 * after the system memory contents are sent over PCIe device 437 */ 438 mb(); 439 amdgpu_device_flush_hdp(adev, NULL); 440 } else { 441 amdgpu_device_invalidate_hdp(adev, NULL); 442 /* Make sure HDP read cache is invalidated before issuing a read 443 * to the PCIe device 444 */ 445 mb(); 446 memcpy_fromio(buf, addr, count); 447 } 448 449 } 450 451 return count; 452 #else 453 return 0; 454 #endif 455 } 456 457 /** 458 * amdgpu_device_vram_access - read/write a buffer in vram 459 * 460 * @adev: amdgpu_device pointer 461 * @pos: offset of the buffer in vram 462 * @buf: virtual address of the buffer in system memory 463 * @size: read/write size, sizeof(@buf) must > @size 464 * @write: true - write to vram, otherwise - read from vram 465 */ 466 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 467 void *buf, size_t size, bool write) 468 { 469 size_t count; 470 471 /* try to using vram apreature to access vram first */ 472 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 473 size -= count; 474 if (size) { 475 /* using MM to access rest vram */ 476 pos += count; 477 buf += count; 478 amdgpu_device_mm_access(adev, pos, buf, size, write); 479 } 480 } 481 482 /* 483 * register access helper functions. 484 */ 485 486 /* Check if hw access should be skipped because of hotplug or device error */ 487 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 488 { 489 if (adev->no_hw_access) 490 return true; 491 492 #ifdef CONFIG_LOCKDEP 493 /* 494 * This is a bit complicated to understand, so worth a comment. What we assert 495 * here is that the GPU reset is not running on another thread in parallel. 496 * 497 * For this we trylock the read side of the reset semaphore, if that succeeds 498 * we know that the reset is not running in paralell. 499 * 500 * If the trylock fails we assert that we are either already holding the read 501 * side of the lock or are the reset thread itself and hold the write side of 502 * the lock. 503 */ 504 if (in_task()) { 505 if (down_read_trylock(&adev->reset_domain->sem)) 506 up_read(&adev->reset_domain->sem); 507 else 508 lockdep_assert_held(&adev->reset_domain->sem); 509 } 510 #endif 511 return false; 512 } 513 514 /** 515 * amdgpu_device_rreg - read a memory mapped IO or indirect register 516 * 517 * @adev: amdgpu_device pointer 518 * @reg: dword aligned register offset 519 * @acc_flags: access flags which require special behavior 520 * 521 * Returns the 32 bit value from the offset specified. 522 */ 523 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 524 uint32_t reg, uint32_t acc_flags) 525 { 526 uint32_t ret; 527 528 if (amdgpu_device_skip_hw_access(adev)) 529 return 0; 530 531 if ((reg * 4) < adev->rmmio_size) { 532 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 533 amdgpu_sriov_runtime(adev) && 534 down_read_trylock(&adev->reset_domain->sem)) { 535 ret = amdgpu_kiq_rreg(adev, reg, 0); 536 up_read(&adev->reset_domain->sem); 537 } else { 538 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 539 } 540 } else { 541 ret = adev->pcie_rreg(adev, reg * 4); 542 } 543 544 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 545 546 return ret; 547 } 548 549 /* 550 * MMIO register read with bytes helper functions 551 * @offset:bytes offset from MMIO start 552 */ 553 554 /** 555 * amdgpu_mm_rreg8 - read a memory mapped IO register 556 * 557 * @adev: amdgpu_device pointer 558 * @offset: byte aligned register offset 559 * 560 * Returns the 8 bit value from the offset specified. 561 */ 562 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 563 { 564 if (amdgpu_device_skip_hw_access(adev)) 565 return 0; 566 567 if (offset < adev->rmmio_size) 568 return (readb(adev->rmmio + offset)); 569 BUG(); 570 } 571 572 573 /** 574 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 575 * 576 * @adev: amdgpu_device pointer 577 * @reg: dword aligned register offset 578 * @acc_flags: access flags which require special behavior 579 * @xcc_id: xcc accelerated compute core id 580 * 581 * Returns the 32 bit value from the offset specified. 582 */ 583 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 584 uint32_t reg, uint32_t acc_flags, 585 uint32_t xcc_id) 586 { 587 uint32_t ret, rlcg_flag; 588 589 if (amdgpu_device_skip_hw_access(adev)) 590 return 0; 591 592 if ((reg * 4) < adev->rmmio_size) { 593 if (amdgpu_sriov_vf(adev) && 594 !amdgpu_sriov_runtime(adev) && 595 adev->gfx.rlc.rlcg_reg_access_supported && 596 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 597 GC_HWIP, false, 598 &rlcg_flag)) { 599 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); 600 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 601 amdgpu_sriov_runtime(adev) && 602 down_read_trylock(&adev->reset_domain->sem)) { 603 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 604 up_read(&adev->reset_domain->sem); 605 } else { 606 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 607 } 608 } else { 609 ret = adev->pcie_rreg(adev, reg * 4); 610 } 611 612 return ret; 613 } 614 615 /* 616 * MMIO register write with bytes helper functions 617 * @offset:bytes offset from MMIO start 618 * @value: the value want to be written to the register 619 */ 620 621 /** 622 * amdgpu_mm_wreg8 - read a memory mapped IO register 623 * 624 * @adev: amdgpu_device pointer 625 * @offset: byte aligned register offset 626 * @value: 8 bit value to write 627 * 628 * Writes the value specified to the offset specified. 629 */ 630 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 631 { 632 if (amdgpu_device_skip_hw_access(adev)) 633 return; 634 635 if (offset < adev->rmmio_size) 636 writeb(value, adev->rmmio + offset); 637 else 638 BUG(); 639 } 640 641 /** 642 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 643 * 644 * @adev: amdgpu_device pointer 645 * @reg: dword aligned register offset 646 * @v: 32 bit value to write to the register 647 * @acc_flags: access flags which require special behavior 648 * 649 * Writes the value specified to the offset specified. 650 */ 651 void amdgpu_device_wreg(struct amdgpu_device *adev, 652 uint32_t reg, uint32_t v, 653 uint32_t acc_flags) 654 { 655 if (amdgpu_device_skip_hw_access(adev)) 656 return; 657 658 if ((reg * 4) < adev->rmmio_size) { 659 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 660 amdgpu_sriov_runtime(adev) && 661 down_read_trylock(&adev->reset_domain->sem)) { 662 amdgpu_kiq_wreg(adev, reg, v, 0); 663 up_read(&adev->reset_domain->sem); 664 } else { 665 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 666 } 667 } else { 668 adev->pcie_wreg(adev, reg * 4, v); 669 } 670 671 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 672 } 673 674 /** 675 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 676 * 677 * @adev: amdgpu_device pointer 678 * @reg: mmio/rlc register 679 * @v: value to write 680 * @xcc_id: xcc accelerated compute core id 681 * 682 * this function is invoked only for the debugfs register access 683 */ 684 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 685 uint32_t reg, uint32_t v, 686 uint32_t xcc_id) 687 { 688 if (amdgpu_device_skip_hw_access(adev)) 689 return; 690 691 if (amdgpu_sriov_fullaccess(adev) && 692 adev->gfx.rlc.funcs && 693 adev->gfx.rlc.funcs->is_rlcg_access_range) { 694 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 695 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 696 } else if ((reg * 4) >= adev->rmmio_size) { 697 adev->pcie_wreg(adev, reg * 4, v); 698 } else { 699 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 700 } 701 } 702 703 /** 704 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 705 * 706 * @adev: amdgpu_device pointer 707 * @reg: dword aligned register offset 708 * @v: 32 bit value to write to the register 709 * @acc_flags: access flags which require special behavior 710 * @xcc_id: xcc accelerated compute core id 711 * 712 * Writes the value specified to the offset specified. 713 */ 714 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 715 uint32_t reg, uint32_t v, 716 uint32_t acc_flags, uint32_t xcc_id) 717 { 718 uint32_t rlcg_flag; 719 720 if (amdgpu_device_skip_hw_access(adev)) 721 return; 722 723 if ((reg * 4) < adev->rmmio_size) { 724 if (amdgpu_sriov_vf(adev) && 725 !amdgpu_sriov_runtime(adev) && 726 adev->gfx.rlc.rlcg_reg_access_supported && 727 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 728 GC_HWIP, true, 729 &rlcg_flag)) { 730 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); 731 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 732 amdgpu_sriov_runtime(adev) && 733 down_read_trylock(&adev->reset_domain->sem)) { 734 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 735 up_read(&adev->reset_domain->sem); 736 } else { 737 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 738 } 739 } else { 740 adev->pcie_wreg(adev, reg * 4, v); 741 } 742 } 743 744 /** 745 * amdgpu_device_indirect_rreg - read an indirect register 746 * 747 * @adev: amdgpu_device pointer 748 * @reg_addr: indirect register address to read from 749 * 750 * Returns the value of indirect register @reg_addr 751 */ 752 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 753 u32 reg_addr) 754 { 755 unsigned long flags, pcie_index, pcie_data; 756 void __iomem *pcie_index_offset; 757 void __iomem *pcie_data_offset; 758 u32 r; 759 760 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 761 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 762 763 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 764 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 765 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 766 767 writel(reg_addr, pcie_index_offset); 768 readl(pcie_index_offset); 769 r = readl(pcie_data_offset); 770 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 771 772 return r; 773 } 774 775 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 776 u64 reg_addr) 777 { 778 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 779 u32 r; 780 void __iomem *pcie_index_offset; 781 void __iomem *pcie_index_hi_offset; 782 void __iomem *pcie_data_offset; 783 784 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 785 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 786 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 787 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 788 else 789 pcie_index_hi = 0; 790 791 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 792 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 793 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 794 if (pcie_index_hi != 0) 795 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 796 pcie_index_hi * 4; 797 798 writel(reg_addr, pcie_index_offset); 799 readl(pcie_index_offset); 800 if (pcie_index_hi != 0) { 801 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 802 readl(pcie_index_hi_offset); 803 } 804 r = readl(pcie_data_offset); 805 806 /* clear the high bits */ 807 if (pcie_index_hi != 0) { 808 writel(0, pcie_index_hi_offset); 809 readl(pcie_index_hi_offset); 810 } 811 812 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 813 814 return r; 815 } 816 817 /** 818 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 819 * 820 * @adev: amdgpu_device pointer 821 * @reg_addr: indirect register address to read from 822 * 823 * Returns the value of indirect register @reg_addr 824 */ 825 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 826 u32 reg_addr) 827 { 828 unsigned long flags, pcie_index, pcie_data; 829 void __iomem *pcie_index_offset; 830 void __iomem *pcie_data_offset; 831 u64 r; 832 833 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 834 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 835 836 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 837 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 838 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 839 840 /* read low 32 bits */ 841 writel(reg_addr, pcie_index_offset); 842 readl(pcie_index_offset); 843 r = readl(pcie_data_offset); 844 /* read high 32 bits */ 845 writel(reg_addr + 4, pcie_index_offset); 846 readl(pcie_index_offset); 847 r |= ((u64)readl(pcie_data_offset) << 32); 848 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 849 850 return r; 851 } 852 853 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 854 u64 reg_addr) 855 { 856 unsigned long flags, pcie_index, pcie_data; 857 unsigned long pcie_index_hi = 0; 858 void __iomem *pcie_index_offset; 859 void __iomem *pcie_index_hi_offset; 860 void __iomem *pcie_data_offset; 861 u64 r; 862 863 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 864 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 865 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 866 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 867 868 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 869 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 870 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 871 if (pcie_index_hi != 0) 872 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 873 pcie_index_hi * 4; 874 875 /* read low 32 bits */ 876 writel(reg_addr, pcie_index_offset); 877 readl(pcie_index_offset); 878 if (pcie_index_hi != 0) { 879 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 880 readl(pcie_index_hi_offset); 881 } 882 r = readl(pcie_data_offset); 883 /* read high 32 bits */ 884 writel(reg_addr + 4, pcie_index_offset); 885 readl(pcie_index_offset); 886 if (pcie_index_hi != 0) { 887 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 888 readl(pcie_index_hi_offset); 889 } 890 r |= ((u64)readl(pcie_data_offset) << 32); 891 892 /* clear the high bits */ 893 if (pcie_index_hi != 0) { 894 writel(0, pcie_index_hi_offset); 895 readl(pcie_index_hi_offset); 896 } 897 898 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 899 900 return r; 901 } 902 903 /** 904 * amdgpu_device_indirect_wreg - write an indirect register address 905 * 906 * @adev: amdgpu_device pointer 907 * @reg_addr: indirect register offset 908 * @reg_data: indirect register data 909 * 910 */ 911 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 912 u32 reg_addr, u32 reg_data) 913 { 914 unsigned long flags, pcie_index, pcie_data; 915 void __iomem *pcie_index_offset; 916 void __iomem *pcie_data_offset; 917 918 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 919 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 920 921 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 922 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 923 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 924 925 writel(reg_addr, pcie_index_offset); 926 readl(pcie_index_offset); 927 writel(reg_data, pcie_data_offset); 928 readl(pcie_data_offset); 929 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 930 } 931 932 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 933 u64 reg_addr, u32 reg_data) 934 { 935 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 936 void __iomem *pcie_index_offset; 937 void __iomem *pcie_index_hi_offset; 938 void __iomem *pcie_data_offset; 939 940 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 941 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 942 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 943 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 944 else 945 pcie_index_hi = 0; 946 947 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 948 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 949 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 950 if (pcie_index_hi != 0) 951 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 952 pcie_index_hi * 4; 953 954 writel(reg_addr, pcie_index_offset); 955 readl(pcie_index_offset); 956 if (pcie_index_hi != 0) { 957 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 958 readl(pcie_index_hi_offset); 959 } 960 writel(reg_data, pcie_data_offset); 961 readl(pcie_data_offset); 962 963 /* clear the high bits */ 964 if (pcie_index_hi != 0) { 965 writel(0, pcie_index_hi_offset); 966 readl(pcie_index_hi_offset); 967 } 968 969 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 970 } 971 972 /** 973 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 974 * 975 * @adev: amdgpu_device pointer 976 * @reg_addr: indirect register offset 977 * @reg_data: indirect register data 978 * 979 */ 980 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 981 u32 reg_addr, u64 reg_data) 982 { 983 unsigned long flags, pcie_index, pcie_data; 984 void __iomem *pcie_index_offset; 985 void __iomem *pcie_data_offset; 986 987 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 988 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 989 990 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 991 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 992 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 993 994 /* write low 32 bits */ 995 writel(reg_addr, pcie_index_offset); 996 readl(pcie_index_offset); 997 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 998 readl(pcie_data_offset); 999 /* write high 32 bits */ 1000 writel(reg_addr + 4, pcie_index_offset); 1001 readl(pcie_index_offset); 1002 writel((u32)(reg_data >> 32), pcie_data_offset); 1003 readl(pcie_data_offset); 1004 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1005 } 1006 1007 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1008 u64 reg_addr, u64 reg_data) 1009 { 1010 unsigned long flags, pcie_index, pcie_data; 1011 unsigned long pcie_index_hi = 0; 1012 void __iomem *pcie_index_offset; 1013 void __iomem *pcie_index_hi_offset; 1014 void __iomem *pcie_data_offset; 1015 1016 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1017 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1018 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1019 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1020 1021 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1022 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1023 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1024 if (pcie_index_hi != 0) 1025 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1026 pcie_index_hi * 4; 1027 1028 /* write low 32 bits */ 1029 writel(reg_addr, pcie_index_offset); 1030 readl(pcie_index_offset); 1031 if (pcie_index_hi != 0) { 1032 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1033 readl(pcie_index_hi_offset); 1034 } 1035 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1036 readl(pcie_data_offset); 1037 /* write high 32 bits */ 1038 writel(reg_addr + 4, pcie_index_offset); 1039 readl(pcie_index_offset); 1040 if (pcie_index_hi != 0) { 1041 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1042 readl(pcie_index_hi_offset); 1043 } 1044 writel((u32)(reg_data >> 32), pcie_data_offset); 1045 readl(pcie_data_offset); 1046 1047 /* clear the high bits */ 1048 if (pcie_index_hi != 0) { 1049 writel(0, pcie_index_hi_offset); 1050 readl(pcie_index_hi_offset); 1051 } 1052 1053 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1054 } 1055 1056 /** 1057 * amdgpu_device_get_rev_id - query device rev_id 1058 * 1059 * @adev: amdgpu_device pointer 1060 * 1061 * Return device rev_id 1062 */ 1063 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1064 { 1065 return adev->nbio.funcs->get_rev_id(adev); 1066 } 1067 1068 /** 1069 * amdgpu_invalid_rreg - dummy reg read function 1070 * 1071 * @adev: amdgpu_device pointer 1072 * @reg: offset of register 1073 * 1074 * Dummy register read function. Used for register blocks 1075 * that certain asics don't have (all asics). 1076 * Returns the value in the register. 1077 */ 1078 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1079 { 1080 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1081 BUG(); 1082 return 0; 1083 } 1084 1085 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1086 { 1087 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1088 BUG(); 1089 return 0; 1090 } 1091 1092 /** 1093 * amdgpu_invalid_wreg - dummy reg write function 1094 * 1095 * @adev: amdgpu_device pointer 1096 * @reg: offset of register 1097 * @v: value to write to the register 1098 * 1099 * Dummy register read function. Used for register blocks 1100 * that certain asics don't have (all asics). 1101 */ 1102 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1103 { 1104 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1105 reg, v); 1106 BUG(); 1107 } 1108 1109 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1110 { 1111 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1112 reg, v); 1113 BUG(); 1114 } 1115 1116 /** 1117 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1118 * 1119 * @adev: amdgpu_device pointer 1120 * @reg: offset of register 1121 * 1122 * Dummy register read function. Used for register blocks 1123 * that certain asics don't have (all asics). 1124 * Returns the value in the register. 1125 */ 1126 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1127 { 1128 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1129 BUG(); 1130 return 0; 1131 } 1132 1133 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1134 { 1135 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1136 BUG(); 1137 return 0; 1138 } 1139 1140 /** 1141 * amdgpu_invalid_wreg64 - dummy reg write function 1142 * 1143 * @adev: amdgpu_device pointer 1144 * @reg: offset of register 1145 * @v: value to write to the register 1146 * 1147 * Dummy register read function. Used for register blocks 1148 * that certain asics don't have (all asics). 1149 */ 1150 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1151 { 1152 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1153 reg, v); 1154 BUG(); 1155 } 1156 1157 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1158 { 1159 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1160 reg, v); 1161 BUG(); 1162 } 1163 1164 /** 1165 * amdgpu_block_invalid_rreg - dummy reg read function 1166 * 1167 * @adev: amdgpu_device pointer 1168 * @block: offset of instance 1169 * @reg: offset of register 1170 * 1171 * Dummy register read function. Used for register blocks 1172 * that certain asics don't have (all asics). 1173 * Returns the value in the register. 1174 */ 1175 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1176 uint32_t block, uint32_t reg) 1177 { 1178 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1179 reg, block); 1180 BUG(); 1181 return 0; 1182 } 1183 1184 /** 1185 * amdgpu_block_invalid_wreg - dummy reg write function 1186 * 1187 * @adev: amdgpu_device pointer 1188 * @block: offset of instance 1189 * @reg: offset of register 1190 * @v: value to write to the register 1191 * 1192 * Dummy register read function. Used for register blocks 1193 * that certain asics don't have (all asics). 1194 */ 1195 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1196 uint32_t block, 1197 uint32_t reg, uint32_t v) 1198 { 1199 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1200 reg, block, v); 1201 BUG(); 1202 } 1203 1204 /** 1205 * amdgpu_device_asic_init - Wrapper for atom asic_init 1206 * 1207 * @adev: amdgpu_device pointer 1208 * 1209 * Does any asic specific work and then calls atom asic init. 1210 */ 1211 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1212 { 1213 int ret; 1214 1215 amdgpu_asic_pre_asic_init(adev); 1216 1217 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1218 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1219 amdgpu_psp_wait_for_bootloader(adev); 1220 ret = amdgpu_atomfirmware_asic_init(adev, true); 1221 /* TODO: check the return val and stop device initialization if boot fails */ 1222 amdgpu_psp_query_boot_status(adev); 1223 return ret; 1224 } else { 1225 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1226 } 1227 1228 return 0; 1229 } 1230 1231 /** 1232 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1233 * 1234 * @adev: amdgpu_device pointer 1235 * 1236 * Allocates a scratch page of VRAM for use by various things in the 1237 * driver. 1238 */ 1239 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1240 { 1241 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1242 AMDGPU_GEM_DOMAIN_VRAM | 1243 AMDGPU_GEM_DOMAIN_GTT, 1244 &adev->mem_scratch.robj, 1245 &adev->mem_scratch.gpu_addr, 1246 (void **)&adev->mem_scratch.ptr); 1247 } 1248 1249 /** 1250 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1251 * 1252 * @adev: amdgpu_device pointer 1253 * 1254 * Frees the VRAM scratch page. 1255 */ 1256 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1257 { 1258 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1259 } 1260 1261 /** 1262 * amdgpu_device_program_register_sequence - program an array of registers. 1263 * 1264 * @adev: amdgpu_device pointer 1265 * @registers: pointer to the register array 1266 * @array_size: size of the register array 1267 * 1268 * Programs an array or registers with and or masks. 1269 * This is a helper for setting golden registers. 1270 */ 1271 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1272 const u32 *registers, 1273 const u32 array_size) 1274 { 1275 u32 tmp, reg, and_mask, or_mask; 1276 int i; 1277 1278 if (array_size % 3) 1279 return; 1280 1281 for (i = 0; i < array_size; i += 3) { 1282 reg = registers[i + 0]; 1283 and_mask = registers[i + 1]; 1284 or_mask = registers[i + 2]; 1285 1286 if (and_mask == 0xffffffff) { 1287 tmp = or_mask; 1288 } else { 1289 tmp = RREG32(reg); 1290 tmp &= ~and_mask; 1291 if (adev->family >= AMDGPU_FAMILY_AI) 1292 tmp |= (or_mask & and_mask); 1293 else 1294 tmp |= or_mask; 1295 } 1296 WREG32(reg, tmp); 1297 } 1298 } 1299 1300 /** 1301 * amdgpu_device_pci_config_reset - reset the GPU 1302 * 1303 * @adev: amdgpu_device pointer 1304 * 1305 * Resets the GPU using the pci config reset sequence. 1306 * Only applicable to asics prior to vega10. 1307 */ 1308 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1309 { 1310 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1311 } 1312 1313 /** 1314 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1315 * 1316 * @adev: amdgpu_device pointer 1317 * 1318 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1319 */ 1320 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1321 { 1322 return pci_reset_function(adev->pdev); 1323 } 1324 1325 /* 1326 * amdgpu_device_wb_*() 1327 * Writeback is the method by which the GPU updates special pages in memory 1328 * with the status of certain GPU events (fences, ring pointers,etc.). 1329 */ 1330 1331 /** 1332 * amdgpu_device_wb_fini - Disable Writeback and free memory 1333 * 1334 * @adev: amdgpu_device pointer 1335 * 1336 * Disables Writeback and frees the Writeback memory (all asics). 1337 * Used at driver shutdown. 1338 */ 1339 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1340 { 1341 if (adev->wb.wb_obj) { 1342 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1343 &adev->wb.gpu_addr, 1344 (void **)&adev->wb.wb); 1345 adev->wb.wb_obj = NULL; 1346 } 1347 } 1348 1349 /** 1350 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1351 * 1352 * @adev: amdgpu_device pointer 1353 * 1354 * Initializes writeback and allocates writeback memory (all asics). 1355 * Used at driver startup. 1356 * Returns 0 on success or an -error on failure. 1357 */ 1358 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1359 { 1360 int r; 1361 1362 if (adev->wb.wb_obj == NULL) { 1363 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1364 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1365 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1366 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1367 (void **)&adev->wb.wb); 1368 if (r) { 1369 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1370 return r; 1371 } 1372 1373 adev->wb.num_wb = AMDGPU_MAX_WB; 1374 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1375 1376 /* clear wb memory */ 1377 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1378 } 1379 1380 return 0; 1381 } 1382 1383 /** 1384 * amdgpu_device_wb_get - Allocate a wb entry 1385 * 1386 * @adev: amdgpu_device pointer 1387 * @wb: wb index 1388 * 1389 * Allocate a wb slot for use by the driver (all asics). 1390 * Returns 0 on success or -EINVAL on failure. 1391 */ 1392 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1393 { 1394 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1395 1396 if (offset < adev->wb.num_wb) { 1397 __set_bit(offset, adev->wb.used); 1398 *wb = offset << 3; /* convert to dw offset */ 1399 return 0; 1400 } else { 1401 return -EINVAL; 1402 } 1403 } 1404 1405 /** 1406 * amdgpu_device_wb_free - Free a wb entry 1407 * 1408 * @adev: amdgpu_device pointer 1409 * @wb: wb index 1410 * 1411 * Free a wb slot allocated for use by the driver (all asics) 1412 */ 1413 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1414 { 1415 wb >>= 3; 1416 if (wb < adev->wb.num_wb) 1417 __clear_bit(wb, adev->wb.used); 1418 } 1419 1420 /** 1421 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1422 * 1423 * @adev: amdgpu_device pointer 1424 * 1425 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1426 * to fail, but if any of the BARs is not accessible after the size we abort 1427 * driver loading by returning -ENODEV. 1428 */ 1429 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1430 { 1431 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1432 struct pci_bus *root; 1433 struct resource *res; 1434 unsigned int i; 1435 u16 cmd; 1436 int r; 1437 1438 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1439 return 0; 1440 1441 /* Bypass for VF */ 1442 if (amdgpu_sriov_vf(adev)) 1443 return 0; 1444 1445 /* skip if the bios has already enabled large BAR */ 1446 if (adev->gmc.real_vram_size && 1447 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1448 return 0; 1449 1450 /* Check if the root BUS has 64bit memory resources */ 1451 root = adev->pdev->bus; 1452 while (root->parent) 1453 root = root->parent; 1454 1455 pci_bus_for_each_resource(root, res, i) { 1456 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1457 res->start > 0x100000000ull) 1458 break; 1459 } 1460 1461 /* Trying to resize is pointless without a root hub window above 4GB */ 1462 if (!res) 1463 return 0; 1464 1465 /* Limit the BAR size to what is available */ 1466 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1467 rbar_size); 1468 1469 /* Disable memory decoding while we change the BAR addresses and size */ 1470 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1471 pci_write_config_word(adev->pdev, PCI_COMMAND, 1472 cmd & ~PCI_COMMAND_MEMORY); 1473 1474 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1475 amdgpu_doorbell_fini(adev); 1476 if (adev->asic_type >= CHIP_BONAIRE) 1477 pci_release_resource(adev->pdev, 2); 1478 1479 pci_release_resource(adev->pdev, 0); 1480 1481 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1482 if (r == -ENOSPC) 1483 DRM_INFO("Not enough PCI address space for a large BAR."); 1484 else if (r && r != -ENOTSUPP) 1485 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1486 1487 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1488 1489 /* When the doorbell or fb BAR isn't available we have no chance of 1490 * using the device. 1491 */ 1492 r = amdgpu_doorbell_init(adev); 1493 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1494 return -ENODEV; 1495 1496 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1497 1498 return 0; 1499 } 1500 1501 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1502 { 1503 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1504 return false; 1505 1506 return true; 1507 } 1508 1509 /* 1510 * GPU helpers function. 1511 */ 1512 /** 1513 * amdgpu_device_need_post - check if the hw need post or not 1514 * 1515 * @adev: amdgpu_device pointer 1516 * 1517 * Check if the asic has been initialized (all asics) at driver startup 1518 * or post is needed if hw reset is performed. 1519 * Returns true if need or false if not. 1520 */ 1521 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1522 { 1523 uint32_t reg; 1524 1525 if (amdgpu_sriov_vf(adev)) 1526 return false; 1527 1528 if (!amdgpu_device_read_bios(adev)) 1529 return false; 1530 1531 if (amdgpu_passthrough(adev)) { 1532 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1533 * some old smc fw still need driver do vPost otherwise gpu hang, while 1534 * those smc fw version above 22.15 doesn't have this flaw, so we force 1535 * vpost executed for smc version below 22.15 1536 */ 1537 if (adev->asic_type == CHIP_FIJI) { 1538 int err; 1539 uint32_t fw_ver; 1540 1541 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1542 /* force vPost if error occured */ 1543 if (err) 1544 return true; 1545 1546 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1547 if (fw_ver < 0x00160e00) 1548 return true; 1549 } 1550 } 1551 1552 /* Don't post if we need to reset whole hive on init */ 1553 if (adev->gmc.xgmi.pending_reset) 1554 return false; 1555 1556 if (adev->has_hw_reset) { 1557 adev->has_hw_reset = false; 1558 return true; 1559 } 1560 1561 /* bios scratch used on CIK+ */ 1562 if (adev->asic_type >= CHIP_BONAIRE) 1563 return amdgpu_atombios_scratch_need_asic_init(adev); 1564 1565 /* check MEM_SIZE for older asics */ 1566 reg = amdgpu_asic_get_config_memsize(adev); 1567 1568 if ((reg != 0) && (reg != 0xffffffff)) 1569 return false; 1570 1571 return true; 1572 } 1573 1574 /* 1575 * Check whether seamless boot is supported. 1576 * 1577 * So far we only support seamless boot on DCE 3.0 or later. 1578 * If users report that it works on older ASICS as well, we may 1579 * loosen this. 1580 */ 1581 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1582 { 1583 switch (amdgpu_seamless) { 1584 case -1: 1585 break; 1586 case 1: 1587 return true; 1588 case 0: 1589 return false; 1590 default: 1591 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1592 amdgpu_seamless); 1593 return false; 1594 } 1595 1596 if (!(adev->flags & AMD_IS_APU)) 1597 return false; 1598 1599 if (adev->mman.keep_stolen_vga_memory) 1600 return false; 1601 1602 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1603 } 1604 1605 /* 1606 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1607 * don't support dynamic speed switching. Until we have confirmation from Intel 1608 * that a specific host supports it, it's safer that we keep it disabled for all. 1609 * 1610 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1611 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1612 */ 1613 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1614 { 1615 #if IS_ENABLED(CONFIG_X86) 1616 struct cpuinfo_x86 *c = &cpu_data(0); 1617 1618 /* eGPU change speeds based on USB4 fabric conditions */ 1619 if (dev_is_removable(adev->dev)) 1620 return true; 1621 1622 if (c->x86_vendor == X86_VENDOR_INTEL) 1623 return false; 1624 #endif 1625 return true; 1626 } 1627 1628 /** 1629 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1630 * 1631 * @adev: amdgpu_device pointer 1632 * 1633 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1634 * be set for this device. 1635 * 1636 * Returns true if it should be used or false if not. 1637 */ 1638 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1639 { 1640 switch (amdgpu_aspm) { 1641 case -1: 1642 break; 1643 case 0: 1644 return false; 1645 case 1: 1646 return true; 1647 default: 1648 return false; 1649 } 1650 if (adev->flags & AMD_IS_APU) 1651 return false; 1652 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1653 return false; 1654 return pcie_aspm_enabled(adev->pdev); 1655 } 1656 1657 /* if we get transitioned to only one device, take VGA back */ 1658 /** 1659 * amdgpu_device_vga_set_decode - enable/disable vga decode 1660 * 1661 * @pdev: PCI device pointer 1662 * @state: enable/disable vga decode 1663 * 1664 * Enable/disable vga decode (all asics). 1665 * Returns VGA resource flags. 1666 */ 1667 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1668 bool state) 1669 { 1670 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1671 1672 amdgpu_asic_set_vga_state(adev, state); 1673 if (state) 1674 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1675 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1676 else 1677 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1678 } 1679 1680 /** 1681 * amdgpu_device_check_block_size - validate the vm block size 1682 * 1683 * @adev: amdgpu_device pointer 1684 * 1685 * Validates the vm block size specified via module parameter. 1686 * The vm block size defines number of bits in page table versus page directory, 1687 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1688 * page table and the remaining bits are in the page directory. 1689 */ 1690 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1691 { 1692 /* defines number of bits in page table versus page directory, 1693 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1694 * page table and the remaining bits are in the page directory 1695 */ 1696 if (amdgpu_vm_block_size == -1) 1697 return; 1698 1699 if (amdgpu_vm_block_size < 9) { 1700 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1701 amdgpu_vm_block_size); 1702 amdgpu_vm_block_size = -1; 1703 } 1704 } 1705 1706 /** 1707 * amdgpu_device_check_vm_size - validate the vm size 1708 * 1709 * @adev: amdgpu_device pointer 1710 * 1711 * Validates the vm size in GB specified via module parameter. 1712 * The VM size is the size of the GPU virtual memory space in GB. 1713 */ 1714 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1715 { 1716 /* no need to check the default value */ 1717 if (amdgpu_vm_size == -1) 1718 return; 1719 1720 if (amdgpu_vm_size < 1) { 1721 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1722 amdgpu_vm_size); 1723 amdgpu_vm_size = -1; 1724 } 1725 } 1726 1727 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1728 { 1729 struct sysinfo si; 1730 bool is_os_64 = (sizeof(void *) == 8); 1731 uint64_t total_memory; 1732 uint64_t dram_size_seven_GB = 0x1B8000000; 1733 uint64_t dram_size_three_GB = 0xB8000000; 1734 1735 if (amdgpu_smu_memory_pool_size == 0) 1736 return; 1737 1738 if (!is_os_64) { 1739 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1740 goto def_value; 1741 } 1742 si_meminfo(&si); 1743 total_memory = (uint64_t)si.totalram * si.mem_unit; 1744 1745 if ((amdgpu_smu_memory_pool_size == 1) || 1746 (amdgpu_smu_memory_pool_size == 2)) { 1747 if (total_memory < dram_size_three_GB) 1748 goto def_value1; 1749 } else if ((amdgpu_smu_memory_pool_size == 4) || 1750 (amdgpu_smu_memory_pool_size == 8)) { 1751 if (total_memory < dram_size_seven_GB) 1752 goto def_value1; 1753 } else { 1754 DRM_WARN("Smu memory pool size not supported\n"); 1755 goto def_value; 1756 } 1757 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1758 1759 return; 1760 1761 def_value1: 1762 DRM_WARN("No enough system memory\n"); 1763 def_value: 1764 adev->pm.smu_prv_buffer_size = 0; 1765 } 1766 1767 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1768 { 1769 if (!(adev->flags & AMD_IS_APU) || 1770 adev->asic_type < CHIP_RAVEN) 1771 return 0; 1772 1773 switch (adev->asic_type) { 1774 case CHIP_RAVEN: 1775 if (adev->pdev->device == 0x15dd) 1776 adev->apu_flags |= AMD_APU_IS_RAVEN; 1777 if (adev->pdev->device == 0x15d8) 1778 adev->apu_flags |= AMD_APU_IS_PICASSO; 1779 break; 1780 case CHIP_RENOIR: 1781 if ((adev->pdev->device == 0x1636) || 1782 (adev->pdev->device == 0x164c)) 1783 adev->apu_flags |= AMD_APU_IS_RENOIR; 1784 else 1785 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1786 break; 1787 case CHIP_VANGOGH: 1788 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1789 break; 1790 case CHIP_YELLOW_CARP: 1791 break; 1792 case CHIP_CYAN_SKILLFISH: 1793 if ((adev->pdev->device == 0x13FE) || 1794 (adev->pdev->device == 0x143F)) 1795 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1796 break; 1797 default: 1798 break; 1799 } 1800 1801 return 0; 1802 } 1803 1804 /** 1805 * amdgpu_device_check_arguments - validate module params 1806 * 1807 * @adev: amdgpu_device pointer 1808 * 1809 * Validates certain module parameters and updates 1810 * the associated values used by the driver (all asics). 1811 */ 1812 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1813 { 1814 if (amdgpu_sched_jobs < 4) { 1815 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1816 amdgpu_sched_jobs); 1817 amdgpu_sched_jobs = 4; 1818 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1819 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1820 amdgpu_sched_jobs); 1821 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1822 } 1823 1824 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1825 /* gart size must be greater or equal to 32M */ 1826 dev_warn(adev->dev, "gart size (%d) too small\n", 1827 amdgpu_gart_size); 1828 amdgpu_gart_size = -1; 1829 } 1830 1831 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1832 /* gtt size must be greater or equal to 32M */ 1833 dev_warn(adev->dev, "gtt size (%d) too small\n", 1834 amdgpu_gtt_size); 1835 amdgpu_gtt_size = -1; 1836 } 1837 1838 /* valid range is between 4 and 9 inclusive */ 1839 if (amdgpu_vm_fragment_size != -1 && 1840 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1841 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1842 amdgpu_vm_fragment_size = -1; 1843 } 1844 1845 if (amdgpu_sched_hw_submission < 2) { 1846 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1847 amdgpu_sched_hw_submission); 1848 amdgpu_sched_hw_submission = 2; 1849 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1850 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1851 amdgpu_sched_hw_submission); 1852 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1853 } 1854 1855 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1856 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1857 amdgpu_reset_method = -1; 1858 } 1859 1860 amdgpu_device_check_smu_prv_buffer_size(adev); 1861 1862 amdgpu_device_check_vm_size(adev); 1863 1864 amdgpu_device_check_block_size(adev); 1865 1866 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1867 1868 return 0; 1869 } 1870 1871 /** 1872 * amdgpu_switcheroo_set_state - set switcheroo state 1873 * 1874 * @pdev: pci dev pointer 1875 * @state: vga_switcheroo state 1876 * 1877 * Callback for the switcheroo driver. Suspends or resumes 1878 * the asics before or after it is powered up using ACPI methods. 1879 */ 1880 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1881 enum vga_switcheroo_state state) 1882 { 1883 struct drm_device *dev = pci_get_drvdata(pdev); 1884 int r; 1885 1886 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1887 return; 1888 1889 if (state == VGA_SWITCHEROO_ON) { 1890 pr_info("switched on\n"); 1891 /* don't suspend or resume card normally */ 1892 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1893 1894 pci_set_power_state(pdev, PCI_D0); 1895 amdgpu_device_load_pci_state(pdev); 1896 r = pci_enable_device(pdev); 1897 if (r) 1898 DRM_WARN("pci_enable_device failed (%d)\n", r); 1899 amdgpu_device_resume(dev, true); 1900 1901 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1902 } else { 1903 pr_info("switched off\n"); 1904 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1905 amdgpu_device_prepare(dev); 1906 amdgpu_device_suspend(dev, true); 1907 amdgpu_device_cache_pci_state(pdev); 1908 /* Shut down the device */ 1909 pci_disable_device(pdev); 1910 pci_set_power_state(pdev, PCI_D3cold); 1911 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1912 } 1913 } 1914 1915 /** 1916 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1917 * 1918 * @pdev: pci dev pointer 1919 * 1920 * Callback for the switcheroo driver. Check of the switcheroo 1921 * state can be changed. 1922 * Returns true if the state can be changed, false if not. 1923 */ 1924 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1925 { 1926 struct drm_device *dev = pci_get_drvdata(pdev); 1927 1928 /* 1929 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1930 * locking inversion with the driver load path. And the access here is 1931 * completely racy anyway. So don't bother with locking for now. 1932 */ 1933 return atomic_read(&dev->open_count) == 0; 1934 } 1935 1936 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1937 .set_gpu_state = amdgpu_switcheroo_set_state, 1938 .reprobe = NULL, 1939 .can_switch = amdgpu_switcheroo_can_switch, 1940 }; 1941 1942 /** 1943 * amdgpu_device_ip_set_clockgating_state - set the CG state 1944 * 1945 * @dev: amdgpu_device pointer 1946 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1947 * @state: clockgating state (gate or ungate) 1948 * 1949 * Sets the requested clockgating state for all instances of 1950 * the hardware IP specified. 1951 * Returns the error code from the last instance. 1952 */ 1953 int amdgpu_device_ip_set_clockgating_state(void *dev, 1954 enum amd_ip_block_type block_type, 1955 enum amd_clockgating_state state) 1956 { 1957 struct amdgpu_device *adev = dev; 1958 int i, r = 0; 1959 1960 for (i = 0; i < adev->num_ip_blocks; i++) { 1961 if (!adev->ip_blocks[i].status.valid) 1962 continue; 1963 if (adev->ip_blocks[i].version->type != block_type) 1964 continue; 1965 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1966 continue; 1967 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1968 (void *)adev, state); 1969 if (r) 1970 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1971 adev->ip_blocks[i].version->funcs->name, r); 1972 } 1973 return r; 1974 } 1975 1976 /** 1977 * amdgpu_device_ip_set_powergating_state - set the PG state 1978 * 1979 * @dev: amdgpu_device pointer 1980 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1981 * @state: powergating state (gate or ungate) 1982 * 1983 * Sets the requested powergating state for all instances of 1984 * the hardware IP specified. 1985 * Returns the error code from the last instance. 1986 */ 1987 int amdgpu_device_ip_set_powergating_state(void *dev, 1988 enum amd_ip_block_type block_type, 1989 enum amd_powergating_state state) 1990 { 1991 struct amdgpu_device *adev = dev; 1992 int i, r = 0; 1993 1994 for (i = 0; i < adev->num_ip_blocks; i++) { 1995 if (!adev->ip_blocks[i].status.valid) 1996 continue; 1997 if (adev->ip_blocks[i].version->type != block_type) 1998 continue; 1999 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2000 continue; 2001 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2002 (void *)adev, state); 2003 if (r) 2004 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2005 adev->ip_blocks[i].version->funcs->name, r); 2006 } 2007 return r; 2008 } 2009 2010 /** 2011 * amdgpu_device_ip_get_clockgating_state - get the CG state 2012 * 2013 * @adev: amdgpu_device pointer 2014 * @flags: clockgating feature flags 2015 * 2016 * Walks the list of IPs on the device and updates the clockgating 2017 * flags for each IP. 2018 * Updates @flags with the feature flags for each hardware IP where 2019 * clockgating is enabled. 2020 */ 2021 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2022 u64 *flags) 2023 { 2024 int i; 2025 2026 for (i = 0; i < adev->num_ip_blocks; i++) { 2027 if (!adev->ip_blocks[i].status.valid) 2028 continue; 2029 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2030 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2031 } 2032 } 2033 2034 /** 2035 * amdgpu_device_ip_wait_for_idle - wait for idle 2036 * 2037 * @adev: amdgpu_device pointer 2038 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2039 * 2040 * Waits for the request hardware IP to be idle. 2041 * Returns 0 for success or a negative error code on failure. 2042 */ 2043 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2044 enum amd_ip_block_type block_type) 2045 { 2046 int i, r; 2047 2048 for (i = 0; i < adev->num_ip_blocks; i++) { 2049 if (!adev->ip_blocks[i].status.valid) 2050 continue; 2051 if (adev->ip_blocks[i].version->type == block_type) { 2052 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2053 if (r) 2054 return r; 2055 break; 2056 } 2057 } 2058 return 0; 2059 2060 } 2061 2062 /** 2063 * amdgpu_device_ip_is_idle - is the hardware IP idle 2064 * 2065 * @adev: amdgpu_device pointer 2066 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2067 * 2068 * Check if the hardware IP is idle or not. 2069 * Returns true if it the IP is idle, false if not. 2070 */ 2071 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2072 enum amd_ip_block_type block_type) 2073 { 2074 int i; 2075 2076 for (i = 0; i < adev->num_ip_blocks; i++) { 2077 if (!adev->ip_blocks[i].status.valid) 2078 continue; 2079 if (adev->ip_blocks[i].version->type == block_type) 2080 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2081 } 2082 return true; 2083 2084 } 2085 2086 /** 2087 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2088 * 2089 * @adev: amdgpu_device pointer 2090 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2091 * 2092 * Returns a pointer to the hardware IP block structure 2093 * if it exists for the asic, otherwise NULL. 2094 */ 2095 struct amdgpu_ip_block * 2096 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2097 enum amd_ip_block_type type) 2098 { 2099 int i; 2100 2101 for (i = 0; i < adev->num_ip_blocks; i++) 2102 if (adev->ip_blocks[i].version->type == type) 2103 return &adev->ip_blocks[i]; 2104 2105 return NULL; 2106 } 2107 2108 /** 2109 * amdgpu_device_ip_block_version_cmp 2110 * 2111 * @adev: amdgpu_device pointer 2112 * @type: enum amd_ip_block_type 2113 * @major: major version 2114 * @minor: minor version 2115 * 2116 * return 0 if equal or greater 2117 * return 1 if smaller or the ip_block doesn't exist 2118 */ 2119 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2120 enum amd_ip_block_type type, 2121 u32 major, u32 minor) 2122 { 2123 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2124 2125 if (ip_block && ((ip_block->version->major > major) || 2126 ((ip_block->version->major == major) && 2127 (ip_block->version->minor >= minor)))) 2128 return 0; 2129 2130 return 1; 2131 } 2132 2133 /** 2134 * amdgpu_device_ip_block_add 2135 * 2136 * @adev: amdgpu_device pointer 2137 * @ip_block_version: pointer to the IP to add 2138 * 2139 * Adds the IP block driver information to the collection of IPs 2140 * on the asic. 2141 */ 2142 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2143 const struct amdgpu_ip_block_version *ip_block_version) 2144 { 2145 if (!ip_block_version) 2146 return -EINVAL; 2147 2148 switch (ip_block_version->type) { 2149 case AMD_IP_BLOCK_TYPE_VCN: 2150 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2151 return 0; 2152 break; 2153 case AMD_IP_BLOCK_TYPE_JPEG: 2154 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2155 return 0; 2156 break; 2157 default: 2158 break; 2159 } 2160 2161 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2162 ip_block_version->funcs->name); 2163 2164 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2165 2166 return 0; 2167 } 2168 2169 /** 2170 * amdgpu_device_enable_virtual_display - enable virtual display feature 2171 * 2172 * @adev: amdgpu_device pointer 2173 * 2174 * Enabled the virtual display feature if the user has enabled it via 2175 * the module parameter virtual_display. This feature provides a virtual 2176 * display hardware on headless boards or in virtualized environments. 2177 * This function parses and validates the configuration string specified by 2178 * the user and configues the virtual display configuration (number of 2179 * virtual connectors, crtcs, etc.) specified. 2180 */ 2181 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2182 { 2183 adev->enable_virtual_display = false; 2184 2185 if (amdgpu_virtual_display) { 2186 const char *pci_address_name = pci_name(adev->pdev); 2187 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2188 2189 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2190 pciaddstr_tmp = pciaddstr; 2191 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2192 pciaddname = strsep(&pciaddname_tmp, ","); 2193 if (!strcmp("all", pciaddname) 2194 || !strcmp(pci_address_name, pciaddname)) { 2195 long num_crtc; 2196 int res = -1; 2197 2198 adev->enable_virtual_display = true; 2199 2200 if (pciaddname_tmp) 2201 res = kstrtol(pciaddname_tmp, 10, 2202 &num_crtc); 2203 2204 if (!res) { 2205 if (num_crtc < 1) 2206 num_crtc = 1; 2207 if (num_crtc > 6) 2208 num_crtc = 6; 2209 adev->mode_info.num_crtc = num_crtc; 2210 } else { 2211 adev->mode_info.num_crtc = 1; 2212 } 2213 break; 2214 } 2215 } 2216 2217 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2218 amdgpu_virtual_display, pci_address_name, 2219 adev->enable_virtual_display, adev->mode_info.num_crtc); 2220 2221 kfree(pciaddstr); 2222 } 2223 } 2224 2225 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2226 { 2227 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2228 adev->mode_info.num_crtc = 1; 2229 adev->enable_virtual_display = true; 2230 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2231 adev->enable_virtual_display, adev->mode_info.num_crtc); 2232 } 2233 } 2234 2235 /** 2236 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2237 * 2238 * @adev: amdgpu_device pointer 2239 * 2240 * Parses the asic configuration parameters specified in the gpu info 2241 * firmware and makes them availale to the driver for use in configuring 2242 * the asic. 2243 * Returns 0 on success, -EINVAL on failure. 2244 */ 2245 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2246 { 2247 const char *chip_name; 2248 char fw_name[40]; 2249 int err; 2250 const struct gpu_info_firmware_header_v1_0 *hdr; 2251 2252 adev->firmware.gpu_info_fw = NULL; 2253 2254 if (adev->mman.discovery_bin) 2255 return 0; 2256 2257 switch (adev->asic_type) { 2258 default: 2259 return 0; 2260 case CHIP_VEGA10: 2261 chip_name = "vega10"; 2262 break; 2263 case CHIP_VEGA12: 2264 chip_name = "vega12"; 2265 break; 2266 case CHIP_RAVEN: 2267 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2268 chip_name = "raven2"; 2269 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2270 chip_name = "picasso"; 2271 else 2272 chip_name = "raven"; 2273 break; 2274 case CHIP_ARCTURUS: 2275 chip_name = "arcturus"; 2276 break; 2277 case CHIP_NAVI12: 2278 chip_name = "navi12"; 2279 break; 2280 } 2281 2282 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2283 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2284 if (err) { 2285 dev_err(adev->dev, 2286 "Failed to get gpu_info firmware \"%s\"\n", 2287 fw_name); 2288 goto out; 2289 } 2290 2291 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2292 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2293 2294 switch (hdr->version_major) { 2295 case 1: 2296 { 2297 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2298 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2299 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2300 2301 /* 2302 * Should be droped when DAL no longer needs it. 2303 */ 2304 if (adev->asic_type == CHIP_NAVI12) 2305 goto parse_soc_bounding_box; 2306 2307 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2308 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2309 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2310 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2311 adev->gfx.config.max_texture_channel_caches = 2312 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2313 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2314 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2315 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2316 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2317 adev->gfx.config.double_offchip_lds_buf = 2318 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2319 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2320 adev->gfx.cu_info.max_waves_per_simd = 2321 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2322 adev->gfx.cu_info.max_scratch_slots_per_cu = 2323 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2324 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2325 if (hdr->version_minor >= 1) { 2326 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2327 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2328 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2329 adev->gfx.config.num_sc_per_sh = 2330 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2331 adev->gfx.config.num_packer_per_sc = 2332 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2333 } 2334 2335 parse_soc_bounding_box: 2336 /* 2337 * soc bounding box info is not integrated in disocovery table, 2338 * we always need to parse it from gpu info firmware if needed. 2339 */ 2340 if (hdr->version_minor == 2) { 2341 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2342 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2343 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2344 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2345 } 2346 break; 2347 } 2348 default: 2349 dev_err(adev->dev, 2350 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2351 err = -EINVAL; 2352 goto out; 2353 } 2354 out: 2355 return err; 2356 } 2357 2358 /** 2359 * amdgpu_device_ip_early_init - run early init for hardware IPs 2360 * 2361 * @adev: amdgpu_device pointer 2362 * 2363 * Early initialization pass for hardware IPs. The hardware IPs that make 2364 * up each asic are discovered each IP's early_init callback is run. This 2365 * is the first stage in initializing the asic. 2366 * Returns 0 on success, negative error code on failure. 2367 */ 2368 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2369 { 2370 struct pci_dev *parent; 2371 int i, r; 2372 bool total; 2373 2374 amdgpu_device_enable_virtual_display(adev); 2375 2376 if (amdgpu_sriov_vf(adev)) { 2377 r = amdgpu_virt_request_full_gpu(adev, true); 2378 if (r) 2379 return r; 2380 } 2381 2382 switch (adev->asic_type) { 2383 #ifdef CONFIG_DRM_AMDGPU_SI 2384 case CHIP_VERDE: 2385 case CHIP_TAHITI: 2386 case CHIP_PITCAIRN: 2387 case CHIP_OLAND: 2388 case CHIP_HAINAN: 2389 adev->family = AMDGPU_FAMILY_SI; 2390 r = si_set_ip_blocks(adev); 2391 if (r) 2392 return r; 2393 break; 2394 #endif 2395 #ifdef CONFIG_DRM_AMDGPU_CIK 2396 case CHIP_BONAIRE: 2397 case CHIP_HAWAII: 2398 case CHIP_KAVERI: 2399 case CHIP_KABINI: 2400 case CHIP_MULLINS: 2401 if (adev->flags & AMD_IS_APU) 2402 adev->family = AMDGPU_FAMILY_KV; 2403 else 2404 adev->family = AMDGPU_FAMILY_CI; 2405 2406 r = cik_set_ip_blocks(adev); 2407 if (r) 2408 return r; 2409 break; 2410 #endif 2411 case CHIP_TOPAZ: 2412 case CHIP_TONGA: 2413 case CHIP_FIJI: 2414 case CHIP_POLARIS10: 2415 case CHIP_POLARIS11: 2416 case CHIP_POLARIS12: 2417 case CHIP_VEGAM: 2418 case CHIP_CARRIZO: 2419 case CHIP_STONEY: 2420 if (adev->flags & AMD_IS_APU) 2421 adev->family = AMDGPU_FAMILY_CZ; 2422 else 2423 adev->family = AMDGPU_FAMILY_VI; 2424 2425 r = vi_set_ip_blocks(adev); 2426 if (r) 2427 return r; 2428 break; 2429 default: 2430 r = amdgpu_discovery_set_ip_blocks(adev); 2431 if (r) 2432 return r; 2433 break; 2434 } 2435 2436 if (amdgpu_has_atpx() && 2437 (amdgpu_is_atpx_hybrid() || 2438 amdgpu_has_atpx_dgpu_power_cntl()) && 2439 ((adev->flags & AMD_IS_APU) == 0) && 2440 !dev_is_removable(&adev->pdev->dev)) 2441 adev->flags |= AMD_IS_PX; 2442 2443 if (!(adev->flags & AMD_IS_APU)) { 2444 parent = pcie_find_root_port(adev->pdev); 2445 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2446 } 2447 2448 2449 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2450 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2451 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2452 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2453 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2454 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2455 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2456 2457 total = true; 2458 for (i = 0; i < adev->num_ip_blocks; i++) { 2459 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2460 DRM_WARN("disabled ip block: %d <%s>\n", 2461 i, adev->ip_blocks[i].version->funcs->name); 2462 adev->ip_blocks[i].status.valid = false; 2463 } else { 2464 if (adev->ip_blocks[i].version->funcs->early_init) { 2465 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2466 if (r == -ENOENT) { 2467 adev->ip_blocks[i].status.valid = false; 2468 } else if (r) { 2469 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2470 adev->ip_blocks[i].version->funcs->name, r); 2471 total = false; 2472 } else { 2473 adev->ip_blocks[i].status.valid = true; 2474 } 2475 } else { 2476 adev->ip_blocks[i].status.valid = true; 2477 } 2478 } 2479 /* get the vbios after the asic_funcs are set up */ 2480 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2481 r = amdgpu_device_parse_gpu_info_fw(adev); 2482 if (r) 2483 return r; 2484 2485 /* Read BIOS */ 2486 if (amdgpu_device_read_bios(adev)) { 2487 if (!amdgpu_get_bios(adev)) 2488 return -EINVAL; 2489 2490 r = amdgpu_atombios_init(adev); 2491 if (r) { 2492 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2493 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2494 return r; 2495 } 2496 } 2497 2498 /*get pf2vf msg info at it's earliest time*/ 2499 if (amdgpu_sriov_vf(adev)) 2500 amdgpu_virt_init_data_exchange(adev); 2501 2502 } 2503 } 2504 if (!total) 2505 return -ENODEV; 2506 2507 amdgpu_amdkfd_device_probe(adev); 2508 adev->cg_flags &= amdgpu_cg_mask; 2509 adev->pg_flags &= amdgpu_pg_mask; 2510 2511 return 0; 2512 } 2513 2514 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2515 { 2516 int i, r; 2517 2518 for (i = 0; i < adev->num_ip_blocks; i++) { 2519 if (!adev->ip_blocks[i].status.sw) 2520 continue; 2521 if (adev->ip_blocks[i].status.hw) 2522 continue; 2523 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2524 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2525 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2526 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2527 if (r) { 2528 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2529 adev->ip_blocks[i].version->funcs->name, r); 2530 return r; 2531 } 2532 adev->ip_blocks[i].status.hw = true; 2533 } 2534 } 2535 2536 return 0; 2537 } 2538 2539 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2540 { 2541 int i, r; 2542 2543 for (i = 0; i < adev->num_ip_blocks; i++) { 2544 if (!adev->ip_blocks[i].status.sw) 2545 continue; 2546 if (adev->ip_blocks[i].status.hw) 2547 continue; 2548 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2549 if (r) { 2550 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2551 adev->ip_blocks[i].version->funcs->name, r); 2552 return r; 2553 } 2554 adev->ip_blocks[i].status.hw = true; 2555 } 2556 2557 return 0; 2558 } 2559 2560 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2561 { 2562 int r = 0; 2563 int i; 2564 uint32_t smu_version; 2565 2566 if (adev->asic_type >= CHIP_VEGA10) { 2567 for (i = 0; i < adev->num_ip_blocks; i++) { 2568 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2569 continue; 2570 2571 if (!adev->ip_blocks[i].status.sw) 2572 continue; 2573 2574 /* no need to do the fw loading again if already done*/ 2575 if (adev->ip_blocks[i].status.hw == true) 2576 break; 2577 2578 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2579 r = adev->ip_blocks[i].version->funcs->resume(adev); 2580 if (r) { 2581 DRM_ERROR("resume of IP block <%s> failed %d\n", 2582 adev->ip_blocks[i].version->funcs->name, r); 2583 return r; 2584 } 2585 } else { 2586 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2587 if (r) { 2588 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2589 adev->ip_blocks[i].version->funcs->name, r); 2590 return r; 2591 } 2592 } 2593 2594 adev->ip_blocks[i].status.hw = true; 2595 break; 2596 } 2597 } 2598 2599 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2600 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2601 2602 return r; 2603 } 2604 2605 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2606 { 2607 long timeout; 2608 int r, i; 2609 2610 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2611 struct amdgpu_ring *ring = adev->rings[i]; 2612 2613 /* No need to setup the GPU scheduler for rings that don't need it */ 2614 if (!ring || ring->no_scheduler) 2615 continue; 2616 2617 switch (ring->funcs->type) { 2618 case AMDGPU_RING_TYPE_GFX: 2619 timeout = adev->gfx_timeout; 2620 break; 2621 case AMDGPU_RING_TYPE_COMPUTE: 2622 timeout = adev->compute_timeout; 2623 break; 2624 case AMDGPU_RING_TYPE_SDMA: 2625 timeout = adev->sdma_timeout; 2626 break; 2627 default: 2628 timeout = adev->video_timeout; 2629 break; 2630 } 2631 2632 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2633 DRM_SCHED_PRIORITY_COUNT, 2634 ring->num_hw_submission, 0, 2635 timeout, adev->reset_domain->wq, 2636 ring->sched_score, ring->name, 2637 adev->dev); 2638 if (r) { 2639 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2640 ring->name); 2641 return r; 2642 } 2643 r = amdgpu_uvd_entity_init(adev, ring); 2644 if (r) { 2645 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2646 ring->name); 2647 return r; 2648 } 2649 r = amdgpu_vce_entity_init(adev, ring); 2650 if (r) { 2651 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2652 ring->name); 2653 return r; 2654 } 2655 } 2656 2657 amdgpu_xcp_update_partition_sched_list(adev); 2658 2659 return 0; 2660 } 2661 2662 2663 /** 2664 * amdgpu_device_ip_init - run init for hardware IPs 2665 * 2666 * @adev: amdgpu_device pointer 2667 * 2668 * Main initialization pass for hardware IPs. The list of all the hardware 2669 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2670 * are run. sw_init initializes the software state associated with each IP 2671 * and hw_init initializes the hardware associated with each IP. 2672 * Returns 0 on success, negative error code on failure. 2673 */ 2674 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2675 { 2676 int i, r; 2677 2678 r = amdgpu_ras_init(adev); 2679 if (r) 2680 return r; 2681 2682 for (i = 0; i < adev->num_ip_blocks; i++) { 2683 if (!adev->ip_blocks[i].status.valid) 2684 continue; 2685 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2686 if (r) { 2687 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2688 adev->ip_blocks[i].version->funcs->name, r); 2689 goto init_failed; 2690 } 2691 adev->ip_blocks[i].status.sw = true; 2692 2693 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2694 /* need to do common hw init early so everything is set up for gmc */ 2695 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2696 if (r) { 2697 DRM_ERROR("hw_init %d failed %d\n", i, r); 2698 goto init_failed; 2699 } 2700 adev->ip_blocks[i].status.hw = true; 2701 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2702 /* need to do gmc hw init early so we can allocate gpu mem */ 2703 /* Try to reserve bad pages early */ 2704 if (amdgpu_sriov_vf(adev)) 2705 amdgpu_virt_exchange_data(adev); 2706 2707 r = amdgpu_device_mem_scratch_init(adev); 2708 if (r) { 2709 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2710 goto init_failed; 2711 } 2712 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2713 if (r) { 2714 DRM_ERROR("hw_init %d failed %d\n", i, r); 2715 goto init_failed; 2716 } 2717 r = amdgpu_device_wb_init(adev); 2718 if (r) { 2719 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2720 goto init_failed; 2721 } 2722 adev->ip_blocks[i].status.hw = true; 2723 2724 /* right after GMC hw init, we create CSA */ 2725 if (adev->gfx.mcbp) { 2726 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2727 AMDGPU_GEM_DOMAIN_VRAM | 2728 AMDGPU_GEM_DOMAIN_GTT, 2729 AMDGPU_CSA_SIZE); 2730 if (r) { 2731 DRM_ERROR("allocate CSA failed %d\n", r); 2732 goto init_failed; 2733 } 2734 } 2735 2736 r = amdgpu_seq64_init(adev); 2737 if (r) { 2738 DRM_ERROR("allocate seq64 failed %d\n", r); 2739 goto init_failed; 2740 } 2741 } 2742 } 2743 2744 if (amdgpu_sriov_vf(adev)) 2745 amdgpu_virt_init_data_exchange(adev); 2746 2747 r = amdgpu_ib_pool_init(adev); 2748 if (r) { 2749 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2750 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2751 goto init_failed; 2752 } 2753 2754 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2755 if (r) 2756 goto init_failed; 2757 2758 r = amdgpu_device_ip_hw_init_phase1(adev); 2759 if (r) 2760 goto init_failed; 2761 2762 r = amdgpu_device_fw_loading(adev); 2763 if (r) 2764 goto init_failed; 2765 2766 r = amdgpu_device_ip_hw_init_phase2(adev); 2767 if (r) 2768 goto init_failed; 2769 2770 /* 2771 * retired pages will be loaded from eeprom and reserved here, 2772 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2773 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2774 * for I2C communication which only true at this point. 2775 * 2776 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2777 * failure from bad gpu situation and stop amdgpu init process 2778 * accordingly. For other failed cases, it will still release all 2779 * the resource and print error message, rather than returning one 2780 * negative value to upper level. 2781 * 2782 * Note: theoretically, this should be called before all vram allocations 2783 * to protect retired page from abusing 2784 */ 2785 r = amdgpu_ras_recovery_init(adev); 2786 if (r) 2787 goto init_failed; 2788 2789 /** 2790 * In case of XGMI grab extra reference for reset domain for this device 2791 */ 2792 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2793 if (amdgpu_xgmi_add_device(adev) == 0) { 2794 if (!amdgpu_sriov_vf(adev)) { 2795 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2796 2797 if (WARN_ON(!hive)) { 2798 r = -ENOENT; 2799 goto init_failed; 2800 } 2801 2802 if (!hive->reset_domain || 2803 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2804 r = -ENOENT; 2805 amdgpu_put_xgmi_hive(hive); 2806 goto init_failed; 2807 } 2808 2809 /* Drop the early temporary reset domain we created for device */ 2810 amdgpu_reset_put_reset_domain(adev->reset_domain); 2811 adev->reset_domain = hive->reset_domain; 2812 amdgpu_put_xgmi_hive(hive); 2813 } 2814 } 2815 } 2816 2817 r = amdgpu_device_init_schedulers(adev); 2818 if (r) 2819 goto init_failed; 2820 2821 if (adev->mman.buffer_funcs_ring->sched.ready) 2822 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2823 2824 /* Don't init kfd if whole hive need to be reset during init */ 2825 if (!adev->gmc.xgmi.pending_reset) { 2826 kgd2kfd_init_zone_device(adev); 2827 amdgpu_amdkfd_device_init(adev); 2828 } 2829 2830 amdgpu_fru_get_product_info(adev); 2831 2832 init_failed: 2833 2834 return r; 2835 } 2836 2837 /** 2838 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2839 * 2840 * @adev: amdgpu_device pointer 2841 * 2842 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2843 * this function before a GPU reset. If the value is retained after a 2844 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2845 */ 2846 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2847 { 2848 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2849 } 2850 2851 /** 2852 * amdgpu_device_check_vram_lost - check if vram is valid 2853 * 2854 * @adev: amdgpu_device pointer 2855 * 2856 * Checks the reset magic value written to the gart pointer in VRAM. 2857 * The driver calls this after a GPU reset to see if the contents of 2858 * VRAM is lost or now. 2859 * returns true if vram is lost, false if not. 2860 */ 2861 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2862 { 2863 if (memcmp(adev->gart.ptr, adev->reset_magic, 2864 AMDGPU_RESET_MAGIC_NUM)) 2865 return true; 2866 2867 if (!amdgpu_in_reset(adev)) 2868 return false; 2869 2870 /* 2871 * For all ASICs with baco/mode1 reset, the VRAM is 2872 * always assumed to be lost. 2873 */ 2874 switch (amdgpu_asic_reset_method(adev)) { 2875 case AMD_RESET_METHOD_BACO: 2876 case AMD_RESET_METHOD_MODE1: 2877 return true; 2878 default: 2879 return false; 2880 } 2881 } 2882 2883 /** 2884 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2885 * 2886 * @adev: amdgpu_device pointer 2887 * @state: clockgating state (gate or ungate) 2888 * 2889 * The list of all the hardware IPs that make up the asic is walked and the 2890 * set_clockgating_state callbacks are run. 2891 * Late initialization pass enabling clockgating for hardware IPs. 2892 * Fini or suspend, pass disabling clockgating for hardware IPs. 2893 * Returns 0 on success, negative error code on failure. 2894 */ 2895 2896 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2897 enum amd_clockgating_state state) 2898 { 2899 int i, j, r; 2900 2901 if (amdgpu_emu_mode == 1) 2902 return 0; 2903 2904 for (j = 0; j < adev->num_ip_blocks; j++) { 2905 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2906 if (!adev->ip_blocks[i].status.late_initialized) 2907 continue; 2908 /* skip CG for GFX, SDMA on S0ix */ 2909 if (adev->in_s0ix && 2910 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2911 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2912 continue; 2913 /* skip CG for VCE/UVD, it's handled specially */ 2914 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2915 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2916 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2917 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2918 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2919 /* enable clockgating to save power */ 2920 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2921 state); 2922 if (r) { 2923 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2924 adev->ip_blocks[i].version->funcs->name, r); 2925 return r; 2926 } 2927 } 2928 } 2929 2930 return 0; 2931 } 2932 2933 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2934 enum amd_powergating_state state) 2935 { 2936 int i, j, r; 2937 2938 if (amdgpu_emu_mode == 1) 2939 return 0; 2940 2941 for (j = 0; j < adev->num_ip_blocks; j++) { 2942 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2943 if (!adev->ip_blocks[i].status.late_initialized) 2944 continue; 2945 /* skip PG for GFX, SDMA on S0ix */ 2946 if (adev->in_s0ix && 2947 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2948 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2949 continue; 2950 /* skip CG for VCE/UVD, it's handled specially */ 2951 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2952 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2953 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2954 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2955 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2956 /* enable powergating to save power */ 2957 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2958 state); 2959 if (r) { 2960 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2961 adev->ip_blocks[i].version->funcs->name, r); 2962 return r; 2963 } 2964 } 2965 } 2966 return 0; 2967 } 2968 2969 static int amdgpu_device_enable_mgpu_fan_boost(void) 2970 { 2971 struct amdgpu_gpu_instance *gpu_ins; 2972 struct amdgpu_device *adev; 2973 int i, ret = 0; 2974 2975 mutex_lock(&mgpu_info.mutex); 2976 2977 /* 2978 * MGPU fan boost feature should be enabled 2979 * only when there are two or more dGPUs in 2980 * the system 2981 */ 2982 if (mgpu_info.num_dgpu < 2) 2983 goto out; 2984 2985 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2986 gpu_ins = &(mgpu_info.gpu_ins[i]); 2987 adev = gpu_ins->adev; 2988 if (!(adev->flags & AMD_IS_APU) && 2989 !gpu_ins->mgpu_fan_enabled) { 2990 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2991 if (ret) 2992 break; 2993 2994 gpu_ins->mgpu_fan_enabled = 1; 2995 } 2996 } 2997 2998 out: 2999 mutex_unlock(&mgpu_info.mutex); 3000 3001 return ret; 3002 } 3003 3004 /** 3005 * amdgpu_device_ip_late_init - run late init for hardware IPs 3006 * 3007 * @adev: amdgpu_device pointer 3008 * 3009 * Late initialization pass for hardware IPs. The list of all the hardware 3010 * IPs that make up the asic is walked and the late_init callbacks are run. 3011 * late_init covers any special initialization that an IP requires 3012 * after all of the have been initialized or something that needs to happen 3013 * late in the init process. 3014 * Returns 0 on success, negative error code on failure. 3015 */ 3016 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3017 { 3018 struct amdgpu_gpu_instance *gpu_instance; 3019 int i = 0, r; 3020 3021 for (i = 0; i < adev->num_ip_blocks; i++) { 3022 if (!adev->ip_blocks[i].status.hw) 3023 continue; 3024 if (adev->ip_blocks[i].version->funcs->late_init) { 3025 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3026 if (r) { 3027 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3028 adev->ip_blocks[i].version->funcs->name, r); 3029 return r; 3030 } 3031 } 3032 adev->ip_blocks[i].status.late_initialized = true; 3033 } 3034 3035 r = amdgpu_ras_late_init(adev); 3036 if (r) { 3037 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3038 return r; 3039 } 3040 3041 amdgpu_ras_set_error_query_ready(adev, true); 3042 3043 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3044 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3045 3046 amdgpu_device_fill_reset_magic(adev); 3047 3048 r = amdgpu_device_enable_mgpu_fan_boost(); 3049 if (r) 3050 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3051 3052 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3053 if (amdgpu_passthrough(adev) && 3054 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3055 adev->asic_type == CHIP_ALDEBARAN)) 3056 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3057 3058 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3059 mutex_lock(&mgpu_info.mutex); 3060 3061 /* 3062 * Reset device p-state to low as this was booted with high. 3063 * 3064 * This should be performed only after all devices from the same 3065 * hive get initialized. 3066 * 3067 * However, it's unknown how many device in the hive in advance. 3068 * As this is counted one by one during devices initializations. 3069 * 3070 * So, we wait for all XGMI interlinked devices initialized. 3071 * This may bring some delays as those devices may come from 3072 * different hives. But that should be OK. 3073 */ 3074 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3075 for (i = 0; i < mgpu_info.num_gpu; i++) { 3076 gpu_instance = &(mgpu_info.gpu_ins[i]); 3077 if (gpu_instance->adev->flags & AMD_IS_APU) 3078 continue; 3079 3080 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3081 AMDGPU_XGMI_PSTATE_MIN); 3082 if (r) { 3083 DRM_ERROR("pstate setting failed (%d).\n", r); 3084 break; 3085 } 3086 } 3087 } 3088 3089 mutex_unlock(&mgpu_info.mutex); 3090 } 3091 3092 return 0; 3093 } 3094 3095 /** 3096 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3097 * 3098 * @adev: amdgpu_device pointer 3099 * 3100 * For ASICs need to disable SMC first 3101 */ 3102 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3103 { 3104 int i, r; 3105 3106 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3107 return; 3108 3109 for (i = 0; i < adev->num_ip_blocks; i++) { 3110 if (!adev->ip_blocks[i].status.hw) 3111 continue; 3112 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3113 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3114 /* XXX handle errors */ 3115 if (r) { 3116 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3117 adev->ip_blocks[i].version->funcs->name, r); 3118 } 3119 adev->ip_blocks[i].status.hw = false; 3120 break; 3121 } 3122 } 3123 } 3124 3125 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3126 { 3127 int i, r; 3128 3129 for (i = 0; i < adev->num_ip_blocks; i++) { 3130 if (!adev->ip_blocks[i].version->funcs->early_fini) 3131 continue; 3132 3133 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3134 if (r) { 3135 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3136 adev->ip_blocks[i].version->funcs->name, r); 3137 } 3138 } 3139 3140 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3141 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3142 3143 amdgpu_amdkfd_suspend(adev, false); 3144 3145 /* Workaroud for ASICs need to disable SMC first */ 3146 amdgpu_device_smu_fini_early(adev); 3147 3148 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3149 if (!adev->ip_blocks[i].status.hw) 3150 continue; 3151 3152 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3153 /* XXX handle errors */ 3154 if (r) { 3155 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3156 adev->ip_blocks[i].version->funcs->name, r); 3157 } 3158 3159 adev->ip_blocks[i].status.hw = false; 3160 } 3161 3162 if (amdgpu_sriov_vf(adev)) { 3163 if (amdgpu_virt_release_full_gpu(adev, false)) 3164 DRM_ERROR("failed to release exclusive mode on fini\n"); 3165 } 3166 3167 return 0; 3168 } 3169 3170 /** 3171 * amdgpu_device_ip_fini - run fini for hardware IPs 3172 * 3173 * @adev: amdgpu_device pointer 3174 * 3175 * Main teardown pass for hardware IPs. The list of all the hardware 3176 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3177 * are run. hw_fini tears down the hardware associated with each IP 3178 * and sw_fini tears down any software state associated with each IP. 3179 * Returns 0 on success, negative error code on failure. 3180 */ 3181 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3182 { 3183 int i, r; 3184 3185 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3186 amdgpu_virt_release_ras_err_handler_data(adev); 3187 3188 if (adev->gmc.xgmi.num_physical_nodes > 1) 3189 amdgpu_xgmi_remove_device(adev); 3190 3191 amdgpu_amdkfd_device_fini_sw(adev); 3192 3193 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3194 if (!adev->ip_blocks[i].status.sw) 3195 continue; 3196 3197 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3198 amdgpu_ucode_free_bo(adev); 3199 amdgpu_free_static_csa(&adev->virt.csa_obj); 3200 amdgpu_device_wb_fini(adev); 3201 amdgpu_device_mem_scratch_fini(adev); 3202 amdgpu_ib_pool_fini(adev); 3203 amdgpu_seq64_fini(adev); 3204 } 3205 3206 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3207 /* XXX handle errors */ 3208 if (r) { 3209 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3210 adev->ip_blocks[i].version->funcs->name, r); 3211 } 3212 adev->ip_blocks[i].status.sw = false; 3213 adev->ip_blocks[i].status.valid = false; 3214 } 3215 3216 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3217 if (!adev->ip_blocks[i].status.late_initialized) 3218 continue; 3219 if (adev->ip_blocks[i].version->funcs->late_fini) 3220 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3221 adev->ip_blocks[i].status.late_initialized = false; 3222 } 3223 3224 amdgpu_ras_fini(adev); 3225 3226 return 0; 3227 } 3228 3229 /** 3230 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3231 * 3232 * @work: work_struct. 3233 */ 3234 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3235 { 3236 struct amdgpu_device *adev = 3237 container_of(work, struct amdgpu_device, delayed_init_work.work); 3238 int r; 3239 3240 r = amdgpu_ib_ring_tests(adev); 3241 if (r) 3242 DRM_ERROR("ib ring test failed (%d).\n", r); 3243 } 3244 3245 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3246 { 3247 struct amdgpu_device *adev = 3248 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3249 3250 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3251 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3252 3253 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3254 adev->gfx.gfx_off_state = true; 3255 } 3256 3257 /** 3258 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3259 * 3260 * @adev: amdgpu_device pointer 3261 * 3262 * Main suspend function for hardware IPs. The list of all the hardware 3263 * IPs that make up the asic is walked, clockgating is disabled and the 3264 * suspend callbacks are run. suspend puts the hardware and software state 3265 * in each IP into a state suitable for suspend. 3266 * Returns 0 on success, negative error code on failure. 3267 */ 3268 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3269 { 3270 int i, r; 3271 3272 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3273 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3274 3275 /* 3276 * Per PMFW team's suggestion, driver needs to handle gfxoff 3277 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3278 * scenario. Add the missing df cstate disablement here. 3279 */ 3280 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3281 dev_warn(adev->dev, "Failed to disallow df cstate"); 3282 3283 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3284 if (!adev->ip_blocks[i].status.valid) 3285 continue; 3286 3287 /* displays are handled separately */ 3288 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3289 continue; 3290 3291 /* XXX handle errors */ 3292 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3293 /* XXX handle errors */ 3294 if (r) { 3295 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3296 adev->ip_blocks[i].version->funcs->name, r); 3297 return r; 3298 } 3299 3300 adev->ip_blocks[i].status.hw = false; 3301 } 3302 3303 return 0; 3304 } 3305 3306 /** 3307 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3308 * 3309 * @adev: amdgpu_device pointer 3310 * 3311 * Main suspend function for hardware IPs. The list of all the hardware 3312 * IPs that make up the asic is walked, clockgating is disabled and the 3313 * suspend callbacks are run. suspend puts the hardware and software state 3314 * in each IP into a state suitable for suspend. 3315 * Returns 0 on success, negative error code on failure. 3316 */ 3317 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3318 { 3319 int i, r; 3320 3321 if (adev->in_s0ix) 3322 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3323 3324 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3325 if (!adev->ip_blocks[i].status.valid) 3326 continue; 3327 /* displays are handled in phase1 */ 3328 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3329 continue; 3330 /* PSP lost connection when err_event_athub occurs */ 3331 if (amdgpu_ras_intr_triggered() && 3332 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3333 adev->ip_blocks[i].status.hw = false; 3334 continue; 3335 } 3336 3337 /* skip unnecessary suspend if we do not initialize them yet */ 3338 if (adev->gmc.xgmi.pending_reset && 3339 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3340 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3341 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3342 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3343 adev->ip_blocks[i].status.hw = false; 3344 continue; 3345 } 3346 3347 /* skip suspend of gfx/mes and psp for S0ix 3348 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3349 * like at runtime. PSP is also part of the always on hardware 3350 * so no need to suspend it. 3351 */ 3352 if (adev->in_s0ix && 3353 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3354 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3355 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3356 continue; 3357 3358 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3359 if (adev->in_s0ix && 3360 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3361 IP_VERSION(5, 0, 0)) && 3362 (adev->ip_blocks[i].version->type == 3363 AMD_IP_BLOCK_TYPE_SDMA)) 3364 continue; 3365 3366 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3367 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3368 * from this location and RLC Autoload automatically also gets loaded 3369 * from here based on PMFW -> PSP message during re-init sequence. 3370 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3371 * the TMR and reload FWs again for IMU enabled APU ASICs. 3372 */ 3373 if (amdgpu_in_reset(adev) && 3374 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3375 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3376 continue; 3377 3378 /* XXX handle errors */ 3379 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3380 /* XXX handle errors */ 3381 if (r) { 3382 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3383 adev->ip_blocks[i].version->funcs->name, r); 3384 } 3385 adev->ip_blocks[i].status.hw = false; 3386 /* handle putting the SMC in the appropriate state */ 3387 if (!amdgpu_sriov_vf(adev)) { 3388 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3389 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3390 if (r) { 3391 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3392 adev->mp1_state, r); 3393 return r; 3394 } 3395 } 3396 } 3397 } 3398 3399 return 0; 3400 } 3401 3402 /** 3403 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3404 * 3405 * @adev: amdgpu_device pointer 3406 * 3407 * Main suspend function for hardware IPs. The list of all the hardware 3408 * IPs that make up the asic is walked, clockgating is disabled and the 3409 * suspend callbacks are run. suspend puts the hardware and software state 3410 * in each IP into a state suitable for suspend. 3411 * Returns 0 on success, negative error code on failure. 3412 */ 3413 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3414 { 3415 int r; 3416 3417 if (amdgpu_sriov_vf(adev)) { 3418 amdgpu_virt_fini_data_exchange(adev); 3419 amdgpu_virt_request_full_gpu(adev, false); 3420 } 3421 3422 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3423 3424 r = amdgpu_device_ip_suspend_phase1(adev); 3425 if (r) 3426 return r; 3427 r = amdgpu_device_ip_suspend_phase2(adev); 3428 3429 if (amdgpu_sriov_vf(adev)) 3430 amdgpu_virt_release_full_gpu(adev, false); 3431 3432 return r; 3433 } 3434 3435 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3436 { 3437 int i, r; 3438 3439 static enum amd_ip_block_type ip_order[] = { 3440 AMD_IP_BLOCK_TYPE_COMMON, 3441 AMD_IP_BLOCK_TYPE_GMC, 3442 AMD_IP_BLOCK_TYPE_PSP, 3443 AMD_IP_BLOCK_TYPE_IH, 3444 }; 3445 3446 for (i = 0; i < adev->num_ip_blocks; i++) { 3447 int j; 3448 struct amdgpu_ip_block *block; 3449 3450 block = &adev->ip_blocks[i]; 3451 block->status.hw = false; 3452 3453 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3454 3455 if (block->version->type != ip_order[j] || 3456 !block->status.valid) 3457 continue; 3458 3459 r = block->version->funcs->hw_init(adev); 3460 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3461 if (r) 3462 return r; 3463 block->status.hw = true; 3464 } 3465 } 3466 3467 return 0; 3468 } 3469 3470 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3471 { 3472 int i, r; 3473 3474 static enum amd_ip_block_type ip_order[] = { 3475 AMD_IP_BLOCK_TYPE_SMC, 3476 AMD_IP_BLOCK_TYPE_DCE, 3477 AMD_IP_BLOCK_TYPE_GFX, 3478 AMD_IP_BLOCK_TYPE_SDMA, 3479 AMD_IP_BLOCK_TYPE_MES, 3480 AMD_IP_BLOCK_TYPE_UVD, 3481 AMD_IP_BLOCK_TYPE_VCE, 3482 AMD_IP_BLOCK_TYPE_VCN, 3483 AMD_IP_BLOCK_TYPE_JPEG 3484 }; 3485 3486 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3487 int j; 3488 struct amdgpu_ip_block *block; 3489 3490 for (j = 0; j < adev->num_ip_blocks; j++) { 3491 block = &adev->ip_blocks[j]; 3492 3493 if (block->version->type != ip_order[i] || 3494 !block->status.valid || 3495 block->status.hw) 3496 continue; 3497 3498 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3499 r = block->version->funcs->resume(adev); 3500 else 3501 r = block->version->funcs->hw_init(adev); 3502 3503 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3504 if (r) 3505 return r; 3506 block->status.hw = true; 3507 } 3508 } 3509 3510 return 0; 3511 } 3512 3513 /** 3514 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3515 * 3516 * @adev: amdgpu_device pointer 3517 * 3518 * First resume function for hardware IPs. The list of all the hardware 3519 * IPs that make up the asic is walked and the resume callbacks are run for 3520 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3521 * after a suspend and updates the software state as necessary. This 3522 * function is also used for restoring the GPU after a GPU reset. 3523 * Returns 0 on success, negative error code on failure. 3524 */ 3525 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3526 { 3527 int i, r; 3528 3529 for (i = 0; i < adev->num_ip_blocks; i++) { 3530 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3531 continue; 3532 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3533 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3534 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3535 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3536 3537 r = adev->ip_blocks[i].version->funcs->resume(adev); 3538 if (r) { 3539 DRM_ERROR("resume of IP block <%s> failed %d\n", 3540 adev->ip_blocks[i].version->funcs->name, r); 3541 return r; 3542 } 3543 adev->ip_blocks[i].status.hw = true; 3544 } 3545 } 3546 3547 return 0; 3548 } 3549 3550 /** 3551 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3552 * 3553 * @adev: amdgpu_device pointer 3554 * 3555 * First resume function for hardware IPs. The list of all the hardware 3556 * IPs that make up the asic is walked and the resume callbacks are run for 3557 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3558 * functional state after a suspend and updates the software state as 3559 * necessary. This function is also used for restoring the GPU after a GPU 3560 * reset. 3561 * Returns 0 on success, negative error code on failure. 3562 */ 3563 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3564 { 3565 int i, r; 3566 3567 for (i = 0; i < adev->num_ip_blocks; i++) { 3568 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3569 continue; 3570 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3571 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3572 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3573 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3574 continue; 3575 r = adev->ip_blocks[i].version->funcs->resume(adev); 3576 if (r) { 3577 DRM_ERROR("resume of IP block <%s> failed %d\n", 3578 adev->ip_blocks[i].version->funcs->name, r); 3579 return r; 3580 } 3581 adev->ip_blocks[i].status.hw = true; 3582 } 3583 3584 return 0; 3585 } 3586 3587 /** 3588 * amdgpu_device_ip_resume - run resume for hardware IPs 3589 * 3590 * @adev: amdgpu_device pointer 3591 * 3592 * Main resume function for hardware IPs. The hardware IPs 3593 * are split into two resume functions because they are 3594 * also used in recovering from a GPU reset and some additional 3595 * steps need to be take between them. In this case (S3/S4) they are 3596 * run sequentially. 3597 * Returns 0 on success, negative error code on failure. 3598 */ 3599 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3600 { 3601 int r; 3602 3603 r = amdgpu_device_ip_resume_phase1(adev); 3604 if (r) 3605 return r; 3606 3607 r = amdgpu_device_fw_loading(adev); 3608 if (r) 3609 return r; 3610 3611 r = amdgpu_device_ip_resume_phase2(adev); 3612 3613 if (adev->mman.buffer_funcs_ring->sched.ready) 3614 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3615 3616 return r; 3617 } 3618 3619 /** 3620 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3621 * 3622 * @adev: amdgpu_device pointer 3623 * 3624 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3625 */ 3626 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3627 { 3628 if (amdgpu_sriov_vf(adev)) { 3629 if (adev->is_atom_fw) { 3630 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3631 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3632 } else { 3633 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3634 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3635 } 3636 3637 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3638 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3639 } 3640 } 3641 3642 /** 3643 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3644 * 3645 * @asic_type: AMD asic type 3646 * 3647 * Check if there is DC (new modesetting infrastructre) support for an asic. 3648 * returns true if DC has support, false if not. 3649 */ 3650 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3651 { 3652 switch (asic_type) { 3653 #ifdef CONFIG_DRM_AMDGPU_SI 3654 case CHIP_HAINAN: 3655 #endif 3656 case CHIP_TOPAZ: 3657 /* chips with no display hardware */ 3658 return false; 3659 #if defined(CONFIG_DRM_AMD_DC) 3660 case CHIP_TAHITI: 3661 case CHIP_PITCAIRN: 3662 case CHIP_VERDE: 3663 case CHIP_OLAND: 3664 /* 3665 * We have systems in the wild with these ASICs that require 3666 * LVDS and VGA support which is not supported with DC. 3667 * 3668 * Fallback to the non-DC driver here by default so as not to 3669 * cause regressions. 3670 */ 3671 #if defined(CONFIG_DRM_AMD_DC_SI) 3672 return amdgpu_dc > 0; 3673 #else 3674 return false; 3675 #endif 3676 case CHIP_BONAIRE: 3677 case CHIP_KAVERI: 3678 case CHIP_KABINI: 3679 case CHIP_MULLINS: 3680 /* 3681 * We have systems in the wild with these ASICs that require 3682 * VGA support which is not supported with DC. 3683 * 3684 * Fallback to the non-DC driver here by default so as not to 3685 * cause regressions. 3686 */ 3687 return amdgpu_dc > 0; 3688 default: 3689 return amdgpu_dc != 0; 3690 #else 3691 default: 3692 if (amdgpu_dc > 0) 3693 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3694 return false; 3695 #endif 3696 } 3697 } 3698 3699 /** 3700 * amdgpu_device_has_dc_support - check if dc is supported 3701 * 3702 * @adev: amdgpu_device pointer 3703 * 3704 * Returns true for supported, false for not supported 3705 */ 3706 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3707 { 3708 if (adev->enable_virtual_display || 3709 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3710 return false; 3711 3712 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3713 } 3714 3715 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3716 { 3717 struct amdgpu_device *adev = 3718 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3719 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3720 3721 /* It's a bug to not have a hive within this function */ 3722 if (WARN_ON(!hive)) 3723 return; 3724 3725 /* 3726 * Use task barrier to synchronize all xgmi reset works across the 3727 * hive. task_barrier_enter and task_barrier_exit will block 3728 * until all the threads running the xgmi reset works reach 3729 * those points. task_barrier_full will do both blocks. 3730 */ 3731 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3732 3733 task_barrier_enter(&hive->tb); 3734 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3735 3736 if (adev->asic_reset_res) 3737 goto fail; 3738 3739 task_barrier_exit(&hive->tb); 3740 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3741 3742 if (adev->asic_reset_res) 3743 goto fail; 3744 3745 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3746 } else { 3747 3748 task_barrier_full(&hive->tb); 3749 adev->asic_reset_res = amdgpu_asic_reset(adev); 3750 } 3751 3752 fail: 3753 if (adev->asic_reset_res) 3754 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3755 adev->asic_reset_res, adev_to_drm(adev)->unique); 3756 amdgpu_put_xgmi_hive(hive); 3757 } 3758 3759 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3760 { 3761 char *input = amdgpu_lockup_timeout; 3762 char *timeout_setting = NULL; 3763 int index = 0; 3764 long timeout; 3765 int ret = 0; 3766 3767 /* 3768 * By default timeout for non compute jobs is 10000 3769 * and 60000 for compute jobs. 3770 * In SR-IOV or passthrough mode, timeout for compute 3771 * jobs are 60000 by default. 3772 */ 3773 adev->gfx_timeout = msecs_to_jiffies(10000); 3774 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3775 if (amdgpu_sriov_vf(adev)) 3776 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3777 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3778 else 3779 adev->compute_timeout = msecs_to_jiffies(60000); 3780 3781 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3782 while ((timeout_setting = strsep(&input, ",")) && 3783 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3784 ret = kstrtol(timeout_setting, 0, &timeout); 3785 if (ret) 3786 return ret; 3787 3788 if (timeout == 0) { 3789 index++; 3790 continue; 3791 } else if (timeout < 0) { 3792 timeout = MAX_SCHEDULE_TIMEOUT; 3793 dev_warn(adev->dev, "lockup timeout disabled"); 3794 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3795 } else { 3796 timeout = msecs_to_jiffies(timeout); 3797 } 3798 3799 switch (index++) { 3800 case 0: 3801 adev->gfx_timeout = timeout; 3802 break; 3803 case 1: 3804 adev->compute_timeout = timeout; 3805 break; 3806 case 2: 3807 adev->sdma_timeout = timeout; 3808 break; 3809 case 3: 3810 adev->video_timeout = timeout; 3811 break; 3812 default: 3813 break; 3814 } 3815 } 3816 /* 3817 * There is only one value specified and 3818 * it should apply to all non-compute jobs. 3819 */ 3820 if (index == 1) { 3821 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3822 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3823 adev->compute_timeout = adev->gfx_timeout; 3824 } 3825 } 3826 3827 return ret; 3828 } 3829 3830 /** 3831 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3832 * 3833 * @adev: amdgpu_device pointer 3834 * 3835 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3836 */ 3837 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3838 { 3839 struct iommu_domain *domain; 3840 3841 domain = iommu_get_domain_for_dev(adev->dev); 3842 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3843 adev->ram_is_direct_mapped = true; 3844 } 3845 3846 static const struct attribute *amdgpu_dev_attributes[] = { 3847 &dev_attr_pcie_replay_count.attr, 3848 NULL 3849 }; 3850 3851 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3852 { 3853 if (amdgpu_mcbp == 1) 3854 adev->gfx.mcbp = true; 3855 else if (amdgpu_mcbp == 0) 3856 adev->gfx.mcbp = false; 3857 3858 if (amdgpu_sriov_vf(adev)) 3859 adev->gfx.mcbp = true; 3860 3861 if (adev->gfx.mcbp) 3862 DRM_INFO("MCBP is enabled\n"); 3863 } 3864 3865 /** 3866 * amdgpu_device_init - initialize the driver 3867 * 3868 * @adev: amdgpu_device pointer 3869 * @flags: driver flags 3870 * 3871 * Initializes the driver info and hw (all asics). 3872 * Returns 0 for success or an error on failure. 3873 * Called at driver startup. 3874 */ 3875 int amdgpu_device_init(struct amdgpu_device *adev, 3876 uint32_t flags) 3877 { 3878 struct drm_device *ddev = adev_to_drm(adev); 3879 struct pci_dev *pdev = adev->pdev; 3880 int r, i; 3881 bool px = false; 3882 u32 max_MBps; 3883 int tmp; 3884 3885 adev->shutdown = false; 3886 adev->flags = flags; 3887 3888 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3889 adev->asic_type = amdgpu_force_asic_type; 3890 else 3891 adev->asic_type = flags & AMD_ASIC_MASK; 3892 3893 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3894 if (amdgpu_emu_mode == 1) 3895 adev->usec_timeout *= 10; 3896 adev->gmc.gart_size = 512 * 1024 * 1024; 3897 adev->accel_working = false; 3898 adev->num_rings = 0; 3899 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3900 adev->mman.buffer_funcs = NULL; 3901 adev->mman.buffer_funcs_ring = NULL; 3902 adev->vm_manager.vm_pte_funcs = NULL; 3903 adev->vm_manager.vm_pte_num_scheds = 0; 3904 adev->gmc.gmc_funcs = NULL; 3905 adev->harvest_ip_mask = 0x0; 3906 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3907 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3908 3909 adev->smc_rreg = &amdgpu_invalid_rreg; 3910 adev->smc_wreg = &amdgpu_invalid_wreg; 3911 adev->pcie_rreg = &amdgpu_invalid_rreg; 3912 adev->pcie_wreg = &amdgpu_invalid_wreg; 3913 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3914 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3915 adev->pciep_rreg = &amdgpu_invalid_rreg; 3916 adev->pciep_wreg = &amdgpu_invalid_wreg; 3917 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3918 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3919 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 3920 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 3921 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3922 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3923 adev->didt_rreg = &amdgpu_invalid_rreg; 3924 adev->didt_wreg = &amdgpu_invalid_wreg; 3925 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3926 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3927 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3928 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3929 3930 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3931 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3932 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3933 3934 /* mutex initialization are all done here so we 3935 * can recall function without having locking issues 3936 */ 3937 mutex_init(&adev->firmware.mutex); 3938 mutex_init(&adev->pm.mutex); 3939 mutex_init(&adev->gfx.gpu_clock_mutex); 3940 mutex_init(&adev->srbm_mutex); 3941 mutex_init(&adev->gfx.pipe_reserve_mutex); 3942 mutex_init(&adev->gfx.gfx_off_mutex); 3943 mutex_init(&adev->gfx.partition_mutex); 3944 mutex_init(&adev->grbm_idx_mutex); 3945 mutex_init(&adev->mn_lock); 3946 mutex_init(&adev->virt.vf_errors.lock); 3947 hash_init(adev->mn_hash); 3948 mutex_init(&adev->psp.mutex); 3949 mutex_init(&adev->notifier_lock); 3950 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3951 mutex_init(&adev->benchmark_mutex); 3952 3953 amdgpu_device_init_apu_flags(adev); 3954 3955 r = amdgpu_device_check_arguments(adev); 3956 if (r) 3957 return r; 3958 3959 spin_lock_init(&adev->mmio_idx_lock); 3960 spin_lock_init(&adev->smc_idx_lock); 3961 spin_lock_init(&adev->pcie_idx_lock); 3962 spin_lock_init(&adev->uvd_ctx_idx_lock); 3963 spin_lock_init(&adev->didt_idx_lock); 3964 spin_lock_init(&adev->gc_cac_idx_lock); 3965 spin_lock_init(&adev->se_cac_idx_lock); 3966 spin_lock_init(&adev->audio_endpt_idx_lock); 3967 spin_lock_init(&adev->mm_stats.lock); 3968 3969 INIT_LIST_HEAD(&adev->shadow_list); 3970 mutex_init(&adev->shadow_list_lock); 3971 3972 INIT_LIST_HEAD(&adev->reset_list); 3973 3974 INIT_LIST_HEAD(&adev->ras_list); 3975 3976 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 3977 3978 INIT_DELAYED_WORK(&adev->delayed_init_work, 3979 amdgpu_device_delayed_init_work_handler); 3980 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3981 amdgpu_device_delay_enable_gfx_off); 3982 3983 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3984 3985 adev->gfx.gfx_off_req_count = 1; 3986 adev->gfx.gfx_off_residency = 0; 3987 adev->gfx.gfx_off_entrycount = 0; 3988 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3989 3990 atomic_set(&adev->throttling_logging_enabled, 1); 3991 /* 3992 * If throttling continues, logging will be performed every minute 3993 * to avoid log flooding. "-1" is subtracted since the thermal 3994 * throttling interrupt comes every second. Thus, the total logging 3995 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3996 * for throttling interrupt) = 60 seconds. 3997 */ 3998 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3999 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4000 4001 /* Registers mapping */ 4002 /* TODO: block userspace mapping of io register */ 4003 if (adev->asic_type >= CHIP_BONAIRE) { 4004 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4005 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4006 } else { 4007 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4008 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4009 } 4010 4011 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4012 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4013 4014 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4015 if (!adev->rmmio) 4016 return -ENOMEM; 4017 4018 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4019 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4020 4021 /* 4022 * Reset domain needs to be present early, before XGMI hive discovered 4023 * (if any) and intitialized to use reset sem and in_gpu reset flag 4024 * early on during init and before calling to RREG32. 4025 */ 4026 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4027 if (!adev->reset_domain) 4028 return -ENOMEM; 4029 4030 /* detect hw virtualization here */ 4031 amdgpu_detect_virtualization(adev); 4032 4033 amdgpu_device_get_pcie_info(adev); 4034 4035 r = amdgpu_device_get_job_timeout_settings(adev); 4036 if (r) { 4037 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4038 return r; 4039 } 4040 4041 /* early init functions */ 4042 r = amdgpu_device_ip_early_init(adev); 4043 if (r) 4044 return r; 4045 4046 amdgpu_device_set_mcbp(adev); 4047 4048 /* Get rid of things like offb */ 4049 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4050 if (r) 4051 return r; 4052 4053 /* Enable TMZ based on IP_VERSION */ 4054 amdgpu_gmc_tmz_set(adev); 4055 4056 amdgpu_gmc_noretry_set(adev); 4057 /* Need to get xgmi info early to decide the reset behavior*/ 4058 if (adev->gmc.xgmi.supported) { 4059 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4060 if (r) 4061 return r; 4062 } 4063 4064 /* enable PCIE atomic ops */ 4065 if (amdgpu_sriov_vf(adev)) { 4066 if (adev->virt.fw_reserve.p_pf2vf) 4067 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4068 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4069 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4070 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4071 * internal path natively support atomics, set have_atomics_support to true. 4072 */ 4073 } else if ((adev->flags & AMD_IS_APU) && 4074 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4075 IP_VERSION(9, 0, 0))) { 4076 adev->have_atomics_support = true; 4077 } else { 4078 adev->have_atomics_support = 4079 !pci_enable_atomic_ops_to_root(adev->pdev, 4080 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4081 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4082 } 4083 4084 if (!adev->have_atomics_support) 4085 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4086 4087 /* doorbell bar mapping and doorbell index init*/ 4088 amdgpu_doorbell_init(adev); 4089 4090 if (amdgpu_emu_mode == 1) { 4091 /* post the asic on emulation mode */ 4092 emu_soc_asic_init(adev); 4093 goto fence_driver_init; 4094 } 4095 4096 amdgpu_reset_init(adev); 4097 4098 /* detect if we are with an SRIOV vbios */ 4099 if (adev->bios) 4100 amdgpu_device_detect_sriov_bios(adev); 4101 4102 /* check if we need to reset the asic 4103 * E.g., driver was not cleanly unloaded previously, etc. 4104 */ 4105 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4106 if (adev->gmc.xgmi.num_physical_nodes) { 4107 dev_info(adev->dev, "Pending hive reset.\n"); 4108 adev->gmc.xgmi.pending_reset = true; 4109 /* Only need to init necessary block for SMU to handle the reset */ 4110 for (i = 0; i < adev->num_ip_blocks; i++) { 4111 if (!adev->ip_blocks[i].status.valid) 4112 continue; 4113 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4114 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4115 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4116 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4117 DRM_DEBUG("IP %s disabled for hw_init.\n", 4118 adev->ip_blocks[i].version->funcs->name); 4119 adev->ip_blocks[i].status.hw = true; 4120 } 4121 } 4122 } else { 4123 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 4124 case IP_VERSION(13, 0, 0): 4125 case IP_VERSION(13, 0, 7): 4126 case IP_VERSION(13, 0, 10): 4127 r = psp_gpu_reset(adev); 4128 break; 4129 default: 4130 tmp = amdgpu_reset_method; 4131 /* It should do a default reset when loading or reloading the driver, 4132 * regardless of the module parameter reset_method. 4133 */ 4134 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4135 r = amdgpu_asic_reset(adev); 4136 amdgpu_reset_method = tmp; 4137 break; 4138 } 4139 4140 if (r) { 4141 dev_err(adev->dev, "asic reset on init failed\n"); 4142 goto failed; 4143 } 4144 } 4145 } 4146 4147 /* Post card if necessary */ 4148 if (amdgpu_device_need_post(adev)) { 4149 if (!adev->bios) { 4150 dev_err(adev->dev, "no vBIOS found\n"); 4151 r = -EINVAL; 4152 goto failed; 4153 } 4154 DRM_INFO("GPU posting now...\n"); 4155 r = amdgpu_device_asic_init(adev); 4156 if (r) { 4157 dev_err(adev->dev, "gpu post error!\n"); 4158 goto failed; 4159 } 4160 } 4161 4162 if (adev->bios) { 4163 if (adev->is_atom_fw) { 4164 /* Initialize clocks */ 4165 r = amdgpu_atomfirmware_get_clock_info(adev); 4166 if (r) { 4167 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4168 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4169 goto failed; 4170 } 4171 } else { 4172 /* Initialize clocks */ 4173 r = amdgpu_atombios_get_clock_info(adev); 4174 if (r) { 4175 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4176 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4177 goto failed; 4178 } 4179 /* init i2c buses */ 4180 if (!amdgpu_device_has_dc_support(adev)) 4181 amdgpu_atombios_i2c_init(adev); 4182 } 4183 } 4184 4185 fence_driver_init: 4186 /* Fence driver */ 4187 r = amdgpu_fence_driver_sw_init(adev); 4188 if (r) { 4189 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4190 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4191 goto failed; 4192 } 4193 4194 /* init the mode config */ 4195 drm_mode_config_init(adev_to_drm(adev)); 4196 4197 r = amdgpu_device_ip_init(adev); 4198 if (r) { 4199 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4200 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4201 goto release_ras_con; 4202 } 4203 4204 amdgpu_fence_driver_hw_init(adev); 4205 4206 dev_info(adev->dev, 4207 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4208 adev->gfx.config.max_shader_engines, 4209 adev->gfx.config.max_sh_per_se, 4210 adev->gfx.config.max_cu_per_sh, 4211 adev->gfx.cu_info.number); 4212 4213 adev->accel_working = true; 4214 4215 amdgpu_vm_check_compute_bug(adev); 4216 4217 /* Initialize the buffer migration limit. */ 4218 if (amdgpu_moverate >= 0) 4219 max_MBps = amdgpu_moverate; 4220 else 4221 max_MBps = 8; /* Allow 8 MB/s. */ 4222 /* Get a log2 for easy divisions. */ 4223 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4224 4225 /* 4226 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4227 * Otherwise the mgpu fan boost feature will be skipped due to the 4228 * gpu instance is counted less. 4229 */ 4230 amdgpu_register_gpu_instance(adev); 4231 4232 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4233 * explicit gating rather than handling it automatically. 4234 */ 4235 if (!adev->gmc.xgmi.pending_reset) { 4236 r = amdgpu_device_ip_late_init(adev); 4237 if (r) { 4238 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4239 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4240 goto release_ras_con; 4241 } 4242 /* must succeed. */ 4243 amdgpu_ras_resume(adev); 4244 queue_delayed_work(system_wq, &adev->delayed_init_work, 4245 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4246 } 4247 4248 if (amdgpu_sriov_vf(adev)) { 4249 amdgpu_virt_release_full_gpu(adev, true); 4250 flush_delayed_work(&adev->delayed_init_work); 4251 } 4252 4253 /* 4254 * Place those sysfs registering after `late_init`. As some of those 4255 * operations performed in `late_init` might affect the sysfs 4256 * interfaces creating. 4257 */ 4258 r = amdgpu_atombios_sysfs_init(adev); 4259 if (r) 4260 drm_err(&adev->ddev, 4261 "registering atombios sysfs failed (%d).\n", r); 4262 4263 r = amdgpu_pm_sysfs_init(adev); 4264 if (r) 4265 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4266 4267 r = amdgpu_ucode_sysfs_init(adev); 4268 if (r) { 4269 adev->ucode_sysfs_en = false; 4270 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4271 } else 4272 adev->ucode_sysfs_en = true; 4273 4274 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4275 if (r) 4276 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4277 4278 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4279 if (r) 4280 dev_err(adev->dev, 4281 "Could not create amdgpu board attributes\n"); 4282 4283 amdgpu_fru_sysfs_init(adev); 4284 amdgpu_reg_state_sysfs_init(adev); 4285 4286 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4287 r = amdgpu_pmu_init(adev); 4288 if (r) 4289 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4290 4291 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4292 if (amdgpu_device_cache_pci_state(adev->pdev)) 4293 pci_restore_state(pdev); 4294 4295 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4296 /* this will fail for cards that aren't VGA class devices, just 4297 * ignore it 4298 */ 4299 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4300 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4301 4302 px = amdgpu_device_supports_px(ddev); 4303 4304 if (px || (!dev_is_removable(&adev->pdev->dev) && 4305 apple_gmux_detect(NULL, NULL))) 4306 vga_switcheroo_register_client(adev->pdev, 4307 &amdgpu_switcheroo_ops, px); 4308 4309 if (px) 4310 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4311 4312 if (adev->gmc.xgmi.pending_reset) 4313 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4314 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4315 4316 amdgpu_device_check_iommu_direct_map(adev); 4317 4318 return 0; 4319 4320 release_ras_con: 4321 if (amdgpu_sriov_vf(adev)) 4322 amdgpu_virt_release_full_gpu(adev, true); 4323 4324 /* failed in exclusive mode due to timeout */ 4325 if (amdgpu_sriov_vf(adev) && 4326 !amdgpu_sriov_runtime(adev) && 4327 amdgpu_virt_mmio_blocked(adev) && 4328 !amdgpu_virt_wait_reset(adev)) { 4329 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4330 /* Don't send request since VF is inactive. */ 4331 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4332 adev->virt.ops = NULL; 4333 r = -EAGAIN; 4334 } 4335 amdgpu_release_ras_context(adev); 4336 4337 failed: 4338 amdgpu_vf_error_trans_all(adev); 4339 4340 return r; 4341 } 4342 4343 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4344 { 4345 4346 /* Clear all CPU mappings pointing to this device */ 4347 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4348 4349 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4350 amdgpu_doorbell_fini(adev); 4351 4352 iounmap(adev->rmmio); 4353 adev->rmmio = NULL; 4354 if (adev->mman.aper_base_kaddr) 4355 iounmap(adev->mman.aper_base_kaddr); 4356 adev->mman.aper_base_kaddr = NULL; 4357 4358 /* Memory manager related */ 4359 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4360 arch_phys_wc_del(adev->gmc.vram_mtrr); 4361 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4362 } 4363 } 4364 4365 /** 4366 * amdgpu_device_fini_hw - tear down the driver 4367 * 4368 * @adev: amdgpu_device pointer 4369 * 4370 * Tear down the driver info (all asics). 4371 * Called at driver shutdown. 4372 */ 4373 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4374 { 4375 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4376 flush_delayed_work(&adev->delayed_init_work); 4377 adev->shutdown = true; 4378 4379 /* make sure IB test finished before entering exclusive mode 4380 * to avoid preemption on IB test 4381 */ 4382 if (amdgpu_sriov_vf(adev)) { 4383 amdgpu_virt_request_full_gpu(adev, false); 4384 amdgpu_virt_fini_data_exchange(adev); 4385 } 4386 4387 /* disable all interrupts */ 4388 amdgpu_irq_disable_all(adev); 4389 if (adev->mode_info.mode_config_initialized) { 4390 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4391 drm_helper_force_disable_all(adev_to_drm(adev)); 4392 else 4393 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4394 } 4395 amdgpu_fence_driver_hw_fini(adev); 4396 4397 if (adev->mman.initialized) 4398 drain_workqueue(adev->mman.bdev.wq); 4399 4400 if (adev->pm.sysfs_initialized) 4401 amdgpu_pm_sysfs_fini(adev); 4402 if (adev->ucode_sysfs_en) 4403 amdgpu_ucode_sysfs_fini(adev); 4404 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4405 amdgpu_fru_sysfs_fini(adev); 4406 4407 amdgpu_reg_state_sysfs_fini(adev); 4408 4409 /* disable ras feature must before hw fini */ 4410 amdgpu_ras_pre_fini(adev); 4411 4412 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4413 4414 amdgpu_device_ip_fini_early(adev); 4415 4416 amdgpu_irq_fini_hw(adev); 4417 4418 if (adev->mman.initialized) 4419 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4420 4421 amdgpu_gart_dummy_page_fini(adev); 4422 4423 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4424 amdgpu_device_unmap_mmio(adev); 4425 4426 } 4427 4428 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4429 { 4430 int idx; 4431 bool px; 4432 4433 amdgpu_fence_driver_sw_fini(adev); 4434 amdgpu_device_ip_fini(adev); 4435 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4436 adev->accel_working = false; 4437 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4438 4439 amdgpu_reset_fini(adev); 4440 4441 /* free i2c buses */ 4442 if (!amdgpu_device_has_dc_support(adev)) 4443 amdgpu_i2c_fini(adev); 4444 4445 if (amdgpu_emu_mode != 1) 4446 amdgpu_atombios_fini(adev); 4447 4448 kfree(adev->bios); 4449 adev->bios = NULL; 4450 4451 kfree(adev->fru_info); 4452 adev->fru_info = NULL; 4453 4454 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4455 4456 if (px || (!dev_is_removable(&adev->pdev->dev) && 4457 apple_gmux_detect(NULL, NULL))) 4458 vga_switcheroo_unregister_client(adev->pdev); 4459 4460 if (px) 4461 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4462 4463 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4464 vga_client_unregister(adev->pdev); 4465 4466 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4467 4468 iounmap(adev->rmmio); 4469 adev->rmmio = NULL; 4470 amdgpu_doorbell_fini(adev); 4471 drm_dev_exit(idx); 4472 } 4473 4474 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4475 amdgpu_pmu_fini(adev); 4476 if (adev->mman.discovery_bin) 4477 amdgpu_discovery_fini(adev); 4478 4479 amdgpu_reset_put_reset_domain(adev->reset_domain); 4480 adev->reset_domain = NULL; 4481 4482 kfree(adev->pci_state); 4483 4484 } 4485 4486 /** 4487 * amdgpu_device_evict_resources - evict device resources 4488 * @adev: amdgpu device object 4489 * 4490 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4491 * of the vram memory type. Mainly used for evicting device resources 4492 * at suspend time. 4493 * 4494 */ 4495 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4496 { 4497 int ret; 4498 4499 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4500 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4501 return 0; 4502 4503 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4504 if (ret) 4505 DRM_WARN("evicting device resources failed\n"); 4506 return ret; 4507 } 4508 4509 /* 4510 * Suspend & resume. 4511 */ 4512 /** 4513 * amdgpu_device_prepare - prepare for device suspend 4514 * 4515 * @dev: drm dev pointer 4516 * 4517 * Prepare to put the hw in the suspend state (all asics). 4518 * Returns 0 for success or an error on failure. 4519 * Called at driver suspend. 4520 */ 4521 int amdgpu_device_prepare(struct drm_device *dev) 4522 { 4523 struct amdgpu_device *adev = drm_to_adev(dev); 4524 int i, r; 4525 4526 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4527 return 0; 4528 4529 /* Evict the majority of BOs before starting suspend sequence */ 4530 r = amdgpu_device_evict_resources(adev); 4531 if (r) 4532 return r; 4533 4534 for (i = 0; i < adev->num_ip_blocks; i++) { 4535 if (!adev->ip_blocks[i].status.valid) 4536 continue; 4537 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4538 continue; 4539 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4540 if (r) 4541 return r; 4542 } 4543 4544 return 0; 4545 } 4546 4547 /** 4548 * amdgpu_device_suspend - initiate device suspend 4549 * 4550 * @dev: drm dev pointer 4551 * @fbcon : notify the fbdev of suspend 4552 * 4553 * Puts the hw in the suspend state (all asics). 4554 * Returns 0 for success or an error on failure. 4555 * Called at driver suspend. 4556 */ 4557 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4558 { 4559 struct amdgpu_device *adev = drm_to_adev(dev); 4560 int r = 0; 4561 4562 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4563 return 0; 4564 4565 adev->in_suspend = true; 4566 4567 if (amdgpu_sriov_vf(adev)) { 4568 amdgpu_virt_fini_data_exchange(adev); 4569 r = amdgpu_virt_request_full_gpu(adev, false); 4570 if (r) 4571 return r; 4572 } 4573 4574 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4575 DRM_WARN("smart shift update failed\n"); 4576 4577 if (fbcon) 4578 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4579 4580 cancel_delayed_work_sync(&adev->delayed_init_work); 4581 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4582 4583 amdgpu_ras_suspend(adev); 4584 4585 amdgpu_device_ip_suspend_phase1(adev); 4586 4587 if (!adev->in_s0ix) 4588 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4589 4590 r = amdgpu_device_evict_resources(adev); 4591 if (r) 4592 return r; 4593 4594 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4595 4596 amdgpu_fence_driver_hw_fini(adev); 4597 4598 amdgpu_device_ip_suspend_phase2(adev); 4599 4600 if (amdgpu_sriov_vf(adev)) 4601 amdgpu_virt_release_full_gpu(adev, false); 4602 4603 r = amdgpu_dpm_notify_rlc_state(adev, false); 4604 if (r) 4605 return r; 4606 4607 return 0; 4608 } 4609 4610 /** 4611 * amdgpu_device_resume - initiate device resume 4612 * 4613 * @dev: drm dev pointer 4614 * @fbcon : notify the fbdev of resume 4615 * 4616 * Bring the hw back to operating state (all asics). 4617 * Returns 0 for success or an error on failure. 4618 * Called at driver resume. 4619 */ 4620 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4621 { 4622 struct amdgpu_device *adev = drm_to_adev(dev); 4623 int r = 0; 4624 4625 if (amdgpu_sriov_vf(adev)) { 4626 r = amdgpu_virt_request_full_gpu(adev, true); 4627 if (r) 4628 return r; 4629 } 4630 4631 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4632 return 0; 4633 4634 if (adev->in_s0ix) 4635 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4636 4637 /* post card */ 4638 if (amdgpu_device_need_post(adev)) { 4639 r = amdgpu_device_asic_init(adev); 4640 if (r) 4641 dev_err(adev->dev, "amdgpu asic init failed\n"); 4642 } 4643 4644 r = amdgpu_device_ip_resume(adev); 4645 4646 if (r) { 4647 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4648 goto exit; 4649 } 4650 amdgpu_fence_driver_hw_init(adev); 4651 4652 if (!adev->in_s0ix) { 4653 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4654 if (r) 4655 goto exit; 4656 } 4657 4658 r = amdgpu_device_ip_late_init(adev); 4659 if (r) 4660 goto exit; 4661 4662 queue_delayed_work(system_wq, &adev->delayed_init_work, 4663 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4664 exit: 4665 if (amdgpu_sriov_vf(adev)) { 4666 amdgpu_virt_init_data_exchange(adev); 4667 amdgpu_virt_release_full_gpu(adev, true); 4668 } 4669 4670 if (r) 4671 return r; 4672 4673 /* Make sure IB tests flushed */ 4674 flush_delayed_work(&adev->delayed_init_work); 4675 4676 if (fbcon) 4677 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4678 4679 amdgpu_ras_resume(adev); 4680 4681 if (adev->mode_info.num_crtc) { 4682 /* 4683 * Most of the connector probing functions try to acquire runtime pm 4684 * refs to ensure that the GPU is powered on when connector polling is 4685 * performed. Since we're calling this from a runtime PM callback, 4686 * trying to acquire rpm refs will cause us to deadlock. 4687 * 4688 * Since we're guaranteed to be holding the rpm lock, it's safe to 4689 * temporarily disable the rpm helpers so this doesn't deadlock us. 4690 */ 4691 #ifdef CONFIG_PM 4692 dev->dev->power.disable_depth++; 4693 #endif 4694 if (!adev->dc_enabled) 4695 drm_helper_hpd_irq_event(dev); 4696 else 4697 drm_kms_helper_hotplug_event(dev); 4698 #ifdef CONFIG_PM 4699 dev->dev->power.disable_depth--; 4700 #endif 4701 } 4702 adev->in_suspend = false; 4703 4704 if (adev->enable_mes) 4705 amdgpu_mes_self_test(adev); 4706 4707 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4708 DRM_WARN("smart shift update failed\n"); 4709 4710 return 0; 4711 } 4712 4713 /** 4714 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4715 * 4716 * @adev: amdgpu_device pointer 4717 * 4718 * The list of all the hardware IPs that make up the asic is walked and 4719 * the check_soft_reset callbacks are run. check_soft_reset determines 4720 * if the asic is still hung or not. 4721 * Returns true if any of the IPs are still in a hung state, false if not. 4722 */ 4723 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4724 { 4725 int i; 4726 bool asic_hang = false; 4727 4728 if (amdgpu_sriov_vf(adev)) 4729 return true; 4730 4731 if (amdgpu_asic_need_full_reset(adev)) 4732 return true; 4733 4734 for (i = 0; i < adev->num_ip_blocks; i++) { 4735 if (!adev->ip_blocks[i].status.valid) 4736 continue; 4737 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4738 adev->ip_blocks[i].status.hang = 4739 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4740 if (adev->ip_blocks[i].status.hang) { 4741 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4742 asic_hang = true; 4743 } 4744 } 4745 return asic_hang; 4746 } 4747 4748 /** 4749 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4750 * 4751 * @adev: amdgpu_device pointer 4752 * 4753 * The list of all the hardware IPs that make up the asic is walked and the 4754 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4755 * handles any IP specific hardware or software state changes that are 4756 * necessary for a soft reset to succeed. 4757 * Returns 0 on success, negative error code on failure. 4758 */ 4759 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4760 { 4761 int i, r = 0; 4762 4763 for (i = 0; i < adev->num_ip_blocks; i++) { 4764 if (!adev->ip_blocks[i].status.valid) 4765 continue; 4766 if (adev->ip_blocks[i].status.hang && 4767 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4768 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4769 if (r) 4770 return r; 4771 } 4772 } 4773 4774 return 0; 4775 } 4776 4777 /** 4778 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4779 * 4780 * @adev: amdgpu_device pointer 4781 * 4782 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4783 * reset is necessary to recover. 4784 * Returns true if a full asic reset is required, false if not. 4785 */ 4786 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4787 { 4788 int i; 4789 4790 if (amdgpu_asic_need_full_reset(adev)) 4791 return true; 4792 4793 for (i = 0; i < adev->num_ip_blocks; i++) { 4794 if (!adev->ip_blocks[i].status.valid) 4795 continue; 4796 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4797 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4798 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4799 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4800 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4801 if (adev->ip_blocks[i].status.hang) { 4802 dev_info(adev->dev, "Some block need full reset!\n"); 4803 return true; 4804 } 4805 } 4806 } 4807 return false; 4808 } 4809 4810 /** 4811 * amdgpu_device_ip_soft_reset - do a soft reset 4812 * 4813 * @adev: amdgpu_device pointer 4814 * 4815 * The list of all the hardware IPs that make up the asic is walked and the 4816 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4817 * IP specific hardware or software state changes that are necessary to soft 4818 * reset the IP. 4819 * Returns 0 on success, negative error code on failure. 4820 */ 4821 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4822 { 4823 int i, r = 0; 4824 4825 for (i = 0; i < adev->num_ip_blocks; i++) { 4826 if (!adev->ip_blocks[i].status.valid) 4827 continue; 4828 if (adev->ip_blocks[i].status.hang && 4829 adev->ip_blocks[i].version->funcs->soft_reset) { 4830 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4831 if (r) 4832 return r; 4833 } 4834 } 4835 4836 return 0; 4837 } 4838 4839 /** 4840 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4841 * 4842 * @adev: amdgpu_device pointer 4843 * 4844 * The list of all the hardware IPs that make up the asic is walked and the 4845 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4846 * handles any IP specific hardware or software state changes that are 4847 * necessary after the IP has been soft reset. 4848 * Returns 0 on success, negative error code on failure. 4849 */ 4850 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4851 { 4852 int i, r = 0; 4853 4854 for (i = 0; i < adev->num_ip_blocks; i++) { 4855 if (!adev->ip_blocks[i].status.valid) 4856 continue; 4857 if (adev->ip_blocks[i].status.hang && 4858 adev->ip_blocks[i].version->funcs->post_soft_reset) 4859 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4860 if (r) 4861 return r; 4862 } 4863 4864 return 0; 4865 } 4866 4867 /** 4868 * amdgpu_device_recover_vram - Recover some VRAM contents 4869 * 4870 * @adev: amdgpu_device pointer 4871 * 4872 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4873 * restore things like GPUVM page tables after a GPU reset where 4874 * the contents of VRAM might be lost. 4875 * 4876 * Returns: 4877 * 0 on success, negative error code on failure. 4878 */ 4879 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4880 { 4881 struct dma_fence *fence = NULL, *next = NULL; 4882 struct amdgpu_bo *shadow; 4883 struct amdgpu_bo_vm *vmbo; 4884 long r = 1, tmo; 4885 4886 if (amdgpu_sriov_runtime(adev)) 4887 tmo = msecs_to_jiffies(8000); 4888 else 4889 tmo = msecs_to_jiffies(100); 4890 4891 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4892 mutex_lock(&adev->shadow_list_lock); 4893 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4894 /* If vm is compute context or adev is APU, shadow will be NULL */ 4895 if (!vmbo->shadow) 4896 continue; 4897 shadow = vmbo->shadow; 4898 4899 /* No need to recover an evicted BO */ 4900 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4901 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4902 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4903 continue; 4904 4905 r = amdgpu_bo_restore_shadow(shadow, &next); 4906 if (r) 4907 break; 4908 4909 if (fence) { 4910 tmo = dma_fence_wait_timeout(fence, false, tmo); 4911 dma_fence_put(fence); 4912 fence = next; 4913 if (tmo == 0) { 4914 r = -ETIMEDOUT; 4915 break; 4916 } else if (tmo < 0) { 4917 r = tmo; 4918 break; 4919 } 4920 } else { 4921 fence = next; 4922 } 4923 } 4924 mutex_unlock(&adev->shadow_list_lock); 4925 4926 if (fence) 4927 tmo = dma_fence_wait_timeout(fence, false, tmo); 4928 dma_fence_put(fence); 4929 4930 if (r < 0 || tmo <= 0) { 4931 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4932 return -EIO; 4933 } 4934 4935 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4936 return 0; 4937 } 4938 4939 4940 /** 4941 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4942 * 4943 * @adev: amdgpu_device pointer 4944 * @from_hypervisor: request from hypervisor 4945 * 4946 * do VF FLR and reinitialize Asic 4947 * return 0 means succeeded otherwise failed 4948 */ 4949 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4950 bool from_hypervisor) 4951 { 4952 int r; 4953 struct amdgpu_hive_info *hive = NULL; 4954 int retry_limit = 0; 4955 4956 retry: 4957 amdgpu_amdkfd_pre_reset(adev); 4958 4959 if (from_hypervisor) 4960 r = amdgpu_virt_request_full_gpu(adev, true); 4961 else 4962 r = amdgpu_virt_reset_gpu(adev); 4963 if (r) 4964 return r; 4965 amdgpu_irq_gpu_reset_resume_helper(adev); 4966 4967 /* some sw clean up VF needs to do before recover */ 4968 amdgpu_virt_post_reset(adev); 4969 4970 /* Resume IP prior to SMC */ 4971 r = amdgpu_device_ip_reinit_early_sriov(adev); 4972 if (r) 4973 goto error; 4974 4975 amdgpu_virt_init_data_exchange(adev); 4976 4977 r = amdgpu_device_fw_loading(adev); 4978 if (r) 4979 return r; 4980 4981 /* now we are okay to resume SMC/CP/SDMA */ 4982 r = amdgpu_device_ip_reinit_late_sriov(adev); 4983 if (r) 4984 goto error; 4985 4986 hive = amdgpu_get_xgmi_hive(adev); 4987 /* Update PSP FW topology after reset */ 4988 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4989 r = amdgpu_xgmi_update_topology(hive, adev); 4990 4991 if (hive) 4992 amdgpu_put_xgmi_hive(hive); 4993 4994 if (!r) { 4995 r = amdgpu_ib_ring_tests(adev); 4996 4997 amdgpu_amdkfd_post_reset(adev); 4998 } 4999 5000 error: 5001 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 5002 amdgpu_inc_vram_lost(adev); 5003 r = amdgpu_device_recover_vram(adev); 5004 } 5005 amdgpu_virt_release_full_gpu(adev, true); 5006 5007 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 5008 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 5009 retry_limit++; 5010 goto retry; 5011 } else 5012 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 5013 } 5014 5015 return r; 5016 } 5017 5018 /** 5019 * amdgpu_device_has_job_running - check if there is any job in mirror list 5020 * 5021 * @adev: amdgpu_device pointer 5022 * 5023 * check if there is any job in mirror list 5024 */ 5025 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5026 { 5027 int i; 5028 struct drm_sched_job *job; 5029 5030 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5031 struct amdgpu_ring *ring = adev->rings[i]; 5032 5033 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5034 continue; 5035 5036 spin_lock(&ring->sched.job_list_lock); 5037 job = list_first_entry_or_null(&ring->sched.pending_list, 5038 struct drm_sched_job, list); 5039 spin_unlock(&ring->sched.job_list_lock); 5040 if (job) 5041 return true; 5042 } 5043 return false; 5044 } 5045 5046 /** 5047 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5048 * 5049 * @adev: amdgpu_device pointer 5050 * 5051 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5052 * a hung GPU. 5053 */ 5054 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5055 { 5056 5057 if (amdgpu_gpu_recovery == 0) 5058 goto disabled; 5059 5060 /* Skip soft reset check in fatal error mode */ 5061 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5062 return true; 5063 5064 if (amdgpu_sriov_vf(adev)) 5065 return true; 5066 5067 if (amdgpu_gpu_recovery == -1) { 5068 switch (adev->asic_type) { 5069 #ifdef CONFIG_DRM_AMDGPU_SI 5070 case CHIP_VERDE: 5071 case CHIP_TAHITI: 5072 case CHIP_PITCAIRN: 5073 case CHIP_OLAND: 5074 case CHIP_HAINAN: 5075 #endif 5076 #ifdef CONFIG_DRM_AMDGPU_CIK 5077 case CHIP_KAVERI: 5078 case CHIP_KABINI: 5079 case CHIP_MULLINS: 5080 #endif 5081 case CHIP_CARRIZO: 5082 case CHIP_STONEY: 5083 case CHIP_CYAN_SKILLFISH: 5084 goto disabled; 5085 default: 5086 break; 5087 } 5088 } 5089 5090 return true; 5091 5092 disabled: 5093 dev_info(adev->dev, "GPU recovery disabled.\n"); 5094 return false; 5095 } 5096 5097 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5098 { 5099 u32 i; 5100 int ret = 0; 5101 5102 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5103 5104 dev_info(adev->dev, "GPU mode1 reset\n"); 5105 5106 /* disable BM */ 5107 pci_clear_master(adev->pdev); 5108 5109 amdgpu_device_cache_pci_state(adev->pdev); 5110 5111 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5112 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5113 ret = amdgpu_dpm_mode1_reset(adev); 5114 } else { 5115 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5116 ret = psp_gpu_reset(adev); 5117 } 5118 5119 if (ret) 5120 goto mode1_reset_failed; 5121 5122 amdgpu_device_load_pci_state(adev->pdev); 5123 ret = amdgpu_psp_wait_for_bootloader(adev); 5124 if (ret) 5125 goto mode1_reset_failed; 5126 5127 /* wait for asic to come out of reset */ 5128 for (i = 0; i < adev->usec_timeout; i++) { 5129 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5130 5131 if (memsize != 0xffffffff) 5132 break; 5133 udelay(1); 5134 } 5135 5136 if (i >= adev->usec_timeout) { 5137 ret = -ETIMEDOUT; 5138 goto mode1_reset_failed; 5139 } 5140 5141 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5142 5143 return 0; 5144 5145 mode1_reset_failed: 5146 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5147 return ret; 5148 } 5149 5150 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5151 struct amdgpu_reset_context *reset_context) 5152 { 5153 int i, r = 0; 5154 struct amdgpu_job *job = NULL; 5155 bool need_full_reset = 5156 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5157 5158 if (reset_context->reset_req_dev == adev) 5159 job = reset_context->job; 5160 5161 if (amdgpu_sriov_vf(adev)) { 5162 /* stop the data exchange thread */ 5163 amdgpu_virt_fini_data_exchange(adev); 5164 } 5165 5166 amdgpu_fence_driver_isr_toggle(adev, true); 5167 5168 /* block all schedulers and reset given job's ring */ 5169 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5170 struct amdgpu_ring *ring = adev->rings[i]; 5171 5172 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5173 continue; 5174 5175 /* Clear job fence from fence drv to avoid force_completion 5176 * leave NULL and vm flush fence in fence drv 5177 */ 5178 amdgpu_fence_driver_clear_job_fences(ring); 5179 5180 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5181 amdgpu_fence_driver_force_completion(ring); 5182 } 5183 5184 amdgpu_fence_driver_isr_toggle(adev, false); 5185 5186 if (job && job->vm) 5187 drm_sched_increase_karma(&job->base); 5188 5189 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5190 /* If reset handler not implemented, continue; otherwise return */ 5191 if (r == -EOPNOTSUPP) 5192 r = 0; 5193 else 5194 return r; 5195 5196 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5197 if (!amdgpu_sriov_vf(adev)) { 5198 5199 if (!need_full_reset) 5200 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5201 5202 if (!need_full_reset && amdgpu_gpu_recovery && 5203 amdgpu_device_ip_check_soft_reset(adev)) { 5204 amdgpu_device_ip_pre_soft_reset(adev); 5205 r = amdgpu_device_ip_soft_reset(adev); 5206 amdgpu_device_ip_post_soft_reset(adev); 5207 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5208 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5209 need_full_reset = true; 5210 } 5211 } 5212 5213 if (need_full_reset) 5214 r = amdgpu_device_ip_suspend(adev); 5215 if (need_full_reset) 5216 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5217 else 5218 clear_bit(AMDGPU_NEED_FULL_RESET, 5219 &reset_context->flags); 5220 } 5221 5222 return r; 5223 } 5224 5225 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5226 { 5227 int i; 5228 5229 lockdep_assert_held(&adev->reset_domain->sem); 5230 5231 for (i = 0; i < adev->reset_info.num_regs; i++) { 5232 adev->reset_info.reset_dump_reg_value[i] = 5233 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5234 5235 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5236 adev->reset_info.reset_dump_reg_value[i]); 5237 } 5238 5239 return 0; 5240 } 5241 5242 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5243 struct amdgpu_reset_context *reset_context) 5244 { 5245 struct amdgpu_device *tmp_adev = NULL; 5246 bool need_full_reset, skip_hw_reset, vram_lost = false; 5247 int r = 0; 5248 bool gpu_reset_for_dev_remove = 0; 5249 5250 /* Try reset handler method first */ 5251 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5252 reset_list); 5253 amdgpu_reset_reg_dumps(tmp_adev); 5254 5255 reset_context->reset_device_list = device_list_handle; 5256 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5257 /* If reset handler not implemented, continue; otherwise return */ 5258 if (r == -EOPNOTSUPP) 5259 r = 0; 5260 else 5261 return r; 5262 5263 /* Reset handler not implemented, use the default method */ 5264 need_full_reset = 5265 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5266 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5267 5268 gpu_reset_for_dev_remove = 5269 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5270 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5271 5272 /* 5273 * ASIC reset has to be done on all XGMI hive nodes ASAP 5274 * to allow proper links negotiation in FW (within 1 sec) 5275 */ 5276 if (!skip_hw_reset && need_full_reset) { 5277 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5278 /* For XGMI run all resets in parallel to speed up the process */ 5279 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5280 tmp_adev->gmc.xgmi.pending_reset = false; 5281 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5282 r = -EALREADY; 5283 } else 5284 r = amdgpu_asic_reset(tmp_adev); 5285 5286 if (r) { 5287 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5288 r, adev_to_drm(tmp_adev)->unique); 5289 goto out; 5290 } 5291 } 5292 5293 /* For XGMI wait for all resets to complete before proceed */ 5294 if (!r) { 5295 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5296 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5297 flush_work(&tmp_adev->xgmi_reset_work); 5298 r = tmp_adev->asic_reset_res; 5299 if (r) 5300 break; 5301 } 5302 } 5303 } 5304 } 5305 5306 if (!r && amdgpu_ras_intr_triggered()) { 5307 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5308 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5309 } 5310 5311 amdgpu_ras_intr_cleared(); 5312 } 5313 5314 /* Since the mode1 reset affects base ip blocks, the 5315 * phase1 ip blocks need to be resumed. Otherwise there 5316 * will be a BIOS signature error and the psp bootloader 5317 * can't load kdb on the next amdgpu install. 5318 */ 5319 if (gpu_reset_for_dev_remove) { 5320 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5321 amdgpu_device_ip_resume_phase1(tmp_adev); 5322 5323 goto end; 5324 } 5325 5326 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5327 if (need_full_reset) { 5328 /* post card */ 5329 r = amdgpu_device_asic_init(tmp_adev); 5330 if (r) { 5331 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5332 } else { 5333 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5334 5335 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5336 if (r) 5337 goto out; 5338 5339 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5340 5341 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5342 5343 if (vram_lost) { 5344 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5345 amdgpu_inc_vram_lost(tmp_adev); 5346 } 5347 5348 r = amdgpu_device_fw_loading(tmp_adev); 5349 if (r) 5350 return r; 5351 5352 r = amdgpu_xcp_restore_partition_mode( 5353 tmp_adev->xcp_mgr); 5354 if (r) 5355 goto out; 5356 5357 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5358 if (r) 5359 goto out; 5360 5361 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5362 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5363 5364 if (vram_lost) 5365 amdgpu_device_fill_reset_magic(tmp_adev); 5366 5367 /* 5368 * Add this ASIC as tracked as reset was already 5369 * complete successfully. 5370 */ 5371 amdgpu_register_gpu_instance(tmp_adev); 5372 5373 if (!reset_context->hive && 5374 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5375 amdgpu_xgmi_add_device(tmp_adev); 5376 5377 r = amdgpu_device_ip_late_init(tmp_adev); 5378 if (r) 5379 goto out; 5380 5381 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5382 5383 /* 5384 * The GPU enters bad state once faulty pages 5385 * by ECC has reached the threshold, and ras 5386 * recovery is scheduled next. So add one check 5387 * here to break recovery if it indeed exceeds 5388 * bad page threshold, and remind user to 5389 * retire this GPU or setting one bigger 5390 * bad_page_threshold value to fix this once 5391 * probing driver again. 5392 */ 5393 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5394 /* must succeed. */ 5395 amdgpu_ras_resume(tmp_adev); 5396 } else { 5397 r = -EINVAL; 5398 goto out; 5399 } 5400 5401 /* Update PSP FW topology after reset */ 5402 if (reset_context->hive && 5403 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5404 r = amdgpu_xgmi_update_topology( 5405 reset_context->hive, tmp_adev); 5406 } 5407 } 5408 5409 out: 5410 if (!r) { 5411 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5412 r = amdgpu_ib_ring_tests(tmp_adev); 5413 if (r) { 5414 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5415 need_full_reset = true; 5416 r = -EAGAIN; 5417 goto end; 5418 } 5419 } 5420 5421 if (!r) 5422 r = amdgpu_device_recover_vram(tmp_adev); 5423 else 5424 tmp_adev->asic_reset_res = r; 5425 } 5426 5427 end: 5428 if (need_full_reset) 5429 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5430 else 5431 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5432 return r; 5433 } 5434 5435 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5436 { 5437 5438 switch (amdgpu_asic_reset_method(adev)) { 5439 case AMD_RESET_METHOD_MODE1: 5440 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5441 break; 5442 case AMD_RESET_METHOD_MODE2: 5443 adev->mp1_state = PP_MP1_STATE_RESET; 5444 break; 5445 default: 5446 adev->mp1_state = PP_MP1_STATE_NONE; 5447 break; 5448 } 5449 } 5450 5451 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5452 { 5453 amdgpu_vf_error_trans_all(adev); 5454 adev->mp1_state = PP_MP1_STATE_NONE; 5455 } 5456 5457 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5458 { 5459 struct pci_dev *p = NULL; 5460 5461 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5462 adev->pdev->bus->number, 1); 5463 if (p) { 5464 pm_runtime_enable(&(p->dev)); 5465 pm_runtime_resume(&(p->dev)); 5466 } 5467 5468 pci_dev_put(p); 5469 } 5470 5471 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5472 { 5473 enum amd_reset_method reset_method; 5474 struct pci_dev *p = NULL; 5475 u64 expires; 5476 5477 /* 5478 * For now, only BACO and mode1 reset are confirmed 5479 * to suffer the audio issue without proper suspended. 5480 */ 5481 reset_method = amdgpu_asic_reset_method(adev); 5482 if ((reset_method != AMD_RESET_METHOD_BACO) && 5483 (reset_method != AMD_RESET_METHOD_MODE1)) 5484 return -EINVAL; 5485 5486 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5487 adev->pdev->bus->number, 1); 5488 if (!p) 5489 return -ENODEV; 5490 5491 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5492 if (!expires) 5493 /* 5494 * If we cannot get the audio device autosuspend delay, 5495 * a fixed 4S interval will be used. Considering 3S is 5496 * the audio controller default autosuspend delay setting. 5497 * 4S used here is guaranteed to cover that. 5498 */ 5499 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5500 5501 while (!pm_runtime_status_suspended(&(p->dev))) { 5502 if (!pm_runtime_suspend(&(p->dev))) 5503 break; 5504 5505 if (expires < ktime_get_mono_fast_ns()) { 5506 dev_warn(adev->dev, "failed to suspend display audio\n"); 5507 pci_dev_put(p); 5508 /* TODO: abort the succeeding gpu reset? */ 5509 return -ETIMEDOUT; 5510 } 5511 } 5512 5513 pm_runtime_disable(&(p->dev)); 5514 5515 pci_dev_put(p); 5516 return 0; 5517 } 5518 5519 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5520 { 5521 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5522 5523 #if defined(CONFIG_DEBUG_FS) 5524 if (!amdgpu_sriov_vf(adev)) 5525 cancel_work(&adev->reset_work); 5526 #endif 5527 5528 if (adev->kfd.dev) 5529 cancel_work(&adev->kfd.reset_work); 5530 5531 if (amdgpu_sriov_vf(adev)) 5532 cancel_work(&adev->virt.flr_work); 5533 5534 if (con && adev->ras_enabled) 5535 cancel_work(&con->recovery_work); 5536 5537 } 5538 5539 /** 5540 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5541 * 5542 * @adev: amdgpu_device pointer 5543 * @job: which job trigger hang 5544 * @reset_context: amdgpu reset context pointer 5545 * 5546 * Attempt to reset the GPU if it has hung (all asics). 5547 * Attempt to do soft-reset or full-reset and reinitialize Asic 5548 * Returns 0 for success or an error on failure. 5549 */ 5550 5551 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5552 struct amdgpu_job *job, 5553 struct amdgpu_reset_context *reset_context) 5554 { 5555 struct list_head device_list, *device_list_handle = NULL; 5556 bool job_signaled = false; 5557 struct amdgpu_hive_info *hive = NULL; 5558 struct amdgpu_device *tmp_adev = NULL; 5559 int i, r = 0; 5560 bool need_emergency_restart = false; 5561 bool audio_suspended = false; 5562 bool gpu_reset_for_dev_remove = false; 5563 5564 gpu_reset_for_dev_remove = 5565 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5566 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5567 5568 /* 5569 * Special case: RAS triggered and full reset isn't supported 5570 */ 5571 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5572 5573 /* 5574 * Flush RAM to disk so that after reboot 5575 * the user can read log and see why the system rebooted. 5576 */ 5577 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5578 amdgpu_ras_get_context(adev)->reboot) { 5579 DRM_WARN("Emergency reboot."); 5580 5581 ksys_sync_helper(); 5582 emergency_restart(); 5583 } 5584 5585 dev_info(adev->dev, "GPU %s begin!\n", 5586 need_emergency_restart ? "jobs stop":"reset"); 5587 5588 if (!amdgpu_sriov_vf(adev)) 5589 hive = amdgpu_get_xgmi_hive(adev); 5590 if (hive) 5591 mutex_lock(&hive->hive_lock); 5592 5593 reset_context->job = job; 5594 reset_context->hive = hive; 5595 /* 5596 * Build list of devices to reset. 5597 * In case we are in XGMI hive mode, resort the device list 5598 * to put adev in the 1st position. 5599 */ 5600 INIT_LIST_HEAD(&device_list); 5601 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5602 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5603 list_add_tail(&tmp_adev->reset_list, &device_list); 5604 if (gpu_reset_for_dev_remove && adev->shutdown) 5605 tmp_adev->shutdown = true; 5606 } 5607 if (!list_is_first(&adev->reset_list, &device_list)) 5608 list_rotate_to_front(&adev->reset_list, &device_list); 5609 device_list_handle = &device_list; 5610 } else { 5611 list_add_tail(&adev->reset_list, &device_list); 5612 device_list_handle = &device_list; 5613 } 5614 5615 /* We need to lock reset domain only once both for XGMI and single device */ 5616 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5617 reset_list); 5618 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5619 5620 /* block all schedulers and reset given job's ring */ 5621 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5622 5623 amdgpu_device_set_mp1_state(tmp_adev); 5624 5625 /* 5626 * Try to put the audio codec into suspend state 5627 * before gpu reset started. 5628 * 5629 * Due to the power domain of the graphics device 5630 * is shared with AZ power domain. Without this, 5631 * we may change the audio hardware from behind 5632 * the audio driver's back. That will trigger 5633 * some audio codec errors. 5634 */ 5635 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5636 audio_suspended = true; 5637 5638 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5639 5640 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5641 5642 if (!amdgpu_sriov_vf(tmp_adev)) 5643 amdgpu_amdkfd_pre_reset(tmp_adev); 5644 5645 /* 5646 * Mark these ASICs to be reseted as untracked first 5647 * And add them back after reset completed 5648 */ 5649 amdgpu_unregister_gpu_instance(tmp_adev); 5650 5651 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5652 5653 /* disable ras on ALL IPs */ 5654 if (!need_emergency_restart && 5655 amdgpu_device_ip_need_full_reset(tmp_adev)) 5656 amdgpu_ras_suspend(tmp_adev); 5657 5658 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5659 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5660 5661 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5662 continue; 5663 5664 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5665 5666 if (need_emergency_restart) 5667 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5668 } 5669 atomic_inc(&tmp_adev->gpu_reset_counter); 5670 } 5671 5672 if (need_emergency_restart) 5673 goto skip_sched_resume; 5674 5675 /* 5676 * Must check guilty signal here since after this point all old 5677 * HW fences are force signaled. 5678 * 5679 * job->base holds a reference to parent fence 5680 */ 5681 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5682 job_signaled = true; 5683 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5684 goto skip_hw_reset; 5685 } 5686 5687 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5688 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5689 if (gpu_reset_for_dev_remove) { 5690 /* Workaroud for ASICs need to disable SMC first */ 5691 amdgpu_device_smu_fini_early(tmp_adev); 5692 } 5693 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5694 /*TODO Should we stop ?*/ 5695 if (r) { 5696 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5697 r, adev_to_drm(tmp_adev)->unique); 5698 tmp_adev->asic_reset_res = r; 5699 } 5700 5701 /* 5702 * Drop all pending non scheduler resets. Scheduler resets 5703 * were already dropped during drm_sched_stop 5704 */ 5705 amdgpu_device_stop_pending_resets(tmp_adev); 5706 } 5707 5708 /* Actual ASIC resets if needed.*/ 5709 /* Host driver will handle XGMI hive reset for SRIOV */ 5710 if (amdgpu_sriov_vf(adev)) { 5711 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5712 if (r) 5713 adev->asic_reset_res = r; 5714 5715 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5716 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5717 IP_VERSION(9, 4, 2) || 5718 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5719 amdgpu_ras_resume(adev); 5720 } else { 5721 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5722 if (r && r == -EAGAIN) 5723 goto retry; 5724 5725 if (!r && gpu_reset_for_dev_remove) 5726 goto recover_end; 5727 } 5728 5729 skip_hw_reset: 5730 5731 /* Post ASIC reset for all devs .*/ 5732 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5733 5734 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5735 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5736 5737 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 5738 continue; 5739 5740 drm_sched_start(&ring->sched, true); 5741 } 5742 5743 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5744 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5745 5746 if (tmp_adev->asic_reset_res) 5747 r = tmp_adev->asic_reset_res; 5748 5749 tmp_adev->asic_reset_res = 0; 5750 5751 if (r) { 5752 /* bad news, how to tell it to userspace ? */ 5753 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5754 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5755 } else { 5756 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5757 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5758 DRM_WARN("smart shift update failed\n"); 5759 } 5760 } 5761 5762 skip_sched_resume: 5763 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5764 /* unlock kfd: SRIOV would do it separately */ 5765 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5766 amdgpu_amdkfd_post_reset(tmp_adev); 5767 5768 /* kfd_post_reset will do nothing if kfd device is not initialized, 5769 * need to bring up kfd here if it's not be initialized before 5770 */ 5771 if (!adev->kfd.init_complete) 5772 amdgpu_amdkfd_device_init(adev); 5773 5774 if (audio_suspended) 5775 amdgpu_device_resume_display_audio(tmp_adev); 5776 5777 amdgpu_device_unset_mp1_state(tmp_adev); 5778 5779 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5780 } 5781 5782 recover_end: 5783 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5784 reset_list); 5785 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5786 5787 if (hive) { 5788 mutex_unlock(&hive->hive_lock); 5789 amdgpu_put_xgmi_hive(hive); 5790 } 5791 5792 if (r) 5793 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5794 5795 atomic_set(&adev->reset_domain->reset_res, r); 5796 return r; 5797 } 5798 5799 /** 5800 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5801 * 5802 * @adev: amdgpu_device pointer 5803 * @speed: pointer to the speed of the link 5804 * @width: pointer to the width of the link 5805 * 5806 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5807 * first physical partner to an AMD dGPU. 5808 * This will exclude any virtual switches and links. 5809 */ 5810 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5811 enum pci_bus_speed *speed, 5812 enum pcie_link_width *width) 5813 { 5814 struct pci_dev *parent = adev->pdev; 5815 5816 if (!speed || !width) 5817 return; 5818 5819 *speed = PCI_SPEED_UNKNOWN; 5820 *width = PCIE_LNK_WIDTH_UNKNOWN; 5821 5822 while ((parent = pci_upstream_bridge(parent))) { 5823 /* skip upstream/downstream switches internal to dGPU*/ 5824 if (parent->vendor == PCI_VENDOR_ID_ATI) 5825 continue; 5826 *speed = pcie_get_speed_cap(parent); 5827 *width = pcie_get_width_cap(parent); 5828 break; 5829 } 5830 } 5831 5832 /** 5833 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5834 * 5835 * @adev: amdgpu_device pointer 5836 * 5837 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5838 * and lanes) of the slot the device is in. Handles APUs and 5839 * virtualized environments where PCIE config space may not be available. 5840 */ 5841 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5842 { 5843 struct pci_dev *pdev; 5844 enum pci_bus_speed speed_cap, platform_speed_cap; 5845 enum pcie_link_width platform_link_width; 5846 5847 if (amdgpu_pcie_gen_cap) 5848 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5849 5850 if (amdgpu_pcie_lane_cap) 5851 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5852 5853 /* covers APUs as well */ 5854 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5855 if (adev->pm.pcie_gen_mask == 0) 5856 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5857 if (adev->pm.pcie_mlw_mask == 0) 5858 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5859 return; 5860 } 5861 5862 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5863 return; 5864 5865 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 5866 &platform_link_width); 5867 5868 if (adev->pm.pcie_gen_mask == 0) { 5869 /* asic caps */ 5870 pdev = adev->pdev; 5871 speed_cap = pcie_get_speed_cap(pdev); 5872 if (speed_cap == PCI_SPEED_UNKNOWN) { 5873 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5874 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5875 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5876 } else { 5877 if (speed_cap == PCIE_SPEED_32_0GT) 5878 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5879 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5880 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5881 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5882 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5883 else if (speed_cap == PCIE_SPEED_16_0GT) 5884 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5885 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5886 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5887 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5888 else if (speed_cap == PCIE_SPEED_8_0GT) 5889 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5890 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5891 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5892 else if (speed_cap == PCIE_SPEED_5_0GT) 5893 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5894 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5895 else 5896 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5897 } 5898 /* platform caps */ 5899 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5900 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5901 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5902 } else { 5903 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5904 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5905 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5906 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5907 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5908 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5909 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5910 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5911 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5912 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5913 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5914 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5915 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5916 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5917 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5918 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5919 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5920 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5921 else 5922 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5923 5924 } 5925 } 5926 if (adev->pm.pcie_mlw_mask == 0) { 5927 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5928 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5929 } else { 5930 switch (platform_link_width) { 5931 case PCIE_LNK_X32: 5932 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5933 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5934 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5935 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5936 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5937 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5938 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5939 break; 5940 case PCIE_LNK_X16: 5941 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5942 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5943 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5944 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5945 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5946 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5947 break; 5948 case PCIE_LNK_X12: 5949 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5950 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5951 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5952 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5953 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5954 break; 5955 case PCIE_LNK_X8: 5956 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5957 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5958 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5959 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5960 break; 5961 case PCIE_LNK_X4: 5962 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5963 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5964 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5965 break; 5966 case PCIE_LNK_X2: 5967 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5968 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5969 break; 5970 case PCIE_LNK_X1: 5971 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5972 break; 5973 default: 5974 break; 5975 } 5976 } 5977 } 5978 } 5979 5980 /** 5981 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5982 * 5983 * @adev: amdgpu_device pointer 5984 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5985 * 5986 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5987 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5988 * @peer_adev. 5989 */ 5990 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5991 struct amdgpu_device *peer_adev) 5992 { 5993 #ifdef CONFIG_HSA_AMD_P2P 5994 uint64_t address_mask = peer_adev->dev->dma_mask ? 5995 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5996 resource_size_t aper_limit = 5997 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5998 bool p2p_access = 5999 !adev->gmc.xgmi.connected_to_cpu && 6000 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6001 6002 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 6003 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 6004 !(adev->gmc.aper_base & address_mask || 6005 aper_limit & address_mask)); 6006 #else 6007 return false; 6008 #endif 6009 } 6010 6011 int amdgpu_device_baco_enter(struct drm_device *dev) 6012 { 6013 struct amdgpu_device *adev = drm_to_adev(dev); 6014 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6015 6016 if (!amdgpu_device_supports_baco(dev)) 6017 return -ENOTSUPP; 6018 6019 if (ras && adev->ras_enabled && 6020 adev->nbio.funcs->enable_doorbell_interrupt) 6021 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6022 6023 return amdgpu_dpm_baco_enter(adev); 6024 } 6025 6026 int amdgpu_device_baco_exit(struct drm_device *dev) 6027 { 6028 struct amdgpu_device *adev = drm_to_adev(dev); 6029 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6030 int ret = 0; 6031 6032 if (!amdgpu_device_supports_baco(dev)) 6033 return -ENOTSUPP; 6034 6035 ret = amdgpu_dpm_baco_exit(adev); 6036 if (ret) 6037 return ret; 6038 6039 if (ras && adev->ras_enabled && 6040 adev->nbio.funcs->enable_doorbell_interrupt) 6041 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6042 6043 if (amdgpu_passthrough(adev) && 6044 adev->nbio.funcs->clear_doorbell_interrupt) 6045 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6046 6047 return 0; 6048 } 6049 6050 /** 6051 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6052 * @pdev: PCI device struct 6053 * @state: PCI channel state 6054 * 6055 * Description: Called when a PCI error is detected. 6056 * 6057 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6058 */ 6059 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6060 { 6061 struct drm_device *dev = pci_get_drvdata(pdev); 6062 struct amdgpu_device *adev = drm_to_adev(dev); 6063 int i; 6064 6065 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6066 6067 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6068 DRM_WARN("No support for XGMI hive yet..."); 6069 return PCI_ERS_RESULT_DISCONNECT; 6070 } 6071 6072 adev->pci_channel_state = state; 6073 6074 switch (state) { 6075 case pci_channel_io_normal: 6076 return PCI_ERS_RESULT_CAN_RECOVER; 6077 /* Fatal error, prepare for slot reset */ 6078 case pci_channel_io_frozen: 6079 /* 6080 * Locking adev->reset_domain->sem will prevent any external access 6081 * to GPU during PCI error recovery 6082 */ 6083 amdgpu_device_lock_reset_domain(adev->reset_domain); 6084 amdgpu_device_set_mp1_state(adev); 6085 6086 /* 6087 * Block any work scheduling as we do for regular GPU reset 6088 * for the duration of the recovery 6089 */ 6090 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6091 struct amdgpu_ring *ring = adev->rings[i]; 6092 6093 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 6094 continue; 6095 6096 drm_sched_stop(&ring->sched, NULL); 6097 } 6098 atomic_inc(&adev->gpu_reset_counter); 6099 return PCI_ERS_RESULT_NEED_RESET; 6100 case pci_channel_io_perm_failure: 6101 /* Permanent error, prepare for device removal */ 6102 return PCI_ERS_RESULT_DISCONNECT; 6103 } 6104 6105 return PCI_ERS_RESULT_NEED_RESET; 6106 } 6107 6108 /** 6109 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6110 * @pdev: pointer to PCI device 6111 */ 6112 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6113 { 6114 6115 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6116 6117 /* TODO - dump whatever for debugging purposes */ 6118 6119 /* This called only if amdgpu_pci_error_detected returns 6120 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6121 * works, no need to reset slot. 6122 */ 6123 6124 return PCI_ERS_RESULT_RECOVERED; 6125 } 6126 6127 /** 6128 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6129 * @pdev: PCI device struct 6130 * 6131 * Description: This routine is called by the pci error recovery 6132 * code after the PCI slot has been reset, just before we 6133 * should resume normal operations. 6134 */ 6135 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6136 { 6137 struct drm_device *dev = pci_get_drvdata(pdev); 6138 struct amdgpu_device *adev = drm_to_adev(dev); 6139 int r, i; 6140 struct amdgpu_reset_context reset_context; 6141 u32 memsize; 6142 struct list_head device_list; 6143 6144 DRM_INFO("PCI error: slot reset callback!!\n"); 6145 6146 memset(&reset_context, 0, sizeof(reset_context)); 6147 6148 INIT_LIST_HEAD(&device_list); 6149 list_add_tail(&adev->reset_list, &device_list); 6150 6151 /* wait for asic to come out of reset */ 6152 msleep(500); 6153 6154 /* Restore PCI confspace */ 6155 amdgpu_device_load_pci_state(pdev); 6156 6157 /* confirm ASIC came out of reset */ 6158 for (i = 0; i < adev->usec_timeout; i++) { 6159 memsize = amdgpu_asic_get_config_memsize(adev); 6160 6161 if (memsize != 0xffffffff) 6162 break; 6163 udelay(1); 6164 } 6165 if (memsize == 0xffffffff) { 6166 r = -ETIME; 6167 goto out; 6168 } 6169 6170 reset_context.method = AMD_RESET_METHOD_NONE; 6171 reset_context.reset_req_dev = adev; 6172 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6173 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6174 6175 adev->no_hw_access = true; 6176 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6177 adev->no_hw_access = false; 6178 if (r) 6179 goto out; 6180 6181 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6182 6183 out: 6184 if (!r) { 6185 if (amdgpu_device_cache_pci_state(adev->pdev)) 6186 pci_restore_state(adev->pdev); 6187 6188 DRM_INFO("PCIe error recovery succeeded\n"); 6189 } else { 6190 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6191 amdgpu_device_unset_mp1_state(adev); 6192 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6193 } 6194 6195 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6196 } 6197 6198 /** 6199 * amdgpu_pci_resume() - resume normal ops after PCI reset 6200 * @pdev: pointer to PCI device 6201 * 6202 * Called when the error recovery driver tells us that its 6203 * OK to resume normal operation. 6204 */ 6205 void amdgpu_pci_resume(struct pci_dev *pdev) 6206 { 6207 struct drm_device *dev = pci_get_drvdata(pdev); 6208 struct amdgpu_device *adev = drm_to_adev(dev); 6209 int i; 6210 6211 6212 DRM_INFO("PCI error: resume callback!!\n"); 6213 6214 /* Only continue execution for the case of pci_channel_io_frozen */ 6215 if (adev->pci_channel_state != pci_channel_io_frozen) 6216 return; 6217 6218 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6219 struct amdgpu_ring *ring = adev->rings[i]; 6220 6221 if (!ring || !drm_sched_wqueue_ready(&ring->sched)) 6222 continue; 6223 6224 drm_sched_start(&ring->sched, true); 6225 } 6226 6227 amdgpu_device_unset_mp1_state(adev); 6228 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6229 } 6230 6231 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6232 { 6233 struct drm_device *dev = pci_get_drvdata(pdev); 6234 struct amdgpu_device *adev = drm_to_adev(dev); 6235 int r; 6236 6237 r = pci_save_state(pdev); 6238 if (!r) { 6239 kfree(adev->pci_state); 6240 6241 adev->pci_state = pci_store_saved_state(pdev); 6242 6243 if (!adev->pci_state) { 6244 DRM_ERROR("Failed to store PCI saved state"); 6245 return false; 6246 } 6247 } else { 6248 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6249 return false; 6250 } 6251 6252 return true; 6253 } 6254 6255 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6256 { 6257 struct drm_device *dev = pci_get_drvdata(pdev); 6258 struct amdgpu_device *adev = drm_to_adev(dev); 6259 int r; 6260 6261 if (!adev->pci_state) 6262 return false; 6263 6264 r = pci_load_saved_state(pdev, adev->pci_state); 6265 6266 if (!r) { 6267 pci_restore_state(pdev); 6268 } else { 6269 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6270 return false; 6271 } 6272 6273 return true; 6274 } 6275 6276 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6277 struct amdgpu_ring *ring) 6278 { 6279 #ifdef CONFIG_X86_64 6280 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6281 return; 6282 #endif 6283 if (adev->gmc.xgmi.connected_to_cpu) 6284 return; 6285 6286 if (ring && ring->funcs->emit_hdp_flush) 6287 amdgpu_ring_emit_hdp_flush(ring); 6288 else 6289 amdgpu_asic_flush_hdp(adev, ring); 6290 } 6291 6292 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6293 struct amdgpu_ring *ring) 6294 { 6295 #ifdef CONFIG_X86_64 6296 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6297 return; 6298 #endif 6299 if (adev->gmc.xgmi.connected_to_cpu) 6300 return; 6301 6302 amdgpu_asic_invalidate_hdp(adev, ring); 6303 } 6304 6305 int amdgpu_in_reset(struct amdgpu_device *adev) 6306 { 6307 return atomic_read(&adev->reset_domain->in_gpu_reset); 6308 } 6309 6310 /** 6311 * amdgpu_device_halt() - bring hardware to some kind of halt state 6312 * 6313 * @adev: amdgpu_device pointer 6314 * 6315 * Bring hardware to some kind of halt state so that no one can touch it 6316 * any more. It will help to maintain error context when error occurred. 6317 * Compare to a simple hang, the system will keep stable at least for SSH 6318 * access. Then it should be trivial to inspect the hardware state and 6319 * see what's going on. Implemented as following: 6320 * 6321 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6322 * clears all CPU mappings to device, disallows remappings through page faults 6323 * 2. amdgpu_irq_disable_all() disables all interrupts 6324 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6325 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6326 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6327 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6328 * flush any in flight DMA operations 6329 */ 6330 void amdgpu_device_halt(struct amdgpu_device *adev) 6331 { 6332 struct pci_dev *pdev = adev->pdev; 6333 struct drm_device *ddev = adev_to_drm(adev); 6334 6335 amdgpu_xcp_dev_unplug(adev); 6336 drm_dev_unplug(ddev); 6337 6338 amdgpu_irq_disable_all(adev); 6339 6340 amdgpu_fence_driver_hw_fini(adev); 6341 6342 adev->no_hw_access = true; 6343 6344 amdgpu_device_unmap_mmio(adev); 6345 6346 pci_disable_device(pdev); 6347 pci_wait_for_pending_transaction(pdev); 6348 } 6349 6350 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6351 u32 reg) 6352 { 6353 unsigned long flags, address, data; 6354 u32 r; 6355 6356 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6357 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6358 6359 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6360 WREG32(address, reg * 4); 6361 (void)RREG32(address); 6362 r = RREG32(data); 6363 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6364 return r; 6365 } 6366 6367 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6368 u32 reg, u32 v) 6369 { 6370 unsigned long flags, address, data; 6371 6372 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6373 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6374 6375 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6376 WREG32(address, reg * 4); 6377 (void)RREG32(address); 6378 WREG32(data, v); 6379 (void)RREG32(data); 6380 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6381 } 6382 6383 /** 6384 * amdgpu_device_switch_gang - switch to a new gang 6385 * @adev: amdgpu_device pointer 6386 * @gang: the gang to switch to 6387 * 6388 * Try to switch to a new gang. 6389 * Returns: NULL if we switched to the new gang or a reference to the current 6390 * gang leader. 6391 */ 6392 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6393 struct dma_fence *gang) 6394 { 6395 struct dma_fence *old = NULL; 6396 6397 do { 6398 dma_fence_put(old); 6399 rcu_read_lock(); 6400 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6401 rcu_read_unlock(); 6402 6403 if (old == gang) 6404 break; 6405 6406 if (!dma_fence_is_signaled(old)) 6407 return old; 6408 6409 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6410 old, gang) != old); 6411 6412 dma_fence_put(old); 6413 return NULL; 6414 } 6415 6416 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6417 { 6418 switch (adev->asic_type) { 6419 #ifdef CONFIG_DRM_AMDGPU_SI 6420 case CHIP_HAINAN: 6421 #endif 6422 case CHIP_TOPAZ: 6423 /* chips with no display hardware */ 6424 return false; 6425 #ifdef CONFIG_DRM_AMDGPU_SI 6426 case CHIP_TAHITI: 6427 case CHIP_PITCAIRN: 6428 case CHIP_VERDE: 6429 case CHIP_OLAND: 6430 #endif 6431 #ifdef CONFIG_DRM_AMDGPU_CIK 6432 case CHIP_BONAIRE: 6433 case CHIP_HAWAII: 6434 case CHIP_KAVERI: 6435 case CHIP_KABINI: 6436 case CHIP_MULLINS: 6437 #endif 6438 case CHIP_TONGA: 6439 case CHIP_FIJI: 6440 case CHIP_POLARIS10: 6441 case CHIP_POLARIS11: 6442 case CHIP_POLARIS12: 6443 case CHIP_VEGAM: 6444 case CHIP_CARRIZO: 6445 case CHIP_STONEY: 6446 /* chips with display hardware */ 6447 return true; 6448 default: 6449 /* IP discovery */ 6450 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6451 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6452 return false; 6453 return true; 6454 } 6455 } 6456 6457 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6458 uint32_t inst, uint32_t reg_addr, char reg_name[], 6459 uint32_t expected_value, uint32_t mask) 6460 { 6461 uint32_t ret = 0; 6462 uint32_t old_ = 0; 6463 uint32_t tmp_ = RREG32(reg_addr); 6464 uint32_t loop = adev->usec_timeout; 6465 6466 while ((tmp_ & (mask)) != (expected_value)) { 6467 if (old_ != tmp_) { 6468 loop = adev->usec_timeout; 6469 old_ = tmp_; 6470 } else 6471 udelay(1); 6472 tmp_ = RREG32(reg_addr); 6473 loop--; 6474 if (!loop) { 6475 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6476 inst, reg_name, (uint32_t)expected_value, 6477 (uint32_t)(tmp_ & (mask))); 6478 ret = -ETIMEDOUT; 6479 break; 6480 } 6481 } 6482 return ret; 6483 } 6484