1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 166 struct bin_attribute *attr, char *buf, 167 loff_t ppos, size_t count) 168 { 169 struct device *dev = kobj_to_dev(kobj); 170 struct drm_device *ddev = dev_get_drvdata(dev); 171 struct amdgpu_device *adev = drm_to_adev(ddev); 172 ssize_t bytes_read; 173 174 switch (ppos) { 175 case AMDGPU_SYS_REG_STATE_XGMI: 176 bytes_read = amdgpu_asic_get_reg_state( 177 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 178 break; 179 case AMDGPU_SYS_REG_STATE_WAFL: 180 bytes_read = amdgpu_asic_get_reg_state( 181 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 182 break; 183 case AMDGPU_SYS_REG_STATE_PCIE: 184 bytes_read = amdgpu_asic_get_reg_state( 185 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 186 break; 187 case AMDGPU_SYS_REG_STATE_USR: 188 bytes_read = amdgpu_asic_get_reg_state( 189 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 190 break; 191 case AMDGPU_SYS_REG_STATE_USR_1: 192 bytes_read = amdgpu_asic_get_reg_state( 193 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 194 break; 195 default: 196 return -EINVAL; 197 } 198 199 return bytes_read; 200 } 201 202 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 203 AMDGPU_SYS_REG_STATE_END); 204 205 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 206 { 207 int ret; 208 209 if (!amdgpu_asic_get_reg_state_supported(adev)) 210 return 0; 211 212 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 213 214 return ret; 215 } 216 217 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 218 { 219 if (!amdgpu_asic_get_reg_state_supported(adev)) 220 return; 221 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 222 } 223 224 /** 225 * DOC: board_info 226 * 227 * The amdgpu driver provides a sysfs API for giving board related information. 228 * It provides the form factor information in the format 229 * 230 * type : form factor 231 * 232 * Possible form factor values 233 * 234 * - "cem" - PCIE CEM card 235 * - "oam" - Open Compute Accelerator Module 236 * - "unknown" - Not known 237 * 238 */ 239 240 static ssize_t amdgpu_device_get_board_info(struct device *dev, 241 struct device_attribute *attr, 242 char *buf) 243 { 244 struct drm_device *ddev = dev_get_drvdata(dev); 245 struct amdgpu_device *adev = drm_to_adev(ddev); 246 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 247 const char *pkg; 248 249 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 250 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 251 252 switch (pkg_type) { 253 case AMDGPU_PKG_TYPE_CEM: 254 pkg = "cem"; 255 break; 256 case AMDGPU_PKG_TYPE_OAM: 257 pkg = "oam"; 258 break; 259 default: 260 pkg = "unknown"; 261 break; 262 } 263 264 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 265 } 266 267 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 268 269 static struct attribute *amdgpu_board_attrs[] = { 270 &dev_attr_board_info.attr, 271 NULL, 272 }; 273 274 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 275 struct attribute *attr, int n) 276 { 277 struct device *dev = kobj_to_dev(kobj); 278 struct drm_device *ddev = dev_get_drvdata(dev); 279 struct amdgpu_device *adev = drm_to_adev(ddev); 280 281 if (adev->flags & AMD_IS_APU) 282 return 0; 283 284 return attr->mode; 285 } 286 287 static const struct attribute_group amdgpu_board_attrs_group = { 288 .attrs = amdgpu_board_attrs, 289 .is_visible = amdgpu_board_attrs_is_visible 290 }; 291 292 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 293 294 295 /** 296 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 297 * 298 * @dev: drm_device pointer 299 * 300 * Returns true if the device is a dGPU with ATPX power control, 301 * otherwise return false. 302 */ 303 bool amdgpu_device_supports_px(struct drm_device *dev) 304 { 305 struct amdgpu_device *adev = drm_to_adev(dev); 306 307 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 308 return true; 309 return false; 310 } 311 312 /** 313 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 314 * 315 * @dev: drm_device pointer 316 * 317 * Returns true if the device is a dGPU with ACPI power control, 318 * otherwise return false. 319 */ 320 bool amdgpu_device_supports_boco(struct drm_device *dev) 321 { 322 struct amdgpu_device *adev = drm_to_adev(dev); 323 324 if (adev->has_pr3 || 325 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 326 return true; 327 return false; 328 } 329 330 /** 331 * amdgpu_device_supports_baco - Does the device support BACO 332 * 333 * @dev: drm_device pointer 334 * 335 * Returns true if the device supporte BACO, 336 * otherwise return false. 337 */ 338 bool amdgpu_device_supports_baco(struct drm_device *dev) 339 { 340 struct amdgpu_device *adev = drm_to_adev(dev); 341 342 return amdgpu_asic_supports_baco(adev); 343 } 344 345 /** 346 * amdgpu_device_supports_smart_shift - Is the device dGPU with 347 * smart shift support 348 * 349 * @dev: drm_device pointer 350 * 351 * Returns true if the device is a dGPU with Smart Shift support, 352 * otherwise returns false. 353 */ 354 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 355 { 356 return (amdgpu_device_supports_boco(dev) && 357 amdgpu_acpi_is_power_shift_control_supported()); 358 } 359 360 /* 361 * VRAM access helper functions 362 */ 363 364 /** 365 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 366 * 367 * @adev: amdgpu_device pointer 368 * @pos: offset of the buffer in vram 369 * @buf: virtual address of the buffer in system memory 370 * @size: read/write size, sizeof(@buf) must > @size 371 * @write: true - write to vram, otherwise - read from vram 372 */ 373 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 374 void *buf, size_t size, bool write) 375 { 376 unsigned long flags; 377 uint32_t hi = ~0, tmp = 0; 378 uint32_t *data = buf; 379 uint64_t last; 380 int idx; 381 382 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 383 return; 384 385 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 386 387 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 388 for (last = pos + size; pos < last; pos += 4) { 389 tmp = pos >> 31; 390 391 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 392 if (tmp != hi) { 393 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 394 hi = tmp; 395 } 396 if (write) 397 WREG32_NO_KIQ(mmMM_DATA, *data++); 398 else 399 *data++ = RREG32_NO_KIQ(mmMM_DATA); 400 } 401 402 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 403 drm_dev_exit(idx); 404 } 405 406 /** 407 * amdgpu_device_aper_access - access vram by vram aperature 408 * 409 * @adev: amdgpu_device pointer 410 * @pos: offset of the buffer in vram 411 * @buf: virtual address of the buffer in system memory 412 * @size: read/write size, sizeof(@buf) must > @size 413 * @write: true - write to vram, otherwise - read from vram 414 * 415 * The return value means how many bytes have been transferred. 416 */ 417 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 418 void *buf, size_t size, bool write) 419 { 420 #ifdef CONFIG_64BIT 421 void __iomem *addr; 422 size_t count = 0; 423 uint64_t last; 424 425 if (!adev->mman.aper_base_kaddr) 426 return 0; 427 428 last = min(pos + size, adev->gmc.visible_vram_size); 429 if (last > pos) { 430 addr = adev->mman.aper_base_kaddr + pos; 431 count = last - pos; 432 433 if (write) { 434 memcpy_toio(addr, buf, count); 435 /* Make sure HDP write cache flush happens without any reordering 436 * after the system memory contents are sent over PCIe device 437 */ 438 mb(); 439 amdgpu_device_flush_hdp(adev, NULL); 440 } else { 441 amdgpu_device_invalidate_hdp(adev, NULL); 442 /* Make sure HDP read cache is invalidated before issuing a read 443 * to the PCIe device 444 */ 445 mb(); 446 memcpy_fromio(buf, addr, count); 447 } 448 449 } 450 451 return count; 452 #else 453 return 0; 454 #endif 455 } 456 457 /** 458 * amdgpu_device_vram_access - read/write a buffer in vram 459 * 460 * @adev: amdgpu_device pointer 461 * @pos: offset of the buffer in vram 462 * @buf: virtual address of the buffer in system memory 463 * @size: read/write size, sizeof(@buf) must > @size 464 * @write: true - write to vram, otherwise - read from vram 465 */ 466 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 467 void *buf, size_t size, bool write) 468 { 469 size_t count; 470 471 /* try to using vram apreature to access vram first */ 472 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 473 size -= count; 474 if (size) { 475 /* using MM to access rest vram */ 476 pos += count; 477 buf += count; 478 amdgpu_device_mm_access(adev, pos, buf, size, write); 479 } 480 } 481 482 /* 483 * register access helper functions. 484 */ 485 486 /* Check if hw access should be skipped because of hotplug or device error */ 487 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 488 { 489 if (adev->no_hw_access) 490 return true; 491 492 #ifdef CONFIG_LOCKDEP 493 /* 494 * This is a bit complicated to understand, so worth a comment. What we assert 495 * here is that the GPU reset is not running on another thread in parallel. 496 * 497 * For this we trylock the read side of the reset semaphore, if that succeeds 498 * we know that the reset is not running in paralell. 499 * 500 * If the trylock fails we assert that we are either already holding the read 501 * side of the lock or are the reset thread itself and hold the write side of 502 * the lock. 503 */ 504 if (in_task()) { 505 if (down_read_trylock(&adev->reset_domain->sem)) 506 up_read(&adev->reset_domain->sem); 507 else 508 lockdep_assert_held(&adev->reset_domain->sem); 509 } 510 #endif 511 return false; 512 } 513 514 /** 515 * amdgpu_device_rreg - read a memory mapped IO or indirect register 516 * 517 * @adev: amdgpu_device pointer 518 * @reg: dword aligned register offset 519 * @acc_flags: access flags which require special behavior 520 * 521 * Returns the 32 bit value from the offset specified. 522 */ 523 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 524 uint32_t reg, uint32_t acc_flags) 525 { 526 uint32_t ret; 527 528 if (amdgpu_device_skip_hw_access(adev)) 529 return 0; 530 531 if ((reg * 4) < adev->rmmio_size) { 532 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 533 amdgpu_sriov_runtime(adev) && 534 down_read_trylock(&adev->reset_domain->sem)) { 535 ret = amdgpu_kiq_rreg(adev, reg, 0); 536 up_read(&adev->reset_domain->sem); 537 } else { 538 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 539 } 540 } else { 541 ret = adev->pcie_rreg(adev, reg * 4); 542 } 543 544 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 545 546 return ret; 547 } 548 549 /* 550 * MMIO register read with bytes helper functions 551 * @offset:bytes offset from MMIO start 552 */ 553 554 /** 555 * amdgpu_mm_rreg8 - read a memory mapped IO register 556 * 557 * @adev: amdgpu_device pointer 558 * @offset: byte aligned register offset 559 * 560 * Returns the 8 bit value from the offset specified. 561 */ 562 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 563 { 564 if (amdgpu_device_skip_hw_access(adev)) 565 return 0; 566 567 if (offset < adev->rmmio_size) 568 return (readb(adev->rmmio + offset)); 569 BUG(); 570 } 571 572 573 /** 574 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 575 * 576 * @adev: amdgpu_device pointer 577 * @reg: dword aligned register offset 578 * @acc_flags: access flags which require special behavior 579 * @xcc_id: xcc accelerated compute core id 580 * 581 * Returns the 32 bit value from the offset specified. 582 */ 583 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 584 uint32_t reg, uint32_t acc_flags, 585 uint32_t xcc_id) 586 { 587 uint32_t ret, rlcg_flag; 588 589 if (amdgpu_device_skip_hw_access(adev)) 590 return 0; 591 592 if ((reg * 4) < adev->rmmio_size) { 593 if (amdgpu_sriov_vf(adev) && 594 !amdgpu_sriov_runtime(adev) && 595 adev->gfx.rlc.rlcg_reg_access_supported && 596 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 597 GC_HWIP, false, 598 &rlcg_flag)) { 599 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); 600 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 601 amdgpu_sriov_runtime(adev) && 602 down_read_trylock(&adev->reset_domain->sem)) { 603 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 604 up_read(&adev->reset_domain->sem); 605 } else { 606 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 607 } 608 } else { 609 ret = adev->pcie_rreg(adev, reg * 4); 610 } 611 612 return ret; 613 } 614 615 /* 616 * MMIO register write with bytes helper functions 617 * @offset:bytes offset from MMIO start 618 * @value: the value want to be written to the register 619 */ 620 621 /** 622 * amdgpu_mm_wreg8 - read a memory mapped IO register 623 * 624 * @adev: amdgpu_device pointer 625 * @offset: byte aligned register offset 626 * @value: 8 bit value to write 627 * 628 * Writes the value specified to the offset specified. 629 */ 630 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 631 { 632 if (amdgpu_device_skip_hw_access(adev)) 633 return; 634 635 if (offset < adev->rmmio_size) 636 writeb(value, adev->rmmio + offset); 637 else 638 BUG(); 639 } 640 641 /** 642 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 643 * 644 * @adev: amdgpu_device pointer 645 * @reg: dword aligned register offset 646 * @v: 32 bit value to write to the register 647 * @acc_flags: access flags which require special behavior 648 * 649 * Writes the value specified to the offset specified. 650 */ 651 void amdgpu_device_wreg(struct amdgpu_device *adev, 652 uint32_t reg, uint32_t v, 653 uint32_t acc_flags) 654 { 655 if (amdgpu_device_skip_hw_access(adev)) 656 return; 657 658 if ((reg * 4) < adev->rmmio_size) { 659 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 660 amdgpu_sriov_runtime(adev) && 661 down_read_trylock(&adev->reset_domain->sem)) { 662 amdgpu_kiq_wreg(adev, reg, v, 0); 663 up_read(&adev->reset_domain->sem); 664 } else { 665 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 666 } 667 } else { 668 adev->pcie_wreg(adev, reg * 4, v); 669 } 670 671 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 672 } 673 674 /** 675 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 676 * 677 * @adev: amdgpu_device pointer 678 * @reg: mmio/rlc register 679 * @v: value to write 680 * @xcc_id: xcc accelerated compute core id 681 * 682 * this function is invoked only for the debugfs register access 683 */ 684 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 685 uint32_t reg, uint32_t v, 686 uint32_t xcc_id) 687 { 688 if (amdgpu_device_skip_hw_access(adev)) 689 return; 690 691 if (amdgpu_sriov_fullaccess(adev) && 692 adev->gfx.rlc.funcs && 693 adev->gfx.rlc.funcs->is_rlcg_access_range) { 694 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 695 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 696 } else if ((reg * 4) >= adev->rmmio_size) { 697 adev->pcie_wreg(adev, reg * 4, v); 698 } else { 699 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 700 } 701 } 702 703 /** 704 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 705 * 706 * @adev: amdgpu_device pointer 707 * @reg: dword aligned register offset 708 * @v: 32 bit value to write to the register 709 * @acc_flags: access flags which require special behavior 710 * @xcc_id: xcc accelerated compute core id 711 * 712 * Writes the value specified to the offset specified. 713 */ 714 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 715 uint32_t reg, uint32_t v, 716 uint32_t acc_flags, uint32_t xcc_id) 717 { 718 uint32_t rlcg_flag; 719 720 if (amdgpu_device_skip_hw_access(adev)) 721 return; 722 723 if ((reg * 4) < adev->rmmio_size) { 724 if (amdgpu_sriov_vf(adev) && 725 !amdgpu_sriov_runtime(adev) && 726 adev->gfx.rlc.rlcg_reg_access_supported && 727 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 728 GC_HWIP, true, 729 &rlcg_flag)) { 730 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); 731 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 732 amdgpu_sriov_runtime(adev) && 733 down_read_trylock(&adev->reset_domain->sem)) { 734 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 735 up_read(&adev->reset_domain->sem); 736 } else { 737 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 738 } 739 } else { 740 adev->pcie_wreg(adev, reg * 4, v); 741 } 742 } 743 744 /** 745 * amdgpu_device_indirect_rreg - read an indirect register 746 * 747 * @adev: amdgpu_device pointer 748 * @reg_addr: indirect register address to read from 749 * 750 * Returns the value of indirect register @reg_addr 751 */ 752 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 753 u32 reg_addr) 754 { 755 unsigned long flags, pcie_index, pcie_data; 756 void __iomem *pcie_index_offset; 757 void __iomem *pcie_data_offset; 758 u32 r; 759 760 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 761 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 762 763 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 764 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 765 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 766 767 writel(reg_addr, pcie_index_offset); 768 readl(pcie_index_offset); 769 r = readl(pcie_data_offset); 770 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 771 772 return r; 773 } 774 775 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 776 u64 reg_addr) 777 { 778 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 779 u32 r; 780 void __iomem *pcie_index_offset; 781 void __iomem *pcie_index_hi_offset; 782 void __iomem *pcie_data_offset; 783 784 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 785 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 786 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 787 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 788 else 789 pcie_index_hi = 0; 790 791 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 792 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 793 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 794 if (pcie_index_hi != 0) 795 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 796 pcie_index_hi * 4; 797 798 writel(reg_addr, pcie_index_offset); 799 readl(pcie_index_offset); 800 if (pcie_index_hi != 0) { 801 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 802 readl(pcie_index_hi_offset); 803 } 804 r = readl(pcie_data_offset); 805 806 /* clear the high bits */ 807 if (pcie_index_hi != 0) { 808 writel(0, pcie_index_hi_offset); 809 readl(pcie_index_hi_offset); 810 } 811 812 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 813 814 return r; 815 } 816 817 /** 818 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 819 * 820 * @adev: amdgpu_device pointer 821 * @reg_addr: indirect register address to read from 822 * 823 * Returns the value of indirect register @reg_addr 824 */ 825 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 826 u32 reg_addr) 827 { 828 unsigned long flags, pcie_index, pcie_data; 829 void __iomem *pcie_index_offset; 830 void __iomem *pcie_data_offset; 831 u64 r; 832 833 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 834 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 835 836 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 837 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 838 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 839 840 /* read low 32 bits */ 841 writel(reg_addr, pcie_index_offset); 842 readl(pcie_index_offset); 843 r = readl(pcie_data_offset); 844 /* read high 32 bits */ 845 writel(reg_addr + 4, pcie_index_offset); 846 readl(pcie_index_offset); 847 r |= ((u64)readl(pcie_data_offset) << 32); 848 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 849 850 return r; 851 } 852 853 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 854 u64 reg_addr) 855 { 856 unsigned long flags, pcie_index, pcie_data; 857 unsigned long pcie_index_hi = 0; 858 void __iomem *pcie_index_offset; 859 void __iomem *pcie_index_hi_offset; 860 void __iomem *pcie_data_offset; 861 u64 r; 862 863 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 864 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 865 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 866 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 867 868 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 869 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 870 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 871 if (pcie_index_hi != 0) 872 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 873 pcie_index_hi * 4; 874 875 /* read low 32 bits */ 876 writel(reg_addr, pcie_index_offset); 877 readl(pcie_index_offset); 878 if (pcie_index_hi != 0) { 879 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 880 readl(pcie_index_hi_offset); 881 } 882 r = readl(pcie_data_offset); 883 /* read high 32 bits */ 884 writel(reg_addr + 4, pcie_index_offset); 885 readl(pcie_index_offset); 886 if (pcie_index_hi != 0) { 887 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 888 readl(pcie_index_hi_offset); 889 } 890 r |= ((u64)readl(pcie_data_offset) << 32); 891 892 /* clear the high bits */ 893 if (pcie_index_hi != 0) { 894 writel(0, pcie_index_hi_offset); 895 readl(pcie_index_hi_offset); 896 } 897 898 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 899 900 return r; 901 } 902 903 /** 904 * amdgpu_device_indirect_wreg - write an indirect register address 905 * 906 * @adev: amdgpu_device pointer 907 * @reg_addr: indirect register offset 908 * @reg_data: indirect register data 909 * 910 */ 911 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 912 u32 reg_addr, u32 reg_data) 913 { 914 unsigned long flags, pcie_index, pcie_data; 915 void __iomem *pcie_index_offset; 916 void __iomem *pcie_data_offset; 917 918 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 919 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 920 921 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 922 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 923 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 924 925 writel(reg_addr, pcie_index_offset); 926 readl(pcie_index_offset); 927 writel(reg_data, pcie_data_offset); 928 readl(pcie_data_offset); 929 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 930 } 931 932 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 933 u64 reg_addr, u32 reg_data) 934 { 935 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 936 void __iomem *pcie_index_offset; 937 void __iomem *pcie_index_hi_offset; 938 void __iomem *pcie_data_offset; 939 940 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 941 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 942 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 943 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 944 else 945 pcie_index_hi = 0; 946 947 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 948 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 949 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 950 if (pcie_index_hi != 0) 951 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 952 pcie_index_hi * 4; 953 954 writel(reg_addr, pcie_index_offset); 955 readl(pcie_index_offset); 956 if (pcie_index_hi != 0) { 957 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 958 readl(pcie_index_hi_offset); 959 } 960 writel(reg_data, pcie_data_offset); 961 readl(pcie_data_offset); 962 963 /* clear the high bits */ 964 if (pcie_index_hi != 0) { 965 writel(0, pcie_index_hi_offset); 966 readl(pcie_index_hi_offset); 967 } 968 969 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 970 } 971 972 /** 973 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 974 * 975 * @adev: amdgpu_device pointer 976 * @reg_addr: indirect register offset 977 * @reg_data: indirect register data 978 * 979 */ 980 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 981 u32 reg_addr, u64 reg_data) 982 { 983 unsigned long flags, pcie_index, pcie_data; 984 void __iomem *pcie_index_offset; 985 void __iomem *pcie_data_offset; 986 987 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 988 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 989 990 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 991 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 992 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 993 994 /* write low 32 bits */ 995 writel(reg_addr, pcie_index_offset); 996 readl(pcie_index_offset); 997 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 998 readl(pcie_data_offset); 999 /* write high 32 bits */ 1000 writel(reg_addr + 4, pcie_index_offset); 1001 readl(pcie_index_offset); 1002 writel((u32)(reg_data >> 32), pcie_data_offset); 1003 readl(pcie_data_offset); 1004 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1005 } 1006 1007 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1008 u64 reg_addr, u64 reg_data) 1009 { 1010 unsigned long flags, pcie_index, pcie_data; 1011 unsigned long pcie_index_hi = 0; 1012 void __iomem *pcie_index_offset; 1013 void __iomem *pcie_index_hi_offset; 1014 void __iomem *pcie_data_offset; 1015 1016 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1017 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1018 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1019 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1020 1021 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1022 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1023 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1024 if (pcie_index_hi != 0) 1025 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1026 pcie_index_hi * 4; 1027 1028 /* write low 32 bits */ 1029 writel(reg_addr, pcie_index_offset); 1030 readl(pcie_index_offset); 1031 if (pcie_index_hi != 0) { 1032 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1033 readl(pcie_index_hi_offset); 1034 } 1035 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1036 readl(pcie_data_offset); 1037 /* write high 32 bits */ 1038 writel(reg_addr + 4, pcie_index_offset); 1039 readl(pcie_index_offset); 1040 if (pcie_index_hi != 0) { 1041 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1042 readl(pcie_index_hi_offset); 1043 } 1044 writel((u32)(reg_data >> 32), pcie_data_offset); 1045 readl(pcie_data_offset); 1046 1047 /* clear the high bits */ 1048 if (pcie_index_hi != 0) { 1049 writel(0, pcie_index_hi_offset); 1050 readl(pcie_index_hi_offset); 1051 } 1052 1053 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1054 } 1055 1056 /** 1057 * amdgpu_device_get_rev_id - query device rev_id 1058 * 1059 * @adev: amdgpu_device pointer 1060 * 1061 * Return device rev_id 1062 */ 1063 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1064 { 1065 return adev->nbio.funcs->get_rev_id(adev); 1066 } 1067 1068 /** 1069 * amdgpu_invalid_rreg - dummy reg read function 1070 * 1071 * @adev: amdgpu_device pointer 1072 * @reg: offset of register 1073 * 1074 * Dummy register read function. Used for register blocks 1075 * that certain asics don't have (all asics). 1076 * Returns the value in the register. 1077 */ 1078 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1079 { 1080 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1081 BUG(); 1082 return 0; 1083 } 1084 1085 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1086 { 1087 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1088 BUG(); 1089 return 0; 1090 } 1091 1092 /** 1093 * amdgpu_invalid_wreg - dummy reg write function 1094 * 1095 * @adev: amdgpu_device pointer 1096 * @reg: offset of register 1097 * @v: value to write to the register 1098 * 1099 * Dummy register read function. Used for register blocks 1100 * that certain asics don't have (all asics). 1101 */ 1102 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1103 { 1104 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1105 reg, v); 1106 BUG(); 1107 } 1108 1109 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1110 { 1111 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1112 reg, v); 1113 BUG(); 1114 } 1115 1116 /** 1117 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1118 * 1119 * @adev: amdgpu_device pointer 1120 * @reg: offset of register 1121 * 1122 * Dummy register read function. Used for register blocks 1123 * that certain asics don't have (all asics). 1124 * Returns the value in the register. 1125 */ 1126 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1127 { 1128 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1129 BUG(); 1130 return 0; 1131 } 1132 1133 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1134 { 1135 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1136 BUG(); 1137 return 0; 1138 } 1139 1140 /** 1141 * amdgpu_invalid_wreg64 - dummy reg write function 1142 * 1143 * @adev: amdgpu_device pointer 1144 * @reg: offset of register 1145 * @v: value to write to the register 1146 * 1147 * Dummy register read function. Used for register blocks 1148 * that certain asics don't have (all asics). 1149 */ 1150 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1151 { 1152 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1153 reg, v); 1154 BUG(); 1155 } 1156 1157 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1158 { 1159 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1160 reg, v); 1161 BUG(); 1162 } 1163 1164 /** 1165 * amdgpu_block_invalid_rreg - dummy reg read function 1166 * 1167 * @adev: amdgpu_device pointer 1168 * @block: offset of instance 1169 * @reg: offset of register 1170 * 1171 * Dummy register read function. Used for register blocks 1172 * that certain asics don't have (all asics). 1173 * Returns the value in the register. 1174 */ 1175 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1176 uint32_t block, uint32_t reg) 1177 { 1178 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1179 reg, block); 1180 BUG(); 1181 return 0; 1182 } 1183 1184 /** 1185 * amdgpu_block_invalid_wreg - dummy reg write function 1186 * 1187 * @adev: amdgpu_device pointer 1188 * @block: offset of instance 1189 * @reg: offset of register 1190 * @v: value to write to the register 1191 * 1192 * Dummy register read function. Used for register blocks 1193 * that certain asics don't have (all asics). 1194 */ 1195 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1196 uint32_t block, 1197 uint32_t reg, uint32_t v) 1198 { 1199 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1200 reg, block, v); 1201 BUG(); 1202 } 1203 1204 /** 1205 * amdgpu_device_asic_init - Wrapper for atom asic_init 1206 * 1207 * @adev: amdgpu_device pointer 1208 * 1209 * Does any asic specific work and then calls atom asic init. 1210 */ 1211 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1212 { 1213 int ret; 1214 1215 amdgpu_asic_pre_asic_init(adev); 1216 1217 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1218 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1219 amdgpu_psp_wait_for_bootloader(adev); 1220 ret = amdgpu_atomfirmware_asic_init(adev, true); 1221 /* TODO: check the return val and stop device initialization if boot fails */ 1222 amdgpu_psp_query_boot_status(adev); 1223 return ret; 1224 } else { 1225 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1226 } 1227 1228 return 0; 1229 } 1230 1231 /** 1232 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1233 * 1234 * @adev: amdgpu_device pointer 1235 * 1236 * Allocates a scratch page of VRAM for use by various things in the 1237 * driver. 1238 */ 1239 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1240 { 1241 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1242 AMDGPU_GEM_DOMAIN_VRAM | 1243 AMDGPU_GEM_DOMAIN_GTT, 1244 &adev->mem_scratch.robj, 1245 &adev->mem_scratch.gpu_addr, 1246 (void **)&adev->mem_scratch.ptr); 1247 } 1248 1249 /** 1250 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1251 * 1252 * @adev: amdgpu_device pointer 1253 * 1254 * Frees the VRAM scratch page. 1255 */ 1256 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1257 { 1258 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1259 } 1260 1261 /** 1262 * amdgpu_device_program_register_sequence - program an array of registers. 1263 * 1264 * @adev: amdgpu_device pointer 1265 * @registers: pointer to the register array 1266 * @array_size: size of the register array 1267 * 1268 * Programs an array or registers with and or masks. 1269 * This is a helper for setting golden registers. 1270 */ 1271 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1272 const u32 *registers, 1273 const u32 array_size) 1274 { 1275 u32 tmp, reg, and_mask, or_mask; 1276 int i; 1277 1278 if (array_size % 3) 1279 return; 1280 1281 for (i = 0; i < array_size; i += 3) { 1282 reg = registers[i + 0]; 1283 and_mask = registers[i + 1]; 1284 or_mask = registers[i + 2]; 1285 1286 if (and_mask == 0xffffffff) { 1287 tmp = or_mask; 1288 } else { 1289 tmp = RREG32(reg); 1290 tmp &= ~and_mask; 1291 if (adev->family >= AMDGPU_FAMILY_AI) 1292 tmp |= (or_mask & and_mask); 1293 else 1294 tmp |= or_mask; 1295 } 1296 WREG32(reg, tmp); 1297 } 1298 } 1299 1300 /** 1301 * amdgpu_device_pci_config_reset - reset the GPU 1302 * 1303 * @adev: amdgpu_device pointer 1304 * 1305 * Resets the GPU using the pci config reset sequence. 1306 * Only applicable to asics prior to vega10. 1307 */ 1308 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1309 { 1310 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1311 } 1312 1313 /** 1314 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1315 * 1316 * @adev: amdgpu_device pointer 1317 * 1318 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1319 */ 1320 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1321 { 1322 return pci_reset_function(adev->pdev); 1323 } 1324 1325 /* 1326 * amdgpu_device_wb_*() 1327 * Writeback is the method by which the GPU updates special pages in memory 1328 * with the status of certain GPU events (fences, ring pointers,etc.). 1329 */ 1330 1331 /** 1332 * amdgpu_device_wb_fini - Disable Writeback and free memory 1333 * 1334 * @adev: amdgpu_device pointer 1335 * 1336 * Disables Writeback and frees the Writeback memory (all asics). 1337 * Used at driver shutdown. 1338 */ 1339 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1340 { 1341 if (adev->wb.wb_obj) { 1342 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1343 &adev->wb.gpu_addr, 1344 (void **)&adev->wb.wb); 1345 adev->wb.wb_obj = NULL; 1346 } 1347 } 1348 1349 /** 1350 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1351 * 1352 * @adev: amdgpu_device pointer 1353 * 1354 * Initializes writeback and allocates writeback memory (all asics). 1355 * Used at driver startup. 1356 * Returns 0 on success or an -error on failure. 1357 */ 1358 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1359 { 1360 int r; 1361 1362 if (adev->wb.wb_obj == NULL) { 1363 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1364 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1365 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1366 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1367 (void **)&adev->wb.wb); 1368 if (r) { 1369 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1370 return r; 1371 } 1372 1373 adev->wb.num_wb = AMDGPU_MAX_WB; 1374 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1375 1376 /* clear wb memory */ 1377 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1378 } 1379 1380 return 0; 1381 } 1382 1383 /** 1384 * amdgpu_device_wb_get - Allocate a wb entry 1385 * 1386 * @adev: amdgpu_device pointer 1387 * @wb: wb index 1388 * 1389 * Allocate a wb slot for use by the driver (all asics). 1390 * Returns 0 on success or -EINVAL on failure. 1391 */ 1392 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1393 { 1394 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1395 1396 if (offset < adev->wb.num_wb) { 1397 __set_bit(offset, adev->wb.used); 1398 *wb = offset << 3; /* convert to dw offset */ 1399 return 0; 1400 } else { 1401 return -EINVAL; 1402 } 1403 } 1404 1405 /** 1406 * amdgpu_device_wb_free - Free a wb entry 1407 * 1408 * @adev: amdgpu_device pointer 1409 * @wb: wb index 1410 * 1411 * Free a wb slot allocated for use by the driver (all asics) 1412 */ 1413 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1414 { 1415 wb >>= 3; 1416 if (wb < adev->wb.num_wb) 1417 __clear_bit(wb, adev->wb.used); 1418 } 1419 1420 /** 1421 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1422 * 1423 * @adev: amdgpu_device pointer 1424 * 1425 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1426 * to fail, but if any of the BARs is not accessible after the size we abort 1427 * driver loading by returning -ENODEV. 1428 */ 1429 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1430 { 1431 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1432 struct pci_bus *root; 1433 struct resource *res; 1434 unsigned int i; 1435 u16 cmd; 1436 int r; 1437 1438 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1439 return 0; 1440 1441 /* Bypass for VF */ 1442 if (amdgpu_sriov_vf(adev)) 1443 return 0; 1444 1445 /* skip if the bios has already enabled large BAR */ 1446 if (adev->gmc.real_vram_size && 1447 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1448 return 0; 1449 1450 /* Check if the root BUS has 64bit memory resources */ 1451 root = adev->pdev->bus; 1452 while (root->parent) 1453 root = root->parent; 1454 1455 pci_bus_for_each_resource(root, res, i) { 1456 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1457 res->start > 0x100000000ull) 1458 break; 1459 } 1460 1461 /* Trying to resize is pointless without a root hub window above 4GB */ 1462 if (!res) 1463 return 0; 1464 1465 /* Limit the BAR size to what is available */ 1466 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1467 rbar_size); 1468 1469 /* Disable memory decoding while we change the BAR addresses and size */ 1470 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1471 pci_write_config_word(adev->pdev, PCI_COMMAND, 1472 cmd & ~PCI_COMMAND_MEMORY); 1473 1474 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1475 amdgpu_doorbell_fini(adev); 1476 if (adev->asic_type >= CHIP_BONAIRE) 1477 pci_release_resource(adev->pdev, 2); 1478 1479 pci_release_resource(adev->pdev, 0); 1480 1481 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1482 if (r == -ENOSPC) 1483 DRM_INFO("Not enough PCI address space for a large BAR."); 1484 else if (r && r != -ENOTSUPP) 1485 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1486 1487 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1488 1489 /* When the doorbell or fb BAR isn't available we have no chance of 1490 * using the device. 1491 */ 1492 r = amdgpu_doorbell_init(adev); 1493 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1494 return -ENODEV; 1495 1496 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1497 1498 return 0; 1499 } 1500 1501 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1502 { 1503 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1504 return false; 1505 1506 return true; 1507 } 1508 1509 /* 1510 * GPU helpers function. 1511 */ 1512 /** 1513 * amdgpu_device_need_post - check if the hw need post or not 1514 * 1515 * @adev: amdgpu_device pointer 1516 * 1517 * Check if the asic has been initialized (all asics) at driver startup 1518 * or post is needed if hw reset is performed. 1519 * Returns true if need or false if not. 1520 */ 1521 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1522 { 1523 uint32_t reg; 1524 1525 if (amdgpu_sriov_vf(adev)) 1526 return false; 1527 1528 if (!amdgpu_device_read_bios(adev)) 1529 return false; 1530 1531 if (amdgpu_passthrough(adev)) { 1532 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1533 * some old smc fw still need driver do vPost otherwise gpu hang, while 1534 * those smc fw version above 22.15 doesn't have this flaw, so we force 1535 * vpost executed for smc version below 22.15 1536 */ 1537 if (adev->asic_type == CHIP_FIJI) { 1538 int err; 1539 uint32_t fw_ver; 1540 1541 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1542 /* force vPost if error occured */ 1543 if (err) 1544 return true; 1545 1546 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1547 release_firmware(adev->pm.fw); 1548 if (fw_ver < 0x00160e00) 1549 return true; 1550 } 1551 } 1552 1553 /* Don't post if we need to reset whole hive on init */ 1554 if (adev->gmc.xgmi.pending_reset) 1555 return false; 1556 1557 if (adev->has_hw_reset) { 1558 adev->has_hw_reset = false; 1559 return true; 1560 } 1561 1562 /* bios scratch used on CIK+ */ 1563 if (adev->asic_type >= CHIP_BONAIRE) 1564 return amdgpu_atombios_scratch_need_asic_init(adev); 1565 1566 /* check MEM_SIZE for older asics */ 1567 reg = amdgpu_asic_get_config_memsize(adev); 1568 1569 if ((reg != 0) && (reg != 0xffffffff)) 1570 return false; 1571 1572 return true; 1573 } 1574 1575 /* 1576 * Check whether seamless boot is supported. 1577 * 1578 * So far we only support seamless boot on DCE 3.0 or later. 1579 * If users report that it works on older ASICS as well, we may 1580 * loosen this. 1581 */ 1582 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1583 { 1584 switch (amdgpu_seamless) { 1585 case -1: 1586 break; 1587 case 1: 1588 return true; 1589 case 0: 1590 return false; 1591 default: 1592 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1593 amdgpu_seamless); 1594 return false; 1595 } 1596 1597 if (!(adev->flags & AMD_IS_APU)) 1598 return false; 1599 1600 if (adev->mman.keep_stolen_vga_memory) 1601 return false; 1602 1603 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1604 } 1605 1606 /* 1607 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1608 * don't support dynamic speed switching. Until we have confirmation from Intel 1609 * that a specific host supports it, it's safer that we keep it disabled for all. 1610 * 1611 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1612 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1613 */ 1614 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1615 { 1616 #if IS_ENABLED(CONFIG_X86) 1617 struct cpuinfo_x86 *c = &cpu_data(0); 1618 1619 /* eGPU change speeds based on USB4 fabric conditions */ 1620 if (dev_is_removable(adev->dev)) 1621 return true; 1622 1623 if (c->x86_vendor == X86_VENDOR_INTEL) 1624 return false; 1625 #endif 1626 return true; 1627 } 1628 1629 /** 1630 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1631 * 1632 * @adev: amdgpu_device pointer 1633 * 1634 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1635 * be set for this device. 1636 * 1637 * Returns true if it should be used or false if not. 1638 */ 1639 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1640 { 1641 switch (amdgpu_aspm) { 1642 case -1: 1643 break; 1644 case 0: 1645 return false; 1646 case 1: 1647 return true; 1648 default: 1649 return false; 1650 } 1651 if (adev->flags & AMD_IS_APU) 1652 return false; 1653 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1654 return false; 1655 return pcie_aspm_enabled(adev->pdev); 1656 } 1657 1658 /* if we get transitioned to only one device, take VGA back */ 1659 /** 1660 * amdgpu_device_vga_set_decode - enable/disable vga decode 1661 * 1662 * @pdev: PCI device pointer 1663 * @state: enable/disable vga decode 1664 * 1665 * Enable/disable vga decode (all asics). 1666 * Returns VGA resource flags. 1667 */ 1668 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1669 bool state) 1670 { 1671 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1672 1673 amdgpu_asic_set_vga_state(adev, state); 1674 if (state) 1675 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1676 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1677 else 1678 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1679 } 1680 1681 /** 1682 * amdgpu_device_check_block_size - validate the vm block size 1683 * 1684 * @adev: amdgpu_device pointer 1685 * 1686 * Validates the vm block size specified via module parameter. 1687 * The vm block size defines number of bits in page table versus page directory, 1688 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1689 * page table and the remaining bits are in the page directory. 1690 */ 1691 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1692 { 1693 /* defines number of bits in page table versus page directory, 1694 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1695 * page table and the remaining bits are in the page directory 1696 */ 1697 if (amdgpu_vm_block_size == -1) 1698 return; 1699 1700 if (amdgpu_vm_block_size < 9) { 1701 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1702 amdgpu_vm_block_size); 1703 amdgpu_vm_block_size = -1; 1704 } 1705 } 1706 1707 /** 1708 * amdgpu_device_check_vm_size - validate the vm size 1709 * 1710 * @adev: amdgpu_device pointer 1711 * 1712 * Validates the vm size in GB specified via module parameter. 1713 * The VM size is the size of the GPU virtual memory space in GB. 1714 */ 1715 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1716 { 1717 /* no need to check the default value */ 1718 if (amdgpu_vm_size == -1) 1719 return; 1720 1721 if (amdgpu_vm_size < 1) { 1722 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1723 amdgpu_vm_size); 1724 amdgpu_vm_size = -1; 1725 } 1726 } 1727 1728 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1729 { 1730 struct sysinfo si; 1731 bool is_os_64 = (sizeof(void *) == 8); 1732 uint64_t total_memory; 1733 uint64_t dram_size_seven_GB = 0x1B8000000; 1734 uint64_t dram_size_three_GB = 0xB8000000; 1735 1736 if (amdgpu_smu_memory_pool_size == 0) 1737 return; 1738 1739 if (!is_os_64) { 1740 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1741 goto def_value; 1742 } 1743 si_meminfo(&si); 1744 total_memory = (uint64_t)si.totalram * si.mem_unit; 1745 1746 if ((amdgpu_smu_memory_pool_size == 1) || 1747 (amdgpu_smu_memory_pool_size == 2)) { 1748 if (total_memory < dram_size_three_GB) 1749 goto def_value1; 1750 } else if ((amdgpu_smu_memory_pool_size == 4) || 1751 (amdgpu_smu_memory_pool_size == 8)) { 1752 if (total_memory < dram_size_seven_GB) 1753 goto def_value1; 1754 } else { 1755 DRM_WARN("Smu memory pool size not supported\n"); 1756 goto def_value; 1757 } 1758 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1759 1760 return; 1761 1762 def_value1: 1763 DRM_WARN("No enough system memory\n"); 1764 def_value: 1765 adev->pm.smu_prv_buffer_size = 0; 1766 } 1767 1768 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1769 { 1770 if (!(adev->flags & AMD_IS_APU) || 1771 adev->asic_type < CHIP_RAVEN) 1772 return 0; 1773 1774 switch (adev->asic_type) { 1775 case CHIP_RAVEN: 1776 if (adev->pdev->device == 0x15dd) 1777 adev->apu_flags |= AMD_APU_IS_RAVEN; 1778 if (adev->pdev->device == 0x15d8) 1779 adev->apu_flags |= AMD_APU_IS_PICASSO; 1780 break; 1781 case CHIP_RENOIR: 1782 if ((adev->pdev->device == 0x1636) || 1783 (adev->pdev->device == 0x164c)) 1784 adev->apu_flags |= AMD_APU_IS_RENOIR; 1785 else 1786 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1787 break; 1788 case CHIP_VANGOGH: 1789 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1790 break; 1791 case CHIP_YELLOW_CARP: 1792 break; 1793 case CHIP_CYAN_SKILLFISH: 1794 if ((adev->pdev->device == 0x13FE) || 1795 (adev->pdev->device == 0x143F)) 1796 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1797 break; 1798 default: 1799 break; 1800 } 1801 1802 return 0; 1803 } 1804 1805 /** 1806 * amdgpu_device_check_arguments - validate module params 1807 * 1808 * @adev: amdgpu_device pointer 1809 * 1810 * Validates certain module parameters and updates 1811 * the associated values used by the driver (all asics). 1812 */ 1813 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1814 { 1815 if (amdgpu_sched_jobs < 4) { 1816 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1817 amdgpu_sched_jobs); 1818 amdgpu_sched_jobs = 4; 1819 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1820 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1821 amdgpu_sched_jobs); 1822 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1823 } 1824 1825 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1826 /* gart size must be greater or equal to 32M */ 1827 dev_warn(adev->dev, "gart size (%d) too small\n", 1828 amdgpu_gart_size); 1829 amdgpu_gart_size = -1; 1830 } 1831 1832 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1833 /* gtt size must be greater or equal to 32M */ 1834 dev_warn(adev->dev, "gtt size (%d) too small\n", 1835 amdgpu_gtt_size); 1836 amdgpu_gtt_size = -1; 1837 } 1838 1839 /* valid range is between 4 and 9 inclusive */ 1840 if (amdgpu_vm_fragment_size != -1 && 1841 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1842 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1843 amdgpu_vm_fragment_size = -1; 1844 } 1845 1846 if (amdgpu_sched_hw_submission < 2) { 1847 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1848 amdgpu_sched_hw_submission); 1849 amdgpu_sched_hw_submission = 2; 1850 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1851 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1852 amdgpu_sched_hw_submission); 1853 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1854 } 1855 1856 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1857 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1858 amdgpu_reset_method = -1; 1859 } 1860 1861 amdgpu_device_check_smu_prv_buffer_size(adev); 1862 1863 amdgpu_device_check_vm_size(adev); 1864 1865 amdgpu_device_check_block_size(adev); 1866 1867 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1868 1869 return 0; 1870 } 1871 1872 /** 1873 * amdgpu_switcheroo_set_state - set switcheroo state 1874 * 1875 * @pdev: pci dev pointer 1876 * @state: vga_switcheroo state 1877 * 1878 * Callback for the switcheroo driver. Suspends or resumes 1879 * the asics before or after it is powered up using ACPI methods. 1880 */ 1881 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1882 enum vga_switcheroo_state state) 1883 { 1884 struct drm_device *dev = pci_get_drvdata(pdev); 1885 int r; 1886 1887 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1888 return; 1889 1890 if (state == VGA_SWITCHEROO_ON) { 1891 pr_info("switched on\n"); 1892 /* don't suspend or resume card normally */ 1893 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1894 1895 pci_set_power_state(pdev, PCI_D0); 1896 amdgpu_device_load_pci_state(pdev); 1897 r = pci_enable_device(pdev); 1898 if (r) 1899 DRM_WARN("pci_enable_device failed (%d)\n", r); 1900 amdgpu_device_resume(dev, true); 1901 1902 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1903 } else { 1904 pr_info("switched off\n"); 1905 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1906 amdgpu_device_prepare(dev); 1907 amdgpu_device_suspend(dev, true); 1908 amdgpu_device_cache_pci_state(pdev); 1909 /* Shut down the device */ 1910 pci_disable_device(pdev); 1911 pci_set_power_state(pdev, PCI_D3cold); 1912 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1913 } 1914 } 1915 1916 /** 1917 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1918 * 1919 * @pdev: pci dev pointer 1920 * 1921 * Callback for the switcheroo driver. Check of the switcheroo 1922 * state can be changed. 1923 * Returns true if the state can be changed, false if not. 1924 */ 1925 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1926 { 1927 struct drm_device *dev = pci_get_drvdata(pdev); 1928 1929 /* 1930 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1931 * locking inversion with the driver load path. And the access here is 1932 * completely racy anyway. So don't bother with locking for now. 1933 */ 1934 return atomic_read(&dev->open_count) == 0; 1935 } 1936 1937 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1938 .set_gpu_state = amdgpu_switcheroo_set_state, 1939 .reprobe = NULL, 1940 .can_switch = amdgpu_switcheroo_can_switch, 1941 }; 1942 1943 /** 1944 * amdgpu_device_ip_set_clockgating_state - set the CG state 1945 * 1946 * @dev: amdgpu_device pointer 1947 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1948 * @state: clockgating state (gate or ungate) 1949 * 1950 * Sets the requested clockgating state for all instances of 1951 * the hardware IP specified. 1952 * Returns the error code from the last instance. 1953 */ 1954 int amdgpu_device_ip_set_clockgating_state(void *dev, 1955 enum amd_ip_block_type block_type, 1956 enum amd_clockgating_state state) 1957 { 1958 struct amdgpu_device *adev = dev; 1959 int i, r = 0; 1960 1961 for (i = 0; i < adev->num_ip_blocks; i++) { 1962 if (!adev->ip_blocks[i].status.valid) 1963 continue; 1964 if (adev->ip_blocks[i].version->type != block_type) 1965 continue; 1966 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1967 continue; 1968 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1969 (void *)adev, state); 1970 if (r) 1971 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1972 adev->ip_blocks[i].version->funcs->name, r); 1973 } 1974 return r; 1975 } 1976 1977 /** 1978 * amdgpu_device_ip_set_powergating_state - set the PG state 1979 * 1980 * @dev: amdgpu_device pointer 1981 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1982 * @state: powergating state (gate or ungate) 1983 * 1984 * Sets the requested powergating state for all instances of 1985 * the hardware IP specified. 1986 * Returns the error code from the last instance. 1987 */ 1988 int amdgpu_device_ip_set_powergating_state(void *dev, 1989 enum amd_ip_block_type block_type, 1990 enum amd_powergating_state state) 1991 { 1992 struct amdgpu_device *adev = dev; 1993 int i, r = 0; 1994 1995 for (i = 0; i < adev->num_ip_blocks; i++) { 1996 if (!adev->ip_blocks[i].status.valid) 1997 continue; 1998 if (adev->ip_blocks[i].version->type != block_type) 1999 continue; 2000 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2001 continue; 2002 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2003 (void *)adev, state); 2004 if (r) 2005 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2006 adev->ip_blocks[i].version->funcs->name, r); 2007 } 2008 return r; 2009 } 2010 2011 /** 2012 * amdgpu_device_ip_get_clockgating_state - get the CG state 2013 * 2014 * @adev: amdgpu_device pointer 2015 * @flags: clockgating feature flags 2016 * 2017 * Walks the list of IPs on the device and updates the clockgating 2018 * flags for each IP. 2019 * Updates @flags with the feature flags for each hardware IP where 2020 * clockgating is enabled. 2021 */ 2022 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2023 u64 *flags) 2024 { 2025 int i; 2026 2027 for (i = 0; i < adev->num_ip_blocks; i++) { 2028 if (!adev->ip_blocks[i].status.valid) 2029 continue; 2030 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2031 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2032 } 2033 } 2034 2035 /** 2036 * amdgpu_device_ip_wait_for_idle - wait for idle 2037 * 2038 * @adev: amdgpu_device pointer 2039 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2040 * 2041 * Waits for the request hardware IP to be idle. 2042 * Returns 0 for success or a negative error code on failure. 2043 */ 2044 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2045 enum amd_ip_block_type block_type) 2046 { 2047 int i, r; 2048 2049 for (i = 0; i < adev->num_ip_blocks; i++) { 2050 if (!adev->ip_blocks[i].status.valid) 2051 continue; 2052 if (adev->ip_blocks[i].version->type == block_type) { 2053 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2054 if (r) 2055 return r; 2056 break; 2057 } 2058 } 2059 return 0; 2060 2061 } 2062 2063 /** 2064 * amdgpu_device_ip_is_idle - is the hardware IP idle 2065 * 2066 * @adev: amdgpu_device pointer 2067 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2068 * 2069 * Check if the hardware IP is idle or not. 2070 * Returns true if it the IP is idle, false if not. 2071 */ 2072 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2073 enum amd_ip_block_type block_type) 2074 { 2075 int i; 2076 2077 for (i = 0; i < adev->num_ip_blocks; i++) { 2078 if (!adev->ip_blocks[i].status.valid) 2079 continue; 2080 if (adev->ip_blocks[i].version->type == block_type) 2081 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2082 } 2083 return true; 2084 2085 } 2086 2087 /** 2088 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2089 * 2090 * @adev: amdgpu_device pointer 2091 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2092 * 2093 * Returns a pointer to the hardware IP block structure 2094 * if it exists for the asic, otherwise NULL. 2095 */ 2096 struct amdgpu_ip_block * 2097 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2098 enum amd_ip_block_type type) 2099 { 2100 int i; 2101 2102 for (i = 0; i < adev->num_ip_blocks; i++) 2103 if (adev->ip_blocks[i].version->type == type) 2104 return &adev->ip_blocks[i]; 2105 2106 return NULL; 2107 } 2108 2109 /** 2110 * amdgpu_device_ip_block_version_cmp 2111 * 2112 * @adev: amdgpu_device pointer 2113 * @type: enum amd_ip_block_type 2114 * @major: major version 2115 * @minor: minor version 2116 * 2117 * return 0 if equal or greater 2118 * return 1 if smaller or the ip_block doesn't exist 2119 */ 2120 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2121 enum amd_ip_block_type type, 2122 u32 major, u32 minor) 2123 { 2124 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2125 2126 if (ip_block && ((ip_block->version->major > major) || 2127 ((ip_block->version->major == major) && 2128 (ip_block->version->minor >= minor)))) 2129 return 0; 2130 2131 return 1; 2132 } 2133 2134 /** 2135 * amdgpu_device_ip_block_add 2136 * 2137 * @adev: amdgpu_device pointer 2138 * @ip_block_version: pointer to the IP to add 2139 * 2140 * Adds the IP block driver information to the collection of IPs 2141 * on the asic. 2142 */ 2143 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2144 const struct amdgpu_ip_block_version *ip_block_version) 2145 { 2146 if (!ip_block_version) 2147 return -EINVAL; 2148 2149 switch (ip_block_version->type) { 2150 case AMD_IP_BLOCK_TYPE_VCN: 2151 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2152 return 0; 2153 break; 2154 case AMD_IP_BLOCK_TYPE_JPEG: 2155 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2156 return 0; 2157 break; 2158 default: 2159 break; 2160 } 2161 2162 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2163 ip_block_version->funcs->name); 2164 2165 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2166 2167 return 0; 2168 } 2169 2170 /** 2171 * amdgpu_device_enable_virtual_display - enable virtual display feature 2172 * 2173 * @adev: amdgpu_device pointer 2174 * 2175 * Enabled the virtual display feature if the user has enabled it via 2176 * the module parameter virtual_display. This feature provides a virtual 2177 * display hardware on headless boards or in virtualized environments. 2178 * This function parses and validates the configuration string specified by 2179 * the user and configues the virtual display configuration (number of 2180 * virtual connectors, crtcs, etc.) specified. 2181 */ 2182 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2183 { 2184 adev->enable_virtual_display = false; 2185 2186 if (amdgpu_virtual_display) { 2187 const char *pci_address_name = pci_name(adev->pdev); 2188 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2189 2190 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2191 pciaddstr_tmp = pciaddstr; 2192 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2193 pciaddname = strsep(&pciaddname_tmp, ","); 2194 if (!strcmp("all", pciaddname) 2195 || !strcmp(pci_address_name, pciaddname)) { 2196 long num_crtc; 2197 int res = -1; 2198 2199 adev->enable_virtual_display = true; 2200 2201 if (pciaddname_tmp) 2202 res = kstrtol(pciaddname_tmp, 10, 2203 &num_crtc); 2204 2205 if (!res) { 2206 if (num_crtc < 1) 2207 num_crtc = 1; 2208 if (num_crtc > 6) 2209 num_crtc = 6; 2210 adev->mode_info.num_crtc = num_crtc; 2211 } else { 2212 adev->mode_info.num_crtc = 1; 2213 } 2214 break; 2215 } 2216 } 2217 2218 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2219 amdgpu_virtual_display, pci_address_name, 2220 adev->enable_virtual_display, adev->mode_info.num_crtc); 2221 2222 kfree(pciaddstr); 2223 } 2224 } 2225 2226 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2227 { 2228 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2229 adev->mode_info.num_crtc = 1; 2230 adev->enable_virtual_display = true; 2231 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2232 adev->enable_virtual_display, adev->mode_info.num_crtc); 2233 } 2234 } 2235 2236 /** 2237 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2238 * 2239 * @adev: amdgpu_device pointer 2240 * 2241 * Parses the asic configuration parameters specified in the gpu info 2242 * firmware and makes them availale to the driver for use in configuring 2243 * the asic. 2244 * Returns 0 on success, -EINVAL on failure. 2245 */ 2246 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2247 { 2248 const char *chip_name; 2249 char fw_name[40]; 2250 int err; 2251 const struct gpu_info_firmware_header_v1_0 *hdr; 2252 2253 adev->firmware.gpu_info_fw = NULL; 2254 2255 if (adev->mman.discovery_bin) 2256 return 0; 2257 2258 switch (adev->asic_type) { 2259 default: 2260 return 0; 2261 case CHIP_VEGA10: 2262 chip_name = "vega10"; 2263 break; 2264 case CHIP_VEGA12: 2265 chip_name = "vega12"; 2266 break; 2267 case CHIP_RAVEN: 2268 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2269 chip_name = "raven2"; 2270 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2271 chip_name = "picasso"; 2272 else 2273 chip_name = "raven"; 2274 break; 2275 case CHIP_ARCTURUS: 2276 chip_name = "arcturus"; 2277 break; 2278 case CHIP_NAVI12: 2279 chip_name = "navi12"; 2280 break; 2281 } 2282 2283 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2284 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2285 if (err) { 2286 dev_err(adev->dev, 2287 "Failed to get gpu_info firmware \"%s\"\n", 2288 fw_name); 2289 goto out; 2290 } 2291 2292 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2293 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2294 2295 switch (hdr->version_major) { 2296 case 1: 2297 { 2298 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2299 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2300 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2301 2302 /* 2303 * Should be droped when DAL no longer needs it. 2304 */ 2305 if (adev->asic_type == CHIP_NAVI12) 2306 goto parse_soc_bounding_box; 2307 2308 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2309 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2310 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2311 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2312 adev->gfx.config.max_texture_channel_caches = 2313 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2314 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2315 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2316 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2317 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2318 adev->gfx.config.double_offchip_lds_buf = 2319 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2320 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2321 adev->gfx.cu_info.max_waves_per_simd = 2322 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2323 adev->gfx.cu_info.max_scratch_slots_per_cu = 2324 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2325 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2326 if (hdr->version_minor >= 1) { 2327 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2328 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2329 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2330 adev->gfx.config.num_sc_per_sh = 2331 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2332 adev->gfx.config.num_packer_per_sc = 2333 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2334 } 2335 2336 parse_soc_bounding_box: 2337 /* 2338 * soc bounding box info is not integrated in disocovery table, 2339 * we always need to parse it from gpu info firmware if needed. 2340 */ 2341 if (hdr->version_minor == 2) { 2342 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2343 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2344 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2345 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2346 } 2347 break; 2348 } 2349 default: 2350 dev_err(adev->dev, 2351 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2352 err = -EINVAL; 2353 goto out; 2354 } 2355 out: 2356 return err; 2357 } 2358 2359 /** 2360 * amdgpu_device_ip_early_init - run early init for hardware IPs 2361 * 2362 * @adev: amdgpu_device pointer 2363 * 2364 * Early initialization pass for hardware IPs. The hardware IPs that make 2365 * up each asic are discovered each IP's early_init callback is run. This 2366 * is the first stage in initializing the asic. 2367 * Returns 0 on success, negative error code on failure. 2368 */ 2369 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2370 { 2371 struct pci_dev *parent; 2372 int i, r; 2373 bool total; 2374 2375 amdgpu_device_enable_virtual_display(adev); 2376 2377 if (amdgpu_sriov_vf(adev)) { 2378 r = amdgpu_virt_request_full_gpu(adev, true); 2379 if (r) 2380 return r; 2381 } 2382 2383 switch (adev->asic_type) { 2384 #ifdef CONFIG_DRM_AMDGPU_SI 2385 case CHIP_VERDE: 2386 case CHIP_TAHITI: 2387 case CHIP_PITCAIRN: 2388 case CHIP_OLAND: 2389 case CHIP_HAINAN: 2390 adev->family = AMDGPU_FAMILY_SI; 2391 r = si_set_ip_blocks(adev); 2392 if (r) 2393 return r; 2394 break; 2395 #endif 2396 #ifdef CONFIG_DRM_AMDGPU_CIK 2397 case CHIP_BONAIRE: 2398 case CHIP_HAWAII: 2399 case CHIP_KAVERI: 2400 case CHIP_KABINI: 2401 case CHIP_MULLINS: 2402 if (adev->flags & AMD_IS_APU) 2403 adev->family = AMDGPU_FAMILY_KV; 2404 else 2405 adev->family = AMDGPU_FAMILY_CI; 2406 2407 r = cik_set_ip_blocks(adev); 2408 if (r) 2409 return r; 2410 break; 2411 #endif 2412 case CHIP_TOPAZ: 2413 case CHIP_TONGA: 2414 case CHIP_FIJI: 2415 case CHIP_POLARIS10: 2416 case CHIP_POLARIS11: 2417 case CHIP_POLARIS12: 2418 case CHIP_VEGAM: 2419 case CHIP_CARRIZO: 2420 case CHIP_STONEY: 2421 if (adev->flags & AMD_IS_APU) 2422 adev->family = AMDGPU_FAMILY_CZ; 2423 else 2424 adev->family = AMDGPU_FAMILY_VI; 2425 2426 r = vi_set_ip_blocks(adev); 2427 if (r) 2428 return r; 2429 break; 2430 default: 2431 r = amdgpu_discovery_set_ip_blocks(adev); 2432 if (r) 2433 return r; 2434 break; 2435 } 2436 2437 if (amdgpu_has_atpx() && 2438 (amdgpu_is_atpx_hybrid() || 2439 amdgpu_has_atpx_dgpu_power_cntl()) && 2440 ((adev->flags & AMD_IS_APU) == 0) && 2441 !dev_is_removable(&adev->pdev->dev)) 2442 adev->flags |= AMD_IS_PX; 2443 2444 if (!(adev->flags & AMD_IS_APU)) { 2445 parent = pcie_find_root_port(adev->pdev); 2446 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2447 } 2448 2449 2450 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2451 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2452 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2453 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2454 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2455 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2456 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2457 2458 total = true; 2459 for (i = 0; i < adev->num_ip_blocks; i++) { 2460 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2461 DRM_WARN("disabled ip block: %d <%s>\n", 2462 i, adev->ip_blocks[i].version->funcs->name); 2463 adev->ip_blocks[i].status.valid = false; 2464 } else { 2465 if (adev->ip_blocks[i].version->funcs->early_init) { 2466 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2467 if (r == -ENOENT) { 2468 adev->ip_blocks[i].status.valid = false; 2469 } else if (r) { 2470 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2471 adev->ip_blocks[i].version->funcs->name, r); 2472 total = false; 2473 } else { 2474 adev->ip_blocks[i].status.valid = true; 2475 } 2476 } else { 2477 adev->ip_blocks[i].status.valid = true; 2478 } 2479 } 2480 /* get the vbios after the asic_funcs are set up */ 2481 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2482 r = amdgpu_device_parse_gpu_info_fw(adev); 2483 if (r) 2484 return r; 2485 2486 /* Read BIOS */ 2487 if (amdgpu_device_read_bios(adev)) { 2488 if (!amdgpu_get_bios(adev)) 2489 return -EINVAL; 2490 2491 r = amdgpu_atombios_init(adev); 2492 if (r) { 2493 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2494 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2495 return r; 2496 } 2497 } 2498 2499 /*get pf2vf msg info at it's earliest time*/ 2500 if (amdgpu_sriov_vf(adev)) 2501 amdgpu_virt_init_data_exchange(adev); 2502 2503 } 2504 } 2505 if (!total) 2506 return -ENODEV; 2507 2508 amdgpu_amdkfd_device_probe(adev); 2509 adev->cg_flags &= amdgpu_cg_mask; 2510 adev->pg_flags &= amdgpu_pg_mask; 2511 2512 return 0; 2513 } 2514 2515 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2516 { 2517 int i, r; 2518 2519 for (i = 0; i < adev->num_ip_blocks; i++) { 2520 if (!adev->ip_blocks[i].status.sw) 2521 continue; 2522 if (adev->ip_blocks[i].status.hw) 2523 continue; 2524 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2525 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2526 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2527 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2528 if (r) { 2529 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2530 adev->ip_blocks[i].version->funcs->name, r); 2531 return r; 2532 } 2533 adev->ip_blocks[i].status.hw = true; 2534 } 2535 } 2536 2537 return 0; 2538 } 2539 2540 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2541 { 2542 int i, r; 2543 2544 for (i = 0; i < adev->num_ip_blocks; i++) { 2545 if (!adev->ip_blocks[i].status.sw) 2546 continue; 2547 if (adev->ip_blocks[i].status.hw) 2548 continue; 2549 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2550 if (r) { 2551 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2552 adev->ip_blocks[i].version->funcs->name, r); 2553 return r; 2554 } 2555 adev->ip_blocks[i].status.hw = true; 2556 } 2557 2558 return 0; 2559 } 2560 2561 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2562 { 2563 int r = 0; 2564 int i; 2565 uint32_t smu_version; 2566 2567 if (adev->asic_type >= CHIP_VEGA10) { 2568 for (i = 0; i < adev->num_ip_blocks; i++) { 2569 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2570 continue; 2571 2572 if (!adev->ip_blocks[i].status.sw) 2573 continue; 2574 2575 /* no need to do the fw loading again if already done*/ 2576 if (adev->ip_blocks[i].status.hw == true) 2577 break; 2578 2579 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2580 r = adev->ip_blocks[i].version->funcs->resume(adev); 2581 if (r) { 2582 DRM_ERROR("resume of IP block <%s> failed %d\n", 2583 adev->ip_blocks[i].version->funcs->name, r); 2584 return r; 2585 } 2586 } else { 2587 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2588 if (r) { 2589 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2590 adev->ip_blocks[i].version->funcs->name, r); 2591 return r; 2592 } 2593 } 2594 2595 adev->ip_blocks[i].status.hw = true; 2596 break; 2597 } 2598 } 2599 2600 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2601 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2602 2603 return r; 2604 } 2605 2606 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2607 { 2608 long timeout; 2609 int r, i; 2610 2611 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2612 struct amdgpu_ring *ring = adev->rings[i]; 2613 2614 /* No need to setup the GPU scheduler for rings that don't need it */ 2615 if (!ring || ring->no_scheduler) 2616 continue; 2617 2618 switch (ring->funcs->type) { 2619 case AMDGPU_RING_TYPE_GFX: 2620 timeout = adev->gfx_timeout; 2621 break; 2622 case AMDGPU_RING_TYPE_COMPUTE: 2623 timeout = adev->compute_timeout; 2624 break; 2625 case AMDGPU_RING_TYPE_SDMA: 2626 timeout = adev->sdma_timeout; 2627 break; 2628 default: 2629 timeout = adev->video_timeout; 2630 break; 2631 } 2632 2633 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2634 DRM_SCHED_PRIORITY_COUNT, 2635 ring->num_hw_submission, 0, 2636 timeout, adev->reset_domain->wq, 2637 ring->sched_score, ring->name, 2638 adev->dev); 2639 if (r) { 2640 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2641 ring->name); 2642 return r; 2643 } 2644 r = amdgpu_uvd_entity_init(adev, ring); 2645 if (r) { 2646 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2647 ring->name); 2648 return r; 2649 } 2650 r = amdgpu_vce_entity_init(adev, ring); 2651 if (r) { 2652 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2653 ring->name); 2654 return r; 2655 } 2656 } 2657 2658 amdgpu_xcp_update_partition_sched_list(adev); 2659 2660 return 0; 2661 } 2662 2663 2664 /** 2665 * amdgpu_device_ip_init - run init for hardware IPs 2666 * 2667 * @adev: amdgpu_device pointer 2668 * 2669 * Main initialization pass for hardware IPs. The list of all the hardware 2670 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2671 * are run. sw_init initializes the software state associated with each IP 2672 * and hw_init initializes the hardware associated with each IP. 2673 * Returns 0 on success, negative error code on failure. 2674 */ 2675 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2676 { 2677 int i, r; 2678 2679 r = amdgpu_ras_init(adev); 2680 if (r) 2681 return r; 2682 2683 for (i = 0; i < adev->num_ip_blocks; i++) { 2684 if (!adev->ip_blocks[i].status.valid) 2685 continue; 2686 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2687 if (r) { 2688 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2689 adev->ip_blocks[i].version->funcs->name, r); 2690 goto init_failed; 2691 } 2692 adev->ip_blocks[i].status.sw = true; 2693 2694 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2695 /* need to do common hw init early so everything is set up for gmc */ 2696 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2697 if (r) { 2698 DRM_ERROR("hw_init %d failed %d\n", i, r); 2699 goto init_failed; 2700 } 2701 adev->ip_blocks[i].status.hw = true; 2702 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2703 /* need to do gmc hw init early so we can allocate gpu mem */ 2704 /* Try to reserve bad pages early */ 2705 if (amdgpu_sriov_vf(adev)) 2706 amdgpu_virt_exchange_data(adev); 2707 2708 r = amdgpu_device_mem_scratch_init(adev); 2709 if (r) { 2710 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2711 goto init_failed; 2712 } 2713 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2714 if (r) { 2715 DRM_ERROR("hw_init %d failed %d\n", i, r); 2716 goto init_failed; 2717 } 2718 r = amdgpu_device_wb_init(adev); 2719 if (r) { 2720 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2721 goto init_failed; 2722 } 2723 adev->ip_blocks[i].status.hw = true; 2724 2725 /* right after GMC hw init, we create CSA */ 2726 if (adev->gfx.mcbp) { 2727 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2728 AMDGPU_GEM_DOMAIN_VRAM | 2729 AMDGPU_GEM_DOMAIN_GTT, 2730 AMDGPU_CSA_SIZE); 2731 if (r) { 2732 DRM_ERROR("allocate CSA failed %d\n", r); 2733 goto init_failed; 2734 } 2735 } 2736 2737 r = amdgpu_seq64_init(adev); 2738 if (r) { 2739 DRM_ERROR("allocate seq64 failed %d\n", r); 2740 goto init_failed; 2741 } 2742 } 2743 } 2744 2745 if (amdgpu_sriov_vf(adev)) 2746 amdgpu_virt_init_data_exchange(adev); 2747 2748 r = amdgpu_ib_pool_init(adev); 2749 if (r) { 2750 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2751 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2752 goto init_failed; 2753 } 2754 2755 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2756 if (r) 2757 goto init_failed; 2758 2759 r = amdgpu_device_ip_hw_init_phase1(adev); 2760 if (r) 2761 goto init_failed; 2762 2763 r = amdgpu_device_fw_loading(adev); 2764 if (r) 2765 goto init_failed; 2766 2767 r = amdgpu_device_ip_hw_init_phase2(adev); 2768 if (r) 2769 goto init_failed; 2770 2771 /* 2772 * retired pages will be loaded from eeprom and reserved here, 2773 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2774 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2775 * for I2C communication which only true at this point. 2776 * 2777 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2778 * failure from bad gpu situation and stop amdgpu init process 2779 * accordingly. For other failed cases, it will still release all 2780 * the resource and print error message, rather than returning one 2781 * negative value to upper level. 2782 * 2783 * Note: theoretically, this should be called before all vram allocations 2784 * to protect retired page from abusing 2785 */ 2786 r = amdgpu_ras_recovery_init(adev); 2787 if (r) 2788 goto init_failed; 2789 2790 /** 2791 * In case of XGMI grab extra reference for reset domain for this device 2792 */ 2793 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2794 if (amdgpu_xgmi_add_device(adev) == 0) { 2795 if (!amdgpu_sriov_vf(adev)) { 2796 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2797 2798 if (WARN_ON(!hive)) { 2799 r = -ENOENT; 2800 goto init_failed; 2801 } 2802 2803 if (!hive->reset_domain || 2804 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2805 r = -ENOENT; 2806 amdgpu_put_xgmi_hive(hive); 2807 goto init_failed; 2808 } 2809 2810 /* Drop the early temporary reset domain we created for device */ 2811 amdgpu_reset_put_reset_domain(adev->reset_domain); 2812 adev->reset_domain = hive->reset_domain; 2813 amdgpu_put_xgmi_hive(hive); 2814 } 2815 } 2816 } 2817 2818 r = amdgpu_device_init_schedulers(adev); 2819 if (r) 2820 goto init_failed; 2821 2822 if (adev->mman.buffer_funcs_ring->sched.ready) 2823 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2824 2825 /* Don't init kfd if whole hive need to be reset during init */ 2826 if (!adev->gmc.xgmi.pending_reset) { 2827 kgd2kfd_init_zone_device(adev); 2828 amdgpu_amdkfd_device_init(adev); 2829 } 2830 2831 amdgpu_fru_get_product_info(adev); 2832 2833 init_failed: 2834 2835 return r; 2836 } 2837 2838 /** 2839 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2840 * 2841 * @adev: amdgpu_device pointer 2842 * 2843 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2844 * this function before a GPU reset. If the value is retained after a 2845 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2846 */ 2847 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2848 { 2849 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2850 } 2851 2852 /** 2853 * amdgpu_device_check_vram_lost - check if vram is valid 2854 * 2855 * @adev: amdgpu_device pointer 2856 * 2857 * Checks the reset magic value written to the gart pointer in VRAM. 2858 * The driver calls this after a GPU reset to see if the contents of 2859 * VRAM is lost or now. 2860 * returns true if vram is lost, false if not. 2861 */ 2862 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2863 { 2864 if (memcmp(adev->gart.ptr, adev->reset_magic, 2865 AMDGPU_RESET_MAGIC_NUM)) 2866 return true; 2867 2868 if (!amdgpu_in_reset(adev)) 2869 return false; 2870 2871 /* 2872 * For all ASICs with baco/mode1 reset, the VRAM is 2873 * always assumed to be lost. 2874 */ 2875 switch (amdgpu_asic_reset_method(adev)) { 2876 case AMD_RESET_METHOD_BACO: 2877 case AMD_RESET_METHOD_MODE1: 2878 return true; 2879 default: 2880 return false; 2881 } 2882 } 2883 2884 /** 2885 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2886 * 2887 * @adev: amdgpu_device pointer 2888 * @state: clockgating state (gate or ungate) 2889 * 2890 * The list of all the hardware IPs that make up the asic is walked and the 2891 * set_clockgating_state callbacks are run. 2892 * Late initialization pass enabling clockgating for hardware IPs. 2893 * Fini or suspend, pass disabling clockgating for hardware IPs. 2894 * Returns 0 on success, negative error code on failure. 2895 */ 2896 2897 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2898 enum amd_clockgating_state state) 2899 { 2900 int i, j, r; 2901 2902 if (amdgpu_emu_mode == 1) 2903 return 0; 2904 2905 for (j = 0; j < adev->num_ip_blocks; j++) { 2906 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2907 if (!adev->ip_blocks[i].status.late_initialized) 2908 continue; 2909 /* skip CG for GFX, SDMA on S0ix */ 2910 if (adev->in_s0ix && 2911 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2912 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2913 continue; 2914 /* skip CG for VCE/UVD, it's handled specially */ 2915 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2916 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2917 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2918 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2919 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2920 /* enable clockgating to save power */ 2921 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2922 state); 2923 if (r) { 2924 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2925 adev->ip_blocks[i].version->funcs->name, r); 2926 return r; 2927 } 2928 } 2929 } 2930 2931 return 0; 2932 } 2933 2934 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2935 enum amd_powergating_state state) 2936 { 2937 int i, j, r; 2938 2939 if (amdgpu_emu_mode == 1) 2940 return 0; 2941 2942 for (j = 0; j < adev->num_ip_blocks; j++) { 2943 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2944 if (!adev->ip_blocks[i].status.late_initialized) 2945 continue; 2946 /* skip PG for GFX, SDMA on S0ix */ 2947 if (adev->in_s0ix && 2948 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2949 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2950 continue; 2951 /* skip CG for VCE/UVD, it's handled specially */ 2952 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2953 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2954 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2955 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2956 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2957 /* enable powergating to save power */ 2958 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2959 state); 2960 if (r) { 2961 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2962 adev->ip_blocks[i].version->funcs->name, r); 2963 return r; 2964 } 2965 } 2966 } 2967 return 0; 2968 } 2969 2970 static int amdgpu_device_enable_mgpu_fan_boost(void) 2971 { 2972 struct amdgpu_gpu_instance *gpu_ins; 2973 struct amdgpu_device *adev; 2974 int i, ret = 0; 2975 2976 mutex_lock(&mgpu_info.mutex); 2977 2978 /* 2979 * MGPU fan boost feature should be enabled 2980 * only when there are two or more dGPUs in 2981 * the system 2982 */ 2983 if (mgpu_info.num_dgpu < 2) 2984 goto out; 2985 2986 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2987 gpu_ins = &(mgpu_info.gpu_ins[i]); 2988 adev = gpu_ins->adev; 2989 if (!(adev->flags & AMD_IS_APU) && 2990 !gpu_ins->mgpu_fan_enabled) { 2991 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2992 if (ret) 2993 break; 2994 2995 gpu_ins->mgpu_fan_enabled = 1; 2996 } 2997 } 2998 2999 out: 3000 mutex_unlock(&mgpu_info.mutex); 3001 3002 return ret; 3003 } 3004 3005 /** 3006 * amdgpu_device_ip_late_init - run late init for hardware IPs 3007 * 3008 * @adev: amdgpu_device pointer 3009 * 3010 * Late initialization pass for hardware IPs. The list of all the hardware 3011 * IPs that make up the asic is walked and the late_init callbacks are run. 3012 * late_init covers any special initialization that an IP requires 3013 * after all of the have been initialized or something that needs to happen 3014 * late in the init process. 3015 * Returns 0 on success, negative error code on failure. 3016 */ 3017 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3018 { 3019 struct amdgpu_gpu_instance *gpu_instance; 3020 int i = 0, r; 3021 3022 for (i = 0; i < adev->num_ip_blocks; i++) { 3023 if (!adev->ip_blocks[i].status.hw) 3024 continue; 3025 if (adev->ip_blocks[i].version->funcs->late_init) { 3026 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3027 if (r) { 3028 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3029 adev->ip_blocks[i].version->funcs->name, r); 3030 return r; 3031 } 3032 } 3033 adev->ip_blocks[i].status.late_initialized = true; 3034 } 3035 3036 r = amdgpu_ras_late_init(adev); 3037 if (r) { 3038 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3039 return r; 3040 } 3041 3042 amdgpu_ras_set_error_query_ready(adev, true); 3043 3044 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3045 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3046 3047 amdgpu_device_fill_reset_magic(adev); 3048 3049 r = amdgpu_device_enable_mgpu_fan_boost(); 3050 if (r) 3051 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3052 3053 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3054 if (amdgpu_passthrough(adev) && 3055 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3056 adev->asic_type == CHIP_ALDEBARAN)) 3057 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3058 3059 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3060 mutex_lock(&mgpu_info.mutex); 3061 3062 /* 3063 * Reset device p-state to low as this was booted with high. 3064 * 3065 * This should be performed only after all devices from the same 3066 * hive get initialized. 3067 * 3068 * However, it's unknown how many device in the hive in advance. 3069 * As this is counted one by one during devices initializations. 3070 * 3071 * So, we wait for all XGMI interlinked devices initialized. 3072 * This may bring some delays as those devices may come from 3073 * different hives. But that should be OK. 3074 */ 3075 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3076 for (i = 0; i < mgpu_info.num_gpu; i++) { 3077 gpu_instance = &(mgpu_info.gpu_ins[i]); 3078 if (gpu_instance->adev->flags & AMD_IS_APU) 3079 continue; 3080 3081 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3082 AMDGPU_XGMI_PSTATE_MIN); 3083 if (r) { 3084 DRM_ERROR("pstate setting failed (%d).\n", r); 3085 break; 3086 } 3087 } 3088 } 3089 3090 mutex_unlock(&mgpu_info.mutex); 3091 } 3092 3093 return 0; 3094 } 3095 3096 /** 3097 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3098 * 3099 * @adev: amdgpu_device pointer 3100 * 3101 * For ASICs need to disable SMC first 3102 */ 3103 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3104 { 3105 int i, r; 3106 3107 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3108 return; 3109 3110 for (i = 0; i < adev->num_ip_blocks; i++) { 3111 if (!adev->ip_blocks[i].status.hw) 3112 continue; 3113 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3114 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3115 /* XXX handle errors */ 3116 if (r) { 3117 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3118 adev->ip_blocks[i].version->funcs->name, r); 3119 } 3120 adev->ip_blocks[i].status.hw = false; 3121 break; 3122 } 3123 } 3124 } 3125 3126 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3127 { 3128 int i, r; 3129 3130 for (i = 0; i < adev->num_ip_blocks; i++) { 3131 if (!adev->ip_blocks[i].version->funcs->early_fini) 3132 continue; 3133 3134 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3135 if (r) { 3136 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3137 adev->ip_blocks[i].version->funcs->name, r); 3138 } 3139 } 3140 3141 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3142 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3143 3144 amdgpu_amdkfd_suspend(adev, false); 3145 3146 /* Workaroud for ASICs need to disable SMC first */ 3147 amdgpu_device_smu_fini_early(adev); 3148 3149 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3150 if (!adev->ip_blocks[i].status.hw) 3151 continue; 3152 3153 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3154 /* XXX handle errors */ 3155 if (r) { 3156 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3157 adev->ip_blocks[i].version->funcs->name, r); 3158 } 3159 3160 adev->ip_blocks[i].status.hw = false; 3161 } 3162 3163 if (amdgpu_sriov_vf(adev)) { 3164 if (amdgpu_virt_release_full_gpu(adev, false)) 3165 DRM_ERROR("failed to release exclusive mode on fini\n"); 3166 } 3167 3168 return 0; 3169 } 3170 3171 /** 3172 * amdgpu_device_ip_fini - run fini for hardware IPs 3173 * 3174 * @adev: amdgpu_device pointer 3175 * 3176 * Main teardown pass for hardware IPs. The list of all the hardware 3177 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3178 * are run. hw_fini tears down the hardware associated with each IP 3179 * and sw_fini tears down any software state associated with each IP. 3180 * Returns 0 on success, negative error code on failure. 3181 */ 3182 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3183 { 3184 int i, r; 3185 3186 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3187 amdgpu_virt_release_ras_err_handler_data(adev); 3188 3189 if (adev->gmc.xgmi.num_physical_nodes > 1) 3190 amdgpu_xgmi_remove_device(adev); 3191 3192 amdgpu_amdkfd_device_fini_sw(adev); 3193 3194 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3195 if (!adev->ip_blocks[i].status.sw) 3196 continue; 3197 3198 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3199 amdgpu_ucode_free_bo(adev); 3200 amdgpu_free_static_csa(&adev->virt.csa_obj); 3201 amdgpu_device_wb_fini(adev); 3202 amdgpu_device_mem_scratch_fini(adev); 3203 amdgpu_ib_pool_fini(adev); 3204 amdgpu_seq64_fini(adev); 3205 } 3206 3207 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3208 /* XXX handle errors */ 3209 if (r) { 3210 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3211 adev->ip_blocks[i].version->funcs->name, r); 3212 } 3213 adev->ip_blocks[i].status.sw = false; 3214 adev->ip_blocks[i].status.valid = false; 3215 } 3216 3217 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3218 if (!adev->ip_blocks[i].status.late_initialized) 3219 continue; 3220 if (adev->ip_blocks[i].version->funcs->late_fini) 3221 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3222 adev->ip_blocks[i].status.late_initialized = false; 3223 } 3224 3225 amdgpu_ras_fini(adev); 3226 3227 return 0; 3228 } 3229 3230 /** 3231 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3232 * 3233 * @work: work_struct. 3234 */ 3235 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3236 { 3237 struct amdgpu_device *adev = 3238 container_of(work, struct amdgpu_device, delayed_init_work.work); 3239 int r; 3240 3241 r = amdgpu_ib_ring_tests(adev); 3242 if (r) 3243 DRM_ERROR("ib ring test failed (%d).\n", r); 3244 } 3245 3246 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3247 { 3248 struct amdgpu_device *adev = 3249 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3250 3251 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3252 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3253 3254 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3255 adev->gfx.gfx_off_state = true; 3256 } 3257 3258 /** 3259 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3260 * 3261 * @adev: amdgpu_device pointer 3262 * 3263 * Main suspend function for hardware IPs. The list of all the hardware 3264 * IPs that make up the asic is walked, clockgating is disabled and the 3265 * suspend callbacks are run. suspend puts the hardware and software state 3266 * in each IP into a state suitable for suspend. 3267 * Returns 0 on success, negative error code on failure. 3268 */ 3269 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3270 { 3271 int i, r; 3272 3273 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3274 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3275 3276 /* 3277 * Per PMFW team's suggestion, driver needs to handle gfxoff 3278 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3279 * scenario. Add the missing df cstate disablement here. 3280 */ 3281 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3282 dev_warn(adev->dev, "Failed to disallow df cstate"); 3283 3284 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3285 if (!adev->ip_blocks[i].status.valid) 3286 continue; 3287 3288 /* displays are handled separately */ 3289 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3290 continue; 3291 3292 /* XXX handle errors */ 3293 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3294 /* XXX handle errors */ 3295 if (r) { 3296 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3297 adev->ip_blocks[i].version->funcs->name, r); 3298 return r; 3299 } 3300 3301 adev->ip_blocks[i].status.hw = false; 3302 } 3303 3304 return 0; 3305 } 3306 3307 /** 3308 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3309 * 3310 * @adev: amdgpu_device pointer 3311 * 3312 * Main suspend function for hardware IPs. The list of all the hardware 3313 * IPs that make up the asic is walked, clockgating is disabled and the 3314 * suspend callbacks are run. suspend puts the hardware and software state 3315 * in each IP into a state suitable for suspend. 3316 * Returns 0 on success, negative error code on failure. 3317 */ 3318 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3319 { 3320 int i, r; 3321 3322 if (adev->in_s0ix) 3323 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3324 3325 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3326 if (!adev->ip_blocks[i].status.valid) 3327 continue; 3328 /* displays are handled in phase1 */ 3329 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3330 continue; 3331 /* PSP lost connection when err_event_athub occurs */ 3332 if (amdgpu_ras_intr_triggered() && 3333 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3334 adev->ip_blocks[i].status.hw = false; 3335 continue; 3336 } 3337 3338 /* skip unnecessary suspend if we do not initialize them yet */ 3339 if (adev->gmc.xgmi.pending_reset && 3340 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3341 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3342 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3343 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3344 adev->ip_blocks[i].status.hw = false; 3345 continue; 3346 } 3347 3348 /* skip suspend of gfx/mes and psp for S0ix 3349 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3350 * like at runtime. PSP is also part of the always on hardware 3351 * so no need to suspend it. 3352 */ 3353 if (adev->in_s0ix && 3354 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3355 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3356 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3357 continue; 3358 3359 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3360 if (adev->in_s0ix && 3361 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3362 IP_VERSION(5, 0, 0)) && 3363 (adev->ip_blocks[i].version->type == 3364 AMD_IP_BLOCK_TYPE_SDMA)) 3365 continue; 3366 3367 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3368 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3369 * from this location and RLC Autoload automatically also gets loaded 3370 * from here based on PMFW -> PSP message during re-init sequence. 3371 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3372 * the TMR and reload FWs again for IMU enabled APU ASICs. 3373 */ 3374 if (amdgpu_in_reset(adev) && 3375 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3376 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3377 continue; 3378 3379 /* XXX handle errors */ 3380 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3381 /* XXX handle errors */ 3382 if (r) { 3383 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3384 adev->ip_blocks[i].version->funcs->name, r); 3385 } 3386 adev->ip_blocks[i].status.hw = false; 3387 /* handle putting the SMC in the appropriate state */ 3388 if (!amdgpu_sriov_vf(adev)) { 3389 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3390 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3391 if (r) { 3392 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3393 adev->mp1_state, r); 3394 return r; 3395 } 3396 } 3397 } 3398 } 3399 3400 return 0; 3401 } 3402 3403 /** 3404 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3405 * 3406 * @adev: amdgpu_device pointer 3407 * 3408 * Main suspend function for hardware IPs. The list of all the hardware 3409 * IPs that make up the asic is walked, clockgating is disabled and the 3410 * suspend callbacks are run. suspend puts the hardware and software state 3411 * in each IP into a state suitable for suspend. 3412 * Returns 0 on success, negative error code on failure. 3413 */ 3414 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3415 { 3416 int r; 3417 3418 if (amdgpu_sriov_vf(adev)) { 3419 amdgpu_virt_fini_data_exchange(adev); 3420 amdgpu_virt_request_full_gpu(adev, false); 3421 } 3422 3423 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3424 3425 r = amdgpu_device_ip_suspend_phase1(adev); 3426 if (r) 3427 return r; 3428 r = amdgpu_device_ip_suspend_phase2(adev); 3429 3430 if (amdgpu_sriov_vf(adev)) 3431 amdgpu_virt_release_full_gpu(adev, false); 3432 3433 return r; 3434 } 3435 3436 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3437 { 3438 int i, r; 3439 3440 static enum amd_ip_block_type ip_order[] = { 3441 AMD_IP_BLOCK_TYPE_COMMON, 3442 AMD_IP_BLOCK_TYPE_GMC, 3443 AMD_IP_BLOCK_TYPE_PSP, 3444 AMD_IP_BLOCK_TYPE_IH, 3445 }; 3446 3447 for (i = 0; i < adev->num_ip_blocks; i++) { 3448 int j; 3449 struct amdgpu_ip_block *block; 3450 3451 block = &adev->ip_blocks[i]; 3452 block->status.hw = false; 3453 3454 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3455 3456 if (block->version->type != ip_order[j] || 3457 !block->status.valid) 3458 continue; 3459 3460 r = block->version->funcs->hw_init(adev); 3461 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3462 if (r) 3463 return r; 3464 block->status.hw = true; 3465 } 3466 } 3467 3468 return 0; 3469 } 3470 3471 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3472 { 3473 int i, r; 3474 3475 static enum amd_ip_block_type ip_order[] = { 3476 AMD_IP_BLOCK_TYPE_SMC, 3477 AMD_IP_BLOCK_TYPE_DCE, 3478 AMD_IP_BLOCK_TYPE_GFX, 3479 AMD_IP_BLOCK_TYPE_SDMA, 3480 AMD_IP_BLOCK_TYPE_MES, 3481 AMD_IP_BLOCK_TYPE_UVD, 3482 AMD_IP_BLOCK_TYPE_VCE, 3483 AMD_IP_BLOCK_TYPE_VCN, 3484 AMD_IP_BLOCK_TYPE_JPEG 3485 }; 3486 3487 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3488 int j; 3489 struct amdgpu_ip_block *block; 3490 3491 for (j = 0; j < adev->num_ip_blocks; j++) { 3492 block = &adev->ip_blocks[j]; 3493 3494 if (block->version->type != ip_order[i] || 3495 !block->status.valid || 3496 block->status.hw) 3497 continue; 3498 3499 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3500 r = block->version->funcs->resume(adev); 3501 else 3502 r = block->version->funcs->hw_init(adev); 3503 3504 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3505 if (r) 3506 return r; 3507 block->status.hw = true; 3508 } 3509 } 3510 3511 return 0; 3512 } 3513 3514 /** 3515 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3516 * 3517 * @adev: amdgpu_device pointer 3518 * 3519 * First resume function for hardware IPs. The list of all the hardware 3520 * IPs that make up the asic is walked and the resume callbacks are run for 3521 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3522 * after a suspend and updates the software state as necessary. This 3523 * function is also used for restoring the GPU after a GPU reset. 3524 * Returns 0 on success, negative error code on failure. 3525 */ 3526 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3527 { 3528 int i, r; 3529 3530 for (i = 0; i < adev->num_ip_blocks; i++) { 3531 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3532 continue; 3533 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3534 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3535 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3536 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3537 3538 r = adev->ip_blocks[i].version->funcs->resume(adev); 3539 if (r) { 3540 DRM_ERROR("resume of IP block <%s> failed %d\n", 3541 adev->ip_blocks[i].version->funcs->name, r); 3542 return r; 3543 } 3544 adev->ip_blocks[i].status.hw = true; 3545 } 3546 } 3547 3548 return 0; 3549 } 3550 3551 /** 3552 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3553 * 3554 * @adev: amdgpu_device pointer 3555 * 3556 * First resume function for hardware IPs. The list of all the hardware 3557 * IPs that make up the asic is walked and the resume callbacks are run for 3558 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3559 * functional state after a suspend and updates the software state as 3560 * necessary. This function is also used for restoring the GPU after a GPU 3561 * reset. 3562 * Returns 0 on success, negative error code on failure. 3563 */ 3564 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3565 { 3566 int i, r; 3567 3568 for (i = 0; i < adev->num_ip_blocks; i++) { 3569 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3570 continue; 3571 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3572 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3573 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3574 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3575 continue; 3576 r = adev->ip_blocks[i].version->funcs->resume(adev); 3577 if (r) { 3578 DRM_ERROR("resume of IP block <%s> failed %d\n", 3579 adev->ip_blocks[i].version->funcs->name, r); 3580 return r; 3581 } 3582 adev->ip_blocks[i].status.hw = true; 3583 } 3584 3585 return 0; 3586 } 3587 3588 /** 3589 * amdgpu_device_ip_resume - run resume for hardware IPs 3590 * 3591 * @adev: amdgpu_device pointer 3592 * 3593 * Main resume function for hardware IPs. The hardware IPs 3594 * are split into two resume functions because they are 3595 * also used in recovering from a GPU reset and some additional 3596 * steps need to be take between them. In this case (S3/S4) they are 3597 * run sequentially. 3598 * Returns 0 on success, negative error code on failure. 3599 */ 3600 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3601 { 3602 int r; 3603 3604 r = amdgpu_device_ip_resume_phase1(adev); 3605 if (r) 3606 return r; 3607 3608 r = amdgpu_device_fw_loading(adev); 3609 if (r) 3610 return r; 3611 3612 r = amdgpu_device_ip_resume_phase2(adev); 3613 3614 if (adev->mman.buffer_funcs_ring->sched.ready) 3615 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3616 3617 return r; 3618 } 3619 3620 /** 3621 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3622 * 3623 * @adev: amdgpu_device pointer 3624 * 3625 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3626 */ 3627 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3628 { 3629 if (amdgpu_sriov_vf(adev)) { 3630 if (adev->is_atom_fw) { 3631 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3632 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3633 } else { 3634 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3635 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3636 } 3637 3638 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3639 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3640 } 3641 } 3642 3643 /** 3644 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3645 * 3646 * @asic_type: AMD asic type 3647 * 3648 * Check if there is DC (new modesetting infrastructre) support for an asic. 3649 * returns true if DC has support, false if not. 3650 */ 3651 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3652 { 3653 switch (asic_type) { 3654 #ifdef CONFIG_DRM_AMDGPU_SI 3655 case CHIP_HAINAN: 3656 #endif 3657 case CHIP_TOPAZ: 3658 /* chips with no display hardware */ 3659 return false; 3660 #if defined(CONFIG_DRM_AMD_DC) 3661 case CHIP_TAHITI: 3662 case CHIP_PITCAIRN: 3663 case CHIP_VERDE: 3664 case CHIP_OLAND: 3665 /* 3666 * We have systems in the wild with these ASICs that require 3667 * LVDS and VGA support which is not supported with DC. 3668 * 3669 * Fallback to the non-DC driver here by default so as not to 3670 * cause regressions. 3671 */ 3672 #if defined(CONFIG_DRM_AMD_DC_SI) 3673 return amdgpu_dc > 0; 3674 #else 3675 return false; 3676 #endif 3677 case CHIP_BONAIRE: 3678 case CHIP_KAVERI: 3679 case CHIP_KABINI: 3680 case CHIP_MULLINS: 3681 /* 3682 * We have systems in the wild with these ASICs that require 3683 * VGA support which is not supported with DC. 3684 * 3685 * Fallback to the non-DC driver here by default so as not to 3686 * cause regressions. 3687 */ 3688 return amdgpu_dc > 0; 3689 default: 3690 return amdgpu_dc != 0; 3691 #else 3692 default: 3693 if (amdgpu_dc > 0) 3694 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3695 return false; 3696 #endif 3697 } 3698 } 3699 3700 /** 3701 * amdgpu_device_has_dc_support - check if dc is supported 3702 * 3703 * @adev: amdgpu_device pointer 3704 * 3705 * Returns true for supported, false for not supported 3706 */ 3707 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3708 { 3709 if (adev->enable_virtual_display || 3710 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3711 return false; 3712 3713 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3714 } 3715 3716 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3717 { 3718 struct amdgpu_device *adev = 3719 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3720 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3721 3722 /* It's a bug to not have a hive within this function */ 3723 if (WARN_ON(!hive)) 3724 return; 3725 3726 /* 3727 * Use task barrier to synchronize all xgmi reset works across the 3728 * hive. task_barrier_enter and task_barrier_exit will block 3729 * until all the threads running the xgmi reset works reach 3730 * those points. task_barrier_full will do both blocks. 3731 */ 3732 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3733 3734 task_barrier_enter(&hive->tb); 3735 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3736 3737 if (adev->asic_reset_res) 3738 goto fail; 3739 3740 task_barrier_exit(&hive->tb); 3741 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3742 3743 if (adev->asic_reset_res) 3744 goto fail; 3745 3746 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3747 } else { 3748 3749 task_barrier_full(&hive->tb); 3750 adev->asic_reset_res = amdgpu_asic_reset(adev); 3751 } 3752 3753 fail: 3754 if (adev->asic_reset_res) 3755 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3756 adev->asic_reset_res, adev_to_drm(adev)->unique); 3757 amdgpu_put_xgmi_hive(hive); 3758 } 3759 3760 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3761 { 3762 char *input = amdgpu_lockup_timeout; 3763 char *timeout_setting = NULL; 3764 int index = 0; 3765 long timeout; 3766 int ret = 0; 3767 3768 /* 3769 * By default timeout for non compute jobs is 10000 3770 * and 60000 for compute jobs. 3771 * In SR-IOV or passthrough mode, timeout for compute 3772 * jobs are 60000 by default. 3773 */ 3774 adev->gfx_timeout = msecs_to_jiffies(10000); 3775 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3776 if (amdgpu_sriov_vf(adev)) 3777 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3778 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3779 else 3780 adev->compute_timeout = msecs_to_jiffies(60000); 3781 3782 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3783 while ((timeout_setting = strsep(&input, ",")) && 3784 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3785 ret = kstrtol(timeout_setting, 0, &timeout); 3786 if (ret) 3787 return ret; 3788 3789 if (timeout == 0) { 3790 index++; 3791 continue; 3792 } else if (timeout < 0) { 3793 timeout = MAX_SCHEDULE_TIMEOUT; 3794 dev_warn(adev->dev, "lockup timeout disabled"); 3795 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3796 } else { 3797 timeout = msecs_to_jiffies(timeout); 3798 } 3799 3800 switch (index++) { 3801 case 0: 3802 adev->gfx_timeout = timeout; 3803 break; 3804 case 1: 3805 adev->compute_timeout = timeout; 3806 break; 3807 case 2: 3808 adev->sdma_timeout = timeout; 3809 break; 3810 case 3: 3811 adev->video_timeout = timeout; 3812 break; 3813 default: 3814 break; 3815 } 3816 } 3817 /* 3818 * There is only one value specified and 3819 * it should apply to all non-compute jobs. 3820 */ 3821 if (index == 1) { 3822 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3823 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3824 adev->compute_timeout = adev->gfx_timeout; 3825 } 3826 } 3827 3828 return ret; 3829 } 3830 3831 /** 3832 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3833 * 3834 * @adev: amdgpu_device pointer 3835 * 3836 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3837 */ 3838 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3839 { 3840 struct iommu_domain *domain; 3841 3842 domain = iommu_get_domain_for_dev(adev->dev); 3843 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3844 adev->ram_is_direct_mapped = true; 3845 } 3846 3847 static const struct attribute *amdgpu_dev_attributes[] = { 3848 &dev_attr_pcie_replay_count.attr, 3849 NULL 3850 }; 3851 3852 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3853 { 3854 if (amdgpu_mcbp == 1) 3855 adev->gfx.mcbp = true; 3856 else if (amdgpu_mcbp == 0) 3857 adev->gfx.mcbp = false; 3858 3859 if (amdgpu_sriov_vf(adev)) 3860 adev->gfx.mcbp = true; 3861 3862 if (adev->gfx.mcbp) 3863 DRM_INFO("MCBP is enabled\n"); 3864 } 3865 3866 /** 3867 * amdgpu_device_init - initialize the driver 3868 * 3869 * @adev: amdgpu_device pointer 3870 * @flags: driver flags 3871 * 3872 * Initializes the driver info and hw (all asics). 3873 * Returns 0 for success or an error on failure. 3874 * Called at driver startup. 3875 */ 3876 int amdgpu_device_init(struct amdgpu_device *adev, 3877 uint32_t flags) 3878 { 3879 struct drm_device *ddev = adev_to_drm(adev); 3880 struct pci_dev *pdev = adev->pdev; 3881 int r, i; 3882 bool px = false; 3883 u32 max_MBps; 3884 int tmp; 3885 3886 adev->shutdown = false; 3887 adev->flags = flags; 3888 3889 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3890 adev->asic_type = amdgpu_force_asic_type; 3891 else 3892 adev->asic_type = flags & AMD_ASIC_MASK; 3893 3894 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3895 if (amdgpu_emu_mode == 1) 3896 adev->usec_timeout *= 10; 3897 adev->gmc.gart_size = 512 * 1024 * 1024; 3898 adev->accel_working = false; 3899 adev->num_rings = 0; 3900 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3901 adev->mman.buffer_funcs = NULL; 3902 adev->mman.buffer_funcs_ring = NULL; 3903 adev->vm_manager.vm_pte_funcs = NULL; 3904 adev->vm_manager.vm_pte_num_scheds = 0; 3905 adev->gmc.gmc_funcs = NULL; 3906 adev->harvest_ip_mask = 0x0; 3907 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3908 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3909 3910 adev->smc_rreg = &amdgpu_invalid_rreg; 3911 adev->smc_wreg = &amdgpu_invalid_wreg; 3912 adev->pcie_rreg = &amdgpu_invalid_rreg; 3913 adev->pcie_wreg = &amdgpu_invalid_wreg; 3914 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3915 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3916 adev->pciep_rreg = &amdgpu_invalid_rreg; 3917 adev->pciep_wreg = &amdgpu_invalid_wreg; 3918 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3919 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3920 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 3921 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 3922 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3923 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3924 adev->didt_rreg = &amdgpu_invalid_rreg; 3925 adev->didt_wreg = &amdgpu_invalid_wreg; 3926 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3927 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3928 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3929 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3930 3931 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3932 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3933 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3934 3935 /* mutex initialization are all done here so we 3936 * can recall function without having locking issues 3937 */ 3938 mutex_init(&adev->firmware.mutex); 3939 mutex_init(&adev->pm.mutex); 3940 mutex_init(&adev->gfx.gpu_clock_mutex); 3941 mutex_init(&adev->srbm_mutex); 3942 mutex_init(&adev->gfx.pipe_reserve_mutex); 3943 mutex_init(&adev->gfx.gfx_off_mutex); 3944 mutex_init(&adev->gfx.partition_mutex); 3945 mutex_init(&adev->grbm_idx_mutex); 3946 mutex_init(&adev->mn_lock); 3947 mutex_init(&adev->virt.vf_errors.lock); 3948 hash_init(adev->mn_hash); 3949 mutex_init(&adev->psp.mutex); 3950 mutex_init(&adev->notifier_lock); 3951 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3952 mutex_init(&adev->benchmark_mutex); 3953 3954 amdgpu_device_init_apu_flags(adev); 3955 3956 r = amdgpu_device_check_arguments(adev); 3957 if (r) 3958 return r; 3959 3960 spin_lock_init(&adev->mmio_idx_lock); 3961 spin_lock_init(&adev->smc_idx_lock); 3962 spin_lock_init(&adev->pcie_idx_lock); 3963 spin_lock_init(&adev->uvd_ctx_idx_lock); 3964 spin_lock_init(&adev->didt_idx_lock); 3965 spin_lock_init(&adev->gc_cac_idx_lock); 3966 spin_lock_init(&adev->se_cac_idx_lock); 3967 spin_lock_init(&adev->audio_endpt_idx_lock); 3968 spin_lock_init(&adev->mm_stats.lock); 3969 3970 INIT_LIST_HEAD(&adev->shadow_list); 3971 mutex_init(&adev->shadow_list_lock); 3972 3973 INIT_LIST_HEAD(&adev->reset_list); 3974 3975 INIT_LIST_HEAD(&adev->ras_list); 3976 3977 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 3978 3979 INIT_DELAYED_WORK(&adev->delayed_init_work, 3980 amdgpu_device_delayed_init_work_handler); 3981 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3982 amdgpu_device_delay_enable_gfx_off); 3983 3984 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3985 3986 adev->gfx.gfx_off_req_count = 1; 3987 adev->gfx.gfx_off_residency = 0; 3988 adev->gfx.gfx_off_entrycount = 0; 3989 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3990 3991 atomic_set(&adev->throttling_logging_enabled, 1); 3992 /* 3993 * If throttling continues, logging will be performed every minute 3994 * to avoid log flooding. "-1" is subtracted since the thermal 3995 * throttling interrupt comes every second. Thus, the total logging 3996 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3997 * for throttling interrupt) = 60 seconds. 3998 */ 3999 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4000 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4001 4002 /* Registers mapping */ 4003 /* TODO: block userspace mapping of io register */ 4004 if (adev->asic_type >= CHIP_BONAIRE) { 4005 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4006 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4007 } else { 4008 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4009 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4010 } 4011 4012 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4013 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4014 4015 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4016 if (!adev->rmmio) 4017 return -ENOMEM; 4018 4019 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4020 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4021 4022 /* 4023 * Reset domain needs to be present early, before XGMI hive discovered 4024 * (if any) and intitialized to use reset sem and in_gpu reset flag 4025 * early on during init and before calling to RREG32. 4026 */ 4027 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4028 if (!adev->reset_domain) 4029 return -ENOMEM; 4030 4031 /* detect hw virtualization here */ 4032 amdgpu_detect_virtualization(adev); 4033 4034 amdgpu_device_get_pcie_info(adev); 4035 4036 r = amdgpu_device_get_job_timeout_settings(adev); 4037 if (r) { 4038 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4039 return r; 4040 } 4041 4042 /* early init functions */ 4043 r = amdgpu_device_ip_early_init(adev); 4044 if (r) 4045 return r; 4046 4047 amdgpu_device_set_mcbp(adev); 4048 4049 /* Get rid of things like offb */ 4050 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4051 if (r) 4052 return r; 4053 4054 /* Enable TMZ based on IP_VERSION */ 4055 amdgpu_gmc_tmz_set(adev); 4056 4057 amdgpu_gmc_noretry_set(adev); 4058 /* Need to get xgmi info early to decide the reset behavior*/ 4059 if (adev->gmc.xgmi.supported) { 4060 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4061 if (r) 4062 return r; 4063 } 4064 4065 /* enable PCIE atomic ops */ 4066 if (amdgpu_sriov_vf(adev)) { 4067 if (adev->virt.fw_reserve.p_pf2vf) 4068 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4069 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4070 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4071 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4072 * internal path natively support atomics, set have_atomics_support to true. 4073 */ 4074 } else if ((adev->flags & AMD_IS_APU) && 4075 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4076 IP_VERSION(9, 0, 0))) { 4077 adev->have_atomics_support = true; 4078 } else { 4079 adev->have_atomics_support = 4080 !pci_enable_atomic_ops_to_root(adev->pdev, 4081 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4082 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4083 } 4084 4085 if (!adev->have_atomics_support) 4086 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4087 4088 /* doorbell bar mapping and doorbell index init*/ 4089 amdgpu_doorbell_init(adev); 4090 4091 if (amdgpu_emu_mode == 1) { 4092 /* post the asic on emulation mode */ 4093 emu_soc_asic_init(adev); 4094 goto fence_driver_init; 4095 } 4096 4097 amdgpu_reset_init(adev); 4098 4099 /* detect if we are with an SRIOV vbios */ 4100 if (adev->bios) 4101 amdgpu_device_detect_sriov_bios(adev); 4102 4103 /* check if we need to reset the asic 4104 * E.g., driver was not cleanly unloaded previously, etc. 4105 */ 4106 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4107 if (adev->gmc.xgmi.num_physical_nodes) { 4108 dev_info(adev->dev, "Pending hive reset.\n"); 4109 adev->gmc.xgmi.pending_reset = true; 4110 /* Only need to init necessary block for SMU to handle the reset */ 4111 for (i = 0; i < adev->num_ip_blocks; i++) { 4112 if (!adev->ip_blocks[i].status.valid) 4113 continue; 4114 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4115 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4116 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4117 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4118 DRM_DEBUG("IP %s disabled for hw_init.\n", 4119 adev->ip_blocks[i].version->funcs->name); 4120 adev->ip_blocks[i].status.hw = true; 4121 } 4122 } 4123 } else { 4124 tmp = amdgpu_reset_method; 4125 /* It should do a default reset when loading or reloading the driver, 4126 * regardless of the module parameter reset_method. 4127 */ 4128 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4129 r = amdgpu_asic_reset(adev); 4130 amdgpu_reset_method = tmp; 4131 if (r) { 4132 dev_err(adev->dev, "asic reset on init failed\n"); 4133 goto failed; 4134 } 4135 } 4136 } 4137 4138 /* Post card if necessary */ 4139 if (amdgpu_device_need_post(adev)) { 4140 if (!adev->bios) { 4141 dev_err(adev->dev, "no vBIOS found\n"); 4142 r = -EINVAL; 4143 goto failed; 4144 } 4145 DRM_INFO("GPU posting now...\n"); 4146 r = amdgpu_device_asic_init(adev); 4147 if (r) { 4148 dev_err(adev->dev, "gpu post error!\n"); 4149 goto failed; 4150 } 4151 } 4152 4153 if (adev->bios) { 4154 if (adev->is_atom_fw) { 4155 /* Initialize clocks */ 4156 r = amdgpu_atomfirmware_get_clock_info(adev); 4157 if (r) { 4158 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4159 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4160 goto failed; 4161 } 4162 } else { 4163 /* Initialize clocks */ 4164 r = amdgpu_atombios_get_clock_info(adev); 4165 if (r) { 4166 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4167 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4168 goto failed; 4169 } 4170 /* init i2c buses */ 4171 if (!amdgpu_device_has_dc_support(adev)) 4172 amdgpu_atombios_i2c_init(adev); 4173 } 4174 } 4175 4176 fence_driver_init: 4177 /* Fence driver */ 4178 r = amdgpu_fence_driver_sw_init(adev); 4179 if (r) { 4180 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4181 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4182 goto failed; 4183 } 4184 4185 /* init the mode config */ 4186 drm_mode_config_init(adev_to_drm(adev)); 4187 4188 r = amdgpu_device_ip_init(adev); 4189 if (r) { 4190 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4191 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4192 goto release_ras_con; 4193 } 4194 4195 amdgpu_fence_driver_hw_init(adev); 4196 4197 dev_info(adev->dev, 4198 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4199 adev->gfx.config.max_shader_engines, 4200 adev->gfx.config.max_sh_per_se, 4201 adev->gfx.config.max_cu_per_sh, 4202 adev->gfx.cu_info.number); 4203 4204 adev->accel_working = true; 4205 4206 amdgpu_vm_check_compute_bug(adev); 4207 4208 /* Initialize the buffer migration limit. */ 4209 if (amdgpu_moverate >= 0) 4210 max_MBps = amdgpu_moverate; 4211 else 4212 max_MBps = 8; /* Allow 8 MB/s. */ 4213 /* Get a log2 for easy divisions. */ 4214 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4215 4216 /* 4217 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4218 * Otherwise the mgpu fan boost feature will be skipped due to the 4219 * gpu instance is counted less. 4220 */ 4221 amdgpu_register_gpu_instance(adev); 4222 4223 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4224 * explicit gating rather than handling it automatically. 4225 */ 4226 if (!adev->gmc.xgmi.pending_reset) { 4227 r = amdgpu_device_ip_late_init(adev); 4228 if (r) { 4229 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4230 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4231 goto release_ras_con; 4232 } 4233 /* must succeed. */ 4234 amdgpu_ras_resume(adev); 4235 queue_delayed_work(system_wq, &adev->delayed_init_work, 4236 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4237 } 4238 4239 if (amdgpu_sriov_vf(adev)) { 4240 amdgpu_virt_release_full_gpu(adev, true); 4241 flush_delayed_work(&adev->delayed_init_work); 4242 } 4243 4244 /* 4245 * Place those sysfs registering after `late_init`. As some of those 4246 * operations performed in `late_init` might affect the sysfs 4247 * interfaces creating. 4248 */ 4249 r = amdgpu_atombios_sysfs_init(adev); 4250 if (r) 4251 drm_err(&adev->ddev, 4252 "registering atombios sysfs failed (%d).\n", r); 4253 4254 r = amdgpu_pm_sysfs_init(adev); 4255 if (r) 4256 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4257 4258 r = amdgpu_ucode_sysfs_init(adev); 4259 if (r) { 4260 adev->ucode_sysfs_en = false; 4261 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4262 } else 4263 adev->ucode_sysfs_en = true; 4264 4265 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4266 if (r) 4267 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4268 4269 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4270 if (r) 4271 dev_err(adev->dev, 4272 "Could not create amdgpu board attributes\n"); 4273 4274 amdgpu_fru_sysfs_init(adev); 4275 amdgpu_reg_state_sysfs_init(adev); 4276 4277 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4278 r = amdgpu_pmu_init(adev); 4279 if (r) 4280 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4281 4282 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4283 if (amdgpu_device_cache_pci_state(adev->pdev)) 4284 pci_restore_state(pdev); 4285 4286 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4287 /* this will fail for cards that aren't VGA class devices, just 4288 * ignore it 4289 */ 4290 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4291 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4292 4293 px = amdgpu_device_supports_px(ddev); 4294 4295 if (px || (!dev_is_removable(&adev->pdev->dev) && 4296 apple_gmux_detect(NULL, NULL))) 4297 vga_switcheroo_register_client(adev->pdev, 4298 &amdgpu_switcheroo_ops, px); 4299 4300 if (px) 4301 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4302 4303 if (adev->gmc.xgmi.pending_reset) 4304 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4305 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4306 4307 amdgpu_device_check_iommu_direct_map(adev); 4308 4309 return 0; 4310 4311 release_ras_con: 4312 if (amdgpu_sriov_vf(adev)) 4313 amdgpu_virt_release_full_gpu(adev, true); 4314 4315 /* failed in exclusive mode due to timeout */ 4316 if (amdgpu_sriov_vf(adev) && 4317 !amdgpu_sriov_runtime(adev) && 4318 amdgpu_virt_mmio_blocked(adev) && 4319 !amdgpu_virt_wait_reset(adev)) { 4320 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4321 /* Don't send request since VF is inactive. */ 4322 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4323 adev->virt.ops = NULL; 4324 r = -EAGAIN; 4325 } 4326 amdgpu_release_ras_context(adev); 4327 4328 failed: 4329 amdgpu_vf_error_trans_all(adev); 4330 4331 return r; 4332 } 4333 4334 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4335 { 4336 4337 /* Clear all CPU mappings pointing to this device */ 4338 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4339 4340 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4341 amdgpu_doorbell_fini(adev); 4342 4343 iounmap(adev->rmmio); 4344 adev->rmmio = NULL; 4345 if (adev->mman.aper_base_kaddr) 4346 iounmap(adev->mman.aper_base_kaddr); 4347 adev->mman.aper_base_kaddr = NULL; 4348 4349 /* Memory manager related */ 4350 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4351 arch_phys_wc_del(adev->gmc.vram_mtrr); 4352 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4353 } 4354 } 4355 4356 /** 4357 * amdgpu_device_fini_hw - tear down the driver 4358 * 4359 * @adev: amdgpu_device pointer 4360 * 4361 * Tear down the driver info (all asics). 4362 * Called at driver shutdown. 4363 */ 4364 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4365 { 4366 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4367 flush_delayed_work(&adev->delayed_init_work); 4368 adev->shutdown = true; 4369 4370 /* make sure IB test finished before entering exclusive mode 4371 * to avoid preemption on IB test 4372 */ 4373 if (amdgpu_sriov_vf(adev)) { 4374 amdgpu_virt_request_full_gpu(adev, false); 4375 amdgpu_virt_fini_data_exchange(adev); 4376 } 4377 4378 /* disable all interrupts */ 4379 amdgpu_irq_disable_all(adev); 4380 if (adev->mode_info.mode_config_initialized) { 4381 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4382 drm_helper_force_disable_all(adev_to_drm(adev)); 4383 else 4384 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4385 } 4386 amdgpu_fence_driver_hw_fini(adev); 4387 4388 if (adev->mman.initialized) 4389 drain_workqueue(adev->mman.bdev.wq); 4390 4391 if (adev->pm.sysfs_initialized) 4392 amdgpu_pm_sysfs_fini(adev); 4393 if (adev->ucode_sysfs_en) 4394 amdgpu_ucode_sysfs_fini(adev); 4395 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4396 amdgpu_fru_sysfs_fini(adev); 4397 4398 amdgpu_reg_state_sysfs_fini(adev); 4399 4400 /* disable ras feature must before hw fini */ 4401 amdgpu_ras_pre_fini(adev); 4402 4403 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4404 4405 amdgpu_device_ip_fini_early(adev); 4406 4407 amdgpu_irq_fini_hw(adev); 4408 4409 if (adev->mman.initialized) 4410 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4411 4412 amdgpu_gart_dummy_page_fini(adev); 4413 4414 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4415 amdgpu_device_unmap_mmio(adev); 4416 4417 } 4418 4419 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4420 { 4421 int idx; 4422 bool px; 4423 4424 amdgpu_fence_driver_sw_fini(adev); 4425 amdgpu_device_ip_fini(adev); 4426 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4427 adev->accel_working = false; 4428 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4429 4430 amdgpu_reset_fini(adev); 4431 4432 /* free i2c buses */ 4433 if (!amdgpu_device_has_dc_support(adev)) 4434 amdgpu_i2c_fini(adev); 4435 4436 if (amdgpu_emu_mode != 1) 4437 amdgpu_atombios_fini(adev); 4438 4439 kfree(adev->bios); 4440 adev->bios = NULL; 4441 4442 kfree(adev->fru_info); 4443 adev->fru_info = NULL; 4444 4445 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4446 4447 if (px || (!dev_is_removable(&adev->pdev->dev) && 4448 apple_gmux_detect(NULL, NULL))) 4449 vga_switcheroo_unregister_client(adev->pdev); 4450 4451 if (px) 4452 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4453 4454 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4455 vga_client_unregister(adev->pdev); 4456 4457 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4458 4459 iounmap(adev->rmmio); 4460 adev->rmmio = NULL; 4461 amdgpu_doorbell_fini(adev); 4462 drm_dev_exit(idx); 4463 } 4464 4465 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4466 amdgpu_pmu_fini(adev); 4467 if (adev->mman.discovery_bin) 4468 amdgpu_discovery_fini(adev); 4469 4470 amdgpu_reset_put_reset_domain(adev->reset_domain); 4471 adev->reset_domain = NULL; 4472 4473 kfree(adev->pci_state); 4474 4475 } 4476 4477 /** 4478 * amdgpu_device_evict_resources - evict device resources 4479 * @adev: amdgpu device object 4480 * 4481 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4482 * of the vram memory type. Mainly used for evicting device resources 4483 * at suspend time. 4484 * 4485 */ 4486 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4487 { 4488 int ret; 4489 4490 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4491 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4492 return 0; 4493 4494 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4495 if (ret) 4496 DRM_WARN("evicting device resources failed\n"); 4497 return ret; 4498 } 4499 4500 /* 4501 * Suspend & resume. 4502 */ 4503 /** 4504 * amdgpu_device_prepare - prepare for device suspend 4505 * 4506 * @dev: drm dev pointer 4507 * 4508 * Prepare to put the hw in the suspend state (all asics). 4509 * Returns 0 for success or an error on failure. 4510 * Called at driver suspend. 4511 */ 4512 int amdgpu_device_prepare(struct drm_device *dev) 4513 { 4514 struct amdgpu_device *adev = drm_to_adev(dev); 4515 int i, r; 4516 4517 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4518 return 0; 4519 4520 /* Evict the majority of BOs before starting suspend sequence */ 4521 r = amdgpu_device_evict_resources(adev); 4522 if (r) 4523 return r; 4524 4525 for (i = 0; i < adev->num_ip_blocks; i++) { 4526 if (!adev->ip_blocks[i].status.valid) 4527 continue; 4528 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4529 continue; 4530 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4531 if (r) 4532 return r; 4533 } 4534 4535 return 0; 4536 } 4537 4538 /** 4539 * amdgpu_device_suspend - initiate device suspend 4540 * 4541 * @dev: drm dev pointer 4542 * @fbcon : notify the fbdev of suspend 4543 * 4544 * Puts the hw in the suspend state (all asics). 4545 * Returns 0 for success or an error on failure. 4546 * Called at driver suspend. 4547 */ 4548 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4549 { 4550 struct amdgpu_device *adev = drm_to_adev(dev); 4551 int r = 0; 4552 4553 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4554 return 0; 4555 4556 adev->in_suspend = true; 4557 4558 if (amdgpu_sriov_vf(adev)) { 4559 amdgpu_virt_fini_data_exchange(adev); 4560 r = amdgpu_virt_request_full_gpu(adev, false); 4561 if (r) 4562 return r; 4563 } 4564 4565 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4566 DRM_WARN("smart shift update failed\n"); 4567 4568 if (fbcon) 4569 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4570 4571 cancel_delayed_work_sync(&adev->delayed_init_work); 4572 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4573 4574 amdgpu_ras_suspend(adev); 4575 4576 amdgpu_device_ip_suspend_phase1(adev); 4577 4578 if (!adev->in_s0ix) 4579 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4580 4581 r = amdgpu_device_evict_resources(adev); 4582 if (r) 4583 return r; 4584 4585 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4586 4587 amdgpu_fence_driver_hw_fini(adev); 4588 4589 amdgpu_device_ip_suspend_phase2(adev); 4590 4591 if (amdgpu_sriov_vf(adev)) 4592 amdgpu_virt_release_full_gpu(adev, false); 4593 4594 r = amdgpu_dpm_notify_rlc_state(adev, false); 4595 if (r) 4596 return r; 4597 4598 return 0; 4599 } 4600 4601 /** 4602 * amdgpu_device_resume - initiate device resume 4603 * 4604 * @dev: drm dev pointer 4605 * @fbcon : notify the fbdev of resume 4606 * 4607 * Bring the hw back to operating state (all asics). 4608 * Returns 0 for success or an error on failure. 4609 * Called at driver resume. 4610 */ 4611 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4612 { 4613 struct amdgpu_device *adev = drm_to_adev(dev); 4614 int r = 0; 4615 4616 if (amdgpu_sriov_vf(adev)) { 4617 r = amdgpu_virt_request_full_gpu(adev, true); 4618 if (r) 4619 return r; 4620 } 4621 4622 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4623 return 0; 4624 4625 if (adev->in_s0ix) 4626 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4627 4628 /* post card */ 4629 if (amdgpu_device_need_post(adev)) { 4630 r = amdgpu_device_asic_init(adev); 4631 if (r) 4632 dev_err(adev->dev, "amdgpu asic init failed\n"); 4633 } 4634 4635 r = amdgpu_device_ip_resume(adev); 4636 4637 if (r) { 4638 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4639 goto exit; 4640 } 4641 amdgpu_fence_driver_hw_init(adev); 4642 4643 if (!adev->in_s0ix) { 4644 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4645 if (r) 4646 goto exit; 4647 } 4648 4649 r = amdgpu_device_ip_late_init(adev); 4650 if (r) 4651 goto exit; 4652 4653 queue_delayed_work(system_wq, &adev->delayed_init_work, 4654 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4655 exit: 4656 if (amdgpu_sriov_vf(adev)) { 4657 amdgpu_virt_init_data_exchange(adev); 4658 amdgpu_virt_release_full_gpu(adev, true); 4659 } 4660 4661 if (r) 4662 return r; 4663 4664 /* Make sure IB tests flushed */ 4665 flush_delayed_work(&adev->delayed_init_work); 4666 4667 if (fbcon) 4668 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4669 4670 amdgpu_ras_resume(adev); 4671 4672 if (adev->mode_info.num_crtc) { 4673 /* 4674 * Most of the connector probing functions try to acquire runtime pm 4675 * refs to ensure that the GPU is powered on when connector polling is 4676 * performed. Since we're calling this from a runtime PM callback, 4677 * trying to acquire rpm refs will cause us to deadlock. 4678 * 4679 * Since we're guaranteed to be holding the rpm lock, it's safe to 4680 * temporarily disable the rpm helpers so this doesn't deadlock us. 4681 */ 4682 #ifdef CONFIG_PM 4683 dev->dev->power.disable_depth++; 4684 #endif 4685 if (!adev->dc_enabled) 4686 drm_helper_hpd_irq_event(dev); 4687 else 4688 drm_kms_helper_hotplug_event(dev); 4689 #ifdef CONFIG_PM 4690 dev->dev->power.disable_depth--; 4691 #endif 4692 } 4693 adev->in_suspend = false; 4694 4695 if (adev->enable_mes) 4696 amdgpu_mes_self_test(adev); 4697 4698 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4699 DRM_WARN("smart shift update failed\n"); 4700 4701 return 0; 4702 } 4703 4704 /** 4705 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4706 * 4707 * @adev: amdgpu_device pointer 4708 * 4709 * The list of all the hardware IPs that make up the asic is walked and 4710 * the check_soft_reset callbacks are run. check_soft_reset determines 4711 * if the asic is still hung or not. 4712 * Returns true if any of the IPs are still in a hung state, false if not. 4713 */ 4714 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4715 { 4716 int i; 4717 bool asic_hang = false; 4718 4719 if (amdgpu_sriov_vf(adev)) 4720 return true; 4721 4722 if (amdgpu_asic_need_full_reset(adev)) 4723 return true; 4724 4725 for (i = 0; i < adev->num_ip_blocks; i++) { 4726 if (!adev->ip_blocks[i].status.valid) 4727 continue; 4728 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4729 adev->ip_blocks[i].status.hang = 4730 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4731 if (adev->ip_blocks[i].status.hang) { 4732 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4733 asic_hang = true; 4734 } 4735 } 4736 return asic_hang; 4737 } 4738 4739 /** 4740 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4741 * 4742 * @adev: amdgpu_device pointer 4743 * 4744 * The list of all the hardware IPs that make up the asic is walked and the 4745 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4746 * handles any IP specific hardware or software state changes that are 4747 * necessary for a soft reset to succeed. 4748 * Returns 0 on success, negative error code on failure. 4749 */ 4750 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4751 { 4752 int i, r = 0; 4753 4754 for (i = 0; i < adev->num_ip_blocks; i++) { 4755 if (!adev->ip_blocks[i].status.valid) 4756 continue; 4757 if (adev->ip_blocks[i].status.hang && 4758 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4759 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4760 if (r) 4761 return r; 4762 } 4763 } 4764 4765 return 0; 4766 } 4767 4768 /** 4769 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4770 * 4771 * @adev: amdgpu_device pointer 4772 * 4773 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4774 * reset is necessary to recover. 4775 * Returns true if a full asic reset is required, false if not. 4776 */ 4777 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4778 { 4779 int i; 4780 4781 if (amdgpu_asic_need_full_reset(adev)) 4782 return true; 4783 4784 for (i = 0; i < adev->num_ip_blocks; i++) { 4785 if (!adev->ip_blocks[i].status.valid) 4786 continue; 4787 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4788 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4789 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4790 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4791 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4792 if (adev->ip_blocks[i].status.hang) { 4793 dev_info(adev->dev, "Some block need full reset!\n"); 4794 return true; 4795 } 4796 } 4797 } 4798 return false; 4799 } 4800 4801 /** 4802 * amdgpu_device_ip_soft_reset - do a soft reset 4803 * 4804 * @adev: amdgpu_device pointer 4805 * 4806 * The list of all the hardware IPs that make up the asic is walked and the 4807 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4808 * IP specific hardware or software state changes that are necessary to soft 4809 * reset the IP. 4810 * Returns 0 on success, negative error code on failure. 4811 */ 4812 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4813 { 4814 int i, r = 0; 4815 4816 for (i = 0; i < adev->num_ip_blocks; i++) { 4817 if (!adev->ip_blocks[i].status.valid) 4818 continue; 4819 if (adev->ip_blocks[i].status.hang && 4820 adev->ip_blocks[i].version->funcs->soft_reset) { 4821 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4822 if (r) 4823 return r; 4824 } 4825 } 4826 4827 return 0; 4828 } 4829 4830 /** 4831 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4832 * 4833 * @adev: amdgpu_device pointer 4834 * 4835 * The list of all the hardware IPs that make up the asic is walked and the 4836 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4837 * handles any IP specific hardware or software state changes that are 4838 * necessary after the IP has been soft reset. 4839 * Returns 0 on success, negative error code on failure. 4840 */ 4841 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4842 { 4843 int i, r = 0; 4844 4845 for (i = 0; i < adev->num_ip_blocks; i++) { 4846 if (!adev->ip_blocks[i].status.valid) 4847 continue; 4848 if (adev->ip_blocks[i].status.hang && 4849 adev->ip_blocks[i].version->funcs->post_soft_reset) 4850 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4851 if (r) 4852 return r; 4853 } 4854 4855 return 0; 4856 } 4857 4858 /** 4859 * amdgpu_device_recover_vram - Recover some VRAM contents 4860 * 4861 * @adev: amdgpu_device pointer 4862 * 4863 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4864 * restore things like GPUVM page tables after a GPU reset where 4865 * the contents of VRAM might be lost. 4866 * 4867 * Returns: 4868 * 0 on success, negative error code on failure. 4869 */ 4870 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4871 { 4872 struct dma_fence *fence = NULL, *next = NULL; 4873 struct amdgpu_bo *shadow; 4874 struct amdgpu_bo_vm *vmbo; 4875 long r = 1, tmo; 4876 4877 if (amdgpu_sriov_runtime(adev)) 4878 tmo = msecs_to_jiffies(8000); 4879 else 4880 tmo = msecs_to_jiffies(100); 4881 4882 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4883 mutex_lock(&adev->shadow_list_lock); 4884 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4885 /* If vm is compute context or adev is APU, shadow will be NULL */ 4886 if (!vmbo->shadow) 4887 continue; 4888 shadow = vmbo->shadow; 4889 4890 /* No need to recover an evicted BO */ 4891 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4892 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4893 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4894 continue; 4895 4896 r = amdgpu_bo_restore_shadow(shadow, &next); 4897 if (r) 4898 break; 4899 4900 if (fence) { 4901 tmo = dma_fence_wait_timeout(fence, false, tmo); 4902 dma_fence_put(fence); 4903 fence = next; 4904 if (tmo == 0) { 4905 r = -ETIMEDOUT; 4906 break; 4907 } else if (tmo < 0) { 4908 r = tmo; 4909 break; 4910 } 4911 } else { 4912 fence = next; 4913 } 4914 } 4915 mutex_unlock(&adev->shadow_list_lock); 4916 4917 if (fence) 4918 tmo = dma_fence_wait_timeout(fence, false, tmo); 4919 dma_fence_put(fence); 4920 4921 if (r < 0 || tmo <= 0) { 4922 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4923 return -EIO; 4924 } 4925 4926 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4927 return 0; 4928 } 4929 4930 4931 /** 4932 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4933 * 4934 * @adev: amdgpu_device pointer 4935 * @from_hypervisor: request from hypervisor 4936 * 4937 * do VF FLR and reinitialize Asic 4938 * return 0 means succeeded otherwise failed 4939 */ 4940 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4941 bool from_hypervisor) 4942 { 4943 int r; 4944 struct amdgpu_hive_info *hive = NULL; 4945 int retry_limit = 0; 4946 4947 retry: 4948 amdgpu_amdkfd_pre_reset(adev); 4949 4950 if (from_hypervisor) 4951 r = amdgpu_virt_request_full_gpu(adev, true); 4952 else 4953 r = amdgpu_virt_reset_gpu(adev); 4954 if (r) 4955 return r; 4956 amdgpu_irq_gpu_reset_resume_helper(adev); 4957 4958 /* some sw clean up VF needs to do before recover */ 4959 amdgpu_virt_post_reset(adev); 4960 4961 /* Resume IP prior to SMC */ 4962 r = amdgpu_device_ip_reinit_early_sriov(adev); 4963 if (r) 4964 goto error; 4965 4966 amdgpu_virt_init_data_exchange(adev); 4967 4968 r = amdgpu_device_fw_loading(adev); 4969 if (r) 4970 return r; 4971 4972 /* now we are okay to resume SMC/CP/SDMA */ 4973 r = amdgpu_device_ip_reinit_late_sriov(adev); 4974 if (r) 4975 goto error; 4976 4977 hive = amdgpu_get_xgmi_hive(adev); 4978 /* Update PSP FW topology after reset */ 4979 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4980 r = amdgpu_xgmi_update_topology(hive, adev); 4981 4982 if (hive) 4983 amdgpu_put_xgmi_hive(hive); 4984 4985 if (!r) { 4986 r = amdgpu_ib_ring_tests(adev); 4987 4988 amdgpu_amdkfd_post_reset(adev); 4989 } 4990 4991 error: 4992 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4993 amdgpu_inc_vram_lost(adev); 4994 r = amdgpu_device_recover_vram(adev); 4995 } 4996 amdgpu_virt_release_full_gpu(adev, true); 4997 4998 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4999 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 5000 retry_limit++; 5001 goto retry; 5002 } else 5003 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 5004 } 5005 5006 return r; 5007 } 5008 5009 /** 5010 * amdgpu_device_has_job_running - check if there is any job in mirror list 5011 * 5012 * @adev: amdgpu_device pointer 5013 * 5014 * check if there is any job in mirror list 5015 */ 5016 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5017 { 5018 int i; 5019 struct drm_sched_job *job; 5020 5021 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5022 struct amdgpu_ring *ring = adev->rings[i]; 5023 5024 if (!amdgpu_ring_sched_ready(ring)) 5025 continue; 5026 5027 spin_lock(&ring->sched.job_list_lock); 5028 job = list_first_entry_or_null(&ring->sched.pending_list, 5029 struct drm_sched_job, list); 5030 spin_unlock(&ring->sched.job_list_lock); 5031 if (job) 5032 return true; 5033 } 5034 return false; 5035 } 5036 5037 /** 5038 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5039 * 5040 * @adev: amdgpu_device pointer 5041 * 5042 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5043 * a hung GPU. 5044 */ 5045 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5046 { 5047 5048 if (amdgpu_gpu_recovery == 0) 5049 goto disabled; 5050 5051 /* Skip soft reset check in fatal error mode */ 5052 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5053 return true; 5054 5055 if (amdgpu_sriov_vf(adev)) 5056 return true; 5057 5058 if (amdgpu_gpu_recovery == -1) { 5059 switch (adev->asic_type) { 5060 #ifdef CONFIG_DRM_AMDGPU_SI 5061 case CHIP_VERDE: 5062 case CHIP_TAHITI: 5063 case CHIP_PITCAIRN: 5064 case CHIP_OLAND: 5065 case CHIP_HAINAN: 5066 #endif 5067 #ifdef CONFIG_DRM_AMDGPU_CIK 5068 case CHIP_KAVERI: 5069 case CHIP_KABINI: 5070 case CHIP_MULLINS: 5071 #endif 5072 case CHIP_CARRIZO: 5073 case CHIP_STONEY: 5074 case CHIP_CYAN_SKILLFISH: 5075 goto disabled; 5076 default: 5077 break; 5078 } 5079 } 5080 5081 return true; 5082 5083 disabled: 5084 dev_info(adev->dev, "GPU recovery disabled.\n"); 5085 return false; 5086 } 5087 5088 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5089 { 5090 u32 i; 5091 int ret = 0; 5092 5093 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5094 5095 dev_info(adev->dev, "GPU mode1 reset\n"); 5096 5097 /* disable BM */ 5098 pci_clear_master(adev->pdev); 5099 5100 amdgpu_device_cache_pci_state(adev->pdev); 5101 5102 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5103 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5104 ret = amdgpu_dpm_mode1_reset(adev); 5105 } else { 5106 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5107 ret = psp_gpu_reset(adev); 5108 } 5109 5110 if (ret) 5111 goto mode1_reset_failed; 5112 5113 amdgpu_device_load_pci_state(adev->pdev); 5114 ret = amdgpu_psp_wait_for_bootloader(adev); 5115 if (ret) 5116 goto mode1_reset_failed; 5117 5118 /* wait for asic to come out of reset */ 5119 for (i = 0; i < adev->usec_timeout; i++) { 5120 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5121 5122 if (memsize != 0xffffffff) 5123 break; 5124 udelay(1); 5125 } 5126 5127 if (i >= adev->usec_timeout) { 5128 ret = -ETIMEDOUT; 5129 goto mode1_reset_failed; 5130 } 5131 5132 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5133 5134 return 0; 5135 5136 mode1_reset_failed: 5137 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5138 return ret; 5139 } 5140 5141 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5142 struct amdgpu_reset_context *reset_context) 5143 { 5144 int i, r = 0; 5145 struct amdgpu_job *job = NULL; 5146 bool need_full_reset = 5147 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5148 5149 if (reset_context->reset_req_dev == adev) 5150 job = reset_context->job; 5151 5152 if (amdgpu_sriov_vf(adev)) { 5153 /* stop the data exchange thread */ 5154 amdgpu_virt_fini_data_exchange(adev); 5155 } 5156 5157 amdgpu_fence_driver_isr_toggle(adev, true); 5158 5159 /* block all schedulers and reset given job's ring */ 5160 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5161 struct amdgpu_ring *ring = adev->rings[i]; 5162 5163 if (!amdgpu_ring_sched_ready(ring)) 5164 continue; 5165 5166 /* Clear job fence from fence drv to avoid force_completion 5167 * leave NULL and vm flush fence in fence drv 5168 */ 5169 amdgpu_fence_driver_clear_job_fences(ring); 5170 5171 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5172 amdgpu_fence_driver_force_completion(ring); 5173 } 5174 5175 amdgpu_fence_driver_isr_toggle(adev, false); 5176 5177 if (job && job->vm) 5178 drm_sched_increase_karma(&job->base); 5179 5180 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5181 /* If reset handler not implemented, continue; otherwise return */ 5182 if (r == -EOPNOTSUPP) 5183 r = 0; 5184 else 5185 return r; 5186 5187 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5188 if (!amdgpu_sriov_vf(adev)) { 5189 5190 if (!need_full_reset) 5191 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5192 5193 if (!need_full_reset && amdgpu_gpu_recovery && 5194 amdgpu_device_ip_check_soft_reset(adev)) { 5195 amdgpu_device_ip_pre_soft_reset(adev); 5196 r = amdgpu_device_ip_soft_reset(adev); 5197 amdgpu_device_ip_post_soft_reset(adev); 5198 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5199 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5200 need_full_reset = true; 5201 } 5202 } 5203 5204 if (need_full_reset) 5205 r = amdgpu_device_ip_suspend(adev); 5206 if (need_full_reset) 5207 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5208 else 5209 clear_bit(AMDGPU_NEED_FULL_RESET, 5210 &reset_context->flags); 5211 } 5212 5213 return r; 5214 } 5215 5216 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5217 { 5218 int i; 5219 5220 lockdep_assert_held(&adev->reset_domain->sem); 5221 5222 for (i = 0; i < adev->reset_info.num_regs; i++) { 5223 adev->reset_info.reset_dump_reg_value[i] = 5224 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5225 5226 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5227 adev->reset_info.reset_dump_reg_value[i]); 5228 } 5229 5230 return 0; 5231 } 5232 5233 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5234 struct amdgpu_reset_context *reset_context) 5235 { 5236 struct amdgpu_device *tmp_adev = NULL; 5237 bool need_full_reset, skip_hw_reset, vram_lost = false; 5238 int r = 0; 5239 5240 /* Try reset handler method first */ 5241 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5242 reset_list); 5243 amdgpu_reset_reg_dumps(tmp_adev); 5244 5245 reset_context->reset_device_list = device_list_handle; 5246 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5247 /* If reset handler not implemented, continue; otherwise return */ 5248 if (r == -EOPNOTSUPP) 5249 r = 0; 5250 else 5251 return r; 5252 5253 /* Reset handler not implemented, use the default method */ 5254 need_full_reset = 5255 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5256 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5257 5258 /* 5259 * ASIC reset has to be done on all XGMI hive nodes ASAP 5260 * to allow proper links negotiation in FW (within 1 sec) 5261 */ 5262 if (!skip_hw_reset && need_full_reset) { 5263 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5264 /* For XGMI run all resets in parallel to speed up the process */ 5265 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5266 tmp_adev->gmc.xgmi.pending_reset = false; 5267 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5268 r = -EALREADY; 5269 } else 5270 r = amdgpu_asic_reset(tmp_adev); 5271 5272 if (r) { 5273 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5274 r, adev_to_drm(tmp_adev)->unique); 5275 goto out; 5276 } 5277 } 5278 5279 /* For XGMI wait for all resets to complete before proceed */ 5280 if (!r) { 5281 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5282 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5283 flush_work(&tmp_adev->xgmi_reset_work); 5284 r = tmp_adev->asic_reset_res; 5285 if (r) 5286 break; 5287 } 5288 } 5289 } 5290 } 5291 5292 if (!r && amdgpu_ras_intr_triggered()) { 5293 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5294 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5295 } 5296 5297 amdgpu_ras_intr_cleared(); 5298 } 5299 5300 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5301 if (need_full_reset) { 5302 /* post card */ 5303 r = amdgpu_device_asic_init(tmp_adev); 5304 if (r) { 5305 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5306 } else { 5307 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5308 5309 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5310 if (r) 5311 goto out; 5312 5313 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5314 5315 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5316 5317 if (vram_lost) { 5318 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5319 amdgpu_inc_vram_lost(tmp_adev); 5320 } 5321 5322 r = amdgpu_device_fw_loading(tmp_adev); 5323 if (r) 5324 return r; 5325 5326 r = amdgpu_xcp_restore_partition_mode( 5327 tmp_adev->xcp_mgr); 5328 if (r) 5329 goto out; 5330 5331 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5332 if (r) 5333 goto out; 5334 5335 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5336 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5337 5338 if (vram_lost) 5339 amdgpu_device_fill_reset_magic(tmp_adev); 5340 5341 /* 5342 * Add this ASIC as tracked as reset was already 5343 * complete successfully. 5344 */ 5345 amdgpu_register_gpu_instance(tmp_adev); 5346 5347 if (!reset_context->hive && 5348 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5349 amdgpu_xgmi_add_device(tmp_adev); 5350 5351 r = amdgpu_device_ip_late_init(tmp_adev); 5352 if (r) 5353 goto out; 5354 5355 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5356 5357 /* 5358 * The GPU enters bad state once faulty pages 5359 * by ECC has reached the threshold, and ras 5360 * recovery is scheduled next. So add one check 5361 * here to break recovery if it indeed exceeds 5362 * bad page threshold, and remind user to 5363 * retire this GPU or setting one bigger 5364 * bad_page_threshold value to fix this once 5365 * probing driver again. 5366 */ 5367 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5368 /* must succeed. */ 5369 amdgpu_ras_resume(tmp_adev); 5370 } else { 5371 r = -EINVAL; 5372 goto out; 5373 } 5374 5375 /* Update PSP FW topology after reset */ 5376 if (reset_context->hive && 5377 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5378 r = amdgpu_xgmi_update_topology( 5379 reset_context->hive, tmp_adev); 5380 } 5381 } 5382 5383 out: 5384 if (!r) { 5385 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5386 r = amdgpu_ib_ring_tests(tmp_adev); 5387 if (r) { 5388 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5389 need_full_reset = true; 5390 r = -EAGAIN; 5391 goto end; 5392 } 5393 } 5394 5395 if (!r) 5396 r = amdgpu_device_recover_vram(tmp_adev); 5397 else 5398 tmp_adev->asic_reset_res = r; 5399 } 5400 5401 end: 5402 if (need_full_reset) 5403 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5404 else 5405 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5406 return r; 5407 } 5408 5409 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5410 { 5411 5412 switch (amdgpu_asic_reset_method(adev)) { 5413 case AMD_RESET_METHOD_MODE1: 5414 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5415 break; 5416 case AMD_RESET_METHOD_MODE2: 5417 adev->mp1_state = PP_MP1_STATE_RESET; 5418 break; 5419 default: 5420 adev->mp1_state = PP_MP1_STATE_NONE; 5421 break; 5422 } 5423 } 5424 5425 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5426 { 5427 amdgpu_vf_error_trans_all(adev); 5428 adev->mp1_state = PP_MP1_STATE_NONE; 5429 } 5430 5431 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5432 { 5433 struct pci_dev *p = NULL; 5434 5435 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5436 adev->pdev->bus->number, 1); 5437 if (p) { 5438 pm_runtime_enable(&(p->dev)); 5439 pm_runtime_resume(&(p->dev)); 5440 } 5441 5442 pci_dev_put(p); 5443 } 5444 5445 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5446 { 5447 enum amd_reset_method reset_method; 5448 struct pci_dev *p = NULL; 5449 u64 expires; 5450 5451 /* 5452 * For now, only BACO and mode1 reset are confirmed 5453 * to suffer the audio issue without proper suspended. 5454 */ 5455 reset_method = amdgpu_asic_reset_method(adev); 5456 if ((reset_method != AMD_RESET_METHOD_BACO) && 5457 (reset_method != AMD_RESET_METHOD_MODE1)) 5458 return -EINVAL; 5459 5460 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5461 adev->pdev->bus->number, 1); 5462 if (!p) 5463 return -ENODEV; 5464 5465 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5466 if (!expires) 5467 /* 5468 * If we cannot get the audio device autosuspend delay, 5469 * a fixed 4S interval will be used. Considering 3S is 5470 * the audio controller default autosuspend delay setting. 5471 * 4S used here is guaranteed to cover that. 5472 */ 5473 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5474 5475 while (!pm_runtime_status_suspended(&(p->dev))) { 5476 if (!pm_runtime_suspend(&(p->dev))) 5477 break; 5478 5479 if (expires < ktime_get_mono_fast_ns()) { 5480 dev_warn(adev->dev, "failed to suspend display audio\n"); 5481 pci_dev_put(p); 5482 /* TODO: abort the succeeding gpu reset? */ 5483 return -ETIMEDOUT; 5484 } 5485 } 5486 5487 pm_runtime_disable(&(p->dev)); 5488 5489 pci_dev_put(p); 5490 return 0; 5491 } 5492 5493 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5494 { 5495 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5496 5497 #if defined(CONFIG_DEBUG_FS) 5498 if (!amdgpu_sriov_vf(adev)) 5499 cancel_work(&adev->reset_work); 5500 #endif 5501 5502 if (adev->kfd.dev) 5503 cancel_work(&adev->kfd.reset_work); 5504 5505 if (amdgpu_sriov_vf(adev)) 5506 cancel_work(&adev->virt.flr_work); 5507 5508 if (con && adev->ras_enabled) 5509 cancel_work(&con->recovery_work); 5510 5511 } 5512 5513 /** 5514 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5515 * 5516 * @adev: amdgpu_device pointer 5517 * @job: which job trigger hang 5518 * @reset_context: amdgpu reset context pointer 5519 * 5520 * Attempt to reset the GPU if it has hung (all asics). 5521 * Attempt to do soft-reset or full-reset and reinitialize Asic 5522 * Returns 0 for success or an error on failure. 5523 */ 5524 5525 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5526 struct amdgpu_job *job, 5527 struct amdgpu_reset_context *reset_context) 5528 { 5529 struct list_head device_list, *device_list_handle = NULL; 5530 bool job_signaled = false; 5531 struct amdgpu_hive_info *hive = NULL; 5532 struct amdgpu_device *tmp_adev = NULL; 5533 int i, r = 0; 5534 bool need_emergency_restart = false; 5535 bool audio_suspended = false; 5536 5537 /* 5538 * Special case: RAS triggered and full reset isn't supported 5539 */ 5540 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5541 5542 /* 5543 * Flush RAM to disk so that after reboot 5544 * the user can read log and see why the system rebooted. 5545 */ 5546 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5547 amdgpu_ras_get_context(adev)->reboot) { 5548 DRM_WARN("Emergency reboot."); 5549 5550 ksys_sync_helper(); 5551 emergency_restart(); 5552 } 5553 5554 dev_info(adev->dev, "GPU %s begin!\n", 5555 need_emergency_restart ? "jobs stop":"reset"); 5556 5557 if (!amdgpu_sriov_vf(adev)) 5558 hive = amdgpu_get_xgmi_hive(adev); 5559 if (hive) 5560 mutex_lock(&hive->hive_lock); 5561 5562 reset_context->job = job; 5563 reset_context->hive = hive; 5564 /* 5565 * Build list of devices to reset. 5566 * In case we are in XGMI hive mode, resort the device list 5567 * to put adev in the 1st position. 5568 */ 5569 INIT_LIST_HEAD(&device_list); 5570 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5571 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5572 list_add_tail(&tmp_adev->reset_list, &device_list); 5573 if (adev->shutdown) 5574 tmp_adev->shutdown = true; 5575 } 5576 if (!list_is_first(&adev->reset_list, &device_list)) 5577 list_rotate_to_front(&adev->reset_list, &device_list); 5578 device_list_handle = &device_list; 5579 } else { 5580 list_add_tail(&adev->reset_list, &device_list); 5581 device_list_handle = &device_list; 5582 } 5583 5584 /* We need to lock reset domain only once both for XGMI and single device */ 5585 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5586 reset_list); 5587 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5588 5589 /* block all schedulers and reset given job's ring */ 5590 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5591 5592 amdgpu_device_set_mp1_state(tmp_adev); 5593 5594 /* 5595 * Try to put the audio codec into suspend state 5596 * before gpu reset started. 5597 * 5598 * Due to the power domain of the graphics device 5599 * is shared with AZ power domain. Without this, 5600 * we may change the audio hardware from behind 5601 * the audio driver's back. That will trigger 5602 * some audio codec errors. 5603 */ 5604 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5605 audio_suspended = true; 5606 5607 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5608 5609 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5610 5611 if (!amdgpu_sriov_vf(tmp_adev)) 5612 amdgpu_amdkfd_pre_reset(tmp_adev); 5613 5614 /* 5615 * Mark these ASICs to be reseted as untracked first 5616 * And add them back after reset completed 5617 */ 5618 amdgpu_unregister_gpu_instance(tmp_adev); 5619 5620 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5621 5622 /* disable ras on ALL IPs */ 5623 if (!need_emergency_restart && 5624 amdgpu_device_ip_need_full_reset(tmp_adev)) 5625 amdgpu_ras_suspend(tmp_adev); 5626 5627 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5628 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5629 5630 if (!amdgpu_ring_sched_ready(ring)) 5631 continue; 5632 5633 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5634 5635 if (need_emergency_restart) 5636 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5637 } 5638 atomic_inc(&tmp_adev->gpu_reset_counter); 5639 } 5640 5641 if (need_emergency_restart) 5642 goto skip_sched_resume; 5643 5644 /* 5645 * Must check guilty signal here since after this point all old 5646 * HW fences are force signaled. 5647 * 5648 * job->base holds a reference to parent fence 5649 */ 5650 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5651 job_signaled = true; 5652 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5653 goto skip_hw_reset; 5654 } 5655 5656 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5657 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5658 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5659 /*TODO Should we stop ?*/ 5660 if (r) { 5661 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5662 r, adev_to_drm(tmp_adev)->unique); 5663 tmp_adev->asic_reset_res = r; 5664 } 5665 5666 /* 5667 * Drop all pending non scheduler resets. Scheduler resets 5668 * were already dropped during drm_sched_stop 5669 */ 5670 amdgpu_device_stop_pending_resets(tmp_adev); 5671 } 5672 5673 /* Actual ASIC resets if needed.*/ 5674 /* Host driver will handle XGMI hive reset for SRIOV */ 5675 if (amdgpu_sriov_vf(adev)) { 5676 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5677 if (r) 5678 adev->asic_reset_res = r; 5679 5680 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5681 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5682 IP_VERSION(9, 4, 2) || 5683 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5684 amdgpu_ras_resume(adev); 5685 } else { 5686 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5687 if (r && r == -EAGAIN) 5688 goto retry; 5689 } 5690 5691 skip_hw_reset: 5692 5693 /* Post ASIC reset for all devs .*/ 5694 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5695 5696 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5697 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5698 5699 if (!amdgpu_ring_sched_ready(ring)) 5700 continue; 5701 5702 drm_sched_start(&ring->sched, true); 5703 } 5704 5705 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5706 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5707 5708 if (tmp_adev->asic_reset_res) 5709 r = tmp_adev->asic_reset_res; 5710 5711 tmp_adev->asic_reset_res = 0; 5712 5713 if (r) { 5714 /* bad news, how to tell it to userspace ? */ 5715 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5716 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5717 } else { 5718 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5719 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5720 DRM_WARN("smart shift update failed\n"); 5721 } 5722 } 5723 5724 skip_sched_resume: 5725 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5726 /* unlock kfd: SRIOV would do it separately */ 5727 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5728 amdgpu_amdkfd_post_reset(tmp_adev); 5729 5730 /* kfd_post_reset will do nothing if kfd device is not initialized, 5731 * need to bring up kfd here if it's not be initialized before 5732 */ 5733 if (!adev->kfd.init_complete) 5734 amdgpu_amdkfd_device_init(adev); 5735 5736 if (audio_suspended) 5737 amdgpu_device_resume_display_audio(tmp_adev); 5738 5739 amdgpu_device_unset_mp1_state(tmp_adev); 5740 5741 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5742 } 5743 5744 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5745 reset_list); 5746 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5747 5748 if (hive) { 5749 mutex_unlock(&hive->hive_lock); 5750 amdgpu_put_xgmi_hive(hive); 5751 } 5752 5753 if (r) 5754 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5755 5756 atomic_set(&adev->reset_domain->reset_res, r); 5757 return r; 5758 } 5759 5760 /** 5761 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5762 * 5763 * @adev: amdgpu_device pointer 5764 * @speed: pointer to the speed of the link 5765 * @width: pointer to the width of the link 5766 * 5767 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5768 * first physical partner to an AMD dGPU. 5769 * This will exclude any virtual switches and links. 5770 */ 5771 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5772 enum pci_bus_speed *speed, 5773 enum pcie_link_width *width) 5774 { 5775 struct pci_dev *parent = adev->pdev; 5776 5777 if (!speed || !width) 5778 return; 5779 5780 *speed = PCI_SPEED_UNKNOWN; 5781 *width = PCIE_LNK_WIDTH_UNKNOWN; 5782 5783 while ((parent = pci_upstream_bridge(parent))) { 5784 /* skip upstream/downstream switches internal to dGPU*/ 5785 if (parent->vendor == PCI_VENDOR_ID_ATI) 5786 continue; 5787 *speed = pcie_get_speed_cap(parent); 5788 *width = pcie_get_width_cap(parent); 5789 break; 5790 } 5791 } 5792 5793 /** 5794 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5795 * 5796 * @adev: amdgpu_device pointer 5797 * 5798 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5799 * and lanes) of the slot the device is in. Handles APUs and 5800 * virtualized environments where PCIE config space may not be available. 5801 */ 5802 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5803 { 5804 struct pci_dev *pdev; 5805 enum pci_bus_speed speed_cap, platform_speed_cap; 5806 enum pcie_link_width platform_link_width; 5807 5808 if (amdgpu_pcie_gen_cap) 5809 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5810 5811 if (amdgpu_pcie_lane_cap) 5812 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5813 5814 /* covers APUs as well */ 5815 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5816 if (adev->pm.pcie_gen_mask == 0) 5817 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5818 if (adev->pm.pcie_mlw_mask == 0) 5819 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5820 return; 5821 } 5822 5823 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5824 return; 5825 5826 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 5827 &platform_link_width); 5828 5829 if (adev->pm.pcie_gen_mask == 0) { 5830 /* asic caps */ 5831 pdev = adev->pdev; 5832 speed_cap = pcie_get_speed_cap(pdev); 5833 if (speed_cap == PCI_SPEED_UNKNOWN) { 5834 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5835 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5836 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5837 } else { 5838 if (speed_cap == PCIE_SPEED_32_0GT) 5839 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5840 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5841 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5842 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5843 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5844 else if (speed_cap == PCIE_SPEED_16_0GT) 5845 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5846 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5847 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5848 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5849 else if (speed_cap == PCIE_SPEED_8_0GT) 5850 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5851 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5852 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5853 else if (speed_cap == PCIE_SPEED_5_0GT) 5854 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5855 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5856 else 5857 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5858 } 5859 /* platform caps */ 5860 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5861 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5862 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5863 } else { 5864 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5865 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5866 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5867 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5868 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5869 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5870 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5871 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5872 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5873 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5874 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5875 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5876 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5877 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5878 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5879 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5880 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5881 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5882 else 5883 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5884 5885 } 5886 } 5887 if (adev->pm.pcie_mlw_mask == 0) { 5888 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5889 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5890 } else { 5891 switch (platform_link_width) { 5892 case PCIE_LNK_X32: 5893 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5894 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5895 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5896 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5897 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5898 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5899 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5900 break; 5901 case PCIE_LNK_X16: 5902 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5903 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5904 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5905 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5906 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5907 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5908 break; 5909 case PCIE_LNK_X12: 5910 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5911 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5912 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5913 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5914 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5915 break; 5916 case PCIE_LNK_X8: 5917 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5918 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5919 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5920 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5921 break; 5922 case PCIE_LNK_X4: 5923 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5924 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5925 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5926 break; 5927 case PCIE_LNK_X2: 5928 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5929 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5930 break; 5931 case PCIE_LNK_X1: 5932 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5933 break; 5934 default: 5935 break; 5936 } 5937 } 5938 } 5939 } 5940 5941 /** 5942 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5943 * 5944 * @adev: amdgpu_device pointer 5945 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5946 * 5947 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5948 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5949 * @peer_adev. 5950 */ 5951 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5952 struct amdgpu_device *peer_adev) 5953 { 5954 #ifdef CONFIG_HSA_AMD_P2P 5955 uint64_t address_mask = peer_adev->dev->dma_mask ? 5956 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5957 resource_size_t aper_limit = 5958 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5959 bool p2p_access = 5960 !adev->gmc.xgmi.connected_to_cpu && 5961 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5962 5963 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5964 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5965 !(adev->gmc.aper_base & address_mask || 5966 aper_limit & address_mask)); 5967 #else 5968 return false; 5969 #endif 5970 } 5971 5972 int amdgpu_device_baco_enter(struct drm_device *dev) 5973 { 5974 struct amdgpu_device *adev = drm_to_adev(dev); 5975 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5976 5977 if (!amdgpu_device_supports_baco(dev)) 5978 return -ENOTSUPP; 5979 5980 if (ras && adev->ras_enabled && 5981 adev->nbio.funcs->enable_doorbell_interrupt) 5982 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5983 5984 return amdgpu_dpm_baco_enter(adev); 5985 } 5986 5987 int amdgpu_device_baco_exit(struct drm_device *dev) 5988 { 5989 struct amdgpu_device *adev = drm_to_adev(dev); 5990 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5991 int ret = 0; 5992 5993 if (!amdgpu_device_supports_baco(dev)) 5994 return -ENOTSUPP; 5995 5996 ret = amdgpu_dpm_baco_exit(adev); 5997 if (ret) 5998 return ret; 5999 6000 if (ras && adev->ras_enabled && 6001 adev->nbio.funcs->enable_doorbell_interrupt) 6002 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6003 6004 if (amdgpu_passthrough(adev) && 6005 adev->nbio.funcs->clear_doorbell_interrupt) 6006 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6007 6008 return 0; 6009 } 6010 6011 /** 6012 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6013 * @pdev: PCI device struct 6014 * @state: PCI channel state 6015 * 6016 * Description: Called when a PCI error is detected. 6017 * 6018 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6019 */ 6020 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6021 { 6022 struct drm_device *dev = pci_get_drvdata(pdev); 6023 struct amdgpu_device *adev = drm_to_adev(dev); 6024 int i; 6025 6026 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6027 6028 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6029 DRM_WARN("No support for XGMI hive yet..."); 6030 return PCI_ERS_RESULT_DISCONNECT; 6031 } 6032 6033 adev->pci_channel_state = state; 6034 6035 switch (state) { 6036 case pci_channel_io_normal: 6037 return PCI_ERS_RESULT_CAN_RECOVER; 6038 /* Fatal error, prepare for slot reset */ 6039 case pci_channel_io_frozen: 6040 /* 6041 * Locking adev->reset_domain->sem will prevent any external access 6042 * to GPU during PCI error recovery 6043 */ 6044 amdgpu_device_lock_reset_domain(adev->reset_domain); 6045 amdgpu_device_set_mp1_state(adev); 6046 6047 /* 6048 * Block any work scheduling as we do for regular GPU reset 6049 * for the duration of the recovery 6050 */ 6051 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6052 struct amdgpu_ring *ring = adev->rings[i]; 6053 6054 if (!amdgpu_ring_sched_ready(ring)) 6055 continue; 6056 6057 drm_sched_stop(&ring->sched, NULL); 6058 } 6059 atomic_inc(&adev->gpu_reset_counter); 6060 return PCI_ERS_RESULT_NEED_RESET; 6061 case pci_channel_io_perm_failure: 6062 /* Permanent error, prepare for device removal */ 6063 return PCI_ERS_RESULT_DISCONNECT; 6064 } 6065 6066 return PCI_ERS_RESULT_NEED_RESET; 6067 } 6068 6069 /** 6070 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6071 * @pdev: pointer to PCI device 6072 */ 6073 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6074 { 6075 6076 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6077 6078 /* TODO - dump whatever for debugging purposes */ 6079 6080 /* This called only if amdgpu_pci_error_detected returns 6081 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6082 * works, no need to reset slot. 6083 */ 6084 6085 return PCI_ERS_RESULT_RECOVERED; 6086 } 6087 6088 /** 6089 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6090 * @pdev: PCI device struct 6091 * 6092 * Description: This routine is called by the pci error recovery 6093 * code after the PCI slot has been reset, just before we 6094 * should resume normal operations. 6095 */ 6096 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6097 { 6098 struct drm_device *dev = pci_get_drvdata(pdev); 6099 struct amdgpu_device *adev = drm_to_adev(dev); 6100 int r, i; 6101 struct amdgpu_reset_context reset_context; 6102 u32 memsize; 6103 struct list_head device_list; 6104 6105 DRM_INFO("PCI error: slot reset callback!!\n"); 6106 6107 memset(&reset_context, 0, sizeof(reset_context)); 6108 6109 INIT_LIST_HEAD(&device_list); 6110 list_add_tail(&adev->reset_list, &device_list); 6111 6112 /* wait for asic to come out of reset */ 6113 msleep(500); 6114 6115 /* Restore PCI confspace */ 6116 amdgpu_device_load_pci_state(pdev); 6117 6118 /* confirm ASIC came out of reset */ 6119 for (i = 0; i < adev->usec_timeout; i++) { 6120 memsize = amdgpu_asic_get_config_memsize(adev); 6121 6122 if (memsize != 0xffffffff) 6123 break; 6124 udelay(1); 6125 } 6126 if (memsize == 0xffffffff) { 6127 r = -ETIME; 6128 goto out; 6129 } 6130 6131 reset_context.method = AMD_RESET_METHOD_NONE; 6132 reset_context.reset_req_dev = adev; 6133 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6134 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6135 6136 adev->no_hw_access = true; 6137 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6138 adev->no_hw_access = false; 6139 if (r) 6140 goto out; 6141 6142 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6143 6144 out: 6145 if (!r) { 6146 if (amdgpu_device_cache_pci_state(adev->pdev)) 6147 pci_restore_state(adev->pdev); 6148 6149 DRM_INFO("PCIe error recovery succeeded\n"); 6150 } else { 6151 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6152 amdgpu_device_unset_mp1_state(adev); 6153 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6154 } 6155 6156 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6157 } 6158 6159 /** 6160 * amdgpu_pci_resume() - resume normal ops after PCI reset 6161 * @pdev: pointer to PCI device 6162 * 6163 * Called when the error recovery driver tells us that its 6164 * OK to resume normal operation. 6165 */ 6166 void amdgpu_pci_resume(struct pci_dev *pdev) 6167 { 6168 struct drm_device *dev = pci_get_drvdata(pdev); 6169 struct amdgpu_device *adev = drm_to_adev(dev); 6170 int i; 6171 6172 6173 DRM_INFO("PCI error: resume callback!!\n"); 6174 6175 /* Only continue execution for the case of pci_channel_io_frozen */ 6176 if (adev->pci_channel_state != pci_channel_io_frozen) 6177 return; 6178 6179 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6180 struct amdgpu_ring *ring = adev->rings[i]; 6181 6182 if (!amdgpu_ring_sched_ready(ring)) 6183 continue; 6184 6185 drm_sched_start(&ring->sched, true); 6186 } 6187 6188 amdgpu_device_unset_mp1_state(adev); 6189 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6190 } 6191 6192 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6193 { 6194 struct drm_device *dev = pci_get_drvdata(pdev); 6195 struct amdgpu_device *adev = drm_to_adev(dev); 6196 int r; 6197 6198 r = pci_save_state(pdev); 6199 if (!r) { 6200 kfree(adev->pci_state); 6201 6202 adev->pci_state = pci_store_saved_state(pdev); 6203 6204 if (!adev->pci_state) { 6205 DRM_ERROR("Failed to store PCI saved state"); 6206 return false; 6207 } 6208 } else { 6209 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6210 return false; 6211 } 6212 6213 return true; 6214 } 6215 6216 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6217 { 6218 struct drm_device *dev = pci_get_drvdata(pdev); 6219 struct amdgpu_device *adev = drm_to_adev(dev); 6220 int r; 6221 6222 if (!adev->pci_state) 6223 return false; 6224 6225 r = pci_load_saved_state(pdev, adev->pci_state); 6226 6227 if (!r) { 6228 pci_restore_state(pdev); 6229 } else { 6230 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6231 return false; 6232 } 6233 6234 return true; 6235 } 6236 6237 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6238 struct amdgpu_ring *ring) 6239 { 6240 #ifdef CONFIG_X86_64 6241 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6242 return; 6243 #endif 6244 if (adev->gmc.xgmi.connected_to_cpu) 6245 return; 6246 6247 if (ring && ring->funcs->emit_hdp_flush) 6248 amdgpu_ring_emit_hdp_flush(ring); 6249 else 6250 amdgpu_asic_flush_hdp(adev, ring); 6251 } 6252 6253 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6254 struct amdgpu_ring *ring) 6255 { 6256 #ifdef CONFIG_X86_64 6257 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6258 return; 6259 #endif 6260 if (adev->gmc.xgmi.connected_to_cpu) 6261 return; 6262 6263 amdgpu_asic_invalidate_hdp(adev, ring); 6264 } 6265 6266 int amdgpu_in_reset(struct amdgpu_device *adev) 6267 { 6268 return atomic_read(&adev->reset_domain->in_gpu_reset); 6269 } 6270 6271 /** 6272 * amdgpu_device_halt() - bring hardware to some kind of halt state 6273 * 6274 * @adev: amdgpu_device pointer 6275 * 6276 * Bring hardware to some kind of halt state so that no one can touch it 6277 * any more. It will help to maintain error context when error occurred. 6278 * Compare to a simple hang, the system will keep stable at least for SSH 6279 * access. Then it should be trivial to inspect the hardware state and 6280 * see what's going on. Implemented as following: 6281 * 6282 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6283 * clears all CPU mappings to device, disallows remappings through page faults 6284 * 2. amdgpu_irq_disable_all() disables all interrupts 6285 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6286 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6287 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6288 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6289 * flush any in flight DMA operations 6290 */ 6291 void amdgpu_device_halt(struct amdgpu_device *adev) 6292 { 6293 struct pci_dev *pdev = adev->pdev; 6294 struct drm_device *ddev = adev_to_drm(adev); 6295 6296 amdgpu_xcp_dev_unplug(adev); 6297 drm_dev_unplug(ddev); 6298 6299 amdgpu_irq_disable_all(adev); 6300 6301 amdgpu_fence_driver_hw_fini(adev); 6302 6303 adev->no_hw_access = true; 6304 6305 amdgpu_device_unmap_mmio(adev); 6306 6307 pci_disable_device(pdev); 6308 pci_wait_for_pending_transaction(pdev); 6309 } 6310 6311 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6312 u32 reg) 6313 { 6314 unsigned long flags, address, data; 6315 u32 r; 6316 6317 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6318 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6319 6320 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6321 WREG32(address, reg * 4); 6322 (void)RREG32(address); 6323 r = RREG32(data); 6324 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6325 return r; 6326 } 6327 6328 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6329 u32 reg, u32 v) 6330 { 6331 unsigned long flags, address, data; 6332 6333 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6334 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6335 6336 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6337 WREG32(address, reg * 4); 6338 (void)RREG32(address); 6339 WREG32(data, v); 6340 (void)RREG32(data); 6341 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6342 } 6343 6344 /** 6345 * amdgpu_device_switch_gang - switch to a new gang 6346 * @adev: amdgpu_device pointer 6347 * @gang: the gang to switch to 6348 * 6349 * Try to switch to a new gang. 6350 * Returns: NULL if we switched to the new gang or a reference to the current 6351 * gang leader. 6352 */ 6353 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6354 struct dma_fence *gang) 6355 { 6356 struct dma_fence *old = NULL; 6357 6358 do { 6359 dma_fence_put(old); 6360 rcu_read_lock(); 6361 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6362 rcu_read_unlock(); 6363 6364 if (old == gang) 6365 break; 6366 6367 if (!dma_fence_is_signaled(old)) 6368 return old; 6369 6370 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6371 old, gang) != old); 6372 6373 dma_fence_put(old); 6374 return NULL; 6375 } 6376 6377 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6378 { 6379 switch (adev->asic_type) { 6380 #ifdef CONFIG_DRM_AMDGPU_SI 6381 case CHIP_HAINAN: 6382 #endif 6383 case CHIP_TOPAZ: 6384 /* chips with no display hardware */ 6385 return false; 6386 #ifdef CONFIG_DRM_AMDGPU_SI 6387 case CHIP_TAHITI: 6388 case CHIP_PITCAIRN: 6389 case CHIP_VERDE: 6390 case CHIP_OLAND: 6391 #endif 6392 #ifdef CONFIG_DRM_AMDGPU_CIK 6393 case CHIP_BONAIRE: 6394 case CHIP_HAWAII: 6395 case CHIP_KAVERI: 6396 case CHIP_KABINI: 6397 case CHIP_MULLINS: 6398 #endif 6399 case CHIP_TONGA: 6400 case CHIP_FIJI: 6401 case CHIP_POLARIS10: 6402 case CHIP_POLARIS11: 6403 case CHIP_POLARIS12: 6404 case CHIP_VEGAM: 6405 case CHIP_CARRIZO: 6406 case CHIP_STONEY: 6407 /* chips with display hardware */ 6408 return true; 6409 default: 6410 /* IP discovery */ 6411 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6412 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6413 return false; 6414 return true; 6415 } 6416 } 6417 6418 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6419 uint32_t inst, uint32_t reg_addr, char reg_name[], 6420 uint32_t expected_value, uint32_t mask) 6421 { 6422 uint32_t ret = 0; 6423 uint32_t old_ = 0; 6424 uint32_t tmp_ = RREG32(reg_addr); 6425 uint32_t loop = adev->usec_timeout; 6426 6427 while ((tmp_ & (mask)) != (expected_value)) { 6428 if (old_ != tmp_) { 6429 loop = adev->usec_timeout; 6430 old_ = tmp_; 6431 } else 6432 udelay(1); 6433 tmp_ = RREG32(reg_addr); 6434 loop--; 6435 if (!loop) { 6436 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6437 inst, reg_name, (uint32_t)expected_value, 6438 (uint32_t)(tmp_ & (mask))); 6439 ret = -ETIMEDOUT; 6440 break; 6441 } 6442 } 6443 return ret; 6444 } 6445