1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 168 /** 169 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 170 * 171 * @dev: drm_device pointer 172 * 173 * Returns true if the device is a dGPU with ATPX power control, 174 * otherwise return false. 175 */ 176 bool amdgpu_device_supports_px(struct drm_device *dev) 177 { 178 struct amdgpu_device *adev = drm_to_adev(dev); 179 180 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 181 return true; 182 return false; 183 } 184 185 /** 186 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 187 * 188 * @dev: drm_device pointer 189 * 190 * Returns true if the device is a dGPU with ACPI power control, 191 * otherwise return false. 192 */ 193 bool amdgpu_device_supports_boco(struct drm_device *dev) 194 { 195 struct amdgpu_device *adev = drm_to_adev(dev); 196 197 if (adev->has_pr3 || 198 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 199 return true; 200 return false; 201 } 202 203 /** 204 * amdgpu_device_supports_baco - Does the device support BACO 205 * 206 * @dev: drm_device pointer 207 * 208 * Returns true if the device supporte BACO, 209 * otherwise return false. 210 */ 211 bool amdgpu_device_supports_baco(struct drm_device *dev) 212 { 213 struct amdgpu_device *adev = drm_to_adev(dev); 214 215 return amdgpu_asic_supports_baco(adev); 216 } 217 218 /** 219 * amdgpu_device_supports_smart_shift - Is the device dGPU with 220 * smart shift support 221 * 222 * @dev: drm_device pointer 223 * 224 * Returns true if the device is a dGPU with Smart Shift support, 225 * otherwise returns false. 226 */ 227 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 228 { 229 return (amdgpu_device_supports_boco(dev) && 230 amdgpu_acpi_is_power_shift_control_supported()); 231 } 232 233 /* 234 * VRAM access helper functions 235 */ 236 237 /** 238 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 239 * 240 * @adev: amdgpu_device pointer 241 * @pos: offset of the buffer in vram 242 * @buf: virtual address of the buffer in system memory 243 * @size: read/write size, sizeof(@buf) must > @size 244 * @write: true - write to vram, otherwise - read from vram 245 */ 246 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 247 void *buf, size_t size, bool write) 248 { 249 unsigned long flags; 250 uint32_t hi = ~0, tmp = 0; 251 uint32_t *data = buf; 252 uint64_t last; 253 int idx; 254 255 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 256 return; 257 258 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 259 260 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 261 for (last = pos + size; pos < last; pos += 4) { 262 tmp = pos >> 31; 263 264 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 265 if (tmp != hi) { 266 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 267 hi = tmp; 268 } 269 if (write) 270 WREG32_NO_KIQ(mmMM_DATA, *data++); 271 else 272 *data++ = RREG32_NO_KIQ(mmMM_DATA); 273 } 274 275 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 276 drm_dev_exit(idx); 277 } 278 279 /** 280 * amdgpu_device_aper_access - access vram by vram aperature 281 * 282 * @adev: amdgpu_device pointer 283 * @pos: offset of the buffer in vram 284 * @buf: virtual address of the buffer in system memory 285 * @size: read/write size, sizeof(@buf) must > @size 286 * @write: true - write to vram, otherwise - read from vram 287 * 288 * The return value means how many bytes have been transferred. 289 */ 290 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 291 void *buf, size_t size, bool write) 292 { 293 #ifdef CONFIG_64BIT 294 void __iomem *addr; 295 size_t count = 0; 296 uint64_t last; 297 298 if (!adev->mman.aper_base_kaddr) 299 return 0; 300 301 last = min(pos + size, adev->gmc.visible_vram_size); 302 if (last > pos) { 303 addr = adev->mman.aper_base_kaddr + pos; 304 count = last - pos; 305 306 if (write) { 307 memcpy_toio(addr, buf, count); 308 /* Make sure HDP write cache flush happens without any reordering 309 * after the system memory contents are sent over PCIe device 310 */ 311 mb(); 312 amdgpu_device_flush_hdp(adev, NULL); 313 } else { 314 amdgpu_device_invalidate_hdp(adev, NULL); 315 /* Make sure HDP read cache is invalidated before issuing a read 316 * to the PCIe device 317 */ 318 mb(); 319 memcpy_fromio(buf, addr, count); 320 } 321 322 } 323 324 return count; 325 #else 326 return 0; 327 #endif 328 } 329 330 /** 331 * amdgpu_device_vram_access - read/write a buffer in vram 332 * 333 * @adev: amdgpu_device pointer 334 * @pos: offset of the buffer in vram 335 * @buf: virtual address of the buffer in system memory 336 * @size: read/write size, sizeof(@buf) must > @size 337 * @write: true - write to vram, otherwise - read from vram 338 */ 339 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 340 void *buf, size_t size, bool write) 341 { 342 size_t count; 343 344 /* try to using vram apreature to access vram first */ 345 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 346 size -= count; 347 if (size) { 348 /* using MM to access rest vram */ 349 pos += count; 350 buf += count; 351 amdgpu_device_mm_access(adev, pos, buf, size, write); 352 } 353 } 354 355 /* 356 * register access helper functions. 357 */ 358 359 /* Check if hw access should be skipped because of hotplug or device error */ 360 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 361 { 362 if (adev->no_hw_access) 363 return true; 364 365 #ifdef CONFIG_LOCKDEP 366 /* 367 * This is a bit complicated to understand, so worth a comment. What we assert 368 * here is that the GPU reset is not running on another thread in parallel. 369 * 370 * For this we trylock the read side of the reset semaphore, if that succeeds 371 * we know that the reset is not running in paralell. 372 * 373 * If the trylock fails we assert that we are either already holding the read 374 * side of the lock or are the reset thread itself and hold the write side of 375 * the lock. 376 */ 377 if (in_task()) { 378 if (down_read_trylock(&adev->reset_domain->sem)) 379 up_read(&adev->reset_domain->sem); 380 else 381 lockdep_assert_held(&adev->reset_domain->sem); 382 } 383 #endif 384 return false; 385 } 386 387 /** 388 * amdgpu_device_rreg - read a memory mapped IO or indirect register 389 * 390 * @adev: amdgpu_device pointer 391 * @reg: dword aligned register offset 392 * @acc_flags: access flags which require special behavior 393 * 394 * Returns the 32 bit value from the offset specified. 395 */ 396 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 397 uint32_t reg, uint32_t acc_flags) 398 { 399 uint32_t ret; 400 401 if (amdgpu_device_skip_hw_access(adev)) 402 return 0; 403 404 if ((reg * 4) < adev->rmmio_size) { 405 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 406 amdgpu_sriov_runtime(adev) && 407 down_read_trylock(&adev->reset_domain->sem)) { 408 ret = amdgpu_kiq_rreg(adev, reg); 409 up_read(&adev->reset_domain->sem); 410 } else { 411 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 412 } 413 } else { 414 ret = adev->pcie_rreg(adev, reg * 4); 415 } 416 417 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 418 419 return ret; 420 } 421 422 /* 423 * MMIO register read with bytes helper functions 424 * @offset:bytes offset from MMIO start 425 */ 426 427 /** 428 * amdgpu_mm_rreg8 - read a memory mapped IO register 429 * 430 * @adev: amdgpu_device pointer 431 * @offset: byte aligned register offset 432 * 433 * Returns the 8 bit value from the offset specified. 434 */ 435 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 436 { 437 if (amdgpu_device_skip_hw_access(adev)) 438 return 0; 439 440 if (offset < adev->rmmio_size) 441 return (readb(adev->rmmio + offset)); 442 BUG(); 443 } 444 445 /* 446 * MMIO register write with bytes helper functions 447 * @offset:bytes offset from MMIO start 448 * @value: the value want to be written to the register 449 */ 450 451 /** 452 * amdgpu_mm_wreg8 - read a memory mapped IO register 453 * 454 * @adev: amdgpu_device pointer 455 * @offset: byte aligned register offset 456 * @value: 8 bit value to write 457 * 458 * Writes the value specified to the offset specified. 459 */ 460 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 461 { 462 if (amdgpu_device_skip_hw_access(adev)) 463 return; 464 465 if (offset < adev->rmmio_size) 466 writeb(value, adev->rmmio + offset); 467 else 468 BUG(); 469 } 470 471 /** 472 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 473 * 474 * @adev: amdgpu_device pointer 475 * @reg: dword aligned register offset 476 * @v: 32 bit value to write to the register 477 * @acc_flags: access flags which require special behavior 478 * 479 * Writes the value specified to the offset specified. 480 */ 481 void amdgpu_device_wreg(struct amdgpu_device *adev, 482 uint32_t reg, uint32_t v, 483 uint32_t acc_flags) 484 { 485 if (amdgpu_device_skip_hw_access(adev)) 486 return; 487 488 if ((reg * 4) < adev->rmmio_size) { 489 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 490 amdgpu_sriov_runtime(adev) && 491 down_read_trylock(&adev->reset_domain->sem)) { 492 amdgpu_kiq_wreg(adev, reg, v); 493 up_read(&adev->reset_domain->sem); 494 } else { 495 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 496 } 497 } else { 498 adev->pcie_wreg(adev, reg * 4, v); 499 } 500 501 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 502 } 503 504 /** 505 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 506 * 507 * @adev: amdgpu_device pointer 508 * @reg: mmio/rlc register 509 * @v: value to write 510 * @xcc_id: xcc accelerated compute core id 511 * 512 * this function is invoked only for the debugfs register access 513 */ 514 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 515 uint32_t reg, uint32_t v, 516 uint32_t xcc_id) 517 { 518 if (amdgpu_device_skip_hw_access(adev)) 519 return; 520 521 if (amdgpu_sriov_fullaccess(adev) && 522 adev->gfx.rlc.funcs && 523 adev->gfx.rlc.funcs->is_rlcg_access_range) { 524 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 525 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 526 } else if ((reg * 4) >= adev->rmmio_size) { 527 adev->pcie_wreg(adev, reg * 4, v); 528 } else { 529 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 530 } 531 } 532 533 /** 534 * amdgpu_device_indirect_rreg - read an indirect register 535 * 536 * @adev: amdgpu_device pointer 537 * @reg_addr: indirect register address to read from 538 * 539 * Returns the value of indirect register @reg_addr 540 */ 541 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 542 u32 reg_addr) 543 { 544 unsigned long flags, pcie_index, pcie_data; 545 void __iomem *pcie_index_offset; 546 void __iomem *pcie_data_offset; 547 u32 r; 548 549 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 550 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 551 552 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 553 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 554 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 555 556 writel(reg_addr, pcie_index_offset); 557 readl(pcie_index_offset); 558 r = readl(pcie_data_offset); 559 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 560 561 return r; 562 } 563 564 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 565 u64 reg_addr) 566 { 567 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 568 u32 r; 569 void __iomem *pcie_index_offset; 570 void __iomem *pcie_index_hi_offset; 571 void __iomem *pcie_data_offset; 572 573 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 574 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 575 if (adev->nbio.funcs->get_pcie_index_hi_offset) 576 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 577 else 578 pcie_index_hi = 0; 579 580 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 581 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 582 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 583 if (pcie_index_hi != 0) 584 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 585 pcie_index_hi * 4; 586 587 writel(reg_addr, pcie_index_offset); 588 readl(pcie_index_offset); 589 if (pcie_index_hi != 0) { 590 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 591 readl(pcie_index_hi_offset); 592 } 593 r = readl(pcie_data_offset); 594 595 /* clear the high bits */ 596 if (pcie_index_hi != 0) { 597 writel(0, pcie_index_hi_offset); 598 readl(pcie_index_hi_offset); 599 } 600 601 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 602 603 return r; 604 } 605 606 /** 607 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 608 * 609 * @adev: amdgpu_device pointer 610 * @reg_addr: indirect register address to read from 611 * 612 * Returns the value of indirect register @reg_addr 613 */ 614 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 615 u32 reg_addr) 616 { 617 unsigned long flags, pcie_index, pcie_data; 618 void __iomem *pcie_index_offset; 619 void __iomem *pcie_data_offset; 620 u64 r; 621 622 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 623 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 624 625 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 626 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 627 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 628 629 /* read low 32 bits */ 630 writel(reg_addr, pcie_index_offset); 631 readl(pcie_index_offset); 632 r = readl(pcie_data_offset); 633 /* read high 32 bits */ 634 writel(reg_addr + 4, pcie_index_offset); 635 readl(pcie_index_offset); 636 r |= ((u64)readl(pcie_data_offset) << 32); 637 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 638 639 return r; 640 } 641 642 /** 643 * amdgpu_device_indirect_wreg - write an indirect register address 644 * 645 * @adev: amdgpu_device pointer 646 * @reg_addr: indirect register offset 647 * @reg_data: indirect register data 648 * 649 */ 650 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 651 u32 reg_addr, u32 reg_data) 652 { 653 unsigned long flags, pcie_index, pcie_data; 654 void __iomem *pcie_index_offset; 655 void __iomem *pcie_data_offset; 656 657 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 658 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 659 660 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 661 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 662 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 663 664 writel(reg_addr, pcie_index_offset); 665 readl(pcie_index_offset); 666 writel(reg_data, pcie_data_offset); 667 readl(pcie_data_offset); 668 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 669 } 670 671 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 672 u64 reg_addr, u32 reg_data) 673 { 674 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 675 void __iomem *pcie_index_offset; 676 void __iomem *pcie_index_hi_offset; 677 void __iomem *pcie_data_offset; 678 679 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 680 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 681 if (adev->nbio.funcs->get_pcie_index_hi_offset) 682 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 683 else 684 pcie_index_hi = 0; 685 686 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 687 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 688 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 689 if (pcie_index_hi != 0) 690 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 691 pcie_index_hi * 4; 692 693 writel(reg_addr, pcie_index_offset); 694 readl(pcie_index_offset); 695 if (pcie_index_hi != 0) { 696 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 697 readl(pcie_index_hi_offset); 698 } 699 writel(reg_data, pcie_data_offset); 700 readl(pcie_data_offset); 701 702 /* clear the high bits */ 703 if (pcie_index_hi != 0) { 704 writel(0, pcie_index_hi_offset); 705 readl(pcie_index_hi_offset); 706 } 707 708 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 709 } 710 711 /** 712 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 713 * 714 * @adev: amdgpu_device pointer 715 * @reg_addr: indirect register offset 716 * @reg_data: indirect register data 717 * 718 */ 719 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 720 u32 reg_addr, u64 reg_data) 721 { 722 unsigned long flags, pcie_index, pcie_data; 723 void __iomem *pcie_index_offset; 724 void __iomem *pcie_data_offset; 725 726 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 727 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 728 729 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 732 733 /* write low 32 bits */ 734 writel(reg_addr, pcie_index_offset); 735 readl(pcie_index_offset); 736 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 737 readl(pcie_data_offset); 738 /* write high 32 bits */ 739 writel(reg_addr + 4, pcie_index_offset); 740 readl(pcie_index_offset); 741 writel((u32)(reg_data >> 32), pcie_data_offset); 742 readl(pcie_data_offset); 743 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 744 } 745 746 /** 747 * amdgpu_device_get_rev_id - query device rev_id 748 * 749 * @adev: amdgpu_device pointer 750 * 751 * Return device rev_id 752 */ 753 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 754 { 755 return adev->nbio.funcs->get_rev_id(adev); 756 } 757 758 /** 759 * amdgpu_invalid_rreg - dummy reg read function 760 * 761 * @adev: amdgpu_device pointer 762 * @reg: offset of register 763 * 764 * Dummy register read function. Used for register blocks 765 * that certain asics don't have (all asics). 766 * Returns the value in the register. 767 */ 768 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 769 { 770 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 771 BUG(); 772 return 0; 773 } 774 775 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 776 { 777 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 778 BUG(); 779 return 0; 780 } 781 782 /** 783 * amdgpu_invalid_wreg - dummy reg write function 784 * 785 * @adev: amdgpu_device pointer 786 * @reg: offset of register 787 * @v: value to write to the register 788 * 789 * Dummy register read function. Used for register blocks 790 * that certain asics don't have (all asics). 791 */ 792 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 793 { 794 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 795 reg, v); 796 BUG(); 797 } 798 799 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 800 { 801 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 802 reg, v); 803 BUG(); 804 } 805 806 /** 807 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 808 * 809 * @adev: amdgpu_device pointer 810 * @reg: offset of register 811 * 812 * Dummy register read function. Used for register blocks 813 * that certain asics don't have (all asics). 814 * Returns the value in the register. 815 */ 816 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 817 { 818 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 819 BUG(); 820 return 0; 821 } 822 823 /** 824 * amdgpu_invalid_wreg64 - dummy reg write function 825 * 826 * @adev: amdgpu_device pointer 827 * @reg: offset of register 828 * @v: value to write to the register 829 * 830 * Dummy register read function. Used for register blocks 831 * that certain asics don't have (all asics). 832 */ 833 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 834 { 835 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 836 reg, v); 837 BUG(); 838 } 839 840 /** 841 * amdgpu_block_invalid_rreg - dummy reg read function 842 * 843 * @adev: amdgpu_device pointer 844 * @block: offset of instance 845 * @reg: offset of register 846 * 847 * Dummy register read function. Used for register blocks 848 * that certain asics don't have (all asics). 849 * Returns the value in the register. 850 */ 851 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 852 uint32_t block, uint32_t reg) 853 { 854 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 855 reg, block); 856 BUG(); 857 return 0; 858 } 859 860 /** 861 * amdgpu_block_invalid_wreg - dummy reg write function 862 * 863 * @adev: amdgpu_device pointer 864 * @block: offset of instance 865 * @reg: offset of register 866 * @v: value to write to the register 867 * 868 * Dummy register read function. Used for register blocks 869 * that certain asics don't have (all asics). 870 */ 871 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 872 uint32_t block, 873 uint32_t reg, uint32_t v) 874 { 875 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 876 reg, block, v); 877 BUG(); 878 } 879 880 /** 881 * amdgpu_device_asic_init - Wrapper for atom asic_init 882 * 883 * @adev: amdgpu_device pointer 884 * 885 * Does any asic specific work and then calls atom asic init. 886 */ 887 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 888 { 889 int ret; 890 891 amdgpu_asic_pre_asic_init(adev); 892 893 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || 894 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) { 895 amdgpu_psp_wait_for_bootloader(adev); 896 ret = amdgpu_atomfirmware_asic_init(adev, true); 897 return ret; 898 } else { 899 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 900 } 901 902 return 0; 903 } 904 905 /** 906 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 907 * 908 * @adev: amdgpu_device pointer 909 * 910 * Allocates a scratch page of VRAM for use by various things in the 911 * driver. 912 */ 913 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 914 { 915 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 916 AMDGPU_GEM_DOMAIN_VRAM | 917 AMDGPU_GEM_DOMAIN_GTT, 918 &adev->mem_scratch.robj, 919 &adev->mem_scratch.gpu_addr, 920 (void **)&adev->mem_scratch.ptr); 921 } 922 923 /** 924 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 925 * 926 * @adev: amdgpu_device pointer 927 * 928 * Frees the VRAM scratch page. 929 */ 930 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 931 { 932 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 933 } 934 935 /** 936 * amdgpu_device_program_register_sequence - program an array of registers. 937 * 938 * @adev: amdgpu_device pointer 939 * @registers: pointer to the register array 940 * @array_size: size of the register array 941 * 942 * Programs an array or registers with and or masks. 943 * This is a helper for setting golden registers. 944 */ 945 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 946 const u32 *registers, 947 const u32 array_size) 948 { 949 u32 tmp, reg, and_mask, or_mask; 950 int i; 951 952 if (array_size % 3) 953 return; 954 955 for (i = 0; i < array_size; i += 3) { 956 reg = registers[i + 0]; 957 and_mask = registers[i + 1]; 958 or_mask = registers[i + 2]; 959 960 if (and_mask == 0xffffffff) { 961 tmp = or_mask; 962 } else { 963 tmp = RREG32(reg); 964 tmp &= ~and_mask; 965 if (adev->family >= AMDGPU_FAMILY_AI) 966 tmp |= (or_mask & and_mask); 967 else 968 tmp |= or_mask; 969 } 970 WREG32(reg, tmp); 971 } 972 } 973 974 /** 975 * amdgpu_device_pci_config_reset - reset the GPU 976 * 977 * @adev: amdgpu_device pointer 978 * 979 * Resets the GPU using the pci config reset sequence. 980 * Only applicable to asics prior to vega10. 981 */ 982 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 983 { 984 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 985 } 986 987 /** 988 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 989 * 990 * @adev: amdgpu_device pointer 991 * 992 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 993 */ 994 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 995 { 996 return pci_reset_function(adev->pdev); 997 } 998 999 /* 1000 * amdgpu_device_wb_*() 1001 * Writeback is the method by which the GPU updates special pages in memory 1002 * with the status of certain GPU events (fences, ring pointers,etc.). 1003 */ 1004 1005 /** 1006 * amdgpu_device_wb_fini - Disable Writeback and free memory 1007 * 1008 * @adev: amdgpu_device pointer 1009 * 1010 * Disables Writeback and frees the Writeback memory (all asics). 1011 * Used at driver shutdown. 1012 */ 1013 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1014 { 1015 if (adev->wb.wb_obj) { 1016 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1017 &adev->wb.gpu_addr, 1018 (void **)&adev->wb.wb); 1019 adev->wb.wb_obj = NULL; 1020 } 1021 } 1022 1023 /** 1024 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1025 * 1026 * @adev: amdgpu_device pointer 1027 * 1028 * Initializes writeback and allocates writeback memory (all asics). 1029 * Used at driver startup. 1030 * Returns 0 on success or an -error on failure. 1031 */ 1032 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1033 { 1034 int r; 1035 1036 if (adev->wb.wb_obj == NULL) { 1037 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1038 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1039 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1040 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1041 (void **)&adev->wb.wb); 1042 if (r) { 1043 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1044 return r; 1045 } 1046 1047 adev->wb.num_wb = AMDGPU_MAX_WB; 1048 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1049 1050 /* clear wb memory */ 1051 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1052 } 1053 1054 return 0; 1055 } 1056 1057 /** 1058 * amdgpu_device_wb_get - Allocate a wb entry 1059 * 1060 * @adev: amdgpu_device pointer 1061 * @wb: wb index 1062 * 1063 * Allocate a wb slot for use by the driver (all asics). 1064 * Returns 0 on success or -EINVAL on failure. 1065 */ 1066 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1067 { 1068 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1069 1070 if (offset < adev->wb.num_wb) { 1071 __set_bit(offset, adev->wb.used); 1072 *wb = offset << 3; /* convert to dw offset */ 1073 return 0; 1074 } else { 1075 return -EINVAL; 1076 } 1077 } 1078 1079 /** 1080 * amdgpu_device_wb_free - Free a wb entry 1081 * 1082 * @adev: amdgpu_device pointer 1083 * @wb: wb index 1084 * 1085 * Free a wb slot allocated for use by the driver (all asics) 1086 */ 1087 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1088 { 1089 wb >>= 3; 1090 if (wb < adev->wb.num_wb) 1091 __clear_bit(wb, adev->wb.used); 1092 } 1093 1094 /** 1095 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1096 * 1097 * @adev: amdgpu_device pointer 1098 * 1099 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1100 * to fail, but if any of the BARs is not accessible after the size we abort 1101 * driver loading by returning -ENODEV. 1102 */ 1103 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1104 { 1105 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1106 struct pci_bus *root; 1107 struct resource *res; 1108 unsigned int i; 1109 u16 cmd; 1110 int r; 1111 1112 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1113 return 0; 1114 1115 /* Bypass for VF */ 1116 if (amdgpu_sriov_vf(adev)) 1117 return 0; 1118 1119 /* skip if the bios has already enabled large BAR */ 1120 if (adev->gmc.real_vram_size && 1121 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1122 return 0; 1123 1124 /* Check if the root BUS has 64bit memory resources */ 1125 root = adev->pdev->bus; 1126 while (root->parent) 1127 root = root->parent; 1128 1129 pci_bus_for_each_resource(root, res, i) { 1130 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1131 res->start > 0x100000000ull) 1132 break; 1133 } 1134 1135 /* Trying to resize is pointless without a root hub window above 4GB */ 1136 if (!res) 1137 return 0; 1138 1139 /* Limit the BAR size to what is available */ 1140 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1141 rbar_size); 1142 1143 /* Disable memory decoding while we change the BAR addresses and size */ 1144 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1145 pci_write_config_word(adev->pdev, PCI_COMMAND, 1146 cmd & ~PCI_COMMAND_MEMORY); 1147 1148 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1149 amdgpu_doorbell_fini(adev); 1150 if (adev->asic_type >= CHIP_BONAIRE) 1151 pci_release_resource(adev->pdev, 2); 1152 1153 pci_release_resource(adev->pdev, 0); 1154 1155 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1156 if (r == -ENOSPC) 1157 DRM_INFO("Not enough PCI address space for a large BAR."); 1158 else if (r && r != -ENOTSUPP) 1159 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1160 1161 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1162 1163 /* When the doorbell or fb BAR isn't available we have no chance of 1164 * using the device. 1165 */ 1166 r = amdgpu_doorbell_init(adev); 1167 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1168 return -ENODEV; 1169 1170 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1171 1172 return 0; 1173 } 1174 1175 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1176 { 1177 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1178 return false; 1179 1180 return true; 1181 } 1182 1183 /* 1184 * GPU helpers function. 1185 */ 1186 /** 1187 * amdgpu_device_need_post - check if the hw need post or not 1188 * 1189 * @adev: amdgpu_device pointer 1190 * 1191 * Check if the asic has been initialized (all asics) at driver startup 1192 * or post is needed if hw reset is performed. 1193 * Returns true if need or false if not. 1194 */ 1195 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1196 { 1197 uint32_t reg; 1198 1199 if (amdgpu_sriov_vf(adev)) 1200 return false; 1201 1202 if (!amdgpu_device_read_bios(adev)) 1203 return false; 1204 1205 if (amdgpu_passthrough(adev)) { 1206 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1207 * some old smc fw still need driver do vPost otherwise gpu hang, while 1208 * those smc fw version above 22.15 doesn't have this flaw, so we force 1209 * vpost executed for smc version below 22.15 1210 */ 1211 if (adev->asic_type == CHIP_FIJI) { 1212 int err; 1213 uint32_t fw_ver; 1214 1215 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1216 /* force vPost if error occured */ 1217 if (err) 1218 return true; 1219 1220 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1221 if (fw_ver < 0x00160e00) 1222 return true; 1223 } 1224 } 1225 1226 /* Don't post if we need to reset whole hive on init */ 1227 if (adev->gmc.xgmi.pending_reset) 1228 return false; 1229 1230 if (adev->has_hw_reset) { 1231 adev->has_hw_reset = false; 1232 return true; 1233 } 1234 1235 /* bios scratch used on CIK+ */ 1236 if (adev->asic_type >= CHIP_BONAIRE) 1237 return amdgpu_atombios_scratch_need_asic_init(adev); 1238 1239 /* check MEM_SIZE for older asics */ 1240 reg = amdgpu_asic_get_config_memsize(adev); 1241 1242 if ((reg != 0) && (reg != 0xffffffff)) 1243 return false; 1244 1245 return true; 1246 } 1247 1248 /* 1249 * On APUs with >= 64GB white flickering has been observed w/ SG enabled. 1250 * Disable S/G on such systems until we have a proper fix. 1251 * https://gitlab.freedesktop.org/drm/amd/-/issues/2354 1252 * https://gitlab.freedesktop.org/drm/amd/-/issues/2735 1253 */ 1254 bool amdgpu_sg_display_supported(struct amdgpu_device *adev) 1255 { 1256 switch (amdgpu_sg_display) { 1257 case -1: 1258 break; 1259 case 0: 1260 return false; 1261 case 1: 1262 return true; 1263 default: 1264 return false; 1265 } 1266 if ((totalram_pages() << (PAGE_SHIFT - 10)) + 1267 (adev->gmc.real_vram_size / 1024) >= 64000000) { 1268 DRM_WARN("Disabling S/G due to >=64GB RAM\n"); 1269 return false; 1270 } 1271 return true; 1272 } 1273 1274 /* 1275 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1276 * speed switching. Until we have confirmation from Intel that a specific host 1277 * supports it, it's safer that we keep it disabled for all. 1278 * 1279 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1280 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1281 */ 1282 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1283 { 1284 #if IS_ENABLED(CONFIG_X86) 1285 struct cpuinfo_x86 *c = &cpu_data(0); 1286 1287 if (c->x86_vendor == X86_VENDOR_INTEL) 1288 return false; 1289 #endif 1290 return true; 1291 } 1292 1293 /** 1294 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1295 * 1296 * @adev: amdgpu_device pointer 1297 * 1298 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1299 * be set for this device. 1300 * 1301 * Returns true if it should be used or false if not. 1302 */ 1303 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1304 { 1305 switch (amdgpu_aspm) { 1306 case -1: 1307 break; 1308 case 0: 1309 return false; 1310 case 1: 1311 return true; 1312 default: 1313 return false; 1314 } 1315 return pcie_aspm_enabled(adev->pdev); 1316 } 1317 1318 bool amdgpu_device_aspm_support_quirk(void) 1319 { 1320 #if IS_ENABLED(CONFIG_X86) 1321 struct cpuinfo_x86 *c = &cpu_data(0); 1322 1323 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1324 #else 1325 return true; 1326 #endif 1327 } 1328 1329 /* if we get transitioned to only one device, take VGA back */ 1330 /** 1331 * amdgpu_device_vga_set_decode - enable/disable vga decode 1332 * 1333 * @pdev: PCI device pointer 1334 * @state: enable/disable vga decode 1335 * 1336 * Enable/disable vga decode (all asics). 1337 * Returns VGA resource flags. 1338 */ 1339 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1340 bool state) 1341 { 1342 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1343 1344 amdgpu_asic_set_vga_state(adev, state); 1345 if (state) 1346 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1347 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1348 else 1349 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1350 } 1351 1352 /** 1353 * amdgpu_device_check_block_size - validate the vm block size 1354 * 1355 * @adev: amdgpu_device pointer 1356 * 1357 * Validates the vm block size specified via module parameter. 1358 * The vm block size defines number of bits in page table versus page directory, 1359 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1360 * page table and the remaining bits are in the page directory. 1361 */ 1362 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1363 { 1364 /* defines number of bits in page table versus page directory, 1365 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1366 * page table and the remaining bits are in the page directory 1367 */ 1368 if (amdgpu_vm_block_size == -1) 1369 return; 1370 1371 if (amdgpu_vm_block_size < 9) { 1372 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1373 amdgpu_vm_block_size); 1374 amdgpu_vm_block_size = -1; 1375 } 1376 } 1377 1378 /** 1379 * amdgpu_device_check_vm_size - validate the vm size 1380 * 1381 * @adev: amdgpu_device pointer 1382 * 1383 * Validates the vm size in GB specified via module parameter. 1384 * The VM size is the size of the GPU virtual memory space in GB. 1385 */ 1386 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1387 { 1388 /* no need to check the default value */ 1389 if (amdgpu_vm_size == -1) 1390 return; 1391 1392 if (amdgpu_vm_size < 1) { 1393 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1394 amdgpu_vm_size); 1395 amdgpu_vm_size = -1; 1396 } 1397 } 1398 1399 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1400 { 1401 struct sysinfo si; 1402 bool is_os_64 = (sizeof(void *) == 8); 1403 uint64_t total_memory; 1404 uint64_t dram_size_seven_GB = 0x1B8000000; 1405 uint64_t dram_size_three_GB = 0xB8000000; 1406 1407 if (amdgpu_smu_memory_pool_size == 0) 1408 return; 1409 1410 if (!is_os_64) { 1411 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1412 goto def_value; 1413 } 1414 si_meminfo(&si); 1415 total_memory = (uint64_t)si.totalram * si.mem_unit; 1416 1417 if ((amdgpu_smu_memory_pool_size == 1) || 1418 (amdgpu_smu_memory_pool_size == 2)) { 1419 if (total_memory < dram_size_three_GB) 1420 goto def_value1; 1421 } else if ((amdgpu_smu_memory_pool_size == 4) || 1422 (amdgpu_smu_memory_pool_size == 8)) { 1423 if (total_memory < dram_size_seven_GB) 1424 goto def_value1; 1425 } else { 1426 DRM_WARN("Smu memory pool size not supported\n"); 1427 goto def_value; 1428 } 1429 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1430 1431 return; 1432 1433 def_value1: 1434 DRM_WARN("No enough system memory\n"); 1435 def_value: 1436 adev->pm.smu_prv_buffer_size = 0; 1437 } 1438 1439 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1440 { 1441 if (!(adev->flags & AMD_IS_APU) || 1442 adev->asic_type < CHIP_RAVEN) 1443 return 0; 1444 1445 switch (adev->asic_type) { 1446 case CHIP_RAVEN: 1447 if (adev->pdev->device == 0x15dd) 1448 adev->apu_flags |= AMD_APU_IS_RAVEN; 1449 if (adev->pdev->device == 0x15d8) 1450 adev->apu_flags |= AMD_APU_IS_PICASSO; 1451 break; 1452 case CHIP_RENOIR: 1453 if ((adev->pdev->device == 0x1636) || 1454 (adev->pdev->device == 0x164c)) 1455 adev->apu_flags |= AMD_APU_IS_RENOIR; 1456 else 1457 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1458 break; 1459 case CHIP_VANGOGH: 1460 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1461 break; 1462 case CHIP_YELLOW_CARP: 1463 break; 1464 case CHIP_CYAN_SKILLFISH: 1465 if ((adev->pdev->device == 0x13FE) || 1466 (adev->pdev->device == 0x143F)) 1467 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1468 break; 1469 default: 1470 break; 1471 } 1472 1473 return 0; 1474 } 1475 1476 /** 1477 * amdgpu_device_check_arguments - validate module params 1478 * 1479 * @adev: amdgpu_device pointer 1480 * 1481 * Validates certain module parameters and updates 1482 * the associated values used by the driver (all asics). 1483 */ 1484 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1485 { 1486 if (amdgpu_sched_jobs < 4) { 1487 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1488 amdgpu_sched_jobs); 1489 amdgpu_sched_jobs = 4; 1490 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1491 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1492 amdgpu_sched_jobs); 1493 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1494 } 1495 1496 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1497 /* gart size must be greater or equal to 32M */ 1498 dev_warn(adev->dev, "gart size (%d) too small\n", 1499 amdgpu_gart_size); 1500 amdgpu_gart_size = -1; 1501 } 1502 1503 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1504 /* gtt size must be greater or equal to 32M */ 1505 dev_warn(adev->dev, "gtt size (%d) too small\n", 1506 amdgpu_gtt_size); 1507 amdgpu_gtt_size = -1; 1508 } 1509 1510 /* valid range is between 4 and 9 inclusive */ 1511 if (amdgpu_vm_fragment_size != -1 && 1512 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1513 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1514 amdgpu_vm_fragment_size = -1; 1515 } 1516 1517 if (amdgpu_sched_hw_submission < 2) { 1518 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1519 amdgpu_sched_hw_submission); 1520 amdgpu_sched_hw_submission = 2; 1521 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1522 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1523 amdgpu_sched_hw_submission); 1524 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1525 } 1526 1527 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1528 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1529 amdgpu_reset_method = -1; 1530 } 1531 1532 amdgpu_device_check_smu_prv_buffer_size(adev); 1533 1534 amdgpu_device_check_vm_size(adev); 1535 1536 amdgpu_device_check_block_size(adev); 1537 1538 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1539 1540 return 0; 1541 } 1542 1543 /** 1544 * amdgpu_switcheroo_set_state - set switcheroo state 1545 * 1546 * @pdev: pci dev pointer 1547 * @state: vga_switcheroo state 1548 * 1549 * Callback for the switcheroo driver. Suspends or resumes 1550 * the asics before or after it is powered up using ACPI methods. 1551 */ 1552 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1553 enum vga_switcheroo_state state) 1554 { 1555 struct drm_device *dev = pci_get_drvdata(pdev); 1556 int r; 1557 1558 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1559 return; 1560 1561 if (state == VGA_SWITCHEROO_ON) { 1562 pr_info("switched on\n"); 1563 /* don't suspend or resume card normally */ 1564 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1565 1566 pci_set_power_state(pdev, PCI_D0); 1567 amdgpu_device_load_pci_state(pdev); 1568 r = pci_enable_device(pdev); 1569 if (r) 1570 DRM_WARN("pci_enable_device failed (%d)\n", r); 1571 amdgpu_device_resume(dev, true); 1572 1573 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1574 } else { 1575 pr_info("switched off\n"); 1576 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1577 amdgpu_device_suspend(dev, true); 1578 amdgpu_device_cache_pci_state(pdev); 1579 /* Shut down the device */ 1580 pci_disable_device(pdev); 1581 pci_set_power_state(pdev, PCI_D3cold); 1582 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1583 } 1584 } 1585 1586 /** 1587 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1588 * 1589 * @pdev: pci dev pointer 1590 * 1591 * Callback for the switcheroo driver. Check of the switcheroo 1592 * state can be changed. 1593 * Returns true if the state can be changed, false if not. 1594 */ 1595 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1596 { 1597 struct drm_device *dev = pci_get_drvdata(pdev); 1598 1599 /* 1600 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1601 * locking inversion with the driver load path. And the access here is 1602 * completely racy anyway. So don't bother with locking for now. 1603 */ 1604 return atomic_read(&dev->open_count) == 0; 1605 } 1606 1607 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1608 .set_gpu_state = amdgpu_switcheroo_set_state, 1609 .reprobe = NULL, 1610 .can_switch = amdgpu_switcheroo_can_switch, 1611 }; 1612 1613 /** 1614 * amdgpu_device_ip_set_clockgating_state - set the CG state 1615 * 1616 * @dev: amdgpu_device pointer 1617 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1618 * @state: clockgating state (gate or ungate) 1619 * 1620 * Sets the requested clockgating state for all instances of 1621 * the hardware IP specified. 1622 * Returns the error code from the last instance. 1623 */ 1624 int amdgpu_device_ip_set_clockgating_state(void *dev, 1625 enum amd_ip_block_type block_type, 1626 enum amd_clockgating_state state) 1627 { 1628 struct amdgpu_device *adev = dev; 1629 int i, r = 0; 1630 1631 for (i = 0; i < adev->num_ip_blocks; i++) { 1632 if (!adev->ip_blocks[i].status.valid) 1633 continue; 1634 if (adev->ip_blocks[i].version->type != block_type) 1635 continue; 1636 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1637 continue; 1638 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1639 (void *)adev, state); 1640 if (r) 1641 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1642 adev->ip_blocks[i].version->funcs->name, r); 1643 } 1644 return r; 1645 } 1646 1647 /** 1648 * amdgpu_device_ip_set_powergating_state - set the PG state 1649 * 1650 * @dev: amdgpu_device pointer 1651 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1652 * @state: powergating state (gate or ungate) 1653 * 1654 * Sets the requested powergating state for all instances of 1655 * the hardware IP specified. 1656 * Returns the error code from the last instance. 1657 */ 1658 int amdgpu_device_ip_set_powergating_state(void *dev, 1659 enum amd_ip_block_type block_type, 1660 enum amd_powergating_state state) 1661 { 1662 struct amdgpu_device *adev = dev; 1663 int i, r = 0; 1664 1665 for (i = 0; i < adev->num_ip_blocks; i++) { 1666 if (!adev->ip_blocks[i].status.valid) 1667 continue; 1668 if (adev->ip_blocks[i].version->type != block_type) 1669 continue; 1670 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1671 continue; 1672 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1673 (void *)adev, state); 1674 if (r) 1675 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1676 adev->ip_blocks[i].version->funcs->name, r); 1677 } 1678 return r; 1679 } 1680 1681 /** 1682 * amdgpu_device_ip_get_clockgating_state - get the CG state 1683 * 1684 * @adev: amdgpu_device pointer 1685 * @flags: clockgating feature flags 1686 * 1687 * Walks the list of IPs on the device and updates the clockgating 1688 * flags for each IP. 1689 * Updates @flags with the feature flags for each hardware IP where 1690 * clockgating is enabled. 1691 */ 1692 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1693 u64 *flags) 1694 { 1695 int i; 1696 1697 for (i = 0; i < adev->num_ip_blocks; i++) { 1698 if (!adev->ip_blocks[i].status.valid) 1699 continue; 1700 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1701 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1702 } 1703 } 1704 1705 /** 1706 * amdgpu_device_ip_wait_for_idle - wait for idle 1707 * 1708 * @adev: amdgpu_device pointer 1709 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1710 * 1711 * Waits for the request hardware IP to be idle. 1712 * Returns 0 for success or a negative error code on failure. 1713 */ 1714 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1715 enum amd_ip_block_type block_type) 1716 { 1717 int i, r; 1718 1719 for (i = 0; i < adev->num_ip_blocks; i++) { 1720 if (!adev->ip_blocks[i].status.valid) 1721 continue; 1722 if (adev->ip_blocks[i].version->type == block_type) { 1723 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1724 if (r) 1725 return r; 1726 break; 1727 } 1728 } 1729 return 0; 1730 1731 } 1732 1733 /** 1734 * amdgpu_device_ip_is_idle - is the hardware IP idle 1735 * 1736 * @adev: amdgpu_device pointer 1737 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1738 * 1739 * Check if the hardware IP is idle or not. 1740 * Returns true if it the IP is idle, false if not. 1741 */ 1742 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1743 enum amd_ip_block_type block_type) 1744 { 1745 int i; 1746 1747 for (i = 0; i < adev->num_ip_blocks; i++) { 1748 if (!adev->ip_blocks[i].status.valid) 1749 continue; 1750 if (adev->ip_blocks[i].version->type == block_type) 1751 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1752 } 1753 return true; 1754 1755 } 1756 1757 /** 1758 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1759 * 1760 * @adev: amdgpu_device pointer 1761 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1762 * 1763 * Returns a pointer to the hardware IP block structure 1764 * if it exists for the asic, otherwise NULL. 1765 */ 1766 struct amdgpu_ip_block * 1767 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1768 enum amd_ip_block_type type) 1769 { 1770 int i; 1771 1772 for (i = 0; i < adev->num_ip_blocks; i++) 1773 if (adev->ip_blocks[i].version->type == type) 1774 return &adev->ip_blocks[i]; 1775 1776 return NULL; 1777 } 1778 1779 /** 1780 * amdgpu_device_ip_block_version_cmp 1781 * 1782 * @adev: amdgpu_device pointer 1783 * @type: enum amd_ip_block_type 1784 * @major: major version 1785 * @minor: minor version 1786 * 1787 * return 0 if equal or greater 1788 * return 1 if smaller or the ip_block doesn't exist 1789 */ 1790 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1791 enum amd_ip_block_type type, 1792 u32 major, u32 minor) 1793 { 1794 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1795 1796 if (ip_block && ((ip_block->version->major > major) || 1797 ((ip_block->version->major == major) && 1798 (ip_block->version->minor >= minor)))) 1799 return 0; 1800 1801 return 1; 1802 } 1803 1804 /** 1805 * amdgpu_device_ip_block_add 1806 * 1807 * @adev: amdgpu_device pointer 1808 * @ip_block_version: pointer to the IP to add 1809 * 1810 * Adds the IP block driver information to the collection of IPs 1811 * on the asic. 1812 */ 1813 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1814 const struct amdgpu_ip_block_version *ip_block_version) 1815 { 1816 if (!ip_block_version) 1817 return -EINVAL; 1818 1819 switch (ip_block_version->type) { 1820 case AMD_IP_BLOCK_TYPE_VCN: 1821 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1822 return 0; 1823 break; 1824 case AMD_IP_BLOCK_TYPE_JPEG: 1825 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1826 return 0; 1827 break; 1828 default: 1829 break; 1830 } 1831 1832 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1833 ip_block_version->funcs->name); 1834 1835 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1836 1837 return 0; 1838 } 1839 1840 /** 1841 * amdgpu_device_enable_virtual_display - enable virtual display feature 1842 * 1843 * @adev: amdgpu_device pointer 1844 * 1845 * Enabled the virtual display feature if the user has enabled it via 1846 * the module parameter virtual_display. This feature provides a virtual 1847 * display hardware on headless boards or in virtualized environments. 1848 * This function parses and validates the configuration string specified by 1849 * the user and configues the virtual display configuration (number of 1850 * virtual connectors, crtcs, etc.) specified. 1851 */ 1852 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1853 { 1854 adev->enable_virtual_display = false; 1855 1856 if (amdgpu_virtual_display) { 1857 const char *pci_address_name = pci_name(adev->pdev); 1858 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1859 1860 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1861 pciaddstr_tmp = pciaddstr; 1862 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1863 pciaddname = strsep(&pciaddname_tmp, ","); 1864 if (!strcmp("all", pciaddname) 1865 || !strcmp(pci_address_name, pciaddname)) { 1866 long num_crtc; 1867 int res = -1; 1868 1869 adev->enable_virtual_display = true; 1870 1871 if (pciaddname_tmp) 1872 res = kstrtol(pciaddname_tmp, 10, 1873 &num_crtc); 1874 1875 if (!res) { 1876 if (num_crtc < 1) 1877 num_crtc = 1; 1878 if (num_crtc > 6) 1879 num_crtc = 6; 1880 adev->mode_info.num_crtc = num_crtc; 1881 } else { 1882 adev->mode_info.num_crtc = 1; 1883 } 1884 break; 1885 } 1886 } 1887 1888 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1889 amdgpu_virtual_display, pci_address_name, 1890 adev->enable_virtual_display, adev->mode_info.num_crtc); 1891 1892 kfree(pciaddstr); 1893 } 1894 } 1895 1896 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1897 { 1898 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1899 adev->mode_info.num_crtc = 1; 1900 adev->enable_virtual_display = true; 1901 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1902 adev->enable_virtual_display, adev->mode_info.num_crtc); 1903 } 1904 } 1905 1906 /** 1907 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1908 * 1909 * @adev: amdgpu_device pointer 1910 * 1911 * Parses the asic configuration parameters specified in the gpu info 1912 * firmware and makes them availale to the driver for use in configuring 1913 * the asic. 1914 * Returns 0 on success, -EINVAL on failure. 1915 */ 1916 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1917 { 1918 const char *chip_name; 1919 char fw_name[40]; 1920 int err; 1921 const struct gpu_info_firmware_header_v1_0 *hdr; 1922 1923 adev->firmware.gpu_info_fw = NULL; 1924 1925 if (adev->mman.discovery_bin) { 1926 /* 1927 * FIXME: The bounding box is still needed by Navi12, so 1928 * temporarily read it from gpu_info firmware. Should be dropped 1929 * when DAL no longer needs it. 1930 */ 1931 if (adev->asic_type != CHIP_NAVI12) 1932 return 0; 1933 } 1934 1935 switch (adev->asic_type) { 1936 default: 1937 return 0; 1938 case CHIP_VEGA10: 1939 chip_name = "vega10"; 1940 break; 1941 case CHIP_VEGA12: 1942 chip_name = "vega12"; 1943 break; 1944 case CHIP_RAVEN: 1945 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1946 chip_name = "raven2"; 1947 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1948 chip_name = "picasso"; 1949 else 1950 chip_name = "raven"; 1951 break; 1952 case CHIP_ARCTURUS: 1953 chip_name = "arcturus"; 1954 break; 1955 case CHIP_NAVI12: 1956 chip_name = "navi12"; 1957 break; 1958 } 1959 1960 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1961 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 1962 if (err) { 1963 dev_err(adev->dev, 1964 "Failed to get gpu_info firmware \"%s\"\n", 1965 fw_name); 1966 goto out; 1967 } 1968 1969 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1970 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1971 1972 switch (hdr->version_major) { 1973 case 1: 1974 { 1975 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1976 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1977 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1978 1979 /* 1980 * Should be droped when DAL no longer needs it. 1981 */ 1982 if (adev->asic_type == CHIP_NAVI12) 1983 goto parse_soc_bounding_box; 1984 1985 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1986 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1987 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1988 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1989 adev->gfx.config.max_texture_channel_caches = 1990 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1991 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1992 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1993 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1994 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1995 adev->gfx.config.double_offchip_lds_buf = 1996 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1997 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1998 adev->gfx.cu_info.max_waves_per_simd = 1999 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2000 adev->gfx.cu_info.max_scratch_slots_per_cu = 2001 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2002 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2003 if (hdr->version_minor >= 1) { 2004 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2005 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2006 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2007 adev->gfx.config.num_sc_per_sh = 2008 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2009 adev->gfx.config.num_packer_per_sc = 2010 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2011 } 2012 2013 parse_soc_bounding_box: 2014 /* 2015 * soc bounding box info is not integrated in disocovery table, 2016 * we always need to parse it from gpu info firmware if needed. 2017 */ 2018 if (hdr->version_minor == 2) { 2019 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2020 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2021 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2022 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2023 } 2024 break; 2025 } 2026 default: 2027 dev_err(adev->dev, 2028 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2029 err = -EINVAL; 2030 goto out; 2031 } 2032 out: 2033 return err; 2034 } 2035 2036 /** 2037 * amdgpu_device_ip_early_init - run early init for hardware IPs 2038 * 2039 * @adev: amdgpu_device pointer 2040 * 2041 * Early initialization pass for hardware IPs. The hardware IPs that make 2042 * up each asic are discovered each IP's early_init callback is run. This 2043 * is the first stage in initializing the asic. 2044 * Returns 0 on success, negative error code on failure. 2045 */ 2046 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2047 { 2048 struct drm_device *dev = adev_to_drm(adev); 2049 struct pci_dev *parent; 2050 int i, r; 2051 bool total; 2052 2053 amdgpu_device_enable_virtual_display(adev); 2054 2055 if (amdgpu_sriov_vf(adev)) { 2056 r = amdgpu_virt_request_full_gpu(adev, true); 2057 if (r) 2058 return r; 2059 } 2060 2061 switch (adev->asic_type) { 2062 #ifdef CONFIG_DRM_AMDGPU_SI 2063 case CHIP_VERDE: 2064 case CHIP_TAHITI: 2065 case CHIP_PITCAIRN: 2066 case CHIP_OLAND: 2067 case CHIP_HAINAN: 2068 adev->family = AMDGPU_FAMILY_SI; 2069 r = si_set_ip_blocks(adev); 2070 if (r) 2071 return r; 2072 break; 2073 #endif 2074 #ifdef CONFIG_DRM_AMDGPU_CIK 2075 case CHIP_BONAIRE: 2076 case CHIP_HAWAII: 2077 case CHIP_KAVERI: 2078 case CHIP_KABINI: 2079 case CHIP_MULLINS: 2080 if (adev->flags & AMD_IS_APU) 2081 adev->family = AMDGPU_FAMILY_KV; 2082 else 2083 adev->family = AMDGPU_FAMILY_CI; 2084 2085 r = cik_set_ip_blocks(adev); 2086 if (r) 2087 return r; 2088 break; 2089 #endif 2090 case CHIP_TOPAZ: 2091 case CHIP_TONGA: 2092 case CHIP_FIJI: 2093 case CHIP_POLARIS10: 2094 case CHIP_POLARIS11: 2095 case CHIP_POLARIS12: 2096 case CHIP_VEGAM: 2097 case CHIP_CARRIZO: 2098 case CHIP_STONEY: 2099 if (adev->flags & AMD_IS_APU) 2100 adev->family = AMDGPU_FAMILY_CZ; 2101 else 2102 adev->family = AMDGPU_FAMILY_VI; 2103 2104 r = vi_set_ip_blocks(adev); 2105 if (r) 2106 return r; 2107 break; 2108 default: 2109 r = amdgpu_discovery_set_ip_blocks(adev); 2110 if (r) 2111 return r; 2112 break; 2113 } 2114 2115 if (amdgpu_has_atpx() && 2116 (amdgpu_is_atpx_hybrid() || 2117 amdgpu_has_atpx_dgpu_power_cntl()) && 2118 ((adev->flags & AMD_IS_APU) == 0) && 2119 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2120 adev->flags |= AMD_IS_PX; 2121 2122 if (!(adev->flags & AMD_IS_APU)) { 2123 parent = pci_upstream_bridge(adev->pdev); 2124 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2125 } 2126 2127 2128 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2129 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2130 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2131 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2132 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2133 2134 total = true; 2135 for (i = 0; i < adev->num_ip_blocks; i++) { 2136 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2137 DRM_WARN("disabled ip block: %d <%s>\n", 2138 i, adev->ip_blocks[i].version->funcs->name); 2139 adev->ip_blocks[i].status.valid = false; 2140 } else { 2141 if (adev->ip_blocks[i].version->funcs->early_init) { 2142 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2143 if (r == -ENOENT) { 2144 adev->ip_blocks[i].status.valid = false; 2145 } else if (r) { 2146 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2147 adev->ip_blocks[i].version->funcs->name, r); 2148 total = false; 2149 } else { 2150 adev->ip_blocks[i].status.valid = true; 2151 } 2152 } else { 2153 adev->ip_blocks[i].status.valid = true; 2154 } 2155 } 2156 /* get the vbios after the asic_funcs are set up */ 2157 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2158 r = amdgpu_device_parse_gpu_info_fw(adev); 2159 if (r) 2160 return r; 2161 2162 /* Read BIOS */ 2163 if (amdgpu_device_read_bios(adev)) { 2164 if (!amdgpu_get_bios(adev)) 2165 return -EINVAL; 2166 2167 r = amdgpu_atombios_init(adev); 2168 if (r) { 2169 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2170 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2171 return r; 2172 } 2173 } 2174 2175 /*get pf2vf msg info at it's earliest time*/ 2176 if (amdgpu_sriov_vf(adev)) 2177 amdgpu_virt_init_data_exchange(adev); 2178 2179 } 2180 } 2181 if (!total) 2182 return -ENODEV; 2183 2184 amdgpu_amdkfd_device_probe(adev); 2185 adev->cg_flags &= amdgpu_cg_mask; 2186 adev->pg_flags &= amdgpu_pg_mask; 2187 2188 return 0; 2189 } 2190 2191 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2192 { 2193 int i, r; 2194 2195 for (i = 0; i < adev->num_ip_blocks; i++) { 2196 if (!adev->ip_blocks[i].status.sw) 2197 continue; 2198 if (adev->ip_blocks[i].status.hw) 2199 continue; 2200 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2201 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2202 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2203 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2204 if (r) { 2205 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2206 adev->ip_blocks[i].version->funcs->name, r); 2207 return r; 2208 } 2209 adev->ip_blocks[i].status.hw = true; 2210 } 2211 } 2212 2213 return 0; 2214 } 2215 2216 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2217 { 2218 int i, r; 2219 2220 for (i = 0; i < adev->num_ip_blocks; i++) { 2221 if (!adev->ip_blocks[i].status.sw) 2222 continue; 2223 if (adev->ip_blocks[i].status.hw) 2224 continue; 2225 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2226 if (r) { 2227 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2228 adev->ip_blocks[i].version->funcs->name, r); 2229 return r; 2230 } 2231 adev->ip_blocks[i].status.hw = true; 2232 } 2233 2234 return 0; 2235 } 2236 2237 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2238 { 2239 int r = 0; 2240 int i; 2241 uint32_t smu_version; 2242 2243 if (adev->asic_type >= CHIP_VEGA10) { 2244 for (i = 0; i < adev->num_ip_blocks; i++) { 2245 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2246 continue; 2247 2248 if (!adev->ip_blocks[i].status.sw) 2249 continue; 2250 2251 /* no need to do the fw loading again if already done*/ 2252 if (adev->ip_blocks[i].status.hw == true) 2253 break; 2254 2255 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2256 r = adev->ip_blocks[i].version->funcs->resume(adev); 2257 if (r) { 2258 DRM_ERROR("resume of IP block <%s> failed %d\n", 2259 adev->ip_blocks[i].version->funcs->name, r); 2260 return r; 2261 } 2262 } else { 2263 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2264 if (r) { 2265 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2266 adev->ip_blocks[i].version->funcs->name, r); 2267 return r; 2268 } 2269 } 2270 2271 adev->ip_blocks[i].status.hw = true; 2272 break; 2273 } 2274 } 2275 2276 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2277 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2278 2279 return r; 2280 } 2281 2282 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2283 { 2284 long timeout; 2285 int r, i; 2286 2287 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2288 struct amdgpu_ring *ring = adev->rings[i]; 2289 2290 /* No need to setup the GPU scheduler for rings that don't need it */ 2291 if (!ring || ring->no_scheduler) 2292 continue; 2293 2294 switch (ring->funcs->type) { 2295 case AMDGPU_RING_TYPE_GFX: 2296 timeout = adev->gfx_timeout; 2297 break; 2298 case AMDGPU_RING_TYPE_COMPUTE: 2299 timeout = adev->compute_timeout; 2300 break; 2301 case AMDGPU_RING_TYPE_SDMA: 2302 timeout = adev->sdma_timeout; 2303 break; 2304 default: 2305 timeout = adev->video_timeout; 2306 break; 2307 } 2308 2309 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2310 ring->num_hw_submission, 0, 2311 timeout, adev->reset_domain->wq, 2312 ring->sched_score, ring->name, 2313 adev->dev); 2314 if (r) { 2315 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2316 ring->name); 2317 return r; 2318 } 2319 } 2320 2321 amdgpu_xcp_update_partition_sched_list(adev); 2322 2323 return 0; 2324 } 2325 2326 2327 /** 2328 * amdgpu_device_ip_init - run init for hardware IPs 2329 * 2330 * @adev: amdgpu_device pointer 2331 * 2332 * Main initialization pass for hardware IPs. The list of all the hardware 2333 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2334 * are run. sw_init initializes the software state associated with each IP 2335 * and hw_init initializes the hardware associated with each IP. 2336 * Returns 0 on success, negative error code on failure. 2337 */ 2338 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2339 { 2340 int i, r; 2341 2342 r = amdgpu_ras_init(adev); 2343 if (r) 2344 return r; 2345 2346 for (i = 0; i < adev->num_ip_blocks; i++) { 2347 if (!adev->ip_blocks[i].status.valid) 2348 continue; 2349 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2350 if (r) { 2351 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2352 adev->ip_blocks[i].version->funcs->name, r); 2353 goto init_failed; 2354 } 2355 adev->ip_blocks[i].status.sw = true; 2356 2357 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2358 /* need to do common hw init early so everything is set up for gmc */ 2359 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2360 if (r) { 2361 DRM_ERROR("hw_init %d failed %d\n", i, r); 2362 goto init_failed; 2363 } 2364 adev->ip_blocks[i].status.hw = true; 2365 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2366 /* need to do gmc hw init early so we can allocate gpu mem */ 2367 /* Try to reserve bad pages early */ 2368 if (amdgpu_sriov_vf(adev)) 2369 amdgpu_virt_exchange_data(adev); 2370 2371 r = amdgpu_device_mem_scratch_init(adev); 2372 if (r) { 2373 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2374 goto init_failed; 2375 } 2376 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2377 if (r) { 2378 DRM_ERROR("hw_init %d failed %d\n", i, r); 2379 goto init_failed; 2380 } 2381 r = amdgpu_device_wb_init(adev); 2382 if (r) { 2383 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2384 goto init_failed; 2385 } 2386 adev->ip_blocks[i].status.hw = true; 2387 2388 /* right after GMC hw init, we create CSA */ 2389 if (adev->gfx.mcbp) { 2390 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2391 AMDGPU_GEM_DOMAIN_VRAM | 2392 AMDGPU_GEM_DOMAIN_GTT, 2393 AMDGPU_CSA_SIZE); 2394 if (r) { 2395 DRM_ERROR("allocate CSA failed %d\n", r); 2396 goto init_failed; 2397 } 2398 } 2399 } 2400 } 2401 2402 if (amdgpu_sriov_vf(adev)) 2403 amdgpu_virt_init_data_exchange(adev); 2404 2405 r = amdgpu_ib_pool_init(adev); 2406 if (r) { 2407 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2408 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2409 goto init_failed; 2410 } 2411 2412 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2413 if (r) 2414 goto init_failed; 2415 2416 r = amdgpu_device_ip_hw_init_phase1(adev); 2417 if (r) 2418 goto init_failed; 2419 2420 r = amdgpu_device_fw_loading(adev); 2421 if (r) 2422 goto init_failed; 2423 2424 r = amdgpu_device_ip_hw_init_phase2(adev); 2425 if (r) 2426 goto init_failed; 2427 2428 /* 2429 * retired pages will be loaded from eeprom and reserved here, 2430 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2431 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2432 * for I2C communication which only true at this point. 2433 * 2434 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2435 * failure from bad gpu situation and stop amdgpu init process 2436 * accordingly. For other failed cases, it will still release all 2437 * the resource and print error message, rather than returning one 2438 * negative value to upper level. 2439 * 2440 * Note: theoretically, this should be called before all vram allocations 2441 * to protect retired page from abusing 2442 */ 2443 r = amdgpu_ras_recovery_init(adev); 2444 if (r) 2445 goto init_failed; 2446 2447 /** 2448 * In case of XGMI grab extra reference for reset domain for this device 2449 */ 2450 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2451 if (amdgpu_xgmi_add_device(adev) == 0) { 2452 if (!amdgpu_sriov_vf(adev)) { 2453 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2454 2455 if (WARN_ON(!hive)) { 2456 r = -ENOENT; 2457 goto init_failed; 2458 } 2459 2460 if (!hive->reset_domain || 2461 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2462 r = -ENOENT; 2463 amdgpu_put_xgmi_hive(hive); 2464 goto init_failed; 2465 } 2466 2467 /* Drop the early temporary reset domain we created for device */ 2468 amdgpu_reset_put_reset_domain(adev->reset_domain); 2469 adev->reset_domain = hive->reset_domain; 2470 amdgpu_put_xgmi_hive(hive); 2471 } 2472 } 2473 } 2474 2475 r = amdgpu_device_init_schedulers(adev); 2476 if (r) 2477 goto init_failed; 2478 2479 /* Don't init kfd if whole hive need to be reset during init */ 2480 if (!adev->gmc.xgmi.pending_reset) { 2481 kgd2kfd_init_zone_device(adev); 2482 amdgpu_amdkfd_device_init(adev); 2483 } 2484 2485 amdgpu_fru_get_product_info(adev); 2486 2487 init_failed: 2488 2489 return r; 2490 } 2491 2492 /** 2493 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2494 * 2495 * @adev: amdgpu_device pointer 2496 * 2497 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2498 * this function before a GPU reset. If the value is retained after a 2499 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2500 */ 2501 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2502 { 2503 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2504 } 2505 2506 /** 2507 * amdgpu_device_check_vram_lost - check if vram is valid 2508 * 2509 * @adev: amdgpu_device pointer 2510 * 2511 * Checks the reset magic value written to the gart pointer in VRAM. 2512 * The driver calls this after a GPU reset to see if the contents of 2513 * VRAM is lost or now. 2514 * returns true if vram is lost, false if not. 2515 */ 2516 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2517 { 2518 if (memcmp(adev->gart.ptr, adev->reset_magic, 2519 AMDGPU_RESET_MAGIC_NUM)) 2520 return true; 2521 2522 if (!amdgpu_in_reset(adev)) 2523 return false; 2524 2525 /* 2526 * For all ASICs with baco/mode1 reset, the VRAM is 2527 * always assumed to be lost. 2528 */ 2529 switch (amdgpu_asic_reset_method(adev)) { 2530 case AMD_RESET_METHOD_BACO: 2531 case AMD_RESET_METHOD_MODE1: 2532 return true; 2533 default: 2534 return false; 2535 } 2536 } 2537 2538 /** 2539 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2540 * 2541 * @adev: amdgpu_device pointer 2542 * @state: clockgating state (gate or ungate) 2543 * 2544 * The list of all the hardware IPs that make up the asic is walked and the 2545 * set_clockgating_state callbacks are run. 2546 * Late initialization pass enabling clockgating for hardware IPs. 2547 * Fini or suspend, pass disabling clockgating for hardware IPs. 2548 * Returns 0 on success, negative error code on failure. 2549 */ 2550 2551 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2552 enum amd_clockgating_state state) 2553 { 2554 int i, j, r; 2555 2556 if (amdgpu_emu_mode == 1) 2557 return 0; 2558 2559 for (j = 0; j < adev->num_ip_blocks; j++) { 2560 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2561 if (!adev->ip_blocks[i].status.late_initialized) 2562 continue; 2563 /* skip CG for GFX, SDMA on S0ix */ 2564 if (adev->in_s0ix && 2565 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2566 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2567 continue; 2568 /* skip CG for VCE/UVD, it's handled specially */ 2569 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2570 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2571 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2572 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2573 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2574 /* enable clockgating to save power */ 2575 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2576 state); 2577 if (r) { 2578 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2579 adev->ip_blocks[i].version->funcs->name, r); 2580 return r; 2581 } 2582 } 2583 } 2584 2585 return 0; 2586 } 2587 2588 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2589 enum amd_powergating_state state) 2590 { 2591 int i, j, r; 2592 2593 if (amdgpu_emu_mode == 1) 2594 return 0; 2595 2596 for (j = 0; j < adev->num_ip_blocks; j++) { 2597 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2598 if (!adev->ip_blocks[i].status.late_initialized) 2599 continue; 2600 /* skip PG for GFX, SDMA on S0ix */ 2601 if (adev->in_s0ix && 2602 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2603 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2604 continue; 2605 /* skip CG for VCE/UVD, it's handled specially */ 2606 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2607 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2608 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2609 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2610 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2611 /* enable powergating to save power */ 2612 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2613 state); 2614 if (r) { 2615 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2616 adev->ip_blocks[i].version->funcs->name, r); 2617 return r; 2618 } 2619 } 2620 } 2621 return 0; 2622 } 2623 2624 static int amdgpu_device_enable_mgpu_fan_boost(void) 2625 { 2626 struct amdgpu_gpu_instance *gpu_ins; 2627 struct amdgpu_device *adev; 2628 int i, ret = 0; 2629 2630 mutex_lock(&mgpu_info.mutex); 2631 2632 /* 2633 * MGPU fan boost feature should be enabled 2634 * only when there are two or more dGPUs in 2635 * the system 2636 */ 2637 if (mgpu_info.num_dgpu < 2) 2638 goto out; 2639 2640 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2641 gpu_ins = &(mgpu_info.gpu_ins[i]); 2642 adev = gpu_ins->adev; 2643 if (!(adev->flags & AMD_IS_APU) && 2644 !gpu_ins->mgpu_fan_enabled) { 2645 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2646 if (ret) 2647 break; 2648 2649 gpu_ins->mgpu_fan_enabled = 1; 2650 } 2651 } 2652 2653 out: 2654 mutex_unlock(&mgpu_info.mutex); 2655 2656 return ret; 2657 } 2658 2659 /** 2660 * amdgpu_device_ip_late_init - run late init for hardware IPs 2661 * 2662 * @adev: amdgpu_device pointer 2663 * 2664 * Late initialization pass for hardware IPs. The list of all the hardware 2665 * IPs that make up the asic is walked and the late_init callbacks are run. 2666 * late_init covers any special initialization that an IP requires 2667 * after all of the have been initialized or something that needs to happen 2668 * late in the init process. 2669 * Returns 0 on success, negative error code on failure. 2670 */ 2671 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2672 { 2673 struct amdgpu_gpu_instance *gpu_instance; 2674 int i = 0, r; 2675 2676 for (i = 0; i < adev->num_ip_blocks; i++) { 2677 if (!adev->ip_blocks[i].status.hw) 2678 continue; 2679 if (adev->ip_blocks[i].version->funcs->late_init) { 2680 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2681 if (r) { 2682 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2683 adev->ip_blocks[i].version->funcs->name, r); 2684 return r; 2685 } 2686 } 2687 adev->ip_blocks[i].status.late_initialized = true; 2688 } 2689 2690 r = amdgpu_ras_late_init(adev); 2691 if (r) { 2692 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2693 return r; 2694 } 2695 2696 amdgpu_ras_set_error_query_ready(adev, true); 2697 2698 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2699 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2700 2701 amdgpu_device_fill_reset_magic(adev); 2702 2703 r = amdgpu_device_enable_mgpu_fan_boost(); 2704 if (r) 2705 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2706 2707 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2708 if (amdgpu_passthrough(adev) && 2709 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2710 adev->asic_type == CHIP_ALDEBARAN)) 2711 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2712 2713 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2714 mutex_lock(&mgpu_info.mutex); 2715 2716 /* 2717 * Reset device p-state to low as this was booted with high. 2718 * 2719 * This should be performed only after all devices from the same 2720 * hive get initialized. 2721 * 2722 * However, it's unknown how many device in the hive in advance. 2723 * As this is counted one by one during devices initializations. 2724 * 2725 * So, we wait for all XGMI interlinked devices initialized. 2726 * This may bring some delays as those devices may come from 2727 * different hives. But that should be OK. 2728 */ 2729 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2730 for (i = 0; i < mgpu_info.num_gpu; i++) { 2731 gpu_instance = &(mgpu_info.gpu_ins[i]); 2732 if (gpu_instance->adev->flags & AMD_IS_APU) 2733 continue; 2734 2735 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2736 AMDGPU_XGMI_PSTATE_MIN); 2737 if (r) { 2738 DRM_ERROR("pstate setting failed (%d).\n", r); 2739 break; 2740 } 2741 } 2742 } 2743 2744 mutex_unlock(&mgpu_info.mutex); 2745 } 2746 2747 return 0; 2748 } 2749 2750 /** 2751 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2752 * 2753 * @adev: amdgpu_device pointer 2754 * 2755 * For ASICs need to disable SMC first 2756 */ 2757 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2758 { 2759 int i, r; 2760 2761 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2762 return; 2763 2764 for (i = 0; i < adev->num_ip_blocks; i++) { 2765 if (!adev->ip_blocks[i].status.hw) 2766 continue; 2767 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2768 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2769 /* XXX handle errors */ 2770 if (r) { 2771 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2772 adev->ip_blocks[i].version->funcs->name, r); 2773 } 2774 adev->ip_blocks[i].status.hw = false; 2775 break; 2776 } 2777 } 2778 } 2779 2780 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2781 { 2782 int i, r; 2783 2784 for (i = 0; i < adev->num_ip_blocks; i++) { 2785 if (!adev->ip_blocks[i].version->funcs->early_fini) 2786 continue; 2787 2788 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2789 if (r) { 2790 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2791 adev->ip_blocks[i].version->funcs->name, r); 2792 } 2793 } 2794 2795 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2796 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2797 2798 amdgpu_amdkfd_suspend(adev, false); 2799 2800 /* Workaroud for ASICs need to disable SMC first */ 2801 amdgpu_device_smu_fini_early(adev); 2802 2803 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2804 if (!adev->ip_blocks[i].status.hw) 2805 continue; 2806 2807 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2808 /* XXX handle errors */ 2809 if (r) { 2810 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2811 adev->ip_blocks[i].version->funcs->name, r); 2812 } 2813 2814 adev->ip_blocks[i].status.hw = false; 2815 } 2816 2817 if (amdgpu_sriov_vf(adev)) { 2818 if (amdgpu_virt_release_full_gpu(adev, false)) 2819 DRM_ERROR("failed to release exclusive mode on fini\n"); 2820 } 2821 2822 return 0; 2823 } 2824 2825 /** 2826 * amdgpu_device_ip_fini - run fini for hardware IPs 2827 * 2828 * @adev: amdgpu_device pointer 2829 * 2830 * Main teardown pass for hardware IPs. The list of all the hardware 2831 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2832 * are run. hw_fini tears down the hardware associated with each IP 2833 * and sw_fini tears down any software state associated with each IP. 2834 * Returns 0 on success, negative error code on failure. 2835 */ 2836 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2837 { 2838 int i, r; 2839 2840 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2841 amdgpu_virt_release_ras_err_handler_data(adev); 2842 2843 if (adev->gmc.xgmi.num_physical_nodes > 1) 2844 amdgpu_xgmi_remove_device(adev); 2845 2846 amdgpu_amdkfd_device_fini_sw(adev); 2847 2848 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2849 if (!adev->ip_blocks[i].status.sw) 2850 continue; 2851 2852 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2853 amdgpu_ucode_free_bo(adev); 2854 amdgpu_free_static_csa(&adev->virt.csa_obj); 2855 amdgpu_device_wb_fini(adev); 2856 amdgpu_device_mem_scratch_fini(adev); 2857 amdgpu_ib_pool_fini(adev); 2858 } 2859 2860 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2861 /* XXX handle errors */ 2862 if (r) { 2863 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2864 adev->ip_blocks[i].version->funcs->name, r); 2865 } 2866 adev->ip_blocks[i].status.sw = false; 2867 adev->ip_blocks[i].status.valid = false; 2868 } 2869 2870 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2871 if (!adev->ip_blocks[i].status.late_initialized) 2872 continue; 2873 if (adev->ip_blocks[i].version->funcs->late_fini) 2874 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2875 adev->ip_blocks[i].status.late_initialized = false; 2876 } 2877 2878 amdgpu_ras_fini(adev); 2879 2880 return 0; 2881 } 2882 2883 /** 2884 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2885 * 2886 * @work: work_struct. 2887 */ 2888 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2889 { 2890 struct amdgpu_device *adev = 2891 container_of(work, struct amdgpu_device, delayed_init_work.work); 2892 int r; 2893 2894 r = amdgpu_ib_ring_tests(adev); 2895 if (r) 2896 DRM_ERROR("ib ring test failed (%d).\n", r); 2897 } 2898 2899 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2900 { 2901 struct amdgpu_device *adev = 2902 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2903 2904 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2905 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2906 2907 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2908 adev->gfx.gfx_off_state = true; 2909 } 2910 2911 /** 2912 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2913 * 2914 * @adev: amdgpu_device pointer 2915 * 2916 * Main suspend function for hardware IPs. The list of all the hardware 2917 * IPs that make up the asic is walked, clockgating is disabled and the 2918 * suspend callbacks are run. suspend puts the hardware and software state 2919 * in each IP into a state suitable for suspend. 2920 * Returns 0 on success, negative error code on failure. 2921 */ 2922 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2923 { 2924 int i, r; 2925 2926 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2927 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2928 2929 /* 2930 * Per PMFW team's suggestion, driver needs to handle gfxoff 2931 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2932 * scenario. Add the missing df cstate disablement here. 2933 */ 2934 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2935 dev_warn(adev->dev, "Failed to disallow df cstate"); 2936 2937 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2938 if (!adev->ip_blocks[i].status.valid) 2939 continue; 2940 2941 /* displays are handled separately */ 2942 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2943 continue; 2944 2945 /* XXX handle errors */ 2946 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2947 /* XXX handle errors */ 2948 if (r) { 2949 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2950 adev->ip_blocks[i].version->funcs->name, r); 2951 return r; 2952 } 2953 2954 adev->ip_blocks[i].status.hw = false; 2955 } 2956 2957 return 0; 2958 } 2959 2960 /** 2961 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2962 * 2963 * @adev: amdgpu_device pointer 2964 * 2965 * Main suspend function for hardware IPs. The list of all the hardware 2966 * IPs that make up the asic is walked, clockgating is disabled and the 2967 * suspend callbacks are run. suspend puts the hardware and software state 2968 * in each IP into a state suitable for suspend. 2969 * Returns 0 on success, negative error code on failure. 2970 */ 2971 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2972 { 2973 int i, r; 2974 2975 if (adev->in_s0ix) 2976 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2977 2978 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2979 if (!adev->ip_blocks[i].status.valid) 2980 continue; 2981 /* displays are handled in phase1 */ 2982 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2983 continue; 2984 /* PSP lost connection when err_event_athub occurs */ 2985 if (amdgpu_ras_intr_triggered() && 2986 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2987 adev->ip_blocks[i].status.hw = false; 2988 continue; 2989 } 2990 2991 /* skip unnecessary suspend if we do not initialize them yet */ 2992 if (adev->gmc.xgmi.pending_reset && 2993 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2994 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2995 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2996 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2997 adev->ip_blocks[i].status.hw = false; 2998 continue; 2999 } 3000 3001 /* skip suspend of gfx/mes and psp for S0ix 3002 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3003 * like at runtime. PSP is also part of the always on hardware 3004 * so no need to suspend it. 3005 */ 3006 if (adev->in_s0ix && 3007 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3008 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3009 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3010 continue; 3011 3012 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3013 if (adev->in_s0ix && 3014 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3015 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3016 continue; 3017 3018 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3019 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3020 * from this location and RLC Autoload automatically also gets loaded 3021 * from here based on PMFW -> PSP message during re-init sequence. 3022 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3023 * the TMR and reload FWs again for IMU enabled APU ASICs. 3024 */ 3025 if (amdgpu_in_reset(adev) && 3026 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3027 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3028 continue; 3029 3030 /* XXX handle errors */ 3031 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3032 /* XXX handle errors */ 3033 if (r) { 3034 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3035 adev->ip_blocks[i].version->funcs->name, r); 3036 } 3037 adev->ip_blocks[i].status.hw = false; 3038 /* handle putting the SMC in the appropriate state */ 3039 if (!amdgpu_sriov_vf(adev)) { 3040 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3041 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3042 if (r) { 3043 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3044 adev->mp1_state, r); 3045 return r; 3046 } 3047 } 3048 } 3049 } 3050 3051 return 0; 3052 } 3053 3054 /** 3055 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3056 * 3057 * @adev: amdgpu_device pointer 3058 * 3059 * Main suspend function for hardware IPs. The list of all the hardware 3060 * IPs that make up the asic is walked, clockgating is disabled and the 3061 * suspend callbacks are run. suspend puts the hardware and software state 3062 * in each IP into a state suitable for suspend. 3063 * Returns 0 on success, negative error code on failure. 3064 */ 3065 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3066 { 3067 int r; 3068 3069 if (amdgpu_sriov_vf(adev)) { 3070 amdgpu_virt_fini_data_exchange(adev); 3071 amdgpu_virt_request_full_gpu(adev, false); 3072 } 3073 3074 r = amdgpu_device_ip_suspend_phase1(adev); 3075 if (r) 3076 return r; 3077 r = amdgpu_device_ip_suspend_phase2(adev); 3078 3079 if (amdgpu_sriov_vf(adev)) 3080 amdgpu_virt_release_full_gpu(adev, false); 3081 3082 return r; 3083 } 3084 3085 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3086 { 3087 int i, r; 3088 3089 static enum amd_ip_block_type ip_order[] = { 3090 AMD_IP_BLOCK_TYPE_COMMON, 3091 AMD_IP_BLOCK_TYPE_GMC, 3092 AMD_IP_BLOCK_TYPE_PSP, 3093 AMD_IP_BLOCK_TYPE_IH, 3094 }; 3095 3096 for (i = 0; i < adev->num_ip_blocks; i++) { 3097 int j; 3098 struct amdgpu_ip_block *block; 3099 3100 block = &adev->ip_blocks[i]; 3101 block->status.hw = false; 3102 3103 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3104 3105 if (block->version->type != ip_order[j] || 3106 !block->status.valid) 3107 continue; 3108 3109 r = block->version->funcs->hw_init(adev); 3110 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3111 if (r) 3112 return r; 3113 block->status.hw = true; 3114 } 3115 } 3116 3117 return 0; 3118 } 3119 3120 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3121 { 3122 int i, r; 3123 3124 static enum amd_ip_block_type ip_order[] = { 3125 AMD_IP_BLOCK_TYPE_SMC, 3126 AMD_IP_BLOCK_TYPE_DCE, 3127 AMD_IP_BLOCK_TYPE_GFX, 3128 AMD_IP_BLOCK_TYPE_SDMA, 3129 AMD_IP_BLOCK_TYPE_MES, 3130 AMD_IP_BLOCK_TYPE_UVD, 3131 AMD_IP_BLOCK_TYPE_VCE, 3132 AMD_IP_BLOCK_TYPE_VCN, 3133 AMD_IP_BLOCK_TYPE_JPEG 3134 }; 3135 3136 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3137 int j; 3138 struct amdgpu_ip_block *block; 3139 3140 for (j = 0; j < adev->num_ip_blocks; j++) { 3141 block = &adev->ip_blocks[j]; 3142 3143 if (block->version->type != ip_order[i] || 3144 !block->status.valid || 3145 block->status.hw) 3146 continue; 3147 3148 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3149 r = block->version->funcs->resume(adev); 3150 else 3151 r = block->version->funcs->hw_init(adev); 3152 3153 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3154 if (r) 3155 return r; 3156 block->status.hw = true; 3157 } 3158 } 3159 3160 return 0; 3161 } 3162 3163 /** 3164 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3165 * 3166 * @adev: amdgpu_device pointer 3167 * 3168 * First resume function for hardware IPs. The list of all the hardware 3169 * IPs that make up the asic is walked and the resume callbacks are run for 3170 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3171 * after a suspend and updates the software state as necessary. This 3172 * function is also used for restoring the GPU after a GPU reset. 3173 * Returns 0 on success, negative error code on failure. 3174 */ 3175 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3176 { 3177 int i, r; 3178 3179 for (i = 0; i < adev->num_ip_blocks; i++) { 3180 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3181 continue; 3182 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3183 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3184 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3185 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3186 3187 r = adev->ip_blocks[i].version->funcs->resume(adev); 3188 if (r) { 3189 DRM_ERROR("resume of IP block <%s> failed %d\n", 3190 adev->ip_blocks[i].version->funcs->name, r); 3191 return r; 3192 } 3193 adev->ip_blocks[i].status.hw = true; 3194 } 3195 } 3196 3197 return 0; 3198 } 3199 3200 /** 3201 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3202 * 3203 * @adev: amdgpu_device pointer 3204 * 3205 * First resume function for hardware IPs. The list of all the hardware 3206 * IPs that make up the asic is walked and the resume callbacks are run for 3207 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3208 * functional state after a suspend and updates the software state as 3209 * necessary. This function is also used for restoring the GPU after a GPU 3210 * reset. 3211 * Returns 0 on success, negative error code on failure. 3212 */ 3213 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3214 { 3215 int i, r; 3216 3217 for (i = 0; i < adev->num_ip_blocks; i++) { 3218 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3219 continue; 3220 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3221 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3222 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3223 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3224 continue; 3225 r = adev->ip_blocks[i].version->funcs->resume(adev); 3226 if (r) { 3227 DRM_ERROR("resume of IP block <%s> failed %d\n", 3228 adev->ip_blocks[i].version->funcs->name, r); 3229 return r; 3230 } 3231 adev->ip_blocks[i].status.hw = true; 3232 } 3233 3234 return 0; 3235 } 3236 3237 /** 3238 * amdgpu_device_ip_resume - run resume for hardware IPs 3239 * 3240 * @adev: amdgpu_device pointer 3241 * 3242 * Main resume function for hardware IPs. The hardware IPs 3243 * are split into two resume functions because they are 3244 * also used in recovering from a GPU reset and some additional 3245 * steps need to be take between them. In this case (S3/S4) they are 3246 * run sequentially. 3247 * Returns 0 on success, negative error code on failure. 3248 */ 3249 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3250 { 3251 int r; 3252 3253 r = amdgpu_device_ip_resume_phase1(adev); 3254 if (r) 3255 return r; 3256 3257 r = amdgpu_device_fw_loading(adev); 3258 if (r) 3259 return r; 3260 3261 r = amdgpu_device_ip_resume_phase2(adev); 3262 3263 return r; 3264 } 3265 3266 /** 3267 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3268 * 3269 * @adev: amdgpu_device pointer 3270 * 3271 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3272 */ 3273 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3274 { 3275 if (amdgpu_sriov_vf(adev)) { 3276 if (adev->is_atom_fw) { 3277 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3278 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3279 } else { 3280 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3281 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3282 } 3283 3284 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3285 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3286 } 3287 } 3288 3289 /** 3290 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3291 * 3292 * @asic_type: AMD asic type 3293 * 3294 * Check if there is DC (new modesetting infrastructre) support for an asic. 3295 * returns true if DC has support, false if not. 3296 */ 3297 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3298 { 3299 switch (asic_type) { 3300 #ifdef CONFIG_DRM_AMDGPU_SI 3301 case CHIP_HAINAN: 3302 #endif 3303 case CHIP_TOPAZ: 3304 /* chips with no display hardware */ 3305 return false; 3306 #if defined(CONFIG_DRM_AMD_DC) 3307 case CHIP_TAHITI: 3308 case CHIP_PITCAIRN: 3309 case CHIP_VERDE: 3310 case CHIP_OLAND: 3311 /* 3312 * We have systems in the wild with these ASICs that require 3313 * LVDS and VGA support which is not supported with DC. 3314 * 3315 * Fallback to the non-DC driver here by default so as not to 3316 * cause regressions. 3317 */ 3318 #if defined(CONFIG_DRM_AMD_DC_SI) 3319 return amdgpu_dc > 0; 3320 #else 3321 return false; 3322 #endif 3323 case CHIP_BONAIRE: 3324 case CHIP_KAVERI: 3325 case CHIP_KABINI: 3326 case CHIP_MULLINS: 3327 /* 3328 * We have systems in the wild with these ASICs that require 3329 * VGA support which is not supported with DC. 3330 * 3331 * Fallback to the non-DC driver here by default so as not to 3332 * cause regressions. 3333 */ 3334 return amdgpu_dc > 0; 3335 default: 3336 return amdgpu_dc != 0; 3337 #else 3338 default: 3339 if (amdgpu_dc > 0) 3340 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3341 return false; 3342 #endif 3343 } 3344 } 3345 3346 /** 3347 * amdgpu_device_has_dc_support - check if dc is supported 3348 * 3349 * @adev: amdgpu_device pointer 3350 * 3351 * Returns true for supported, false for not supported 3352 */ 3353 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3354 { 3355 if (adev->enable_virtual_display || 3356 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3357 return false; 3358 3359 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3360 } 3361 3362 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3363 { 3364 struct amdgpu_device *adev = 3365 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3366 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3367 3368 /* It's a bug to not have a hive within this function */ 3369 if (WARN_ON(!hive)) 3370 return; 3371 3372 /* 3373 * Use task barrier to synchronize all xgmi reset works across the 3374 * hive. task_barrier_enter and task_barrier_exit will block 3375 * until all the threads running the xgmi reset works reach 3376 * those points. task_barrier_full will do both blocks. 3377 */ 3378 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3379 3380 task_barrier_enter(&hive->tb); 3381 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3382 3383 if (adev->asic_reset_res) 3384 goto fail; 3385 3386 task_barrier_exit(&hive->tb); 3387 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3388 3389 if (adev->asic_reset_res) 3390 goto fail; 3391 3392 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3393 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3394 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3395 } else { 3396 3397 task_barrier_full(&hive->tb); 3398 adev->asic_reset_res = amdgpu_asic_reset(adev); 3399 } 3400 3401 fail: 3402 if (adev->asic_reset_res) 3403 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3404 adev->asic_reset_res, adev_to_drm(adev)->unique); 3405 amdgpu_put_xgmi_hive(hive); 3406 } 3407 3408 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3409 { 3410 char *input = amdgpu_lockup_timeout; 3411 char *timeout_setting = NULL; 3412 int index = 0; 3413 long timeout; 3414 int ret = 0; 3415 3416 /* 3417 * By default timeout for non compute jobs is 10000 3418 * and 60000 for compute jobs. 3419 * In SR-IOV or passthrough mode, timeout for compute 3420 * jobs are 60000 by default. 3421 */ 3422 adev->gfx_timeout = msecs_to_jiffies(10000); 3423 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3424 if (amdgpu_sriov_vf(adev)) 3425 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3426 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3427 else 3428 adev->compute_timeout = msecs_to_jiffies(60000); 3429 3430 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3431 while ((timeout_setting = strsep(&input, ",")) && 3432 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3433 ret = kstrtol(timeout_setting, 0, &timeout); 3434 if (ret) 3435 return ret; 3436 3437 if (timeout == 0) { 3438 index++; 3439 continue; 3440 } else if (timeout < 0) { 3441 timeout = MAX_SCHEDULE_TIMEOUT; 3442 dev_warn(adev->dev, "lockup timeout disabled"); 3443 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3444 } else { 3445 timeout = msecs_to_jiffies(timeout); 3446 } 3447 3448 switch (index++) { 3449 case 0: 3450 adev->gfx_timeout = timeout; 3451 break; 3452 case 1: 3453 adev->compute_timeout = timeout; 3454 break; 3455 case 2: 3456 adev->sdma_timeout = timeout; 3457 break; 3458 case 3: 3459 adev->video_timeout = timeout; 3460 break; 3461 default: 3462 break; 3463 } 3464 } 3465 /* 3466 * There is only one value specified and 3467 * it should apply to all non-compute jobs. 3468 */ 3469 if (index == 1) { 3470 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3471 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3472 adev->compute_timeout = adev->gfx_timeout; 3473 } 3474 } 3475 3476 return ret; 3477 } 3478 3479 /** 3480 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3481 * 3482 * @adev: amdgpu_device pointer 3483 * 3484 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3485 */ 3486 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3487 { 3488 struct iommu_domain *domain; 3489 3490 domain = iommu_get_domain_for_dev(adev->dev); 3491 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3492 adev->ram_is_direct_mapped = true; 3493 } 3494 3495 static const struct attribute *amdgpu_dev_attributes[] = { 3496 &dev_attr_pcie_replay_count.attr, 3497 NULL 3498 }; 3499 3500 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3501 { 3502 if (amdgpu_mcbp == 1) 3503 adev->gfx.mcbp = true; 3504 else if (amdgpu_mcbp == 0) 3505 adev->gfx.mcbp = false; 3506 else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && 3507 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && 3508 adev->gfx.num_gfx_rings) 3509 adev->gfx.mcbp = true; 3510 3511 if (amdgpu_sriov_vf(adev)) 3512 adev->gfx.mcbp = true; 3513 3514 if (adev->gfx.mcbp) 3515 DRM_INFO("MCBP is enabled\n"); 3516 } 3517 3518 /** 3519 * amdgpu_device_init - initialize the driver 3520 * 3521 * @adev: amdgpu_device pointer 3522 * @flags: driver flags 3523 * 3524 * Initializes the driver info and hw (all asics). 3525 * Returns 0 for success or an error on failure. 3526 * Called at driver startup. 3527 */ 3528 int amdgpu_device_init(struct amdgpu_device *adev, 3529 uint32_t flags) 3530 { 3531 struct drm_device *ddev = adev_to_drm(adev); 3532 struct pci_dev *pdev = adev->pdev; 3533 int r, i; 3534 bool px = false; 3535 u32 max_MBps; 3536 int tmp; 3537 3538 adev->shutdown = false; 3539 adev->flags = flags; 3540 3541 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3542 adev->asic_type = amdgpu_force_asic_type; 3543 else 3544 adev->asic_type = flags & AMD_ASIC_MASK; 3545 3546 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3547 if (amdgpu_emu_mode == 1) 3548 adev->usec_timeout *= 10; 3549 adev->gmc.gart_size = 512 * 1024 * 1024; 3550 adev->accel_working = false; 3551 adev->num_rings = 0; 3552 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3553 adev->mman.buffer_funcs = NULL; 3554 adev->mman.buffer_funcs_ring = NULL; 3555 adev->vm_manager.vm_pte_funcs = NULL; 3556 adev->vm_manager.vm_pte_num_scheds = 0; 3557 adev->gmc.gmc_funcs = NULL; 3558 adev->harvest_ip_mask = 0x0; 3559 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3560 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3561 3562 adev->smc_rreg = &amdgpu_invalid_rreg; 3563 adev->smc_wreg = &amdgpu_invalid_wreg; 3564 adev->pcie_rreg = &amdgpu_invalid_rreg; 3565 adev->pcie_wreg = &amdgpu_invalid_wreg; 3566 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3567 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3568 adev->pciep_rreg = &amdgpu_invalid_rreg; 3569 adev->pciep_wreg = &amdgpu_invalid_wreg; 3570 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3571 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3572 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3573 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3574 adev->didt_rreg = &amdgpu_invalid_rreg; 3575 adev->didt_wreg = &amdgpu_invalid_wreg; 3576 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3577 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3578 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3579 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3580 3581 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3582 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3583 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3584 3585 /* mutex initialization are all done here so we 3586 * can recall function without having locking issues 3587 */ 3588 mutex_init(&adev->firmware.mutex); 3589 mutex_init(&adev->pm.mutex); 3590 mutex_init(&adev->gfx.gpu_clock_mutex); 3591 mutex_init(&adev->srbm_mutex); 3592 mutex_init(&adev->gfx.pipe_reserve_mutex); 3593 mutex_init(&adev->gfx.gfx_off_mutex); 3594 mutex_init(&adev->gfx.partition_mutex); 3595 mutex_init(&adev->grbm_idx_mutex); 3596 mutex_init(&adev->mn_lock); 3597 mutex_init(&adev->virt.vf_errors.lock); 3598 hash_init(adev->mn_hash); 3599 mutex_init(&adev->psp.mutex); 3600 mutex_init(&adev->notifier_lock); 3601 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3602 mutex_init(&adev->benchmark_mutex); 3603 3604 amdgpu_device_init_apu_flags(adev); 3605 3606 r = amdgpu_device_check_arguments(adev); 3607 if (r) 3608 return r; 3609 3610 spin_lock_init(&adev->mmio_idx_lock); 3611 spin_lock_init(&adev->smc_idx_lock); 3612 spin_lock_init(&adev->pcie_idx_lock); 3613 spin_lock_init(&adev->uvd_ctx_idx_lock); 3614 spin_lock_init(&adev->didt_idx_lock); 3615 spin_lock_init(&adev->gc_cac_idx_lock); 3616 spin_lock_init(&adev->se_cac_idx_lock); 3617 spin_lock_init(&adev->audio_endpt_idx_lock); 3618 spin_lock_init(&adev->mm_stats.lock); 3619 3620 INIT_LIST_HEAD(&adev->shadow_list); 3621 mutex_init(&adev->shadow_list_lock); 3622 3623 INIT_LIST_HEAD(&adev->reset_list); 3624 3625 INIT_LIST_HEAD(&adev->ras_list); 3626 3627 INIT_DELAYED_WORK(&adev->delayed_init_work, 3628 amdgpu_device_delayed_init_work_handler); 3629 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3630 amdgpu_device_delay_enable_gfx_off); 3631 3632 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3633 3634 adev->gfx.gfx_off_req_count = 1; 3635 adev->gfx.gfx_off_residency = 0; 3636 adev->gfx.gfx_off_entrycount = 0; 3637 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3638 3639 atomic_set(&adev->throttling_logging_enabled, 1); 3640 /* 3641 * If throttling continues, logging will be performed every minute 3642 * to avoid log flooding. "-1" is subtracted since the thermal 3643 * throttling interrupt comes every second. Thus, the total logging 3644 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3645 * for throttling interrupt) = 60 seconds. 3646 */ 3647 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3648 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3649 3650 /* Registers mapping */ 3651 /* TODO: block userspace mapping of io register */ 3652 if (adev->asic_type >= CHIP_BONAIRE) { 3653 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3654 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3655 } else { 3656 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3657 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3658 } 3659 3660 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3661 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3662 3663 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3664 if (!adev->rmmio) 3665 return -ENOMEM; 3666 3667 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3668 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3669 3670 /* 3671 * Reset domain needs to be present early, before XGMI hive discovered 3672 * (if any) and intitialized to use reset sem and in_gpu reset flag 3673 * early on during init and before calling to RREG32. 3674 */ 3675 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3676 if (!adev->reset_domain) 3677 return -ENOMEM; 3678 3679 /* detect hw virtualization here */ 3680 amdgpu_detect_virtualization(adev); 3681 3682 amdgpu_device_get_pcie_info(adev); 3683 3684 r = amdgpu_device_get_job_timeout_settings(adev); 3685 if (r) { 3686 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3687 return r; 3688 } 3689 3690 /* early init functions */ 3691 r = amdgpu_device_ip_early_init(adev); 3692 if (r) 3693 return r; 3694 3695 amdgpu_device_set_mcbp(adev); 3696 3697 /* Get rid of things like offb */ 3698 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3699 if (r) 3700 return r; 3701 3702 /* Enable TMZ based on IP_VERSION */ 3703 amdgpu_gmc_tmz_set(adev); 3704 3705 amdgpu_gmc_noretry_set(adev); 3706 /* Need to get xgmi info early to decide the reset behavior*/ 3707 if (adev->gmc.xgmi.supported) { 3708 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3709 if (r) 3710 return r; 3711 } 3712 3713 /* enable PCIE atomic ops */ 3714 if (amdgpu_sriov_vf(adev)) { 3715 if (adev->virt.fw_reserve.p_pf2vf) 3716 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3717 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3718 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3719 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3720 * internal path natively support atomics, set have_atomics_support to true. 3721 */ 3722 } else if ((adev->flags & AMD_IS_APU) && 3723 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { 3724 adev->have_atomics_support = true; 3725 } else { 3726 adev->have_atomics_support = 3727 !pci_enable_atomic_ops_to_root(adev->pdev, 3728 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3729 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3730 } 3731 3732 if (!adev->have_atomics_support) 3733 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3734 3735 /* doorbell bar mapping and doorbell index init*/ 3736 amdgpu_doorbell_init(adev); 3737 3738 if (amdgpu_emu_mode == 1) { 3739 /* post the asic on emulation mode */ 3740 emu_soc_asic_init(adev); 3741 goto fence_driver_init; 3742 } 3743 3744 amdgpu_reset_init(adev); 3745 3746 /* detect if we are with an SRIOV vbios */ 3747 if (adev->bios) 3748 amdgpu_device_detect_sriov_bios(adev); 3749 3750 /* check if we need to reset the asic 3751 * E.g., driver was not cleanly unloaded previously, etc. 3752 */ 3753 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3754 if (adev->gmc.xgmi.num_physical_nodes) { 3755 dev_info(adev->dev, "Pending hive reset.\n"); 3756 adev->gmc.xgmi.pending_reset = true; 3757 /* Only need to init necessary block for SMU to handle the reset */ 3758 for (i = 0; i < adev->num_ip_blocks; i++) { 3759 if (!adev->ip_blocks[i].status.valid) 3760 continue; 3761 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3762 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3763 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3764 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3765 DRM_DEBUG("IP %s disabled for hw_init.\n", 3766 adev->ip_blocks[i].version->funcs->name); 3767 adev->ip_blocks[i].status.hw = true; 3768 } 3769 } 3770 } else { 3771 tmp = amdgpu_reset_method; 3772 /* It should do a default reset when loading or reloading the driver, 3773 * regardless of the module parameter reset_method. 3774 */ 3775 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3776 r = amdgpu_asic_reset(adev); 3777 amdgpu_reset_method = tmp; 3778 if (r) { 3779 dev_err(adev->dev, "asic reset on init failed\n"); 3780 goto failed; 3781 } 3782 } 3783 } 3784 3785 /* Post card if necessary */ 3786 if (amdgpu_device_need_post(adev)) { 3787 if (!adev->bios) { 3788 dev_err(adev->dev, "no vBIOS found\n"); 3789 r = -EINVAL; 3790 goto failed; 3791 } 3792 DRM_INFO("GPU posting now...\n"); 3793 r = amdgpu_device_asic_init(adev); 3794 if (r) { 3795 dev_err(adev->dev, "gpu post error!\n"); 3796 goto failed; 3797 } 3798 } 3799 3800 if (adev->bios) { 3801 if (adev->is_atom_fw) { 3802 /* Initialize clocks */ 3803 r = amdgpu_atomfirmware_get_clock_info(adev); 3804 if (r) { 3805 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3806 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3807 goto failed; 3808 } 3809 } else { 3810 /* Initialize clocks */ 3811 r = amdgpu_atombios_get_clock_info(adev); 3812 if (r) { 3813 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3814 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3815 goto failed; 3816 } 3817 /* init i2c buses */ 3818 if (!amdgpu_device_has_dc_support(adev)) 3819 amdgpu_atombios_i2c_init(adev); 3820 } 3821 } 3822 3823 fence_driver_init: 3824 /* Fence driver */ 3825 r = amdgpu_fence_driver_sw_init(adev); 3826 if (r) { 3827 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3828 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3829 goto failed; 3830 } 3831 3832 /* init the mode config */ 3833 drm_mode_config_init(adev_to_drm(adev)); 3834 3835 r = amdgpu_device_ip_init(adev); 3836 if (r) { 3837 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3838 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3839 goto release_ras_con; 3840 } 3841 3842 amdgpu_fence_driver_hw_init(adev); 3843 3844 dev_info(adev->dev, 3845 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3846 adev->gfx.config.max_shader_engines, 3847 adev->gfx.config.max_sh_per_se, 3848 adev->gfx.config.max_cu_per_sh, 3849 adev->gfx.cu_info.number); 3850 3851 adev->accel_working = true; 3852 3853 amdgpu_vm_check_compute_bug(adev); 3854 3855 /* Initialize the buffer migration limit. */ 3856 if (amdgpu_moverate >= 0) 3857 max_MBps = amdgpu_moverate; 3858 else 3859 max_MBps = 8; /* Allow 8 MB/s. */ 3860 /* Get a log2 for easy divisions. */ 3861 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3862 3863 r = amdgpu_atombios_sysfs_init(adev); 3864 if (r) 3865 drm_err(&adev->ddev, 3866 "registering atombios sysfs failed (%d).\n", r); 3867 3868 r = amdgpu_pm_sysfs_init(adev); 3869 if (r) 3870 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3871 3872 r = amdgpu_ucode_sysfs_init(adev); 3873 if (r) { 3874 adev->ucode_sysfs_en = false; 3875 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3876 } else 3877 adev->ucode_sysfs_en = true; 3878 3879 /* 3880 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3881 * Otherwise the mgpu fan boost feature will be skipped due to the 3882 * gpu instance is counted less. 3883 */ 3884 amdgpu_register_gpu_instance(adev); 3885 3886 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3887 * explicit gating rather than handling it automatically. 3888 */ 3889 if (!adev->gmc.xgmi.pending_reset) { 3890 r = amdgpu_device_ip_late_init(adev); 3891 if (r) { 3892 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3893 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3894 goto release_ras_con; 3895 } 3896 /* must succeed. */ 3897 amdgpu_ras_resume(adev); 3898 queue_delayed_work(system_wq, &adev->delayed_init_work, 3899 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3900 } 3901 3902 if (amdgpu_sriov_vf(adev)) { 3903 amdgpu_virt_release_full_gpu(adev, true); 3904 flush_delayed_work(&adev->delayed_init_work); 3905 } 3906 3907 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3908 if (r) 3909 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3910 3911 amdgpu_fru_sysfs_init(adev); 3912 3913 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3914 r = amdgpu_pmu_init(adev); 3915 if (r) 3916 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3917 3918 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3919 if (amdgpu_device_cache_pci_state(adev->pdev)) 3920 pci_restore_state(pdev); 3921 3922 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3923 /* this will fail for cards that aren't VGA class devices, just 3924 * ignore it 3925 */ 3926 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3927 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3928 3929 px = amdgpu_device_supports_px(ddev); 3930 3931 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 3932 apple_gmux_detect(NULL, NULL))) 3933 vga_switcheroo_register_client(adev->pdev, 3934 &amdgpu_switcheroo_ops, px); 3935 3936 if (px) 3937 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3938 3939 if (adev->gmc.xgmi.pending_reset) 3940 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3941 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3942 3943 amdgpu_device_check_iommu_direct_map(adev); 3944 3945 return 0; 3946 3947 release_ras_con: 3948 if (amdgpu_sriov_vf(adev)) 3949 amdgpu_virt_release_full_gpu(adev, true); 3950 3951 /* failed in exclusive mode due to timeout */ 3952 if (amdgpu_sriov_vf(adev) && 3953 !amdgpu_sriov_runtime(adev) && 3954 amdgpu_virt_mmio_blocked(adev) && 3955 !amdgpu_virt_wait_reset(adev)) { 3956 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3957 /* Don't send request since VF is inactive. */ 3958 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3959 adev->virt.ops = NULL; 3960 r = -EAGAIN; 3961 } 3962 amdgpu_release_ras_context(adev); 3963 3964 failed: 3965 amdgpu_vf_error_trans_all(adev); 3966 3967 return r; 3968 } 3969 3970 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3971 { 3972 3973 /* Clear all CPU mappings pointing to this device */ 3974 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3975 3976 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3977 amdgpu_doorbell_fini(adev); 3978 3979 iounmap(adev->rmmio); 3980 adev->rmmio = NULL; 3981 if (adev->mman.aper_base_kaddr) 3982 iounmap(adev->mman.aper_base_kaddr); 3983 adev->mman.aper_base_kaddr = NULL; 3984 3985 /* Memory manager related */ 3986 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 3987 arch_phys_wc_del(adev->gmc.vram_mtrr); 3988 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3989 } 3990 } 3991 3992 /** 3993 * amdgpu_device_fini_hw - tear down the driver 3994 * 3995 * @adev: amdgpu_device pointer 3996 * 3997 * Tear down the driver info (all asics). 3998 * Called at driver shutdown. 3999 */ 4000 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4001 { 4002 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4003 flush_delayed_work(&adev->delayed_init_work); 4004 adev->shutdown = true; 4005 4006 /* make sure IB test finished before entering exclusive mode 4007 * to avoid preemption on IB test 4008 */ 4009 if (amdgpu_sriov_vf(adev)) { 4010 amdgpu_virt_request_full_gpu(adev, false); 4011 amdgpu_virt_fini_data_exchange(adev); 4012 } 4013 4014 /* disable all interrupts */ 4015 amdgpu_irq_disable_all(adev); 4016 if (adev->mode_info.mode_config_initialized) { 4017 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4018 drm_helper_force_disable_all(adev_to_drm(adev)); 4019 else 4020 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4021 } 4022 amdgpu_fence_driver_hw_fini(adev); 4023 4024 if (adev->mman.initialized) 4025 drain_workqueue(adev->mman.bdev.wq); 4026 4027 if (adev->pm.sysfs_initialized) 4028 amdgpu_pm_sysfs_fini(adev); 4029 if (adev->ucode_sysfs_en) 4030 amdgpu_ucode_sysfs_fini(adev); 4031 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4032 amdgpu_fru_sysfs_fini(adev); 4033 4034 /* disable ras feature must before hw fini */ 4035 amdgpu_ras_pre_fini(adev); 4036 4037 amdgpu_device_ip_fini_early(adev); 4038 4039 amdgpu_irq_fini_hw(adev); 4040 4041 if (adev->mman.initialized) 4042 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4043 4044 amdgpu_gart_dummy_page_fini(adev); 4045 4046 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4047 amdgpu_device_unmap_mmio(adev); 4048 4049 } 4050 4051 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4052 { 4053 int idx; 4054 bool px; 4055 4056 amdgpu_fence_driver_sw_fini(adev); 4057 amdgpu_device_ip_fini(adev); 4058 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4059 adev->accel_working = false; 4060 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4061 4062 amdgpu_reset_fini(adev); 4063 4064 /* free i2c buses */ 4065 if (!amdgpu_device_has_dc_support(adev)) 4066 amdgpu_i2c_fini(adev); 4067 4068 if (amdgpu_emu_mode != 1) 4069 amdgpu_atombios_fini(adev); 4070 4071 kfree(adev->bios); 4072 adev->bios = NULL; 4073 4074 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4075 4076 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4077 apple_gmux_detect(NULL, NULL))) 4078 vga_switcheroo_unregister_client(adev->pdev); 4079 4080 if (px) 4081 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4082 4083 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4084 vga_client_unregister(adev->pdev); 4085 4086 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4087 4088 iounmap(adev->rmmio); 4089 adev->rmmio = NULL; 4090 amdgpu_doorbell_fini(adev); 4091 drm_dev_exit(idx); 4092 } 4093 4094 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4095 amdgpu_pmu_fini(adev); 4096 if (adev->mman.discovery_bin) 4097 amdgpu_discovery_fini(adev); 4098 4099 amdgpu_reset_put_reset_domain(adev->reset_domain); 4100 adev->reset_domain = NULL; 4101 4102 kfree(adev->pci_state); 4103 4104 } 4105 4106 /** 4107 * amdgpu_device_evict_resources - evict device resources 4108 * @adev: amdgpu device object 4109 * 4110 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4111 * of the vram memory type. Mainly used for evicting device resources 4112 * at suspend time. 4113 * 4114 */ 4115 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4116 { 4117 int ret; 4118 4119 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4120 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4121 return 0; 4122 4123 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4124 if (ret) 4125 DRM_WARN("evicting device resources failed\n"); 4126 return ret; 4127 } 4128 4129 /* 4130 * Suspend & resume. 4131 */ 4132 /** 4133 * amdgpu_device_suspend - initiate device suspend 4134 * 4135 * @dev: drm dev pointer 4136 * @fbcon : notify the fbdev of suspend 4137 * 4138 * Puts the hw in the suspend state (all asics). 4139 * Returns 0 for success or an error on failure. 4140 * Called at driver suspend. 4141 */ 4142 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4143 { 4144 struct amdgpu_device *adev = drm_to_adev(dev); 4145 int r = 0; 4146 4147 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4148 return 0; 4149 4150 adev->in_suspend = true; 4151 4152 /* Evict the majority of BOs before grabbing the full access */ 4153 r = amdgpu_device_evict_resources(adev); 4154 if (r) 4155 return r; 4156 4157 if (amdgpu_sriov_vf(adev)) { 4158 amdgpu_virt_fini_data_exchange(adev); 4159 r = amdgpu_virt_request_full_gpu(adev, false); 4160 if (r) 4161 return r; 4162 } 4163 4164 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4165 DRM_WARN("smart shift update failed\n"); 4166 4167 if (fbcon) 4168 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4169 4170 cancel_delayed_work_sync(&adev->delayed_init_work); 4171 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4172 4173 amdgpu_ras_suspend(adev); 4174 4175 amdgpu_device_ip_suspend_phase1(adev); 4176 4177 if (!adev->in_s0ix) 4178 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4179 4180 r = amdgpu_device_evict_resources(adev); 4181 if (r) 4182 return r; 4183 4184 amdgpu_fence_driver_hw_fini(adev); 4185 4186 amdgpu_device_ip_suspend_phase2(adev); 4187 4188 if (amdgpu_sriov_vf(adev)) 4189 amdgpu_virt_release_full_gpu(adev, false); 4190 4191 return 0; 4192 } 4193 4194 /** 4195 * amdgpu_device_resume - initiate device resume 4196 * 4197 * @dev: drm dev pointer 4198 * @fbcon : notify the fbdev of resume 4199 * 4200 * Bring the hw back to operating state (all asics). 4201 * Returns 0 for success or an error on failure. 4202 * Called at driver resume. 4203 */ 4204 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4205 { 4206 struct amdgpu_device *adev = drm_to_adev(dev); 4207 int r = 0; 4208 4209 if (amdgpu_sriov_vf(adev)) { 4210 r = amdgpu_virt_request_full_gpu(adev, true); 4211 if (r) 4212 return r; 4213 } 4214 4215 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4216 return 0; 4217 4218 if (adev->in_s0ix) 4219 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4220 4221 /* post card */ 4222 if (amdgpu_device_need_post(adev)) { 4223 r = amdgpu_device_asic_init(adev); 4224 if (r) 4225 dev_err(adev->dev, "amdgpu asic init failed\n"); 4226 } 4227 4228 r = amdgpu_device_ip_resume(adev); 4229 4230 if (r) { 4231 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4232 goto exit; 4233 } 4234 amdgpu_fence_driver_hw_init(adev); 4235 4236 r = amdgpu_device_ip_late_init(adev); 4237 if (r) 4238 goto exit; 4239 4240 queue_delayed_work(system_wq, &adev->delayed_init_work, 4241 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4242 4243 if (!adev->in_s0ix) { 4244 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4245 if (r) 4246 goto exit; 4247 } 4248 4249 exit: 4250 if (amdgpu_sriov_vf(adev)) { 4251 amdgpu_virt_init_data_exchange(adev); 4252 amdgpu_virt_release_full_gpu(adev, true); 4253 } 4254 4255 if (r) 4256 return r; 4257 4258 /* Make sure IB tests flushed */ 4259 flush_delayed_work(&adev->delayed_init_work); 4260 4261 if (fbcon) 4262 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4263 4264 amdgpu_ras_resume(adev); 4265 4266 if (adev->mode_info.num_crtc) { 4267 /* 4268 * Most of the connector probing functions try to acquire runtime pm 4269 * refs to ensure that the GPU is powered on when connector polling is 4270 * performed. Since we're calling this from a runtime PM callback, 4271 * trying to acquire rpm refs will cause us to deadlock. 4272 * 4273 * Since we're guaranteed to be holding the rpm lock, it's safe to 4274 * temporarily disable the rpm helpers so this doesn't deadlock us. 4275 */ 4276 #ifdef CONFIG_PM 4277 dev->dev->power.disable_depth++; 4278 #endif 4279 if (!adev->dc_enabled) 4280 drm_helper_hpd_irq_event(dev); 4281 else 4282 drm_kms_helper_hotplug_event(dev); 4283 #ifdef CONFIG_PM 4284 dev->dev->power.disable_depth--; 4285 #endif 4286 } 4287 adev->in_suspend = false; 4288 4289 if (adev->enable_mes) 4290 amdgpu_mes_self_test(adev); 4291 4292 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4293 DRM_WARN("smart shift update failed\n"); 4294 4295 return 0; 4296 } 4297 4298 /** 4299 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4300 * 4301 * @adev: amdgpu_device pointer 4302 * 4303 * The list of all the hardware IPs that make up the asic is walked and 4304 * the check_soft_reset callbacks are run. check_soft_reset determines 4305 * if the asic is still hung or not. 4306 * Returns true if any of the IPs are still in a hung state, false if not. 4307 */ 4308 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4309 { 4310 int i; 4311 bool asic_hang = false; 4312 4313 if (amdgpu_sriov_vf(adev)) 4314 return true; 4315 4316 if (amdgpu_asic_need_full_reset(adev)) 4317 return true; 4318 4319 for (i = 0; i < adev->num_ip_blocks; i++) { 4320 if (!adev->ip_blocks[i].status.valid) 4321 continue; 4322 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4323 adev->ip_blocks[i].status.hang = 4324 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4325 if (adev->ip_blocks[i].status.hang) { 4326 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4327 asic_hang = true; 4328 } 4329 } 4330 return asic_hang; 4331 } 4332 4333 /** 4334 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4335 * 4336 * @adev: amdgpu_device pointer 4337 * 4338 * The list of all the hardware IPs that make up the asic is walked and the 4339 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4340 * handles any IP specific hardware or software state changes that are 4341 * necessary for a soft reset to succeed. 4342 * Returns 0 on success, negative error code on failure. 4343 */ 4344 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4345 { 4346 int i, r = 0; 4347 4348 for (i = 0; i < adev->num_ip_blocks; i++) { 4349 if (!adev->ip_blocks[i].status.valid) 4350 continue; 4351 if (adev->ip_blocks[i].status.hang && 4352 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4353 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4354 if (r) 4355 return r; 4356 } 4357 } 4358 4359 return 0; 4360 } 4361 4362 /** 4363 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4364 * 4365 * @adev: amdgpu_device pointer 4366 * 4367 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4368 * reset is necessary to recover. 4369 * Returns true if a full asic reset is required, false if not. 4370 */ 4371 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4372 { 4373 int i; 4374 4375 if (amdgpu_asic_need_full_reset(adev)) 4376 return true; 4377 4378 for (i = 0; i < adev->num_ip_blocks; i++) { 4379 if (!adev->ip_blocks[i].status.valid) 4380 continue; 4381 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4382 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4383 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4384 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4385 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4386 if (adev->ip_blocks[i].status.hang) { 4387 dev_info(adev->dev, "Some block need full reset!\n"); 4388 return true; 4389 } 4390 } 4391 } 4392 return false; 4393 } 4394 4395 /** 4396 * amdgpu_device_ip_soft_reset - do a soft reset 4397 * 4398 * @adev: amdgpu_device pointer 4399 * 4400 * The list of all the hardware IPs that make up the asic is walked and the 4401 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4402 * IP specific hardware or software state changes that are necessary to soft 4403 * reset the IP. 4404 * Returns 0 on success, negative error code on failure. 4405 */ 4406 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4407 { 4408 int i, r = 0; 4409 4410 for (i = 0; i < adev->num_ip_blocks; i++) { 4411 if (!adev->ip_blocks[i].status.valid) 4412 continue; 4413 if (adev->ip_blocks[i].status.hang && 4414 adev->ip_blocks[i].version->funcs->soft_reset) { 4415 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4416 if (r) 4417 return r; 4418 } 4419 } 4420 4421 return 0; 4422 } 4423 4424 /** 4425 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4426 * 4427 * @adev: amdgpu_device pointer 4428 * 4429 * The list of all the hardware IPs that make up the asic is walked and the 4430 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4431 * handles any IP specific hardware or software state changes that are 4432 * necessary after the IP has been soft reset. 4433 * Returns 0 on success, negative error code on failure. 4434 */ 4435 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4436 { 4437 int i, r = 0; 4438 4439 for (i = 0; i < adev->num_ip_blocks; i++) { 4440 if (!adev->ip_blocks[i].status.valid) 4441 continue; 4442 if (adev->ip_blocks[i].status.hang && 4443 adev->ip_blocks[i].version->funcs->post_soft_reset) 4444 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4445 if (r) 4446 return r; 4447 } 4448 4449 return 0; 4450 } 4451 4452 /** 4453 * amdgpu_device_recover_vram - Recover some VRAM contents 4454 * 4455 * @adev: amdgpu_device pointer 4456 * 4457 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4458 * restore things like GPUVM page tables after a GPU reset where 4459 * the contents of VRAM might be lost. 4460 * 4461 * Returns: 4462 * 0 on success, negative error code on failure. 4463 */ 4464 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4465 { 4466 struct dma_fence *fence = NULL, *next = NULL; 4467 struct amdgpu_bo *shadow; 4468 struct amdgpu_bo_vm *vmbo; 4469 long r = 1, tmo; 4470 4471 if (amdgpu_sriov_runtime(adev)) 4472 tmo = msecs_to_jiffies(8000); 4473 else 4474 tmo = msecs_to_jiffies(100); 4475 4476 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4477 mutex_lock(&adev->shadow_list_lock); 4478 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4479 /* If vm is compute context or adev is APU, shadow will be NULL */ 4480 if (!vmbo->shadow) 4481 continue; 4482 shadow = vmbo->shadow; 4483 4484 /* No need to recover an evicted BO */ 4485 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4486 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4487 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4488 continue; 4489 4490 r = amdgpu_bo_restore_shadow(shadow, &next); 4491 if (r) 4492 break; 4493 4494 if (fence) { 4495 tmo = dma_fence_wait_timeout(fence, false, tmo); 4496 dma_fence_put(fence); 4497 fence = next; 4498 if (tmo == 0) { 4499 r = -ETIMEDOUT; 4500 break; 4501 } else if (tmo < 0) { 4502 r = tmo; 4503 break; 4504 } 4505 } else { 4506 fence = next; 4507 } 4508 } 4509 mutex_unlock(&adev->shadow_list_lock); 4510 4511 if (fence) 4512 tmo = dma_fence_wait_timeout(fence, false, tmo); 4513 dma_fence_put(fence); 4514 4515 if (r < 0 || tmo <= 0) { 4516 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4517 return -EIO; 4518 } 4519 4520 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4521 return 0; 4522 } 4523 4524 4525 /** 4526 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4527 * 4528 * @adev: amdgpu_device pointer 4529 * @from_hypervisor: request from hypervisor 4530 * 4531 * do VF FLR and reinitialize Asic 4532 * return 0 means succeeded otherwise failed 4533 */ 4534 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4535 bool from_hypervisor) 4536 { 4537 int r; 4538 struct amdgpu_hive_info *hive = NULL; 4539 int retry_limit = 0; 4540 4541 retry: 4542 amdgpu_amdkfd_pre_reset(adev); 4543 4544 if (from_hypervisor) 4545 r = amdgpu_virt_request_full_gpu(adev, true); 4546 else 4547 r = amdgpu_virt_reset_gpu(adev); 4548 if (r) 4549 return r; 4550 amdgpu_irq_gpu_reset_resume_helper(adev); 4551 4552 /* some sw clean up VF needs to do before recover */ 4553 amdgpu_virt_post_reset(adev); 4554 4555 /* Resume IP prior to SMC */ 4556 r = amdgpu_device_ip_reinit_early_sriov(adev); 4557 if (r) 4558 goto error; 4559 4560 amdgpu_virt_init_data_exchange(adev); 4561 4562 r = amdgpu_device_fw_loading(adev); 4563 if (r) 4564 return r; 4565 4566 /* now we are okay to resume SMC/CP/SDMA */ 4567 r = amdgpu_device_ip_reinit_late_sriov(adev); 4568 if (r) 4569 goto error; 4570 4571 hive = amdgpu_get_xgmi_hive(adev); 4572 /* Update PSP FW topology after reset */ 4573 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4574 r = amdgpu_xgmi_update_topology(hive, adev); 4575 4576 if (hive) 4577 amdgpu_put_xgmi_hive(hive); 4578 4579 if (!r) { 4580 r = amdgpu_ib_ring_tests(adev); 4581 4582 amdgpu_amdkfd_post_reset(adev); 4583 } 4584 4585 error: 4586 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4587 amdgpu_inc_vram_lost(adev); 4588 r = amdgpu_device_recover_vram(adev); 4589 } 4590 amdgpu_virt_release_full_gpu(adev, true); 4591 4592 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4593 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4594 retry_limit++; 4595 goto retry; 4596 } else 4597 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4598 } 4599 4600 return r; 4601 } 4602 4603 /** 4604 * amdgpu_device_has_job_running - check if there is any job in mirror list 4605 * 4606 * @adev: amdgpu_device pointer 4607 * 4608 * check if there is any job in mirror list 4609 */ 4610 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4611 { 4612 int i; 4613 struct drm_sched_job *job; 4614 4615 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4616 struct amdgpu_ring *ring = adev->rings[i]; 4617 4618 if (!ring || !ring->sched.thread) 4619 continue; 4620 4621 spin_lock(&ring->sched.job_list_lock); 4622 job = list_first_entry_or_null(&ring->sched.pending_list, 4623 struct drm_sched_job, list); 4624 spin_unlock(&ring->sched.job_list_lock); 4625 if (job) 4626 return true; 4627 } 4628 return false; 4629 } 4630 4631 /** 4632 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4633 * 4634 * @adev: amdgpu_device pointer 4635 * 4636 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4637 * a hung GPU. 4638 */ 4639 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4640 { 4641 4642 if (amdgpu_gpu_recovery == 0) 4643 goto disabled; 4644 4645 /* Skip soft reset check in fatal error mode */ 4646 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4647 return true; 4648 4649 if (amdgpu_sriov_vf(adev)) 4650 return true; 4651 4652 if (amdgpu_gpu_recovery == -1) { 4653 switch (adev->asic_type) { 4654 #ifdef CONFIG_DRM_AMDGPU_SI 4655 case CHIP_VERDE: 4656 case CHIP_TAHITI: 4657 case CHIP_PITCAIRN: 4658 case CHIP_OLAND: 4659 case CHIP_HAINAN: 4660 #endif 4661 #ifdef CONFIG_DRM_AMDGPU_CIK 4662 case CHIP_KAVERI: 4663 case CHIP_KABINI: 4664 case CHIP_MULLINS: 4665 #endif 4666 case CHIP_CARRIZO: 4667 case CHIP_STONEY: 4668 case CHIP_CYAN_SKILLFISH: 4669 goto disabled; 4670 default: 4671 break; 4672 } 4673 } 4674 4675 return true; 4676 4677 disabled: 4678 dev_info(adev->dev, "GPU recovery disabled.\n"); 4679 return false; 4680 } 4681 4682 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4683 { 4684 u32 i; 4685 int ret = 0; 4686 4687 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4688 4689 dev_info(adev->dev, "GPU mode1 reset\n"); 4690 4691 /* disable BM */ 4692 pci_clear_master(adev->pdev); 4693 4694 amdgpu_device_cache_pci_state(adev->pdev); 4695 4696 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4697 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4698 ret = amdgpu_dpm_mode1_reset(adev); 4699 } else { 4700 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4701 ret = psp_gpu_reset(adev); 4702 } 4703 4704 if (ret) 4705 goto mode1_reset_failed; 4706 4707 amdgpu_device_load_pci_state(adev->pdev); 4708 ret = amdgpu_psp_wait_for_bootloader(adev); 4709 if (ret) 4710 goto mode1_reset_failed; 4711 4712 /* wait for asic to come out of reset */ 4713 for (i = 0; i < adev->usec_timeout; i++) { 4714 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4715 4716 if (memsize != 0xffffffff) 4717 break; 4718 udelay(1); 4719 } 4720 4721 if (i >= adev->usec_timeout) { 4722 ret = -ETIMEDOUT; 4723 goto mode1_reset_failed; 4724 } 4725 4726 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4727 4728 return 0; 4729 4730 mode1_reset_failed: 4731 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4732 return ret; 4733 } 4734 4735 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4736 struct amdgpu_reset_context *reset_context) 4737 { 4738 int i, r = 0; 4739 struct amdgpu_job *job = NULL; 4740 bool need_full_reset = 4741 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4742 4743 if (reset_context->reset_req_dev == adev) 4744 job = reset_context->job; 4745 4746 if (amdgpu_sriov_vf(adev)) { 4747 /* stop the data exchange thread */ 4748 amdgpu_virt_fini_data_exchange(adev); 4749 } 4750 4751 amdgpu_fence_driver_isr_toggle(adev, true); 4752 4753 /* block all schedulers and reset given job's ring */ 4754 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4755 struct amdgpu_ring *ring = adev->rings[i]; 4756 4757 if (!ring || !ring->sched.thread) 4758 continue; 4759 4760 /* Clear job fence from fence drv to avoid force_completion 4761 * leave NULL and vm flush fence in fence drv 4762 */ 4763 amdgpu_fence_driver_clear_job_fences(ring); 4764 4765 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4766 amdgpu_fence_driver_force_completion(ring); 4767 } 4768 4769 amdgpu_fence_driver_isr_toggle(adev, false); 4770 4771 if (job && job->vm) 4772 drm_sched_increase_karma(&job->base); 4773 4774 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4775 /* If reset handler not implemented, continue; otherwise return */ 4776 if (r == -EOPNOTSUPP) 4777 r = 0; 4778 else 4779 return r; 4780 4781 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4782 if (!amdgpu_sriov_vf(adev)) { 4783 4784 if (!need_full_reset) 4785 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4786 4787 if (!need_full_reset && amdgpu_gpu_recovery && 4788 amdgpu_device_ip_check_soft_reset(adev)) { 4789 amdgpu_device_ip_pre_soft_reset(adev); 4790 r = amdgpu_device_ip_soft_reset(adev); 4791 amdgpu_device_ip_post_soft_reset(adev); 4792 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4793 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4794 need_full_reset = true; 4795 } 4796 } 4797 4798 if (need_full_reset) 4799 r = amdgpu_device_ip_suspend(adev); 4800 if (need_full_reset) 4801 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4802 else 4803 clear_bit(AMDGPU_NEED_FULL_RESET, 4804 &reset_context->flags); 4805 } 4806 4807 return r; 4808 } 4809 4810 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4811 { 4812 int i; 4813 4814 lockdep_assert_held(&adev->reset_domain->sem); 4815 4816 for (i = 0; i < adev->num_regs; i++) { 4817 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4818 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4819 adev->reset_dump_reg_value[i]); 4820 } 4821 4822 return 0; 4823 } 4824 4825 #ifdef CONFIG_DEV_COREDUMP 4826 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4827 size_t count, void *data, size_t datalen) 4828 { 4829 struct drm_printer p; 4830 struct amdgpu_device *adev = data; 4831 struct drm_print_iterator iter; 4832 int i; 4833 4834 iter.data = buffer; 4835 iter.offset = 0; 4836 iter.start = offset; 4837 iter.remain = count; 4838 4839 p = drm_coredump_printer(&iter); 4840 4841 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4842 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4843 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4844 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4845 if (adev->reset_task_info.pid) 4846 drm_printf(&p, "process_name: %s PID: %d\n", 4847 adev->reset_task_info.process_name, 4848 adev->reset_task_info.pid); 4849 4850 if (adev->reset_vram_lost) 4851 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4852 if (adev->num_regs) { 4853 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4854 4855 for (i = 0; i < adev->num_regs; i++) 4856 drm_printf(&p, "0x%08x: 0x%08x\n", 4857 adev->reset_dump_reg_list[i], 4858 adev->reset_dump_reg_value[i]); 4859 } 4860 4861 return count - iter.remain; 4862 } 4863 4864 static void amdgpu_devcoredump_free(void *data) 4865 { 4866 } 4867 4868 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4869 { 4870 struct drm_device *dev = adev_to_drm(adev); 4871 4872 ktime_get_ts64(&adev->reset_time); 4873 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4874 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4875 } 4876 #endif 4877 4878 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4879 struct amdgpu_reset_context *reset_context) 4880 { 4881 struct amdgpu_device *tmp_adev = NULL; 4882 bool need_full_reset, skip_hw_reset, vram_lost = false; 4883 int r = 0; 4884 bool gpu_reset_for_dev_remove = 0; 4885 4886 /* Try reset handler method first */ 4887 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4888 reset_list); 4889 amdgpu_reset_reg_dumps(tmp_adev); 4890 4891 reset_context->reset_device_list = device_list_handle; 4892 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4893 /* If reset handler not implemented, continue; otherwise return */ 4894 if (r == -EOPNOTSUPP) 4895 r = 0; 4896 else 4897 return r; 4898 4899 /* Reset handler not implemented, use the default method */ 4900 need_full_reset = 4901 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4902 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4903 4904 gpu_reset_for_dev_remove = 4905 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4906 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4907 4908 /* 4909 * ASIC reset has to be done on all XGMI hive nodes ASAP 4910 * to allow proper links negotiation in FW (within 1 sec) 4911 */ 4912 if (!skip_hw_reset && need_full_reset) { 4913 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4914 /* For XGMI run all resets in parallel to speed up the process */ 4915 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4916 tmp_adev->gmc.xgmi.pending_reset = false; 4917 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4918 r = -EALREADY; 4919 } else 4920 r = amdgpu_asic_reset(tmp_adev); 4921 4922 if (r) { 4923 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4924 r, adev_to_drm(tmp_adev)->unique); 4925 break; 4926 } 4927 } 4928 4929 /* For XGMI wait for all resets to complete before proceed */ 4930 if (!r) { 4931 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4932 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4933 flush_work(&tmp_adev->xgmi_reset_work); 4934 r = tmp_adev->asic_reset_res; 4935 if (r) 4936 break; 4937 } 4938 } 4939 } 4940 } 4941 4942 if (!r && amdgpu_ras_intr_triggered()) { 4943 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4944 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4945 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4946 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4947 } 4948 4949 amdgpu_ras_intr_cleared(); 4950 } 4951 4952 /* Since the mode1 reset affects base ip blocks, the 4953 * phase1 ip blocks need to be resumed. Otherwise there 4954 * will be a BIOS signature error and the psp bootloader 4955 * can't load kdb on the next amdgpu install. 4956 */ 4957 if (gpu_reset_for_dev_remove) { 4958 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4959 amdgpu_device_ip_resume_phase1(tmp_adev); 4960 4961 goto end; 4962 } 4963 4964 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4965 if (need_full_reset) { 4966 /* post card */ 4967 r = amdgpu_device_asic_init(tmp_adev); 4968 if (r) { 4969 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4970 } else { 4971 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4972 4973 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4974 if (r) 4975 goto out; 4976 4977 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4978 #ifdef CONFIG_DEV_COREDUMP 4979 tmp_adev->reset_vram_lost = vram_lost; 4980 memset(&tmp_adev->reset_task_info, 0, 4981 sizeof(tmp_adev->reset_task_info)); 4982 if (reset_context->job && reset_context->job->vm) 4983 tmp_adev->reset_task_info = 4984 reset_context->job->vm->task_info; 4985 amdgpu_reset_capture_coredumpm(tmp_adev); 4986 #endif 4987 if (vram_lost) { 4988 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4989 amdgpu_inc_vram_lost(tmp_adev); 4990 } 4991 4992 r = amdgpu_device_fw_loading(tmp_adev); 4993 if (r) 4994 return r; 4995 4996 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4997 if (r) 4998 goto out; 4999 5000 if (vram_lost) 5001 amdgpu_device_fill_reset_magic(tmp_adev); 5002 5003 /* 5004 * Add this ASIC as tracked as reset was already 5005 * complete successfully. 5006 */ 5007 amdgpu_register_gpu_instance(tmp_adev); 5008 5009 if (!reset_context->hive && 5010 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5011 amdgpu_xgmi_add_device(tmp_adev); 5012 5013 r = amdgpu_device_ip_late_init(tmp_adev); 5014 if (r) 5015 goto out; 5016 5017 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5018 5019 /* 5020 * The GPU enters bad state once faulty pages 5021 * by ECC has reached the threshold, and ras 5022 * recovery is scheduled next. So add one check 5023 * here to break recovery if it indeed exceeds 5024 * bad page threshold, and remind user to 5025 * retire this GPU or setting one bigger 5026 * bad_page_threshold value to fix this once 5027 * probing driver again. 5028 */ 5029 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5030 /* must succeed. */ 5031 amdgpu_ras_resume(tmp_adev); 5032 } else { 5033 r = -EINVAL; 5034 goto out; 5035 } 5036 5037 /* Update PSP FW topology after reset */ 5038 if (reset_context->hive && 5039 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5040 r = amdgpu_xgmi_update_topology( 5041 reset_context->hive, tmp_adev); 5042 } 5043 } 5044 5045 out: 5046 if (!r) { 5047 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5048 r = amdgpu_ib_ring_tests(tmp_adev); 5049 if (r) { 5050 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5051 need_full_reset = true; 5052 r = -EAGAIN; 5053 goto end; 5054 } 5055 } 5056 5057 if (!r) 5058 r = amdgpu_device_recover_vram(tmp_adev); 5059 else 5060 tmp_adev->asic_reset_res = r; 5061 } 5062 5063 end: 5064 if (need_full_reset) 5065 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5066 else 5067 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5068 return r; 5069 } 5070 5071 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5072 { 5073 5074 switch (amdgpu_asic_reset_method(adev)) { 5075 case AMD_RESET_METHOD_MODE1: 5076 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5077 break; 5078 case AMD_RESET_METHOD_MODE2: 5079 adev->mp1_state = PP_MP1_STATE_RESET; 5080 break; 5081 default: 5082 adev->mp1_state = PP_MP1_STATE_NONE; 5083 break; 5084 } 5085 } 5086 5087 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5088 { 5089 amdgpu_vf_error_trans_all(adev); 5090 adev->mp1_state = PP_MP1_STATE_NONE; 5091 } 5092 5093 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5094 { 5095 struct pci_dev *p = NULL; 5096 5097 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5098 adev->pdev->bus->number, 1); 5099 if (p) { 5100 pm_runtime_enable(&(p->dev)); 5101 pm_runtime_resume(&(p->dev)); 5102 } 5103 5104 pci_dev_put(p); 5105 } 5106 5107 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5108 { 5109 enum amd_reset_method reset_method; 5110 struct pci_dev *p = NULL; 5111 u64 expires; 5112 5113 /* 5114 * For now, only BACO and mode1 reset are confirmed 5115 * to suffer the audio issue without proper suspended. 5116 */ 5117 reset_method = amdgpu_asic_reset_method(adev); 5118 if ((reset_method != AMD_RESET_METHOD_BACO) && 5119 (reset_method != AMD_RESET_METHOD_MODE1)) 5120 return -EINVAL; 5121 5122 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5123 adev->pdev->bus->number, 1); 5124 if (!p) 5125 return -ENODEV; 5126 5127 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5128 if (!expires) 5129 /* 5130 * If we cannot get the audio device autosuspend delay, 5131 * a fixed 4S interval will be used. Considering 3S is 5132 * the audio controller default autosuspend delay setting. 5133 * 4S used here is guaranteed to cover that. 5134 */ 5135 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5136 5137 while (!pm_runtime_status_suspended(&(p->dev))) { 5138 if (!pm_runtime_suspend(&(p->dev))) 5139 break; 5140 5141 if (expires < ktime_get_mono_fast_ns()) { 5142 dev_warn(adev->dev, "failed to suspend display audio\n"); 5143 pci_dev_put(p); 5144 /* TODO: abort the succeeding gpu reset? */ 5145 return -ETIMEDOUT; 5146 } 5147 } 5148 5149 pm_runtime_disable(&(p->dev)); 5150 5151 pci_dev_put(p); 5152 return 0; 5153 } 5154 5155 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5156 { 5157 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5158 5159 #if defined(CONFIG_DEBUG_FS) 5160 if (!amdgpu_sriov_vf(adev)) 5161 cancel_work(&adev->reset_work); 5162 #endif 5163 5164 if (adev->kfd.dev) 5165 cancel_work(&adev->kfd.reset_work); 5166 5167 if (amdgpu_sriov_vf(adev)) 5168 cancel_work(&adev->virt.flr_work); 5169 5170 if (con && adev->ras_enabled) 5171 cancel_work(&con->recovery_work); 5172 5173 } 5174 5175 /** 5176 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5177 * 5178 * @adev: amdgpu_device pointer 5179 * @job: which job trigger hang 5180 * @reset_context: amdgpu reset context pointer 5181 * 5182 * Attempt to reset the GPU if it has hung (all asics). 5183 * Attempt to do soft-reset or full-reset and reinitialize Asic 5184 * Returns 0 for success or an error on failure. 5185 */ 5186 5187 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5188 struct amdgpu_job *job, 5189 struct amdgpu_reset_context *reset_context) 5190 { 5191 struct list_head device_list, *device_list_handle = NULL; 5192 bool job_signaled = false; 5193 struct amdgpu_hive_info *hive = NULL; 5194 struct amdgpu_device *tmp_adev = NULL; 5195 int i, r = 0; 5196 bool need_emergency_restart = false; 5197 bool audio_suspended = false; 5198 bool gpu_reset_for_dev_remove = false; 5199 5200 gpu_reset_for_dev_remove = 5201 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5202 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5203 5204 /* 5205 * Special case: RAS triggered and full reset isn't supported 5206 */ 5207 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5208 5209 /* 5210 * Flush RAM to disk so that after reboot 5211 * the user can read log and see why the system rebooted. 5212 */ 5213 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5214 DRM_WARN("Emergency reboot."); 5215 5216 ksys_sync_helper(); 5217 emergency_restart(); 5218 } 5219 5220 dev_info(adev->dev, "GPU %s begin!\n", 5221 need_emergency_restart ? "jobs stop":"reset"); 5222 5223 if (!amdgpu_sriov_vf(adev)) 5224 hive = amdgpu_get_xgmi_hive(adev); 5225 if (hive) 5226 mutex_lock(&hive->hive_lock); 5227 5228 reset_context->job = job; 5229 reset_context->hive = hive; 5230 /* 5231 * Build list of devices to reset. 5232 * In case we are in XGMI hive mode, resort the device list 5233 * to put adev in the 1st position. 5234 */ 5235 INIT_LIST_HEAD(&device_list); 5236 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5237 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5238 list_add_tail(&tmp_adev->reset_list, &device_list); 5239 if (gpu_reset_for_dev_remove && adev->shutdown) 5240 tmp_adev->shutdown = true; 5241 } 5242 if (!list_is_first(&adev->reset_list, &device_list)) 5243 list_rotate_to_front(&adev->reset_list, &device_list); 5244 device_list_handle = &device_list; 5245 } else { 5246 list_add_tail(&adev->reset_list, &device_list); 5247 device_list_handle = &device_list; 5248 } 5249 5250 /* We need to lock reset domain only once both for XGMI and single device */ 5251 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5252 reset_list); 5253 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5254 5255 /* block all schedulers and reset given job's ring */ 5256 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5257 5258 amdgpu_device_set_mp1_state(tmp_adev); 5259 5260 /* 5261 * Try to put the audio codec into suspend state 5262 * before gpu reset started. 5263 * 5264 * Due to the power domain of the graphics device 5265 * is shared with AZ power domain. Without this, 5266 * we may change the audio hardware from behind 5267 * the audio driver's back. That will trigger 5268 * some audio codec errors. 5269 */ 5270 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5271 audio_suspended = true; 5272 5273 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5274 5275 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5276 5277 if (!amdgpu_sriov_vf(tmp_adev)) 5278 amdgpu_amdkfd_pre_reset(tmp_adev); 5279 5280 /* 5281 * Mark these ASICs to be reseted as untracked first 5282 * And add them back after reset completed 5283 */ 5284 amdgpu_unregister_gpu_instance(tmp_adev); 5285 5286 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5287 5288 /* disable ras on ALL IPs */ 5289 if (!need_emergency_restart && 5290 amdgpu_device_ip_need_full_reset(tmp_adev)) 5291 amdgpu_ras_suspend(tmp_adev); 5292 5293 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5294 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5295 5296 if (!ring || !ring->sched.thread) 5297 continue; 5298 5299 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5300 5301 if (need_emergency_restart) 5302 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5303 } 5304 atomic_inc(&tmp_adev->gpu_reset_counter); 5305 } 5306 5307 if (need_emergency_restart) 5308 goto skip_sched_resume; 5309 5310 /* 5311 * Must check guilty signal here since after this point all old 5312 * HW fences are force signaled. 5313 * 5314 * job->base holds a reference to parent fence 5315 */ 5316 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5317 job_signaled = true; 5318 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5319 goto skip_hw_reset; 5320 } 5321 5322 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5323 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5324 if (gpu_reset_for_dev_remove) { 5325 /* Workaroud for ASICs need to disable SMC first */ 5326 amdgpu_device_smu_fini_early(tmp_adev); 5327 } 5328 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5329 /*TODO Should we stop ?*/ 5330 if (r) { 5331 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5332 r, adev_to_drm(tmp_adev)->unique); 5333 tmp_adev->asic_reset_res = r; 5334 } 5335 5336 /* 5337 * Drop all pending non scheduler resets. Scheduler resets 5338 * were already dropped during drm_sched_stop 5339 */ 5340 amdgpu_device_stop_pending_resets(tmp_adev); 5341 } 5342 5343 /* Actual ASIC resets if needed.*/ 5344 /* Host driver will handle XGMI hive reset for SRIOV */ 5345 if (amdgpu_sriov_vf(adev)) { 5346 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5347 if (r) 5348 adev->asic_reset_res = r; 5349 5350 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5351 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || 5352 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) 5353 amdgpu_ras_resume(adev); 5354 } else { 5355 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5356 if (r && r == -EAGAIN) 5357 goto retry; 5358 5359 if (!r && gpu_reset_for_dev_remove) 5360 goto recover_end; 5361 } 5362 5363 skip_hw_reset: 5364 5365 /* Post ASIC reset for all devs .*/ 5366 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5367 5368 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5369 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5370 5371 if (!ring || !ring->sched.thread) 5372 continue; 5373 5374 drm_sched_start(&ring->sched, true); 5375 } 5376 5377 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5378 amdgpu_mes_self_test(tmp_adev); 5379 5380 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5381 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5382 5383 if (tmp_adev->asic_reset_res) 5384 r = tmp_adev->asic_reset_res; 5385 5386 tmp_adev->asic_reset_res = 0; 5387 5388 if (r) { 5389 /* bad news, how to tell it to userspace ? */ 5390 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5391 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5392 } else { 5393 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5394 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5395 DRM_WARN("smart shift update failed\n"); 5396 } 5397 } 5398 5399 skip_sched_resume: 5400 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5401 /* unlock kfd: SRIOV would do it separately */ 5402 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5403 amdgpu_amdkfd_post_reset(tmp_adev); 5404 5405 /* kfd_post_reset will do nothing if kfd device is not initialized, 5406 * need to bring up kfd here if it's not be initialized before 5407 */ 5408 if (!adev->kfd.init_complete) 5409 amdgpu_amdkfd_device_init(adev); 5410 5411 if (audio_suspended) 5412 amdgpu_device_resume_display_audio(tmp_adev); 5413 5414 amdgpu_device_unset_mp1_state(tmp_adev); 5415 5416 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5417 } 5418 5419 recover_end: 5420 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5421 reset_list); 5422 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5423 5424 if (hive) { 5425 mutex_unlock(&hive->hive_lock); 5426 amdgpu_put_xgmi_hive(hive); 5427 } 5428 5429 if (r) 5430 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5431 5432 atomic_set(&adev->reset_domain->reset_res, r); 5433 return r; 5434 } 5435 5436 /** 5437 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5438 * 5439 * @adev: amdgpu_device pointer 5440 * 5441 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5442 * and lanes) of the slot the device is in. Handles APUs and 5443 * virtualized environments where PCIE config space may not be available. 5444 */ 5445 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5446 { 5447 struct pci_dev *pdev; 5448 enum pci_bus_speed speed_cap, platform_speed_cap; 5449 enum pcie_link_width platform_link_width; 5450 5451 if (amdgpu_pcie_gen_cap) 5452 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5453 5454 if (amdgpu_pcie_lane_cap) 5455 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5456 5457 /* covers APUs as well */ 5458 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5459 if (adev->pm.pcie_gen_mask == 0) 5460 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5461 if (adev->pm.pcie_mlw_mask == 0) 5462 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5463 return; 5464 } 5465 5466 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5467 return; 5468 5469 pcie_bandwidth_available(adev->pdev, NULL, 5470 &platform_speed_cap, &platform_link_width); 5471 5472 if (adev->pm.pcie_gen_mask == 0) { 5473 /* asic caps */ 5474 pdev = adev->pdev; 5475 speed_cap = pcie_get_speed_cap(pdev); 5476 if (speed_cap == PCI_SPEED_UNKNOWN) { 5477 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5478 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5479 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5480 } else { 5481 if (speed_cap == PCIE_SPEED_32_0GT) 5482 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5483 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5484 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5485 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5486 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5487 else if (speed_cap == PCIE_SPEED_16_0GT) 5488 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5489 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5490 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5491 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5492 else if (speed_cap == PCIE_SPEED_8_0GT) 5493 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5494 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5495 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5496 else if (speed_cap == PCIE_SPEED_5_0GT) 5497 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5498 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5499 else 5500 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5501 } 5502 /* platform caps */ 5503 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5504 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5505 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5506 } else { 5507 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5508 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5509 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5510 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5511 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5512 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5513 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5514 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5515 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5516 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5517 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5518 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5519 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5520 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5521 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5522 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5523 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5524 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5525 else 5526 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5527 5528 } 5529 } 5530 if (adev->pm.pcie_mlw_mask == 0) { 5531 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5532 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5533 } else { 5534 switch (platform_link_width) { 5535 case PCIE_LNK_X32: 5536 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5537 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5538 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5540 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5541 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5542 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5543 break; 5544 case PCIE_LNK_X16: 5545 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5547 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5548 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5549 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5550 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5551 break; 5552 case PCIE_LNK_X12: 5553 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5554 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5555 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5556 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5557 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5558 break; 5559 case PCIE_LNK_X8: 5560 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5561 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5562 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5563 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5564 break; 5565 case PCIE_LNK_X4: 5566 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5567 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5568 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5569 break; 5570 case PCIE_LNK_X2: 5571 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5572 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5573 break; 5574 case PCIE_LNK_X1: 5575 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5576 break; 5577 default: 5578 break; 5579 } 5580 } 5581 } 5582 } 5583 5584 /** 5585 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5586 * 5587 * @adev: amdgpu_device pointer 5588 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5589 * 5590 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5591 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5592 * @peer_adev. 5593 */ 5594 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5595 struct amdgpu_device *peer_adev) 5596 { 5597 #ifdef CONFIG_HSA_AMD_P2P 5598 uint64_t address_mask = peer_adev->dev->dma_mask ? 5599 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5600 resource_size_t aper_limit = 5601 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5602 bool p2p_access = 5603 !adev->gmc.xgmi.connected_to_cpu && 5604 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5605 5606 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5607 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5608 !(adev->gmc.aper_base & address_mask || 5609 aper_limit & address_mask)); 5610 #else 5611 return false; 5612 #endif 5613 } 5614 5615 int amdgpu_device_baco_enter(struct drm_device *dev) 5616 { 5617 struct amdgpu_device *adev = drm_to_adev(dev); 5618 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5619 5620 if (!amdgpu_device_supports_baco(dev)) 5621 return -ENOTSUPP; 5622 5623 if (ras && adev->ras_enabled && 5624 adev->nbio.funcs->enable_doorbell_interrupt) 5625 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5626 5627 return amdgpu_dpm_baco_enter(adev); 5628 } 5629 5630 int amdgpu_device_baco_exit(struct drm_device *dev) 5631 { 5632 struct amdgpu_device *adev = drm_to_adev(dev); 5633 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5634 int ret = 0; 5635 5636 if (!amdgpu_device_supports_baco(dev)) 5637 return -ENOTSUPP; 5638 5639 ret = amdgpu_dpm_baco_exit(adev); 5640 if (ret) 5641 return ret; 5642 5643 if (ras && adev->ras_enabled && 5644 adev->nbio.funcs->enable_doorbell_interrupt) 5645 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5646 5647 if (amdgpu_passthrough(adev) && 5648 adev->nbio.funcs->clear_doorbell_interrupt) 5649 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5650 5651 return 0; 5652 } 5653 5654 /** 5655 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5656 * @pdev: PCI device struct 5657 * @state: PCI channel state 5658 * 5659 * Description: Called when a PCI error is detected. 5660 * 5661 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5662 */ 5663 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5664 { 5665 struct drm_device *dev = pci_get_drvdata(pdev); 5666 struct amdgpu_device *adev = drm_to_adev(dev); 5667 int i; 5668 5669 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5670 5671 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5672 DRM_WARN("No support for XGMI hive yet..."); 5673 return PCI_ERS_RESULT_DISCONNECT; 5674 } 5675 5676 adev->pci_channel_state = state; 5677 5678 switch (state) { 5679 case pci_channel_io_normal: 5680 return PCI_ERS_RESULT_CAN_RECOVER; 5681 /* Fatal error, prepare for slot reset */ 5682 case pci_channel_io_frozen: 5683 /* 5684 * Locking adev->reset_domain->sem will prevent any external access 5685 * to GPU during PCI error recovery 5686 */ 5687 amdgpu_device_lock_reset_domain(adev->reset_domain); 5688 amdgpu_device_set_mp1_state(adev); 5689 5690 /* 5691 * Block any work scheduling as we do for regular GPU reset 5692 * for the duration of the recovery 5693 */ 5694 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5695 struct amdgpu_ring *ring = adev->rings[i]; 5696 5697 if (!ring || !ring->sched.thread) 5698 continue; 5699 5700 drm_sched_stop(&ring->sched, NULL); 5701 } 5702 atomic_inc(&adev->gpu_reset_counter); 5703 return PCI_ERS_RESULT_NEED_RESET; 5704 case pci_channel_io_perm_failure: 5705 /* Permanent error, prepare for device removal */ 5706 return PCI_ERS_RESULT_DISCONNECT; 5707 } 5708 5709 return PCI_ERS_RESULT_NEED_RESET; 5710 } 5711 5712 /** 5713 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5714 * @pdev: pointer to PCI device 5715 */ 5716 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5717 { 5718 5719 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5720 5721 /* TODO - dump whatever for debugging purposes */ 5722 5723 /* This called only if amdgpu_pci_error_detected returns 5724 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5725 * works, no need to reset slot. 5726 */ 5727 5728 return PCI_ERS_RESULT_RECOVERED; 5729 } 5730 5731 /** 5732 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5733 * @pdev: PCI device struct 5734 * 5735 * Description: This routine is called by the pci error recovery 5736 * code after the PCI slot has been reset, just before we 5737 * should resume normal operations. 5738 */ 5739 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5740 { 5741 struct drm_device *dev = pci_get_drvdata(pdev); 5742 struct amdgpu_device *adev = drm_to_adev(dev); 5743 int r, i; 5744 struct amdgpu_reset_context reset_context; 5745 u32 memsize; 5746 struct list_head device_list; 5747 5748 DRM_INFO("PCI error: slot reset callback!!\n"); 5749 5750 memset(&reset_context, 0, sizeof(reset_context)); 5751 5752 INIT_LIST_HEAD(&device_list); 5753 list_add_tail(&adev->reset_list, &device_list); 5754 5755 /* wait for asic to come out of reset */ 5756 msleep(500); 5757 5758 /* Restore PCI confspace */ 5759 amdgpu_device_load_pci_state(pdev); 5760 5761 /* confirm ASIC came out of reset */ 5762 for (i = 0; i < adev->usec_timeout; i++) { 5763 memsize = amdgpu_asic_get_config_memsize(adev); 5764 5765 if (memsize != 0xffffffff) 5766 break; 5767 udelay(1); 5768 } 5769 if (memsize == 0xffffffff) { 5770 r = -ETIME; 5771 goto out; 5772 } 5773 5774 reset_context.method = AMD_RESET_METHOD_NONE; 5775 reset_context.reset_req_dev = adev; 5776 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5777 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5778 5779 adev->no_hw_access = true; 5780 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5781 adev->no_hw_access = false; 5782 if (r) 5783 goto out; 5784 5785 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5786 5787 out: 5788 if (!r) { 5789 if (amdgpu_device_cache_pci_state(adev->pdev)) 5790 pci_restore_state(adev->pdev); 5791 5792 DRM_INFO("PCIe error recovery succeeded\n"); 5793 } else { 5794 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5795 amdgpu_device_unset_mp1_state(adev); 5796 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5797 } 5798 5799 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5800 } 5801 5802 /** 5803 * amdgpu_pci_resume() - resume normal ops after PCI reset 5804 * @pdev: pointer to PCI device 5805 * 5806 * Called when the error recovery driver tells us that its 5807 * OK to resume normal operation. 5808 */ 5809 void amdgpu_pci_resume(struct pci_dev *pdev) 5810 { 5811 struct drm_device *dev = pci_get_drvdata(pdev); 5812 struct amdgpu_device *adev = drm_to_adev(dev); 5813 int i; 5814 5815 5816 DRM_INFO("PCI error: resume callback!!\n"); 5817 5818 /* Only continue execution for the case of pci_channel_io_frozen */ 5819 if (adev->pci_channel_state != pci_channel_io_frozen) 5820 return; 5821 5822 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5823 struct amdgpu_ring *ring = adev->rings[i]; 5824 5825 if (!ring || !ring->sched.thread) 5826 continue; 5827 5828 drm_sched_start(&ring->sched, true); 5829 } 5830 5831 amdgpu_device_unset_mp1_state(adev); 5832 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5833 } 5834 5835 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5836 { 5837 struct drm_device *dev = pci_get_drvdata(pdev); 5838 struct amdgpu_device *adev = drm_to_adev(dev); 5839 int r; 5840 5841 r = pci_save_state(pdev); 5842 if (!r) { 5843 kfree(adev->pci_state); 5844 5845 adev->pci_state = pci_store_saved_state(pdev); 5846 5847 if (!adev->pci_state) { 5848 DRM_ERROR("Failed to store PCI saved state"); 5849 return false; 5850 } 5851 } else { 5852 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5853 return false; 5854 } 5855 5856 return true; 5857 } 5858 5859 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5860 { 5861 struct drm_device *dev = pci_get_drvdata(pdev); 5862 struct amdgpu_device *adev = drm_to_adev(dev); 5863 int r; 5864 5865 if (!adev->pci_state) 5866 return false; 5867 5868 r = pci_load_saved_state(pdev, adev->pci_state); 5869 5870 if (!r) { 5871 pci_restore_state(pdev); 5872 } else { 5873 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5874 return false; 5875 } 5876 5877 return true; 5878 } 5879 5880 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5881 struct amdgpu_ring *ring) 5882 { 5883 #ifdef CONFIG_X86_64 5884 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5885 return; 5886 #endif 5887 if (adev->gmc.xgmi.connected_to_cpu) 5888 return; 5889 5890 if (ring && ring->funcs->emit_hdp_flush) 5891 amdgpu_ring_emit_hdp_flush(ring); 5892 else 5893 amdgpu_asic_flush_hdp(adev, ring); 5894 } 5895 5896 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5897 struct amdgpu_ring *ring) 5898 { 5899 #ifdef CONFIG_X86_64 5900 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5901 return; 5902 #endif 5903 if (adev->gmc.xgmi.connected_to_cpu) 5904 return; 5905 5906 amdgpu_asic_invalidate_hdp(adev, ring); 5907 } 5908 5909 int amdgpu_in_reset(struct amdgpu_device *adev) 5910 { 5911 return atomic_read(&adev->reset_domain->in_gpu_reset); 5912 } 5913 5914 /** 5915 * amdgpu_device_halt() - bring hardware to some kind of halt state 5916 * 5917 * @adev: amdgpu_device pointer 5918 * 5919 * Bring hardware to some kind of halt state so that no one can touch it 5920 * any more. It will help to maintain error context when error occurred. 5921 * Compare to a simple hang, the system will keep stable at least for SSH 5922 * access. Then it should be trivial to inspect the hardware state and 5923 * see what's going on. Implemented as following: 5924 * 5925 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5926 * clears all CPU mappings to device, disallows remappings through page faults 5927 * 2. amdgpu_irq_disable_all() disables all interrupts 5928 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5929 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5930 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5931 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5932 * flush any in flight DMA operations 5933 */ 5934 void amdgpu_device_halt(struct amdgpu_device *adev) 5935 { 5936 struct pci_dev *pdev = adev->pdev; 5937 struct drm_device *ddev = adev_to_drm(adev); 5938 5939 amdgpu_xcp_dev_unplug(adev); 5940 drm_dev_unplug(ddev); 5941 5942 amdgpu_irq_disable_all(adev); 5943 5944 amdgpu_fence_driver_hw_fini(adev); 5945 5946 adev->no_hw_access = true; 5947 5948 amdgpu_device_unmap_mmio(adev); 5949 5950 pci_disable_device(pdev); 5951 pci_wait_for_pending_transaction(pdev); 5952 } 5953 5954 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5955 u32 reg) 5956 { 5957 unsigned long flags, address, data; 5958 u32 r; 5959 5960 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5961 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5962 5963 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5964 WREG32(address, reg * 4); 5965 (void)RREG32(address); 5966 r = RREG32(data); 5967 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5968 return r; 5969 } 5970 5971 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5972 u32 reg, u32 v) 5973 { 5974 unsigned long flags, address, data; 5975 5976 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5977 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5978 5979 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5980 WREG32(address, reg * 4); 5981 (void)RREG32(address); 5982 WREG32(data, v); 5983 (void)RREG32(data); 5984 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5985 } 5986 5987 /** 5988 * amdgpu_device_switch_gang - switch to a new gang 5989 * @adev: amdgpu_device pointer 5990 * @gang: the gang to switch to 5991 * 5992 * Try to switch to a new gang. 5993 * Returns: NULL if we switched to the new gang or a reference to the current 5994 * gang leader. 5995 */ 5996 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5997 struct dma_fence *gang) 5998 { 5999 struct dma_fence *old = NULL; 6000 6001 do { 6002 dma_fence_put(old); 6003 rcu_read_lock(); 6004 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6005 rcu_read_unlock(); 6006 6007 if (old == gang) 6008 break; 6009 6010 if (!dma_fence_is_signaled(old)) 6011 return old; 6012 6013 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6014 old, gang) != old); 6015 6016 dma_fence_put(old); 6017 return NULL; 6018 } 6019 6020 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6021 { 6022 switch (adev->asic_type) { 6023 #ifdef CONFIG_DRM_AMDGPU_SI 6024 case CHIP_HAINAN: 6025 #endif 6026 case CHIP_TOPAZ: 6027 /* chips with no display hardware */ 6028 return false; 6029 #ifdef CONFIG_DRM_AMDGPU_SI 6030 case CHIP_TAHITI: 6031 case CHIP_PITCAIRN: 6032 case CHIP_VERDE: 6033 case CHIP_OLAND: 6034 #endif 6035 #ifdef CONFIG_DRM_AMDGPU_CIK 6036 case CHIP_BONAIRE: 6037 case CHIP_HAWAII: 6038 case CHIP_KAVERI: 6039 case CHIP_KABINI: 6040 case CHIP_MULLINS: 6041 #endif 6042 case CHIP_TONGA: 6043 case CHIP_FIJI: 6044 case CHIP_POLARIS10: 6045 case CHIP_POLARIS11: 6046 case CHIP_POLARIS12: 6047 case CHIP_VEGAM: 6048 case CHIP_CARRIZO: 6049 case CHIP_STONEY: 6050 /* chips with display hardware */ 6051 return true; 6052 default: 6053 /* IP discovery */ 6054 if (!adev->ip_versions[DCE_HWIP][0] || 6055 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6056 return false; 6057 return true; 6058 } 6059 } 6060 6061 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6062 uint32_t inst, uint32_t reg_addr, char reg_name[], 6063 uint32_t expected_value, uint32_t mask) 6064 { 6065 uint32_t ret = 0; 6066 uint32_t old_ = 0; 6067 uint32_t tmp_ = RREG32(reg_addr); 6068 uint32_t loop = adev->usec_timeout; 6069 6070 while ((tmp_ & (mask)) != (expected_value)) { 6071 if (old_ != tmp_) { 6072 loop = adev->usec_timeout; 6073 old_ = tmp_; 6074 } else 6075 udelay(1); 6076 tmp_ = RREG32(reg_addr); 6077 loop--; 6078 if (!loop) { 6079 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6080 inst, reg_name, (uint32_t)expected_value, 6081 (uint32_t)(tmp_ & (mask))); 6082 ret = -ETIMEDOUT; 6083 break; 6084 } 6085 } 6086 return ret; 6087 } 6088