1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 #if IS_ENABLED(CONFIG_X86) 85 #include <asm/intel-family.h> 86 #endif 87 88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 95 96 #define AMDGPU_RESUME_MS 2000 97 #define AMDGPU_MAX_RETRY_LIMIT 2 98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 99 100 static const struct drm_driver amdgpu_kms_driver; 101 102 const char *amdgpu_asic_name[] = { 103 "TAHITI", 104 "PITCAIRN", 105 "VERDE", 106 "OLAND", 107 "HAINAN", 108 "BONAIRE", 109 "KAVERI", 110 "KABINI", 111 "HAWAII", 112 "MULLINS", 113 "TOPAZ", 114 "TONGA", 115 "FIJI", 116 "CARRIZO", 117 "STONEY", 118 "POLARIS10", 119 "POLARIS11", 120 "POLARIS12", 121 "VEGAM", 122 "VEGA10", 123 "VEGA12", 124 "VEGA20", 125 "RAVEN", 126 "ARCTURUS", 127 "RENOIR", 128 "ALDEBARAN", 129 "NAVI10", 130 "CYAN_SKILLFISH", 131 "NAVI14", 132 "NAVI12", 133 "SIENNA_CICHLID", 134 "NAVY_FLOUNDER", 135 "VANGOGH", 136 "DIMGREY_CAVEFISH", 137 "BEIGE_GOBY", 138 "YELLOW_CARP", 139 "IP DISCOVERY", 140 "LAST", 141 }; 142 143 /** 144 * DOC: pcie_replay_count 145 * 146 * The amdgpu driver provides a sysfs API for reporting the total number 147 * of PCIe replays (NAKs) 148 * The file pcie_replay_count is used for this and returns the total 149 * number of replays as a sum of the NAKs generated and NAKs received 150 */ 151 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 153 struct device_attribute *attr, char *buf) 154 { 155 struct drm_device *ddev = dev_get_drvdata(dev); 156 struct amdgpu_device *adev = drm_to_adev(ddev); 157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 158 159 return sysfs_emit(buf, "%llu\n", cnt); 160 } 161 162 static DEVICE_ATTR(pcie_replay_count, 0444, 163 amdgpu_device_get_pcie_replay_count, NULL); 164 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 166 167 168 /** 169 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 170 * 171 * @dev: drm_device pointer 172 * 173 * Returns true if the device is a dGPU with ATPX power control, 174 * otherwise return false. 175 */ 176 bool amdgpu_device_supports_px(struct drm_device *dev) 177 { 178 struct amdgpu_device *adev = drm_to_adev(dev); 179 180 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 181 return true; 182 return false; 183 } 184 185 /** 186 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 187 * 188 * @dev: drm_device pointer 189 * 190 * Returns true if the device is a dGPU with ACPI power control, 191 * otherwise return false. 192 */ 193 bool amdgpu_device_supports_boco(struct drm_device *dev) 194 { 195 struct amdgpu_device *adev = drm_to_adev(dev); 196 197 if (adev->has_pr3 || 198 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 199 return true; 200 return false; 201 } 202 203 /** 204 * amdgpu_device_supports_baco - Does the device support BACO 205 * 206 * @dev: drm_device pointer 207 * 208 * Returns true if the device supporte BACO, 209 * otherwise return false. 210 */ 211 bool amdgpu_device_supports_baco(struct drm_device *dev) 212 { 213 struct amdgpu_device *adev = drm_to_adev(dev); 214 215 return amdgpu_asic_supports_baco(adev); 216 } 217 218 /** 219 * amdgpu_device_supports_smart_shift - Is the device dGPU with 220 * smart shift support 221 * 222 * @dev: drm_device pointer 223 * 224 * Returns true if the device is a dGPU with Smart Shift support, 225 * otherwise returns false. 226 */ 227 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 228 { 229 return (amdgpu_device_supports_boco(dev) && 230 amdgpu_acpi_is_power_shift_control_supported()); 231 } 232 233 /* 234 * VRAM access helper functions 235 */ 236 237 /** 238 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 239 * 240 * @adev: amdgpu_device pointer 241 * @pos: offset of the buffer in vram 242 * @buf: virtual address of the buffer in system memory 243 * @size: read/write size, sizeof(@buf) must > @size 244 * @write: true - write to vram, otherwise - read from vram 245 */ 246 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 247 void *buf, size_t size, bool write) 248 { 249 unsigned long flags; 250 uint32_t hi = ~0, tmp = 0; 251 uint32_t *data = buf; 252 uint64_t last; 253 int idx; 254 255 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 256 return; 257 258 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 259 260 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 261 for (last = pos + size; pos < last; pos += 4) { 262 tmp = pos >> 31; 263 264 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 265 if (tmp != hi) { 266 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 267 hi = tmp; 268 } 269 if (write) 270 WREG32_NO_KIQ(mmMM_DATA, *data++); 271 else 272 *data++ = RREG32_NO_KIQ(mmMM_DATA); 273 } 274 275 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 276 drm_dev_exit(idx); 277 } 278 279 /** 280 * amdgpu_device_aper_access - access vram by vram aperature 281 * 282 * @adev: amdgpu_device pointer 283 * @pos: offset of the buffer in vram 284 * @buf: virtual address of the buffer in system memory 285 * @size: read/write size, sizeof(@buf) must > @size 286 * @write: true - write to vram, otherwise - read from vram 287 * 288 * The return value means how many bytes have been transferred. 289 */ 290 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 291 void *buf, size_t size, bool write) 292 { 293 #ifdef CONFIG_64BIT 294 void __iomem *addr; 295 size_t count = 0; 296 uint64_t last; 297 298 if (!adev->mman.aper_base_kaddr) 299 return 0; 300 301 last = min(pos + size, adev->gmc.visible_vram_size); 302 if (last > pos) { 303 addr = adev->mman.aper_base_kaddr + pos; 304 count = last - pos; 305 306 if (write) { 307 memcpy_toio(addr, buf, count); 308 /* Make sure HDP write cache flush happens without any reordering 309 * after the system memory contents are sent over PCIe device 310 */ 311 mb(); 312 amdgpu_device_flush_hdp(adev, NULL); 313 } else { 314 amdgpu_device_invalidate_hdp(adev, NULL); 315 /* Make sure HDP read cache is invalidated before issuing a read 316 * to the PCIe device 317 */ 318 mb(); 319 memcpy_fromio(buf, addr, count); 320 } 321 322 } 323 324 return count; 325 #else 326 return 0; 327 #endif 328 } 329 330 /** 331 * amdgpu_device_vram_access - read/write a buffer in vram 332 * 333 * @adev: amdgpu_device pointer 334 * @pos: offset of the buffer in vram 335 * @buf: virtual address of the buffer in system memory 336 * @size: read/write size, sizeof(@buf) must > @size 337 * @write: true - write to vram, otherwise - read from vram 338 */ 339 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 340 void *buf, size_t size, bool write) 341 { 342 size_t count; 343 344 /* try to using vram apreature to access vram first */ 345 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 346 size -= count; 347 if (size) { 348 /* using MM to access rest vram */ 349 pos += count; 350 buf += count; 351 amdgpu_device_mm_access(adev, pos, buf, size, write); 352 } 353 } 354 355 /* 356 * register access helper functions. 357 */ 358 359 /* Check if hw access should be skipped because of hotplug or device error */ 360 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 361 { 362 if (adev->no_hw_access) 363 return true; 364 365 #ifdef CONFIG_LOCKDEP 366 /* 367 * This is a bit complicated to understand, so worth a comment. What we assert 368 * here is that the GPU reset is not running on another thread in parallel. 369 * 370 * For this we trylock the read side of the reset semaphore, if that succeeds 371 * we know that the reset is not running in paralell. 372 * 373 * If the trylock fails we assert that we are either already holding the read 374 * side of the lock or are the reset thread itself and hold the write side of 375 * the lock. 376 */ 377 if (in_task()) { 378 if (down_read_trylock(&adev->reset_domain->sem)) 379 up_read(&adev->reset_domain->sem); 380 else 381 lockdep_assert_held(&adev->reset_domain->sem); 382 } 383 #endif 384 return false; 385 } 386 387 /** 388 * amdgpu_device_rreg - read a memory mapped IO or indirect register 389 * 390 * @adev: amdgpu_device pointer 391 * @reg: dword aligned register offset 392 * @acc_flags: access flags which require special behavior 393 * 394 * Returns the 32 bit value from the offset specified. 395 */ 396 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 397 uint32_t reg, uint32_t acc_flags) 398 { 399 uint32_t ret; 400 401 if (amdgpu_device_skip_hw_access(adev)) 402 return 0; 403 404 if ((reg * 4) < adev->rmmio_size) { 405 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 406 amdgpu_sriov_runtime(adev) && 407 down_read_trylock(&adev->reset_domain->sem)) { 408 ret = amdgpu_kiq_rreg(adev, reg); 409 up_read(&adev->reset_domain->sem); 410 } else { 411 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 412 } 413 } else { 414 ret = adev->pcie_rreg(adev, reg * 4); 415 } 416 417 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 418 419 return ret; 420 } 421 422 /* 423 * MMIO register read with bytes helper functions 424 * @offset:bytes offset from MMIO start 425 */ 426 427 /** 428 * amdgpu_mm_rreg8 - read a memory mapped IO register 429 * 430 * @adev: amdgpu_device pointer 431 * @offset: byte aligned register offset 432 * 433 * Returns the 8 bit value from the offset specified. 434 */ 435 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 436 { 437 if (amdgpu_device_skip_hw_access(adev)) 438 return 0; 439 440 if (offset < adev->rmmio_size) 441 return (readb(adev->rmmio + offset)); 442 BUG(); 443 } 444 445 /* 446 * MMIO register write with bytes helper functions 447 * @offset:bytes offset from MMIO start 448 * @value: the value want to be written to the register 449 */ 450 451 /** 452 * amdgpu_mm_wreg8 - read a memory mapped IO register 453 * 454 * @adev: amdgpu_device pointer 455 * @offset: byte aligned register offset 456 * @value: 8 bit value to write 457 * 458 * Writes the value specified to the offset specified. 459 */ 460 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 461 { 462 if (amdgpu_device_skip_hw_access(adev)) 463 return; 464 465 if (offset < adev->rmmio_size) 466 writeb(value, adev->rmmio + offset); 467 else 468 BUG(); 469 } 470 471 /** 472 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 473 * 474 * @adev: amdgpu_device pointer 475 * @reg: dword aligned register offset 476 * @v: 32 bit value to write to the register 477 * @acc_flags: access flags which require special behavior 478 * 479 * Writes the value specified to the offset specified. 480 */ 481 void amdgpu_device_wreg(struct amdgpu_device *adev, 482 uint32_t reg, uint32_t v, 483 uint32_t acc_flags) 484 { 485 if (amdgpu_device_skip_hw_access(adev)) 486 return; 487 488 if ((reg * 4) < adev->rmmio_size) { 489 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 490 amdgpu_sriov_runtime(adev) && 491 down_read_trylock(&adev->reset_domain->sem)) { 492 amdgpu_kiq_wreg(adev, reg, v); 493 up_read(&adev->reset_domain->sem); 494 } else { 495 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 496 } 497 } else { 498 adev->pcie_wreg(adev, reg * 4, v); 499 } 500 501 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 502 } 503 504 /** 505 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 506 * 507 * @adev: amdgpu_device pointer 508 * @reg: mmio/rlc register 509 * @v: value to write 510 * @xcc_id: xcc accelerated compute core id 511 * 512 * this function is invoked only for the debugfs register access 513 */ 514 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 515 uint32_t reg, uint32_t v, 516 uint32_t xcc_id) 517 { 518 if (amdgpu_device_skip_hw_access(adev)) 519 return; 520 521 if (amdgpu_sriov_fullaccess(adev) && 522 adev->gfx.rlc.funcs && 523 adev->gfx.rlc.funcs->is_rlcg_access_range) { 524 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 525 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 526 } else if ((reg * 4) >= adev->rmmio_size) { 527 adev->pcie_wreg(adev, reg * 4, v); 528 } else { 529 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 530 } 531 } 532 533 /** 534 * amdgpu_device_indirect_rreg - read an indirect register 535 * 536 * @adev: amdgpu_device pointer 537 * @reg_addr: indirect register address to read from 538 * 539 * Returns the value of indirect register @reg_addr 540 */ 541 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 542 u32 reg_addr) 543 { 544 unsigned long flags, pcie_index, pcie_data; 545 void __iomem *pcie_index_offset; 546 void __iomem *pcie_data_offset; 547 u32 r; 548 549 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 550 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 551 552 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 553 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 554 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 555 556 writel(reg_addr, pcie_index_offset); 557 readl(pcie_index_offset); 558 r = readl(pcie_data_offset); 559 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 560 561 return r; 562 } 563 564 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 565 u64 reg_addr) 566 { 567 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 568 u32 r; 569 void __iomem *pcie_index_offset; 570 void __iomem *pcie_index_hi_offset; 571 void __iomem *pcie_data_offset; 572 573 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 574 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 575 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 576 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 577 else 578 pcie_index_hi = 0; 579 580 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 581 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 582 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 583 if (pcie_index_hi != 0) 584 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 585 pcie_index_hi * 4; 586 587 writel(reg_addr, pcie_index_offset); 588 readl(pcie_index_offset); 589 if (pcie_index_hi != 0) { 590 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 591 readl(pcie_index_hi_offset); 592 } 593 r = readl(pcie_data_offset); 594 595 /* clear the high bits */ 596 if (pcie_index_hi != 0) { 597 writel(0, pcie_index_hi_offset); 598 readl(pcie_index_hi_offset); 599 } 600 601 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 602 603 return r; 604 } 605 606 /** 607 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 608 * 609 * @adev: amdgpu_device pointer 610 * @reg_addr: indirect register address to read from 611 * 612 * Returns the value of indirect register @reg_addr 613 */ 614 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 615 u32 reg_addr) 616 { 617 unsigned long flags, pcie_index, pcie_data; 618 void __iomem *pcie_index_offset; 619 void __iomem *pcie_data_offset; 620 u64 r; 621 622 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 623 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 624 625 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 626 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 627 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 628 629 /* read low 32 bits */ 630 writel(reg_addr, pcie_index_offset); 631 readl(pcie_index_offset); 632 r = readl(pcie_data_offset); 633 /* read high 32 bits */ 634 writel(reg_addr + 4, pcie_index_offset); 635 readl(pcie_index_offset); 636 r |= ((u64)readl(pcie_data_offset) << 32); 637 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 638 639 return r; 640 } 641 642 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 643 u64 reg_addr) 644 { 645 unsigned long flags, pcie_index, pcie_data; 646 unsigned long pcie_index_hi = 0; 647 void __iomem *pcie_index_offset; 648 void __iomem *pcie_index_hi_offset; 649 void __iomem *pcie_data_offset; 650 u64 r; 651 652 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 653 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 654 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 655 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 656 657 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 658 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 659 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 660 if (pcie_index_hi != 0) 661 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 662 pcie_index_hi * 4; 663 664 /* read low 32 bits */ 665 writel(reg_addr, pcie_index_offset); 666 readl(pcie_index_offset); 667 if (pcie_index_hi != 0) { 668 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 669 readl(pcie_index_hi_offset); 670 } 671 r = readl(pcie_data_offset); 672 /* read high 32 bits */ 673 writel(reg_addr + 4, pcie_index_offset); 674 readl(pcie_index_offset); 675 if (pcie_index_hi != 0) { 676 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 677 readl(pcie_index_hi_offset); 678 } 679 r |= ((u64)readl(pcie_data_offset) << 32); 680 681 /* clear the high bits */ 682 if (pcie_index_hi != 0) { 683 writel(0, pcie_index_hi_offset); 684 readl(pcie_index_hi_offset); 685 } 686 687 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 688 689 return r; 690 } 691 692 /** 693 * amdgpu_device_indirect_wreg - write an indirect register address 694 * 695 * @adev: amdgpu_device pointer 696 * @reg_addr: indirect register offset 697 * @reg_data: indirect register data 698 * 699 */ 700 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 701 u32 reg_addr, u32 reg_data) 702 { 703 unsigned long flags, pcie_index, pcie_data; 704 void __iomem *pcie_index_offset; 705 void __iomem *pcie_data_offset; 706 707 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 708 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 709 710 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 711 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 712 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 713 714 writel(reg_addr, pcie_index_offset); 715 readl(pcie_index_offset); 716 writel(reg_data, pcie_data_offset); 717 readl(pcie_data_offset); 718 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 719 } 720 721 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 722 u64 reg_addr, u32 reg_data) 723 { 724 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 725 void __iomem *pcie_index_offset; 726 void __iomem *pcie_index_hi_offset; 727 void __iomem *pcie_data_offset; 728 729 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 730 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 731 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 732 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 733 else 734 pcie_index_hi = 0; 735 736 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 737 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 738 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 739 if (pcie_index_hi != 0) 740 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 741 pcie_index_hi * 4; 742 743 writel(reg_addr, pcie_index_offset); 744 readl(pcie_index_offset); 745 if (pcie_index_hi != 0) { 746 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 747 readl(pcie_index_hi_offset); 748 } 749 writel(reg_data, pcie_data_offset); 750 readl(pcie_data_offset); 751 752 /* clear the high bits */ 753 if (pcie_index_hi != 0) { 754 writel(0, pcie_index_hi_offset); 755 readl(pcie_index_hi_offset); 756 } 757 758 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 759 } 760 761 /** 762 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 763 * 764 * @adev: amdgpu_device pointer 765 * @reg_addr: indirect register offset 766 * @reg_data: indirect register data 767 * 768 */ 769 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 770 u32 reg_addr, u64 reg_data) 771 { 772 unsigned long flags, pcie_index, pcie_data; 773 void __iomem *pcie_index_offset; 774 void __iomem *pcie_data_offset; 775 776 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 777 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 778 779 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 780 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 781 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 782 783 /* write low 32 bits */ 784 writel(reg_addr, pcie_index_offset); 785 readl(pcie_index_offset); 786 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 787 readl(pcie_data_offset); 788 /* write high 32 bits */ 789 writel(reg_addr + 4, pcie_index_offset); 790 readl(pcie_index_offset); 791 writel((u32)(reg_data >> 32), pcie_data_offset); 792 readl(pcie_data_offset); 793 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 794 } 795 796 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 797 u64 reg_addr, u64 reg_data) 798 { 799 unsigned long flags, pcie_index, pcie_data; 800 unsigned long pcie_index_hi = 0; 801 void __iomem *pcie_index_offset; 802 void __iomem *pcie_index_hi_offset; 803 void __iomem *pcie_data_offset; 804 805 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 806 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 807 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 808 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 809 810 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 811 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 812 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 813 if (pcie_index_hi != 0) 814 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 815 pcie_index_hi * 4; 816 817 /* write low 32 bits */ 818 writel(reg_addr, pcie_index_offset); 819 readl(pcie_index_offset); 820 if (pcie_index_hi != 0) { 821 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 822 readl(pcie_index_hi_offset); 823 } 824 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 825 readl(pcie_data_offset); 826 /* write high 32 bits */ 827 writel(reg_addr + 4, pcie_index_offset); 828 readl(pcie_index_offset); 829 if (pcie_index_hi != 0) { 830 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 831 readl(pcie_index_hi_offset); 832 } 833 writel((u32)(reg_data >> 32), pcie_data_offset); 834 readl(pcie_data_offset); 835 836 /* clear the high bits */ 837 if (pcie_index_hi != 0) { 838 writel(0, pcie_index_hi_offset); 839 readl(pcie_index_hi_offset); 840 } 841 842 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 843 } 844 845 /** 846 * amdgpu_device_get_rev_id - query device rev_id 847 * 848 * @adev: amdgpu_device pointer 849 * 850 * Return device rev_id 851 */ 852 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 853 { 854 return adev->nbio.funcs->get_rev_id(adev); 855 } 856 857 /** 858 * amdgpu_invalid_rreg - dummy reg read function 859 * 860 * @adev: amdgpu_device pointer 861 * @reg: offset of register 862 * 863 * Dummy register read function. Used for register blocks 864 * that certain asics don't have (all asics). 865 * Returns the value in the register. 866 */ 867 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 868 { 869 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 870 BUG(); 871 return 0; 872 } 873 874 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 875 { 876 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 877 BUG(); 878 return 0; 879 } 880 881 /** 882 * amdgpu_invalid_wreg - dummy reg write function 883 * 884 * @adev: amdgpu_device pointer 885 * @reg: offset of register 886 * @v: value to write to the register 887 * 888 * Dummy register read function. Used for register blocks 889 * that certain asics don't have (all asics). 890 */ 891 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 892 { 893 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 894 reg, v); 895 BUG(); 896 } 897 898 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 899 { 900 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 901 reg, v); 902 BUG(); 903 } 904 905 /** 906 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 907 * 908 * @adev: amdgpu_device pointer 909 * @reg: offset of register 910 * 911 * Dummy register read function. Used for register blocks 912 * that certain asics don't have (all asics). 913 * Returns the value in the register. 914 */ 915 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 916 { 917 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 918 BUG(); 919 return 0; 920 } 921 922 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 923 { 924 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 925 BUG(); 926 return 0; 927 } 928 929 /** 930 * amdgpu_invalid_wreg64 - dummy reg write function 931 * 932 * @adev: amdgpu_device pointer 933 * @reg: offset of register 934 * @v: value to write to the register 935 * 936 * Dummy register read function. Used for register blocks 937 * that certain asics don't have (all asics). 938 */ 939 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 940 { 941 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 942 reg, v); 943 BUG(); 944 } 945 946 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 947 { 948 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 949 reg, v); 950 BUG(); 951 } 952 953 /** 954 * amdgpu_block_invalid_rreg - dummy reg read function 955 * 956 * @adev: amdgpu_device pointer 957 * @block: offset of instance 958 * @reg: offset of register 959 * 960 * Dummy register read function. Used for register blocks 961 * that certain asics don't have (all asics). 962 * Returns the value in the register. 963 */ 964 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 965 uint32_t block, uint32_t reg) 966 { 967 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 968 reg, block); 969 BUG(); 970 return 0; 971 } 972 973 /** 974 * amdgpu_block_invalid_wreg - dummy reg write function 975 * 976 * @adev: amdgpu_device pointer 977 * @block: offset of instance 978 * @reg: offset of register 979 * @v: value to write to the register 980 * 981 * Dummy register read function. Used for register blocks 982 * that certain asics don't have (all asics). 983 */ 984 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 985 uint32_t block, 986 uint32_t reg, uint32_t v) 987 { 988 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 989 reg, block, v); 990 BUG(); 991 } 992 993 /** 994 * amdgpu_device_asic_init - Wrapper for atom asic_init 995 * 996 * @adev: amdgpu_device pointer 997 * 998 * Does any asic specific work and then calls atom asic init. 999 */ 1000 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1001 { 1002 int ret; 1003 1004 amdgpu_asic_pre_asic_init(adev); 1005 1006 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1007 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1008 amdgpu_psp_wait_for_bootloader(adev); 1009 ret = amdgpu_atomfirmware_asic_init(adev, true); 1010 return ret; 1011 } else { 1012 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1013 } 1014 1015 return 0; 1016 } 1017 1018 /** 1019 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1020 * 1021 * @adev: amdgpu_device pointer 1022 * 1023 * Allocates a scratch page of VRAM for use by various things in the 1024 * driver. 1025 */ 1026 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1027 { 1028 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1029 AMDGPU_GEM_DOMAIN_VRAM | 1030 AMDGPU_GEM_DOMAIN_GTT, 1031 &adev->mem_scratch.robj, 1032 &adev->mem_scratch.gpu_addr, 1033 (void **)&adev->mem_scratch.ptr); 1034 } 1035 1036 /** 1037 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1038 * 1039 * @adev: amdgpu_device pointer 1040 * 1041 * Frees the VRAM scratch page. 1042 */ 1043 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1044 { 1045 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1046 } 1047 1048 /** 1049 * amdgpu_device_program_register_sequence - program an array of registers. 1050 * 1051 * @adev: amdgpu_device pointer 1052 * @registers: pointer to the register array 1053 * @array_size: size of the register array 1054 * 1055 * Programs an array or registers with and or masks. 1056 * This is a helper for setting golden registers. 1057 */ 1058 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1059 const u32 *registers, 1060 const u32 array_size) 1061 { 1062 u32 tmp, reg, and_mask, or_mask; 1063 int i; 1064 1065 if (array_size % 3) 1066 return; 1067 1068 for (i = 0; i < array_size; i += 3) { 1069 reg = registers[i + 0]; 1070 and_mask = registers[i + 1]; 1071 or_mask = registers[i + 2]; 1072 1073 if (and_mask == 0xffffffff) { 1074 tmp = or_mask; 1075 } else { 1076 tmp = RREG32(reg); 1077 tmp &= ~and_mask; 1078 if (adev->family >= AMDGPU_FAMILY_AI) 1079 tmp |= (or_mask & and_mask); 1080 else 1081 tmp |= or_mask; 1082 } 1083 WREG32(reg, tmp); 1084 } 1085 } 1086 1087 /** 1088 * amdgpu_device_pci_config_reset - reset the GPU 1089 * 1090 * @adev: amdgpu_device pointer 1091 * 1092 * Resets the GPU using the pci config reset sequence. 1093 * Only applicable to asics prior to vega10. 1094 */ 1095 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1096 { 1097 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1098 } 1099 1100 /** 1101 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1102 * 1103 * @adev: amdgpu_device pointer 1104 * 1105 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1106 */ 1107 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1108 { 1109 return pci_reset_function(adev->pdev); 1110 } 1111 1112 /* 1113 * amdgpu_device_wb_*() 1114 * Writeback is the method by which the GPU updates special pages in memory 1115 * with the status of certain GPU events (fences, ring pointers,etc.). 1116 */ 1117 1118 /** 1119 * amdgpu_device_wb_fini - Disable Writeback and free memory 1120 * 1121 * @adev: amdgpu_device pointer 1122 * 1123 * Disables Writeback and frees the Writeback memory (all asics). 1124 * Used at driver shutdown. 1125 */ 1126 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1127 { 1128 if (adev->wb.wb_obj) { 1129 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1130 &adev->wb.gpu_addr, 1131 (void **)&adev->wb.wb); 1132 adev->wb.wb_obj = NULL; 1133 } 1134 } 1135 1136 /** 1137 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1138 * 1139 * @adev: amdgpu_device pointer 1140 * 1141 * Initializes writeback and allocates writeback memory (all asics). 1142 * Used at driver startup. 1143 * Returns 0 on success or an -error on failure. 1144 */ 1145 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1146 { 1147 int r; 1148 1149 if (adev->wb.wb_obj == NULL) { 1150 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1151 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1152 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1153 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1154 (void **)&adev->wb.wb); 1155 if (r) { 1156 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1157 return r; 1158 } 1159 1160 adev->wb.num_wb = AMDGPU_MAX_WB; 1161 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1162 1163 /* clear wb memory */ 1164 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1165 } 1166 1167 return 0; 1168 } 1169 1170 /** 1171 * amdgpu_device_wb_get - Allocate a wb entry 1172 * 1173 * @adev: amdgpu_device pointer 1174 * @wb: wb index 1175 * 1176 * Allocate a wb slot for use by the driver (all asics). 1177 * Returns 0 on success or -EINVAL on failure. 1178 */ 1179 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1180 { 1181 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1182 1183 if (offset < adev->wb.num_wb) { 1184 __set_bit(offset, adev->wb.used); 1185 *wb = offset << 3; /* convert to dw offset */ 1186 return 0; 1187 } else { 1188 return -EINVAL; 1189 } 1190 } 1191 1192 /** 1193 * amdgpu_device_wb_free - Free a wb entry 1194 * 1195 * @adev: amdgpu_device pointer 1196 * @wb: wb index 1197 * 1198 * Free a wb slot allocated for use by the driver (all asics) 1199 */ 1200 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1201 { 1202 wb >>= 3; 1203 if (wb < adev->wb.num_wb) 1204 __clear_bit(wb, adev->wb.used); 1205 } 1206 1207 /** 1208 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1209 * 1210 * @adev: amdgpu_device pointer 1211 * 1212 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1213 * to fail, but if any of the BARs is not accessible after the size we abort 1214 * driver loading by returning -ENODEV. 1215 */ 1216 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1217 { 1218 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1219 struct pci_bus *root; 1220 struct resource *res; 1221 unsigned int i; 1222 u16 cmd; 1223 int r; 1224 1225 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1226 return 0; 1227 1228 /* Bypass for VF */ 1229 if (amdgpu_sriov_vf(adev)) 1230 return 0; 1231 1232 /* skip if the bios has already enabled large BAR */ 1233 if (adev->gmc.real_vram_size && 1234 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1235 return 0; 1236 1237 /* Check if the root BUS has 64bit memory resources */ 1238 root = adev->pdev->bus; 1239 while (root->parent) 1240 root = root->parent; 1241 1242 pci_bus_for_each_resource(root, res, i) { 1243 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1244 res->start > 0x100000000ull) 1245 break; 1246 } 1247 1248 /* Trying to resize is pointless without a root hub window above 4GB */ 1249 if (!res) 1250 return 0; 1251 1252 /* Limit the BAR size to what is available */ 1253 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1254 rbar_size); 1255 1256 /* Disable memory decoding while we change the BAR addresses and size */ 1257 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1258 pci_write_config_word(adev->pdev, PCI_COMMAND, 1259 cmd & ~PCI_COMMAND_MEMORY); 1260 1261 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1262 amdgpu_doorbell_fini(adev); 1263 if (adev->asic_type >= CHIP_BONAIRE) 1264 pci_release_resource(adev->pdev, 2); 1265 1266 pci_release_resource(adev->pdev, 0); 1267 1268 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1269 if (r == -ENOSPC) 1270 DRM_INFO("Not enough PCI address space for a large BAR."); 1271 else if (r && r != -ENOTSUPP) 1272 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1273 1274 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1275 1276 /* When the doorbell or fb BAR isn't available we have no chance of 1277 * using the device. 1278 */ 1279 r = amdgpu_doorbell_init(adev); 1280 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1281 return -ENODEV; 1282 1283 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1284 1285 return 0; 1286 } 1287 1288 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1289 { 1290 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1291 return false; 1292 1293 return true; 1294 } 1295 1296 /* 1297 * GPU helpers function. 1298 */ 1299 /** 1300 * amdgpu_device_need_post - check if the hw need post or not 1301 * 1302 * @adev: amdgpu_device pointer 1303 * 1304 * Check if the asic has been initialized (all asics) at driver startup 1305 * or post is needed if hw reset is performed. 1306 * Returns true if need or false if not. 1307 */ 1308 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1309 { 1310 uint32_t reg; 1311 1312 if (amdgpu_sriov_vf(adev)) 1313 return false; 1314 1315 if (!amdgpu_device_read_bios(adev)) 1316 return false; 1317 1318 if (amdgpu_passthrough(adev)) { 1319 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1320 * some old smc fw still need driver do vPost otherwise gpu hang, while 1321 * those smc fw version above 22.15 doesn't have this flaw, so we force 1322 * vpost executed for smc version below 22.15 1323 */ 1324 if (adev->asic_type == CHIP_FIJI) { 1325 int err; 1326 uint32_t fw_ver; 1327 1328 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1329 /* force vPost if error occured */ 1330 if (err) 1331 return true; 1332 1333 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1334 if (fw_ver < 0x00160e00) 1335 return true; 1336 } 1337 } 1338 1339 /* Don't post if we need to reset whole hive on init */ 1340 if (adev->gmc.xgmi.pending_reset) 1341 return false; 1342 1343 if (adev->has_hw_reset) { 1344 adev->has_hw_reset = false; 1345 return true; 1346 } 1347 1348 /* bios scratch used on CIK+ */ 1349 if (adev->asic_type >= CHIP_BONAIRE) 1350 return amdgpu_atombios_scratch_need_asic_init(adev); 1351 1352 /* check MEM_SIZE for older asics */ 1353 reg = amdgpu_asic_get_config_memsize(adev); 1354 1355 if ((reg != 0) && (reg != 0xffffffff)) 1356 return false; 1357 1358 return true; 1359 } 1360 1361 /* 1362 * Check whether seamless boot is supported. 1363 * 1364 * So far we only support seamless boot on DCE 3.0 or later. 1365 * If users report that it works on older ASICS as well, we may 1366 * loosen this. 1367 */ 1368 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1369 { 1370 switch (amdgpu_seamless) { 1371 case -1: 1372 break; 1373 case 1: 1374 return true; 1375 case 0: 1376 return false; 1377 default: 1378 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1379 amdgpu_seamless); 1380 return false; 1381 } 1382 1383 if (adev->mman.keep_stolen_vga_memory) 1384 return false; 1385 1386 return adev->ip_versions[DCE_HWIP][0] >= IP_VERSION(3, 0, 0); 1387 } 1388 1389 /* 1390 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic 1391 * speed switching. Until we have confirmation from Intel that a specific host 1392 * supports it, it's safer that we keep it disabled for all. 1393 * 1394 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1395 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1396 */ 1397 bool amdgpu_device_pcie_dynamic_switching_supported(void) 1398 { 1399 #if IS_ENABLED(CONFIG_X86) 1400 struct cpuinfo_x86 *c = &cpu_data(0); 1401 1402 if (c->x86_vendor == X86_VENDOR_INTEL) 1403 return false; 1404 #endif 1405 return true; 1406 } 1407 1408 /** 1409 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1410 * 1411 * @adev: amdgpu_device pointer 1412 * 1413 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1414 * be set for this device. 1415 * 1416 * Returns true if it should be used or false if not. 1417 */ 1418 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1419 { 1420 switch (amdgpu_aspm) { 1421 case -1: 1422 break; 1423 case 0: 1424 return false; 1425 case 1: 1426 return true; 1427 default: 1428 return false; 1429 } 1430 return pcie_aspm_enabled(adev->pdev); 1431 } 1432 1433 bool amdgpu_device_aspm_support_quirk(void) 1434 { 1435 #if IS_ENABLED(CONFIG_X86) 1436 struct cpuinfo_x86 *c = &cpu_data(0); 1437 1438 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); 1439 #else 1440 return true; 1441 #endif 1442 } 1443 1444 /* if we get transitioned to only one device, take VGA back */ 1445 /** 1446 * amdgpu_device_vga_set_decode - enable/disable vga decode 1447 * 1448 * @pdev: PCI device pointer 1449 * @state: enable/disable vga decode 1450 * 1451 * Enable/disable vga decode (all asics). 1452 * Returns VGA resource flags. 1453 */ 1454 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1455 bool state) 1456 { 1457 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1458 1459 amdgpu_asic_set_vga_state(adev, state); 1460 if (state) 1461 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1462 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1463 else 1464 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1465 } 1466 1467 /** 1468 * amdgpu_device_check_block_size - validate the vm block size 1469 * 1470 * @adev: amdgpu_device pointer 1471 * 1472 * Validates the vm block size specified via module parameter. 1473 * The vm block size defines number of bits in page table versus page directory, 1474 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1475 * page table and the remaining bits are in the page directory. 1476 */ 1477 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1478 { 1479 /* defines number of bits in page table versus page directory, 1480 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1481 * page table and the remaining bits are in the page directory 1482 */ 1483 if (amdgpu_vm_block_size == -1) 1484 return; 1485 1486 if (amdgpu_vm_block_size < 9) { 1487 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1488 amdgpu_vm_block_size); 1489 amdgpu_vm_block_size = -1; 1490 } 1491 } 1492 1493 /** 1494 * amdgpu_device_check_vm_size - validate the vm size 1495 * 1496 * @adev: amdgpu_device pointer 1497 * 1498 * Validates the vm size in GB specified via module parameter. 1499 * The VM size is the size of the GPU virtual memory space in GB. 1500 */ 1501 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1502 { 1503 /* no need to check the default value */ 1504 if (amdgpu_vm_size == -1) 1505 return; 1506 1507 if (amdgpu_vm_size < 1) { 1508 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1509 amdgpu_vm_size); 1510 amdgpu_vm_size = -1; 1511 } 1512 } 1513 1514 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1515 { 1516 struct sysinfo si; 1517 bool is_os_64 = (sizeof(void *) == 8); 1518 uint64_t total_memory; 1519 uint64_t dram_size_seven_GB = 0x1B8000000; 1520 uint64_t dram_size_three_GB = 0xB8000000; 1521 1522 if (amdgpu_smu_memory_pool_size == 0) 1523 return; 1524 1525 if (!is_os_64) { 1526 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1527 goto def_value; 1528 } 1529 si_meminfo(&si); 1530 total_memory = (uint64_t)si.totalram * si.mem_unit; 1531 1532 if ((amdgpu_smu_memory_pool_size == 1) || 1533 (amdgpu_smu_memory_pool_size == 2)) { 1534 if (total_memory < dram_size_three_GB) 1535 goto def_value1; 1536 } else if ((amdgpu_smu_memory_pool_size == 4) || 1537 (amdgpu_smu_memory_pool_size == 8)) { 1538 if (total_memory < dram_size_seven_GB) 1539 goto def_value1; 1540 } else { 1541 DRM_WARN("Smu memory pool size not supported\n"); 1542 goto def_value; 1543 } 1544 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1545 1546 return; 1547 1548 def_value1: 1549 DRM_WARN("No enough system memory\n"); 1550 def_value: 1551 adev->pm.smu_prv_buffer_size = 0; 1552 } 1553 1554 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1555 { 1556 if (!(adev->flags & AMD_IS_APU) || 1557 adev->asic_type < CHIP_RAVEN) 1558 return 0; 1559 1560 switch (adev->asic_type) { 1561 case CHIP_RAVEN: 1562 if (adev->pdev->device == 0x15dd) 1563 adev->apu_flags |= AMD_APU_IS_RAVEN; 1564 if (adev->pdev->device == 0x15d8) 1565 adev->apu_flags |= AMD_APU_IS_PICASSO; 1566 break; 1567 case CHIP_RENOIR: 1568 if ((adev->pdev->device == 0x1636) || 1569 (adev->pdev->device == 0x164c)) 1570 adev->apu_flags |= AMD_APU_IS_RENOIR; 1571 else 1572 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1573 break; 1574 case CHIP_VANGOGH: 1575 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1576 break; 1577 case CHIP_YELLOW_CARP: 1578 break; 1579 case CHIP_CYAN_SKILLFISH: 1580 if ((adev->pdev->device == 0x13FE) || 1581 (adev->pdev->device == 0x143F)) 1582 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1583 break; 1584 default: 1585 break; 1586 } 1587 1588 return 0; 1589 } 1590 1591 /** 1592 * amdgpu_device_check_arguments - validate module params 1593 * 1594 * @adev: amdgpu_device pointer 1595 * 1596 * Validates certain module parameters and updates 1597 * the associated values used by the driver (all asics). 1598 */ 1599 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1600 { 1601 if (amdgpu_sched_jobs < 4) { 1602 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1603 amdgpu_sched_jobs); 1604 amdgpu_sched_jobs = 4; 1605 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1606 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1607 amdgpu_sched_jobs); 1608 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1609 } 1610 1611 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1612 /* gart size must be greater or equal to 32M */ 1613 dev_warn(adev->dev, "gart size (%d) too small\n", 1614 amdgpu_gart_size); 1615 amdgpu_gart_size = -1; 1616 } 1617 1618 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1619 /* gtt size must be greater or equal to 32M */ 1620 dev_warn(adev->dev, "gtt size (%d) too small\n", 1621 amdgpu_gtt_size); 1622 amdgpu_gtt_size = -1; 1623 } 1624 1625 /* valid range is between 4 and 9 inclusive */ 1626 if (amdgpu_vm_fragment_size != -1 && 1627 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1628 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1629 amdgpu_vm_fragment_size = -1; 1630 } 1631 1632 if (amdgpu_sched_hw_submission < 2) { 1633 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1634 amdgpu_sched_hw_submission); 1635 amdgpu_sched_hw_submission = 2; 1636 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1637 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1638 amdgpu_sched_hw_submission); 1639 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1640 } 1641 1642 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1643 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1644 amdgpu_reset_method = -1; 1645 } 1646 1647 amdgpu_device_check_smu_prv_buffer_size(adev); 1648 1649 amdgpu_device_check_vm_size(adev); 1650 1651 amdgpu_device_check_block_size(adev); 1652 1653 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1654 1655 return 0; 1656 } 1657 1658 /** 1659 * amdgpu_switcheroo_set_state - set switcheroo state 1660 * 1661 * @pdev: pci dev pointer 1662 * @state: vga_switcheroo state 1663 * 1664 * Callback for the switcheroo driver. Suspends or resumes 1665 * the asics before or after it is powered up using ACPI methods. 1666 */ 1667 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1668 enum vga_switcheroo_state state) 1669 { 1670 struct drm_device *dev = pci_get_drvdata(pdev); 1671 int r; 1672 1673 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1674 return; 1675 1676 if (state == VGA_SWITCHEROO_ON) { 1677 pr_info("switched on\n"); 1678 /* don't suspend or resume card normally */ 1679 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1680 1681 pci_set_power_state(pdev, PCI_D0); 1682 amdgpu_device_load_pci_state(pdev); 1683 r = pci_enable_device(pdev); 1684 if (r) 1685 DRM_WARN("pci_enable_device failed (%d)\n", r); 1686 amdgpu_device_resume(dev, true); 1687 1688 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1689 } else { 1690 pr_info("switched off\n"); 1691 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1692 amdgpu_device_suspend(dev, true); 1693 amdgpu_device_cache_pci_state(pdev); 1694 /* Shut down the device */ 1695 pci_disable_device(pdev); 1696 pci_set_power_state(pdev, PCI_D3cold); 1697 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1698 } 1699 } 1700 1701 /** 1702 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1703 * 1704 * @pdev: pci dev pointer 1705 * 1706 * Callback for the switcheroo driver. Check of the switcheroo 1707 * state can be changed. 1708 * Returns true if the state can be changed, false if not. 1709 */ 1710 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1711 { 1712 struct drm_device *dev = pci_get_drvdata(pdev); 1713 1714 /* 1715 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1716 * locking inversion with the driver load path. And the access here is 1717 * completely racy anyway. So don't bother with locking for now. 1718 */ 1719 return atomic_read(&dev->open_count) == 0; 1720 } 1721 1722 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1723 .set_gpu_state = amdgpu_switcheroo_set_state, 1724 .reprobe = NULL, 1725 .can_switch = amdgpu_switcheroo_can_switch, 1726 }; 1727 1728 /** 1729 * amdgpu_device_ip_set_clockgating_state - set the CG state 1730 * 1731 * @dev: amdgpu_device pointer 1732 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1733 * @state: clockgating state (gate or ungate) 1734 * 1735 * Sets the requested clockgating state for all instances of 1736 * the hardware IP specified. 1737 * Returns the error code from the last instance. 1738 */ 1739 int amdgpu_device_ip_set_clockgating_state(void *dev, 1740 enum amd_ip_block_type block_type, 1741 enum amd_clockgating_state state) 1742 { 1743 struct amdgpu_device *adev = dev; 1744 int i, r = 0; 1745 1746 for (i = 0; i < adev->num_ip_blocks; i++) { 1747 if (!adev->ip_blocks[i].status.valid) 1748 continue; 1749 if (adev->ip_blocks[i].version->type != block_type) 1750 continue; 1751 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1752 continue; 1753 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1754 (void *)adev, state); 1755 if (r) 1756 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1757 adev->ip_blocks[i].version->funcs->name, r); 1758 } 1759 return r; 1760 } 1761 1762 /** 1763 * amdgpu_device_ip_set_powergating_state - set the PG state 1764 * 1765 * @dev: amdgpu_device pointer 1766 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1767 * @state: powergating state (gate or ungate) 1768 * 1769 * Sets the requested powergating state for all instances of 1770 * the hardware IP specified. 1771 * Returns the error code from the last instance. 1772 */ 1773 int amdgpu_device_ip_set_powergating_state(void *dev, 1774 enum amd_ip_block_type block_type, 1775 enum amd_powergating_state state) 1776 { 1777 struct amdgpu_device *adev = dev; 1778 int i, r = 0; 1779 1780 for (i = 0; i < adev->num_ip_blocks; i++) { 1781 if (!adev->ip_blocks[i].status.valid) 1782 continue; 1783 if (adev->ip_blocks[i].version->type != block_type) 1784 continue; 1785 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1786 continue; 1787 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1788 (void *)adev, state); 1789 if (r) 1790 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1791 adev->ip_blocks[i].version->funcs->name, r); 1792 } 1793 return r; 1794 } 1795 1796 /** 1797 * amdgpu_device_ip_get_clockgating_state - get the CG state 1798 * 1799 * @adev: amdgpu_device pointer 1800 * @flags: clockgating feature flags 1801 * 1802 * Walks the list of IPs on the device and updates the clockgating 1803 * flags for each IP. 1804 * Updates @flags with the feature flags for each hardware IP where 1805 * clockgating is enabled. 1806 */ 1807 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1808 u64 *flags) 1809 { 1810 int i; 1811 1812 for (i = 0; i < adev->num_ip_blocks; i++) { 1813 if (!adev->ip_blocks[i].status.valid) 1814 continue; 1815 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1816 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1817 } 1818 } 1819 1820 /** 1821 * amdgpu_device_ip_wait_for_idle - wait for idle 1822 * 1823 * @adev: amdgpu_device pointer 1824 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1825 * 1826 * Waits for the request hardware IP to be idle. 1827 * Returns 0 for success or a negative error code on failure. 1828 */ 1829 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1830 enum amd_ip_block_type block_type) 1831 { 1832 int i, r; 1833 1834 for (i = 0; i < adev->num_ip_blocks; i++) { 1835 if (!adev->ip_blocks[i].status.valid) 1836 continue; 1837 if (adev->ip_blocks[i].version->type == block_type) { 1838 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1839 if (r) 1840 return r; 1841 break; 1842 } 1843 } 1844 return 0; 1845 1846 } 1847 1848 /** 1849 * amdgpu_device_ip_is_idle - is the hardware IP idle 1850 * 1851 * @adev: amdgpu_device pointer 1852 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1853 * 1854 * Check if the hardware IP is idle or not. 1855 * Returns true if it the IP is idle, false if not. 1856 */ 1857 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1858 enum amd_ip_block_type block_type) 1859 { 1860 int i; 1861 1862 for (i = 0; i < adev->num_ip_blocks; i++) { 1863 if (!adev->ip_blocks[i].status.valid) 1864 continue; 1865 if (adev->ip_blocks[i].version->type == block_type) 1866 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1867 } 1868 return true; 1869 1870 } 1871 1872 /** 1873 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1874 * 1875 * @adev: amdgpu_device pointer 1876 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1877 * 1878 * Returns a pointer to the hardware IP block structure 1879 * if it exists for the asic, otherwise NULL. 1880 */ 1881 struct amdgpu_ip_block * 1882 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1883 enum amd_ip_block_type type) 1884 { 1885 int i; 1886 1887 for (i = 0; i < adev->num_ip_blocks; i++) 1888 if (adev->ip_blocks[i].version->type == type) 1889 return &adev->ip_blocks[i]; 1890 1891 return NULL; 1892 } 1893 1894 /** 1895 * amdgpu_device_ip_block_version_cmp 1896 * 1897 * @adev: amdgpu_device pointer 1898 * @type: enum amd_ip_block_type 1899 * @major: major version 1900 * @minor: minor version 1901 * 1902 * return 0 if equal or greater 1903 * return 1 if smaller or the ip_block doesn't exist 1904 */ 1905 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1906 enum amd_ip_block_type type, 1907 u32 major, u32 minor) 1908 { 1909 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1910 1911 if (ip_block && ((ip_block->version->major > major) || 1912 ((ip_block->version->major == major) && 1913 (ip_block->version->minor >= minor)))) 1914 return 0; 1915 1916 return 1; 1917 } 1918 1919 /** 1920 * amdgpu_device_ip_block_add 1921 * 1922 * @adev: amdgpu_device pointer 1923 * @ip_block_version: pointer to the IP to add 1924 * 1925 * Adds the IP block driver information to the collection of IPs 1926 * on the asic. 1927 */ 1928 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1929 const struct amdgpu_ip_block_version *ip_block_version) 1930 { 1931 if (!ip_block_version) 1932 return -EINVAL; 1933 1934 switch (ip_block_version->type) { 1935 case AMD_IP_BLOCK_TYPE_VCN: 1936 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1937 return 0; 1938 break; 1939 case AMD_IP_BLOCK_TYPE_JPEG: 1940 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1941 return 0; 1942 break; 1943 default: 1944 break; 1945 } 1946 1947 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1948 ip_block_version->funcs->name); 1949 1950 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1951 1952 return 0; 1953 } 1954 1955 /** 1956 * amdgpu_device_enable_virtual_display - enable virtual display feature 1957 * 1958 * @adev: amdgpu_device pointer 1959 * 1960 * Enabled the virtual display feature if the user has enabled it via 1961 * the module parameter virtual_display. This feature provides a virtual 1962 * display hardware on headless boards or in virtualized environments. 1963 * This function parses and validates the configuration string specified by 1964 * the user and configues the virtual display configuration (number of 1965 * virtual connectors, crtcs, etc.) specified. 1966 */ 1967 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1968 { 1969 adev->enable_virtual_display = false; 1970 1971 if (amdgpu_virtual_display) { 1972 const char *pci_address_name = pci_name(adev->pdev); 1973 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1974 1975 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1976 pciaddstr_tmp = pciaddstr; 1977 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1978 pciaddname = strsep(&pciaddname_tmp, ","); 1979 if (!strcmp("all", pciaddname) 1980 || !strcmp(pci_address_name, pciaddname)) { 1981 long num_crtc; 1982 int res = -1; 1983 1984 adev->enable_virtual_display = true; 1985 1986 if (pciaddname_tmp) 1987 res = kstrtol(pciaddname_tmp, 10, 1988 &num_crtc); 1989 1990 if (!res) { 1991 if (num_crtc < 1) 1992 num_crtc = 1; 1993 if (num_crtc > 6) 1994 num_crtc = 6; 1995 adev->mode_info.num_crtc = num_crtc; 1996 } else { 1997 adev->mode_info.num_crtc = 1; 1998 } 1999 break; 2000 } 2001 } 2002 2003 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2004 amdgpu_virtual_display, pci_address_name, 2005 adev->enable_virtual_display, adev->mode_info.num_crtc); 2006 2007 kfree(pciaddstr); 2008 } 2009 } 2010 2011 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2012 { 2013 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2014 adev->mode_info.num_crtc = 1; 2015 adev->enable_virtual_display = true; 2016 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2017 adev->enable_virtual_display, adev->mode_info.num_crtc); 2018 } 2019 } 2020 2021 /** 2022 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2023 * 2024 * @adev: amdgpu_device pointer 2025 * 2026 * Parses the asic configuration parameters specified in the gpu info 2027 * firmware and makes them availale to the driver for use in configuring 2028 * the asic. 2029 * Returns 0 on success, -EINVAL on failure. 2030 */ 2031 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2032 { 2033 const char *chip_name; 2034 char fw_name[40]; 2035 int err; 2036 const struct gpu_info_firmware_header_v1_0 *hdr; 2037 2038 adev->firmware.gpu_info_fw = NULL; 2039 2040 if (adev->mman.discovery_bin) { 2041 /* 2042 * FIXME: The bounding box is still needed by Navi12, so 2043 * temporarily read it from gpu_info firmware. Should be dropped 2044 * when DAL no longer needs it. 2045 */ 2046 if (adev->asic_type != CHIP_NAVI12) 2047 return 0; 2048 } 2049 2050 switch (adev->asic_type) { 2051 default: 2052 return 0; 2053 case CHIP_VEGA10: 2054 chip_name = "vega10"; 2055 break; 2056 case CHIP_VEGA12: 2057 chip_name = "vega12"; 2058 break; 2059 case CHIP_RAVEN: 2060 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2061 chip_name = "raven2"; 2062 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2063 chip_name = "picasso"; 2064 else 2065 chip_name = "raven"; 2066 break; 2067 case CHIP_ARCTURUS: 2068 chip_name = "arcturus"; 2069 break; 2070 case CHIP_NAVI12: 2071 chip_name = "navi12"; 2072 break; 2073 } 2074 2075 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2076 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2077 if (err) { 2078 dev_err(adev->dev, 2079 "Failed to get gpu_info firmware \"%s\"\n", 2080 fw_name); 2081 goto out; 2082 } 2083 2084 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2085 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2086 2087 switch (hdr->version_major) { 2088 case 1: 2089 { 2090 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2091 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2092 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2093 2094 /* 2095 * Should be droped when DAL no longer needs it. 2096 */ 2097 if (adev->asic_type == CHIP_NAVI12) 2098 goto parse_soc_bounding_box; 2099 2100 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2101 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2102 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2103 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2104 adev->gfx.config.max_texture_channel_caches = 2105 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2106 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2107 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2108 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2109 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2110 adev->gfx.config.double_offchip_lds_buf = 2111 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2112 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2113 adev->gfx.cu_info.max_waves_per_simd = 2114 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2115 adev->gfx.cu_info.max_scratch_slots_per_cu = 2116 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2117 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2118 if (hdr->version_minor >= 1) { 2119 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2120 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2121 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2122 adev->gfx.config.num_sc_per_sh = 2123 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2124 adev->gfx.config.num_packer_per_sc = 2125 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2126 } 2127 2128 parse_soc_bounding_box: 2129 /* 2130 * soc bounding box info is not integrated in disocovery table, 2131 * we always need to parse it from gpu info firmware if needed. 2132 */ 2133 if (hdr->version_minor == 2) { 2134 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2135 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2136 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2137 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2138 } 2139 break; 2140 } 2141 default: 2142 dev_err(adev->dev, 2143 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2144 err = -EINVAL; 2145 goto out; 2146 } 2147 out: 2148 return err; 2149 } 2150 2151 /** 2152 * amdgpu_device_ip_early_init - run early init for hardware IPs 2153 * 2154 * @adev: amdgpu_device pointer 2155 * 2156 * Early initialization pass for hardware IPs. The hardware IPs that make 2157 * up each asic are discovered each IP's early_init callback is run. This 2158 * is the first stage in initializing the asic. 2159 * Returns 0 on success, negative error code on failure. 2160 */ 2161 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2162 { 2163 struct drm_device *dev = adev_to_drm(adev); 2164 struct pci_dev *parent; 2165 int i, r; 2166 bool total; 2167 2168 amdgpu_device_enable_virtual_display(adev); 2169 2170 if (amdgpu_sriov_vf(adev)) { 2171 r = amdgpu_virt_request_full_gpu(adev, true); 2172 if (r) 2173 return r; 2174 } 2175 2176 switch (adev->asic_type) { 2177 #ifdef CONFIG_DRM_AMDGPU_SI 2178 case CHIP_VERDE: 2179 case CHIP_TAHITI: 2180 case CHIP_PITCAIRN: 2181 case CHIP_OLAND: 2182 case CHIP_HAINAN: 2183 adev->family = AMDGPU_FAMILY_SI; 2184 r = si_set_ip_blocks(adev); 2185 if (r) 2186 return r; 2187 break; 2188 #endif 2189 #ifdef CONFIG_DRM_AMDGPU_CIK 2190 case CHIP_BONAIRE: 2191 case CHIP_HAWAII: 2192 case CHIP_KAVERI: 2193 case CHIP_KABINI: 2194 case CHIP_MULLINS: 2195 if (adev->flags & AMD_IS_APU) 2196 adev->family = AMDGPU_FAMILY_KV; 2197 else 2198 adev->family = AMDGPU_FAMILY_CI; 2199 2200 r = cik_set_ip_blocks(adev); 2201 if (r) 2202 return r; 2203 break; 2204 #endif 2205 case CHIP_TOPAZ: 2206 case CHIP_TONGA: 2207 case CHIP_FIJI: 2208 case CHIP_POLARIS10: 2209 case CHIP_POLARIS11: 2210 case CHIP_POLARIS12: 2211 case CHIP_VEGAM: 2212 case CHIP_CARRIZO: 2213 case CHIP_STONEY: 2214 if (adev->flags & AMD_IS_APU) 2215 adev->family = AMDGPU_FAMILY_CZ; 2216 else 2217 adev->family = AMDGPU_FAMILY_VI; 2218 2219 r = vi_set_ip_blocks(adev); 2220 if (r) 2221 return r; 2222 break; 2223 default: 2224 r = amdgpu_discovery_set_ip_blocks(adev); 2225 if (r) 2226 return r; 2227 break; 2228 } 2229 2230 if (amdgpu_has_atpx() && 2231 (amdgpu_is_atpx_hybrid() || 2232 amdgpu_has_atpx_dgpu_power_cntl()) && 2233 ((adev->flags & AMD_IS_APU) == 0) && 2234 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2235 adev->flags |= AMD_IS_PX; 2236 2237 if (!(adev->flags & AMD_IS_APU)) { 2238 parent = pci_upstream_bridge(adev->pdev); 2239 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2240 } 2241 2242 2243 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2244 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2245 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2246 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2247 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2248 2249 total = true; 2250 for (i = 0; i < adev->num_ip_blocks; i++) { 2251 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2252 DRM_WARN("disabled ip block: %d <%s>\n", 2253 i, adev->ip_blocks[i].version->funcs->name); 2254 adev->ip_blocks[i].status.valid = false; 2255 } else { 2256 if (adev->ip_blocks[i].version->funcs->early_init) { 2257 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2258 if (r == -ENOENT) { 2259 adev->ip_blocks[i].status.valid = false; 2260 } else if (r) { 2261 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2262 adev->ip_blocks[i].version->funcs->name, r); 2263 total = false; 2264 } else { 2265 adev->ip_blocks[i].status.valid = true; 2266 } 2267 } else { 2268 adev->ip_blocks[i].status.valid = true; 2269 } 2270 } 2271 /* get the vbios after the asic_funcs are set up */ 2272 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2273 r = amdgpu_device_parse_gpu_info_fw(adev); 2274 if (r) 2275 return r; 2276 2277 /* Read BIOS */ 2278 if (amdgpu_device_read_bios(adev)) { 2279 if (!amdgpu_get_bios(adev)) 2280 return -EINVAL; 2281 2282 r = amdgpu_atombios_init(adev); 2283 if (r) { 2284 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2285 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2286 return r; 2287 } 2288 } 2289 2290 /*get pf2vf msg info at it's earliest time*/ 2291 if (amdgpu_sriov_vf(adev)) 2292 amdgpu_virt_init_data_exchange(adev); 2293 2294 } 2295 } 2296 if (!total) 2297 return -ENODEV; 2298 2299 amdgpu_amdkfd_device_probe(adev); 2300 adev->cg_flags &= amdgpu_cg_mask; 2301 adev->pg_flags &= amdgpu_pg_mask; 2302 2303 return 0; 2304 } 2305 2306 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2307 { 2308 int i, r; 2309 2310 for (i = 0; i < adev->num_ip_blocks; i++) { 2311 if (!adev->ip_blocks[i].status.sw) 2312 continue; 2313 if (adev->ip_blocks[i].status.hw) 2314 continue; 2315 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2316 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2317 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2318 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2319 if (r) { 2320 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2321 adev->ip_blocks[i].version->funcs->name, r); 2322 return r; 2323 } 2324 adev->ip_blocks[i].status.hw = true; 2325 } 2326 } 2327 2328 return 0; 2329 } 2330 2331 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2332 { 2333 int i, r; 2334 2335 for (i = 0; i < adev->num_ip_blocks; i++) { 2336 if (!adev->ip_blocks[i].status.sw) 2337 continue; 2338 if (adev->ip_blocks[i].status.hw) 2339 continue; 2340 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2341 if (r) { 2342 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2343 adev->ip_blocks[i].version->funcs->name, r); 2344 return r; 2345 } 2346 adev->ip_blocks[i].status.hw = true; 2347 } 2348 2349 return 0; 2350 } 2351 2352 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2353 { 2354 int r = 0; 2355 int i; 2356 uint32_t smu_version; 2357 2358 if (adev->asic_type >= CHIP_VEGA10) { 2359 for (i = 0; i < adev->num_ip_blocks; i++) { 2360 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2361 continue; 2362 2363 if (!adev->ip_blocks[i].status.sw) 2364 continue; 2365 2366 /* no need to do the fw loading again if already done*/ 2367 if (adev->ip_blocks[i].status.hw == true) 2368 break; 2369 2370 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2371 r = adev->ip_blocks[i].version->funcs->resume(adev); 2372 if (r) { 2373 DRM_ERROR("resume of IP block <%s> failed %d\n", 2374 adev->ip_blocks[i].version->funcs->name, r); 2375 return r; 2376 } 2377 } else { 2378 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2379 if (r) { 2380 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2381 adev->ip_blocks[i].version->funcs->name, r); 2382 return r; 2383 } 2384 } 2385 2386 adev->ip_blocks[i].status.hw = true; 2387 break; 2388 } 2389 } 2390 2391 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2392 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2393 2394 return r; 2395 } 2396 2397 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2398 { 2399 long timeout; 2400 int r, i; 2401 2402 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2403 struct amdgpu_ring *ring = adev->rings[i]; 2404 2405 /* No need to setup the GPU scheduler for rings that don't need it */ 2406 if (!ring || ring->no_scheduler) 2407 continue; 2408 2409 switch (ring->funcs->type) { 2410 case AMDGPU_RING_TYPE_GFX: 2411 timeout = adev->gfx_timeout; 2412 break; 2413 case AMDGPU_RING_TYPE_COMPUTE: 2414 timeout = adev->compute_timeout; 2415 break; 2416 case AMDGPU_RING_TYPE_SDMA: 2417 timeout = adev->sdma_timeout; 2418 break; 2419 default: 2420 timeout = adev->video_timeout; 2421 break; 2422 } 2423 2424 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2425 ring->num_hw_submission, 0, 2426 timeout, adev->reset_domain->wq, 2427 ring->sched_score, ring->name, 2428 adev->dev); 2429 if (r) { 2430 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2431 ring->name); 2432 return r; 2433 } 2434 } 2435 2436 amdgpu_xcp_update_partition_sched_list(adev); 2437 2438 return 0; 2439 } 2440 2441 2442 /** 2443 * amdgpu_device_ip_init - run init for hardware IPs 2444 * 2445 * @adev: amdgpu_device pointer 2446 * 2447 * Main initialization pass for hardware IPs. The list of all the hardware 2448 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2449 * are run. sw_init initializes the software state associated with each IP 2450 * and hw_init initializes the hardware associated with each IP. 2451 * Returns 0 on success, negative error code on failure. 2452 */ 2453 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2454 { 2455 int i, r; 2456 2457 r = amdgpu_ras_init(adev); 2458 if (r) 2459 return r; 2460 2461 for (i = 0; i < adev->num_ip_blocks; i++) { 2462 if (!adev->ip_blocks[i].status.valid) 2463 continue; 2464 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2465 if (r) { 2466 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2467 adev->ip_blocks[i].version->funcs->name, r); 2468 goto init_failed; 2469 } 2470 adev->ip_blocks[i].status.sw = true; 2471 2472 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2473 /* need to do common hw init early so everything is set up for gmc */ 2474 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2475 if (r) { 2476 DRM_ERROR("hw_init %d failed %d\n", i, r); 2477 goto init_failed; 2478 } 2479 adev->ip_blocks[i].status.hw = true; 2480 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2481 /* need to do gmc hw init early so we can allocate gpu mem */ 2482 /* Try to reserve bad pages early */ 2483 if (amdgpu_sriov_vf(adev)) 2484 amdgpu_virt_exchange_data(adev); 2485 2486 r = amdgpu_device_mem_scratch_init(adev); 2487 if (r) { 2488 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2489 goto init_failed; 2490 } 2491 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2492 if (r) { 2493 DRM_ERROR("hw_init %d failed %d\n", i, r); 2494 goto init_failed; 2495 } 2496 r = amdgpu_device_wb_init(adev); 2497 if (r) { 2498 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2499 goto init_failed; 2500 } 2501 adev->ip_blocks[i].status.hw = true; 2502 2503 /* right after GMC hw init, we create CSA */ 2504 if (adev->gfx.mcbp) { 2505 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2506 AMDGPU_GEM_DOMAIN_VRAM | 2507 AMDGPU_GEM_DOMAIN_GTT, 2508 AMDGPU_CSA_SIZE); 2509 if (r) { 2510 DRM_ERROR("allocate CSA failed %d\n", r); 2511 goto init_failed; 2512 } 2513 } 2514 } 2515 } 2516 2517 if (amdgpu_sriov_vf(adev)) 2518 amdgpu_virt_init_data_exchange(adev); 2519 2520 r = amdgpu_ib_pool_init(adev); 2521 if (r) { 2522 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2523 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2524 goto init_failed; 2525 } 2526 2527 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2528 if (r) 2529 goto init_failed; 2530 2531 r = amdgpu_device_ip_hw_init_phase1(adev); 2532 if (r) 2533 goto init_failed; 2534 2535 r = amdgpu_device_fw_loading(adev); 2536 if (r) 2537 goto init_failed; 2538 2539 r = amdgpu_device_ip_hw_init_phase2(adev); 2540 if (r) 2541 goto init_failed; 2542 2543 /* 2544 * retired pages will be loaded from eeprom and reserved here, 2545 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2546 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2547 * for I2C communication which only true at this point. 2548 * 2549 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2550 * failure from bad gpu situation and stop amdgpu init process 2551 * accordingly. For other failed cases, it will still release all 2552 * the resource and print error message, rather than returning one 2553 * negative value to upper level. 2554 * 2555 * Note: theoretically, this should be called before all vram allocations 2556 * to protect retired page from abusing 2557 */ 2558 r = amdgpu_ras_recovery_init(adev); 2559 if (r) 2560 goto init_failed; 2561 2562 /** 2563 * In case of XGMI grab extra reference for reset domain for this device 2564 */ 2565 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2566 if (amdgpu_xgmi_add_device(adev) == 0) { 2567 if (!amdgpu_sriov_vf(adev)) { 2568 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2569 2570 if (WARN_ON(!hive)) { 2571 r = -ENOENT; 2572 goto init_failed; 2573 } 2574 2575 if (!hive->reset_domain || 2576 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2577 r = -ENOENT; 2578 amdgpu_put_xgmi_hive(hive); 2579 goto init_failed; 2580 } 2581 2582 /* Drop the early temporary reset domain we created for device */ 2583 amdgpu_reset_put_reset_domain(adev->reset_domain); 2584 adev->reset_domain = hive->reset_domain; 2585 amdgpu_put_xgmi_hive(hive); 2586 } 2587 } 2588 } 2589 2590 r = amdgpu_device_init_schedulers(adev); 2591 if (r) 2592 goto init_failed; 2593 2594 /* Don't init kfd if whole hive need to be reset during init */ 2595 if (!adev->gmc.xgmi.pending_reset) { 2596 kgd2kfd_init_zone_device(adev); 2597 amdgpu_amdkfd_device_init(adev); 2598 } 2599 2600 amdgpu_fru_get_product_info(adev); 2601 2602 init_failed: 2603 2604 return r; 2605 } 2606 2607 /** 2608 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2609 * 2610 * @adev: amdgpu_device pointer 2611 * 2612 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2613 * this function before a GPU reset. If the value is retained after a 2614 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2615 */ 2616 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2617 { 2618 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2619 } 2620 2621 /** 2622 * amdgpu_device_check_vram_lost - check if vram is valid 2623 * 2624 * @adev: amdgpu_device pointer 2625 * 2626 * Checks the reset magic value written to the gart pointer in VRAM. 2627 * The driver calls this after a GPU reset to see if the contents of 2628 * VRAM is lost or now. 2629 * returns true if vram is lost, false if not. 2630 */ 2631 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2632 { 2633 if (memcmp(adev->gart.ptr, adev->reset_magic, 2634 AMDGPU_RESET_MAGIC_NUM)) 2635 return true; 2636 2637 if (!amdgpu_in_reset(adev)) 2638 return false; 2639 2640 /* 2641 * For all ASICs with baco/mode1 reset, the VRAM is 2642 * always assumed to be lost. 2643 */ 2644 switch (amdgpu_asic_reset_method(adev)) { 2645 case AMD_RESET_METHOD_BACO: 2646 case AMD_RESET_METHOD_MODE1: 2647 return true; 2648 default: 2649 return false; 2650 } 2651 } 2652 2653 /** 2654 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2655 * 2656 * @adev: amdgpu_device pointer 2657 * @state: clockgating state (gate or ungate) 2658 * 2659 * The list of all the hardware IPs that make up the asic is walked and the 2660 * set_clockgating_state callbacks are run. 2661 * Late initialization pass enabling clockgating for hardware IPs. 2662 * Fini or suspend, pass disabling clockgating for hardware IPs. 2663 * Returns 0 on success, negative error code on failure. 2664 */ 2665 2666 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2667 enum amd_clockgating_state state) 2668 { 2669 int i, j, r; 2670 2671 if (amdgpu_emu_mode == 1) 2672 return 0; 2673 2674 for (j = 0; j < adev->num_ip_blocks; j++) { 2675 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2676 if (!adev->ip_blocks[i].status.late_initialized) 2677 continue; 2678 /* skip CG for GFX, SDMA on S0ix */ 2679 if (adev->in_s0ix && 2680 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2681 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2682 continue; 2683 /* skip CG for VCE/UVD, it's handled specially */ 2684 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2685 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2686 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2687 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2688 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2689 /* enable clockgating to save power */ 2690 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2691 state); 2692 if (r) { 2693 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2694 adev->ip_blocks[i].version->funcs->name, r); 2695 return r; 2696 } 2697 } 2698 } 2699 2700 return 0; 2701 } 2702 2703 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2704 enum amd_powergating_state state) 2705 { 2706 int i, j, r; 2707 2708 if (amdgpu_emu_mode == 1) 2709 return 0; 2710 2711 for (j = 0; j < adev->num_ip_blocks; j++) { 2712 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2713 if (!adev->ip_blocks[i].status.late_initialized) 2714 continue; 2715 /* skip PG for GFX, SDMA on S0ix */ 2716 if (adev->in_s0ix && 2717 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2718 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2719 continue; 2720 /* skip CG for VCE/UVD, it's handled specially */ 2721 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2722 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2723 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2724 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2725 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2726 /* enable powergating to save power */ 2727 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2728 state); 2729 if (r) { 2730 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2731 adev->ip_blocks[i].version->funcs->name, r); 2732 return r; 2733 } 2734 } 2735 } 2736 return 0; 2737 } 2738 2739 static int amdgpu_device_enable_mgpu_fan_boost(void) 2740 { 2741 struct amdgpu_gpu_instance *gpu_ins; 2742 struct amdgpu_device *adev; 2743 int i, ret = 0; 2744 2745 mutex_lock(&mgpu_info.mutex); 2746 2747 /* 2748 * MGPU fan boost feature should be enabled 2749 * only when there are two or more dGPUs in 2750 * the system 2751 */ 2752 if (mgpu_info.num_dgpu < 2) 2753 goto out; 2754 2755 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2756 gpu_ins = &(mgpu_info.gpu_ins[i]); 2757 adev = gpu_ins->adev; 2758 if (!(adev->flags & AMD_IS_APU) && 2759 !gpu_ins->mgpu_fan_enabled) { 2760 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2761 if (ret) 2762 break; 2763 2764 gpu_ins->mgpu_fan_enabled = 1; 2765 } 2766 } 2767 2768 out: 2769 mutex_unlock(&mgpu_info.mutex); 2770 2771 return ret; 2772 } 2773 2774 /** 2775 * amdgpu_device_ip_late_init - run late init for hardware IPs 2776 * 2777 * @adev: amdgpu_device pointer 2778 * 2779 * Late initialization pass for hardware IPs. The list of all the hardware 2780 * IPs that make up the asic is walked and the late_init callbacks are run. 2781 * late_init covers any special initialization that an IP requires 2782 * after all of the have been initialized or something that needs to happen 2783 * late in the init process. 2784 * Returns 0 on success, negative error code on failure. 2785 */ 2786 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2787 { 2788 struct amdgpu_gpu_instance *gpu_instance; 2789 int i = 0, r; 2790 2791 for (i = 0; i < adev->num_ip_blocks; i++) { 2792 if (!adev->ip_blocks[i].status.hw) 2793 continue; 2794 if (adev->ip_blocks[i].version->funcs->late_init) { 2795 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2796 if (r) { 2797 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2798 adev->ip_blocks[i].version->funcs->name, r); 2799 return r; 2800 } 2801 } 2802 adev->ip_blocks[i].status.late_initialized = true; 2803 } 2804 2805 r = amdgpu_ras_late_init(adev); 2806 if (r) { 2807 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2808 return r; 2809 } 2810 2811 amdgpu_ras_set_error_query_ready(adev, true); 2812 2813 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2814 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2815 2816 amdgpu_device_fill_reset_magic(adev); 2817 2818 r = amdgpu_device_enable_mgpu_fan_boost(); 2819 if (r) 2820 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2821 2822 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2823 if (amdgpu_passthrough(adev) && 2824 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2825 adev->asic_type == CHIP_ALDEBARAN)) 2826 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2827 2828 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2829 mutex_lock(&mgpu_info.mutex); 2830 2831 /* 2832 * Reset device p-state to low as this was booted with high. 2833 * 2834 * This should be performed only after all devices from the same 2835 * hive get initialized. 2836 * 2837 * However, it's unknown how many device in the hive in advance. 2838 * As this is counted one by one during devices initializations. 2839 * 2840 * So, we wait for all XGMI interlinked devices initialized. 2841 * This may bring some delays as those devices may come from 2842 * different hives. But that should be OK. 2843 */ 2844 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2845 for (i = 0; i < mgpu_info.num_gpu; i++) { 2846 gpu_instance = &(mgpu_info.gpu_ins[i]); 2847 if (gpu_instance->adev->flags & AMD_IS_APU) 2848 continue; 2849 2850 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2851 AMDGPU_XGMI_PSTATE_MIN); 2852 if (r) { 2853 DRM_ERROR("pstate setting failed (%d).\n", r); 2854 break; 2855 } 2856 } 2857 } 2858 2859 mutex_unlock(&mgpu_info.mutex); 2860 } 2861 2862 return 0; 2863 } 2864 2865 /** 2866 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2867 * 2868 * @adev: amdgpu_device pointer 2869 * 2870 * For ASICs need to disable SMC first 2871 */ 2872 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2873 { 2874 int i, r; 2875 2876 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 2877 return; 2878 2879 for (i = 0; i < adev->num_ip_blocks; i++) { 2880 if (!adev->ip_blocks[i].status.hw) 2881 continue; 2882 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2883 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2884 /* XXX handle errors */ 2885 if (r) { 2886 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2887 adev->ip_blocks[i].version->funcs->name, r); 2888 } 2889 adev->ip_blocks[i].status.hw = false; 2890 break; 2891 } 2892 } 2893 } 2894 2895 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2896 { 2897 int i, r; 2898 2899 for (i = 0; i < adev->num_ip_blocks; i++) { 2900 if (!adev->ip_blocks[i].version->funcs->early_fini) 2901 continue; 2902 2903 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2904 if (r) { 2905 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2906 adev->ip_blocks[i].version->funcs->name, r); 2907 } 2908 } 2909 2910 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2911 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2912 2913 amdgpu_amdkfd_suspend(adev, false); 2914 2915 /* Workaroud for ASICs need to disable SMC first */ 2916 amdgpu_device_smu_fini_early(adev); 2917 2918 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2919 if (!adev->ip_blocks[i].status.hw) 2920 continue; 2921 2922 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2923 /* XXX handle errors */ 2924 if (r) { 2925 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2926 adev->ip_blocks[i].version->funcs->name, r); 2927 } 2928 2929 adev->ip_blocks[i].status.hw = false; 2930 } 2931 2932 if (amdgpu_sriov_vf(adev)) { 2933 if (amdgpu_virt_release_full_gpu(adev, false)) 2934 DRM_ERROR("failed to release exclusive mode on fini\n"); 2935 } 2936 2937 return 0; 2938 } 2939 2940 /** 2941 * amdgpu_device_ip_fini - run fini for hardware IPs 2942 * 2943 * @adev: amdgpu_device pointer 2944 * 2945 * Main teardown pass for hardware IPs. The list of all the hardware 2946 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2947 * are run. hw_fini tears down the hardware associated with each IP 2948 * and sw_fini tears down any software state associated with each IP. 2949 * Returns 0 on success, negative error code on failure. 2950 */ 2951 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2952 { 2953 int i, r; 2954 2955 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2956 amdgpu_virt_release_ras_err_handler_data(adev); 2957 2958 if (adev->gmc.xgmi.num_physical_nodes > 1) 2959 amdgpu_xgmi_remove_device(adev); 2960 2961 amdgpu_amdkfd_device_fini_sw(adev); 2962 2963 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2964 if (!adev->ip_blocks[i].status.sw) 2965 continue; 2966 2967 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2968 amdgpu_ucode_free_bo(adev); 2969 amdgpu_free_static_csa(&adev->virt.csa_obj); 2970 amdgpu_device_wb_fini(adev); 2971 amdgpu_device_mem_scratch_fini(adev); 2972 amdgpu_ib_pool_fini(adev); 2973 } 2974 2975 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2976 /* XXX handle errors */ 2977 if (r) { 2978 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2979 adev->ip_blocks[i].version->funcs->name, r); 2980 } 2981 adev->ip_blocks[i].status.sw = false; 2982 adev->ip_blocks[i].status.valid = false; 2983 } 2984 2985 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2986 if (!adev->ip_blocks[i].status.late_initialized) 2987 continue; 2988 if (adev->ip_blocks[i].version->funcs->late_fini) 2989 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2990 adev->ip_blocks[i].status.late_initialized = false; 2991 } 2992 2993 amdgpu_ras_fini(adev); 2994 2995 return 0; 2996 } 2997 2998 /** 2999 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3000 * 3001 * @work: work_struct. 3002 */ 3003 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3004 { 3005 struct amdgpu_device *adev = 3006 container_of(work, struct amdgpu_device, delayed_init_work.work); 3007 int r; 3008 3009 r = amdgpu_ib_ring_tests(adev); 3010 if (r) 3011 DRM_ERROR("ib ring test failed (%d).\n", r); 3012 } 3013 3014 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3015 { 3016 struct amdgpu_device *adev = 3017 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3018 3019 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3020 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3021 3022 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3023 adev->gfx.gfx_off_state = true; 3024 } 3025 3026 /** 3027 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3028 * 3029 * @adev: amdgpu_device pointer 3030 * 3031 * Main suspend function for hardware IPs. The list of all the hardware 3032 * IPs that make up the asic is walked, clockgating is disabled and the 3033 * suspend callbacks are run. suspend puts the hardware and software state 3034 * in each IP into a state suitable for suspend. 3035 * Returns 0 on success, negative error code on failure. 3036 */ 3037 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3038 { 3039 int i, r; 3040 3041 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3042 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3043 3044 /* 3045 * Per PMFW team's suggestion, driver needs to handle gfxoff 3046 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3047 * scenario. Add the missing df cstate disablement here. 3048 */ 3049 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3050 dev_warn(adev->dev, "Failed to disallow df cstate"); 3051 3052 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3053 if (!adev->ip_blocks[i].status.valid) 3054 continue; 3055 3056 /* displays are handled separately */ 3057 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3058 continue; 3059 3060 /* XXX handle errors */ 3061 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3062 /* XXX handle errors */ 3063 if (r) { 3064 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3065 adev->ip_blocks[i].version->funcs->name, r); 3066 return r; 3067 } 3068 3069 adev->ip_blocks[i].status.hw = false; 3070 } 3071 3072 return 0; 3073 } 3074 3075 /** 3076 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3077 * 3078 * @adev: amdgpu_device pointer 3079 * 3080 * Main suspend function for hardware IPs. The list of all the hardware 3081 * IPs that make up the asic is walked, clockgating is disabled and the 3082 * suspend callbacks are run. suspend puts the hardware and software state 3083 * in each IP into a state suitable for suspend. 3084 * Returns 0 on success, negative error code on failure. 3085 */ 3086 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3087 { 3088 int i, r; 3089 3090 if (adev->in_s0ix) 3091 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3092 3093 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3094 if (!adev->ip_blocks[i].status.valid) 3095 continue; 3096 /* displays are handled in phase1 */ 3097 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3098 continue; 3099 /* PSP lost connection when err_event_athub occurs */ 3100 if (amdgpu_ras_intr_triggered() && 3101 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3102 adev->ip_blocks[i].status.hw = false; 3103 continue; 3104 } 3105 3106 /* skip unnecessary suspend if we do not initialize them yet */ 3107 if (adev->gmc.xgmi.pending_reset && 3108 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3109 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3110 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3111 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3112 adev->ip_blocks[i].status.hw = false; 3113 continue; 3114 } 3115 3116 /* skip suspend of gfx/mes and psp for S0ix 3117 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3118 * like at runtime. PSP is also part of the always on hardware 3119 * so no need to suspend it. 3120 */ 3121 if (adev->in_s0ix && 3122 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3123 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3124 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3125 continue; 3126 3127 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3128 if (adev->in_s0ix && 3129 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3130 IP_VERSION(5, 0, 0)) && 3131 (adev->ip_blocks[i].version->type == 3132 AMD_IP_BLOCK_TYPE_SDMA)) 3133 continue; 3134 3135 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3136 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3137 * from this location and RLC Autoload automatically also gets loaded 3138 * from here based on PMFW -> PSP message during re-init sequence. 3139 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3140 * the TMR and reload FWs again for IMU enabled APU ASICs. 3141 */ 3142 if (amdgpu_in_reset(adev) && 3143 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3144 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3145 continue; 3146 3147 /* XXX handle errors */ 3148 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3149 /* XXX handle errors */ 3150 if (r) { 3151 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3152 adev->ip_blocks[i].version->funcs->name, r); 3153 } 3154 adev->ip_blocks[i].status.hw = false; 3155 /* handle putting the SMC in the appropriate state */ 3156 if (!amdgpu_sriov_vf(adev)) { 3157 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3158 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3159 if (r) { 3160 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3161 adev->mp1_state, r); 3162 return r; 3163 } 3164 } 3165 } 3166 } 3167 3168 return 0; 3169 } 3170 3171 /** 3172 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3173 * 3174 * @adev: amdgpu_device pointer 3175 * 3176 * Main suspend function for hardware IPs. The list of all the hardware 3177 * IPs that make up the asic is walked, clockgating is disabled and the 3178 * suspend callbacks are run. suspend puts the hardware and software state 3179 * in each IP into a state suitable for suspend. 3180 * Returns 0 on success, negative error code on failure. 3181 */ 3182 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3183 { 3184 int r; 3185 3186 if (amdgpu_sriov_vf(adev)) { 3187 amdgpu_virt_fini_data_exchange(adev); 3188 amdgpu_virt_request_full_gpu(adev, false); 3189 } 3190 3191 r = amdgpu_device_ip_suspend_phase1(adev); 3192 if (r) 3193 return r; 3194 r = amdgpu_device_ip_suspend_phase2(adev); 3195 3196 if (amdgpu_sriov_vf(adev)) 3197 amdgpu_virt_release_full_gpu(adev, false); 3198 3199 return r; 3200 } 3201 3202 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3203 { 3204 int i, r; 3205 3206 static enum amd_ip_block_type ip_order[] = { 3207 AMD_IP_BLOCK_TYPE_COMMON, 3208 AMD_IP_BLOCK_TYPE_GMC, 3209 AMD_IP_BLOCK_TYPE_PSP, 3210 AMD_IP_BLOCK_TYPE_IH, 3211 }; 3212 3213 for (i = 0; i < adev->num_ip_blocks; i++) { 3214 int j; 3215 struct amdgpu_ip_block *block; 3216 3217 block = &adev->ip_blocks[i]; 3218 block->status.hw = false; 3219 3220 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3221 3222 if (block->version->type != ip_order[j] || 3223 !block->status.valid) 3224 continue; 3225 3226 r = block->version->funcs->hw_init(adev); 3227 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3228 if (r) 3229 return r; 3230 block->status.hw = true; 3231 } 3232 } 3233 3234 return 0; 3235 } 3236 3237 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3238 { 3239 int i, r; 3240 3241 static enum amd_ip_block_type ip_order[] = { 3242 AMD_IP_BLOCK_TYPE_SMC, 3243 AMD_IP_BLOCK_TYPE_DCE, 3244 AMD_IP_BLOCK_TYPE_GFX, 3245 AMD_IP_BLOCK_TYPE_SDMA, 3246 AMD_IP_BLOCK_TYPE_MES, 3247 AMD_IP_BLOCK_TYPE_UVD, 3248 AMD_IP_BLOCK_TYPE_VCE, 3249 AMD_IP_BLOCK_TYPE_VCN, 3250 AMD_IP_BLOCK_TYPE_JPEG 3251 }; 3252 3253 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3254 int j; 3255 struct amdgpu_ip_block *block; 3256 3257 for (j = 0; j < adev->num_ip_blocks; j++) { 3258 block = &adev->ip_blocks[j]; 3259 3260 if (block->version->type != ip_order[i] || 3261 !block->status.valid || 3262 block->status.hw) 3263 continue; 3264 3265 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3266 r = block->version->funcs->resume(adev); 3267 else 3268 r = block->version->funcs->hw_init(adev); 3269 3270 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3271 if (r) 3272 return r; 3273 block->status.hw = true; 3274 } 3275 } 3276 3277 return 0; 3278 } 3279 3280 /** 3281 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3282 * 3283 * @adev: amdgpu_device pointer 3284 * 3285 * First resume function for hardware IPs. The list of all the hardware 3286 * IPs that make up the asic is walked and the resume callbacks are run for 3287 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3288 * after a suspend and updates the software state as necessary. This 3289 * function is also used for restoring the GPU after a GPU reset. 3290 * Returns 0 on success, negative error code on failure. 3291 */ 3292 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3293 { 3294 int i, r; 3295 3296 for (i = 0; i < adev->num_ip_blocks; i++) { 3297 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3298 continue; 3299 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3300 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3301 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3302 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3303 3304 r = adev->ip_blocks[i].version->funcs->resume(adev); 3305 if (r) { 3306 DRM_ERROR("resume of IP block <%s> failed %d\n", 3307 adev->ip_blocks[i].version->funcs->name, r); 3308 return r; 3309 } 3310 adev->ip_blocks[i].status.hw = true; 3311 } 3312 } 3313 3314 return 0; 3315 } 3316 3317 /** 3318 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3319 * 3320 * @adev: amdgpu_device pointer 3321 * 3322 * First resume function for hardware IPs. The list of all the hardware 3323 * IPs that make up the asic is walked and the resume callbacks are run for 3324 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3325 * functional state after a suspend and updates the software state as 3326 * necessary. This function is also used for restoring the GPU after a GPU 3327 * reset. 3328 * Returns 0 on success, negative error code on failure. 3329 */ 3330 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3331 { 3332 int i, r; 3333 3334 for (i = 0; i < adev->num_ip_blocks; i++) { 3335 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3336 continue; 3337 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3338 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3339 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3340 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3341 continue; 3342 r = adev->ip_blocks[i].version->funcs->resume(adev); 3343 if (r) { 3344 DRM_ERROR("resume of IP block <%s> failed %d\n", 3345 adev->ip_blocks[i].version->funcs->name, r); 3346 return r; 3347 } 3348 adev->ip_blocks[i].status.hw = true; 3349 } 3350 3351 return 0; 3352 } 3353 3354 /** 3355 * amdgpu_device_ip_resume - run resume for hardware IPs 3356 * 3357 * @adev: amdgpu_device pointer 3358 * 3359 * Main resume function for hardware IPs. The hardware IPs 3360 * are split into two resume functions because they are 3361 * also used in recovering from a GPU reset and some additional 3362 * steps need to be take between them. In this case (S3/S4) they are 3363 * run sequentially. 3364 * Returns 0 on success, negative error code on failure. 3365 */ 3366 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3367 { 3368 int r; 3369 3370 r = amdgpu_device_ip_resume_phase1(adev); 3371 if (r) 3372 return r; 3373 3374 r = amdgpu_device_fw_loading(adev); 3375 if (r) 3376 return r; 3377 3378 r = amdgpu_device_ip_resume_phase2(adev); 3379 3380 return r; 3381 } 3382 3383 /** 3384 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3385 * 3386 * @adev: amdgpu_device pointer 3387 * 3388 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3389 */ 3390 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3391 { 3392 if (amdgpu_sriov_vf(adev)) { 3393 if (adev->is_atom_fw) { 3394 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3395 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3396 } else { 3397 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3398 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3399 } 3400 3401 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3402 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3403 } 3404 } 3405 3406 /** 3407 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3408 * 3409 * @asic_type: AMD asic type 3410 * 3411 * Check if there is DC (new modesetting infrastructre) support for an asic. 3412 * returns true if DC has support, false if not. 3413 */ 3414 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3415 { 3416 switch (asic_type) { 3417 #ifdef CONFIG_DRM_AMDGPU_SI 3418 case CHIP_HAINAN: 3419 #endif 3420 case CHIP_TOPAZ: 3421 /* chips with no display hardware */ 3422 return false; 3423 #if defined(CONFIG_DRM_AMD_DC) 3424 case CHIP_TAHITI: 3425 case CHIP_PITCAIRN: 3426 case CHIP_VERDE: 3427 case CHIP_OLAND: 3428 /* 3429 * We have systems in the wild with these ASICs that require 3430 * LVDS and VGA support which is not supported with DC. 3431 * 3432 * Fallback to the non-DC driver here by default so as not to 3433 * cause regressions. 3434 */ 3435 #if defined(CONFIG_DRM_AMD_DC_SI) 3436 return amdgpu_dc > 0; 3437 #else 3438 return false; 3439 #endif 3440 case CHIP_BONAIRE: 3441 case CHIP_KAVERI: 3442 case CHIP_KABINI: 3443 case CHIP_MULLINS: 3444 /* 3445 * We have systems in the wild with these ASICs that require 3446 * VGA support which is not supported with DC. 3447 * 3448 * Fallback to the non-DC driver here by default so as not to 3449 * cause regressions. 3450 */ 3451 return amdgpu_dc > 0; 3452 default: 3453 return amdgpu_dc != 0; 3454 #else 3455 default: 3456 if (amdgpu_dc > 0) 3457 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3458 return false; 3459 #endif 3460 } 3461 } 3462 3463 /** 3464 * amdgpu_device_has_dc_support - check if dc is supported 3465 * 3466 * @adev: amdgpu_device pointer 3467 * 3468 * Returns true for supported, false for not supported 3469 */ 3470 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3471 { 3472 if (adev->enable_virtual_display || 3473 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3474 return false; 3475 3476 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3477 } 3478 3479 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3480 { 3481 struct amdgpu_device *adev = 3482 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3483 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3484 3485 /* It's a bug to not have a hive within this function */ 3486 if (WARN_ON(!hive)) 3487 return; 3488 3489 /* 3490 * Use task barrier to synchronize all xgmi reset works across the 3491 * hive. task_barrier_enter and task_barrier_exit will block 3492 * until all the threads running the xgmi reset works reach 3493 * those points. task_barrier_full will do both blocks. 3494 */ 3495 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3496 3497 task_barrier_enter(&hive->tb); 3498 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3499 3500 if (adev->asic_reset_res) 3501 goto fail; 3502 3503 task_barrier_exit(&hive->tb); 3504 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3505 3506 if (adev->asic_reset_res) 3507 goto fail; 3508 3509 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3510 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3511 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3512 } else { 3513 3514 task_barrier_full(&hive->tb); 3515 adev->asic_reset_res = amdgpu_asic_reset(adev); 3516 } 3517 3518 fail: 3519 if (adev->asic_reset_res) 3520 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3521 adev->asic_reset_res, adev_to_drm(adev)->unique); 3522 amdgpu_put_xgmi_hive(hive); 3523 } 3524 3525 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3526 { 3527 char *input = amdgpu_lockup_timeout; 3528 char *timeout_setting = NULL; 3529 int index = 0; 3530 long timeout; 3531 int ret = 0; 3532 3533 /* 3534 * By default timeout for non compute jobs is 10000 3535 * and 60000 for compute jobs. 3536 * In SR-IOV or passthrough mode, timeout for compute 3537 * jobs are 60000 by default. 3538 */ 3539 adev->gfx_timeout = msecs_to_jiffies(10000); 3540 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3541 if (amdgpu_sriov_vf(adev)) 3542 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3543 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3544 else 3545 adev->compute_timeout = msecs_to_jiffies(60000); 3546 3547 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3548 while ((timeout_setting = strsep(&input, ",")) && 3549 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3550 ret = kstrtol(timeout_setting, 0, &timeout); 3551 if (ret) 3552 return ret; 3553 3554 if (timeout == 0) { 3555 index++; 3556 continue; 3557 } else if (timeout < 0) { 3558 timeout = MAX_SCHEDULE_TIMEOUT; 3559 dev_warn(adev->dev, "lockup timeout disabled"); 3560 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3561 } else { 3562 timeout = msecs_to_jiffies(timeout); 3563 } 3564 3565 switch (index++) { 3566 case 0: 3567 adev->gfx_timeout = timeout; 3568 break; 3569 case 1: 3570 adev->compute_timeout = timeout; 3571 break; 3572 case 2: 3573 adev->sdma_timeout = timeout; 3574 break; 3575 case 3: 3576 adev->video_timeout = timeout; 3577 break; 3578 default: 3579 break; 3580 } 3581 } 3582 /* 3583 * There is only one value specified and 3584 * it should apply to all non-compute jobs. 3585 */ 3586 if (index == 1) { 3587 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3588 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3589 adev->compute_timeout = adev->gfx_timeout; 3590 } 3591 } 3592 3593 return ret; 3594 } 3595 3596 /** 3597 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3598 * 3599 * @adev: amdgpu_device pointer 3600 * 3601 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3602 */ 3603 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3604 { 3605 struct iommu_domain *domain; 3606 3607 domain = iommu_get_domain_for_dev(adev->dev); 3608 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3609 adev->ram_is_direct_mapped = true; 3610 } 3611 3612 static const struct attribute *amdgpu_dev_attributes[] = { 3613 &dev_attr_pcie_replay_count.attr, 3614 NULL 3615 }; 3616 3617 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3618 { 3619 if (amdgpu_mcbp == 1) 3620 adev->gfx.mcbp = true; 3621 else if (amdgpu_mcbp == 0) 3622 adev->gfx.mcbp = false; 3623 else if ((amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 0, 0)) && 3624 (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(10, 0, 0)) && 3625 adev->gfx.num_gfx_rings) 3626 adev->gfx.mcbp = true; 3627 3628 if (amdgpu_sriov_vf(adev)) 3629 adev->gfx.mcbp = true; 3630 3631 if (adev->gfx.mcbp) 3632 DRM_INFO("MCBP is enabled\n"); 3633 } 3634 3635 /** 3636 * amdgpu_device_init - initialize the driver 3637 * 3638 * @adev: amdgpu_device pointer 3639 * @flags: driver flags 3640 * 3641 * Initializes the driver info and hw (all asics). 3642 * Returns 0 for success or an error on failure. 3643 * Called at driver startup. 3644 */ 3645 int amdgpu_device_init(struct amdgpu_device *adev, 3646 uint32_t flags) 3647 { 3648 struct drm_device *ddev = adev_to_drm(adev); 3649 struct pci_dev *pdev = adev->pdev; 3650 int r, i; 3651 bool px = false; 3652 u32 max_MBps; 3653 int tmp; 3654 3655 adev->shutdown = false; 3656 adev->flags = flags; 3657 3658 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3659 adev->asic_type = amdgpu_force_asic_type; 3660 else 3661 adev->asic_type = flags & AMD_ASIC_MASK; 3662 3663 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3664 if (amdgpu_emu_mode == 1) 3665 adev->usec_timeout *= 10; 3666 adev->gmc.gart_size = 512 * 1024 * 1024; 3667 adev->accel_working = false; 3668 adev->num_rings = 0; 3669 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3670 adev->mman.buffer_funcs = NULL; 3671 adev->mman.buffer_funcs_ring = NULL; 3672 adev->vm_manager.vm_pte_funcs = NULL; 3673 adev->vm_manager.vm_pte_num_scheds = 0; 3674 adev->gmc.gmc_funcs = NULL; 3675 adev->harvest_ip_mask = 0x0; 3676 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3677 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3678 3679 adev->smc_rreg = &amdgpu_invalid_rreg; 3680 adev->smc_wreg = &amdgpu_invalid_wreg; 3681 adev->pcie_rreg = &amdgpu_invalid_rreg; 3682 adev->pcie_wreg = &amdgpu_invalid_wreg; 3683 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3684 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3685 adev->pciep_rreg = &amdgpu_invalid_rreg; 3686 adev->pciep_wreg = &amdgpu_invalid_wreg; 3687 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3688 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3689 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 3690 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 3691 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3692 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3693 adev->didt_rreg = &amdgpu_invalid_rreg; 3694 adev->didt_wreg = &amdgpu_invalid_wreg; 3695 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3696 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3697 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3698 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3699 3700 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3701 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3702 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3703 3704 /* mutex initialization are all done here so we 3705 * can recall function without having locking issues 3706 */ 3707 mutex_init(&adev->firmware.mutex); 3708 mutex_init(&adev->pm.mutex); 3709 mutex_init(&adev->gfx.gpu_clock_mutex); 3710 mutex_init(&adev->srbm_mutex); 3711 mutex_init(&adev->gfx.pipe_reserve_mutex); 3712 mutex_init(&adev->gfx.gfx_off_mutex); 3713 mutex_init(&adev->gfx.partition_mutex); 3714 mutex_init(&adev->grbm_idx_mutex); 3715 mutex_init(&adev->mn_lock); 3716 mutex_init(&adev->virt.vf_errors.lock); 3717 hash_init(adev->mn_hash); 3718 mutex_init(&adev->psp.mutex); 3719 mutex_init(&adev->notifier_lock); 3720 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3721 mutex_init(&adev->benchmark_mutex); 3722 3723 amdgpu_device_init_apu_flags(adev); 3724 3725 r = amdgpu_device_check_arguments(adev); 3726 if (r) 3727 return r; 3728 3729 spin_lock_init(&adev->mmio_idx_lock); 3730 spin_lock_init(&adev->smc_idx_lock); 3731 spin_lock_init(&adev->pcie_idx_lock); 3732 spin_lock_init(&adev->uvd_ctx_idx_lock); 3733 spin_lock_init(&adev->didt_idx_lock); 3734 spin_lock_init(&adev->gc_cac_idx_lock); 3735 spin_lock_init(&adev->se_cac_idx_lock); 3736 spin_lock_init(&adev->audio_endpt_idx_lock); 3737 spin_lock_init(&adev->mm_stats.lock); 3738 3739 INIT_LIST_HEAD(&adev->shadow_list); 3740 mutex_init(&adev->shadow_list_lock); 3741 3742 INIT_LIST_HEAD(&adev->reset_list); 3743 3744 INIT_LIST_HEAD(&adev->ras_list); 3745 3746 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 3747 3748 INIT_DELAYED_WORK(&adev->delayed_init_work, 3749 amdgpu_device_delayed_init_work_handler); 3750 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3751 amdgpu_device_delay_enable_gfx_off); 3752 3753 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3754 3755 adev->gfx.gfx_off_req_count = 1; 3756 adev->gfx.gfx_off_residency = 0; 3757 adev->gfx.gfx_off_entrycount = 0; 3758 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3759 3760 atomic_set(&adev->throttling_logging_enabled, 1); 3761 /* 3762 * If throttling continues, logging will be performed every minute 3763 * to avoid log flooding. "-1" is subtracted since the thermal 3764 * throttling interrupt comes every second. Thus, the total logging 3765 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3766 * for throttling interrupt) = 60 seconds. 3767 */ 3768 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3769 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3770 3771 /* Registers mapping */ 3772 /* TODO: block userspace mapping of io register */ 3773 if (adev->asic_type >= CHIP_BONAIRE) { 3774 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3775 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3776 } else { 3777 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3778 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3779 } 3780 3781 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3782 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3783 3784 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3785 if (!adev->rmmio) 3786 return -ENOMEM; 3787 3788 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3789 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 3790 3791 /* 3792 * Reset domain needs to be present early, before XGMI hive discovered 3793 * (if any) and intitialized to use reset sem and in_gpu reset flag 3794 * early on during init and before calling to RREG32. 3795 */ 3796 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3797 if (!adev->reset_domain) 3798 return -ENOMEM; 3799 3800 /* detect hw virtualization here */ 3801 amdgpu_detect_virtualization(adev); 3802 3803 amdgpu_device_get_pcie_info(adev); 3804 3805 r = amdgpu_device_get_job_timeout_settings(adev); 3806 if (r) { 3807 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3808 return r; 3809 } 3810 3811 /* early init functions */ 3812 r = amdgpu_device_ip_early_init(adev); 3813 if (r) 3814 return r; 3815 3816 amdgpu_device_set_mcbp(adev); 3817 3818 /* Get rid of things like offb */ 3819 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3820 if (r) 3821 return r; 3822 3823 /* Enable TMZ based on IP_VERSION */ 3824 amdgpu_gmc_tmz_set(adev); 3825 3826 amdgpu_gmc_noretry_set(adev); 3827 /* Need to get xgmi info early to decide the reset behavior*/ 3828 if (adev->gmc.xgmi.supported) { 3829 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3830 if (r) 3831 return r; 3832 } 3833 3834 /* enable PCIE atomic ops */ 3835 if (amdgpu_sriov_vf(adev)) { 3836 if (adev->virt.fw_reserve.p_pf2vf) 3837 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3838 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3839 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3840 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3841 * internal path natively support atomics, set have_atomics_support to true. 3842 */ 3843 } else if ((adev->flags & AMD_IS_APU) && 3844 (amdgpu_ip_version(adev, GC_HWIP, 0) > 3845 IP_VERSION(9, 0, 0))) { 3846 adev->have_atomics_support = true; 3847 } else { 3848 adev->have_atomics_support = 3849 !pci_enable_atomic_ops_to_root(adev->pdev, 3850 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3851 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3852 } 3853 3854 if (!adev->have_atomics_support) 3855 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3856 3857 /* doorbell bar mapping and doorbell index init*/ 3858 amdgpu_doorbell_init(adev); 3859 3860 if (amdgpu_emu_mode == 1) { 3861 /* post the asic on emulation mode */ 3862 emu_soc_asic_init(adev); 3863 goto fence_driver_init; 3864 } 3865 3866 amdgpu_reset_init(adev); 3867 3868 /* detect if we are with an SRIOV vbios */ 3869 if (adev->bios) 3870 amdgpu_device_detect_sriov_bios(adev); 3871 3872 /* check if we need to reset the asic 3873 * E.g., driver was not cleanly unloaded previously, etc. 3874 */ 3875 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3876 if (adev->gmc.xgmi.num_physical_nodes) { 3877 dev_info(adev->dev, "Pending hive reset.\n"); 3878 adev->gmc.xgmi.pending_reset = true; 3879 /* Only need to init necessary block for SMU to handle the reset */ 3880 for (i = 0; i < adev->num_ip_blocks; i++) { 3881 if (!adev->ip_blocks[i].status.valid) 3882 continue; 3883 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3884 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3885 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3886 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3887 DRM_DEBUG("IP %s disabled for hw_init.\n", 3888 adev->ip_blocks[i].version->funcs->name); 3889 adev->ip_blocks[i].status.hw = true; 3890 } 3891 } 3892 } else { 3893 tmp = amdgpu_reset_method; 3894 /* It should do a default reset when loading or reloading the driver, 3895 * regardless of the module parameter reset_method. 3896 */ 3897 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3898 r = amdgpu_asic_reset(adev); 3899 amdgpu_reset_method = tmp; 3900 if (r) { 3901 dev_err(adev->dev, "asic reset on init failed\n"); 3902 goto failed; 3903 } 3904 } 3905 } 3906 3907 /* Post card if necessary */ 3908 if (amdgpu_device_need_post(adev)) { 3909 if (!adev->bios) { 3910 dev_err(adev->dev, "no vBIOS found\n"); 3911 r = -EINVAL; 3912 goto failed; 3913 } 3914 DRM_INFO("GPU posting now...\n"); 3915 r = amdgpu_device_asic_init(adev); 3916 if (r) { 3917 dev_err(adev->dev, "gpu post error!\n"); 3918 goto failed; 3919 } 3920 } 3921 3922 if (adev->bios) { 3923 if (adev->is_atom_fw) { 3924 /* Initialize clocks */ 3925 r = amdgpu_atomfirmware_get_clock_info(adev); 3926 if (r) { 3927 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3928 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3929 goto failed; 3930 } 3931 } else { 3932 /* Initialize clocks */ 3933 r = amdgpu_atombios_get_clock_info(adev); 3934 if (r) { 3935 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3936 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3937 goto failed; 3938 } 3939 /* init i2c buses */ 3940 if (!amdgpu_device_has_dc_support(adev)) 3941 amdgpu_atombios_i2c_init(adev); 3942 } 3943 } 3944 3945 fence_driver_init: 3946 /* Fence driver */ 3947 r = amdgpu_fence_driver_sw_init(adev); 3948 if (r) { 3949 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3950 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3951 goto failed; 3952 } 3953 3954 /* init the mode config */ 3955 drm_mode_config_init(adev_to_drm(adev)); 3956 3957 r = amdgpu_device_ip_init(adev); 3958 if (r) { 3959 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3960 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3961 goto release_ras_con; 3962 } 3963 3964 amdgpu_fence_driver_hw_init(adev); 3965 3966 dev_info(adev->dev, 3967 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3968 adev->gfx.config.max_shader_engines, 3969 adev->gfx.config.max_sh_per_se, 3970 adev->gfx.config.max_cu_per_sh, 3971 adev->gfx.cu_info.number); 3972 3973 adev->accel_working = true; 3974 3975 amdgpu_vm_check_compute_bug(adev); 3976 3977 /* Initialize the buffer migration limit. */ 3978 if (amdgpu_moverate >= 0) 3979 max_MBps = amdgpu_moverate; 3980 else 3981 max_MBps = 8; /* Allow 8 MB/s. */ 3982 /* Get a log2 for easy divisions. */ 3983 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3984 3985 /* 3986 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3987 * Otherwise the mgpu fan boost feature will be skipped due to the 3988 * gpu instance is counted less. 3989 */ 3990 amdgpu_register_gpu_instance(adev); 3991 3992 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3993 * explicit gating rather than handling it automatically. 3994 */ 3995 if (!adev->gmc.xgmi.pending_reset) { 3996 r = amdgpu_device_ip_late_init(adev); 3997 if (r) { 3998 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3999 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4000 goto release_ras_con; 4001 } 4002 /* must succeed. */ 4003 amdgpu_ras_resume(adev); 4004 queue_delayed_work(system_wq, &adev->delayed_init_work, 4005 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4006 } 4007 4008 if (amdgpu_sriov_vf(adev)) { 4009 amdgpu_virt_release_full_gpu(adev, true); 4010 flush_delayed_work(&adev->delayed_init_work); 4011 } 4012 4013 /* 4014 * Place those sysfs registering after `late_init`. As some of those 4015 * operations performed in `late_init` might affect the sysfs 4016 * interfaces creating. 4017 */ 4018 r = amdgpu_atombios_sysfs_init(adev); 4019 if (r) 4020 drm_err(&adev->ddev, 4021 "registering atombios sysfs failed (%d).\n", r); 4022 4023 r = amdgpu_pm_sysfs_init(adev); 4024 if (r) 4025 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4026 4027 r = amdgpu_ucode_sysfs_init(adev); 4028 if (r) { 4029 adev->ucode_sysfs_en = false; 4030 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4031 } else 4032 adev->ucode_sysfs_en = true; 4033 4034 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4035 if (r) 4036 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4037 4038 amdgpu_fru_sysfs_init(adev); 4039 4040 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4041 r = amdgpu_pmu_init(adev); 4042 if (r) 4043 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4044 4045 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4046 if (amdgpu_device_cache_pci_state(adev->pdev)) 4047 pci_restore_state(pdev); 4048 4049 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4050 /* this will fail for cards that aren't VGA class devices, just 4051 * ignore it 4052 */ 4053 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4054 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4055 4056 px = amdgpu_device_supports_px(ddev); 4057 4058 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4059 apple_gmux_detect(NULL, NULL))) 4060 vga_switcheroo_register_client(adev->pdev, 4061 &amdgpu_switcheroo_ops, px); 4062 4063 if (px) 4064 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4065 4066 if (adev->gmc.xgmi.pending_reset) 4067 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4068 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4069 4070 amdgpu_device_check_iommu_direct_map(adev); 4071 4072 return 0; 4073 4074 release_ras_con: 4075 if (amdgpu_sriov_vf(adev)) 4076 amdgpu_virt_release_full_gpu(adev, true); 4077 4078 /* failed in exclusive mode due to timeout */ 4079 if (amdgpu_sriov_vf(adev) && 4080 !amdgpu_sriov_runtime(adev) && 4081 amdgpu_virt_mmio_blocked(adev) && 4082 !amdgpu_virt_wait_reset(adev)) { 4083 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4084 /* Don't send request since VF is inactive. */ 4085 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4086 adev->virt.ops = NULL; 4087 r = -EAGAIN; 4088 } 4089 amdgpu_release_ras_context(adev); 4090 4091 failed: 4092 amdgpu_vf_error_trans_all(adev); 4093 4094 return r; 4095 } 4096 4097 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4098 { 4099 4100 /* Clear all CPU mappings pointing to this device */ 4101 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4102 4103 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4104 amdgpu_doorbell_fini(adev); 4105 4106 iounmap(adev->rmmio); 4107 adev->rmmio = NULL; 4108 if (adev->mman.aper_base_kaddr) 4109 iounmap(adev->mman.aper_base_kaddr); 4110 adev->mman.aper_base_kaddr = NULL; 4111 4112 /* Memory manager related */ 4113 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4114 arch_phys_wc_del(adev->gmc.vram_mtrr); 4115 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4116 } 4117 } 4118 4119 /** 4120 * amdgpu_device_fini_hw - tear down the driver 4121 * 4122 * @adev: amdgpu_device pointer 4123 * 4124 * Tear down the driver info (all asics). 4125 * Called at driver shutdown. 4126 */ 4127 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4128 { 4129 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4130 flush_delayed_work(&adev->delayed_init_work); 4131 adev->shutdown = true; 4132 4133 /* make sure IB test finished before entering exclusive mode 4134 * to avoid preemption on IB test 4135 */ 4136 if (amdgpu_sriov_vf(adev)) { 4137 amdgpu_virt_request_full_gpu(adev, false); 4138 amdgpu_virt_fini_data_exchange(adev); 4139 } 4140 4141 /* disable all interrupts */ 4142 amdgpu_irq_disable_all(adev); 4143 if (adev->mode_info.mode_config_initialized) { 4144 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4145 drm_helper_force_disable_all(adev_to_drm(adev)); 4146 else 4147 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4148 } 4149 amdgpu_fence_driver_hw_fini(adev); 4150 4151 if (adev->mman.initialized) 4152 drain_workqueue(adev->mman.bdev.wq); 4153 4154 if (adev->pm.sysfs_initialized) 4155 amdgpu_pm_sysfs_fini(adev); 4156 if (adev->ucode_sysfs_en) 4157 amdgpu_ucode_sysfs_fini(adev); 4158 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4159 amdgpu_fru_sysfs_fini(adev); 4160 4161 /* disable ras feature must before hw fini */ 4162 amdgpu_ras_pre_fini(adev); 4163 4164 amdgpu_device_ip_fini_early(adev); 4165 4166 amdgpu_irq_fini_hw(adev); 4167 4168 if (adev->mman.initialized) 4169 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4170 4171 amdgpu_gart_dummy_page_fini(adev); 4172 4173 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4174 amdgpu_device_unmap_mmio(adev); 4175 4176 } 4177 4178 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4179 { 4180 int idx; 4181 bool px; 4182 4183 amdgpu_fence_driver_sw_fini(adev); 4184 amdgpu_device_ip_fini(adev); 4185 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4186 adev->accel_working = false; 4187 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4188 4189 amdgpu_reset_fini(adev); 4190 4191 /* free i2c buses */ 4192 if (!amdgpu_device_has_dc_support(adev)) 4193 amdgpu_i2c_fini(adev); 4194 4195 if (amdgpu_emu_mode != 1) 4196 amdgpu_atombios_fini(adev); 4197 4198 kfree(adev->bios); 4199 adev->bios = NULL; 4200 4201 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4202 4203 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4204 apple_gmux_detect(NULL, NULL))) 4205 vga_switcheroo_unregister_client(adev->pdev); 4206 4207 if (px) 4208 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4209 4210 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4211 vga_client_unregister(adev->pdev); 4212 4213 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4214 4215 iounmap(adev->rmmio); 4216 adev->rmmio = NULL; 4217 amdgpu_doorbell_fini(adev); 4218 drm_dev_exit(idx); 4219 } 4220 4221 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4222 amdgpu_pmu_fini(adev); 4223 if (adev->mman.discovery_bin) 4224 amdgpu_discovery_fini(adev); 4225 4226 amdgpu_reset_put_reset_domain(adev->reset_domain); 4227 adev->reset_domain = NULL; 4228 4229 kfree(adev->pci_state); 4230 4231 } 4232 4233 /** 4234 * amdgpu_device_evict_resources - evict device resources 4235 * @adev: amdgpu device object 4236 * 4237 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4238 * of the vram memory type. Mainly used for evicting device resources 4239 * at suspend time. 4240 * 4241 */ 4242 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4243 { 4244 int ret; 4245 4246 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4247 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4248 return 0; 4249 4250 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4251 if (ret) 4252 DRM_WARN("evicting device resources failed\n"); 4253 return ret; 4254 } 4255 4256 /* 4257 * Suspend & resume. 4258 */ 4259 /** 4260 * amdgpu_device_suspend - initiate device suspend 4261 * 4262 * @dev: drm dev pointer 4263 * @fbcon : notify the fbdev of suspend 4264 * 4265 * Puts the hw in the suspend state (all asics). 4266 * Returns 0 for success or an error on failure. 4267 * Called at driver suspend. 4268 */ 4269 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4270 { 4271 struct amdgpu_device *adev = drm_to_adev(dev); 4272 int r = 0; 4273 4274 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4275 return 0; 4276 4277 adev->in_suspend = true; 4278 4279 /* Evict the majority of BOs before grabbing the full access */ 4280 r = amdgpu_device_evict_resources(adev); 4281 if (r) 4282 return r; 4283 4284 if (amdgpu_sriov_vf(adev)) { 4285 amdgpu_virt_fini_data_exchange(adev); 4286 r = amdgpu_virt_request_full_gpu(adev, false); 4287 if (r) 4288 return r; 4289 } 4290 4291 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4292 DRM_WARN("smart shift update failed\n"); 4293 4294 if (fbcon) 4295 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4296 4297 cancel_delayed_work_sync(&adev->delayed_init_work); 4298 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4299 4300 amdgpu_ras_suspend(adev); 4301 4302 amdgpu_device_ip_suspend_phase1(adev); 4303 4304 if (!adev->in_s0ix) 4305 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4306 4307 r = amdgpu_device_evict_resources(adev); 4308 if (r) 4309 return r; 4310 4311 amdgpu_fence_driver_hw_fini(adev); 4312 4313 amdgpu_device_ip_suspend_phase2(adev); 4314 4315 if (amdgpu_sriov_vf(adev)) 4316 amdgpu_virt_release_full_gpu(adev, false); 4317 4318 return 0; 4319 } 4320 4321 /** 4322 * amdgpu_device_resume - initiate device resume 4323 * 4324 * @dev: drm dev pointer 4325 * @fbcon : notify the fbdev of resume 4326 * 4327 * Bring the hw back to operating state (all asics). 4328 * Returns 0 for success or an error on failure. 4329 * Called at driver resume. 4330 */ 4331 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4332 { 4333 struct amdgpu_device *adev = drm_to_adev(dev); 4334 int r = 0; 4335 4336 if (amdgpu_sriov_vf(adev)) { 4337 r = amdgpu_virt_request_full_gpu(adev, true); 4338 if (r) 4339 return r; 4340 } 4341 4342 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4343 return 0; 4344 4345 if (adev->in_s0ix) 4346 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4347 4348 /* post card */ 4349 if (amdgpu_device_need_post(adev)) { 4350 r = amdgpu_device_asic_init(adev); 4351 if (r) 4352 dev_err(adev->dev, "amdgpu asic init failed\n"); 4353 } 4354 4355 r = amdgpu_device_ip_resume(adev); 4356 4357 if (r) { 4358 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4359 goto exit; 4360 } 4361 amdgpu_fence_driver_hw_init(adev); 4362 4363 r = amdgpu_device_ip_late_init(adev); 4364 if (r) 4365 goto exit; 4366 4367 queue_delayed_work(system_wq, &adev->delayed_init_work, 4368 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4369 4370 if (!adev->in_s0ix) { 4371 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4372 if (r) 4373 goto exit; 4374 } 4375 4376 exit: 4377 if (amdgpu_sriov_vf(adev)) { 4378 amdgpu_virt_init_data_exchange(adev); 4379 amdgpu_virt_release_full_gpu(adev, true); 4380 } 4381 4382 if (r) 4383 return r; 4384 4385 /* Make sure IB tests flushed */ 4386 flush_delayed_work(&adev->delayed_init_work); 4387 4388 if (fbcon) 4389 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4390 4391 amdgpu_ras_resume(adev); 4392 4393 if (adev->mode_info.num_crtc) { 4394 /* 4395 * Most of the connector probing functions try to acquire runtime pm 4396 * refs to ensure that the GPU is powered on when connector polling is 4397 * performed. Since we're calling this from a runtime PM callback, 4398 * trying to acquire rpm refs will cause us to deadlock. 4399 * 4400 * Since we're guaranteed to be holding the rpm lock, it's safe to 4401 * temporarily disable the rpm helpers so this doesn't deadlock us. 4402 */ 4403 #ifdef CONFIG_PM 4404 dev->dev->power.disable_depth++; 4405 #endif 4406 if (!adev->dc_enabled) 4407 drm_helper_hpd_irq_event(dev); 4408 else 4409 drm_kms_helper_hotplug_event(dev); 4410 #ifdef CONFIG_PM 4411 dev->dev->power.disable_depth--; 4412 #endif 4413 } 4414 adev->in_suspend = false; 4415 4416 if (adev->enable_mes) 4417 amdgpu_mes_self_test(adev); 4418 4419 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4420 DRM_WARN("smart shift update failed\n"); 4421 4422 return 0; 4423 } 4424 4425 /** 4426 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4427 * 4428 * @adev: amdgpu_device pointer 4429 * 4430 * The list of all the hardware IPs that make up the asic is walked and 4431 * the check_soft_reset callbacks are run. check_soft_reset determines 4432 * if the asic is still hung or not. 4433 * Returns true if any of the IPs are still in a hung state, false if not. 4434 */ 4435 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4436 { 4437 int i; 4438 bool asic_hang = false; 4439 4440 if (amdgpu_sriov_vf(adev)) 4441 return true; 4442 4443 if (amdgpu_asic_need_full_reset(adev)) 4444 return true; 4445 4446 for (i = 0; i < adev->num_ip_blocks; i++) { 4447 if (!adev->ip_blocks[i].status.valid) 4448 continue; 4449 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4450 adev->ip_blocks[i].status.hang = 4451 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4452 if (adev->ip_blocks[i].status.hang) { 4453 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4454 asic_hang = true; 4455 } 4456 } 4457 return asic_hang; 4458 } 4459 4460 /** 4461 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4462 * 4463 * @adev: amdgpu_device pointer 4464 * 4465 * The list of all the hardware IPs that make up the asic is walked and the 4466 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4467 * handles any IP specific hardware or software state changes that are 4468 * necessary for a soft reset to succeed. 4469 * Returns 0 on success, negative error code on failure. 4470 */ 4471 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4472 { 4473 int i, r = 0; 4474 4475 for (i = 0; i < adev->num_ip_blocks; i++) { 4476 if (!adev->ip_blocks[i].status.valid) 4477 continue; 4478 if (adev->ip_blocks[i].status.hang && 4479 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4480 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4481 if (r) 4482 return r; 4483 } 4484 } 4485 4486 return 0; 4487 } 4488 4489 /** 4490 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4491 * 4492 * @adev: amdgpu_device pointer 4493 * 4494 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4495 * reset is necessary to recover. 4496 * Returns true if a full asic reset is required, false if not. 4497 */ 4498 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4499 { 4500 int i; 4501 4502 if (amdgpu_asic_need_full_reset(adev)) 4503 return true; 4504 4505 for (i = 0; i < adev->num_ip_blocks; i++) { 4506 if (!adev->ip_blocks[i].status.valid) 4507 continue; 4508 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4509 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4510 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4511 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4512 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4513 if (adev->ip_blocks[i].status.hang) { 4514 dev_info(adev->dev, "Some block need full reset!\n"); 4515 return true; 4516 } 4517 } 4518 } 4519 return false; 4520 } 4521 4522 /** 4523 * amdgpu_device_ip_soft_reset - do a soft reset 4524 * 4525 * @adev: amdgpu_device pointer 4526 * 4527 * The list of all the hardware IPs that make up the asic is walked and the 4528 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4529 * IP specific hardware or software state changes that are necessary to soft 4530 * reset the IP. 4531 * Returns 0 on success, negative error code on failure. 4532 */ 4533 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4534 { 4535 int i, r = 0; 4536 4537 for (i = 0; i < adev->num_ip_blocks; i++) { 4538 if (!adev->ip_blocks[i].status.valid) 4539 continue; 4540 if (adev->ip_blocks[i].status.hang && 4541 adev->ip_blocks[i].version->funcs->soft_reset) { 4542 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4543 if (r) 4544 return r; 4545 } 4546 } 4547 4548 return 0; 4549 } 4550 4551 /** 4552 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4553 * 4554 * @adev: amdgpu_device pointer 4555 * 4556 * The list of all the hardware IPs that make up the asic is walked and the 4557 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4558 * handles any IP specific hardware or software state changes that are 4559 * necessary after the IP has been soft reset. 4560 * Returns 0 on success, negative error code on failure. 4561 */ 4562 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4563 { 4564 int i, r = 0; 4565 4566 for (i = 0; i < adev->num_ip_blocks; i++) { 4567 if (!adev->ip_blocks[i].status.valid) 4568 continue; 4569 if (adev->ip_blocks[i].status.hang && 4570 adev->ip_blocks[i].version->funcs->post_soft_reset) 4571 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4572 if (r) 4573 return r; 4574 } 4575 4576 return 0; 4577 } 4578 4579 /** 4580 * amdgpu_device_recover_vram - Recover some VRAM contents 4581 * 4582 * @adev: amdgpu_device pointer 4583 * 4584 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4585 * restore things like GPUVM page tables after a GPU reset where 4586 * the contents of VRAM might be lost. 4587 * 4588 * Returns: 4589 * 0 on success, negative error code on failure. 4590 */ 4591 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4592 { 4593 struct dma_fence *fence = NULL, *next = NULL; 4594 struct amdgpu_bo *shadow; 4595 struct amdgpu_bo_vm *vmbo; 4596 long r = 1, tmo; 4597 4598 if (amdgpu_sriov_runtime(adev)) 4599 tmo = msecs_to_jiffies(8000); 4600 else 4601 tmo = msecs_to_jiffies(100); 4602 4603 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4604 mutex_lock(&adev->shadow_list_lock); 4605 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4606 /* If vm is compute context or adev is APU, shadow will be NULL */ 4607 if (!vmbo->shadow) 4608 continue; 4609 shadow = vmbo->shadow; 4610 4611 /* No need to recover an evicted BO */ 4612 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4613 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4614 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4615 continue; 4616 4617 r = amdgpu_bo_restore_shadow(shadow, &next); 4618 if (r) 4619 break; 4620 4621 if (fence) { 4622 tmo = dma_fence_wait_timeout(fence, false, tmo); 4623 dma_fence_put(fence); 4624 fence = next; 4625 if (tmo == 0) { 4626 r = -ETIMEDOUT; 4627 break; 4628 } else if (tmo < 0) { 4629 r = tmo; 4630 break; 4631 } 4632 } else { 4633 fence = next; 4634 } 4635 } 4636 mutex_unlock(&adev->shadow_list_lock); 4637 4638 if (fence) 4639 tmo = dma_fence_wait_timeout(fence, false, tmo); 4640 dma_fence_put(fence); 4641 4642 if (r < 0 || tmo <= 0) { 4643 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4644 return -EIO; 4645 } 4646 4647 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4648 return 0; 4649 } 4650 4651 4652 /** 4653 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4654 * 4655 * @adev: amdgpu_device pointer 4656 * @from_hypervisor: request from hypervisor 4657 * 4658 * do VF FLR and reinitialize Asic 4659 * return 0 means succeeded otherwise failed 4660 */ 4661 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4662 bool from_hypervisor) 4663 { 4664 int r; 4665 struct amdgpu_hive_info *hive = NULL; 4666 int retry_limit = 0; 4667 4668 retry: 4669 amdgpu_amdkfd_pre_reset(adev); 4670 4671 if (from_hypervisor) 4672 r = amdgpu_virt_request_full_gpu(adev, true); 4673 else 4674 r = amdgpu_virt_reset_gpu(adev); 4675 if (r) 4676 return r; 4677 amdgpu_irq_gpu_reset_resume_helper(adev); 4678 4679 /* some sw clean up VF needs to do before recover */ 4680 amdgpu_virt_post_reset(adev); 4681 4682 /* Resume IP prior to SMC */ 4683 r = amdgpu_device_ip_reinit_early_sriov(adev); 4684 if (r) 4685 goto error; 4686 4687 amdgpu_virt_init_data_exchange(adev); 4688 4689 r = amdgpu_device_fw_loading(adev); 4690 if (r) 4691 return r; 4692 4693 /* now we are okay to resume SMC/CP/SDMA */ 4694 r = amdgpu_device_ip_reinit_late_sriov(adev); 4695 if (r) 4696 goto error; 4697 4698 hive = amdgpu_get_xgmi_hive(adev); 4699 /* Update PSP FW topology after reset */ 4700 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4701 r = amdgpu_xgmi_update_topology(hive, adev); 4702 4703 if (hive) 4704 amdgpu_put_xgmi_hive(hive); 4705 4706 if (!r) { 4707 r = amdgpu_ib_ring_tests(adev); 4708 4709 amdgpu_amdkfd_post_reset(adev); 4710 } 4711 4712 error: 4713 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4714 amdgpu_inc_vram_lost(adev); 4715 r = amdgpu_device_recover_vram(adev); 4716 } 4717 amdgpu_virt_release_full_gpu(adev, true); 4718 4719 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4720 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4721 retry_limit++; 4722 goto retry; 4723 } else 4724 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4725 } 4726 4727 return r; 4728 } 4729 4730 /** 4731 * amdgpu_device_has_job_running - check if there is any job in mirror list 4732 * 4733 * @adev: amdgpu_device pointer 4734 * 4735 * check if there is any job in mirror list 4736 */ 4737 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4738 { 4739 int i; 4740 struct drm_sched_job *job; 4741 4742 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4743 struct amdgpu_ring *ring = adev->rings[i]; 4744 4745 if (!ring || !ring->sched.thread) 4746 continue; 4747 4748 spin_lock(&ring->sched.job_list_lock); 4749 job = list_first_entry_or_null(&ring->sched.pending_list, 4750 struct drm_sched_job, list); 4751 spin_unlock(&ring->sched.job_list_lock); 4752 if (job) 4753 return true; 4754 } 4755 return false; 4756 } 4757 4758 /** 4759 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4760 * 4761 * @adev: amdgpu_device pointer 4762 * 4763 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4764 * a hung GPU. 4765 */ 4766 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4767 { 4768 4769 if (amdgpu_gpu_recovery == 0) 4770 goto disabled; 4771 4772 /* Skip soft reset check in fatal error mode */ 4773 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4774 return true; 4775 4776 if (amdgpu_sriov_vf(adev)) 4777 return true; 4778 4779 if (amdgpu_gpu_recovery == -1) { 4780 switch (adev->asic_type) { 4781 #ifdef CONFIG_DRM_AMDGPU_SI 4782 case CHIP_VERDE: 4783 case CHIP_TAHITI: 4784 case CHIP_PITCAIRN: 4785 case CHIP_OLAND: 4786 case CHIP_HAINAN: 4787 #endif 4788 #ifdef CONFIG_DRM_AMDGPU_CIK 4789 case CHIP_KAVERI: 4790 case CHIP_KABINI: 4791 case CHIP_MULLINS: 4792 #endif 4793 case CHIP_CARRIZO: 4794 case CHIP_STONEY: 4795 case CHIP_CYAN_SKILLFISH: 4796 goto disabled; 4797 default: 4798 break; 4799 } 4800 } 4801 4802 return true; 4803 4804 disabled: 4805 dev_info(adev->dev, "GPU recovery disabled.\n"); 4806 return false; 4807 } 4808 4809 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4810 { 4811 u32 i; 4812 int ret = 0; 4813 4814 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4815 4816 dev_info(adev->dev, "GPU mode1 reset\n"); 4817 4818 /* disable BM */ 4819 pci_clear_master(adev->pdev); 4820 4821 amdgpu_device_cache_pci_state(adev->pdev); 4822 4823 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4824 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4825 ret = amdgpu_dpm_mode1_reset(adev); 4826 } else { 4827 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4828 ret = psp_gpu_reset(adev); 4829 } 4830 4831 if (ret) 4832 goto mode1_reset_failed; 4833 4834 amdgpu_device_load_pci_state(adev->pdev); 4835 ret = amdgpu_psp_wait_for_bootloader(adev); 4836 if (ret) 4837 goto mode1_reset_failed; 4838 4839 /* wait for asic to come out of reset */ 4840 for (i = 0; i < adev->usec_timeout; i++) { 4841 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4842 4843 if (memsize != 0xffffffff) 4844 break; 4845 udelay(1); 4846 } 4847 4848 if (i >= adev->usec_timeout) { 4849 ret = -ETIMEDOUT; 4850 goto mode1_reset_failed; 4851 } 4852 4853 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4854 4855 return 0; 4856 4857 mode1_reset_failed: 4858 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4859 return ret; 4860 } 4861 4862 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4863 struct amdgpu_reset_context *reset_context) 4864 { 4865 int i, r = 0; 4866 struct amdgpu_job *job = NULL; 4867 bool need_full_reset = 4868 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4869 4870 if (reset_context->reset_req_dev == adev) 4871 job = reset_context->job; 4872 4873 if (amdgpu_sriov_vf(adev)) { 4874 /* stop the data exchange thread */ 4875 amdgpu_virt_fini_data_exchange(adev); 4876 } 4877 4878 amdgpu_fence_driver_isr_toggle(adev, true); 4879 4880 /* block all schedulers and reset given job's ring */ 4881 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4882 struct amdgpu_ring *ring = adev->rings[i]; 4883 4884 if (!ring || !ring->sched.thread) 4885 continue; 4886 4887 /* Clear job fence from fence drv to avoid force_completion 4888 * leave NULL and vm flush fence in fence drv 4889 */ 4890 amdgpu_fence_driver_clear_job_fences(ring); 4891 4892 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4893 amdgpu_fence_driver_force_completion(ring); 4894 } 4895 4896 amdgpu_fence_driver_isr_toggle(adev, false); 4897 4898 if (job && job->vm) 4899 drm_sched_increase_karma(&job->base); 4900 4901 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4902 /* If reset handler not implemented, continue; otherwise return */ 4903 if (r == -EOPNOTSUPP) 4904 r = 0; 4905 else 4906 return r; 4907 4908 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4909 if (!amdgpu_sriov_vf(adev)) { 4910 4911 if (!need_full_reset) 4912 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4913 4914 if (!need_full_reset && amdgpu_gpu_recovery && 4915 amdgpu_device_ip_check_soft_reset(adev)) { 4916 amdgpu_device_ip_pre_soft_reset(adev); 4917 r = amdgpu_device_ip_soft_reset(adev); 4918 amdgpu_device_ip_post_soft_reset(adev); 4919 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4920 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4921 need_full_reset = true; 4922 } 4923 } 4924 4925 if (need_full_reset) 4926 r = amdgpu_device_ip_suspend(adev); 4927 if (need_full_reset) 4928 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4929 else 4930 clear_bit(AMDGPU_NEED_FULL_RESET, 4931 &reset_context->flags); 4932 } 4933 4934 return r; 4935 } 4936 4937 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4938 { 4939 int i; 4940 4941 lockdep_assert_held(&adev->reset_domain->sem); 4942 4943 for (i = 0; i < adev->num_regs; i++) { 4944 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4945 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4946 adev->reset_dump_reg_value[i]); 4947 } 4948 4949 return 0; 4950 } 4951 4952 #ifdef CONFIG_DEV_COREDUMP 4953 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4954 size_t count, void *data, size_t datalen) 4955 { 4956 struct drm_printer p; 4957 struct amdgpu_device *adev = data; 4958 struct drm_print_iterator iter; 4959 int i; 4960 4961 iter.data = buffer; 4962 iter.offset = 0; 4963 iter.start = offset; 4964 iter.remain = count; 4965 4966 p = drm_coredump_printer(&iter); 4967 4968 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4969 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4970 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4971 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4972 if (adev->reset_task_info.pid) 4973 drm_printf(&p, "process_name: %s PID: %d\n", 4974 adev->reset_task_info.process_name, 4975 adev->reset_task_info.pid); 4976 4977 if (adev->reset_vram_lost) 4978 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4979 if (adev->num_regs) { 4980 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4981 4982 for (i = 0; i < adev->num_regs; i++) 4983 drm_printf(&p, "0x%08x: 0x%08x\n", 4984 adev->reset_dump_reg_list[i], 4985 adev->reset_dump_reg_value[i]); 4986 } 4987 4988 return count - iter.remain; 4989 } 4990 4991 static void amdgpu_devcoredump_free(void *data) 4992 { 4993 } 4994 4995 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4996 { 4997 struct drm_device *dev = adev_to_drm(adev); 4998 4999 ktime_get_ts64(&adev->reset_time); 5000 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT, 5001 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 5002 } 5003 #endif 5004 5005 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5006 struct amdgpu_reset_context *reset_context) 5007 { 5008 struct amdgpu_device *tmp_adev = NULL; 5009 bool need_full_reset, skip_hw_reset, vram_lost = false; 5010 int r = 0; 5011 bool gpu_reset_for_dev_remove = 0; 5012 5013 /* Try reset handler method first */ 5014 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5015 reset_list); 5016 amdgpu_reset_reg_dumps(tmp_adev); 5017 5018 reset_context->reset_device_list = device_list_handle; 5019 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5020 /* If reset handler not implemented, continue; otherwise return */ 5021 if (r == -EOPNOTSUPP) 5022 r = 0; 5023 else 5024 return r; 5025 5026 /* Reset handler not implemented, use the default method */ 5027 need_full_reset = 5028 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5029 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5030 5031 gpu_reset_for_dev_remove = 5032 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5033 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5034 5035 /* 5036 * ASIC reset has to be done on all XGMI hive nodes ASAP 5037 * to allow proper links negotiation in FW (within 1 sec) 5038 */ 5039 if (!skip_hw_reset && need_full_reset) { 5040 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5041 /* For XGMI run all resets in parallel to speed up the process */ 5042 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5043 tmp_adev->gmc.xgmi.pending_reset = false; 5044 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5045 r = -EALREADY; 5046 } else 5047 r = amdgpu_asic_reset(tmp_adev); 5048 5049 if (r) { 5050 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5051 r, adev_to_drm(tmp_adev)->unique); 5052 break; 5053 } 5054 } 5055 5056 /* For XGMI wait for all resets to complete before proceed */ 5057 if (!r) { 5058 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5059 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5060 flush_work(&tmp_adev->xgmi_reset_work); 5061 r = tmp_adev->asic_reset_res; 5062 if (r) 5063 break; 5064 } 5065 } 5066 } 5067 } 5068 5069 if (!r && amdgpu_ras_intr_triggered()) { 5070 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5071 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 5072 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 5073 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 5074 } 5075 5076 amdgpu_ras_intr_cleared(); 5077 } 5078 5079 /* Since the mode1 reset affects base ip blocks, the 5080 * phase1 ip blocks need to be resumed. Otherwise there 5081 * will be a BIOS signature error and the psp bootloader 5082 * can't load kdb on the next amdgpu install. 5083 */ 5084 if (gpu_reset_for_dev_remove) { 5085 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 5086 amdgpu_device_ip_resume_phase1(tmp_adev); 5087 5088 goto end; 5089 } 5090 5091 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5092 if (need_full_reset) { 5093 /* post card */ 5094 r = amdgpu_device_asic_init(tmp_adev); 5095 if (r) { 5096 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5097 } else { 5098 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5099 5100 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5101 if (r) 5102 goto out; 5103 5104 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5105 #ifdef CONFIG_DEV_COREDUMP 5106 tmp_adev->reset_vram_lost = vram_lost; 5107 memset(&tmp_adev->reset_task_info, 0, 5108 sizeof(tmp_adev->reset_task_info)); 5109 if (reset_context->job && reset_context->job->vm) 5110 tmp_adev->reset_task_info = 5111 reset_context->job->vm->task_info; 5112 amdgpu_reset_capture_coredumpm(tmp_adev); 5113 #endif 5114 if (vram_lost) { 5115 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5116 amdgpu_inc_vram_lost(tmp_adev); 5117 } 5118 5119 r = amdgpu_device_fw_loading(tmp_adev); 5120 if (r) 5121 return r; 5122 5123 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5124 if (r) 5125 goto out; 5126 5127 if (vram_lost) 5128 amdgpu_device_fill_reset_magic(tmp_adev); 5129 5130 /* 5131 * Add this ASIC as tracked as reset was already 5132 * complete successfully. 5133 */ 5134 amdgpu_register_gpu_instance(tmp_adev); 5135 5136 if (!reset_context->hive && 5137 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5138 amdgpu_xgmi_add_device(tmp_adev); 5139 5140 r = amdgpu_device_ip_late_init(tmp_adev); 5141 if (r) 5142 goto out; 5143 5144 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5145 5146 /* 5147 * The GPU enters bad state once faulty pages 5148 * by ECC has reached the threshold, and ras 5149 * recovery is scheduled next. So add one check 5150 * here to break recovery if it indeed exceeds 5151 * bad page threshold, and remind user to 5152 * retire this GPU or setting one bigger 5153 * bad_page_threshold value to fix this once 5154 * probing driver again. 5155 */ 5156 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5157 /* must succeed. */ 5158 amdgpu_ras_resume(tmp_adev); 5159 } else { 5160 r = -EINVAL; 5161 goto out; 5162 } 5163 5164 /* Update PSP FW topology after reset */ 5165 if (reset_context->hive && 5166 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5167 r = amdgpu_xgmi_update_topology( 5168 reset_context->hive, tmp_adev); 5169 } 5170 } 5171 5172 out: 5173 if (!r) { 5174 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5175 r = amdgpu_ib_ring_tests(tmp_adev); 5176 if (r) { 5177 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5178 need_full_reset = true; 5179 r = -EAGAIN; 5180 goto end; 5181 } 5182 } 5183 5184 if (!r) 5185 r = amdgpu_device_recover_vram(tmp_adev); 5186 else 5187 tmp_adev->asic_reset_res = r; 5188 } 5189 5190 end: 5191 if (need_full_reset) 5192 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5193 else 5194 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5195 return r; 5196 } 5197 5198 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5199 { 5200 5201 switch (amdgpu_asic_reset_method(adev)) { 5202 case AMD_RESET_METHOD_MODE1: 5203 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5204 break; 5205 case AMD_RESET_METHOD_MODE2: 5206 adev->mp1_state = PP_MP1_STATE_RESET; 5207 break; 5208 default: 5209 adev->mp1_state = PP_MP1_STATE_NONE; 5210 break; 5211 } 5212 } 5213 5214 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5215 { 5216 amdgpu_vf_error_trans_all(adev); 5217 adev->mp1_state = PP_MP1_STATE_NONE; 5218 } 5219 5220 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5221 { 5222 struct pci_dev *p = NULL; 5223 5224 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5225 adev->pdev->bus->number, 1); 5226 if (p) { 5227 pm_runtime_enable(&(p->dev)); 5228 pm_runtime_resume(&(p->dev)); 5229 } 5230 5231 pci_dev_put(p); 5232 } 5233 5234 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5235 { 5236 enum amd_reset_method reset_method; 5237 struct pci_dev *p = NULL; 5238 u64 expires; 5239 5240 /* 5241 * For now, only BACO and mode1 reset are confirmed 5242 * to suffer the audio issue without proper suspended. 5243 */ 5244 reset_method = amdgpu_asic_reset_method(adev); 5245 if ((reset_method != AMD_RESET_METHOD_BACO) && 5246 (reset_method != AMD_RESET_METHOD_MODE1)) 5247 return -EINVAL; 5248 5249 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5250 adev->pdev->bus->number, 1); 5251 if (!p) 5252 return -ENODEV; 5253 5254 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5255 if (!expires) 5256 /* 5257 * If we cannot get the audio device autosuspend delay, 5258 * a fixed 4S interval will be used. Considering 3S is 5259 * the audio controller default autosuspend delay setting. 5260 * 4S used here is guaranteed to cover that. 5261 */ 5262 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5263 5264 while (!pm_runtime_status_suspended(&(p->dev))) { 5265 if (!pm_runtime_suspend(&(p->dev))) 5266 break; 5267 5268 if (expires < ktime_get_mono_fast_ns()) { 5269 dev_warn(adev->dev, "failed to suspend display audio\n"); 5270 pci_dev_put(p); 5271 /* TODO: abort the succeeding gpu reset? */ 5272 return -ETIMEDOUT; 5273 } 5274 } 5275 5276 pm_runtime_disable(&(p->dev)); 5277 5278 pci_dev_put(p); 5279 return 0; 5280 } 5281 5282 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5283 { 5284 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5285 5286 #if defined(CONFIG_DEBUG_FS) 5287 if (!amdgpu_sriov_vf(adev)) 5288 cancel_work(&adev->reset_work); 5289 #endif 5290 5291 if (adev->kfd.dev) 5292 cancel_work(&adev->kfd.reset_work); 5293 5294 if (amdgpu_sriov_vf(adev)) 5295 cancel_work(&adev->virt.flr_work); 5296 5297 if (con && adev->ras_enabled) 5298 cancel_work(&con->recovery_work); 5299 5300 } 5301 5302 /** 5303 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5304 * 5305 * @adev: amdgpu_device pointer 5306 * @job: which job trigger hang 5307 * @reset_context: amdgpu reset context pointer 5308 * 5309 * Attempt to reset the GPU if it has hung (all asics). 5310 * Attempt to do soft-reset or full-reset and reinitialize Asic 5311 * Returns 0 for success or an error on failure. 5312 */ 5313 5314 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5315 struct amdgpu_job *job, 5316 struct amdgpu_reset_context *reset_context) 5317 { 5318 struct list_head device_list, *device_list_handle = NULL; 5319 bool job_signaled = false; 5320 struct amdgpu_hive_info *hive = NULL; 5321 struct amdgpu_device *tmp_adev = NULL; 5322 int i, r = 0; 5323 bool need_emergency_restart = false; 5324 bool audio_suspended = false; 5325 bool gpu_reset_for_dev_remove = false; 5326 5327 gpu_reset_for_dev_remove = 5328 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5329 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5330 5331 /* 5332 * Special case: RAS triggered and full reset isn't supported 5333 */ 5334 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5335 5336 /* 5337 * Flush RAM to disk so that after reboot 5338 * the user can read log and see why the system rebooted. 5339 */ 5340 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5341 DRM_WARN("Emergency reboot."); 5342 5343 ksys_sync_helper(); 5344 emergency_restart(); 5345 } 5346 5347 dev_info(adev->dev, "GPU %s begin!\n", 5348 need_emergency_restart ? "jobs stop":"reset"); 5349 5350 if (!amdgpu_sriov_vf(adev)) 5351 hive = amdgpu_get_xgmi_hive(adev); 5352 if (hive) 5353 mutex_lock(&hive->hive_lock); 5354 5355 reset_context->job = job; 5356 reset_context->hive = hive; 5357 /* 5358 * Build list of devices to reset. 5359 * In case we are in XGMI hive mode, resort the device list 5360 * to put adev in the 1st position. 5361 */ 5362 INIT_LIST_HEAD(&device_list); 5363 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5364 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5365 list_add_tail(&tmp_adev->reset_list, &device_list); 5366 if (gpu_reset_for_dev_remove && adev->shutdown) 5367 tmp_adev->shutdown = true; 5368 } 5369 if (!list_is_first(&adev->reset_list, &device_list)) 5370 list_rotate_to_front(&adev->reset_list, &device_list); 5371 device_list_handle = &device_list; 5372 } else { 5373 list_add_tail(&adev->reset_list, &device_list); 5374 device_list_handle = &device_list; 5375 } 5376 5377 /* We need to lock reset domain only once both for XGMI and single device */ 5378 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5379 reset_list); 5380 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5381 5382 /* block all schedulers and reset given job's ring */ 5383 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5384 5385 amdgpu_device_set_mp1_state(tmp_adev); 5386 5387 /* 5388 * Try to put the audio codec into suspend state 5389 * before gpu reset started. 5390 * 5391 * Due to the power domain of the graphics device 5392 * is shared with AZ power domain. Without this, 5393 * we may change the audio hardware from behind 5394 * the audio driver's back. That will trigger 5395 * some audio codec errors. 5396 */ 5397 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5398 audio_suspended = true; 5399 5400 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5401 5402 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5403 5404 if (!amdgpu_sriov_vf(tmp_adev)) 5405 amdgpu_amdkfd_pre_reset(tmp_adev); 5406 5407 /* 5408 * Mark these ASICs to be reseted as untracked first 5409 * And add them back after reset completed 5410 */ 5411 amdgpu_unregister_gpu_instance(tmp_adev); 5412 5413 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5414 5415 /* disable ras on ALL IPs */ 5416 if (!need_emergency_restart && 5417 amdgpu_device_ip_need_full_reset(tmp_adev)) 5418 amdgpu_ras_suspend(tmp_adev); 5419 5420 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5421 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5422 5423 if (!ring || !ring->sched.thread) 5424 continue; 5425 5426 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5427 5428 if (need_emergency_restart) 5429 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5430 } 5431 atomic_inc(&tmp_adev->gpu_reset_counter); 5432 } 5433 5434 if (need_emergency_restart) 5435 goto skip_sched_resume; 5436 5437 /* 5438 * Must check guilty signal here since after this point all old 5439 * HW fences are force signaled. 5440 * 5441 * job->base holds a reference to parent fence 5442 */ 5443 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5444 job_signaled = true; 5445 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5446 goto skip_hw_reset; 5447 } 5448 5449 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5450 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5451 if (gpu_reset_for_dev_remove) { 5452 /* Workaroud for ASICs need to disable SMC first */ 5453 amdgpu_device_smu_fini_early(tmp_adev); 5454 } 5455 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5456 /*TODO Should we stop ?*/ 5457 if (r) { 5458 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5459 r, adev_to_drm(tmp_adev)->unique); 5460 tmp_adev->asic_reset_res = r; 5461 } 5462 5463 /* 5464 * Drop all pending non scheduler resets. Scheduler resets 5465 * were already dropped during drm_sched_stop 5466 */ 5467 amdgpu_device_stop_pending_resets(tmp_adev); 5468 } 5469 5470 /* Actual ASIC resets if needed.*/ 5471 /* Host driver will handle XGMI hive reset for SRIOV */ 5472 if (amdgpu_sriov_vf(adev)) { 5473 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5474 if (r) 5475 adev->asic_reset_res = r; 5476 5477 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5478 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5479 IP_VERSION(9, 4, 2) || 5480 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5481 amdgpu_ras_resume(adev); 5482 } else { 5483 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5484 if (r && r == -EAGAIN) 5485 goto retry; 5486 5487 if (!r && gpu_reset_for_dev_remove) 5488 goto recover_end; 5489 } 5490 5491 skip_hw_reset: 5492 5493 /* Post ASIC reset for all devs .*/ 5494 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5495 5496 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5497 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5498 5499 if (!ring || !ring->sched.thread) 5500 continue; 5501 5502 drm_sched_start(&ring->sched, true); 5503 } 5504 5505 if (adev->enable_mes && 5506 amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(11, 0, 3)) 5507 amdgpu_mes_self_test(tmp_adev); 5508 5509 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5510 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5511 5512 if (tmp_adev->asic_reset_res) 5513 r = tmp_adev->asic_reset_res; 5514 5515 tmp_adev->asic_reset_res = 0; 5516 5517 if (r) { 5518 /* bad news, how to tell it to userspace ? */ 5519 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5520 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5521 } else { 5522 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5523 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5524 DRM_WARN("smart shift update failed\n"); 5525 } 5526 } 5527 5528 skip_sched_resume: 5529 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5530 /* unlock kfd: SRIOV would do it separately */ 5531 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5532 amdgpu_amdkfd_post_reset(tmp_adev); 5533 5534 /* kfd_post_reset will do nothing if kfd device is not initialized, 5535 * need to bring up kfd here if it's not be initialized before 5536 */ 5537 if (!adev->kfd.init_complete) 5538 amdgpu_amdkfd_device_init(adev); 5539 5540 if (audio_suspended) 5541 amdgpu_device_resume_display_audio(tmp_adev); 5542 5543 amdgpu_device_unset_mp1_state(tmp_adev); 5544 5545 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5546 } 5547 5548 recover_end: 5549 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5550 reset_list); 5551 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5552 5553 if (hive) { 5554 mutex_unlock(&hive->hive_lock); 5555 amdgpu_put_xgmi_hive(hive); 5556 } 5557 5558 if (r) 5559 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5560 5561 atomic_set(&adev->reset_domain->reset_res, r); 5562 return r; 5563 } 5564 5565 /** 5566 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5567 * 5568 * @adev: amdgpu_device pointer 5569 * 5570 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5571 * and lanes) of the slot the device is in. Handles APUs and 5572 * virtualized environments where PCIE config space may not be available. 5573 */ 5574 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5575 { 5576 struct pci_dev *pdev; 5577 enum pci_bus_speed speed_cap, platform_speed_cap; 5578 enum pcie_link_width platform_link_width; 5579 5580 if (amdgpu_pcie_gen_cap) 5581 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5582 5583 if (amdgpu_pcie_lane_cap) 5584 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5585 5586 /* covers APUs as well */ 5587 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5588 if (adev->pm.pcie_gen_mask == 0) 5589 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5590 if (adev->pm.pcie_mlw_mask == 0) 5591 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5592 return; 5593 } 5594 5595 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5596 return; 5597 5598 pcie_bandwidth_available(adev->pdev, NULL, 5599 &platform_speed_cap, &platform_link_width); 5600 5601 if (adev->pm.pcie_gen_mask == 0) { 5602 /* asic caps */ 5603 pdev = adev->pdev; 5604 speed_cap = pcie_get_speed_cap(pdev); 5605 if (speed_cap == PCI_SPEED_UNKNOWN) { 5606 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5607 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5608 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5609 } else { 5610 if (speed_cap == PCIE_SPEED_32_0GT) 5611 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5612 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5613 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5614 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5615 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5616 else if (speed_cap == PCIE_SPEED_16_0GT) 5617 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5618 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5619 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5620 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5621 else if (speed_cap == PCIE_SPEED_8_0GT) 5622 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5623 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5624 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5625 else if (speed_cap == PCIE_SPEED_5_0GT) 5626 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5627 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5628 else 5629 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5630 } 5631 /* platform caps */ 5632 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5633 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5634 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5635 } else { 5636 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5637 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5638 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5639 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5640 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5641 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5642 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5643 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5644 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5645 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5646 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5647 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5648 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5649 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5650 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5651 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5652 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5653 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5654 else 5655 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5656 5657 } 5658 } 5659 if (adev->pm.pcie_mlw_mask == 0) { 5660 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5661 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5662 } else { 5663 switch (platform_link_width) { 5664 case PCIE_LNK_X32: 5665 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5666 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5667 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5668 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5669 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5670 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5671 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5672 break; 5673 case PCIE_LNK_X16: 5674 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5675 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5676 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5677 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5678 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5679 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5680 break; 5681 case PCIE_LNK_X12: 5682 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5683 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5684 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5685 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5686 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5687 break; 5688 case PCIE_LNK_X8: 5689 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5690 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5691 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5692 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5693 break; 5694 case PCIE_LNK_X4: 5695 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5696 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5697 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5698 break; 5699 case PCIE_LNK_X2: 5700 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5701 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5702 break; 5703 case PCIE_LNK_X1: 5704 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5705 break; 5706 default: 5707 break; 5708 } 5709 } 5710 } 5711 } 5712 5713 /** 5714 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5715 * 5716 * @adev: amdgpu_device pointer 5717 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5718 * 5719 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5720 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5721 * @peer_adev. 5722 */ 5723 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5724 struct amdgpu_device *peer_adev) 5725 { 5726 #ifdef CONFIG_HSA_AMD_P2P 5727 uint64_t address_mask = peer_adev->dev->dma_mask ? 5728 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5729 resource_size_t aper_limit = 5730 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5731 bool p2p_access = 5732 !adev->gmc.xgmi.connected_to_cpu && 5733 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5734 5735 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5736 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5737 !(adev->gmc.aper_base & address_mask || 5738 aper_limit & address_mask)); 5739 #else 5740 return false; 5741 #endif 5742 } 5743 5744 int amdgpu_device_baco_enter(struct drm_device *dev) 5745 { 5746 struct amdgpu_device *adev = drm_to_adev(dev); 5747 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5748 5749 if (!amdgpu_device_supports_baco(dev)) 5750 return -ENOTSUPP; 5751 5752 if (ras && adev->ras_enabled && 5753 adev->nbio.funcs->enable_doorbell_interrupt) 5754 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5755 5756 return amdgpu_dpm_baco_enter(adev); 5757 } 5758 5759 int amdgpu_device_baco_exit(struct drm_device *dev) 5760 { 5761 struct amdgpu_device *adev = drm_to_adev(dev); 5762 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5763 int ret = 0; 5764 5765 if (!amdgpu_device_supports_baco(dev)) 5766 return -ENOTSUPP; 5767 5768 ret = amdgpu_dpm_baco_exit(adev); 5769 if (ret) 5770 return ret; 5771 5772 if (ras && adev->ras_enabled && 5773 adev->nbio.funcs->enable_doorbell_interrupt) 5774 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5775 5776 if (amdgpu_passthrough(adev) && 5777 adev->nbio.funcs->clear_doorbell_interrupt) 5778 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5779 5780 return 0; 5781 } 5782 5783 /** 5784 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5785 * @pdev: PCI device struct 5786 * @state: PCI channel state 5787 * 5788 * Description: Called when a PCI error is detected. 5789 * 5790 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5791 */ 5792 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5793 { 5794 struct drm_device *dev = pci_get_drvdata(pdev); 5795 struct amdgpu_device *adev = drm_to_adev(dev); 5796 int i; 5797 5798 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5799 5800 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5801 DRM_WARN("No support for XGMI hive yet..."); 5802 return PCI_ERS_RESULT_DISCONNECT; 5803 } 5804 5805 adev->pci_channel_state = state; 5806 5807 switch (state) { 5808 case pci_channel_io_normal: 5809 return PCI_ERS_RESULT_CAN_RECOVER; 5810 /* Fatal error, prepare for slot reset */ 5811 case pci_channel_io_frozen: 5812 /* 5813 * Locking adev->reset_domain->sem will prevent any external access 5814 * to GPU during PCI error recovery 5815 */ 5816 amdgpu_device_lock_reset_domain(adev->reset_domain); 5817 amdgpu_device_set_mp1_state(adev); 5818 5819 /* 5820 * Block any work scheduling as we do for regular GPU reset 5821 * for the duration of the recovery 5822 */ 5823 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5824 struct amdgpu_ring *ring = adev->rings[i]; 5825 5826 if (!ring || !ring->sched.thread) 5827 continue; 5828 5829 drm_sched_stop(&ring->sched, NULL); 5830 } 5831 atomic_inc(&adev->gpu_reset_counter); 5832 return PCI_ERS_RESULT_NEED_RESET; 5833 case pci_channel_io_perm_failure: 5834 /* Permanent error, prepare for device removal */ 5835 return PCI_ERS_RESULT_DISCONNECT; 5836 } 5837 5838 return PCI_ERS_RESULT_NEED_RESET; 5839 } 5840 5841 /** 5842 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5843 * @pdev: pointer to PCI device 5844 */ 5845 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5846 { 5847 5848 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5849 5850 /* TODO - dump whatever for debugging purposes */ 5851 5852 /* This called only if amdgpu_pci_error_detected returns 5853 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5854 * works, no need to reset slot. 5855 */ 5856 5857 return PCI_ERS_RESULT_RECOVERED; 5858 } 5859 5860 /** 5861 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5862 * @pdev: PCI device struct 5863 * 5864 * Description: This routine is called by the pci error recovery 5865 * code after the PCI slot has been reset, just before we 5866 * should resume normal operations. 5867 */ 5868 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5869 { 5870 struct drm_device *dev = pci_get_drvdata(pdev); 5871 struct amdgpu_device *adev = drm_to_adev(dev); 5872 int r, i; 5873 struct amdgpu_reset_context reset_context; 5874 u32 memsize; 5875 struct list_head device_list; 5876 5877 DRM_INFO("PCI error: slot reset callback!!\n"); 5878 5879 memset(&reset_context, 0, sizeof(reset_context)); 5880 5881 INIT_LIST_HEAD(&device_list); 5882 list_add_tail(&adev->reset_list, &device_list); 5883 5884 /* wait for asic to come out of reset */ 5885 msleep(500); 5886 5887 /* Restore PCI confspace */ 5888 amdgpu_device_load_pci_state(pdev); 5889 5890 /* confirm ASIC came out of reset */ 5891 for (i = 0; i < adev->usec_timeout; i++) { 5892 memsize = amdgpu_asic_get_config_memsize(adev); 5893 5894 if (memsize != 0xffffffff) 5895 break; 5896 udelay(1); 5897 } 5898 if (memsize == 0xffffffff) { 5899 r = -ETIME; 5900 goto out; 5901 } 5902 5903 reset_context.method = AMD_RESET_METHOD_NONE; 5904 reset_context.reset_req_dev = adev; 5905 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5906 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5907 5908 adev->no_hw_access = true; 5909 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5910 adev->no_hw_access = false; 5911 if (r) 5912 goto out; 5913 5914 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5915 5916 out: 5917 if (!r) { 5918 if (amdgpu_device_cache_pci_state(adev->pdev)) 5919 pci_restore_state(adev->pdev); 5920 5921 DRM_INFO("PCIe error recovery succeeded\n"); 5922 } else { 5923 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5924 amdgpu_device_unset_mp1_state(adev); 5925 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5926 } 5927 5928 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5929 } 5930 5931 /** 5932 * amdgpu_pci_resume() - resume normal ops after PCI reset 5933 * @pdev: pointer to PCI device 5934 * 5935 * Called when the error recovery driver tells us that its 5936 * OK to resume normal operation. 5937 */ 5938 void amdgpu_pci_resume(struct pci_dev *pdev) 5939 { 5940 struct drm_device *dev = pci_get_drvdata(pdev); 5941 struct amdgpu_device *adev = drm_to_adev(dev); 5942 int i; 5943 5944 5945 DRM_INFO("PCI error: resume callback!!\n"); 5946 5947 /* Only continue execution for the case of pci_channel_io_frozen */ 5948 if (adev->pci_channel_state != pci_channel_io_frozen) 5949 return; 5950 5951 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5952 struct amdgpu_ring *ring = adev->rings[i]; 5953 5954 if (!ring || !ring->sched.thread) 5955 continue; 5956 5957 drm_sched_start(&ring->sched, true); 5958 } 5959 5960 amdgpu_device_unset_mp1_state(adev); 5961 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5962 } 5963 5964 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5965 { 5966 struct drm_device *dev = pci_get_drvdata(pdev); 5967 struct amdgpu_device *adev = drm_to_adev(dev); 5968 int r; 5969 5970 r = pci_save_state(pdev); 5971 if (!r) { 5972 kfree(adev->pci_state); 5973 5974 adev->pci_state = pci_store_saved_state(pdev); 5975 5976 if (!adev->pci_state) { 5977 DRM_ERROR("Failed to store PCI saved state"); 5978 return false; 5979 } 5980 } else { 5981 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5982 return false; 5983 } 5984 5985 return true; 5986 } 5987 5988 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5989 { 5990 struct drm_device *dev = pci_get_drvdata(pdev); 5991 struct amdgpu_device *adev = drm_to_adev(dev); 5992 int r; 5993 5994 if (!adev->pci_state) 5995 return false; 5996 5997 r = pci_load_saved_state(pdev, adev->pci_state); 5998 5999 if (!r) { 6000 pci_restore_state(pdev); 6001 } else { 6002 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6003 return false; 6004 } 6005 6006 return true; 6007 } 6008 6009 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6010 struct amdgpu_ring *ring) 6011 { 6012 #ifdef CONFIG_X86_64 6013 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6014 return; 6015 #endif 6016 if (adev->gmc.xgmi.connected_to_cpu) 6017 return; 6018 6019 if (ring && ring->funcs->emit_hdp_flush) 6020 amdgpu_ring_emit_hdp_flush(ring); 6021 else 6022 amdgpu_asic_flush_hdp(adev, ring); 6023 } 6024 6025 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6026 struct amdgpu_ring *ring) 6027 { 6028 #ifdef CONFIG_X86_64 6029 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6030 return; 6031 #endif 6032 if (adev->gmc.xgmi.connected_to_cpu) 6033 return; 6034 6035 amdgpu_asic_invalidate_hdp(adev, ring); 6036 } 6037 6038 int amdgpu_in_reset(struct amdgpu_device *adev) 6039 { 6040 return atomic_read(&adev->reset_domain->in_gpu_reset); 6041 } 6042 6043 /** 6044 * amdgpu_device_halt() - bring hardware to some kind of halt state 6045 * 6046 * @adev: amdgpu_device pointer 6047 * 6048 * Bring hardware to some kind of halt state so that no one can touch it 6049 * any more. It will help to maintain error context when error occurred. 6050 * Compare to a simple hang, the system will keep stable at least for SSH 6051 * access. Then it should be trivial to inspect the hardware state and 6052 * see what's going on. Implemented as following: 6053 * 6054 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6055 * clears all CPU mappings to device, disallows remappings through page faults 6056 * 2. amdgpu_irq_disable_all() disables all interrupts 6057 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6058 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6059 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6060 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6061 * flush any in flight DMA operations 6062 */ 6063 void amdgpu_device_halt(struct amdgpu_device *adev) 6064 { 6065 struct pci_dev *pdev = adev->pdev; 6066 struct drm_device *ddev = adev_to_drm(adev); 6067 6068 amdgpu_xcp_dev_unplug(adev); 6069 drm_dev_unplug(ddev); 6070 6071 amdgpu_irq_disable_all(adev); 6072 6073 amdgpu_fence_driver_hw_fini(adev); 6074 6075 adev->no_hw_access = true; 6076 6077 amdgpu_device_unmap_mmio(adev); 6078 6079 pci_disable_device(pdev); 6080 pci_wait_for_pending_transaction(pdev); 6081 } 6082 6083 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6084 u32 reg) 6085 { 6086 unsigned long flags, address, data; 6087 u32 r; 6088 6089 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6090 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6091 6092 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6093 WREG32(address, reg * 4); 6094 (void)RREG32(address); 6095 r = RREG32(data); 6096 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6097 return r; 6098 } 6099 6100 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6101 u32 reg, u32 v) 6102 { 6103 unsigned long flags, address, data; 6104 6105 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6106 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6107 6108 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6109 WREG32(address, reg * 4); 6110 (void)RREG32(address); 6111 WREG32(data, v); 6112 (void)RREG32(data); 6113 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6114 } 6115 6116 /** 6117 * amdgpu_device_switch_gang - switch to a new gang 6118 * @adev: amdgpu_device pointer 6119 * @gang: the gang to switch to 6120 * 6121 * Try to switch to a new gang. 6122 * Returns: NULL if we switched to the new gang or a reference to the current 6123 * gang leader. 6124 */ 6125 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6126 struct dma_fence *gang) 6127 { 6128 struct dma_fence *old = NULL; 6129 6130 do { 6131 dma_fence_put(old); 6132 rcu_read_lock(); 6133 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6134 rcu_read_unlock(); 6135 6136 if (old == gang) 6137 break; 6138 6139 if (!dma_fence_is_signaled(old)) 6140 return old; 6141 6142 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6143 old, gang) != old); 6144 6145 dma_fence_put(old); 6146 return NULL; 6147 } 6148 6149 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6150 { 6151 switch (adev->asic_type) { 6152 #ifdef CONFIG_DRM_AMDGPU_SI 6153 case CHIP_HAINAN: 6154 #endif 6155 case CHIP_TOPAZ: 6156 /* chips with no display hardware */ 6157 return false; 6158 #ifdef CONFIG_DRM_AMDGPU_SI 6159 case CHIP_TAHITI: 6160 case CHIP_PITCAIRN: 6161 case CHIP_VERDE: 6162 case CHIP_OLAND: 6163 #endif 6164 #ifdef CONFIG_DRM_AMDGPU_CIK 6165 case CHIP_BONAIRE: 6166 case CHIP_HAWAII: 6167 case CHIP_KAVERI: 6168 case CHIP_KABINI: 6169 case CHIP_MULLINS: 6170 #endif 6171 case CHIP_TONGA: 6172 case CHIP_FIJI: 6173 case CHIP_POLARIS10: 6174 case CHIP_POLARIS11: 6175 case CHIP_POLARIS12: 6176 case CHIP_VEGAM: 6177 case CHIP_CARRIZO: 6178 case CHIP_STONEY: 6179 /* chips with display hardware */ 6180 return true; 6181 default: 6182 /* IP discovery */ 6183 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6184 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6185 return false; 6186 return true; 6187 } 6188 } 6189 6190 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6191 uint32_t inst, uint32_t reg_addr, char reg_name[], 6192 uint32_t expected_value, uint32_t mask) 6193 { 6194 uint32_t ret = 0; 6195 uint32_t old_ = 0; 6196 uint32_t tmp_ = RREG32(reg_addr); 6197 uint32_t loop = adev->usec_timeout; 6198 6199 while ((tmp_ & (mask)) != (expected_value)) { 6200 if (old_ != tmp_) { 6201 loop = adev->usec_timeout; 6202 old_ = tmp_; 6203 } else 6204 udelay(1); 6205 tmp_ = RREG32(reg_addr); 6206 loop--; 6207 if (!loop) { 6208 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6209 inst, reg_name, (uint32_t)expected_value, 6210 (uint32_t)(tmp_ & (mask))); 6211 ret = -ETIMEDOUT; 6212 break; 6213 } 6214 } 6215 return ret; 6216 } 6217