1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_aperture.h> 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_fb_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 78 #include <linux/suspend.h> 79 #include <drm/task_barrier.h> 80 #include <linux/pm_runtime.h> 81 82 #include <drm/drm_drv.h> 83 84 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 89 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 91 92 #define AMDGPU_RESUME_MS 2000 93 #define AMDGPU_MAX_RETRY_LIMIT 2 94 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 95 96 static const struct drm_driver amdgpu_kms_driver; 97 98 const char *amdgpu_asic_name[] = { 99 "TAHITI", 100 "PITCAIRN", 101 "VERDE", 102 "OLAND", 103 "HAINAN", 104 "BONAIRE", 105 "KAVERI", 106 "KABINI", 107 "HAWAII", 108 "MULLINS", 109 "TOPAZ", 110 "TONGA", 111 "FIJI", 112 "CARRIZO", 113 "STONEY", 114 "POLARIS10", 115 "POLARIS11", 116 "POLARIS12", 117 "VEGAM", 118 "VEGA10", 119 "VEGA12", 120 "VEGA20", 121 "RAVEN", 122 "ARCTURUS", 123 "RENOIR", 124 "ALDEBARAN", 125 "NAVI10", 126 "CYAN_SKILLFISH", 127 "NAVI14", 128 "NAVI12", 129 "SIENNA_CICHLID", 130 "NAVY_FLOUNDER", 131 "VANGOGH", 132 "DIMGREY_CAVEFISH", 133 "BEIGE_GOBY", 134 "YELLOW_CARP", 135 "IP DISCOVERY", 136 "LAST", 137 }; 138 139 /** 140 * DOC: pcie_replay_count 141 * 142 * The amdgpu driver provides a sysfs API for reporting the total number 143 * of PCIe replays (NAKs) 144 * The file pcie_replay_count is used for this and returns the total 145 * number of replays as a sum of the NAKs generated and NAKs received 146 */ 147 148 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 149 struct device_attribute *attr, char *buf) 150 { 151 struct drm_device *ddev = dev_get_drvdata(dev); 152 struct amdgpu_device *adev = drm_to_adev(ddev); 153 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 154 155 return sysfs_emit(buf, "%llu\n", cnt); 156 } 157 158 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 159 amdgpu_device_get_pcie_replay_count, NULL); 160 161 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 162 163 /** 164 * DOC: product_name 165 * 166 * The amdgpu driver provides a sysfs API for reporting the product name 167 * for the device 168 * The file product_name is used for this and returns the product name 169 * as returned from the FRU. 170 * NOTE: This is only available for certain server cards 171 */ 172 173 static ssize_t amdgpu_device_get_product_name(struct device *dev, 174 struct device_attribute *attr, char *buf) 175 { 176 struct drm_device *ddev = dev_get_drvdata(dev); 177 struct amdgpu_device *adev = drm_to_adev(ddev); 178 179 return sysfs_emit(buf, "%s\n", adev->product_name); 180 } 181 182 static DEVICE_ATTR(product_name, S_IRUGO, 183 amdgpu_device_get_product_name, NULL); 184 185 /** 186 * DOC: product_number 187 * 188 * The amdgpu driver provides a sysfs API for reporting the part number 189 * for the device 190 * The file product_number is used for this and returns the part number 191 * as returned from the FRU. 192 * NOTE: This is only available for certain server cards 193 */ 194 195 static ssize_t amdgpu_device_get_product_number(struct device *dev, 196 struct device_attribute *attr, char *buf) 197 { 198 struct drm_device *ddev = dev_get_drvdata(dev); 199 struct amdgpu_device *adev = drm_to_adev(ddev); 200 201 return sysfs_emit(buf, "%s\n", adev->product_number); 202 } 203 204 static DEVICE_ATTR(product_number, S_IRUGO, 205 amdgpu_device_get_product_number, NULL); 206 207 /** 208 * DOC: serial_number 209 * 210 * The amdgpu driver provides a sysfs API for reporting the serial number 211 * for the device 212 * The file serial_number is used for this and returns the serial number 213 * as returned from the FRU. 214 * NOTE: This is only available for certain server cards 215 */ 216 217 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 218 struct device_attribute *attr, char *buf) 219 { 220 struct drm_device *ddev = dev_get_drvdata(dev); 221 struct amdgpu_device *adev = drm_to_adev(ddev); 222 223 return sysfs_emit(buf, "%s\n", adev->serial); 224 } 225 226 static DEVICE_ATTR(serial_number, S_IRUGO, 227 amdgpu_device_get_serial_number, NULL); 228 229 /** 230 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 231 * 232 * @dev: drm_device pointer 233 * 234 * Returns true if the device is a dGPU with ATPX power control, 235 * otherwise return false. 236 */ 237 bool amdgpu_device_supports_px(struct drm_device *dev) 238 { 239 struct amdgpu_device *adev = drm_to_adev(dev); 240 241 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 242 return true; 243 return false; 244 } 245 246 /** 247 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 248 * 249 * @dev: drm_device pointer 250 * 251 * Returns true if the device is a dGPU with ACPI power control, 252 * otherwise return false. 253 */ 254 bool amdgpu_device_supports_boco(struct drm_device *dev) 255 { 256 struct amdgpu_device *adev = drm_to_adev(dev); 257 258 if (adev->has_pr3 || 259 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 260 return true; 261 return false; 262 } 263 264 /** 265 * amdgpu_device_supports_baco - Does the device support BACO 266 * 267 * @dev: drm_device pointer 268 * 269 * Returns true if the device supporte BACO, 270 * otherwise return false. 271 */ 272 bool amdgpu_device_supports_baco(struct drm_device *dev) 273 { 274 struct amdgpu_device *adev = drm_to_adev(dev); 275 276 return amdgpu_asic_supports_baco(adev); 277 } 278 279 /** 280 * amdgpu_device_supports_smart_shift - Is the device dGPU with 281 * smart shift support 282 * 283 * @dev: drm_device pointer 284 * 285 * Returns true if the device is a dGPU with Smart Shift support, 286 * otherwise returns false. 287 */ 288 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 289 { 290 return (amdgpu_device_supports_boco(dev) && 291 amdgpu_acpi_is_power_shift_control_supported()); 292 } 293 294 /* 295 * VRAM access helper functions 296 */ 297 298 /** 299 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 300 * 301 * @adev: amdgpu_device pointer 302 * @pos: offset of the buffer in vram 303 * @buf: virtual address of the buffer in system memory 304 * @size: read/write size, sizeof(@buf) must > @size 305 * @write: true - write to vram, otherwise - read from vram 306 */ 307 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 308 void *buf, size_t size, bool write) 309 { 310 unsigned long flags; 311 uint32_t hi = ~0, tmp = 0; 312 uint32_t *data = buf; 313 uint64_t last; 314 int idx; 315 316 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 317 return; 318 319 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 320 321 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 322 for (last = pos + size; pos < last; pos += 4) { 323 tmp = pos >> 31; 324 325 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 326 if (tmp != hi) { 327 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 328 hi = tmp; 329 } 330 if (write) 331 WREG32_NO_KIQ(mmMM_DATA, *data++); 332 else 333 *data++ = RREG32_NO_KIQ(mmMM_DATA); 334 } 335 336 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 337 drm_dev_exit(idx); 338 } 339 340 /** 341 * amdgpu_device_aper_access - access vram by vram aperature 342 * 343 * @adev: amdgpu_device pointer 344 * @pos: offset of the buffer in vram 345 * @buf: virtual address of the buffer in system memory 346 * @size: read/write size, sizeof(@buf) must > @size 347 * @write: true - write to vram, otherwise - read from vram 348 * 349 * The return value means how many bytes have been transferred. 350 */ 351 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 352 void *buf, size_t size, bool write) 353 { 354 #ifdef CONFIG_64BIT 355 void __iomem *addr; 356 size_t count = 0; 357 uint64_t last; 358 359 if (!adev->mman.aper_base_kaddr) 360 return 0; 361 362 last = min(pos + size, adev->gmc.visible_vram_size); 363 if (last > pos) { 364 addr = adev->mman.aper_base_kaddr + pos; 365 count = last - pos; 366 367 if (write) { 368 memcpy_toio(addr, buf, count); 369 mb(); 370 amdgpu_device_flush_hdp(adev, NULL); 371 } else { 372 amdgpu_device_invalidate_hdp(adev, NULL); 373 mb(); 374 memcpy_fromio(buf, addr, count); 375 } 376 377 } 378 379 return count; 380 #else 381 return 0; 382 #endif 383 } 384 385 /** 386 * amdgpu_device_vram_access - read/write a buffer in vram 387 * 388 * @adev: amdgpu_device pointer 389 * @pos: offset of the buffer in vram 390 * @buf: virtual address of the buffer in system memory 391 * @size: read/write size, sizeof(@buf) must > @size 392 * @write: true - write to vram, otherwise - read from vram 393 */ 394 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 395 void *buf, size_t size, bool write) 396 { 397 size_t count; 398 399 /* try to using vram apreature to access vram first */ 400 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 401 size -= count; 402 if (size) { 403 /* using MM to access rest vram */ 404 pos += count; 405 buf += count; 406 amdgpu_device_mm_access(adev, pos, buf, size, write); 407 } 408 } 409 410 /* 411 * register access helper functions. 412 */ 413 414 /* Check if hw access should be skipped because of hotplug or device error */ 415 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 416 { 417 if (adev->no_hw_access) 418 return true; 419 420 #ifdef CONFIG_LOCKDEP 421 /* 422 * This is a bit complicated to understand, so worth a comment. What we assert 423 * here is that the GPU reset is not running on another thread in parallel. 424 * 425 * For this we trylock the read side of the reset semaphore, if that succeeds 426 * we know that the reset is not running in paralell. 427 * 428 * If the trylock fails we assert that we are either already holding the read 429 * side of the lock or are the reset thread itself and hold the write side of 430 * the lock. 431 */ 432 if (in_task()) { 433 if (down_read_trylock(&adev->reset_domain->sem)) 434 up_read(&adev->reset_domain->sem); 435 else 436 lockdep_assert_held(&adev->reset_domain->sem); 437 } 438 #endif 439 return false; 440 } 441 442 /** 443 * amdgpu_device_rreg - read a memory mapped IO or indirect register 444 * 445 * @adev: amdgpu_device pointer 446 * @reg: dword aligned register offset 447 * @acc_flags: access flags which require special behavior 448 * 449 * Returns the 32 bit value from the offset specified. 450 */ 451 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 452 uint32_t reg, uint32_t acc_flags) 453 { 454 uint32_t ret; 455 456 if (amdgpu_device_skip_hw_access(adev)) 457 return 0; 458 459 if ((reg * 4) < adev->rmmio_size) { 460 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 461 amdgpu_sriov_runtime(adev) && 462 down_read_trylock(&adev->reset_domain->sem)) { 463 ret = amdgpu_kiq_rreg(adev, reg); 464 up_read(&adev->reset_domain->sem); 465 } else { 466 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 467 } 468 } else { 469 ret = adev->pcie_rreg(adev, reg * 4); 470 } 471 472 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 473 474 return ret; 475 } 476 477 /* 478 * MMIO register read with bytes helper functions 479 * @offset:bytes offset from MMIO start 480 * 481 */ 482 483 /** 484 * amdgpu_mm_rreg8 - read a memory mapped IO register 485 * 486 * @adev: amdgpu_device pointer 487 * @offset: byte aligned register offset 488 * 489 * Returns the 8 bit value from the offset specified. 490 */ 491 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 492 { 493 if (amdgpu_device_skip_hw_access(adev)) 494 return 0; 495 496 if (offset < adev->rmmio_size) 497 return (readb(adev->rmmio + offset)); 498 BUG(); 499 } 500 501 /* 502 * MMIO register write with bytes helper functions 503 * @offset:bytes offset from MMIO start 504 * @value: the value want to be written to the register 505 * 506 */ 507 /** 508 * amdgpu_mm_wreg8 - read a memory mapped IO register 509 * 510 * @adev: amdgpu_device pointer 511 * @offset: byte aligned register offset 512 * @value: 8 bit value to write 513 * 514 * Writes the value specified to the offset specified. 515 */ 516 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 517 { 518 if (amdgpu_device_skip_hw_access(adev)) 519 return; 520 521 if (offset < adev->rmmio_size) 522 writeb(value, adev->rmmio + offset); 523 else 524 BUG(); 525 } 526 527 /** 528 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 529 * 530 * @adev: amdgpu_device pointer 531 * @reg: dword aligned register offset 532 * @v: 32 bit value to write to the register 533 * @acc_flags: access flags which require special behavior 534 * 535 * Writes the value specified to the offset specified. 536 */ 537 void amdgpu_device_wreg(struct amdgpu_device *adev, 538 uint32_t reg, uint32_t v, 539 uint32_t acc_flags) 540 { 541 if (amdgpu_device_skip_hw_access(adev)) 542 return; 543 544 if ((reg * 4) < adev->rmmio_size) { 545 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 546 amdgpu_sriov_runtime(adev) && 547 down_read_trylock(&adev->reset_domain->sem)) { 548 amdgpu_kiq_wreg(adev, reg, v); 549 up_read(&adev->reset_domain->sem); 550 } else { 551 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 552 } 553 } else { 554 adev->pcie_wreg(adev, reg * 4, v); 555 } 556 557 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 558 } 559 560 /** 561 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 562 * 563 * @adev: amdgpu_device pointer 564 * @reg: mmio/rlc register 565 * @v: value to write 566 * 567 * this function is invoked only for the debugfs register access 568 */ 569 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 570 uint32_t reg, uint32_t v) 571 { 572 if (amdgpu_device_skip_hw_access(adev)) 573 return; 574 575 if (amdgpu_sriov_fullaccess(adev) && 576 adev->gfx.rlc.funcs && 577 adev->gfx.rlc.funcs->is_rlcg_access_range) { 578 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 579 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 580 } else if ((reg * 4) >= adev->rmmio_size) { 581 adev->pcie_wreg(adev, reg * 4, v); 582 } else { 583 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 584 } 585 } 586 587 /** 588 * amdgpu_mm_rdoorbell - read a doorbell dword 589 * 590 * @adev: amdgpu_device pointer 591 * @index: doorbell index 592 * 593 * Returns the value in the doorbell aperture at the 594 * requested doorbell index (CIK). 595 */ 596 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 597 { 598 if (amdgpu_device_skip_hw_access(adev)) 599 return 0; 600 601 if (index < adev->doorbell.num_doorbells) { 602 return readl(adev->doorbell.ptr + index); 603 } else { 604 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 605 return 0; 606 } 607 } 608 609 /** 610 * amdgpu_mm_wdoorbell - write a doorbell dword 611 * 612 * @adev: amdgpu_device pointer 613 * @index: doorbell index 614 * @v: value to write 615 * 616 * Writes @v to the doorbell aperture at the 617 * requested doorbell index (CIK). 618 */ 619 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 620 { 621 if (amdgpu_device_skip_hw_access(adev)) 622 return; 623 624 if (index < adev->doorbell.num_doorbells) { 625 writel(v, adev->doorbell.ptr + index); 626 } else { 627 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 628 } 629 } 630 631 /** 632 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 633 * 634 * @adev: amdgpu_device pointer 635 * @index: doorbell index 636 * 637 * Returns the value in the doorbell aperture at the 638 * requested doorbell index (VEGA10+). 639 */ 640 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 641 { 642 if (amdgpu_device_skip_hw_access(adev)) 643 return 0; 644 645 if (index < adev->doorbell.num_doorbells) { 646 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 647 } else { 648 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 649 return 0; 650 } 651 } 652 653 /** 654 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 655 * 656 * @adev: amdgpu_device pointer 657 * @index: doorbell index 658 * @v: value to write 659 * 660 * Writes @v to the doorbell aperture at the 661 * requested doorbell index (VEGA10+). 662 */ 663 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 664 { 665 if (amdgpu_device_skip_hw_access(adev)) 666 return; 667 668 if (index < adev->doorbell.num_doorbells) { 669 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 670 } else { 671 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 672 } 673 } 674 675 /** 676 * amdgpu_device_indirect_rreg - read an indirect register 677 * 678 * @adev: amdgpu_device pointer 679 * @reg_addr: indirect register address to read from 680 * 681 * Returns the value of indirect register @reg_addr 682 */ 683 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 684 u32 reg_addr) 685 { 686 unsigned long flags, pcie_index, pcie_data; 687 void __iomem *pcie_index_offset; 688 void __iomem *pcie_data_offset; 689 u32 r; 690 691 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 692 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 693 694 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 695 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 696 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 697 698 writel(reg_addr, pcie_index_offset); 699 readl(pcie_index_offset); 700 r = readl(pcie_data_offset); 701 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 702 703 return r; 704 } 705 706 /** 707 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 708 * 709 * @adev: amdgpu_device pointer 710 * @reg_addr: indirect register address to read from 711 * 712 * Returns the value of indirect register @reg_addr 713 */ 714 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 715 u32 reg_addr) 716 { 717 unsigned long flags, pcie_index, pcie_data; 718 void __iomem *pcie_index_offset; 719 void __iomem *pcie_data_offset; 720 u64 r; 721 722 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 723 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 724 725 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 726 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 727 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 728 729 /* read low 32 bits */ 730 writel(reg_addr, pcie_index_offset); 731 readl(pcie_index_offset); 732 r = readl(pcie_data_offset); 733 /* read high 32 bits */ 734 writel(reg_addr + 4, pcie_index_offset); 735 readl(pcie_index_offset); 736 r |= ((u64)readl(pcie_data_offset) << 32); 737 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 738 739 return r; 740 } 741 742 /** 743 * amdgpu_device_indirect_wreg - write an indirect register address 744 * 745 * @adev: amdgpu_device pointer 746 * @pcie_index: mmio register offset 747 * @pcie_data: mmio register offset 748 * @reg_addr: indirect register offset 749 * @reg_data: indirect register data 750 * 751 */ 752 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 753 u32 reg_addr, u32 reg_data) 754 { 755 unsigned long flags, pcie_index, pcie_data; 756 void __iomem *pcie_index_offset; 757 void __iomem *pcie_data_offset; 758 759 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 760 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 761 762 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 763 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 764 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 765 766 writel(reg_addr, pcie_index_offset); 767 readl(pcie_index_offset); 768 writel(reg_data, pcie_data_offset); 769 readl(pcie_data_offset); 770 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 771 } 772 773 /** 774 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 775 * 776 * @adev: amdgpu_device pointer 777 * @pcie_index: mmio register offset 778 * @pcie_data: mmio register offset 779 * @reg_addr: indirect register offset 780 * @reg_data: indirect register data 781 * 782 */ 783 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 784 u32 reg_addr, u64 reg_data) 785 { 786 unsigned long flags, pcie_index, pcie_data; 787 void __iomem *pcie_index_offset; 788 void __iomem *pcie_data_offset; 789 790 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 791 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 792 793 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 794 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 795 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 796 797 /* write low 32 bits */ 798 writel(reg_addr, pcie_index_offset); 799 readl(pcie_index_offset); 800 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 801 readl(pcie_data_offset); 802 /* write high 32 bits */ 803 writel(reg_addr + 4, pcie_index_offset); 804 readl(pcie_index_offset); 805 writel((u32)(reg_data >> 32), pcie_data_offset); 806 readl(pcie_data_offset); 807 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 808 } 809 810 /** 811 * amdgpu_device_get_rev_id - query device rev_id 812 * 813 * @adev: amdgpu_device pointer 814 * 815 * Return device rev_id 816 */ 817 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 818 { 819 return adev->nbio.funcs->get_rev_id(adev); 820 } 821 822 /** 823 * amdgpu_invalid_rreg - dummy reg read function 824 * 825 * @adev: amdgpu_device pointer 826 * @reg: offset of register 827 * 828 * Dummy register read function. Used for register blocks 829 * that certain asics don't have (all asics). 830 * Returns the value in the register. 831 */ 832 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 833 { 834 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 835 BUG(); 836 return 0; 837 } 838 839 /** 840 * amdgpu_invalid_wreg - dummy reg write function 841 * 842 * @adev: amdgpu_device pointer 843 * @reg: offset of register 844 * @v: value to write to the register 845 * 846 * Dummy register read function. Used for register blocks 847 * that certain asics don't have (all asics). 848 */ 849 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 850 { 851 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 852 reg, v); 853 BUG(); 854 } 855 856 /** 857 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 858 * 859 * @adev: amdgpu_device pointer 860 * @reg: offset of register 861 * 862 * Dummy register read function. Used for register blocks 863 * that certain asics don't have (all asics). 864 * Returns the value in the register. 865 */ 866 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 867 { 868 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 869 BUG(); 870 return 0; 871 } 872 873 /** 874 * amdgpu_invalid_wreg64 - dummy reg write function 875 * 876 * @adev: amdgpu_device pointer 877 * @reg: offset of register 878 * @v: value to write to the register 879 * 880 * Dummy register read function. Used for register blocks 881 * that certain asics don't have (all asics). 882 */ 883 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 884 { 885 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 886 reg, v); 887 BUG(); 888 } 889 890 /** 891 * amdgpu_block_invalid_rreg - dummy reg read function 892 * 893 * @adev: amdgpu_device pointer 894 * @block: offset of instance 895 * @reg: offset of register 896 * 897 * Dummy register read function. Used for register blocks 898 * that certain asics don't have (all asics). 899 * Returns the value in the register. 900 */ 901 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 902 uint32_t block, uint32_t reg) 903 { 904 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 905 reg, block); 906 BUG(); 907 return 0; 908 } 909 910 /** 911 * amdgpu_block_invalid_wreg - dummy reg write function 912 * 913 * @adev: amdgpu_device pointer 914 * @block: offset of instance 915 * @reg: offset of register 916 * @v: value to write to the register 917 * 918 * Dummy register read function. Used for register blocks 919 * that certain asics don't have (all asics). 920 */ 921 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 922 uint32_t block, 923 uint32_t reg, uint32_t v) 924 { 925 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 926 reg, block, v); 927 BUG(); 928 } 929 930 /** 931 * amdgpu_device_asic_init - Wrapper for atom asic_init 932 * 933 * @adev: amdgpu_device pointer 934 * 935 * Does any asic specific work and then calls atom asic init. 936 */ 937 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 938 { 939 amdgpu_asic_pre_asic_init(adev); 940 941 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 942 return amdgpu_atomfirmware_asic_init(adev, true); 943 else 944 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 945 } 946 947 /** 948 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 949 * 950 * @adev: amdgpu_device pointer 951 * 952 * Allocates a scratch page of VRAM for use by various things in the 953 * driver. 954 */ 955 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 956 { 957 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 958 AMDGPU_GEM_DOMAIN_VRAM | 959 AMDGPU_GEM_DOMAIN_GTT, 960 &adev->mem_scratch.robj, 961 &adev->mem_scratch.gpu_addr, 962 (void **)&adev->mem_scratch.ptr); 963 } 964 965 /** 966 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 967 * 968 * @adev: amdgpu_device pointer 969 * 970 * Frees the VRAM scratch page. 971 */ 972 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 973 { 974 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 975 } 976 977 /** 978 * amdgpu_device_program_register_sequence - program an array of registers. 979 * 980 * @adev: amdgpu_device pointer 981 * @registers: pointer to the register array 982 * @array_size: size of the register array 983 * 984 * Programs an array or registers with and and or masks. 985 * This is a helper for setting golden registers. 986 */ 987 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 988 const u32 *registers, 989 const u32 array_size) 990 { 991 u32 tmp, reg, and_mask, or_mask; 992 int i; 993 994 if (array_size % 3) 995 return; 996 997 for (i = 0; i < array_size; i +=3) { 998 reg = registers[i + 0]; 999 and_mask = registers[i + 1]; 1000 or_mask = registers[i + 2]; 1001 1002 if (and_mask == 0xffffffff) { 1003 tmp = or_mask; 1004 } else { 1005 tmp = RREG32(reg); 1006 tmp &= ~and_mask; 1007 if (adev->family >= AMDGPU_FAMILY_AI) 1008 tmp |= (or_mask & and_mask); 1009 else 1010 tmp |= or_mask; 1011 } 1012 WREG32(reg, tmp); 1013 } 1014 } 1015 1016 /** 1017 * amdgpu_device_pci_config_reset - reset the GPU 1018 * 1019 * @adev: amdgpu_device pointer 1020 * 1021 * Resets the GPU using the pci config reset sequence. 1022 * Only applicable to asics prior to vega10. 1023 */ 1024 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1025 { 1026 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1027 } 1028 1029 /** 1030 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1031 * 1032 * @adev: amdgpu_device pointer 1033 * 1034 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1035 */ 1036 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1037 { 1038 return pci_reset_function(adev->pdev); 1039 } 1040 1041 /* 1042 * GPU doorbell aperture helpers function. 1043 */ 1044 /** 1045 * amdgpu_device_doorbell_init - Init doorbell driver information. 1046 * 1047 * @adev: amdgpu_device pointer 1048 * 1049 * Init doorbell driver information (CIK) 1050 * Returns 0 on success, error on failure. 1051 */ 1052 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1053 { 1054 1055 /* No doorbell on SI hardware generation */ 1056 if (adev->asic_type < CHIP_BONAIRE) { 1057 adev->doorbell.base = 0; 1058 adev->doorbell.size = 0; 1059 adev->doorbell.num_doorbells = 0; 1060 adev->doorbell.ptr = NULL; 1061 return 0; 1062 } 1063 1064 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1065 return -EINVAL; 1066 1067 amdgpu_asic_init_doorbell_index(adev); 1068 1069 /* doorbell bar mapping */ 1070 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1071 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1072 1073 if (adev->enable_mes) { 1074 adev->doorbell.num_doorbells = 1075 adev->doorbell.size / sizeof(u32); 1076 } else { 1077 adev->doorbell.num_doorbells = 1078 min_t(u32, adev->doorbell.size / sizeof(u32), 1079 adev->doorbell_index.max_assignment+1); 1080 if (adev->doorbell.num_doorbells == 0) 1081 return -EINVAL; 1082 1083 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1084 * paging queue doorbell use the second page. The 1085 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1086 * doorbells are in the first page. So with paging queue enabled, 1087 * the max num_doorbells should + 1 page (0x400 in dword) 1088 */ 1089 if (adev->asic_type >= CHIP_VEGA10) 1090 adev->doorbell.num_doorbells += 0x400; 1091 } 1092 1093 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1094 adev->doorbell.num_doorbells * 1095 sizeof(u32)); 1096 if (adev->doorbell.ptr == NULL) 1097 return -ENOMEM; 1098 1099 return 0; 1100 } 1101 1102 /** 1103 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1104 * 1105 * @adev: amdgpu_device pointer 1106 * 1107 * Tear down doorbell driver information (CIK) 1108 */ 1109 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1110 { 1111 iounmap(adev->doorbell.ptr); 1112 adev->doorbell.ptr = NULL; 1113 } 1114 1115 1116 1117 /* 1118 * amdgpu_device_wb_*() 1119 * Writeback is the method by which the GPU updates special pages in memory 1120 * with the status of certain GPU events (fences, ring pointers,etc.). 1121 */ 1122 1123 /** 1124 * amdgpu_device_wb_fini - Disable Writeback and free memory 1125 * 1126 * @adev: amdgpu_device pointer 1127 * 1128 * Disables Writeback and frees the Writeback memory (all asics). 1129 * Used at driver shutdown. 1130 */ 1131 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1132 { 1133 if (adev->wb.wb_obj) { 1134 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1135 &adev->wb.gpu_addr, 1136 (void **)&adev->wb.wb); 1137 adev->wb.wb_obj = NULL; 1138 } 1139 } 1140 1141 /** 1142 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1143 * 1144 * @adev: amdgpu_device pointer 1145 * 1146 * Initializes writeback and allocates writeback memory (all asics). 1147 * Used at driver startup. 1148 * Returns 0 on success or an -error on failure. 1149 */ 1150 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1151 { 1152 int r; 1153 1154 if (adev->wb.wb_obj == NULL) { 1155 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1156 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1157 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1158 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1159 (void **)&adev->wb.wb); 1160 if (r) { 1161 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1162 return r; 1163 } 1164 1165 adev->wb.num_wb = AMDGPU_MAX_WB; 1166 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1167 1168 /* clear wb memory */ 1169 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1170 } 1171 1172 return 0; 1173 } 1174 1175 /** 1176 * amdgpu_device_wb_get - Allocate a wb entry 1177 * 1178 * @adev: amdgpu_device pointer 1179 * @wb: wb index 1180 * 1181 * Allocate a wb slot for use by the driver (all asics). 1182 * Returns 0 on success or -EINVAL on failure. 1183 */ 1184 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1185 { 1186 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1187 1188 if (offset < adev->wb.num_wb) { 1189 __set_bit(offset, adev->wb.used); 1190 *wb = offset << 3; /* convert to dw offset */ 1191 return 0; 1192 } else { 1193 return -EINVAL; 1194 } 1195 } 1196 1197 /** 1198 * amdgpu_device_wb_free - Free a wb entry 1199 * 1200 * @adev: amdgpu_device pointer 1201 * @wb: wb index 1202 * 1203 * Free a wb slot allocated for use by the driver (all asics) 1204 */ 1205 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1206 { 1207 wb >>= 3; 1208 if (wb < adev->wb.num_wb) 1209 __clear_bit(wb, adev->wb.used); 1210 } 1211 1212 /** 1213 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1214 * 1215 * @adev: amdgpu_device pointer 1216 * 1217 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1218 * to fail, but if any of the BARs is not accessible after the size we abort 1219 * driver loading by returning -ENODEV. 1220 */ 1221 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1222 { 1223 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1224 struct pci_bus *root; 1225 struct resource *res; 1226 unsigned i; 1227 u16 cmd; 1228 int r; 1229 1230 /* Bypass for VF */ 1231 if (amdgpu_sriov_vf(adev)) 1232 return 0; 1233 1234 /* skip if the bios has already enabled large BAR */ 1235 if (adev->gmc.real_vram_size && 1236 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1237 return 0; 1238 1239 /* Check if the root BUS has 64bit memory resources */ 1240 root = adev->pdev->bus; 1241 while (root->parent) 1242 root = root->parent; 1243 1244 pci_bus_for_each_resource(root, res, i) { 1245 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1246 res->start > 0x100000000ull) 1247 break; 1248 } 1249 1250 /* Trying to resize is pointless without a root hub window above 4GB */ 1251 if (!res) 1252 return 0; 1253 1254 /* Limit the BAR size to what is available */ 1255 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1256 rbar_size); 1257 1258 /* Disable memory decoding while we change the BAR addresses and size */ 1259 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1260 pci_write_config_word(adev->pdev, PCI_COMMAND, 1261 cmd & ~PCI_COMMAND_MEMORY); 1262 1263 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1264 amdgpu_device_doorbell_fini(adev); 1265 if (adev->asic_type >= CHIP_BONAIRE) 1266 pci_release_resource(adev->pdev, 2); 1267 1268 pci_release_resource(adev->pdev, 0); 1269 1270 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1271 if (r == -ENOSPC) 1272 DRM_INFO("Not enough PCI address space for a large BAR."); 1273 else if (r && r != -ENOTSUPP) 1274 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1275 1276 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1277 1278 /* When the doorbell or fb BAR isn't available we have no chance of 1279 * using the device. 1280 */ 1281 r = amdgpu_device_doorbell_init(adev); 1282 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1283 return -ENODEV; 1284 1285 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1286 1287 return 0; 1288 } 1289 1290 /* 1291 * GPU helpers function. 1292 */ 1293 /** 1294 * amdgpu_device_need_post - check if the hw need post or not 1295 * 1296 * @adev: amdgpu_device pointer 1297 * 1298 * Check if the asic has been initialized (all asics) at driver startup 1299 * or post is needed if hw reset is performed. 1300 * Returns true if need or false if not. 1301 */ 1302 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1303 { 1304 uint32_t reg; 1305 1306 if (amdgpu_sriov_vf(adev)) 1307 return false; 1308 1309 if (amdgpu_passthrough(adev)) { 1310 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1311 * some old smc fw still need driver do vPost otherwise gpu hang, while 1312 * those smc fw version above 22.15 doesn't have this flaw, so we force 1313 * vpost executed for smc version below 22.15 1314 */ 1315 if (adev->asic_type == CHIP_FIJI) { 1316 int err; 1317 uint32_t fw_ver; 1318 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1319 /* force vPost if error occured */ 1320 if (err) 1321 return true; 1322 1323 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1324 if (fw_ver < 0x00160e00) 1325 return true; 1326 } 1327 } 1328 1329 /* Don't post if we need to reset whole hive on init */ 1330 if (adev->gmc.xgmi.pending_reset) 1331 return false; 1332 1333 if (adev->has_hw_reset) { 1334 adev->has_hw_reset = false; 1335 return true; 1336 } 1337 1338 /* bios scratch used on CIK+ */ 1339 if (adev->asic_type >= CHIP_BONAIRE) 1340 return amdgpu_atombios_scratch_need_asic_init(adev); 1341 1342 /* check MEM_SIZE for older asics */ 1343 reg = amdgpu_asic_get_config_memsize(adev); 1344 1345 if ((reg != 0) && (reg != 0xffffffff)) 1346 return false; 1347 1348 return true; 1349 } 1350 1351 /** 1352 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1353 * 1354 * @adev: amdgpu_device pointer 1355 * 1356 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1357 * be set for this device. 1358 * 1359 * Returns true if it should be used or false if not. 1360 */ 1361 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1362 { 1363 switch (amdgpu_aspm) { 1364 case -1: 1365 break; 1366 case 0: 1367 return false; 1368 case 1: 1369 return true; 1370 default: 1371 return false; 1372 } 1373 return pcie_aspm_enabled(adev->pdev); 1374 } 1375 1376 /* if we get transitioned to only one device, take VGA back */ 1377 /** 1378 * amdgpu_device_vga_set_decode - enable/disable vga decode 1379 * 1380 * @pdev: PCI device pointer 1381 * @state: enable/disable vga decode 1382 * 1383 * Enable/disable vga decode (all asics). 1384 * Returns VGA resource flags. 1385 */ 1386 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1387 bool state) 1388 { 1389 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1390 amdgpu_asic_set_vga_state(adev, state); 1391 if (state) 1392 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1393 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1394 else 1395 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1396 } 1397 1398 /** 1399 * amdgpu_device_check_block_size - validate the vm block size 1400 * 1401 * @adev: amdgpu_device pointer 1402 * 1403 * Validates the vm block size specified via module parameter. 1404 * The vm block size defines number of bits in page table versus page directory, 1405 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1406 * page table and the remaining bits are in the page directory. 1407 */ 1408 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1409 { 1410 /* defines number of bits in page table versus page directory, 1411 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1412 * page table and the remaining bits are in the page directory */ 1413 if (amdgpu_vm_block_size == -1) 1414 return; 1415 1416 if (amdgpu_vm_block_size < 9) { 1417 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1418 amdgpu_vm_block_size); 1419 amdgpu_vm_block_size = -1; 1420 } 1421 } 1422 1423 /** 1424 * amdgpu_device_check_vm_size - validate the vm size 1425 * 1426 * @adev: amdgpu_device pointer 1427 * 1428 * Validates the vm size in GB specified via module parameter. 1429 * The VM size is the size of the GPU virtual memory space in GB. 1430 */ 1431 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1432 { 1433 /* no need to check the default value */ 1434 if (amdgpu_vm_size == -1) 1435 return; 1436 1437 if (amdgpu_vm_size < 1) { 1438 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1439 amdgpu_vm_size); 1440 amdgpu_vm_size = -1; 1441 } 1442 } 1443 1444 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1445 { 1446 struct sysinfo si; 1447 bool is_os_64 = (sizeof(void *) == 8); 1448 uint64_t total_memory; 1449 uint64_t dram_size_seven_GB = 0x1B8000000; 1450 uint64_t dram_size_three_GB = 0xB8000000; 1451 1452 if (amdgpu_smu_memory_pool_size == 0) 1453 return; 1454 1455 if (!is_os_64) { 1456 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1457 goto def_value; 1458 } 1459 si_meminfo(&si); 1460 total_memory = (uint64_t)si.totalram * si.mem_unit; 1461 1462 if ((amdgpu_smu_memory_pool_size == 1) || 1463 (amdgpu_smu_memory_pool_size == 2)) { 1464 if (total_memory < dram_size_three_GB) 1465 goto def_value1; 1466 } else if ((amdgpu_smu_memory_pool_size == 4) || 1467 (amdgpu_smu_memory_pool_size == 8)) { 1468 if (total_memory < dram_size_seven_GB) 1469 goto def_value1; 1470 } else { 1471 DRM_WARN("Smu memory pool size not supported\n"); 1472 goto def_value; 1473 } 1474 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1475 1476 return; 1477 1478 def_value1: 1479 DRM_WARN("No enough system memory\n"); 1480 def_value: 1481 adev->pm.smu_prv_buffer_size = 0; 1482 } 1483 1484 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1485 { 1486 if (!(adev->flags & AMD_IS_APU) || 1487 adev->asic_type < CHIP_RAVEN) 1488 return 0; 1489 1490 switch (adev->asic_type) { 1491 case CHIP_RAVEN: 1492 if (adev->pdev->device == 0x15dd) 1493 adev->apu_flags |= AMD_APU_IS_RAVEN; 1494 if (adev->pdev->device == 0x15d8) 1495 adev->apu_flags |= AMD_APU_IS_PICASSO; 1496 break; 1497 case CHIP_RENOIR: 1498 if ((adev->pdev->device == 0x1636) || 1499 (adev->pdev->device == 0x164c)) 1500 adev->apu_flags |= AMD_APU_IS_RENOIR; 1501 else 1502 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1503 break; 1504 case CHIP_VANGOGH: 1505 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1506 break; 1507 case CHIP_YELLOW_CARP: 1508 break; 1509 case CHIP_CYAN_SKILLFISH: 1510 if ((adev->pdev->device == 0x13FE) || 1511 (adev->pdev->device == 0x143F)) 1512 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1513 break; 1514 default: 1515 break; 1516 } 1517 1518 return 0; 1519 } 1520 1521 /** 1522 * amdgpu_device_check_arguments - validate module params 1523 * 1524 * @adev: amdgpu_device pointer 1525 * 1526 * Validates certain module parameters and updates 1527 * the associated values used by the driver (all asics). 1528 */ 1529 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1530 { 1531 if (amdgpu_sched_jobs < 4) { 1532 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1533 amdgpu_sched_jobs); 1534 amdgpu_sched_jobs = 4; 1535 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1536 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1537 amdgpu_sched_jobs); 1538 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1539 } 1540 1541 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1542 /* gart size must be greater or equal to 32M */ 1543 dev_warn(adev->dev, "gart size (%d) too small\n", 1544 amdgpu_gart_size); 1545 amdgpu_gart_size = -1; 1546 } 1547 1548 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1549 /* gtt size must be greater or equal to 32M */ 1550 dev_warn(adev->dev, "gtt size (%d) too small\n", 1551 amdgpu_gtt_size); 1552 amdgpu_gtt_size = -1; 1553 } 1554 1555 /* valid range is between 4 and 9 inclusive */ 1556 if (amdgpu_vm_fragment_size != -1 && 1557 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1558 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1559 amdgpu_vm_fragment_size = -1; 1560 } 1561 1562 if (amdgpu_sched_hw_submission < 2) { 1563 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1564 amdgpu_sched_hw_submission); 1565 amdgpu_sched_hw_submission = 2; 1566 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1567 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1568 amdgpu_sched_hw_submission); 1569 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1570 } 1571 1572 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1573 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1574 amdgpu_reset_method = -1; 1575 } 1576 1577 amdgpu_device_check_smu_prv_buffer_size(adev); 1578 1579 amdgpu_device_check_vm_size(adev); 1580 1581 amdgpu_device_check_block_size(adev); 1582 1583 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1584 1585 return 0; 1586 } 1587 1588 /** 1589 * amdgpu_switcheroo_set_state - set switcheroo state 1590 * 1591 * @pdev: pci dev pointer 1592 * @state: vga_switcheroo state 1593 * 1594 * Callback for the switcheroo driver. Suspends or resumes 1595 * the asics before or after it is powered up using ACPI methods. 1596 */ 1597 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1598 enum vga_switcheroo_state state) 1599 { 1600 struct drm_device *dev = pci_get_drvdata(pdev); 1601 int r; 1602 1603 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1604 return; 1605 1606 if (state == VGA_SWITCHEROO_ON) { 1607 pr_info("switched on\n"); 1608 /* don't suspend or resume card normally */ 1609 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1610 1611 pci_set_power_state(pdev, PCI_D0); 1612 amdgpu_device_load_pci_state(pdev); 1613 r = pci_enable_device(pdev); 1614 if (r) 1615 DRM_WARN("pci_enable_device failed (%d)\n", r); 1616 amdgpu_device_resume(dev, true); 1617 1618 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1619 } else { 1620 pr_info("switched off\n"); 1621 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1622 amdgpu_device_suspend(dev, true); 1623 amdgpu_device_cache_pci_state(pdev); 1624 /* Shut down the device */ 1625 pci_disable_device(pdev); 1626 pci_set_power_state(pdev, PCI_D3cold); 1627 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1628 } 1629 } 1630 1631 /** 1632 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1633 * 1634 * @pdev: pci dev pointer 1635 * 1636 * Callback for the switcheroo driver. Check of the switcheroo 1637 * state can be changed. 1638 * Returns true if the state can be changed, false if not. 1639 */ 1640 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1641 { 1642 struct drm_device *dev = pci_get_drvdata(pdev); 1643 1644 /* 1645 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1646 * locking inversion with the driver load path. And the access here is 1647 * completely racy anyway. So don't bother with locking for now. 1648 */ 1649 return atomic_read(&dev->open_count) == 0; 1650 } 1651 1652 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1653 .set_gpu_state = amdgpu_switcheroo_set_state, 1654 .reprobe = NULL, 1655 .can_switch = amdgpu_switcheroo_can_switch, 1656 }; 1657 1658 /** 1659 * amdgpu_device_ip_set_clockgating_state - set the CG state 1660 * 1661 * @dev: amdgpu_device pointer 1662 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1663 * @state: clockgating state (gate or ungate) 1664 * 1665 * Sets the requested clockgating state for all instances of 1666 * the hardware IP specified. 1667 * Returns the error code from the last instance. 1668 */ 1669 int amdgpu_device_ip_set_clockgating_state(void *dev, 1670 enum amd_ip_block_type block_type, 1671 enum amd_clockgating_state state) 1672 { 1673 struct amdgpu_device *adev = dev; 1674 int i, r = 0; 1675 1676 for (i = 0; i < adev->num_ip_blocks; i++) { 1677 if (!adev->ip_blocks[i].status.valid) 1678 continue; 1679 if (adev->ip_blocks[i].version->type != block_type) 1680 continue; 1681 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1682 continue; 1683 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1684 (void *)adev, state); 1685 if (r) 1686 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1687 adev->ip_blocks[i].version->funcs->name, r); 1688 } 1689 return r; 1690 } 1691 1692 /** 1693 * amdgpu_device_ip_set_powergating_state - set the PG state 1694 * 1695 * @dev: amdgpu_device pointer 1696 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1697 * @state: powergating state (gate or ungate) 1698 * 1699 * Sets the requested powergating state for all instances of 1700 * the hardware IP specified. 1701 * Returns the error code from the last instance. 1702 */ 1703 int amdgpu_device_ip_set_powergating_state(void *dev, 1704 enum amd_ip_block_type block_type, 1705 enum amd_powergating_state state) 1706 { 1707 struct amdgpu_device *adev = dev; 1708 int i, r = 0; 1709 1710 for (i = 0; i < adev->num_ip_blocks; i++) { 1711 if (!adev->ip_blocks[i].status.valid) 1712 continue; 1713 if (adev->ip_blocks[i].version->type != block_type) 1714 continue; 1715 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1716 continue; 1717 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1718 (void *)adev, state); 1719 if (r) 1720 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1721 adev->ip_blocks[i].version->funcs->name, r); 1722 } 1723 return r; 1724 } 1725 1726 /** 1727 * amdgpu_device_ip_get_clockgating_state - get the CG state 1728 * 1729 * @adev: amdgpu_device pointer 1730 * @flags: clockgating feature flags 1731 * 1732 * Walks the list of IPs on the device and updates the clockgating 1733 * flags for each IP. 1734 * Updates @flags with the feature flags for each hardware IP where 1735 * clockgating is enabled. 1736 */ 1737 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1738 u64 *flags) 1739 { 1740 int i; 1741 1742 for (i = 0; i < adev->num_ip_blocks; i++) { 1743 if (!adev->ip_blocks[i].status.valid) 1744 continue; 1745 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1746 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1747 } 1748 } 1749 1750 /** 1751 * amdgpu_device_ip_wait_for_idle - wait for idle 1752 * 1753 * @adev: amdgpu_device pointer 1754 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1755 * 1756 * Waits for the request hardware IP to be idle. 1757 * Returns 0 for success or a negative error code on failure. 1758 */ 1759 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1760 enum amd_ip_block_type block_type) 1761 { 1762 int i, r; 1763 1764 for (i = 0; i < adev->num_ip_blocks; i++) { 1765 if (!adev->ip_blocks[i].status.valid) 1766 continue; 1767 if (adev->ip_blocks[i].version->type == block_type) { 1768 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1769 if (r) 1770 return r; 1771 break; 1772 } 1773 } 1774 return 0; 1775 1776 } 1777 1778 /** 1779 * amdgpu_device_ip_is_idle - is the hardware IP idle 1780 * 1781 * @adev: amdgpu_device pointer 1782 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1783 * 1784 * Check if the hardware IP is idle or not. 1785 * Returns true if it the IP is idle, false if not. 1786 */ 1787 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1788 enum amd_ip_block_type block_type) 1789 { 1790 int i; 1791 1792 for (i = 0; i < adev->num_ip_blocks; i++) { 1793 if (!adev->ip_blocks[i].status.valid) 1794 continue; 1795 if (adev->ip_blocks[i].version->type == block_type) 1796 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1797 } 1798 return true; 1799 1800 } 1801 1802 /** 1803 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1804 * 1805 * @adev: amdgpu_device pointer 1806 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1807 * 1808 * Returns a pointer to the hardware IP block structure 1809 * if it exists for the asic, otherwise NULL. 1810 */ 1811 struct amdgpu_ip_block * 1812 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1813 enum amd_ip_block_type type) 1814 { 1815 int i; 1816 1817 for (i = 0; i < adev->num_ip_blocks; i++) 1818 if (adev->ip_blocks[i].version->type == type) 1819 return &adev->ip_blocks[i]; 1820 1821 return NULL; 1822 } 1823 1824 /** 1825 * amdgpu_device_ip_block_version_cmp 1826 * 1827 * @adev: amdgpu_device pointer 1828 * @type: enum amd_ip_block_type 1829 * @major: major version 1830 * @minor: minor version 1831 * 1832 * return 0 if equal or greater 1833 * return 1 if smaller or the ip_block doesn't exist 1834 */ 1835 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1836 enum amd_ip_block_type type, 1837 u32 major, u32 minor) 1838 { 1839 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1840 1841 if (ip_block && ((ip_block->version->major > major) || 1842 ((ip_block->version->major == major) && 1843 (ip_block->version->minor >= minor)))) 1844 return 0; 1845 1846 return 1; 1847 } 1848 1849 /** 1850 * amdgpu_device_ip_block_add 1851 * 1852 * @adev: amdgpu_device pointer 1853 * @ip_block_version: pointer to the IP to add 1854 * 1855 * Adds the IP block driver information to the collection of IPs 1856 * on the asic. 1857 */ 1858 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1859 const struct amdgpu_ip_block_version *ip_block_version) 1860 { 1861 if (!ip_block_version) 1862 return -EINVAL; 1863 1864 switch (ip_block_version->type) { 1865 case AMD_IP_BLOCK_TYPE_VCN: 1866 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1867 return 0; 1868 break; 1869 case AMD_IP_BLOCK_TYPE_JPEG: 1870 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1871 return 0; 1872 break; 1873 default: 1874 break; 1875 } 1876 1877 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1878 ip_block_version->funcs->name); 1879 1880 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1881 1882 return 0; 1883 } 1884 1885 /** 1886 * amdgpu_device_enable_virtual_display - enable virtual display feature 1887 * 1888 * @adev: amdgpu_device pointer 1889 * 1890 * Enabled the virtual display feature if the user has enabled it via 1891 * the module parameter virtual_display. This feature provides a virtual 1892 * display hardware on headless boards or in virtualized environments. 1893 * This function parses and validates the configuration string specified by 1894 * the user and configues the virtual display configuration (number of 1895 * virtual connectors, crtcs, etc.) specified. 1896 */ 1897 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1898 { 1899 adev->enable_virtual_display = false; 1900 1901 if (amdgpu_virtual_display) { 1902 const char *pci_address_name = pci_name(adev->pdev); 1903 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1904 1905 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1906 pciaddstr_tmp = pciaddstr; 1907 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1908 pciaddname = strsep(&pciaddname_tmp, ","); 1909 if (!strcmp("all", pciaddname) 1910 || !strcmp(pci_address_name, pciaddname)) { 1911 long num_crtc; 1912 int res = -1; 1913 1914 adev->enable_virtual_display = true; 1915 1916 if (pciaddname_tmp) 1917 res = kstrtol(pciaddname_tmp, 10, 1918 &num_crtc); 1919 1920 if (!res) { 1921 if (num_crtc < 1) 1922 num_crtc = 1; 1923 if (num_crtc > 6) 1924 num_crtc = 6; 1925 adev->mode_info.num_crtc = num_crtc; 1926 } else { 1927 adev->mode_info.num_crtc = 1; 1928 } 1929 break; 1930 } 1931 } 1932 1933 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1934 amdgpu_virtual_display, pci_address_name, 1935 adev->enable_virtual_display, adev->mode_info.num_crtc); 1936 1937 kfree(pciaddstr); 1938 } 1939 } 1940 1941 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1942 { 1943 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1944 adev->mode_info.num_crtc = 1; 1945 adev->enable_virtual_display = true; 1946 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1947 adev->enable_virtual_display, adev->mode_info.num_crtc); 1948 } 1949 } 1950 1951 /** 1952 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1953 * 1954 * @adev: amdgpu_device pointer 1955 * 1956 * Parses the asic configuration parameters specified in the gpu info 1957 * firmware and makes them availale to the driver for use in configuring 1958 * the asic. 1959 * Returns 0 on success, -EINVAL on failure. 1960 */ 1961 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1962 { 1963 const char *chip_name; 1964 char fw_name[40]; 1965 int err; 1966 const struct gpu_info_firmware_header_v1_0 *hdr; 1967 1968 adev->firmware.gpu_info_fw = NULL; 1969 1970 if (adev->mman.discovery_bin) { 1971 /* 1972 * FIXME: The bounding box is still needed by Navi12, so 1973 * temporarily read it from gpu_info firmware. Should be dropped 1974 * when DAL no longer needs it. 1975 */ 1976 if (adev->asic_type != CHIP_NAVI12) 1977 return 0; 1978 } 1979 1980 switch (adev->asic_type) { 1981 default: 1982 return 0; 1983 case CHIP_VEGA10: 1984 chip_name = "vega10"; 1985 break; 1986 case CHIP_VEGA12: 1987 chip_name = "vega12"; 1988 break; 1989 case CHIP_RAVEN: 1990 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1991 chip_name = "raven2"; 1992 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1993 chip_name = "picasso"; 1994 else 1995 chip_name = "raven"; 1996 break; 1997 case CHIP_ARCTURUS: 1998 chip_name = "arcturus"; 1999 break; 2000 case CHIP_NAVI12: 2001 chip_name = "navi12"; 2002 break; 2003 } 2004 2005 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2006 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2007 if (err) { 2008 dev_err(adev->dev, 2009 "Failed to get gpu_info firmware \"%s\"\n", 2010 fw_name); 2011 goto out; 2012 } 2013 2014 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2015 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2016 2017 switch (hdr->version_major) { 2018 case 1: 2019 { 2020 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2021 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2022 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2023 2024 /* 2025 * Should be droped when DAL no longer needs it. 2026 */ 2027 if (adev->asic_type == CHIP_NAVI12) 2028 goto parse_soc_bounding_box; 2029 2030 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2031 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2032 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2033 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2034 adev->gfx.config.max_texture_channel_caches = 2035 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2036 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2037 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2038 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2039 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2040 adev->gfx.config.double_offchip_lds_buf = 2041 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2042 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2043 adev->gfx.cu_info.max_waves_per_simd = 2044 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2045 adev->gfx.cu_info.max_scratch_slots_per_cu = 2046 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2047 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2048 if (hdr->version_minor >= 1) { 2049 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2050 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2051 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2052 adev->gfx.config.num_sc_per_sh = 2053 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2054 adev->gfx.config.num_packer_per_sc = 2055 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2056 } 2057 2058 parse_soc_bounding_box: 2059 /* 2060 * soc bounding box info is not integrated in disocovery table, 2061 * we always need to parse it from gpu info firmware if needed. 2062 */ 2063 if (hdr->version_minor == 2) { 2064 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2065 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2066 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2067 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2068 } 2069 break; 2070 } 2071 default: 2072 dev_err(adev->dev, 2073 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2074 err = -EINVAL; 2075 goto out; 2076 } 2077 out: 2078 return err; 2079 } 2080 2081 /** 2082 * amdgpu_device_ip_early_init - run early init for hardware IPs 2083 * 2084 * @adev: amdgpu_device pointer 2085 * 2086 * Early initialization pass for hardware IPs. The hardware IPs that make 2087 * up each asic are discovered each IP's early_init callback is run. This 2088 * is the first stage in initializing the asic. 2089 * Returns 0 on success, negative error code on failure. 2090 */ 2091 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2092 { 2093 struct drm_device *dev = adev_to_drm(adev); 2094 struct pci_dev *parent; 2095 int i, r; 2096 bool total; 2097 2098 amdgpu_device_enable_virtual_display(adev); 2099 2100 if (amdgpu_sriov_vf(adev)) { 2101 r = amdgpu_virt_request_full_gpu(adev, true); 2102 if (r) 2103 return r; 2104 } 2105 2106 switch (adev->asic_type) { 2107 #ifdef CONFIG_DRM_AMDGPU_SI 2108 case CHIP_VERDE: 2109 case CHIP_TAHITI: 2110 case CHIP_PITCAIRN: 2111 case CHIP_OLAND: 2112 case CHIP_HAINAN: 2113 adev->family = AMDGPU_FAMILY_SI; 2114 r = si_set_ip_blocks(adev); 2115 if (r) 2116 return r; 2117 break; 2118 #endif 2119 #ifdef CONFIG_DRM_AMDGPU_CIK 2120 case CHIP_BONAIRE: 2121 case CHIP_HAWAII: 2122 case CHIP_KAVERI: 2123 case CHIP_KABINI: 2124 case CHIP_MULLINS: 2125 if (adev->flags & AMD_IS_APU) 2126 adev->family = AMDGPU_FAMILY_KV; 2127 else 2128 adev->family = AMDGPU_FAMILY_CI; 2129 2130 r = cik_set_ip_blocks(adev); 2131 if (r) 2132 return r; 2133 break; 2134 #endif 2135 case CHIP_TOPAZ: 2136 case CHIP_TONGA: 2137 case CHIP_FIJI: 2138 case CHIP_POLARIS10: 2139 case CHIP_POLARIS11: 2140 case CHIP_POLARIS12: 2141 case CHIP_VEGAM: 2142 case CHIP_CARRIZO: 2143 case CHIP_STONEY: 2144 if (adev->flags & AMD_IS_APU) 2145 adev->family = AMDGPU_FAMILY_CZ; 2146 else 2147 adev->family = AMDGPU_FAMILY_VI; 2148 2149 r = vi_set_ip_blocks(adev); 2150 if (r) 2151 return r; 2152 break; 2153 default: 2154 r = amdgpu_discovery_set_ip_blocks(adev); 2155 if (r) 2156 return r; 2157 break; 2158 } 2159 2160 if (amdgpu_has_atpx() && 2161 (amdgpu_is_atpx_hybrid() || 2162 amdgpu_has_atpx_dgpu_power_cntl()) && 2163 ((adev->flags & AMD_IS_APU) == 0) && 2164 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2165 adev->flags |= AMD_IS_PX; 2166 2167 if (!(adev->flags & AMD_IS_APU)) { 2168 parent = pci_upstream_bridge(adev->pdev); 2169 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2170 } 2171 2172 amdgpu_amdkfd_device_probe(adev); 2173 2174 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2175 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2176 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2177 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2178 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2179 2180 total = true; 2181 for (i = 0; i < adev->num_ip_blocks; i++) { 2182 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2183 DRM_ERROR("disabled ip block: %d <%s>\n", 2184 i, adev->ip_blocks[i].version->funcs->name); 2185 adev->ip_blocks[i].status.valid = false; 2186 } else { 2187 if (adev->ip_blocks[i].version->funcs->early_init) { 2188 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2189 if (r == -ENOENT) { 2190 adev->ip_blocks[i].status.valid = false; 2191 } else if (r) { 2192 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2193 adev->ip_blocks[i].version->funcs->name, r); 2194 total = false; 2195 } else { 2196 adev->ip_blocks[i].status.valid = true; 2197 } 2198 } else { 2199 adev->ip_blocks[i].status.valid = true; 2200 } 2201 } 2202 /* get the vbios after the asic_funcs are set up */ 2203 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2204 r = amdgpu_device_parse_gpu_info_fw(adev); 2205 if (r) 2206 return r; 2207 2208 /* Read BIOS */ 2209 if (!amdgpu_get_bios(adev)) 2210 return -EINVAL; 2211 2212 r = amdgpu_atombios_init(adev); 2213 if (r) { 2214 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2215 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2216 return r; 2217 } 2218 2219 /*get pf2vf msg info at it's earliest time*/ 2220 if (amdgpu_sriov_vf(adev)) 2221 amdgpu_virt_init_data_exchange(adev); 2222 2223 } 2224 } 2225 if (!total) 2226 return -ENODEV; 2227 2228 adev->cg_flags &= amdgpu_cg_mask; 2229 adev->pg_flags &= amdgpu_pg_mask; 2230 2231 return 0; 2232 } 2233 2234 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2235 { 2236 int i, r; 2237 2238 for (i = 0; i < adev->num_ip_blocks; i++) { 2239 if (!adev->ip_blocks[i].status.sw) 2240 continue; 2241 if (adev->ip_blocks[i].status.hw) 2242 continue; 2243 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2244 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2245 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2246 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2247 if (r) { 2248 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2249 adev->ip_blocks[i].version->funcs->name, r); 2250 return r; 2251 } 2252 adev->ip_blocks[i].status.hw = true; 2253 } 2254 } 2255 2256 return 0; 2257 } 2258 2259 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2260 { 2261 int i, r; 2262 2263 for (i = 0; i < adev->num_ip_blocks; i++) { 2264 if (!adev->ip_blocks[i].status.sw) 2265 continue; 2266 if (adev->ip_blocks[i].status.hw) 2267 continue; 2268 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2269 if (r) { 2270 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2271 adev->ip_blocks[i].version->funcs->name, r); 2272 return r; 2273 } 2274 adev->ip_blocks[i].status.hw = true; 2275 } 2276 2277 return 0; 2278 } 2279 2280 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2281 { 2282 int r = 0; 2283 int i; 2284 uint32_t smu_version; 2285 2286 if (adev->asic_type >= CHIP_VEGA10) { 2287 for (i = 0; i < adev->num_ip_blocks; i++) { 2288 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2289 continue; 2290 2291 if (!adev->ip_blocks[i].status.sw) 2292 continue; 2293 2294 /* no need to do the fw loading again if already done*/ 2295 if (adev->ip_blocks[i].status.hw == true) 2296 break; 2297 2298 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2299 r = adev->ip_blocks[i].version->funcs->resume(adev); 2300 if (r) { 2301 DRM_ERROR("resume of IP block <%s> failed %d\n", 2302 adev->ip_blocks[i].version->funcs->name, r); 2303 return r; 2304 } 2305 } else { 2306 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2307 if (r) { 2308 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2309 adev->ip_blocks[i].version->funcs->name, r); 2310 return r; 2311 } 2312 } 2313 2314 adev->ip_blocks[i].status.hw = true; 2315 break; 2316 } 2317 } 2318 2319 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2320 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2321 2322 return r; 2323 } 2324 2325 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2326 { 2327 long timeout; 2328 int r, i; 2329 2330 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2331 struct amdgpu_ring *ring = adev->rings[i]; 2332 2333 /* No need to setup the GPU scheduler for rings that don't need it */ 2334 if (!ring || ring->no_scheduler) 2335 continue; 2336 2337 switch (ring->funcs->type) { 2338 case AMDGPU_RING_TYPE_GFX: 2339 timeout = adev->gfx_timeout; 2340 break; 2341 case AMDGPU_RING_TYPE_COMPUTE: 2342 timeout = adev->compute_timeout; 2343 break; 2344 case AMDGPU_RING_TYPE_SDMA: 2345 timeout = adev->sdma_timeout; 2346 break; 2347 default: 2348 timeout = adev->video_timeout; 2349 break; 2350 } 2351 2352 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2353 ring->num_hw_submission, amdgpu_job_hang_limit, 2354 timeout, adev->reset_domain->wq, 2355 ring->sched_score, ring->name, 2356 adev->dev); 2357 if (r) { 2358 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2359 ring->name); 2360 return r; 2361 } 2362 } 2363 2364 return 0; 2365 } 2366 2367 2368 /** 2369 * amdgpu_device_ip_init - run init for hardware IPs 2370 * 2371 * @adev: amdgpu_device pointer 2372 * 2373 * Main initialization pass for hardware IPs. The list of all the hardware 2374 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2375 * are run. sw_init initializes the software state associated with each IP 2376 * and hw_init initializes the hardware associated with each IP. 2377 * Returns 0 on success, negative error code on failure. 2378 */ 2379 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2380 { 2381 int i, r; 2382 2383 r = amdgpu_ras_init(adev); 2384 if (r) 2385 return r; 2386 2387 for (i = 0; i < adev->num_ip_blocks; i++) { 2388 if (!adev->ip_blocks[i].status.valid) 2389 continue; 2390 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2391 if (r) { 2392 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2393 adev->ip_blocks[i].version->funcs->name, r); 2394 goto init_failed; 2395 } 2396 adev->ip_blocks[i].status.sw = true; 2397 2398 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2399 /* need to do common hw init early so everything is set up for gmc */ 2400 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2401 if (r) { 2402 DRM_ERROR("hw_init %d failed %d\n", i, r); 2403 goto init_failed; 2404 } 2405 adev->ip_blocks[i].status.hw = true; 2406 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2407 /* need to do gmc hw init early so we can allocate gpu mem */ 2408 /* Try to reserve bad pages early */ 2409 if (amdgpu_sriov_vf(adev)) 2410 amdgpu_virt_exchange_data(adev); 2411 2412 r = amdgpu_device_mem_scratch_init(adev); 2413 if (r) { 2414 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2415 goto init_failed; 2416 } 2417 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2418 if (r) { 2419 DRM_ERROR("hw_init %d failed %d\n", i, r); 2420 goto init_failed; 2421 } 2422 r = amdgpu_device_wb_init(adev); 2423 if (r) { 2424 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2425 goto init_failed; 2426 } 2427 adev->ip_blocks[i].status.hw = true; 2428 2429 /* right after GMC hw init, we create CSA */ 2430 if (amdgpu_mcbp) { 2431 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2432 AMDGPU_GEM_DOMAIN_VRAM | 2433 AMDGPU_GEM_DOMAIN_GTT, 2434 AMDGPU_CSA_SIZE); 2435 if (r) { 2436 DRM_ERROR("allocate CSA failed %d\n", r); 2437 goto init_failed; 2438 } 2439 } 2440 } 2441 } 2442 2443 if (amdgpu_sriov_vf(adev)) 2444 amdgpu_virt_init_data_exchange(adev); 2445 2446 r = amdgpu_ib_pool_init(adev); 2447 if (r) { 2448 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2449 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2450 goto init_failed; 2451 } 2452 2453 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2454 if (r) 2455 goto init_failed; 2456 2457 r = amdgpu_device_ip_hw_init_phase1(adev); 2458 if (r) 2459 goto init_failed; 2460 2461 r = amdgpu_device_fw_loading(adev); 2462 if (r) 2463 goto init_failed; 2464 2465 r = amdgpu_device_ip_hw_init_phase2(adev); 2466 if (r) 2467 goto init_failed; 2468 2469 /* 2470 * retired pages will be loaded from eeprom and reserved here, 2471 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2472 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2473 * for I2C communication which only true at this point. 2474 * 2475 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2476 * failure from bad gpu situation and stop amdgpu init process 2477 * accordingly. For other failed cases, it will still release all 2478 * the resource and print error message, rather than returning one 2479 * negative value to upper level. 2480 * 2481 * Note: theoretically, this should be called before all vram allocations 2482 * to protect retired page from abusing 2483 */ 2484 r = amdgpu_ras_recovery_init(adev); 2485 if (r) 2486 goto init_failed; 2487 2488 /** 2489 * In case of XGMI grab extra reference for reset domain for this device 2490 */ 2491 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2492 if (amdgpu_xgmi_add_device(adev) == 0) { 2493 if (!amdgpu_sriov_vf(adev)) { 2494 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2495 2496 if (WARN_ON(!hive)) { 2497 r = -ENOENT; 2498 goto init_failed; 2499 } 2500 2501 if (!hive->reset_domain || 2502 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2503 r = -ENOENT; 2504 amdgpu_put_xgmi_hive(hive); 2505 goto init_failed; 2506 } 2507 2508 /* Drop the early temporary reset domain we created for device */ 2509 amdgpu_reset_put_reset_domain(adev->reset_domain); 2510 adev->reset_domain = hive->reset_domain; 2511 amdgpu_put_xgmi_hive(hive); 2512 } 2513 } 2514 } 2515 2516 r = amdgpu_device_init_schedulers(adev); 2517 if (r) 2518 goto init_failed; 2519 2520 /* Don't init kfd if whole hive need to be reset during init */ 2521 if (!adev->gmc.xgmi.pending_reset) 2522 amdgpu_amdkfd_device_init(adev); 2523 2524 amdgpu_fru_get_product_info(adev); 2525 2526 init_failed: 2527 if (amdgpu_sriov_vf(adev)) 2528 amdgpu_virt_release_full_gpu(adev, true); 2529 2530 return r; 2531 } 2532 2533 /** 2534 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2535 * 2536 * @adev: amdgpu_device pointer 2537 * 2538 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2539 * this function before a GPU reset. If the value is retained after a 2540 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2541 */ 2542 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2543 { 2544 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2545 } 2546 2547 /** 2548 * amdgpu_device_check_vram_lost - check if vram is valid 2549 * 2550 * @adev: amdgpu_device pointer 2551 * 2552 * Checks the reset magic value written to the gart pointer in VRAM. 2553 * The driver calls this after a GPU reset to see if the contents of 2554 * VRAM is lost or now. 2555 * returns true if vram is lost, false if not. 2556 */ 2557 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2558 { 2559 if (memcmp(adev->gart.ptr, adev->reset_magic, 2560 AMDGPU_RESET_MAGIC_NUM)) 2561 return true; 2562 2563 if (!amdgpu_in_reset(adev)) 2564 return false; 2565 2566 /* 2567 * For all ASICs with baco/mode1 reset, the VRAM is 2568 * always assumed to be lost. 2569 */ 2570 switch (amdgpu_asic_reset_method(adev)) { 2571 case AMD_RESET_METHOD_BACO: 2572 case AMD_RESET_METHOD_MODE1: 2573 return true; 2574 default: 2575 return false; 2576 } 2577 } 2578 2579 /** 2580 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2581 * 2582 * @adev: amdgpu_device pointer 2583 * @state: clockgating state (gate or ungate) 2584 * 2585 * The list of all the hardware IPs that make up the asic is walked and the 2586 * set_clockgating_state callbacks are run. 2587 * Late initialization pass enabling clockgating for hardware IPs. 2588 * Fini or suspend, pass disabling clockgating for hardware IPs. 2589 * Returns 0 on success, negative error code on failure. 2590 */ 2591 2592 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2593 enum amd_clockgating_state state) 2594 { 2595 int i, j, r; 2596 2597 if (amdgpu_emu_mode == 1) 2598 return 0; 2599 2600 for (j = 0; j < adev->num_ip_blocks; j++) { 2601 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2602 if (!adev->ip_blocks[i].status.late_initialized) 2603 continue; 2604 /* skip CG for GFX, SDMA on S0ix */ 2605 if (adev->in_s0ix && 2606 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2607 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2608 continue; 2609 /* skip CG for VCE/UVD, it's handled specially */ 2610 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2611 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2612 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2613 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2614 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2615 /* enable clockgating to save power */ 2616 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2617 state); 2618 if (r) { 2619 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2620 adev->ip_blocks[i].version->funcs->name, r); 2621 return r; 2622 } 2623 } 2624 } 2625 2626 return 0; 2627 } 2628 2629 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2630 enum amd_powergating_state state) 2631 { 2632 int i, j, r; 2633 2634 if (amdgpu_emu_mode == 1) 2635 return 0; 2636 2637 for (j = 0; j < adev->num_ip_blocks; j++) { 2638 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2639 if (!adev->ip_blocks[i].status.late_initialized) 2640 continue; 2641 /* skip PG for GFX, SDMA on S0ix */ 2642 if (adev->in_s0ix && 2643 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2644 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2645 continue; 2646 /* skip CG for VCE/UVD, it's handled specially */ 2647 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2648 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2649 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2650 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2651 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2652 /* enable powergating to save power */ 2653 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2654 state); 2655 if (r) { 2656 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2657 adev->ip_blocks[i].version->funcs->name, r); 2658 return r; 2659 } 2660 } 2661 } 2662 return 0; 2663 } 2664 2665 static int amdgpu_device_enable_mgpu_fan_boost(void) 2666 { 2667 struct amdgpu_gpu_instance *gpu_ins; 2668 struct amdgpu_device *adev; 2669 int i, ret = 0; 2670 2671 mutex_lock(&mgpu_info.mutex); 2672 2673 /* 2674 * MGPU fan boost feature should be enabled 2675 * only when there are two or more dGPUs in 2676 * the system 2677 */ 2678 if (mgpu_info.num_dgpu < 2) 2679 goto out; 2680 2681 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2682 gpu_ins = &(mgpu_info.gpu_ins[i]); 2683 adev = gpu_ins->adev; 2684 if (!(adev->flags & AMD_IS_APU) && 2685 !gpu_ins->mgpu_fan_enabled) { 2686 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2687 if (ret) 2688 break; 2689 2690 gpu_ins->mgpu_fan_enabled = 1; 2691 } 2692 } 2693 2694 out: 2695 mutex_unlock(&mgpu_info.mutex); 2696 2697 return ret; 2698 } 2699 2700 /** 2701 * amdgpu_device_ip_late_init - run late init for hardware IPs 2702 * 2703 * @adev: amdgpu_device pointer 2704 * 2705 * Late initialization pass for hardware IPs. The list of all the hardware 2706 * IPs that make up the asic is walked and the late_init callbacks are run. 2707 * late_init covers any special initialization that an IP requires 2708 * after all of the have been initialized or something that needs to happen 2709 * late in the init process. 2710 * Returns 0 on success, negative error code on failure. 2711 */ 2712 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2713 { 2714 struct amdgpu_gpu_instance *gpu_instance; 2715 int i = 0, r; 2716 2717 for (i = 0; i < adev->num_ip_blocks; i++) { 2718 if (!adev->ip_blocks[i].status.hw) 2719 continue; 2720 if (adev->ip_blocks[i].version->funcs->late_init) { 2721 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2722 if (r) { 2723 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2724 adev->ip_blocks[i].version->funcs->name, r); 2725 return r; 2726 } 2727 } 2728 adev->ip_blocks[i].status.late_initialized = true; 2729 } 2730 2731 r = amdgpu_ras_late_init(adev); 2732 if (r) { 2733 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2734 return r; 2735 } 2736 2737 amdgpu_ras_set_error_query_ready(adev, true); 2738 2739 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2740 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2741 2742 amdgpu_device_fill_reset_magic(adev); 2743 2744 r = amdgpu_device_enable_mgpu_fan_boost(); 2745 if (r) 2746 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2747 2748 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2749 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2750 adev->asic_type == CHIP_ALDEBARAN )) 2751 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2752 2753 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2754 mutex_lock(&mgpu_info.mutex); 2755 2756 /* 2757 * Reset device p-state to low as this was booted with high. 2758 * 2759 * This should be performed only after all devices from the same 2760 * hive get initialized. 2761 * 2762 * However, it's unknown how many device in the hive in advance. 2763 * As this is counted one by one during devices initializations. 2764 * 2765 * So, we wait for all XGMI interlinked devices initialized. 2766 * This may bring some delays as those devices may come from 2767 * different hives. But that should be OK. 2768 */ 2769 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2770 for (i = 0; i < mgpu_info.num_gpu; i++) { 2771 gpu_instance = &(mgpu_info.gpu_ins[i]); 2772 if (gpu_instance->adev->flags & AMD_IS_APU) 2773 continue; 2774 2775 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2776 AMDGPU_XGMI_PSTATE_MIN); 2777 if (r) { 2778 DRM_ERROR("pstate setting failed (%d).\n", r); 2779 break; 2780 } 2781 } 2782 } 2783 2784 mutex_unlock(&mgpu_info.mutex); 2785 } 2786 2787 return 0; 2788 } 2789 2790 /** 2791 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2792 * 2793 * @adev: amdgpu_device pointer 2794 * 2795 * For ASICs need to disable SMC first 2796 */ 2797 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2798 { 2799 int i, r; 2800 2801 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2802 return; 2803 2804 for (i = 0; i < adev->num_ip_blocks; i++) { 2805 if (!adev->ip_blocks[i].status.hw) 2806 continue; 2807 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2808 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2809 /* XXX handle errors */ 2810 if (r) { 2811 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2812 adev->ip_blocks[i].version->funcs->name, r); 2813 } 2814 adev->ip_blocks[i].status.hw = false; 2815 break; 2816 } 2817 } 2818 } 2819 2820 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2821 { 2822 int i, r; 2823 2824 for (i = 0; i < adev->num_ip_blocks; i++) { 2825 if (!adev->ip_blocks[i].version->funcs->early_fini) 2826 continue; 2827 2828 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2829 if (r) { 2830 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2831 adev->ip_blocks[i].version->funcs->name, r); 2832 } 2833 } 2834 2835 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2836 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2837 2838 amdgpu_amdkfd_suspend(adev, false); 2839 2840 /* Workaroud for ASICs need to disable SMC first */ 2841 amdgpu_device_smu_fini_early(adev); 2842 2843 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2844 if (!adev->ip_blocks[i].status.hw) 2845 continue; 2846 2847 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2848 /* XXX handle errors */ 2849 if (r) { 2850 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2851 adev->ip_blocks[i].version->funcs->name, r); 2852 } 2853 2854 adev->ip_blocks[i].status.hw = false; 2855 } 2856 2857 if (amdgpu_sriov_vf(adev)) { 2858 if (amdgpu_virt_release_full_gpu(adev, false)) 2859 DRM_ERROR("failed to release exclusive mode on fini\n"); 2860 } 2861 2862 return 0; 2863 } 2864 2865 /** 2866 * amdgpu_device_ip_fini - run fini for hardware IPs 2867 * 2868 * @adev: amdgpu_device pointer 2869 * 2870 * Main teardown pass for hardware IPs. The list of all the hardware 2871 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2872 * are run. hw_fini tears down the hardware associated with each IP 2873 * and sw_fini tears down any software state associated with each IP. 2874 * Returns 0 on success, negative error code on failure. 2875 */ 2876 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2877 { 2878 int i, r; 2879 2880 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2881 amdgpu_virt_release_ras_err_handler_data(adev); 2882 2883 if (adev->gmc.xgmi.num_physical_nodes > 1) 2884 amdgpu_xgmi_remove_device(adev); 2885 2886 amdgpu_amdkfd_device_fini_sw(adev); 2887 2888 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2889 if (!adev->ip_blocks[i].status.sw) 2890 continue; 2891 2892 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2893 amdgpu_ucode_free_bo(adev); 2894 amdgpu_free_static_csa(&adev->virt.csa_obj); 2895 amdgpu_device_wb_fini(adev); 2896 amdgpu_device_mem_scratch_fini(adev); 2897 amdgpu_ib_pool_fini(adev); 2898 } 2899 2900 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2901 /* XXX handle errors */ 2902 if (r) { 2903 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2904 adev->ip_blocks[i].version->funcs->name, r); 2905 } 2906 adev->ip_blocks[i].status.sw = false; 2907 adev->ip_blocks[i].status.valid = false; 2908 } 2909 2910 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2911 if (!adev->ip_blocks[i].status.late_initialized) 2912 continue; 2913 if (adev->ip_blocks[i].version->funcs->late_fini) 2914 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2915 adev->ip_blocks[i].status.late_initialized = false; 2916 } 2917 2918 amdgpu_ras_fini(adev); 2919 2920 return 0; 2921 } 2922 2923 /** 2924 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2925 * 2926 * @work: work_struct. 2927 */ 2928 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2929 { 2930 struct amdgpu_device *adev = 2931 container_of(work, struct amdgpu_device, delayed_init_work.work); 2932 int r; 2933 2934 r = amdgpu_ib_ring_tests(adev); 2935 if (r) 2936 DRM_ERROR("ib ring test failed (%d).\n", r); 2937 } 2938 2939 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2940 { 2941 struct amdgpu_device *adev = 2942 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2943 2944 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2945 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2946 2947 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2948 adev->gfx.gfx_off_state = true; 2949 } 2950 2951 /** 2952 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2953 * 2954 * @adev: amdgpu_device pointer 2955 * 2956 * Main suspend function for hardware IPs. The list of all the hardware 2957 * IPs that make up the asic is walked, clockgating is disabled and the 2958 * suspend callbacks are run. suspend puts the hardware and software state 2959 * in each IP into a state suitable for suspend. 2960 * Returns 0 on success, negative error code on failure. 2961 */ 2962 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2963 { 2964 int i, r; 2965 2966 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2967 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2968 2969 /* 2970 * Per PMFW team's suggestion, driver needs to handle gfxoff 2971 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2972 * scenario. Add the missing df cstate disablement here. 2973 */ 2974 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2975 dev_warn(adev->dev, "Failed to disallow df cstate"); 2976 2977 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2978 if (!adev->ip_blocks[i].status.valid) 2979 continue; 2980 2981 /* displays are handled separately */ 2982 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2983 continue; 2984 2985 /* XXX handle errors */ 2986 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2987 /* XXX handle errors */ 2988 if (r) { 2989 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2990 adev->ip_blocks[i].version->funcs->name, r); 2991 return r; 2992 } 2993 2994 adev->ip_blocks[i].status.hw = false; 2995 } 2996 2997 return 0; 2998 } 2999 3000 /** 3001 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3002 * 3003 * @adev: amdgpu_device pointer 3004 * 3005 * Main suspend function for hardware IPs. The list of all the hardware 3006 * IPs that make up the asic is walked, clockgating is disabled and the 3007 * suspend callbacks are run. suspend puts the hardware and software state 3008 * in each IP into a state suitable for suspend. 3009 * Returns 0 on success, negative error code on failure. 3010 */ 3011 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3012 { 3013 int i, r; 3014 3015 if (adev->in_s0ix) 3016 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3017 3018 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3019 if (!adev->ip_blocks[i].status.valid) 3020 continue; 3021 /* displays are handled in phase1 */ 3022 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3023 continue; 3024 /* PSP lost connection when err_event_athub occurs */ 3025 if (amdgpu_ras_intr_triggered() && 3026 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3027 adev->ip_blocks[i].status.hw = false; 3028 continue; 3029 } 3030 3031 /* skip unnecessary suspend if we do not initialize them yet */ 3032 if (adev->gmc.xgmi.pending_reset && 3033 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3034 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3035 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3036 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3037 adev->ip_blocks[i].status.hw = false; 3038 continue; 3039 } 3040 3041 /* skip suspend of gfx/mes and psp for S0ix 3042 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3043 * like at runtime. PSP is also part of the always on hardware 3044 * so no need to suspend it. 3045 */ 3046 if (adev->in_s0ix && 3047 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3048 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3049 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3050 continue; 3051 3052 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3053 if (adev->in_s0ix && 3054 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3055 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3056 continue; 3057 3058 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3059 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3060 * from this location and RLC Autoload automatically also gets loaded 3061 * from here based on PMFW -> PSP message during re-init sequence. 3062 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3063 * the TMR and reload FWs again for IMU enabled APU ASICs. 3064 */ 3065 if (amdgpu_in_reset(adev) && 3066 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3067 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3068 continue; 3069 3070 /* XXX handle errors */ 3071 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3072 /* XXX handle errors */ 3073 if (r) { 3074 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3075 adev->ip_blocks[i].version->funcs->name, r); 3076 } 3077 adev->ip_blocks[i].status.hw = false; 3078 /* handle putting the SMC in the appropriate state */ 3079 if(!amdgpu_sriov_vf(adev)){ 3080 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3081 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3082 if (r) { 3083 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3084 adev->mp1_state, r); 3085 return r; 3086 } 3087 } 3088 } 3089 } 3090 3091 return 0; 3092 } 3093 3094 /** 3095 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3096 * 3097 * @adev: amdgpu_device pointer 3098 * 3099 * Main suspend function for hardware IPs. The list of all the hardware 3100 * IPs that make up the asic is walked, clockgating is disabled and the 3101 * suspend callbacks are run. suspend puts the hardware and software state 3102 * in each IP into a state suitable for suspend. 3103 * Returns 0 on success, negative error code on failure. 3104 */ 3105 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3106 { 3107 int r; 3108 3109 if (amdgpu_sriov_vf(adev)) { 3110 amdgpu_virt_fini_data_exchange(adev); 3111 amdgpu_virt_request_full_gpu(adev, false); 3112 } 3113 3114 r = amdgpu_device_ip_suspend_phase1(adev); 3115 if (r) 3116 return r; 3117 r = amdgpu_device_ip_suspend_phase2(adev); 3118 3119 if (amdgpu_sriov_vf(adev)) 3120 amdgpu_virt_release_full_gpu(adev, false); 3121 3122 return r; 3123 } 3124 3125 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3126 { 3127 int i, r; 3128 3129 static enum amd_ip_block_type ip_order[] = { 3130 AMD_IP_BLOCK_TYPE_COMMON, 3131 AMD_IP_BLOCK_TYPE_GMC, 3132 AMD_IP_BLOCK_TYPE_PSP, 3133 AMD_IP_BLOCK_TYPE_IH, 3134 }; 3135 3136 for (i = 0; i < adev->num_ip_blocks; i++) { 3137 int j; 3138 struct amdgpu_ip_block *block; 3139 3140 block = &adev->ip_blocks[i]; 3141 block->status.hw = false; 3142 3143 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3144 3145 if (block->version->type != ip_order[j] || 3146 !block->status.valid) 3147 continue; 3148 3149 r = block->version->funcs->hw_init(adev); 3150 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3151 if (r) 3152 return r; 3153 block->status.hw = true; 3154 } 3155 } 3156 3157 return 0; 3158 } 3159 3160 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3161 { 3162 int i, r; 3163 3164 static enum amd_ip_block_type ip_order[] = { 3165 AMD_IP_BLOCK_TYPE_SMC, 3166 AMD_IP_BLOCK_TYPE_DCE, 3167 AMD_IP_BLOCK_TYPE_GFX, 3168 AMD_IP_BLOCK_TYPE_SDMA, 3169 AMD_IP_BLOCK_TYPE_UVD, 3170 AMD_IP_BLOCK_TYPE_VCE, 3171 AMD_IP_BLOCK_TYPE_VCN 3172 }; 3173 3174 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3175 int j; 3176 struct amdgpu_ip_block *block; 3177 3178 for (j = 0; j < adev->num_ip_blocks; j++) { 3179 block = &adev->ip_blocks[j]; 3180 3181 if (block->version->type != ip_order[i] || 3182 !block->status.valid || 3183 block->status.hw) 3184 continue; 3185 3186 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3187 r = block->version->funcs->resume(adev); 3188 else 3189 r = block->version->funcs->hw_init(adev); 3190 3191 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3192 if (r) 3193 return r; 3194 block->status.hw = true; 3195 } 3196 } 3197 3198 return 0; 3199 } 3200 3201 /** 3202 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3203 * 3204 * @adev: amdgpu_device pointer 3205 * 3206 * First resume function for hardware IPs. The list of all the hardware 3207 * IPs that make up the asic is walked and the resume callbacks are run for 3208 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3209 * after a suspend and updates the software state as necessary. This 3210 * function is also used for restoring the GPU after a GPU reset. 3211 * Returns 0 on success, negative error code on failure. 3212 */ 3213 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3214 { 3215 int i, r; 3216 3217 for (i = 0; i < adev->num_ip_blocks; i++) { 3218 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3219 continue; 3220 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3221 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3222 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3223 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3224 3225 r = adev->ip_blocks[i].version->funcs->resume(adev); 3226 if (r) { 3227 DRM_ERROR("resume of IP block <%s> failed %d\n", 3228 adev->ip_blocks[i].version->funcs->name, r); 3229 return r; 3230 } 3231 adev->ip_blocks[i].status.hw = true; 3232 } 3233 } 3234 3235 return 0; 3236 } 3237 3238 /** 3239 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3240 * 3241 * @adev: amdgpu_device pointer 3242 * 3243 * First resume function for hardware IPs. The list of all the hardware 3244 * IPs that make up the asic is walked and the resume callbacks are run for 3245 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3246 * functional state after a suspend and updates the software state as 3247 * necessary. This function is also used for restoring the GPU after a GPU 3248 * reset. 3249 * Returns 0 on success, negative error code on failure. 3250 */ 3251 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3252 { 3253 int i, r; 3254 3255 for (i = 0; i < adev->num_ip_blocks; i++) { 3256 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3257 continue; 3258 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3259 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3260 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3261 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3262 continue; 3263 r = adev->ip_blocks[i].version->funcs->resume(adev); 3264 if (r) { 3265 DRM_ERROR("resume of IP block <%s> failed %d\n", 3266 adev->ip_blocks[i].version->funcs->name, r); 3267 return r; 3268 } 3269 adev->ip_blocks[i].status.hw = true; 3270 } 3271 3272 return 0; 3273 } 3274 3275 /** 3276 * amdgpu_device_ip_resume - run resume for hardware IPs 3277 * 3278 * @adev: amdgpu_device pointer 3279 * 3280 * Main resume function for hardware IPs. The hardware IPs 3281 * are split into two resume functions because they are 3282 * are also used in in recovering from a GPU reset and some additional 3283 * steps need to be take between them. In this case (S3/S4) they are 3284 * run sequentially. 3285 * Returns 0 on success, negative error code on failure. 3286 */ 3287 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3288 { 3289 int r; 3290 3291 r = amdgpu_amdkfd_resume_iommu(adev); 3292 if (r) 3293 return r; 3294 3295 r = amdgpu_device_ip_resume_phase1(adev); 3296 if (r) 3297 return r; 3298 3299 r = amdgpu_device_fw_loading(adev); 3300 if (r) 3301 return r; 3302 3303 r = amdgpu_device_ip_resume_phase2(adev); 3304 3305 return r; 3306 } 3307 3308 /** 3309 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3310 * 3311 * @adev: amdgpu_device pointer 3312 * 3313 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3314 */ 3315 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3316 { 3317 if (amdgpu_sriov_vf(adev)) { 3318 if (adev->is_atom_fw) { 3319 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3320 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3321 } else { 3322 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3323 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3324 } 3325 3326 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3327 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3328 } 3329 } 3330 3331 /** 3332 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3333 * 3334 * @asic_type: AMD asic type 3335 * 3336 * Check if there is DC (new modesetting infrastructre) support for an asic. 3337 * returns true if DC has support, false if not. 3338 */ 3339 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3340 { 3341 switch (asic_type) { 3342 #ifdef CONFIG_DRM_AMDGPU_SI 3343 case CHIP_HAINAN: 3344 #endif 3345 case CHIP_TOPAZ: 3346 /* chips with no display hardware */ 3347 return false; 3348 #if defined(CONFIG_DRM_AMD_DC) 3349 case CHIP_TAHITI: 3350 case CHIP_PITCAIRN: 3351 case CHIP_VERDE: 3352 case CHIP_OLAND: 3353 /* 3354 * We have systems in the wild with these ASICs that require 3355 * LVDS and VGA support which is not supported with DC. 3356 * 3357 * Fallback to the non-DC driver here by default so as not to 3358 * cause regressions. 3359 */ 3360 #if defined(CONFIG_DRM_AMD_DC_SI) 3361 return amdgpu_dc > 0; 3362 #else 3363 return false; 3364 #endif 3365 case CHIP_BONAIRE: 3366 case CHIP_KAVERI: 3367 case CHIP_KABINI: 3368 case CHIP_MULLINS: 3369 /* 3370 * We have systems in the wild with these ASICs that require 3371 * VGA support which is not supported with DC. 3372 * 3373 * Fallback to the non-DC driver here by default so as not to 3374 * cause regressions. 3375 */ 3376 return amdgpu_dc > 0; 3377 default: 3378 return amdgpu_dc != 0; 3379 #else 3380 default: 3381 if (amdgpu_dc > 0) 3382 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3383 "but isn't supported by ASIC, ignoring\n"); 3384 return false; 3385 #endif 3386 } 3387 } 3388 3389 /** 3390 * amdgpu_device_has_dc_support - check if dc is supported 3391 * 3392 * @adev: amdgpu_device pointer 3393 * 3394 * Returns true for supported, false for not supported 3395 */ 3396 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3397 { 3398 if (adev->enable_virtual_display || 3399 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3400 return false; 3401 3402 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3403 } 3404 3405 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3406 { 3407 struct amdgpu_device *adev = 3408 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3409 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3410 3411 /* It's a bug to not have a hive within this function */ 3412 if (WARN_ON(!hive)) 3413 return; 3414 3415 /* 3416 * Use task barrier to synchronize all xgmi reset works across the 3417 * hive. task_barrier_enter and task_barrier_exit will block 3418 * until all the threads running the xgmi reset works reach 3419 * those points. task_barrier_full will do both blocks. 3420 */ 3421 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3422 3423 task_barrier_enter(&hive->tb); 3424 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3425 3426 if (adev->asic_reset_res) 3427 goto fail; 3428 3429 task_barrier_exit(&hive->tb); 3430 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3431 3432 if (adev->asic_reset_res) 3433 goto fail; 3434 3435 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3436 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3437 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3438 } else { 3439 3440 task_barrier_full(&hive->tb); 3441 adev->asic_reset_res = amdgpu_asic_reset(adev); 3442 } 3443 3444 fail: 3445 if (adev->asic_reset_res) 3446 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3447 adev->asic_reset_res, adev_to_drm(adev)->unique); 3448 amdgpu_put_xgmi_hive(hive); 3449 } 3450 3451 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3452 { 3453 char *input = amdgpu_lockup_timeout; 3454 char *timeout_setting = NULL; 3455 int index = 0; 3456 long timeout; 3457 int ret = 0; 3458 3459 /* 3460 * By default timeout for non compute jobs is 10000 3461 * and 60000 for compute jobs. 3462 * In SR-IOV or passthrough mode, timeout for compute 3463 * jobs are 60000 by default. 3464 */ 3465 adev->gfx_timeout = msecs_to_jiffies(10000); 3466 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3467 if (amdgpu_sriov_vf(adev)) 3468 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3469 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3470 else 3471 adev->compute_timeout = msecs_to_jiffies(60000); 3472 3473 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3474 while ((timeout_setting = strsep(&input, ",")) && 3475 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3476 ret = kstrtol(timeout_setting, 0, &timeout); 3477 if (ret) 3478 return ret; 3479 3480 if (timeout == 0) { 3481 index++; 3482 continue; 3483 } else if (timeout < 0) { 3484 timeout = MAX_SCHEDULE_TIMEOUT; 3485 dev_warn(adev->dev, "lockup timeout disabled"); 3486 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3487 } else { 3488 timeout = msecs_to_jiffies(timeout); 3489 } 3490 3491 switch (index++) { 3492 case 0: 3493 adev->gfx_timeout = timeout; 3494 break; 3495 case 1: 3496 adev->compute_timeout = timeout; 3497 break; 3498 case 2: 3499 adev->sdma_timeout = timeout; 3500 break; 3501 case 3: 3502 adev->video_timeout = timeout; 3503 break; 3504 default: 3505 break; 3506 } 3507 } 3508 /* 3509 * There is only one value specified and 3510 * it should apply to all non-compute jobs. 3511 */ 3512 if (index == 1) { 3513 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3514 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3515 adev->compute_timeout = adev->gfx_timeout; 3516 } 3517 } 3518 3519 return ret; 3520 } 3521 3522 /** 3523 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3524 * 3525 * @adev: amdgpu_device pointer 3526 * 3527 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3528 */ 3529 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3530 { 3531 struct iommu_domain *domain; 3532 3533 domain = iommu_get_domain_for_dev(adev->dev); 3534 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3535 adev->ram_is_direct_mapped = true; 3536 } 3537 3538 static const struct attribute *amdgpu_dev_attributes[] = { 3539 &dev_attr_product_name.attr, 3540 &dev_attr_product_number.attr, 3541 &dev_attr_serial_number.attr, 3542 &dev_attr_pcie_replay_count.attr, 3543 NULL 3544 }; 3545 3546 /** 3547 * amdgpu_device_init - initialize the driver 3548 * 3549 * @adev: amdgpu_device pointer 3550 * @flags: driver flags 3551 * 3552 * Initializes the driver info and hw (all asics). 3553 * Returns 0 for success or an error on failure. 3554 * Called at driver startup. 3555 */ 3556 int amdgpu_device_init(struct amdgpu_device *adev, 3557 uint32_t flags) 3558 { 3559 struct drm_device *ddev = adev_to_drm(adev); 3560 struct pci_dev *pdev = adev->pdev; 3561 int r, i; 3562 bool px = false; 3563 u32 max_MBps; 3564 3565 adev->shutdown = false; 3566 adev->flags = flags; 3567 3568 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3569 adev->asic_type = amdgpu_force_asic_type; 3570 else 3571 adev->asic_type = flags & AMD_ASIC_MASK; 3572 3573 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3574 if (amdgpu_emu_mode == 1) 3575 adev->usec_timeout *= 10; 3576 adev->gmc.gart_size = 512 * 1024 * 1024; 3577 adev->accel_working = false; 3578 adev->num_rings = 0; 3579 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3580 adev->mman.buffer_funcs = NULL; 3581 adev->mman.buffer_funcs_ring = NULL; 3582 adev->vm_manager.vm_pte_funcs = NULL; 3583 adev->vm_manager.vm_pte_num_scheds = 0; 3584 adev->gmc.gmc_funcs = NULL; 3585 adev->harvest_ip_mask = 0x0; 3586 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3587 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3588 3589 adev->smc_rreg = &amdgpu_invalid_rreg; 3590 adev->smc_wreg = &amdgpu_invalid_wreg; 3591 adev->pcie_rreg = &amdgpu_invalid_rreg; 3592 adev->pcie_wreg = &amdgpu_invalid_wreg; 3593 adev->pciep_rreg = &amdgpu_invalid_rreg; 3594 adev->pciep_wreg = &amdgpu_invalid_wreg; 3595 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3596 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3597 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3598 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3599 adev->didt_rreg = &amdgpu_invalid_rreg; 3600 adev->didt_wreg = &amdgpu_invalid_wreg; 3601 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3602 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3603 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3604 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3605 3606 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3607 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3608 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3609 3610 /* mutex initialization are all done here so we 3611 * can recall function without having locking issues */ 3612 mutex_init(&adev->firmware.mutex); 3613 mutex_init(&adev->pm.mutex); 3614 mutex_init(&adev->gfx.gpu_clock_mutex); 3615 mutex_init(&adev->srbm_mutex); 3616 mutex_init(&adev->gfx.pipe_reserve_mutex); 3617 mutex_init(&adev->gfx.gfx_off_mutex); 3618 mutex_init(&adev->grbm_idx_mutex); 3619 mutex_init(&adev->mn_lock); 3620 mutex_init(&adev->virt.vf_errors.lock); 3621 hash_init(adev->mn_hash); 3622 mutex_init(&adev->psp.mutex); 3623 mutex_init(&adev->notifier_lock); 3624 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3625 mutex_init(&adev->benchmark_mutex); 3626 3627 amdgpu_device_init_apu_flags(adev); 3628 3629 r = amdgpu_device_check_arguments(adev); 3630 if (r) 3631 return r; 3632 3633 spin_lock_init(&adev->mmio_idx_lock); 3634 spin_lock_init(&adev->smc_idx_lock); 3635 spin_lock_init(&adev->pcie_idx_lock); 3636 spin_lock_init(&adev->uvd_ctx_idx_lock); 3637 spin_lock_init(&adev->didt_idx_lock); 3638 spin_lock_init(&adev->gc_cac_idx_lock); 3639 spin_lock_init(&adev->se_cac_idx_lock); 3640 spin_lock_init(&adev->audio_endpt_idx_lock); 3641 spin_lock_init(&adev->mm_stats.lock); 3642 3643 INIT_LIST_HEAD(&adev->shadow_list); 3644 mutex_init(&adev->shadow_list_lock); 3645 3646 INIT_LIST_HEAD(&adev->reset_list); 3647 3648 INIT_LIST_HEAD(&adev->ras_list); 3649 3650 INIT_DELAYED_WORK(&adev->delayed_init_work, 3651 amdgpu_device_delayed_init_work_handler); 3652 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3653 amdgpu_device_delay_enable_gfx_off); 3654 3655 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3656 3657 adev->gfx.gfx_off_req_count = 1; 3658 adev->gfx.gfx_off_residency = 0; 3659 adev->gfx.gfx_off_entrycount = 0; 3660 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3661 3662 atomic_set(&adev->throttling_logging_enabled, 1); 3663 /* 3664 * If throttling continues, logging will be performed every minute 3665 * to avoid log flooding. "-1" is subtracted since the thermal 3666 * throttling interrupt comes every second. Thus, the total logging 3667 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3668 * for throttling interrupt) = 60 seconds. 3669 */ 3670 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3671 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3672 3673 /* Registers mapping */ 3674 /* TODO: block userspace mapping of io register */ 3675 if (adev->asic_type >= CHIP_BONAIRE) { 3676 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3677 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3678 } else { 3679 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3680 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3681 } 3682 3683 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3684 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3685 3686 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3687 if (adev->rmmio == NULL) { 3688 return -ENOMEM; 3689 } 3690 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3691 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3692 3693 amdgpu_device_get_pcie_info(adev); 3694 3695 if (amdgpu_mcbp) 3696 DRM_INFO("MCBP is enabled\n"); 3697 3698 /* 3699 * Reset domain needs to be present early, before XGMI hive discovered 3700 * (if any) and intitialized to use reset sem and in_gpu reset flag 3701 * early on during init and before calling to RREG32. 3702 */ 3703 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3704 if (!adev->reset_domain) 3705 return -ENOMEM; 3706 3707 /* detect hw virtualization here */ 3708 amdgpu_detect_virtualization(adev); 3709 3710 r = amdgpu_device_get_job_timeout_settings(adev); 3711 if (r) { 3712 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3713 return r; 3714 } 3715 3716 /* early init functions */ 3717 r = amdgpu_device_ip_early_init(adev); 3718 if (r) 3719 return r; 3720 3721 /* Get rid of things like offb */ 3722 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 3723 if (r) 3724 return r; 3725 3726 /* Enable TMZ based on IP_VERSION */ 3727 amdgpu_gmc_tmz_set(adev); 3728 3729 amdgpu_gmc_noretry_set(adev); 3730 /* Need to get xgmi info early to decide the reset behavior*/ 3731 if (adev->gmc.xgmi.supported) { 3732 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3733 if (r) 3734 return r; 3735 } 3736 3737 /* enable PCIE atomic ops */ 3738 if (amdgpu_sriov_vf(adev)) 3739 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3740 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3741 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3742 else 3743 adev->have_atomics_support = 3744 !pci_enable_atomic_ops_to_root(adev->pdev, 3745 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3746 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3747 if (!adev->have_atomics_support) 3748 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3749 3750 /* doorbell bar mapping and doorbell index init*/ 3751 amdgpu_device_doorbell_init(adev); 3752 3753 if (amdgpu_emu_mode == 1) { 3754 /* post the asic on emulation mode */ 3755 emu_soc_asic_init(adev); 3756 goto fence_driver_init; 3757 } 3758 3759 amdgpu_reset_init(adev); 3760 3761 /* detect if we are with an SRIOV vbios */ 3762 amdgpu_device_detect_sriov_bios(adev); 3763 3764 /* check if we need to reset the asic 3765 * E.g., driver was not cleanly unloaded previously, etc. 3766 */ 3767 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3768 if (adev->gmc.xgmi.num_physical_nodes) { 3769 dev_info(adev->dev, "Pending hive reset.\n"); 3770 adev->gmc.xgmi.pending_reset = true; 3771 /* Only need to init necessary block for SMU to handle the reset */ 3772 for (i = 0; i < adev->num_ip_blocks; i++) { 3773 if (!adev->ip_blocks[i].status.valid) 3774 continue; 3775 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3776 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3777 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3778 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3779 DRM_DEBUG("IP %s disabled for hw_init.\n", 3780 adev->ip_blocks[i].version->funcs->name); 3781 adev->ip_blocks[i].status.hw = true; 3782 } 3783 } 3784 } else { 3785 r = amdgpu_asic_reset(adev); 3786 if (r) { 3787 dev_err(adev->dev, "asic reset on init failed\n"); 3788 goto failed; 3789 } 3790 } 3791 } 3792 3793 /* Post card if necessary */ 3794 if (amdgpu_device_need_post(adev)) { 3795 if (!adev->bios) { 3796 dev_err(adev->dev, "no vBIOS found\n"); 3797 r = -EINVAL; 3798 goto failed; 3799 } 3800 DRM_INFO("GPU posting now...\n"); 3801 r = amdgpu_device_asic_init(adev); 3802 if (r) { 3803 dev_err(adev->dev, "gpu post error!\n"); 3804 goto failed; 3805 } 3806 } 3807 3808 if (adev->is_atom_fw) { 3809 /* Initialize clocks */ 3810 r = amdgpu_atomfirmware_get_clock_info(adev); 3811 if (r) { 3812 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3813 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3814 goto failed; 3815 } 3816 } else { 3817 /* Initialize clocks */ 3818 r = amdgpu_atombios_get_clock_info(adev); 3819 if (r) { 3820 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3821 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3822 goto failed; 3823 } 3824 /* init i2c buses */ 3825 if (!amdgpu_device_has_dc_support(adev)) 3826 amdgpu_atombios_i2c_init(adev); 3827 } 3828 3829 fence_driver_init: 3830 /* Fence driver */ 3831 r = amdgpu_fence_driver_sw_init(adev); 3832 if (r) { 3833 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3834 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3835 goto failed; 3836 } 3837 3838 /* init the mode config */ 3839 drm_mode_config_init(adev_to_drm(adev)); 3840 3841 r = amdgpu_device_ip_init(adev); 3842 if (r) { 3843 /* failed in exclusive mode due to timeout */ 3844 if (amdgpu_sriov_vf(adev) && 3845 !amdgpu_sriov_runtime(adev) && 3846 amdgpu_virt_mmio_blocked(adev) && 3847 !amdgpu_virt_wait_reset(adev)) { 3848 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3849 /* Don't send request since VF is inactive. */ 3850 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3851 adev->virt.ops = NULL; 3852 r = -EAGAIN; 3853 goto release_ras_con; 3854 } 3855 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3856 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3857 goto release_ras_con; 3858 } 3859 3860 amdgpu_fence_driver_hw_init(adev); 3861 3862 dev_info(adev->dev, 3863 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3864 adev->gfx.config.max_shader_engines, 3865 adev->gfx.config.max_sh_per_se, 3866 adev->gfx.config.max_cu_per_sh, 3867 adev->gfx.cu_info.number); 3868 3869 adev->accel_working = true; 3870 3871 amdgpu_vm_check_compute_bug(adev); 3872 3873 /* Initialize the buffer migration limit. */ 3874 if (amdgpu_moverate >= 0) 3875 max_MBps = amdgpu_moverate; 3876 else 3877 max_MBps = 8; /* Allow 8 MB/s. */ 3878 /* Get a log2 for easy divisions. */ 3879 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3880 3881 r = amdgpu_pm_sysfs_init(adev); 3882 if (r) 3883 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 3884 3885 r = amdgpu_ucode_sysfs_init(adev); 3886 if (r) { 3887 adev->ucode_sysfs_en = false; 3888 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3889 } else 3890 adev->ucode_sysfs_en = true; 3891 3892 r = amdgpu_psp_sysfs_init(adev); 3893 if (r) { 3894 adev->psp_sysfs_en = false; 3895 if (!amdgpu_sriov_vf(adev)) 3896 DRM_ERROR("Creating psp sysfs failed\n"); 3897 } else 3898 adev->psp_sysfs_en = true; 3899 3900 /* 3901 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3902 * Otherwise the mgpu fan boost feature will be skipped due to the 3903 * gpu instance is counted less. 3904 */ 3905 amdgpu_register_gpu_instance(adev); 3906 3907 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3908 * explicit gating rather than handling it automatically. 3909 */ 3910 if (!adev->gmc.xgmi.pending_reset) { 3911 r = amdgpu_device_ip_late_init(adev); 3912 if (r) { 3913 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3914 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3915 goto release_ras_con; 3916 } 3917 /* must succeed. */ 3918 amdgpu_ras_resume(adev); 3919 queue_delayed_work(system_wq, &adev->delayed_init_work, 3920 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3921 } 3922 3923 if (amdgpu_sriov_vf(adev)) 3924 flush_delayed_work(&adev->delayed_init_work); 3925 3926 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3927 if (r) 3928 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3929 3930 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3931 r = amdgpu_pmu_init(adev); 3932 if (r) 3933 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3934 3935 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3936 if (amdgpu_device_cache_pci_state(adev->pdev)) 3937 pci_restore_state(pdev); 3938 3939 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3940 /* this will fail for cards that aren't VGA class devices, just 3941 * ignore it */ 3942 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3943 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3944 3945 px = amdgpu_device_supports_px(ddev); 3946 3947 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 3948 apple_gmux_detect(NULL, NULL))) 3949 vga_switcheroo_register_client(adev->pdev, 3950 &amdgpu_switcheroo_ops, px); 3951 3952 if (px) 3953 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3954 3955 if (adev->gmc.xgmi.pending_reset) 3956 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3957 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3958 3959 amdgpu_device_check_iommu_direct_map(adev); 3960 3961 return 0; 3962 3963 release_ras_con: 3964 amdgpu_release_ras_context(adev); 3965 3966 failed: 3967 amdgpu_vf_error_trans_all(adev); 3968 3969 return r; 3970 } 3971 3972 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3973 { 3974 3975 /* Clear all CPU mappings pointing to this device */ 3976 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3977 3978 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3979 amdgpu_device_doorbell_fini(adev); 3980 3981 iounmap(adev->rmmio); 3982 adev->rmmio = NULL; 3983 if (adev->mman.aper_base_kaddr) 3984 iounmap(adev->mman.aper_base_kaddr); 3985 adev->mman.aper_base_kaddr = NULL; 3986 3987 /* Memory manager related */ 3988 if (!adev->gmc.xgmi.connected_to_cpu) { 3989 arch_phys_wc_del(adev->gmc.vram_mtrr); 3990 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3991 } 3992 } 3993 3994 /** 3995 * amdgpu_device_fini_hw - tear down the driver 3996 * 3997 * @adev: amdgpu_device pointer 3998 * 3999 * Tear down the driver info (all asics). 4000 * Called at driver shutdown. 4001 */ 4002 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4003 { 4004 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4005 flush_delayed_work(&adev->delayed_init_work); 4006 adev->shutdown = true; 4007 4008 /* make sure IB test finished before entering exclusive mode 4009 * to avoid preemption on IB test 4010 * */ 4011 if (amdgpu_sriov_vf(adev)) { 4012 amdgpu_virt_request_full_gpu(adev, false); 4013 amdgpu_virt_fini_data_exchange(adev); 4014 } 4015 4016 /* disable all interrupts */ 4017 amdgpu_irq_disable_all(adev); 4018 if (adev->mode_info.mode_config_initialized){ 4019 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4020 drm_helper_force_disable_all(adev_to_drm(adev)); 4021 else 4022 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4023 } 4024 amdgpu_fence_driver_hw_fini(adev); 4025 4026 if (adev->mman.initialized) 4027 drain_workqueue(adev->mman.bdev.wq); 4028 4029 if (adev->pm.sysfs_initialized) 4030 amdgpu_pm_sysfs_fini(adev); 4031 if (adev->ucode_sysfs_en) 4032 amdgpu_ucode_sysfs_fini(adev); 4033 if (adev->psp_sysfs_en) 4034 amdgpu_psp_sysfs_fini(adev); 4035 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4036 4037 /* disable ras feature must before hw fini */ 4038 amdgpu_ras_pre_fini(adev); 4039 4040 amdgpu_device_ip_fini_early(adev); 4041 4042 amdgpu_irq_fini_hw(adev); 4043 4044 if (adev->mman.initialized) 4045 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4046 4047 amdgpu_gart_dummy_page_fini(adev); 4048 4049 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4050 amdgpu_device_unmap_mmio(adev); 4051 4052 } 4053 4054 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4055 { 4056 int idx; 4057 bool px; 4058 4059 amdgpu_fence_driver_sw_fini(adev); 4060 amdgpu_device_ip_fini(adev); 4061 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4062 adev->accel_working = false; 4063 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4064 4065 amdgpu_reset_fini(adev); 4066 4067 /* free i2c buses */ 4068 if (!amdgpu_device_has_dc_support(adev)) 4069 amdgpu_i2c_fini(adev); 4070 4071 if (amdgpu_emu_mode != 1) 4072 amdgpu_atombios_fini(adev); 4073 4074 kfree(adev->bios); 4075 adev->bios = NULL; 4076 4077 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4078 4079 if (px || (!pci_is_thunderbolt_attached(adev->pdev) && 4080 apple_gmux_detect(NULL, NULL))) 4081 vga_switcheroo_unregister_client(adev->pdev); 4082 4083 if (px) 4084 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4085 4086 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4087 vga_client_unregister(adev->pdev); 4088 4089 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4090 4091 iounmap(adev->rmmio); 4092 adev->rmmio = NULL; 4093 amdgpu_device_doorbell_fini(adev); 4094 drm_dev_exit(idx); 4095 } 4096 4097 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4098 amdgpu_pmu_fini(adev); 4099 if (adev->mman.discovery_bin) 4100 amdgpu_discovery_fini(adev); 4101 4102 amdgpu_reset_put_reset_domain(adev->reset_domain); 4103 adev->reset_domain = NULL; 4104 4105 kfree(adev->pci_state); 4106 4107 } 4108 4109 /** 4110 * amdgpu_device_evict_resources - evict device resources 4111 * @adev: amdgpu device object 4112 * 4113 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4114 * of the vram memory type. Mainly used for evicting device resources 4115 * at suspend time. 4116 * 4117 */ 4118 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4119 { 4120 int ret; 4121 4122 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4123 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4124 return 0; 4125 4126 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4127 if (ret) 4128 DRM_WARN("evicting device resources failed\n"); 4129 return ret; 4130 } 4131 4132 /* 4133 * Suspend & resume. 4134 */ 4135 /** 4136 * amdgpu_device_suspend - initiate device suspend 4137 * 4138 * @dev: drm dev pointer 4139 * @fbcon : notify the fbdev of suspend 4140 * 4141 * Puts the hw in the suspend state (all asics). 4142 * Returns 0 for success or an error on failure. 4143 * Called at driver suspend. 4144 */ 4145 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4146 { 4147 struct amdgpu_device *adev = drm_to_adev(dev); 4148 int r = 0; 4149 4150 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4151 return 0; 4152 4153 adev->in_suspend = true; 4154 4155 /* Evict the majority of BOs before grabbing the full access */ 4156 r = amdgpu_device_evict_resources(adev); 4157 if (r) 4158 return r; 4159 4160 if (amdgpu_sriov_vf(adev)) { 4161 amdgpu_virt_fini_data_exchange(adev); 4162 r = amdgpu_virt_request_full_gpu(adev, false); 4163 if (r) 4164 return r; 4165 } 4166 4167 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4168 DRM_WARN("smart shift update failed\n"); 4169 4170 if (fbcon) 4171 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4172 4173 cancel_delayed_work_sync(&adev->delayed_init_work); 4174 4175 amdgpu_ras_suspend(adev); 4176 4177 amdgpu_device_ip_suspend_phase1(adev); 4178 4179 if (!adev->in_s0ix) 4180 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4181 4182 r = amdgpu_device_evict_resources(adev); 4183 if (r) 4184 return r; 4185 4186 amdgpu_fence_driver_hw_fini(adev); 4187 4188 amdgpu_device_ip_suspend_phase2(adev); 4189 4190 if (amdgpu_sriov_vf(adev)) 4191 amdgpu_virt_release_full_gpu(adev, false); 4192 4193 return 0; 4194 } 4195 4196 /** 4197 * amdgpu_device_resume - initiate device resume 4198 * 4199 * @dev: drm dev pointer 4200 * @fbcon : notify the fbdev of resume 4201 * 4202 * Bring the hw back to operating state (all asics). 4203 * Returns 0 for success or an error on failure. 4204 * Called at driver resume. 4205 */ 4206 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4207 { 4208 struct amdgpu_device *adev = drm_to_adev(dev); 4209 int r = 0; 4210 4211 if (amdgpu_sriov_vf(adev)) { 4212 r = amdgpu_virt_request_full_gpu(adev, true); 4213 if (r) 4214 return r; 4215 } 4216 4217 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4218 return 0; 4219 4220 if (adev->in_s0ix) 4221 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4222 4223 /* post card */ 4224 if (amdgpu_device_need_post(adev)) { 4225 r = amdgpu_device_asic_init(adev); 4226 if (r) 4227 dev_err(adev->dev, "amdgpu asic init failed\n"); 4228 } 4229 4230 r = amdgpu_device_ip_resume(adev); 4231 4232 if (r) { 4233 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4234 goto exit; 4235 } 4236 amdgpu_fence_driver_hw_init(adev); 4237 4238 r = amdgpu_device_ip_late_init(adev); 4239 if (r) 4240 goto exit; 4241 4242 queue_delayed_work(system_wq, &adev->delayed_init_work, 4243 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4244 4245 if (!adev->in_s0ix) { 4246 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4247 if (r) 4248 goto exit; 4249 } 4250 4251 exit: 4252 if (amdgpu_sriov_vf(adev)) { 4253 amdgpu_virt_init_data_exchange(adev); 4254 amdgpu_virt_release_full_gpu(adev, true); 4255 } 4256 4257 if (r) 4258 return r; 4259 4260 /* Make sure IB tests flushed */ 4261 flush_delayed_work(&adev->delayed_init_work); 4262 4263 if (fbcon) 4264 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4265 4266 amdgpu_ras_resume(adev); 4267 4268 if (adev->mode_info.num_crtc) { 4269 /* 4270 * Most of the connector probing functions try to acquire runtime pm 4271 * refs to ensure that the GPU is powered on when connector polling is 4272 * performed. Since we're calling this from a runtime PM callback, 4273 * trying to acquire rpm refs will cause us to deadlock. 4274 * 4275 * Since we're guaranteed to be holding the rpm lock, it's safe to 4276 * temporarily disable the rpm helpers so this doesn't deadlock us. 4277 */ 4278 #ifdef CONFIG_PM 4279 dev->dev->power.disable_depth++; 4280 #endif 4281 if (!adev->dc_enabled) 4282 drm_helper_hpd_irq_event(dev); 4283 else 4284 drm_kms_helper_hotplug_event(dev); 4285 #ifdef CONFIG_PM 4286 dev->dev->power.disable_depth--; 4287 #endif 4288 } 4289 adev->in_suspend = false; 4290 4291 if (adev->enable_mes) 4292 amdgpu_mes_self_test(adev); 4293 4294 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4295 DRM_WARN("smart shift update failed\n"); 4296 4297 return 0; 4298 } 4299 4300 /** 4301 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4302 * 4303 * @adev: amdgpu_device pointer 4304 * 4305 * The list of all the hardware IPs that make up the asic is walked and 4306 * the check_soft_reset callbacks are run. check_soft_reset determines 4307 * if the asic is still hung or not. 4308 * Returns true if any of the IPs are still in a hung state, false if not. 4309 */ 4310 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4311 { 4312 int i; 4313 bool asic_hang = false; 4314 4315 if (amdgpu_sriov_vf(adev)) 4316 return true; 4317 4318 if (amdgpu_asic_need_full_reset(adev)) 4319 return true; 4320 4321 for (i = 0; i < adev->num_ip_blocks; i++) { 4322 if (!adev->ip_blocks[i].status.valid) 4323 continue; 4324 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4325 adev->ip_blocks[i].status.hang = 4326 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4327 if (adev->ip_blocks[i].status.hang) { 4328 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4329 asic_hang = true; 4330 } 4331 } 4332 return asic_hang; 4333 } 4334 4335 /** 4336 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4337 * 4338 * @adev: amdgpu_device pointer 4339 * 4340 * The list of all the hardware IPs that make up the asic is walked and the 4341 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4342 * handles any IP specific hardware or software state changes that are 4343 * necessary for a soft reset to succeed. 4344 * Returns 0 on success, negative error code on failure. 4345 */ 4346 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4347 { 4348 int i, r = 0; 4349 4350 for (i = 0; i < adev->num_ip_blocks; i++) { 4351 if (!adev->ip_blocks[i].status.valid) 4352 continue; 4353 if (adev->ip_blocks[i].status.hang && 4354 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4355 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4356 if (r) 4357 return r; 4358 } 4359 } 4360 4361 return 0; 4362 } 4363 4364 /** 4365 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4366 * 4367 * @adev: amdgpu_device pointer 4368 * 4369 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4370 * reset is necessary to recover. 4371 * Returns true if a full asic reset is required, false if not. 4372 */ 4373 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4374 { 4375 int i; 4376 4377 if (amdgpu_asic_need_full_reset(adev)) 4378 return true; 4379 4380 for (i = 0; i < adev->num_ip_blocks; i++) { 4381 if (!adev->ip_blocks[i].status.valid) 4382 continue; 4383 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4384 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4385 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4386 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4387 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4388 if (adev->ip_blocks[i].status.hang) { 4389 dev_info(adev->dev, "Some block need full reset!\n"); 4390 return true; 4391 } 4392 } 4393 } 4394 return false; 4395 } 4396 4397 /** 4398 * amdgpu_device_ip_soft_reset - do a soft reset 4399 * 4400 * @adev: amdgpu_device pointer 4401 * 4402 * The list of all the hardware IPs that make up the asic is walked and the 4403 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4404 * IP specific hardware or software state changes that are necessary to soft 4405 * reset the IP. 4406 * Returns 0 on success, negative error code on failure. 4407 */ 4408 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4409 { 4410 int i, r = 0; 4411 4412 for (i = 0; i < adev->num_ip_blocks; i++) { 4413 if (!adev->ip_blocks[i].status.valid) 4414 continue; 4415 if (adev->ip_blocks[i].status.hang && 4416 adev->ip_blocks[i].version->funcs->soft_reset) { 4417 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4418 if (r) 4419 return r; 4420 } 4421 } 4422 4423 return 0; 4424 } 4425 4426 /** 4427 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4428 * 4429 * @adev: amdgpu_device pointer 4430 * 4431 * The list of all the hardware IPs that make up the asic is walked and the 4432 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4433 * handles any IP specific hardware or software state changes that are 4434 * necessary after the IP has been soft reset. 4435 * Returns 0 on success, negative error code on failure. 4436 */ 4437 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4438 { 4439 int i, r = 0; 4440 4441 for (i = 0; i < adev->num_ip_blocks; i++) { 4442 if (!adev->ip_blocks[i].status.valid) 4443 continue; 4444 if (adev->ip_blocks[i].status.hang && 4445 adev->ip_blocks[i].version->funcs->post_soft_reset) 4446 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4447 if (r) 4448 return r; 4449 } 4450 4451 return 0; 4452 } 4453 4454 /** 4455 * amdgpu_device_recover_vram - Recover some VRAM contents 4456 * 4457 * @adev: amdgpu_device pointer 4458 * 4459 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4460 * restore things like GPUVM page tables after a GPU reset where 4461 * the contents of VRAM might be lost. 4462 * 4463 * Returns: 4464 * 0 on success, negative error code on failure. 4465 */ 4466 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4467 { 4468 struct dma_fence *fence = NULL, *next = NULL; 4469 struct amdgpu_bo *shadow; 4470 struct amdgpu_bo_vm *vmbo; 4471 long r = 1, tmo; 4472 4473 if (amdgpu_sriov_runtime(adev)) 4474 tmo = msecs_to_jiffies(8000); 4475 else 4476 tmo = msecs_to_jiffies(100); 4477 4478 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4479 mutex_lock(&adev->shadow_list_lock); 4480 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4481 shadow = &vmbo->bo; 4482 /* No need to recover an evicted BO */ 4483 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4484 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4485 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4486 continue; 4487 4488 r = amdgpu_bo_restore_shadow(shadow, &next); 4489 if (r) 4490 break; 4491 4492 if (fence) { 4493 tmo = dma_fence_wait_timeout(fence, false, tmo); 4494 dma_fence_put(fence); 4495 fence = next; 4496 if (tmo == 0) { 4497 r = -ETIMEDOUT; 4498 break; 4499 } else if (tmo < 0) { 4500 r = tmo; 4501 break; 4502 } 4503 } else { 4504 fence = next; 4505 } 4506 } 4507 mutex_unlock(&adev->shadow_list_lock); 4508 4509 if (fence) 4510 tmo = dma_fence_wait_timeout(fence, false, tmo); 4511 dma_fence_put(fence); 4512 4513 if (r < 0 || tmo <= 0) { 4514 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4515 return -EIO; 4516 } 4517 4518 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4519 return 0; 4520 } 4521 4522 4523 /** 4524 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4525 * 4526 * @adev: amdgpu_device pointer 4527 * @from_hypervisor: request from hypervisor 4528 * 4529 * do VF FLR and reinitialize Asic 4530 * return 0 means succeeded otherwise failed 4531 */ 4532 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4533 bool from_hypervisor) 4534 { 4535 int r; 4536 struct amdgpu_hive_info *hive = NULL; 4537 int retry_limit = 0; 4538 4539 retry: 4540 amdgpu_amdkfd_pre_reset(adev); 4541 4542 if (from_hypervisor) 4543 r = amdgpu_virt_request_full_gpu(adev, true); 4544 else 4545 r = amdgpu_virt_reset_gpu(adev); 4546 if (r) 4547 return r; 4548 4549 /* Resume IP prior to SMC */ 4550 r = amdgpu_device_ip_reinit_early_sriov(adev); 4551 if (r) 4552 goto error; 4553 4554 amdgpu_virt_init_data_exchange(adev); 4555 4556 r = amdgpu_device_fw_loading(adev); 4557 if (r) 4558 return r; 4559 4560 /* now we are okay to resume SMC/CP/SDMA */ 4561 r = amdgpu_device_ip_reinit_late_sriov(adev); 4562 if (r) 4563 goto error; 4564 4565 hive = amdgpu_get_xgmi_hive(adev); 4566 /* Update PSP FW topology after reset */ 4567 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4568 r = amdgpu_xgmi_update_topology(hive, adev); 4569 4570 if (hive) 4571 amdgpu_put_xgmi_hive(hive); 4572 4573 if (!r) { 4574 amdgpu_irq_gpu_reset_resume_helper(adev); 4575 r = amdgpu_ib_ring_tests(adev); 4576 4577 amdgpu_amdkfd_post_reset(adev); 4578 } 4579 4580 error: 4581 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4582 amdgpu_inc_vram_lost(adev); 4583 r = amdgpu_device_recover_vram(adev); 4584 } 4585 amdgpu_virt_release_full_gpu(adev, true); 4586 4587 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4588 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4589 retry_limit++; 4590 goto retry; 4591 } else 4592 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4593 } 4594 4595 return r; 4596 } 4597 4598 /** 4599 * amdgpu_device_has_job_running - check if there is any job in mirror list 4600 * 4601 * @adev: amdgpu_device pointer 4602 * 4603 * check if there is any job in mirror list 4604 */ 4605 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4606 { 4607 int i; 4608 struct drm_sched_job *job; 4609 4610 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4611 struct amdgpu_ring *ring = adev->rings[i]; 4612 4613 if (!ring || !ring->sched.thread) 4614 continue; 4615 4616 spin_lock(&ring->sched.job_list_lock); 4617 job = list_first_entry_or_null(&ring->sched.pending_list, 4618 struct drm_sched_job, list); 4619 spin_unlock(&ring->sched.job_list_lock); 4620 if (job) 4621 return true; 4622 } 4623 return false; 4624 } 4625 4626 /** 4627 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4628 * 4629 * @adev: amdgpu_device pointer 4630 * 4631 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4632 * a hung GPU. 4633 */ 4634 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4635 { 4636 4637 if (amdgpu_gpu_recovery == 0) 4638 goto disabled; 4639 4640 /* Skip soft reset check in fatal error mode */ 4641 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4642 return true; 4643 4644 if (amdgpu_sriov_vf(adev)) 4645 return true; 4646 4647 if (amdgpu_gpu_recovery == -1) { 4648 switch (adev->asic_type) { 4649 #ifdef CONFIG_DRM_AMDGPU_SI 4650 case CHIP_VERDE: 4651 case CHIP_TAHITI: 4652 case CHIP_PITCAIRN: 4653 case CHIP_OLAND: 4654 case CHIP_HAINAN: 4655 #endif 4656 #ifdef CONFIG_DRM_AMDGPU_CIK 4657 case CHIP_KAVERI: 4658 case CHIP_KABINI: 4659 case CHIP_MULLINS: 4660 #endif 4661 case CHIP_CARRIZO: 4662 case CHIP_STONEY: 4663 case CHIP_CYAN_SKILLFISH: 4664 goto disabled; 4665 default: 4666 break; 4667 } 4668 } 4669 4670 return true; 4671 4672 disabled: 4673 dev_info(adev->dev, "GPU recovery disabled.\n"); 4674 return false; 4675 } 4676 4677 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4678 { 4679 u32 i; 4680 int ret = 0; 4681 4682 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4683 4684 dev_info(adev->dev, "GPU mode1 reset\n"); 4685 4686 /* disable BM */ 4687 pci_clear_master(adev->pdev); 4688 4689 amdgpu_device_cache_pci_state(adev->pdev); 4690 4691 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4692 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4693 ret = amdgpu_dpm_mode1_reset(adev); 4694 } else { 4695 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4696 ret = psp_gpu_reset(adev); 4697 } 4698 4699 if (ret) 4700 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4701 4702 amdgpu_device_load_pci_state(adev->pdev); 4703 4704 /* wait for asic to come out of reset */ 4705 for (i = 0; i < adev->usec_timeout; i++) { 4706 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4707 4708 if (memsize != 0xffffffff) 4709 break; 4710 udelay(1); 4711 } 4712 4713 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4714 return ret; 4715 } 4716 4717 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4718 struct amdgpu_reset_context *reset_context) 4719 { 4720 int i, r = 0; 4721 struct amdgpu_job *job = NULL; 4722 bool need_full_reset = 4723 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4724 4725 if (reset_context->reset_req_dev == adev) 4726 job = reset_context->job; 4727 4728 if (amdgpu_sriov_vf(adev)) { 4729 /* stop the data exchange thread */ 4730 amdgpu_virt_fini_data_exchange(adev); 4731 } 4732 4733 amdgpu_fence_driver_isr_toggle(adev, true); 4734 4735 /* block all schedulers and reset given job's ring */ 4736 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4737 struct amdgpu_ring *ring = adev->rings[i]; 4738 4739 if (!ring || !ring->sched.thread) 4740 continue; 4741 4742 /*clear job fence from fence drv to avoid force_completion 4743 *leave NULL and vm flush fence in fence drv */ 4744 amdgpu_fence_driver_clear_job_fences(ring); 4745 4746 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4747 amdgpu_fence_driver_force_completion(ring); 4748 } 4749 4750 amdgpu_fence_driver_isr_toggle(adev, false); 4751 4752 if (job && job->vm) 4753 drm_sched_increase_karma(&job->base); 4754 4755 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4756 /* If reset handler not implemented, continue; otherwise return */ 4757 if (r == -ENOSYS) 4758 r = 0; 4759 else 4760 return r; 4761 4762 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4763 if (!amdgpu_sriov_vf(adev)) { 4764 4765 if (!need_full_reset) 4766 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4767 4768 if (!need_full_reset && amdgpu_gpu_recovery && 4769 amdgpu_device_ip_check_soft_reset(adev)) { 4770 amdgpu_device_ip_pre_soft_reset(adev); 4771 r = amdgpu_device_ip_soft_reset(adev); 4772 amdgpu_device_ip_post_soft_reset(adev); 4773 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4774 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4775 need_full_reset = true; 4776 } 4777 } 4778 4779 if (need_full_reset) 4780 r = amdgpu_device_ip_suspend(adev); 4781 if (need_full_reset) 4782 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4783 else 4784 clear_bit(AMDGPU_NEED_FULL_RESET, 4785 &reset_context->flags); 4786 } 4787 4788 return r; 4789 } 4790 4791 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4792 { 4793 int i; 4794 4795 lockdep_assert_held(&adev->reset_domain->sem); 4796 4797 for (i = 0; i < adev->num_regs; i++) { 4798 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4799 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4800 adev->reset_dump_reg_value[i]); 4801 } 4802 4803 return 0; 4804 } 4805 4806 #ifdef CONFIG_DEV_COREDUMP 4807 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4808 size_t count, void *data, size_t datalen) 4809 { 4810 struct drm_printer p; 4811 struct amdgpu_device *adev = data; 4812 struct drm_print_iterator iter; 4813 int i; 4814 4815 iter.data = buffer; 4816 iter.offset = 0; 4817 iter.start = offset; 4818 iter.remain = count; 4819 4820 p = drm_coredump_printer(&iter); 4821 4822 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4823 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4824 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4825 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4826 if (adev->reset_task_info.pid) 4827 drm_printf(&p, "process_name: %s PID: %d\n", 4828 adev->reset_task_info.process_name, 4829 adev->reset_task_info.pid); 4830 4831 if (adev->reset_vram_lost) 4832 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4833 if (adev->num_regs) { 4834 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4835 4836 for (i = 0; i < adev->num_regs; i++) 4837 drm_printf(&p, "0x%08x: 0x%08x\n", 4838 adev->reset_dump_reg_list[i], 4839 adev->reset_dump_reg_value[i]); 4840 } 4841 4842 return count - iter.remain; 4843 } 4844 4845 static void amdgpu_devcoredump_free(void *data) 4846 { 4847 } 4848 4849 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4850 { 4851 struct drm_device *dev = adev_to_drm(adev); 4852 4853 ktime_get_ts64(&adev->reset_time); 4854 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4855 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4856 } 4857 #endif 4858 4859 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4860 struct amdgpu_reset_context *reset_context) 4861 { 4862 struct amdgpu_device *tmp_adev = NULL; 4863 bool need_full_reset, skip_hw_reset, vram_lost = false; 4864 int r = 0; 4865 bool gpu_reset_for_dev_remove = 0; 4866 4867 /* Try reset handler method first */ 4868 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4869 reset_list); 4870 amdgpu_reset_reg_dumps(tmp_adev); 4871 4872 reset_context->reset_device_list = device_list_handle; 4873 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4874 /* If reset handler not implemented, continue; otherwise return */ 4875 if (r == -ENOSYS) 4876 r = 0; 4877 else 4878 return r; 4879 4880 /* Reset handler not implemented, use the default method */ 4881 need_full_reset = 4882 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4883 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4884 4885 gpu_reset_for_dev_remove = 4886 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4887 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4888 4889 /* 4890 * ASIC reset has to be done on all XGMI hive nodes ASAP 4891 * to allow proper links negotiation in FW (within 1 sec) 4892 */ 4893 if (!skip_hw_reset && need_full_reset) { 4894 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4895 /* For XGMI run all resets in parallel to speed up the process */ 4896 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4897 tmp_adev->gmc.xgmi.pending_reset = false; 4898 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4899 r = -EALREADY; 4900 } else 4901 r = amdgpu_asic_reset(tmp_adev); 4902 4903 if (r) { 4904 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4905 r, adev_to_drm(tmp_adev)->unique); 4906 break; 4907 } 4908 } 4909 4910 /* For XGMI wait for all resets to complete before proceed */ 4911 if (!r) { 4912 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4913 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4914 flush_work(&tmp_adev->xgmi_reset_work); 4915 r = tmp_adev->asic_reset_res; 4916 if (r) 4917 break; 4918 } 4919 } 4920 } 4921 } 4922 4923 if (!r && amdgpu_ras_intr_triggered()) { 4924 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4925 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4926 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4927 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4928 } 4929 4930 amdgpu_ras_intr_cleared(); 4931 } 4932 4933 /* Since the mode1 reset affects base ip blocks, the 4934 * phase1 ip blocks need to be resumed. Otherwise there 4935 * will be a BIOS signature error and the psp bootloader 4936 * can't load kdb on the next amdgpu install. 4937 */ 4938 if (gpu_reset_for_dev_remove) { 4939 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4940 amdgpu_device_ip_resume_phase1(tmp_adev); 4941 4942 goto end; 4943 } 4944 4945 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4946 if (need_full_reset) { 4947 /* post card */ 4948 r = amdgpu_device_asic_init(tmp_adev); 4949 if (r) { 4950 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4951 } else { 4952 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4953 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4954 if (r) 4955 goto out; 4956 4957 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4958 if (r) 4959 goto out; 4960 4961 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4962 #ifdef CONFIG_DEV_COREDUMP 4963 tmp_adev->reset_vram_lost = vram_lost; 4964 memset(&tmp_adev->reset_task_info, 0, 4965 sizeof(tmp_adev->reset_task_info)); 4966 if (reset_context->job && reset_context->job->vm) 4967 tmp_adev->reset_task_info = 4968 reset_context->job->vm->task_info; 4969 amdgpu_reset_capture_coredumpm(tmp_adev); 4970 #endif 4971 if (vram_lost) { 4972 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4973 amdgpu_inc_vram_lost(tmp_adev); 4974 } 4975 4976 r = amdgpu_device_fw_loading(tmp_adev); 4977 if (r) 4978 return r; 4979 4980 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4981 if (r) 4982 goto out; 4983 4984 if (vram_lost) 4985 amdgpu_device_fill_reset_magic(tmp_adev); 4986 4987 /* 4988 * Add this ASIC as tracked as reset was already 4989 * complete successfully. 4990 */ 4991 amdgpu_register_gpu_instance(tmp_adev); 4992 4993 if (!reset_context->hive && 4994 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4995 amdgpu_xgmi_add_device(tmp_adev); 4996 4997 r = amdgpu_device_ip_late_init(tmp_adev); 4998 if (r) 4999 goto out; 5000 5001 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5002 5003 /* 5004 * The GPU enters bad state once faulty pages 5005 * by ECC has reached the threshold, and ras 5006 * recovery is scheduled next. So add one check 5007 * here to break recovery if it indeed exceeds 5008 * bad page threshold, and remind user to 5009 * retire this GPU or setting one bigger 5010 * bad_page_threshold value to fix this once 5011 * probing driver again. 5012 */ 5013 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5014 /* must succeed. */ 5015 amdgpu_ras_resume(tmp_adev); 5016 } else { 5017 r = -EINVAL; 5018 goto out; 5019 } 5020 5021 /* Update PSP FW topology after reset */ 5022 if (reset_context->hive && 5023 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5024 r = amdgpu_xgmi_update_topology( 5025 reset_context->hive, tmp_adev); 5026 } 5027 } 5028 5029 out: 5030 if (!r) { 5031 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5032 r = amdgpu_ib_ring_tests(tmp_adev); 5033 if (r) { 5034 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5035 need_full_reset = true; 5036 r = -EAGAIN; 5037 goto end; 5038 } 5039 } 5040 5041 if (!r) 5042 r = amdgpu_device_recover_vram(tmp_adev); 5043 else 5044 tmp_adev->asic_reset_res = r; 5045 } 5046 5047 end: 5048 if (need_full_reset) 5049 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5050 else 5051 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5052 return r; 5053 } 5054 5055 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5056 { 5057 5058 switch (amdgpu_asic_reset_method(adev)) { 5059 case AMD_RESET_METHOD_MODE1: 5060 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5061 break; 5062 case AMD_RESET_METHOD_MODE2: 5063 adev->mp1_state = PP_MP1_STATE_RESET; 5064 break; 5065 default: 5066 adev->mp1_state = PP_MP1_STATE_NONE; 5067 break; 5068 } 5069 } 5070 5071 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5072 { 5073 amdgpu_vf_error_trans_all(adev); 5074 adev->mp1_state = PP_MP1_STATE_NONE; 5075 } 5076 5077 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5078 { 5079 struct pci_dev *p = NULL; 5080 5081 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5082 adev->pdev->bus->number, 1); 5083 if (p) { 5084 pm_runtime_enable(&(p->dev)); 5085 pm_runtime_resume(&(p->dev)); 5086 } 5087 5088 pci_dev_put(p); 5089 } 5090 5091 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5092 { 5093 enum amd_reset_method reset_method; 5094 struct pci_dev *p = NULL; 5095 u64 expires; 5096 5097 /* 5098 * For now, only BACO and mode1 reset are confirmed 5099 * to suffer the audio issue without proper suspended. 5100 */ 5101 reset_method = amdgpu_asic_reset_method(adev); 5102 if ((reset_method != AMD_RESET_METHOD_BACO) && 5103 (reset_method != AMD_RESET_METHOD_MODE1)) 5104 return -EINVAL; 5105 5106 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5107 adev->pdev->bus->number, 1); 5108 if (!p) 5109 return -ENODEV; 5110 5111 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5112 if (!expires) 5113 /* 5114 * If we cannot get the audio device autosuspend delay, 5115 * a fixed 4S interval will be used. Considering 3S is 5116 * the audio controller default autosuspend delay setting. 5117 * 4S used here is guaranteed to cover that. 5118 */ 5119 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5120 5121 while (!pm_runtime_status_suspended(&(p->dev))) { 5122 if (!pm_runtime_suspend(&(p->dev))) 5123 break; 5124 5125 if (expires < ktime_get_mono_fast_ns()) { 5126 dev_warn(adev->dev, "failed to suspend display audio\n"); 5127 pci_dev_put(p); 5128 /* TODO: abort the succeeding gpu reset? */ 5129 return -ETIMEDOUT; 5130 } 5131 } 5132 5133 pm_runtime_disable(&(p->dev)); 5134 5135 pci_dev_put(p); 5136 return 0; 5137 } 5138 5139 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5140 { 5141 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5142 5143 #if defined(CONFIG_DEBUG_FS) 5144 if (!amdgpu_sriov_vf(adev)) 5145 cancel_work(&adev->reset_work); 5146 #endif 5147 5148 if (adev->kfd.dev) 5149 cancel_work(&adev->kfd.reset_work); 5150 5151 if (amdgpu_sriov_vf(adev)) 5152 cancel_work(&adev->virt.flr_work); 5153 5154 if (con && adev->ras_enabled) 5155 cancel_work(&con->recovery_work); 5156 5157 } 5158 5159 /** 5160 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5161 * 5162 * @adev: amdgpu_device pointer 5163 * @job: which job trigger hang 5164 * 5165 * Attempt to reset the GPU if it has hung (all asics). 5166 * Attempt to do soft-reset or full-reset and reinitialize Asic 5167 * Returns 0 for success or an error on failure. 5168 */ 5169 5170 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5171 struct amdgpu_job *job, 5172 struct amdgpu_reset_context *reset_context) 5173 { 5174 struct list_head device_list, *device_list_handle = NULL; 5175 bool job_signaled = false; 5176 struct amdgpu_hive_info *hive = NULL; 5177 struct amdgpu_device *tmp_adev = NULL; 5178 int i, r = 0; 5179 bool need_emergency_restart = false; 5180 bool audio_suspended = false; 5181 bool gpu_reset_for_dev_remove = false; 5182 5183 gpu_reset_for_dev_remove = 5184 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5185 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5186 5187 /* 5188 * Special case: RAS triggered and full reset isn't supported 5189 */ 5190 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5191 5192 /* 5193 * Flush RAM to disk so that after reboot 5194 * the user can read log and see why the system rebooted. 5195 */ 5196 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5197 DRM_WARN("Emergency reboot."); 5198 5199 ksys_sync_helper(); 5200 emergency_restart(); 5201 } 5202 5203 dev_info(adev->dev, "GPU %s begin!\n", 5204 need_emergency_restart ? "jobs stop":"reset"); 5205 5206 if (!amdgpu_sriov_vf(adev)) 5207 hive = amdgpu_get_xgmi_hive(adev); 5208 if (hive) 5209 mutex_lock(&hive->hive_lock); 5210 5211 reset_context->job = job; 5212 reset_context->hive = hive; 5213 /* 5214 * Build list of devices to reset. 5215 * In case we are in XGMI hive mode, resort the device list 5216 * to put adev in the 1st position. 5217 */ 5218 INIT_LIST_HEAD(&device_list); 5219 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5220 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5221 list_add_tail(&tmp_adev->reset_list, &device_list); 5222 if (gpu_reset_for_dev_remove && adev->shutdown) 5223 tmp_adev->shutdown = true; 5224 } 5225 if (!list_is_first(&adev->reset_list, &device_list)) 5226 list_rotate_to_front(&adev->reset_list, &device_list); 5227 device_list_handle = &device_list; 5228 } else { 5229 list_add_tail(&adev->reset_list, &device_list); 5230 device_list_handle = &device_list; 5231 } 5232 5233 /* We need to lock reset domain only once both for XGMI and single device */ 5234 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5235 reset_list); 5236 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5237 5238 /* block all schedulers and reset given job's ring */ 5239 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5240 5241 amdgpu_device_set_mp1_state(tmp_adev); 5242 5243 /* 5244 * Try to put the audio codec into suspend state 5245 * before gpu reset started. 5246 * 5247 * Due to the power domain of the graphics device 5248 * is shared with AZ power domain. Without this, 5249 * we may change the audio hardware from behind 5250 * the audio driver's back. That will trigger 5251 * some audio codec errors. 5252 */ 5253 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5254 audio_suspended = true; 5255 5256 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5257 5258 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5259 5260 if (!amdgpu_sriov_vf(tmp_adev)) 5261 amdgpu_amdkfd_pre_reset(tmp_adev); 5262 5263 /* 5264 * Mark these ASICs to be reseted as untracked first 5265 * And add them back after reset completed 5266 */ 5267 amdgpu_unregister_gpu_instance(tmp_adev); 5268 5269 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5270 5271 /* disable ras on ALL IPs */ 5272 if (!need_emergency_restart && 5273 amdgpu_device_ip_need_full_reset(tmp_adev)) 5274 amdgpu_ras_suspend(tmp_adev); 5275 5276 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5277 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5278 5279 if (!ring || !ring->sched.thread) 5280 continue; 5281 5282 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5283 5284 if (need_emergency_restart) 5285 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5286 } 5287 atomic_inc(&tmp_adev->gpu_reset_counter); 5288 } 5289 5290 if (need_emergency_restart) 5291 goto skip_sched_resume; 5292 5293 /* 5294 * Must check guilty signal here since after this point all old 5295 * HW fences are force signaled. 5296 * 5297 * job->base holds a reference to parent fence 5298 */ 5299 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5300 job_signaled = true; 5301 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5302 goto skip_hw_reset; 5303 } 5304 5305 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5306 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5307 if (gpu_reset_for_dev_remove) { 5308 /* Workaroud for ASICs need to disable SMC first */ 5309 amdgpu_device_smu_fini_early(tmp_adev); 5310 } 5311 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5312 /*TODO Should we stop ?*/ 5313 if (r) { 5314 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5315 r, adev_to_drm(tmp_adev)->unique); 5316 tmp_adev->asic_reset_res = r; 5317 } 5318 5319 /* 5320 * Drop all pending non scheduler resets. Scheduler resets 5321 * were already dropped during drm_sched_stop 5322 */ 5323 amdgpu_device_stop_pending_resets(tmp_adev); 5324 } 5325 5326 /* Actual ASIC resets if needed.*/ 5327 /* Host driver will handle XGMI hive reset for SRIOV */ 5328 if (amdgpu_sriov_vf(adev)) { 5329 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5330 if (r) 5331 adev->asic_reset_res = r; 5332 5333 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5334 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5335 amdgpu_ras_resume(adev); 5336 } else { 5337 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5338 if (r && r == -EAGAIN) 5339 goto retry; 5340 5341 if (!r && gpu_reset_for_dev_remove) 5342 goto recover_end; 5343 } 5344 5345 skip_hw_reset: 5346 5347 /* Post ASIC reset for all devs .*/ 5348 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5349 5350 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5351 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5352 5353 if (!ring || !ring->sched.thread) 5354 continue; 5355 5356 drm_sched_start(&ring->sched, true); 5357 } 5358 5359 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5360 amdgpu_mes_self_test(tmp_adev); 5361 5362 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5363 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5364 } 5365 5366 if (tmp_adev->asic_reset_res) 5367 r = tmp_adev->asic_reset_res; 5368 5369 tmp_adev->asic_reset_res = 0; 5370 5371 if (r) { 5372 /* bad news, how to tell it to userspace ? */ 5373 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5374 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5375 } else { 5376 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5377 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5378 DRM_WARN("smart shift update failed\n"); 5379 } 5380 } 5381 5382 skip_sched_resume: 5383 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5384 /* unlock kfd: SRIOV would do it separately */ 5385 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5386 amdgpu_amdkfd_post_reset(tmp_adev); 5387 5388 /* kfd_post_reset will do nothing if kfd device is not initialized, 5389 * need to bring up kfd here if it's not be initialized before 5390 */ 5391 if (!adev->kfd.init_complete) 5392 amdgpu_amdkfd_device_init(adev); 5393 5394 if (audio_suspended) 5395 amdgpu_device_resume_display_audio(tmp_adev); 5396 5397 amdgpu_device_unset_mp1_state(tmp_adev); 5398 5399 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5400 } 5401 5402 recover_end: 5403 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5404 reset_list); 5405 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5406 5407 if (hive) { 5408 mutex_unlock(&hive->hive_lock); 5409 amdgpu_put_xgmi_hive(hive); 5410 } 5411 5412 if (r) 5413 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5414 5415 atomic_set(&adev->reset_domain->reset_res, r); 5416 return r; 5417 } 5418 5419 /** 5420 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5421 * 5422 * @adev: amdgpu_device pointer 5423 * 5424 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5425 * and lanes) of the slot the device is in. Handles APUs and 5426 * virtualized environments where PCIE config space may not be available. 5427 */ 5428 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5429 { 5430 struct pci_dev *pdev; 5431 enum pci_bus_speed speed_cap, platform_speed_cap; 5432 enum pcie_link_width platform_link_width; 5433 5434 if (amdgpu_pcie_gen_cap) 5435 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5436 5437 if (amdgpu_pcie_lane_cap) 5438 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5439 5440 /* covers APUs as well */ 5441 if (pci_is_root_bus(adev->pdev->bus)) { 5442 if (adev->pm.pcie_gen_mask == 0) 5443 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5444 if (adev->pm.pcie_mlw_mask == 0) 5445 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5446 return; 5447 } 5448 5449 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5450 return; 5451 5452 pcie_bandwidth_available(adev->pdev, NULL, 5453 &platform_speed_cap, &platform_link_width); 5454 5455 if (adev->pm.pcie_gen_mask == 0) { 5456 /* asic caps */ 5457 pdev = adev->pdev; 5458 speed_cap = pcie_get_speed_cap(pdev); 5459 if (speed_cap == PCI_SPEED_UNKNOWN) { 5460 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5461 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5462 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5463 } else { 5464 if (speed_cap == PCIE_SPEED_32_0GT) 5465 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5466 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5467 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5468 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5469 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5470 else if (speed_cap == PCIE_SPEED_16_0GT) 5471 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5472 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5473 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5474 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5475 else if (speed_cap == PCIE_SPEED_8_0GT) 5476 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5477 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5478 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5479 else if (speed_cap == PCIE_SPEED_5_0GT) 5480 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5481 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5482 else 5483 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5484 } 5485 /* platform caps */ 5486 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5487 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5488 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5489 } else { 5490 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5491 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5492 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5493 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5494 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5495 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5496 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5497 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5498 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5499 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5500 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5501 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5502 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5503 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5504 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5505 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5506 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5507 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5508 else 5509 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5510 5511 } 5512 } 5513 if (adev->pm.pcie_mlw_mask == 0) { 5514 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5515 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5516 } else { 5517 switch (platform_link_width) { 5518 case PCIE_LNK_X32: 5519 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5520 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5521 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5522 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5523 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5524 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5525 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5526 break; 5527 case PCIE_LNK_X16: 5528 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5529 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5530 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5531 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5532 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5533 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5534 break; 5535 case PCIE_LNK_X12: 5536 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5537 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5538 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5539 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5540 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5541 break; 5542 case PCIE_LNK_X8: 5543 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5544 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5545 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5547 break; 5548 case PCIE_LNK_X4: 5549 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5550 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5551 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5552 break; 5553 case PCIE_LNK_X2: 5554 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5555 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5556 break; 5557 case PCIE_LNK_X1: 5558 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5559 break; 5560 default: 5561 break; 5562 } 5563 } 5564 } 5565 } 5566 5567 /** 5568 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5569 * 5570 * @adev: amdgpu_device pointer 5571 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5572 * 5573 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5574 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5575 * @peer_adev. 5576 */ 5577 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5578 struct amdgpu_device *peer_adev) 5579 { 5580 #ifdef CONFIG_HSA_AMD_P2P 5581 uint64_t address_mask = peer_adev->dev->dma_mask ? 5582 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5583 resource_size_t aper_limit = 5584 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5585 bool p2p_access = 5586 !adev->gmc.xgmi.connected_to_cpu && 5587 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5588 5589 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5590 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5591 !(adev->gmc.aper_base & address_mask || 5592 aper_limit & address_mask)); 5593 #else 5594 return false; 5595 #endif 5596 } 5597 5598 int amdgpu_device_baco_enter(struct drm_device *dev) 5599 { 5600 struct amdgpu_device *adev = drm_to_adev(dev); 5601 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5602 5603 if (!amdgpu_device_supports_baco(dev)) 5604 return -ENOTSUPP; 5605 5606 if (ras && adev->ras_enabled && 5607 adev->nbio.funcs->enable_doorbell_interrupt) 5608 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5609 5610 return amdgpu_dpm_baco_enter(adev); 5611 } 5612 5613 int amdgpu_device_baco_exit(struct drm_device *dev) 5614 { 5615 struct amdgpu_device *adev = drm_to_adev(dev); 5616 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5617 int ret = 0; 5618 5619 if (!amdgpu_device_supports_baco(dev)) 5620 return -ENOTSUPP; 5621 5622 ret = amdgpu_dpm_baco_exit(adev); 5623 if (ret) 5624 return ret; 5625 5626 if (ras && adev->ras_enabled && 5627 adev->nbio.funcs->enable_doorbell_interrupt) 5628 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5629 5630 if (amdgpu_passthrough(adev) && 5631 adev->nbio.funcs->clear_doorbell_interrupt) 5632 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5633 5634 return 0; 5635 } 5636 5637 /** 5638 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5639 * @pdev: PCI device struct 5640 * @state: PCI channel state 5641 * 5642 * Description: Called when a PCI error is detected. 5643 * 5644 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5645 */ 5646 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5647 { 5648 struct drm_device *dev = pci_get_drvdata(pdev); 5649 struct amdgpu_device *adev = drm_to_adev(dev); 5650 int i; 5651 5652 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5653 5654 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5655 DRM_WARN("No support for XGMI hive yet..."); 5656 return PCI_ERS_RESULT_DISCONNECT; 5657 } 5658 5659 adev->pci_channel_state = state; 5660 5661 switch (state) { 5662 case pci_channel_io_normal: 5663 return PCI_ERS_RESULT_CAN_RECOVER; 5664 /* Fatal error, prepare for slot reset */ 5665 case pci_channel_io_frozen: 5666 /* 5667 * Locking adev->reset_domain->sem will prevent any external access 5668 * to GPU during PCI error recovery 5669 */ 5670 amdgpu_device_lock_reset_domain(adev->reset_domain); 5671 amdgpu_device_set_mp1_state(adev); 5672 5673 /* 5674 * Block any work scheduling as we do for regular GPU reset 5675 * for the duration of the recovery 5676 */ 5677 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5678 struct amdgpu_ring *ring = adev->rings[i]; 5679 5680 if (!ring || !ring->sched.thread) 5681 continue; 5682 5683 drm_sched_stop(&ring->sched, NULL); 5684 } 5685 atomic_inc(&adev->gpu_reset_counter); 5686 return PCI_ERS_RESULT_NEED_RESET; 5687 case pci_channel_io_perm_failure: 5688 /* Permanent error, prepare for device removal */ 5689 return PCI_ERS_RESULT_DISCONNECT; 5690 } 5691 5692 return PCI_ERS_RESULT_NEED_RESET; 5693 } 5694 5695 /** 5696 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5697 * @pdev: pointer to PCI device 5698 */ 5699 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5700 { 5701 5702 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5703 5704 /* TODO - dump whatever for debugging purposes */ 5705 5706 /* This called only if amdgpu_pci_error_detected returns 5707 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5708 * works, no need to reset slot. 5709 */ 5710 5711 return PCI_ERS_RESULT_RECOVERED; 5712 } 5713 5714 /** 5715 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5716 * @pdev: PCI device struct 5717 * 5718 * Description: This routine is called by the pci error recovery 5719 * code after the PCI slot has been reset, just before we 5720 * should resume normal operations. 5721 */ 5722 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5723 { 5724 struct drm_device *dev = pci_get_drvdata(pdev); 5725 struct amdgpu_device *adev = drm_to_adev(dev); 5726 int r, i; 5727 struct amdgpu_reset_context reset_context; 5728 u32 memsize; 5729 struct list_head device_list; 5730 5731 DRM_INFO("PCI error: slot reset callback!!\n"); 5732 5733 memset(&reset_context, 0, sizeof(reset_context)); 5734 5735 INIT_LIST_HEAD(&device_list); 5736 list_add_tail(&adev->reset_list, &device_list); 5737 5738 /* wait for asic to come out of reset */ 5739 msleep(500); 5740 5741 /* Restore PCI confspace */ 5742 amdgpu_device_load_pci_state(pdev); 5743 5744 /* confirm ASIC came out of reset */ 5745 for (i = 0; i < adev->usec_timeout; i++) { 5746 memsize = amdgpu_asic_get_config_memsize(adev); 5747 5748 if (memsize != 0xffffffff) 5749 break; 5750 udelay(1); 5751 } 5752 if (memsize == 0xffffffff) { 5753 r = -ETIME; 5754 goto out; 5755 } 5756 5757 reset_context.method = AMD_RESET_METHOD_NONE; 5758 reset_context.reset_req_dev = adev; 5759 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5760 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5761 5762 adev->no_hw_access = true; 5763 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5764 adev->no_hw_access = false; 5765 if (r) 5766 goto out; 5767 5768 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5769 5770 out: 5771 if (!r) { 5772 if (amdgpu_device_cache_pci_state(adev->pdev)) 5773 pci_restore_state(adev->pdev); 5774 5775 DRM_INFO("PCIe error recovery succeeded\n"); 5776 } else { 5777 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5778 amdgpu_device_unset_mp1_state(adev); 5779 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5780 } 5781 5782 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5783 } 5784 5785 /** 5786 * amdgpu_pci_resume() - resume normal ops after PCI reset 5787 * @pdev: pointer to PCI device 5788 * 5789 * Called when the error recovery driver tells us that its 5790 * OK to resume normal operation. 5791 */ 5792 void amdgpu_pci_resume(struct pci_dev *pdev) 5793 { 5794 struct drm_device *dev = pci_get_drvdata(pdev); 5795 struct amdgpu_device *adev = drm_to_adev(dev); 5796 int i; 5797 5798 5799 DRM_INFO("PCI error: resume callback!!\n"); 5800 5801 /* Only continue execution for the case of pci_channel_io_frozen */ 5802 if (adev->pci_channel_state != pci_channel_io_frozen) 5803 return; 5804 5805 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5806 struct amdgpu_ring *ring = adev->rings[i]; 5807 5808 if (!ring || !ring->sched.thread) 5809 continue; 5810 5811 drm_sched_start(&ring->sched, true); 5812 } 5813 5814 amdgpu_device_unset_mp1_state(adev); 5815 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5816 } 5817 5818 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5819 { 5820 struct drm_device *dev = pci_get_drvdata(pdev); 5821 struct amdgpu_device *adev = drm_to_adev(dev); 5822 int r; 5823 5824 r = pci_save_state(pdev); 5825 if (!r) { 5826 kfree(adev->pci_state); 5827 5828 adev->pci_state = pci_store_saved_state(pdev); 5829 5830 if (!adev->pci_state) { 5831 DRM_ERROR("Failed to store PCI saved state"); 5832 return false; 5833 } 5834 } else { 5835 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5836 return false; 5837 } 5838 5839 return true; 5840 } 5841 5842 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5843 { 5844 struct drm_device *dev = pci_get_drvdata(pdev); 5845 struct amdgpu_device *adev = drm_to_adev(dev); 5846 int r; 5847 5848 if (!adev->pci_state) 5849 return false; 5850 5851 r = pci_load_saved_state(pdev, adev->pci_state); 5852 5853 if (!r) { 5854 pci_restore_state(pdev); 5855 } else { 5856 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5857 return false; 5858 } 5859 5860 return true; 5861 } 5862 5863 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5864 struct amdgpu_ring *ring) 5865 { 5866 #ifdef CONFIG_X86_64 5867 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5868 return; 5869 #endif 5870 if (adev->gmc.xgmi.connected_to_cpu) 5871 return; 5872 5873 if (ring && ring->funcs->emit_hdp_flush) 5874 amdgpu_ring_emit_hdp_flush(ring); 5875 else 5876 amdgpu_asic_flush_hdp(adev, ring); 5877 } 5878 5879 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5880 struct amdgpu_ring *ring) 5881 { 5882 #ifdef CONFIG_X86_64 5883 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5884 return; 5885 #endif 5886 if (adev->gmc.xgmi.connected_to_cpu) 5887 return; 5888 5889 amdgpu_asic_invalidate_hdp(adev, ring); 5890 } 5891 5892 int amdgpu_in_reset(struct amdgpu_device *adev) 5893 { 5894 return atomic_read(&adev->reset_domain->in_gpu_reset); 5895 } 5896 5897 /** 5898 * amdgpu_device_halt() - bring hardware to some kind of halt state 5899 * 5900 * @adev: amdgpu_device pointer 5901 * 5902 * Bring hardware to some kind of halt state so that no one can touch it 5903 * any more. It will help to maintain error context when error occurred. 5904 * Compare to a simple hang, the system will keep stable at least for SSH 5905 * access. Then it should be trivial to inspect the hardware state and 5906 * see what's going on. Implemented as following: 5907 * 5908 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5909 * clears all CPU mappings to device, disallows remappings through page faults 5910 * 2. amdgpu_irq_disable_all() disables all interrupts 5911 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5912 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5913 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5914 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5915 * flush any in flight DMA operations 5916 */ 5917 void amdgpu_device_halt(struct amdgpu_device *adev) 5918 { 5919 struct pci_dev *pdev = adev->pdev; 5920 struct drm_device *ddev = adev_to_drm(adev); 5921 5922 drm_dev_unplug(ddev); 5923 5924 amdgpu_irq_disable_all(adev); 5925 5926 amdgpu_fence_driver_hw_fini(adev); 5927 5928 adev->no_hw_access = true; 5929 5930 amdgpu_device_unmap_mmio(adev); 5931 5932 pci_disable_device(pdev); 5933 pci_wait_for_pending_transaction(pdev); 5934 } 5935 5936 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5937 u32 reg) 5938 { 5939 unsigned long flags, address, data; 5940 u32 r; 5941 5942 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5943 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5944 5945 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5946 WREG32(address, reg * 4); 5947 (void)RREG32(address); 5948 r = RREG32(data); 5949 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5950 return r; 5951 } 5952 5953 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5954 u32 reg, u32 v) 5955 { 5956 unsigned long flags, address, data; 5957 5958 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5959 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5960 5961 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5962 WREG32(address, reg * 4); 5963 (void)RREG32(address); 5964 WREG32(data, v); 5965 (void)RREG32(data); 5966 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5967 } 5968 5969 /** 5970 * amdgpu_device_switch_gang - switch to a new gang 5971 * @adev: amdgpu_device pointer 5972 * @gang: the gang to switch to 5973 * 5974 * Try to switch to a new gang. 5975 * Returns: NULL if we switched to the new gang or a reference to the current 5976 * gang leader. 5977 */ 5978 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5979 struct dma_fence *gang) 5980 { 5981 struct dma_fence *old = NULL; 5982 5983 do { 5984 dma_fence_put(old); 5985 rcu_read_lock(); 5986 old = dma_fence_get_rcu_safe(&adev->gang_submit); 5987 rcu_read_unlock(); 5988 5989 if (old == gang) 5990 break; 5991 5992 if (!dma_fence_is_signaled(old)) 5993 return old; 5994 5995 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 5996 old, gang) != old); 5997 5998 dma_fence_put(old); 5999 return NULL; 6000 } 6001 6002 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6003 { 6004 switch (adev->asic_type) { 6005 #ifdef CONFIG_DRM_AMDGPU_SI 6006 case CHIP_HAINAN: 6007 #endif 6008 case CHIP_TOPAZ: 6009 /* chips with no display hardware */ 6010 return false; 6011 #ifdef CONFIG_DRM_AMDGPU_SI 6012 case CHIP_TAHITI: 6013 case CHIP_PITCAIRN: 6014 case CHIP_VERDE: 6015 case CHIP_OLAND: 6016 #endif 6017 #ifdef CONFIG_DRM_AMDGPU_CIK 6018 case CHIP_BONAIRE: 6019 case CHIP_HAWAII: 6020 case CHIP_KAVERI: 6021 case CHIP_KABINI: 6022 case CHIP_MULLINS: 6023 #endif 6024 case CHIP_TONGA: 6025 case CHIP_FIJI: 6026 case CHIP_POLARIS10: 6027 case CHIP_POLARIS11: 6028 case CHIP_POLARIS12: 6029 case CHIP_VEGAM: 6030 case CHIP_CARRIZO: 6031 case CHIP_STONEY: 6032 /* chips with display hardware */ 6033 return true; 6034 default: 6035 /* IP discovery */ 6036 if (!adev->ip_versions[DCE_HWIP][0] || 6037 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6038 return false; 6039 return true; 6040 } 6041 } 6042