1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/devcoredump.h> 36 #include <generated/utsrelease.h> 37 #include <linux/pci-p2pdma.h> 38 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_fb_helper.h> 41 #include <drm/drm_probe_helper.h> 42 #include <drm/amdgpu_drm.h> 43 #include <linux/vgaarb.h> 44 #include <linux/vga_switcheroo.h> 45 #include <linux/efi.h> 46 #include "amdgpu.h" 47 #include "amdgpu_trace.h" 48 #include "amdgpu_i2c.h" 49 #include "atom.h" 50 #include "amdgpu_atombios.h" 51 #include "amdgpu_atomfirmware.h" 52 #include "amd_pcie.h" 53 #ifdef CONFIG_DRM_AMDGPU_SI 54 #include "si.h" 55 #endif 56 #ifdef CONFIG_DRM_AMDGPU_CIK 57 #include "cik.h" 58 #endif 59 #include "vi.h" 60 #include "soc15.h" 61 #include "nv.h" 62 #include "bif/bif_4_1_d.h" 63 #include <linux/firmware.h> 64 #include "amdgpu_vf_error.h" 65 66 #include "amdgpu_amdkfd.h" 67 #include "amdgpu_pm.h" 68 69 #include "amdgpu_xgmi.h" 70 #include "amdgpu_ras.h" 71 #include "amdgpu_pmu.h" 72 #include "amdgpu_fru_eeprom.h" 73 #include "amdgpu_reset.h" 74 75 #include <linux/suspend.h> 76 #include <drm/task_barrier.h> 77 #include <linux/pm_runtime.h> 78 79 #include <drm/drm_drv.h> 80 81 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 88 89 #define AMDGPU_RESUME_MS 2000 90 #define AMDGPU_MAX_RETRY_LIMIT 2 91 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 92 93 const char *amdgpu_asic_name[] = { 94 "TAHITI", 95 "PITCAIRN", 96 "VERDE", 97 "OLAND", 98 "HAINAN", 99 "BONAIRE", 100 "KAVERI", 101 "KABINI", 102 "HAWAII", 103 "MULLINS", 104 "TOPAZ", 105 "TONGA", 106 "FIJI", 107 "CARRIZO", 108 "STONEY", 109 "POLARIS10", 110 "POLARIS11", 111 "POLARIS12", 112 "VEGAM", 113 "VEGA10", 114 "VEGA12", 115 "VEGA20", 116 "RAVEN", 117 "ARCTURUS", 118 "RENOIR", 119 "ALDEBARAN", 120 "NAVI10", 121 "CYAN_SKILLFISH", 122 "NAVI14", 123 "NAVI12", 124 "SIENNA_CICHLID", 125 "NAVY_FLOUNDER", 126 "VANGOGH", 127 "DIMGREY_CAVEFISH", 128 "BEIGE_GOBY", 129 "YELLOW_CARP", 130 "IP DISCOVERY", 131 "LAST", 132 }; 133 134 /** 135 * DOC: pcie_replay_count 136 * 137 * The amdgpu driver provides a sysfs API for reporting the total number 138 * of PCIe replays (NAKs) 139 * The file pcie_replay_count is used for this and returns the total 140 * number of replays as a sum of the NAKs generated and NAKs received 141 */ 142 143 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 144 struct device_attribute *attr, char *buf) 145 { 146 struct drm_device *ddev = dev_get_drvdata(dev); 147 struct amdgpu_device *adev = drm_to_adev(ddev); 148 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 149 150 return sysfs_emit(buf, "%llu\n", cnt); 151 } 152 153 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 154 amdgpu_device_get_pcie_replay_count, NULL); 155 156 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 157 158 /** 159 * DOC: product_name 160 * 161 * The amdgpu driver provides a sysfs API for reporting the product name 162 * for the device 163 * The file serial_number is used for this and returns the product name 164 * as returned from the FRU. 165 * NOTE: This is only available for certain server cards 166 */ 167 168 static ssize_t amdgpu_device_get_product_name(struct device *dev, 169 struct device_attribute *attr, char *buf) 170 { 171 struct drm_device *ddev = dev_get_drvdata(dev); 172 struct amdgpu_device *adev = drm_to_adev(ddev); 173 174 return sysfs_emit(buf, "%s\n", adev->product_name); 175 } 176 177 static DEVICE_ATTR(product_name, S_IRUGO, 178 amdgpu_device_get_product_name, NULL); 179 180 /** 181 * DOC: product_number 182 * 183 * The amdgpu driver provides a sysfs API for reporting the part number 184 * for the device 185 * The file serial_number is used for this and returns the part number 186 * as returned from the FRU. 187 * NOTE: This is only available for certain server cards 188 */ 189 190 static ssize_t amdgpu_device_get_product_number(struct device *dev, 191 struct device_attribute *attr, char *buf) 192 { 193 struct drm_device *ddev = dev_get_drvdata(dev); 194 struct amdgpu_device *adev = drm_to_adev(ddev); 195 196 return sysfs_emit(buf, "%s\n", adev->product_number); 197 } 198 199 static DEVICE_ATTR(product_number, S_IRUGO, 200 amdgpu_device_get_product_number, NULL); 201 202 /** 203 * DOC: serial_number 204 * 205 * The amdgpu driver provides a sysfs API for reporting the serial number 206 * for the device 207 * The file serial_number is used for this and returns the serial number 208 * as returned from the FRU. 209 * NOTE: This is only available for certain server cards 210 */ 211 212 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 213 struct device_attribute *attr, char *buf) 214 { 215 struct drm_device *ddev = dev_get_drvdata(dev); 216 struct amdgpu_device *adev = drm_to_adev(ddev); 217 218 return sysfs_emit(buf, "%s\n", adev->serial); 219 } 220 221 static DEVICE_ATTR(serial_number, S_IRUGO, 222 amdgpu_device_get_serial_number, NULL); 223 224 /** 225 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 226 * 227 * @dev: drm_device pointer 228 * 229 * Returns true if the device is a dGPU with ATPX power control, 230 * otherwise return false. 231 */ 232 bool amdgpu_device_supports_px(struct drm_device *dev) 233 { 234 struct amdgpu_device *adev = drm_to_adev(dev); 235 236 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 237 return true; 238 return false; 239 } 240 241 /** 242 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 243 * 244 * @dev: drm_device pointer 245 * 246 * Returns true if the device is a dGPU with ACPI power control, 247 * otherwise return false. 248 */ 249 bool amdgpu_device_supports_boco(struct drm_device *dev) 250 { 251 struct amdgpu_device *adev = drm_to_adev(dev); 252 253 if (adev->has_pr3 || 254 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 255 return true; 256 return false; 257 } 258 259 /** 260 * amdgpu_device_supports_baco - Does the device support BACO 261 * 262 * @dev: drm_device pointer 263 * 264 * Returns true if the device supporte BACO, 265 * otherwise return false. 266 */ 267 bool amdgpu_device_supports_baco(struct drm_device *dev) 268 { 269 struct amdgpu_device *adev = drm_to_adev(dev); 270 271 return amdgpu_asic_supports_baco(adev); 272 } 273 274 /** 275 * amdgpu_device_supports_smart_shift - Is the device dGPU with 276 * smart shift support 277 * 278 * @dev: drm_device pointer 279 * 280 * Returns true if the device is a dGPU with Smart Shift support, 281 * otherwise returns false. 282 */ 283 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 284 { 285 return (amdgpu_device_supports_boco(dev) && 286 amdgpu_acpi_is_power_shift_control_supported()); 287 } 288 289 /* 290 * VRAM access helper functions 291 */ 292 293 /** 294 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 295 * 296 * @adev: amdgpu_device pointer 297 * @pos: offset of the buffer in vram 298 * @buf: virtual address of the buffer in system memory 299 * @size: read/write size, sizeof(@buf) must > @size 300 * @write: true - write to vram, otherwise - read from vram 301 */ 302 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 303 void *buf, size_t size, bool write) 304 { 305 unsigned long flags; 306 uint32_t hi = ~0, tmp = 0; 307 uint32_t *data = buf; 308 uint64_t last; 309 int idx; 310 311 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 312 return; 313 314 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 315 316 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 317 for (last = pos + size; pos < last; pos += 4) { 318 tmp = pos >> 31; 319 320 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 321 if (tmp != hi) { 322 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 323 hi = tmp; 324 } 325 if (write) 326 WREG32_NO_KIQ(mmMM_DATA, *data++); 327 else 328 *data++ = RREG32_NO_KIQ(mmMM_DATA); 329 } 330 331 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 332 drm_dev_exit(idx); 333 } 334 335 /** 336 * amdgpu_device_aper_access - access vram by vram aperature 337 * 338 * @adev: amdgpu_device pointer 339 * @pos: offset of the buffer in vram 340 * @buf: virtual address of the buffer in system memory 341 * @size: read/write size, sizeof(@buf) must > @size 342 * @write: true - write to vram, otherwise - read from vram 343 * 344 * The return value means how many bytes have been transferred. 345 */ 346 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 347 void *buf, size_t size, bool write) 348 { 349 #ifdef CONFIG_64BIT 350 void __iomem *addr; 351 size_t count = 0; 352 uint64_t last; 353 354 if (!adev->mman.aper_base_kaddr) 355 return 0; 356 357 last = min(pos + size, adev->gmc.visible_vram_size); 358 if (last > pos) { 359 addr = adev->mman.aper_base_kaddr + pos; 360 count = last - pos; 361 362 if (write) { 363 memcpy_toio(addr, buf, count); 364 mb(); 365 amdgpu_device_flush_hdp(adev, NULL); 366 } else { 367 amdgpu_device_invalidate_hdp(adev, NULL); 368 mb(); 369 memcpy_fromio(buf, addr, count); 370 } 371 372 } 373 374 return count; 375 #else 376 return 0; 377 #endif 378 } 379 380 /** 381 * amdgpu_device_vram_access - read/write a buffer in vram 382 * 383 * @adev: amdgpu_device pointer 384 * @pos: offset of the buffer in vram 385 * @buf: virtual address of the buffer in system memory 386 * @size: read/write size, sizeof(@buf) must > @size 387 * @write: true - write to vram, otherwise - read from vram 388 */ 389 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 390 void *buf, size_t size, bool write) 391 { 392 size_t count; 393 394 /* try to using vram apreature to access vram first */ 395 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 396 size -= count; 397 if (size) { 398 /* using MM to access rest vram */ 399 pos += count; 400 buf += count; 401 amdgpu_device_mm_access(adev, pos, buf, size, write); 402 } 403 } 404 405 /* 406 * register access helper functions. 407 */ 408 409 /* Check if hw access should be skipped because of hotplug or device error */ 410 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 411 { 412 if (adev->no_hw_access) 413 return true; 414 415 #ifdef CONFIG_LOCKDEP 416 /* 417 * This is a bit complicated to understand, so worth a comment. What we assert 418 * here is that the GPU reset is not running on another thread in parallel. 419 * 420 * For this we trylock the read side of the reset semaphore, if that succeeds 421 * we know that the reset is not running in paralell. 422 * 423 * If the trylock fails we assert that we are either already holding the read 424 * side of the lock or are the reset thread itself and hold the write side of 425 * the lock. 426 */ 427 if (in_task()) { 428 if (down_read_trylock(&adev->reset_domain->sem)) 429 up_read(&adev->reset_domain->sem); 430 else 431 lockdep_assert_held(&adev->reset_domain->sem); 432 } 433 #endif 434 return false; 435 } 436 437 /** 438 * amdgpu_device_rreg - read a memory mapped IO or indirect register 439 * 440 * @adev: amdgpu_device pointer 441 * @reg: dword aligned register offset 442 * @acc_flags: access flags which require special behavior 443 * 444 * Returns the 32 bit value from the offset specified. 445 */ 446 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 447 uint32_t reg, uint32_t acc_flags) 448 { 449 uint32_t ret; 450 451 if (amdgpu_device_skip_hw_access(adev)) 452 return 0; 453 454 if ((reg * 4) < adev->rmmio_size) { 455 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 456 amdgpu_sriov_runtime(adev) && 457 down_read_trylock(&adev->reset_domain->sem)) { 458 ret = amdgpu_kiq_rreg(adev, reg); 459 up_read(&adev->reset_domain->sem); 460 } else { 461 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 462 } 463 } else { 464 ret = adev->pcie_rreg(adev, reg * 4); 465 } 466 467 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 468 469 return ret; 470 } 471 472 /* 473 * MMIO register read with bytes helper functions 474 * @offset:bytes offset from MMIO start 475 * 476 */ 477 478 /** 479 * amdgpu_mm_rreg8 - read a memory mapped IO register 480 * 481 * @adev: amdgpu_device pointer 482 * @offset: byte aligned register offset 483 * 484 * Returns the 8 bit value from the offset specified. 485 */ 486 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 487 { 488 if (amdgpu_device_skip_hw_access(adev)) 489 return 0; 490 491 if (offset < adev->rmmio_size) 492 return (readb(adev->rmmio + offset)); 493 BUG(); 494 } 495 496 /* 497 * MMIO register write with bytes helper functions 498 * @offset:bytes offset from MMIO start 499 * @value: the value want to be written to the register 500 * 501 */ 502 /** 503 * amdgpu_mm_wreg8 - read a memory mapped IO register 504 * 505 * @adev: amdgpu_device pointer 506 * @offset: byte aligned register offset 507 * @value: 8 bit value to write 508 * 509 * Writes the value specified to the offset specified. 510 */ 511 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 512 { 513 if (amdgpu_device_skip_hw_access(adev)) 514 return; 515 516 if (offset < adev->rmmio_size) 517 writeb(value, adev->rmmio + offset); 518 else 519 BUG(); 520 } 521 522 /** 523 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 524 * 525 * @adev: amdgpu_device pointer 526 * @reg: dword aligned register offset 527 * @v: 32 bit value to write to the register 528 * @acc_flags: access flags which require special behavior 529 * 530 * Writes the value specified to the offset specified. 531 */ 532 void amdgpu_device_wreg(struct amdgpu_device *adev, 533 uint32_t reg, uint32_t v, 534 uint32_t acc_flags) 535 { 536 if (amdgpu_device_skip_hw_access(adev)) 537 return; 538 539 if ((reg * 4) < adev->rmmio_size) { 540 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 541 amdgpu_sriov_runtime(adev) && 542 down_read_trylock(&adev->reset_domain->sem)) { 543 amdgpu_kiq_wreg(adev, reg, v); 544 up_read(&adev->reset_domain->sem); 545 } else { 546 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 547 } 548 } else { 549 adev->pcie_wreg(adev, reg * 4, v); 550 } 551 552 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 553 } 554 555 /** 556 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 557 * 558 * @adev: amdgpu_device pointer 559 * @reg: mmio/rlc register 560 * @v: value to write 561 * 562 * this function is invoked only for the debugfs register access 563 */ 564 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 565 uint32_t reg, uint32_t v) 566 { 567 if (amdgpu_device_skip_hw_access(adev)) 568 return; 569 570 if (amdgpu_sriov_fullaccess(adev) && 571 adev->gfx.rlc.funcs && 572 adev->gfx.rlc.funcs->is_rlcg_access_range) { 573 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 574 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 575 } else if ((reg * 4) >= adev->rmmio_size) { 576 adev->pcie_wreg(adev, reg * 4, v); 577 } else { 578 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 579 } 580 } 581 582 /** 583 * amdgpu_mm_rdoorbell - read a doorbell dword 584 * 585 * @adev: amdgpu_device pointer 586 * @index: doorbell index 587 * 588 * Returns the value in the doorbell aperture at the 589 * requested doorbell index (CIK). 590 */ 591 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 592 { 593 if (amdgpu_device_skip_hw_access(adev)) 594 return 0; 595 596 if (index < adev->doorbell.num_doorbells) { 597 return readl(adev->doorbell.ptr + index); 598 } else { 599 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 600 return 0; 601 } 602 } 603 604 /** 605 * amdgpu_mm_wdoorbell - write a doorbell dword 606 * 607 * @adev: amdgpu_device pointer 608 * @index: doorbell index 609 * @v: value to write 610 * 611 * Writes @v to the doorbell aperture at the 612 * requested doorbell index (CIK). 613 */ 614 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 615 { 616 if (amdgpu_device_skip_hw_access(adev)) 617 return; 618 619 if (index < adev->doorbell.num_doorbells) { 620 writel(v, adev->doorbell.ptr + index); 621 } else { 622 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 623 } 624 } 625 626 /** 627 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 628 * 629 * @adev: amdgpu_device pointer 630 * @index: doorbell index 631 * 632 * Returns the value in the doorbell aperture at the 633 * requested doorbell index (VEGA10+). 634 */ 635 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 636 { 637 if (amdgpu_device_skip_hw_access(adev)) 638 return 0; 639 640 if (index < adev->doorbell.num_doorbells) { 641 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 642 } else { 643 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 644 return 0; 645 } 646 } 647 648 /** 649 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 650 * 651 * @adev: amdgpu_device pointer 652 * @index: doorbell index 653 * @v: value to write 654 * 655 * Writes @v to the doorbell aperture at the 656 * requested doorbell index (VEGA10+). 657 */ 658 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 659 { 660 if (amdgpu_device_skip_hw_access(adev)) 661 return; 662 663 if (index < adev->doorbell.num_doorbells) { 664 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 665 } else { 666 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 667 } 668 } 669 670 /** 671 * amdgpu_device_indirect_rreg - read an indirect register 672 * 673 * @adev: amdgpu_device pointer 674 * @pcie_index: mmio register offset 675 * @pcie_data: mmio register offset 676 * @reg_addr: indirect register address to read from 677 * 678 * Returns the value of indirect register @reg_addr 679 */ 680 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 681 u32 pcie_index, u32 pcie_data, 682 u32 reg_addr) 683 { 684 unsigned long flags; 685 u32 r; 686 void __iomem *pcie_index_offset; 687 void __iomem *pcie_data_offset; 688 689 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 690 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 691 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 692 693 writel(reg_addr, pcie_index_offset); 694 readl(pcie_index_offset); 695 r = readl(pcie_data_offset); 696 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 697 698 return r; 699 } 700 701 /** 702 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 703 * 704 * @adev: amdgpu_device pointer 705 * @pcie_index: mmio register offset 706 * @pcie_data: mmio register offset 707 * @reg_addr: indirect register address to read from 708 * 709 * Returns the value of indirect register @reg_addr 710 */ 711 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 712 u32 pcie_index, u32 pcie_data, 713 u32 reg_addr) 714 { 715 unsigned long flags; 716 u64 r; 717 void __iomem *pcie_index_offset; 718 void __iomem *pcie_data_offset; 719 720 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 721 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 722 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 723 724 /* read low 32 bits */ 725 writel(reg_addr, pcie_index_offset); 726 readl(pcie_index_offset); 727 r = readl(pcie_data_offset); 728 /* read high 32 bits */ 729 writel(reg_addr + 4, pcie_index_offset); 730 readl(pcie_index_offset); 731 r |= ((u64)readl(pcie_data_offset) << 32); 732 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 733 734 return r; 735 } 736 737 /** 738 * amdgpu_device_indirect_wreg - write an indirect register address 739 * 740 * @adev: amdgpu_device pointer 741 * @pcie_index: mmio register offset 742 * @pcie_data: mmio register offset 743 * @reg_addr: indirect register offset 744 * @reg_data: indirect register data 745 * 746 */ 747 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 748 u32 pcie_index, u32 pcie_data, 749 u32 reg_addr, u32 reg_data) 750 { 751 unsigned long flags; 752 void __iomem *pcie_index_offset; 753 void __iomem *pcie_data_offset; 754 755 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 756 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 757 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 758 759 writel(reg_addr, pcie_index_offset); 760 readl(pcie_index_offset); 761 writel(reg_data, pcie_data_offset); 762 readl(pcie_data_offset); 763 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 764 } 765 766 /** 767 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 768 * 769 * @adev: amdgpu_device pointer 770 * @pcie_index: mmio register offset 771 * @pcie_data: mmio register offset 772 * @reg_addr: indirect register offset 773 * @reg_data: indirect register data 774 * 775 */ 776 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 777 u32 pcie_index, u32 pcie_data, 778 u32 reg_addr, u64 reg_data) 779 { 780 unsigned long flags; 781 void __iomem *pcie_index_offset; 782 void __iomem *pcie_data_offset; 783 784 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 785 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 786 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 787 788 /* write low 32 bits */ 789 writel(reg_addr, pcie_index_offset); 790 readl(pcie_index_offset); 791 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 792 readl(pcie_data_offset); 793 /* write high 32 bits */ 794 writel(reg_addr + 4, pcie_index_offset); 795 readl(pcie_index_offset); 796 writel((u32)(reg_data >> 32), pcie_data_offset); 797 readl(pcie_data_offset); 798 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 799 } 800 801 /** 802 * amdgpu_invalid_rreg - dummy reg read function 803 * 804 * @adev: amdgpu_device pointer 805 * @reg: offset of register 806 * 807 * Dummy register read function. Used for register blocks 808 * that certain asics don't have (all asics). 809 * Returns the value in the register. 810 */ 811 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 812 { 813 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 814 BUG(); 815 return 0; 816 } 817 818 /** 819 * amdgpu_invalid_wreg - dummy reg write function 820 * 821 * @adev: amdgpu_device pointer 822 * @reg: offset of register 823 * @v: value to write to the register 824 * 825 * Dummy register read function. Used for register blocks 826 * that certain asics don't have (all asics). 827 */ 828 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 829 { 830 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 831 reg, v); 832 BUG(); 833 } 834 835 /** 836 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 837 * 838 * @adev: amdgpu_device pointer 839 * @reg: offset of register 840 * 841 * Dummy register read function. Used for register blocks 842 * that certain asics don't have (all asics). 843 * Returns the value in the register. 844 */ 845 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 846 { 847 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 848 BUG(); 849 return 0; 850 } 851 852 /** 853 * amdgpu_invalid_wreg64 - dummy reg write function 854 * 855 * @adev: amdgpu_device pointer 856 * @reg: offset of register 857 * @v: value to write to the register 858 * 859 * Dummy register read function. Used for register blocks 860 * that certain asics don't have (all asics). 861 */ 862 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 863 { 864 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 865 reg, v); 866 BUG(); 867 } 868 869 /** 870 * amdgpu_block_invalid_rreg - dummy reg read function 871 * 872 * @adev: amdgpu_device pointer 873 * @block: offset of instance 874 * @reg: offset of register 875 * 876 * Dummy register read function. Used for register blocks 877 * that certain asics don't have (all asics). 878 * Returns the value in the register. 879 */ 880 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 881 uint32_t block, uint32_t reg) 882 { 883 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 884 reg, block); 885 BUG(); 886 return 0; 887 } 888 889 /** 890 * amdgpu_block_invalid_wreg - dummy reg write function 891 * 892 * @adev: amdgpu_device pointer 893 * @block: offset of instance 894 * @reg: offset of register 895 * @v: value to write to the register 896 * 897 * Dummy register read function. Used for register blocks 898 * that certain asics don't have (all asics). 899 */ 900 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 901 uint32_t block, 902 uint32_t reg, uint32_t v) 903 { 904 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 905 reg, block, v); 906 BUG(); 907 } 908 909 /** 910 * amdgpu_device_asic_init - Wrapper for atom asic_init 911 * 912 * @adev: amdgpu_device pointer 913 * 914 * Does any asic specific work and then calls atom asic init. 915 */ 916 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 917 { 918 amdgpu_asic_pre_asic_init(adev); 919 920 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) 921 return amdgpu_atomfirmware_asic_init(adev, true); 922 else 923 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 924 } 925 926 /** 927 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 928 * 929 * @adev: amdgpu_device pointer 930 * 931 * Allocates a scratch page of VRAM for use by various things in the 932 * driver. 933 */ 934 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 935 { 936 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 937 AMDGPU_GEM_DOMAIN_VRAM | 938 AMDGPU_GEM_DOMAIN_GTT, 939 &adev->mem_scratch.robj, 940 &adev->mem_scratch.gpu_addr, 941 (void **)&adev->mem_scratch.ptr); 942 } 943 944 /** 945 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 946 * 947 * @adev: amdgpu_device pointer 948 * 949 * Frees the VRAM scratch page. 950 */ 951 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 952 { 953 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 954 } 955 956 /** 957 * amdgpu_device_program_register_sequence - program an array of registers. 958 * 959 * @adev: amdgpu_device pointer 960 * @registers: pointer to the register array 961 * @array_size: size of the register array 962 * 963 * Programs an array or registers with and and or masks. 964 * This is a helper for setting golden registers. 965 */ 966 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 967 const u32 *registers, 968 const u32 array_size) 969 { 970 u32 tmp, reg, and_mask, or_mask; 971 int i; 972 973 if (array_size % 3) 974 return; 975 976 for (i = 0; i < array_size; i +=3) { 977 reg = registers[i + 0]; 978 and_mask = registers[i + 1]; 979 or_mask = registers[i + 2]; 980 981 if (and_mask == 0xffffffff) { 982 tmp = or_mask; 983 } else { 984 tmp = RREG32(reg); 985 tmp &= ~and_mask; 986 if (adev->family >= AMDGPU_FAMILY_AI) 987 tmp |= (or_mask & and_mask); 988 else 989 tmp |= or_mask; 990 } 991 WREG32(reg, tmp); 992 } 993 } 994 995 /** 996 * amdgpu_device_pci_config_reset - reset the GPU 997 * 998 * @adev: amdgpu_device pointer 999 * 1000 * Resets the GPU using the pci config reset sequence. 1001 * Only applicable to asics prior to vega10. 1002 */ 1003 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1004 { 1005 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1006 } 1007 1008 /** 1009 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1010 * 1011 * @adev: amdgpu_device pointer 1012 * 1013 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1014 */ 1015 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1016 { 1017 return pci_reset_function(adev->pdev); 1018 } 1019 1020 /* 1021 * GPU doorbell aperture helpers function. 1022 */ 1023 /** 1024 * amdgpu_device_doorbell_init - Init doorbell driver information. 1025 * 1026 * @adev: amdgpu_device pointer 1027 * 1028 * Init doorbell driver information (CIK) 1029 * Returns 0 on success, error on failure. 1030 */ 1031 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1032 { 1033 1034 /* No doorbell on SI hardware generation */ 1035 if (adev->asic_type < CHIP_BONAIRE) { 1036 adev->doorbell.base = 0; 1037 adev->doorbell.size = 0; 1038 adev->doorbell.num_doorbells = 0; 1039 adev->doorbell.ptr = NULL; 1040 return 0; 1041 } 1042 1043 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1044 return -EINVAL; 1045 1046 amdgpu_asic_init_doorbell_index(adev); 1047 1048 /* doorbell bar mapping */ 1049 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1050 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1051 1052 if (adev->enable_mes) { 1053 adev->doorbell.num_doorbells = 1054 adev->doorbell.size / sizeof(u32); 1055 } else { 1056 adev->doorbell.num_doorbells = 1057 min_t(u32, adev->doorbell.size / sizeof(u32), 1058 adev->doorbell_index.max_assignment+1); 1059 if (adev->doorbell.num_doorbells == 0) 1060 return -EINVAL; 1061 1062 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1063 * paging queue doorbell use the second page. The 1064 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1065 * doorbells are in the first page. So with paging queue enabled, 1066 * the max num_doorbells should + 1 page (0x400 in dword) 1067 */ 1068 if (adev->asic_type >= CHIP_VEGA10) 1069 adev->doorbell.num_doorbells += 0x400; 1070 } 1071 1072 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1073 adev->doorbell.num_doorbells * 1074 sizeof(u32)); 1075 if (adev->doorbell.ptr == NULL) 1076 return -ENOMEM; 1077 1078 return 0; 1079 } 1080 1081 /** 1082 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1083 * 1084 * @adev: amdgpu_device pointer 1085 * 1086 * Tear down doorbell driver information (CIK) 1087 */ 1088 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1089 { 1090 iounmap(adev->doorbell.ptr); 1091 adev->doorbell.ptr = NULL; 1092 } 1093 1094 1095 1096 /* 1097 * amdgpu_device_wb_*() 1098 * Writeback is the method by which the GPU updates special pages in memory 1099 * with the status of certain GPU events (fences, ring pointers,etc.). 1100 */ 1101 1102 /** 1103 * amdgpu_device_wb_fini - Disable Writeback and free memory 1104 * 1105 * @adev: amdgpu_device pointer 1106 * 1107 * Disables Writeback and frees the Writeback memory (all asics). 1108 * Used at driver shutdown. 1109 */ 1110 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1111 { 1112 if (adev->wb.wb_obj) { 1113 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1114 &adev->wb.gpu_addr, 1115 (void **)&adev->wb.wb); 1116 adev->wb.wb_obj = NULL; 1117 } 1118 } 1119 1120 /** 1121 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1122 * 1123 * @adev: amdgpu_device pointer 1124 * 1125 * Initializes writeback and allocates writeback memory (all asics). 1126 * Used at driver startup. 1127 * Returns 0 on success or an -error on failure. 1128 */ 1129 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1130 { 1131 int r; 1132 1133 if (adev->wb.wb_obj == NULL) { 1134 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1135 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1136 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1137 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1138 (void **)&adev->wb.wb); 1139 if (r) { 1140 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1141 return r; 1142 } 1143 1144 adev->wb.num_wb = AMDGPU_MAX_WB; 1145 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1146 1147 /* clear wb memory */ 1148 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1149 } 1150 1151 return 0; 1152 } 1153 1154 /** 1155 * amdgpu_device_wb_get - Allocate a wb entry 1156 * 1157 * @adev: amdgpu_device pointer 1158 * @wb: wb index 1159 * 1160 * Allocate a wb slot for use by the driver (all asics). 1161 * Returns 0 on success or -EINVAL on failure. 1162 */ 1163 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1164 { 1165 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1166 1167 if (offset < adev->wb.num_wb) { 1168 __set_bit(offset, adev->wb.used); 1169 *wb = offset << 3; /* convert to dw offset */ 1170 return 0; 1171 } else { 1172 return -EINVAL; 1173 } 1174 } 1175 1176 /** 1177 * amdgpu_device_wb_free - Free a wb entry 1178 * 1179 * @adev: amdgpu_device pointer 1180 * @wb: wb index 1181 * 1182 * Free a wb slot allocated for use by the driver (all asics) 1183 */ 1184 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1185 { 1186 wb >>= 3; 1187 if (wb < adev->wb.num_wb) 1188 __clear_bit(wb, adev->wb.used); 1189 } 1190 1191 /** 1192 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1193 * 1194 * @adev: amdgpu_device pointer 1195 * 1196 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1197 * to fail, but if any of the BARs is not accessible after the size we abort 1198 * driver loading by returning -ENODEV. 1199 */ 1200 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1201 { 1202 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1203 struct pci_bus *root; 1204 struct resource *res; 1205 unsigned i; 1206 u16 cmd; 1207 int r; 1208 1209 /* Bypass for VF */ 1210 if (amdgpu_sriov_vf(adev)) 1211 return 0; 1212 1213 /* skip if the bios has already enabled large BAR */ 1214 if (adev->gmc.real_vram_size && 1215 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1216 return 0; 1217 1218 /* Check if the root BUS has 64bit memory resources */ 1219 root = adev->pdev->bus; 1220 while (root->parent) 1221 root = root->parent; 1222 1223 pci_bus_for_each_resource(root, res, i) { 1224 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1225 res->start > 0x100000000ull) 1226 break; 1227 } 1228 1229 /* Trying to resize is pointless without a root hub window above 4GB */ 1230 if (!res) 1231 return 0; 1232 1233 /* Limit the BAR size to what is available */ 1234 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1235 rbar_size); 1236 1237 /* Disable memory decoding while we change the BAR addresses and size */ 1238 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1239 pci_write_config_word(adev->pdev, PCI_COMMAND, 1240 cmd & ~PCI_COMMAND_MEMORY); 1241 1242 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1243 amdgpu_device_doorbell_fini(adev); 1244 if (adev->asic_type >= CHIP_BONAIRE) 1245 pci_release_resource(adev->pdev, 2); 1246 1247 pci_release_resource(adev->pdev, 0); 1248 1249 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1250 if (r == -ENOSPC) 1251 DRM_INFO("Not enough PCI address space for a large BAR."); 1252 else if (r && r != -ENOTSUPP) 1253 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1254 1255 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1256 1257 /* When the doorbell or fb BAR isn't available we have no chance of 1258 * using the device. 1259 */ 1260 r = amdgpu_device_doorbell_init(adev); 1261 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1262 return -ENODEV; 1263 1264 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1265 1266 return 0; 1267 } 1268 1269 /* 1270 * GPU helpers function. 1271 */ 1272 /** 1273 * amdgpu_device_need_post - check if the hw need post or not 1274 * 1275 * @adev: amdgpu_device pointer 1276 * 1277 * Check if the asic has been initialized (all asics) at driver startup 1278 * or post is needed if hw reset is performed. 1279 * Returns true if need or false if not. 1280 */ 1281 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1282 { 1283 uint32_t reg; 1284 1285 if (amdgpu_sriov_vf(adev)) 1286 return false; 1287 1288 if (amdgpu_passthrough(adev)) { 1289 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1290 * some old smc fw still need driver do vPost otherwise gpu hang, while 1291 * those smc fw version above 22.15 doesn't have this flaw, so we force 1292 * vpost executed for smc version below 22.15 1293 */ 1294 if (adev->asic_type == CHIP_FIJI) { 1295 int err; 1296 uint32_t fw_ver; 1297 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1298 /* force vPost if error occured */ 1299 if (err) 1300 return true; 1301 1302 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1303 if (fw_ver < 0x00160e00) 1304 return true; 1305 } 1306 } 1307 1308 /* Don't post if we need to reset whole hive on init */ 1309 if (adev->gmc.xgmi.pending_reset) 1310 return false; 1311 1312 if (adev->has_hw_reset) { 1313 adev->has_hw_reset = false; 1314 return true; 1315 } 1316 1317 /* bios scratch used on CIK+ */ 1318 if (adev->asic_type >= CHIP_BONAIRE) 1319 return amdgpu_atombios_scratch_need_asic_init(adev); 1320 1321 /* check MEM_SIZE for older asics */ 1322 reg = amdgpu_asic_get_config_memsize(adev); 1323 1324 if ((reg != 0) && (reg != 0xffffffff)) 1325 return false; 1326 1327 return true; 1328 } 1329 1330 /** 1331 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1332 * 1333 * @adev: amdgpu_device pointer 1334 * 1335 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1336 * be set for this device. 1337 * 1338 * Returns true if it should be used or false if not. 1339 */ 1340 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1341 { 1342 switch (amdgpu_aspm) { 1343 case -1: 1344 break; 1345 case 0: 1346 return false; 1347 case 1: 1348 return true; 1349 default: 1350 return false; 1351 } 1352 return pcie_aspm_enabled(adev->pdev); 1353 } 1354 1355 /* if we get transitioned to only one device, take VGA back */ 1356 /** 1357 * amdgpu_device_vga_set_decode - enable/disable vga decode 1358 * 1359 * @pdev: PCI device pointer 1360 * @state: enable/disable vga decode 1361 * 1362 * Enable/disable vga decode (all asics). 1363 * Returns VGA resource flags. 1364 */ 1365 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1366 bool state) 1367 { 1368 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1369 amdgpu_asic_set_vga_state(adev, state); 1370 if (state) 1371 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1372 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1373 else 1374 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1375 } 1376 1377 /** 1378 * amdgpu_device_check_block_size - validate the vm block size 1379 * 1380 * @adev: amdgpu_device pointer 1381 * 1382 * Validates the vm block size specified via module parameter. 1383 * The vm block size defines number of bits in page table versus page directory, 1384 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1385 * page table and the remaining bits are in the page directory. 1386 */ 1387 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1388 { 1389 /* defines number of bits in page table versus page directory, 1390 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1391 * page table and the remaining bits are in the page directory */ 1392 if (amdgpu_vm_block_size == -1) 1393 return; 1394 1395 if (amdgpu_vm_block_size < 9) { 1396 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1397 amdgpu_vm_block_size); 1398 amdgpu_vm_block_size = -1; 1399 } 1400 } 1401 1402 /** 1403 * amdgpu_device_check_vm_size - validate the vm size 1404 * 1405 * @adev: amdgpu_device pointer 1406 * 1407 * Validates the vm size in GB specified via module parameter. 1408 * The VM size is the size of the GPU virtual memory space in GB. 1409 */ 1410 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1411 { 1412 /* no need to check the default value */ 1413 if (amdgpu_vm_size == -1) 1414 return; 1415 1416 if (amdgpu_vm_size < 1) { 1417 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1418 amdgpu_vm_size); 1419 amdgpu_vm_size = -1; 1420 } 1421 } 1422 1423 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1424 { 1425 struct sysinfo si; 1426 bool is_os_64 = (sizeof(void *) == 8); 1427 uint64_t total_memory; 1428 uint64_t dram_size_seven_GB = 0x1B8000000; 1429 uint64_t dram_size_three_GB = 0xB8000000; 1430 1431 if (amdgpu_smu_memory_pool_size == 0) 1432 return; 1433 1434 if (!is_os_64) { 1435 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1436 goto def_value; 1437 } 1438 si_meminfo(&si); 1439 total_memory = (uint64_t)si.totalram * si.mem_unit; 1440 1441 if ((amdgpu_smu_memory_pool_size == 1) || 1442 (amdgpu_smu_memory_pool_size == 2)) { 1443 if (total_memory < dram_size_three_GB) 1444 goto def_value1; 1445 } else if ((amdgpu_smu_memory_pool_size == 4) || 1446 (amdgpu_smu_memory_pool_size == 8)) { 1447 if (total_memory < dram_size_seven_GB) 1448 goto def_value1; 1449 } else { 1450 DRM_WARN("Smu memory pool size not supported\n"); 1451 goto def_value; 1452 } 1453 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1454 1455 return; 1456 1457 def_value1: 1458 DRM_WARN("No enough system memory\n"); 1459 def_value: 1460 adev->pm.smu_prv_buffer_size = 0; 1461 } 1462 1463 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1464 { 1465 if (!(adev->flags & AMD_IS_APU) || 1466 adev->asic_type < CHIP_RAVEN) 1467 return 0; 1468 1469 switch (adev->asic_type) { 1470 case CHIP_RAVEN: 1471 if (adev->pdev->device == 0x15dd) 1472 adev->apu_flags |= AMD_APU_IS_RAVEN; 1473 if (adev->pdev->device == 0x15d8) 1474 adev->apu_flags |= AMD_APU_IS_PICASSO; 1475 break; 1476 case CHIP_RENOIR: 1477 if ((adev->pdev->device == 0x1636) || 1478 (adev->pdev->device == 0x164c)) 1479 adev->apu_flags |= AMD_APU_IS_RENOIR; 1480 else 1481 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1482 break; 1483 case CHIP_VANGOGH: 1484 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1485 break; 1486 case CHIP_YELLOW_CARP: 1487 break; 1488 case CHIP_CYAN_SKILLFISH: 1489 if ((adev->pdev->device == 0x13FE) || 1490 (adev->pdev->device == 0x143F)) 1491 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1492 break; 1493 default: 1494 break; 1495 } 1496 1497 return 0; 1498 } 1499 1500 /** 1501 * amdgpu_device_check_arguments - validate module params 1502 * 1503 * @adev: amdgpu_device pointer 1504 * 1505 * Validates certain module parameters and updates 1506 * the associated values used by the driver (all asics). 1507 */ 1508 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1509 { 1510 if (amdgpu_sched_jobs < 4) { 1511 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1512 amdgpu_sched_jobs); 1513 amdgpu_sched_jobs = 4; 1514 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1515 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1516 amdgpu_sched_jobs); 1517 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1518 } 1519 1520 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1521 /* gart size must be greater or equal to 32M */ 1522 dev_warn(adev->dev, "gart size (%d) too small\n", 1523 amdgpu_gart_size); 1524 amdgpu_gart_size = -1; 1525 } 1526 1527 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1528 /* gtt size must be greater or equal to 32M */ 1529 dev_warn(adev->dev, "gtt size (%d) too small\n", 1530 amdgpu_gtt_size); 1531 amdgpu_gtt_size = -1; 1532 } 1533 1534 /* valid range is between 4 and 9 inclusive */ 1535 if (amdgpu_vm_fragment_size != -1 && 1536 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1537 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1538 amdgpu_vm_fragment_size = -1; 1539 } 1540 1541 if (amdgpu_sched_hw_submission < 2) { 1542 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1543 amdgpu_sched_hw_submission); 1544 amdgpu_sched_hw_submission = 2; 1545 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1546 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1547 amdgpu_sched_hw_submission); 1548 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1549 } 1550 1551 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1552 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1553 amdgpu_reset_method = -1; 1554 } 1555 1556 amdgpu_device_check_smu_prv_buffer_size(adev); 1557 1558 amdgpu_device_check_vm_size(adev); 1559 1560 amdgpu_device_check_block_size(adev); 1561 1562 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1563 1564 return 0; 1565 } 1566 1567 /** 1568 * amdgpu_switcheroo_set_state - set switcheroo state 1569 * 1570 * @pdev: pci dev pointer 1571 * @state: vga_switcheroo state 1572 * 1573 * Callback for the switcheroo driver. Suspends or resumes 1574 * the asics before or after it is powered up using ACPI methods. 1575 */ 1576 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1577 enum vga_switcheroo_state state) 1578 { 1579 struct drm_device *dev = pci_get_drvdata(pdev); 1580 int r; 1581 1582 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1583 return; 1584 1585 if (state == VGA_SWITCHEROO_ON) { 1586 pr_info("switched on\n"); 1587 /* don't suspend or resume card normally */ 1588 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1589 1590 pci_set_power_state(pdev, PCI_D0); 1591 amdgpu_device_load_pci_state(pdev); 1592 r = pci_enable_device(pdev); 1593 if (r) 1594 DRM_WARN("pci_enable_device failed (%d)\n", r); 1595 amdgpu_device_resume(dev, true); 1596 1597 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1598 } else { 1599 pr_info("switched off\n"); 1600 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1601 amdgpu_device_suspend(dev, true); 1602 amdgpu_device_cache_pci_state(pdev); 1603 /* Shut down the device */ 1604 pci_disable_device(pdev); 1605 pci_set_power_state(pdev, PCI_D3cold); 1606 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1607 } 1608 } 1609 1610 /** 1611 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1612 * 1613 * @pdev: pci dev pointer 1614 * 1615 * Callback for the switcheroo driver. Check of the switcheroo 1616 * state can be changed. 1617 * Returns true if the state can be changed, false if not. 1618 */ 1619 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1620 { 1621 struct drm_device *dev = pci_get_drvdata(pdev); 1622 1623 /* 1624 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1625 * locking inversion with the driver load path. And the access here is 1626 * completely racy anyway. So don't bother with locking for now. 1627 */ 1628 return atomic_read(&dev->open_count) == 0; 1629 } 1630 1631 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1632 .set_gpu_state = amdgpu_switcheroo_set_state, 1633 .reprobe = NULL, 1634 .can_switch = amdgpu_switcheroo_can_switch, 1635 }; 1636 1637 /** 1638 * amdgpu_device_ip_set_clockgating_state - set the CG state 1639 * 1640 * @dev: amdgpu_device pointer 1641 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1642 * @state: clockgating state (gate or ungate) 1643 * 1644 * Sets the requested clockgating state for all instances of 1645 * the hardware IP specified. 1646 * Returns the error code from the last instance. 1647 */ 1648 int amdgpu_device_ip_set_clockgating_state(void *dev, 1649 enum amd_ip_block_type block_type, 1650 enum amd_clockgating_state state) 1651 { 1652 struct amdgpu_device *adev = dev; 1653 int i, r = 0; 1654 1655 for (i = 0; i < adev->num_ip_blocks; i++) { 1656 if (!adev->ip_blocks[i].status.valid) 1657 continue; 1658 if (adev->ip_blocks[i].version->type != block_type) 1659 continue; 1660 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1661 continue; 1662 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1663 (void *)adev, state); 1664 if (r) 1665 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1666 adev->ip_blocks[i].version->funcs->name, r); 1667 } 1668 return r; 1669 } 1670 1671 /** 1672 * amdgpu_device_ip_set_powergating_state - set the PG state 1673 * 1674 * @dev: amdgpu_device pointer 1675 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1676 * @state: powergating state (gate or ungate) 1677 * 1678 * Sets the requested powergating state for all instances of 1679 * the hardware IP specified. 1680 * Returns the error code from the last instance. 1681 */ 1682 int amdgpu_device_ip_set_powergating_state(void *dev, 1683 enum amd_ip_block_type block_type, 1684 enum amd_powergating_state state) 1685 { 1686 struct amdgpu_device *adev = dev; 1687 int i, r = 0; 1688 1689 for (i = 0; i < adev->num_ip_blocks; i++) { 1690 if (!adev->ip_blocks[i].status.valid) 1691 continue; 1692 if (adev->ip_blocks[i].version->type != block_type) 1693 continue; 1694 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1695 continue; 1696 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1697 (void *)adev, state); 1698 if (r) 1699 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1700 adev->ip_blocks[i].version->funcs->name, r); 1701 } 1702 return r; 1703 } 1704 1705 /** 1706 * amdgpu_device_ip_get_clockgating_state - get the CG state 1707 * 1708 * @adev: amdgpu_device pointer 1709 * @flags: clockgating feature flags 1710 * 1711 * Walks the list of IPs on the device and updates the clockgating 1712 * flags for each IP. 1713 * Updates @flags with the feature flags for each hardware IP where 1714 * clockgating is enabled. 1715 */ 1716 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1717 u64 *flags) 1718 { 1719 int i; 1720 1721 for (i = 0; i < adev->num_ip_blocks; i++) { 1722 if (!adev->ip_blocks[i].status.valid) 1723 continue; 1724 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1725 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1726 } 1727 } 1728 1729 /** 1730 * amdgpu_device_ip_wait_for_idle - wait for idle 1731 * 1732 * @adev: amdgpu_device pointer 1733 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1734 * 1735 * Waits for the request hardware IP to be idle. 1736 * Returns 0 for success or a negative error code on failure. 1737 */ 1738 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1739 enum amd_ip_block_type block_type) 1740 { 1741 int i, r; 1742 1743 for (i = 0; i < adev->num_ip_blocks; i++) { 1744 if (!adev->ip_blocks[i].status.valid) 1745 continue; 1746 if (adev->ip_blocks[i].version->type == block_type) { 1747 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1748 if (r) 1749 return r; 1750 break; 1751 } 1752 } 1753 return 0; 1754 1755 } 1756 1757 /** 1758 * amdgpu_device_ip_is_idle - is the hardware IP idle 1759 * 1760 * @adev: amdgpu_device pointer 1761 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1762 * 1763 * Check if the hardware IP is idle or not. 1764 * Returns true if it the IP is idle, false if not. 1765 */ 1766 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1767 enum amd_ip_block_type block_type) 1768 { 1769 int i; 1770 1771 for (i = 0; i < adev->num_ip_blocks; i++) { 1772 if (!adev->ip_blocks[i].status.valid) 1773 continue; 1774 if (adev->ip_blocks[i].version->type == block_type) 1775 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1776 } 1777 return true; 1778 1779 } 1780 1781 /** 1782 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1783 * 1784 * @adev: amdgpu_device pointer 1785 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1786 * 1787 * Returns a pointer to the hardware IP block structure 1788 * if it exists for the asic, otherwise NULL. 1789 */ 1790 struct amdgpu_ip_block * 1791 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1792 enum amd_ip_block_type type) 1793 { 1794 int i; 1795 1796 for (i = 0; i < adev->num_ip_blocks; i++) 1797 if (adev->ip_blocks[i].version->type == type) 1798 return &adev->ip_blocks[i]; 1799 1800 return NULL; 1801 } 1802 1803 /** 1804 * amdgpu_device_ip_block_version_cmp 1805 * 1806 * @adev: amdgpu_device pointer 1807 * @type: enum amd_ip_block_type 1808 * @major: major version 1809 * @minor: minor version 1810 * 1811 * return 0 if equal or greater 1812 * return 1 if smaller or the ip_block doesn't exist 1813 */ 1814 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1815 enum amd_ip_block_type type, 1816 u32 major, u32 minor) 1817 { 1818 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1819 1820 if (ip_block && ((ip_block->version->major > major) || 1821 ((ip_block->version->major == major) && 1822 (ip_block->version->minor >= minor)))) 1823 return 0; 1824 1825 return 1; 1826 } 1827 1828 /** 1829 * amdgpu_device_ip_block_add 1830 * 1831 * @adev: amdgpu_device pointer 1832 * @ip_block_version: pointer to the IP to add 1833 * 1834 * Adds the IP block driver information to the collection of IPs 1835 * on the asic. 1836 */ 1837 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1838 const struct amdgpu_ip_block_version *ip_block_version) 1839 { 1840 if (!ip_block_version) 1841 return -EINVAL; 1842 1843 switch (ip_block_version->type) { 1844 case AMD_IP_BLOCK_TYPE_VCN: 1845 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1846 return 0; 1847 break; 1848 case AMD_IP_BLOCK_TYPE_JPEG: 1849 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1850 return 0; 1851 break; 1852 default: 1853 break; 1854 } 1855 1856 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1857 ip_block_version->funcs->name); 1858 1859 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1860 1861 return 0; 1862 } 1863 1864 /** 1865 * amdgpu_device_enable_virtual_display - enable virtual display feature 1866 * 1867 * @adev: amdgpu_device pointer 1868 * 1869 * Enabled the virtual display feature if the user has enabled it via 1870 * the module parameter virtual_display. This feature provides a virtual 1871 * display hardware on headless boards or in virtualized environments. 1872 * This function parses and validates the configuration string specified by 1873 * the user and configues the virtual display configuration (number of 1874 * virtual connectors, crtcs, etc.) specified. 1875 */ 1876 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1877 { 1878 adev->enable_virtual_display = false; 1879 1880 if (amdgpu_virtual_display) { 1881 const char *pci_address_name = pci_name(adev->pdev); 1882 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1883 1884 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1885 pciaddstr_tmp = pciaddstr; 1886 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1887 pciaddname = strsep(&pciaddname_tmp, ","); 1888 if (!strcmp("all", pciaddname) 1889 || !strcmp(pci_address_name, pciaddname)) { 1890 long num_crtc; 1891 int res = -1; 1892 1893 adev->enable_virtual_display = true; 1894 1895 if (pciaddname_tmp) 1896 res = kstrtol(pciaddname_tmp, 10, 1897 &num_crtc); 1898 1899 if (!res) { 1900 if (num_crtc < 1) 1901 num_crtc = 1; 1902 if (num_crtc > 6) 1903 num_crtc = 6; 1904 adev->mode_info.num_crtc = num_crtc; 1905 } else { 1906 adev->mode_info.num_crtc = 1; 1907 } 1908 break; 1909 } 1910 } 1911 1912 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1913 amdgpu_virtual_display, pci_address_name, 1914 adev->enable_virtual_display, adev->mode_info.num_crtc); 1915 1916 kfree(pciaddstr); 1917 } 1918 } 1919 1920 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1921 { 1922 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1923 adev->mode_info.num_crtc = 1; 1924 adev->enable_virtual_display = true; 1925 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 1926 adev->enable_virtual_display, adev->mode_info.num_crtc); 1927 } 1928 } 1929 1930 /** 1931 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1932 * 1933 * @adev: amdgpu_device pointer 1934 * 1935 * Parses the asic configuration parameters specified in the gpu info 1936 * firmware and makes them availale to the driver for use in configuring 1937 * the asic. 1938 * Returns 0 on success, -EINVAL on failure. 1939 */ 1940 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1941 { 1942 const char *chip_name; 1943 char fw_name[40]; 1944 int err; 1945 const struct gpu_info_firmware_header_v1_0 *hdr; 1946 1947 adev->firmware.gpu_info_fw = NULL; 1948 1949 if (adev->mman.discovery_bin) { 1950 /* 1951 * FIXME: The bounding box is still needed by Navi12, so 1952 * temporarily read it from gpu_info firmware. Should be dropped 1953 * when DAL no longer needs it. 1954 */ 1955 if (adev->asic_type != CHIP_NAVI12) 1956 return 0; 1957 } 1958 1959 switch (adev->asic_type) { 1960 default: 1961 return 0; 1962 case CHIP_VEGA10: 1963 chip_name = "vega10"; 1964 break; 1965 case CHIP_VEGA12: 1966 chip_name = "vega12"; 1967 break; 1968 case CHIP_RAVEN: 1969 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1970 chip_name = "raven2"; 1971 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1972 chip_name = "picasso"; 1973 else 1974 chip_name = "raven"; 1975 break; 1976 case CHIP_ARCTURUS: 1977 chip_name = "arcturus"; 1978 break; 1979 case CHIP_NAVI12: 1980 chip_name = "navi12"; 1981 break; 1982 } 1983 1984 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1985 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1986 if (err) { 1987 dev_err(adev->dev, 1988 "Failed to load gpu_info firmware \"%s\"\n", 1989 fw_name); 1990 goto out; 1991 } 1992 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1993 if (err) { 1994 dev_err(adev->dev, 1995 "Failed to validate gpu_info firmware \"%s\"\n", 1996 fw_name); 1997 goto out; 1998 } 1999 2000 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2001 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2002 2003 switch (hdr->version_major) { 2004 case 1: 2005 { 2006 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2007 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2008 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2009 2010 /* 2011 * Should be droped when DAL no longer needs it. 2012 */ 2013 if (adev->asic_type == CHIP_NAVI12) 2014 goto parse_soc_bounding_box; 2015 2016 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2017 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2018 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2019 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2020 adev->gfx.config.max_texture_channel_caches = 2021 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2022 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2023 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2024 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2025 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2026 adev->gfx.config.double_offchip_lds_buf = 2027 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2028 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2029 adev->gfx.cu_info.max_waves_per_simd = 2030 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2031 adev->gfx.cu_info.max_scratch_slots_per_cu = 2032 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2033 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2034 if (hdr->version_minor >= 1) { 2035 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2036 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2037 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2038 adev->gfx.config.num_sc_per_sh = 2039 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2040 adev->gfx.config.num_packer_per_sc = 2041 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2042 } 2043 2044 parse_soc_bounding_box: 2045 /* 2046 * soc bounding box info is not integrated in disocovery table, 2047 * we always need to parse it from gpu info firmware if needed. 2048 */ 2049 if (hdr->version_minor == 2) { 2050 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2051 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2052 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2053 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2054 } 2055 break; 2056 } 2057 default: 2058 dev_err(adev->dev, 2059 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2060 err = -EINVAL; 2061 goto out; 2062 } 2063 out: 2064 return err; 2065 } 2066 2067 /** 2068 * amdgpu_device_ip_early_init - run early init for hardware IPs 2069 * 2070 * @adev: amdgpu_device pointer 2071 * 2072 * Early initialization pass for hardware IPs. The hardware IPs that make 2073 * up each asic are discovered each IP's early_init callback is run. This 2074 * is the first stage in initializing the asic. 2075 * Returns 0 on success, negative error code on failure. 2076 */ 2077 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2078 { 2079 struct drm_device *dev = adev_to_drm(adev); 2080 struct pci_dev *parent; 2081 int i, r; 2082 2083 amdgpu_device_enable_virtual_display(adev); 2084 2085 if (amdgpu_sriov_vf(adev)) { 2086 r = amdgpu_virt_request_full_gpu(adev, true); 2087 if (r) 2088 return r; 2089 } 2090 2091 switch (adev->asic_type) { 2092 #ifdef CONFIG_DRM_AMDGPU_SI 2093 case CHIP_VERDE: 2094 case CHIP_TAHITI: 2095 case CHIP_PITCAIRN: 2096 case CHIP_OLAND: 2097 case CHIP_HAINAN: 2098 adev->family = AMDGPU_FAMILY_SI; 2099 r = si_set_ip_blocks(adev); 2100 if (r) 2101 return r; 2102 break; 2103 #endif 2104 #ifdef CONFIG_DRM_AMDGPU_CIK 2105 case CHIP_BONAIRE: 2106 case CHIP_HAWAII: 2107 case CHIP_KAVERI: 2108 case CHIP_KABINI: 2109 case CHIP_MULLINS: 2110 if (adev->flags & AMD_IS_APU) 2111 adev->family = AMDGPU_FAMILY_KV; 2112 else 2113 adev->family = AMDGPU_FAMILY_CI; 2114 2115 r = cik_set_ip_blocks(adev); 2116 if (r) 2117 return r; 2118 break; 2119 #endif 2120 case CHIP_TOPAZ: 2121 case CHIP_TONGA: 2122 case CHIP_FIJI: 2123 case CHIP_POLARIS10: 2124 case CHIP_POLARIS11: 2125 case CHIP_POLARIS12: 2126 case CHIP_VEGAM: 2127 case CHIP_CARRIZO: 2128 case CHIP_STONEY: 2129 if (adev->flags & AMD_IS_APU) 2130 adev->family = AMDGPU_FAMILY_CZ; 2131 else 2132 adev->family = AMDGPU_FAMILY_VI; 2133 2134 r = vi_set_ip_blocks(adev); 2135 if (r) 2136 return r; 2137 break; 2138 default: 2139 r = amdgpu_discovery_set_ip_blocks(adev); 2140 if (r) 2141 return r; 2142 break; 2143 } 2144 2145 if (amdgpu_has_atpx() && 2146 (amdgpu_is_atpx_hybrid() || 2147 amdgpu_has_atpx_dgpu_power_cntl()) && 2148 ((adev->flags & AMD_IS_APU) == 0) && 2149 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2150 adev->flags |= AMD_IS_PX; 2151 2152 if (!(adev->flags & AMD_IS_APU)) { 2153 parent = pci_upstream_bridge(adev->pdev); 2154 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2155 } 2156 2157 amdgpu_amdkfd_device_probe(adev); 2158 2159 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2160 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2161 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2162 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2163 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2164 2165 for (i = 0; i < adev->num_ip_blocks; i++) { 2166 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2167 DRM_ERROR("disabled ip block: %d <%s>\n", 2168 i, adev->ip_blocks[i].version->funcs->name); 2169 adev->ip_blocks[i].status.valid = false; 2170 } else { 2171 if (adev->ip_blocks[i].version->funcs->early_init) { 2172 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2173 if (r == -ENOENT) { 2174 adev->ip_blocks[i].status.valid = false; 2175 } else if (r) { 2176 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2177 adev->ip_blocks[i].version->funcs->name, r); 2178 return r; 2179 } else { 2180 adev->ip_blocks[i].status.valid = true; 2181 } 2182 } else { 2183 adev->ip_blocks[i].status.valid = true; 2184 } 2185 } 2186 /* get the vbios after the asic_funcs are set up */ 2187 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2188 r = amdgpu_device_parse_gpu_info_fw(adev); 2189 if (r) 2190 return r; 2191 2192 /* Read BIOS */ 2193 if (!amdgpu_get_bios(adev)) 2194 return -EINVAL; 2195 2196 r = amdgpu_atombios_init(adev); 2197 if (r) { 2198 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2199 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2200 return r; 2201 } 2202 2203 /*get pf2vf msg info at it's earliest time*/ 2204 if (amdgpu_sriov_vf(adev)) 2205 amdgpu_virt_init_data_exchange(adev); 2206 2207 } 2208 } 2209 2210 adev->cg_flags &= amdgpu_cg_mask; 2211 adev->pg_flags &= amdgpu_pg_mask; 2212 2213 return 0; 2214 } 2215 2216 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2217 { 2218 int i, r; 2219 2220 for (i = 0; i < adev->num_ip_blocks; i++) { 2221 if (!adev->ip_blocks[i].status.sw) 2222 continue; 2223 if (adev->ip_blocks[i].status.hw) 2224 continue; 2225 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2226 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2227 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2228 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2229 if (r) { 2230 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2231 adev->ip_blocks[i].version->funcs->name, r); 2232 return r; 2233 } 2234 adev->ip_blocks[i].status.hw = true; 2235 } 2236 } 2237 2238 return 0; 2239 } 2240 2241 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2242 { 2243 int i, r; 2244 2245 for (i = 0; i < adev->num_ip_blocks; i++) { 2246 if (!adev->ip_blocks[i].status.sw) 2247 continue; 2248 if (adev->ip_blocks[i].status.hw) 2249 continue; 2250 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2251 if (r) { 2252 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2253 adev->ip_blocks[i].version->funcs->name, r); 2254 return r; 2255 } 2256 adev->ip_blocks[i].status.hw = true; 2257 } 2258 2259 return 0; 2260 } 2261 2262 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2263 { 2264 int r = 0; 2265 int i; 2266 uint32_t smu_version; 2267 2268 if (adev->asic_type >= CHIP_VEGA10) { 2269 for (i = 0; i < adev->num_ip_blocks; i++) { 2270 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2271 continue; 2272 2273 if (!adev->ip_blocks[i].status.sw) 2274 continue; 2275 2276 /* no need to do the fw loading again if already done*/ 2277 if (adev->ip_blocks[i].status.hw == true) 2278 break; 2279 2280 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2281 r = adev->ip_blocks[i].version->funcs->resume(adev); 2282 if (r) { 2283 DRM_ERROR("resume of IP block <%s> failed %d\n", 2284 adev->ip_blocks[i].version->funcs->name, r); 2285 return r; 2286 } 2287 } else { 2288 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2289 if (r) { 2290 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2291 adev->ip_blocks[i].version->funcs->name, r); 2292 return r; 2293 } 2294 } 2295 2296 adev->ip_blocks[i].status.hw = true; 2297 break; 2298 } 2299 } 2300 2301 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2302 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2303 2304 return r; 2305 } 2306 2307 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2308 { 2309 long timeout; 2310 int r, i; 2311 2312 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2313 struct amdgpu_ring *ring = adev->rings[i]; 2314 2315 /* No need to setup the GPU scheduler for rings that don't need it */ 2316 if (!ring || ring->no_scheduler) 2317 continue; 2318 2319 switch (ring->funcs->type) { 2320 case AMDGPU_RING_TYPE_GFX: 2321 timeout = adev->gfx_timeout; 2322 break; 2323 case AMDGPU_RING_TYPE_COMPUTE: 2324 timeout = adev->compute_timeout; 2325 break; 2326 case AMDGPU_RING_TYPE_SDMA: 2327 timeout = adev->sdma_timeout; 2328 break; 2329 default: 2330 timeout = adev->video_timeout; 2331 break; 2332 } 2333 2334 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, 2335 ring->num_hw_submission, amdgpu_job_hang_limit, 2336 timeout, adev->reset_domain->wq, 2337 ring->sched_score, ring->name, 2338 adev->dev); 2339 if (r) { 2340 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2341 ring->name); 2342 return r; 2343 } 2344 } 2345 2346 return 0; 2347 } 2348 2349 2350 /** 2351 * amdgpu_device_ip_init - run init for hardware IPs 2352 * 2353 * @adev: amdgpu_device pointer 2354 * 2355 * Main initialization pass for hardware IPs. The list of all the hardware 2356 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2357 * are run. sw_init initializes the software state associated with each IP 2358 * and hw_init initializes the hardware associated with each IP. 2359 * Returns 0 on success, negative error code on failure. 2360 */ 2361 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2362 { 2363 int i, r; 2364 2365 r = amdgpu_ras_init(adev); 2366 if (r) 2367 return r; 2368 2369 for (i = 0; i < adev->num_ip_blocks; i++) { 2370 if (!adev->ip_blocks[i].status.valid) 2371 continue; 2372 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2373 if (r) { 2374 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2375 adev->ip_blocks[i].version->funcs->name, r); 2376 goto init_failed; 2377 } 2378 adev->ip_blocks[i].status.sw = true; 2379 2380 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2381 /* need to do common hw init early so everything is set up for gmc */ 2382 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2383 if (r) { 2384 DRM_ERROR("hw_init %d failed %d\n", i, r); 2385 goto init_failed; 2386 } 2387 adev->ip_blocks[i].status.hw = true; 2388 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2389 /* need to do gmc hw init early so we can allocate gpu mem */ 2390 /* Try to reserve bad pages early */ 2391 if (amdgpu_sriov_vf(adev)) 2392 amdgpu_virt_exchange_data(adev); 2393 2394 r = amdgpu_device_mem_scratch_init(adev); 2395 if (r) { 2396 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2397 goto init_failed; 2398 } 2399 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2400 if (r) { 2401 DRM_ERROR("hw_init %d failed %d\n", i, r); 2402 goto init_failed; 2403 } 2404 r = amdgpu_device_wb_init(adev); 2405 if (r) { 2406 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2407 goto init_failed; 2408 } 2409 adev->ip_blocks[i].status.hw = true; 2410 2411 /* right after GMC hw init, we create CSA */ 2412 if (amdgpu_mcbp) { 2413 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2414 AMDGPU_GEM_DOMAIN_VRAM | 2415 AMDGPU_GEM_DOMAIN_GTT, 2416 AMDGPU_CSA_SIZE); 2417 if (r) { 2418 DRM_ERROR("allocate CSA failed %d\n", r); 2419 goto init_failed; 2420 } 2421 } 2422 } 2423 } 2424 2425 if (amdgpu_sriov_vf(adev)) 2426 amdgpu_virt_init_data_exchange(adev); 2427 2428 r = amdgpu_ib_pool_init(adev); 2429 if (r) { 2430 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2431 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2432 goto init_failed; 2433 } 2434 2435 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2436 if (r) 2437 goto init_failed; 2438 2439 r = amdgpu_device_ip_hw_init_phase1(adev); 2440 if (r) 2441 goto init_failed; 2442 2443 r = amdgpu_device_fw_loading(adev); 2444 if (r) 2445 goto init_failed; 2446 2447 r = amdgpu_device_ip_hw_init_phase2(adev); 2448 if (r) 2449 goto init_failed; 2450 2451 /* 2452 * retired pages will be loaded from eeprom and reserved here, 2453 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2454 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2455 * for I2C communication which only true at this point. 2456 * 2457 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2458 * failure from bad gpu situation and stop amdgpu init process 2459 * accordingly. For other failed cases, it will still release all 2460 * the resource and print error message, rather than returning one 2461 * negative value to upper level. 2462 * 2463 * Note: theoretically, this should be called before all vram allocations 2464 * to protect retired page from abusing 2465 */ 2466 r = amdgpu_ras_recovery_init(adev); 2467 if (r) 2468 goto init_failed; 2469 2470 /** 2471 * In case of XGMI grab extra reference for reset domain for this device 2472 */ 2473 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2474 if (amdgpu_xgmi_add_device(adev) == 0) { 2475 if (!amdgpu_sriov_vf(adev)) { 2476 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2477 2478 if (WARN_ON(!hive)) { 2479 r = -ENOENT; 2480 goto init_failed; 2481 } 2482 2483 if (!hive->reset_domain || 2484 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2485 r = -ENOENT; 2486 amdgpu_put_xgmi_hive(hive); 2487 goto init_failed; 2488 } 2489 2490 /* Drop the early temporary reset domain we created for device */ 2491 amdgpu_reset_put_reset_domain(adev->reset_domain); 2492 adev->reset_domain = hive->reset_domain; 2493 amdgpu_put_xgmi_hive(hive); 2494 } 2495 } 2496 } 2497 2498 r = amdgpu_device_init_schedulers(adev); 2499 if (r) 2500 goto init_failed; 2501 2502 /* Don't init kfd if whole hive need to be reset during init */ 2503 if (!adev->gmc.xgmi.pending_reset) 2504 amdgpu_amdkfd_device_init(adev); 2505 2506 amdgpu_fru_get_product_info(adev); 2507 2508 init_failed: 2509 if (amdgpu_sriov_vf(adev)) 2510 amdgpu_virt_release_full_gpu(adev, true); 2511 2512 return r; 2513 } 2514 2515 /** 2516 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2517 * 2518 * @adev: amdgpu_device pointer 2519 * 2520 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2521 * this function before a GPU reset. If the value is retained after a 2522 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2523 */ 2524 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2525 { 2526 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2527 } 2528 2529 /** 2530 * amdgpu_device_check_vram_lost - check if vram is valid 2531 * 2532 * @adev: amdgpu_device pointer 2533 * 2534 * Checks the reset magic value written to the gart pointer in VRAM. 2535 * The driver calls this after a GPU reset to see if the contents of 2536 * VRAM is lost or now. 2537 * returns true if vram is lost, false if not. 2538 */ 2539 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2540 { 2541 if (memcmp(adev->gart.ptr, adev->reset_magic, 2542 AMDGPU_RESET_MAGIC_NUM)) 2543 return true; 2544 2545 if (!amdgpu_in_reset(adev)) 2546 return false; 2547 2548 /* 2549 * For all ASICs with baco/mode1 reset, the VRAM is 2550 * always assumed to be lost. 2551 */ 2552 switch (amdgpu_asic_reset_method(adev)) { 2553 case AMD_RESET_METHOD_BACO: 2554 case AMD_RESET_METHOD_MODE1: 2555 return true; 2556 default: 2557 return false; 2558 } 2559 } 2560 2561 /** 2562 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2563 * 2564 * @adev: amdgpu_device pointer 2565 * @state: clockgating state (gate or ungate) 2566 * 2567 * The list of all the hardware IPs that make up the asic is walked and the 2568 * set_clockgating_state callbacks are run. 2569 * Late initialization pass enabling clockgating for hardware IPs. 2570 * Fini or suspend, pass disabling clockgating for hardware IPs. 2571 * Returns 0 on success, negative error code on failure. 2572 */ 2573 2574 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2575 enum amd_clockgating_state state) 2576 { 2577 int i, j, r; 2578 2579 if (amdgpu_emu_mode == 1) 2580 return 0; 2581 2582 for (j = 0; j < adev->num_ip_blocks; j++) { 2583 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2584 if (!adev->ip_blocks[i].status.late_initialized) 2585 continue; 2586 /* skip CG for GFX, SDMA on S0ix */ 2587 if (adev->in_s0ix && 2588 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2589 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2590 continue; 2591 /* skip CG for VCE/UVD, it's handled specially */ 2592 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2593 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2594 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2595 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2596 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2597 /* enable clockgating to save power */ 2598 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2599 state); 2600 if (r) { 2601 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2602 adev->ip_blocks[i].version->funcs->name, r); 2603 return r; 2604 } 2605 } 2606 } 2607 2608 return 0; 2609 } 2610 2611 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2612 enum amd_powergating_state state) 2613 { 2614 int i, j, r; 2615 2616 if (amdgpu_emu_mode == 1) 2617 return 0; 2618 2619 for (j = 0; j < adev->num_ip_blocks; j++) { 2620 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2621 if (!adev->ip_blocks[i].status.late_initialized) 2622 continue; 2623 /* skip PG for GFX, SDMA on S0ix */ 2624 if (adev->in_s0ix && 2625 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2626 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2627 continue; 2628 /* skip CG for VCE/UVD, it's handled specially */ 2629 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2630 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2631 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2632 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2633 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2634 /* enable powergating to save power */ 2635 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2636 state); 2637 if (r) { 2638 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2639 adev->ip_blocks[i].version->funcs->name, r); 2640 return r; 2641 } 2642 } 2643 } 2644 return 0; 2645 } 2646 2647 static int amdgpu_device_enable_mgpu_fan_boost(void) 2648 { 2649 struct amdgpu_gpu_instance *gpu_ins; 2650 struct amdgpu_device *adev; 2651 int i, ret = 0; 2652 2653 mutex_lock(&mgpu_info.mutex); 2654 2655 /* 2656 * MGPU fan boost feature should be enabled 2657 * only when there are two or more dGPUs in 2658 * the system 2659 */ 2660 if (mgpu_info.num_dgpu < 2) 2661 goto out; 2662 2663 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2664 gpu_ins = &(mgpu_info.gpu_ins[i]); 2665 adev = gpu_ins->adev; 2666 if (!(adev->flags & AMD_IS_APU) && 2667 !gpu_ins->mgpu_fan_enabled) { 2668 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2669 if (ret) 2670 break; 2671 2672 gpu_ins->mgpu_fan_enabled = 1; 2673 } 2674 } 2675 2676 out: 2677 mutex_unlock(&mgpu_info.mutex); 2678 2679 return ret; 2680 } 2681 2682 /** 2683 * amdgpu_device_ip_late_init - run late init for hardware IPs 2684 * 2685 * @adev: amdgpu_device pointer 2686 * 2687 * Late initialization pass for hardware IPs. The list of all the hardware 2688 * IPs that make up the asic is walked and the late_init callbacks are run. 2689 * late_init covers any special initialization that an IP requires 2690 * after all of the have been initialized or something that needs to happen 2691 * late in the init process. 2692 * Returns 0 on success, negative error code on failure. 2693 */ 2694 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2695 { 2696 struct amdgpu_gpu_instance *gpu_instance; 2697 int i = 0, r; 2698 2699 for (i = 0; i < adev->num_ip_blocks; i++) { 2700 if (!adev->ip_blocks[i].status.hw) 2701 continue; 2702 if (adev->ip_blocks[i].version->funcs->late_init) { 2703 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2704 if (r) { 2705 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2706 adev->ip_blocks[i].version->funcs->name, r); 2707 return r; 2708 } 2709 } 2710 adev->ip_blocks[i].status.late_initialized = true; 2711 } 2712 2713 r = amdgpu_ras_late_init(adev); 2714 if (r) { 2715 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2716 return r; 2717 } 2718 2719 amdgpu_ras_set_error_query_ready(adev, true); 2720 2721 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2722 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2723 2724 amdgpu_device_fill_reset_magic(adev); 2725 2726 r = amdgpu_device_enable_mgpu_fan_boost(); 2727 if (r) 2728 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2729 2730 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2731 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2732 adev->asic_type == CHIP_ALDEBARAN )) 2733 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2734 2735 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2736 mutex_lock(&mgpu_info.mutex); 2737 2738 /* 2739 * Reset device p-state to low as this was booted with high. 2740 * 2741 * This should be performed only after all devices from the same 2742 * hive get initialized. 2743 * 2744 * However, it's unknown how many device in the hive in advance. 2745 * As this is counted one by one during devices initializations. 2746 * 2747 * So, we wait for all XGMI interlinked devices initialized. 2748 * This may bring some delays as those devices may come from 2749 * different hives. But that should be OK. 2750 */ 2751 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2752 for (i = 0; i < mgpu_info.num_gpu; i++) { 2753 gpu_instance = &(mgpu_info.gpu_ins[i]); 2754 if (gpu_instance->adev->flags & AMD_IS_APU) 2755 continue; 2756 2757 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2758 AMDGPU_XGMI_PSTATE_MIN); 2759 if (r) { 2760 DRM_ERROR("pstate setting failed (%d).\n", r); 2761 break; 2762 } 2763 } 2764 } 2765 2766 mutex_unlock(&mgpu_info.mutex); 2767 } 2768 2769 return 0; 2770 } 2771 2772 /** 2773 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2774 * 2775 * @adev: amdgpu_device pointer 2776 * 2777 * For ASICs need to disable SMC first 2778 */ 2779 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2780 { 2781 int i, r; 2782 2783 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2784 return; 2785 2786 for (i = 0; i < adev->num_ip_blocks; i++) { 2787 if (!adev->ip_blocks[i].status.hw) 2788 continue; 2789 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2790 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2791 /* XXX handle errors */ 2792 if (r) { 2793 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2794 adev->ip_blocks[i].version->funcs->name, r); 2795 } 2796 adev->ip_blocks[i].status.hw = false; 2797 break; 2798 } 2799 } 2800 } 2801 2802 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2803 { 2804 int i, r; 2805 2806 for (i = 0; i < adev->num_ip_blocks; i++) { 2807 if (!adev->ip_blocks[i].version->funcs->early_fini) 2808 continue; 2809 2810 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2811 if (r) { 2812 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2813 adev->ip_blocks[i].version->funcs->name, r); 2814 } 2815 } 2816 2817 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2818 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2819 2820 amdgpu_amdkfd_suspend(adev, false); 2821 2822 /* Workaroud for ASICs need to disable SMC first */ 2823 amdgpu_device_smu_fini_early(adev); 2824 2825 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2826 if (!adev->ip_blocks[i].status.hw) 2827 continue; 2828 2829 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2830 /* XXX handle errors */ 2831 if (r) { 2832 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2833 adev->ip_blocks[i].version->funcs->name, r); 2834 } 2835 2836 adev->ip_blocks[i].status.hw = false; 2837 } 2838 2839 if (amdgpu_sriov_vf(adev)) { 2840 if (amdgpu_virt_release_full_gpu(adev, false)) 2841 DRM_ERROR("failed to release exclusive mode on fini\n"); 2842 } 2843 2844 return 0; 2845 } 2846 2847 /** 2848 * amdgpu_device_ip_fini - run fini for hardware IPs 2849 * 2850 * @adev: amdgpu_device pointer 2851 * 2852 * Main teardown pass for hardware IPs. The list of all the hardware 2853 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2854 * are run. hw_fini tears down the hardware associated with each IP 2855 * and sw_fini tears down any software state associated with each IP. 2856 * Returns 0 on success, negative error code on failure. 2857 */ 2858 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2859 { 2860 int i, r; 2861 2862 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2863 amdgpu_virt_release_ras_err_handler_data(adev); 2864 2865 if (adev->gmc.xgmi.num_physical_nodes > 1) 2866 amdgpu_xgmi_remove_device(adev); 2867 2868 amdgpu_amdkfd_device_fini_sw(adev); 2869 2870 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2871 if (!adev->ip_blocks[i].status.sw) 2872 continue; 2873 2874 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2875 amdgpu_ucode_free_bo(adev); 2876 amdgpu_free_static_csa(&adev->virt.csa_obj); 2877 amdgpu_device_wb_fini(adev); 2878 amdgpu_device_mem_scratch_fini(adev); 2879 amdgpu_ib_pool_fini(adev); 2880 } 2881 2882 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2883 /* XXX handle errors */ 2884 if (r) { 2885 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2886 adev->ip_blocks[i].version->funcs->name, r); 2887 } 2888 adev->ip_blocks[i].status.sw = false; 2889 adev->ip_blocks[i].status.valid = false; 2890 } 2891 2892 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2893 if (!adev->ip_blocks[i].status.late_initialized) 2894 continue; 2895 if (adev->ip_blocks[i].version->funcs->late_fini) 2896 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2897 adev->ip_blocks[i].status.late_initialized = false; 2898 } 2899 2900 amdgpu_ras_fini(adev); 2901 2902 return 0; 2903 } 2904 2905 /** 2906 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2907 * 2908 * @work: work_struct. 2909 */ 2910 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2911 { 2912 struct amdgpu_device *adev = 2913 container_of(work, struct amdgpu_device, delayed_init_work.work); 2914 int r; 2915 2916 r = amdgpu_ib_ring_tests(adev); 2917 if (r) 2918 DRM_ERROR("ib ring test failed (%d).\n", r); 2919 } 2920 2921 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2922 { 2923 struct amdgpu_device *adev = 2924 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2925 2926 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2927 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2928 2929 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2930 adev->gfx.gfx_off_state = true; 2931 } 2932 2933 /** 2934 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2935 * 2936 * @adev: amdgpu_device pointer 2937 * 2938 * Main suspend function for hardware IPs. The list of all the hardware 2939 * IPs that make up the asic is walked, clockgating is disabled and the 2940 * suspend callbacks are run. suspend puts the hardware and software state 2941 * in each IP into a state suitable for suspend. 2942 * Returns 0 on success, negative error code on failure. 2943 */ 2944 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2945 { 2946 int i, r; 2947 2948 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2949 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2950 2951 /* 2952 * Per PMFW team's suggestion, driver needs to handle gfxoff 2953 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2954 * scenario. Add the missing df cstate disablement here. 2955 */ 2956 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2957 dev_warn(adev->dev, "Failed to disallow df cstate"); 2958 2959 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2960 if (!adev->ip_blocks[i].status.valid) 2961 continue; 2962 2963 /* displays are handled separately */ 2964 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2965 continue; 2966 2967 /* XXX handle errors */ 2968 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2969 /* XXX handle errors */ 2970 if (r) { 2971 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2972 adev->ip_blocks[i].version->funcs->name, r); 2973 return r; 2974 } 2975 2976 adev->ip_blocks[i].status.hw = false; 2977 } 2978 2979 return 0; 2980 } 2981 2982 /** 2983 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2984 * 2985 * @adev: amdgpu_device pointer 2986 * 2987 * Main suspend function for hardware IPs. The list of all the hardware 2988 * IPs that make up the asic is walked, clockgating is disabled and the 2989 * suspend callbacks are run. suspend puts the hardware and software state 2990 * in each IP into a state suitable for suspend. 2991 * Returns 0 on success, negative error code on failure. 2992 */ 2993 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2994 { 2995 int i, r; 2996 2997 if (adev->in_s0ix) 2998 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2999 3000 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3001 if (!adev->ip_blocks[i].status.valid) 3002 continue; 3003 /* displays are handled in phase1 */ 3004 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3005 continue; 3006 /* PSP lost connection when err_event_athub occurs */ 3007 if (amdgpu_ras_intr_triggered() && 3008 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3009 adev->ip_blocks[i].status.hw = false; 3010 continue; 3011 } 3012 3013 /* skip unnecessary suspend if we do not initialize them yet */ 3014 if (adev->gmc.xgmi.pending_reset && 3015 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3016 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3017 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3018 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3019 adev->ip_blocks[i].status.hw = false; 3020 continue; 3021 } 3022 3023 /* skip suspend of gfx/mes and psp for S0ix 3024 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3025 * like at runtime. PSP is also part of the always on hardware 3026 * so no need to suspend it. 3027 */ 3028 if (adev->in_s0ix && 3029 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3030 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3031 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3032 continue; 3033 3034 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3035 if (adev->in_s0ix && 3036 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && 3037 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3038 continue; 3039 3040 /* XXX handle errors */ 3041 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3042 /* XXX handle errors */ 3043 if (r) { 3044 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3045 adev->ip_blocks[i].version->funcs->name, r); 3046 } 3047 adev->ip_blocks[i].status.hw = false; 3048 /* handle putting the SMC in the appropriate state */ 3049 if(!amdgpu_sriov_vf(adev)){ 3050 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3051 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3052 if (r) { 3053 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3054 adev->mp1_state, r); 3055 return r; 3056 } 3057 } 3058 } 3059 } 3060 3061 return 0; 3062 } 3063 3064 /** 3065 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3066 * 3067 * @adev: amdgpu_device pointer 3068 * 3069 * Main suspend function for hardware IPs. The list of all the hardware 3070 * IPs that make up the asic is walked, clockgating is disabled and the 3071 * suspend callbacks are run. suspend puts the hardware and software state 3072 * in each IP into a state suitable for suspend. 3073 * Returns 0 on success, negative error code on failure. 3074 */ 3075 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3076 { 3077 int r; 3078 3079 if (amdgpu_sriov_vf(adev)) { 3080 amdgpu_virt_fini_data_exchange(adev); 3081 amdgpu_virt_request_full_gpu(adev, false); 3082 } 3083 3084 r = amdgpu_device_ip_suspend_phase1(adev); 3085 if (r) 3086 return r; 3087 r = amdgpu_device_ip_suspend_phase2(adev); 3088 3089 if (amdgpu_sriov_vf(adev)) 3090 amdgpu_virt_release_full_gpu(adev, false); 3091 3092 return r; 3093 } 3094 3095 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3096 { 3097 int i, r; 3098 3099 static enum amd_ip_block_type ip_order[] = { 3100 AMD_IP_BLOCK_TYPE_COMMON, 3101 AMD_IP_BLOCK_TYPE_GMC, 3102 AMD_IP_BLOCK_TYPE_PSP, 3103 AMD_IP_BLOCK_TYPE_IH, 3104 }; 3105 3106 for (i = 0; i < adev->num_ip_blocks; i++) { 3107 int j; 3108 struct amdgpu_ip_block *block; 3109 3110 block = &adev->ip_blocks[i]; 3111 block->status.hw = false; 3112 3113 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3114 3115 if (block->version->type != ip_order[j] || 3116 !block->status.valid) 3117 continue; 3118 3119 r = block->version->funcs->hw_init(adev); 3120 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3121 if (r) 3122 return r; 3123 block->status.hw = true; 3124 } 3125 } 3126 3127 return 0; 3128 } 3129 3130 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3131 { 3132 int i, r; 3133 3134 static enum amd_ip_block_type ip_order[] = { 3135 AMD_IP_BLOCK_TYPE_SMC, 3136 AMD_IP_BLOCK_TYPE_DCE, 3137 AMD_IP_BLOCK_TYPE_GFX, 3138 AMD_IP_BLOCK_TYPE_SDMA, 3139 AMD_IP_BLOCK_TYPE_UVD, 3140 AMD_IP_BLOCK_TYPE_VCE, 3141 AMD_IP_BLOCK_TYPE_VCN 3142 }; 3143 3144 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3145 int j; 3146 struct amdgpu_ip_block *block; 3147 3148 for (j = 0; j < adev->num_ip_blocks; j++) { 3149 block = &adev->ip_blocks[j]; 3150 3151 if (block->version->type != ip_order[i] || 3152 !block->status.valid || 3153 block->status.hw) 3154 continue; 3155 3156 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3157 r = block->version->funcs->resume(adev); 3158 else 3159 r = block->version->funcs->hw_init(adev); 3160 3161 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3162 if (r) 3163 return r; 3164 block->status.hw = true; 3165 } 3166 } 3167 3168 return 0; 3169 } 3170 3171 /** 3172 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3173 * 3174 * @adev: amdgpu_device pointer 3175 * 3176 * First resume function for hardware IPs. The list of all the hardware 3177 * IPs that make up the asic is walked and the resume callbacks are run for 3178 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3179 * after a suspend and updates the software state as necessary. This 3180 * function is also used for restoring the GPU after a GPU reset. 3181 * Returns 0 on success, negative error code on failure. 3182 */ 3183 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3184 { 3185 int i, r; 3186 3187 for (i = 0; i < adev->num_ip_blocks; i++) { 3188 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3189 continue; 3190 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3191 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3192 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3193 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3194 3195 r = adev->ip_blocks[i].version->funcs->resume(adev); 3196 if (r) { 3197 DRM_ERROR("resume of IP block <%s> failed %d\n", 3198 adev->ip_blocks[i].version->funcs->name, r); 3199 return r; 3200 } 3201 adev->ip_blocks[i].status.hw = true; 3202 } 3203 } 3204 3205 return 0; 3206 } 3207 3208 /** 3209 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3210 * 3211 * @adev: amdgpu_device pointer 3212 * 3213 * First resume function for hardware IPs. The list of all the hardware 3214 * IPs that make up the asic is walked and the resume callbacks are run for 3215 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3216 * functional state after a suspend and updates the software state as 3217 * necessary. This function is also used for restoring the GPU after a GPU 3218 * reset. 3219 * Returns 0 on success, negative error code on failure. 3220 */ 3221 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3222 { 3223 int i, r; 3224 3225 for (i = 0; i < adev->num_ip_blocks; i++) { 3226 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3227 continue; 3228 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3229 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3230 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3231 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3232 continue; 3233 r = adev->ip_blocks[i].version->funcs->resume(adev); 3234 if (r) { 3235 DRM_ERROR("resume of IP block <%s> failed %d\n", 3236 adev->ip_blocks[i].version->funcs->name, r); 3237 return r; 3238 } 3239 adev->ip_blocks[i].status.hw = true; 3240 } 3241 3242 return 0; 3243 } 3244 3245 /** 3246 * amdgpu_device_ip_resume - run resume for hardware IPs 3247 * 3248 * @adev: amdgpu_device pointer 3249 * 3250 * Main resume function for hardware IPs. The hardware IPs 3251 * are split into two resume functions because they are 3252 * are also used in in recovering from a GPU reset and some additional 3253 * steps need to be take between them. In this case (S3/S4) they are 3254 * run sequentially. 3255 * Returns 0 on success, negative error code on failure. 3256 */ 3257 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3258 { 3259 int r; 3260 3261 r = amdgpu_amdkfd_resume_iommu(adev); 3262 if (r) 3263 return r; 3264 3265 r = amdgpu_device_ip_resume_phase1(adev); 3266 if (r) 3267 return r; 3268 3269 r = amdgpu_device_fw_loading(adev); 3270 if (r) 3271 return r; 3272 3273 r = amdgpu_device_ip_resume_phase2(adev); 3274 3275 return r; 3276 } 3277 3278 /** 3279 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3280 * 3281 * @adev: amdgpu_device pointer 3282 * 3283 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3284 */ 3285 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3286 { 3287 if (amdgpu_sriov_vf(adev)) { 3288 if (adev->is_atom_fw) { 3289 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3290 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3291 } else { 3292 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3293 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3294 } 3295 3296 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3297 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3298 } 3299 } 3300 3301 /** 3302 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3303 * 3304 * @asic_type: AMD asic type 3305 * 3306 * Check if there is DC (new modesetting infrastructre) support for an asic. 3307 * returns true if DC has support, false if not. 3308 */ 3309 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3310 { 3311 switch (asic_type) { 3312 #ifdef CONFIG_DRM_AMDGPU_SI 3313 case CHIP_HAINAN: 3314 #endif 3315 case CHIP_TOPAZ: 3316 /* chips with no display hardware */ 3317 return false; 3318 #if defined(CONFIG_DRM_AMD_DC) 3319 case CHIP_TAHITI: 3320 case CHIP_PITCAIRN: 3321 case CHIP_VERDE: 3322 case CHIP_OLAND: 3323 /* 3324 * We have systems in the wild with these ASICs that require 3325 * LVDS and VGA support which is not supported with DC. 3326 * 3327 * Fallback to the non-DC driver here by default so as not to 3328 * cause regressions. 3329 */ 3330 #if defined(CONFIG_DRM_AMD_DC_SI) 3331 return amdgpu_dc > 0; 3332 #else 3333 return false; 3334 #endif 3335 case CHIP_BONAIRE: 3336 case CHIP_KAVERI: 3337 case CHIP_KABINI: 3338 case CHIP_MULLINS: 3339 /* 3340 * We have systems in the wild with these ASICs that require 3341 * VGA support which is not supported with DC. 3342 * 3343 * Fallback to the non-DC driver here by default so as not to 3344 * cause regressions. 3345 */ 3346 return amdgpu_dc > 0; 3347 default: 3348 return amdgpu_dc != 0; 3349 #else 3350 default: 3351 if (amdgpu_dc > 0) 3352 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3353 "but isn't supported by ASIC, ignoring\n"); 3354 return false; 3355 #endif 3356 } 3357 } 3358 3359 /** 3360 * amdgpu_device_has_dc_support - check if dc is supported 3361 * 3362 * @adev: amdgpu_device pointer 3363 * 3364 * Returns true for supported, false for not supported 3365 */ 3366 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3367 { 3368 if (adev->enable_virtual_display || 3369 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3370 return false; 3371 3372 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3373 } 3374 3375 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3376 { 3377 struct amdgpu_device *adev = 3378 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3379 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3380 3381 /* It's a bug to not have a hive within this function */ 3382 if (WARN_ON(!hive)) 3383 return; 3384 3385 /* 3386 * Use task barrier to synchronize all xgmi reset works across the 3387 * hive. task_barrier_enter and task_barrier_exit will block 3388 * until all the threads running the xgmi reset works reach 3389 * those points. task_barrier_full will do both blocks. 3390 */ 3391 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3392 3393 task_barrier_enter(&hive->tb); 3394 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3395 3396 if (adev->asic_reset_res) 3397 goto fail; 3398 3399 task_barrier_exit(&hive->tb); 3400 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3401 3402 if (adev->asic_reset_res) 3403 goto fail; 3404 3405 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3406 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3407 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3408 } else { 3409 3410 task_barrier_full(&hive->tb); 3411 adev->asic_reset_res = amdgpu_asic_reset(adev); 3412 } 3413 3414 fail: 3415 if (adev->asic_reset_res) 3416 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3417 adev->asic_reset_res, adev_to_drm(adev)->unique); 3418 amdgpu_put_xgmi_hive(hive); 3419 } 3420 3421 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3422 { 3423 char *input = amdgpu_lockup_timeout; 3424 char *timeout_setting = NULL; 3425 int index = 0; 3426 long timeout; 3427 int ret = 0; 3428 3429 /* 3430 * By default timeout for non compute jobs is 10000 3431 * and 60000 for compute jobs. 3432 * In SR-IOV or passthrough mode, timeout for compute 3433 * jobs are 60000 by default. 3434 */ 3435 adev->gfx_timeout = msecs_to_jiffies(10000); 3436 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3437 if (amdgpu_sriov_vf(adev)) 3438 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3439 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3440 else 3441 adev->compute_timeout = msecs_to_jiffies(60000); 3442 3443 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3444 while ((timeout_setting = strsep(&input, ",")) && 3445 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3446 ret = kstrtol(timeout_setting, 0, &timeout); 3447 if (ret) 3448 return ret; 3449 3450 if (timeout == 0) { 3451 index++; 3452 continue; 3453 } else if (timeout < 0) { 3454 timeout = MAX_SCHEDULE_TIMEOUT; 3455 dev_warn(adev->dev, "lockup timeout disabled"); 3456 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3457 } else { 3458 timeout = msecs_to_jiffies(timeout); 3459 } 3460 3461 switch (index++) { 3462 case 0: 3463 adev->gfx_timeout = timeout; 3464 break; 3465 case 1: 3466 adev->compute_timeout = timeout; 3467 break; 3468 case 2: 3469 adev->sdma_timeout = timeout; 3470 break; 3471 case 3: 3472 adev->video_timeout = timeout; 3473 break; 3474 default: 3475 break; 3476 } 3477 } 3478 /* 3479 * There is only one value specified and 3480 * it should apply to all non-compute jobs. 3481 */ 3482 if (index == 1) { 3483 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3484 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3485 adev->compute_timeout = adev->gfx_timeout; 3486 } 3487 } 3488 3489 return ret; 3490 } 3491 3492 /** 3493 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3494 * 3495 * @adev: amdgpu_device pointer 3496 * 3497 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3498 */ 3499 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3500 { 3501 struct iommu_domain *domain; 3502 3503 domain = iommu_get_domain_for_dev(adev->dev); 3504 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3505 adev->ram_is_direct_mapped = true; 3506 } 3507 3508 static const struct attribute *amdgpu_dev_attributes[] = { 3509 &dev_attr_product_name.attr, 3510 &dev_attr_product_number.attr, 3511 &dev_attr_serial_number.attr, 3512 &dev_attr_pcie_replay_count.attr, 3513 NULL 3514 }; 3515 3516 /** 3517 * amdgpu_device_init - initialize the driver 3518 * 3519 * @adev: amdgpu_device pointer 3520 * @flags: driver flags 3521 * 3522 * Initializes the driver info and hw (all asics). 3523 * Returns 0 for success or an error on failure. 3524 * Called at driver startup. 3525 */ 3526 int amdgpu_device_init(struct amdgpu_device *adev, 3527 uint32_t flags) 3528 { 3529 struct drm_device *ddev = adev_to_drm(adev); 3530 struct pci_dev *pdev = adev->pdev; 3531 int r, i; 3532 bool px = false; 3533 u32 max_MBps; 3534 3535 adev->shutdown = false; 3536 adev->flags = flags; 3537 3538 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3539 adev->asic_type = amdgpu_force_asic_type; 3540 else 3541 adev->asic_type = flags & AMD_ASIC_MASK; 3542 3543 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3544 if (amdgpu_emu_mode == 1) 3545 adev->usec_timeout *= 10; 3546 adev->gmc.gart_size = 512 * 1024 * 1024; 3547 adev->accel_working = false; 3548 adev->num_rings = 0; 3549 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3550 adev->mman.buffer_funcs = NULL; 3551 adev->mman.buffer_funcs_ring = NULL; 3552 adev->vm_manager.vm_pte_funcs = NULL; 3553 adev->vm_manager.vm_pte_num_scheds = 0; 3554 adev->gmc.gmc_funcs = NULL; 3555 adev->harvest_ip_mask = 0x0; 3556 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3557 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3558 3559 adev->smc_rreg = &amdgpu_invalid_rreg; 3560 adev->smc_wreg = &amdgpu_invalid_wreg; 3561 adev->pcie_rreg = &amdgpu_invalid_rreg; 3562 adev->pcie_wreg = &amdgpu_invalid_wreg; 3563 adev->pciep_rreg = &amdgpu_invalid_rreg; 3564 adev->pciep_wreg = &amdgpu_invalid_wreg; 3565 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3566 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3567 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3568 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3569 adev->didt_rreg = &amdgpu_invalid_rreg; 3570 adev->didt_wreg = &amdgpu_invalid_wreg; 3571 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3572 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3573 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3574 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3575 3576 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3577 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3578 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3579 3580 /* mutex initialization are all done here so we 3581 * can recall function without having locking issues */ 3582 mutex_init(&adev->firmware.mutex); 3583 mutex_init(&adev->pm.mutex); 3584 mutex_init(&adev->gfx.gpu_clock_mutex); 3585 mutex_init(&adev->srbm_mutex); 3586 mutex_init(&adev->gfx.pipe_reserve_mutex); 3587 mutex_init(&adev->gfx.gfx_off_mutex); 3588 mutex_init(&adev->grbm_idx_mutex); 3589 mutex_init(&adev->mn_lock); 3590 mutex_init(&adev->virt.vf_errors.lock); 3591 hash_init(adev->mn_hash); 3592 mutex_init(&adev->psp.mutex); 3593 mutex_init(&adev->notifier_lock); 3594 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3595 mutex_init(&adev->benchmark_mutex); 3596 3597 amdgpu_device_init_apu_flags(adev); 3598 3599 r = amdgpu_device_check_arguments(adev); 3600 if (r) 3601 return r; 3602 3603 spin_lock_init(&adev->mmio_idx_lock); 3604 spin_lock_init(&adev->smc_idx_lock); 3605 spin_lock_init(&adev->pcie_idx_lock); 3606 spin_lock_init(&adev->uvd_ctx_idx_lock); 3607 spin_lock_init(&adev->didt_idx_lock); 3608 spin_lock_init(&adev->gc_cac_idx_lock); 3609 spin_lock_init(&adev->se_cac_idx_lock); 3610 spin_lock_init(&adev->audio_endpt_idx_lock); 3611 spin_lock_init(&adev->mm_stats.lock); 3612 3613 INIT_LIST_HEAD(&adev->shadow_list); 3614 mutex_init(&adev->shadow_list_lock); 3615 3616 INIT_LIST_HEAD(&adev->reset_list); 3617 3618 INIT_LIST_HEAD(&adev->ras_list); 3619 3620 INIT_DELAYED_WORK(&adev->delayed_init_work, 3621 amdgpu_device_delayed_init_work_handler); 3622 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3623 amdgpu_device_delay_enable_gfx_off); 3624 3625 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3626 3627 adev->gfx.gfx_off_req_count = 1; 3628 adev->gfx.gfx_off_residency = 0; 3629 adev->gfx.gfx_off_entrycount = 0; 3630 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3631 3632 atomic_set(&adev->throttling_logging_enabled, 1); 3633 /* 3634 * If throttling continues, logging will be performed every minute 3635 * to avoid log flooding. "-1" is subtracted since the thermal 3636 * throttling interrupt comes every second. Thus, the total logging 3637 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3638 * for throttling interrupt) = 60 seconds. 3639 */ 3640 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3641 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3642 3643 /* Registers mapping */ 3644 /* TODO: block userspace mapping of io register */ 3645 if (adev->asic_type >= CHIP_BONAIRE) { 3646 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3647 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3648 } else { 3649 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3650 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3651 } 3652 3653 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3654 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3655 3656 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3657 if (adev->rmmio == NULL) { 3658 return -ENOMEM; 3659 } 3660 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3661 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3662 3663 amdgpu_device_get_pcie_info(adev); 3664 3665 if (amdgpu_mcbp) 3666 DRM_INFO("MCBP is enabled\n"); 3667 3668 /* 3669 * Reset domain needs to be present early, before XGMI hive discovered 3670 * (if any) and intitialized to use reset sem and in_gpu reset flag 3671 * early on during init and before calling to RREG32. 3672 */ 3673 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3674 if (!adev->reset_domain) 3675 return -ENOMEM; 3676 3677 /* detect hw virtualization here */ 3678 amdgpu_detect_virtualization(adev); 3679 3680 r = amdgpu_device_get_job_timeout_settings(adev); 3681 if (r) { 3682 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3683 return r; 3684 } 3685 3686 /* early init functions */ 3687 r = amdgpu_device_ip_early_init(adev); 3688 if (r) 3689 return r; 3690 3691 /* Enable TMZ based on IP_VERSION */ 3692 amdgpu_gmc_tmz_set(adev); 3693 3694 amdgpu_gmc_noretry_set(adev); 3695 /* Need to get xgmi info early to decide the reset behavior*/ 3696 if (adev->gmc.xgmi.supported) { 3697 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3698 if (r) 3699 return r; 3700 } 3701 3702 /* enable PCIE atomic ops */ 3703 if (amdgpu_sriov_vf(adev)) 3704 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3705 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3706 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3707 else 3708 adev->have_atomics_support = 3709 !pci_enable_atomic_ops_to_root(adev->pdev, 3710 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3711 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3712 if (!adev->have_atomics_support) 3713 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3714 3715 /* doorbell bar mapping and doorbell index init*/ 3716 amdgpu_device_doorbell_init(adev); 3717 3718 if (amdgpu_emu_mode == 1) { 3719 /* post the asic on emulation mode */ 3720 emu_soc_asic_init(adev); 3721 goto fence_driver_init; 3722 } 3723 3724 amdgpu_reset_init(adev); 3725 3726 /* detect if we are with an SRIOV vbios */ 3727 amdgpu_device_detect_sriov_bios(adev); 3728 3729 /* check if we need to reset the asic 3730 * E.g., driver was not cleanly unloaded previously, etc. 3731 */ 3732 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3733 if (adev->gmc.xgmi.num_physical_nodes) { 3734 dev_info(adev->dev, "Pending hive reset.\n"); 3735 adev->gmc.xgmi.pending_reset = true; 3736 /* Only need to init necessary block for SMU to handle the reset */ 3737 for (i = 0; i < adev->num_ip_blocks; i++) { 3738 if (!adev->ip_blocks[i].status.valid) 3739 continue; 3740 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3741 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3742 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3743 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3744 DRM_DEBUG("IP %s disabled for hw_init.\n", 3745 adev->ip_blocks[i].version->funcs->name); 3746 adev->ip_blocks[i].status.hw = true; 3747 } 3748 } 3749 } else { 3750 r = amdgpu_asic_reset(adev); 3751 if (r) { 3752 dev_err(adev->dev, "asic reset on init failed\n"); 3753 goto failed; 3754 } 3755 } 3756 } 3757 3758 pci_enable_pcie_error_reporting(adev->pdev); 3759 3760 /* Post card if necessary */ 3761 if (amdgpu_device_need_post(adev)) { 3762 if (!adev->bios) { 3763 dev_err(adev->dev, "no vBIOS found\n"); 3764 r = -EINVAL; 3765 goto failed; 3766 } 3767 DRM_INFO("GPU posting now...\n"); 3768 r = amdgpu_device_asic_init(adev); 3769 if (r) { 3770 dev_err(adev->dev, "gpu post error!\n"); 3771 goto failed; 3772 } 3773 } 3774 3775 if (adev->is_atom_fw) { 3776 /* Initialize clocks */ 3777 r = amdgpu_atomfirmware_get_clock_info(adev); 3778 if (r) { 3779 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3780 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3781 goto failed; 3782 } 3783 } else { 3784 /* Initialize clocks */ 3785 r = amdgpu_atombios_get_clock_info(adev); 3786 if (r) { 3787 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3788 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3789 goto failed; 3790 } 3791 /* init i2c buses */ 3792 if (!amdgpu_device_has_dc_support(adev)) 3793 amdgpu_atombios_i2c_init(adev); 3794 } 3795 3796 fence_driver_init: 3797 /* Fence driver */ 3798 r = amdgpu_fence_driver_sw_init(adev); 3799 if (r) { 3800 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3801 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3802 goto failed; 3803 } 3804 3805 /* init the mode config */ 3806 drm_mode_config_init(adev_to_drm(adev)); 3807 3808 r = amdgpu_device_ip_init(adev); 3809 if (r) { 3810 /* failed in exclusive mode due to timeout */ 3811 if (amdgpu_sriov_vf(adev) && 3812 !amdgpu_sriov_runtime(adev) && 3813 amdgpu_virt_mmio_blocked(adev) && 3814 !amdgpu_virt_wait_reset(adev)) { 3815 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3816 /* Don't send request since VF is inactive. */ 3817 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3818 adev->virt.ops = NULL; 3819 r = -EAGAIN; 3820 goto release_ras_con; 3821 } 3822 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3823 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3824 goto release_ras_con; 3825 } 3826 3827 amdgpu_fence_driver_hw_init(adev); 3828 3829 dev_info(adev->dev, 3830 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3831 adev->gfx.config.max_shader_engines, 3832 adev->gfx.config.max_sh_per_se, 3833 adev->gfx.config.max_cu_per_sh, 3834 adev->gfx.cu_info.number); 3835 3836 adev->accel_working = true; 3837 3838 amdgpu_vm_check_compute_bug(adev); 3839 3840 /* Initialize the buffer migration limit. */ 3841 if (amdgpu_moverate >= 0) 3842 max_MBps = amdgpu_moverate; 3843 else 3844 max_MBps = 8; /* Allow 8 MB/s. */ 3845 /* Get a log2 for easy divisions. */ 3846 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3847 3848 r = amdgpu_pm_sysfs_init(adev); 3849 if (r) { 3850 adev->pm_sysfs_en = false; 3851 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3852 } else 3853 adev->pm_sysfs_en = true; 3854 3855 r = amdgpu_ucode_sysfs_init(adev); 3856 if (r) { 3857 adev->ucode_sysfs_en = false; 3858 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3859 } else 3860 adev->ucode_sysfs_en = true; 3861 3862 r = amdgpu_psp_sysfs_init(adev); 3863 if (r) { 3864 adev->psp_sysfs_en = false; 3865 if (!amdgpu_sriov_vf(adev)) 3866 DRM_ERROR("Creating psp sysfs failed\n"); 3867 } else 3868 adev->psp_sysfs_en = true; 3869 3870 /* 3871 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3872 * Otherwise the mgpu fan boost feature will be skipped due to the 3873 * gpu instance is counted less. 3874 */ 3875 amdgpu_register_gpu_instance(adev); 3876 3877 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3878 * explicit gating rather than handling it automatically. 3879 */ 3880 if (!adev->gmc.xgmi.pending_reset) { 3881 r = amdgpu_device_ip_late_init(adev); 3882 if (r) { 3883 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3884 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3885 goto release_ras_con; 3886 } 3887 /* must succeed. */ 3888 amdgpu_ras_resume(adev); 3889 queue_delayed_work(system_wq, &adev->delayed_init_work, 3890 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3891 } 3892 3893 if (amdgpu_sriov_vf(adev)) 3894 flush_delayed_work(&adev->delayed_init_work); 3895 3896 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3897 if (r) 3898 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3899 3900 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3901 r = amdgpu_pmu_init(adev); 3902 if (r) 3903 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3904 3905 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3906 if (amdgpu_device_cache_pci_state(adev->pdev)) 3907 pci_restore_state(pdev); 3908 3909 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3910 /* this will fail for cards that aren't VGA class devices, just 3911 * ignore it */ 3912 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3913 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3914 3915 if (amdgpu_device_supports_px(ddev)) { 3916 px = true; 3917 vga_switcheroo_register_client(adev->pdev, 3918 &amdgpu_switcheroo_ops, px); 3919 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3920 } 3921 3922 if (adev->gmc.xgmi.pending_reset) 3923 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3924 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3925 3926 amdgpu_device_check_iommu_direct_map(adev); 3927 3928 return 0; 3929 3930 release_ras_con: 3931 amdgpu_release_ras_context(adev); 3932 3933 failed: 3934 amdgpu_vf_error_trans_all(adev); 3935 3936 return r; 3937 } 3938 3939 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3940 { 3941 3942 /* Clear all CPU mappings pointing to this device */ 3943 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3944 3945 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3946 amdgpu_device_doorbell_fini(adev); 3947 3948 iounmap(adev->rmmio); 3949 adev->rmmio = NULL; 3950 if (adev->mman.aper_base_kaddr) 3951 iounmap(adev->mman.aper_base_kaddr); 3952 adev->mman.aper_base_kaddr = NULL; 3953 3954 /* Memory manager related */ 3955 if (!adev->gmc.xgmi.connected_to_cpu) { 3956 arch_phys_wc_del(adev->gmc.vram_mtrr); 3957 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3958 } 3959 } 3960 3961 /** 3962 * amdgpu_device_fini_hw - tear down the driver 3963 * 3964 * @adev: amdgpu_device pointer 3965 * 3966 * Tear down the driver info (all asics). 3967 * Called at driver shutdown. 3968 */ 3969 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3970 { 3971 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3972 flush_delayed_work(&adev->delayed_init_work); 3973 adev->shutdown = true; 3974 3975 /* make sure IB test finished before entering exclusive mode 3976 * to avoid preemption on IB test 3977 * */ 3978 if (amdgpu_sriov_vf(adev)) { 3979 amdgpu_virt_request_full_gpu(adev, false); 3980 amdgpu_virt_fini_data_exchange(adev); 3981 } 3982 3983 /* disable all interrupts */ 3984 amdgpu_irq_disable_all(adev); 3985 if (adev->mode_info.mode_config_initialized){ 3986 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3987 drm_helper_force_disable_all(adev_to_drm(adev)); 3988 else 3989 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3990 } 3991 amdgpu_fence_driver_hw_fini(adev); 3992 3993 if (adev->mman.initialized) { 3994 flush_delayed_work(&adev->mman.bdev.wq); 3995 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3996 } 3997 3998 if (adev->pm_sysfs_en) 3999 amdgpu_pm_sysfs_fini(adev); 4000 if (adev->ucode_sysfs_en) 4001 amdgpu_ucode_sysfs_fini(adev); 4002 if (adev->psp_sysfs_en) 4003 amdgpu_psp_sysfs_fini(adev); 4004 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4005 4006 /* disable ras feature must before hw fini */ 4007 amdgpu_ras_pre_fini(adev); 4008 4009 amdgpu_device_ip_fini_early(adev); 4010 4011 amdgpu_irq_fini_hw(adev); 4012 4013 if (adev->mman.initialized) 4014 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4015 4016 amdgpu_gart_dummy_page_fini(adev); 4017 4018 amdgpu_device_unmap_mmio(adev); 4019 4020 } 4021 4022 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4023 { 4024 int idx; 4025 4026 amdgpu_fence_driver_sw_fini(adev); 4027 amdgpu_device_ip_fini(adev); 4028 release_firmware(adev->firmware.gpu_info_fw); 4029 adev->firmware.gpu_info_fw = NULL; 4030 adev->accel_working = false; 4031 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4032 4033 amdgpu_reset_fini(adev); 4034 4035 /* free i2c buses */ 4036 if (!amdgpu_device_has_dc_support(adev)) 4037 amdgpu_i2c_fini(adev); 4038 4039 if (amdgpu_emu_mode != 1) 4040 amdgpu_atombios_fini(adev); 4041 4042 kfree(adev->bios); 4043 adev->bios = NULL; 4044 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 4045 vga_switcheroo_unregister_client(adev->pdev); 4046 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4047 } 4048 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4049 vga_client_unregister(adev->pdev); 4050 4051 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4052 4053 iounmap(adev->rmmio); 4054 adev->rmmio = NULL; 4055 amdgpu_device_doorbell_fini(adev); 4056 drm_dev_exit(idx); 4057 } 4058 4059 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4060 amdgpu_pmu_fini(adev); 4061 if (adev->mman.discovery_bin) 4062 amdgpu_discovery_fini(adev); 4063 4064 amdgpu_reset_put_reset_domain(adev->reset_domain); 4065 adev->reset_domain = NULL; 4066 4067 kfree(adev->pci_state); 4068 4069 } 4070 4071 /** 4072 * amdgpu_device_evict_resources - evict device resources 4073 * @adev: amdgpu device object 4074 * 4075 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4076 * of the vram memory type. Mainly used for evicting device resources 4077 * at suspend time. 4078 * 4079 */ 4080 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4081 { 4082 int ret; 4083 4084 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4085 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4086 return 0; 4087 4088 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4089 if (ret) 4090 DRM_WARN("evicting device resources failed\n"); 4091 return ret; 4092 } 4093 4094 /* 4095 * Suspend & resume. 4096 */ 4097 /** 4098 * amdgpu_device_suspend - initiate device suspend 4099 * 4100 * @dev: drm dev pointer 4101 * @fbcon : notify the fbdev of suspend 4102 * 4103 * Puts the hw in the suspend state (all asics). 4104 * Returns 0 for success or an error on failure. 4105 * Called at driver suspend. 4106 */ 4107 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4108 { 4109 struct amdgpu_device *adev = drm_to_adev(dev); 4110 int r = 0; 4111 4112 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4113 return 0; 4114 4115 adev->in_suspend = true; 4116 4117 /* Evict the majority of BOs before grabbing the full access */ 4118 r = amdgpu_device_evict_resources(adev); 4119 if (r) 4120 return r; 4121 4122 if (amdgpu_sriov_vf(adev)) { 4123 amdgpu_virt_fini_data_exchange(adev); 4124 r = amdgpu_virt_request_full_gpu(adev, false); 4125 if (r) 4126 return r; 4127 } 4128 4129 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4130 DRM_WARN("smart shift update failed\n"); 4131 4132 drm_kms_helper_poll_disable(dev); 4133 4134 if (fbcon) 4135 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4136 4137 cancel_delayed_work_sync(&adev->delayed_init_work); 4138 4139 amdgpu_ras_suspend(adev); 4140 4141 amdgpu_device_ip_suspend_phase1(adev); 4142 4143 if (!adev->in_s0ix) 4144 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4145 4146 r = amdgpu_device_evict_resources(adev); 4147 if (r) 4148 return r; 4149 4150 amdgpu_fence_driver_hw_fini(adev); 4151 4152 amdgpu_device_ip_suspend_phase2(adev); 4153 4154 if (amdgpu_sriov_vf(adev)) 4155 amdgpu_virt_release_full_gpu(adev, false); 4156 4157 return 0; 4158 } 4159 4160 /** 4161 * amdgpu_device_resume - initiate device resume 4162 * 4163 * @dev: drm dev pointer 4164 * @fbcon : notify the fbdev of resume 4165 * 4166 * Bring the hw back to operating state (all asics). 4167 * Returns 0 for success or an error on failure. 4168 * Called at driver resume. 4169 */ 4170 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4171 { 4172 struct amdgpu_device *adev = drm_to_adev(dev); 4173 int r = 0; 4174 4175 if (amdgpu_sriov_vf(adev)) { 4176 r = amdgpu_virt_request_full_gpu(adev, true); 4177 if (r) 4178 return r; 4179 } 4180 4181 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4182 return 0; 4183 4184 if (adev->in_s0ix) 4185 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4186 4187 /* post card */ 4188 if (amdgpu_device_need_post(adev)) { 4189 r = amdgpu_device_asic_init(adev); 4190 if (r) 4191 dev_err(adev->dev, "amdgpu asic init failed\n"); 4192 } 4193 4194 r = amdgpu_device_ip_resume(adev); 4195 4196 if (r) { 4197 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4198 goto exit; 4199 } 4200 amdgpu_fence_driver_hw_init(adev); 4201 4202 r = amdgpu_device_ip_late_init(adev); 4203 if (r) 4204 goto exit; 4205 4206 queue_delayed_work(system_wq, &adev->delayed_init_work, 4207 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4208 4209 if (!adev->in_s0ix) { 4210 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4211 if (r) 4212 goto exit; 4213 } 4214 4215 exit: 4216 if (amdgpu_sriov_vf(adev)) { 4217 amdgpu_virt_init_data_exchange(adev); 4218 amdgpu_virt_release_full_gpu(adev, true); 4219 } 4220 4221 if (r) 4222 return r; 4223 4224 /* Make sure IB tests flushed */ 4225 flush_delayed_work(&adev->delayed_init_work); 4226 4227 if (fbcon) 4228 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4229 4230 drm_kms_helper_poll_enable(dev); 4231 4232 amdgpu_ras_resume(adev); 4233 4234 if (adev->mode_info.num_crtc) { 4235 /* 4236 * Most of the connector probing functions try to acquire runtime pm 4237 * refs to ensure that the GPU is powered on when connector polling is 4238 * performed. Since we're calling this from a runtime PM callback, 4239 * trying to acquire rpm refs will cause us to deadlock. 4240 * 4241 * Since we're guaranteed to be holding the rpm lock, it's safe to 4242 * temporarily disable the rpm helpers so this doesn't deadlock us. 4243 */ 4244 #ifdef CONFIG_PM 4245 dev->dev->power.disable_depth++; 4246 #endif 4247 if (!adev->dc_enabled) 4248 drm_helper_hpd_irq_event(dev); 4249 else 4250 drm_kms_helper_hotplug_event(dev); 4251 #ifdef CONFIG_PM 4252 dev->dev->power.disable_depth--; 4253 #endif 4254 } 4255 adev->in_suspend = false; 4256 4257 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4258 DRM_WARN("smart shift update failed\n"); 4259 4260 return 0; 4261 } 4262 4263 /** 4264 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4265 * 4266 * @adev: amdgpu_device pointer 4267 * 4268 * The list of all the hardware IPs that make up the asic is walked and 4269 * the check_soft_reset callbacks are run. check_soft_reset determines 4270 * if the asic is still hung or not. 4271 * Returns true if any of the IPs are still in a hung state, false if not. 4272 */ 4273 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4274 { 4275 int i; 4276 bool asic_hang = false; 4277 4278 if (amdgpu_sriov_vf(adev)) 4279 return true; 4280 4281 if (amdgpu_asic_need_full_reset(adev)) 4282 return true; 4283 4284 for (i = 0; i < adev->num_ip_blocks; i++) { 4285 if (!adev->ip_blocks[i].status.valid) 4286 continue; 4287 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4288 adev->ip_blocks[i].status.hang = 4289 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4290 if (adev->ip_blocks[i].status.hang) { 4291 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4292 asic_hang = true; 4293 } 4294 } 4295 return asic_hang; 4296 } 4297 4298 /** 4299 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4300 * 4301 * @adev: amdgpu_device pointer 4302 * 4303 * The list of all the hardware IPs that make up the asic is walked and the 4304 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4305 * handles any IP specific hardware or software state changes that are 4306 * necessary for a soft reset to succeed. 4307 * Returns 0 on success, negative error code on failure. 4308 */ 4309 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4310 { 4311 int i, r = 0; 4312 4313 for (i = 0; i < adev->num_ip_blocks; i++) { 4314 if (!adev->ip_blocks[i].status.valid) 4315 continue; 4316 if (adev->ip_blocks[i].status.hang && 4317 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4318 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4319 if (r) 4320 return r; 4321 } 4322 } 4323 4324 return 0; 4325 } 4326 4327 /** 4328 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4329 * 4330 * @adev: amdgpu_device pointer 4331 * 4332 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4333 * reset is necessary to recover. 4334 * Returns true if a full asic reset is required, false if not. 4335 */ 4336 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4337 { 4338 int i; 4339 4340 if (amdgpu_asic_need_full_reset(adev)) 4341 return true; 4342 4343 for (i = 0; i < adev->num_ip_blocks; i++) { 4344 if (!adev->ip_blocks[i].status.valid) 4345 continue; 4346 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4347 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4348 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4349 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4350 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4351 if (adev->ip_blocks[i].status.hang) { 4352 dev_info(adev->dev, "Some block need full reset!\n"); 4353 return true; 4354 } 4355 } 4356 } 4357 return false; 4358 } 4359 4360 /** 4361 * amdgpu_device_ip_soft_reset - do a soft reset 4362 * 4363 * @adev: amdgpu_device pointer 4364 * 4365 * The list of all the hardware IPs that make up the asic is walked and the 4366 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4367 * IP specific hardware or software state changes that are necessary to soft 4368 * reset the IP. 4369 * Returns 0 on success, negative error code on failure. 4370 */ 4371 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4372 { 4373 int i, r = 0; 4374 4375 for (i = 0; i < adev->num_ip_blocks; i++) { 4376 if (!adev->ip_blocks[i].status.valid) 4377 continue; 4378 if (adev->ip_blocks[i].status.hang && 4379 adev->ip_blocks[i].version->funcs->soft_reset) { 4380 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4381 if (r) 4382 return r; 4383 } 4384 } 4385 4386 return 0; 4387 } 4388 4389 /** 4390 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4391 * 4392 * @adev: amdgpu_device pointer 4393 * 4394 * The list of all the hardware IPs that make up the asic is walked and the 4395 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4396 * handles any IP specific hardware or software state changes that are 4397 * necessary after the IP has been soft reset. 4398 * Returns 0 on success, negative error code on failure. 4399 */ 4400 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4401 { 4402 int i, r = 0; 4403 4404 for (i = 0; i < adev->num_ip_blocks; i++) { 4405 if (!adev->ip_blocks[i].status.valid) 4406 continue; 4407 if (adev->ip_blocks[i].status.hang && 4408 adev->ip_blocks[i].version->funcs->post_soft_reset) 4409 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4410 if (r) 4411 return r; 4412 } 4413 4414 return 0; 4415 } 4416 4417 /** 4418 * amdgpu_device_recover_vram - Recover some VRAM contents 4419 * 4420 * @adev: amdgpu_device pointer 4421 * 4422 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4423 * restore things like GPUVM page tables after a GPU reset where 4424 * the contents of VRAM might be lost. 4425 * 4426 * Returns: 4427 * 0 on success, negative error code on failure. 4428 */ 4429 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4430 { 4431 struct dma_fence *fence = NULL, *next = NULL; 4432 struct amdgpu_bo *shadow; 4433 struct amdgpu_bo_vm *vmbo; 4434 long r = 1, tmo; 4435 4436 if (amdgpu_sriov_runtime(adev)) 4437 tmo = msecs_to_jiffies(8000); 4438 else 4439 tmo = msecs_to_jiffies(100); 4440 4441 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4442 mutex_lock(&adev->shadow_list_lock); 4443 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4444 shadow = &vmbo->bo; 4445 /* No need to recover an evicted BO */ 4446 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4447 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4448 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4449 continue; 4450 4451 r = amdgpu_bo_restore_shadow(shadow, &next); 4452 if (r) 4453 break; 4454 4455 if (fence) { 4456 tmo = dma_fence_wait_timeout(fence, false, tmo); 4457 dma_fence_put(fence); 4458 fence = next; 4459 if (tmo == 0) { 4460 r = -ETIMEDOUT; 4461 break; 4462 } else if (tmo < 0) { 4463 r = tmo; 4464 break; 4465 } 4466 } else { 4467 fence = next; 4468 } 4469 } 4470 mutex_unlock(&adev->shadow_list_lock); 4471 4472 if (fence) 4473 tmo = dma_fence_wait_timeout(fence, false, tmo); 4474 dma_fence_put(fence); 4475 4476 if (r < 0 || tmo <= 0) { 4477 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4478 return -EIO; 4479 } 4480 4481 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4482 return 0; 4483 } 4484 4485 4486 /** 4487 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4488 * 4489 * @adev: amdgpu_device pointer 4490 * @from_hypervisor: request from hypervisor 4491 * 4492 * do VF FLR and reinitialize Asic 4493 * return 0 means succeeded otherwise failed 4494 */ 4495 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4496 bool from_hypervisor) 4497 { 4498 int r; 4499 struct amdgpu_hive_info *hive = NULL; 4500 int retry_limit = 0; 4501 4502 retry: 4503 amdgpu_amdkfd_pre_reset(adev); 4504 4505 if (from_hypervisor) 4506 r = amdgpu_virt_request_full_gpu(adev, true); 4507 else 4508 r = amdgpu_virt_reset_gpu(adev); 4509 if (r) 4510 return r; 4511 4512 /* Resume IP prior to SMC */ 4513 r = amdgpu_device_ip_reinit_early_sriov(adev); 4514 if (r) 4515 goto error; 4516 4517 amdgpu_virt_init_data_exchange(adev); 4518 4519 r = amdgpu_device_fw_loading(adev); 4520 if (r) 4521 return r; 4522 4523 /* now we are okay to resume SMC/CP/SDMA */ 4524 r = amdgpu_device_ip_reinit_late_sriov(adev); 4525 if (r) 4526 goto error; 4527 4528 hive = amdgpu_get_xgmi_hive(adev); 4529 /* Update PSP FW topology after reset */ 4530 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4531 r = amdgpu_xgmi_update_topology(hive, adev); 4532 4533 if (hive) 4534 amdgpu_put_xgmi_hive(hive); 4535 4536 if (!r) { 4537 amdgpu_irq_gpu_reset_resume_helper(adev); 4538 r = amdgpu_ib_ring_tests(adev); 4539 4540 amdgpu_amdkfd_post_reset(adev); 4541 } 4542 4543 error: 4544 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4545 amdgpu_inc_vram_lost(adev); 4546 r = amdgpu_device_recover_vram(adev); 4547 } 4548 amdgpu_virt_release_full_gpu(adev, true); 4549 4550 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4551 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4552 retry_limit++; 4553 goto retry; 4554 } else 4555 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4556 } 4557 4558 return r; 4559 } 4560 4561 /** 4562 * amdgpu_device_has_job_running - check if there is any job in mirror list 4563 * 4564 * @adev: amdgpu_device pointer 4565 * 4566 * check if there is any job in mirror list 4567 */ 4568 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4569 { 4570 int i; 4571 struct drm_sched_job *job; 4572 4573 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4574 struct amdgpu_ring *ring = adev->rings[i]; 4575 4576 if (!ring || !ring->sched.thread) 4577 continue; 4578 4579 spin_lock(&ring->sched.job_list_lock); 4580 job = list_first_entry_or_null(&ring->sched.pending_list, 4581 struct drm_sched_job, list); 4582 spin_unlock(&ring->sched.job_list_lock); 4583 if (job) 4584 return true; 4585 } 4586 return false; 4587 } 4588 4589 /** 4590 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4591 * 4592 * @adev: amdgpu_device pointer 4593 * 4594 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4595 * a hung GPU. 4596 */ 4597 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4598 { 4599 4600 if (amdgpu_gpu_recovery == 0) 4601 goto disabled; 4602 4603 /* Skip soft reset check in fatal error mode */ 4604 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4605 return true; 4606 4607 if (amdgpu_sriov_vf(adev)) 4608 return true; 4609 4610 if (amdgpu_gpu_recovery == -1) { 4611 switch (adev->asic_type) { 4612 #ifdef CONFIG_DRM_AMDGPU_SI 4613 case CHIP_VERDE: 4614 case CHIP_TAHITI: 4615 case CHIP_PITCAIRN: 4616 case CHIP_OLAND: 4617 case CHIP_HAINAN: 4618 #endif 4619 #ifdef CONFIG_DRM_AMDGPU_CIK 4620 case CHIP_KAVERI: 4621 case CHIP_KABINI: 4622 case CHIP_MULLINS: 4623 #endif 4624 case CHIP_CARRIZO: 4625 case CHIP_STONEY: 4626 case CHIP_CYAN_SKILLFISH: 4627 goto disabled; 4628 default: 4629 break; 4630 } 4631 } 4632 4633 return true; 4634 4635 disabled: 4636 dev_info(adev->dev, "GPU recovery disabled.\n"); 4637 return false; 4638 } 4639 4640 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4641 { 4642 u32 i; 4643 int ret = 0; 4644 4645 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4646 4647 dev_info(adev->dev, "GPU mode1 reset\n"); 4648 4649 /* disable BM */ 4650 pci_clear_master(adev->pdev); 4651 4652 amdgpu_device_cache_pci_state(adev->pdev); 4653 4654 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4655 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4656 ret = amdgpu_dpm_mode1_reset(adev); 4657 } else { 4658 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4659 ret = psp_gpu_reset(adev); 4660 } 4661 4662 if (ret) 4663 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4664 4665 amdgpu_device_load_pci_state(adev->pdev); 4666 4667 /* wait for asic to come out of reset */ 4668 for (i = 0; i < adev->usec_timeout; i++) { 4669 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4670 4671 if (memsize != 0xffffffff) 4672 break; 4673 udelay(1); 4674 } 4675 4676 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4677 return ret; 4678 } 4679 4680 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4681 struct amdgpu_reset_context *reset_context) 4682 { 4683 int i, r = 0; 4684 struct amdgpu_job *job = NULL; 4685 bool need_full_reset = 4686 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4687 4688 if (reset_context->reset_req_dev == adev) 4689 job = reset_context->job; 4690 4691 if (amdgpu_sriov_vf(adev)) { 4692 /* stop the data exchange thread */ 4693 amdgpu_virt_fini_data_exchange(adev); 4694 } 4695 4696 amdgpu_fence_driver_isr_toggle(adev, true); 4697 4698 /* block all schedulers and reset given job's ring */ 4699 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4700 struct amdgpu_ring *ring = adev->rings[i]; 4701 4702 if (!ring || !ring->sched.thread) 4703 continue; 4704 4705 /*clear job fence from fence drv to avoid force_completion 4706 *leave NULL and vm flush fence in fence drv */ 4707 amdgpu_fence_driver_clear_job_fences(ring); 4708 4709 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4710 amdgpu_fence_driver_force_completion(ring); 4711 } 4712 4713 amdgpu_fence_driver_isr_toggle(adev, false); 4714 4715 if (job && job->vm) 4716 drm_sched_increase_karma(&job->base); 4717 4718 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4719 /* If reset handler not implemented, continue; otherwise return */ 4720 if (r == -ENOSYS) 4721 r = 0; 4722 else 4723 return r; 4724 4725 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4726 if (!amdgpu_sriov_vf(adev)) { 4727 4728 if (!need_full_reset) 4729 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4730 4731 if (!need_full_reset && amdgpu_gpu_recovery && 4732 amdgpu_device_ip_check_soft_reset(adev)) { 4733 amdgpu_device_ip_pre_soft_reset(adev); 4734 r = amdgpu_device_ip_soft_reset(adev); 4735 amdgpu_device_ip_post_soft_reset(adev); 4736 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4737 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4738 need_full_reset = true; 4739 } 4740 } 4741 4742 if (need_full_reset) 4743 r = amdgpu_device_ip_suspend(adev); 4744 if (need_full_reset) 4745 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4746 else 4747 clear_bit(AMDGPU_NEED_FULL_RESET, 4748 &reset_context->flags); 4749 } 4750 4751 return r; 4752 } 4753 4754 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 4755 { 4756 int i; 4757 4758 lockdep_assert_held(&adev->reset_domain->sem); 4759 4760 for (i = 0; i < adev->num_regs; i++) { 4761 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); 4762 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], 4763 adev->reset_dump_reg_value[i]); 4764 } 4765 4766 return 0; 4767 } 4768 4769 #ifdef CONFIG_DEV_COREDUMP 4770 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, 4771 size_t count, void *data, size_t datalen) 4772 { 4773 struct drm_printer p; 4774 struct amdgpu_device *adev = data; 4775 struct drm_print_iterator iter; 4776 int i; 4777 4778 iter.data = buffer; 4779 iter.offset = 0; 4780 iter.start = offset; 4781 iter.remain = count; 4782 4783 p = drm_coredump_printer(&iter); 4784 4785 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 4786 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 4787 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 4788 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); 4789 if (adev->reset_task_info.pid) 4790 drm_printf(&p, "process_name: %s PID: %d\n", 4791 adev->reset_task_info.process_name, 4792 adev->reset_task_info.pid); 4793 4794 if (adev->reset_vram_lost) 4795 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 4796 if (adev->num_regs) { 4797 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 4798 4799 for (i = 0; i < adev->num_regs; i++) 4800 drm_printf(&p, "0x%08x: 0x%08x\n", 4801 adev->reset_dump_reg_list[i], 4802 adev->reset_dump_reg_value[i]); 4803 } 4804 4805 return count - iter.remain; 4806 } 4807 4808 static void amdgpu_devcoredump_free(void *data) 4809 { 4810 } 4811 4812 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) 4813 { 4814 struct drm_device *dev = adev_to_drm(adev); 4815 4816 ktime_get_ts64(&adev->reset_time); 4817 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, 4818 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 4819 } 4820 #endif 4821 4822 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4823 struct amdgpu_reset_context *reset_context) 4824 { 4825 struct amdgpu_device *tmp_adev = NULL; 4826 bool need_full_reset, skip_hw_reset, vram_lost = false; 4827 int r = 0; 4828 bool gpu_reset_for_dev_remove = 0; 4829 4830 /* Try reset handler method first */ 4831 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4832 reset_list); 4833 amdgpu_reset_reg_dumps(tmp_adev); 4834 4835 reset_context->reset_device_list = device_list_handle; 4836 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4837 /* If reset handler not implemented, continue; otherwise return */ 4838 if (r == -ENOSYS) 4839 r = 0; 4840 else 4841 return r; 4842 4843 /* Reset handler not implemented, use the default method */ 4844 need_full_reset = 4845 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4846 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4847 4848 gpu_reset_for_dev_remove = 4849 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 4850 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4851 4852 /* 4853 * ASIC reset has to be done on all XGMI hive nodes ASAP 4854 * to allow proper links negotiation in FW (within 1 sec) 4855 */ 4856 if (!skip_hw_reset && need_full_reset) { 4857 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4858 /* For XGMI run all resets in parallel to speed up the process */ 4859 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4860 tmp_adev->gmc.xgmi.pending_reset = false; 4861 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4862 r = -EALREADY; 4863 } else 4864 r = amdgpu_asic_reset(tmp_adev); 4865 4866 if (r) { 4867 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4868 r, adev_to_drm(tmp_adev)->unique); 4869 break; 4870 } 4871 } 4872 4873 /* For XGMI wait for all resets to complete before proceed */ 4874 if (!r) { 4875 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4876 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4877 flush_work(&tmp_adev->xgmi_reset_work); 4878 r = tmp_adev->asic_reset_res; 4879 if (r) 4880 break; 4881 } 4882 } 4883 } 4884 } 4885 4886 if (!r && amdgpu_ras_intr_triggered()) { 4887 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4888 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4889 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4890 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4891 } 4892 4893 amdgpu_ras_intr_cleared(); 4894 } 4895 4896 /* Since the mode1 reset affects base ip blocks, the 4897 * phase1 ip blocks need to be resumed. Otherwise there 4898 * will be a BIOS signature error and the psp bootloader 4899 * can't load kdb on the next amdgpu install. 4900 */ 4901 if (gpu_reset_for_dev_remove) { 4902 list_for_each_entry(tmp_adev, device_list_handle, reset_list) 4903 amdgpu_device_ip_resume_phase1(tmp_adev); 4904 4905 goto end; 4906 } 4907 4908 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4909 if (need_full_reset) { 4910 /* post card */ 4911 r = amdgpu_device_asic_init(tmp_adev); 4912 if (r) { 4913 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4914 } else { 4915 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4916 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4917 if (r) 4918 goto out; 4919 4920 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4921 if (r) 4922 goto out; 4923 4924 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4925 #ifdef CONFIG_DEV_COREDUMP 4926 tmp_adev->reset_vram_lost = vram_lost; 4927 memset(&tmp_adev->reset_task_info, 0, 4928 sizeof(tmp_adev->reset_task_info)); 4929 if (reset_context->job && reset_context->job->vm) 4930 tmp_adev->reset_task_info = 4931 reset_context->job->vm->task_info; 4932 amdgpu_reset_capture_coredumpm(tmp_adev); 4933 #endif 4934 if (vram_lost) { 4935 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4936 amdgpu_inc_vram_lost(tmp_adev); 4937 } 4938 4939 r = amdgpu_device_fw_loading(tmp_adev); 4940 if (r) 4941 return r; 4942 4943 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4944 if (r) 4945 goto out; 4946 4947 if (vram_lost) 4948 amdgpu_device_fill_reset_magic(tmp_adev); 4949 4950 /* 4951 * Add this ASIC as tracked as reset was already 4952 * complete successfully. 4953 */ 4954 amdgpu_register_gpu_instance(tmp_adev); 4955 4956 if (!reset_context->hive && 4957 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4958 amdgpu_xgmi_add_device(tmp_adev); 4959 4960 r = amdgpu_device_ip_late_init(tmp_adev); 4961 if (r) 4962 goto out; 4963 4964 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4965 4966 /* 4967 * The GPU enters bad state once faulty pages 4968 * by ECC has reached the threshold, and ras 4969 * recovery is scheduled next. So add one check 4970 * here to break recovery if it indeed exceeds 4971 * bad page threshold, and remind user to 4972 * retire this GPU or setting one bigger 4973 * bad_page_threshold value to fix this once 4974 * probing driver again. 4975 */ 4976 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4977 /* must succeed. */ 4978 amdgpu_ras_resume(tmp_adev); 4979 } else { 4980 r = -EINVAL; 4981 goto out; 4982 } 4983 4984 /* Update PSP FW topology after reset */ 4985 if (reset_context->hive && 4986 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4987 r = amdgpu_xgmi_update_topology( 4988 reset_context->hive, tmp_adev); 4989 } 4990 } 4991 4992 out: 4993 if (!r) { 4994 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4995 r = amdgpu_ib_ring_tests(tmp_adev); 4996 if (r) { 4997 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4998 need_full_reset = true; 4999 r = -EAGAIN; 5000 goto end; 5001 } 5002 } 5003 5004 if (!r) 5005 r = amdgpu_device_recover_vram(tmp_adev); 5006 else 5007 tmp_adev->asic_reset_res = r; 5008 } 5009 5010 end: 5011 if (need_full_reset) 5012 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5013 else 5014 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5015 return r; 5016 } 5017 5018 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5019 { 5020 5021 switch (amdgpu_asic_reset_method(adev)) { 5022 case AMD_RESET_METHOD_MODE1: 5023 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5024 break; 5025 case AMD_RESET_METHOD_MODE2: 5026 adev->mp1_state = PP_MP1_STATE_RESET; 5027 break; 5028 default: 5029 adev->mp1_state = PP_MP1_STATE_NONE; 5030 break; 5031 } 5032 } 5033 5034 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5035 { 5036 amdgpu_vf_error_trans_all(adev); 5037 adev->mp1_state = PP_MP1_STATE_NONE; 5038 } 5039 5040 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5041 { 5042 struct pci_dev *p = NULL; 5043 5044 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5045 adev->pdev->bus->number, 1); 5046 if (p) { 5047 pm_runtime_enable(&(p->dev)); 5048 pm_runtime_resume(&(p->dev)); 5049 } 5050 5051 pci_dev_put(p); 5052 } 5053 5054 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5055 { 5056 enum amd_reset_method reset_method; 5057 struct pci_dev *p = NULL; 5058 u64 expires; 5059 5060 /* 5061 * For now, only BACO and mode1 reset are confirmed 5062 * to suffer the audio issue without proper suspended. 5063 */ 5064 reset_method = amdgpu_asic_reset_method(adev); 5065 if ((reset_method != AMD_RESET_METHOD_BACO) && 5066 (reset_method != AMD_RESET_METHOD_MODE1)) 5067 return -EINVAL; 5068 5069 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5070 adev->pdev->bus->number, 1); 5071 if (!p) 5072 return -ENODEV; 5073 5074 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5075 if (!expires) 5076 /* 5077 * If we cannot get the audio device autosuspend delay, 5078 * a fixed 4S interval will be used. Considering 3S is 5079 * the audio controller default autosuspend delay setting. 5080 * 4S used here is guaranteed to cover that. 5081 */ 5082 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5083 5084 while (!pm_runtime_status_suspended(&(p->dev))) { 5085 if (!pm_runtime_suspend(&(p->dev))) 5086 break; 5087 5088 if (expires < ktime_get_mono_fast_ns()) { 5089 dev_warn(adev->dev, "failed to suspend display audio\n"); 5090 pci_dev_put(p); 5091 /* TODO: abort the succeeding gpu reset? */ 5092 return -ETIMEDOUT; 5093 } 5094 } 5095 5096 pm_runtime_disable(&(p->dev)); 5097 5098 pci_dev_put(p); 5099 return 0; 5100 } 5101 5102 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5103 { 5104 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5105 5106 #if defined(CONFIG_DEBUG_FS) 5107 if (!amdgpu_sriov_vf(adev)) 5108 cancel_work(&adev->reset_work); 5109 #endif 5110 5111 if (adev->kfd.dev) 5112 cancel_work(&adev->kfd.reset_work); 5113 5114 if (amdgpu_sriov_vf(adev)) 5115 cancel_work(&adev->virt.flr_work); 5116 5117 if (con && adev->ras_enabled) 5118 cancel_work(&con->recovery_work); 5119 5120 } 5121 5122 /** 5123 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5124 * 5125 * @adev: amdgpu_device pointer 5126 * @job: which job trigger hang 5127 * 5128 * Attempt to reset the GPU if it has hung (all asics). 5129 * Attempt to do soft-reset or full-reset and reinitialize Asic 5130 * Returns 0 for success or an error on failure. 5131 */ 5132 5133 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5134 struct amdgpu_job *job, 5135 struct amdgpu_reset_context *reset_context) 5136 { 5137 struct list_head device_list, *device_list_handle = NULL; 5138 bool job_signaled = false; 5139 struct amdgpu_hive_info *hive = NULL; 5140 struct amdgpu_device *tmp_adev = NULL; 5141 int i, r = 0; 5142 bool need_emergency_restart = false; 5143 bool audio_suspended = false; 5144 bool gpu_reset_for_dev_remove = false; 5145 5146 gpu_reset_for_dev_remove = 5147 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && 5148 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5149 5150 /* 5151 * Special case: RAS triggered and full reset isn't supported 5152 */ 5153 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5154 5155 /* 5156 * Flush RAM to disk so that after reboot 5157 * the user can read log and see why the system rebooted. 5158 */ 5159 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5160 DRM_WARN("Emergency reboot."); 5161 5162 ksys_sync_helper(); 5163 emergency_restart(); 5164 } 5165 5166 dev_info(adev->dev, "GPU %s begin!\n", 5167 need_emergency_restart ? "jobs stop":"reset"); 5168 5169 if (!amdgpu_sriov_vf(adev)) 5170 hive = amdgpu_get_xgmi_hive(adev); 5171 if (hive) 5172 mutex_lock(&hive->hive_lock); 5173 5174 reset_context->job = job; 5175 reset_context->hive = hive; 5176 /* 5177 * Build list of devices to reset. 5178 * In case we are in XGMI hive mode, resort the device list 5179 * to put adev in the 1st position. 5180 */ 5181 INIT_LIST_HEAD(&device_list); 5182 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5183 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5184 list_add_tail(&tmp_adev->reset_list, &device_list); 5185 if (gpu_reset_for_dev_remove && adev->shutdown) 5186 tmp_adev->shutdown = true; 5187 } 5188 if (!list_is_first(&adev->reset_list, &device_list)) 5189 list_rotate_to_front(&adev->reset_list, &device_list); 5190 device_list_handle = &device_list; 5191 } else { 5192 list_add_tail(&adev->reset_list, &device_list); 5193 device_list_handle = &device_list; 5194 } 5195 5196 /* We need to lock reset domain only once both for XGMI and single device */ 5197 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5198 reset_list); 5199 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5200 5201 /* block all schedulers and reset given job's ring */ 5202 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5203 5204 amdgpu_device_set_mp1_state(tmp_adev); 5205 5206 /* 5207 * Try to put the audio codec into suspend state 5208 * before gpu reset started. 5209 * 5210 * Due to the power domain of the graphics device 5211 * is shared with AZ power domain. Without this, 5212 * we may change the audio hardware from behind 5213 * the audio driver's back. That will trigger 5214 * some audio codec errors. 5215 */ 5216 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5217 audio_suspended = true; 5218 5219 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5220 5221 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5222 5223 if (!amdgpu_sriov_vf(tmp_adev)) 5224 amdgpu_amdkfd_pre_reset(tmp_adev); 5225 5226 /* 5227 * Mark these ASICs to be reseted as untracked first 5228 * And add them back after reset completed 5229 */ 5230 amdgpu_unregister_gpu_instance(tmp_adev); 5231 5232 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5233 5234 /* disable ras on ALL IPs */ 5235 if (!need_emergency_restart && 5236 amdgpu_device_ip_need_full_reset(tmp_adev)) 5237 amdgpu_ras_suspend(tmp_adev); 5238 5239 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5240 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5241 5242 if (!ring || !ring->sched.thread) 5243 continue; 5244 5245 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5246 5247 if (need_emergency_restart) 5248 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5249 } 5250 atomic_inc(&tmp_adev->gpu_reset_counter); 5251 } 5252 5253 if (need_emergency_restart) 5254 goto skip_sched_resume; 5255 5256 /* 5257 * Must check guilty signal here since after this point all old 5258 * HW fences are force signaled. 5259 * 5260 * job->base holds a reference to parent fence 5261 */ 5262 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5263 job_signaled = true; 5264 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5265 goto skip_hw_reset; 5266 } 5267 5268 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5269 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5270 if (gpu_reset_for_dev_remove) { 5271 /* Workaroud for ASICs need to disable SMC first */ 5272 amdgpu_device_smu_fini_early(tmp_adev); 5273 } 5274 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5275 /*TODO Should we stop ?*/ 5276 if (r) { 5277 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5278 r, adev_to_drm(tmp_adev)->unique); 5279 tmp_adev->asic_reset_res = r; 5280 } 5281 5282 /* 5283 * Drop all pending non scheduler resets. Scheduler resets 5284 * were already dropped during drm_sched_stop 5285 */ 5286 amdgpu_device_stop_pending_resets(tmp_adev); 5287 } 5288 5289 /* Actual ASIC resets if needed.*/ 5290 /* Host driver will handle XGMI hive reset for SRIOV */ 5291 if (amdgpu_sriov_vf(adev)) { 5292 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5293 if (r) 5294 adev->asic_reset_res = r; 5295 5296 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ 5297 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) 5298 amdgpu_ras_resume(adev); 5299 } else { 5300 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5301 if (r && r == -EAGAIN) 5302 goto retry; 5303 5304 if (!r && gpu_reset_for_dev_remove) 5305 goto recover_end; 5306 } 5307 5308 skip_hw_reset: 5309 5310 /* Post ASIC reset for all devs .*/ 5311 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5312 5313 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5314 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5315 5316 if (!ring || !ring->sched.thread) 5317 continue; 5318 5319 drm_sched_start(&ring->sched, true); 5320 } 5321 5322 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) 5323 amdgpu_mes_self_test(tmp_adev); 5324 5325 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5326 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5327 } 5328 5329 if (tmp_adev->asic_reset_res) 5330 r = tmp_adev->asic_reset_res; 5331 5332 tmp_adev->asic_reset_res = 0; 5333 5334 if (r) { 5335 /* bad news, how to tell it to userspace ? */ 5336 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5337 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5338 } else { 5339 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5340 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5341 DRM_WARN("smart shift update failed\n"); 5342 } 5343 } 5344 5345 skip_sched_resume: 5346 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5347 /* unlock kfd: SRIOV would do it separately */ 5348 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5349 amdgpu_amdkfd_post_reset(tmp_adev); 5350 5351 /* kfd_post_reset will do nothing if kfd device is not initialized, 5352 * need to bring up kfd here if it's not be initialized before 5353 */ 5354 if (!adev->kfd.init_complete) 5355 amdgpu_amdkfd_device_init(adev); 5356 5357 if (audio_suspended) 5358 amdgpu_device_resume_display_audio(tmp_adev); 5359 5360 amdgpu_device_unset_mp1_state(tmp_adev); 5361 5362 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5363 } 5364 5365 recover_end: 5366 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5367 reset_list); 5368 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5369 5370 if (hive) { 5371 mutex_unlock(&hive->hive_lock); 5372 amdgpu_put_xgmi_hive(hive); 5373 } 5374 5375 if (r) 5376 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5377 5378 atomic_set(&adev->reset_domain->reset_res, r); 5379 return r; 5380 } 5381 5382 /** 5383 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5384 * 5385 * @adev: amdgpu_device pointer 5386 * 5387 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5388 * and lanes) of the slot the device is in. Handles APUs and 5389 * virtualized environments where PCIE config space may not be available. 5390 */ 5391 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5392 { 5393 struct pci_dev *pdev; 5394 enum pci_bus_speed speed_cap, platform_speed_cap; 5395 enum pcie_link_width platform_link_width; 5396 5397 if (amdgpu_pcie_gen_cap) 5398 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5399 5400 if (amdgpu_pcie_lane_cap) 5401 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5402 5403 /* covers APUs as well */ 5404 if (pci_is_root_bus(adev->pdev->bus)) { 5405 if (adev->pm.pcie_gen_mask == 0) 5406 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5407 if (adev->pm.pcie_mlw_mask == 0) 5408 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5409 return; 5410 } 5411 5412 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5413 return; 5414 5415 pcie_bandwidth_available(adev->pdev, NULL, 5416 &platform_speed_cap, &platform_link_width); 5417 5418 if (adev->pm.pcie_gen_mask == 0) { 5419 /* asic caps */ 5420 pdev = adev->pdev; 5421 speed_cap = pcie_get_speed_cap(pdev); 5422 if (speed_cap == PCI_SPEED_UNKNOWN) { 5423 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5424 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5425 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5426 } else { 5427 if (speed_cap == PCIE_SPEED_32_0GT) 5428 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5429 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5430 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5431 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5432 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5433 else if (speed_cap == PCIE_SPEED_16_0GT) 5434 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5435 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5436 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5437 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5438 else if (speed_cap == PCIE_SPEED_8_0GT) 5439 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5440 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5441 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5442 else if (speed_cap == PCIE_SPEED_5_0GT) 5443 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5444 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5445 else 5446 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5447 } 5448 /* platform caps */ 5449 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5450 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5451 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5452 } else { 5453 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5454 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5455 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5456 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5457 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5458 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5459 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5460 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5461 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5462 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5463 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5464 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5465 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5466 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5467 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5468 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5469 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5470 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5471 else 5472 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5473 5474 } 5475 } 5476 if (adev->pm.pcie_mlw_mask == 0) { 5477 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5478 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5479 } else { 5480 switch (platform_link_width) { 5481 case PCIE_LNK_X32: 5482 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5483 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5484 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5485 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5486 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5487 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5488 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5489 break; 5490 case PCIE_LNK_X16: 5491 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5492 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5493 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5494 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5495 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5496 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5497 break; 5498 case PCIE_LNK_X12: 5499 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5500 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5501 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5502 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5503 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5504 break; 5505 case PCIE_LNK_X8: 5506 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5507 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5508 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5509 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5510 break; 5511 case PCIE_LNK_X4: 5512 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5513 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5514 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5515 break; 5516 case PCIE_LNK_X2: 5517 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5518 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5519 break; 5520 case PCIE_LNK_X1: 5521 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5522 break; 5523 default: 5524 break; 5525 } 5526 } 5527 } 5528 } 5529 5530 /** 5531 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 5532 * 5533 * @adev: amdgpu_device pointer 5534 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 5535 * 5536 * Return true if @peer_adev can access (DMA) @adev through the PCIe 5537 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 5538 * @peer_adev. 5539 */ 5540 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 5541 struct amdgpu_device *peer_adev) 5542 { 5543 #ifdef CONFIG_HSA_AMD_P2P 5544 uint64_t address_mask = peer_adev->dev->dma_mask ? 5545 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 5546 resource_size_t aper_limit = 5547 adev->gmc.aper_base + adev->gmc.aper_size - 1; 5548 bool p2p_access = 5549 !adev->gmc.xgmi.connected_to_cpu && 5550 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 5551 5552 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 5553 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 5554 !(adev->gmc.aper_base & address_mask || 5555 aper_limit & address_mask)); 5556 #else 5557 return false; 5558 #endif 5559 } 5560 5561 int amdgpu_device_baco_enter(struct drm_device *dev) 5562 { 5563 struct amdgpu_device *adev = drm_to_adev(dev); 5564 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5565 5566 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5567 return -ENOTSUPP; 5568 5569 if (ras && adev->ras_enabled && 5570 adev->nbio.funcs->enable_doorbell_interrupt) 5571 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5572 5573 return amdgpu_dpm_baco_enter(adev); 5574 } 5575 5576 int amdgpu_device_baco_exit(struct drm_device *dev) 5577 { 5578 struct amdgpu_device *adev = drm_to_adev(dev); 5579 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5580 int ret = 0; 5581 5582 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5583 return -ENOTSUPP; 5584 5585 ret = amdgpu_dpm_baco_exit(adev); 5586 if (ret) 5587 return ret; 5588 5589 if (ras && adev->ras_enabled && 5590 adev->nbio.funcs->enable_doorbell_interrupt) 5591 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5592 5593 if (amdgpu_passthrough(adev) && 5594 adev->nbio.funcs->clear_doorbell_interrupt) 5595 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5596 5597 return 0; 5598 } 5599 5600 /** 5601 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5602 * @pdev: PCI device struct 5603 * @state: PCI channel state 5604 * 5605 * Description: Called when a PCI error is detected. 5606 * 5607 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5608 */ 5609 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5610 { 5611 struct drm_device *dev = pci_get_drvdata(pdev); 5612 struct amdgpu_device *adev = drm_to_adev(dev); 5613 int i; 5614 5615 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5616 5617 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5618 DRM_WARN("No support for XGMI hive yet..."); 5619 return PCI_ERS_RESULT_DISCONNECT; 5620 } 5621 5622 adev->pci_channel_state = state; 5623 5624 switch (state) { 5625 case pci_channel_io_normal: 5626 return PCI_ERS_RESULT_CAN_RECOVER; 5627 /* Fatal error, prepare for slot reset */ 5628 case pci_channel_io_frozen: 5629 /* 5630 * Locking adev->reset_domain->sem will prevent any external access 5631 * to GPU during PCI error recovery 5632 */ 5633 amdgpu_device_lock_reset_domain(adev->reset_domain); 5634 amdgpu_device_set_mp1_state(adev); 5635 5636 /* 5637 * Block any work scheduling as we do for regular GPU reset 5638 * for the duration of the recovery 5639 */ 5640 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5641 struct amdgpu_ring *ring = adev->rings[i]; 5642 5643 if (!ring || !ring->sched.thread) 5644 continue; 5645 5646 drm_sched_stop(&ring->sched, NULL); 5647 } 5648 atomic_inc(&adev->gpu_reset_counter); 5649 return PCI_ERS_RESULT_NEED_RESET; 5650 case pci_channel_io_perm_failure: 5651 /* Permanent error, prepare for device removal */ 5652 return PCI_ERS_RESULT_DISCONNECT; 5653 } 5654 5655 return PCI_ERS_RESULT_NEED_RESET; 5656 } 5657 5658 /** 5659 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5660 * @pdev: pointer to PCI device 5661 */ 5662 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5663 { 5664 5665 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5666 5667 /* TODO - dump whatever for debugging purposes */ 5668 5669 /* This called only if amdgpu_pci_error_detected returns 5670 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5671 * works, no need to reset slot. 5672 */ 5673 5674 return PCI_ERS_RESULT_RECOVERED; 5675 } 5676 5677 /** 5678 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5679 * @pdev: PCI device struct 5680 * 5681 * Description: This routine is called by the pci error recovery 5682 * code after the PCI slot has been reset, just before we 5683 * should resume normal operations. 5684 */ 5685 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5686 { 5687 struct drm_device *dev = pci_get_drvdata(pdev); 5688 struct amdgpu_device *adev = drm_to_adev(dev); 5689 int r, i; 5690 struct amdgpu_reset_context reset_context; 5691 u32 memsize; 5692 struct list_head device_list; 5693 5694 DRM_INFO("PCI error: slot reset callback!!\n"); 5695 5696 memset(&reset_context, 0, sizeof(reset_context)); 5697 5698 INIT_LIST_HEAD(&device_list); 5699 list_add_tail(&adev->reset_list, &device_list); 5700 5701 /* wait for asic to come out of reset */ 5702 msleep(500); 5703 5704 /* Restore PCI confspace */ 5705 amdgpu_device_load_pci_state(pdev); 5706 5707 /* confirm ASIC came out of reset */ 5708 for (i = 0; i < adev->usec_timeout; i++) { 5709 memsize = amdgpu_asic_get_config_memsize(adev); 5710 5711 if (memsize != 0xffffffff) 5712 break; 5713 udelay(1); 5714 } 5715 if (memsize == 0xffffffff) { 5716 r = -ETIME; 5717 goto out; 5718 } 5719 5720 reset_context.method = AMD_RESET_METHOD_NONE; 5721 reset_context.reset_req_dev = adev; 5722 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5723 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5724 5725 adev->no_hw_access = true; 5726 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5727 adev->no_hw_access = false; 5728 if (r) 5729 goto out; 5730 5731 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5732 5733 out: 5734 if (!r) { 5735 if (amdgpu_device_cache_pci_state(adev->pdev)) 5736 pci_restore_state(adev->pdev); 5737 5738 DRM_INFO("PCIe error recovery succeeded\n"); 5739 } else { 5740 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5741 amdgpu_device_unset_mp1_state(adev); 5742 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5743 } 5744 5745 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5746 } 5747 5748 /** 5749 * amdgpu_pci_resume() - resume normal ops after PCI reset 5750 * @pdev: pointer to PCI device 5751 * 5752 * Called when the error recovery driver tells us that its 5753 * OK to resume normal operation. 5754 */ 5755 void amdgpu_pci_resume(struct pci_dev *pdev) 5756 { 5757 struct drm_device *dev = pci_get_drvdata(pdev); 5758 struct amdgpu_device *adev = drm_to_adev(dev); 5759 int i; 5760 5761 5762 DRM_INFO("PCI error: resume callback!!\n"); 5763 5764 /* Only continue execution for the case of pci_channel_io_frozen */ 5765 if (adev->pci_channel_state != pci_channel_io_frozen) 5766 return; 5767 5768 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5769 struct amdgpu_ring *ring = adev->rings[i]; 5770 5771 if (!ring || !ring->sched.thread) 5772 continue; 5773 5774 drm_sched_start(&ring->sched, true); 5775 } 5776 5777 amdgpu_device_unset_mp1_state(adev); 5778 amdgpu_device_unlock_reset_domain(adev->reset_domain); 5779 } 5780 5781 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5782 { 5783 struct drm_device *dev = pci_get_drvdata(pdev); 5784 struct amdgpu_device *adev = drm_to_adev(dev); 5785 int r; 5786 5787 r = pci_save_state(pdev); 5788 if (!r) { 5789 kfree(adev->pci_state); 5790 5791 adev->pci_state = pci_store_saved_state(pdev); 5792 5793 if (!adev->pci_state) { 5794 DRM_ERROR("Failed to store PCI saved state"); 5795 return false; 5796 } 5797 } else { 5798 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5799 return false; 5800 } 5801 5802 return true; 5803 } 5804 5805 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5806 { 5807 struct drm_device *dev = pci_get_drvdata(pdev); 5808 struct amdgpu_device *adev = drm_to_adev(dev); 5809 int r; 5810 5811 if (!adev->pci_state) 5812 return false; 5813 5814 r = pci_load_saved_state(pdev, adev->pci_state); 5815 5816 if (!r) { 5817 pci_restore_state(pdev); 5818 } else { 5819 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5820 return false; 5821 } 5822 5823 return true; 5824 } 5825 5826 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5827 struct amdgpu_ring *ring) 5828 { 5829 #ifdef CONFIG_X86_64 5830 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5831 return; 5832 #endif 5833 if (adev->gmc.xgmi.connected_to_cpu) 5834 return; 5835 5836 if (ring && ring->funcs->emit_hdp_flush) 5837 amdgpu_ring_emit_hdp_flush(ring); 5838 else 5839 amdgpu_asic_flush_hdp(adev, ring); 5840 } 5841 5842 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5843 struct amdgpu_ring *ring) 5844 { 5845 #ifdef CONFIG_X86_64 5846 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 5847 return; 5848 #endif 5849 if (adev->gmc.xgmi.connected_to_cpu) 5850 return; 5851 5852 amdgpu_asic_invalidate_hdp(adev, ring); 5853 } 5854 5855 int amdgpu_in_reset(struct amdgpu_device *adev) 5856 { 5857 return atomic_read(&adev->reset_domain->in_gpu_reset); 5858 } 5859 5860 /** 5861 * amdgpu_device_halt() - bring hardware to some kind of halt state 5862 * 5863 * @adev: amdgpu_device pointer 5864 * 5865 * Bring hardware to some kind of halt state so that no one can touch it 5866 * any more. It will help to maintain error context when error occurred. 5867 * Compare to a simple hang, the system will keep stable at least for SSH 5868 * access. Then it should be trivial to inspect the hardware state and 5869 * see what's going on. Implemented as following: 5870 * 5871 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5872 * clears all CPU mappings to device, disallows remappings through page faults 5873 * 2. amdgpu_irq_disable_all() disables all interrupts 5874 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5875 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5876 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5877 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5878 * flush any in flight DMA operations 5879 */ 5880 void amdgpu_device_halt(struct amdgpu_device *adev) 5881 { 5882 struct pci_dev *pdev = adev->pdev; 5883 struct drm_device *ddev = adev_to_drm(adev); 5884 5885 drm_dev_unplug(ddev); 5886 5887 amdgpu_irq_disable_all(adev); 5888 5889 amdgpu_fence_driver_hw_fini(adev); 5890 5891 adev->no_hw_access = true; 5892 5893 amdgpu_device_unmap_mmio(adev); 5894 5895 pci_disable_device(pdev); 5896 pci_wait_for_pending_transaction(pdev); 5897 } 5898 5899 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5900 u32 reg) 5901 { 5902 unsigned long flags, address, data; 5903 u32 r; 5904 5905 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5906 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5907 5908 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5909 WREG32(address, reg * 4); 5910 (void)RREG32(address); 5911 r = RREG32(data); 5912 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5913 return r; 5914 } 5915 5916 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5917 u32 reg, u32 v) 5918 { 5919 unsigned long flags, address, data; 5920 5921 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5922 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5923 5924 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5925 WREG32(address, reg * 4); 5926 (void)RREG32(address); 5927 WREG32(data, v); 5928 (void)RREG32(data); 5929 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5930 } 5931 5932 /** 5933 * amdgpu_device_switch_gang - switch to a new gang 5934 * @adev: amdgpu_device pointer 5935 * @gang: the gang to switch to 5936 * 5937 * Try to switch to a new gang. 5938 * Returns: NULL if we switched to the new gang or a reference to the current 5939 * gang leader. 5940 */ 5941 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 5942 struct dma_fence *gang) 5943 { 5944 struct dma_fence *old = NULL; 5945 5946 do { 5947 dma_fence_put(old); 5948 rcu_read_lock(); 5949 old = dma_fence_get_rcu_safe(&adev->gang_submit); 5950 rcu_read_unlock(); 5951 5952 if (old == gang) 5953 break; 5954 5955 if (!dma_fence_is_signaled(old)) 5956 return old; 5957 5958 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 5959 old, gang) != old); 5960 5961 dma_fence_put(old); 5962 return NULL; 5963 } 5964 5965 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 5966 { 5967 switch (adev->asic_type) { 5968 #ifdef CONFIG_DRM_AMDGPU_SI 5969 case CHIP_HAINAN: 5970 #endif 5971 case CHIP_TOPAZ: 5972 /* chips with no display hardware */ 5973 return false; 5974 #ifdef CONFIG_DRM_AMDGPU_SI 5975 case CHIP_TAHITI: 5976 case CHIP_PITCAIRN: 5977 case CHIP_VERDE: 5978 case CHIP_OLAND: 5979 #endif 5980 #ifdef CONFIG_DRM_AMDGPU_CIK 5981 case CHIP_BONAIRE: 5982 case CHIP_HAWAII: 5983 case CHIP_KAVERI: 5984 case CHIP_KABINI: 5985 case CHIP_MULLINS: 5986 #endif 5987 case CHIP_TONGA: 5988 case CHIP_FIJI: 5989 case CHIP_POLARIS10: 5990 case CHIP_POLARIS11: 5991 case CHIP_POLARIS12: 5992 case CHIP_VEGAM: 5993 case CHIP_CARRIZO: 5994 case CHIP_STONEY: 5995 /* chips with display hardware */ 5996 return true; 5997 default: 5998 /* IP discovery */ 5999 if (!adev->ip_versions[DCE_HWIP][0] || 6000 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6001 return false; 6002 return true; 6003 } 6004 } 6005