1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 #include "amdgpu_reset.h" 69 70 #include <linux/suspend.h> 71 #include <drm/task_barrier.h> 72 #include <linux/pm_runtime.h> 73 74 #include <drm/drm_drv.h> 75 76 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); 88 89 #define AMDGPU_RESUME_MS 2000 90 91 const char *amdgpu_asic_name[] = { 92 "TAHITI", 93 "PITCAIRN", 94 "VERDE", 95 "OLAND", 96 "HAINAN", 97 "BONAIRE", 98 "KAVERI", 99 "KABINI", 100 "HAWAII", 101 "MULLINS", 102 "TOPAZ", 103 "TONGA", 104 "FIJI", 105 "CARRIZO", 106 "STONEY", 107 "POLARIS10", 108 "POLARIS11", 109 "POLARIS12", 110 "VEGAM", 111 "VEGA10", 112 "VEGA12", 113 "VEGA20", 114 "RAVEN", 115 "ARCTURUS", 116 "RENOIR", 117 "ALDEBARAN", 118 "NAVI10", 119 "NAVI14", 120 "NAVI12", 121 "SIENNA_CICHLID", 122 "NAVY_FLOUNDER", 123 "VANGOGH", 124 "DIMGREY_CAVEFISH", 125 "BEIGE_GOBY", 126 "YELLOW_CARP", 127 "LAST", 128 }; 129 130 /** 131 * DOC: pcie_replay_count 132 * 133 * The amdgpu driver provides a sysfs API for reporting the total number 134 * of PCIe replays (NAKs) 135 * The file pcie_replay_count is used for this and returns the total 136 * number of replays as a sum of the NAKs generated and NAKs received 137 */ 138 139 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 140 struct device_attribute *attr, char *buf) 141 { 142 struct drm_device *ddev = dev_get_drvdata(dev); 143 struct amdgpu_device *adev = drm_to_adev(ddev); 144 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 145 146 return sysfs_emit(buf, "%llu\n", cnt); 147 } 148 149 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 150 amdgpu_device_get_pcie_replay_count, NULL); 151 152 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 153 154 /** 155 * DOC: product_name 156 * 157 * The amdgpu driver provides a sysfs API for reporting the product name 158 * for the device 159 * The file serial_number is used for this and returns the product name 160 * as returned from the FRU. 161 * NOTE: This is only available for certain server cards 162 */ 163 164 static ssize_t amdgpu_device_get_product_name(struct device *dev, 165 struct device_attribute *attr, char *buf) 166 { 167 struct drm_device *ddev = dev_get_drvdata(dev); 168 struct amdgpu_device *adev = drm_to_adev(ddev); 169 170 return sysfs_emit(buf, "%s\n", adev->product_name); 171 } 172 173 static DEVICE_ATTR(product_name, S_IRUGO, 174 amdgpu_device_get_product_name, NULL); 175 176 /** 177 * DOC: product_number 178 * 179 * The amdgpu driver provides a sysfs API for reporting the part number 180 * for the device 181 * The file serial_number is used for this and returns the part number 182 * as returned from the FRU. 183 * NOTE: This is only available for certain server cards 184 */ 185 186 static ssize_t amdgpu_device_get_product_number(struct device *dev, 187 struct device_attribute *attr, char *buf) 188 { 189 struct drm_device *ddev = dev_get_drvdata(dev); 190 struct amdgpu_device *adev = drm_to_adev(ddev); 191 192 return sysfs_emit(buf, "%s\n", adev->product_number); 193 } 194 195 static DEVICE_ATTR(product_number, S_IRUGO, 196 amdgpu_device_get_product_number, NULL); 197 198 /** 199 * DOC: serial_number 200 * 201 * The amdgpu driver provides a sysfs API for reporting the serial number 202 * for the device 203 * The file serial_number is used for this and returns the serial number 204 * as returned from the FRU. 205 * NOTE: This is only available for certain server cards 206 */ 207 208 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 209 struct device_attribute *attr, char *buf) 210 { 211 struct drm_device *ddev = dev_get_drvdata(dev); 212 struct amdgpu_device *adev = drm_to_adev(ddev); 213 214 return sysfs_emit(buf, "%s\n", adev->serial); 215 } 216 217 static DEVICE_ATTR(serial_number, S_IRUGO, 218 amdgpu_device_get_serial_number, NULL); 219 220 /** 221 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 222 * 223 * @dev: drm_device pointer 224 * 225 * Returns true if the device is a dGPU with ATPX power control, 226 * otherwise return false. 227 */ 228 bool amdgpu_device_supports_px(struct drm_device *dev) 229 { 230 struct amdgpu_device *adev = drm_to_adev(dev); 231 232 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 233 return true; 234 return false; 235 } 236 237 /** 238 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 239 * 240 * @dev: drm_device pointer 241 * 242 * Returns true if the device is a dGPU with ACPI power control, 243 * otherwise return false. 244 */ 245 bool amdgpu_device_supports_boco(struct drm_device *dev) 246 { 247 struct amdgpu_device *adev = drm_to_adev(dev); 248 249 if (adev->has_pr3 || 250 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 251 return true; 252 return false; 253 } 254 255 /** 256 * amdgpu_device_supports_baco - Does the device support BACO 257 * 258 * @dev: drm_device pointer 259 * 260 * Returns true if the device supporte BACO, 261 * otherwise return false. 262 */ 263 bool amdgpu_device_supports_baco(struct drm_device *dev) 264 { 265 struct amdgpu_device *adev = drm_to_adev(dev); 266 267 return amdgpu_asic_supports_baco(adev); 268 } 269 270 /** 271 * amdgpu_device_supports_smart_shift - Is the device dGPU with 272 * smart shift support 273 * 274 * @dev: drm_device pointer 275 * 276 * Returns true if the device is a dGPU with Smart Shift support, 277 * otherwise returns false. 278 */ 279 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 280 { 281 return (amdgpu_device_supports_boco(dev) && 282 amdgpu_acpi_is_power_shift_control_supported()); 283 } 284 285 /* 286 * VRAM access helper functions 287 */ 288 289 /** 290 * amdgpu_device_vram_access - read/write a buffer in vram 291 * 292 * @adev: amdgpu_device pointer 293 * @pos: offset of the buffer in vram 294 * @buf: virtual address of the buffer in system memory 295 * @size: read/write size, sizeof(@buf) must > @size 296 * @write: true - write to vram, otherwise - read from vram 297 */ 298 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 299 uint32_t *buf, size_t size, bool write) 300 { 301 unsigned long flags; 302 uint32_t hi = ~0; 303 uint64_t last; 304 int idx; 305 306 if (!drm_dev_enter(&adev->ddev, &idx)) 307 return; 308 309 #ifdef CONFIG_64BIT 310 last = min(pos + size, adev->gmc.visible_vram_size); 311 if (last > pos) { 312 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 313 size_t count = last - pos; 314 315 if (write) { 316 memcpy_toio(addr, buf, count); 317 mb(); 318 amdgpu_device_flush_hdp(adev, NULL); 319 } else { 320 amdgpu_device_invalidate_hdp(adev, NULL); 321 mb(); 322 memcpy_fromio(buf, addr, count); 323 } 324 325 if (count == size) 326 goto exit; 327 328 pos += count; 329 buf += count / 4; 330 size -= count; 331 } 332 #endif 333 334 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 335 for (last = pos + size; pos < last; pos += 4) { 336 uint32_t tmp = pos >> 31; 337 338 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 339 if (tmp != hi) { 340 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 341 hi = tmp; 342 } 343 if (write) 344 WREG32_NO_KIQ(mmMM_DATA, *buf++); 345 else 346 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 347 } 348 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 349 350 #ifdef CONFIG_64BIT 351 exit: 352 #endif 353 drm_dev_exit(idx); 354 } 355 356 /* 357 * register access helper functions. 358 */ 359 360 /* Check if hw access should be skipped because of hotplug or device error */ 361 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 362 { 363 if (adev->no_hw_access) 364 return true; 365 366 #ifdef CONFIG_LOCKDEP 367 /* 368 * This is a bit complicated to understand, so worth a comment. What we assert 369 * here is that the GPU reset is not running on another thread in parallel. 370 * 371 * For this we trylock the read side of the reset semaphore, if that succeeds 372 * we know that the reset is not running in paralell. 373 * 374 * If the trylock fails we assert that we are either already holding the read 375 * side of the lock or are the reset thread itself and hold the write side of 376 * the lock. 377 */ 378 if (in_task()) { 379 if (down_read_trylock(&adev->reset_sem)) 380 up_read(&adev->reset_sem); 381 else 382 lockdep_assert_held(&adev->reset_sem); 383 } 384 #endif 385 return false; 386 } 387 388 /** 389 * amdgpu_device_rreg - read a memory mapped IO or indirect register 390 * 391 * @adev: amdgpu_device pointer 392 * @reg: dword aligned register offset 393 * @acc_flags: access flags which require special behavior 394 * 395 * Returns the 32 bit value from the offset specified. 396 */ 397 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 398 uint32_t reg, uint32_t acc_flags) 399 { 400 uint32_t ret; 401 402 if (amdgpu_device_skip_hw_access(adev)) 403 return 0; 404 405 if ((reg * 4) < adev->rmmio_size) { 406 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 407 amdgpu_sriov_runtime(adev) && 408 down_read_trylock(&adev->reset_sem)) { 409 ret = amdgpu_kiq_rreg(adev, reg); 410 up_read(&adev->reset_sem); 411 } else { 412 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 413 } 414 } else { 415 ret = adev->pcie_rreg(adev, reg * 4); 416 } 417 418 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 419 420 return ret; 421 } 422 423 /* 424 * MMIO register read with bytes helper functions 425 * @offset:bytes offset from MMIO start 426 * 427 */ 428 429 /** 430 * amdgpu_mm_rreg8 - read a memory mapped IO register 431 * 432 * @adev: amdgpu_device pointer 433 * @offset: byte aligned register offset 434 * 435 * Returns the 8 bit value from the offset specified. 436 */ 437 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 438 { 439 if (amdgpu_device_skip_hw_access(adev)) 440 return 0; 441 442 if (offset < adev->rmmio_size) 443 return (readb(adev->rmmio + offset)); 444 BUG(); 445 } 446 447 /* 448 * MMIO register write with bytes helper functions 449 * @offset:bytes offset from MMIO start 450 * @value: the value want to be written to the register 451 * 452 */ 453 /** 454 * amdgpu_mm_wreg8 - read a memory mapped IO register 455 * 456 * @adev: amdgpu_device pointer 457 * @offset: byte aligned register offset 458 * @value: 8 bit value to write 459 * 460 * Writes the value specified to the offset specified. 461 */ 462 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 463 { 464 if (amdgpu_device_skip_hw_access(adev)) 465 return; 466 467 if (offset < adev->rmmio_size) 468 writeb(value, adev->rmmio + offset); 469 else 470 BUG(); 471 } 472 473 /** 474 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 475 * 476 * @adev: amdgpu_device pointer 477 * @reg: dword aligned register offset 478 * @v: 32 bit value to write to the register 479 * @acc_flags: access flags which require special behavior 480 * 481 * Writes the value specified to the offset specified. 482 */ 483 void amdgpu_device_wreg(struct amdgpu_device *adev, 484 uint32_t reg, uint32_t v, 485 uint32_t acc_flags) 486 { 487 if (amdgpu_device_skip_hw_access(adev)) 488 return; 489 490 if ((reg * 4) < adev->rmmio_size) { 491 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 492 amdgpu_sriov_runtime(adev) && 493 down_read_trylock(&adev->reset_sem)) { 494 amdgpu_kiq_wreg(adev, reg, v); 495 up_read(&adev->reset_sem); 496 } else { 497 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 498 } 499 } else { 500 adev->pcie_wreg(adev, reg * 4, v); 501 } 502 503 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 504 } 505 506 /* 507 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 508 * 509 * this function is invoked only the debugfs register access 510 * */ 511 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 512 uint32_t reg, uint32_t v) 513 { 514 if (amdgpu_device_skip_hw_access(adev)) 515 return; 516 517 if (amdgpu_sriov_fullaccess(adev) && 518 adev->gfx.rlc.funcs && 519 adev->gfx.rlc.funcs->is_rlcg_access_range) { 520 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 521 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0, 0); 522 } else { 523 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 524 } 525 } 526 527 /** 528 * amdgpu_mm_rdoorbell - read a doorbell dword 529 * 530 * @adev: amdgpu_device pointer 531 * @index: doorbell index 532 * 533 * Returns the value in the doorbell aperture at the 534 * requested doorbell index (CIK). 535 */ 536 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 537 { 538 if (amdgpu_device_skip_hw_access(adev)) 539 return 0; 540 541 if (index < adev->doorbell.num_doorbells) { 542 return readl(adev->doorbell.ptr + index); 543 } else { 544 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 545 return 0; 546 } 547 } 548 549 /** 550 * amdgpu_mm_wdoorbell - write a doorbell dword 551 * 552 * @adev: amdgpu_device pointer 553 * @index: doorbell index 554 * @v: value to write 555 * 556 * Writes @v to the doorbell aperture at the 557 * requested doorbell index (CIK). 558 */ 559 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 560 { 561 if (amdgpu_device_skip_hw_access(adev)) 562 return; 563 564 if (index < adev->doorbell.num_doorbells) { 565 writel(v, adev->doorbell.ptr + index); 566 } else { 567 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 568 } 569 } 570 571 /** 572 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 573 * 574 * @adev: amdgpu_device pointer 575 * @index: doorbell index 576 * 577 * Returns the value in the doorbell aperture at the 578 * requested doorbell index (VEGA10+). 579 */ 580 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 581 { 582 if (amdgpu_device_skip_hw_access(adev)) 583 return 0; 584 585 if (index < adev->doorbell.num_doorbells) { 586 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 587 } else { 588 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 589 return 0; 590 } 591 } 592 593 /** 594 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 595 * 596 * @adev: amdgpu_device pointer 597 * @index: doorbell index 598 * @v: value to write 599 * 600 * Writes @v to the doorbell aperture at the 601 * requested doorbell index (VEGA10+). 602 */ 603 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 604 { 605 if (amdgpu_device_skip_hw_access(adev)) 606 return; 607 608 if (index < adev->doorbell.num_doorbells) { 609 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 610 } else { 611 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 612 } 613 } 614 615 /** 616 * amdgpu_device_indirect_rreg - read an indirect register 617 * 618 * @adev: amdgpu_device pointer 619 * @pcie_index: mmio register offset 620 * @pcie_data: mmio register offset 621 * @reg_addr: indirect register address to read from 622 * 623 * Returns the value of indirect register @reg_addr 624 */ 625 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 626 u32 pcie_index, u32 pcie_data, 627 u32 reg_addr) 628 { 629 unsigned long flags; 630 u32 r; 631 void __iomem *pcie_index_offset; 632 void __iomem *pcie_data_offset; 633 634 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 635 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 636 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 637 638 writel(reg_addr, pcie_index_offset); 639 readl(pcie_index_offset); 640 r = readl(pcie_data_offset); 641 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 642 643 return r; 644 } 645 646 /** 647 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 648 * 649 * @adev: amdgpu_device pointer 650 * @pcie_index: mmio register offset 651 * @pcie_data: mmio register offset 652 * @reg_addr: indirect register address to read from 653 * 654 * Returns the value of indirect register @reg_addr 655 */ 656 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 657 u32 pcie_index, u32 pcie_data, 658 u32 reg_addr) 659 { 660 unsigned long flags; 661 u64 r; 662 void __iomem *pcie_index_offset; 663 void __iomem *pcie_data_offset; 664 665 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 666 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 667 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 668 669 /* read low 32 bits */ 670 writel(reg_addr, pcie_index_offset); 671 readl(pcie_index_offset); 672 r = readl(pcie_data_offset); 673 /* read high 32 bits */ 674 writel(reg_addr + 4, pcie_index_offset); 675 readl(pcie_index_offset); 676 r |= ((u64)readl(pcie_data_offset) << 32); 677 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 678 679 return r; 680 } 681 682 /** 683 * amdgpu_device_indirect_wreg - write an indirect register address 684 * 685 * @adev: amdgpu_device pointer 686 * @pcie_index: mmio register offset 687 * @pcie_data: mmio register offset 688 * @reg_addr: indirect register offset 689 * @reg_data: indirect register data 690 * 691 */ 692 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 693 u32 pcie_index, u32 pcie_data, 694 u32 reg_addr, u32 reg_data) 695 { 696 unsigned long flags; 697 void __iomem *pcie_index_offset; 698 void __iomem *pcie_data_offset; 699 700 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 701 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 702 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 703 704 writel(reg_addr, pcie_index_offset); 705 readl(pcie_index_offset); 706 writel(reg_data, pcie_data_offset); 707 readl(pcie_data_offset); 708 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 709 } 710 711 /** 712 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 713 * 714 * @adev: amdgpu_device pointer 715 * @pcie_index: mmio register offset 716 * @pcie_data: mmio register offset 717 * @reg_addr: indirect register offset 718 * @reg_data: indirect register data 719 * 720 */ 721 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 722 u32 pcie_index, u32 pcie_data, 723 u32 reg_addr, u64 reg_data) 724 { 725 unsigned long flags; 726 void __iomem *pcie_index_offset; 727 void __iomem *pcie_data_offset; 728 729 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 730 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 731 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 732 733 /* write low 32 bits */ 734 writel(reg_addr, pcie_index_offset); 735 readl(pcie_index_offset); 736 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 737 readl(pcie_data_offset); 738 /* write high 32 bits */ 739 writel(reg_addr + 4, pcie_index_offset); 740 readl(pcie_index_offset); 741 writel((u32)(reg_data >> 32), pcie_data_offset); 742 readl(pcie_data_offset); 743 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 744 } 745 746 /** 747 * amdgpu_invalid_rreg - dummy reg read function 748 * 749 * @adev: amdgpu_device pointer 750 * @reg: offset of register 751 * 752 * Dummy register read function. Used for register blocks 753 * that certain asics don't have (all asics). 754 * Returns the value in the register. 755 */ 756 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 757 { 758 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 759 BUG(); 760 return 0; 761 } 762 763 /** 764 * amdgpu_invalid_wreg - dummy reg write function 765 * 766 * @adev: amdgpu_device pointer 767 * @reg: offset of register 768 * @v: value to write to the register 769 * 770 * Dummy register read function. Used for register blocks 771 * that certain asics don't have (all asics). 772 */ 773 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 774 { 775 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 776 reg, v); 777 BUG(); 778 } 779 780 /** 781 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 782 * 783 * @adev: amdgpu_device pointer 784 * @reg: offset of register 785 * 786 * Dummy register read function. Used for register blocks 787 * that certain asics don't have (all asics). 788 * Returns the value in the register. 789 */ 790 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 791 { 792 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 793 BUG(); 794 return 0; 795 } 796 797 /** 798 * amdgpu_invalid_wreg64 - dummy reg write function 799 * 800 * @adev: amdgpu_device pointer 801 * @reg: offset of register 802 * @v: value to write to the register 803 * 804 * Dummy register read function. Used for register blocks 805 * that certain asics don't have (all asics). 806 */ 807 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 808 { 809 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 810 reg, v); 811 BUG(); 812 } 813 814 /** 815 * amdgpu_block_invalid_rreg - dummy reg read function 816 * 817 * @adev: amdgpu_device pointer 818 * @block: offset of instance 819 * @reg: offset of register 820 * 821 * Dummy register read function. Used for register blocks 822 * that certain asics don't have (all asics). 823 * Returns the value in the register. 824 */ 825 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 826 uint32_t block, uint32_t reg) 827 { 828 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 829 reg, block); 830 BUG(); 831 return 0; 832 } 833 834 /** 835 * amdgpu_block_invalid_wreg - dummy reg write function 836 * 837 * @adev: amdgpu_device pointer 838 * @block: offset of instance 839 * @reg: offset of register 840 * @v: value to write to the register 841 * 842 * Dummy register read function. Used for register blocks 843 * that certain asics don't have (all asics). 844 */ 845 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 846 uint32_t block, 847 uint32_t reg, uint32_t v) 848 { 849 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 850 reg, block, v); 851 BUG(); 852 } 853 854 /** 855 * amdgpu_device_asic_init - Wrapper for atom asic_init 856 * 857 * @adev: amdgpu_device pointer 858 * 859 * Does any asic specific work and then calls atom asic init. 860 */ 861 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 862 { 863 amdgpu_asic_pre_asic_init(adev); 864 865 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 866 } 867 868 /** 869 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 870 * 871 * @adev: amdgpu_device pointer 872 * 873 * Allocates a scratch page of VRAM for use by various things in the 874 * driver. 875 */ 876 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 877 { 878 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 879 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 880 &adev->vram_scratch.robj, 881 &adev->vram_scratch.gpu_addr, 882 (void **)&adev->vram_scratch.ptr); 883 } 884 885 /** 886 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 887 * 888 * @adev: amdgpu_device pointer 889 * 890 * Frees the VRAM scratch page. 891 */ 892 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 893 { 894 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 895 } 896 897 /** 898 * amdgpu_device_program_register_sequence - program an array of registers. 899 * 900 * @adev: amdgpu_device pointer 901 * @registers: pointer to the register array 902 * @array_size: size of the register array 903 * 904 * Programs an array or registers with and and or masks. 905 * This is a helper for setting golden registers. 906 */ 907 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 908 const u32 *registers, 909 const u32 array_size) 910 { 911 u32 tmp, reg, and_mask, or_mask; 912 int i; 913 914 if (array_size % 3) 915 return; 916 917 for (i = 0; i < array_size; i +=3) { 918 reg = registers[i + 0]; 919 and_mask = registers[i + 1]; 920 or_mask = registers[i + 2]; 921 922 if (and_mask == 0xffffffff) { 923 tmp = or_mask; 924 } else { 925 tmp = RREG32(reg); 926 tmp &= ~and_mask; 927 if (adev->family >= AMDGPU_FAMILY_AI) 928 tmp |= (or_mask & and_mask); 929 else 930 tmp |= or_mask; 931 } 932 WREG32(reg, tmp); 933 } 934 } 935 936 /** 937 * amdgpu_device_pci_config_reset - reset the GPU 938 * 939 * @adev: amdgpu_device pointer 940 * 941 * Resets the GPU using the pci config reset sequence. 942 * Only applicable to asics prior to vega10. 943 */ 944 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 945 { 946 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 947 } 948 949 /** 950 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 951 * 952 * @adev: amdgpu_device pointer 953 * 954 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 955 */ 956 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 957 { 958 return pci_reset_function(adev->pdev); 959 } 960 961 /* 962 * GPU doorbell aperture helpers function. 963 */ 964 /** 965 * amdgpu_device_doorbell_init - Init doorbell driver information. 966 * 967 * @adev: amdgpu_device pointer 968 * 969 * Init doorbell driver information (CIK) 970 * Returns 0 on success, error on failure. 971 */ 972 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 973 { 974 975 /* No doorbell on SI hardware generation */ 976 if (adev->asic_type < CHIP_BONAIRE) { 977 adev->doorbell.base = 0; 978 adev->doorbell.size = 0; 979 adev->doorbell.num_doorbells = 0; 980 adev->doorbell.ptr = NULL; 981 return 0; 982 } 983 984 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 985 return -EINVAL; 986 987 amdgpu_asic_init_doorbell_index(adev); 988 989 /* doorbell bar mapping */ 990 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 991 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 992 993 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 994 adev->doorbell_index.max_assignment+1); 995 if (adev->doorbell.num_doorbells == 0) 996 return -EINVAL; 997 998 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 999 * paging queue doorbell use the second page. The 1000 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1001 * doorbells are in the first page. So with paging queue enabled, 1002 * the max num_doorbells should + 1 page (0x400 in dword) 1003 */ 1004 if (adev->asic_type >= CHIP_VEGA10) 1005 adev->doorbell.num_doorbells += 0x400; 1006 1007 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1008 adev->doorbell.num_doorbells * 1009 sizeof(u32)); 1010 if (adev->doorbell.ptr == NULL) 1011 return -ENOMEM; 1012 1013 return 0; 1014 } 1015 1016 /** 1017 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1018 * 1019 * @adev: amdgpu_device pointer 1020 * 1021 * Tear down doorbell driver information (CIK) 1022 */ 1023 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1024 { 1025 iounmap(adev->doorbell.ptr); 1026 adev->doorbell.ptr = NULL; 1027 } 1028 1029 1030 1031 /* 1032 * amdgpu_device_wb_*() 1033 * Writeback is the method by which the GPU updates special pages in memory 1034 * with the status of certain GPU events (fences, ring pointers,etc.). 1035 */ 1036 1037 /** 1038 * amdgpu_device_wb_fini - Disable Writeback and free memory 1039 * 1040 * @adev: amdgpu_device pointer 1041 * 1042 * Disables Writeback and frees the Writeback memory (all asics). 1043 * Used at driver shutdown. 1044 */ 1045 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1046 { 1047 if (adev->wb.wb_obj) { 1048 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1049 &adev->wb.gpu_addr, 1050 (void **)&adev->wb.wb); 1051 adev->wb.wb_obj = NULL; 1052 } 1053 } 1054 1055 /** 1056 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1057 * 1058 * @adev: amdgpu_device pointer 1059 * 1060 * Initializes writeback and allocates writeback memory (all asics). 1061 * Used at driver startup. 1062 * Returns 0 on success or an -error on failure. 1063 */ 1064 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1065 { 1066 int r; 1067 1068 if (adev->wb.wb_obj == NULL) { 1069 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1070 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1071 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1072 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1073 (void **)&adev->wb.wb); 1074 if (r) { 1075 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1076 return r; 1077 } 1078 1079 adev->wb.num_wb = AMDGPU_MAX_WB; 1080 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1081 1082 /* clear wb memory */ 1083 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1084 } 1085 1086 return 0; 1087 } 1088 1089 /** 1090 * amdgpu_device_wb_get - Allocate a wb entry 1091 * 1092 * @adev: amdgpu_device pointer 1093 * @wb: wb index 1094 * 1095 * Allocate a wb slot for use by the driver (all asics). 1096 * Returns 0 on success or -EINVAL on failure. 1097 */ 1098 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1099 { 1100 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1101 1102 if (offset < adev->wb.num_wb) { 1103 __set_bit(offset, adev->wb.used); 1104 *wb = offset << 3; /* convert to dw offset */ 1105 return 0; 1106 } else { 1107 return -EINVAL; 1108 } 1109 } 1110 1111 /** 1112 * amdgpu_device_wb_free - Free a wb entry 1113 * 1114 * @adev: amdgpu_device pointer 1115 * @wb: wb index 1116 * 1117 * Free a wb slot allocated for use by the driver (all asics) 1118 */ 1119 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1120 { 1121 wb >>= 3; 1122 if (wb < adev->wb.num_wb) 1123 __clear_bit(wb, adev->wb.used); 1124 } 1125 1126 /** 1127 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1128 * 1129 * @adev: amdgpu_device pointer 1130 * 1131 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1132 * to fail, but if any of the BARs is not accessible after the size we abort 1133 * driver loading by returning -ENODEV. 1134 */ 1135 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1136 { 1137 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1138 struct pci_bus *root; 1139 struct resource *res; 1140 unsigned i; 1141 u16 cmd; 1142 int r; 1143 1144 /* Bypass for VF */ 1145 if (amdgpu_sriov_vf(adev)) 1146 return 0; 1147 1148 /* skip if the bios has already enabled large BAR */ 1149 if (adev->gmc.real_vram_size && 1150 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1151 return 0; 1152 1153 /* Check if the root BUS has 64bit memory resources */ 1154 root = adev->pdev->bus; 1155 while (root->parent) 1156 root = root->parent; 1157 1158 pci_bus_for_each_resource(root, res, i) { 1159 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1160 res->start > 0x100000000ull) 1161 break; 1162 } 1163 1164 /* Trying to resize is pointless without a root hub window above 4GB */ 1165 if (!res) 1166 return 0; 1167 1168 /* Limit the BAR size to what is available */ 1169 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1170 rbar_size); 1171 1172 /* Disable memory decoding while we change the BAR addresses and size */ 1173 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1174 pci_write_config_word(adev->pdev, PCI_COMMAND, 1175 cmd & ~PCI_COMMAND_MEMORY); 1176 1177 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1178 amdgpu_device_doorbell_fini(adev); 1179 if (adev->asic_type >= CHIP_BONAIRE) 1180 pci_release_resource(adev->pdev, 2); 1181 1182 pci_release_resource(adev->pdev, 0); 1183 1184 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1185 if (r == -ENOSPC) 1186 DRM_INFO("Not enough PCI address space for a large BAR."); 1187 else if (r && r != -ENOTSUPP) 1188 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1189 1190 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1191 1192 /* When the doorbell or fb BAR isn't available we have no chance of 1193 * using the device. 1194 */ 1195 r = amdgpu_device_doorbell_init(adev); 1196 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1197 return -ENODEV; 1198 1199 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1200 1201 return 0; 1202 } 1203 1204 /* 1205 * GPU helpers function. 1206 */ 1207 /** 1208 * amdgpu_device_need_post - check if the hw need post or not 1209 * 1210 * @adev: amdgpu_device pointer 1211 * 1212 * Check if the asic has been initialized (all asics) at driver startup 1213 * or post is needed if hw reset is performed. 1214 * Returns true if need or false if not. 1215 */ 1216 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1217 { 1218 uint32_t reg; 1219 1220 if (amdgpu_sriov_vf(adev)) 1221 return false; 1222 1223 if (amdgpu_passthrough(adev)) { 1224 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1225 * some old smc fw still need driver do vPost otherwise gpu hang, while 1226 * those smc fw version above 22.15 doesn't have this flaw, so we force 1227 * vpost executed for smc version below 22.15 1228 */ 1229 if (adev->asic_type == CHIP_FIJI) { 1230 int err; 1231 uint32_t fw_ver; 1232 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1233 /* force vPost if error occured */ 1234 if (err) 1235 return true; 1236 1237 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1238 if (fw_ver < 0x00160e00) 1239 return true; 1240 } 1241 } 1242 1243 /* Don't post if we need to reset whole hive on init */ 1244 if (adev->gmc.xgmi.pending_reset) 1245 return false; 1246 1247 if (adev->has_hw_reset) { 1248 adev->has_hw_reset = false; 1249 return true; 1250 } 1251 1252 /* bios scratch used on CIK+ */ 1253 if (adev->asic_type >= CHIP_BONAIRE) 1254 return amdgpu_atombios_scratch_need_asic_init(adev); 1255 1256 /* check MEM_SIZE for older asics */ 1257 reg = amdgpu_asic_get_config_memsize(adev); 1258 1259 if ((reg != 0) && (reg != 0xffffffff)) 1260 return false; 1261 1262 return true; 1263 } 1264 1265 /* if we get transitioned to only one device, take VGA back */ 1266 /** 1267 * amdgpu_device_vga_set_decode - enable/disable vga decode 1268 * 1269 * @pdev: PCI device pointer 1270 * @state: enable/disable vga decode 1271 * 1272 * Enable/disable vga decode (all asics). 1273 * Returns VGA resource flags. 1274 */ 1275 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1276 bool state) 1277 { 1278 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1279 amdgpu_asic_set_vga_state(adev, state); 1280 if (state) 1281 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1282 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1283 else 1284 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1285 } 1286 1287 /** 1288 * amdgpu_device_check_block_size - validate the vm block size 1289 * 1290 * @adev: amdgpu_device pointer 1291 * 1292 * Validates the vm block size specified via module parameter. 1293 * The vm block size defines number of bits in page table versus page directory, 1294 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1295 * page table and the remaining bits are in the page directory. 1296 */ 1297 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1298 { 1299 /* defines number of bits in page table versus page directory, 1300 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1301 * page table and the remaining bits are in the page directory */ 1302 if (amdgpu_vm_block_size == -1) 1303 return; 1304 1305 if (amdgpu_vm_block_size < 9) { 1306 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1307 amdgpu_vm_block_size); 1308 amdgpu_vm_block_size = -1; 1309 } 1310 } 1311 1312 /** 1313 * amdgpu_device_check_vm_size - validate the vm size 1314 * 1315 * @adev: amdgpu_device pointer 1316 * 1317 * Validates the vm size in GB specified via module parameter. 1318 * The VM size is the size of the GPU virtual memory space in GB. 1319 */ 1320 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1321 { 1322 /* no need to check the default value */ 1323 if (amdgpu_vm_size == -1) 1324 return; 1325 1326 if (amdgpu_vm_size < 1) { 1327 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1328 amdgpu_vm_size); 1329 amdgpu_vm_size = -1; 1330 } 1331 } 1332 1333 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1334 { 1335 struct sysinfo si; 1336 bool is_os_64 = (sizeof(void *) == 8); 1337 uint64_t total_memory; 1338 uint64_t dram_size_seven_GB = 0x1B8000000; 1339 uint64_t dram_size_three_GB = 0xB8000000; 1340 1341 if (amdgpu_smu_memory_pool_size == 0) 1342 return; 1343 1344 if (!is_os_64) { 1345 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1346 goto def_value; 1347 } 1348 si_meminfo(&si); 1349 total_memory = (uint64_t)si.totalram * si.mem_unit; 1350 1351 if ((amdgpu_smu_memory_pool_size == 1) || 1352 (amdgpu_smu_memory_pool_size == 2)) { 1353 if (total_memory < dram_size_three_GB) 1354 goto def_value1; 1355 } else if ((amdgpu_smu_memory_pool_size == 4) || 1356 (amdgpu_smu_memory_pool_size == 8)) { 1357 if (total_memory < dram_size_seven_GB) 1358 goto def_value1; 1359 } else { 1360 DRM_WARN("Smu memory pool size not supported\n"); 1361 goto def_value; 1362 } 1363 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1364 1365 return; 1366 1367 def_value1: 1368 DRM_WARN("No enough system memory\n"); 1369 def_value: 1370 adev->pm.smu_prv_buffer_size = 0; 1371 } 1372 1373 /** 1374 * amdgpu_device_check_arguments - validate module params 1375 * 1376 * @adev: amdgpu_device pointer 1377 * 1378 * Validates certain module parameters and updates 1379 * the associated values used by the driver (all asics). 1380 */ 1381 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1382 { 1383 if (amdgpu_sched_jobs < 4) { 1384 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1385 amdgpu_sched_jobs); 1386 amdgpu_sched_jobs = 4; 1387 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1388 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1389 amdgpu_sched_jobs); 1390 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1391 } 1392 1393 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1394 /* gart size must be greater or equal to 32M */ 1395 dev_warn(adev->dev, "gart size (%d) too small\n", 1396 amdgpu_gart_size); 1397 amdgpu_gart_size = -1; 1398 } 1399 1400 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1401 /* gtt size must be greater or equal to 32M */ 1402 dev_warn(adev->dev, "gtt size (%d) too small\n", 1403 amdgpu_gtt_size); 1404 amdgpu_gtt_size = -1; 1405 } 1406 1407 /* valid range is between 4 and 9 inclusive */ 1408 if (amdgpu_vm_fragment_size != -1 && 1409 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1410 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1411 amdgpu_vm_fragment_size = -1; 1412 } 1413 1414 if (amdgpu_sched_hw_submission < 2) { 1415 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1416 amdgpu_sched_hw_submission); 1417 amdgpu_sched_hw_submission = 2; 1418 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1419 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1420 amdgpu_sched_hw_submission); 1421 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1422 } 1423 1424 amdgpu_device_check_smu_prv_buffer_size(adev); 1425 1426 amdgpu_device_check_vm_size(adev); 1427 1428 amdgpu_device_check_block_size(adev); 1429 1430 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1431 1432 amdgpu_gmc_tmz_set(adev); 1433 1434 amdgpu_gmc_noretry_set(adev); 1435 1436 return 0; 1437 } 1438 1439 /** 1440 * amdgpu_switcheroo_set_state - set switcheroo state 1441 * 1442 * @pdev: pci dev pointer 1443 * @state: vga_switcheroo state 1444 * 1445 * Callback for the switcheroo driver. Suspends or resumes the 1446 * the asics before or after it is powered up using ACPI methods. 1447 */ 1448 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1449 enum vga_switcheroo_state state) 1450 { 1451 struct drm_device *dev = pci_get_drvdata(pdev); 1452 int r; 1453 1454 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1455 return; 1456 1457 if (state == VGA_SWITCHEROO_ON) { 1458 pr_info("switched on\n"); 1459 /* don't suspend or resume card normally */ 1460 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1461 1462 pci_set_power_state(pdev, PCI_D0); 1463 amdgpu_device_load_pci_state(pdev); 1464 r = pci_enable_device(pdev); 1465 if (r) 1466 DRM_WARN("pci_enable_device failed (%d)\n", r); 1467 amdgpu_device_resume(dev, true); 1468 1469 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1470 } else { 1471 pr_info("switched off\n"); 1472 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1473 amdgpu_device_suspend(dev, true); 1474 amdgpu_device_cache_pci_state(pdev); 1475 /* Shut down the device */ 1476 pci_disable_device(pdev); 1477 pci_set_power_state(pdev, PCI_D3cold); 1478 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1479 } 1480 } 1481 1482 /** 1483 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1484 * 1485 * @pdev: pci dev pointer 1486 * 1487 * Callback for the switcheroo driver. Check of the switcheroo 1488 * state can be changed. 1489 * Returns true if the state can be changed, false if not. 1490 */ 1491 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1492 { 1493 struct drm_device *dev = pci_get_drvdata(pdev); 1494 1495 /* 1496 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1497 * locking inversion with the driver load path. And the access here is 1498 * completely racy anyway. So don't bother with locking for now. 1499 */ 1500 return atomic_read(&dev->open_count) == 0; 1501 } 1502 1503 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1504 .set_gpu_state = amdgpu_switcheroo_set_state, 1505 .reprobe = NULL, 1506 .can_switch = amdgpu_switcheroo_can_switch, 1507 }; 1508 1509 /** 1510 * amdgpu_device_ip_set_clockgating_state - set the CG state 1511 * 1512 * @dev: amdgpu_device pointer 1513 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1514 * @state: clockgating state (gate or ungate) 1515 * 1516 * Sets the requested clockgating state for all instances of 1517 * the hardware IP specified. 1518 * Returns the error code from the last instance. 1519 */ 1520 int amdgpu_device_ip_set_clockgating_state(void *dev, 1521 enum amd_ip_block_type block_type, 1522 enum amd_clockgating_state state) 1523 { 1524 struct amdgpu_device *adev = dev; 1525 int i, r = 0; 1526 1527 for (i = 0; i < adev->num_ip_blocks; i++) { 1528 if (!adev->ip_blocks[i].status.valid) 1529 continue; 1530 if (adev->ip_blocks[i].version->type != block_type) 1531 continue; 1532 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1533 continue; 1534 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1535 (void *)adev, state); 1536 if (r) 1537 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1538 adev->ip_blocks[i].version->funcs->name, r); 1539 } 1540 return r; 1541 } 1542 1543 /** 1544 * amdgpu_device_ip_set_powergating_state - set the PG state 1545 * 1546 * @dev: amdgpu_device pointer 1547 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1548 * @state: powergating state (gate or ungate) 1549 * 1550 * Sets the requested powergating state for all instances of 1551 * the hardware IP specified. 1552 * Returns the error code from the last instance. 1553 */ 1554 int amdgpu_device_ip_set_powergating_state(void *dev, 1555 enum amd_ip_block_type block_type, 1556 enum amd_powergating_state state) 1557 { 1558 struct amdgpu_device *adev = dev; 1559 int i, r = 0; 1560 1561 for (i = 0; i < adev->num_ip_blocks; i++) { 1562 if (!adev->ip_blocks[i].status.valid) 1563 continue; 1564 if (adev->ip_blocks[i].version->type != block_type) 1565 continue; 1566 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1567 continue; 1568 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1569 (void *)adev, state); 1570 if (r) 1571 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1572 adev->ip_blocks[i].version->funcs->name, r); 1573 } 1574 return r; 1575 } 1576 1577 /** 1578 * amdgpu_device_ip_get_clockgating_state - get the CG state 1579 * 1580 * @adev: amdgpu_device pointer 1581 * @flags: clockgating feature flags 1582 * 1583 * Walks the list of IPs on the device and updates the clockgating 1584 * flags for each IP. 1585 * Updates @flags with the feature flags for each hardware IP where 1586 * clockgating is enabled. 1587 */ 1588 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1589 u32 *flags) 1590 { 1591 int i; 1592 1593 for (i = 0; i < adev->num_ip_blocks; i++) { 1594 if (!adev->ip_blocks[i].status.valid) 1595 continue; 1596 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1597 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1598 } 1599 } 1600 1601 /** 1602 * amdgpu_device_ip_wait_for_idle - wait for idle 1603 * 1604 * @adev: amdgpu_device pointer 1605 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1606 * 1607 * Waits for the request hardware IP to be idle. 1608 * Returns 0 for success or a negative error code on failure. 1609 */ 1610 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1611 enum amd_ip_block_type block_type) 1612 { 1613 int i, r; 1614 1615 for (i = 0; i < adev->num_ip_blocks; i++) { 1616 if (!adev->ip_blocks[i].status.valid) 1617 continue; 1618 if (adev->ip_blocks[i].version->type == block_type) { 1619 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1620 if (r) 1621 return r; 1622 break; 1623 } 1624 } 1625 return 0; 1626 1627 } 1628 1629 /** 1630 * amdgpu_device_ip_is_idle - is the hardware IP idle 1631 * 1632 * @adev: amdgpu_device pointer 1633 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1634 * 1635 * Check if the hardware IP is idle or not. 1636 * Returns true if it the IP is idle, false if not. 1637 */ 1638 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1639 enum amd_ip_block_type block_type) 1640 { 1641 int i; 1642 1643 for (i = 0; i < adev->num_ip_blocks; i++) { 1644 if (!adev->ip_blocks[i].status.valid) 1645 continue; 1646 if (adev->ip_blocks[i].version->type == block_type) 1647 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1648 } 1649 return true; 1650 1651 } 1652 1653 /** 1654 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1655 * 1656 * @adev: amdgpu_device pointer 1657 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1658 * 1659 * Returns a pointer to the hardware IP block structure 1660 * if it exists for the asic, otherwise NULL. 1661 */ 1662 struct amdgpu_ip_block * 1663 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1664 enum amd_ip_block_type type) 1665 { 1666 int i; 1667 1668 for (i = 0; i < adev->num_ip_blocks; i++) 1669 if (adev->ip_blocks[i].version->type == type) 1670 return &adev->ip_blocks[i]; 1671 1672 return NULL; 1673 } 1674 1675 /** 1676 * amdgpu_device_ip_block_version_cmp 1677 * 1678 * @adev: amdgpu_device pointer 1679 * @type: enum amd_ip_block_type 1680 * @major: major version 1681 * @minor: minor version 1682 * 1683 * return 0 if equal or greater 1684 * return 1 if smaller or the ip_block doesn't exist 1685 */ 1686 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1687 enum amd_ip_block_type type, 1688 u32 major, u32 minor) 1689 { 1690 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1691 1692 if (ip_block && ((ip_block->version->major > major) || 1693 ((ip_block->version->major == major) && 1694 (ip_block->version->minor >= minor)))) 1695 return 0; 1696 1697 return 1; 1698 } 1699 1700 /** 1701 * amdgpu_device_ip_block_add 1702 * 1703 * @adev: amdgpu_device pointer 1704 * @ip_block_version: pointer to the IP to add 1705 * 1706 * Adds the IP block driver information to the collection of IPs 1707 * on the asic. 1708 */ 1709 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1710 const struct amdgpu_ip_block_version *ip_block_version) 1711 { 1712 if (!ip_block_version) 1713 return -EINVAL; 1714 1715 switch (ip_block_version->type) { 1716 case AMD_IP_BLOCK_TYPE_VCN: 1717 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1718 return 0; 1719 break; 1720 case AMD_IP_BLOCK_TYPE_JPEG: 1721 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1722 return 0; 1723 break; 1724 default: 1725 break; 1726 } 1727 1728 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1729 ip_block_version->funcs->name); 1730 1731 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1732 1733 return 0; 1734 } 1735 1736 /** 1737 * amdgpu_device_enable_virtual_display - enable virtual display feature 1738 * 1739 * @adev: amdgpu_device pointer 1740 * 1741 * Enabled the virtual display feature if the user has enabled it via 1742 * the module parameter virtual_display. This feature provides a virtual 1743 * display hardware on headless boards or in virtualized environments. 1744 * This function parses and validates the configuration string specified by 1745 * the user and configues the virtual display configuration (number of 1746 * virtual connectors, crtcs, etc.) specified. 1747 */ 1748 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1749 { 1750 adev->enable_virtual_display = false; 1751 1752 if (amdgpu_virtual_display) { 1753 const char *pci_address_name = pci_name(adev->pdev); 1754 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1755 1756 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1757 pciaddstr_tmp = pciaddstr; 1758 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1759 pciaddname = strsep(&pciaddname_tmp, ","); 1760 if (!strcmp("all", pciaddname) 1761 || !strcmp(pci_address_name, pciaddname)) { 1762 long num_crtc; 1763 int res = -1; 1764 1765 adev->enable_virtual_display = true; 1766 1767 if (pciaddname_tmp) 1768 res = kstrtol(pciaddname_tmp, 10, 1769 &num_crtc); 1770 1771 if (!res) { 1772 if (num_crtc < 1) 1773 num_crtc = 1; 1774 if (num_crtc > 6) 1775 num_crtc = 6; 1776 adev->mode_info.num_crtc = num_crtc; 1777 } else { 1778 adev->mode_info.num_crtc = 1; 1779 } 1780 break; 1781 } 1782 } 1783 1784 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1785 amdgpu_virtual_display, pci_address_name, 1786 adev->enable_virtual_display, adev->mode_info.num_crtc); 1787 1788 kfree(pciaddstr); 1789 } 1790 } 1791 1792 /** 1793 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1794 * 1795 * @adev: amdgpu_device pointer 1796 * 1797 * Parses the asic configuration parameters specified in the gpu info 1798 * firmware and makes them availale to the driver for use in configuring 1799 * the asic. 1800 * Returns 0 on success, -EINVAL on failure. 1801 */ 1802 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1803 { 1804 const char *chip_name; 1805 char fw_name[40]; 1806 int err; 1807 const struct gpu_info_firmware_header_v1_0 *hdr; 1808 1809 adev->firmware.gpu_info_fw = NULL; 1810 1811 if (adev->mman.discovery_bin) { 1812 amdgpu_discovery_get_gfx_info(adev); 1813 1814 /* 1815 * FIXME: The bounding box is still needed by Navi12, so 1816 * temporarily read it from gpu_info firmware. Should be droped 1817 * when DAL no longer needs it. 1818 */ 1819 if (adev->asic_type != CHIP_NAVI12) 1820 return 0; 1821 } 1822 1823 switch (adev->asic_type) { 1824 #ifdef CONFIG_DRM_AMDGPU_SI 1825 case CHIP_VERDE: 1826 case CHIP_TAHITI: 1827 case CHIP_PITCAIRN: 1828 case CHIP_OLAND: 1829 case CHIP_HAINAN: 1830 #endif 1831 #ifdef CONFIG_DRM_AMDGPU_CIK 1832 case CHIP_BONAIRE: 1833 case CHIP_HAWAII: 1834 case CHIP_KAVERI: 1835 case CHIP_KABINI: 1836 case CHIP_MULLINS: 1837 #endif 1838 case CHIP_TOPAZ: 1839 case CHIP_TONGA: 1840 case CHIP_FIJI: 1841 case CHIP_POLARIS10: 1842 case CHIP_POLARIS11: 1843 case CHIP_POLARIS12: 1844 case CHIP_VEGAM: 1845 case CHIP_CARRIZO: 1846 case CHIP_STONEY: 1847 case CHIP_VEGA20: 1848 case CHIP_ALDEBARAN: 1849 case CHIP_SIENNA_CICHLID: 1850 case CHIP_NAVY_FLOUNDER: 1851 case CHIP_DIMGREY_CAVEFISH: 1852 case CHIP_BEIGE_GOBY: 1853 default: 1854 return 0; 1855 case CHIP_VEGA10: 1856 chip_name = "vega10"; 1857 break; 1858 case CHIP_VEGA12: 1859 chip_name = "vega12"; 1860 break; 1861 case CHIP_RAVEN: 1862 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1863 chip_name = "raven2"; 1864 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1865 chip_name = "picasso"; 1866 else 1867 chip_name = "raven"; 1868 break; 1869 case CHIP_ARCTURUS: 1870 chip_name = "arcturus"; 1871 break; 1872 case CHIP_RENOIR: 1873 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1874 chip_name = "renoir"; 1875 else 1876 chip_name = "green_sardine"; 1877 break; 1878 case CHIP_NAVI10: 1879 chip_name = "navi10"; 1880 break; 1881 case CHIP_NAVI14: 1882 chip_name = "navi14"; 1883 break; 1884 case CHIP_NAVI12: 1885 chip_name = "navi12"; 1886 break; 1887 case CHIP_VANGOGH: 1888 chip_name = "vangogh"; 1889 break; 1890 case CHIP_YELLOW_CARP: 1891 chip_name = "yellow_carp"; 1892 break; 1893 } 1894 1895 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1896 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1897 if (err) { 1898 dev_err(adev->dev, 1899 "Failed to load gpu_info firmware \"%s\"\n", 1900 fw_name); 1901 goto out; 1902 } 1903 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1904 if (err) { 1905 dev_err(adev->dev, 1906 "Failed to validate gpu_info firmware \"%s\"\n", 1907 fw_name); 1908 goto out; 1909 } 1910 1911 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1912 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1913 1914 switch (hdr->version_major) { 1915 case 1: 1916 { 1917 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1918 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1919 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1920 1921 /* 1922 * Should be droped when DAL no longer needs it. 1923 */ 1924 if (adev->asic_type == CHIP_NAVI12) 1925 goto parse_soc_bounding_box; 1926 1927 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1928 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1929 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1930 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1931 adev->gfx.config.max_texture_channel_caches = 1932 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1933 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1934 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1935 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1936 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1937 adev->gfx.config.double_offchip_lds_buf = 1938 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1939 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1940 adev->gfx.cu_info.max_waves_per_simd = 1941 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1942 adev->gfx.cu_info.max_scratch_slots_per_cu = 1943 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1944 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1945 if (hdr->version_minor >= 1) { 1946 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1947 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1948 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1949 adev->gfx.config.num_sc_per_sh = 1950 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1951 adev->gfx.config.num_packer_per_sc = 1952 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1953 } 1954 1955 parse_soc_bounding_box: 1956 /* 1957 * soc bounding box info is not integrated in disocovery table, 1958 * we always need to parse it from gpu info firmware if needed. 1959 */ 1960 if (hdr->version_minor == 2) { 1961 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1962 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1963 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1964 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1965 } 1966 break; 1967 } 1968 default: 1969 dev_err(adev->dev, 1970 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1971 err = -EINVAL; 1972 goto out; 1973 } 1974 out: 1975 return err; 1976 } 1977 1978 /** 1979 * amdgpu_device_ip_early_init - run early init for hardware IPs 1980 * 1981 * @adev: amdgpu_device pointer 1982 * 1983 * Early initialization pass for hardware IPs. The hardware IPs that make 1984 * up each asic are discovered each IP's early_init callback is run. This 1985 * is the first stage in initializing the asic. 1986 * Returns 0 on success, negative error code on failure. 1987 */ 1988 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1989 { 1990 int i, r; 1991 1992 amdgpu_device_enable_virtual_display(adev); 1993 1994 if (amdgpu_sriov_vf(adev)) { 1995 r = amdgpu_virt_request_full_gpu(adev, true); 1996 if (r) 1997 return r; 1998 } 1999 2000 switch (adev->asic_type) { 2001 #ifdef CONFIG_DRM_AMDGPU_SI 2002 case CHIP_VERDE: 2003 case CHIP_TAHITI: 2004 case CHIP_PITCAIRN: 2005 case CHIP_OLAND: 2006 case CHIP_HAINAN: 2007 adev->family = AMDGPU_FAMILY_SI; 2008 r = si_set_ip_blocks(adev); 2009 if (r) 2010 return r; 2011 break; 2012 #endif 2013 #ifdef CONFIG_DRM_AMDGPU_CIK 2014 case CHIP_BONAIRE: 2015 case CHIP_HAWAII: 2016 case CHIP_KAVERI: 2017 case CHIP_KABINI: 2018 case CHIP_MULLINS: 2019 if (adev->flags & AMD_IS_APU) 2020 adev->family = AMDGPU_FAMILY_KV; 2021 else 2022 adev->family = AMDGPU_FAMILY_CI; 2023 2024 r = cik_set_ip_blocks(adev); 2025 if (r) 2026 return r; 2027 break; 2028 #endif 2029 case CHIP_TOPAZ: 2030 case CHIP_TONGA: 2031 case CHIP_FIJI: 2032 case CHIP_POLARIS10: 2033 case CHIP_POLARIS11: 2034 case CHIP_POLARIS12: 2035 case CHIP_VEGAM: 2036 case CHIP_CARRIZO: 2037 case CHIP_STONEY: 2038 if (adev->flags & AMD_IS_APU) 2039 adev->family = AMDGPU_FAMILY_CZ; 2040 else 2041 adev->family = AMDGPU_FAMILY_VI; 2042 2043 r = vi_set_ip_blocks(adev); 2044 if (r) 2045 return r; 2046 break; 2047 case CHIP_VEGA10: 2048 case CHIP_VEGA12: 2049 case CHIP_VEGA20: 2050 case CHIP_RAVEN: 2051 case CHIP_ARCTURUS: 2052 case CHIP_RENOIR: 2053 case CHIP_ALDEBARAN: 2054 if (adev->flags & AMD_IS_APU) 2055 adev->family = AMDGPU_FAMILY_RV; 2056 else 2057 adev->family = AMDGPU_FAMILY_AI; 2058 2059 r = soc15_set_ip_blocks(adev); 2060 if (r) 2061 return r; 2062 break; 2063 case CHIP_NAVI10: 2064 case CHIP_NAVI14: 2065 case CHIP_NAVI12: 2066 case CHIP_SIENNA_CICHLID: 2067 case CHIP_NAVY_FLOUNDER: 2068 case CHIP_DIMGREY_CAVEFISH: 2069 case CHIP_BEIGE_GOBY: 2070 case CHIP_VANGOGH: 2071 case CHIP_YELLOW_CARP: 2072 if (adev->asic_type == CHIP_VANGOGH) 2073 adev->family = AMDGPU_FAMILY_VGH; 2074 else if (adev->asic_type == CHIP_YELLOW_CARP) 2075 adev->family = AMDGPU_FAMILY_YC; 2076 else 2077 adev->family = AMDGPU_FAMILY_NV; 2078 2079 r = nv_set_ip_blocks(adev); 2080 if (r) 2081 return r; 2082 break; 2083 default: 2084 /* FIXME: not supported yet */ 2085 return -EINVAL; 2086 } 2087 2088 amdgpu_amdkfd_device_probe(adev); 2089 2090 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2091 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2092 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2093 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2094 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2095 2096 for (i = 0; i < adev->num_ip_blocks; i++) { 2097 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2098 DRM_ERROR("disabled ip block: %d <%s>\n", 2099 i, adev->ip_blocks[i].version->funcs->name); 2100 adev->ip_blocks[i].status.valid = false; 2101 } else { 2102 if (adev->ip_blocks[i].version->funcs->early_init) { 2103 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2104 if (r == -ENOENT) { 2105 adev->ip_blocks[i].status.valid = false; 2106 } else if (r) { 2107 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2108 adev->ip_blocks[i].version->funcs->name, r); 2109 return r; 2110 } else { 2111 adev->ip_blocks[i].status.valid = true; 2112 } 2113 } else { 2114 adev->ip_blocks[i].status.valid = true; 2115 } 2116 } 2117 /* get the vbios after the asic_funcs are set up */ 2118 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2119 r = amdgpu_device_parse_gpu_info_fw(adev); 2120 if (r) 2121 return r; 2122 2123 /* Read BIOS */ 2124 if (!amdgpu_get_bios(adev)) 2125 return -EINVAL; 2126 2127 r = amdgpu_atombios_init(adev); 2128 if (r) { 2129 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2130 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2131 return r; 2132 } 2133 2134 /*get pf2vf msg info at it's earliest time*/ 2135 if (amdgpu_sriov_vf(adev)) 2136 amdgpu_virt_init_data_exchange(adev); 2137 2138 } 2139 } 2140 2141 adev->cg_flags &= amdgpu_cg_mask; 2142 adev->pg_flags &= amdgpu_pg_mask; 2143 2144 return 0; 2145 } 2146 2147 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2148 { 2149 int i, r; 2150 2151 for (i = 0; i < adev->num_ip_blocks; i++) { 2152 if (!adev->ip_blocks[i].status.sw) 2153 continue; 2154 if (adev->ip_blocks[i].status.hw) 2155 continue; 2156 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2157 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2158 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2159 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2160 if (r) { 2161 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2162 adev->ip_blocks[i].version->funcs->name, r); 2163 return r; 2164 } 2165 adev->ip_blocks[i].status.hw = true; 2166 } 2167 } 2168 2169 return 0; 2170 } 2171 2172 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2173 { 2174 int i, r; 2175 2176 for (i = 0; i < adev->num_ip_blocks; i++) { 2177 if (!adev->ip_blocks[i].status.sw) 2178 continue; 2179 if (adev->ip_blocks[i].status.hw) 2180 continue; 2181 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2182 if (r) { 2183 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2184 adev->ip_blocks[i].version->funcs->name, r); 2185 return r; 2186 } 2187 adev->ip_blocks[i].status.hw = true; 2188 } 2189 2190 return 0; 2191 } 2192 2193 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2194 { 2195 int r = 0; 2196 int i; 2197 uint32_t smu_version; 2198 2199 if (adev->asic_type >= CHIP_VEGA10) { 2200 for (i = 0; i < adev->num_ip_blocks; i++) { 2201 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2202 continue; 2203 2204 if (!adev->ip_blocks[i].status.sw) 2205 continue; 2206 2207 /* no need to do the fw loading again if already done*/ 2208 if (adev->ip_blocks[i].status.hw == true) 2209 break; 2210 2211 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2212 r = adev->ip_blocks[i].version->funcs->resume(adev); 2213 if (r) { 2214 DRM_ERROR("resume of IP block <%s> failed %d\n", 2215 adev->ip_blocks[i].version->funcs->name, r); 2216 return r; 2217 } 2218 } else { 2219 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2220 if (r) { 2221 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2222 adev->ip_blocks[i].version->funcs->name, r); 2223 return r; 2224 } 2225 } 2226 2227 adev->ip_blocks[i].status.hw = true; 2228 break; 2229 } 2230 } 2231 2232 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2233 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2234 2235 return r; 2236 } 2237 2238 /** 2239 * amdgpu_device_ip_init - run init for hardware IPs 2240 * 2241 * @adev: amdgpu_device pointer 2242 * 2243 * Main initialization pass for hardware IPs. The list of all the hardware 2244 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2245 * are run. sw_init initializes the software state associated with each IP 2246 * and hw_init initializes the hardware associated with each IP. 2247 * Returns 0 on success, negative error code on failure. 2248 */ 2249 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2250 { 2251 int i, r; 2252 2253 r = amdgpu_ras_init(adev); 2254 if (r) 2255 return r; 2256 2257 for (i = 0; i < adev->num_ip_blocks; i++) { 2258 if (!adev->ip_blocks[i].status.valid) 2259 continue; 2260 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2261 if (r) { 2262 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2263 adev->ip_blocks[i].version->funcs->name, r); 2264 goto init_failed; 2265 } 2266 adev->ip_blocks[i].status.sw = true; 2267 2268 /* need to do gmc hw init early so we can allocate gpu mem */ 2269 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2270 r = amdgpu_device_vram_scratch_init(adev); 2271 if (r) { 2272 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2273 goto init_failed; 2274 } 2275 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2276 if (r) { 2277 DRM_ERROR("hw_init %d failed %d\n", i, r); 2278 goto init_failed; 2279 } 2280 r = amdgpu_device_wb_init(adev); 2281 if (r) { 2282 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2283 goto init_failed; 2284 } 2285 adev->ip_blocks[i].status.hw = true; 2286 2287 /* right after GMC hw init, we create CSA */ 2288 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2289 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2290 AMDGPU_GEM_DOMAIN_VRAM, 2291 AMDGPU_CSA_SIZE); 2292 if (r) { 2293 DRM_ERROR("allocate CSA failed %d\n", r); 2294 goto init_failed; 2295 } 2296 } 2297 } 2298 } 2299 2300 if (amdgpu_sriov_vf(adev)) 2301 amdgpu_virt_init_data_exchange(adev); 2302 2303 r = amdgpu_ib_pool_init(adev); 2304 if (r) { 2305 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2306 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2307 goto init_failed; 2308 } 2309 2310 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2311 if (r) 2312 goto init_failed; 2313 2314 r = amdgpu_device_ip_hw_init_phase1(adev); 2315 if (r) 2316 goto init_failed; 2317 2318 r = amdgpu_device_fw_loading(adev); 2319 if (r) 2320 goto init_failed; 2321 2322 r = amdgpu_device_ip_hw_init_phase2(adev); 2323 if (r) 2324 goto init_failed; 2325 2326 /* 2327 * retired pages will be loaded from eeprom and reserved here, 2328 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2329 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2330 * for I2C communication which only true at this point. 2331 * 2332 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2333 * failure from bad gpu situation and stop amdgpu init process 2334 * accordingly. For other failed cases, it will still release all 2335 * the resource and print error message, rather than returning one 2336 * negative value to upper level. 2337 * 2338 * Note: theoretically, this should be called before all vram allocations 2339 * to protect retired page from abusing 2340 */ 2341 r = amdgpu_ras_recovery_init(adev); 2342 if (r) 2343 goto init_failed; 2344 2345 if (adev->gmc.xgmi.num_physical_nodes > 1) 2346 amdgpu_xgmi_add_device(adev); 2347 2348 /* Don't init kfd if whole hive need to be reset during init */ 2349 if (!adev->gmc.xgmi.pending_reset) 2350 amdgpu_amdkfd_device_init(adev); 2351 2352 amdgpu_fru_get_product_info(adev); 2353 2354 init_failed: 2355 if (amdgpu_sriov_vf(adev)) 2356 amdgpu_virt_release_full_gpu(adev, true); 2357 2358 return r; 2359 } 2360 2361 /** 2362 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2363 * 2364 * @adev: amdgpu_device pointer 2365 * 2366 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2367 * this function before a GPU reset. If the value is retained after a 2368 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2369 */ 2370 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2371 { 2372 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2373 } 2374 2375 /** 2376 * amdgpu_device_check_vram_lost - check if vram is valid 2377 * 2378 * @adev: amdgpu_device pointer 2379 * 2380 * Checks the reset magic value written to the gart pointer in VRAM. 2381 * The driver calls this after a GPU reset to see if the contents of 2382 * VRAM is lost or now. 2383 * returns true if vram is lost, false if not. 2384 */ 2385 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2386 { 2387 if (memcmp(adev->gart.ptr, adev->reset_magic, 2388 AMDGPU_RESET_MAGIC_NUM)) 2389 return true; 2390 2391 if (!amdgpu_in_reset(adev)) 2392 return false; 2393 2394 /* 2395 * For all ASICs with baco/mode1 reset, the VRAM is 2396 * always assumed to be lost. 2397 */ 2398 switch (amdgpu_asic_reset_method(adev)) { 2399 case AMD_RESET_METHOD_BACO: 2400 case AMD_RESET_METHOD_MODE1: 2401 return true; 2402 default: 2403 return false; 2404 } 2405 } 2406 2407 /** 2408 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2409 * 2410 * @adev: amdgpu_device pointer 2411 * @state: clockgating state (gate or ungate) 2412 * 2413 * The list of all the hardware IPs that make up the asic is walked and the 2414 * set_clockgating_state callbacks are run. 2415 * Late initialization pass enabling clockgating for hardware IPs. 2416 * Fini or suspend, pass disabling clockgating for hardware IPs. 2417 * Returns 0 on success, negative error code on failure. 2418 */ 2419 2420 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2421 enum amd_clockgating_state state) 2422 { 2423 int i, j, r; 2424 2425 if (amdgpu_emu_mode == 1) 2426 return 0; 2427 2428 for (j = 0; j < adev->num_ip_blocks; j++) { 2429 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2430 if (!adev->ip_blocks[i].status.late_initialized) 2431 continue; 2432 /* skip CG for GFX on S0ix */ 2433 if (adev->in_s0ix && 2434 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2435 continue; 2436 /* skip CG for VCE/UVD, it's handled specially */ 2437 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2438 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2439 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2440 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2441 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2442 /* enable clockgating to save power */ 2443 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2444 state); 2445 if (r) { 2446 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2447 adev->ip_blocks[i].version->funcs->name, r); 2448 return r; 2449 } 2450 } 2451 } 2452 2453 return 0; 2454 } 2455 2456 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2457 enum amd_powergating_state state) 2458 { 2459 int i, j, r; 2460 2461 if (amdgpu_emu_mode == 1) 2462 return 0; 2463 2464 for (j = 0; j < adev->num_ip_blocks; j++) { 2465 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2466 if (!adev->ip_blocks[i].status.late_initialized) 2467 continue; 2468 /* skip PG for GFX on S0ix */ 2469 if (adev->in_s0ix && 2470 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2471 continue; 2472 /* skip CG for VCE/UVD, it's handled specially */ 2473 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2474 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2475 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2476 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2477 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2478 /* enable powergating to save power */ 2479 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2480 state); 2481 if (r) { 2482 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2483 adev->ip_blocks[i].version->funcs->name, r); 2484 return r; 2485 } 2486 } 2487 } 2488 return 0; 2489 } 2490 2491 static int amdgpu_device_enable_mgpu_fan_boost(void) 2492 { 2493 struct amdgpu_gpu_instance *gpu_ins; 2494 struct amdgpu_device *adev; 2495 int i, ret = 0; 2496 2497 mutex_lock(&mgpu_info.mutex); 2498 2499 /* 2500 * MGPU fan boost feature should be enabled 2501 * only when there are two or more dGPUs in 2502 * the system 2503 */ 2504 if (mgpu_info.num_dgpu < 2) 2505 goto out; 2506 2507 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2508 gpu_ins = &(mgpu_info.gpu_ins[i]); 2509 adev = gpu_ins->adev; 2510 if (!(adev->flags & AMD_IS_APU) && 2511 !gpu_ins->mgpu_fan_enabled) { 2512 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2513 if (ret) 2514 break; 2515 2516 gpu_ins->mgpu_fan_enabled = 1; 2517 } 2518 } 2519 2520 out: 2521 mutex_unlock(&mgpu_info.mutex); 2522 2523 return ret; 2524 } 2525 2526 /** 2527 * amdgpu_device_ip_late_init - run late init for hardware IPs 2528 * 2529 * @adev: amdgpu_device pointer 2530 * 2531 * Late initialization pass for hardware IPs. The list of all the hardware 2532 * IPs that make up the asic is walked and the late_init callbacks are run. 2533 * late_init covers any special initialization that an IP requires 2534 * after all of the have been initialized or something that needs to happen 2535 * late in the init process. 2536 * Returns 0 on success, negative error code on failure. 2537 */ 2538 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2539 { 2540 struct amdgpu_gpu_instance *gpu_instance; 2541 int i = 0, r; 2542 2543 for (i = 0; i < adev->num_ip_blocks; i++) { 2544 if (!adev->ip_blocks[i].status.hw) 2545 continue; 2546 if (adev->ip_blocks[i].version->funcs->late_init) { 2547 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2548 if (r) { 2549 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2550 adev->ip_blocks[i].version->funcs->name, r); 2551 return r; 2552 } 2553 } 2554 adev->ip_blocks[i].status.late_initialized = true; 2555 } 2556 2557 amdgpu_ras_set_error_query_ready(adev, true); 2558 2559 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2560 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2561 2562 amdgpu_device_fill_reset_magic(adev); 2563 2564 r = amdgpu_device_enable_mgpu_fan_boost(); 2565 if (r) 2566 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2567 2568 /* For XGMI + passthrough configuration on arcturus, enable light SBR */ 2569 if (adev->asic_type == CHIP_ARCTURUS && 2570 amdgpu_passthrough(adev) && 2571 adev->gmc.xgmi.num_physical_nodes > 1) 2572 smu_set_light_sbr(&adev->smu, true); 2573 2574 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2575 mutex_lock(&mgpu_info.mutex); 2576 2577 /* 2578 * Reset device p-state to low as this was booted with high. 2579 * 2580 * This should be performed only after all devices from the same 2581 * hive get initialized. 2582 * 2583 * However, it's unknown how many device in the hive in advance. 2584 * As this is counted one by one during devices initializations. 2585 * 2586 * So, we wait for all XGMI interlinked devices initialized. 2587 * This may bring some delays as those devices may come from 2588 * different hives. But that should be OK. 2589 */ 2590 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2591 for (i = 0; i < mgpu_info.num_gpu; i++) { 2592 gpu_instance = &(mgpu_info.gpu_ins[i]); 2593 if (gpu_instance->adev->flags & AMD_IS_APU) 2594 continue; 2595 2596 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2597 AMDGPU_XGMI_PSTATE_MIN); 2598 if (r) { 2599 DRM_ERROR("pstate setting failed (%d).\n", r); 2600 break; 2601 } 2602 } 2603 } 2604 2605 mutex_unlock(&mgpu_info.mutex); 2606 } 2607 2608 return 0; 2609 } 2610 2611 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2612 { 2613 int i, r; 2614 2615 for (i = 0; i < adev->num_ip_blocks; i++) { 2616 if (!adev->ip_blocks[i].version->funcs->early_fini) 2617 continue; 2618 2619 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2620 if (r) { 2621 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2622 adev->ip_blocks[i].version->funcs->name, r); 2623 } 2624 } 2625 2626 amdgpu_amdkfd_suspend(adev, false); 2627 2628 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2629 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2630 2631 /* need to disable SMC first */ 2632 for (i = 0; i < adev->num_ip_blocks; i++) { 2633 if (!adev->ip_blocks[i].status.hw) 2634 continue; 2635 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2636 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2637 /* XXX handle errors */ 2638 if (r) { 2639 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2640 adev->ip_blocks[i].version->funcs->name, r); 2641 } 2642 adev->ip_blocks[i].status.hw = false; 2643 break; 2644 } 2645 } 2646 2647 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2648 if (!adev->ip_blocks[i].status.hw) 2649 continue; 2650 2651 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2652 /* XXX handle errors */ 2653 if (r) { 2654 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2655 adev->ip_blocks[i].version->funcs->name, r); 2656 } 2657 2658 adev->ip_blocks[i].status.hw = false; 2659 } 2660 2661 return 0; 2662 } 2663 2664 /** 2665 * amdgpu_device_ip_fini - run fini for hardware IPs 2666 * 2667 * @adev: amdgpu_device pointer 2668 * 2669 * Main teardown pass for hardware IPs. The list of all the hardware 2670 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2671 * are run. hw_fini tears down the hardware associated with each IP 2672 * and sw_fini tears down any software state associated with each IP. 2673 * Returns 0 on success, negative error code on failure. 2674 */ 2675 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2676 { 2677 int i, r; 2678 2679 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2680 amdgpu_virt_release_ras_err_handler_data(adev); 2681 2682 amdgpu_ras_pre_fini(adev); 2683 2684 if (adev->gmc.xgmi.num_physical_nodes > 1) 2685 amdgpu_xgmi_remove_device(adev); 2686 2687 amdgpu_amdkfd_device_fini_sw(adev); 2688 2689 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2690 if (!adev->ip_blocks[i].status.sw) 2691 continue; 2692 2693 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2694 amdgpu_ucode_free_bo(adev); 2695 amdgpu_free_static_csa(&adev->virt.csa_obj); 2696 amdgpu_device_wb_fini(adev); 2697 amdgpu_device_vram_scratch_fini(adev); 2698 amdgpu_ib_pool_fini(adev); 2699 } 2700 2701 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2702 /* XXX handle errors */ 2703 if (r) { 2704 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2705 adev->ip_blocks[i].version->funcs->name, r); 2706 } 2707 adev->ip_blocks[i].status.sw = false; 2708 adev->ip_blocks[i].status.valid = false; 2709 } 2710 2711 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2712 if (!adev->ip_blocks[i].status.late_initialized) 2713 continue; 2714 if (adev->ip_blocks[i].version->funcs->late_fini) 2715 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2716 adev->ip_blocks[i].status.late_initialized = false; 2717 } 2718 2719 amdgpu_ras_fini(adev); 2720 2721 if (amdgpu_sriov_vf(adev)) 2722 if (amdgpu_virt_release_full_gpu(adev, false)) 2723 DRM_ERROR("failed to release exclusive mode on fini\n"); 2724 2725 return 0; 2726 } 2727 2728 /** 2729 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2730 * 2731 * @work: work_struct. 2732 */ 2733 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2734 { 2735 struct amdgpu_device *adev = 2736 container_of(work, struct amdgpu_device, delayed_init_work.work); 2737 int r; 2738 2739 r = amdgpu_ib_ring_tests(adev); 2740 if (r) 2741 DRM_ERROR("ib ring test failed (%d).\n", r); 2742 } 2743 2744 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2745 { 2746 struct amdgpu_device *adev = 2747 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2748 2749 mutex_lock(&adev->gfx.gfx_off_mutex); 2750 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2751 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2752 adev->gfx.gfx_off_state = true; 2753 } 2754 mutex_unlock(&adev->gfx.gfx_off_mutex); 2755 } 2756 2757 /** 2758 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2759 * 2760 * @adev: amdgpu_device pointer 2761 * 2762 * Main suspend function for hardware IPs. The list of all the hardware 2763 * IPs that make up the asic is walked, clockgating is disabled and the 2764 * suspend callbacks are run. suspend puts the hardware and software state 2765 * in each IP into a state suitable for suspend. 2766 * Returns 0 on success, negative error code on failure. 2767 */ 2768 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2769 { 2770 int i, r; 2771 2772 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2773 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2774 2775 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2776 if (!adev->ip_blocks[i].status.valid) 2777 continue; 2778 2779 /* displays are handled separately */ 2780 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2781 continue; 2782 2783 /* XXX handle errors */ 2784 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2785 /* XXX handle errors */ 2786 if (r) { 2787 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2788 adev->ip_blocks[i].version->funcs->name, r); 2789 return r; 2790 } 2791 2792 adev->ip_blocks[i].status.hw = false; 2793 } 2794 2795 return 0; 2796 } 2797 2798 /** 2799 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2800 * 2801 * @adev: amdgpu_device pointer 2802 * 2803 * Main suspend function for hardware IPs. The list of all the hardware 2804 * IPs that make up the asic is walked, clockgating is disabled and the 2805 * suspend callbacks are run. suspend puts the hardware and software state 2806 * in each IP into a state suitable for suspend. 2807 * Returns 0 on success, negative error code on failure. 2808 */ 2809 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2810 { 2811 int i, r; 2812 2813 if (adev->in_s0ix) 2814 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 2815 2816 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2817 if (!adev->ip_blocks[i].status.valid) 2818 continue; 2819 /* displays are handled in phase1 */ 2820 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2821 continue; 2822 /* PSP lost connection when err_event_athub occurs */ 2823 if (amdgpu_ras_intr_triggered() && 2824 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2825 adev->ip_blocks[i].status.hw = false; 2826 continue; 2827 } 2828 2829 /* skip unnecessary suspend if we do not initialize them yet */ 2830 if (adev->gmc.xgmi.pending_reset && 2831 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2832 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2833 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2834 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2835 adev->ip_blocks[i].status.hw = false; 2836 continue; 2837 } 2838 2839 /* skip suspend of gfx and psp for S0ix 2840 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2841 * like at runtime. PSP is also part of the always on hardware 2842 * so no need to suspend it. 2843 */ 2844 if (adev->in_s0ix && 2845 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2846 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 2847 continue; 2848 2849 /* XXX handle errors */ 2850 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2851 /* XXX handle errors */ 2852 if (r) { 2853 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2854 adev->ip_blocks[i].version->funcs->name, r); 2855 } 2856 adev->ip_blocks[i].status.hw = false; 2857 /* handle putting the SMC in the appropriate state */ 2858 if(!amdgpu_sriov_vf(adev)){ 2859 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2860 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2861 if (r) { 2862 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2863 adev->mp1_state, r); 2864 return r; 2865 } 2866 } 2867 } 2868 } 2869 2870 return 0; 2871 } 2872 2873 /** 2874 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2875 * 2876 * @adev: amdgpu_device pointer 2877 * 2878 * Main suspend function for hardware IPs. The list of all the hardware 2879 * IPs that make up the asic is walked, clockgating is disabled and the 2880 * suspend callbacks are run. suspend puts the hardware and software state 2881 * in each IP into a state suitable for suspend. 2882 * Returns 0 on success, negative error code on failure. 2883 */ 2884 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2885 { 2886 int r; 2887 2888 if (amdgpu_sriov_vf(adev)) { 2889 amdgpu_virt_fini_data_exchange(adev); 2890 amdgpu_virt_request_full_gpu(adev, false); 2891 } 2892 2893 r = amdgpu_device_ip_suspend_phase1(adev); 2894 if (r) 2895 return r; 2896 r = amdgpu_device_ip_suspend_phase2(adev); 2897 2898 if (amdgpu_sriov_vf(adev)) 2899 amdgpu_virt_release_full_gpu(adev, false); 2900 2901 return r; 2902 } 2903 2904 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2905 { 2906 int i, r; 2907 2908 static enum amd_ip_block_type ip_order[] = { 2909 AMD_IP_BLOCK_TYPE_GMC, 2910 AMD_IP_BLOCK_TYPE_COMMON, 2911 AMD_IP_BLOCK_TYPE_PSP, 2912 AMD_IP_BLOCK_TYPE_IH, 2913 }; 2914 2915 for (i = 0; i < adev->num_ip_blocks; i++) { 2916 int j; 2917 struct amdgpu_ip_block *block; 2918 2919 block = &adev->ip_blocks[i]; 2920 block->status.hw = false; 2921 2922 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2923 2924 if (block->version->type != ip_order[j] || 2925 !block->status.valid) 2926 continue; 2927 2928 r = block->version->funcs->hw_init(adev); 2929 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2930 if (r) 2931 return r; 2932 block->status.hw = true; 2933 } 2934 } 2935 2936 return 0; 2937 } 2938 2939 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2940 { 2941 int i, r; 2942 2943 static enum amd_ip_block_type ip_order[] = { 2944 AMD_IP_BLOCK_TYPE_SMC, 2945 AMD_IP_BLOCK_TYPE_DCE, 2946 AMD_IP_BLOCK_TYPE_GFX, 2947 AMD_IP_BLOCK_TYPE_SDMA, 2948 AMD_IP_BLOCK_TYPE_UVD, 2949 AMD_IP_BLOCK_TYPE_VCE, 2950 AMD_IP_BLOCK_TYPE_VCN 2951 }; 2952 2953 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2954 int j; 2955 struct amdgpu_ip_block *block; 2956 2957 for (j = 0; j < adev->num_ip_blocks; j++) { 2958 block = &adev->ip_blocks[j]; 2959 2960 if (block->version->type != ip_order[i] || 2961 !block->status.valid || 2962 block->status.hw) 2963 continue; 2964 2965 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2966 r = block->version->funcs->resume(adev); 2967 else 2968 r = block->version->funcs->hw_init(adev); 2969 2970 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2971 if (r) 2972 return r; 2973 block->status.hw = true; 2974 } 2975 } 2976 2977 return 0; 2978 } 2979 2980 /** 2981 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2982 * 2983 * @adev: amdgpu_device pointer 2984 * 2985 * First resume function for hardware IPs. The list of all the hardware 2986 * IPs that make up the asic is walked and the resume callbacks are run for 2987 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2988 * after a suspend and updates the software state as necessary. This 2989 * function is also used for restoring the GPU after a GPU reset. 2990 * Returns 0 on success, negative error code on failure. 2991 */ 2992 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2993 { 2994 int i, r; 2995 2996 for (i = 0; i < adev->num_ip_blocks; i++) { 2997 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2998 continue; 2999 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3000 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3001 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3002 3003 r = adev->ip_blocks[i].version->funcs->resume(adev); 3004 if (r) { 3005 DRM_ERROR("resume of IP block <%s> failed %d\n", 3006 adev->ip_blocks[i].version->funcs->name, r); 3007 return r; 3008 } 3009 adev->ip_blocks[i].status.hw = true; 3010 } 3011 } 3012 3013 return 0; 3014 } 3015 3016 /** 3017 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3018 * 3019 * @adev: amdgpu_device pointer 3020 * 3021 * First resume function for hardware IPs. The list of all the hardware 3022 * IPs that make up the asic is walked and the resume callbacks are run for 3023 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3024 * functional state after a suspend and updates the software state as 3025 * necessary. This function is also used for restoring the GPU after a GPU 3026 * reset. 3027 * Returns 0 on success, negative error code on failure. 3028 */ 3029 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3030 { 3031 int i, r; 3032 3033 for (i = 0; i < adev->num_ip_blocks; i++) { 3034 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3035 continue; 3036 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3037 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3038 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3039 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3040 continue; 3041 r = adev->ip_blocks[i].version->funcs->resume(adev); 3042 if (r) { 3043 DRM_ERROR("resume of IP block <%s> failed %d\n", 3044 adev->ip_blocks[i].version->funcs->name, r); 3045 return r; 3046 } 3047 adev->ip_blocks[i].status.hw = true; 3048 } 3049 3050 return 0; 3051 } 3052 3053 /** 3054 * amdgpu_device_ip_resume - run resume for hardware IPs 3055 * 3056 * @adev: amdgpu_device pointer 3057 * 3058 * Main resume function for hardware IPs. The hardware IPs 3059 * are split into two resume functions because they are 3060 * are also used in in recovering from a GPU reset and some additional 3061 * steps need to be take between them. In this case (S3/S4) they are 3062 * run sequentially. 3063 * Returns 0 on success, negative error code on failure. 3064 */ 3065 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3066 { 3067 int r; 3068 3069 r = amdgpu_device_ip_resume_phase1(adev); 3070 if (r) 3071 return r; 3072 3073 r = amdgpu_device_fw_loading(adev); 3074 if (r) 3075 return r; 3076 3077 r = amdgpu_device_ip_resume_phase2(adev); 3078 3079 return r; 3080 } 3081 3082 /** 3083 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3084 * 3085 * @adev: amdgpu_device pointer 3086 * 3087 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3088 */ 3089 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3090 { 3091 if (amdgpu_sriov_vf(adev)) { 3092 if (adev->is_atom_fw) { 3093 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3094 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3095 } else { 3096 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3097 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3098 } 3099 3100 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3101 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3102 } 3103 } 3104 3105 /** 3106 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3107 * 3108 * @asic_type: AMD asic type 3109 * 3110 * Check if there is DC (new modesetting infrastructre) support for an asic. 3111 * returns true if DC has support, false if not. 3112 */ 3113 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3114 { 3115 switch (asic_type) { 3116 #if defined(CONFIG_DRM_AMD_DC) 3117 #if defined(CONFIG_DRM_AMD_DC_SI) 3118 case CHIP_TAHITI: 3119 case CHIP_PITCAIRN: 3120 case CHIP_VERDE: 3121 case CHIP_OLAND: 3122 #endif 3123 case CHIP_BONAIRE: 3124 case CHIP_KAVERI: 3125 case CHIP_KABINI: 3126 case CHIP_MULLINS: 3127 /* 3128 * We have systems in the wild with these ASICs that require 3129 * LVDS and VGA support which is not supported with DC. 3130 * 3131 * Fallback to the non-DC driver here by default so as not to 3132 * cause regressions. 3133 */ 3134 return amdgpu_dc > 0; 3135 case CHIP_HAWAII: 3136 case CHIP_CARRIZO: 3137 case CHIP_STONEY: 3138 case CHIP_POLARIS10: 3139 case CHIP_POLARIS11: 3140 case CHIP_POLARIS12: 3141 case CHIP_VEGAM: 3142 case CHIP_TONGA: 3143 case CHIP_FIJI: 3144 case CHIP_VEGA10: 3145 case CHIP_VEGA12: 3146 case CHIP_VEGA20: 3147 #if defined(CONFIG_DRM_AMD_DC_DCN) 3148 case CHIP_RAVEN: 3149 case CHIP_NAVI10: 3150 case CHIP_NAVI14: 3151 case CHIP_NAVI12: 3152 case CHIP_RENOIR: 3153 case CHIP_SIENNA_CICHLID: 3154 case CHIP_NAVY_FLOUNDER: 3155 case CHIP_DIMGREY_CAVEFISH: 3156 case CHIP_BEIGE_GOBY: 3157 case CHIP_VANGOGH: 3158 case CHIP_YELLOW_CARP: 3159 #endif 3160 return amdgpu_dc != 0; 3161 #endif 3162 default: 3163 if (amdgpu_dc > 0) 3164 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3165 "but isn't supported by ASIC, ignoring\n"); 3166 return false; 3167 } 3168 } 3169 3170 /** 3171 * amdgpu_device_has_dc_support - check if dc is supported 3172 * 3173 * @adev: amdgpu_device pointer 3174 * 3175 * Returns true for supported, false for not supported 3176 */ 3177 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3178 { 3179 if (amdgpu_sriov_vf(adev) || 3180 adev->enable_virtual_display || 3181 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3182 return false; 3183 3184 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3185 } 3186 3187 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3188 { 3189 struct amdgpu_device *adev = 3190 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3191 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3192 3193 /* It's a bug to not have a hive within this function */ 3194 if (WARN_ON(!hive)) 3195 return; 3196 3197 /* 3198 * Use task barrier to synchronize all xgmi reset works across the 3199 * hive. task_barrier_enter and task_barrier_exit will block 3200 * until all the threads running the xgmi reset works reach 3201 * those points. task_barrier_full will do both blocks. 3202 */ 3203 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3204 3205 task_barrier_enter(&hive->tb); 3206 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3207 3208 if (adev->asic_reset_res) 3209 goto fail; 3210 3211 task_barrier_exit(&hive->tb); 3212 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3213 3214 if (adev->asic_reset_res) 3215 goto fail; 3216 3217 if (adev->mmhub.ras_funcs && 3218 adev->mmhub.ras_funcs->reset_ras_error_count) 3219 adev->mmhub.ras_funcs->reset_ras_error_count(adev); 3220 } else { 3221 3222 task_barrier_full(&hive->tb); 3223 adev->asic_reset_res = amdgpu_asic_reset(adev); 3224 } 3225 3226 fail: 3227 if (adev->asic_reset_res) 3228 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3229 adev->asic_reset_res, adev_to_drm(adev)->unique); 3230 amdgpu_put_xgmi_hive(hive); 3231 } 3232 3233 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3234 { 3235 char *input = amdgpu_lockup_timeout; 3236 char *timeout_setting = NULL; 3237 int index = 0; 3238 long timeout; 3239 int ret = 0; 3240 3241 /* 3242 * By default timeout for non compute jobs is 10000 3243 * and 60000 for compute jobs. 3244 * In SR-IOV or passthrough mode, timeout for compute 3245 * jobs are 60000 by default. 3246 */ 3247 adev->gfx_timeout = msecs_to_jiffies(10000); 3248 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3249 if (amdgpu_sriov_vf(adev)) 3250 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3251 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3252 else 3253 adev->compute_timeout = msecs_to_jiffies(60000); 3254 3255 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3256 while ((timeout_setting = strsep(&input, ",")) && 3257 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3258 ret = kstrtol(timeout_setting, 0, &timeout); 3259 if (ret) 3260 return ret; 3261 3262 if (timeout == 0) { 3263 index++; 3264 continue; 3265 } else if (timeout < 0) { 3266 timeout = MAX_SCHEDULE_TIMEOUT; 3267 } else { 3268 timeout = msecs_to_jiffies(timeout); 3269 } 3270 3271 switch (index++) { 3272 case 0: 3273 adev->gfx_timeout = timeout; 3274 break; 3275 case 1: 3276 adev->compute_timeout = timeout; 3277 break; 3278 case 2: 3279 adev->sdma_timeout = timeout; 3280 break; 3281 case 3: 3282 adev->video_timeout = timeout; 3283 break; 3284 default: 3285 break; 3286 } 3287 } 3288 /* 3289 * There is only one value specified and 3290 * it should apply to all non-compute jobs. 3291 */ 3292 if (index == 1) { 3293 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3294 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3295 adev->compute_timeout = adev->gfx_timeout; 3296 } 3297 } 3298 3299 return ret; 3300 } 3301 3302 static const struct attribute *amdgpu_dev_attributes[] = { 3303 &dev_attr_product_name.attr, 3304 &dev_attr_product_number.attr, 3305 &dev_attr_serial_number.attr, 3306 &dev_attr_pcie_replay_count.attr, 3307 NULL 3308 }; 3309 3310 /** 3311 * amdgpu_device_init - initialize the driver 3312 * 3313 * @adev: amdgpu_device pointer 3314 * @flags: driver flags 3315 * 3316 * Initializes the driver info and hw (all asics). 3317 * Returns 0 for success or an error on failure. 3318 * Called at driver startup. 3319 */ 3320 int amdgpu_device_init(struct amdgpu_device *adev, 3321 uint32_t flags) 3322 { 3323 struct drm_device *ddev = adev_to_drm(adev); 3324 struct pci_dev *pdev = adev->pdev; 3325 int r, i; 3326 bool px = false; 3327 u32 max_MBps; 3328 3329 adev->shutdown = false; 3330 adev->flags = flags; 3331 3332 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3333 adev->asic_type = amdgpu_force_asic_type; 3334 else 3335 adev->asic_type = flags & AMD_ASIC_MASK; 3336 3337 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3338 if (amdgpu_emu_mode == 1) 3339 adev->usec_timeout *= 10; 3340 adev->gmc.gart_size = 512 * 1024 * 1024; 3341 adev->accel_working = false; 3342 adev->num_rings = 0; 3343 adev->mman.buffer_funcs = NULL; 3344 adev->mman.buffer_funcs_ring = NULL; 3345 adev->vm_manager.vm_pte_funcs = NULL; 3346 adev->vm_manager.vm_pte_num_scheds = 0; 3347 adev->gmc.gmc_funcs = NULL; 3348 adev->harvest_ip_mask = 0x0; 3349 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3350 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3351 3352 adev->smc_rreg = &amdgpu_invalid_rreg; 3353 adev->smc_wreg = &amdgpu_invalid_wreg; 3354 adev->pcie_rreg = &amdgpu_invalid_rreg; 3355 adev->pcie_wreg = &amdgpu_invalid_wreg; 3356 adev->pciep_rreg = &amdgpu_invalid_rreg; 3357 adev->pciep_wreg = &amdgpu_invalid_wreg; 3358 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3359 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3360 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3361 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3362 adev->didt_rreg = &amdgpu_invalid_rreg; 3363 adev->didt_wreg = &amdgpu_invalid_wreg; 3364 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3365 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3366 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3367 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3368 3369 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3370 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3371 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3372 3373 /* mutex initialization are all done here so we 3374 * can recall function without having locking issues */ 3375 mutex_init(&adev->firmware.mutex); 3376 mutex_init(&adev->pm.mutex); 3377 mutex_init(&adev->gfx.gpu_clock_mutex); 3378 mutex_init(&adev->srbm_mutex); 3379 mutex_init(&adev->gfx.pipe_reserve_mutex); 3380 mutex_init(&adev->gfx.gfx_off_mutex); 3381 mutex_init(&adev->grbm_idx_mutex); 3382 mutex_init(&adev->mn_lock); 3383 mutex_init(&adev->virt.vf_errors.lock); 3384 hash_init(adev->mn_hash); 3385 atomic_set(&adev->in_gpu_reset, 0); 3386 init_rwsem(&adev->reset_sem); 3387 mutex_init(&adev->psp.mutex); 3388 mutex_init(&adev->notifier_lock); 3389 3390 r = amdgpu_device_check_arguments(adev); 3391 if (r) 3392 return r; 3393 3394 spin_lock_init(&adev->mmio_idx_lock); 3395 spin_lock_init(&adev->smc_idx_lock); 3396 spin_lock_init(&adev->pcie_idx_lock); 3397 spin_lock_init(&adev->uvd_ctx_idx_lock); 3398 spin_lock_init(&adev->didt_idx_lock); 3399 spin_lock_init(&adev->gc_cac_idx_lock); 3400 spin_lock_init(&adev->se_cac_idx_lock); 3401 spin_lock_init(&adev->audio_endpt_idx_lock); 3402 spin_lock_init(&adev->mm_stats.lock); 3403 3404 INIT_LIST_HEAD(&adev->shadow_list); 3405 mutex_init(&adev->shadow_list_lock); 3406 3407 INIT_LIST_HEAD(&adev->reset_list); 3408 3409 INIT_DELAYED_WORK(&adev->delayed_init_work, 3410 amdgpu_device_delayed_init_work_handler); 3411 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3412 amdgpu_device_delay_enable_gfx_off); 3413 3414 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3415 3416 adev->gfx.gfx_off_req_count = 1; 3417 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3418 3419 atomic_set(&adev->throttling_logging_enabled, 1); 3420 /* 3421 * If throttling continues, logging will be performed every minute 3422 * to avoid log flooding. "-1" is subtracted since the thermal 3423 * throttling interrupt comes every second. Thus, the total logging 3424 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3425 * for throttling interrupt) = 60 seconds. 3426 */ 3427 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3428 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3429 3430 /* Registers mapping */ 3431 /* TODO: block userspace mapping of io register */ 3432 if (adev->asic_type >= CHIP_BONAIRE) { 3433 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3434 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3435 } else { 3436 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3437 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3438 } 3439 3440 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3441 if (adev->rmmio == NULL) { 3442 return -ENOMEM; 3443 } 3444 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3445 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3446 3447 /* enable PCIE atomic ops */ 3448 r = pci_enable_atomic_ops_to_root(adev->pdev, 3449 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3450 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3451 if (r) { 3452 adev->have_atomics_support = false; 3453 DRM_INFO("PCIE atomic ops is not supported\n"); 3454 } else { 3455 adev->have_atomics_support = true; 3456 } 3457 3458 amdgpu_device_get_pcie_info(adev); 3459 3460 if (amdgpu_mcbp) 3461 DRM_INFO("MCBP is enabled\n"); 3462 3463 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3464 adev->enable_mes = true; 3465 3466 /* detect hw virtualization here */ 3467 amdgpu_detect_virtualization(adev); 3468 3469 r = amdgpu_device_get_job_timeout_settings(adev); 3470 if (r) { 3471 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3472 goto failed_unmap; 3473 } 3474 3475 /* early init functions */ 3476 r = amdgpu_device_ip_early_init(adev); 3477 if (r) 3478 goto failed_unmap; 3479 3480 /* doorbell bar mapping and doorbell index init*/ 3481 amdgpu_device_doorbell_init(adev); 3482 3483 if (amdgpu_emu_mode == 1) { 3484 /* post the asic on emulation mode */ 3485 emu_soc_asic_init(adev); 3486 goto fence_driver_init; 3487 } 3488 3489 amdgpu_reset_init(adev); 3490 3491 /* detect if we are with an SRIOV vbios */ 3492 amdgpu_device_detect_sriov_bios(adev); 3493 3494 /* check if we need to reset the asic 3495 * E.g., driver was not cleanly unloaded previously, etc. 3496 */ 3497 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3498 if (adev->gmc.xgmi.num_physical_nodes) { 3499 dev_info(adev->dev, "Pending hive reset.\n"); 3500 adev->gmc.xgmi.pending_reset = true; 3501 /* Only need to init necessary block for SMU to handle the reset */ 3502 for (i = 0; i < adev->num_ip_blocks; i++) { 3503 if (!adev->ip_blocks[i].status.valid) 3504 continue; 3505 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3506 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3507 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3508 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3509 DRM_DEBUG("IP %s disabled for hw_init.\n", 3510 adev->ip_blocks[i].version->funcs->name); 3511 adev->ip_blocks[i].status.hw = true; 3512 } 3513 } 3514 } else { 3515 r = amdgpu_asic_reset(adev); 3516 if (r) { 3517 dev_err(adev->dev, "asic reset on init failed\n"); 3518 goto failed; 3519 } 3520 } 3521 } 3522 3523 pci_enable_pcie_error_reporting(adev->pdev); 3524 3525 /* Post card if necessary */ 3526 if (amdgpu_device_need_post(adev)) { 3527 if (!adev->bios) { 3528 dev_err(adev->dev, "no vBIOS found\n"); 3529 r = -EINVAL; 3530 goto failed; 3531 } 3532 DRM_INFO("GPU posting now...\n"); 3533 r = amdgpu_device_asic_init(adev); 3534 if (r) { 3535 dev_err(adev->dev, "gpu post error!\n"); 3536 goto failed; 3537 } 3538 } 3539 3540 if (adev->is_atom_fw) { 3541 /* Initialize clocks */ 3542 r = amdgpu_atomfirmware_get_clock_info(adev); 3543 if (r) { 3544 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3545 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3546 goto failed; 3547 } 3548 } else { 3549 /* Initialize clocks */ 3550 r = amdgpu_atombios_get_clock_info(adev); 3551 if (r) { 3552 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3553 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3554 goto failed; 3555 } 3556 /* init i2c buses */ 3557 if (!amdgpu_device_has_dc_support(adev)) 3558 amdgpu_atombios_i2c_init(adev); 3559 } 3560 3561 fence_driver_init: 3562 /* Fence driver */ 3563 r = amdgpu_fence_driver_init(adev); 3564 if (r) { 3565 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3566 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3567 goto failed; 3568 } 3569 3570 /* init the mode config */ 3571 drm_mode_config_init(adev_to_drm(adev)); 3572 3573 r = amdgpu_device_ip_init(adev); 3574 if (r) { 3575 /* failed in exclusive mode due to timeout */ 3576 if (amdgpu_sriov_vf(adev) && 3577 !amdgpu_sriov_runtime(adev) && 3578 amdgpu_virt_mmio_blocked(adev) && 3579 !amdgpu_virt_wait_reset(adev)) { 3580 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3581 /* Don't send request since VF is inactive. */ 3582 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3583 adev->virt.ops = NULL; 3584 r = -EAGAIN; 3585 goto release_ras_con; 3586 } 3587 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3588 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3589 goto release_ras_con; 3590 } 3591 3592 dev_info(adev->dev, 3593 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3594 adev->gfx.config.max_shader_engines, 3595 adev->gfx.config.max_sh_per_se, 3596 adev->gfx.config.max_cu_per_sh, 3597 adev->gfx.cu_info.number); 3598 3599 adev->accel_working = true; 3600 3601 amdgpu_vm_check_compute_bug(adev); 3602 3603 /* Initialize the buffer migration limit. */ 3604 if (amdgpu_moverate >= 0) 3605 max_MBps = amdgpu_moverate; 3606 else 3607 max_MBps = 8; /* Allow 8 MB/s. */ 3608 /* Get a log2 for easy divisions. */ 3609 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3610 3611 amdgpu_fbdev_init(adev); 3612 3613 r = amdgpu_pm_sysfs_init(adev); 3614 if (r) { 3615 adev->pm_sysfs_en = false; 3616 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3617 } else 3618 adev->pm_sysfs_en = true; 3619 3620 r = amdgpu_ucode_sysfs_init(adev); 3621 if (r) { 3622 adev->ucode_sysfs_en = false; 3623 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3624 } else 3625 adev->ucode_sysfs_en = true; 3626 3627 if ((amdgpu_testing & 1)) { 3628 if (adev->accel_working) 3629 amdgpu_test_moves(adev); 3630 else 3631 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3632 } 3633 if (amdgpu_benchmarking) { 3634 if (adev->accel_working) 3635 amdgpu_benchmark(adev, amdgpu_benchmarking); 3636 else 3637 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3638 } 3639 3640 /* 3641 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3642 * Otherwise the mgpu fan boost feature will be skipped due to the 3643 * gpu instance is counted less. 3644 */ 3645 amdgpu_register_gpu_instance(adev); 3646 3647 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3648 * explicit gating rather than handling it automatically. 3649 */ 3650 if (!adev->gmc.xgmi.pending_reset) { 3651 r = amdgpu_device_ip_late_init(adev); 3652 if (r) { 3653 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3654 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3655 goto release_ras_con; 3656 } 3657 /* must succeed. */ 3658 amdgpu_ras_resume(adev); 3659 queue_delayed_work(system_wq, &adev->delayed_init_work, 3660 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3661 } 3662 3663 if (amdgpu_sriov_vf(adev)) 3664 flush_delayed_work(&adev->delayed_init_work); 3665 3666 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3667 if (r) 3668 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3669 3670 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3671 r = amdgpu_pmu_init(adev); 3672 if (r) 3673 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3674 3675 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3676 if (amdgpu_device_cache_pci_state(adev->pdev)) 3677 pci_restore_state(pdev); 3678 3679 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3680 /* this will fail for cards that aren't VGA class devices, just 3681 * ignore it */ 3682 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3683 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3684 3685 if (amdgpu_device_supports_px(ddev)) { 3686 px = true; 3687 vga_switcheroo_register_client(adev->pdev, 3688 &amdgpu_switcheroo_ops, px); 3689 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3690 } 3691 3692 if (adev->gmc.xgmi.pending_reset) 3693 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3694 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3695 3696 return 0; 3697 3698 release_ras_con: 3699 amdgpu_release_ras_context(adev); 3700 3701 failed: 3702 amdgpu_vf_error_trans_all(adev); 3703 3704 failed_unmap: 3705 iounmap(adev->rmmio); 3706 adev->rmmio = NULL; 3707 3708 return r; 3709 } 3710 3711 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3712 { 3713 /* Clear all CPU mappings pointing to this device */ 3714 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3715 3716 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3717 amdgpu_device_doorbell_fini(adev); 3718 3719 iounmap(adev->rmmio); 3720 adev->rmmio = NULL; 3721 if (adev->mman.aper_base_kaddr) 3722 iounmap(adev->mman.aper_base_kaddr); 3723 adev->mman.aper_base_kaddr = NULL; 3724 3725 /* Memory manager related */ 3726 if (!adev->gmc.xgmi.connected_to_cpu) { 3727 arch_phys_wc_del(adev->gmc.vram_mtrr); 3728 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3729 } 3730 } 3731 3732 /** 3733 * amdgpu_device_fini - tear down the driver 3734 * 3735 * @adev: amdgpu_device pointer 3736 * 3737 * Tear down the driver info (all asics). 3738 * Called at driver shutdown. 3739 */ 3740 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3741 { 3742 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3743 flush_delayed_work(&adev->delayed_init_work); 3744 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3745 adev->shutdown = true; 3746 3747 /* make sure IB test finished before entering exclusive mode 3748 * to avoid preemption on IB test 3749 * */ 3750 if (amdgpu_sriov_vf(adev)) { 3751 amdgpu_virt_request_full_gpu(adev, false); 3752 amdgpu_virt_fini_data_exchange(adev); 3753 } 3754 3755 /* disable all interrupts */ 3756 amdgpu_irq_disable_all(adev); 3757 if (adev->mode_info.mode_config_initialized){ 3758 if (!amdgpu_device_has_dc_support(adev)) 3759 drm_helper_force_disable_all(adev_to_drm(adev)); 3760 else 3761 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3762 } 3763 amdgpu_fence_driver_fini_hw(adev); 3764 3765 if (adev->pm_sysfs_en) 3766 amdgpu_pm_sysfs_fini(adev); 3767 if (adev->ucode_sysfs_en) 3768 amdgpu_ucode_sysfs_fini(adev); 3769 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3770 3771 amdgpu_fbdev_fini(adev); 3772 3773 amdgpu_irq_fini_hw(adev); 3774 3775 amdgpu_device_ip_fini_early(adev); 3776 3777 amdgpu_gart_dummy_page_fini(adev); 3778 3779 amdgpu_device_unmap_mmio(adev); 3780 } 3781 3782 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 3783 { 3784 amdgpu_device_ip_fini(adev); 3785 amdgpu_fence_driver_fini_sw(adev); 3786 release_firmware(adev->firmware.gpu_info_fw); 3787 adev->firmware.gpu_info_fw = NULL; 3788 adev->accel_working = false; 3789 3790 amdgpu_reset_fini(adev); 3791 3792 /* free i2c buses */ 3793 if (!amdgpu_device_has_dc_support(adev)) 3794 amdgpu_i2c_fini(adev); 3795 3796 if (amdgpu_emu_mode != 1) 3797 amdgpu_atombios_fini(adev); 3798 3799 kfree(adev->bios); 3800 adev->bios = NULL; 3801 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 3802 vga_switcheroo_unregister_client(adev->pdev); 3803 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3804 } 3805 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3806 vga_client_unregister(adev->pdev); 3807 3808 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3809 amdgpu_pmu_fini(adev); 3810 if (adev->mman.discovery_bin) 3811 amdgpu_discovery_fini(adev); 3812 3813 kfree(adev->pci_state); 3814 3815 } 3816 3817 3818 /* 3819 * Suspend & resume. 3820 */ 3821 /** 3822 * amdgpu_device_suspend - initiate device suspend 3823 * 3824 * @dev: drm dev pointer 3825 * @fbcon : notify the fbdev of suspend 3826 * 3827 * Puts the hw in the suspend state (all asics). 3828 * Returns 0 for success or an error on failure. 3829 * Called at driver suspend. 3830 */ 3831 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3832 { 3833 struct amdgpu_device *adev = drm_to_adev(dev); 3834 3835 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3836 return 0; 3837 3838 adev->in_suspend = true; 3839 3840 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 3841 DRM_WARN("smart shift update failed\n"); 3842 3843 drm_kms_helper_poll_disable(dev); 3844 3845 if (fbcon) 3846 amdgpu_fbdev_set_suspend(adev, 1); 3847 3848 cancel_delayed_work_sync(&adev->delayed_init_work); 3849 3850 amdgpu_ras_suspend(adev); 3851 3852 amdgpu_device_ip_suspend_phase1(adev); 3853 3854 if (!adev->in_s0ix) 3855 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 3856 3857 /* evict vram memory */ 3858 amdgpu_bo_evict_vram(adev); 3859 3860 amdgpu_fence_driver_suspend(adev); 3861 3862 amdgpu_device_ip_suspend_phase2(adev); 3863 /* evict remaining vram memory 3864 * This second call to evict vram is to evict the gart page table 3865 * using the CPU. 3866 */ 3867 amdgpu_bo_evict_vram(adev); 3868 3869 return 0; 3870 } 3871 3872 /** 3873 * amdgpu_device_resume - initiate device resume 3874 * 3875 * @dev: drm dev pointer 3876 * @fbcon : notify the fbdev of resume 3877 * 3878 * Bring the hw back to operating state (all asics). 3879 * Returns 0 for success or an error on failure. 3880 * Called at driver resume. 3881 */ 3882 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3883 { 3884 struct amdgpu_device *adev = drm_to_adev(dev); 3885 int r = 0; 3886 3887 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3888 return 0; 3889 3890 if (adev->in_s0ix) 3891 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 3892 3893 /* post card */ 3894 if (amdgpu_device_need_post(adev)) { 3895 r = amdgpu_device_asic_init(adev); 3896 if (r) 3897 dev_err(adev->dev, "amdgpu asic init failed\n"); 3898 } 3899 3900 r = amdgpu_device_ip_resume(adev); 3901 if (r) { 3902 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3903 return r; 3904 } 3905 amdgpu_fence_driver_resume(adev); 3906 3907 3908 r = amdgpu_device_ip_late_init(adev); 3909 if (r) 3910 return r; 3911 3912 queue_delayed_work(system_wq, &adev->delayed_init_work, 3913 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3914 3915 if (!adev->in_s0ix) { 3916 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 3917 if (r) 3918 return r; 3919 } 3920 3921 /* Make sure IB tests flushed */ 3922 flush_delayed_work(&adev->delayed_init_work); 3923 3924 if (fbcon) 3925 amdgpu_fbdev_set_suspend(adev, 0); 3926 3927 drm_kms_helper_poll_enable(dev); 3928 3929 amdgpu_ras_resume(adev); 3930 3931 /* 3932 * Most of the connector probing functions try to acquire runtime pm 3933 * refs to ensure that the GPU is powered on when connector polling is 3934 * performed. Since we're calling this from a runtime PM callback, 3935 * trying to acquire rpm refs will cause us to deadlock. 3936 * 3937 * Since we're guaranteed to be holding the rpm lock, it's safe to 3938 * temporarily disable the rpm helpers so this doesn't deadlock us. 3939 */ 3940 #ifdef CONFIG_PM 3941 dev->dev->power.disable_depth++; 3942 #endif 3943 if (!amdgpu_device_has_dc_support(adev)) 3944 drm_helper_hpd_irq_event(dev); 3945 else 3946 drm_kms_helper_hotplug_event(dev); 3947 #ifdef CONFIG_PM 3948 dev->dev->power.disable_depth--; 3949 #endif 3950 adev->in_suspend = false; 3951 3952 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 3953 DRM_WARN("smart shift update failed\n"); 3954 3955 return 0; 3956 } 3957 3958 /** 3959 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3960 * 3961 * @adev: amdgpu_device pointer 3962 * 3963 * The list of all the hardware IPs that make up the asic is walked and 3964 * the check_soft_reset callbacks are run. check_soft_reset determines 3965 * if the asic is still hung or not. 3966 * Returns true if any of the IPs are still in a hung state, false if not. 3967 */ 3968 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3969 { 3970 int i; 3971 bool asic_hang = false; 3972 3973 if (amdgpu_sriov_vf(adev)) 3974 return true; 3975 3976 if (amdgpu_asic_need_full_reset(adev)) 3977 return true; 3978 3979 for (i = 0; i < adev->num_ip_blocks; i++) { 3980 if (!adev->ip_blocks[i].status.valid) 3981 continue; 3982 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3983 adev->ip_blocks[i].status.hang = 3984 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3985 if (adev->ip_blocks[i].status.hang) { 3986 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3987 asic_hang = true; 3988 } 3989 } 3990 return asic_hang; 3991 } 3992 3993 /** 3994 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3995 * 3996 * @adev: amdgpu_device pointer 3997 * 3998 * The list of all the hardware IPs that make up the asic is walked and the 3999 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4000 * handles any IP specific hardware or software state changes that are 4001 * necessary for a soft reset to succeed. 4002 * Returns 0 on success, negative error code on failure. 4003 */ 4004 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4005 { 4006 int i, r = 0; 4007 4008 for (i = 0; i < adev->num_ip_blocks; i++) { 4009 if (!adev->ip_blocks[i].status.valid) 4010 continue; 4011 if (adev->ip_blocks[i].status.hang && 4012 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4013 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4014 if (r) 4015 return r; 4016 } 4017 } 4018 4019 return 0; 4020 } 4021 4022 /** 4023 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4024 * 4025 * @adev: amdgpu_device pointer 4026 * 4027 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4028 * reset is necessary to recover. 4029 * Returns true if a full asic reset is required, false if not. 4030 */ 4031 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4032 { 4033 int i; 4034 4035 if (amdgpu_asic_need_full_reset(adev)) 4036 return true; 4037 4038 for (i = 0; i < adev->num_ip_blocks; i++) { 4039 if (!adev->ip_blocks[i].status.valid) 4040 continue; 4041 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4042 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4043 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4044 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4045 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4046 if (adev->ip_blocks[i].status.hang) { 4047 dev_info(adev->dev, "Some block need full reset!\n"); 4048 return true; 4049 } 4050 } 4051 } 4052 return false; 4053 } 4054 4055 /** 4056 * amdgpu_device_ip_soft_reset - do a soft reset 4057 * 4058 * @adev: amdgpu_device pointer 4059 * 4060 * The list of all the hardware IPs that make up the asic is walked and the 4061 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4062 * IP specific hardware or software state changes that are necessary to soft 4063 * reset the IP. 4064 * Returns 0 on success, negative error code on failure. 4065 */ 4066 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4067 { 4068 int i, r = 0; 4069 4070 for (i = 0; i < adev->num_ip_blocks; i++) { 4071 if (!adev->ip_blocks[i].status.valid) 4072 continue; 4073 if (adev->ip_blocks[i].status.hang && 4074 adev->ip_blocks[i].version->funcs->soft_reset) { 4075 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4076 if (r) 4077 return r; 4078 } 4079 } 4080 4081 return 0; 4082 } 4083 4084 /** 4085 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4086 * 4087 * @adev: amdgpu_device pointer 4088 * 4089 * The list of all the hardware IPs that make up the asic is walked and the 4090 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4091 * handles any IP specific hardware or software state changes that are 4092 * necessary after the IP has been soft reset. 4093 * Returns 0 on success, negative error code on failure. 4094 */ 4095 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4096 { 4097 int i, r = 0; 4098 4099 for (i = 0; i < adev->num_ip_blocks; i++) { 4100 if (!adev->ip_blocks[i].status.valid) 4101 continue; 4102 if (adev->ip_blocks[i].status.hang && 4103 adev->ip_blocks[i].version->funcs->post_soft_reset) 4104 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4105 if (r) 4106 return r; 4107 } 4108 4109 return 0; 4110 } 4111 4112 /** 4113 * amdgpu_device_recover_vram - Recover some VRAM contents 4114 * 4115 * @adev: amdgpu_device pointer 4116 * 4117 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4118 * restore things like GPUVM page tables after a GPU reset where 4119 * the contents of VRAM might be lost. 4120 * 4121 * Returns: 4122 * 0 on success, negative error code on failure. 4123 */ 4124 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4125 { 4126 struct dma_fence *fence = NULL, *next = NULL; 4127 struct amdgpu_bo *shadow; 4128 long r = 1, tmo; 4129 4130 if (amdgpu_sriov_runtime(adev)) 4131 tmo = msecs_to_jiffies(8000); 4132 else 4133 tmo = msecs_to_jiffies(100); 4134 4135 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4136 mutex_lock(&adev->shadow_list_lock); 4137 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4138 4139 /* No need to recover an evicted BO */ 4140 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4141 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4142 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4143 continue; 4144 4145 r = amdgpu_bo_restore_shadow(shadow, &next); 4146 if (r) 4147 break; 4148 4149 if (fence) { 4150 tmo = dma_fence_wait_timeout(fence, false, tmo); 4151 dma_fence_put(fence); 4152 fence = next; 4153 if (tmo == 0) { 4154 r = -ETIMEDOUT; 4155 break; 4156 } else if (tmo < 0) { 4157 r = tmo; 4158 break; 4159 } 4160 } else { 4161 fence = next; 4162 } 4163 } 4164 mutex_unlock(&adev->shadow_list_lock); 4165 4166 if (fence) 4167 tmo = dma_fence_wait_timeout(fence, false, tmo); 4168 dma_fence_put(fence); 4169 4170 if (r < 0 || tmo <= 0) { 4171 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4172 return -EIO; 4173 } 4174 4175 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4176 return 0; 4177 } 4178 4179 4180 /** 4181 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4182 * 4183 * @adev: amdgpu_device pointer 4184 * @from_hypervisor: request from hypervisor 4185 * 4186 * do VF FLR and reinitialize Asic 4187 * return 0 means succeeded otherwise failed 4188 */ 4189 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4190 bool from_hypervisor) 4191 { 4192 int r; 4193 4194 if (from_hypervisor) 4195 r = amdgpu_virt_request_full_gpu(adev, true); 4196 else 4197 r = amdgpu_virt_reset_gpu(adev); 4198 if (r) 4199 return r; 4200 4201 amdgpu_amdkfd_pre_reset(adev); 4202 4203 /* Resume IP prior to SMC */ 4204 r = amdgpu_device_ip_reinit_early_sriov(adev); 4205 if (r) 4206 goto error; 4207 4208 amdgpu_virt_init_data_exchange(adev); 4209 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4210 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4211 4212 r = amdgpu_device_fw_loading(adev); 4213 if (r) 4214 return r; 4215 4216 /* now we are okay to resume SMC/CP/SDMA */ 4217 r = amdgpu_device_ip_reinit_late_sriov(adev); 4218 if (r) 4219 goto error; 4220 4221 amdgpu_irq_gpu_reset_resume_helper(adev); 4222 r = amdgpu_ib_ring_tests(adev); 4223 amdgpu_amdkfd_post_reset(adev); 4224 4225 error: 4226 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4227 amdgpu_inc_vram_lost(adev); 4228 r = amdgpu_device_recover_vram(adev); 4229 } 4230 amdgpu_virt_release_full_gpu(adev, true); 4231 4232 return r; 4233 } 4234 4235 /** 4236 * amdgpu_device_has_job_running - check if there is any job in mirror list 4237 * 4238 * @adev: amdgpu_device pointer 4239 * 4240 * check if there is any job in mirror list 4241 */ 4242 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4243 { 4244 int i; 4245 struct drm_sched_job *job; 4246 4247 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4248 struct amdgpu_ring *ring = adev->rings[i]; 4249 4250 if (!ring || !ring->sched.thread) 4251 continue; 4252 4253 spin_lock(&ring->sched.job_list_lock); 4254 job = list_first_entry_or_null(&ring->sched.pending_list, 4255 struct drm_sched_job, list); 4256 spin_unlock(&ring->sched.job_list_lock); 4257 if (job) 4258 return true; 4259 } 4260 return false; 4261 } 4262 4263 /** 4264 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4265 * 4266 * @adev: amdgpu_device pointer 4267 * 4268 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4269 * a hung GPU. 4270 */ 4271 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4272 { 4273 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4274 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4275 return false; 4276 } 4277 4278 if (amdgpu_gpu_recovery == 0) 4279 goto disabled; 4280 4281 if (amdgpu_sriov_vf(adev)) 4282 return true; 4283 4284 if (amdgpu_gpu_recovery == -1) { 4285 switch (adev->asic_type) { 4286 case CHIP_BONAIRE: 4287 case CHIP_HAWAII: 4288 case CHIP_TOPAZ: 4289 case CHIP_TONGA: 4290 case CHIP_FIJI: 4291 case CHIP_POLARIS10: 4292 case CHIP_POLARIS11: 4293 case CHIP_POLARIS12: 4294 case CHIP_VEGAM: 4295 case CHIP_VEGA20: 4296 case CHIP_VEGA10: 4297 case CHIP_VEGA12: 4298 case CHIP_RAVEN: 4299 case CHIP_ARCTURUS: 4300 case CHIP_RENOIR: 4301 case CHIP_NAVI10: 4302 case CHIP_NAVI14: 4303 case CHIP_NAVI12: 4304 case CHIP_SIENNA_CICHLID: 4305 case CHIP_NAVY_FLOUNDER: 4306 case CHIP_DIMGREY_CAVEFISH: 4307 case CHIP_VANGOGH: 4308 case CHIP_ALDEBARAN: 4309 break; 4310 default: 4311 goto disabled; 4312 } 4313 } 4314 4315 return true; 4316 4317 disabled: 4318 dev_info(adev->dev, "GPU recovery disabled.\n"); 4319 return false; 4320 } 4321 4322 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4323 { 4324 u32 i; 4325 int ret = 0; 4326 4327 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4328 4329 dev_info(adev->dev, "GPU mode1 reset\n"); 4330 4331 /* disable BM */ 4332 pci_clear_master(adev->pdev); 4333 4334 amdgpu_device_cache_pci_state(adev->pdev); 4335 4336 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4337 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4338 ret = amdgpu_dpm_mode1_reset(adev); 4339 } else { 4340 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4341 ret = psp_gpu_reset(adev); 4342 } 4343 4344 if (ret) 4345 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4346 4347 amdgpu_device_load_pci_state(adev->pdev); 4348 4349 /* wait for asic to come out of reset */ 4350 for (i = 0; i < adev->usec_timeout; i++) { 4351 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4352 4353 if (memsize != 0xffffffff) 4354 break; 4355 udelay(1); 4356 } 4357 4358 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4359 return ret; 4360 } 4361 4362 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4363 struct amdgpu_reset_context *reset_context) 4364 { 4365 int i, r = 0; 4366 struct amdgpu_job *job = NULL; 4367 bool need_full_reset = 4368 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4369 4370 if (reset_context->reset_req_dev == adev) 4371 job = reset_context->job; 4372 4373 /* no need to dump if device is not in good state during probe period */ 4374 if (!adev->gmc.xgmi.pending_reset) 4375 amdgpu_debugfs_wait_dump(adev); 4376 4377 if (amdgpu_sriov_vf(adev)) { 4378 /* stop the data exchange thread */ 4379 amdgpu_virt_fini_data_exchange(adev); 4380 } 4381 4382 /* block all schedulers and reset given job's ring */ 4383 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4384 struct amdgpu_ring *ring = adev->rings[i]; 4385 4386 if (!ring || !ring->sched.thread) 4387 continue; 4388 4389 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4390 amdgpu_fence_driver_force_completion(ring); 4391 } 4392 4393 if(job) 4394 drm_sched_increase_karma(&job->base); 4395 4396 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4397 /* If reset handler not implemented, continue; otherwise return */ 4398 if (r == -ENOSYS) 4399 r = 0; 4400 else 4401 return r; 4402 4403 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4404 if (!amdgpu_sriov_vf(adev)) { 4405 4406 if (!need_full_reset) 4407 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4408 4409 if (!need_full_reset) { 4410 amdgpu_device_ip_pre_soft_reset(adev); 4411 r = amdgpu_device_ip_soft_reset(adev); 4412 amdgpu_device_ip_post_soft_reset(adev); 4413 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4414 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4415 need_full_reset = true; 4416 } 4417 } 4418 4419 if (need_full_reset) 4420 r = amdgpu_device_ip_suspend(adev); 4421 if (need_full_reset) 4422 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4423 else 4424 clear_bit(AMDGPU_NEED_FULL_RESET, 4425 &reset_context->flags); 4426 } 4427 4428 return r; 4429 } 4430 4431 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4432 struct amdgpu_reset_context *reset_context) 4433 { 4434 struct amdgpu_device *tmp_adev = NULL; 4435 bool need_full_reset, skip_hw_reset, vram_lost = false; 4436 int r = 0; 4437 4438 /* Try reset handler method first */ 4439 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4440 reset_list); 4441 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4442 /* If reset handler not implemented, continue; otherwise return */ 4443 if (r == -ENOSYS) 4444 r = 0; 4445 else 4446 return r; 4447 4448 /* Reset handler not implemented, use the default method */ 4449 need_full_reset = 4450 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4451 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4452 4453 /* 4454 * ASIC reset has to be done on all XGMI hive nodes ASAP 4455 * to allow proper links negotiation in FW (within 1 sec) 4456 */ 4457 if (!skip_hw_reset && need_full_reset) { 4458 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4459 /* For XGMI run all resets in parallel to speed up the process */ 4460 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4461 tmp_adev->gmc.xgmi.pending_reset = false; 4462 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4463 r = -EALREADY; 4464 } else 4465 r = amdgpu_asic_reset(tmp_adev); 4466 4467 if (r) { 4468 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4469 r, adev_to_drm(tmp_adev)->unique); 4470 break; 4471 } 4472 } 4473 4474 /* For XGMI wait for all resets to complete before proceed */ 4475 if (!r) { 4476 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4477 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4478 flush_work(&tmp_adev->xgmi_reset_work); 4479 r = tmp_adev->asic_reset_res; 4480 if (r) 4481 break; 4482 } 4483 } 4484 } 4485 } 4486 4487 if (!r && amdgpu_ras_intr_triggered()) { 4488 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4489 if (tmp_adev->mmhub.ras_funcs && 4490 tmp_adev->mmhub.ras_funcs->reset_ras_error_count) 4491 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); 4492 } 4493 4494 amdgpu_ras_intr_cleared(); 4495 } 4496 4497 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4498 if (need_full_reset) { 4499 /* post card */ 4500 r = amdgpu_device_asic_init(tmp_adev); 4501 if (r) { 4502 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4503 } else { 4504 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4505 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4506 if (r) 4507 goto out; 4508 4509 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4510 if (vram_lost) { 4511 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4512 amdgpu_inc_vram_lost(tmp_adev); 4513 } 4514 4515 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4516 if (r) 4517 goto out; 4518 4519 r = amdgpu_device_fw_loading(tmp_adev); 4520 if (r) 4521 return r; 4522 4523 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4524 if (r) 4525 goto out; 4526 4527 if (vram_lost) 4528 amdgpu_device_fill_reset_magic(tmp_adev); 4529 4530 /* 4531 * Add this ASIC as tracked as reset was already 4532 * complete successfully. 4533 */ 4534 amdgpu_register_gpu_instance(tmp_adev); 4535 4536 if (!reset_context->hive && 4537 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4538 amdgpu_xgmi_add_device(tmp_adev); 4539 4540 r = amdgpu_device_ip_late_init(tmp_adev); 4541 if (r) 4542 goto out; 4543 4544 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4545 4546 /* 4547 * The GPU enters bad state once faulty pages 4548 * by ECC has reached the threshold, and ras 4549 * recovery is scheduled next. So add one check 4550 * here to break recovery if it indeed exceeds 4551 * bad page threshold, and remind user to 4552 * retire this GPU or setting one bigger 4553 * bad_page_threshold value to fix this once 4554 * probing driver again. 4555 */ 4556 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4557 /* must succeed. */ 4558 amdgpu_ras_resume(tmp_adev); 4559 } else { 4560 r = -EINVAL; 4561 goto out; 4562 } 4563 4564 /* Update PSP FW topology after reset */ 4565 if (reset_context->hive && 4566 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4567 r = amdgpu_xgmi_update_topology( 4568 reset_context->hive, tmp_adev); 4569 } 4570 } 4571 4572 out: 4573 if (!r) { 4574 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4575 r = amdgpu_ib_ring_tests(tmp_adev); 4576 if (r) { 4577 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4578 need_full_reset = true; 4579 r = -EAGAIN; 4580 goto end; 4581 } 4582 } 4583 4584 if (!r) 4585 r = amdgpu_device_recover_vram(tmp_adev); 4586 else 4587 tmp_adev->asic_reset_res = r; 4588 } 4589 4590 end: 4591 if (need_full_reset) 4592 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4593 else 4594 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4595 return r; 4596 } 4597 4598 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4599 struct amdgpu_hive_info *hive) 4600 { 4601 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4602 return false; 4603 4604 if (hive) { 4605 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4606 } else { 4607 down_write(&adev->reset_sem); 4608 } 4609 4610 switch (amdgpu_asic_reset_method(adev)) { 4611 case AMD_RESET_METHOD_MODE1: 4612 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4613 break; 4614 case AMD_RESET_METHOD_MODE2: 4615 adev->mp1_state = PP_MP1_STATE_RESET; 4616 break; 4617 default: 4618 adev->mp1_state = PP_MP1_STATE_NONE; 4619 break; 4620 } 4621 4622 return true; 4623 } 4624 4625 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4626 { 4627 amdgpu_vf_error_trans_all(adev); 4628 adev->mp1_state = PP_MP1_STATE_NONE; 4629 atomic_set(&adev->in_gpu_reset, 0); 4630 up_write(&adev->reset_sem); 4631 } 4632 4633 /* 4634 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4635 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4636 * 4637 * unlock won't require roll back. 4638 */ 4639 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4640 { 4641 struct amdgpu_device *tmp_adev = NULL; 4642 4643 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4644 if (!hive) { 4645 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4646 return -ENODEV; 4647 } 4648 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4649 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4650 goto roll_back; 4651 } 4652 } else if (!amdgpu_device_lock_adev(adev, hive)) 4653 return -EAGAIN; 4654 4655 return 0; 4656 roll_back: 4657 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4658 /* 4659 * if the lockup iteration break in the middle of a hive, 4660 * it may means there may has a race issue, 4661 * or a hive device locked up independently. 4662 * we may be in trouble and may not, so will try to roll back 4663 * the lock and give out a warnning. 4664 */ 4665 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4666 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4667 amdgpu_device_unlock_adev(tmp_adev); 4668 } 4669 } 4670 return -EAGAIN; 4671 } 4672 4673 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4674 { 4675 struct pci_dev *p = NULL; 4676 4677 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4678 adev->pdev->bus->number, 1); 4679 if (p) { 4680 pm_runtime_enable(&(p->dev)); 4681 pm_runtime_resume(&(p->dev)); 4682 } 4683 } 4684 4685 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4686 { 4687 enum amd_reset_method reset_method; 4688 struct pci_dev *p = NULL; 4689 u64 expires; 4690 4691 /* 4692 * For now, only BACO and mode1 reset are confirmed 4693 * to suffer the audio issue without proper suspended. 4694 */ 4695 reset_method = amdgpu_asic_reset_method(adev); 4696 if ((reset_method != AMD_RESET_METHOD_BACO) && 4697 (reset_method != AMD_RESET_METHOD_MODE1)) 4698 return -EINVAL; 4699 4700 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4701 adev->pdev->bus->number, 1); 4702 if (!p) 4703 return -ENODEV; 4704 4705 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4706 if (!expires) 4707 /* 4708 * If we cannot get the audio device autosuspend delay, 4709 * a fixed 4S interval will be used. Considering 3S is 4710 * the audio controller default autosuspend delay setting. 4711 * 4S used here is guaranteed to cover that. 4712 */ 4713 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4714 4715 while (!pm_runtime_status_suspended(&(p->dev))) { 4716 if (!pm_runtime_suspend(&(p->dev))) 4717 break; 4718 4719 if (expires < ktime_get_mono_fast_ns()) { 4720 dev_warn(adev->dev, "failed to suspend display audio\n"); 4721 /* TODO: abort the succeeding gpu reset? */ 4722 return -ETIMEDOUT; 4723 } 4724 } 4725 4726 pm_runtime_disable(&(p->dev)); 4727 4728 return 0; 4729 } 4730 4731 static void amdgpu_device_recheck_guilty_jobs( 4732 struct amdgpu_device *adev, struct list_head *device_list_handle, 4733 struct amdgpu_reset_context *reset_context) 4734 { 4735 int i, r = 0; 4736 4737 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4738 struct amdgpu_ring *ring = adev->rings[i]; 4739 int ret = 0; 4740 struct drm_sched_job *s_job; 4741 4742 if (!ring || !ring->sched.thread) 4743 continue; 4744 4745 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4746 struct drm_sched_job, list); 4747 if (s_job == NULL) 4748 continue; 4749 4750 /* clear job's guilty and depend the folowing step to decide the real one */ 4751 drm_sched_reset_karma(s_job); 4752 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4753 4754 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4755 if (ret == 0) { /* timeout */ 4756 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4757 ring->sched.name, s_job->id); 4758 4759 /* set guilty */ 4760 drm_sched_increase_karma(s_job); 4761 retry: 4762 /* do hw reset */ 4763 if (amdgpu_sriov_vf(adev)) { 4764 amdgpu_virt_fini_data_exchange(adev); 4765 r = amdgpu_device_reset_sriov(adev, false); 4766 if (r) 4767 adev->asic_reset_res = r; 4768 } else { 4769 clear_bit(AMDGPU_SKIP_HW_RESET, 4770 &reset_context->flags); 4771 r = amdgpu_do_asic_reset(device_list_handle, 4772 reset_context); 4773 if (r && r == -EAGAIN) 4774 goto retry; 4775 } 4776 4777 /* 4778 * add reset counter so that the following 4779 * resubmitted job could flush vmid 4780 */ 4781 atomic_inc(&adev->gpu_reset_counter); 4782 continue; 4783 } 4784 4785 /* got the hw fence, signal finished fence */ 4786 atomic_dec(ring->sched.score); 4787 dma_fence_get(&s_job->s_fence->finished); 4788 dma_fence_signal(&s_job->s_fence->finished); 4789 dma_fence_put(&s_job->s_fence->finished); 4790 4791 /* remove node from list and free the job */ 4792 spin_lock(&ring->sched.job_list_lock); 4793 list_del_init(&s_job->list); 4794 spin_unlock(&ring->sched.job_list_lock); 4795 ring->sched.ops->free_job(s_job); 4796 } 4797 } 4798 4799 /** 4800 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4801 * 4802 * @adev: amdgpu_device pointer 4803 * @job: which job trigger hang 4804 * 4805 * Attempt to reset the GPU if it has hung (all asics). 4806 * Attempt to do soft-reset or full-reset and reinitialize Asic 4807 * Returns 0 for success or an error on failure. 4808 */ 4809 4810 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4811 struct amdgpu_job *job) 4812 { 4813 struct list_head device_list, *device_list_handle = NULL; 4814 bool job_signaled = false; 4815 struct amdgpu_hive_info *hive = NULL; 4816 struct amdgpu_device *tmp_adev = NULL; 4817 int i, r = 0; 4818 bool need_emergency_restart = false; 4819 bool audio_suspended = false; 4820 int tmp_vram_lost_counter; 4821 struct amdgpu_reset_context reset_context; 4822 4823 memset(&reset_context, 0, sizeof(reset_context)); 4824 4825 /* 4826 * Special case: RAS triggered and full reset isn't supported 4827 */ 4828 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4829 4830 /* 4831 * Flush RAM to disk so that after reboot 4832 * the user can read log and see why the system rebooted. 4833 */ 4834 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4835 DRM_WARN("Emergency reboot."); 4836 4837 ksys_sync_helper(); 4838 emergency_restart(); 4839 } 4840 4841 dev_info(adev->dev, "GPU %s begin!\n", 4842 need_emergency_restart ? "jobs stop":"reset"); 4843 4844 /* 4845 * Here we trylock to avoid chain of resets executing from 4846 * either trigger by jobs on different adevs in XGMI hive or jobs on 4847 * different schedulers for same device while this TO handler is running. 4848 * We always reset all schedulers for device and all devices for XGMI 4849 * hive so that should take care of them too. 4850 */ 4851 hive = amdgpu_get_xgmi_hive(adev); 4852 if (hive) { 4853 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4854 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4855 job ? job->base.id : -1, hive->hive_id); 4856 amdgpu_put_xgmi_hive(hive); 4857 if (job) 4858 drm_sched_increase_karma(&job->base); 4859 return 0; 4860 } 4861 mutex_lock(&hive->hive_lock); 4862 } 4863 4864 reset_context.method = AMD_RESET_METHOD_NONE; 4865 reset_context.reset_req_dev = adev; 4866 reset_context.job = job; 4867 reset_context.hive = hive; 4868 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 4869 4870 /* 4871 * lock the device before we try to operate the linked list 4872 * if didn't get the device lock, don't touch the linked list since 4873 * others may iterating it. 4874 */ 4875 r = amdgpu_device_lock_hive_adev(adev, hive); 4876 if (r) { 4877 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4878 job ? job->base.id : -1); 4879 4880 /* even we skipped this reset, still need to set the job to guilty */ 4881 if (job) 4882 drm_sched_increase_karma(&job->base); 4883 goto skip_recovery; 4884 } 4885 4886 /* 4887 * Build list of devices to reset. 4888 * In case we are in XGMI hive mode, resort the device list 4889 * to put adev in the 1st position. 4890 */ 4891 INIT_LIST_HEAD(&device_list); 4892 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4893 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 4894 list_add_tail(&tmp_adev->reset_list, &device_list); 4895 if (!list_is_first(&adev->reset_list, &device_list)) 4896 list_rotate_to_front(&adev->reset_list, &device_list); 4897 device_list_handle = &device_list; 4898 } else { 4899 list_add_tail(&adev->reset_list, &device_list); 4900 device_list_handle = &device_list; 4901 } 4902 4903 /* block all schedulers and reset given job's ring */ 4904 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4905 /* 4906 * Try to put the audio codec into suspend state 4907 * before gpu reset started. 4908 * 4909 * Due to the power domain of the graphics device 4910 * is shared with AZ power domain. Without this, 4911 * we may change the audio hardware from behind 4912 * the audio driver's back. That will trigger 4913 * some audio codec errors. 4914 */ 4915 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4916 audio_suspended = true; 4917 4918 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4919 4920 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4921 4922 if (!amdgpu_sriov_vf(tmp_adev)) 4923 amdgpu_amdkfd_pre_reset(tmp_adev); 4924 4925 /* 4926 * Mark these ASICs to be reseted as untracked first 4927 * And add them back after reset completed 4928 */ 4929 amdgpu_unregister_gpu_instance(tmp_adev); 4930 4931 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4932 4933 /* disable ras on ALL IPs */ 4934 if (!need_emergency_restart && 4935 amdgpu_device_ip_need_full_reset(tmp_adev)) 4936 amdgpu_ras_suspend(tmp_adev); 4937 4938 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4939 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4940 4941 if (!ring || !ring->sched.thread) 4942 continue; 4943 4944 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4945 4946 if (need_emergency_restart) 4947 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4948 } 4949 atomic_inc(&tmp_adev->gpu_reset_counter); 4950 } 4951 4952 if (need_emergency_restart) 4953 goto skip_sched_resume; 4954 4955 /* 4956 * Must check guilty signal here since after this point all old 4957 * HW fences are force signaled. 4958 * 4959 * job->base holds a reference to parent fence 4960 */ 4961 if (job && job->base.s_fence->parent && 4962 dma_fence_is_signaled(job->base.s_fence->parent)) { 4963 job_signaled = true; 4964 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4965 goto skip_hw_reset; 4966 } 4967 4968 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4969 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4970 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 4971 /*TODO Should we stop ?*/ 4972 if (r) { 4973 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4974 r, adev_to_drm(tmp_adev)->unique); 4975 tmp_adev->asic_reset_res = r; 4976 } 4977 } 4978 4979 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 4980 /* Actual ASIC resets if needed.*/ 4981 /* TODO Implement XGMI hive reset logic for SRIOV */ 4982 if (amdgpu_sriov_vf(adev)) { 4983 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4984 if (r) 4985 adev->asic_reset_res = r; 4986 } else { 4987 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 4988 if (r && r == -EAGAIN) 4989 goto retry; 4990 } 4991 4992 skip_hw_reset: 4993 4994 /* Post ASIC reset for all devs .*/ 4995 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4996 4997 /* 4998 * Sometimes a later bad compute job can block a good gfx job as gfx 4999 * and compute ring share internal GC HW mutually. We add an additional 5000 * guilty jobs recheck step to find the real guilty job, it synchronously 5001 * submits and pends for the first job being signaled. If it gets timeout, 5002 * we identify it as a real guilty job. 5003 */ 5004 if (amdgpu_gpu_recovery == 2 && 5005 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5006 amdgpu_device_recheck_guilty_jobs( 5007 tmp_adev, device_list_handle, &reset_context); 5008 5009 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5010 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5011 5012 if (!ring || !ring->sched.thread) 5013 continue; 5014 5015 /* No point to resubmit jobs if we didn't HW reset*/ 5016 if (!tmp_adev->asic_reset_res && !job_signaled) 5017 drm_sched_resubmit_jobs(&ring->sched); 5018 5019 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5020 } 5021 5022 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 5023 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5024 } 5025 5026 tmp_adev->asic_reset_res = 0; 5027 5028 if (r) { 5029 /* bad news, how to tell it to userspace ? */ 5030 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5031 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5032 } else { 5033 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5034 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5035 DRM_WARN("smart shift update failed\n"); 5036 } 5037 } 5038 5039 skip_sched_resume: 5040 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5041 /* unlock kfd: SRIOV would do it separately */ 5042 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5043 amdgpu_amdkfd_post_reset(tmp_adev); 5044 5045 /* kfd_post_reset will do nothing if kfd device is not initialized, 5046 * need to bring up kfd here if it's not be initialized before 5047 */ 5048 if (!adev->kfd.init_complete) 5049 amdgpu_amdkfd_device_init(adev); 5050 5051 if (audio_suspended) 5052 amdgpu_device_resume_display_audio(tmp_adev); 5053 amdgpu_device_unlock_adev(tmp_adev); 5054 } 5055 5056 skip_recovery: 5057 if (hive) { 5058 atomic_set(&hive->in_reset, 0); 5059 mutex_unlock(&hive->hive_lock); 5060 amdgpu_put_xgmi_hive(hive); 5061 } 5062 5063 if (r && r != -EAGAIN) 5064 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5065 return r; 5066 } 5067 5068 /** 5069 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5070 * 5071 * @adev: amdgpu_device pointer 5072 * 5073 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5074 * and lanes) of the slot the device is in. Handles APUs and 5075 * virtualized environments where PCIE config space may not be available. 5076 */ 5077 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5078 { 5079 struct pci_dev *pdev; 5080 enum pci_bus_speed speed_cap, platform_speed_cap; 5081 enum pcie_link_width platform_link_width; 5082 5083 if (amdgpu_pcie_gen_cap) 5084 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5085 5086 if (amdgpu_pcie_lane_cap) 5087 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5088 5089 /* covers APUs as well */ 5090 if (pci_is_root_bus(adev->pdev->bus)) { 5091 if (adev->pm.pcie_gen_mask == 0) 5092 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5093 if (adev->pm.pcie_mlw_mask == 0) 5094 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5095 return; 5096 } 5097 5098 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5099 return; 5100 5101 pcie_bandwidth_available(adev->pdev, NULL, 5102 &platform_speed_cap, &platform_link_width); 5103 5104 if (adev->pm.pcie_gen_mask == 0) { 5105 /* asic caps */ 5106 pdev = adev->pdev; 5107 speed_cap = pcie_get_speed_cap(pdev); 5108 if (speed_cap == PCI_SPEED_UNKNOWN) { 5109 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5110 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5111 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5112 } else { 5113 if (speed_cap == PCIE_SPEED_32_0GT) 5114 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5115 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5116 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5117 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5118 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5119 else if (speed_cap == PCIE_SPEED_16_0GT) 5120 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5121 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5122 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5123 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5124 else if (speed_cap == PCIE_SPEED_8_0GT) 5125 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5126 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5127 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5128 else if (speed_cap == PCIE_SPEED_5_0GT) 5129 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5130 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5131 else 5132 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5133 } 5134 /* platform caps */ 5135 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5136 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5137 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5138 } else { 5139 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5140 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5141 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5142 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5143 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5144 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5145 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5146 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5147 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5148 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5149 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5150 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5151 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5152 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5153 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5154 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5155 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5156 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5157 else 5158 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5159 5160 } 5161 } 5162 if (adev->pm.pcie_mlw_mask == 0) { 5163 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5164 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5165 } else { 5166 switch (platform_link_width) { 5167 case PCIE_LNK_X32: 5168 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5169 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5170 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5171 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5172 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5173 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5174 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5175 break; 5176 case PCIE_LNK_X16: 5177 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5178 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5179 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5180 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5181 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5182 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5183 break; 5184 case PCIE_LNK_X12: 5185 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5186 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5187 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5188 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5189 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5190 break; 5191 case PCIE_LNK_X8: 5192 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5193 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5194 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5195 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5196 break; 5197 case PCIE_LNK_X4: 5198 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5199 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5200 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5201 break; 5202 case PCIE_LNK_X2: 5203 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5204 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5205 break; 5206 case PCIE_LNK_X1: 5207 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5208 break; 5209 default: 5210 break; 5211 } 5212 } 5213 } 5214 } 5215 5216 int amdgpu_device_baco_enter(struct drm_device *dev) 5217 { 5218 struct amdgpu_device *adev = drm_to_adev(dev); 5219 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5220 5221 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5222 return -ENOTSUPP; 5223 5224 if (ras && adev->ras_enabled && 5225 adev->nbio.funcs->enable_doorbell_interrupt) 5226 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5227 5228 return amdgpu_dpm_baco_enter(adev); 5229 } 5230 5231 int amdgpu_device_baco_exit(struct drm_device *dev) 5232 { 5233 struct amdgpu_device *adev = drm_to_adev(dev); 5234 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5235 int ret = 0; 5236 5237 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5238 return -ENOTSUPP; 5239 5240 ret = amdgpu_dpm_baco_exit(adev); 5241 if (ret) 5242 return ret; 5243 5244 if (ras && adev->ras_enabled && 5245 adev->nbio.funcs->enable_doorbell_interrupt) 5246 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5247 5248 return 0; 5249 } 5250 5251 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5252 { 5253 int i; 5254 5255 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5256 struct amdgpu_ring *ring = adev->rings[i]; 5257 5258 if (!ring || !ring->sched.thread) 5259 continue; 5260 5261 cancel_delayed_work_sync(&ring->sched.work_tdr); 5262 } 5263 } 5264 5265 /** 5266 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5267 * @pdev: PCI device struct 5268 * @state: PCI channel state 5269 * 5270 * Description: Called when a PCI error is detected. 5271 * 5272 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5273 */ 5274 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5275 { 5276 struct drm_device *dev = pci_get_drvdata(pdev); 5277 struct amdgpu_device *adev = drm_to_adev(dev); 5278 int i; 5279 5280 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5281 5282 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5283 DRM_WARN("No support for XGMI hive yet..."); 5284 return PCI_ERS_RESULT_DISCONNECT; 5285 } 5286 5287 switch (state) { 5288 case pci_channel_io_normal: 5289 return PCI_ERS_RESULT_CAN_RECOVER; 5290 /* Fatal error, prepare for slot reset */ 5291 case pci_channel_io_frozen: 5292 /* 5293 * Cancel and wait for all TDRs in progress if failing to 5294 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5295 * 5296 * Locking adev->reset_sem will prevent any external access 5297 * to GPU during PCI error recovery 5298 */ 5299 while (!amdgpu_device_lock_adev(adev, NULL)) 5300 amdgpu_cancel_all_tdr(adev); 5301 5302 /* 5303 * Block any work scheduling as we do for regular GPU reset 5304 * for the duration of the recovery 5305 */ 5306 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5307 struct amdgpu_ring *ring = adev->rings[i]; 5308 5309 if (!ring || !ring->sched.thread) 5310 continue; 5311 5312 drm_sched_stop(&ring->sched, NULL); 5313 } 5314 atomic_inc(&adev->gpu_reset_counter); 5315 return PCI_ERS_RESULT_NEED_RESET; 5316 case pci_channel_io_perm_failure: 5317 /* Permanent error, prepare for device removal */ 5318 return PCI_ERS_RESULT_DISCONNECT; 5319 } 5320 5321 return PCI_ERS_RESULT_NEED_RESET; 5322 } 5323 5324 /** 5325 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5326 * @pdev: pointer to PCI device 5327 */ 5328 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5329 { 5330 5331 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5332 5333 /* TODO - dump whatever for debugging purposes */ 5334 5335 /* This called only if amdgpu_pci_error_detected returns 5336 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5337 * works, no need to reset slot. 5338 */ 5339 5340 return PCI_ERS_RESULT_RECOVERED; 5341 } 5342 5343 /** 5344 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5345 * @pdev: PCI device struct 5346 * 5347 * Description: This routine is called by the pci error recovery 5348 * code after the PCI slot has been reset, just before we 5349 * should resume normal operations. 5350 */ 5351 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5352 { 5353 struct drm_device *dev = pci_get_drvdata(pdev); 5354 struct amdgpu_device *adev = drm_to_adev(dev); 5355 int r, i; 5356 struct amdgpu_reset_context reset_context; 5357 u32 memsize; 5358 struct list_head device_list; 5359 5360 DRM_INFO("PCI error: slot reset callback!!\n"); 5361 5362 memset(&reset_context, 0, sizeof(reset_context)); 5363 5364 INIT_LIST_HEAD(&device_list); 5365 list_add_tail(&adev->reset_list, &device_list); 5366 5367 /* wait for asic to come out of reset */ 5368 msleep(500); 5369 5370 /* Restore PCI confspace */ 5371 amdgpu_device_load_pci_state(pdev); 5372 5373 /* confirm ASIC came out of reset */ 5374 for (i = 0; i < adev->usec_timeout; i++) { 5375 memsize = amdgpu_asic_get_config_memsize(adev); 5376 5377 if (memsize != 0xffffffff) 5378 break; 5379 udelay(1); 5380 } 5381 if (memsize == 0xffffffff) { 5382 r = -ETIME; 5383 goto out; 5384 } 5385 5386 reset_context.method = AMD_RESET_METHOD_NONE; 5387 reset_context.reset_req_dev = adev; 5388 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5389 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5390 5391 adev->no_hw_access = true; 5392 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5393 adev->no_hw_access = false; 5394 if (r) 5395 goto out; 5396 5397 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5398 5399 out: 5400 if (!r) { 5401 if (amdgpu_device_cache_pci_state(adev->pdev)) 5402 pci_restore_state(adev->pdev); 5403 5404 DRM_INFO("PCIe error recovery succeeded\n"); 5405 } else { 5406 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5407 amdgpu_device_unlock_adev(adev); 5408 } 5409 5410 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5411 } 5412 5413 /** 5414 * amdgpu_pci_resume() - resume normal ops after PCI reset 5415 * @pdev: pointer to PCI device 5416 * 5417 * Called when the error recovery driver tells us that its 5418 * OK to resume normal operation. 5419 */ 5420 void amdgpu_pci_resume(struct pci_dev *pdev) 5421 { 5422 struct drm_device *dev = pci_get_drvdata(pdev); 5423 struct amdgpu_device *adev = drm_to_adev(dev); 5424 int i; 5425 5426 5427 DRM_INFO("PCI error: resume callback!!\n"); 5428 5429 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5430 struct amdgpu_ring *ring = adev->rings[i]; 5431 5432 if (!ring || !ring->sched.thread) 5433 continue; 5434 5435 5436 drm_sched_resubmit_jobs(&ring->sched); 5437 drm_sched_start(&ring->sched, true); 5438 } 5439 5440 amdgpu_device_unlock_adev(adev); 5441 } 5442 5443 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5444 { 5445 struct drm_device *dev = pci_get_drvdata(pdev); 5446 struct amdgpu_device *adev = drm_to_adev(dev); 5447 int r; 5448 5449 r = pci_save_state(pdev); 5450 if (!r) { 5451 kfree(adev->pci_state); 5452 5453 adev->pci_state = pci_store_saved_state(pdev); 5454 5455 if (!adev->pci_state) { 5456 DRM_ERROR("Failed to store PCI saved state"); 5457 return false; 5458 } 5459 } else { 5460 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5461 return false; 5462 } 5463 5464 return true; 5465 } 5466 5467 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5468 { 5469 struct drm_device *dev = pci_get_drvdata(pdev); 5470 struct amdgpu_device *adev = drm_to_adev(dev); 5471 int r; 5472 5473 if (!adev->pci_state) 5474 return false; 5475 5476 r = pci_load_saved_state(pdev, adev->pci_state); 5477 5478 if (!r) { 5479 pci_restore_state(pdev); 5480 } else { 5481 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5482 return false; 5483 } 5484 5485 return true; 5486 } 5487 5488 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5489 struct amdgpu_ring *ring) 5490 { 5491 #ifdef CONFIG_X86_64 5492 if (adev->flags & AMD_IS_APU) 5493 return; 5494 #endif 5495 if (adev->gmc.xgmi.connected_to_cpu) 5496 return; 5497 5498 if (ring && ring->funcs->emit_hdp_flush) 5499 amdgpu_ring_emit_hdp_flush(ring); 5500 else 5501 amdgpu_asic_flush_hdp(adev, ring); 5502 } 5503 5504 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5505 struct amdgpu_ring *ring) 5506 { 5507 #ifdef CONFIG_X86_64 5508 if (adev->flags & AMD_IS_APU) 5509 return; 5510 #endif 5511 if (adev->gmc.xgmi.connected_to_cpu) 5512 return; 5513 5514 amdgpu_asic_invalidate_hdp(adev, ring); 5515 } 5516