1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 #include "amdgpu_reset.h" 69 70 #include <linux/suspend.h> 71 #include <drm/task_barrier.h> 72 #include <linux/pm_runtime.h> 73 74 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 85 86 #define AMDGPU_RESUME_MS 2000 87 88 const char *amdgpu_asic_name[] = { 89 "TAHITI", 90 "PITCAIRN", 91 "VERDE", 92 "OLAND", 93 "HAINAN", 94 "BONAIRE", 95 "KAVERI", 96 "KABINI", 97 "HAWAII", 98 "MULLINS", 99 "TOPAZ", 100 "TONGA", 101 "FIJI", 102 "CARRIZO", 103 "STONEY", 104 "POLARIS10", 105 "POLARIS11", 106 "POLARIS12", 107 "VEGAM", 108 "VEGA10", 109 "VEGA12", 110 "VEGA20", 111 "RAVEN", 112 "ARCTURUS", 113 "RENOIR", 114 "ALDEBARAN", 115 "NAVI10", 116 "NAVI14", 117 "NAVI12", 118 "SIENNA_CICHLID", 119 "NAVY_FLOUNDER", 120 "VANGOGH", 121 "DIMGREY_CAVEFISH", 122 "LAST", 123 }; 124 125 /** 126 * DOC: pcie_replay_count 127 * 128 * The amdgpu driver provides a sysfs API for reporting the total number 129 * of PCIe replays (NAKs) 130 * The file pcie_replay_count is used for this and returns the total 131 * number of replays as a sum of the NAKs generated and NAKs received 132 */ 133 134 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 135 struct device_attribute *attr, char *buf) 136 { 137 struct drm_device *ddev = dev_get_drvdata(dev); 138 struct amdgpu_device *adev = drm_to_adev(ddev); 139 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 140 141 return sysfs_emit(buf, "%llu\n", cnt); 142 } 143 144 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 145 amdgpu_device_get_pcie_replay_count, NULL); 146 147 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 148 149 /** 150 * DOC: product_name 151 * 152 * The amdgpu driver provides a sysfs API for reporting the product name 153 * for the device 154 * The file serial_number is used for this and returns the product name 155 * as returned from the FRU. 156 * NOTE: This is only available for certain server cards 157 */ 158 159 static ssize_t amdgpu_device_get_product_name(struct device *dev, 160 struct device_attribute *attr, char *buf) 161 { 162 struct drm_device *ddev = dev_get_drvdata(dev); 163 struct amdgpu_device *adev = drm_to_adev(ddev); 164 165 return sysfs_emit(buf, "%s\n", adev->product_name); 166 } 167 168 static DEVICE_ATTR(product_name, S_IRUGO, 169 amdgpu_device_get_product_name, NULL); 170 171 /** 172 * DOC: product_number 173 * 174 * The amdgpu driver provides a sysfs API for reporting the part number 175 * for the device 176 * The file serial_number is used for this and returns the part number 177 * as returned from the FRU. 178 * NOTE: This is only available for certain server cards 179 */ 180 181 static ssize_t amdgpu_device_get_product_number(struct device *dev, 182 struct device_attribute *attr, char *buf) 183 { 184 struct drm_device *ddev = dev_get_drvdata(dev); 185 struct amdgpu_device *adev = drm_to_adev(ddev); 186 187 return sysfs_emit(buf, "%s\n", adev->product_number); 188 } 189 190 static DEVICE_ATTR(product_number, S_IRUGO, 191 amdgpu_device_get_product_number, NULL); 192 193 /** 194 * DOC: serial_number 195 * 196 * The amdgpu driver provides a sysfs API for reporting the serial number 197 * for the device 198 * The file serial_number is used for this and returns the serial number 199 * as returned from the FRU. 200 * NOTE: This is only available for certain server cards 201 */ 202 203 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 204 struct device_attribute *attr, char *buf) 205 { 206 struct drm_device *ddev = dev_get_drvdata(dev); 207 struct amdgpu_device *adev = drm_to_adev(ddev); 208 209 return sysfs_emit(buf, "%s\n", adev->serial); 210 } 211 212 static DEVICE_ATTR(serial_number, S_IRUGO, 213 amdgpu_device_get_serial_number, NULL); 214 215 /** 216 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 217 * 218 * @dev: drm_device pointer 219 * 220 * Returns true if the device is a dGPU with ATPX power control, 221 * otherwise return false. 222 */ 223 bool amdgpu_device_supports_px(struct drm_device *dev) 224 { 225 struct amdgpu_device *adev = drm_to_adev(dev); 226 227 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 228 return true; 229 return false; 230 } 231 232 /** 233 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 234 * 235 * @dev: drm_device pointer 236 * 237 * Returns true if the device is a dGPU with ACPI power control, 238 * otherwise return false. 239 */ 240 bool amdgpu_device_supports_boco(struct drm_device *dev) 241 { 242 struct amdgpu_device *adev = drm_to_adev(dev); 243 244 if (adev->has_pr3 || 245 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 246 return true; 247 return false; 248 } 249 250 /** 251 * amdgpu_device_supports_baco - Does the device support BACO 252 * 253 * @dev: drm_device pointer 254 * 255 * Returns true if the device supporte BACO, 256 * otherwise return false. 257 */ 258 bool amdgpu_device_supports_baco(struct drm_device *dev) 259 { 260 struct amdgpu_device *adev = drm_to_adev(dev); 261 262 return amdgpu_asic_supports_baco(adev); 263 } 264 265 /* 266 * VRAM access helper functions 267 */ 268 269 /** 270 * amdgpu_device_vram_access - read/write a buffer in vram 271 * 272 * @adev: amdgpu_device pointer 273 * @pos: offset of the buffer in vram 274 * @buf: virtual address of the buffer in system memory 275 * @size: read/write size, sizeof(@buf) must > @size 276 * @write: true - write to vram, otherwise - read from vram 277 */ 278 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 279 uint32_t *buf, size_t size, bool write) 280 { 281 unsigned long flags; 282 uint32_t hi = ~0; 283 uint64_t last; 284 285 286 #ifdef CONFIG_64BIT 287 last = min(pos + size, adev->gmc.visible_vram_size); 288 if (last > pos) { 289 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 290 size_t count = last - pos; 291 292 if (write) { 293 memcpy_toio(addr, buf, count); 294 mb(); 295 amdgpu_asic_flush_hdp(adev, NULL); 296 } else { 297 amdgpu_asic_invalidate_hdp(adev, NULL); 298 mb(); 299 memcpy_fromio(buf, addr, count); 300 } 301 302 if (count == size) 303 return; 304 305 pos += count; 306 buf += count / 4; 307 size -= count; 308 } 309 #endif 310 311 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 312 for (last = pos + size; pos < last; pos += 4) { 313 uint32_t tmp = pos >> 31; 314 315 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 316 if (tmp != hi) { 317 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 318 hi = tmp; 319 } 320 if (write) 321 WREG32_NO_KIQ(mmMM_DATA, *buf++); 322 else 323 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 324 } 325 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 326 } 327 328 /* 329 * register access helper functions. 330 */ 331 332 /* Check if hw access should be skipped because of hotplug or device error */ 333 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 334 { 335 if (adev->in_pci_err_recovery) 336 return true; 337 338 #ifdef CONFIG_LOCKDEP 339 /* 340 * This is a bit complicated to understand, so worth a comment. What we assert 341 * here is that the GPU reset is not running on another thread in parallel. 342 * 343 * For this we trylock the read side of the reset semaphore, if that succeeds 344 * we know that the reset is not running in paralell. 345 * 346 * If the trylock fails we assert that we are either already holding the read 347 * side of the lock or are the reset thread itself and hold the write side of 348 * the lock. 349 */ 350 if (in_task()) { 351 if (down_read_trylock(&adev->reset_sem)) 352 up_read(&adev->reset_sem); 353 else 354 lockdep_assert_held(&adev->reset_sem); 355 } 356 #endif 357 return false; 358 } 359 360 /** 361 * amdgpu_device_rreg - read a memory mapped IO or indirect register 362 * 363 * @adev: amdgpu_device pointer 364 * @reg: dword aligned register offset 365 * @acc_flags: access flags which require special behavior 366 * 367 * Returns the 32 bit value from the offset specified. 368 */ 369 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 370 uint32_t reg, uint32_t acc_flags) 371 { 372 uint32_t ret; 373 374 if (amdgpu_device_skip_hw_access(adev)) 375 return 0; 376 377 if ((reg * 4) < adev->rmmio_size) { 378 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 379 amdgpu_sriov_runtime(adev) && 380 down_read_trylock(&adev->reset_sem)) { 381 ret = amdgpu_kiq_rreg(adev, reg); 382 up_read(&adev->reset_sem); 383 } else { 384 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 385 } 386 } else { 387 ret = adev->pcie_rreg(adev, reg * 4); 388 } 389 390 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 391 392 return ret; 393 } 394 395 /* 396 * MMIO register read with bytes helper functions 397 * @offset:bytes offset from MMIO start 398 * 399 */ 400 401 /** 402 * amdgpu_mm_rreg8 - read a memory mapped IO register 403 * 404 * @adev: amdgpu_device pointer 405 * @offset: byte aligned register offset 406 * 407 * Returns the 8 bit value from the offset specified. 408 */ 409 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 410 { 411 if (amdgpu_device_skip_hw_access(adev)) 412 return 0; 413 414 if (offset < adev->rmmio_size) 415 return (readb(adev->rmmio + offset)); 416 BUG(); 417 } 418 419 /* 420 * MMIO register write with bytes helper functions 421 * @offset:bytes offset from MMIO start 422 * @value: the value want to be written to the register 423 * 424 */ 425 /** 426 * amdgpu_mm_wreg8 - read a memory mapped IO register 427 * 428 * @adev: amdgpu_device pointer 429 * @offset: byte aligned register offset 430 * @value: 8 bit value to write 431 * 432 * Writes the value specified to the offset specified. 433 */ 434 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 435 { 436 if (amdgpu_device_skip_hw_access(adev)) 437 return; 438 439 if (offset < adev->rmmio_size) 440 writeb(value, adev->rmmio + offset); 441 else 442 BUG(); 443 } 444 445 /** 446 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 447 * 448 * @adev: amdgpu_device pointer 449 * @reg: dword aligned register offset 450 * @v: 32 bit value to write to the register 451 * @acc_flags: access flags which require special behavior 452 * 453 * Writes the value specified to the offset specified. 454 */ 455 void amdgpu_device_wreg(struct amdgpu_device *adev, 456 uint32_t reg, uint32_t v, 457 uint32_t acc_flags) 458 { 459 if (amdgpu_device_skip_hw_access(adev)) 460 return; 461 462 if ((reg * 4) < adev->rmmio_size) { 463 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 464 amdgpu_sriov_runtime(adev) && 465 down_read_trylock(&adev->reset_sem)) { 466 amdgpu_kiq_wreg(adev, reg, v); 467 up_read(&adev->reset_sem); 468 } else { 469 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 470 } 471 } else { 472 adev->pcie_wreg(adev, reg * 4, v); 473 } 474 475 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 476 } 477 478 /* 479 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 480 * 481 * this function is invoked only the debugfs register access 482 * */ 483 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 484 uint32_t reg, uint32_t v) 485 { 486 if (amdgpu_device_skip_hw_access(adev)) 487 return; 488 489 if (amdgpu_sriov_fullaccess(adev) && 490 adev->gfx.rlc.funcs && 491 adev->gfx.rlc.funcs->is_rlcg_access_range) { 492 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 493 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); 494 } else { 495 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 496 } 497 } 498 499 /** 500 * amdgpu_mm_rdoorbell - read a doorbell dword 501 * 502 * @adev: amdgpu_device pointer 503 * @index: doorbell index 504 * 505 * Returns the value in the doorbell aperture at the 506 * requested doorbell index (CIK). 507 */ 508 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 509 { 510 if (amdgpu_device_skip_hw_access(adev)) 511 return 0; 512 513 if (index < adev->doorbell.num_doorbells) { 514 return readl(adev->doorbell.ptr + index); 515 } else { 516 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 517 return 0; 518 } 519 } 520 521 /** 522 * amdgpu_mm_wdoorbell - write a doorbell dword 523 * 524 * @adev: amdgpu_device pointer 525 * @index: doorbell index 526 * @v: value to write 527 * 528 * Writes @v to the doorbell aperture at the 529 * requested doorbell index (CIK). 530 */ 531 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 532 { 533 if (amdgpu_device_skip_hw_access(adev)) 534 return; 535 536 if (index < adev->doorbell.num_doorbells) { 537 writel(v, adev->doorbell.ptr + index); 538 } else { 539 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 540 } 541 } 542 543 /** 544 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 545 * 546 * @adev: amdgpu_device pointer 547 * @index: doorbell index 548 * 549 * Returns the value in the doorbell aperture at the 550 * requested doorbell index (VEGA10+). 551 */ 552 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 553 { 554 if (amdgpu_device_skip_hw_access(adev)) 555 return 0; 556 557 if (index < adev->doorbell.num_doorbells) { 558 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 559 } else { 560 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 561 return 0; 562 } 563 } 564 565 /** 566 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 567 * 568 * @adev: amdgpu_device pointer 569 * @index: doorbell index 570 * @v: value to write 571 * 572 * Writes @v to the doorbell aperture at the 573 * requested doorbell index (VEGA10+). 574 */ 575 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 576 { 577 if (amdgpu_device_skip_hw_access(adev)) 578 return; 579 580 if (index < adev->doorbell.num_doorbells) { 581 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 582 } else { 583 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 584 } 585 } 586 587 /** 588 * amdgpu_device_indirect_rreg - read an indirect register 589 * 590 * @adev: amdgpu_device pointer 591 * @pcie_index: mmio register offset 592 * @pcie_data: mmio register offset 593 * @reg_addr: indirect register address to read from 594 * 595 * Returns the value of indirect register @reg_addr 596 */ 597 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 598 u32 pcie_index, u32 pcie_data, 599 u32 reg_addr) 600 { 601 unsigned long flags; 602 u32 r; 603 void __iomem *pcie_index_offset; 604 void __iomem *pcie_data_offset; 605 606 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 607 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 608 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 609 610 writel(reg_addr, pcie_index_offset); 611 readl(pcie_index_offset); 612 r = readl(pcie_data_offset); 613 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 614 615 return r; 616 } 617 618 /** 619 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 620 * 621 * @adev: amdgpu_device pointer 622 * @pcie_index: mmio register offset 623 * @pcie_data: mmio register offset 624 * @reg_addr: indirect register address to read from 625 * 626 * Returns the value of indirect register @reg_addr 627 */ 628 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 629 u32 pcie_index, u32 pcie_data, 630 u32 reg_addr) 631 { 632 unsigned long flags; 633 u64 r; 634 void __iomem *pcie_index_offset; 635 void __iomem *pcie_data_offset; 636 637 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 638 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 639 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 640 641 /* read low 32 bits */ 642 writel(reg_addr, pcie_index_offset); 643 readl(pcie_index_offset); 644 r = readl(pcie_data_offset); 645 /* read high 32 bits */ 646 writel(reg_addr + 4, pcie_index_offset); 647 readl(pcie_index_offset); 648 r |= ((u64)readl(pcie_data_offset) << 32); 649 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 650 651 return r; 652 } 653 654 /** 655 * amdgpu_device_indirect_wreg - write an indirect register address 656 * 657 * @adev: amdgpu_device pointer 658 * @pcie_index: mmio register offset 659 * @pcie_data: mmio register offset 660 * @reg_addr: indirect register offset 661 * @reg_data: indirect register data 662 * 663 */ 664 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 665 u32 pcie_index, u32 pcie_data, 666 u32 reg_addr, u32 reg_data) 667 { 668 unsigned long flags; 669 void __iomem *pcie_index_offset; 670 void __iomem *pcie_data_offset; 671 672 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 673 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 674 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 675 676 writel(reg_addr, pcie_index_offset); 677 readl(pcie_index_offset); 678 writel(reg_data, pcie_data_offset); 679 readl(pcie_data_offset); 680 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 681 } 682 683 /** 684 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 685 * 686 * @adev: amdgpu_device pointer 687 * @pcie_index: mmio register offset 688 * @pcie_data: mmio register offset 689 * @reg_addr: indirect register offset 690 * @reg_data: indirect register data 691 * 692 */ 693 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 694 u32 pcie_index, u32 pcie_data, 695 u32 reg_addr, u64 reg_data) 696 { 697 unsigned long flags; 698 void __iomem *pcie_index_offset; 699 void __iomem *pcie_data_offset; 700 701 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 702 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 703 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 704 705 /* write low 32 bits */ 706 writel(reg_addr, pcie_index_offset); 707 readl(pcie_index_offset); 708 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 709 readl(pcie_data_offset); 710 /* write high 32 bits */ 711 writel(reg_addr + 4, pcie_index_offset); 712 readl(pcie_index_offset); 713 writel((u32)(reg_data >> 32), pcie_data_offset); 714 readl(pcie_data_offset); 715 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 716 } 717 718 /** 719 * amdgpu_invalid_rreg - dummy reg read function 720 * 721 * @adev: amdgpu_device pointer 722 * @reg: offset of register 723 * 724 * Dummy register read function. Used for register blocks 725 * that certain asics don't have (all asics). 726 * Returns the value in the register. 727 */ 728 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 729 { 730 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 731 BUG(); 732 return 0; 733 } 734 735 /** 736 * amdgpu_invalid_wreg - dummy reg write function 737 * 738 * @adev: amdgpu_device pointer 739 * @reg: offset of register 740 * @v: value to write to the register 741 * 742 * Dummy register read function. Used for register blocks 743 * that certain asics don't have (all asics). 744 */ 745 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 746 { 747 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 748 reg, v); 749 BUG(); 750 } 751 752 /** 753 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 754 * 755 * @adev: amdgpu_device pointer 756 * @reg: offset of register 757 * 758 * Dummy register read function. Used for register blocks 759 * that certain asics don't have (all asics). 760 * Returns the value in the register. 761 */ 762 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 763 { 764 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 765 BUG(); 766 return 0; 767 } 768 769 /** 770 * amdgpu_invalid_wreg64 - dummy reg write function 771 * 772 * @adev: amdgpu_device pointer 773 * @reg: offset of register 774 * @v: value to write to the register 775 * 776 * Dummy register read function. Used for register blocks 777 * that certain asics don't have (all asics). 778 */ 779 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 780 { 781 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 782 reg, v); 783 BUG(); 784 } 785 786 /** 787 * amdgpu_block_invalid_rreg - dummy reg read function 788 * 789 * @adev: amdgpu_device pointer 790 * @block: offset of instance 791 * @reg: offset of register 792 * 793 * Dummy register read function. Used for register blocks 794 * that certain asics don't have (all asics). 795 * Returns the value in the register. 796 */ 797 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 798 uint32_t block, uint32_t reg) 799 { 800 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 801 reg, block); 802 BUG(); 803 return 0; 804 } 805 806 /** 807 * amdgpu_block_invalid_wreg - dummy reg write function 808 * 809 * @adev: amdgpu_device pointer 810 * @block: offset of instance 811 * @reg: offset of register 812 * @v: value to write to the register 813 * 814 * Dummy register read function. Used for register blocks 815 * that certain asics don't have (all asics). 816 */ 817 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 818 uint32_t block, 819 uint32_t reg, uint32_t v) 820 { 821 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 822 reg, block, v); 823 BUG(); 824 } 825 826 /** 827 * amdgpu_device_asic_init - Wrapper for atom asic_init 828 * 829 * @adev: amdgpu_device pointer 830 * 831 * Does any asic specific work and then calls atom asic init. 832 */ 833 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 834 { 835 amdgpu_asic_pre_asic_init(adev); 836 837 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 838 } 839 840 /** 841 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 842 * 843 * @adev: amdgpu_device pointer 844 * 845 * Allocates a scratch page of VRAM for use by various things in the 846 * driver. 847 */ 848 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 849 { 850 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 851 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 852 &adev->vram_scratch.robj, 853 &adev->vram_scratch.gpu_addr, 854 (void **)&adev->vram_scratch.ptr); 855 } 856 857 /** 858 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 859 * 860 * @adev: amdgpu_device pointer 861 * 862 * Frees the VRAM scratch page. 863 */ 864 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 865 { 866 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 867 } 868 869 /** 870 * amdgpu_device_program_register_sequence - program an array of registers. 871 * 872 * @adev: amdgpu_device pointer 873 * @registers: pointer to the register array 874 * @array_size: size of the register array 875 * 876 * Programs an array or registers with and and or masks. 877 * This is a helper for setting golden registers. 878 */ 879 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 880 const u32 *registers, 881 const u32 array_size) 882 { 883 u32 tmp, reg, and_mask, or_mask; 884 int i; 885 886 if (array_size % 3) 887 return; 888 889 for (i = 0; i < array_size; i +=3) { 890 reg = registers[i + 0]; 891 and_mask = registers[i + 1]; 892 or_mask = registers[i + 2]; 893 894 if (and_mask == 0xffffffff) { 895 tmp = or_mask; 896 } else { 897 tmp = RREG32(reg); 898 tmp &= ~and_mask; 899 if (adev->family >= AMDGPU_FAMILY_AI) 900 tmp |= (or_mask & and_mask); 901 else 902 tmp |= or_mask; 903 } 904 WREG32(reg, tmp); 905 } 906 } 907 908 /** 909 * amdgpu_device_pci_config_reset - reset the GPU 910 * 911 * @adev: amdgpu_device pointer 912 * 913 * Resets the GPU using the pci config reset sequence. 914 * Only applicable to asics prior to vega10. 915 */ 916 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 917 { 918 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 919 } 920 921 /** 922 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 923 * 924 * @adev: amdgpu_device pointer 925 * 926 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 927 */ 928 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 929 { 930 return pci_reset_function(adev->pdev); 931 } 932 933 /* 934 * GPU doorbell aperture helpers function. 935 */ 936 /** 937 * amdgpu_device_doorbell_init - Init doorbell driver information. 938 * 939 * @adev: amdgpu_device pointer 940 * 941 * Init doorbell driver information (CIK) 942 * Returns 0 on success, error on failure. 943 */ 944 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 945 { 946 947 /* No doorbell on SI hardware generation */ 948 if (adev->asic_type < CHIP_BONAIRE) { 949 adev->doorbell.base = 0; 950 adev->doorbell.size = 0; 951 adev->doorbell.num_doorbells = 0; 952 adev->doorbell.ptr = NULL; 953 return 0; 954 } 955 956 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 957 return -EINVAL; 958 959 amdgpu_asic_init_doorbell_index(adev); 960 961 /* doorbell bar mapping */ 962 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 963 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 964 965 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 966 adev->doorbell_index.max_assignment+1); 967 if (adev->doorbell.num_doorbells == 0) 968 return -EINVAL; 969 970 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 971 * paging queue doorbell use the second page. The 972 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 973 * doorbells are in the first page. So with paging queue enabled, 974 * the max num_doorbells should + 1 page (0x400 in dword) 975 */ 976 if (adev->asic_type >= CHIP_VEGA10) 977 adev->doorbell.num_doorbells += 0x400; 978 979 adev->doorbell.ptr = ioremap(adev->doorbell.base, 980 adev->doorbell.num_doorbells * 981 sizeof(u32)); 982 if (adev->doorbell.ptr == NULL) 983 return -ENOMEM; 984 985 return 0; 986 } 987 988 /** 989 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 990 * 991 * @adev: amdgpu_device pointer 992 * 993 * Tear down doorbell driver information (CIK) 994 */ 995 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 996 { 997 iounmap(adev->doorbell.ptr); 998 adev->doorbell.ptr = NULL; 999 } 1000 1001 1002 1003 /* 1004 * amdgpu_device_wb_*() 1005 * Writeback is the method by which the GPU updates special pages in memory 1006 * with the status of certain GPU events (fences, ring pointers,etc.). 1007 */ 1008 1009 /** 1010 * amdgpu_device_wb_fini - Disable Writeback and free memory 1011 * 1012 * @adev: amdgpu_device pointer 1013 * 1014 * Disables Writeback and frees the Writeback memory (all asics). 1015 * Used at driver shutdown. 1016 */ 1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1018 { 1019 if (adev->wb.wb_obj) { 1020 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1021 &adev->wb.gpu_addr, 1022 (void **)&adev->wb.wb); 1023 adev->wb.wb_obj = NULL; 1024 } 1025 } 1026 1027 /** 1028 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1029 * 1030 * @adev: amdgpu_device pointer 1031 * 1032 * Initializes writeback and allocates writeback memory (all asics). 1033 * Used at driver startup. 1034 * Returns 0 on success or an -error on failure. 1035 */ 1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1037 { 1038 int r; 1039 1040 if (adev->wb.wb_obj == NULL) { 1041 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1042 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1043 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1044 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1045 (void **)&adev->wb.wb); 1046 if (r) { 1047 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1048 return r; 1049 } 1050 1051 adev->wb.num_wb = AMDGPU_MAX_WB; 1052 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1053 1054 /* clear wb memory */ 1055 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1056 } 1057 1058 return 0; 1059 } 1060 1061 /** 1062 * amdgpu_device_wb_get - Allocate a wb entry 1063 * 1064 * @adev: amdgpu_device pointer 1065 * @wb: wb index 1066 * 1067 * Allocate a wb slot for use by the driver (all asics). 1068 * Returns 0 on success or -EINVAL on failure. 1069 */ 1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1071 { 1072 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1073 1074 if (offset < adev->wb.num_wb) { 1075 __set_bit(offset, adev->wb.used); 1076 *wb = offset << 3; /* convert to dw offset */ 1077 return 0; 1078 } else { 1079 return -EINVAL; 1080 } 1081 } 1082 1083 /** 1084 * amdgpu_device_wb_free - Free a wb entry 1085 * 1086 * @adev: amdgpu_device pointer 1087 * @wb: wb index 1088 * 1089 * Free a wb slot allocated for use by the driver (all asics) 1090 */ 1091 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1092 { 1093 wb >>= 3; 1094 if (wb < adev->wb.num_wb) 1095 __clear_bit(wb, adev->wb.used); 1096 } 1097 1098 /** 1099 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1100 * 1101 * @adev: amdgpu_device pointer 1102 * 1103 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1104 * to fail, but if any of the BARs is not accessible after the size we abort 1105 * driver loading by returning -ENODEV. 1106 */ 1107 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1108 { 1109 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1110 struct pci_bus *root; 1111 struct resource *res; 1112 unsigned i; 1113 u16 cmd; 1114 int r; 1115 1116 /* Bypass for VF */ 1117 if (amdgpu_sriov_vf(adev)) 1118 return 0; 1119 1120 /* skip if the bios has already enabled large BAR */ 1121 if (adev->gmc.real_vram_size && 1122 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1123 return 0; 1124 1125 /* Check if the root BUS has 64bit memory resources */ 1126 root = adev->pdev->bus; 1127 while (root->parent) 1128 root = root->parent; 1129 1130 pci_bus_for_each_resource(root, res, i) { 1131 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1132 res->start > 0x100000000ull) 1133 break; 1134 } 1135 1136 /* Trying to resize is pointless without a root hub window above 4GB */ 1137 if (!res) 1138 return 0; 1139 1140 /* Limit the BAR size to what is available */ 1141 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1142 rbar_size); 1143 1144 /* Disable memory decoding while we change the BAR addresses and size */ 1145 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1146 pci_write_config_word(adev->pdev, PCI_COMMAND, 1147 cmd & ~PCI_COMMAND_MEMORY); 1148 1149 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1150 amdgpu_device_doorbell_fini(adev); 1151 if (adev->asic_type >= CHIP_BONAIRE) 1152 pci_release_resource(adev->pdev, 2); 1153 1154 pci_release_resource(adev->pdev, 0); 1155 1156 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1157 if (r == -ENOSPC) 1158 DRM_INFO("Not enough PCI address space for a large BAR."); 1159 else if (r && r != -ENOTSUPP) 1160 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1161 1162 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1163 1164 /* When the doorbell or fb BAR isn't available we have no chance of 1165 * using the device. 1166 */ 1167 r = amdgpu_device_doorbell_init(adev); 1168 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1169 return -ENODEV; 1170 1171 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1172 1173 return 0; 1174 } 1175 1176 /* 1177 * GPU helpers function. 1178 */ 1179 /** 1180 * amdgpu_device_need_post - check if the hw need post or not 1181 * 1182 * @adev: amdgpu_device pointer 1183 * 1184 * Check if the asic has been initialized (all asics) at driver startup 1185 * or post is needed if hw reset is performed. 1186 * Returns true if need or false if not. 1187 */ 1188 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1189 { 1190 uint32_t reg; 1191 1192 if (amdgpu_sriov_vf(adev)) 1193 return false; 1194 1195 if (amdgpu_passthrough(adev)) { 1196 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1197 * some old smc fw still need driver do vPost otherwise gpu hang, while 1198 * those smc fw version above 22.15 doesn't have this flaw, so we force 1199 * vpost executed for smc version below 22.15 1200 */ 1201 if (adev->asic_type == CHIP_FIJI) { 1202 int err; 1203 uint32_t fw_ver; 1204 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1205 /* force vPost if error occured */ 1206 if (err) 1207 return true; 1208 1209 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1210 if (fw_ver < 0x00160e00) 1211 return true; 1212 } 1213 } 1214 1215 /* Don't post if we need to reset whole hive on init */ 1216 if (adev->gmc.xgmi.pending_reset) 1217 return false; 1218 1219 if (adev->has_hw_reset) { 1220 adev->has_hw_reset = false; 1221 return true; 1222 } 1223 1224 /* bios scratch used on CIK+ */ 1225 if (adev->asic_type >= CHIP_BONAIRE) 1226 return amdgpu_atombios_scratch_need_asic_init(adev); 1227 1228 /* check MEM_SIZE for older asics */ 1229 reg = amdgpu_asic_get_config_memsize(adev); 1230 1231 if ((reg != 0) && (reg != 0xffffffff)) 1232 return false; 1233 1234 return true; 1235 } 1236 1237 /* if we get transitioned to only one device, take VGA back */ 1238 /** 1239 * amdgpu_device_vga_set_decode - enable/disable vga decode 1240 * 1241 * @cookie: amdgpu_device pointer 1242 * @state: enable/disable vga decode 1243 * 1244 * Enable/disable vga decode (all asics). 1245 * Returns VGA resource flags. 1246 */ 1247 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1248 { 1249 struct amdgpu_device *adev = cookie; 1250 amdgpu_asic_set_vga_state(adev, state); 1251 if (state) 1252 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1253 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1254 else 1255 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1256 } 1257 1258 /** 1259 * amdgpu_device_check_block_size - validate the vm block size 1260 * 1261 * @adev: amdgpu_device pointer 1262 * 1263 * Validates the vm block size specified via module parameter. 1264 * The vm block size defines number of bits in page table versus page directory, 1265 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1266 * page table and the remaining bits are in the page directory. 1267 */ 1268 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1269 { 1270 /* defines number of bits in page table versus page directory, 1271 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1272 * page table and the remaining bits are in the page directory */ 1273 if (amdgpu_vm_block_size == -1) 1274 return; 1275 1276 if (amdgpu_vm_block_size < 9) { 1277 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1278 amdgpu_vm_block_size); 1279 amdgpu_vm_block_size = -1; 1280 } 1281 } 1282 1283 /** 1284 * amdgpu_device_check_vm_size - validate the vm size 1285 * 1286 * @adev: amdgpu_device pointer 1287 * 1288 * Validates the vm size in GB specified via module parameter. 1289 * The VM size is the size of the GPU virtual memory space in GB. 1290 */ 1291 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1292 { 1293 /* no need to check the default value */ 1294 if (amdgpu_vm_size == -1) 1295 return; 1296 1297 if (amdgpu_vm_size < 1) { 1298 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1299 amdgpu_vm_size); 1300 amdgpu_vm_size = -1; 1301 } 1302 } 1303 1304 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1305 { 1306 struct sysinfo si; 1307 bool is_os_64 = (sizeof(void *) == 8); 1308 uint64_t total_memory; 1309 uint64_t dram_size_seven_GB = 0x1B8000000; 1310 uint64_t dram_size_three_GB = 0xB8000000; 1311 1312 if (amdgpu_smu_memory_pool_size == 0) 1313 return; 1314 1315 if (!is_os_64) { 1316 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1317 goto def_value; 1318 } 1319 si_meminfo(&si); 1320 total_memory = (uint64_t)si.totalram * si.mem_unit; 1321 1322 if ((amdgpu_smu_memory_pool_size == 1) || 1323 (amdgpu_smu_memory_pool_size == 2)) { 1324 if (total_memory < dram_size_three_GB) 1325 goto def_value1; 1326 } else if ((amdgpu_smu_memory_pool_size == 4) || 1327 (amdgpu_smu_memory_pool_size == 8)) { 1328 if (total_memory < dram_size_seven_GB) 1329 goto def_value1; 1330 } else { 1331 DRM_WARN("Smu memory pool size not supported\n"); 1332 goto def_value; 1333 } 1334 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1335 1336 return; 1337 1338 def_value1: 1339 DRM_WARN("No enough system memory\n"); 1340 def_value: 1341 adev->pm.smu_prv_buffer_size = 0; 1342 } 1343 1344 /** 1345 * amdgpu_device_check_arguments - validate module params 1346 * 1347 * @adev: amdgpu_device pointer 1348 * 1349 * Validates certain module parameters and updates 1350 * the associated values used by the driver (all asics). 1351 */ 1352 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1353 { 1354 if (amdgpu_sched_jobs < 4) { 1355 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1356 amdgpu_sched_jobs); 1357 amdgpu_sched_jobs = 4; 1358 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1359 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1360 amdgpu_sched_jobs); 1361 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1362 } 1363 1364 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1365 /* gart size must be greater or equal to 32M */ 1366 dev_warn(adev->dev, "gart size (%d) too small\n", 1367 amdgpu_gart_size); 1368 amdgpu_gart_size = -1; 1369 } 1370 1371 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1372 /* gtt size must be greater or equal to 32M */ 1373 dev_warn(adev->dev, "gtt size (%d) too small\n", 1374 amdgpu_gtt_size); 1375 amdgpu_gtt_size = -1; 1376 } 1377 1378 /* valid range is between 4 and 9 inclusive */ 1379 if (amdgpu_vm_fragment_size != -1 && 1380 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1381 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1382 amdgpu_vm_fragment_size = -1; 1383 } 1384 1385 if (amdgpu_sched_hw_submission < 2) { 1386 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1387 amdgpu_sched_hw_submission); 1388 amdgpu_sched_hw_submission = 2; 1389 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1390 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1391 amdgpu_sched_hw_submission); 1392 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1393 } 1394 1395 amdgpu_device_check_smu_prv_buffer_size(adev); 1396 1397 amdgpu_device_check_vm_size(adev); 1398 1399 amdgpu_device_check_block_size(adev); 1400 1401 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1402 1403 amdgpu_gmc_tmz_set(adev); 1404 1405 amdgpu_gmc_noretry_set(adev); 1406 1407 return 0; 1408 } 1409 1410 /** 1411 * amdgpu_switcheroo_set_state - set switcheroo state 1412 * 1413 * @pdev: pci dev pointer 1414 * @state: vga_switcheroo state 1415 * 1416 * Callback for the switcheroo driver. Suspends or resumes the 1417 * the asics before or after it is powered up using ACPI methods. 1418 */ 1419 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1420 enum vga_switcheroo_state state) 1421 { 1422 struct drm_device *dev = pci_get_drvdata(pdev); 1423 int r; 1424 1425 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1426 return; 1427 1428 if (state == VGA_SWITCHEROO_ON) { 1429 pr_info("switched on\n"); 1430 /* don't suspend or resume card normally */ 1431 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1432 1433 pci_set_power_state(pdev, PCI_D0); 1434 amdgpu_device_load_pci_state(pdev); 1435 r = pci_enable_device(pdev); 1436 if (r) 1437 DRM_WARN("pci_enable_device failed (%d)\n", r); 1438 amdgpu_device_resume(dev, true); 1439 1440 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1441 } else { 1442 pr_info("switched off\n"); 1443 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1444 amdgpu_device_suspend(dev, true); 1445 amdgpu_device_cache_pci_state(pdev); 1446 /* Shut down the device */ 1447 pci_disable_device(pdev); 1448 pci_set_power_state(pdev, PCI_D3cold); 1449 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1450 } 1451 } 1452 1453 /** 1454 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1455 * 1456 * @pdev: pci dev pointer 1457 * 1458 * Callback for the switcheroo driver. Check of the switcheroo 1459 * state can be changed. 1460 * Returns true if the state can be changed, false if not. 1461 */ 1462 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1463 { 1464 struct drm_device *dev = pci_get_drvdata(pdev); 1465 1466 /* 1467 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1468 * locking inversion with the driver load path. And the access here is 1469 * completely racy anyway. So don't bother with locking for now. 1470 */ 1471 return atomic_read(&dev->open_count) == 0; 1472 } 1473 1474 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1475 .set_gpu_state = amdgpu_switcheroo_set_state, 1476 .reprobe = NULL, 1477 .can_switch = amdgpu_switcheroo_can_switch, 1478 }; 1479 1480 /** 1481 * amdgpu_device_ip_set_clockgating_state - set the CG state 1482 * 1483 * @dev: amdgpu_device pointer 1484 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1485 * @state: clockgating state (gate or ungate) 1486 * 1487 * Sets the requested clockgating state for all instances of 1488 * the hardware IP specified. 1489 * Returns the error code from the last instance. 1490 */ 1491 int amdgpu_device_ip_set_clockgating_state(void *dev, 1492 enum amd_ip_block_type block_type, 1493 enum amd_clockgating_state state) 1494 { 1495 struct amdgpu_device *adev = dev; 1496 int i, r = 0; 1497 1498 for (i = 0; i < adev->num_ip_blocks; i++) { 1499 if (!adev->ip_blocks[i].status.valid) 1500 continue; 1501 if (adev->ip_blocks[i].version->type != block_type) 1502 continue; 1503 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1504 continue; 1505 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1506 (void *)adev, state); 1507 if (r) 1508 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1509 adev->ip_blocks[i].version->funcs->name, r); 1510 } 1511 return r; 1512 } 1513 1514 /** 1515 * amdgpu_device_ip_set_powergating_state - set the PG state 1516 * 1517 * @dev: amdgpu_device pointer 1518 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1519 * @state: powergating state (gate or ungate) 1520 * 1521 * Sets the requested powergating state for all instances of 1522 * the hardware IP specified. 1523 * Returns the error code from the last instance. 1524 */ 1525 int amdgpu_device_ip_set_powergating_state(void *dev, 1526 enum amd_ip_block_type block_type, 1527 enum amd_powergating_state state) 1528 { 1529 struct amdgpu_device *adev = dev; 1530 int i, r = 0; 1531 1532 for (i = 0; i < adev->num_ip_blocks; i++) { 1533 if (!adev->ip_blocks[i].status.valid) 1534 continue; 1535 if (adev->ip_blocks[i].version->type != block_type) 1536 continue; 1537 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1538 continue; 1539 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1540 (void *)adev, state); 1541 if (r) 1542 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1543 adev->ip_blocks[i].version->funcs->name, r); 1544 } 1545 return r; 1546 } 1547 1548 /** 1549 * amdgpu_device_ip_get_clockgating_state - get the CG state 1550 * 1551 * @adev: amdgpu_device pointer 1552 * @flags: clockgating feature flags 1553 * 1554 * Walks the list of IPs on the device and updates the clockgating 1555 * flags for each IP. 1556 * Updates @flags with the feature flags for each hardware IP where 1557 * clockgating is enabled. 1558 */ 1559 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1560 u32 *flags) 1561 { 1562 int i; 1563 1564 for (i = 0; i < adev->num_ip_blocks; i++) { 1565 if (!adev->ip_blocks[i].status.valid) 1566 continue; 1567 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1568 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1569 } 1570 } 1571 1572 /** 1573 * amdgpu_device_ip_wait_for_idle - wait for idle 1574 * 1575 * @adev: amdgpu_device pointer 1576 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1577 * 1578 * Waits for the request hardware IP to be idle. 1579 * Returns 0 for success or a negative error code on failure. 1580 */ 1581 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1582 enum amd_ip_block_type block_type) 1583 { 1584 int i, r; 1585 1586 for (i = 0; i < adev->num_ip_blocks; i++) { 1587 if (!adev->ip_blocks[i].status.valid) 1588 continue; 1589 if (adev->ip_blocks[i].version->type == block_type) { 1590 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1591 if (r) 1592 return r; 1593 break; 1594 } 1595 } 1596 return 0; 1597 1598 } 1599 1600 /** 1601 * amdgpu_device_ip_is_idle - is the hardware IP idle 1602 * 1603 * @adev: amdgpu_device pointer 1604 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1605 * 1606 * Check if the hardware IP is idle or not. 1607 * Returns true if it the IP is idle, false if not. 1608 */ 1609 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1610 enum amd_ip_block_type block_type) 1611 { 1612 int i; 1613 1614 for (i = 0; i < adev->num_ip_blocks; i++) { 1615 if (!adev->ip_blocks[i].status.valid) 1616 continue; 1617 if (adev->ip_blocks[i].version->type == block_type) 1618 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1619 } 1620 return true; 1621 1622 } 1623 1624 /** 1625 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1626 * 1627 * @adev: amdgpu_device pointer 1628 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1629 * 1630 * Returns a pointer to the hardware IP block structure 1631 * if it exists for the asic, otherwise NULL. 1632 */ 1633 struct amdgpu_ip_block * 1634 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1635 enum amd_ip_block_type type) 1636 { 1637 int i; 1638 1639 for (i = 0; i < adev->num_ip_blocks; i++) 1640 if (adev->ip_blocks[i].version->type == type) 1641 return &adev->ip_blocks[i]; 1642 1643 return NULL; 1644 } 1645 1646 /** 1647 * amdgpu_device_ip_block_version_cmp 1648 * 1649 * @adev: amdgpu_device pointer 1650 * @type: enum amd_ip_block_type 1651 * @major: major version 1652 * @minor: minor version 1653 * 1654 * return 0 if equal or greater 1655 * return 1 if smaller or the ip_block doesn't exist 1656 */ 1657 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1658 enum amd_ip_block_type type, 1659 u32 major, u32 minor) 1660 { 1661 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1662 1663 if (ip_block && ((ip_block->version->major > major) || 1664 ((ip_block->version->major == major) && 1665 (ip_block->version->minor >= minor)))) 1666 return 0; 1667 1668 return 1; 1669 } 1670 1671 /** 1672 * amdgpu_device_ip_block_add 1673 * 1674 * @adev: amdgpu_device pointer 1675 * @ip_block_version: pointer to the IP to add 1676 * 1677 * Adds the IP block driver information to the collection of IPs 1678 * on the asic. 1679 */ 1680 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1681 const struct amdgpu_ip_block_version *ip_block_version) 1682 { 1683 if (!ip_block_version) 1684 return -EINVAL; 1685 1686 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1687 ip_block_version->funcs->name); 1688 1689 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1690 1691 return 0; 1692 } 1693 1694 /** 1695 * amdgpu_device_enable_virtual_display - enable virtual display feature 1696 * 1697 * @adev: amdgpu_device pointer 1698 * 1699 * Enabled the virtual display feature if the user has enabled it via 1700 * the module parameter virtual_display. This feature provides a virtual 1701 * display hardware on headless boards or in virtualized environments. 1702 * This function parses and validates the configuration string specified by 1703 * the user and configues the virtual display configuration (number of 1704 * virtual connectors, crtcs, etc.) specified. 1705 */ 1706 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1707 { 1708 adev->enable_virtual_display = false; 1709 1710 if (amdgpu_virtual_display) { 1711 const char *pci_address_name = pci_name(adev->pdev); 1712 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1713 1714 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1715 pciaddstr_tmp = pciaddstr; 1716 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1717 pciaddname = strsep(&pciaddname_tmp, ","); 1718 if (!strcmp("all", pciaddname) 1719 || !strcmp(pci_address_name, pciaddname)) { 1720 long num_crtc; 1721 int res = -1; 1722 1723 adev->enable_virtual_display = true; 1724 1725 if (pciaddname_tmp) 1726 res = kstrtol(pciaddname_tmp, 10, 1727 &num_crtc); 1728 1729 if (!res) { 1730 if (num_crtc < 1) 1731 num_crtc = 1; 1732 if (num_crtc > 6) 1733 num_crtc = 6; 1734 adev->mode_info.num_crtc = num_crtc; 1735 } else { 1736 adev->mode_info.num_crtc = 1; 1737 } 1738 break; 1739 } 1740 } 1741 1742 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1743 amdgpu_virtual_display, pci_address_name, 1744 adev->enable_virtual_display, adev->mode_info.num_crtc); 1745 1746 kfree(pciaddstr); 1747 } 1748 } 1749 1750 /** 1751 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1752 * 1753 * @adev: amdgpu_device pointer 1754 * 1755 * Parses the asic configuration parameters specified in the gpu info 1756 * firmware and makes them availale to the driver for use in configuring 1757 * the asic. 1758 * Returns 0 on success, -EINVAL on failure. 1759 */ 1760 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1761 { 1762 const char *chip_name; 1763 char fw_name[40]; 1764 int err; 1765 const struct gpu_info_firmware_header_v1_0 *hdr; 1766 1767 adev->firmware.gpu_info_fw = NULL; 1768 1769 if (adev->mman.discovery_bin) { 1770 amdgpu_discovery_get_gfx_info(adev); 1771 1772 /* 1773 * FIXME: The bounding box is still needed by Navi12, so 1774 * temporarily read it from gpu_info firmware. Should be droped 1775 * when DAL no longer needs it. 1776 */ 1777 if (adev->asic_type != CHIP_NAVI12) 1778 return 0; 1779 } 1780 1781 switch (adev->asic_type) { 1782 #ifdef CONFIG_DRM_AMDGPU_SI 1783 case CHIP_VERDE: 1784 case CHIP_TAHITI: 1785 case CHIP_PITCAIRN: 1786 case CHIP_OLAND: 1787 case CHIP_HAINAN: 1788 #endif 1789 #ifdef CONFIG_DRM_AMDGPU_CIK 1790 case CHIP_BONAIRE: 1791 case CHIP_HAWAII: 1792 case CHIP_KAVERI: 1793 case CHIP_KABINI: 1794 case CHIP_MULLINS: 1795 #endif 1796 case CHIP_TOPAZ: 1797 case CHIP_TONGA: 1798 case CHIP_FIJI: 1799 case CHIP_POLARIS10: 1800 case CHIP_POLARIS11: 1801 case CHIP_POLARIS12: 1802 case CHIP_VEGAM: 1803 case CHIP_CARRIZO: 1804 case CHIP_STONEY: 1805 case CHIP_VEGA20: 1806 case CHIP_ALDEBARAN: 1807 case CHIP_SIENNA_CICHLID: 1808 case CHIP_NAVY_FLOUNDER: 1809 case CHIP_DIMGREY_CAVEFISH: 1810 default: 1811 return 0; 1812 case CHIP_VEGA10: 1813 chip_name = "vega10"; 1814 break; 1815 case CHIP_VEGA12: 1816 chip_name = "vega12"; 1817 break; 1818 case CHIP_RAVEN: 1819 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1820 chip_name = "raven2"; 1821 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1822 chip_name = "picasso"; 1823 else 1824 chip_name = "raven"; 1825 break; 1826 case CHIP_ARCTURUS: 1827 chip_name = "arcturus"; 1828 break; 1829 case CHIP_RENOIR: 1830 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1831 chip_name = "renoir"; 1832 else 1833 chip_name = "green_sardine"; 1834 break; 1835 case CHIP_NAVI10: 1836 chip_name = "navi10"; 1837 break; 1838 case CHIP_NAVI14: 1839 chip_name = "navi14"; 1840 break; 1841 case CHIP_NAVI12: 1842 chip_name = "navi12"; 1843 break; 1844 case CHIP_VANGOGH: 1845 chip_name = "vangogh"; 1846 break; 1847 } 1848 1849 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1850 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1851 if (err) { 1852 dev_err(adev->dev, 1853 "Failed to load gpu_info firmware \"%s\"\n", 1854 fw_name); 1855 goto out; 1856 } 1857 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1858 if (err) { 1859 dev_err(adev->dev, 1860 "Failed to validate gpu_info firmware \"%s\"\n", 1861 fw_name); 1862 goto out; 1863 } 1864 1865 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1866 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1867 1868 switch (hdr->version_major) { 1869 case 1: 1870 { 1871 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1872 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1873 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1874 1875 /* 1876 * Should be droped when DAL no longer needs it. 1877 */ 1878 if (adev->asic_type == CHIP_NAVI12) 1879 goto parse_soc_bounding_box; 1880 1881 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1882 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1883 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1884 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1885 adev->gfx.config.max_texture_channel_caches = 1886 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1887 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1888 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1889 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1890 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1891 adev->gfx.config.double_offchip_lds_buf = 1892 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1893 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1894 adev->gfx.cu_info.max_waves_per_simd = 1895 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1896 adev->gfx.cu_info.max_scratch_slots_per_cu = 1897 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1898 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1899 if (hdr->version_minor >= 1) { 1900 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1901 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1902 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1903 adev->gfx.config.num_sc_per_sh = 1904 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1905 adev->gfx.config.num_packer_per_sc = 1906 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1907 } 1908 1909 parse_soc_bounding_box: 1910 /* 1911 * soc bounding box info is not integrated in disocovery table, 1912 * we always need to parse it from gpu info firmware if needed. 1913 */ 1914 if (hdr->version_minor == 2) { 1915 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1916 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1917 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1918 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1919 } 1920 break; 1921 } 1922 default: 1923 dev_err(adev->dev, 1924 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1925 err = -EINVAL; 1926 goto out; 1927 } 1928 out: 1929 return err; 1930 } 1931 1932 /** 1933 * amdgpu_device_ip_early_init - run early init for hardware IPs 1934 * 1935 * @adev: amdgpu_device pointer 1936 * 1937 * Early initialization pass for hardware IPs. The hardware IPs that make 1938 * up each asic are discovered each IP's early_init callback is run. This 1939 * is the first stage in initializing the asic. 1940 * Returns 0 on success, negative error code on failure. 1941 */ 1942 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1943 { 1944 int i, r; 1945 1946 amdgpu_device_enable_virtual_display(adev); 1947 1948 if (amdgpu_sriov_vf(adev)) { 1949 r = amdgpu_virt_request_full_gpu(adev, true); 1950 if (r) 1951 return r; 1952 } 1953 1954 switch (adev->asic_type) { 1955 #ifdef CONFIG_DRM_AMDGPU_SI 1956 case CHIP_VERDE: 1957 case CHIP_TAHITI: 1958 case CHIP_PITCAIRN: 1959 case CHIP_OLAND: 1960 case CHIP_HAINAN: 1961 adev->family = AMDGPU_FAMILY_SI; 1962 r = si_set_ip_blocks(adev); 1963 if (r) 1964 return r; 1965 break; 1966 #endif 1967 #ifdef CONFIG_DRM_AMDGPU_CIK 1968 case CHIP_BONAIRE: 1969 case CHIP_HAWAII: 1970 case CHIP_KAVERI: 1971 case CHIP_KABINI: 1972 case CHIP_MULLINS: 1973 if (adev->flags & AMD_IS_APU) 1974 adev->family = AMDGPU_FAMILY_KV; 1975 else 1976 adev->family = AMDGPU_FAMILY_CI; 1977 1978 r = cik_set_ip_blocks(adev); 1979 if (r) 1980 return r; 1981 break; 1982 #endif 1983 case CHIP_TOPAZ: 1984 case CHIP_TONGA: 1985 case CHIP_FIJI: 1986 case CHIP_POLARIS10: 1987 case CHIP_POLARIS11: 1988 case CHIP_POLARIS12: 1989 case CHIP_VEGAM: 1990 case CHIP_CARRIZO: 1991 case CHIP_STONEY: 1992 if (adev->flags & AMD_IS_APU) 1993 adev->family = AMDGPU_FAMILY_CZ; 1994 else 1995 adev->family = AMDGPU_FAMILY_VI; 1996 1997 r = vi_set_ip_blocks(adev); 1998 if (r) 1999 return r; 2000 break; 2001 case CHIP_VEGA10: 2002 case CHIP_VEGA12: 2003 case CHIP_VEGA20: 2004 case CHIP_RAVEN: 2005 case CHIP_ARCTURUS: 2006 case CHIP_RENOIR: 2007 case CHIP_ALDEBARAN: 2008 if (adev->flags & AMD_IS_APU) 2009 adev->family = AMDGPU_FAMILY_RV; 2010 else 2011 adev->family = AMDGPU_FAMILY_AI; 2012 2013 r = soc15_set_ip_blocks(adev); 2014 if (r) 2015 return r; 2016 break; 2017 case CHIP_NAVI10: 2018 case CHIP_NAVI14: 2019 case CHIP_NAVI12: 2020 case CHIP_SIENNA_CICHLID: 2021 case CHIP_NAVY_FLOUNDER: 2022 case CHIP_DIMGREY_CAVEFISH: 2023 case CHIP_VANGOGH: 2024 if (adev->asic_type == CHIP_VANGOGH) 2025 adev->family = AMDGPU_FAMILY_VGH; 2026 else 2027 adev->family = AMDGPU_FAMILY_NV; 2028 2029 r = nv_set_ip_blocks(adev); 2030 if (r) 2031 return r; 2032 break; 2033 default: 2034 /* FIXME: not supported yet */ 2035 return -EINVAL; 2036 } 2037 2038 amdgpu_amdkfd_device_probe(adev); 2039 2040 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2041 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2042 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2043 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2044 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2045 2046 for (i = 0; i < adev->num_ip_blocks; i++) { 2047 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2048 DRM_ERROR("disabled ip block: %d <%s>\n", 2049 i, adev->ip_blocks[i].version->funcs->name); 2050 adev->ip_blocks[i].status.valid = false; 2051 } else { 2052 if (adev->ip_blocks[i].version->funcs->early_init) { 2053 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2054 if (r == -ENOENT) { 2055 adev->ip_blocks[i].status.valid = false; 2056 } else if (r) { 2057 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2058 adev->ip_blocks[i].version->funcs->name, r); 2059 return r; 2060 } else { 2061 adev->ip_blocks[i].status.valid = true; 2062 } 2063 } else { 2064 adev->ip_blocks[i].status.valid = true; 2065 } 2066 } 2067 /* get the vbios after the asic_funcs are set up */ 2068 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2069 r = amdgpu_device_parse_gpu_info_fw(adev); 2070 if (r) 2071 return r; 2072 2073 /* Read BIOS */ 2074 if (!amdgpu_get_bios(adev)) 2075 return -EINVAL; 2076 2077 r = amdgpu_atombios_init(adev); 2078 if (r) { 2079 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2080 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2081 return r; 2082 } 2083 } 2084 } 2085 2086 adev->cg_flags &= amdgpu_cg_mask; 2087 adev->pg_flags &= amdgpu_pg_mask; 2088 2089 return 0; 2090 } 2091 2092 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2093 { 2094 int i, r; 2095 2096 for (i = 0; i < adev->num_ip_blocks; i++) { 2097 if (!adev->ip_blocks[i].status.sw) 2098 continue; 2099 if (adev->ip_blocks[i].status.hw) 2100 continue; 2101 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2102 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2103 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2104 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2105 if (r) { 2106 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2107 adev->ip_blocks[i].version->funcs->name, r); 2108 return r; 2109 } 2110 adev->ip_blocks[i].status.hw = true; 2111 } 2112 } 2113 2114 return 0; 2115 } 2116 2117 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2118 { 2119 int i, r; 2120 2121 for (i = 0; i < adev->num_ip_blocks; i++) { 2122 if (!adev->ip_blocks[i].status.sw) 2123 continue; 2124 if (adev->ip_blocks[i].status.hw) 2125 continue; 2126 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2127 if (r) { 2128 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2129 adev->ip_blocks[i].version->funcs->name, r); 2130 return r; 2131 } 2132 adev->ip_blocks[i].status.hw = true; 2133 } 2134 2135 return 0; 2136 } 2137 2138 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2139 { 2140 int r = 0; 2141 int i; 2142 uint32_t smu_version; 2143 2144 if (adev->asic_type >= CHIP_VEGA10) { 2145 for (i = 0; i < adev->num_ip_blocks; i++) { 2146 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2147 continue; 2148 2149 if (!adev->ip_blocks[i].status.sw) 2150 continue; 2151 2152 /* no need to do the fw loading again if already done*/ 2153 if (adev->ip_blocks[i].status.hw == true) 2154 break; 2155 2156 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2157 r = adev->ip_blocks[i].version->funcs->resume(adev); 2158 if (r) { 2159 DRM_ERROR("resume of IP block <%s> failed %d\n", 2160 adev->ip_blocks[i].version->funcs->name, r); 2161 return r; 2162 } 2163 } else { 2164 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2165 if (r) { 2166 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2167 adev->ip_blocks[i].version->funcs->name, r); 2168 return r; 2169 } 2170 } 2171 2172 adev->ip_blocks[i].status.hw = true; 2173 break; 2174 } 2175 } 2176 2177 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2178 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2179 2180 return r; 2181 } 2182 2183 /** 2184 * amdgpu_device_ip_init - run init for hardware IPs 2185 * 2186 * @adev: amdgpu_device pointer 2187 * 2188 * Main initialization pass for hardware IPs. The list of all the hardware 2189 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2190 * are run. sw_init initializes the software state associated with each IP 2191 * and hw_init initializes the hardware associated with each IP. 2192 * Returns 0 on success, negative error code on failure. 2193 */ 2194 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2195 { 2196 int i, r; 2197 2198 r = amdgpu_ras_init(adev); 2199 if (r) 2200 return r; 2201 2202 for (i = 0; i < adev->num_ip_blocks; i++) { 2203 if (!adev->ip_blocks[i].status.valid) 2204 continue; 2205 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2206 if (r) { 2207 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2208 adev->ip_blocks[i].version->funcs->name, r); 2209 goto init_failed; 2210 } 2211 adev->ip_blocks[i].status.sw = true; 2212 2213 /* need to do gmc hw init early so we can allocate gpu mem */ 2214 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2215 r = amdgpu_device_vram_scratch_init(adev); 2216 if (r) { 2217 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2218 goto init_failed; 2219 } 2220 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2221 if (r) { 2222 DRM_ERROR("hw_init %d failed %d\n", i, r); 2223 goto init_failed; 2224 } 2225 r = amdgpu_device_wb_init(adev); 2226 if (r) { 2227 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2228 goto init_failed; 2229 } 2230 adev->ip_blocks[i].status.hw = true; 2231 2232 /* right after GMC hw init, we create CSA */ 2233 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2234 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2235 AMDGPU_GEM_DOMAIN_VRAM, 2236 AMDGPU_CSA_SIZE); 2237 if (r) { 2238 DRM_ERROR("allocate CSA failed %d\n", r); 2239 goto init_failed; 2240 } 2241 } 2242 } 2243 } 2244 2245 if (amdgpu_sriov_vf(adev)) 2246 amdgpu_virt_init_data_exchange(adev); 2247 2248 r = amdgpu_ib_pool_init(adev); 2249 if (r) { 2250 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2251 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2252 goto init_failed; 2253 } 2254 2255 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2256 if (r) 2257 goto init_failed; 2258 2259 r = amdgpu_device_ip_hw_init_phase1(adev); 2260 if (r) 2261 goto init_failed; 2262 2263 r = amdgpu_device_fw_loading(adev); 2264 if (r) 2265 goto init_failed; 2266 2267 r = amdgpu_device_ip_hw_init_phase2(adev); 2268 if (r) 2269 goto init_failed; 2270 2271 /* 2272 * retired pages will be loaded from eeprom and reserved here, 2273 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2274 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2275 * for I2C communication which only true at this point. 2276 * 2277 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2278 * failure from bad gpu situation and stop amdgpu init process 2279 * accordingly. For other failed cases, it will still release all 2280 * the resource and print error message, rather than returning one 2281 * negative value to upper level. 2282 * 2283 * Note: theoretically, this should be called before all vram allocations 2284 * to protect retired page from abusing 2285 */ 2286 r = amdgpu_ras_recovery_init(adev); 2287 if (r) 2288 goto init_failed; 2289 2290 if (adev->gmc.xgmi.num_physical_nodes > 1) 2291 amdgpu_xgmi_add_device(adev); 2292 2293 /* Don't init kfd if whole hive need to be reset during init */ 2294 if (!adev->gmc.xgmi.pending_reset) 2295 amdgpu_amdkfd_device_init(adev); 2296 2297 amdgpu_fru_get_product_info(adev); 2298 2299 init_failed: 2300 if (amdgpu_sriov_vf(adev)) 2301 amdgpu_virt_release_full_gpu(adev, true); 2302 2303 return r; 2304 } 2305 2306 /** 2307 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2308 * 2309 * @adev: amdgpu_device pointer 2310 * 2311 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2312 * this function before a GPU reset. If the value is retained after a 2313 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2314 */ 2315 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2316 { 2317 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2318 } 2319 2320 /** 2321 * amdgpu_device_check_vram_lost - check if vram is valid 2322 * 2323 * @adev: amdgpu_device pointer 2324 * 2325 * Checks the reset magic value written to the gart pointer in VRAM. 2326 * The driver calls this after a GPU reset to see if the contents of 2327 * VRAM is lost or now. 2328 * returns true if vram is lost, false if not. 2329 */ 2330 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2331 { 2332 if (memcmp(adev->gart.ptr, adev->reset_magic, 2333 AMDGPU_RESET_MAGIC_NUM)) 2334 return true; 2335 2336 if (!amdgpu_in_reset(adev)) 2337 return false; 2338 2339 /* 2340 * For all ASICs with baco/mode1 reset, the VRAM is 2341 * always assumed to be lost. 2342 */ 2343 switch (amdgpu_asic_reset_method(adev)) { 2344 case AMD_RESET_METHOD_BACO: 2345 case AMD_RESET_METHOD_MODE1: 2346 return true; 2347 default: 2348 return false; 2349 } 2350 } 2351 2352 /** 2353 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2354 * 2355 * @adev: amdgpu_device pointer 2356 * @state: clockgating state (gate or ungate) 2357 * 2358 * The list of all the hardware IPs that make up the asic is walked and the 2359 * set_clockgating_state callbacks are run. 2360 * Late initialization pass enabling clockgating for hardware IPs. 2361 * Fini or suspend, pass disabling clockgating for hardware IPs. 2362 * Returns 0 on success, negative error code on failure. 2363 */ 2364 2365 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2366 enum amd_clockgating_state state) 2367 { 2368 int i, j, r; 2369 2370 if (amdgpu_emu_mode == 1) 2371 return 0; 2372 2373 for (j = 0; j < adev->num_ip_blocks; j++) { 2374 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2375 if (!adev->ip_blocks[i].status.late_initialized) 2376 continue; 2377 /* skip CG for GFX on S0ix */ 2378 if (adev->in_s0ix && 2379 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2380 continue; 2381 /* skip CG for VCE/UVD, it's handled specially */ 2382 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2383 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2384 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2385 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2386 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2387 /* enable clockgating to save power */ 2388 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2389 state); 2390 if (r) { 2391 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2392 adev->ip_blocks[i].version->funcs->name, r); 2393 return r; 2394 } 2395 } 2396 } 2397 2398 return 0; 2399 } 2400 2401 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2402 enum amd_powergating_state state) 2403 { 2404 int i, j, r; 2405 2406 if (amdgpu_emu_mode == 1) 2407 return 0; 2408 2409 for (j = 0; j < adev->num_ip_blocks; j++) { 2410 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2411 if (!adev->ip_blocks[i].status.late_initialized) 2412 continue; 2413 /* skip PG for GFX on S0ix */ 2414 if (adev->in_s0ix && 2415 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2416 continue; 2417 /* skip CG for VCE/UVD, it's handled specially */ 2418 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2419 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2420 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2421 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2422 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2423 /* enable powergating to save power */ 2424 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2425 state); 2426 if (r) { 2427 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2428 adev->ip_blocks[i].version->funcs->name, r); 2429 return r; 2430 } 2431 } 2432 } 2433 return 0; 2434 } 2435 2436 static int amdgpu_device_enable_mgpu_fan_boost(void) 2437 { 2438 struct amdgpu_gpu_instance *gpu_ins; 2439 struct amdgpu_device *adev; 2440 int i, ret = 0; 2441 2442 mutex_lock(&mgpu_info.mutex); 2443 2444 /* 2445 * MGPU fan boost feature should be enabled 2446 * only when there are two or more dGPUs in 2447 * the system 2448 */ 2449 if (mgpu_info.num_dgpu < 2) 2450 goto out; 2451 2452 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2453 gpu_ins = &(mgpu_info.gpu_ins[i]); 2454 adev = gpu_ins->adev; 2455 if (!(adev->flags & AMD_IS_APU) && 2456 !gpu_ins->mgpu_fan_enabled) { 2457 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2458 if (ret) 2459 break; 2460 2461 gpu_ins->mgpu_fan_enabled = 1; 2462 } 2463 } 2464 2465 out: 2466 mutex_unlock(&mgpu_info.mutex); 2467 2468 return ret; 2469 } 2470 2471 /** 2472 * amdgpu_device_ip_late_init - run late init for hardware IPs 2473 * 2474 * @adev: amdgpu_device pointer 2475 * 2476 * Late initialization pass for hardware IPs. The list of all the hardware 2477 * IPs that make up the asic is walked and the late_init callbacks are run. 2478 * late_init covers any special initialization that an IP requires 2479 * after all of the have been initialized or something that needs to happen 2480 * late in the init process. 2481 * Returns 0 on success, negative error code on failure. 2482 */ 2483 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2484 { 2485 struct amdgpu_gpu_instance *gpu_instance; 2486 int i = 0, r; 2487 2488 for (i = 0; i < adev->num_ip_blocks; i++) { 2489 if (!adev->ip_blocks[i].status.hw) 2490 continue; 2491 if (adev->ip_blocks[i].version->funcs->late_init) { 2492 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2493 if (r) { 2494 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2495 adev->ip_blocks[i].version->funcs->name, r); 2496 return r; 2497 } 2498 } 2499 adev->ip_blocks[i].status.late_initialized = true; 2500 } 2501 2502 amdgpu_ras_set_error_query_ready(adev, true); 2503 2504 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2505 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2506 2507 amdgpu_device_fill_reset_magic(adev); 2508 2509 r = amdgpu_device_enable_mgpu_fan_boost(); 2510 if (r) 2511 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2512 2513 /* For XGMI + passthrough configuration on arcturus, enable light SBR */ 2514 if (adev->asic_type == CHIP_ARCTURUS && 2515 amdgpu_passthrough(adev) && 2516 adev->gmc.xgmi.num_physical_nodes > 1) 2517 smu_set_light_sbr(&adev->smu, true); 2518 2519 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2520 mutex_lock(&mgpu_info.mutex); 2521 2522 /* 2523 * Reset device p-state to low as this was booted with high. 2524 * 2525 * This should be performed only after all devices from the same 2526 * hive get initialized. 2527 * 2528 * However, it's unknown how many device in the hive in advance. 2529 * As this is counted one by one during devices initializations. 2530 * 2531 * So, we wait for all XGMI interlinked devices initialized. 2532 * This may bring some delays as those devices may come from 2533 * different hives. But that should be OK. 2534 */ 2535 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2536 for (i = 0; i < mgpu_info.num_gpu; i++) { 2537 gpu_instance = &(mgpu_info.gpu_ins[i]); 2538 if (gpu_instance->adev->flags & AMD_IS_APU) 2539 continue; 2540 2541 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2542 AMDGPU_XGMI_PSTATE_MIN); 2543 if (r) { 2544 DRM_ERROR("pstate setting failed (%d).\n", r); 2545 break; 2546 } 2547 } 2548 } 2549 2550 mutex_unlock(&mgpu_info.mutex); 2551 } 2552 2553 return 0; 2554 } 2555 2556 /** 2557 * amdgpu_device_ip_fini - run fini for hardware IPs 2558 * 2559 * @adev: amdgpu_device pointer 2560 * 2561 * Main teardown pass for hardware IPs. The list of all the hardware 2562 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2563 * are run. hw_fini tears down the hardware associated with each IP 2564 * and sw_fini tears down any software state associated with each IP. 2565 * Returns 0 on success, negative error code on failure. 2566 */ 2567 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2568 { 2569 int i, r; 2570 2571 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2572 amdgpu_virt_release_ras_err_handler_data(adev); 2573 2574 amdgpu_ras_pre_fini(adev); 2575 2576 if (adev->gmc.xgmi.num_physical_nodes > 1) 2577 amdgpu_xgmi_remove_device(adev); 2578 2579 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2580 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2581 2582 amdgpu_amdkfd_device_fini(adev); 2583 2584 /* need to disable SMC first */ 2585 for (i = 0; i < adev->num_ip_blocks; i++) { 2586 if (!adev->ip_blocks[i].status.hw) 2587 continue; 2588 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2589 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2590 /* XXX handle errors */ 2591 if (r) { 2592 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2593 adev->ip_blocks[i].version->funcs->name, r); 2594 } 2595 adev->ip_blocks[i].status.hw = false; 2596 break; 2597 } 2598 } 2599 2600 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2601 if (!adev->ip_blocks[i].status.hw) 2602 continue; 2603 2604 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2605 /* XXX handle errors */ 2606 if (r) { 2607 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2608 adev->ip_blocks[i].version->funcs->name, r); 2609 } 2610 2611 adev->ip_blocks[i].status.hw = false; 2612 } 2613 2614 2615 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2616 if (!adev->ip_blocks[i].status.sw) 2617 continue; 2618 2619 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2620 amdgpu_ucode_free_bo(adev); 2621 amdgpu_free_static_csa(&adev->virt.csa_obj); 2622 amdgpu_device_wb_fini(adev); 2623 amdgpu_device_vram_scratch_fini(adev); 2624 amdgpu_ib_pool_fini(adev); 2625 } 2626 2627 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2628 /* XXX handle errors */ 2629 if (r) { 2630 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2631 adev->ip_blocks[i].version->funcs->name, r); 2632 } 2633 adev->ip_blocks[i].status.sw = false; 2634 adev->ip_blocks[i].status.valid = false; 2635 } 2636 2637 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2638 if (!adev->ip_blocks[i].status.late_initialized) 2639 continue; 2640 if (adev->ip_blocks[i].version->funcs->late_fini) 2641 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2642 adev->ip_blocks[i].status.late_initialized = false; 2643 } 2644 2645 amdgpu_ras_fini(adev); 2646 2647 if (amdgpu_sriov_vf(adev)) 2648 if (amdgpu_virt_release_full_gpu(adev, false)) 2649 DRM_ERROR("failed to release exclusive mode on fini\n"); 2650 2651 return 0; 2652 } 2653 2654 /** 2655 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2656 * 2657 * @work: work_struct. 2658 */ 2659 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2660 { 2661 struct amdgpu_device *adev = 2662 container_of(work, struct amdgpu_device, delayed_init_work.work); 2663 int r; 2664 2665 r = amdgpu_ib_ring_tests(adev); 2666 if (r) 2667 DRM_ERROR("ib ring test failed (%d).\n", r); 2668 } 2669 2670 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2671 { 2672 struct amdgpu_device *adev = 2673 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2674 2675 mutex_lock(&adev->gfx.gfx_off_mutex); 2676 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2677 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2678 adev->gfx.gfx_off_state = true; 2679 } 2680 mutex_unlock(&adev->gfx.gfx_off_mutex); 2681 } 2682 2683 /** 2684 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2685 * 2686 * @adev: amdgpu_device pointer 2687 * 2688 * Main suspend function for hardware IPs. The list of all the hardware 2689 * IPs that make up the asic is walked, clockgating is disabled and the 2690 * suspend callbacks are run. suspend puts the hardware and software state 2691 * in each IP into a state suitable for suspend. 2692 * Returns 0 on success, negative error code on failure. 2693 */ 2694 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2695 { 2696 int i, r; 2697 2698 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2699 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2700 2701 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2702 if (!adev->ip_blocks[i].status.valid) 2703 continue; 2704 2705 /* displays are handled separately */ 2706 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2707 continue; 2708 2709 /* XXX handle errors */ 2710 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2711 /* XXX handle errors */ 2712 if (r) { 2713 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2714 adev->ip_blocks[i].version->funcs->name, r); 2715 return r; 2716 } 2717 2718 adev->ip_blocks[i].status.hw = false; 2719 } 2720 2721 return 0; 2722 } 2723 2724 /** 2725 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2726 * 2727 * @adev: amdgpu_device pointer 2728 * 2729 * Main suspend function for hardware IPs. The list of all the hardware 2730 * IPs that make up the asic is walked, clockgating is disabled and the 2731 * suspend callbacks are run. suspend puts the hardware and software state 2732 * in each IP into a state suitable for suspend. 2733 * Returns 0 on success, negative error code on failure. 2734 */ 2735 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2736 { 2737 int i, r; 2738 2739 if (adev->in_s0ix) 2740 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 2741 2742 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2743 if (!adev->ip_blocks[i].status.valid) 2744 continue; 2745 /* displays are handled in phase1 */ 2746 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2747 continue; 2748 /* PSP lost connection when err_event_athub occurs */ 2749 if (amdgpu_ras_intr_triggered() && 2750 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2751 adev->ip_blocks[i].status.hw = false; 2752 continue; 2753 } 2754 2755 /* skip unnecessary suspend if we do not initialize them yet */ 2756 if (adev->gmc.xgmi.pending_reset && 2757 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2758 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2759 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2760 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2761 adev->ip_blocks[i].status.hw = false; 2762 continue; 2763 } 2764 2765 /* skip suspend of gfx and psp for S0ix 2766 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2767 * like at runtime. PSP is also part of the always on hardware 2768 * so no need to suspend it. 2769 */ 2770 if (adev->in_s0ix && 2771 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2772 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 2773 continue; 2774 2775 /* XXX handle errors */ 2776 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2777 /* XXX handle errors */ 2778 if (r) { 2779 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2780 adev->ip_blocks[i].version->funcs->name, r); 2781 } 2782 adev->ip_blocks[i].status.hw = false; 2783 /* handle putting the SMC in the appropriate state */ 2784 if(!amdgpu_sriov_vf(adev)){ 2785 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2786 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2787 if (r) { 2788 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2789 adev->mp1_state, r); 2790 return r; 2791 } 2792 } 2793 } 2794 } 2795 2796 return 0; 2797 } 2798 2799 /** 2800 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2801 * 2802 * @adev: amdgpu_device pointer 2803 * 2804 * Main suspend function for hardware IPs. The list of all the hardware 2805 * IPs that make up the asic is walked, clockgating is disabled and the 2806 * suspend callbacks are run. suspend puts the hardware and software state 2807 * in each IP into a state suitable for suspend. 2808 * Returns 0 on success, negative error code on failure. 2809 */ 2810 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2811 { 2812 int r; 2813 2814 if (amdgpu_sriov_vf(adev)) { 2815 amdgpu_virt_fini_data_exchange(adev); 2816 amdgpu_virt_request_full_gpu(adev, false); 2817 } 2818 2819 r = amdgpu_device_ip_suspend_phase1(adev); 2820 if (r) 2821 return r; 2822 r = amdgpu_device_ip_suspend_phase2(adev); 2823 2824 if (amdgpu_sriov_vf(adev)) 2825 amdgpu_virt_release_full_gpu(adev, false); 2826 2827 return r; 2828 } 2829 2830 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2831 { 2832 int i, r; 2833 2834 static enum amd_ip_block_type ip_order[] = { 2835 AMD_IP_BLOCK_TYPE_GMC, 2836 AMD_IP_BLOCK_TYPE_COMMON, 2837 AMD_IP_BLOCK_TYPE_PSP, 2838 AMD_IP_BLOCK_TYPE_IH, 2839 }; 2840 2841 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2842 int j; 2843 struct amdgpu_ip_block *block; 2844 2845 block = &adev->ip_blocks[i]; 2846 block->status.hw = false; 2847 2848 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2849 2850 if (block->version->type != ip_order[j] || 2851 !block->status.valid) 2852 continue; 2853 2854 r = block->version->funcs->hw_init(adev); 2855 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2856 if (r) 2857 return r; 2858 block->status.hw = true; 2859 } 2860 } 2861 2862 return 0; 2863 } 2864 2865 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2866 { 2867 int i, r; 2868 2869 static enum amd_ip_block_type ip_order[] = { 2870 AMD_IP_BLOCK_TYPE_SMC, 2871 AMD_IP_BLOCK_TYPE_DCE, 2872 AMD_IP_BLOCK_TYPE_GFX, 2873 AMD_IP_BLOCK_TYPE_SDMA, 2874 AMD_IP_BLOCK_TYPE_UVD, 2875 AMD_IP_BLOCK_TYPE_VCE, 2876 AMD_IP_BLOCK_TYPE_VCN 2877 }; 2878 2879 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2880 int j; 2881 struct amdgpu_ip_block *block; 2882 2883 for (j = 0; j < adev->num_ip_blocks; j++) { 2884 block = &adev->ip_blocks[j]; 2885 2886 if (block->version->type != ip_order[i] || 2887 !block->status.valid || 2888 block->status.hw) 2889 continue; 2890 2891 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2892 r = block->version->funcs->resume(adev); 2893 else 2894 r = block->version->funcs->hw_init(adev); 2895 2896 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2897 if (r) 2898 return r; 2899 block->status.hw = true; 2900 } 2901 } 2902 2903 return 0; 2904 } 2905 2906 /** 2907 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2908 * 2909 * @adev: amdgpu_device pointer 2910 * 2911 * First resume function for hardware IPs. The list of all the hardware 2912 * IPs that make up the asic is walked and the resume callbacks are run for 2913 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2914 * after a suspend and updates the software state as necessary. This 2915 * function is also used for restoring the GPU after a GPU reset. 2916 * Returns 0 on success, negative error code on failure. 2917 */ 2918 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2919 { 2920 int i, r; 2921 2922 for (i = 0; i < adev->num_ip_blocks; i++) { 2923 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2924 continue; 2925 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2926 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2927 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2928 2929 r = adev->ip_blocks[i].version->funcs->resume(adev); 2930 if (r) { 2931 DRM_ERROR("resume of IP block <%s> failed %d\n", 2932 adev->ip_blocks[i].version->funcs->name, r); 2933 return r; 2934 } 2935 adev->ip_blocks[i].status.hw = true; 2936 } 2937 } 2938 2939 return 0; 2940 } 2941 2942 /** 2943 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2944 * 2945 * @adev: amdgpu_device pointer 2946 * 2947 * First resume function for hardware IPs. The list of all the hardware 2948 * IPs that make up the asic is walked and the resume callbacks are run for 2949 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2950 * functional state after a suspend and updates the software state as 2951 * necessary. This function is also used for restoring the GPU after a GPU 2952 * reset. 2953 * Returns 0 on success, negative error code on failure. 2954 */ 2955 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2956 { 2957 int i, r; 2958 2959 for (i = 0; i < adev->num_ip_blocks; i++) { 2960 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2961 continue; 2962 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2963 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2964 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2965 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2966 continue; 2967 r = adev->ip_blocks[i].version->funcs->resume(adev); 2968 if (r) { 2969 DRM_ERROR("resume of IP block <%s> failed %d\n", 2970 adev->ip_blocks[i].version->funcs->name, r); 2971 return r; 2972 } 2973 adev->ip_blocks[i].status.hw = true; 2974 } 2975 2976 return 0; 2977 } 2978 2979 /** 2980 * amdgpu_device_ip_resume - run resume for hardware IPs 2981 * 2982 * @adev: amdgpu_device pointer 2983 * 2984 * Main resume function for hardware IPs. The hardware IPs 2985 * are split into two resume functions because they are 2986 * are also used in in recovering from a GPU reset and some additional 2987 * steps need to be take between them. In this case (S3/S4) they are 2988 * run sequentially. 2989 * Returns 0 on success, negative error code on failure. 2990 */ 2991 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2992 { 2993 int r; 2994 2995 r = amdgpu_device_ip_resume_phase1(adev); 2996 if (r) 2997 return r; 2998 2999 r = amdgpu_device_fw_loading(adev); 3000 if (r) 3001 return r; 3002 3003 r = amdgpu_device_ip_resume_phase2(adev); 3004 3005 return r; 3006 } 3007 3008 /** 3009 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3010 * 3011 * @adev: amdgpu_device pointer 3012 * 3013 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3014 */ 3015 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3016 { 3017 if (amdgpu_sriov_vf(adev)) { 3018 if (adev->is_atom_fw) { 3019 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 3020 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3021 } else { 3022 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3023 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3024 } 3025 3026 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3027 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3028 } 3029 } 3030 3031 /** 3032 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3033 * 3034 * @asic_type: AMD asic type 3035 * 3036 * Check if there is DC (new modesetting infrastructre) support for an asic. 3037 * returns true if DC has support, false if not. 3038 */ 3039 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3040 { 3041 switch (asic_type) { 3042 #if defined(CONFIG_DRM_AMD_DC) 3043 #if defined(CONFIG_DRM_AMD_DC_SI) 3044 case CHIP_TAHITI: 3045 case CHIP_PITCAIRN: 3046 case CHIP_VERDE: 3047 case CHIP_OLAND: 3048 #endif 3049 case CHIP_BONAIRE: 3050 case CHIP_KAVERI: 3051 case CHIP_KABINI: 3052 case CHIP_MULLINS: 3053 /* 3054 * We have systems in the wild with these ASICs that require 3055 * LVDS and VGA support which is not supported with DC. 3056 * 3057 * Fallback to the non-DC driver here by default so as not to 3058 * cause regressions. 3059 */ 3060 return amdgpu_dc > 0; 3061 case CHIP_HAWAII: 3062 case CHIP_CARRIZO: 3063 case CHIP_STONEY: 3064 case CHIP_POLARIS10: 3065 case CHIP_POLARIS11: 3066 case CHIP_POLARIS12: 3067 case CHIP_VEGAM: 3068 case CHIP_TONGA: 3069 case CHIP_FIJI: 3070 case CHIP_VEGA10: 3071 case CHIP_VEGA12: 3072 case CHIP_VEGA20: 3073 #if defined(CONFIG_DRM_AMD_DC_DCN) 3074 case CHIP_RAVEN: 3075 case CHIP_NAVI10: 3076 case CHIP_NAVI14: 3077 case CHIP_NAVI12: 3078 case CHIP_RENOIR: 3079 case CHIP_SIENNA_CICHLID: 3080 case CHIP_NAVY_FLOUNDER: 3081 case CHIP_DIMGREY_CAVEFISH: 3082 case CHIP_VANGOGH: 3083 #endif 3084 return amdgpu_dc != 0; 3085 #endif 3086 default: 3087 if (amdgpu_dc > 0) 3088 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3089 "but isn't supported by ASIC, ignoring\n"); 3090 return false; 3091 } 3092 } 3093 3094 /** 3095 * amdgpu_device_has_dc_support - check if dc is supported 3096 * 3097 * @adev: amdgpu_device pointer 3098 * 3099 * Returns true for supported, false for not supported 3100 */ 3101 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3102 { 3103 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 3104 return false; 3105 3106 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3107 } 3108 3109 3110 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3111 { 3112 struct amdgpu_device *adev = 3113 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3114 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3115 3116 /* It's a bug to not have a hive within this function */ 3117 if (WARN_ON(!hive)) 3118 return; 3119 3120 /* 3121 * Use task barrier to synchronize all xgmi reset works across the 3122 * hive. task_barrier_enter and task_barrier_exit will block 3123 * until all the threads running the xgmi reset works reach 3124 * those points. task_barrier_full will do both blocks. 3125 */ 3126 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3127 3128 task_barrier_enter(&hive->tb); 3129 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3130 3131 if (adev->asic_reset_res) 3132 goto fail; 3133 3134 task_barrier_exit(&hive->tb); 3135 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3136 3137 if (adev->asic_reset_res) 3138 goto fail; 3139 3140 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count) 3141 adev->mmhub.funcs->reset_ras_error_count(adev); 3142 } else { 3143 3144 task_barrier_full(&hive->tb); 3145 adev->asic_reset_res = amdgpu_asic_reset(adev); 3146 } 3147 3148 fail: 3149 if (adev->asic_reset_res) 3150 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3151 adev->asic_reset_res, adev_to_drm(adev)->unique); 3152 amdgpu_put_xgmi_hive(hive); 3153 } 3154 3155 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3156 { 3157 char *input = amdgpu_lockup_timeout; 3158 char *timeout_setting = NULL; 3159 int index = 0; 3160 long timeout; 3161 int ret = 0; 3162 3163 /* 3164 * By default timeout for non compute jobs is 10000. 3165 * And there is no timeout enforced on compute jobs. 3166 * In SR-IOV or passthrough mode, timeout for compute 3167 * jobs are 60000 by default. 3168 */ 3169 adev->gfx_timeout = msecs_to_jiffies(10000); 3170 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3171 if (amdgpu_sriov_vf(adev)) 3172 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3173 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3174 else if (amdgpu_passthrough(adev)) 3175 adev->compute_timeout = msecs_to_jiffies(60000); 3176 else 3177 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; 3178 3179 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3180 while ((timeout_setting = strsep(&input, ",")) && 3181 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3182 ret = kstrtol(timeout_setting, 0, &timeout); 3183 if (ret) 3184 return ret; 3185 3186 if (timeout == 0) { 3187 index++; 3188 continue; 3189 } else if (timeout < 0) { 3190 timeout = MAX_SCHEDULE_TIMEOUT; 3191 } else { 3192 timeout = msecs_to_jiffies(timeout); 3193 } 3194 3195 switch (index++) { 3196 case 0: 3197 adev->gfx_timeout = timeout; 3198 break; 3199 case 1: 3200 adev->compute_timeout = timeout; 3201 break; 3202 case 2: 3203 adev->sdma_timeout = timeout; 3204 break; 3205 case 3: 3206 adev->video_timeout = timeout; 3207 break; 3208 default: 3209 break; 3210 } 3211 } 3212 /* 3213 * There is only one value specified and 3214 * it should apply to all non-compute jobs. 3215 */ 3216 if (index == 1) { 3217 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3218 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3219 adev->compute_timeout = adev->gfx_timeout; 3220 } 3221 } 3222 3223 return ret; 3224 } 3225 3226 static const struct attribute *amdgpu_dev_attributes[] = { 3227 &dev_attr_product_name.attr, 3228 &dev_attr_product_number.attr, 3229 &dev_attr_serial_number.attr, 3230 &dev_attr_pcie_replay_count.attr, 3231 NULL 3232 }; 3233 3234 3235 /** 3236 * amdgpu_device_init - initialize the driver 3237 * 3238 * @adev: amdgpu_device pointer 3239 * @flags: driver flags 3240 * 3241 * Initializes the driver info and hw (all asics). 3242 * Returns 0 for success or an error on failure. 3243 * Called at driver startup. 3244 */ 3245 int amdgpu_device_init(struct amdgpu_device *adev, 3246 uint32_t flags) 3247 { 3248 struct drm_device *ddev = adev_to_drm(adev); 3249 struct pci_dev *pdev = adev->pdev; 3250 int r, i; 3251 bool px = false; 3252 u32 max_MBps; 3253 3254 adev->shutdown = false; 3255 adev->flags = flags; 3256 3257 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3258 adev->asic_type = amdgpu_force_asic_type; 3259 else 3260 adev->asic_type = flags & AMD_ASIC_MASK; 3261 3262 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3263 if (amdgpu_emu_mode == 1) 3264 adev->usec_timeout *= 10; 3265 adev->gmc.gart_size = 512 * 1024 * 1024; 3266 adev->accel_working = false; 3267 adev->num_rings = 0; 3268 adev->mman.buffer_funcs = NULL; 3269 adev->mman.buffer_funcs_ring = NULL; 3270 adev->vm_manager.vm_pte_funcs = NULL; 3271 adev->vm_manager.vm_pte_num_scheds = 0; 3272 adev->gmc.gmc_funcs = NULL; 3273 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3274 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3275 3276 adev->smc_rreg = &amdgpu_invalid_rreg; 3277 adev->smc_wreg = &amdgpu_invalid_wreg; 3278 adev->pcie_rreg = &amdgpu_invalid_rreg; 3279 adev->pcie_wreg = &amdgpu_invalid_wreg; 3280 adev->pciep_rreg = &amdgpu_invalid_rreg; 3281 adev->pciep_wreg = &amdgpu_invalid_wreg; 3282 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3283 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3284 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3285 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3286 adev->didt_rreg = &amdgpu_invalid_rreg; 3287 adev->didt_wreg = &amdgpu_invalid_wreg; 3288 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3289 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3290 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3291 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3292 3293 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3294 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3295 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3296 3297 /* mutex initialization are all done here so we 3298 * can recall function without having locking issues */ 3299 mutex_init(&adev->firmware.mutex); 3300 mutex_init(&adev->pm.mutex); 3301 mutex_init(&adev->gfx.gpu_clock_mutex); 3302 mutex_init(&adev->srbm_mutex); 3303 mutex_init(&adev->gfx.pipe_reserve_mutex); 3304 mutex_init(&adev->gfx.gfx_off_mutex); 3305 mutex_init(&adev->grbm_idx_mutex); 3306 mutex_init(&adev->mn_lock); 3307 mutex_init(&adev->virt.vf_errors.lock); 3308 hash_init(adev->mn_hash); 3309 atomic_set(&adev->in_gpu_reset, 0); 3310 init_rwsem(&adev->reset_sem); 3311 mutex_init(&adev->psp.mutex); 3312 mutex_init(&adev->notifier_lock); 3313 3314 r = amdgpu_device_check_arguments(adev); 3315 if (r) 3316 return r; 3317 3318 spin_lock_init(&adev->mmio_idx_lock); 3319 spin_lock_init(&adev->smc_idx_lock); 3320 spin_lock_init(&adev->pcie_idx_lock); 3321 spin_lock_init(&adev->uvd_ctx_idx_lock); 3322 spin_lock_init(&adev->didt_idx_lock); 3323 spin_lock_init(&adev->gc_cac_idx_lock); 3324 spin_lock_init(&adev->se_cac_idx_lock); 3325 spin_lock_init(&adev->audio_endpt_idx_lock); 3326 spin_lock_init(&adev->mm_stats.lock); 3327 3328 INIT_LIST_HEAD(&adev->shadow_list); 3329 mutex_init(&adev->shadow_list_lock); 3330 3331 INIT_LIST_HEAD(&adev->reset_list); 3332 3333 INIT_DELAYED_WORK(&adev->delayed_init_work, 3334 amdgpu_device_delayed_init_work_handler); 3335 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3336 amdgpu_device_delay_enable_gfx_off); 3337 3338 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3339 3340 adev->gfx.gfx_off_req_count = 1; 3341 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3342 3343 atomic_set(&adev->throttling_logging_enabled, 1); 3344 /* 3345 * If throttling continues, logging will be performed every minute 3346 * to avoid log flooding. "-1" is subtracted since the thermal 3347 * throttling interrupt comes every second. Thus, the total logging 3348 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3349 * for throttling interrupt) = 60 seconds. 3350 */ 3351 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3352 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3353 3354 /* Registers mapping */ 3355 /* TODO: block userspace mapping of io register */ 3356 if (adev->asic_type >= CHIP_BONAIRE) { 3357 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3358 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3359 } else { 3360 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3361 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3362 } 3363 3364 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3365 if (adev->rmmio == NULL) { 3366 return -ENOMEM; 3367 } 3368 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3369 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3370 3371 /* enable PCIE atomic ops */ 3372 r = pci_enable_atomic_ops_to_root(adev->pdev, 3373 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3374 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3375 if (r) { 3376 adev->have_atomics_support = false; 3377 DRM_INFO("PCIE atomic ops is not supported\n"); 3378 } else { 3379 adev->have_atomics_support = true; 3380 } 3381 3382 amdgpu_device_get_pcie_info(adev); 3383 3384 if (amdgpu_mcbp) 3385 DRM_INFO("MCBP is enabled\n"); 3386 3387 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3388 adev->enable_mes = true; 3389 3390 /* detect hw virtualization here */ 3391 amdgpu_detect_virtualization(adev); 3392 3393 r = amdgpu_device_get_job_timeout_settings(adev); 3394 if (r) { 3395 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3396 goto failed_unmap; 3397 } 3398 3399 /* early init functions */ 3400 r = amdgpu_device_ip_early_init(adev); 3401 if (r) 3402 goto failed_unmap; 3403 3404 /* doorbell bar mapping and doorbell index init*/ 3405 amdgpu_device_doorbell_init(adev); 3406 3407 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3408 /* this will fail for cards that aren't VGA class devices, just 3409 * ignore it */ 3410 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3411 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3412 3413 if (amdgpu_device_supports_px(ddev)) { 3414 px = true; 3415 vga_switcheroo_register_client(adev->pdev, 3416 &amdgpu_switcheroo_ops, px); 3417 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3418 } 3419 3420 if (amdgpu_emu_mode == 1) { 3421 /* post the asic on emulation mode */ 3422 emu_soc_asic_init(adev); 3423 goto fence_driver_init; 3424 } 3425 3426 amdgpu_reset_init(adev); 3427 3428 /* detect if we are with an SRIOV vbios */ 3429 amdgpu_device_detect_sriov_bios(adev); 3430 3431 /* check if we need to reset the asic 3432 * E.g., driver was not cleanly unloaded previously, etc. 3433 */ 3434 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3435 if (adev->gmc.xgmi.num_physical_nodes) { 3436 dev_info(adev->dev, "Pending hive reset.\n"); 3437 adev->gmc.xgmi.pending_reset = true; 3438 /* Only need to init necessary block for SMU to handle the reset */ 3439 for (i = 0; i < adev->num_ip_blocks; i++) { 3440 if (!adev->ip_blocks[i].status.valid) 3441 continue; 3442 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3443 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3444 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3445 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3446 DRM_DEBUG("IP %s disabled for hw_init.\n", 3447 adev->ip_blocks[i].version->funcs->name); 3448 adev->ip_blocks[i].status.hw = true; 3449 } 3450 } 3451 } else { 3452 r = amdgpu_asic_reset(adev); 3453 if (r) { 3454 dev_err(adev->dev, "asic reset on init failed\n"); 3455 goto failed; 3456 } 3457 } 3458 } 3459 3460 pci_enable_pcie_error_reporting(adev->pdev); 3461 3462 /* Post card if necessary */ 3463 if (amdgpu_device_need_post(adev)) { 3464 if (!adev->bios) { 3465 dev_err(adev->dev, "no vBIOS found\n"); 3466 r = -EINVAL; 3467 goto failed; 3468 } 3469 DRM_INFO("GPU posting now...\n"); 3470 r = amdgpu_device_asic_init(adev); 3471 if (r) { 3472 dev_err(adev->dev, "gpu post error!\n"); 3473 goto failed; 3474 } 3475 } 3476 3477 if (adev->is_atom_fw) { 3478 /* Initialize clocks */ 3479 r = amdgpu_atomfirmware_get_clock_info(adev); 3480 if (r) { 3481 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3482 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3483 goto failed; 3484 } 3485 } else { 3486 /* Initialize clocks */ 3487 r = amdgpu_atombios_get_clock_info(adev); 3488 if (r) { 3489 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3490 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3491 goto failed; 3492 } 3493 /* init i2c buses */ 3494 if (!amdgpu_device_has_dc_support(adev)) 3495 amdgpu_atombios_i2c_init(adev); 3496 } 3497 3498 fence_driver_init: 3499 /* Fence driver */ 3500 r = amdgpu_fence_driver_init(adev); 3501 if (r) { 3502 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3503 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3504 goto failed; 3505 } 3506 3507 /* init the mode config */ 3508 drm_mode_config_init(adev_to_drm(adev)); 3509 3510 r = amdgpu_device_ip_init(adev); 3511 if (r) { 3512 /* failed in exclusive mode due to timeout */ 3513 if (amdgpu_sriov_vf(adev) && 3514 !amdgpu_sriov_runtime(adev) && 3515 amdgpu_virt_mmio_blocked(adev) && 3516 !amdgpu_virt_wait_reset(adev)) { 3517 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3518 /* Don't send request since VF is inactive. */ 3519 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3520 adev->virt.ops = NULL; 3521 r = -EAGAIN; 3522 goto release_ras_con; 3523 } 3524 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3525 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3526 goto release_ras_con; 3527 } 3528 3529 dev_info(adev->dev, 3530 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3531 adev->gfx.config.max_shader_engines, 3532 adev->gfx.config.max_sh_per_se, 3533 adev->gfx.config.max_cu_per_sh, 3534 adev->gfx.cu_info.number); 3535 3536 adev->accel_working = true; 3537 3538 amdgpu_vm_check_compute_bug(adev); 3539 3540 /* Initialize the buffer migration limit. */ 3541 if (amdgpu_moverate >= 0) 3542 max_MBps = amdgpu_moverate; 3543 else 3544 max_MBps = 8; /* Allow 8 MB/s. */ 3545 /* Get a log2 for easy divisions. */ 3546 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3547 3548 amdgpu_fbdev_init(adev); 3549 3550 r = amdgpu_pm_sysfs_init(adev); 3551 if (r) { 3552 adev->pm_sysfs_en = false; 3553 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3554 } else 3555 adev->pm_sysfs_en = true; 3556 3557 r = amdgpu_ucode_sysfs_init(adev); 3558 if (r) { 3559 adev->ucode_sysfs_en = false; 3560 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3561 } else 3562 adev->ucode_sysfs_en = true; 3563 3564 if ((amdgpu_testing & 1)) { 3565 if (adev->accel_working) 3566 amdgpu_test_moves(adev); 3567 else 3568 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3569 } 3570 if (amdgpu_benchmarking) { 3571 if (adev->accel_working) 3572 amdgpu_benchmark(adev, amdgpu_benchmarking); 3573 else 3574 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3575 } 3576 3577 /* 3578 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3579 * Otherwise the mgpu fan boost feature will be skipped due to the 3580 * gpu instance is counted less. 3581 */ 3582 amdgpu_register_gpu_instance(adev); 3583 3584 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3585 * explicit gating rather than handling it automatically. 3586 */ 3587 if (!adev->gmc.xgmi.pending_reset) { 3588 r = amdgpu_device_ip_late_init(adev); 3589 if (r) { 3590 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3591 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3592 goto release_ras_con; 3593 } 3594 /* must succeed. */ 3595 amdgpu_ras_resume(adev); 3596 queue_delayed_work(system_wq, &adev->delayed_init_work, 3597 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3598 } 3599 3600 if (amdgpu_sriov_vf(adev)) 3601 flush_delayed_work(&adev->delayed_init_work); 3602 3603 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3604 if (r) 3605 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3606 3607 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3608 r = amdgpu_pmu_init(adev); 3609 if (r) 3610 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3611 3612 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3613 if (amdgpu_device_cache_pci_state(adev->pdev)) 3614 pci_restore_state(pdev); 3615 3616 if (adev->gmc.xgmi.pending_reset) 3617 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3618 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3619 3620 return 0; 3621 3622 release_ras_con: 3623 amdgpu_release_ras_context(adev); 3624 3625 failed: 3626 amdgpu_vf_error_trans_all(adev); 3627 if (px) 3628 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3629 3630 failed_unmap: 3631 iounmap(adev->rmmio); 3632 adev->rmmio = NULL; 3633 3634 return r; 3635 } 3636 3637 /** 3638 * amdgpu_device_fini - tear down the driver 3639 * 3640 * @adev: amdgpu_device pointer 3641 * 3642 * Tear down the driver info (all asics). 3643 * Called at driver shutdown. 3644 */ 3645 void amdgpu_device_fini(struct amdgpu_device *adev) 3646 { 3647 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3648 flush_delayed_work(&adev->delayed_init_work); 3649 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3650 adev->shutdown = true; 3651 3652 kfree(adev->pci_state); 3653 3654 /* make sure IB test finished before entering exclusive mode 3655 * to avoid preemption on IB test 3656 * */ 3657 if (amdgpu_sriov_vf(adev)) { 3658 amdgpu_virt_request_full_gpu(adev, false); 3659 amdgpu_virt_fini_data_exchange(adev); 3660 } 3661 3662 /* disable all interrupts */ 3663 amdgpu_irq_disable_all(adev); 3664 if (adev->mode_info.mode_config_initialized){ 3665 if (!amdgpu_device_has_dc_support(adev)) 3666 drm_helper_force_disable_all(adev_to_drm(adev)); 3667 else 3668 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3669 } 3670 amdgpu_fence_driver_fini(adev); 3671 if (adev->pm_sysfs_en) 3672 amdgpu_pm_sysfs_fini(adev); 3673 amdgpu_fbdev_fini(adev); 3674 amdgpu_device_ip_fini(adev); 3675 release_firmware(adev->firmware.gpu_info_fw); 3676 adev->firmware.gpu_info_fw = NULL; 3677 adev->accel_working = false; 3678 3679 amdgpu_reset_fini(adev); 3680 3681 /* free i2c buses */ 3682 if (!amdgpu_device_has_dc_support(adev)) 3683 amdgpu_i2c_fini(adev); 3684 3685 if (amdgpu_emu_mode != 1) 3686 amdgpu_atombios_fini(adev); 3687 3688 kfree(adev->bios); 3689 adev->bios = NULL; 3690 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 3691 vga_switcheroo_unregister_client(adev->pdev); 3692 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3693 } 3694 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3695 vga_client_register(adev->pdev, NULL, NULL, NULL); 3696 iounmap(adev->rmmio); 3697 adev->rmmio = NULL; 3698 amdgpu_device_doorbell_fini(adev); 3699 3700 if (adev->ucode_sysfs_en) 3701 amdgpu_ucode_sysfs_fini(adev); 3702 3703 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3704 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3705 amdgpu_pmu_fini(adev); 3706 if (adev->mman.discovery_bin) 3707 amdgpu_discovery_fini(adev); 3708 } 3709 3710 3711 /* 3712 * Suspend & resume. 3713 */ 3714 /** 3715 * amdgpu_device_suspend - initiate device suspend 3716 * 3717 * @dev: drm dev pointer 3718 * @fbcon : notify the fbdev of suspend 3719 * 3720 * Puts the hw in the suspend state (all asics). 3721 * Returns 0 for success or an error on failure. 3722 * Called at driver suspend. 3723 */ 3724 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3725 { 3726 struct amdgpu_device *adev = drm_to_adev(dev); 3727 int r; 3728 3729 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3730 return 0; 3731 3732 adev->in_suspend = true; 3733 drm_kms_helper_poll_disable(dev); 3734 3735 if (fbcon) 3736 amdgpu_fbdev_set_suspend(adev, 1); 3737 3738 cancel_delayed_work_sync(&adev->delayed_init_work); 3739 3740 amdgpu_ras_suspend(adev); 3741 3742 r = amdgpu_device_ip_suspend_phase1(adev); 3743 3744 if (!adev->in_s0ix) 3745 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 3746 3747 /* evict vram memory */ 3748 amdgpu_bo_evict_vram(adev); 3749 3750 amdgpu_fence_driver_suspend(adev); 3751 3752 r = amdgpu_device_ip_suspend_phase2(adev); 3753 /* evict remaining vram memory 3754 * This second call to evict vram is to evict the gart page table 3755 * using the CPU. 3756 */ 3757 amdgpu_bo_evict_vram(adev); 3758 3759 return 0; 3760 } 3761 3762 /** 3763 * amdgpu_device_resume - initiate device resume 3764 * 3765 * @dev: drm dev pointer 3766 * @fbcon : notify the fbdev of resume 3767 * 3768 * Bring the hw back to operating state (all asics). 3769 * Returns 0 for success or an error on failure. 3770 * Called at driver resume. 3771 */ 3772 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3773 { 3774 struct amdgpu_device *adev = drm_to_adev(dev); 3775 int r = 0; 3776 3777 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3778 return 0; 3779 3780 if (adev->in_s0ix) 3781 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 3782 3783 /* post card */ 3784 if (amdgpu_device_need_post(adev)) { 3785 r = amdgpu_device_asic_init(adev); 3786 if (r) 3787 dev_err(adev->dev, "amdgpu asic init failed\n"); 3788 } 3789 3790 r = amdgpu_device_ip_resume(adev); 3791 if (r) { 3792 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3793 return r; 3794 } 3795 amdgpu_fence_driver_resume(adev); 3796 3797 3798 r = amdgpu_device_ip_late_init(adev); 3799 if (r) 3800 return r; 3801 3802 queue_delayed_work(system_wq, &adev->delayed_init_work, 3803 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3804 3805 if (!adev->in_s0ix) { 3806 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 3807 if (r) 3808 return r; 3809 } 3810 3811 /* Make sure IB tests flushed */ 3812 flush_delayed_work(&adev->delayed_init_work); 3813 3814 if (fbcon) 3815 amdgpu_fbdev_set_suspend(adev, 0); 3816 3817 drm_kms_helper_poll_enable(dev); 3818 3819 amdgpu_ras_resume(adev); 3820 3821 /* 3822 * Most of the connector probing functions try to acquire runtime pm 3823 * refs to ensure that the GPU is powered on when connector polling is 3824 * performed. Since we're calling this from a runtime PM callback, 3825 * trying to acquire rpm refs will cause us to deadlock. 3826 * 3827 * Since we're guaranteed to be holding the rpm lock, it's safe to 3828 * temporarily disable the rpm helpers so this doesn't deadlock us. 3829 */ 3830 #ifdef CONFIG_PM 3831 dev->dev->power.disable_depth++; 3832 #endif 3833 if (!amdgpu_device_has_dc_support(adev)) 3834 drm_helper_hpd_irq_event(dev); 3835 else 3836 drm_kms_helper_hotplug_event(dev); 3837 #ifdef CONFIG_PM 3838 dev->dev->power.disable_depth--; 3839 #endif 3840 adev->in_suspend = false; 3841 3842 return 0; 3843 } 3844 3845 /** 3846 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3847 * 3848 * @adev: amdgpu_device pointer 3849 * 3850 * The list of all the hardware IPs that make up the asic is walked and 3851 * the check_soft_reset callbacks are run. check_soft_reset determines 3852 * if the asic is still hung or not. 3853 * Returns true if any of the IPs are still in a hung state, false if not. 3854 */ 3855 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3856 { 3857 int i; 3858 bool asic_hang = false; 3859 3860 if (amdgpu_sriov_vf(adev)) 3861 return true; 3862 3863 if (amdgpu_asic_need_full_reset(adev)) 3864 return true; 3865 3866 for (i = 0; i < adev->num_ip_blocks; i++) { 3867 if (!adev->ip_blocks[i].status.valid) 3868 continue; 3869 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3870 adev->ip_blocks[i].status.hang = 3871 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3872 if (adev->ip_blocks[i].status.hang) { 3873 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3874 asic_hang = true; 3875 } 3876 } 3877 return asic_hang; 3878 } 3879 3880 /** 3881 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3882 * 3883 * @adev: amdgpu_device pointer 3884 * 3885 * The list of all the hardware IPs that make up the asic is walked and the 3886 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3887 * handles any IP specific hardware or software state changes that are 3888 * necessary for a soft reset to succeed. 3889 * Returns 0 on success, negative error code on failure. 3890 */ 3891 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3892 { 3893 int i, r = 0; 3894 3895 for (i = 0; i < adev->num_ip_blocks; i++) { 3896 if (!adev->ip_blocks[i].status.valid) 3897 continue; 3898 if (adev->ip_blocks[i].status.hang && 3899 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3900 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3901 if (r) 3902 return r; 3903 } 3904 } 3905 3906 return 0; 3907 } 3908 3909 /** 3910 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3911 * 3912 * @adev: amdgpu_device pointer 3913 * 3914 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3915 * reset is necessary to recover. 3916 * Returns true if a full asic reset is required, false if not. 3917 */ 3918 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3919 { 3920 int i; 3921 3922 if (amdgpu_asic_need_full_reset(adev)) 3923 return true; 3924 3925 for (i = 0; i < adev->num_ip_blocks; i++) { 3926 if (!adev->ip_blocks[i].status.valid) 3927 continue; 3928 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3929 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3930 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3931 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3932 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3933 if (adev->ip_blocks[i].status.hang) { 3934 dev_info(adev->dev, "Some block need full reset!\n"); 3935 return true; 3936 } 3937 } 3938 } 3939 return false; 3940 } 3941 3942 /** 3943 * amdgpu_device_ip_soft_reset - do a soft reset 3944 * 3945 * @adev: amdgpu_device pointer 3946 * 3947 * The list of all the hardware IPs that make up the asic is walked and the 3948 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3949 * IP specific hardware or software state changes that are necessary to soft 3950 * reset the IP. 3951 * Returns 0 on success, negative error code on failure. 3952 */ 3953 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3954 { 3955 int i, r = 0; 3956 3957 for (i = 0; i < adev->num_ip_blocks; i++) { 3958 if (!adev->ip_blocks[i].status.valid) 3959 continue; 3960 if (adev->ip_blocks[i].status.hang && 3961 adev->ip_blocks[i].version->funcs->soft_reset) { 3962 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3963 if (r) 3964 return r; 3965 } 3966 } 3967 3968 return 0; 3969 } 3970 3971 /** 3972 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3973 * 3974 * @adev: amdgpu_device pointer 3975 * 3976 * The list of all the hardware IPs that make up the asic is walked and the 3977 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3978 * handles any IP specific hardware or software state changes that are 3979 * necessary after the IP has been soft reset. 3980 * Returns 0 on success, negative error code on failure. 3981 */ 3982 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3983 { 3984 int i, r = 0; 3985 3986 for (i = 0; i < adev->num_ip_blocks; i++) { 3987 if (!adev->ip_blocks[i].status.valid) 3988 continue; 3989 if (adev->ip_blocks[i].status.hang && 3990 adev->ip_blocks[i].version->funcs->post_soft_reset) 3991 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3992 if (r) 3993 return r; 3994 } 3995 3996 return 0; 3997 } 3998 3999 /** 4000 * amdgpu_device_recover_vram - Recover some VRAM contents 4001 * 4002 * @adev: amdgpu_device pointer 4003 * 4004 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4005 * restore things like GPUVM page tables after a GPU reset where 4006 * the contents of VRAM might be lost. 4007 * 4008 * Returns: 4009 * 0 on success, negative error code on failure. 4010 */ 4011 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4012 { 4013 struct dma_fence *fence = NULL, *next = NULL; 4014 struct amdgpu_bo *shadow; 4015 long r = 1, tmo; 4016 4017 if (amdgpu_sriov_runtime(adev)) 4018 tmo = msecs_to_jiffies(8000); 4019 else 4020 tmo = msecs_to_jiffies(100); 4021 4022 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4023 mutex_lock(&adev->shadow_list_lock); 4024 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4025 4026 /* No need to recover an evicted BO */ 4027 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 4028 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 4029 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 4030 continue; 4031 4032 r = amdgpu_bo_restore_shadow(shadow, &next); 4033 if (r) 4034 break; 4035 4036 if (fence) { 4037 tmo = dma_fence_wait_timeout(fence, false, tmo); 4038 dma_fence_put(fence); 4039 fence = next; 4040 if (tmo == 0) { 4041 r = -ETIMEDOUT; 4042 break; 4043 } else if (tmo < 0) { 4044 r = tmo; 4045 break; 4046 } 4047 } else { 4048 fence = next; 4049 } 4050 } 4051 mutex_unlock(&adev->shadow_list_lock); 4052 4053 if (fence) 4054 tmo = dma_fence_wait_timeout(fence, false, tmo); 4055 dma_fence_put(fence); 4056 4057 if (r < 0 || tmo <= 0) { 4058 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4059 return -EIO; 4060 } 4061 4062 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4063 return 0; 4064 } 4065 4066 4067 /** 4068 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4069 * 4070 * @adev: amdgpu_device pointer 4071 * @from_hypervisor: request from hypervisor 4072 * 4073 * do VF FLR and reinitialize Asic 4074 * return 0 means succeeded otherwise failed 4075 */ 4076 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4077 bool from_hypervisor) 4078 { 4079 int r; 4080 4081 if (from_hypervisor) 4082 r = amdgpu_virt_request_full_gpu(adev, true); 4083 else 4084 r = amdgpu_virt_reset_gpu(adev); 4085 if (r) 4086 return r; 4087 4088 amdgpu_amdkfd_pre_reset(adev); 4089 4090 /* Resume IP prior to SMC */ 4091 r = amdgpu_device_ip_reinit_early_sriov(adev); 4092 if (r) 4093 goto error; 4094 4095 amdgpu_virt_init_data_exchange(adev); 4096 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4097 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4098 4099 r = amdgpu_device_fw_loading(adev); 4100 if (r) 4101 return r; 4102 4103 /* now we are okay to resume SMC/CP/SDMA */ 4104 r = amdgpu_device_ip_reinit_late_sriov(adev); 4105 if (r) 4106 goto error; 4107 4108 amdgpu_irq_gpu_reset_resume_helper(adev); 4109 r = amdgpu_ib_ring_tests(adev); 4110 amdgpu_amdkfd_post_reset(adev); 4111 4112 error: 4113 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4114 amdgpu_inc_vram_lost(adev); 4115 r = amdgpu_device_recover_vram(adev); 4116 } 4117 amdgpu_virt_release_full_gpu(adev, true); 4118 4119 return r; 4120 } 4121 4122 /** 4123 * amdgpu_device_has_job_running - check if there is any job in mirror list 4124 * 4125 * @adev: amdgpu_device pointer 4126 * 4127 * check if there is any job in mirror list 4128 */ 4129 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4130 { 4131 int i; 4132 struct drm_sched_job *job; 4133 4134 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4135 struct amdgpu_ring *ring = adev->rings[i]; 4136 4137 if (!ring || !ring->sched.thread) 4138 continue; 4139 4140 spin_lock(&ring->sched.job_list_lock); 4141 job = list_first_entry_or_null(&ring->sched.pending_list, 4142 struct drm_sched_job, list); 4143 spin_unlock(&ring->sched.job_list_lock); 4144 if (job) 4145 return true; 4146 } 4147 return false; 4148 } 4149 4150 /** 4151 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4152 * 4153 * @adev: amdgpu_device pointer 4154 * 4155 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4156 * a hung GPU. 4157 */ 4158 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4159 { 4160 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4161 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4162 return false; 4163 } 4164 4165 if (amdgpu_gpu_recovery == 0) 4166 goto disabled; 4167 4168 if (amdgpu_sriov_vf(adev)) 4169 return true; 4170 4171 if (amdgpu_gpu_recovery == -1) { 4172 switch (adev->asic_type) { 4173 case CHIP_BONAIRE: 4174 case CHIP_HAWAII: 4175 case CHIP_TOPAZ: 4176 case CHIP_TONGA: 4177 case CHIP_FIJI: 4178 case CHIP_POLARIS10: 4179 case CHIP_POLARIS11: 4180 case CHIP_POLARIS12: 4181 case CHIP_VEGAM: 4182 case CHIP_VEGA20: 4183 case CHIP_VEGA10: 4184 case CHIP_VEGA12: 4185 case CHIP_RAVEN: 4186 case CHIP_ARCTURUS: 4187 case CHIP_RENOIR: 4188 case CHIP_NAVI10: 4189 case CHIP_NAVI14: 4190 case CHIP_NAVI12: 4191 case CHIP_SIENNA_CICHLID: 4192 case CHIP_NAVY_FLOUNDER: 4193 case CHIP_DIMGREY_CAVEFISH: 4194 case CHIP_VANGOGH: 4195 case CHIP_ALDEBARAN: 4196 break; 4197 default: 4198 goto disabled; 4199 } 4200 } 4201 4202 return true; 4203 4204 disabled: 4205 dev_info(adev->dev, "GPU recovery disabled.\n"); 4206 return false; 4207 } 4208 4209 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4210 { 4211 u32 i; 4212 int ret = 0; 4213 4214 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4215 4216 dev_info(adev->dev, "GPU mode1 reset\n"); 4217 4218 /* disable BM */ 4219 pci_clear_master(adev->pdev); 4220 4221 amdgpu_device_cache_pci_state(adev->pdev); 4222 4223 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4224 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4225 ret = amdgpu_dpm_mode1_reset(adev); 4226 } else { 4227 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4228 ret = psp_gpu_reset(adev); 4229 } 4230 4231 if (ret) 4232 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4233 4234 amdgpu_device_load_pci_state(adev->pdev); 4235 4236 /* wait for asic to come out of reset */ 4237 for (i = 0; i < adev->usec_timeout; i++) { 4238 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4239 4240 if (memsize != 0xffffffff) 4241 break; 4242 udelay(1); 4243 } 4244 4245 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4246 return ret; 4247 } 4248 4249 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4250 struct amdgpu_reset_context *reset_context) 4251 { 4252 int i, r = 0; 4253 struct amdgpu_job *job = NULL; 4254 bool need_full_reset = 4255 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4256 4257 if (reset_context->reset_req_dev == adev) 4258 job = reset_context->job; 4259 4260 /* no need to dump if device is not in good state during probe period */ 4261 if (!adev->gmc.xgmi.pending_reset) 4262 amdgpu_debugfs_wait_dump(adev); 4263 4264 if (amdgpu_sriov_vf(adev)) { 4265 /* stop the data exchange thread */ 4266 amdgpu_virt_fini_data_exchange(adev); 4267 } 4268 4269 /* block all schedulers and reset given job's ring */ 4270 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4271 struct amdgpu_ring *ring = adev->rings[i]; 4272 4273 if (!ring || !ring->sched.thread) 4274 continue; 4275 4276 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4277 amdgpu_fence_driver_force_completion(ring); 4278 } 4279 4280 if(job) 4281 drm_sched_increase_karma(&job->base); 4282 4283 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4284 /* If reset handler not implemented, continue; otherwise return */ 4285 if (r == -ENOSYS) 4286 r = 0; 4287 else 4288 return r; 4289 4290 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4291 if (!amdgpu_sriov_vf(adev)) { 4292 4293 if (!need_full_reset) 4294 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4295 4296 if (!need_full_reset) { 4297 amdgpu_device_ip_pre_soft_reset(adev); 4298 r = amdgpu_device_ip_soft_reset(adev); 4299 amdgpu_device_ip_post_soft_reset(adev); 4300 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4301 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4302 need_full_reset = true; 4303 } 4304 } 4305 4306 if (need_full_reset) 4307 r = amdgpu_device_ip_suspend(adev); 4308 if (need_full_reset) 4309 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4310 else 4311 clear_bit(AMDGPU_NEED_FULL_RESET, 4312 &reset_context->flags); 4313 } 4314 4315 return r; 4316 } 4317 4318 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4319 struct amdgpu_reset_context *reset_context) 4320 { 4321 struct amdgpu_device *tmp_adev = NULL; 4322 bool need_full_reset, skip_hw_reset, vram_lost = false; 4323 int r = 0; 4324 4325 /* Try reset handler method first */ 4326 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4327 reset_list); 4328 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4329 /* If reset handler not implemented, continue; otherwise return */ 4330 if (r == -ENOSYS) 4331 r = 0; 4332 else 4333 return r; 4334 4335 /* Reset handler not implemented, use the default method */ 4336 need_full_reset = 4337 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4338 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4339 4340 /* 4341 * ASIC reset has to be done on all XGMI hive nodes ASAP 4342 * to allow proper links negotiation in FW (within 1 sec) 4343 */ 4344 if (!skip_hw_reset && need_full_reset) { 4345 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4346 /* For XGMI run all resets in parallel to speed up the process */ 4347 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4348 tmp_adev->gmc.xgmi.pending_reset = false; 4349 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4350 r = -EALREADY; 4351 } else 4352 r = amdgpu_asic_reset(tmp_adev); 4353 4354 if (r) { 4355 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4356 r, adev_to_drm(tmp_adev)->unique); 4357 break; 4358 } 4359 } 4360 4361 /* For XGMI wait for all resets to complete before proceed */ 4362 if (!r) { 4363 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4364 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4365 flush_work(&tmp_adev->xgmi_reset_work); 4366 r = tmp_adev->asic_reset_res; 4367 if (r) 4368 break; 4369 } 4370 } 4371 } 4372 } 4373 4374 if (!r && amdgpu_ras_intr_triggered()) { 4375 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4376 if (tmp_adev->mmhub.funcs && 4377 tmp_adev->mmhub.funcs->reset_ras_error_count) 4378 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev); 4379 } 4380 4381 amdgpu_ras_intr_cleared(); 4382 } 4383 4384 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4385 if (need_full_reset) { 4386 /* post card */ 4387 r = amdgpu_device_asic_init(tmp_adev); 4388 if (r) { 4389 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4390 } else { 4391 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4392 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4393 if (r) 4394 goto out; 4395 4396 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4397 if (vram_lost) { 4398 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4399 amdgpu_inc_vram_lost(tmp_adev); 4400 } 4401 4402 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4403 if (r) 4404 goto out; 4405 4406 r = amdgpu_device_fw_loading(tmp_adev); 4407 if (r) 4408 return r; 4409 4410 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4411 if (r) 4412 goto out; 4413 4414 if (vram_lost) 4415 amdgpu_device_fill_reset_magic(tmp_adev); 4416 4417 /* 4418 * Add this ASIC as tracked as reset was already 4419 * complete successfully. 4420 */ 4421 amdgpu_register_gpu_instance(tmp_adev); 4422 4423 if (!reset_context->hive && 4424 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4425 amdgpu_xgmi_add_device(tmp_adev); 4426 4427 r = amdgpu_device_ip_late_init(tmp_adev); 4428 if (r) 4429 goto out; 4430 4431 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4432 4433 /* 4434 * The GPU enters bad state once faulty pages 4435 * by ECC has reached the threshold, and ras 4436 * recovery is scheduled next. So add one check 4437 * here to break recovery if it indeed exceeds 4438 * bad page threshold, and remind user to 4439 * retire this GPU or setting one bigger 4440 * bad_page_threshold value to fix this once 4441 * probing driver again. 4442 */ 4443 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4444 /* must succeed. */ 4445 amdgpu_ras_resume(tmp_adev); 4446 } else { 4447 r = -EINVAL; 4448 goto out; 4449 } 4450 4451 /* Update PSP FW topology after reset */ 4452 if (reset_context->hive && 4453 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4454 r = amdgpu_xgmi_update_topology( 4455 reset_context->hive, tmp_adev); 4456 } 4457 } 4458 4459 out: 4460 if (!r) { 4461 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4462 r = amdgpu_ib_ring_tests(tmp_adev); 4463 if (r) { 4464 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4465 r = amdgpu_device_ip_suspend(tmp_adev); 4466 need_full_reset = true; 4467 r = -EAGAIN; 4468 goto end; 4469 } 4470 } 4471 4472 if (!r) 4473 r = amdgpu_device_recover_vram(tmp_adev); 4474 else 4475 tmp_adev->asic_reset_res = r; 4476 } 4477 4478 end: 4479 if (need_full_reset) 4480 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4481 else 4482 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4483 return r; 4484 } 4485 4486 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4487 struct amdgpu_hive_info *hive) 4488 { 4489 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4490 return false; 4491 4492 if (hive) { 4493 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4494 } else { 4495 down_write(&adev->reset_sem); 4496 } 4497 4498 switch (amdgpu_asic_reset_method(adev)) { 4499 case AMD_RESET_METHOD_MODE1: 4500 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4501 break; 4502 case AMD_RESET_METHOD_MODE2: 4503 adev->mp1_state = PP_MP1_STATE_RESET; 4504 break; 4505 default: 4506 adev->mp1_state = PP_MP1_STATE_NONE; 4507 break; 4508 } 4509 4510 return true; 4511 } 4512 4513 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4514 { 4515 amdgpu_vf_error_trans_all(adev); 4516 adev->mp1_state = PP_MP1_STATE_NONE; 4517 atomic_set(&adev->in_gpu_reset, 0); 4518 up_write(&adev->reset_sem); 4519 } 4520 4521 /* 4522 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4523 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4524 * 4525 * unlock won't require roll back. 4526 */ 4527 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4528 { 4529 struct amdgpu_device *tmp_adev = NULL; 4530 4531 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4532 if (!hive) { 4533 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4534 return -ENODEV; 4535 } 4536 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4537 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4538 goto roll_back; 4539 } 4540 } else if (!amdgpu_device_lock_adev(adev, hive)) 4541 return -EAGAIN; 4542 4543 return 0; 4544 roll_back: 4545 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4546 /* 4547 * if the lockup iteration break in the middle of a hive, 4548 * it may means there may has a race issue, 4549 * or a hive device locked up independently. 4550 * we may be in trouble and may not, so will try to roll back 4551 * the lock and give out a warnning. 4552 */ 4553 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4554 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4555 amdgpu_device_unlock_adev(tmp_adev); 4556 } 4557 } 4558 return -EAGAIN; 4559 } 4560 4561 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4562 { 4563 struct pci_dev *p = NULL; 4564 4565 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4566 adev->pdev->bus->number, 1); 4567 if (p) { 4568 pm_runtime_enable(&(p->dev)); 4569 pm_runtime_resume(&(p->dev)); 4570 } 4571 } 4572 4573 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4574 { 4575 enum amd_reset_method reset_method; 4576 struct pci_dev *p = NULL; 4577 u64 expires; 4578 4579 /* 4580 * For now, only BACO and mode1 reset are confirmed 4581 * to suffer the audio issue without proper suspended. 4582 */ 4583 reset_method = amdgpu_asic_reset_method(adev); 4584 if ((reset_method != AMD_RESET_METHOD_BACO) && 4585 (reset_method != AMD_RESET_METHOD_MODE1)) 4586 return -EINVAL; 4587 4588 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4589 adev->pdev->bus->number, 1); 4590 if (!p) 4591 return -ENODEV; 4592 4593 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4594 if (!expires) 4595 /* 4596 * If we cannot get the audio device autosuspend delay, 4597 * a fixed 4S interval will be used. Considering 3S is 4598 * the audio controller default autosuspend delay setting. 4599 * 4S used here is guaranteed to cover that. 4600 */ 4601 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4602 4603 while (!pm_runtime_status_suspended(&(p->dev))) { 4604 if (!pm_runtime_suspend(&(p->dev))) 4605 break; 4606 4607 if (expires < ktime_get_mono_fast_ns()) { 4608 dev_warn(adev->dev, "failed to suspend display audio\n"); 4609 /* TODO: abort the succeeding gpu reset? */ 4610 return -ETIMEDOUT; 4611 } 4612 } 4613 4614 pm_runtime_disable(&(p->dev)); 4615 4616 return 0; 4617 } 4618 4619 void amdgpu_device_recheck_guilty_jobs( 4620 struct amdgpu_device *adev, struct list_head *device_list_handle, 4621 struct amdgpu_reset_context *reset_context) 4622 { 4623 int i, r = 0; 4624 4625 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4626 struct amdgpu_ring *ring = adev->rings[i]; 4627 int ret = 0; 4628 struct drm_sched_job *s_job; 4629 4630 if (!ring || !ring->sched.thread) 4631 continue; 4632 4633 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4634 struct drm_sched_job, list); 4635 if (s_job == NULL) 4636 continue; 4637 4638 /* clear job's guilty and depend the folowing step to decide the real one */ 4639 drm_sched_reset_karma(s_job); 4640 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4641 4642 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4643 if (ret == 0) { /* timeout */ 4644 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4645 ring->sched.name, s_job->id); 4646 4647 /* set guilty */ 4648 drm_sched_increase_karma(s_job); 4649 retry: 4650 /* do hw reset */ 4651 if (amdgpu_sriov_vf(adev)) { 4652 amdgpu_virt_fini_data_exchange(adev); 4653 r = amdgpu_device_reset_sriov(adev, false); 4654 if (r) 4655 adev->asic_reset_res = r; 4656 } else { 4657 clear_bit(AMDGPU_SKIP_HW_RESET, 4658 &reset_context->flags); 4659 r = amdgpu_do_asic_reset(device_list_handle, 4660 reset_context); 4661 if (r && r == -EAGAIN) 4662 goto retry; 4663 } 4664 4665 /* 4666 * add reset counter so that the following 4667 * resubmitted job could flush vmid 4668 */ 4669 atomic_inc(&adev->gpu_reset_counter); 4670 continue; 4671 } 4672 4673 /* got the hw fence, signal finished fence */ 4674 atomic_dec(ring->sched.score); 4675 dma_fence_get(&s_job->s_fence->finished); 4676 dma_fence_signal(&s_job->s_fence->finished); 4677 dma_fence_put(&s_job->s_fence->finished); 4678 4679 /* remove node from list and free the job */ 4680 spin_lock(&ring->sched.job_list_lock); 4681 list_del_init(&s_job->list); 4682 spin_unlock(&ring->sched.job_list_lock); 4683 ring->sched.ops->free_job(s_job); 4684 } 4685 } 4686 4687 /** 4688 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4689 * 4690 * @adev: amdgpu_device pointer 4691 * @job: which job trigger hang 4692 * 4693 * Attempt to reset the GPU if it has hung (all asics). 4694 * Attempt to do soft-reset or full-reset and reinitialize Asic 4695 * Returns 0 for success or an error on failure. 4696 */ 4697 4698 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4699 struct amdgpu_job *job) 4700 { 4701 struct list_head device_list, *device_list_handle = NULL; 4702 bool job_signaled = false; 4703 struct amdgpu_hive_info *hive = NULL; 4704 struct amdgpu_device *tmp_adev = NULL; 4705 int i, r = 0; 4706 bool need_emergency_restart = false; 4707 bool audio_suspended = false; 4708 int tmp_vram_lost_counter; 4709 struct amdgpu_reset_context reset_context; 4710 4711 memset(&reset_context, 0, sizeof(reset_context)); 4712 4713 /* 4714 * Special case: RAS triggered and full reset isn't supported 4715 */ 4716 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4717 4718 /* 4719 * Flush RAM to disk so that after reboot 4720 * the user can read log and see why the system rebooted. 4721 */ 4722 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4723 DRM_WARN("Emergency reboot."); 4724 4725 ksys_sync_helper(); 4726 emergency_restart(); 4727 } 4728 4729 dev_info(adev->dev, "GPU %s begin!\n", 4730 need_emergency_restart ? "jobs stop":"reset"); 4731 4732 /* 4733 * Here we trylock to avoid chain of resets executing from 4734 * either trigger by jobs on different adevs in XGMI hive or jobs on 4735 * different schedulers for same device while this TO handler is running. 4736 * We always reset all schedulers for device and all devices for XGMI 4737 * hive so that should take care of them too. 4738 */ 4739 hive = amdgpu_get_xgmi_hive(adev); 4740 if (hive) { 4741 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4742 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4743 job ? job->base.id : -1, hive->hive_id); 4744 amdgpu_put_xgmi_hive(hive); 4745 if (job) 4746 drm_sched_increase_karma(&job->base); 4747 return 0; 4748 } 4749 mutex_lock(&hive->hive_lock); 4750 } 4751 4752 reset_context.method = AMD_RESET_METHOD_NONE; 4753 reset_context.reset_req_dev = adev; 4754 reset_context.job = job; 4755 reset_context.hive = hive; 4756 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 4757 4758 /* 4759 * lock the device before we try to operate the linked list 4760 * if didn't get the device lock, don't touch the linked list since 4761 * others may iterating it. 4762 */ 4763 r = amdgpu_device_lock_hive_adev(adev, hive); 4764 if (r) { 4765 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4766 job ? job->base.id : -1); 4767 4768 /* even we skipped this reset, still need to set the job to guilty */ 4769 if (job) 4770 drm_sched_increase_karma(&job->base); 4771 goto skip_recovery; 4772 } 4773 4774 /* 4775 * Build list of devices to reset. 4776 * In case we are in XGMI hive mode, resort the device list 4777 * to put adev in the 1st position. 4778 */ 4779 INIT_LIST_HEAD(&device_list); 4780 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4781 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 4782 list_add_tail(&tmp_adev->reset_list, &device_list); 4783 if (!list_is_first(&adev->reset_list, &device_list)) 4784 list_rotate_to_front(&adev->reset_list, &device_list); 4785 device_list_handle = &device_list; 4786 } else { 4787 list_add_tail(&adev->reset_list, &device_list); 4788 device_list_handle = &device_list; 4789 } 4790 4791 /* block all schedulers and reset given job's ring */ 4792 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4793 /* 4794 * Try to put the audio codec into suspend state 4795 * before gpu reset started. 4796 * 4797 * Due to the power domain of the graphics device 4798 * is shared with AZ power domain. Without this, 4799 * we may change the audio hardware from behind 4800 * the audio driver's back. That will trigger 4801 * some audio codec errors. 4802 */ 4803 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4804 audio_suspended = true; 4805 4806 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4807 4808 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4809 4810 if (!amdgpu_sriov_vf(tmp_adev)) 4811 amdgpu_amdkfd_pre_reset(tmp_adev); 4812 4813 /* 4814 * Mark these ASICs to be reseted as untracked first 4815 * And add them back after reset completed 4816 */ 4817 amdgpu_unregister_gpu_instance(tmp_adev); 4818 4819 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4820 4821 /* disable ras on ALL IPs */ 4822 if (!need_emergency_restart && 4823 amdgpu_device_ip_need_full_reset(tmp_adev)) 4824 amdgpu_ras_suspend(tmp_adev); 4825 4826 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4827 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4828 4829 if (!ring || !ring->sched.thread) 4830 continue; 4831 4832 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4833 4834 if (need_emergency_restart) 4835 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4836 } 4837 atomic_inc(&tmp_adev->gpu_reset_counter); 4838 } 4839 4840 if (need_emergency_restart) 4841 goto skip_sched_resume; 4842 4843 /* 4844 * Must check guilty signal here since after this point all old 4845 * HW fences are force signaled. 4846 * 4847 * job->base holds a reference to parent fence 4848 */ 4849 if (job && job->base.s_fence->parent && 4850 dma_fence_is_signaled(job->base.s_fence->parent)) { 4851 job_signaled = true; 4852 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4853 goto skip_hw_reset; 4854 } 4855 4856 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4857 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4858 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 4859 /*TODO Should we stop ?*/ 4860 if (r) { 4861 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4862 r, adev_to_drm(tmp_adev)->unique); 4863 tmp_adev->asic_reset_res = r; 4864 } 4865 } 4866 4867 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 4868 /* Actual ASIC resets if needed.*/ 4869 /* TODO Implement XGMI hive reset logic for SRIOV */ 4870 if (amdgpu_sriov_vf(adev)) { 4871 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4872 if (r) 4873 adev->asic_reset_res = r; 4874 } else { 4875 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 4876 if (r && r == -EAGAIN) 4877 goto retry; 4878 } 4879 4880 skip_hw_reset: 4881 4882 /* Post ASIC reset for all devs .*/ 4883 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4884 4885 /* 4886 * Sometimes a later bad compute job can block a good gfx job as gfx 4887 * and compute ring share internal GC HW mutually. We add an additional 4888 * guilty jobs recheck step to find the real guilty job, it synchronously 4889 * submits and pends for the first job being signaled. If it gets timeout, 4890 * we identify it as a real guilty job. 4891 */ 4892 if (amdgpu_gpu_recovery == 2 && 4893 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 4894 amdgpu_device_recheck_guilty_jobs( 4895 tmp_adev, device_list_handle, &reset_context); 4896 4897 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4898 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4899 4900 if (!ring || !ring->sched.thread) 4901 continue; 4902 4903 /* No point to resubmit jobs if we didn't HW reset*/ 4904 if (!tmp_adev->asic_reset_res && !job_signaled) 4905 drm_sched_resubmit_jobs(&ring->sched); 4906 4907 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4908 } 4909 4910 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4911 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4912 } 4913 4914 tmp_adev->asic_reset_res = 0; 4915 4916 if (r) { 4917 /* bad news, how to tell it to userspace ? */ 4918 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4919 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4920 } else { 4921 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4922 } 4923 } 4924 4925 skip_sched_resume: 4926 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4927 /* unlock kfd: SRIOV would do it separately */ 4928 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 4929 amdgpu_amdkfd_post_reset(tmp_adev); 4930 4931 /* kfd_post_reset will do nothing if kfd device is not initialized, 4932 * need to bring up kfd here if it's not be initialized before 4933 */ 4934 if (!adev->kfd.init_complete) 4935 amdgpu_amdkfd_device_init(adev); 4936 4937 if (audio_suspended) 4938 amdgpu_device_resume_display_audio(tmp_adev); 4939 amdgpu_device_unlock_adev(tmp_adev); 4940 } 4941 4942 skip_recovery: 4943 if (hive) { 4944 atomic_set(&hive->in_reset, 0); 4945 mutex_unlock(&hive->hive_lock); 4946 amdgpu_put_xgmi_hive(hive); 4947 } 4948 4949 if (r && r != -EAGAIN) 4950 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 4951 return r; 4952 } 4953 4954 /** 4955 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 4956 * 4957 * @adev: amdgpu_device pointer 4958 * 4959 * Fetchs and stores in the driver the PCIE capabilities (gen speed 4960 * and lanes) of the slot the device is in. Handles APUs and 4961 * virtualized environments where PCIE config space may not be available. 4962 */ 4963 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 4964 { 4965 struct pci_dev *pdev; 4966 enum pci_bus_speed speed_cap, platform_speed_cap; 4967 enum pcie_link_width platform_link_width; 4968 4969 if (amdgpu_pcie_gen_cap) 4970 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 4971 4972 if (amdgpu_pcie_lane_cap) 4973 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 4974 4975 /* covers APUs as well */ 4976 if (pci_is_root_bus(adev->pdev->bus)) { 4977 if (adev->pm.pcie_gen_mask == 0) 4978 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 4979 if (adev->pm.pcie_mlw_mask == 0) 4980 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 4981 return; 4982 } 4983 4984 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 4985 return; 4986 4987 pcie_bandwidth_available(adev->pdev, NULL, 4988 &platform_speed_cap, &platform_link_width); 4989 4990 if (adev->pm.pcie_gen_mask == 0) { 4991 /* asic caps */ 4992 pdev = adev->pdev; 4993 speed_cap = pcie_get_speed_cap(pdev); 4994 if (speed_cap == PCI_SPEED_UNKNOWN) { 4995 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4996 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4997 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 4998 } else { 4999 if (speed_cap == PCIE_SPEED_32_0GT) 5000 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5001 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5002 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5003 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5004 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5005 else if (speed_cap == PCIE_SPEED_16_0GT) 5006 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5007 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5008 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5009 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5010 else if (speed_cap == PCIE_SPEED_8_0GT) 5011 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5012 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5013 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5014 else if (speed_cap == PCIE_SPEED_5_0GT) 5015 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5016 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5017 else 5018 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5019 } 5020 /* platform caps */ 5021 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5022 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5023 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5024 } else { 5025 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5026 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5027 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5028 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5029 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5030 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5031 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5032 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5033 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5034 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5035 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5036 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5037 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5038 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5039 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5040 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5041 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5042 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5043 else 5044 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5045 5046 } 5047 } 5048 if (adev->pm.pcie_mlw_mask == 0) { 5049 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5050 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5051 } else { 5052 switch (platform_link_width) { 5053 case PCIE_LNK_X32: 5054 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5055 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5056 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5057 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5058 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5059 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5060 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5061 break; 5062 case PCIE_LNK_X16: 5063 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5064 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5065 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5066 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5067 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5068 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5069 break; 5070 case PCIE_LNK_X12: 5071 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5072 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5073 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5074 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5075 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5076 break; 5077 case PCIE_LNK_X8: 5078 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5079 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5080 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5081 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5082 break; 5083 case PCIE_LNK_X4: 5084 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5085 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5086 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5087 break; 5088 case PCIE_LNK_X2: 5089 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5090 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5091 break; 5092 case PCIE_LNK_X1: 5093 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5094 break; 5095 default: 5096 break; 5097 } 5098 } 5099 } 5100 } 5101 5102 int amdgpu_device_baco_enter(struct drm_device *dev) 5103 { 5104 struct amdgpu_device *adev = drm_to_adev(dev); 5105 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5106 5107 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5108 return -ENOTSUPP; 5109 5110 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 5111 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5112 5113 return amdgpu_dpm_baco_enter(adev); 5114 } 5115 5116 int amdgpu_device_baco_exit(struct drm_device *dev) 5117 { 5118 struct amdgpu_device *adev = drm_to_adev(dev); 5119 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5120 int ret = 0; 5121 5122 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5123 return -ENOTSUPP; 5124 5125 ret = amdgpu_dpm_baco_exit(adev); 5126 if (ret) 5127 return ret; 5128 5129 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) 5130 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5131 5132 return 0; 5133 } 5134 5135 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5136 { 5137 int i; 5138 5139 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5140 struct amdgpu_ring *ring = adev->rings[i]; 5141 5142 if (!ring || !ring->sched.thread) 5143 continue; 5144 5145 cancel_delayed_work_sync(&ring->sched.work_tdr); 5146 } 5147 } 5148 5149 /** 5150 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5151 * @pdev: PCI device struct 5152 * @state: PCI channel state 5153 * 5154 * Description: Called when a PCI error is detected. 5155 * 5156 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5157 */ 5158 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5159 { 5160 struct drm_device *dev = pci_get_drvdata(pdev); 5161 struct amdgpu_device *adev = drm_to_adev(dev); 5162 int i; 5163 5164 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5165 5166 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5167 DRM_WARN("No support for XGMI hive yet..."); 5168 return PCI_ERS_RESULT_DISCONNECT; 5169 } 5170 5171 switch (state) { 5172 case pci_channel_io_normal: 5173 return PCI_ERS_RESULT_CAN_RECOVER; 5174 /* Fatal error, prepare for slot reset */ 5175 case pci_channel_io_frozen: 5176 /* 5177 * Cancel and wait for all TDRs in progress if failing to 5178 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5179 * 5180 * Locking adev->reset_sem will prevent any external access 5181 * to GPU during PCI error recovery 5182 */ 5183 while (!amdgpu_device_lock_adev(adev, NULL)) 5184 amdgpu_cancel_all_tdr(adev); 5185 5186 /* 5187 * Block any work scheduling as we do for regular GPU reset 5188 * for the duration of the recovery 5189 */ 5190 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5191 struct amdgpu_ring *ring = adev->rings[i]; 5192 5193 if (!ring || !ring->sched.thread) 5194 continue; 5195 5196 drm_sched_stop(&ring->sched, NULL); 5197 } 5198 atomic_inc(&adev->gpu_reset_counter); 5199 return PCI_ERS_RESULT_NEED_RESET; 5200 case pci_channel_io_perm_failure: 5201 /* Permanent error, prepare for device removal */ 5202 return PCI_ERS_RESULT_DISCONNECT; 5203 } 5204 5205 return PCI_ERS_RESULT_NEED_RESET; 5206 } 5207 5208 /** 5209 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5210 * @pdev: pointer to PCI device 5211 */ 5212 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5213 { 5214 5215 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5216 5217 /* TODO - dump whatever for debugging purposes */ 5218 5219 /* This called only if amdgpu_pci_error_detected returns 5220 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5221 * works, no need to reset slot. 5222 */ 5223 5224 return PCI_ERS_RESULT_RECOVERED; 5225 } 5226 5227 /** 5228 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5229 * @pdev: PCI device struct 5230 * 5231 * Description: This routine is called by the pci error recovery 5232 * code after the PCI slot has been reset, just before we 5233 * should resume normal operations. 5234 */ 5235 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5236 { 5237 struct drm_device *dev = pci_get_drvdata(pdev); 5238 struct amdgpu_device *adev = drm_to_adev(dev); 5239 int r, i; 5240 struct amdgpu_reset_context reset_context; 5241 u32 memsize; 5242 struct list_head device_list; 5243 5244 DRM_INFO("PCI error: slot reset callback!!\n"); 5245 5246 memset(&reset_context, 0, sizeof(reset_context)); 5247 5248 INIT_LIST_HEAD(&device_list); 5249 list_add_tail(&adev->reset_list, &device_list); 5250 5251 /* wait for asic to come out of reset */ 5252 msleep(500); 5253 5254 /* Restore PCI confspace */ 5255 amdgpu_device_load_pci_state(pdev); 5256 5257 /* confirm ASIC came out of reset */ 5258 for (i = 0; i < adev->usec_timeout; i++) { 5259 memsize = amdgpu_asic_get_config_memsize(adev); 5260 5261 if (memsize != 0xffffffff) 5262 break; 5263 udelay(1); 5264 } 5265 if (memsize == 0xffffffff) { 5266 r = -ETIME; 5267 goto out; 5268 } 5269 5270 reset_context.method = AMD_RESET_METHOD_NONE; 5271 reset_context.reset_req_dev = adev; 5272 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5273 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5274 5275 adev->in_pci_err_recovery = true; 5276 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5277 adev->in_pci_err_recovery = false; 5278 if (r) 5279 goto out; 5280 5281 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5282 5283 out: 5284 if (!r) { 5285 if (amdgpu_device_cache_pci_state(adev->pdev)) 5286 pci_restore_state(adev->pdev); 5287 5288 DRM_INFO("PCIe error recovery succeeded\n"); 5289 } else { 5290 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5291 amdgpu_device_unlock_adev(adev); 5292 } 5293 5294 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5295 } 5296 5297 /** 5298 * amdgpu_pci_resume() - resume normal ops after PCI reset 5299 * @pdev: pointer to PCI device 5300 * 5301 * Called when the error recovery driver tells us that its 5302 * OK to resume normal operation. 5303 */ 5304 void amdgpu_pci_resume(struct pci_dev *pdev) 5305 { 5306 struct drm_device *dev = pci_get_drvdata(pdev); 5307 struct amdgpu_device *adev = drm_to_adev(dev); 5308 int i; 5309 5310 5311 DRM_INFO("PCI error: resume callback!!\n"); 5312 5313 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5314 struct amdgpu_ring *ring = adev->rings[i]; 5315 5316 if (!ring || !ring->sched.thread) 5317 continue; 5318 5319 5320 drm_sched_resubmit_jobs(&ring->sched); 5321 drm_sched_start(&ring->sched, true); 5322 } 5323 5324 amdgpu_device_unlock_adev(adev); 5325 } 5326 5327 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5328 { 5329 struct drm_device *dev = pci_get_drvdata(pdev); 5330 struct amdgpu_device *adev = drm_to_adev(dev); 5331 int r; 5332 5333 r = pci_save_state(pdev); 5334 if (!r) { 5335 kfree(adev->pci_state); 5336 5337 adev->pci_state = pci_store_saved_state(pdev); 5338 5339 if (!adev->pci_state) { 5340 DRM_ERROR("Failed to store PCI saved state"); 5341 return false; 5342 } 5343 } else { 5344 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5345 return false; 5346 } 5347 5348 return true; 5349 } 5350 5351 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5352 { 5353 struct drm_device *dev = pci_get_drvdata(pdev); 5354 struct amdgpu_device *adev = drm_to_adev(dev); 5355 int r; 5356 5357 if (!adev->pci_state) 5358 return false; 5359 5360 r = pci_load_saved_state(pdev, adev->pci_state); 5361 5362 if (!r) { 5363 pci_restore_state(pdev); 5364 } else { 5365 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5366 return false; 5367 } 5368 5369 return true; 5370 } 5371 5372 5373