1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 #include "amdgpu_fru_eeprom.h" 68 #include "amdgpu_reset.h" 69 70 #include <linux/suspend.h> 71 #include <drm/task_barrier.h> 72 #include <linux/pm_runtime.h> 73 74 #include <drm/drm_drv.h> 75 76 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 77 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 87 88 #define AMDGPU_RESUME_MS 2000 89 90 const char *amdgpu_asic_name[] = { 91 "TAHITI", 92 "PITCAIRN", 93 "VERDE", 94 "OLAND", 95 "HAINAN", 96 "BONAIRE", 97 "KAVERI", 98 "KABINI", 99 "HAWAII", 100 "MULLINS", 101 "TOPAZ", 102 "TONGA", 103 "FIJI", 104 "CARRIZO", 105 "STONEY", 106 "POLARIS10", 107 "POLARIS11", 108 "POLARIS12", 109 "VEGAM", 110 "VEGA10", 111 "VEGA12", 112 "VEGA20", 113 "RAVEN", 114 "ARCTURUS", 115 "RENOIR", 116 "ALDEBARAN", 117 "NAVI10", 118 "NAVI14", 119 "NAVI12", 120 "SIENNA_CICHLID", 121 "NAVY_FLOUNDER", 122 "VANGOGH", 123 "DIMGREY_CAVEFISH", 124 "BEIGE_GOBY", 125 "LAST", 126 }; 127 128 /** 129 * DOC: pcie_replay_count 130 * 131 * The amdgpu driver provides a sysfs API for reporting the total number 132 * of PCIe replays (NAKs) 133 * The file pcie_replay_count is used for this and returns the total 134 * number of replays as a sum of the NAKs generated and NAKs received 135 */ 136 137 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 138 struct device_attribute *attr, char *buf) 139 { 140 struct drm_device *ddev = dev_get_drvdata(dev); 141 struct amdgpu_device *adev = drm_to_adev(ddev); 142 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 143 144 return sysfs_emit(buf, "%llu\n", cnt); 145 } 146 147 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 148 amdgpu_device_get_pcie_replay_count, NULL); 149 150 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 151 152 /** 153 * DOC: product_name 154 * 155 * The amdgpu driver provides a sysfs API for reporting the product name 156 * for the device 157 * The file serial_number is used for this and returns the product name 158 * as returned from the FRU. 159 * NOTE: This is only available for certain server cards 160 */ 161 162 static ssize_t amdgpu_device_get_product_name(struct device *dev, 163 struct device_attribute *attr, char *buf) 164 { 165 struct drm_device *ddev = dev_get_drvdata(dev); 166 struct amdgpu_device *adev = drm_to_adev(ddev); 167 168 return sysfs_emit(buf, "%s\n", adev->product_name); 169 } 170 171 static DEVICE_ATTR(product_name, S_IRUGO, 172 amdgpu_device_get_product_name, NULL); 173 174 /** 175 * DOC: product_number 176 * 177 * The amdgpu driver provides a sysfs API for reporting the part number 178 * for the device 179 * The file serial_number is used for this and returns the part number 180 * as returned from the FRU. 181 * NOTE: This is only available for certain server cards 182 */ 183 184 static ssize_t amdgpu_device_get_product_number(struct device *dev, 185 struct device_attribute *attr, char *buf) 186 { 187 struct drm_device *ddev = dev_get_drvdata(dev); 188 struct amdgpu_device *adev = drm_to_adev(ddev); 189 190 return sysfs_emit(buf, "%s\n", adev->product_number); 191 } 192 193 static DEVICE_ATTR(product_number, S_IRUGO, 194 amdgpu_device_get_product_number, NULL); 195 196 /** 197 * DOC: serial_number 198 * 199 * The amdgpu driver provides a sysfs API for reporting the serial number 200 * for the device 201 * The file serial_number is used for this and returns the serial number 202 * as returned from the FRU. 203 * NOTE: This is only available for certain server cards 204 */ 205 206 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 207 struct device_attribute *attr, char *buf) 208 { 209 struct drm_device *ddev = dev_get_drvdata(dev); 210 struct amdgpu_device *adev = drm_to_adev(ddev); 211 212 return sysfs_emit(buf, "%s\n", adev->serial); 213 } 214 215 static DEVICE_ATTR(serial_number, S_IRUGO, 216 amdgpu_device_get_serial_number, NULL); 217 218 /** 219 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 220 * 221 * @dev: drm_device pointer 222 * 223 * Returns true if the device is a dGPU with ATPX power control, 224 * otherwise return false. 225 */ 226 bool amdgpu_device_supports_px(struct drm_device *dev) 227 { 228 struct amdgpu_device *adev = drm_to_adev(dev); 229 230 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 231 return true; 232 return false; 233 } 234 235 /** 236 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 237 * 238 * @dev: drm_device pointer 239 * 240 * Returns true if the device is a dGPU with ACPI power control, 241 * otherwise return false. 242 */ 243 bool amdgpu_device_supports_boco(struct drm_device *dev) 244 { 245 struct amdgpu_device *adev = drm_to_adev(dev); 246 247 if (adev->has_pr3 || 248 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 249 return true; 250 return false; 251 } 252 253 /** 254 * amdgpu_device_supports_baco - Does the device support BACO 255 * 256 * @dev: drm_device pointer 257 * 258 * Returns true if the device supporte BACO, 259 * otherwise return false. 260 */ 261 bool amdgpu_device_supports_baco(struct drm_device *dev) 262 { 263 struct amdgpu_device *adev = drm_to_adev(dev); 264 265 return amdgpu_asic_supports_baco(adev); 266 } 267 268 /* 269 * VRAM access helper functions 270 */ 271 272 /** 273 * amdgpu_device_vram_access - read/write a buffer in vram 274 * 275 * @adev: amdgpu_device pointer 276 * @pos: offset of the buffer in vram 277 * @buf: virtual address of the buffer in system memory 278 * @size: read/write size, sizeof(@buf) must > @size 279 * @write: true - write to vram, otherwise - read from vram 280 */ 281 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 282 uint32_t *buf, size_t size, bool write) 283 { 284 unsigned long flags; 285 uint32_t hi = ~0; 286 uint64_t last; 287 int idx; 288 289 if (!drm_dev_enter(&adev->ddev, &idx)) 290 return; 291 292 #ifdef CONFIG_64BIT 293 last = min(pos + size, adev->gmc.visible_vram_size); 294 if (last > pos) { 295 void __iomem *addr = adev->mman.aper_base_kaddr + pos; 296 size_t count = last - pos; 297 298 if (write) { 299 memcpy_toio(addr, buf, count); 300 mb(); 301 amdgpu_asic_flush_hdp(adev, NULL); 302 } else { 303 amdgpu_asic_invalidate_hdp(adev, NULL); 304 mb(); 305 memcpy_fromio(buf, addr, count); 306 } 307 308 if (count == size) 309 goto exit; 310 311 pos += count; 312 buf += count / 4; 313 size -= count; 314 } 315 #endif 316 317 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 318 for (last = pos + size; pos < last; pos += 4) { 319 uint32_t tmp = pos >> 31; 320 321 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 322 if (tmp != hi) { 323 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 324 hi = tmp; 325 } 326 if (write) 327 WREG32_NO_KIQ(mmMM_DATA, *buf++); 328 else 329 *buf++ = RREG32_NO_KIQ(mmMM_DATA); 330 } 331 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 332 333 #ifdef CONFIG_64BIT 334 exit: 335 #endif 336 drm_dev_exit(idx); 337 } 338 339 /* 340 * register access helper functions. 341 */ 342 343 /* Check if hw access should be skipped because of hotplug or device error */ 344 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 345 { 346 if (adev->no_hw_access) 347 return true; 348 349 #ifdef CONFIG_LOCKDEP 350 /* 351 * This is a bit complicated to understand, so worth a comment. What we assert 352 * here is that the GPU reset is not running on another thread in parallel. 353 * 354 * For this we trylock the read side of the reset semaphore, if that succeeds 355 * we know that the reset is not running in paralell. 356 * 357 * If the trylock fails we assert that we are either already holding the read 358 * side of the lock or are the reset thread itself and hold the write side of 359 * the lock. 360 */ 361 if (in_task()) { 362 if (down_read_trylock(&adev->reset_sem)) 363 up_read(&adev->reset_sem); 364 else 365 lockdep_assert_held(&adev->reset_sem); 366 } 367 #endif 368 return false; 369 } 370 371 /** 372 * amdgpu_device_rreg - read a memory mapped IO or indirect register 373 * 374 * @adev: amdgpu_device pointer 375 * @reg: dword aligned register offset 376 * @acc_flags: access flags which require special behavior 377 * 378 * Returns the 32 bit value from the offset specified. 379 */ 380 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 381 uint32_t reg, uint32_t acc_flags) 382 { 383 uint32_t ret; 384 385 if (amdgpu_device_skip_hw_access(adev)) 386 return 0; 387 388 if ((reg * 4) < adev->rmmio_size) { 389 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 390 amdgpu_sriov_runtime(adev) && 391 down_read_trylock(&adev->reset_sem)) { 392 ret = amdgpu_kiq_rreg(adev, reg); 393 up_read(&adev->reset_sem); 394 } else { 395 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 396 } 397 } else { 398 ret = adev->pcie_rreg(adev, reg * 4); 399 } 400 401 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 402 403 return ret; 404 } 405 406 /* 407 * MMIO register read with bytes helper functions 408 * @offset:bytes offset from MMIO start 409 * 410 */ 411 412 /** 413 * amdgpu_mm_rreg8 - read a memory mapped IO register 414 * 415 * @adev: amdgpu_device pointer 416 * @offset: byte aligned register offset 417 * 418 * Returns the 8 bit value from the offset specified. 419 */ 420 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 421 { 422 if (amdgpu_device_skip_hw_access(adev)) 423 return 0; 424 425 if (offset < adev->rmmio_size) 426 return (readb(adev->rmmio + offset)); 427 BUG(); 428 } 429 430 /* 431 * MMIO register write with bytes helper functions 432 * @offset:bytes offset from MMIO start 433 * @value: the value want to be written to the register 434 * 435 */ 436 /** 437 * amdgpu_mm_wreg8 - read a memory mapped IO register 438 * 439 * @adev: amdgpu_device pointer 440 * @offset: byte aligned register offset 441 * @value: 8 bit value to write 442 * 443 * Writes the value specified to the offset specified. 444 */ 445 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 446 { 447 if (amdgpu_device_skip_hw_access(adev)) 448 return; 449 450 if (offset < adev->rmmio_size) 451 writeb(value, adev->rmmio + offset); 452 else 453 BUG(); 454 } 455 456 /** 457 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 458 * 459 * @adev: amdgpu_device pointer 460 * @reg: dword aligned register offset 461 * @v: 32 bit value to write to the register 462 * @acc_flags: access flags which require special behavior 463 * 464 * Writes the value specified to the offset specified. 465 */ 466 void amdgpu_device_wreg(struct amdgpu_device *adev, 467 uint32_t reg, uint32_t v, 468 uint32_t acc_flags) 469 { 470 if (amdgpu_device_skip_hw_access(adev)) 471 return; 472 473 if ((reg * 4) < adev->rmmio_size) { 474 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 475 amdgpu_sriov_runtime(adev) && 476 down_read_trylock(&adev->reset_sem)) { 477 amdgpu_kiq_wreg(adev, reg, v); 478 up_read(&adev->reset_sem); 479 } else { 480 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 481 } 482 } else { 483 adev->pcie_wreg(adev, reg * 4, v); 484 } 485 486 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 487 } 488 489 /* 490 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range 491 * 492 * this function is invoked only the debugfs register access 493 * */ 494 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 495 uint32_t reg, uint32_t v) 496 { 497 if (amdgpu_device_skip_hw_access(adev)) 498 return; 499 500 if (amdgpu_sriov_fullaccess(adev) && 501 adev->gfx.rlc.funcs && 502 adev->gfx.rlc.funcs->is_rlcg_access_range) { 503 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 504 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0); 505 } else { 506 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 507 } 508 } 509 510 /** 511 * amdgpu_mm_rdoorbell - read a doorbell dword 512 * 513 * @adev: amdgpu_device pointer 514 * @index: doorbell index 515 * 516 * Returns the value in the doorbell aperture at the 517 * requested doorbell index (CIK). 518 */ 519 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 520 { 521 if (amdgpu_device_skip_hw_access(adev)) 522 return 0; 523 524 if (index < adev->doorbell.num_doorbells) { 525 return readl(adev->doorbell.ptr + index); 526 } else { 527 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 528 return 0; 529 } 530 } 531 532 /** 533 * amdgpu_mm_wdoorbell - write a doorbell dword 534 * 535 * @adev: amdgpu_device pointer 536 * @index: doorbell index 537 * @v: value to write 538 * 539 * Writes @v to the doorbell aperture at the 540 * requested doorbell index (CIK). 541 */ 542 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 543 { 544 if (amdgpu_device_skip_hw_access(adev)) 545 return; 546 547 if (index < adev->doorbell.num_doorbells) { 548 writel(v, adev->doorbell.ptr + index); 549 } else { 550 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 551 } 552 } 553 554 /** 555 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 556 * 557 * @adev: amdgpu_device pointer 558 * @index: doorbell index 559 * 560 * Returns the value in the doorbell aperture at the 561 * requested doorbell index (VEGA10+). 562 */ 563 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 564 { 565 if (amdgpu_device_skip_hw_access(adev)) 566 return 0; 567 568 if (index < adev->doorbell.num_doorbells) { 569 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 570 } else { 571 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 572 return 0; 573 } 574 } 575 576 /** 577 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 578 * 579 * @adev: amdgpu_device pointer 580 * @index: doorbell index 581 * @v: value to write 582 * 583 * Writes @v to the doorbell aperture at the 584 * requested doorbell index (VEGA10+). 585 */ 586 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 587 { 588 if (amdgpu_device_skip_hw_access(adev)) 589 return; 590 591 if (index < adev->doorbell.num_doorbells) { 592 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 593 } else { 594 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 595 } 596 } 597 598 /** 599 * amdgpu_device_indirect_rreg - read an indirect register 600 * 601 * @adev: amdgpu_device pointer 602 * @pcie_index: mmio register offset 603 * @pcie_data: mmio register offset 604 * @reg_addr: indirect register address to read from 605 * 606 * Returns the value of indirect register @reg_addr 607 */ 608 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 609 u32 pcie_index, u32 pcie_data, 610 u32 reg_addr) 611 { 612 unsigned long flags; 613 u32 r; 614 void __iomem *pcie_index_offset; 615 void __iomem *pcie_data_offset; 616 617 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 618 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 619 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 620 621 writel(reg_addr, pcie_index_offset); 622 readl(pcie_index_offset); 623 r = readl(pcie_data_offset); 624 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 625 626 return r; 627 } 628 629 /** 630 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 631 * 632 * @adev: amdgpu_device pointer 633 * @pcie_index: mmio register offset 634 * @pcie_data: mmio register offset 635 * @reg_addr: indirect register address to read from 636 * 637 * Returns the value of indirect register @reg_addr 638 */ 639 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 640 u32 pcie_index, u32 pcie_data, 641 u32 reg_addr) 642 { 643 unsigned long flags; 644 u64 r; 645 void __iomem *pcie_index_offset; 646 void __iomem *pcie_data_offset; 647 648 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 649 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 650 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 651 652 /* read low 32 bits */ 653 writel(reg_addr, pcie_index_offset); 654 readl(pcie_index_offset); 655 r = readl(pcie_data_offset); 656 /* read high 32 bits */ 657 writel(reg_addr + 4, pcie_index_offset); 658 readl(pcie_index_offset); 659 r |= ((u64)readl(pcie_data_offset) << 32); 660 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 661 662 return r; 663 } 664 665 /** 666 * amdgpu_device_indirect_wreg - write an indirect register address 667 * 668 * @adev: amdgpu_device pointer 669 * @pcie_index: mmio register offset 670 * @pcie_data: mmio register offset 671 * @reg_addr: indirect register offset 672 * @reg_data: indirect register data 673 * 674 */ 675 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 676 u32 pcie_index, u32 pcie_data, 677 u32 reg_addr, u32 reg_data) 678 { 679 unsigned long flags; 680 void __iomem *pcie_index_offset; 681 void __iomem *pcie_data_offset; 682 683 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 684 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 685 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 686 687 writel(reg_addr, pcie_index_offset); 688 readl(pcie_index_offset); 689 writel(reg_data, pcie_data_offset); 690 readl(pcie_data_offset); 691 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 692 } 693 694 /** 695 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 696 * 697 * @adev: amdgpu_device pointer 698 * @pcie_index: mmio register offset 699 * @pcie_data: mmio register offset 700 * @reg_addr: indirect register offset 701 * @reg_data: indirect register data 702 * 703 */ 704 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 705 u32 pcie_index, u32 pcie_data, 706 u32 reg_addr, u64 reg_data) 707 { 708 unsigned long flags; 709 void __iomem *pcie_index_offset; 710 void __iomem *pcie_data_offset; 711 712 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 713 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 714 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 715 716 /* write low 32 bits */ 717 writel(reg_addr, pcie_index_offset); 718 readl(pcie_index_offset); 719 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 720 readl(pcie_data_offset); 721 /* write high 32 bits */ 722 writel(reg_addr + 4, pcie_index_offset); 723 readl(pcie_index_offset); 724 writel((u32)(reg_data >> 32), pcie_data_offset); 725 readl(pcie_data_offset); 726 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 727 } 728 729 /** 730 * amdgpu_invalid_rreg - dummy reg read function 731 * 732 * @adev: amdgpu_device pointer 733 * @reg: offset of register 734 * 735 * Dummy register read function. Used for register blocks 736 * that certain asics don't have (all asics). 737 * Returns the value in the register. 738 */ 739 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 740 { 741 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 742 BUG(); 743 return 0; 744 } 745 746 /** 747 * amdgpu_invalid_wreg - dummy reg write function 748 * 749 * @adev: amdgpu_device pointer 750 * @reg: offset of register 751 * @v: value to write to the register 752 * 753 * Dummy register read function. Used for register blocks 754 * that certain asics don't have (all asics). 755 */ 756 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 757 { 758 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 759 reg, v); 760 BUG(); 761 } 762 763 /** 764 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 765 * 766 * @adev: amdgpu_device pointer 767 * @reg: offset of register 768 * 769 * Dummy register read function. Used for register blocks 770 * that certain asics don't have (all asics). 771 * Returns the value in the register. 772 */ 773 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 774 { 775 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 776 BUG(); 777 return 0; 778 } 779 780 /** 781 * amdgpu_invalid_wreg64 - dummy reg write function 782 * 783 * @adev: amdgpu_device pointer 784 * @reg: offset of register 785 * @v: value to write to the register 786 * 787 * Dummy register read function. Used for register blocks 788 * that certain asics don't have (all asics). 789 */ 790 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 791 { 792 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 793 reg, v); 794 BUG(); 795 } 796 797 /** 798 * amdgpu_block_invalid_rreg - dummy reg read function 799 * 800 * @adev: amdgpu_device pointer 801 * @block: offset of instance 802 * @reg: offset of register 803 * 804 * Dummy register read function. Used for register blocks 805 * that certain asics don't have (all asics). 806 * Returns the value in the register. 807 */ 808 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 809 uint32_t block, uint32_t reg) 810 { 811 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 812 reg, block); 813 BUG(); 814 return 0; 815 } 816 817 /** 818 * amdgpu_block_invalid_wreg - dummy reg write function 819 * 820 * @adev: amdgpu_device pointer 821 * @block: offset of instance 822 * @reg: offset of register 823 * @v: value to write to the register 824 * 825 * Dummy register read function. Used for register blocks 826 * that certain asics don't have (all asics). 827 */ 828 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 829 uint32_t block, 830 uint32_t reg, uint32_t v) 831 { 832 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 833 reg, block, v); 834 BUG(); 835 } 836 837 /** 838 * amdgpu_device_asic_init - Wrapper for atom asic_init 839 * 840 * @adev: amdgpu_device pointer 841 * 842 * Does any asic specific work and then calls atom asic init. 843 */ 844 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 845 { 846 amdgpu_asic_pre_asic_init(adev); 847 848 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 849 } 850 851 /** 852 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 853 * 854 * @adev: amdgpu_device pointer 855 * 856 * Allocates a scratch page of VRAM for use by various things in the 857 * driver. 858 */ 859 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 860 { 861 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 862 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 863 &adev->vram_scratch.robj, 864 &adev->vram_scratch.gpu_addr, 865 (void **)&adev->vram_scratch.ptr); 866 } 867 868 /** 869 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 870 * 871 * @adev: amdgpu_device pointer 872 * 873 * Frees the VRAM scratch page. 874 */ 875 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 876 { 877 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 878 } 879 880 /** 881 * amdgpu_device_program_register_sequence - program an array of registers. 882 * 883 * @adev: amdgpu_device pointer 884 * @registers: pointer to the register array 885 * @array_size: size of the register array 886 * 887 * Programs an array or registers with and and or masks. 888 * This is a helper for setting golden registers. 889 */ 890 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 891 const u32 *registers, 892 const u32 array_size) 893 { 894 u32 tmp, reg, and_mask, or_mask; 895 int i; 896 897 if (array_size % 3) 898 return; 899 900 for (i = 0; i < array_size; i +=3) { 901 reg = registers[i + 0]; 902 and_mask = registers[i + 1]; 903 or_mask = registers[i + 2]; 904 905 if (and_mask == 0xffffffff) { 906 tmp = or_mask; 907 } else { 908 tmp = RREG32(reg); 909 tmp &= ~and_mask; 910 if (adev->family >= AMDGPU_FAMILY_AI) 911 tmp |= (or_mask & and_mask); 912 else 913 tmp |= or_mask; 914 } 915 WREG32(reg, tmp); 916 } 917 } 918 919 /** 920 * amdgpu_device_pci_config_reset - reset the GPU 921 * 922 * @adev: amdgpu_device pointer 923 * 924 * Resets the GPU using the pci config reset sequence. 925 * Only applicable to asics prior to vega10. 926 */ 927 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 928 { 929 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 930 } 931 932 /** 933 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 934 * 935 * @adev: amdgpu_device pointer 936 * 937 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 938 */ 939 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 940 { 941 return pci_reset_function(adev->pdev); 942 } 943 944 /* 945 * GPU doorbell aperture helpers function. 946 */ 947 /** 948 * amdgpu_device_doorbell_init - Init doorbell driver information. 949 * 950 * @adev: amdgpu_device pointer 951 * 952 * Init doorbell driver information (CIK) 953 * Returns 0 on success, error on failure. 954 */ 955 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 956 { 957 958 /* No doorbell on SI hardware generation */ 959 if (adev->asic_type < CHIP_BONAIRE) { 960 adev->doorbell.base = 0; 961 adev->doorbell.size = 0; 962 adev->doorbell.num_doorbells = 0; 963 adev->doorbell.ptr = NULL; 964 return 0; 965 } 966 967 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 968 return -EINVAL; 969 970 amdgpu_asic_init_doorbell_index(adev); 971 972 /* doorbell bar mapping */ 973 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 974 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 975 976 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 977 adev->doorbell_index.max_assignment+1); 978 if (adev->doorbell.num_doorbells == 0) 979 return -EINVAL; 980 981 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 982 * paging queue doorbell use the second page. The 983 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 984 * doorbells are in the first page. So with paging queue enabled, 985 * the max num_doorbells should + 1 page (0x400 in dword) 986 */ 987 if (adev->asic_type >= CHIP_VEGA10) 988 adev->doorbell.num_doorbells += 0x400; 989 990 adev->doorbell.ptr = ioremap(adev->doorbell.base, 991 adev->doorbell.num_doorbells * 992 sizeof(u32)); 993 if (adev->doorbell.ptr == NULL) 994 return -ENOMEM; 995 996 return 0; 997 } 998 999 /** 1000 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1001 * 1002 * @adev: amdgpu_device pointer 1003 * 1004 * Tear down doorbell driver information (CIK) 1005 */ 1006 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1007 { 1008 iounmap(adev->doorbell.ptr); 1009 adev->doorbell.ptr = NULL; 1010 } 1011 1012 1013 1014 /* 1015 * amdgpu_device_wb_*() 1016 * Writeback is the method by which the GPU updates special pages in memory 1017 * with the status of certain GPU events (fences, ring pointers,etc.). 1018 */ 1019 1020 /** 1021 * amdgpu_device_wb_fini - Disable Writeback and free memory 1022 * 1023 * @adev: amdgpu_device pointer 1024 * 1025 * Disables Writeback and frees the Writeback memory (all asics). 1026 * Used at driver shutdown. 1027 */ 1028 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1029 { 1030 if (adev->wb.wb_obj) { 1031 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1032 &adev->wb.gpu_addr, 1033 (void **)&adev->wb.wb); 1034 adev->wb.wb_obj = NULL; 1035 } 1036 } 1037 1038 /** 1039 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 1040 * 1041 * @adev: amdgpu_device pointer 1042 * 1043 * Initializes writeback and allocates writeback memory (all asics). 1044 * Used at driver startup. 1045 * Returns 0 on success or an -error on failure. 1046 */ 1047 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1048 { 1049 int r; 1050 1051 if (adev->wb.wb_obj == NULL) { 1052 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1053 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1054 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1055 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1056 (void **)&adev->wb.wb); 1057 if (r) { 1058 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1059 return r; 1060 } 1061 1062 adev->wb.num_wb = AMDGPU_MAX_WB; 1063 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1064 1065 /* clear wb memory */ 1066 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1067 } 1068 1069 return 0; 1070 } 1071 1072 /** 1073 * amdgpu_device_wb_get - Allocate a wb entry 1074 * 1075 * @adev: amdgpu_device pointer 1076 * @wb: wb index 1077 * 1078 * Allocate a wb slot for use by the driver (all asics). 1079 * Returns 0 on success or -EINVAL on failure. 1080 */ 1081 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1082 { 1083 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1084 1085 if (offset < adev->wb.num_wb) { 1086 __set_bit(offset, adev->wb.used); 1087 *wb = offset << 3; /* convert to dw offset */ 1088 return 0; 1089 } else { 1090 return -EINVAL; 1091 } 1092 } 1093 1094 /** 1095 * amdgpu_device_wb_free - Free a wb entry 1096 * 1097 * @adev: amdgpu_device pointer 1098 * @wb: wb index 1099 * 1100 * Free a wb slot allocated for use by the driver (all asics) 1101 */ 1102 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1103 { 1104 wb >>= 3; 1105 if (wb < adev->wb.num_wb) 1106 __clear_bit(wb, adev->wb.used); 1107 } 1108 1109 /** 1110 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1111 * 1112 * @adev: amdgpu_device pointer 1113 * 1114 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1115 * to fail, but if any of the BARs is not accessible after the size we abort 1116 * driver loading by returning -ENODEV. 1117 */ 1118 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1119 { 1120 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1121 struct pci_bus *root; 1122 struct resource *res; 1123 unsigned i; 1124 u16 cmd; 1125 int r; 1126 1127 /* Bypass for VF */ 1128 if (amdgpu_sriov_vf(adev)) 1129 return 0; 1130 1131 /* skip if the bios has already enabled large BAR */ 1132 if (adev->gmc.real_vram_size && 1133 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1134 return 0; 1135 1136 /* Check if the root BUS has 64bit memory resources */ 1137 root = adev->pdev->bus; 1138 while (root->parent) 1139 root = root->parent; 1140 1141 pci_bus_for_each_resource(root, res, i) { 1142 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1143 res->start > 0x100000000ull) 1144 break; 1145 } 1146 1147 /* Trying to resize is pointless without a root hub window above 4GB */ 1148 if (!res) 1149 return 0; 1150 1151 /* Limit the BAR size to what is available */ 1152 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1153 rbar_size); 1154 1155 /* Disable memory decoding while we change the BAR addresses and size */ 1156 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1157 pci_write_config_word(adev->pdev, PCI_COMMAND, 1158 cmd & ~PCI_COMMAND_MEMORY); 1159 1160 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1161 amdgpu_device_doorbell_fini(adev); 1162 if (adev->asic_type >= CHIP_BONAIRE) 1163 pci_release_resource(adev->pdev, 2); 1164 1165 pci_release_resource(adev->pdev, 0); 1166 1167 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1168 if (r == -ENOSPC) 1169 DRM_INFO("Not enough PCI address space for a large BAR."); 1170 else if (r && r != -ENOTSUPP) 1171 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1172 1173 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1174 1175 /* When the doorbell or fb BAR isn't available we have no chance of 1176 * using the device. 1177 */ 1178 r = amdgpu_device_doorbell_init(adev); 1179 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1180 return -ENODEV; 1181 1182 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1183 1184 return 0; 1185 } 1186 1187 /* 1188 * GPU helpers function. 1189 */ 1190 /** 1191 * amdgpu_device_need_post - check if the hw need post or not 1192 * 1193 * @adev: amdgpu_device pointer 1194 * 1195 * Check if the asic has been initialized (all asics) at driver startup 1196 * or post is needed if hw reset is performed. 1197 * Returns true if need or false if not. 1198 */ 1199 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1200 { 1201 uint32_t reg; 1202 1203 if (amdgpu_sriov_vf(adev)) 1204 return false; 1205 1206 if (amdgpu_passthrough(adev)) { 1207 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1208 * some old smc fw still need driver do vPost otherwise gpu hang, while 1209 * those smc fw version above 22.15 doesn't have this flaw, so we force 1210 * vpost executed for smc version below 22.15 1211 */ 1212 if (adev->asic_type == CHIP_FIJI) { 1213 int err; 1214 uint32_t fw_ver; 1215 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1216 /* force vPost if error occured */ 1217 if (err) 1218 return true; 1219 1220 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1221 if (fw_ver < 0x00160e00) 1222 return true; 1223 } 1224 } 1225 1226 /* Don't post if we need to reset whole hive on init */ 1227 if (adev->gmc.xgmi.pending_reset) 1228 return false; 1229 1230 if (adev->has_hw_reset) { 1231 adev->has_hw_reset = false; 1232 return true; 1233 } 1234 1235 /* bios scratch used on CIK+ */ 1236 if (adev->asic_type >= CHIP_BONAIRE) 1237 return amdgpu_atombios_scratch_need_asic_init(adev); 1238 1239 /* check MEM_SIZE for older asics */ 1240 reg = amdgpu_asic_get_config_memsize(adev); 1241 1242 if ((reg != 0) && (reg != 0xffffffff)) 1243 return false; 1244 1245 return true; 1246 } 1247 1248 /* if we get transitioned to only one device, take VGA back */ 1249 /** 1250 * amdgpu_device_vga_set_decode - enable/disable vga decode 1251 * 1252 * @cookie: amdgpu_device pointer 1253 * @state: enable/disable vga decode 1254 * 1255 * Enable/disable vga decode (all asics). 1256 * Returns VGA resource flags. 1257 */ 1258 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 1259 { 1260 struct amdgpu_device *adev = cookie; 1261 amdgpu_asic_set_vga_state(adev, state); 1262 if (state) 1263 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1264 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1265 else 1266 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1267 } 1268 1269 /** 1270 * amdgpu_device_check_block_size - validate the vm block size 1271 * 1272 * @adev: amdgpu_device pointer 1273 * 1274 * Validates the vm block size specified via module parameter. 1275 * The vm block size defines number of bits in page table versus page directory, 1276 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1277 * page table and the remaining bits are in the page directory. 1278 */ 1279 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1280 { 1281 /* defines number of bits in page table versus page directory, 1282 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1283 * page table and the remaining bits are in the page directory */ 1284 if (amdgpu_vm_block_size == -1) 1285 return; 1286 1287 if (amdgpu_vm_block_size < 9) { 1288 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1289 amdgpu_vm_block_size); 1290 amdgpu_vm_block_size = -1; 1291 } 1292 } 1293 1294 /** 1295 * amdgpu_device_check_vm_size - validate the vm size 1296 * 1297 * @adev: amdgpu_device pointer 1298 * 1299 * Validates the vm size in GB specified via module parameter. 1300 * The VM size is the size of the GPU virtual memory space in GB. 1301 */ 1302 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1303 { 1304 /* no need to check the default value */ 1305 if (amdgpu_vm_size == -1) 1306 return; 1307 1308 if (amdgpu_vm_size < 1) { 1309 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1310 amdgpu_vm_size); 1311 amdgpu_vm_size = -1; 1312 } 1313 } 1314 1315 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1316 { 1317 struct sysinfo si; 1318 bool is_os_64 = (sizeof(void *) == 8); 1319 uint64_t total_memory; 1320 uint64_t dram_size_seven_GB = 0x1B8000000; 1321 uint64_t dram_size_three_GB = 0xB8000000; 1322 1323 if (amdgpu_smu_memory_pool_size == 0) 1324 return; 1325 1326 if (!is_os_64) { 1327 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1328 goto def_value; 1329 } 1330 si_meminfo(&si); 1331 total_memory = (uint64_t)si.totalram * si.mem_unit; 1332 1333 if ((amdgpu_smu_memory_pool_size == 1) || 1334 (amdgpu_smu_memory_pool_size == 2)) { 1335 if (total_memory < dram_size_three_GB) 1336 goto def_value1; 1337 } else if ((amdgpu_smu_memory_pool_size == 4) || 1338 (amdgpu_smu_memory_pool_size == 8)) { 1339 if (total_memory < dram_size_seven_GB) 1340 goto def_value1; 1341 } else { 1342 DRM_WARN("Smu memory pool size not supported\n"); 1343 goto def_value; 1344 } 1345 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1346 1347 return; 1348 1349 def_value1: 1350 DRM_WARN("No enough system memory\n"); 1351 def_value: 1352 adev->pm.smu_prv_buffer_size = 0; 1353 } 1354 1355 /** 1356 * amdgpu_device_check_arguments - validate module params 1357 * 1358 * @adev: amdgpu_device pointer 1359 * 1360 * Validates certain module parameters and updates 1361 * the associated values used by the driver (all asics). 1362 */ 1363 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1364 { 1365 if (amdgpu_sched_jobs < 4) { 1366 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1367 amdgpu_sched_jobs); 1368 amdgpu_sched_jobs = 4; 1369 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1370 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1371 amdgpu_sched_jobs); 1372 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1373 } 1374 1375 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1376 /* gart size must be greater or equal to 32M */ 1377 dev_warn(adev->dev, "gart size (%d) too small\n", 1378 amdgpu_gart_size); 1379 amdgpu_gart_size = -1; 1380 } 1381 1382 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1383 /* gtt size must be greater or equal to 32M */ 1384 dev_warn(adev->dev, "gtt size (%d) too small\n", 1385 amdgpu_gtt_size); 1386 amdgpu_gtt_size = -1; 1387 } 1388 1389 /* valid range is between 4 and 9 inclusive */ 1390 if (amdgpu_vm_fragment_size != -1 && 1391 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1392 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1393 amdgpu_vm_fragment_size = -1; 1394 } 1395 1396 if (amdgpu_sched_hw_submission < 2) { 1397 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1398 amdgpu_sched_hw_submission); 1399 amdgpu_sched_hw_submission = 2; 1400 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1401 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1402 amdgpu_sched_hw_submission); 1403 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1404 } 1405 1406 amdgpu_device_check_smu_prv_buffer_size(adev); 1407 1408 amdgpu_device_check_vm_size(adev); 1409 1410 amdgpu_device_check_block_size(adev); 1411 1412 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1413 1414 amdgpu_gmc_tmz_set(adev); 1415 1416 amdgpu_gmc_noretry_set(adev); 1417 1418 return 0; 1419 } 1420 1421 /** 1422 * amdgpu_switcheroo_set_state - set switcheroo state 1423 * 1424 * @pdev: pci dev pointer 1425 * @state: vga_switcheroo state 1426 * 1427 * Callback for the switcheroo driver. Suspends or resumes the 1428 * the asics before or after it is powered up using ACPI methods. 1429 */ 1430 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1431 enum vga_switcheroo_state state) 1432 { 1433 struct drm_device *dev = pci_get_drvdata(pdev); 1434 int r; 1435 1436 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1437 return; 1438 1439 if (state == VGA_SWITCHEROO_ON) { 1440 pr_info("switched on\n"); 1441 /* don't suspend or resume card normally */ 1442 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1443 1444 pci_set_power_state(pdev, PCI_D0); 1445 amdgpu_device_load_pci_state(pdev); 1446 r = pci_enable_device(pdev); 1447 if (r) 1448 DRM_WARN("pci_enable_device failed (%d)\n", r); 1449 amdgpu_device_resume(dev, true); 1450 1451 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1452 } else { 1453 pr_info("switched off\n"); 1454 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1455 amdgpu_device_suspend(dev, true); 1456 amdgpu_device_cache_pci_state(pdev); 1457 /* Shut down the device */ 1458 pci_disable_device(pdev); 1459 pci_set_power_state(pdev, PCI_D3cold); 1460 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1461 } 1462 } 1463 1464 /** 1465 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1466 * 1467 * @pdev: pci dev pointer 1468 * 1469 * Callback for the switcheroo driver. Check of the switcheroo 1470 * state can be changed. 1471 * Returns true if the state can be changed, false if not. 1472 */ 1473 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1474 { 1475 struct drm_device *dev = pci_get_drvdata(pdev); 1476 1477 /* 1478 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1479 * locking inversion with the driver load path. And the access here is 1480 * completely racy anyway. So don't bother with locking for now. 1481 */ 1482 return atomic_read(&dev->open_count) == 0; 1483 } 1484 1485 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1486 .set_gpu_state = amdgpu_switcheroo_set_state, 1487 .reprobe = NULL, 1488 .can_switch = amdgpu_switcheroo_can_switch, 1489 }; 1490 1491 /** 1492 * amdgpu_device_ip_set_clockgating_state - set the CG state 1493 * 1494 * @dev: amdgpu_device pointer 1495 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1496 * @state: clockgating state (gate or ungate) 1497 * 1498 * Sets the requested clockgating state for all instances of 1499 * the hardware IP specified. 1500 * Returns the error code from the last instance. 1501 */ 1502 int amdgpu_device_ip_set_clockgating_state(void *dev, 1503 enum amd_ip_block_type block_type, 1504 enum amd_clockgating_state state) 1505 { 1506 struct amdgpu_device *adev = dev; 1507 int i, r = 0; 1508 1509 for (i = 0; i < adev->num_ip_blocks; i++) { 1510 if (!adev->ip_blocks[i].status.valid) 1511 continue; 1512 if (adev->ip_blocks[i].version->type != block_type) 1513 continue; 1514 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1515 continue; 1516 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1517 (void *)adev, state); 1518 if (r) 1519 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1520 adev->ip_blocks[i].version->funcs->name, r); 1521 } 1522 return r; 1523 } 1524 1525 /** 1526 * amdgpu_device_ip_set_powergating_state - set the PG state 1527 * 1528 * @dev: amdgpu_device pointer 1529 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1530 * @state: powergating state (gate or ungate) 1531 * 1532 * Sets the requested powergating state for all instances of 1533 * the hardware IP specified. 1534 * Returns the error code from the last instance. 1535 */ 1536 int amdgpu_device_ip_set_powergating_state(void *dev, 1537 enum amd_ip_block_type block_type, 1538 enum amd_powergating_state state) 1539 { 1540 struct amdgpu_device *adev = dev; 1541 int i, r = 0; 1542 1543 for (i = 0; i < adev->num_ip_blocks; i++) { 1544 if (!adev->ip_blocks[i].status.valid) 1545 continue; 1546 if (adev->ip_blocks[i].version->type != block_type) 1547 continue; 1548 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1549 continue; 1550 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1551 (void *)adev, state); 1552 if (r) 1553 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1554 adev->ip_blocks[i].version->funcs->name, r); 1555 } 1556 return r; 1557 } 1558 1559 /** 1560 * amdgpu_device_ip_get_clockgating_state - get the CG state 1561 * 1562 * @adev: amdgpu_device pointer 1563 * @flags: clockgating feature flags 1564 * 1565 * Walks the list of IPs on the device and updates the clockgating 1566 * flags for each IP. 1567 * Updates @flags with the feature flags for each hardware IP where 1568 * clockgating is enabled. 1569 */ 1570 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1571 u32 *flags) 1572 { 1573 int i; 1574 1575 for (i = 0; i < adev->num_ip_blocks; i++) { 1576 if (!adev->ip_blocks[i].status.valid) 1577 continue; 1578 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1579 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1580 } 1581 } 1582 1583 /** 1584 * amdgpu_device_ip_wait_for_idle - wait for idle 1585 * 1586 * @adev: amdgpu_device pointer 1587 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1588 * 1589 * Waits for the request hardware IP to be idle. 1590 * Returns 0 for success or a negative error code on failure. 1591 */ 1592 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1593 enum amd_ip_block_type block_type) 1594 { 1595 int i, r; 1596 1597 for (i = 0; i < adev->num_ip_blocks; i++) { 1598 if (!adev->ip_blocks[i].status.valid) 1599 continue; 1600 if (adev->ip_blocks[i].version->type == block_type) { 1601 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1602 if (r) 1603 return r; 1604 break; 1605 } 1606 } 1607 return 0; 1608 1609 } 1610 1611 /** 1612 * amdgpu_device_ip_is_idle - is the hardware IP idle 1613 * 1614 * @adev: amdgpu_device pointer 1615 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1616 * 1617 * Check if the hardware IP is idle or not. 1618 * Returns true if it the IP is idle, false if not. 1619 */ 1620 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1621 enum amd_ip_block_type block_type) 1622 { 1623 int i; 1624 1625 for (i = 0; i < adev->num_ip_blocks; i++) { 1626 if (!adev->ip_blocks[i].status.valid) 1627 continue; 1628 if (adev->ip_blocks[i].version->type == block_type) 1629 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1630 } 1631 return true; 1632 1633 } 1634 1635 /** 1636 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1637 * 1638 * @adev: amdgpu_device pointer 1639 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1640 * 1641 * Returns a pointer to the hardware IP block structure 1642 * if it exists for the asic, otherwise NULL. 1643 */ 1644 struct amdgpu_ip_block * 1645 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1646 enum amd_ip_block_type type) 1647 { 1648 int i; 1649 1650 for (i = 0; i < adev->num_ip_blocks; i++) 1651 if (adev->ip_blocks[i].version->type == type) 1652 return &adev->ip_blocks[i]; 1653 1654 return NULL; 1655 } 1656 1657 /** 1658 * amdgpu_device_ip_block_version_cmp 1659 * 1660 * @adev: amdgpu_device pointer 1661 * @type: enum amd_ip_block_type 1662 * @major: major version 1663 * @minor: minor version 1664 * 1665 * return 0 if equal or greater 1666 * return 1 if smaller or the ip_block doesn't exist 1667 */ 1668 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1669 enum amd_ip_block_type type, 1670 u32 major, u32 minor) 1671 { 1672 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1673 1674 if (ip_block && ((ip_block->version->major > major) || 1675 ((ip_block->version->major == major) && 1676 (ip_block->version->minor >= minor)))) 1677 return 0; 1678 1679 return 1; 1680 } 1681 1682 /** 1683 * amdgpu_device_ip_block_add 1684 * 1685 * @adev: amdgpu_device pointer 1686 * @ip_block_version: pointer to the IP to add 1687 * 1688 * Adds the IP block driver information to the collection of IPs 1689 * on the asic. 1690 */ 1691 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1692 const struct amdgpu_ip_block_version *ip_block_version) 1693 { 1694 if (!ip_block_version) 1695 return -EINVAL; 1696 1697 switch (ip_block_version->type) { 1698 case AMD_IP_BLOCK_TYPE_VCN: 1699 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1700 return 0; 1701 break; 1702 case AMD_IP_BLOCK_TYPE_JPEG: 1703 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1704 return 0; 1705 break; 1706 default: 1707 break; 1708 } 1709 1710 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1711 ip_block_version->funcs->name); 1712 1713 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1714 1715 return 0; 1716 } 1717 1718 /** 1719 * amdgpu_device_enable_virtual_display - enable virtual display feature 1720 * 1721 * @adev: amdgpu_device pointer 1722 * 1723 * Enabled the virtual display feature if the user has enabled it via 1724 * the module parameter virtual_display. This feature provides a virtual 1725 * display hardware on headless boards or in virtualized environments. 1726 * This function parses and validates the configuration string specified by 1727 * the user and configues the virtual display configuration (number of 1728 * virtual connectors, crtcs, etc.) specified. 1729 */ 1730 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1731 { 1732 adev->enable_virtual_display = false; 1733 1734 if (amdgpu_virtual_display) { 1735 const char *pci_address_name = pci_name(adev->pdev); 1736 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1737 1738 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1739 pciaddstr_tmp = pciaddstr; 1740 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1741 pciaddname = strsep(&pciaddname_tmp, ","); 1742 if (!strcmp("all", pciaddname) 1743 || !strcmp(pci_address_name, pciaddname)) { 1744 long num_crtc; 1745 int res = -1; 1746 1747 adev->enable_virtual_display = true; 1748 1749 if (pciaddname_tmp) 1750 res = kstrtol(pciaddname_tmp, 10, 1751 &num_crtc); 1752 1753 if (!res) { 1754 if (num_crtc < 1) 1755 num_crtc = 1; 1756 if (num_crtc > 6) 1757 num_crtc = 6; 1758 adev->mode_info.num_crtc = num_crtc; 1759 } else { 1760 adev->mode_info.num_crtc = 1; 1761 } 1762 break; 1763 } 1764 } 1765 1766 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1767 amdgpu_virtual_display, pci_address_name, 1768 adev->enable_virtual_display, adev->mode_info.num_crtc); 1769 1770 kfree(pciaddstr); 1771 } 1772 } 1773 1774 /** 1775 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1776 * 1777 * @adev: amdgpu_device pointer 1778 * 1779 * Parses the asic configuration parameters specified in the gpu info 1780 * firmware and makes them availale to the driver for use in configuring 1781 * the asic. 1782 * Returns 0 on success, -EINVAL on failure. 1783 */ 1784 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1785 { 1786 const char *chip_name; 1787 char fw_name[40]; 1788 int err; 1789 const struct gpu_info_firmware_header_v1_0 *hdr; 1790 1791 adev->firmware.gpu_info_fw = NULL; 1792 1793 if (adev->mman.discovery_bin) { 1794 amdgpu_discovery_get_gfx_info(adev); 1795 1796 /* 1797 * FIXME: The bounding box is still needed by Navi12, so 1798 * temporarily read it from gpu_info firmware. Should be droped 1799 * when DAL no longer needs it. 1800 */ 1801 if (adev->asic_type != CHIP_NAVI12) 1802 return 0; 1803 } 1804 1805 switch (adev->asic_type) { 1806 #ifdef CONFIG_DRM_AMDGPU_SI 1807 case CHIP_VERDE: 1808 case CHIP_TAHITI: 1809 case CHIP_PITCAIRN: 1810 case CHIP_OLAND: 1811 case CHIP_HAINAN: 1812 #endif 1813 #ifdef CONFIG_DRM_AMDGPU_CIK 1814 case CHIP_BONAIRE: 1815 case CHIP_HAWAII: 1816 case CHIP_KAVERI: 1817 case CHIP_KABINI: 1818 case CHIP_MULLINS: 1819 #endif 1820 case CHIP_TOPAZ: 1821 case CHIP_TONGA: 1822 case CHIP_FIJI: 1823 case CHIP_POLARIS10: 1824 case CHIP_POLARIS11: 1825 case CHIP_POLARIS12: 1826 case CHIP_VEGAM: 1827 case CHIP_CARRIZO: 1828 case CHIP_STONEY: 1829 case CHIP_VEGA20: 1830 case CHIP_ALDEBARAN: 1831 case CHIP_SIENNA_CICHLID: 1832 case CHIP_NAVY_FLOUNDER: 1833 case CHIP_DIMGREY_CAVEFISH: 1834 case CHIP_BEIGE_GOBY: 1835 default: 1836 return 0; 1837 case CHIP_VEGA10: 1838 chip_name = "vega10"; 1839 break; 1840 case CHIP_VEGA12: 1841 chip_name = "vega12"; 1842 break; 1843 case CHIP_RAVEN: 1844 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1845 chip_name = "raven2"; 1846 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1847 chip_name = "picasso"; 1848 else 1849 chip_name = "raven"; 1850 break; 1851 case CHIP_ARCTURUS: 1852 chip_name = "arcturus"; 1853 break; 1854 case CHIP_RENOIR: 1855 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1856 chip_name = "renoir"; 1857 else 1858 chip_name = "green_sardine"; 1859 break; 1860 case CHIP_NAVI10: 1861 chip_name = "navi10"; 1862 break; 1863 case CHIP_NAVI14: 1864 chip_name = "navi14"; 1865 break; 1866 case CHIP_NAVI12: 1867 chip_name = "navi12"; 1868 break; 1869 case CHIP_VANGOGH: 1870 chip_name = "vangogh"; 1871 break; 1872 } 1873 1874 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1875 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1876 if (err) { 1877 dev_err(adev->dev, 1878 "Failed to load gpu_info firmware \"%s\"\n", 1879 fw_name); 1880 goto out; 1881 } 1882 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1883 if (err) { 1884 dev_err(adev->dev, 1885 "Failed to validate gpu_info firmware \"%s\"\n", 1886 fw_name); 1887 goto out; 1888 } 1889 1890 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1891 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1892 1893 switch (hdr->version_major) { 1894 case 1: 1895 { 1896 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1897 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1898 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1899 1900 /* 1901 * Should be droped when DAL no longer needs it. 1902 */ 1903 if (adev->asic_type == CHIP_NAVI12) 1904 goto parse_soc_bounding_box; 1905 1906 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1907 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1908 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1909 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1910 adev->gfx.config.max_texture_channel_caches = 1911 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1912 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1913 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1914 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1915 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1916 adev->gfx.config.double_offchip_lds_buf = 1917 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1918 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1919 adev->gfx.cu_info.max_waves_per_simd = 1920 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1921 adev->gfx.cu_info.max_scratch_slots_per_cu = 1922 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1923 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1924 if (hdr->version_minor >= 1) { 1925 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1926 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1927 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1928 adev->gfx.config.num_sc_per_sh = 1929 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1930 adev->gfx.config.num_packer_per_sc = 1931 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1932 } 1933 1934 parse_soc_bounding_box: 1935 /* 1936 * soc bounding box info is not integrated in disocovery table, 1937 * we always need to parse it from gpu info firmware if needed. 1938 */ 1939 if (hdr->version_minor == 2) { 1940 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1941 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1942 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1943 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1944 } 1945 break; 1946 } 1947 default: 1948 dev_err(adev->dev, 1949 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1950 err = -EINVAL; 1951 goto out; 1952 } 1953 out: 1954 return err; 1955 } 1956 1957 /** 1958 * amdgpu_device_ip_early_init - run early init for hardware IPs 1959 * 1960 * @adev: amdgpu_device pointer 1961 * 1962 * Early initialization pass for hardware IPs. The hardware IPs that make 1963 * up each asic are discovered each IP's early_init callback is run. This 1964 * is the first stage in initializing the asic. 1965 * Returns 0 on success, negative error code on failure. 1966 */ 1967 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1968 { 1969 int i, r; 1970 1971 amdgpu_device_enable_virtual_display(adev); 1972 1973 if (amdgpu_sriov_vf(adev)) { 1974 r = amdgpu_virt_request_full_gpu(adev, true); 1975 if (r) 1976 return r; 1977 } 1978 1979 switch (adev->asic_type) { 1980 #ifdef CONFIG_DRM_AMDGPU_SI 1981 case CHIP_VERDE: 1982 case CHIP_TAHITI: 1983 case CHIP_PITCAIRN: 1984 case CHIP_OLAND: 1985 case CHIP_HAINAN: 1986 adev->family = AMDGPU_FAMILY_SI; 1987 r = si_set_ip_blocks(adev); 1988 if (r) 1989 return r; 1990 break; 1991 #endif 1992 #ifdef CONFIG_DRM_AMDGPU_CIK 1993 case CHIP_BONAIRE: 1994 case CHIP_HAWAII: 1995 case CHIP_KAVERI: 1996 case CHIP_KABINI: 1997 case CHIP_MULLINS: 1998 if (adev->flags & AMD_IS_APU) 1999 adev->family = AMDGPU_FAMILY_KV; 2000 else 2001 adev->family = AMDGPU_FAMILY_CI; 2002 2003 r = cik_set_ip_blocks(adev); 2004 if (r) 2005 return r; 2006 break; 2007 #endif 2008 case CHIP_TOPAZ: 2009 case CHIP_TONGA: 2010 case CHIP_FIJI: 2011 case CHIP_POLARIS10: 2012 case CHIP_POLARIS11: 2013 case CHIP_POLARIS12: 2014 case CHIP_VEGAM: 2015 case CHIP_CARRIZO: 2016 case CHIP_STONEY: 2017 if (adev->flags & AMD_IS_APU) 2018 adev->family = AMDGPU_FAMILY_CZ; 2019 else 2020 adev->family = AMDGPU_FAMILY_VI; 2021 2022 r = vi_set_ip_blocks(adev); 2023 if (r) 2024 return r; 2025 break; 2026 case CHIP_VEGA10: 2027 case CHIP_VEGA12: 2028 case CHIP_VEGA20: 2029 case CHIP_RAVEN: 2030 case CHIP_ARCTURUS: 2031 case CHIP_RENOIR: 2032 case CHIP_ALDEBARAN: 2033 if (adev->flags & AMD_IS_APU) 2034 adev->family = AMDGPU_FAMILY_RV; 2035 else 2036 adev->family = AMDGPU_FAMILY_AI; 2037 2038 r = soc15_set_ip_blocks(adev); 2039 if (r) 2040 return r; 2041 break; 2042 case CHIP_NAVI10: 2043 case CHIP_NAVI14: 2044 case CHIP_NAVI12: 2045 case CHIP_SIENNA_CICHLID: 2046 case CHIP_NAVY_FLOUNDER: 2047 case CHIP_DIMGREY_CAVEFISH: 2048 case CHIP_BEIGE_GOBY: 2049 case CHIP_VANGOGH: 2050 if (adev->asic_type == CHIP_VANGOGH) 2051 adev->family = AMDGPU_FAMILY_VGH; 2052 else 2053 adev->family = AMDGPU_FAMILY_NV; 2054 2055 r = nv_set_ip_blocks(adev); 2056 if (r) 2057 return r; 2058 break; 2059 default: 2060 /* FIXME: not supported yet */ 2061 return -EINVAL; 2062 } 2063 2064 amdgpu_amdkfd_device_probe(adev); 2065 2066 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2067 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2068 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2069 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2070 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2071 2072 for (i = 0; i < adev->num_ip_blocks; i++) { 2073 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2074 DRM_ERROR("disabled ip block: %d <%s>\n", 2075 i, adev->ip_blocks[i].version->funcs->name); 2076 adev->ip_blocks[i].status.valid = false; 2077 } else { 2078 if (adev->ip_blocks[i].version->funcs->early_init) { 2079 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2080 if (r == -ENOENT) { 2081 adev->ip_blocks[i].status.valid = false; 2082 } else if (r) { 2083 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2084 adev->ip_blocks[i].version->funcs->name, r); 2085 return r; 2086 } else { 2087 adev->ip_blocks[i].status.valid = true; 2088 } 2089 } else { 2090 adev->ip_blocks[i].status.valid = true; 2091 } 2092 } 2093 /* get the vbios after the asic_funcs are set up */ 2094 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2095 r = amdgpu_device_parse_gpu_info_fw(adev); 2096 if (r) 2097 return r; 2098 2099 /* Read BIOS */ 2100 if (!amdgpu_get_bios(adev)) 2101 return -EINVAL; 2102 2103 r = amdgpu_atombios_init(adev); 2104 if (r) { 2105 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2106 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2107 return r; 2108 } 2109 2110 /*get pf2vf msg info at it's earliest time*/ 2111 if (amdgpu_sriov_vf(adev)) 2112 amdgpu_virt_init_data_exchange(adev); 2113 2114 } 2115 } 2116 2117 adev->cg_flags &= amdgpu_cg_mask; 2118 adev->pg_flags &= amdgpu_pg_mask; 2119 2120 return 0; 2121 } 2122 2123 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2124 { 2125 int i, r; 2126 2127 for (i = 0; i < adev->num_ip_blocks; i++) { 2128 if (!adev->ip_blocks[i].status.sw) 2129 continue; 2130 if (adev->ip_blocks[i].status.hw) 2131 continue; 2132 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2133 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2134 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2135 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2136 if (r) { 2137 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2138 adev->ip_blocks[i].version->funcs->name, r); 2139 return r; 2140 } 2141 adev->ip_blocks[i].status.hw = true; 2142 } 2143 } 2144 2145 return 0; 2146 } 2147 2148 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2149 { 2150 int i, r; 2151 2152 for (i = 0; i < adev->num_ip_blocks; i++) { 2153 if (!adev->ip_blocks[i].status.sw) 2154 continue; 2155 if (adev->ip_blocks[i].status.hw) 2156 continue; 2157 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2158 if (r) { 2159 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2160 adev->ip_blocks[i].version->funcs->name, r); 2161 return r; 2162 } 2163 adev->ip_blocks[i].status.hw = true; 2164 } 2165 2166 return 0; 2167 } 2168 2169 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2170 { 2171 int r = 0; 2172 int i; 2173 uint32_t smu_version; 2174 2175 if (adev->asic_type >= CHIP_VEGA10) { 2176 for (i = 0; i < adev->num_ip_blocks; i++) { 2177 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2178 continue; 2179 2180 if (!adev->ip_blocks[i].status.sw) 2181 continue; 2182 2183 /* no need to do the fw loading again if already done*/ 2184 if (adev->ip_blocks[i].status.hw == true) 2185 break; 2186 2187 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2188 r = adev->ip_blocks[i].version->funcs->resume(adev); 2189 if (r) { 2190 DRM_ERROR("resume of IP block <%s> failed %d\n", 2191 adev->ip_blocks[i].version->funcs->name, r); 2192 return r; 2193 } 2194 } else { 2195 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2196 if (r) { 2197 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2198 adev->ip_blocks[i].version->funcs->name, r); 2199 return r; 2200 } 2201 } 2202 2203 adev->ip_blocks[i].status.hw = true; 2204 break; 2205 } 2206 } 2207 2208 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2209 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2210 2211 return r; 2212 } 2213 2214 /** 2215 * amdgpu_device_ip_init - run init for hardware IPs 2216 * 2217 * @adev: amdgpu_device pointer 2218 * 2219 * Main initialization pass for hardware IPs. The list of all the hardware 2220 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2221 * are run. sw_init initializes the software state associated with each IP 2222 * and hw_init initializes the hardware associated with each IP. 2223 * Returns 0 on success, negative error code on failure. 2224 */ 2225 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2226 { 2227 int i, r; 2228 2229 r = amdgpu_ras_init(adev); 2230 if (r) 2231 return r; 2232 2233 for (i = 0; i < adev->num_ip_blocks; i++) { 2234 if (!adev->ip_blocks[i].status.valid) 2235 continue; 2236 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2237 if (r) { 2238 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2239 adev->ip_blocks[i].version->funcs->name, r); 2240 goto init_failed; 2241 } 2242 adev->ip_blocks[i].status.sw = true; 2243 2244 /* need to do gmc hw init early so we can allocate gpu mem */ 2245 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2246 r = amdgpu_device_vram_scratch_init(adev); 2247 if (r) { 2248 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2249 goto init_failed; 2250 } 2251 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2252 if (r) { 2253 DRM_ERROR("hw_init %d failed %d\n", i, r); 2254 goto init_failed; 2255 } 2256 r = amdgpu_device_wb_init(adev); 2257 if (r) { 2258 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2259 goto init_failed; 2260 } 2261 adev->ip_blocks[i].status.hw = true; 2262 2263 /* right after GMC hw init, we create CSA */ 2264 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2265 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2266 AMDGPU_GEM_DOMAIN_VRAM, 2267 AMDGPU_CSA_SIZE); 2268 if (r) { 2269 DRM_ERROR("allocate CSA failed %d\n", r); 2270 goto init_failed; 2271 } 2272 } 2273 } 2274 } 2275 2276 if (amdgpu_sriov_vf(adev)) 2277 amdgpu_virt_init_data_exchange(adev); 2278 2279 r = amdgpu_ib_pool_init(adev); 2280 if (r) { 2281 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2282 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2283 goto init_failed; 2284 } 2285 2286 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2287 if (r) 2288 goto init_failed; 2289 2290 r = amdgpu_device_ip_hw_init_phase1(adev); 2291 if (r) 2292 goto init_failed; 2293 2294 r = amdgpu_device_fw_loading(adev); 2295 if (r) 2296 goto init_failed; 2297 2298 r = amdgpu_device_ip_hw_init_phase2(adev); 2299 if (r) 2300 goto init_failed; 2301 2302 /* 2303 * retired pages will be loaded from eeprom and reserved here, 2304 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2305 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2306 * for I2C communication which only true at this point. 2307 * 2308 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2309 * failure from bad gpu situation and stop amdgpu init process 2310 * accordingly. For other failed cases, it will still release all 2311 * the resource and print error message, rather than returning one 2312 * negative value to upper level. 2313 * 2314 * Note: theoretically, this should be called before all vram allocations 2315 * to protect retired page from abusing 2316 */ 2317 r = amdgpu_ras_recovery_init(adev); 2318 if (r) 2319 goto init_failed; 2320 2321 if (adev->gmc.xgmi.num_physical_nodes > 1) 2322 amdgpu_xgmi_add_device(adev); 2323 2324 /* Don't init kfd if whole hive need to be reset during init */ 2325 if (!adev->gmc.xgmi.pending_reset) 2326 amdgpu_amdkfd_device_init(adev); 2327 2328 amdgpu_fru_get_product_info(adev); 2329 2330 init_failed: 2331 if (amdgpu_sriov_vf(adev)) 2332 amdgpu_virt_release_full_gpu(adev, true); 2333 2334 return r; 2335 } 2336 2337 /** 2338 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2339 * 2340 * @adev: amdgpu_device pointer 2341 * 2342 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2343 * this function before a GPU reset. If the value is retained after a 2344 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2345 */ 2346 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2347 { 2348 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2349 } 2350 2351 /** 2352 * amdgpu_device_check_vram_lost - check if vram is valid 2353 * 2354 * @adev: amdgpu_device pointer 2355 * 2356 * Checks the reset magic value written to the gart pointer in VRAM. 2357 * The driver calls this after a GPU reset to see if the contents of 2358 * VRAM is lost or now. 2359 * returns true if vram is lost, false if not. 2360 */ 2361 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2362 { 2363 if (memcmp(adev->gart.ptr, adev->reset_magic, 2364 AMDGPU_RESET_MAGIC_NUM)) 2365 return true; 2366 2367 if (!amdgpu_in_reset(adev)) 2368 return false; 2369 2370 /* 2371 * For all ASICs with baco/mode1 reset, the VRAM is 2372 * always assumed to be lost. 2373 */ 2374 switch (amdgpu_asic_reset_method(adev)) { 2375 case AMD_RESET_METHOD_BACO: 2376 case AMD_RESET_METHOD_MODE1: 2377 return true; 2378 default: 2379 return false; 2380 } 2381 } 2382 2383 /** 2384 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2385 * 2386 * @adev: amdgpu_device pointer 2387 * @state: clockgating state (gate or ungate) 2388 * 2389 * The list of all the hardware IPs that make up the asic is walked and the 2390 * set_clockgating_state callbacks are run. 2391 * Late initialization pass enabling clockgating for hardware IPs. 2392 * Fini or suspend, pass disabling clockgating for hardware IPs. 2393 * Returns 0 on success, negative error code on failure. 2394 */ 2395 2396 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2397 enum amd_clockgating_state state) 2398 { 2399 int i, j, r; 2400 2401 if (amdgpu_emu_mode == 1) 2402 return 0; 2403 2404 for (j = 0; j < adev->num_ip_blocks; j++) { 2405 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2406 if (!adev->ip_blocks[i].status.late_initialized) 2407 continue; 2408 /* skip CG for GFX on S0ix */ 2409 if (adev->in_s0ix && 2410 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2411 continue; 2412 /* skip CG for VCE/UVD, it's handled specially */ 2413 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2414 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2415 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2416 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2417 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2418 /* enable clockgating to save power */ 2419 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2420 state); 2421 if (r) { 2422 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2423 adev->ip_blocks[i].version->funcs->name, r); 2424 return r; 2425 } 2426 } 2427 } 2428 2429 return 0; 2430 } 2431 2432 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2433 enum amd_powergating_state state) 2434 { 2435 int i, j, r; 2436 2437 if (amdgpu_emu_mode == 1) 2438 return 0; 2439 2440 for (j = 0; j < adev->num_ip_blocks; j++) { 2441 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2442 if (!adev->ip_blocks[i].status.late_initialized) 2443 continue; 2444 /* skip PG for GFX on S0ix */ 2445 if (adev->in_s0ix && 2446 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2447 continue; 2448 /* skip CG for VCE/UVD, it's handled specially */ 2449 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2450 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2451 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2452 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2453 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2454 /* enable powergating to save power */ 2455 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2456 state); 2457 if (r) { 2458 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2459 adev->ip_blocks[i].version->funcs->name, r); 2460 return r; 2461 } 2462 } 2463 } 2464 return 0; 2465 } 2466 2467 static int amdgpu_device_enable_mgpu_fan_boost(void) 2468 { 2469 struct amdgpu_gpu_instance *gpu_ins; 2470 struct amdgpu_device *adev; 2471 int i, ret = 0; 2472 2473 mutex_lock(&mgpu_info.mutex); 2474 2475 /* 2476 * MGPU fan boost feature should be enabled 2477 * only when there are two or more dGPUs in 2478 * the system 2479 */ 2480 if (mgpu_info.num_dgpu < 2) 2481 goto out; 2482 2483 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2484 gpu_ins = &(mgpu_info.gpu_ins[i]); 2485 adev = gpu_ins->adev; 2486 if (!(adev->flags & AMD_IS_APU) && 2487 !gpu_ins->mgpu_fan_enabled) { 2488 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2489 if (ret) 2490 break; 2491 2492 gpu_ins->mgpu_fan_enabled = 1; 2493 } 2494 } 2495 2496 out: 2497 mutex_unlock(&mgpu_info.mutex); 2498 2499 return ret; 2500 } 2501 2502 /** 2503 * amdgpu_device_ip_late_init - run late init for hardware IPs 2504 * 2505 * @adev: amdgpu_device pointer 2506 * 2507 * Late initialization pass for hardware IPs. The list of all the hardware 2508 * IPs that make up the asic is walked and the late_init callbacks are run. 2509 * late_init covers any special initialization that an IP requires 2510 * after all of the have been initialized or something that needs to happen 2511 * late in the init process. 2512 * Returns 0 on success, negative error code on failure. 2513 */ 2514 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2515 { 2516 struct amdgpu_gpu_instance *gpu_instance; 2517 int i = 0, r; 2518 2519 for (i = 0; i < adev->num_ip_blocks; i++) { 2520 if (!adev->ip_blocks[i].status.hw) 2521 continue; 2522 if (adev->ip_blocks[i].version->funcs->late_init) { 2523 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2524 if (r) { 2525 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2526 adev->ip_blocks[i].version->funcs->name, r); 2527 return r; 2528 } 2529 } 2530 adev->ip_blocks[i].status.late_initialized = true; 2531 } 2532 2533 amdgpu_ras_set_error_query_ready(adev, true); 2534 2535 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2536 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2537 2538 amdgpu_device_fill_reset_magic(adev); 2539 2540 r = amdgpu_device_enable_mgpu_fan_boost(); 2541 if (r) 2542 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2543 2544 /* For XGMI + passthrough configuration on arcturus, enable light SBR */ 2545 if (adev->asic_type == CHIP_ARCTURUS && 2546 amdgpu_passthrough(adev) && 2547 adev->gmc.xgmi.num_physical_nodes > 1) 2548 smu_set_light_sbr(&adev->smu, true); 2549 2550 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2551 mutex_lock(&mgpu_info.mutex); 2552 2553 /* 2554 * Reset device p-state to low as this was booted with high. 2555 * 2556 * This should be performed only after all devices from the same 2557 * hive get initialized. 2558 * 2559 * However, it's unknown how many device in the hive in advance. 2560 * As this is counted one by one during devices initializations. 2561 * 2562 * So, we wait for all XGMI interlinked devices initialized. 2563 * This may bring some delays as those devices may come from 2564 * different hives. But that should be OK. 2565 */ 2566 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2567 for (i = 0; i < mgpu_info.num_gpu; i++) { 2568 gpu_instance = &(mgpu_info.gpu_ins[i]); 2569 if (gpu_instance->adev->flags & AMD_IS_APU) 2570 continue; 2571 2572 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2573 AMDGPU_XGMI_PSTATE_MIN); 2574 if (r) { 2575 DRM_ERROR("pstate setting failed (%d).\n", r); 2576 break; 2577 } 2578 } 2579 } 2580 2581 mutex_unlock(&mgpu_info.mutex); 2582 } 2583 2584 return 0; 2585 } 2586 2587 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2588 { 2589 int i, r; 2590 2591 for (i = 0; i < adev->num_ip_blocks; i++) { 2592 if (!adev->ip_blocks[i].version->funcs->early_fini) 2593 continue; 2594 2595 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2596 if (r) { 2597 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2598 adev->ip_blocks[i].version->funcs->name, r); 2599 } 2600 } 2601 2602 amdgpu_amdkfd_suspend(adev, false); 2603 2604 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2605 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2606 2607 /* need to disable SMC first */ 2608 for (i = 0; i < adev->num_ip_blocks; i++) { 2609 if (!adev->ip_blocks[i].status.hw) 2610 continue; 2611 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2612 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2613 /* XXX handle errors */ 2614 if (r) { 2615 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2616 adev->ip_blocks[i].version->funcs->name, r); 2617 } 2618 adev->ip_blocks[i].status.hw = false; 2619 break; 2620 } 2621 } 2622 2623 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2624 if (!adev->ip_blocks[i].status.hw) 2625 continue; 2626 2627 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2628 /* XXX handle errors */ 2629 if (r) { 2630 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2631 adev->ip_blocks[i].version->funcs->name, r); 2632 } 2633 2634 adev->ip_blocks[i].status.hw = false; 2635 } 2636 2637 return 0; 2638 } 2639 2640 /** 2641 * amdgpu_device_ip_fini - run fini for hardware IPs 2642 * 2643 * @adev: amdgpu_device pointer 2644 * 2645 * Main teardown pass for hardware IPs. The list of all the hardware 2646 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2647 * are run. hw_fini tears down the hardware associated with each IP 2648 * and sw_fini tears down any software state associated with each IP. 2649 * Returns 0 on success, negative error code on failure. 2650 */ 2651 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2652 { 2653 int i, r; 2654 2655 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2656 amdgpu_virt_release_ras_err_handler_data(adev); 2657 2658 amdgpu_ras_pre_fini(adev); 2659 2660 if (adev->gmc.xgmi.num_physical_nodes > 1) 2661 amdgpu_xgmi_remove_device(adev); 2662 2663 amdgpu_amdkfd_device_fini_sw(adev); 2664 2665 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2666 if (!adev->ip_blocks[i].status.sw) 2667 continue; 2668 2669 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2670 amdgpu_ucode_free_bo(adev); 2671 amdgpu_free_static_csa(&adev->virt.csa_obj); 2672 amdgpu_device_wb_fini(adev); 2673 amdgpu_device_vram_scratch_fini(adev); 2674 amdgpu_ib_pool_fini(adev); 2675 } 2676 2677 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2678 /* XXX handle errors */ 2679 if (r) { 2680 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2681 adev->ip_blocks[i].version->funcs->name, r); 2682 } 2683 adev->ip_blocks[i].status.sw = false; 2684 adev->ip_blocks[i].status.valid = false; 2685 } 2686 2687 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2688 if (!adev->ip_blocks[i].status.late_initialized) 2689 continue; 2690 if (adev->ip_blocks[i].version->funcs->late_fini) 2691 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2692 adev->ip_blocks[i].status.late_initialized = false; 2693 } 2694 2695 amdgpu_ras_fini(adev); 2696 2697 if (amdgpu_sriov_vf(adev)) 2698 if (amdgpu_virt_release_full_gpu(adev, false)) 2699 DRM_ERROR("failed to release exclusive mode on fini\n"); 2700 2701 return 0; 2702 } 2703 2704 /** 2705 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2706 * 2707 * @work: work_struct. 2708 */ 2709 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2710 { 2711 struct amdgpu_device *adev = 2712 container_of(work, struct amdgpu_device, delayed_init_work.work); 2713 int r; 2714 2715 r = amdgpu_ib_ring_tests(adev); 2716 if (r) 2717 DRM_ERROR("ib ring test failed (%d).\n", r); 2718 } 2719 2720 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2721 { 2722 struct amdgpu_device *adev = 2723 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2724 2725 mutex_lock(&adev->gfx.gfx_off_mutex); 2726 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2727 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2728 adev->gfx.gfx_off_state = true; 2729 } 2730 mutex_unlock(&adev->gfx.gfx_off_mutex); 2731 } 2732 2733 /** 2734 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2735 * 2736 * @adev: amdgpu_device pointer 2737 * 2738 * Main suspend function for hardware IPs. The list of all the hardware 2739 * IPs that make up the asic is walked, clockgating is disabled and the 2740 * suspend callbacks are run. suspend puts the hardware and software state 2741 * in each IP into a state suitable for suspend. 2742 * Returns 0 on success, negative error code on failure. 2743 */ 2744 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2745 { 2746 int i, r; 2747 2748 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2749 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2750 2751 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2752 if (!adev->ip_blocks[i].status.valid) 2753 continue; 2754 2755 /* displays are handled separately */ 2756 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2757 continue; 2758 2759 /* XXX handle errors */ 2760 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2761 /* XXX handle errors */ 2762 if (r) { 2763 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2764 adev->ip_blocks[i].version->funcs->name, r); 2765 return r; 2766 } 2767 2768 adev->ip_blocks[i].status.hw = false; 2769 } 2770 2771 return 0; 2772 } 2773 2774 /** 2775 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2776 * 2777 * @adev: amdgpu_device pointer 2778 * 2779 * Main suspend function for hardware IPs. The list of all the hardware 2780 * IPs that make up the asic is walked, clockgating is disabled and the 2781 * suspend callbacks are run. suspend puts the hardware and software state 2782 * in each IP into a state suitable for suspend. 2783 * Returns 0 on success, negative error code on failure. 2784 */ 2785 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2786 { 2787 int i, r; 2788 2789 if (adev->in_s0ix) 2790 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry); 2791 2792 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2793 if (!adev->ip_blocks[i].status.valid) 2794 continue; 2795 /* displays are handled in phase1 */ 2796 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2797 continue; 2798 /* PSP lost connection when err_event_athub occurs */ 2799 if (amdgpu_ras_intr_triggered() && 2800 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2801 adev->ip_blocks[i].status.hw = false; 2802 continue; 2803 } 2804 2805 /* skip unnecessary suspend if we do not initialize them yet */ 2806 if (adev->gmc.xgmi.pending_reset && 2807 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2808 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2809 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2810 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2811 adev->ip_blocks[i].status.hw = false; 2812 continue; 2813 } 2814 2815 /* skip suspend of gfx and psp for S0ix 2816 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2817 * like at runtime. PSP is also part of the always on hardware 2818 * so no need to suspend it. 2819 */ 2820 if (adev->in_s0ix && 2821 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2822 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 2823 continue; 2824 2825 /* XXX handle errors */ 2826 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2827 /* XXX handle errors */ 2828 if (r) { 2829 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2830 adev->ip_blocks[i].version->funcs->name, r); 2831 } 2832 adev->ip_blocks[i].status.hw = false; 2833 /* handle putting the SMC in the appropriate state */ 2834 if(!amdgpu_sriov_vf(adev)){ 2835 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2836 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2837 if (r) { 2838 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2839 adev->mp1_state, r); 2840 return r; 2841 } 2842 } 2843 } 2844 } 2845 2846 return 0; 2847 } 2848 2849 /** 2850 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2851 * 2852 * @adev: amdgpu_device pointer 2853 * 2854 * Main suspend function for hardware IPs. The list of all the hardware 2855 * IPs that make up the asic is walked, clockgating is disabled and the 2856 * suspend callbacks are run. suspend puts the hardware and software state 2857 * in each IP into a state suitable for suspend. 2858 * Returns 0 on success, negative error code on failure. 2859 */ 2860 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2861 { 2862 int r; 2863 2864 if (amdgpu_sriov_vf(adev)) { 2865 amdgpu_virt_fini_data_exchange(adev); 2866 amdgpu_virt_request_full_gpu(adev, false); 2867 } 2868 2869 r = amdgpu_device_ip_suspend_phase1(adev); 2870 if (r) 2871 return r; 2872 r = amdgpu_device_ip_suspend_phase2(adev); 2873 2874 if (amdgpu_sriov_vf(adev)) 2875 amdgpu_virt_release_full_gpu(adev, false); 2876 2877 return r; 2878 } 2879 2880 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2881 { 2882 int i, r; 2883 2884 static enum amd_ip_block_type ip_order[] = { 2885 AMD_IP_BLOCK_TYPE_GMC, 2886 AMD_IP_BLOCK_TYPE_COMMON, 2887 AMD_IP_BLOCK_TYPE_PSP, 2888 AMD_IP_BLOCK_TYPE_IH, 2889 }; 2890 2891 for (i = 0; i < adev->num_ip_blocks; i++) { 2892 int j; 2893 struct amdgpu_ip_block *block; 2894 2895 block = &adev->ip_blocks[i]; 2896 block->status.hw = false; 2897 2898 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 2899 2900 if (block->version->type != ip_order[j] || 2901 !block->status.valid) 2902 continue; 2903 2904 r = block->version->funcs->hw_init(adev); 2905 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2906 if (r) 2907 return r; 2908 block->status.hw = true; 2909 } 2910 } 2911 2912 return 0; 2913 } 2914 2915 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2916 { 2917 int i, r; 2918 2919 static enum amd_ip_block_type ip_order[] = { 2920 AMD_IP_BLOCK_TYPE_SMC, 2921 AMD_IP_BLOCK_TYPE_DCE, 2922 AMD_IP_BLOCK_TYPE_GFX, 2923 AMD_IP_BLOCK_TYPE_SDMA, 2924 AMD_IP_BLOCK_TYPE_UVD, 2925 AMD_IP_BLOCK_TYPE_VCE, 2926 AMD_IP_BLOCK_TYPE_VCN 2927 }; 2928 2929 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2930 int j; 2931 struct amdgpu_ip_block *block; 2932 2933 for (j = 0; j < adev->num_ip_blocks; j++) { 2934 block = &adev->ip_blocks[j]; 2935 2936 if (block->version->type != ip_order[i] || 2937 !block->status.valid || 2938 block->status.hw) 2939 continue; 2940 2941 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 2942 r = block->version->funcs->resume(adev); 2943 else 2944 r = block->version->funcs->hw_init(adev); 2945 2946 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2947 if (r) 2948 return r; 2949 block->status.hw = true; 2950 } 2951 } 2952 2953 return 0; 2954 } 2955 2956 /** 2957 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2958 * 2959 * @adev: amdgpu_device pointer 2960 * 2961 * First resume function for hardware IPs. The list of all the hardware 2962 * IPs that make up the asic is walked and the resume callbacks are run for 2963 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2964 * after a suspend and updates the software state as necessary. This 2965 * function is also used for restoring the GPU after a GPU reset. 2966 * Returns 0 on success, negative error code on failure. 2967 */ 2968 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2969 { 2970 int i, r; 2971 2972 for (i = 0; i < adev->num_ip_blocks; i++) { 2973 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2974 continue; 2975 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2976 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2977 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2978 2979 r = adev->ip_blocks[i].version->funcs->resume(adev); 2980 if (r) { 2981 DRM_ERROR("resume of IP block <%s> failed %d\n", 2982 adev->ip_blocks[i].version->funcs->name, r); 2983 return r; 2984 } 2985 adev->ip_blocks[i].status.hw = true; 2986 } 2987 } 2988 2989 return 0; 2990 } 2991 2992 /** 2993 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2994 * 2995 * @adev: amdgpu_device pointer 2996 * 2997 * First resume function for hardware IPs. The list of all the hardware 2998 * IPs that make up the asic is walked and the resume callbacks are run for 2999 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3000 * functional state after a suspend and updates the software state as 3001 * necessary. This function is also used for restoring the GPU after a GPU 3002 * reset. 3003 * Returns 0 on success, negative error code on failure. 3004 */ 3005 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3006 { 3007 int i, r; 3008 3009 for (i = 0; i < adev->num_ip_blocks; i++) { 3010 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3011 continue; 3012 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3013 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3014 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3015 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3016 continue; 3017 r = adev->ip_blocks[i].version->funcs->resume(adev); 3018 if (r) { 3019 DRM_ERROR("resume of IP block <%s> failed %d\n", 3020 adev->ip_blocks[i].version->funcs->name, r); 3021 return r; 3022 } 3023 adev->ip_blocks[i].status.hw = true; 3024 } 3025 3026 return 0; 3027 } 3028 3029 /** 3030 * amdgpu_device_ip_resume - run resume for hardware IPs 3031 * 3032 * @adev: amdgpu_device pointer 3033 * 3034 * Main resume function for hardware IPs. The hardware IPs 3035 * are split into two resume functions because they are 3036 * are also used in in recovering from a GPU reset and some additional 3037 * steps need to be take between them. In this case (S3/S4) they are 3038 * run sequentially. 3039 * Returns 0 on success, negative error code on failure. 3040 */ 3041 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3042 { 3043 int r; 3044 3045 r = amdgpu_device_ip_resume_phase1(adev); 3046 if (r) 3047 return r; 3048 3049 r = amdgpu_device_fw_loading(adev); 3050 if (r) 3051 return r; 3052 3053 r = amdgpu_device_ip_resume_phase2(adev); 3054 3055 return r; 3056 } 3057 3058 /** 3059 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3060 * 3061 * @adev: amdgpu_device pointer 3062 * 3063 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3064 */ 3065 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3066 { 3067 if (amdgpu_sriov_vf(adev)) { 3068 if (adev->is_atom_fw) { 3069 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3070 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3071 } else { 3072 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3073 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3074 } 3075 3076 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3077 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3078 } 3079 } 3080 3081 /** 3082 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3083 * 3084 * @asic_type: AMD asic type 3085 * 3086 * Check if there is DC (new modesetting infrastructre) support for an asic. 3087 * returns true if DC has support, false if not. 3088 */ 3089 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3090 { 3091 switch (asic_type) { 3092 #if defined(CONFIG_DRM_AMD_DC) 3093 #if defined(CONFIG_DRM_AMD_DC_SI) 3094 case CHIP_TAHITI: 3095 case CHIP_PITCAIRN: 3096 case CHIP_VERDE: 3097 case CHIP_OLAND: 3098 #endif 3099 case CHIP_BONAIRE: 3100 case CHIP_KAVERI: 3101 case CHIP_KABINI: 3102 case CHIP_MULLINS: 3103 /* 3104 * We have systems in the wild with these ASICs that require 3105 * LVDS and VGA support which is not supported with DC. 3106 * 3107 * Fallback to the non-DC driver here by default so as not to 3108 * cause regressions. 3109 */ 3110 return amdgpu_dc > 0; 3111 case CHIP_HAWAII: 3112 case CHIP_CARRIZO: 3113 case CHIP_STONEY: 3114 case CHIP_POLARIS10: 3115 case CHIP_POLARIS11: 3116 case CHIP_POLARIS12: 3117 case CHIP_VEGAM: 3118 case CHIP_TONGA: 3119 case CHIP_FIJI: 3120 case CHIP_VEGA10: 3121 case CHIP_VEGA12: 3122 case CHIP_VEGA20: 3123 #if defined(CONFIG_DRM_AMD_DC_DCN) 3124 case CHIP_RAVEN: 3125 case CHIP_NAVI10: 3126 case CHIP_NAVI14: 3127 case CHIP_NAVI12: 3128 case CHIP_RENOIR: 3129 case CHIP_SIENNA_CICHLID: 3130 case CHIP_NAVY_FLOUNDER: 3131 case CHIP_DIMGREY_CAVEFISH: 3132 case CHIP_BEIGE_GOBY: 3133 case CHIP_VANGOGH: 3134 #endif 3135 return amdgpu_dc != 0; 3136 #endif 3137 default: 3138 if (amdgpu_dc > 0) 3139 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3140 "but isn't supported by ASIC, ignoring\n"); 3141 return false; 3142 } 3143 } 3144 3145 /** 3146 * amdgpu_device_has_dc_support - check if dc is supported 3147 * 3148 * @adev: amdgpu_device pointer 3149 * 3150 * Returns true for supported, false for not supported 3151 */ 3152 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3153 { 3154 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) 3155 return false; 3156 3157 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3158 } 3159 3160 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3161 { 3162 struct amdgpu_device *adev = 3163 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3164 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3165 3166 /* It's a bug to not have a hive within this function */ 3167 if (WARN_ON(!hive)) 3168 return; 3169 3170 /* 3171 * Use task barrier to synchronize all xgmi reset works across the 3172 * hive. task_barrier_enter and task_barrier_exit will block 3173 * until all the threads running the xgmi reset works reach 3174 * those points. task_barrier_full will do both blocks. 3175 */ 3176 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3177 3178 task_barrier_enter(&hive->tb); 3179 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3180 3181 if (adev->asic_reset_res) 3182 goto fail; 3183 3184 task_barrier_exit(&hive->tb); 3185 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3186 3187 if (adev->asic_reset_res) 3188 goto fail; 3189 3190 if (adev->mmhub.ras_funcs && 3191 adev->mmhub.ras_funcs->reset_ras_error_count) 3192 adev->mmhub.ras_funcs->reset_ras_error_count(adev); 3193 } else { 3194 3195 task_barrier_full(&hive->tb); 3196 adev->asic_reset_res = amdgpu_asic_reset(adev); 3197 } 3198 3199 fail: 3200 if (adev->asic_reset_res) 3201 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3202 adev->asic_reset_res, adev_to_drm(adev)->unique); 3203 amdgpu_put_xgmi_hive(hive); 3204 } 3205 3206 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3207 { 3208 char *input = amdgpu_lockup_timeout; 3209 char *timeout_setting = NULL; 3210 int index = 0; 3211 long timeout; 3212 int ret = 0; 3213 3214 /* 3215 * By default timeout for non compute jobs is 10000 3216 * and 60000 for compute jobs. 3217 * In SR-IOV or passthrough mode, timeout for compute 3218 * jobs are 60000 by default. 3219 */ 3220 adev->gfx_timeout = msecs_to_jiffies(10000); 3221 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3222 if (amdgpu_sriov_vf(adev)) 3223 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3224 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3225 else 3226 adev->compute_timeout = msecs_to_jiffies(60000); 3227 3228 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3229 while ((timeout_setting = strsep(&input, ",")) && 3230 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3231 ret = kstrtol(timeout_setting, 0, &timeout); 3232 if (ret) 3233 return ret; 3234 3235 if (timeout == 0) { 3236 index++; 3237 continue; 3238 } else if (timeout < 0) { 3239 timeout = MAX_SCHEDULE_TIMEOUT; 3240 } else { 3241 timeout = msecs_to_jiffies(timeout); 3242 } 3243 3244 switch (index++) { 3245 case 0: 3246 adev->gfx_timeout = timeout; 3247 break; 3248 case 1: 3249 adev->compute_timeout = timeout; 3250 break; 3251 case 2: 3252 adev->sdma_timeout = timeout; 3253 break; 3254 case 3: 3255 adev->video_timeout = timeout; 3256 break; 3257 default: 3258 break; 3259 } 3260 } 3261 /* 3262 * There is only one value specified and 3263 * it should apply to all non-compute jobs. 3264 */ 3265 if (index == 1) { 3266 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3267 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3268 adev->compute_timeout = adev->gfx_timeout; 3269 } 3270 } 3271 3272 return ret; 3273 } 3274 3275 static const struct attribute *amdgpu_dev_attributes[] = { 3276 &dev_attr_product_name.attr, 3277 &dev_attr_product_number.attr, 3278 &dev_attr_serial_number.attr, 3279 &dev_attr_pcie_replay_count.attr, 3280 NULL 3281 }; 3282 3283 /** 3284 * amdgpu_device_init - initialize the driver 3285 * 3286 * @adev: amdgpu_device pointer 3287 * @flags: driver flags 3288 * 3289 * Initializes the driver info and hw (all asics). 3290 * Returns 0 for success or an error on failure. 3291 * Called at driver startup. 3292 */ 3293 int amdgpu_device_init(struct amdgpu_device *adev, 3294 uint32_t flags) 3295 { 3296 struct drm_device *ddev = adev_to_drm(adev); 3297 struct pci_dev *pdev = adev->pdev; 3298 int r, i; 3299 bool px = false; 3300 u32 max_MBps; 3301 3302 adev->shutdown = false; 3303 adev->flags = flags; 3304 3305 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3306 adev->asic_type = amdgpu_force_asic_type; 3307 else 3308 adev->asic_type = flags & AMD_ASIC_MASK; 3309 3310 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3311 if (amdgpu_emu_mode == 1) 3312 adev->usec_timeout *= 10; 3313 adev->gmc.gart_size = 512 * 1024 * 1024; 3314 adev->accel_working = false; 3315 adev->num_rings = 0; 3316 adev->mman.buffer_funcs = NULL; 3317 adev->mman.buffer_funcs_ring = NULL; 3318 adev->vm_manager.vm_pte_funcs = NULL; 3319 adev->vm_manager.vm_pte_num_scheds = 0; 3320 adev->gmc.gmc_funcs = NULL; 3321 adev->harvest_ip_mask = 0x0; 3322 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3323 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3324 3325 adev->smc_rreg = &amdgpu_invalid_rreg; 3326 adev->smc_wreg = &amdgpu_invalid_wreg; 3327 adev->pcie_rreg = &amdgpu_invalid_rreg; 3328 adev->pcie_wreg = &amdgpu_invalid_wreg; 3329 adev->pciep_rreg = &amdgpu_invalid_rreg; 3330 adev->pciep_wreg = &amdgpu_invalid_wreg; 3331 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3332 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3333 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3334 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3335 adev->didt_rreg = &amdgpu_invalid_rreg; 3336 adev->didt_wreg = &amdgpu_invalid_wreg; 3337 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3338 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3339 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3340 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3341 3342 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3343 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3344 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3345 3346 /* mutex initialization are all done here so we 3347 * can recall function without having locking issues */ 3348 mutex_init(&adev->firmware.mutex); 3349 mutex_init(&adev->pm.mutex); 3350 mutex_init(&adev->gfx.gpu_clock_mutex); 3351 mutex_init(&adev->srbm_mutex); 3352 mutex_init(&adev->gfx.pipe_reserve_mutex); 3353 mutex_init(&adev->gfx.gfx_off_mutex); 3354 mutex_init(&adev->grbm_idx_mutex); 3355 mutex_init(&adev->mn_lock); 3356 mutex_init(&adev->virt.vf_errors.lock); 3357 hash_init(adev->mn_hash); 3358 atomic_set(&adev->in_gpu_reset, 0); 3359 init_rwsem(&adev->reset_sem); 3360 mutex_init(&adev->psp.mutex); 3361 mutex_init(&adev->notifier_lock); 3362 3363 r = amdgpu_device_check_arguments(adev); 3364 if (r) 3365 return r; 3366 3367 spin_lock_init(&adev->mmio_idx_lock); 3368 spin_lock_init(&adev->smc_idx_lock); 3369 spin_lock_init(&adev->pcie_idx_lock); 3370 spin_lock_init(&adev->uvd_ctx_idx_lock); 3371 spin_lock_init(&adev->didt_idx_lock); 3372 spin_lock_init(&adev->gc_cac_idx_lock); 3373 spin_lock_init(&adev->se_cac_idx_lock); 3374 spin_lock_init(&adev->audio_endpt_idx_lock); 3375 spin_lock_init(&adev->mm_stats.lock); 3376 3377 INIT_LIST_HEAD(&adev->shadow_list); 3378 mutex_init(&adev->shadow_list_lock); 3379 3380 INIT_LIST_HEAD(&adev->reset_list); 3381 3382 INIT_DELAYED_WORK(&adev->delayed_init_work, 3383 amdgpu_device_delayed_init_work_handler); 3384 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3385 amdgpu_device_delay_enable_gfx_off); 3386 3387 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3388 3389 adev->gfx.gfx_off_req_count = 1; 3390 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3391 3392 atomic_set(&adev->throttling_logging_enabled, 1); 3393 /* 3394 * If throttling continues, logging will be performed every minute 3395 * to avoid log flooding. "-1" is subtracted since the thermal 3396 * throttling interrupt comes every second. Thus, the total logging 3397 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3398 * for throttling interrupt) = 60 seconds. 3399 */ 3400 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3401 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3402 3403 /* Registers mapping */ 3404 /* TODO: block userspace mapping of io register */ 3405 if (adev->asic_type >= CHIP_BONAIRE) { 3406 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3407 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3408 } else { 3409 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3410 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3411 } 3412 3413 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3414 if (adev->rmmio == NULL) { 3415 return -ENOMEM; 3416 } 3417 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3418 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3419 3420 /* enable PCIE atomic ops */ 3421 r = pci_enable_atomic_ops_to_root(adev->pdev, 3422 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3423 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3424 if (r) { 3425 adev->have_atomics_support = false; 3426 DRM_INFO("PCIE atomic ops is not supported\n"); 3427 } else { 3428 adev->have_atomics_support = true; 3429 } 3430 3431 amdgpu_device_get_pcie_info(adev); 3432 3433 if (amdgpu_mcbp) 3434 DRM_INFO("MCBP is enabled\n"); 3435 3436 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3437 adev->enable_mes = true; 3438 3439 /* detect hw virtualization here */ 3440 amdgpu_detect_virtualization(adev); 3441 3442 r = amdgpu_device_get_job_timeout_settings(adev); 3443 if (r) { 3444 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3445 goto failed_unmap; 3446 } 3447 3448 /* early init functions */ 3449 r = amdgpu_device_ip_early_init(adev); 3450 if (r) 3451 goto failed_unmap; 3452 3453 /* doorbell bar mapping and doorbell index init*/ 3454 amdgpu_device_doorbell_init(adev); 3455 3456 if (amdgpu_emu_mode == 1) { 3457 /* post the asic on emulation mode */ 3458 emu_soc_asic_init(adev); 3459 goto fence_driver_init; 3460 } 3461 3462 amdgpu_reset_init(adev); 3463 3464 /* detect if we are with an SRIOV vbios */ 3465 amdgpu_device_detect_sriov_bios(adev); 3466 3467 /* check if we need to reset the asic 3468 * E.g., driver was not cleanly unloaded previously, etc. 3469 */ 3470 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3471 if (adev->gmc.xgmi.num_physical_nodes) { 3472 dev_info(adev->dev, "Pending hive reset.\n"); 3473 adev->gmc.xgmi.pending_reset = true; 3474 /* Only need to init necessary block for SMU to handle the reset */ 3475 for (i = 0; i < adev->num_ip_blocks; i++) { 3476 if (!adev->ip_blocks[i].status.valid) 3477 continue; 3478 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3479 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3480 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3481 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3482 DRM_DEBUG("IP %s disabled for hw_init.\n", 3483 adev->ip_blocks[i].version->funcs->name); 3484 adev->ip_blocks[i].status.hw = true; 3485 } 3486 } 3487 } else { 3488 r = amdgpu_asic_reset(adev); 3489 if (r) { 3490 dev_err(adev->dev, "asic reset on init failed\n"); 3491 goto failed; 3492 } 3493 } 3494 } 3495 3496 pci_enable_pcie_error_reporting(adev->pdev); 3497 3498 /* Post card if necessary */ 3499 if (amdgpu_device_need_post(adev)) { 3500 if (!adev->bios) { 3501 dev_err(adev->dev, "no vBIOS found\n"); 3502 r = -EINVAL; 3503 goto failed; 3504 } 3505 DRM_INFO("GPU posting now...\n"); 3506 r = amdgpu_device_asic_init(adev); 3507 if (r) { 3508 dev_err(adev->dev, "gpu post error!\n"); 3509 goto failed; 3510 } 3511 } 3512 3513 if (adev->is_atom_fw) { 3514 /* Initialize clocks */ 3515 r = amdgpu_atomfirmware_get_clock_info(adev); 3516 if (r) { 3517 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3518 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3519 goto failed; 3520 } 3521 } else { 3522 /* Initialize clocks */ 3523 r = amdgpu_atombios_get_clock_info(adev); 3524 if (r) { 3525 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3526 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3527 goto failed; 3528 } 3529 /* init i2c buses */ 3530 if (!amdgpu_device_has_dc_support(adev)) 3531 amdgpu_atombios_i2c_init(adev); 3532 } 3533 3534 fence_driver_init: 3535 /* Fence driver */ 3536 r = amdgpu_fence_driver_init(adev); 3537 if (r) { 3538 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 3539 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3540 goto failed; 3541 } 3542 3543 /* init the mode config */ 3544 drm_mode_config_init(adev_to_drm(adev)); 3545 3546 r = amdgpu_device_ip_init(adev); 3547 if (r) { 3548 /* failed in exclusive mode due to timeout */ 3549 if (amdgpu_sriov_vf(adev) && 3550 !amdgpu_sriov_runtime(adev) && 3551 amdgpu_virt_mmio_blocked(adev) && 3552 !amdgpu_virt_wait_reset(adev)) { 3553 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3554 /* Don't send request since VF is inactive. */ 3555 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3556 adev->virt.ops = NULL; 3557 r = -EAGAIN; 3558 goto release_ras_con; 3559 } 3560 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3561 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3562 goto release_ras_con; 3563 } 3564 3565 dev_info(adev->dev, 3566 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3567 adev->gfx.config.max_shader_engines, 3568 adev->gfx.config.max_sh_per_se, 3569 adev->gfx.config.max_cu_per_sh, 3570 adev->gfx.cu_info.number); 3571 3572 adev->accel_working = true; 3573 3574 amdgpu_vm_check_compute_bug(adev); 3575 3576 /* Initialize the buffer migration limit. */ 3577 if (amdgpu_moverate >= 0) 3578 max_MBps = amdgpu_moverate; 3579 else 3580 max_MBps = 8; /* Allow 8 MB/s. */ 3581 /* Get a log2 for easy divisions. */ 3582 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3583 3584 amdgpu_fbdev_init(adev); 3585 3586 r = amdgpu_pm_sysfs_init(adev); 3587 if (r) { 3588 adev->pm_sysfs_en = false; 3589 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3590 } else 3591 adev->pm_sysfs_en = true; 3592 3593 r = amdgpu_ucode_sysfs_init(adev); 3594 if (r) { 3595 adev->ucode_sysfs_en = false; 3596 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3597 } else 3598 adev->ucode_sysfs_en = true; 3599 3600 if ((amdgpu_testing & 1)) { 3601 if (adev->accel_working) 3602 amdgpu_test_moves(adev); 3603 else 3604 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3605 } 3606 if (amdgpu_benchmarking) { 3607 if (adev->accel_working) 3608 amdgpu_benchmark(adev, amdgpu_benchmarking); 3609 else 3610 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3611 } 3612 3613 /* 3614 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3615 * Otherwise the mgpu fan boost feature will be skipped due to the 3616 * gpu instance is counted less. 3617 */ 3618 amdgpu_register_gpu_instance(adev); 3619 3620 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3621 * explicit gating rather than handling it automatically. 3622 */ 3623 if (!adev->gmc.xgmi.pending_reset) { 3624 r = amdgpu_device_ip_late_init(adev); 3625 if (r) { 3626 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3627 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3628 goto release_ras_con; 3629 } 3630 /* must succeed. */ 3631 amdgpu_ras_resume(adev); 3632 queue_delayed_work(system_wq, &adev->delayed_init_work, 3633 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3634 } 3635 3636 if (amdgpu_sriov_vf(adev)) 3637 flush_delayed_work(&adev->delayed_init_work); 3638 3639 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3640 if (r) 3641 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3642 3643 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3644 r = amdgpu_pmu_init(adev); 3645 if (r) 3646 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3647 3648 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3649 if (amdgpu_device_cache_pci_state(adev->pdev)) 3650 pci_restore_state(pdev); 3651 3652 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3653 /* this will fail for cards that aren't VGA class devices, just 3654 * ignore it */ 3655 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3656 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 3657 3658 if (amdgpu_device_supports_px(ddev)) { 3659 px = true; 3660 vga_switcheroo_register_client(adev->pdev, 3661 &amdgpu_switcheroo_ops, px); 3662 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3663 } 3664 3665 if (adev->gmc.xgmi.pending_reset) 3666 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3667 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3668 3669 return 0; 3670 3671 release_ras_con: 3672 amdgpu_release_ras_context(adev); 3673 3674 failed: 3675 amdgpu_vf_error_trans_all(adev); 3676 3677 failed_unmap: 3678 iounmap(adev->rmmio); 3679 adev->rmmio = NULL; 3680 3681 return r; 3682 } 3683 3684 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3685 { 3686 /* Clear all CPU mappings pointing to this device */ 3687 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3688 3689 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3690 amdgpu_device_doorbell_fini(adev); 3691 3692 iounmap(adev->rmmio); 3693 adev->rmmio = NULL; 3694 if (adev->mman.aper_base_kaddr) 3695 iounmap(adev->mman.aper_base_kaddr); 3696 adev->mman.aper_base_kaddr = NULL; 3697 3698 /* Memory manager related */ 3699 if (!adev->gmc.xgmi.connected_to_cpu) { 3700 arch_phys_wc_del(adev->gmc.vram_mtrr); 3701 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3702 } 3703 } 3704 3705 /** 3706 * amdgpu_device_fini - tear down the driver 3707 * 3708 * @adev: amdgpu_device pointer 3709 * 3710 * Tear down the driver info (all asics). 3711 * Called at driver shutdown. 3712 */ 3713 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3714 { 3715 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3716 flush_delayed_work(&adev->delayed_init_work); 3717 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3718 adev->shutdown = true; 3719 3720 /* make sure IB test finished before entering exclusive mode 3721 * to avoid preemption on IB test 3722 * */ 3723 if (amdgpu_sriov_vf(adev)) { 3724 amdgpu_virt_request_full_gpu(adev, false); 3725 amdgpu_virt_fini_data_exchange(adev); 3726 } 3727 3728 /* disable all interrupts */ 3729 amdgpu_irq_disable_all(adev); 3730 if (adev->mode_info.mode_config_initialized){ 3731 if (!amdgpu_device_has_dc_support(adev)) 3732 drm_helper_force_disable_all(adev_to_drm(adev)); 3733 else 3734 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3735 } 3736 amdgpu_fence_driver_fini_hw(adev); 3737 3738 if (adev->pm_sysfs_en) 3739 amdgpu_pm_sysfs_fini(adev); 3740 if (adev->ucode_sysfs_en) 3741 amdgpu_ucode_sysfs_fini(adev); 3742 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3743 3744 amdgpu_fbdev_fini(adev); 3745 3746 amdgpu_irq_fini_hw(adev); 3747 3748 amdgpu_device_ip_fini_early(adev); 3749 3750 amdgpu_gart_dummy_page_fini(adev); 3751 3752 amdgpu_device_unmap_mmio(adev); 3753 } 3754 3755 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 3756 { 3757 amdgpu_device_ip_fini(adev); 3758 amdgpu_fence_driver_fini_sw(adev); 3759 release_firmware(adev->firmware.gpu_info_fw); 3760 adev->firmware.gpu_info_fw = NULL; 3761 adev->accel_working = false; 3762 3763 amdgpu_reset_fini(adev); 3764 3765 /* free i2c buses */ 3766 if (!amdgpu_device_has_dc_support(adev)) 3767 amdgpu_i2c_fini(adev); 3768 3769 if (amdgpu_emu_mode != 1) 3770 amdgpu_atombios_fini(adev); 3771 3772 kfree(adev->bios); 3773 adev->bios = NULL; 3774 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 3775 vga_switcheroo_unregister_client(adev->pdev); 3776 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3777 } 3778 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3779 vga_client_register(adev->pdev, NULL, NULL, NULL); 3780 3781 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3782 amdgpu_pmu_fini(adev); 3783 if (adev->mman.discovery_bin) 3784 amdgpu_discovery_fini(adev); 3785 3786 kfree(adev->pci_state); 3787 3788 } 3789 3790 3791 /* 3792 * Suspend & resume. 3793 */ 3794 /** 3795 * amdgpu_device_suspend - initiate device suspend 3796 * 3797 * @dev: drm dev pointer 3798 * @fbcon : notify the fbdev of suspend 3799 * 3800 * Puts the hw in the suspend state (all asics). 3801 * Returns 0 for success or an error on failure. 3802 * Called at driver suspend. 3803 */ 3804 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 3805 { 3806 struct amdgpu_device *adev = drm_to_adev(dev); 3807 3808 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3809 return 0; 3810 3811 adev->in_suspend = true; 3812 drm_kms_helper_poll_disable(dev); 3813 3814 if (fbcon) 3815 amdgpu_fbdev_set_suspend(adev, 1); 3816 3817 cancel_delayed_work_sync(&adev->delayed_init_work); 3818 3819 amdgpu_ras_suspend(adev); 3820 3821 amdgpu_device_ip_suspend_phase1(adev); 3822 3823 if (!adev->in_s0ix) 3824 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 3825 3826 /* evict vram memory */ 3827 amdgpu_bo_evict_vram(adev); 3828 3829 amdgpu_fence_driver_suspend(adev); 3830 3831 amdgpu_device_ip_suspend_phase2(adev); 3832 /* evict remaining vram memory 3833 * This second call to evict vram is to evict the gart page table 3834 * using the CPU. 3835 */ 3836 amdgpu_bo_evict_vram(adev); 3837 3838 return 0; 3839 } 3840 3841 /** 3842 * amdgpu_device_resume - initiate device resume 3843 * 3844 * @dev: drm dev pointer 3845 * @fbcon : notify the fbdev of resume 3846 * 3847 * Bring the hw back to operating state (all asics). 3848 * Returns 0 for success or an error on failure. 3849 * Called at driver resume. 3850 */ 3851 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 3852 { 3853 struct amdgpu_device *adev = drm_to_adev(dev); 3854 int r = 0; 3855 3856 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3857 return 0; 3858 3859 if (adev->in_s0ix) 3860 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry); 3861 3862 /* post card */ 3863 if (amdgpu_device_need_post(adev)) { 3864 r = amdgpu_device_asic_init(adev); 3865 if (r) 3866 dev_err(adev->dev, "amdgpu asic init failed\n"); 3867 } 3868 3869 r = amdgpu_device_ip_resume(adev); 3870 if (r) { 3871 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 3872 return r; 3873 } 3874 amdgpu_fence_driver_resume(adev); 3875 3876 3877 r = amdgpu_device_ip_late_init(adev); 3878 if (r) 3879 return r; 3880 3881 queue_delayed_work(system_wq, &adev->delayed_init_work, 3882 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3883 3884 if (!adev->in_s0ix) { 3885 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 3886 if (r) 3887 return r; 3888 } 3889 3890 /* Make sure IB tests flushed */ 3891 flush_delayed_work(&adev->delayed_init_work); 3892 3893 if (fbcon) 3894 amdgpu_fbdev_set_suspend(adev, 0); 3895 3896 drm_kms_helper_poll_enable(dev); 3897 3898 amdgpu_ras_resume(adev); 3899 3900 /* 3901 * Most of the connector probing functions try to acquire runtime pm 3902 * refs to ensure that the GPU is powered on when connector polling is 3903 * performed. Since we're calling this from a runtime PM callback, 3904 * trying to acquire rpm refs will cause us to deadlock. 3905 * 3906 * Since we're guaranteed to be holding the rpm lock, it's safe to 3907 * temporarily disable the rpm helpers so this doesn't deadlock us. 3908 */ 3909 #ifdef CONFIG_PM 3910 dev->dev->power.disable_depth++; 3911 #endif 3912 if (!amdgpu_device_has_dc_support(adev)) 3913 drm_helper_hpd_irq_event(dev); 3914 else 3915 drm_kms_helper_hotplug_event(dev); 3916 #ifdef CONFIG_PM 3917 dev->dev->power.disable_depth--; 3918 #endif 3919 adev->in_suspend = false; 3920 3921 return 0; 3922 } 3923 3924 /** 3925 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3926 * 3927 * @adev: amdgpu_device pointer 3928 * 3929 * The list of all the hardware IPs that make up the asic is walked and 3930 * the check_soft_reset callbacks are run. check_soft_reset determines 3931 * if the asic is still hung or not. 3932 * Returns true if any of the IPs are still in a hung state, false if not. 3933 */ 3934 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3935 { 3936 int i; 3937 bool asic_hang = false; 3938 3939 if (amdgpu_sriov_vf(adev)) 3940 return true; 3941 3942 if (amdgpu_asic_need_full_reset(adev)) 3943 return true; 3944 3945 for (i = 0; i < adev->num_ip_blocks; i++) { 3946 if (!adev->ip_blocks[i].status.valid) 3947 continue; 3948 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3949 adev->ip_blocks[i].status.hang = 3950 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3951 if (adev->ip_blocks[i].status.hang) { 3952 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3953 asic_hang = true; 3954 } 3955 } 3956 return asic_hang; 3957 } 3958 3959 /** 3960 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3961 * 3962 * @adev: amdgpu_device pointer 3963 * 3964 * The list of all the hardware IPs that make up the asic is walked and the 3965 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3966 * handles any IP specific hardware or software state changes that are 3967 * necessary for a soft reset to succeed. 3968 * Returns 0 on success, negative error code on failure. 3969 */ 3970 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3971 { 3972 int i, r = 0; 3973 3974 for (i = 0; i < adev->num_ip_blocks; i++) { 3975 if (!adev->ip_blocks[i].status.valid) 3976 continue; 3977 if (adev->ip_blocks[i].status.hang && 3978 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3979 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3980 if (r) 3981 return r; 3982 } 3983 } 3984 3985 return 0; 3986 } 3987 3988 /** 3989 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3990 * 3991 * @adev: amdgpu_device pointer 3992 * 3993 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3994 * reset is necessary to recover. 3995 * Returns true if a full asic reset is required, false if not. 3996 */ 3997 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3998 { 3999 int i; 4000 4001 if (amdgpu_asic_need_full_reset(adev)) 4002 return true; 4003 4004 for (i = 0; i < adev->num_ip_blocks; i++) { 4005 if (!adev->ip_blocks[i].status.valid) 4006 continue; 4007 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4008 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4009 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4010 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4011 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4012 if (adev->ip_blocks[i].status.hang) { 4013 dev_info(adev->dev, "Some block need full reset!\n"); 4014 return true; 4015 } 4016 } 4017 } 4018 return false; 4019 } 4020 4021 /** 4022 * amdgpu_device_ip_soft_reset - do a soft reset 4023 * 4024 * @adev: amdgpu_device pointer 4025 * 4026 * The list of all the hardware IPs that make up the asic is walked and the 4027 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4028 * IP specific hardware or software state changes that are necessary to soft 4029 * reset the IP. 4030 * Returns 0 on success, negative error code on failure. 4031 */ 4032 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4033 { 4034 int i, r = 0; 4035 4036 for (i = 0; i < adev->num_ip_blocks; i++) { 4037 if (!adev->ip_blocks[i].status.valid) 4038 continue; 4039 if (adev->ip_blocks[i].status.hang && 4040 adev->ip_blocks[i].version->funcs->soft_reset) { 4041 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4042 if (r) 4043 return r; 4044 } 4045 } 4046 4047 return 0; 4048 } 4049 4050 /** 4051 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4052 * 4053 * @adev: amdgpu_device pointer 4054 * 4055 * The list of all the hardware IPs that make up the asic is walked and the 4056 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4057 * handles any IP specific hardware or software state changes that are 4058 * necessary after the IP has been soft reset. 4059 * Returns 0 on success, negative error code on failure. 4060 */ 4061 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4062 { 4063 int i, r = 0; 4064 4065 for (i = 0; i < adev->num_ip_blocks; i++) { 4066 if (!adev->ip_blocks[i].status.valid) 4067 continue; 4068 if (adev->ip_blocks[i].status.hang && 4069 adev->ip_blocks[i].version->funcs->post_soft_reset) 4070 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4071 if (r) 4072 return r; 4073 } 4074 4075 return 0; 4076 } 4077 4078 /** 4079 * amdgpu_device_recover_vram - Recover some VRAM contents 4080 * 4081 * @adev: amdgpu_device pointer 4082 * 4083 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4084 * restore things like GPUVM page tables after a GPU reset where 4085 * the contents of VRAM might be lost. 4086 * 4087 * Returns: 4088 * 0 on success, negative error code on failure. 4089 */ 4090 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4091 { 4092 struct dma_fence *fence = NULL, *next = NULL; 4093 struct amdgpu_bo *shadow; 4094 long r = 1, tmo; 4095 4096 if (amdgpu_sriov_runtime(adev)) 4097 tmo = msecs_to_jiffies(8000); 4098 else 4099 tmo = msecs_to_jiffies(100); 4100 4101 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4102 mutex_lock(&adev->shadow_list_lock); 4103 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 4104 4105 /* No need to recover an evicted BO */ 4106 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4107 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4108 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4109 continue; 4110 4111 r = amdgpu_bo_restore_shadow(shadow, &next); 4112 if (r) 4113 break; 4114 4115 if (fence) { 4116 tmo = dma_fence_wait_timeout(fence, false, tmo); 4117 dma_fence_put(fence); 4118 fence = next; 4119 if (tmo == 0) { 4120 r = -ETIMEDOUT; 4121 break; 4122 } else if (tmo < 0) { 4123 r = tmo; 4124 break; 4125 } 4126 } else { 4127 fence = next; 4128 } 4129 } 4130 mutex_unlock(&adev->shadow_list_lock); 4131 4132 if (fence) 4133 tmo = dma_fence_wait_timeout(fence, false, tmo); 4134 dma_fence_put(fence); 4135 4136 if (r < 0 || tmo <= 0) { 4137 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4138 return -EIO; 4139 } 4140 4141 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4142 return 0; 4143 } 4144 4145 4146 /** 4147 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4148 * 4149 * @adev: amdgpu_device pointer 4150 * @from_hypervisor: request from hypervisor 4151 * 4152 * do VF FLR and reinitialize Asic 4153 * return 0 means succeeded otherwise failed 4154 */ 4155 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4156 bool from_hypervisor) 4157 { 4158 int r; 4159 4160 if (from_hypervisor) 4161 r = amdgpu_virt_request_full_gpu(adev, true); 4162 else 4163 r = amdgpu_virt_reset_gpu(adev); 4164 if (r) 4165 return r; 4166 4167 amdgpu_amdkfd_pre_reset(adev); 4168 4169 /* Resume IP prior to SMC */ 4170 r = amdgpu_device_ip_reinit_early_sriov(adev); 4171 if (r) 4172 goto error; 4173 4174 amdgpu_virt_init_data_exchange(adev); 4175 /* we need recover gart prior to run SMC/CP/SDMA resume */ 4176 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); 4177 4178 r = amdgpu_device_fw_loading(adev); 4179 if (r) 4180 return r; 4181 4182 /* now we are okay to resume SMC/CP/SDMA */ 4183 r = amdgpu_device_ip_reinit_late_sriov(adev); 4184 if (r) 4185 goto error; 4186 4187 amdgpu_irq_gpu_reset_resume_helper(adev); 4188 r = amdgpu_ib_ring_tests(adev); 4189 amdgpu_amdkfd_post_reset(adev); 4190 4191 error: 4192 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4193 amdgpu_inc_vram_lost(adev); 4194 r = amdgpu_device_recover_vram(adev); 4195 } 4196 amdgpu_virt_release_full_gpu(adev, true); 4197 4198 return r; 4199 } 4200 4201 /** 4202 * amdgpu_device_has_job_running - check if there is any job in mirror list 4203 * 4204 * @adev: amdgpu_device pointer 4205 * 4206 * check if there is any job in mirror list 4207 */ 4208 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4209 { 4210 int i; 4211 struct drm_sched_job *job; 4212 4213 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4214 struct amdgpu_ring *ring = adev->rings[i]; 4215 4216 if (!ring || !ring->sched.thread) 4217 continue; 4218 4219 spin_lock(&ring->sched.job_list_lock); 4220 job = list_first_entry_or_null(&ring->sched.pending_list, 4221 struct drm_sched_job, list); 4222 spin_unlock(&ring->sched.job_list_lock); 4223 if (job) 4224 return true; 4225 } 4226 return false; 4227 } 4228 4229 /** 4230 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4231 * 4232 * @adev: amdgpu_device pointer 4233 * 4234 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4235 * a hung GPU. 4236 */ 4237 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4238 { 4239 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4240 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4241 return false; 4242 } 4243 4244 if (amdgpu_gpu_recovery == 0) 4245 goto disabled; 4246 4247 if (amdgpu_sriov_vf(adev)) 4248 return true; 4249 4250 if (amdgpu_gpu_recovery == -1) { 4251 switch (adev->asic_type) { 4252 case CHIP_BONAIRE: 4253 case CHIP_HAWAII: 4254 case CHIP_TOPAZ: 4255 case CHIP_TONGA: 4256 case CHIP_FIJI: 4257 case CHIP_POLARIS10: 4258 case CHIP_POLARIS11: 4259 case CHIP_POLARIS12: 4260 case CHIP_VEGAM: 4261 case CHIP_VEGA20: 4262 case CHIP_VEGA10: 4263 case CHIP_VEGA12: 4264 case CHIP_RAVEN: 4265 case CHIP_ARCTURUS: 4266 case CHIP_RENOIR: 4267 case CHIP_NAVI10: 4268 case CHIP_NAVI14: 4269 case CHIP_NAVI12: 4270 case CHIP_SIENNA_CICHLID: 4271 case CHIP_NAVY_FLOUNDER: 4272 case CHIP_DIMGREY_CAVEFISH: 4273 case CHIP_VANGOGH: 4274 case CHIP_ALDEBARAN: 4275 break; 4276 default: 4277 goto disabled; 4278 } 4279 } 4280 4281 return true; 4282 4283 disabled: 4284 dev_info(adev->dev, "GPU recovery disabled.\n"); 4285 return false; 4286 } 4287 4288 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4289 { 4290 u32 i; 4291 int ret = 0; 4292 4293 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4294 4295 dev_info(adev->dev, "GPU mode1 reset\n"); 4296 4297 /* disable BM */ 4298 pci_clear_master(adev->pdev); 4299 4300 amdgpu_device_cache_pci_state(adev->pdev); 4301 4302 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4303 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4304 ret = amdgpu_dpm_mode1_reset(adev); 4305 } else { 4306 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4307 ret = psp_gpu_reset(adev); 4308 } 4309 4310 if (ret) 4311 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4312 4313 amdgpu_device_load_pci_state(adev->pdev); 4314 4315 /* wait for asic to come out of reset */ 4316 for (i = 0; i < adev->usec_timeout; i++) { 4317 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4318 4319 if (memsize != 0xffffffff) 4320 break; 4321 udelay(1); 4322 } 4323 4324 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4325 return ret; 4326 } 4327 4328 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4329 struct amdgpu_reset_context *reset_context) 4330 { 4331 int i, r = 0; 4332 struct amdgpu_job *job = NULL; 4333 bool need_full_reset = 4334 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4335 4336 if (reset_context->reset_req_dev == adev) 4337 job = reset_context->job; 4338 4339 /* no need to dump if device is not in good state during probe period */ 4340 if (!adev->gmc.xgmi.pending_reset) 4341 amdgpu_debugfs_wait_dump(adev); 4342 4343 if (amdgpu_sriov_vf(adev)) { 4344 /* stop the data exchange thread */ 4345 amdgpu_virt_fini_data_exchange(adev); 4346 } 4347 4348 /* block all schedulers and reset given job's ring */ 4349 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4350 struct amdgpu_ring *ring = adev->rings[i]; 4351 4352 if (!ring || !ring->sched.thread) 4353 continue; 4354 4355 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4356 amdgpu_fence_driver_force_completion(ring); 4357 } 4358 4359 if(job) 4360 drm_sched_increase_karma(&job->base); 4361 4362 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4363 /* If reset handler not implemented, continue; otherwise return */ 4364 if (r == -ENOSYS) 4365 r = 0; 4366 else 4367 return r; 4368 4369 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4370 if (!amdgpu_sriov_vf(adev)) { 4371 4372 if (!need_full_reset) 4373 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4374 4375 if (!need_full_reset) { 4376 amdgpu_device_ip_pre_soft_reset(adev); 4377 r = amdgpu_device_ip_soft_reset(adev); 4378 amdgpu_device_ip_post_soft_reset(adev); 4379 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4380 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4381 need_full_reset = true; 4382 } 4383 } 4384 4385 if (need_full_reset) 4386 r = amdgpu_device_ip_suspend(adev); 4387 if (need_full_reset) 4388 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4389 else 4390 clear_bit(AMDGPU_NEED_FULL_RESET, 4391 &reset_context->flags); 4392 } 4393 4394 return r; 4395 } 4396 4397 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4398 struct amdgpu_reset_context *reset_context) 4399 { 4400 struct amdgpu_device *tmp_adev = NULL; 4401 bool need_full_reset, skip_hw_reset, vram_lost = false; 4402 int r = 0; 4403 4404 /* Try reset handler method first */ 4405 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4406 reset_list); 4407 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4408 /* If reset handler not implemented, continue; otherwise return */ 4409 if (r == -ENOSYS) 4410 r = 0; 4411 else 4412 return r; 4413 4414 /* Reset handler not implemented, use the default method */ 4415 need_full_reset = 4416 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4417 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4418 4419 /* 4420 * ASIC reset has to be done on all XGMI hive nodes ASAP 4421 * to allow proper links negotiation in FW (within 1 sec) 4422 */ 4423 if (!skip_hw_reset && need_full_reset) { 4424 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4425 /* For XGMI run all resets in parallel to speed up the process */ 4426 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4427 tmp_adev->gmc.xgmi.pending_reset = false; 4428 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4429 r = -EALREADY; 4430 } else 4431 r = amdgpu_asic_reset(tmp_adev); 4432 4433 if (r) { 4434 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4435 r, adev_to_drm(tmp_adev)->unique); 4436 break; 4437 } 4438 } 4439 4440 /* For XGMI wait for all resets to complete before proceed */ 4441 if (!r) { 4442 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4443 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4444 flush_work(&tmp_adev->xgmi_reset_work); 4445 r = tmp_adev->asic_reset_res; 4446 if (r) 4447 break; 4448 } 4449 } 4450 } 4451 } 4452 4453 if (!r && amdgpu_ras_intr_triggered()) { 4454 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4455 if (tmp_adev->mmhub.ras_funcs && 4456 tmp_adev->mmhub.ras_funcs->reset_ras_error_count) 4457 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev); 4458 } 4459 4460 amdgpu_ras_intr_cleared(); 4461 } 4462 4463 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4464 if (need_full_reset) { 4465 /* post card */ 4466 r = amdgpu_device_asic_init(tmp_adev); 4467 if (r) { 4468 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4469 } else { 4470 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4471 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4472 if (r) 4473 goto out; 4474 4475 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4476 if (vram_lost) { 4477 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4478 amdgpu_inc_vram_lost(tmp_adev); 4479 } 4480 4481 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); 4482 if (r) 4483 goto out; 4484 4485 r = amdgpu_device_fw_loading(tmp_adev); 4486 if (r) 4487 return r; 4488 4489 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4490 if (r) 4491 goto out; 4492 4493 if (vram_lost) 4494 amdgpu_device_fill_reset_magic(tmp_adev); 4495 4496 /* 4497 * Add this ASIC as tracked as reset was already 4498 * complete successfully. 4499 */ 4500 amdgpu_register_gpu_instance(tmp_adev); 4501 4502 if (!reset_context->hive && 4503 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4504 amdgpu_xgmi_add_device(tmp_adev); 4505 4506 r = amdgpu_device_ip_late_init(tmp_adev); 4507 if (r) 4508 goto out; 4509 4510 amdgpu_fbdev_set_suspend(tmp_adev, 0); 4511 4512 /* 4513 * The GPU enters bad state once faulty pages 4514 * by ECC has reached the threshold, and ras 4515 * recovery is scheduled next. So add one check 4516 * here to break recovery if it indeed exceeds 4517 * bad page threshold, and remind user to 4518 * retire this GPU or setting one bigger 4519 * bad_page_threshold value to fix this once 4520 * probing driver again. 4521 */ 4522 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4523 /* must succeed. */ 4524 amdgpu_ras_resume(tmp_adev); 4525 } else { 4526 r = -EINVAL; 4527 goto out; 4528 } 4529 4530 /* Update PSP FW topology after reset */ 4531 if (reset_context->hive && 4532 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4533 r = amdgpu_xgmi_update_topology( 4534 reset_context->hive, tmp_adev); 4535 } 4536 } 4537 4538 out: 4539 if (!r) { 4540 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4541 r = amdgpu_ib_ring_tests(tmp_adev); 4542 if (r) { 4543 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4544 need_full_reset = true; 4545 r = -EAGAIN; 4546 goto end; 4547 } 4548 } 4549 4550 if (!r) 4551 r = amdgpu_device_recover_vram(tmp_adev); 4552 else 4553 tmp_adev->asic_reset_res = r; 4554 } 4555 4556 end: 4557 if (need_full_reset) 4558 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4559 else 4560 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4561 return r; 4562 } 4563 4564 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4565 struct amdgpu_hive_info *hive) 4566 { 4567 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4568 return false; 4569 4570 if (hive) { 4571 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4572 } else { 4573 down_write(&adev->reset_sem); 4574 } 4575 4576 switch (amdgpu_asic_reset_method(adev)) { 4577 case AMD_RESET_METHOD_MODE1: 4578 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4579 break; 4580 case AMD_RESET_METHOD_MODE2: 4581 adev->mp1_state = PP_MP1_STATE_RESET; 4582 break; 4583 default: 4584 adev->mp1_state = PP_MP1_STATE_NONE; 4585 break; 4586 } 4587 4588 return true; 4589 } 4590 4591 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4592 { 4593 amdgpu_vf_error_trans_all(adev); 4594 adev->mp1_state = PP_MP1_STATE_NONE; 4595 atomic_set(&adev->in_gpu_reset, 0); 4596 up_write(&adev->reset_sem); 4597 } 4598 4599 /* 4600 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4601 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4602 * 4603 * unlock won't require roll back. 4604 */ 4605 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4606 { 4607 struct amdgpu_device *tmp_adev = NULL; 4608 4609 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4610 if (!hive) { 4611 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4612 return -ENODEV; 4613 } 4614 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4615 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4616 goto roll_back; 4617 } 4618 } else if (!amdgpu_device_lock_adev(adev, hive)) 4619 return -EAGAIN; 4620 4621 return 0; 4622 roll_back: 4623 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4624 /* 4625 * if the lockup iteration break in the middle of a hive, 4626 * it may means there may has a race issue, 4627 * or a hive device locked up independently. 4628 * we may be in trouble and may not, so will try to roll back 4629 * the lock and give out a warnning. 4630 */ 4631 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4632 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4633 amdgpu_device_unlock_adev(tmp_adev); 4634 } 4635 } 4636 return -EAGAIN; 4637 } 4638 4639 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4640 { 4641 struct pci_dev *p = NULL; 4642 4643 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4644 adev->pdev->bus->number, 1); 4645 if (p) { 4646 pm_runtime_enable(&(p->dev)); 4647 pm_runtime_resume(&(p->dev)); 4648 } 4649 } 4650 4651 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4652 { 4653 enum amd_reset_method reset_method; 4654 struct pci_dev *p = NULL; 4655 u64 expires; 4656 4657 /* 4658 * For now, only BACO and mode1 reset are confirmed 4659 * to suffer the audio issue without proper suspended. 4660 */ 4661 reset_method = amdgpu_asic_reset_method(adev); 4662 if ((reset_method != AMD_RESET_METHOD_BACO) && 4663 (reset_method != AMD_RESET_METHOD_MODE1)) 4664 return -EINVAL; 4665 4666 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4667 adev->pdev->bus->number, 1); 4668 if (!p) 4669 return -ENODEV; 4670 4671 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4672 if (!expires) 4673 /* 4674 * If we cannot get the audio device autosuspend delay, 4675 * a fixed 4S interval will be used. Considering 3S is 4676 * the audio controller default autosuspend delay setting. 4677 * 4S used here is guaranteed to cover that. 4678 */ 4679 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4680 4681 while (!pm_runtime_status_suspended(&(p->dev))) { 4682 if (!pm_runtime_suspend(&(p->dev))) 4683 break; 4684 4685 if (expires < ktime_get_mono_fast_ns()) { 4686 dev_warn(adev->dev, "failed to suspend display audio\n"); 4687 /* TODO: abort the succeeding gpu reset? */ 4688 return -ETIMEDOUT; 4689 } 4690 } 4691 4692 pm_runtime_disable(&(p->dev)); 4693 4694 return 0; 4695 } 4696 4697 void amdgpu_device_recheck_guilty_jobs( 4698 struct amdgpu_device *adev, struct list_head *device_list_handle, 4699 struct amdgpu_reset_context *reset_context) 4700 { 4701 int i, r = 0; 4702 4703 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4704 struct amdgpu_ring *ring = adev->rings[i]; 4705 int ret = 0; 4706 struct drm_sched_job *s_job; 4707 4708 if (!ring || !ring->sched.thread) 4709 continue; 4710 4711 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4712 struct drm_sched_job, list); 4713 if (s_job == NULL) 4714 continue; 4715 4716 /* clear job's guilty and depend the folowing step to decide the real one */ 4717 drm_sched_reset_karma(s_job); 4718 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4719 4720 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4721 if (ret == 0) { /* timeout */ 4722 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4723 ring->sched.name, s_job->id); 4724 4725 /* set guilty */ 4726 drm_sched_increase_karma(s_job); 4727 retry: 4728 /* do hw reset */ 4729 if (amdgpu_sriov_vf(adev)) { 4730 amdgpu_virt_fini_data_exchange(adev); 4731 r = amdgpu_device_reset_sriov(adev, false); 4732 if (r) 4733 adev->asic_reset_res = r; 4734 } else { 4735 clear_bit(AMDGPU_SKIP_HW_RESET, 4736 &reset_context->flags); 4737 r = amdgpu_do_asic_reset(device_list_handle, 4738 reset_context); 4739 if (r && r == -EAGAIN) 4740 goto retry; 4741 } 4742 4743 /* 4744 * add reset counter so that the following 4745 * resubmitted job could flush vmid 4746 */ 4747 atomic_inc(&adev->gpu_reset_counter); 4748 continue; 4749 } 4750 4751 /* got the hw fence, signal finished fence */ 4752 atomic_dec(ring->sched.score); 4753 dma_fence_get(&s_job->s_fence->finished); 4754 dma_fence_signal(&s_job->s_fence->finished); 4755 dma_fence_put(&s_job->s_fence->finished); 4756 4757 /* remove node from list and free the job */ 4758 spin_lock(&ring->sched.job_list_lock); 4759 list_del_init(&s_job->list); 4760 spin_unlock(&ring->sched.job_list_lock); 4761 ring->sched.ops->free_job(s_job); 4762 } 4763 } 4764 4765 /** 4766 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 4767 * 4768 * @adev: amdgpu_device pointer 4769 * @job: which job trigger hang 4770 * 4771 * Attempt to reset the GPU if it has hung (all asics). 4772 * Attempt to do soft-reset or full-reset and reinitialize Asic 4773 * Returns 0 for success or an error on failure. 4774 */ 4775 4776 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 4777 struct amdgpu_job *job) 4778 { 4779 struct list_head device_list, *device_list_handle = NULL; 4780 bool job_signaled = false; 4781 struct amdgpu_hive_info *hive = NULL; 4782 struct amdgpu_device *tmp_adev = NULL; 4783 int i, r = 0; 4784 bool need_emergency_restart = false; 4785 bool audio_suspended = false; 4786 int tmp_vram_lost_counter; 4787 struct amdgpu_reset_context reset_context; 4788 4789 memset(&reset_context, 0, sizeof(reset_context)); 4790 4791 /* 4792 * Special case: RAS triggered and full reset isn't supported 4793 */ 4794 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 4795 4796 /* 4797 * Flush RAM to disk so that after reboot 4798 * the user can read log and see why the system rebooted. 4799 */ 4800 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 4801 DRM_WARN("Emergency reboot."); 4802 4803 ksys_sync_helper(); 4804 emergency_restart(); 4805 } 4806 4807 dev_info(adev->dev, "GPU %s begin!\n", 4808 need_emergency_restart ? "jobs stop":"reset"); 4809 4810 /* 4811 * Here we trylock to avoid chain of resets executing from 4812 * either trigger by jobs on different adevs in XGMI hive or jobs on 4813 * different schedulers for same device while this TO handler is running. 4814 * We always reset all schedulers for device and all devices for XGMI 4815 * hive so that should take care of them too. 4816 */ 4817 hive = amdgpu_get_xgmi_hive(adev); 4818 if (hive) { 4819 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 4820 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 4821 job ? job->base.id : -1, hive->hive_id); 4822 amdgpu_put_xgmi_hive(hive); 4823 if (job) 4824 drm_sched_increase_karma(&job->base); 4825 return 0; 4826 } 4827 mutex_lock(&hive->hive_lock); 4828 } 4829 4830 reset_context.method = AMD_RESET_METHOD_NONE; 4831 reset_context.reset_req_dev = adev; 4832 reset_context.job = job; 4833 reset_context.hive = hive; 4834 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 4835 4836 /* 4837 * lock the device before we try to operate the linked list 4838 * if didn't get the device lock, don't touch the linked list since 4839 * others may iterating it. 4840 */ 4841 r = amdgpu_device_lock_hive_adev(adev, hive); 4842 if (r) { 4843 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 4844 job ? job->base.id : -1); 4845 4846 /* even we skipped this reset, still need to set the job to guilty */ 4847 if (job) 4848 drm_sched_increase_karma(&job->base); 4849 goto skip_recovery; 4850 } 4851 4852 /* 4853 * Build list of devices to reset. 4854 * In case we are in XGMI hive mode, resort the device list 4855 * to put adev in the 1st position. 4856 */ 4857 INIT_LIST_HEAD(&device_list); 4858 if (adev->gmc.xgmi.num_physical_nodes > 1) { 4859 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 4860 list_add_tail(&tmp_adev->reset_list, &device_list); 4861 if (!list_is_first(&adev->reset_list, &device_list)) 4862 list_rotate_to_front(&adev->reset_list, &device_list); 4863 device_list_handle = &device_list; 4864 } else { 4865 list_add_tail(&adev->reset_list, &device_list); 4866 device_list_handle = &device_list; 4867 } 4868 4869 /* block all schedulers and reset given job's ring */ 4870 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4871 /* 4872 * Try to put the audio codec into suspend state 4873 * before gpu reset started. 4874 * 4875 * Due to the power domain of the graphics device 4876 * is shared with AZ power domain. Without this, 4877 * we may change the audio hardware from behind 4878 * the audio driver's back. That will trigger 4879 * some audio codec errors. 4880 */ 4881 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 4882 audio_suspended = true; 4883 4884 amdgpu_ras_set_error_query_ready(tmp_adev, false); 4885 4886 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 4887 4888 if (!amdgpu_sriov_vf(tmp_adev)) 4889 amdgpu_amdkfd_pre_reset(tmp_adev); 4890 4891 /* 4892 * Mark these ASICs to be reseted as untracked first 4893 * And add them back after reset completed 4894 */ 4895 amdgpu_unregister_gpu_instance(tmp_adev); 4896 4897 amdgpu_fbdev_set_suspend(tmp_adev, 1); 4898 4899 /* disable ras on ALL IPs */ 4900 if (!need_emergency_restart && 4901 amdgpu_device_ip_need_full_reset(tmp_adev)) 4902 amdgpu_ras_suspend(tmp_adev); 4903 4904 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4905 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4906 4907 if (!ring || !ring->sched.thread) 4908 continue; 4909 4910 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 4911 4912 if (need_emergency_restart) 4913 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 4914 } 4915 atomic_inc(&tmp_adev->gpu_reset_counter); 4916 } 4917 4918 if (need_emergency_restart) 4919 goto skip_sched_resume; 4920 4921 /* 4922 * Must check guilty signal here since after this point all old 4923 * HW fences are force signaled. 4924 * 4925 * job->base holds a reference to parent fence 4926 */ 4927 if (job && job->base.s_fence->parent && 4928 dma_fence_is_signaled(job->base.s_fence->parent)) { 4929 job_signaled = true; 4930 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 4931 goto skip_hw_reset; 4932 } 4933 4934 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4935 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4936 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 4937 /*TODO Should we stop ?*/ 4938 if (r) { 4939 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 4940 r, adev_to_drm(tmp_adev)->unique); 4941 tmp_adev->asic_reset_res = r; 4942 } 4943 } 4944 4945 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 4946 /* Actual ASIC resets if needed.*/ 4947 /* TODO Implement XGMI hive reset logic for SRIOV */ 4948 if (amdgpu_sriov_vf(adev)) { 4949 r = amdgpu_device_reset_sriov(adev, job ? false : true); 4950 if (r) 4951 adev->asic_reset_res = r; 4952 } else { 4953 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 4954 if (r && r == -EAGAIN) 4955 goto retry; 4956 } 4957 4958 skip_hw_reset: 4959 4960 /* Post ASIC reset for all devs .*/ 4961 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4962 4963 /* 4964 * Sometimes a later bad compute job can block a good gfx job as gfx 4965 * and compute ring share internal GC HW mutually. We add an additional 4966 * guilty jobs recheck step to find the real guilty job, it synchronously 4967 * submits and pends for the first job being signaled. If it gets timeout, 4968 * we identify it as a real guilty job. 4969 */ 4970 if (amdgpu_gpu_recovery == 2 && 4971 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 4972 amdgpu_device_recheck_guilty_jobs( 4973 tmp_adev, device_list_handle, &reset_context); 4974 4975 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4976 struct amdgpu_ring *ring = tmp_adev->rings[i]; 4977 4978 if (!ring || !ring->sched.thread) 4979 continue; 4980 4981 /* No point to resubmit jobs if we didn't HW reset*/ 4982 if (!tmp_adev->asic_reset_res && !job_signaled) 4983 drm_sched_resubmit_jobs(&ring->sched); 4984 4985 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 4986 } 4987 4988 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 4989 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 4990 } 4991 4992 tmp_adev->asic_reset_res = 0; 4993 4994 if (r) { 4995 /* bad news, how to tell it to userspace ? */ 4996 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 4997 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 4998 } else { 4999 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5000 } 5001 } 5002 5003 skip_sched_resume: 5004 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5005 /* unlock kfd: SRIOV would do it separately */ 5006 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5007 amdgpu_amdkfd_post_reset(tmp_adev); 5008 5009 /* kfd_post_reset will do nothing if kfd device is not initialized, 5010 * need to bring up kfd here if it's not be initialized before 5011 */ 5012 if (!adev->kfd.init_complete) 5013 amdgpu_amdkfd_device_init(adev); 5014 5015 if (audio_suspended) 5016 amdgpu_device_resume_display_audio(tmp_adev); 5017 amdgpu_device_unlock_adev(tmp_adev); 5018 } 5019 5020 skip_recovery: 5021 if (hive) { 5022 atomic_set(&hive->in_reset, 0); 5023 mutex_unlock(&hive->hive_lock); 5024 amdgpu_put_xgmi_hive(hive); 5025 } 5026 5027 if (r && r != -EAGAIN) 5028 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5029 return r; 5030 } 5031 5032 /** 5033 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5034 * 5035 * @adev: amdgpu_device pointer 5036 * 5037 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5038 * and lanes) of the slot the device is in. Handles APUs and 5039 * virtualized environments where PCIE config space may not be available. 5040 */ 5041 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5042 { 5043 struct pci_dev *pdev; 5044 enum pci_bus_speed speed_cap, platform_speed_cap; 5045 enum pcie_link_width platform_link_width; 5046 5047 if (amdgpu_pcie_gen_cap) 5048 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5049 5050 if (amdgpu_pcie_lane_cap) 5051 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5052 5053 /* covers APUs as well */ 5054 if (pci_is_root_bus(adev->pdev->bus)) { 5055 if (adev->pm.pcie_gen_mask == 0) 5056 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5057 if (adev->pm.pcie_mlw_mask == 0) 5058 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5059 return; 5060 } 5061 5062 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5063 return; 5064 5065 pcie_bandwidth_available(adev->pdev, NULL, 5066 &platform_speed_cap, &platform_link_width); 5067 5068 if (adev->pm.pcie_gen_mask == 0) { 5069 /* asic caps */ 5070 pdev = adev->pdev; 5071 speed_cap = pcie_get_speed_cap(pdev); 5072 if (speed_cap == PCI_SPEED_UNKNOWN) { 5073 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5074 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5075 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5076 } else { 5077 if (speed_cap == PCIE_SPEED_32_0GT) 5078 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5079 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5080 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5081 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5082 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5083 else if (speed_cap == PCIE_SPEED_16_0GT) 5084 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5085 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5086 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5087 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5088 else if (speed_cap == PCIE_SPEED_8_0GT) 5089 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5090 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5091 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5092 else if (speed_cap == PCIE_SPEED_5_0GT) 5093 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5094 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5095 else 5096 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5097 } 5098 /* platform caps */ 5099 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5100 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5101 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5102 } else { 5103 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5104 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5105 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5106 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5107 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5108 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5109 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5110 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5111 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5112 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5113 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5114 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5115 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5116 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5117 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5118 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5119 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5120 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5121 else 5122 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5123 5124 } 5125 } 5126 if (adev->pm.pcie_mlw_mask == 0) { 5127 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5128 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5129 } else { 5130 switch (platform_link_width) { 5131 case PCIE_LNK_X32: 5132 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5133 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5134 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5135 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5136 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5137 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5138 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5139 break; 5140 case PCIE_LNK_X16: 5141 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5142 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5143 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5144 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5145 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5146 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5147 break; 5148 case PCIE_LNK_X12: 5149 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5150 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5151 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5152 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5153 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5154 break; 5155 case PCIE_LNK_X8: 5156 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5157 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5158 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5159 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5160 break; 5161 case PCIE_LNK_X4: 5162 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5163 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5164 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5165 break; 5166 case PCIE_LNK_X2: 5167 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5168 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5169 break; 5170 case PCIE_LNK_X1: 5171 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5172 break; 5173 default: 5174 break; 5175 } 5176 } 5177 } 5178 } 5179 5180 int amdgpu_device_baco_enter(struct drm_device *dev) 5181 { 5182 struct amdgpu_device *adev = drm_to_adev(dev); 5183 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5184 5185 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5186 return -ENOTSUPP; 5187 5188 if (ras && adev->ras_enabled && 5189 adev->nbio.funcs->enable_doorbell_interrupt) 5190 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5191 5192 return amdgpu_dpm_baco_enter(adev); 5193 } 5194 5195 int amdgpu_device_baco_exit(struct drm_device *dev) 5196 { 5197 struct amdgpu_device *adev = drm_to_adev(dev); 5198 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5199 int ret = 0; 5200 5201 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5202 return -ENOTSUPP; 5203 5204 ret = amdgpu_dpm_baco_exit(adev); 5205 if (ret) 5206 return ret; 5207 5208 if (ras && adev->ras_enabled && 5209 adev->nbio.funcs->enable_doorbell_interrupt) 5210 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5211 5212 return 0; 5213 } 5214 5215 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5216 { 5217 int i; 5218 5219 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5220 struct amdgpu_ring *ring = adev->rings[i]; 5221 5222 if (!ring || !ring->sched.thread) 5223 continue; 5224 5225 cancel_delayed_work_sync(&ring->sched.work_tdr); 5226 } 5227 } 5228 5229 /** 5230 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5231 * @pdev: PCI device struct 5232 * @state: PCI channel state 5233 * 5234 * Description: Called when a PCI error is detected. 5235 * 5236 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5237 */ 5238 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5239 { 5240 struct drm_device *dev = pci_get_drvdata(pdev); 5241 struct amdgpu_device *adev = drm_to_adev(dev); 5242 int i; 5243 5244 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5245 5246 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5247 DRM_WARN("No support for XGMI hive yet..."); 5248 return PCI_ERS_RESULT_DISCONNECT; 5249 } 5250 5251 switch (state) { 5252 case pci_channel_io_normal: 5253 return PCI_ERS_RESULT_CAN_RECOVER; 5254 /* Fatal error, prepare for slot reset */ 5255 case pci_channel_io_frozen: 5256 /* 5257 * Cancel and wait for all TDRs in progress if failing to 5258 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5259 * 5260 * Locking adev->reset_sem will prevent any external access 5261 * to GPU during PCI error recovery 5262 */ 5263 while (!amdgpu_device_lock_adev(adev, NULL)) 5264 amdgpu_cancel_all_tdr(adev); 5265 5266 /* 5267 * Block any work scheduling as we do for regular GPU reset 5268 * for the duration of the recovery 5269 */ 5270 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5271 struct amdgpu_ring *ring = adev->rings[i]; 5272 5273 if (!ring || !ring->sched.thread) 5274 continue; 5275 5276 drm_sched_stop(&ring->sched, NULL); 5277 } 5278 atomic_inc(&adev->gpu_reset_counter); 5279 return PCI_ERS_RESULT_NEED_RESET; 5280 case pci_channel_io_perm_failure: 5281 /* Permanent error, prepare for device removal */ 5282 return PCI_ERS_RESULT_DISCONNECT; 5283 } 5284 5285 return PCI_ERS_RESULT_NEED_RESET; 5286 } 5287 5288 /** 5289 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5290 * @pdev: pointer to PCI device 5291 */ 5292 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5293 { 5294 5295 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5296 5297 /* TODO - dump whatever for debugging purposes */ 5298 5299 /* This called only if amdgpu_pci_error_detected returns 5300 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5301 * works, no need to reset slot. 5302 */ 5303 5304 return PCI_ERS_RESULT_RECOVERED; 5305 } 5306 5307 /** 5308 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5309 * @pdev: PCI device struct 5310 * 5311 * Description: This routine is called by the pci error recovery 5312 * code after the PCI slot has been reset, just before we 5313 * should resume normal operations. 5314 */ 5315 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5316 { 5317 struct drm_device *dev = pci_get_drvdata(pdev); 5318 struct amdgpu_device *adev = drm_to_adev(dev); 5319 int r, i; 5320 struct amdgpu_reset_context reset_context; 5321 u32 memsize; 5322 struct list_head device_list; 5323 5324 DRM_INFO("PCI error: slot reset callback!!\n"); 5325 5326 memset(&reset_context, 0, sizeof(reset_context)); 5327 5328 INIT_LIST_HEAD(&device_list); 5329 list_add_tail(&adev->reset_list, &device_list); 5330 5331 /* wait for asic to come out of reset */ 5332 msleep(500); 5333 5334 /* Restore PCI confspace */ 5335 amdgpu_device_load_pci_state(pdev); 5336 5337 /* confirm ASIC came out of reset */ 5338 for (i = 0; i < adev->usec_timeout; i++) { 5339 memsize = amdgpu_asic_get_config_memsize(adev); 5340 5341 if (memsize != 0xffffffff) 5342 break; 5343 udelay(1); 5344 } 5345 if (memsize == 0xffffffff) { 5346 r = -ETIME; 5347 goto out; 5348 } 5349 5350 reset_context.method = AMD_RESET_METHOD_NONE; 5351 reset_context.reset_req_dev = adev; 5352 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5353 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5354 5355 adev->no_hw_access = true; 5356 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5357 adev->no_hw_access = false; 5358 if (r) 5359 goto out; 5360 5361 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5362 5363 out: 5364 if (!r) { 5365 if (amdgpu_device_cache_pci_state(adev->pdev)) 5366 pci_restore_state(adev->pdev); 5367 5368 DRM_INFO("PCIe error recovery succeeded\n"); 5369 } else { 5370 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5371 amdgpu_device_unlock_adev(adev); 5372 } 5373 5374 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5375 } 5376 5377 /** 5378 * amdgpu_pci_resume() - resume normal ops after PCI reset 5379 * @pdev: pointer to PCI device 5380 * 5381 * Called when the error recovery driver tells us that its 5382 * OK to resume normal operation. 5383 */ 5384 void amdgpu_pci_resume(struct pci_dev *pdev) 5385 { 5386 struct drm_device *dev = pci_get_drvdata(pdev); 5387 struct amdgpu_device *adev = drm_to_adev(dev); 5388 int i; 5389 5390 5391 DRM_INFO("PCI error: resume callback!!\n"); 5392 5393 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5394 struct amdgpu_ring *ring = adev->rings[i]; 5395 5396 if (!ring || !ring->sched.thread) 5397 continue; 5398 5399 5400 drm_sched_resubmit_jobs(&ring->sched); 5401 drm_sched_start(&ring->sched, true); 5402 } 5403 5404 amdgpu_device_unlock_adev(adev); 5405 } 5406 5407 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5408 { 5409 struct drm_device *dev = pci_get_drvdata(pdev); 5410 struct amdgpu_device *adev = drm_to_adev(dev); 5411 int r; 5412 5413 r = pci_save_state(pdev); 5414 if (!r) { 5415 kfree(adev->pci_state); 5416 5417 adev->pci_state = pci_store_saved_state(pdev); 5418 5419 if (!adev->pci_state) { 5420 DRM_ERROR("Failed to store PCI saved state"); 5421 return false; 5422 } 5423 } else { 5424 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5425 return false; 5426 } 5427 5428 return true; 5429 } 5430 5431 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5432 { 5433 struct drm_device *dev = pci_get_drvdata(pdev); 5434 struct amdgpu_device *adev = drm_to_adev(dev); 5435 int r; 5436 5437 if (!adev->pci_state) 5438 return false; 5439 5440 r = pci_load_saved_state(pdev, adev->pci_state); 5441 5442 if (!r) { 5443 pci_restore_state(pdev); 5444 } else { 5445 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5446 return false; 5447 } 5448 5449 return true; 5450 } 5451 5452 5453