1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 34 #include <drm/drm_atomic_helper.h> 35 #include <drm/drm_probe_helper.h> 36 #include <drm/amdgpu_drm.h> 37 #include <linux/vgaarb.h> 38 #include <linux/vga_switcheroo.h> 39 #include <linux/efi.h> 40 #include "amdgpu.h" 41 #include "amdgpu_trace.h" 42 #include "amdgpu_i2c.h" 43 #include "atom.h" 44 #include "amdgpu_atombios.h" 45 #include "amdgpu_atomfirmware.h" 46 #include "amd_pcie.h" 47 #ifdef CONFIG_DRM_AMDGPU_SI 48 #include "si.h" 49 #endif 50 #ifdef CONFIG_DRM_AMDGPU_CIK 51 #include "cik.h" 52 #endif 53 #include "vi.h" 54 #include "soc15.h" 55 #include "nv.h" 56 #include "bif/bif_4_1_d.h" 57 #include <linux/pci.h> 58 #include <linux/firmware.h> 59 #include "amdgpu_vf_error.h" 60 61 #include "amdgpu_amdkfd.h" 62 #include "amdgpu_pm.h" 63 64 #include "amdgpu_xgmi.h" 65 #include "amdgpu_ras.h" 66 #include "amdgpu_pmu.h" 67 68 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 69 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 70 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 71 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 72 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 73 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 74 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 75 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 76 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 77 78 #define AMDGPU_RESUME_MS 2000 79 80 static const char *amdgpu_asic_name[] = { 81 "TAHITI", 82 "PITCAIRN", 83 "VERDE", 84 "OLAND", 85 "HAINAN", 86 "BONAIRE", 87 "KAVERI", 88 "KABINI", 89 "HAWAII", 90 "MULLINS", 91 "TOPAZ", 92 "TONGA", 93 "FIJI", 94 "CARRIZO", 95 "STONEY", 96 "POLARIS10", 97 "POLARIS11", 98 "POLARIS12", 99 "VEGAM", 100 "VEGA10", 101 "VEGA12", 102 "VEGA20", 103 "RAVEN", 104 "ARCTURUS", 105 "NAVI10", 106 "NAVI14", 107 "NAVI12", 108 "LAST", 109 }; 110 111 /** 112 * DOC: pcie_replay_count 113 * 114 * The amdgpu driver provides a sysfs API for reporting the total number 115 * of PCIe replays (NAKs) 116 * The file pcie_replay_count is used for this and returns the total 117 * number of replays as a sum of the NAKs generated and NAKs received 118 */ 119 120 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 121 struct device_attribute *attr, char *buf) 122 { 123 struct drm_device *ddev = dev_get_drvdata(dev); 124 struct amdgpu_device *adev = ddev->dev_private; 125 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 126 127 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); 128 } 129 130 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 131 amdgpu_device_get_pcie_replay_count, NULL); 132 133 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 134 135 /** 136 * amdgpu_device_is_px - Is the device is a dGPU with HG/PX power control 137 * 138 * @dev: drm_device pointer 139 * 140 * Returns true if the device is a dGPU with HG/PX power control, 141 * otherwise return false. 142 */ 143 bool amdgpu_device_is_px(struct drm_device *dev) 144 { 145 struct amdgpu_device *adev = dev->dev_private; 146 147 if (adev->flags & AMD_IS_PX) 148 return true; 149 return false; 150 } 151 152 /* 153 * MMIO register access helper functions. 154 */ 155 /** 156 * amdgpu_mm_rreg - read a memory mapped IO register 157 * 158 * @adev: amdgpu_device pointer 159 * @reg: dword aligned register offset 160 * @acc_flags: access flags which require special behavior 161 * 162 * Returns the 32 bit value from the offset specified. 163 */ 164 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, 165 uint32_t acc_flags) 166 { 167 uint32_t ret; 168 169 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) 170 return amdgpu_virt_kiq_rreg(adev, reg); 171 172 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) 173 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 174 else { 175 unsigned long flags; 176 177 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 178 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 179 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 180 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 181 } 182 trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); 183 return ret; 184 } 185 186 /* 187 * MMIO register read with bytes helper functions 188 * @offset:bytes offset from MMIO start 189 * 190 */ 191 192 /** 193 * amdgpu_mm_rreg8 - read a memory mapped IO register 194 * 195 * @adev: amdgpu_device pointer 196 * @offset: byte aligned register offset 197 * 198 * Returns the 8 bit value from the offset specified. 199 */ 200 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { 201 if (offset < adev->rmmio_size) 202 return (readb(adev->rmmio + offset)); 203 BUG(); 204 } 205 206 /* 207 * MMIO register write with bytes helper functions 208 * @offset:bytes offset from MMIO start 209 * @value: the value want to be written to the register 210 * 211 */ 212 /** 213 * amdgpu_mm_wreg8 - read a memory mapped IO register 214 * 215 * @adev: amdgpu_device pointer 216 * @offset: byte aligned register offset 217 * @value: 8 bit value to write 218 * 219 * Writes the value specified to the offset specified. 220 */ 221 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { 222 if (offset < adev->rmmio_size) 223 writeb(value, adev->rmmio + offset); 224 else 225 BUG(); 226 } 227 228 /** 229 * amdgpu_mm_wreg - write to a memory mapped IO register 230 * 231 * @adev: amdgpu_device pointer 232 * @reg: dword aligned register offset 233 * @v: 32 bit value to write to the register 234 * @acc_flags: access flags which require special behavior 235 * 236 * Writes the value specified to the offset specified. 237 */ 238 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, 239 uint32_t acc_flags) 240 { 241 trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); 242 243 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { 244 adev->last_mm_index = v; 245 } 246 247 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) 248 return amdgpu_virt_kiq_wreg(adev, reg, v); 249 250 if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) 251 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 252 else { 253 unsigned long flags; 254 255 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 256 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); 257 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); 258 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 259 } 260 261 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { 262 udelay(500); 263 } 264 } 265 266 /** 267 * amdgpu_io_rreg - read an IO register 268 * 269 * @adev: amdgpu_device pointer 270 * @reg: dword aligned register offset 271 * 272 * Returns the 32 bit value from the offset specified. 273 */ 274 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) 275 { 276 if ((reg * 4) < adev->rio_mem_size) 277 return ioread32(adev->rio_mem + (reg * 4)); 278 else { 279 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 280 return ioread32(adev->rio_mem + (mmMM_DATA * 4)); 281 } 282 } 283 284 /** 285 * amdgpu_io_wreg - write to an IO register 286 * 287 * @adev: amdgpu_device pointer 288 * @reg: dword aligned register offset 289 * @v: 32 bit value to write to the register 290 * 291 * Writes the value specified to the offset specified. 292 */ 293 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) 294 { 295 if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { 296 adev->last_mm_index = v; 297 } 298 299 if ((reg * 4) < adev->rio_mem_size) 300 iowrite32(v, adev->rio_mem + (reg * 4)); 301 else { 302 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); 303 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); 304 } 305 306 if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { 307 udelay(500); 308 } 309 } 310 311 /** 312 * amdgpu_mm_rdoorbell - read a doorbell dword 313 * 314 * @adev: amdgpu_device pointer 315 * @index: doorbell index 316 * 317 * Returns the value in the doorbell aperture at the 318 * requested doorbell index (CIK). 319 */ 320 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 321 { 322 if (index < adev->doorbell.num_doorbells) { 323 return readl(adev->doorbell.ptr + index); 324 } else { 325 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 326 return 0; 327 } 328 } 329 330 /** 331 * amdgpu_mm_wdoorbell - write a doorbell dword 332 * 333 * @adev: amdgpu_device pointer 334 * @index: doorbell index 335 * @v: value to write 336 * 337 * Writes @v to the doorbell aperture at the 338 * requested doorbell index (CIK). 339 */ 340 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 341 { 342 if (index < adev->doorbell.num_doorbells) { 343 writel(v, adev->doorbell.ptr + index); 344 } else { 345 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 346 } 347 } 348 349 /** 350 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 351 * 352 * @adev: amdgpu_device pointer 353 * @index: doorbell index 354 * 355 * Returns the value in the doorbell aperture at the 356 * requested doorbell index (VEGA10+). 357 */ 358 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 359 { 360 if (index < adev->doorbell.num_doorbells) { 361 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 362 } else { 363 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 364 return 0; 365 } 366 } 367 368 /** 369 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 370 * 371 * @adev: amdgpu_device pointer 372 * @index: doorbell index 373 * @v: value to write 374 * 375 * Writes @v to the doorbell aperture at the 376 * requested doorbell index (VEGA10+). 377 */ 378 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 379 { 380 if (index < adev->doorbell.num_doorbells) { 381 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 382 } else { 383 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 384 } 385 } 386 387 /** 388 * amdgpu_invalid_rreg - dummy reg read function 389 * 390 * @adev: amdgpu device pointer 391 * @reg: offset of register 392 * 393 * Dummy register read function. Used for register blocks 394 * that certain asics don't have (all asics). 395 * Returns the value in the register. 396 */ 397 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 398 { 399 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 400 BUG(); 401 return 0; 402 } 403 404 /** 405 * amdgpu_invalid_wreg - dummy reg write function 406 * 407 * @adev: amdgpu device pointer 408 * @reg: offset of register 409 * @v: value to write to the register 410 * 411 * Dummy register read function. Used for register blocks 412 * that certain asics don't have (all asics). 413 */ 414 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 415 { 416 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 417 reg, v); 418 BUG(); 419 } 420 421 /** 422 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 423 * 424 * @adev: amdgpu device pointer 425 * @reg: offset of register 426 * 427 * Dummy register read function. Used for register blocks 428 * that certain asics don't have (all asics). 429 * Returns the value in the register. 430 */ 431 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 432 { 433 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 434 BUG(); 435 return 0; 436 } 437 438 /** 439 * amdgpu_invalid_wreg64 - dummy reg write function 440 * 441 * @adev: amdgpu device pointer 442 * @reg: offset of register 443 * @v: value to write to the register 444 * 445 * Dummy register read function. Used for register blocks 446 * that certain asics don't have (all asics). 447 */ 448 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 449 { 450 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 451 reg, v); 452 BUG(); 453 } 454 455 /** 456 * amdgpu_block_invalid_rreg - dummy reg read function 457 * 458 * @adev: amdgpu device pointer 459 * @block: offset of instance 460 * @reg: offset of register 461 * 462 * Dummy register read function. Used for register blocks 463 * that certain asics don't have (all asics). 464 * Returns the value in the register. 465 */ 466 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 467 uint32_t block, uint32_t reg) 468 { 469 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 470 reg, block); 471 BUG(); 472 return 0; 473 } 474 475 /** 476 * amdgpu_block_invalid_wreg - dummy reg write function 477 * 478 * @adev: amdgpu device pointer 479 * @block: offset of instance 480 * @reg: offset of register 481 * @v: value to write to the register 482 * 483 * Dummy register read function. Used for register blocks 484 * that certain asics don't have (all asics). 485 */ 486 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 487 uint32_t block, 488 uint32_t reg, uint32_t v) 489 { 490 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 491 reg, block, v); 492 BUG(); 493 } 494 495 /** 496 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 497 * 498 * @adev: amdgpu device pointer 499 * 500 * Allocates a scratch page of VRAM for use by various things in the 501 * driver. 502 */ 503 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 504 { 505 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 506 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 507 &adev->vram_scratch.robj, 508 &adev->vram_scratch.gpu_addr, 509 (void **)&adev->vram_scratch.ptr); 510 } 511 512 /** 513 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 514 * 515 * @adev: amdgpu device pointer 516 * 517 * Frees the VRAM scratch page. 518 */ 519 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 520 { 521 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 522 } 523 524 /** 525 * amdgpu_device_program_register_sequence - program an array of registers. 526 * 527 * @adev: amdgpu_device pointer 528 * @registers: pointer to the register array 529 * @array_size: size of the register array 530 * 531 * Programs an array or registers with and and or masks. 532 * This is a helper for setting golden registers. 533 */ 534 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 535 const u32 *registers, 536 const u32 array_size) 537 { 538 u32 tmp, reg, and_mask, or_mask; 539 int i; 540 541 if (array_size % 3) 542 return; 543 544 for (i = 0; i < array_size; i +=3) { 545 reg = registers[i + 0]; 546 and_mask = registers[i + 1]; 547 or_mask = registers[i + 2]; 548 549 if (and_mask == 0xffffffff) { 550 tmp = or_mask; 551 } else { 552 tmp = RREG32(reg); 553 tmp &= ~and_mask; 554 if (adev->family >= AMDGPU_FAMILY_AI) 555 tmp |= (or_mask & and_mask); 556 else 557 tmp |= or_mask; 558 } 559 WREG32(reg, tmp); 560 } 561 } 562 563 /** 564 * amdgpu_device_pci_config_reset - reset the GPU 565 * 566 * @adev: amdgpu_device pointer 567 * 568 * Resets the GPU using the pci config reset sequence. 569 * Only applicable to asics prior to vega10. 570 */ 571 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 572 { 573 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 574 } 575 576 /* 577 * GPU doorbell aperture helpers function. 578 */ 579 /** 580 * amdgpu_device_doorbell_init - Init doorbell driver information. 581 * 582 * @adev: amdgpu_device pointer 583 * 584 * Init doorbell driver information (CIK) 585 * Returns 0 on success, error on failure. 586 */ 587 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 588 { 589 590 /* No doorbell on SI hardware generation */ 591 if (adev->asic_type < CHIP_BONAIRE) { 592 adev->doorbell.base = 0; 593 adev->doorbell.size = 0; 594 adev->doorbell.num_doorbells = 0; 595 adev->doorbell.ptr = NULL; 596 return 0; 597 } 598 599 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 600 return -EINVAL; 601 602 amdgpu_asic_init_doorbell_index(adev); 603 604 /* doorbell bar mapping */ 605 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 606 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 607 608 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 609 adev->doorbell_index.max_assignment+1); 610 if (adev->doorbell.num_doorbells == 0) 611 return -EINVAL; 612 613 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 614 * paging queue doorbell use the second page. The 615 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 616 * doorbells are in the first page. So with paging queue enabled, 617 * the max num_doorbells should + 1 page (0x400 in dword) 618 */ 619 if (adev->asic_type >= CHIP_VEGA10) 620 adev->doorbell.num_doorbells += 0x400; 621 622 adev->doorbell.ptr = ioremap(adev->doorbell.base, 623 adev->doorbell.num_doorbells * 624 sizeof(u32)); 625 if (adev->doorbell.ptr == NULL) 626 return -ENOMEM; 627 628 return 0; 629 } 630 631 /** 632 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 633 * 634 * @adev: amdgpu_device pointer 635 * 636 * Tear down doorbell driver information (CIK) 637 */ 638 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 639 { 640 iounmap(adev->doorbell.ptr); 641 adev->doorbell.ptr = NULL; 642 } 643 644 645 646 /* 647 * amdgpu_device_wb_*() 648 * Writeback is the method by which the GPU updates special pages in memory 649 * with the status of certain GPU events (fences, ring pointers,etc.). 650 */ 651 652 /** 653 * amdgpu_device_wb_fini - Disable Writeback and free memory 654 * 655 * @adev: amdgpu_device pointer 656 * 657 * Disables Writeback and frees the Writeback memory (all asics). 658 * Used at driver shutdown. 659 */ 660 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 661 { 662 if (adev->wb.wb_obj) { 663 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 664 &adev->wb.gpu_addr, 665 (void **)&adev->wb.wb); 666 adev->wb.wb_obj = NULL; 667 } 668 } 669 670 /** 671 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory 672 * 673 * @adev: amdgpu_device pointer 674 * 675 * Initializes writeback and allocates writeback memory (all asics). 676 * Used at driver startup. 677 * Returns 0 on success or an -error on failure. 678 */ 679 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 680 { 681 int r; 682 683 if (adev->wb.wb_obj == NULL) { 684 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 685 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 686 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 687 &adev->wb.wb_obj, &adev->wb.gpu_addr, 688 (void **)&adev->wb.wb); 689 if (r) { 690 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 691 return r; 692 } 693 694 adev->wb.num_wb = AMDGPU_MAX_WB; 695 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 696 697 /* clear wb memory */ 698 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 699 } 700 701 return 0; 702 } 703 704 /** 705 * amdgpu_device_wb_get - Allocate a wb entry 706 * 707 * @adev: amdgpu_device pointer 708 * @wb: wb index 709 * 710 * Allocate a wb slot for use by the driver (all asics). 711 * Returns 0 on success or -EINVAL on failure. 712 */ 713 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 714 { 715 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 716 717 if (offset < adev->wb.num_wb) { 718 __set_bit(offset, adev->wb.used); 719 *wb = offset << 3; /* convert to dw offset */ 720 return 0; 721 } else { 722 return -EINVAL; 723 } 724 } 725 726 /** 727 * amdgpu_device_wb_free - Free a wb entry 728 * 729 * @adev: amdgpu_device pointer 730 * @wb: wb index 731 * 732 * Free a wb slot allocated for use by the driver (all asics) 733 */ 734 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 735 { 736 wb >>= 3; 737 if (wb < adev->wb.num_wb) 738 __clear_bit(wb, adev->wb.used); 739 } 740 741 /** 742 * amdgpu_device_resize_fb_bar - try to resize FB BAR 743 * 744 * @adev: amdgpu_device pointer 745 * 746 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 747 * to fail, but if any of the BARs is not accessible after the size we abort 748 * driver loading by returning -ENODEV. 749 */ 750 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 751 { 752 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size); 753 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1; 754 struct pci_bus *root; 755 struct resource *res; 756 unsigned i; 757 u16 cmd; 758 int r; 759 760 /* Bypass for VF */ 761 if (amdgpu_sriov_vf(adev)) 762 return 0; 763 764 /* Check if the root BUS has 64bit memory resources */ 765 root = adev->pdev->bus; 766 while (root->parent) 767 root = root->parent; 768 769 pci_bus_for_each_resource(root, res, i) { 770 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 771 res->start > 0x100000000ull) 772 break; 773 } 774 775 /* Trying to resize is pointless without a root hub window above 4GB */ 776 if (!res) 777 return 0; 778 779 /* Disable memory decoding while we change the BAR addresses and size */ 780 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 781 pci_write_config_word(adev->pdev, PCI_COMMAND, 782 cmd & ~PCI_COMMAND_MEMORY); 783 784 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 785 amdgpu_device_doorbell_fini(adev); 786 if (adev->asic_type >= CHIP_BONAIRE) 787 pci_release_resource(adev->pdev, 2); 788 789 pci_release_resource(adev->pdev, 0); 790 791 r = pci_resize_resource(adev->pdev, 0, rbar_size); 792 if (r == -ENOSPC) 793 DRM_INFO("Not enough PCI address space for a large BAR."); 794 else if (r && r != -ENOTSUPP) 795 DRM_ERROR("Problem resizing BAR0 (%d).", r); 796 797 pci_assign_unassigned_bus_resources(adev->pdev->bus); 798 799 /* When the doorbell or fb BAR isn't available we have no chance of 800 * using the device. 801 */ 802 r = amdgpu_device_doorbell_init(adev); 803 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 804 return -ENODEV; 805 806 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 807 808 return 0; 809 } 810 811 /* 812 * GPU helpers function. 813 */ 814 /** 815 * amdgpu_device_need_post - check if the hw need post or not 816 * 817 * @adev: amdgpu_device pointer 818 * 819 * Check if the asic has been initialized (all asics) at driver startup 820 * or post is needed if hw reset is performed. 821 * Returns true if need or false if not. 822 */ 823 bool amdgpu_device_need_post(struct amdgpu_device *adev) 824 { 825 uint32_t reg; 826 827 if (amdgpu_sriov_vf(adev)) 828 return false; 829 830 if (amdgpu_passthrough(adev)) { 831 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 832 * some old smc fw still need driver do vPost otherwise gpu hang, while 833 * those smc fw version above 22.15 doesn't have this flaw, so we force 834 * vpost executed for smc version below 22.15 835 */ 836 if (adev->asic_type == CHIP_FIJI) { 837 int err; 838 uint32_t fw_ver; 839 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 840 /* force vPost if error occured */ 841 if (err) 842 return true; 843 844 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 845 if (fw_ver < 0x00160e00) 846 return true; 847 } 848 } 849 850 if (adev->has_hw_reset) { 851 adev->has_hw_reset = false; 852 return true; 853 } 854 855 /* bios scratch used on CIK+ */ 856 if (adev->asic_type >= CHIP_BONAIRE) 857 return amdgpu_atombios_scratch_need_asic_init(adev); 858 859 /* check MEM_SIZE for older asics */ 860 reg = amdgpu_asic_get_config_memsize(adev); 861 862 if ((reg != 0) && (reg != 0xffffffff)) 863 return false; 864 865 return true; 866 } 867 868 /* if we get transitioned to only one device, take VGA back */ 869 /** 870 * amdgpu_device_vga_set_decode - enable/disable vga decode 871 * 872 * @cookie: amdgpu_device pointer 873 * @state: enable/disable vga decode 874 * 875 * Enable/disable vga decode (all asics). 876 * Returns VGA resource flags. 877 */ 878 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state) 879 { 880 struct amdgpu_device *adev = cookie; 881 amdgpu_asic_set_vga_state(adev, state); 882 if (state) 883 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 884 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 885 else 886 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 887 } 888 889 /** 890 * amdgpu_device_check_block_size - validate the vm block size 891 * 892 * @adev: amdgpu_device pointer 893 * 894 * Validates the vm block size specified via module parameter. 895 * The vm block size defines number of bits in page table versus page directory, 896 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 897 * page table and the remaining bits are in the page directory. 898 */ 899 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 900 { 901 /* defines number of bits in page table versus page directory, 902 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 903 * page table and the remaining bits are in the page directory */ 904 if (amdgpu_vm_block_size == -1) 905 return; 906 907 if (amdgpu_vm_block_size < 9) { 908 dev_warn(adev->dev, "VM page table size (%d) too small\n", 909 amdgpu_vm_block_size); 910 amdgpu_vm_block_size = -1; 911 } 912 } 913 914 /** 915 * amdgpu_device_check_vm_size - validate the vm size 916 * 917 * @adev: amdgpu_device pointer 918 * 919 * Validates the vm size in GB specified via module parameter. 920 * The VM size is the size of the GPU virtual memory space in GB. 921 */ 922 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 923 { 924 /* no need to check the default value */ 925 if (amdgpu_vm_size == -1) 926 return; 927 928 if (amdgpu_vm_size < 1) { 929 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 930 amdgpu_vm_size); 931 amdgpu_vm_size = -1; 932 } 933 } 934 935 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 936 { 937 struct sysinfo si; 938 bool is_os_64 = (sizeof(void *) == 8) ? true : false; 939 uint64_t total_memory; 940 uint64_t dram_size_seven_GB = 0x1B8000000; 941 uint64_t dram_size_three_GB = 0xB8000000; 942 943 if (amdgpu_smu_memory_pool_size == 0) 944 return; 945 946 if (!is_os_64) { 947 DRM_WARN("Not 64-bit OS, feature not supported\n"); 948 goto def_value; 949 } 950 si_meminfo(&si); 951 total_memory = (uint64_t)si.totalram * si.mem_unit; 952 953 if ((amdgpu_smu_memory_pool_size == 1) || 954 (amdgpu_smu_memory_pool_size == 2)) { 955 if (total_memory < dram_size_three_GB) 956 goto def_value1; 957 } else if ((amdgpu_smu_memory_pool_size == 4) || 958 (amdgpu_smu_memory_pool_size == 8)) { 959 if (total_memory < dram_size_seven_GB) 960 goto def_value1; 961 } else { 962 DRM_WARN("Smu memory pool size not supported\n"); 963 goto def_value; 964 } 965 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 966 967 return; 968 969 def_value1: 970 DRM_WARN("No enough system memory\n"); 971 def_value: 972 adev->pm.smu_prv_buffer_size = 0; 973 } 974 975 /** 976 * amdgpu_device_check_arguments - validate module params 977 * 978 * @adev: amdgpu_device pointer 979 * 980 * Validates certain module parameters and updates 981 * the associated values used by the driver (all asics). 982 */ 983 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 984 { 985 int ret = 0; 986 987 if (amdgpu_sched_jobs < 4) { 988 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 989 amdgpu_sched_jobs); 990 amdgpu_sched_jobs = 4; 991 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 992 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 993 amdgpu_sched_jobs); 994 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 995 } 996 997 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 998 /* gart size must be greater or equal to 32M */ 999 dev_warn(adev->dev, "gart size (%d) too small\n", 1000 amdgpu_gart_size); 1001 amdgpu_gart_size = -1; 1002 } 1003 1004 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1005 /* gtt size must be greater or equal to 32M */ 1006 dev_warn(adev->dev, "gtt size (%d) too small\n", 1007 amdgpu_gtt_size); 1008 amdgpu_gtt_size = -1; 1009 } 1010 1011 /* valid range is between 4 and 9 inclusive */ 1012 if (amdgpu_vm_fragment_size != -1 && 1013 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1014 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1015 amdgpu_vm_fragment_size = -1; 1016 } 1017 1018 amdgpu_device_check_smu_prv_buffer_size(adev); 1019 1020 amdgpu_device_check_vm_size(adev); 1021 1022 amdgpu_device_check_block_size(adev); 1023 1024 ret = amdgpu_device_get_job_timeout_settings(adev); 1025 if (ret) { 1026 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 1027 return ret; 1028 } 1029 1030 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1031 1032 return ret; 1033 } 1034 1035 /** 1036 * amdgpu_switcheroo_set_state - set switcheroo state 1037 * 1038 * @pdev: pci dev pointer 1039 * @state: vga_switcheroo state 1040 * 1041 * Callback for the switcheroo driver. Suspends or resumes the 1042 * the asics before or after it is powered up using ACPI methods. 1043 */ 1044 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) 1045 { 1046 struct drm_device *dev = pci_get_drvdata(pdev); 1047 1048 if (amdgpu_device_is_px(dev) && state == VGA_SWITCHEROO_OFF) 1049 return; 1050 1051 if (state == VGA_SWITCHEROO_ON) { 1052 pr_info("amdgpu: switched on\n"); 1053 /* don't suspend or resume card normally */ 1054 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1055 1056 amdgpu_device_resume(dev, true, true); 1057 1058 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1059 drm_kms_helper_poll_enable(dev); 1060 } else { 1061 pr_info("amdgpu: switched off\n"); 1062 drm_kms_helper_poll_disable(dev); 1063 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1064 amdgpu_device_suspend(dev, true, true); 1065 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1066 } 1067 } 1068 1069 /** 1070 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1071 * 1072 * @pdev: pci dev pointer 1073 * 1074 * Callback for the switcheroo driver. Check of the switcheroo 1075 * state can be changed. 1076 * Returns true if the state can be changed, false if not. 1077 */ 1078 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1079 { 1080 struct drm_device *dev = pci_get_drvdata(pdev); 1081 1082 /* 1083 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1084 * locking inversion with the driver load path. And the access here is 1085 * completely racy anyway. So don't bother with locking for now. 1086 */ 1087 return dev->open_count == 0; 1088 } 1089 1090 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1091 .set_gpu_state = amdgpu_switcheroo_set_state, 1092 .reprobe = NULL, 1093 .can_switch = amdgpu_switcheroo_can_switch, 1094 }; 1095 1096 /** 1097 * amdgpu_device_ip_set_clockgating_state - set the CG state 1098 * 1099 * @dev: amdgpu_device pointer 1100 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1101 * @state: clockgating state (gate or ungate) 1102 * 1103 * Sets the requested clockgating state for all instances of 1104 * the hardware IP specified. 1105 * Returns the error code from the last instance. 1106 */ 1107 int amdgpu_device_ip_set_clockgating_state(void *dev, 1108 enum amd_ip_block_type block_type, 1109 enum amd_clockgating_state state) 1110 { 1111 struct amdgpu_device *adev = dev; 1112 int i, r = 0; 1113 1114 for (i = 0; i < adev->num_ip_blocks; i++) { 1115 if (!adev->ip_blocks[i].status.valid) 1116 continue; 1117 if (adev->ip_blocks[i].version->type != block_type) 1118 continue; 1119 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1120 continue; 1121 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1122 (void *)adev, state); 1123 if (r) 1124 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1125 adev->ip_blocks[i].version->funcs->name, r); 1126 } 1127 return r; 1128 } 1129 1130 /** 1131 * amdgpu_device_ip_set_powergating_state - set the PG state 1132 * 1133 * @dev: amdgpu_device pointer 1134 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1135 * @state: powergating state (gate or ungate) 1136 * 1137 * Sets the requested powergating state for all instances of 1138 * the hardware IP specified. 1139 * Returns the error code from the last instance. 1140 */ 1141 int amdgpu_device_ip_set_powergating_state(void *dev, 1142 enum amd_ip_block_type block_type, 1143 enum amd_powergating_state state) 1144 { 1145 struct amdgpu_device *adev = dev; 1146 int i, r = 0; 1147 1148 for (i = 0; i < adev->num_ip_blocks; i++) { 1149 if (!adev->ip_blocks[i].status.valid) 1150 continue; 1151 if (adev->ip_blocks[i].version->type != block_type) 1152 continue; 1153 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1154 continue; 1155 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1156 (void *)adev, state); 1157 if (r) 1158 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1159 adev->ip_blocks[i].version->funcs->name, r); 1160 } 1161 return r; 1162 } 1163 1164 /** 1165 * amdgpu_device_ip_get_clockgating_state - get the CG state 1166 * 1167 * @adev: amdgpu_device pointer 1168 * @flags: clockgating feature flags 1169 * 1170 * Walks the list of IPs on the device and updates the clockgating 1171 * flags for each IP. 1172 * Updates @flags with the feature flags for each hardware IP where 1173 * clockgating is enabled. 1174 */ 1175 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1176 u32 *flags) 1177 { 1178 int i; 1179 1180 for (i = 0; i < adev->num_ip_blocks; i++) { 1181 if (!adev->ip_blocks[i].status.valid) 1182 continue; 1183 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1184 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1185 } 1186 } 1187 1188 /** 1189 * amdgpu_device_ip_wait_for_idle - wait for idle 1190 * 1191 * @adev: amdgpu_device pointer 1192 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1193 * 1194 * Waits for the request hardware IP to be idle. 1195 * Returns 0 for success or a negative error code on failure. 1196 */ 1197 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1198 enum amd_ip_block_type block_type) 1199 { 1200 int i, r; 1201 1202 for (i = 0; i < adev->num_ip_blocks; i++) { 1203 if (!adev->ip_blocks[i].status.valid) 1204 continue; 1205 if (adev->ip_blocks[i].version->type == block_type) { 1206 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1207 if (r) 1208 return r; 1209 break; 1210 } 1211 } 1212 return 0; 1213 1214 } 1215 1216 /** 1217 * amdgpu_device_ip_is_idle - is the hardware IP idle 1218 * 1219 * @adev: amdgpu_device pointer 1220 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1221 * 1222 * Check if the hardware IP is idle or not. 1223 * Returns true if it the IP is idle, false if not. 1224 */ 1225 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1226 enum amd_ip_block_type block_type) 1227 { 1228 int i; 1229 1230 for (i = 0; i < adev->num_ip_blocks; i++) { 1231 if (!adev->ip_blocks[i].status.valid) 1232 continue; 1233 if (adev->ip_blocks[i].version->type == block_type) 1234 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1235 } 1236 return true; 1237 1238 } 1239 1240 /** 1241 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1242 * 1243 * @adev: amdgpu_device pointer 1244 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1245 * 1246 * Returns a pointer to the hardware IP block structure 1247 * if it exists for the asic, otherwise NULL. 1248 */ 1249 struct amdgpu_ip_block * 1250 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1251 enum amd_ip_block_type type) 1252 { 1253 int i; 1254 1255 for (i = 0; i < adev->num_ip_blocks; i++) 1256 if (adev->ip_blocks[i].version->type == type) 1257 return &adev->ip_blocks[i]; 1258 1259 return NULL; 1260 } 1261 1262 /** 1263 * amdgpu_device_ip_block_version_cmp 1264 * 1265 * @adev: amdgpu_device pointer 1266 * @type: enum amd_ip_block_type 1267 * @major: major version 1268 * @minor: minor version 1269 * 1270 * return 0 if equal or greater 1271 * return 1 if smaller or the ip_block doesn't exist 1272 */ 1273 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1274 enum amd_ip_block_type type, 1275 u32 major, u32 minor) 1276 { 1277 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1278 1279 if (ip_block && ((ip_block->version->major > major) || 1280 ((ip_block->version->major == major) && 1281 (ip_block->version->minor >= minor)))) 1282 return 0; 1283 1284 return 1; 1285 } 1286 1287 /** 1288 * amdgpu_device_ip_block_add 1289 * 1290 * @adev: amdgpu_device pointer 1291 * @ip_block_version: pointer to the IP to add 1292 * 1293 * Adds the IP block driver information to the collection of IPs 1294 * on the asic. 1295 */ 1296 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1297 const struct amdgpu_ip_block_version *ip_block_version) 1298 { 1299 if (!ip_block_version) 1300 return -EINVAL; 1301 1302 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1303 ip_block_version->funcs->name); 1304 1305 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1306 1307 return 0; 1308 } 1309 1310 /** 1311 * amdgpu_device_enable_virtual_display - enable virtual display feature 1312 * 1313 * @adev: amdgpu_device pointer 1314 * 1315 * Enabled the virtual display feature if the user has enabled it via 1316 * the module parameter virtual_display. This feature provides a virtual 1317 * display hardware on headless boards or in virtualized environments. 1318 * This function parses and validates the configuration string specified by 1319 * the user and configues the virtual display configuration (number of 1320 * virtual connectors, crtcs, etc.) specified. 1321 */ 1322 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1323 { 1324 adev->enable_virtual_display = false; 1325 1326 if (amdgpu_virtual_display) { 1327 struct drm_device *ddev = adev->ddev; 1328 const char *pci_address_name = pci_name(ddev->pdev); 1329 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1330 1331 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1332 pciaddstr_tmp = pciaddstr; 1333 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1334 pciaddname = strsep(&pciaddname_tmp, ","); 1335 if (!strcmp("all", pciaddname) 1336 || !strcmp(pci_address_name, pciaddname)) { 1337 long num_crtc; 1338 int res = -1; 1339 1340 adev->enable_virtual_display = true; 1341 1342 if (pciaddname_tmp) 1343 res = kstrtol(pciaddname_tmp, 10, 1344 &num_crtc); 1345 1346 if (!res) { 1347 if (num_crtc < 1) 1348 num_crtc = 1; 1349 if (num_crtc > 6) 1350 num_crtc = 6; 1351 adev->mode_info.num_crtc = num_crtc; 1352 } else { 1353 adev->mode_info.num_crtc = 1; 1354 } 1355 break; 1356 } 1357 } 1358 1359 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1360 amdgpu_virtual_display, pci_address_name, 1361 adev->enable_virtual_display, adev->mode_info.num_crtc); 1362 1363 kfree(pciaddstr); 1364 } 1365 } 1366 1367 /** 1368 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1369 * 1370 * @adev: amdgpu_device pointer 1371 * 1372 * Parses the asic configuration parameters specified in the gpu info 1373 * firmware and makes them availale to the driver for use in configuring 1374 * the asic. 1375 * Returns 0 on success, -EINVAL on failure. 1376 */ 1377 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1378 { 1379 const char *chip_name; 1380 char fw_name[30]; 1381 int err; 1382 const struct gpu_info_firmware_header_v1_0 *hdr; 1383 1384 adev->firmware.gpu_info_fw = NULL; 1385 1386 switch (adev->asic_type) { 1387 case CHIP_TOPAZ: 1388 case CHIP_TONGA: 1389 case CHIP_FIJI: 1390 case CHIP_POLARIS10: 1391 case CHIP_POLARIS11: 1392 case CHIP_POLARIS12: 1393 case CHIP_VEGAM: 1394 case CHIP_CARRIZO: 1395 case CHIP_STONEY: 1396 #ifdef CONFIG_DRM_AMDGPU_SI 1397 case CHIP_VERDE: 1398 case CHIP_TAHITI: 1399 case CHIP_PITCAIRN: 1400 case CHIP_OLAND: 1401 case CHIP_HAINAN: 1402 #endif 1403 #ifdef CONFIG_DRM_AMDGPU_CIK 1404 case CHIP_BONAIRE: 1405 case CHIP_HAWAII: 1406 case CHIP_KAVERI: 1407 case CHIP_KABINI: 1408 case CHIP_MULLINS: 1409 #endif 1410 case CHIP_VEGA20: 1411 default: 1412 return 0; 1413 case CHIP_VEGA10: 1414 chip_name = "vega10"; 1415 break; 1416 case CHIP_VEGA12: 1417 chip_name = "vega12"; 1418 break; 1419 case CHIP_RAVEN: 1420 if (adev->rev_id >= 8) 1421 chip_name = "raven2"; 1422 else if (adev->pdev->device == 0x15d8) 1423 chip_name = "picasso"; 1424 else 1425 chip_name = "raven"; 1426 break; 1427 case CHIP_ARCTURUS: 1428 chip_name = "arcturus"; 1429 break; 1430 case CHIP_NAVI10: 1431 chip_name = "navi10"; 1432 break; 1433 case CHIP_NAVI14: 1434 chip_name = "navi14"; 1435 break; 1436 case CHIP_NAVI12: 1437 chip_name = "navi12"; 1438 break; 1439 } 1440 1441 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 1442 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 1443 if (err) { 1444 dev_err(adev->dev, 1445 "Failed to load gpu_info firmware \"%s\"\n", 1446 fw_name); 1447 goto out; 1448 } 1449 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 1450 if (err) { 1451 dev_err(adev->dev, 1452 "Failed to validate gpu_info firmware \"%s\"\n", 1453 fw_name); 1454 goto out; 1455 } 1456 1457 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1458 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1459 1460 switch (hdr->version_major) { 1461 case 1: 1462 { 1463 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1464 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1465 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1466 1467 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1468 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1469 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1470 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1471 adev->gfx.config.max_texture_channel_caches = 1472 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1473 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1474 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1475 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1476 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1477 adev->gfx.config.double_offchip_lds_buf = 1478 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1479 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1480 adev->gfx.cu_info.max_waves_per_simd = 1481 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1482 adev->gfx.cu_info.max_scratch_slots_per_cu = 1483 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1484 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1485 if (hdr->version_minor >= 1) { 1486 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1487 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1488 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1489 adev->gfx.config.num_sc_per_sh = 1490 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1491 adev->gfx.config.num_packer_per_sc = 1492 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1493 } 1494 #ifdef CONFIG_DRM_AMD_DC_DCN2_0 1495 if (hdr->version_minor == 2) { 1496 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1497 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1498 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1499 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1500 } 1501 #endif 1502 break; 1503 } 1504 default: 1505 dev_err(adev->dev, 1506 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1507 err = -EINVAL; 1508 goto out; 1509 } 1510 out: 1511 return err; 1512 } 1513 1514 /** 1515 * amdgpu_device_ip_early_init - run early init for hardware IPs 1516 * 1517 * @adev: amdgpu_device pointer 1518 * 1519 * Early initialization pass for hardware IPs. The hardware IPs that make 1520 * up each asic are discovered each IP's early_init callback is run. This 1521 * is the first stage in initializing the asic. 1522 * Returns 0 on success, negative error code on failure. 1523 */ 1524 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1525 { 1526 int i, r; 1527 1528 amdgpu_device_enable_virtual_display(adev); 1529 1530 switch (adev->asic_type) { 1531 case CHIP_TOPAZ: 1532 case CHIP_TONGA: 1533 case CHIP_FIJI: 1534 case CHIP_POLARIS10: 1535 case CHIP_POLARIS11: 1536 case CHIP_POLARIS12: 1537 case CHIP_VEGAM: 1538 case CHIP_CARRIZO: 1539 case CHIP_STONEY: 1540 if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY) 1541 adev->family = AMDGPU_FAMILY_CZ; 1542 else 1543 adev->family = AMDGPU_FAMILY_VI; 1544 1545 r = vi_set_ip_blocks(adev); 1546 if (r) 1547 return r; 1548 break; 1549 #ifdef CONFIG_DRM_AMDGPU_SI 1550 case CHIP_VERDE: 1551 case CHIP_TAHITI: 1552 case CHIP_PITCAIRN: 1553 case CHIP_OLAND: 1554 case CHIP_HAINAN: 1555 adev->family = AMDGPU_FAMILY_SI; 1556 r = si_set_ip_blocks(adev); 1557 if (r) 1558 return r; 1559 break; 1560 #endif 1561 #ifdef CONFIG_DRM_AMDGPU_CIK 1562 case CHIP_BONAIRE: 1563 case CHIP_HAWAII: 1564 case CHIP_KAVERI: 1565 case CHIP_KABINI: 1566 case CHIP_MULLINS: 1567 if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII)) 1568 adev->family = AMDGPU_FAMILY_CI; 1569 else 1570 adev->family = AMDGPU_FAMILY_KV; 1571 1572 r = cik_set_ip_blocks(adev); 1573 if (r) 1574 return r; 1575 break; 1576 #endif 1577 case CHIP_VEGA10: 1578 case CHIP_VEGA12: 1579 case CHIP_VEGA20: 1580 case CHIP_RAVEN: 1581 case CHIP_ARCTURUS: 1582 if (adev->asic_type == CHIP_RAVEN) 1583 adev->family = AMDGPU_FAMILY_RV; 1584 else 1585 adev->family = AMDGPU_FAMILY_AI; 1586 1587 r = soc15_set_ip_blocks(adev); 1588 if (r) 1589 return r; 1590 break; 1591 case CHIP_NAVI10: 1592 case CHIP_NAVI14: 1593 case CHIP_NAVI12: 1594 adev->family = AMDGPU_FAMILY_NV; 1595 1596 r = nv_set_ip_blocks(adev); 1597 if (r) 1598 return r; 1599 break; 1600 default: 1601 /* FIXME: not supported yet */ 1602 return -EINVAL; 1603 } 1604 1605 r = amdgpu_device_parse_gpu_info_fw(adev); 1606 if (r) 1607 return r; 1608 1609 amdgpu_amdkfd_device_probe(adev); 1610 1611 if (amdgpu_sriov_vf(adev)) { 1612 r = amdgpu_virt_request_full_gpu(adev, true); 1613 if (r) 1614 return -EAGAIN; 1615 } 1616 1617 adev->pm.pp_feature = amdgpu_pp_feature_mask; 1618 if (amdgpu_sriov_vf(adev)) 1619 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 1620 1621 for (i = 0; i < adev->num_ip_blocks; i++) { 1622 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 1623 DRM_ERROR("disabled ip block: %d <%s>\n", 1624 i, adev->ip_blocks[i].version->funcs->name); 1625 adev->ip_blocks[i].status.valid = false; 1626 } else { 1627 if (adev->ip_blocks[i].version->funcs->early_init) { 1628 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 1629 if (r == -ENOENT) { 1630 adev->ip_blocks[i].status.valid = false; 1631 } else if (r) { 1632 DRM_ERROR("early_init of IP block <%s> failed %d\n", 1633 adev->ip_blocks[i].version->funcs->name, r); 1634 return r; 1635 } else { 1636 adev->ip_blocks[i].status.valid = true; 1637 } 1638 } else { 1639 adev->ip_blocks[i].status.valid = true; 1640 } 1641 } 1642 /* get the vbios after the asic_funcs are set up */ 1643 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 1644 /* Read BIOS */ 1645 if (!amdgpu_get_bios(adev)) 1646 return -EINVAL; 1647 1648 r = amdgpu_atombios_init(adev); 1649 if (r) { 1650 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 1651 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 1652 return r; 1653 } 1654 } 1655 } 1656 1657 adev->cg_flags &= amdgpu_cg_mask; 1658 adev->pg_flags &= amdgpu_pg_mask; 1659 1660 return 0; 1661 } 1662 1663 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 1664 { 1665 int i, r; 1666 1667 for (i = 0; i < adev->num_ip_blocks; i++) { 1668 if (!adev->ip_blocks[i].status.sw) 1669 continue; 1670 if (adev->ip_blocks[i].status.hw) 1671 continue; 1672 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 1673 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 1674 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 1675 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1676 if (r) { 1677 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1678 adev->ip_blocks[i].version->funcs->name, r); 1679 return r; 1680 } 1681 adev->ip_blocks[i].status.hw = true; 1682 } 1683 } 1684 1685 return 0; 1686 } 1687 1688 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 1689 { 1690 int i, r; 1691 1692 for (i = 0; i < adev->num_ip_blocks; i++) { 1693 if (!adev->ip_blocks[i].status.sw) 1694 continue; 1695 if (adev->ip_blocks[i].status.hw) 1696 continue; 1697 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1698 if (r) { 1699 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1700 adev->ip_blocks[i].version->funcs->name, r); 1701 return r; 1702 } 1703 adev->ip_blocks[i].status.hw = true; 1704 } 1705 1706 return 0; 1707 } 1708 1709 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 1710 { 1711 int r = 0; 1712 int i; 1713 uint32_t smu_version; 1714 1715 if (adev->asic_type >= CHIP_VEGA10) { 1716 for (i = 0; i < adev->num_ip_blocks; i++) { 1717 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 1718 continue; 1719 1720 /* no need to do the fw loading again if already done*/ 1721 if (adev->ip_blocks[i].status.hw == true) 1722 break; 1723 1724 if (adev->in_gpu_reset || adev->in_suspend) { 1725 r = adev->ip_blocks[i].version->funcs->resume(adev); 1726 if (r) { 1727 DRM_ERROR("resume of IP block <%s> failed %d\n", 1728 adev->ip_blocks[i].version->funcs->name, r); 1729 return r; 1730 } 1731 } else { 1732 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 1733 if (r) { 1734 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 1735 adev->ip_blocks[i].version->funcs->name, r); 1736 return r; 1737 } 1738 } 1739 1740 adev->ip_blocks[i].status.hw = true; 1741 break; 1742 } 1743 } 1744 1745 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 1746 1747 return r; 1748 } 1749 1750 /** 1751 * amdgpu_device_ip_init - run init for hardware IPs 1752 * 1753 * @adev: amdgpu_device pointer 1754 * 1755 * Main initialization pass for hardware IPs. The list of all the hardware 1756 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 1757 * are run. sw_init initializes the software state associated with each IP 1758 * and hw_init initializes the hardware associated with each IP. 1759 * Returns 0 on success, negative error code on failure. 1760 */ 1761 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 1762 { 1763 int i, r; 1764 1765 r = amdgpu_ras_init(adev); 1766 if (r) 1767 return r; 1768 1769 for (i = 0; i < adev->num_ip_blocks; i++) { 1770 if (!adev->ip_blocks[i].status.valid) 1771 continue; 1772 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 1773 if (r) { 1774 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 1775 adev->ip_blocks[i].version->funcs->name, r); 1776 goto init_failed; 1777 } 1778 adev->ip_blocks[i].status.sw = true; 1779 1780 /* need to do gmc hw init early so we can allocate gpu mem */ 1781 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 1782 r = amdgpu_device_vram_scratch_init(adev); 1783 if (r) { 1784 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 1785 goto init_failed; 1786 } 1787 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 1788 if (r) { 1789 DRM_ERROR("hw_init %d failed %d\n", i, r); 1790 goto init_failed; 1791 } 1792 r = amdgpu_device_wb_init(adev); 1793 if (r) { 1794 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 1795 goto init_failed; 1796 } 1797 adev->ip_blocks[i].status.hw = true; 1798 1799 /* right after GMC hw init, we create CSA */ 1800 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 1801 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 1802 AMDGPU_GEM_DOMAIN_VRAM, 1803 AMDGPU_CSA_SIZE); 1804 if (r) { 1805 DRM_ERROR("allocate CSA failed %d\n", r); 1806 goto init_failed; 1807 } 1808 } 1809 } 1810 } 1811 1812 r = amdgpu_ib_pool_init(adev); 1813 if (r) { 1814 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 1815 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 1816 goto init_failed; 1817 } 1818 1819 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 1820 if (r) 1821 goto init_failed; 1822 1823 r = amdgpu_device_ip_hw_init_phase1(adev); 1824 if (r) 1825 goto init_failed; 1826 1827 r = amdgpu_device_fw_loading(adev); 1828 if (r) 1829 goto init_failed; 1830 1831 r = amdgpu_device_ip_hw_init_phase2(adev); 1832 if (r) 1833 goto init_failed; 1834 1835 if (adev->gmc.xgmi.num_physical_nodes > 1) 1836 amdgpu_xgmi_add_device(adev); 1837 amdgpu_amdkfd_device_init(adev); 1838 1839 init_failed: 1840 if (amdgpu_sriov_vf(adev)) { 1841 if (!r) 1842 amdgpu_virt_init_data_exchange(adev); 1843 amdgpu_virt_release_full_gpu(adev, true); 1844 } 1845 1846 return r; 1847 } 1848 1849 /** 1850 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 1851 * 1852 * @adev: amdgpu_device pointer 1853 * 1854 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 1855 * this function before a GPU reset. If the value is retained after a 1856 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 1857 */ 1858 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 1859 { 1860 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 1861 } 1862 1863 /** 1864 * amdgpu_device_check_vram_lost - check if vram is valid 1865 * 1866 * @adev: amdgpu_device pointer 1867 * 1868 * Checks the reset magic value written to the gart pointer in VRAM. 1869 * The driver calls this after a GPU reset to see if the contents of 1870 * VRAM is lost or now. 1871 * returns true if vram is lost, false if not. 1872 */ 1873 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 1874 { 1875 return !!memcmp(adev->gart.ptr, adev->reset_magic, 1876 AMDGPU_RESET_MAGIC_NUM); 1877 } 1878 1879 /** 1880 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 1881 * 1882 * @adev: amdgpu_device pointer 1883 * 1884 * The list of all the hardware IPs that make up the asic is walked and the 1885 * set_clockgating_state callbacks are run. 1886 * Late initialization pass enabling clockgating for hardware IPs. 1887 * Fini or suspend, pass disabling clockgating for hardware IPs. 1888 * Returns 0 on success, negative error code on failure. 1889 */ 1890 1891 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 1892 enum amd_clockgating_state state) 1893 { 1894 int i, j, r; 1895 1896 if (amdgpu_emu_mode == 1) 1897 return 0; 1898 1899 for (j = 0; j < adev->num_ip_blocks; j++) { 1900 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 1901 if (!adev->ip_blocks[i].status.late_initialized) 1902 continue; 1903 /* skip CG for VCE/UVD, it's handled specially */ 1904 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 1905 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 1906 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 1907 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 1908 /* enable clockgating to save power */ 1909 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 1910 state); 1911 if (r) { 1912 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 1913 adev->ip_blocks[i].version->funcs->name, r); 1914 return r; 1915 } 1916 } 1917 } 1918 1919 return 0; 1920 } 1921 1922 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state) 1923 { 1924 int i, j, r; 1925 1926 if (amdgpu_emu_mode == 1) 1927 return 0; 1928 1929 for (j = 0; j < adev->num_ip_blocks; j++) { 1930 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 1931 if (!adev->ip_blocks[i].status.late_initialized) 1932 continue; 1933 /* skip CG for VCE/UVD, it's handled specially */ 1934 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 1935 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 1936 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 1937 adev->ip_blocks[i].version->funcs->set_powergating_state) { 1938 /* enable powergating to save power */ 1939 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 1940 state); 1941 if (r) { 1942 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 1943 adev->ip_blocks[i].version->funcs->name, r); 1944 return r; 1945 } 1946 } 1947 } 1948 return 0; 1949 } 1950 1951 static int amdgpu_device_enable_mgpu_fan_boost(void) 1952 { 1953 struct amdgpu_gpu_instance *gpu_ins; 1954 struct amdgpu_device *adev; 1955 int i, ret = 0; 1956 1957 mutex_lock(&mgpu_info.mutex); 1958 1959 /* 1960 * MGPU fan boost feature should be enabled 1961 * only when there are two or more dGPUs in 1962 * the system 1963 */ 1964 if (mgpu_info.num_dgpu < 2) 1965 goto out; 1966 1967 for (i = 0; i < mgpu_info.num_dgpu; i++) { 1968 gpu_ins = &(mgpu_info.gpu_ins[i]); 1969 adev = gpu_ins->adev; 1970 if (!(adev->flags & AMD_IS_APU) && 1971 !gpu_ins->mgpu_fan_enabled && 1972 adev->powerplay.pp_funcs && 1973 adev->powerplay.pp_funcs->enable_mgpu_fan_boost) { 1974 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 1975 if (ret) 1976 break; 1977 1978 gpu_ins->mgpu_fan_enabled = 1; 1979 } 1980 } 1981 1982 out: 1983 mutex_unlock(&mgpu_info.mutex); 1984 1985 return ret; 1986 } 1987 1988 /** 1989 * amdgpu_device_ip_late_init - run late init for hardware IPs 1990 * 1991 * @adev: amdgpu_device pointer 1992 * 1993 * Late initialization pass for hardware IPs. The list of all the hardware 1994 * IPs that make up the asic is walked and the late_init callbacks are run. 1995 * late_init covers any special initialization that an IP requires 1996 * after all of the have been initialized or something that needs to happen 1997 * late in the init process. 1998 * Returns 0 on success, negative error code on failure. 1999 */ 2000 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2001 { 2002 int i = 0, r; 2003 2004 for (i = 0; i < adev->num_ip_blocks; i++) { 2005 if (!adev->ip_blocks[i].status.hw) 2006 continue; 2007 if (adev->ip_blocks[i].version->funcs->late_init) { 2008 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2009 if (r) { 2010 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2011 adev->ip_blocks[i].version->funcs->name, r); 2012 return r; 2013 } 2014 } 2015 adev->ip_blocks[i].status.late_initialized = true; 2016 } 2017 2018 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2019 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2020 2021 amdgpu_device_fill_reset_magic(adev); 2022 2023 r = amdgpu_device_enable_mgpu_fan_boost(); 2024 if (r) 2025 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2026 2027 /* set to low pstate by default */ 2028 amdgpu_xgmi_set_pstate(adev, 0); 2029 2030 return 0; 2031 } 2032 2033 /** 2034 * amdgpu_device_ip_fini - run fini for hardware IPs 2035 * 2036 * @adev: amdgpu_device pointer 2037 * 2038 * Main teardown pass for hardware IPs. The list of all the hardware 2039 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2040 * are run. hw_fini tears down the hardware associated with each IP 2041 * and sw_fini tears down any software state associated with each IP. 2042 * Returns 0 on success, negative error code on failure. 2043 */ 2044 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2045 { 2046 int i, r; 2047 2048 amdgpu_ras_pre_fini(adev); 2049 2050 if (adev->gmc.xgmi.num_physical_nodes > 1) 2051 amdgpu_xgmi_remove_device(adev); 2052 2053 amdgpu_amdkfd_device_fini(adev); 2054 2055 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2056 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2057 2058 /* need to disable SMC first */ 2059 for (i = 0; i < adev->num_ip_blocks; i++) { 2060 if (!adev->ip_blocks[i].status.hw) 2061 continue; 2062 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2063 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2064 /* XXX handle errors */ 2065 if (r) { 2066 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2067 adev->ip_blocks[i].version->funcs->name, r); 2068 } 2069 adev->ip_blocks[i].status.hw = false; 2070 break; 2071 } 2072 } 2073 2074 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2075 if (!adev->ip_blocks[i].status.hw) 2076 continue; 2077 2078 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2079 /* XXX handle errors */ 2080 if (r) { 2081 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2082 adev->ip_blocks[i].version->funcs->name, r); 2083 } 2084 2085 adev->ip_blocks[i].status.hw = false; 2086 } 2087 2088 2089 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2090 if (!adev->ip_blocks[i].status.sw) 2091 continue; 2092 2093 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2094 amdgpu_ucode_free_bo(adev); 2095 amdgpu_free_static_csa(&adev->virt.csa_obj); 2096 amdgpu_device_wb_fini(adev); 2097 amdgpu_device_vram_scratch_fini(adev); 2098 amdgpu_ib_pool_fini(adev); 2099 } 2100 2101 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2102 /* XXX handle errors */ 2103 if (r) { 2104 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2105 adev->ip_blocks[i].version->funcs->name, r); 2106 } 2107 adev->ip_blocks[i].status.sw = false; 2108 adev->ip_blocks[i].status.valid = false; 2109 } 2110 2111 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2112 if (!adev->ip_blocks[i].status.late_initialized) 2113 continue; 2114 if (adev->ip_blocks[i].version->funcs->late_fini) 2115 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2116 adev->ip_blocks[i].status.late_initialized = false; 2117 } 2118 2119 amdgpu_ras_fini(adev); 2120 2121 if (amdgpu_sriov_vf(adev)) 2122 if (amdgpu_virt_release_full_gpu(adev, false)) 2123 DRM_ERROR("failed to release exclusive mode on fini\n"); 2124 2125 return 0; 2126 } 2127 2128 /** 2129 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2130 * 2131 * @work: work_struct. 2132 */ 2133 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2134 { 2135 struct amdgpu_device *adev = 2136 container_of(work, struct amdgpu_device, delayed_init_work.work); 2137 int r; 2138 2139 r = amdgpu_ib_ring_tests(adev); 2140 if (r) 2141 DRM_ERROR("ib ring test failed (%d).\n", r); 2142 } 2143 2144 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2145 { 2146 struct amdgpu_device *adev = 2147 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2148 2149 mutex_lock(&adev->gfx.gfx_off_mutex); 2150 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) { 2151 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2152 adev->gfx.gfx_off_state = true; 2153 } 2154 mutex_unlock(&adev->gfx.gfx_off_mutex); 2155 } 2156 2157 /** 2158 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2159 * 2160 * @adev: amdgpu_device pointer 2161 * 2162 * Main suspend function for hardware IPs. The list of all the hardware 2163 * IPs that make up the asic is walked, clockgating is disabled and the 2164 * suspend callbacks are run. suspend puts the hardware and software state 2165 * in each IP into a state suitable for suspend. 2166 * Returns 0 on success, negative error code on failure. 2167 */ 2168 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2169 { 2170 int i, r; 2171 2172 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2173 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2174 2175 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2176 if (!adev->ip_blocks[i].status.valid) 2177 continue; 2178 /* displays are handled separately */ 2179 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 2180 /* XXX handle errors */ 2181 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2182 /* XXX handle errors */ 2183 if (r) { 2184 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2185 adev->ip_blocks[i].version->funcs->name, r); 2186 return r; 2187 } 2188 adev->ip_blocks[i].status.hw = false; 2189 } 2190 } 2191 2192 return 0; 2193 } 2194 2195 /** 2196 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2197 * 2198 * @adev: amdgpu_device pointer 2199 * 2200 * Main suspend function for hardware IPs. The list of all the hardware 2201 * IPs that make up the asic is walked, clockgating is disabled and the 2202 * suspend callbacks are run. suspend puts the hardware and software state 2203 * in each IP into a state suitable for suspend. 2204 * Returns 0 on success, negative error code on failure. 2205 */ 2206 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2207 { 2208 int i, r; 2209 2210 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2211 if (!adev->ip_blocks[i].status.valid) 2212 continue; 2213 /* displays are handled in phase1 */ 2214 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2215 continue; 2216 /* XXX handle errors */ 2217 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2218 /* XXX handle errors */ 2219 if (r) { 2220 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2221 adev->ip_blocks[i].version->funcs->name, r); 2222 } 2223 adev->ip_blocks[i].status.hw = false; 2224 /* handle putting the SMC in the appropriate state */ 2225 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2226 if (is_support_sw_smu(adev)) { 2227 /* todo */ 2228 } else if (adev->powerplay.pp_funcs && 2229 adev->powerplay.pp_funcs->set_mp1_state) { 2230 r = adev->powerplay.pp_funcs->set_mp1_state( 2231 adev->powerplay.pp_handle, 2232 adev->mp1_state); 2233 if (r) { 2234 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2235 adev->mp1_state, r); 2236 return r; 2237 } 2238 } 2239 } 2240 2241 adev->ip_blocks[i].status.hw = false; 2242 } 2243 2244 return 0; 2245 } 2246 2247 /** 2248 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2249 * 2250 * @adev: amdgpu_device pointer 2251 * 2252 * Main suspend function for hardware IPs. The list of all the hardware 2253 * IPs that make up the asic is walked, clockgating is disabled and the 2254 * suspend callbacks are run. suspend puts the hardware and software state 2255 * in each IP into a state suitable for suspend. 2256 * Returns 0 on success, negative error code on failure. 2257 */ 2258 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 2259 { 2260 int r; 2261 2262 if (amdgpu_sriov_vf(adev)) 2263 amdgpu_virt_request_full_gpu(adev, false); 2264 2265 r = amdgpu_device_ip_suspend_phase1(adev); 2266 if (r) 2267 return r; 2268 r = amdgpu_device_ip_suspend_phase2(adev); 2269 2270 if (amdgpu_sriov_vf(adev)) 2271 amdgpu_virt_release_full_gpu(adev, false); 2272 2273 return r; 2274 } 2275 2276 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 2277 { 2278 int i, r; 2279 2280 static enum amd_ip_block_type ip_order[] = { 2281 AMD_IP_BLOCK_TYPE_GMC, 2282 AMD_IP_BLOCK_TYPE_COMMON, 2283 AMD_IP_BLOCK_TYPE_PSP, 2284 AMD_IP_BLOCK_TYPE_IH, 2285 }; 2286 2287 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2288 int j; 2289 struct amdgpu_ip_block *block; 2290 2291 for (j = 0; j < adev->num_ip_blocks; j++) { 2292 block = &adev->ip_blocks[j]; 2293 2294 block->status.hw = false; 2295 if (block->version->type != ip_order[i] || 2296 !block->status.valid) 2297 continue; 2298 2299 r = block->version->funcs->hw_init(adev); 2300 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2301 if (r) 2302 return r; 2303 block->status.hw = true; 2304 } 2305 } 2306 2307 return 0; 2308 } 2309 2310 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 2311 { 2312 int i, r; 2313 2314 static enum amd_ip_block_type ip_order[] = { 2315 AMD_IP_BLOCK_TYPE_SMC, 2316 AMD_IP_BLOCK_TYPE_DCE, 2317 AMD_IP_BLOCK_TYPE_GFX, 2318 AMD_IP_BLOCK_TYPE_SDMA, 2319 AMD_IP_BLOCK_TYPE_UVD, 2320 AMD_IP_BLOCK_TYPE_VCE 2321 }; 2322 2323 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 2324 int j; 2325 struct amdgpu_ip_block *block; 2326 2327 for (j = 0; j < adev->num_ip_blocks; j++) { 2328 block = &adev->ip_blocks[j]; 2329 2330 if (block->version->type != ip_order[i] || 2331 !block->status.valid || 2332 block->status.hw) 2333 continue; 2334 2335 r = block->version->funcs->hw_init(adev); 2336 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 2337 if (r) 2338 return r; 2339 block->status.hw = true; 2340 } 2341 } 2342 2343 return 0; 2344 } 2345 2346 /** 2347 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 2348 * 2349 * @adev: amdgpu_device pointer 2350 * 2351 * First resume function for hardware IPs. The list of all the hardware 2352 * IPs that make up the asic is walked and the resume callbacks are run for 2353 * COMMON, GMC, and IH. resume puts the hardware into a functional state 2354 * after a suspend and updates the software state as necessary. This 2355 * function is also used for restoring the GPU after a GPU reset. 2356 * Returns 0 on success, negative error code on failure. 2357 */ 2358 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 2359 { 2360 int i, r; 2361 2362 for (i = 0; i < adev->num_ip_blocks; i++) { 2363 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2364 continue; 2365 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2366 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2367 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2368 2369 r = adev->ip_blocks[i].version->funcs->resume(adev); 2370 if (r) { 2371 DRM_ERROR("resume of IP block <%s> failed %d\n", 2372 adev->ip_blocks[i].version->funcs->name, r); 2373 return r; 2374 } 2375 adev->ip_blocks[i].status.hw = true; 2376 } 2377 } 2378 2379 return 0; 2380 } 2381 2382 /** 2383 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 2384 * 2385 * @adev: amdgpu_device pointer 2386 * 2387 * First resume function for hardware IPs. The list of all the hardware 2388 * IPs that make up the asic is walked and the resume callbacks are run for 2389 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 2390 * functional state after a suspend and updates the software state as 2391 * necessary. This function is also used for restoring the GPU after a GPU 2392 * reset. 2393 * Returns 0 on success, negative error code on failure. 2394 */ 2395 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 2396 { 2397 int i, r; 2398 2399 for (i = 0; i < adev->num_ip_blocks; i++) { 2400 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 2401 continue; 2402 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2403 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2404 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 2405 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 2406 continue; 2407 r = adev->ip_blocks[i].version->funcs->resume(adev); 2408 if (r) { 2409 DRM_ERROR("resume of IP block <%s> failed %d\n", 2410 adev->ip_blocks[i].version->funcs->name, r); 2411 return r; 2412 } 2413 adev->ip_blocks[i].status.hw = true; 2414 } 2415 2416 return 0; 2417 } 2418 2419 /** 2420 * amdgpu_device_ip_resume - run resume for hardware IPs 2421 * 2422 * @adev: amdgpu_device pointer 2423 * 2424 * Main resume function for hardware IPs. The hardware IPs 2425 * are split into two resume functions because they are 2426 * are also used in in recovering from a GPU reset and some additional 2427 * steps need to be take between them. In this case (S3/S4) they are 2428 * run sequentially. 2429 * Returns 0 on success, negative error code on failure. 2430 */ 2431 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 2432 { 2433 int r; 2434 2435 r = amdgpu_device_ip_resume_phase1(adev); 2436 if (r) 2437 return r; 2438 2439 r = amdgpu_device_fw_loading(adev); 2440 if (r) 2441 return r; 2442 2443 r = amdgpu_device_ip_resume_phase2(adev); 2444 2445 return r; 2446 } 2447 2448 /** 2449 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 2450 * 2451 * @adev: amdgpu_device pointer 2452 * 2453 * Query the VBIOS data tables to determine if the board supports SR-IOV. 2454 */ 2455 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 2456 { 2457 if (amdgpu_sriov_vf(adev)) { 2458 if (adev->is_atom_fw) { 2459 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) 2460 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2461 } else { 2462 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 2463 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 2464 } 2465 2466 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 2467 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 2468 } 2469 } 2470 2471 /** 2472 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 2473 * 2474 * @asic_type: AMD asic type 2475 * 2476 * Check if there is DC (new modesetting infrastructre) support for an asic. 2477 * returns true if DC has support, false if not. 2478 */ 2479 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 2480 { 2481 switch (asic_type) { 2482 #if defined(CONFIG_DRM_AMD_DC) 2483 case CHIP_BONAIRE: 2484 case CHIP_KAVERI: 2485 case CHIP_KABINI: 2486 case CHIP_MULLINS: 2487 /* 2488 * We have systems in the wild with these ASICs that require 2489 * LVDS and VGA support which is not supported with DC. 2490 * 2491 * Fallback to the non-DC driver here by default so as not to 2492 * cause regressions. 2493 */ 2494 return amdgpu_dc > 0; 2495 case CHIP_HAWAII: 2496 case CHIP_CARRIZO: 2497 case CHIP_STONEY: 2498 case CHIP_POLARIS10: 2499 case CHIP_POLARIS11: 2500 case CHIP_POLARIS12: 2501 case CHIP_VEGAM: 2502 case CHIP_TONGA: 2503 case CHIP_FIJI: 2504 case CHIP_VEGA10: 2505 case CHIP_VEGA12: 2506 case CHIP_VEGA20: 2507 #if defined(CONFIG_DRM_AMD_DC_DCN1_0) 2508 case CHIP_RAVEN: 2509 #endif 2510 #if defined(CONFIG_DRM_AMD_DC_DCN2_0) 2511 case CHIP_NAVI10: 2512 case CHIP_NAVI14: 2513 case CHIP_NAVI12: 2514 #endif 2515 return amdgpu_dc != 0; 2516 #endif 2517 default: 2518 return false; 2519 } 2520 } 2521 2522 /** 2523 * amdgpu_device_has_dc_support - check if dc is supported 2524 * 2525 * @adev: amdgpu_device_pointer 2526 * 2527 * Returns true for supported, false for not supported 2528 */ 2529 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 2530 { 2531 if (amdgpu_sriov_vf(adev)) 2532 return false; 2533 2534 return amdgpu_device_asic_has_dc_support(adev->asic_type); 2535 } 2536 2537 2538 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 2539 { 2540 struct amdgpu_device *adev = 2541 container_of(__work, struct amdgpu_device, xgmi_reset_work); 2542 2543 adev->asic_reset_res = amdgpu_asic_reset(adev); 2544 if (adev->asic_reset_res) 2545 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 2546 adev->asic_reset_res, adev->ddev->unique); 2547 } 2548 2549 2550 /** 2551 * amdgpu_device_init - initialize the driver 2552 * 2553 * @adev: amdgpu_device pointer 2554 * @ddev: drm dev pointer 2555 * @pdev: pci dev pointer 2556 * @flags: driver flags 2557 * 2558 * Initializes the driver info and hw (all asics). 2559 * Returns 0 for success or an error on failure. 2560 * Called at driver startup. 2561 */ 2562 int amdgpu_device_init(struct amdgpu_device *adev, 2563 struct drm_device *ddev, 2564 struct pci_dev *pdev, 2565 uint32_t flags) 2566 { 2567 int r, i; 2568 bool runtime = false; 2569 u32 max_MBps; 2570 2571 adev->shutdown = false; 2572 adev->dev = &pdev->dev; 2573 adev->ddev = ddev; 2574 adev->pdev = pdev; 2575 adev->flags = flags; 2576 adev->asic_type = flags & AMD_ASIC_MASK; 2577 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 2578 if (amdgpu_emu_mode == 1) 2579 adev->usec_timeout *= 2; 2580 adev->gmc.gart_size = 512 * 1024 * 1024; 2581 adev->accel_working = false; 2582 adev->num_rings = 0; 2583 adev->mman.buffer_funcs = NULL; 2584 adev->mman.buffer_funcs_ring = NULL; 2585 adev->vm_manager.vm_pte_funcs = NULL; 2586 adev->vm_manager.vm_pte_num_rqs = 0; 2587 adev->gmc.gmc_funcs = NULL; 2588 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 2589 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 2590 2591 adev->smc_rreg = &amdgpu_invalid_rreg; 2592 adev->smc_wreg = &amdgpu_invalid_wreg; 2593 adev->pcie_rreg = &amdgpu_invalid_rreg; 2594 adev->pcie_wreg = &amdgpu_invalid_wreg; 2595 adev->pciep_rreg = &amdgpu_invalid_rreg; 2596 adev->pciep_wreg = &amdgpu_invalid_wreg; 2597 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 2598 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 2599 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 2600 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 2601 adev->didt_rreg = &amdgpu_invalid_rreg; 2602 adev->didt_wreg = &amdgpu_invalid_wreg; 2603 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 2604 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 2605 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 2606 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 2607 2608 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 2609 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 2610 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 2611 2612 /* mutex initialization are all done here so we 2613 * can recall function without having locking issues */ 2614 atomic_set(&adev->irq.ih.lock, 0); 2615 mutex_init(&adev->firmware.mutex); 2616 mutex_init(&adev->pm.mutex); 2617 mutex_init(&adev->gfx.gpu_clock_mutex); 2618 mutex_init(&adev->srbm_mutex); 2619 mutex_init(&adev->gfx.pipe_reserve_mutex); 2620 mutex_init(&adev->gfx.gfx_off_mutex); 2621 mutex_init(&adev->grbm_idx_mutex); 2622 mutex_init(&adev->mn_lock); 2623 mutex_init(&adev->virt.vf_errors.lock); 2624 hash_init(adev->mn_hash); 2625 mutex_init(&adev->lock_reset); 2626 mutex_init(&adev->virt.dpm_mutex); 2627 mutex_init(&adev->psp.mutex); 2628 2629 r = amdgpu_device_check_arguments(adev); 2630 if (r) 2631 return r; 2632 2633 spin_lock_init(&adev->mmio_idx_lock); 2634 spin_lock_init(&adev->smc_idx_lock); 2635 spin_lock_init(&adev->pcie_idx_lock); 2636 spin_lock_init(&adev->uvd_ctx_idx_lock); 2637 spin_lock_init(&adev->didt_idx_lock); 2638 spin_lock_init(&adev->gc_cac_idx_lock); 2639 spin_lock_init(&adev->se_cac_idx_lock); 2640 spin_lock_init(&adev->audio_endpt_idx_lock); 2641 spin_lock_init(&adev->mm_stats.lock); 2642 2643 INIT_LIST_HEAD(&adev->shadow_list); 2644 mutex_init(&adev->shadow_list_lock); 2645 2646 INIT_LIST_HEAD(&adev->ring_lru_list); 2647 spin_lock_init(&adev->ring_lru_list_lock); 2648 2649 INIT_DELAYED_WORK(&adev->delayed_init_work, 2650 amdgpu_device_delayed_init_work_handler); 2651 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 2652 amdgpu_device_delay_enable_gfx_off); 2653 2654 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 2655 2656 adev->gfx.gfx_off_req_count = 1; 2657 adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; 2658 2659 /* Registers mapping */ 2660 /* TODO: block userspace mapping of io register */ 2661 if (adev->asic_type >= CHIP_BONAIRE) { 2662 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 2663 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 2664 } else { 2665 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 2666 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 2667 } 2668 2669 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 2670 if (adev->rmmio == NULL) { 2671 return -ENOMEM; 2672 } 2673 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 2674 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 2675 2676 /* io port mapping */ 2677 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { 2678 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { 2679 adev->rio_mem_size = pci_resource_len(adev->pdev, i); 2680 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size); 2681 break; 2682 } 2683 } 2684 if (adev->rio_mem == NULL) 2685 DRM_INFO("PCI I/O BAR is not found.\n"); 2686 2687 /* enable PCIE atomic ops */ 2688 r = pci_enable_atomic_ops_to_root(adev->pdev, 2689 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 2690 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 2691 if (r) { 2692 adev->have_atomics_support = false; 2693 DRM_INFO("PCIE atomic ops is not supported\n"); 2694 } else { 2695 adev->have_atomics_support = true; 2696 } 2697 2698 amdgpu_device_get_pcie_info(adev); 2699 2700 if (amdgpu_mcbp) 2701 DRM_INFO("MCBP is enabled\n"); 2702 2703 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 2704 adev->enable_mes = true; 2705 2706 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) { 2707 r = amdgpu_discovery_init(adev); 2708 if (r) { 2709 dev_err(adev->dev, "amdgpu_discovery_init failed\n"); 2710 return r; 2711 } 2712 } 2713 2714 /* early init functions */ 2715 r = amdgpu_device_ip_early_init(adev); 2716 if (r) 2717 return r; 2718 2719 /* doorbell bar mapping and doorbell index init*/ 2720 amdgpu_device_doorbell_init(adev); 2721 2722 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 2723 /* this will fail for cards that aren't VGA class devices, just 2724 * ignore it */ 2725 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); 2726 2727 if (amdgpu_device_is_px(ddev)) 2728 runtime = true; 2729 if (!pci_is_thunderbolt_attached(adev->pdev)) 2730 vga_switcheroo_register_client(adev->pdev, 2731 &amdgpu_switcheroo_ops, runtime); 2732 if (runtime) 2733 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 2734 2735 if (amdgpu_emu_mode == 1) { 2736 /* post the asic on emulation mode */ 2737 emu_soc_asic_init(adev); 2738 goto fence_driver_init; 2739 } 2740 2741 /* detect if we are with an SRIOV vbios */ 2742 amdgpu_device_detect_sriov_bios(adev); 2743 2744 /* check if we need to reset the asic 2745 * E.g., driver was not cleanly unloaded previously, etc. 2746 */ 2747 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 2748 r = amdgpu_asic_reset(adev); 2749 if (r) { 2750 dev_err(adev->dev, "asic reset on init failed\n"); 2751 goto failed; 2752 } 2753 } 2754 2755 /* Post card if necessary */ 2756 if (amdgpu_device_need_post(adev)) { 2757 if (!adev->bios) { 2758 dev_err(adev->dev, "no vBIOS found\n"); 2759 r = -EINVAL; 2760 goto failed; 2761 } 2762 DRM_INFO("GPU posting now...\n"); 2763 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 2764 if (r) { 2765 dev_err(adev->dev, "gpu post error!\n"); 2766 goto failed; 2767 } 2768 } 2769 2770 if (adev->is_atom_fw) { 2771 /* Initialize clocks */ 2772 r = amdgpu_atomfirmware_get_clock_info(adev); 2773 if (r) { 2774 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 2775 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 2776 goto failed; 2777 } 2778 } else { 2779 /* Initialize clocks */ 2780 r = amdgpu_atombios_get_clock_info(adev); 2781 if (r) { 2782 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 2783 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 2784 goto failed; 2785 } 2786 /* init i2c buses */ 2787 if (!amdgpu_device_has_dc_support(adev)) 2788 amdgpu_atombios_i2c_init(adev); 2789 } 2790 2791 fence_driver_init: 2792 /* Fence driver */ 2793 r = amdgpu_fence_driver_init(adev); 2794 if (r) { 2795 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n"); 2796 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 2797 goto failed; 2798 } 2799 2800 /* init the mode config */ 2801 drm_mode_config_init(adev->ddev); 2802 2803 r = amdgpu_device_ip_init(adev); 2804 if (r) { 2805 /* failed in exclusive mode due to timeout */ 2806 if (amdgpu_sriov_vf(adev) && 2807 !amdgpu_sriov_runtime(adev) && 2808 amdgpu_virt_mmio_blocked(adev) && 2809 !amdgpu_virt_wait_reset(adev)) { 2810 dev_err(adev->dev, "VF exclusive mode timeout\n"); 2811 /* Don't send request since VF is inactive. */ 2812 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 2813 adev->virt.ops = NULL; 2814 r = -EAGAIN; 2815 goto failed; 2816 } 2817 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 2818 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 2819 if (amdgpu_virt_request_full_gpu(adev, false)) 2820 amdgpu_virt_release_full_gpu(adev, false); 2821 goto failed; 2822 } 2823 2824 adev->accel_working = true; 2825 2826 amdgpu_vm_check_compute_bug(adev); 2827 2828 /* Initialize the buffer migration limit. */ 2829 if (amdgpu_moverate >= 0) 2830 max_MBps = amdgpu_moverate; 2831 else 2832 max_MBps = 8; /* Allow 8 MB/s. */ 2833 /* Get a log2 for easy divisions. */ 2834 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 2835 2836 amdgpu_fbdev_init(adev); 2837 2838 if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev)) 2839 amdgpu_pm_virt_sysfs_init(adev); 2840 2841 r = amdgpu_pm_sysfs_init(adev); 2842 if (r) 2843 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 2844 2845 r = amdgpu_ucode_sysfs_init(adev); 2846 if (r) 2847 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 2848 2849 r = amdgpu_debugfs_gem_init(adev); 2850 if (r) 2851 DRM_ERROR("registering gem debugfs failed (%d).\n", r); 2852 2853 r = amdgpu_debugfs_regs_init(adev); 2854 if (r) 2855 DRM_ERROR("registering register debugfs failed (%d).\n", r); 2856 2857 r = amdgpu_debugfs_firmware_init(adev); 2858 if (r) 2859 DRM_ERROR("registering firmware debugfs failed (%d).\n", r); 2860 2861 r = amdgpu_debugfs_init(adev); 2862 if (r) 2863 DRM_ERROR("Creating debugfs files failed (%d).\n", r); 2864 2865 if ((amdgpu_testing & 1)) { 2866 if (adev->accel_working) 2867 amdgpu_test_moves(adev); 2868 else 2869 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 2870 } 2871 if (amdgpu_benchmarking) { 2872 if (adev->accel_working) 2873 amdgpu_benchmark(adev, amdgpu_benchmarking); 2874 else 2875 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 2876 } 2877 2878 /* enable clockgating, etc. after ib tests, etc. since some blocks require 2879 * explicit gating rather than handling it automatically. 2880 */ 2881 r = amdgpu_device_ip_late_init(adev); 2882 if (r) { 2883 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 2884 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 2885 goto failed; 2886 } 2887 2888 /* must succeed. */ 2889 amdgpu_ras_resume(adev); 2890 2891 queue_delayed_work(system_wq, &adev->delayed_init_work, 2892 msecs_to_jiffies(AMDGPU_RESUME_MS)); 2893 2894 r = device_create_file(adev->dev, &dev_attr_pcie_replay_count); 2895 if (r) { 2896 dev_err(adev->dev, "Could not create pcie_replay_count"); 2897 return r; 2898 } 2899 2900 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 2901 r = amdgpu_pmu_init(adev); 2902 if (r) 2903 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 2904 2905 return 0; 2906 2907 failed: 2908 amdgpu_vf_error_trans_all(adev); 2909 if (runtime) 2910 vga_switcheroo_fini_domain_pm_ops(adev->dev); 2911 2912 return r; 2913 } 2914 2915 /** 2916 * amdgpu_device_fini - tear down the driver 2917 * 2918 * @adev: amdgpu_device pointer 2919 * 2920 * Tear down the driver info (all asics). 2921 * Called at driver shutdown. 2922 */ 2923 void amdgpu_device_fini(struct amdgpu_device *adev) 2924 { 2925 int r; 2926 2927 DRM_INFO("amdgpu: finishing device.\n"); 2928 adev->shutdown = true; 2929 /* disable all interrupts */ 2930 amdgpu_irq_disable_all(adev); 2931 if (adev->mode_info.mode_config_initialized){ 2932 if (!amdgpu_device_has_dc_support(adev)) 2933 drm_helper_force_disable_all(adev->ddev); 2934 else 2935 drm_atomic_helper_shutdown(adev->ddev); 2936 } 2937 amdgpu_fence_driver_fini(adev); 2938 amdgpu_pm_sysfs_fini(adev); 2939 amdgpu_fbdev_fini(adev); 2940 r = amdgpu_device_ip_fini(adev); 2941 if (adev->firmware.gpu_info_fw) { 2942 release_firmware(adev->firmware.gpu_info_fw); 2943 adev->firmware.gpu_info_fw = NULL; 2944 } 2945 adev->accel_working = false; 2946 cancel_delayed_work_sync(&adev->delayed_init_work); 2947 /* free i2c buses */ 2948 if (!amdgpu_device_has_dc_support(adev)) 2949 amdgpu_i2c_fini(adev); 2950 2951 if (amdgpu_emu_mode != 1) 2952 amdgpu_atombios_fini(adev); 2953 2954 kfree(adev->bios); 2955 adev->bios = NULL; 2956 if (!pci_is_thunderbolt_attached(adev->pdev)) 2957 vga_switcheroo_unregister_client(adev->pdev); 2958 if (adev->flags & AMD_IS_PX) 2959 vga_switcheroo_fini_domain_pm_ops(adev->dev); 2960 vga_client_register(adev->pdev, NULL, NULL, NULL); 2961 if (adev->rio_mem) 2962 pci_iounmap(adev->pdev, adev->rio_mem); 2963 adev->rio_mem = NULL; 2964 iounmap(adev->rmmio); 2965 adev->rmmio = NULL; 2966 amdgpu_device_doorbell_fini(adev); 2967 if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev)) 2968 amdgpu_pm_virt_sysfs_fini(adev); 2969 2970 amdgpu_debugfs_regs_cleanup(adev); 2971 device_remove_file(adev->dev, &dev_attr_pcie_replay_count); 2972 amdgpu_ucode_sysfs_fini(adev); 2973 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 2974 amdgpu_pmu_fini(adev); 2975 amdgpu_debugfs_preempt_cleanup(adev); 2976 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) 2977 amdgpu_discovery_fini(adev); 2978 } 2979 2980 2981 /* 2982 * Suspend & resume. 2983 */ 2984 /** 2985 * amdgpu_device_suspend - initiate device suspend 2986 * 2987 * @dev: drm dev pointer 2988 * @suspend: suspend state 2989 * @fbcon : notify the fbdev of suspend 2990 * 2991 * Puts the hw in the suspend state (all asics). 2992 * Returns 0 for success or an error on failure. 2993 * Called at driver suspend. 2994 */ 2995 int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) 2996 { 2997 struct amdgpu_device *adev; 2998 struct drm_crtc *crtc; 2999 struct drm_connector *connector; 3000 int r; 3001 3002 if (dev == NULL || dev->dev_private == NULL) { 3003 return -ENODEV; 3004 } 3005 3006 adev = dev->dev_private; 3007 3008 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3009 return 0; 3010 3011 adev->in_suspend = true; 3012 drm_kms_helper_poll_disable(dev); 3013 3014 if (fbcon) 3015 amdgpu_fbdev_set_suspend(adev, 1); 3016 3017 cancel_delayed_work_sync(&adev->delayed_init_work); 3018 3019 if (!amdgpu_device_has_dc_support(adev)) { 3020 /* turn off display hw */ 3021 drm_modeset_lock_all(dev); 3022 list_for_each_entry(connector, &dev->mode_config.connector_list, head) { 3023 drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); 3024 } 3025 drm_modeset_unlock_all(dev); 3026 /* unpin the front buffers and cursors */ 3027 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3028 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3029 struct drm_framebuffer *fb = crtc->primary->fb; 3030 struct amdgpu_bo *robj; 3031 3032 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3033 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3034 r = amdgpu_bo_reserve(aobj, true); 3035 if (r == 0) { 3036 amdgpu_bo_unpin(aobj); 3037 amdgpu_bo_unreserve(aobj); 3038 } 3039 } 3040 3041 if (fb == NULL || fb->obj[0] == NULL) { 3042 continue; 3043 } 3044 robj = gem_to_amdgpu_bo(fb->obj[0]); 3045 /* don't unpin kernel fb objects */ 3046 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) { 3047 r = amdgpu_bo_reserve(robj, true); 3048 if (r == 0) { 3049 amdgpu_bo_unpin(robj); 3050 amdgpu_bo_unreserve(robj); 3051 } 3052 } 3053 } 3054 } 3055 3056 amdgpu_amdkfd_suspend(adev); 3057 3058 amdgpu_ras_suspend(adev); 3059 3060 r = amdgpu_device_ip_suspend_phase1(adev); 3061 3062 /* evict vram memory */ 3063 amdgpu_bo_evict_vram(adev); 3064 3065 amdgpu_fence_driver_suspend(adev); 3066 3067 r = amdgpu_device_ip_suspend_phase2(adev); 3068 3069 /* evict remaining vram memory 3070 * This second call to evict vram is to evict the gart page table 3071 * using the CPU. 3072 */ 3073 amdgpu_bo_evict_vram(adev); 3074 3075 pci_save_state(dev->pdev); 3076 if (suspend) { 3077 /* Shut down the device */ 3078 pci_disable_device(dev->pdev); 3079 pci_set_power_state(dev->pdev, PCI_D3hot); 3080 } else { 3081 r = amdgpu_asic_reset(adev); 3082 if (r) 3083 DRM_ERROR("amdgpu asic reset failed\n"); 3084 } 3085 3086 return 0; 3087 } 3088 3089 /** 3090 * amdgpu_device_resume - initiate device resume 3091 * 3092 * @dev: drm dev pointer 3093 * @resume: resume state 3094 * @fbcon : notify the fbdev of resume 3095 * 3096 * Bring the hw back to operating state (all asics). 3097 * Returns 0 for success or an error on failure. 3098 * Called at driver resume. 3099 */ 3100 int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) 3101 { 3102 struct drm_connector *connector; 3103 struct amdgpu_device *adev = dev->dev_private; 3104 struct drm_crtc *crtc; 3105 int r = 0; 3106 3107 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 3108 return 0; 3109 3110 if (resume) { 3111 pci_set_power_state(dev->pdev, PCI_D0); 3112 pci_restore_state(dev->pdev); 3113 r = pci_enable_device(dev->pdev); 3114 if (r) 3115 return r; 3116 } 3117 3118 /* post card */ 3119 if (amdgpu_device_need_post(adev)) { 3120 r = amdgpu_atom_asic_init(adev->mode_info.atom_context); 3121 if (r) 3122 DRM_ERROR("amdgpu asic init failed\n"); 3123 } 3124 3125 r = amdgpu_device_ip_resume(adev); 3126 if (r) { 3127 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); 3128 return r; 3129 } 3130 amdgpu_fence_driver_resume(adev); 3131 3132 3133 r = amdgpu_device_ip_late_init(adev); 3134 if (r) 3135 return r; 3136 3137 queue_delayed_work(system_wq, &adev->delayed_init_work, 3138 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3139 3140 if (!amdgpu_device_has_dc_support(adev)) { 3141 /* pin cursors */ 3142 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { 3143 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); 3144 3145 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { 3146 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); 3147 r = amdgpu_bo_reserve(aobj, true); 3148 if (r == 0) { 3149 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); 3150 if (r != 0) 3151 DRM_ERROR("Failed to pin cursor BO (%d)\n", r); 3152 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); 3153 amdgpu_bo_unreserve(aobj); 3154 } 3155 } 3156 } 3157 } 3158 r = amdgpu_amdkfd_resume(adev); 3159 if (r) 3160 return r; 3161 3162 /* Make sure IB tests flushed */ 3163 flush_delayed_work(&adev->delayed_init_work); 3164 3165 /* blat the mode back in */ 3166 if (fbcon) { 3167 if (!amdgpu_device_has_dc_support(adev)) { 3168 /* pre DCE11 */ 3169 drm_helper_resume_force_mode(dev); 3170 3171 /* turn on display hw */ 3172 drm_modeset_lock_all(dev); 3173 list_for_each_entry(connector, &dev->mode_config.connector_list, head) { 3174 drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON); 3175 } 3176 drm_modeset_unlock_all(dev); 3177 } 3178 amdgpu_fbdev_set_suspend(adev, 0); 3179 } 3180 3181 drm_kms_helper_poll_enable(dev); 3182 3183 amdgpu_ras_resume(adev); 3184 3185 /* 3186 * Most of the connector probing functions try to acquire runtime pm 3187 * refs to ensure that the GPU is powered on when connector polling is 3188 * performed. Since we're calling this from a runtime PM callback, 3189 * trying to acquire rpm refs will cause us to deadlock. 3190 * 3191 * Since we're guaranteed to be holding the rpm lock, it's safe to 3192 * temporarily disable the rpm helpers so this doesn't deadlock us. 3193 */ 3194 #ifdef CONFIG_PM 3195 dev->dev->power.disable_depth++; 3196 #endif 3197 if (!amdgpu_device_has_dc_support(adev)) 3198 drm_helper_hpd_irq_event(dev); 3199 else 3200 drm_kms_helper_hotplug_event(dev); 3201 #ifdef CONFIG_PM 3202 dev->dev->power.disable_depth--; 3203 #endif 3204 adev->in_suspend = false; 3205 3206 return 0; 3207 } 3208 3209 /** 3210 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 3211 * 3212 * @adev: amdgpu_device pointer 3213 * 3214 * The list of all the hardware IPs that make up the asic is walked and 3215 * the check_soft_reset callbacks are run. check_soft_reset determines 3216 * if the asic is still hung or not. 3217 * Returns true if any of the IPs are still in a hung state, false if not. 3218 */ 3219 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 3220 { 3221 int i; 3222 bool asic_hang = false; 3223 3224 if (amdgpu_sriov_vf(adev)) 3225 return true; 3226 3227 if (amdgpu_asic_need_full_reset(adev)) 3228 return true; 3229 3230 for (i = 0; i < adev->num_ip_blocks; i++) { 3231 if (!adev->ip_blocks[i].status.valid) 3232 continue; 3233 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 3234 adev->ip_blocks[i].status.hang = 3235 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 3236 if (adev->ip_blocks[i].status.hang) { 3237 DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 3238 asic_hang = true; 3239 } 3240 } 3241 return asic_hang; 3242 } 3243 3244 /** 3245 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 3246 * 3247 * @adev: amdgpu_device pointer 3248 * 3249 * The list of all the hardware IPs that make up the asic is walked and the 3250 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 3251 * handles any IP specific hardware or software state changes that are 3252 * necessary for a soft reset to succeed. 3253 * Returns 0 on success, negative error code on failure. 3254 */ 3255 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 3256 { 3257 int i, r = 0; 3258 3259 for (i = 0; i < adev->num_ip_blocks; i++) { 3260 if (!adev->ip_blocks[i].status.valid) 3261 continue; 3262 if (adev->ip_blocks[i].status.hang && 3263 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 3264 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 3265 if (r) 3266 return r; 3267 } 3268 } 3269 3270 return 0; 3271 } 3272 3273 /** 3274 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 3275 * 3276 * @adev: amdgpu_device pointer 3277 * 3278 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 3279 * reset is necessary to recover. 3280 * Returns true if a full asic reset is required, false if not. 3281 */ 3282 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 3283 { 3284 int i; 3285 3286 if (amdgpu_asic_need_full_reset(adev)) 3287 return true; 3288 3289 for (i = 0; i < adev->num_ip_blocks; i++) { 3290 if (!adev->ip_blocks[i].status.valid) 3291 continue; 3292 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 3293 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 3294 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 3295 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 3296 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3297 if (adev->ip_blocks[i].status.hang) { 3298 DRM_INFO("Some block need full reset!\n"); 3299 return true; 3300 } 3301 } 3302 } 3303 return false; 3304 } 3305 3306 /** 3307 * amdgpu_device_ip_soft_reset - do a soft reset 3308 * 3309 * @adev: amdgpu_device pointer 3310 * 3311 * The list of all the hardware IPs that make up the asic is walked and the 3312 * soft_reset callbacks are run if the block is hung. soft_reset handles any 3313 * IP specific hardware or software state changes that are necessary to soft 3314 * reset the IP. 3315 * Returns 0 on success, negative error code on failure. 3316 */ 3317 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 3318 { 3319 int i, r = 0; 3320 3321 for (i = 0; i < adev->num_ip_blocks; i++) { 3322 if (!adev->ip_blocks[i].status.valid) 3323 continue; 3324 if (adev->ip_blocks[i].status.hang && 3325 adev->ip_blocks[i].version->funcs->soft_reset) { 3326 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 3327 if (r) 3328 return r; 3329 } 3330 } 3331 3332 return 0; 3333 } 3334 3335 /** 3336 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 3337 * 3338 * @adev: amdgpu_device pointer 3339 * 3340 * The list of all the hardware IPs that make up the asic is walked and the 3341 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 3342 * handles any IP specific hardware or software state changes that are 3343 * necessary after the IP has been soft reset. 3344 * Returns 0 on success, negative error code on failure. 3345 */ 3346 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 3347 { 3348 int i, r = 0; 3349 3350 for (i = 0; i < adev->num_ip_blocks; i++) { 3351 if (!adev->ip_blocks[i].status.valid) 3352 continue; 3353 if (adev->ip_blocks[i].status.hang && 3354 adev->ip_blocks[i].version->funcs->post_soft_reset) 3355 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 3356 if (r) 3357 return r; 3358 } 3359 3360 return 0; 3361 } 3362 3363 /** 3364 * amdgpu_device_recover_vram - Recover some VRAM contents 3365 * 3366 * @adev: amdgpu_device pointer 3367 * 3368 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 3369 * restore things like GPUVM page tables after a GPU reset where 3370 * the contents of VRAM might be lost. 3371 * 3372 * Returns: 3373 * 0 on success, negative error code on failure. 3374 */ 3375 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 3376 { 3377 struct dma_fence *fence = NULL, *next = NULL; 3378 struct amdgpu_bo *shadow; 3379 long r = 1, tmo; 3380 3381 if (amdgpu_sriov_runtime(adev)) 3382 tmo = msecs_to_jiffies(8000); 3383 else 3384 tmo = msecs_to_jiffies(100); 3385 3386 DRM_INFO("recover vram bo from shadow start\n"); 3387 mutex_lock(&adev->shadow_list_lock); 3388 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { 3389 3390 /* No need to recover an evicted BO */ 3391 if (shadow->tbo.mem.mem_type != TTM_PL_TT || 3392 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET || 3393 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM) 3394 continue; 3395 3396 r = amdgpu_bo_restore_shadow(shadow, &next); 3397 if (r) 3398 break; 3399 3400 if (fence) { 3401 tmo = dma_fence_wait_timeout(fence, false, tmo); 3402 dma_fence_put(fence); 3403 fence = next; 3404 if (tmo == 0) { 3405 r = -ETIMEDOUT; 3406 break; 3407 } else if (tmo < 0) { 3408 r = tmo; 3409 break; 3410 } 3411 } else { 3412 fence = next; 3413 } 3414 } 3415 mutex_unlock(&adev->shadow_list_lock); 3416 3417 if (fence) 3418 tmo = dma_fence_wait_timeout(fence, false, tmo); 3419 dma_fence_put(fence); 3420 3421 if (r < 0 || tmo <= 0) { 3422 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 3423 return -EIO; 3424 } 3425 3426 DRM_INFO("recover vram bo from shadow done\n"); 3427 return 0; 3428 } 3429 3430 3431 /** 3432 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 3433 * 3434 * @adev: amdgpu device pointer 3435 * @from_hypervisor: request from hypervisor 3436 * 3437 * do VF FLR and reinitialize Asic 3438 * return 0 means succeeded otherwise failed 3439 */ 3440 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 3441 bool from_hypervisor) 3442 { 3443 int r; 3444 3445 if (from_hypervisor) 3446 r = amdgpu_virt_request_full_gpu(adev, true); 3447 else 3448 r = amdgpu_virt_reset_gpu(adev); 3449 if (r) 3450 return r; 3451 3452 amdgpu_amdkfd_pre_reset(adev); 3453 3454 /* Resume IP prior to SMC */ 3455 r = amdgpu_device_ip_reinit_early_sriov(adev); 3456 if (r) 3457 goto error; 3458 3459 /* we need recover gart prior to run SMC/CP/SDMA resume */ 3460 amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); 3461 3462 r = amdgpu_device_fw_loading(adev); 3463 if (r) 3464 return r; 3465 3466 /* now we are okay to resume SMC/CP/SDMA */ 3467 r = amdgpu_device_ip_reinit_late_sriov(adev); 3468 if (r) 3469 goto error; 3470 3471 amdgpu_irq_gpu_reset_resume_helper(adev); 3472 r = amdgpu_ib_ring_tests(adev); 3473 amdgpu_amdkfd_post_reset(adev); 3474 3475 error: 3476 amdgpu_virt_init_data_exchange(adev); 3477 amdgpu_virt_release_full_gpu(adev, true); 3478 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 3479 atomic_inc(&adev->vram_lost_counter); 3480 r = amdgpu_device_recover_vram(adev); 3481 } 3482 3483 return r; 3484 } 3485 3486 /** 3487 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 3488 * 3489 * @adev: amdgpu device pointer 3490 * 3491 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 3492 * a hung GPU. 3493 */ 3494 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 3495 { 3496 if (!amdgpu_device_ip_check_soft_reset(adev)) { 3497 DRM_INFO("Timeout, but no hardware hang detected.\n"); 3498 return false; 3499 } 3500 3501 if (amdgpu_gpu_recovery == 0) 3502 goto disabled; 3503 3504 if (amdgpu_sriov_vf(adev)) 3505 return true; 3506 3507 if (amdgpu_gpu_recovery == -1) { 3508 switch (adev->asic_type) { 3509 case CHIP_BONAIRE: 3510 case CHIP_HAWAII: 3511 case CHIP_TOPAZ: 3512 case CHIP_TONGA: 3513 case CHIP_FIJI: 3514 case CHIP_POLARIS10: 3515 case CHIP_POLARIS11: 3516 case CHIP_POLARIS12: 3517 case CHIP_VEGAM: 3518 case CHIP_VEGA20: 3519 case CHIP_VEGA10: 3520 case CHIP_VEGA12: 3521 break; 3522 default: 3523 goto disabled; 3524 } 3525 } 3526 3527 return true; 3528 3529 disabled: 3530 DRM_INFO("GPU recovery disabled.\n"); 3531 return false; 3532 } 3533 3534 3535 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 3536 struct amdgpu_job *job, 3537 bool *need_full_reset_arg) 3538 { 3539 int i, r = 0; 3540 bool need_full_reset = *need_full_reset_arg; 3541 3542 /* block all schedulers and reset given job's ring */ 3543 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3544 struct amdgpu_ring *ring = adev->rings[i]; 3545 3546 if (!ring || !ring->sched.thread) 3547 continue; 3548 3549 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 3550 amdgpu_fence_driver_force_completion(ring); 3551 } 3552 3553 if(job) 3554 drm_sched_increase_karma(&job->base); 3555 3556 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 3557 if (!amdgpu_sriov_vf(adev)) { 3558 3559 if (!need_full_reset) 3560 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 3561 3562 if (!need_full_reset) { 3563 amdgpu_device_ip_pre_soft_reset(adev); 3564 r = amdgpu_device_ip_soft_reset(adev); 3565 amdgpu_device_ip_post_soft_reset(adev); 3566 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 3567 DRM_INFO("soft reset failed, will fallback to full reset!\n"); 3568 need_full_reset = true; 3569 } 3570 } 3571 3572 if (need_full_reset) 3573 r = amdgpu_device_ip_suspend(adev); 3574 3575 *need_full_reset_arg = need_full_reset; 3576 } 3577 3578 return r; 3579 } 3580 3581 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 3582 struct list_head *device_list_handle, 3583 bool *need_full_reset_arg) 3584 { 3585 struct amdgpu_device *tmp_adev = NULL; 3586 bool need_full_reset = *need_full_reset_arg, vram_lost = false; 3587 int r = 0; 3588 3589 /* 3590 * ASIC reset has to be done on all HGMI hive nodes ASAP 3591 * to allow proper links negotiation in FW (within 1 sec) 3592 */ 3593 if (need_full_reset) { 3594 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3595 /* For XGMI run all resets in parallel to speed up the process */ 3596 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 3597 if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) 3598 r = -EALREADY; 3599 } else 3600 r = amdgpu_asic_reset(tmp_adev); 3601 3602 if (r) { 3603 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", 3604 r, tmp_adev->ddev->unique); 3605 break; 3606 } 3607 } 3608 3609 /* For XGMI wait for all PSP resets to complete before proceed */ 3610 if (!r) { 3611 list_for_each_entry(tmp_adev, device_list_handle, 3612 gmc.xgmi.head) { 3613 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 3614 flush_work(&tmp_adev->xgmi_reset_work); 3615 r = tmp_adev->asic_reset_res; 3616 if (r) 3617 break; 3618 } 3619 } 3620 3621 list_for_each_entry(tmp_adev, device_list_handle, 3622 gmc.xgmi.head) { 3623 amdgpu_ras_reserve_bad_pages(tmp_adev); 3624 } 3625 } 3626 } 3627 3628 3629 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3630 if (need_full_reset) { 3631 /* post card */ 3632 if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) 3633 DRM_WARN("asic atom init failed!"); 3634 3635 if (!r) { 3636 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 3637 r = amdgpu_device_ip_resume_phase1(tmp_adev); 3638 if (r) 3639 goto out; 3640 3641 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 3642 if (vram_lost) { 3643 DRM_INFO("VRAM is lost due to GPU reset!\n"); 3644 atomic_inc(&tmp_adev->vram_lost_counter); 3645 } 3646 3647 r = amdgpu_gtt_mgr_recover( 3648 &tmp_adev->mman.bdev.man[TTM_PL_TT]); 3649 if (r) 3650 goto out; 3651 3652 r = amdgpu_device_fw_loading(tmp_adev); 3653 if (r) 3654 return r; 3655 3656 r = amdgpu_device_ip_resume_phase2(tmp_adev); 3657 if (r) 3658 goto out; 3659 3660 if (vram_lost) 3661 amdgpu_device_fill_reset_magic(tmp_adev); 3662 3663 /* 3664 * Add this ASIC as tracked as reset was already 3665 * complete successfully. 3666 */ 3667 amdgpu_register_gpu_instance(tmp_adev); 3668 3669 r = amdgpu_device_ip_late_init(tmp_adev); 3670 if (r) 3671 goto out; 3672 3673 /* must succeed. */ 3674 amdgpu_ras_resume(tmp_adev); 3675 3676 /* Update PSP FW topology after reset */ 3677 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 3678 r = amdgpu_xgmi_update_topology(hive, tmp_adev); 3679 } 3680 } 3681 3682 3683 out: 3684 if (!r) { 3685 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 3686 r = amdgpu_ib_ring_tests(tmp_adev); 3687 if (r) { 3688 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 3689 r = amdgpu_device_ip_suspend(tmp_adev); 3690 need_full_reset = true; 3691 r = -EAGAIN; 3692 goto end; 3693 } 3694 } 3695 3696 if (!r) 3697 r = amdgpu_device_recover_vram(tmp_adev); 3698 else 3699 tmp_adev->asic_reset_res = r; 3700 } 3701 3702 end: 3703 *need_full_reset_arg = need_full_reset; 3704 return r; 3705 } 3706 3707 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) 3708 { 3709 if (trylock) { 3710 if (!mutex_trylock(&adev->lock_reset)) 3711 return false; 3712 } else 3713 mutex_lock(&adev->lock_reset); 3714 3715 atomic_inc(&adev->gpu_reset_counter); 3716 adev->in_gpu_reset = 1; 3717 switch (amdgpu_asic_reset_method(adev)) { 3718 case AMD_RESET_METHOD_MODE1: 3719 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 3720 break; 3721 case AMD_RESET_METHOD_MODE2: 3722 adev->mp1_state = PP_MP1_STATE_RESET; 3723 break; 3724 default: 3725 adev->mp1_state = PP_MP1_STATE_NONE; 3726 break; 3727 } 3728 /* Block kfd: SRIOV would do it separately */ 3729 if (!amdgpu_sriov_vf(adev)) 3730 amdgpu_amdkfd_pre_reset(adev); 3731 3732 return true; 3733 } 3734 3735 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 3736 { 3737 /*unlock kfd: SRIOV would do it separately */ 3738 if (!amdgpu_sriov_vf(adev)) 3739 amdgpu_amdkfd_post_reset(adev); 3740 amdgpu_vf_error_trans_all(adev); 3741 adev->mp1_state = PP_MP1_STATE_NONE; 3742 adev->in_gpu_reset = 0; 3743 mutex_unlock(&adev->lock_reset); 3744 } 3745 3746 3747 /** 3748 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 3749 * 3750 * @adev: amdgpu device pointer 3751 * @job: which job trigger hang 3752 * 3753 * Attempt to reset the GPU if it has hung (all asics). 3754 * Attempt to do soft-reset or full-reset and reinitialize Asic 3755 * Returns 0 for success or an error on failure. 3756 */ 3757 3758 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 3759 struct amdgpu_job *job) 3760 { 3761 struct list_head device_list, *device_list_handle = NULL; 3762 bool need_full_reset, job_signaled; 3763 struct amdgpu_hive_info *hive = NULL; 3764 struct amdgpu_device *tmp_adev = NULL; 3765 int i, r = 0; 3766 3767 need_full_reset = job_signaled = false; 3768 INIT_LIST_HEAD(&device_list); 3769 3770 dev_info(adev->dev, "GPU reset begin!\n"); 3771 3772 cancel_delayed_work_sync(&adev->delayed_init_work); 3773 3774 hive = amdgpu_get_xgmi_hive(adev, false); 3775 3776 /* 3777 * Here we trylock to avoid chain of resets executing from 3778 * either trigger by jobs on different adevs in XGMI hive or jobs on 3779 * different schedulers for same device while this TO handler is running. 3780 * We always reset all schedulers for device and all devices for XGMI 3781 * hive so that should take care of them too. 3782 */ 3783 3784 if (hive && !mutex_trylock(&hive->reset_lock)) { 3785 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 3786 job->base.id, hive->hive_id); 3787 return 0; 3788 } 3789 3790 /* Start with adev pre asic reset first for soft reset check.*/ 3791 if (!amdgpu_device_lock_adev(adev, !hive)) { 3792 DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", 3793 job->base.id); 3794 return 0; 3795 } 3796 3797 /* Build list of devices to reset */ 3798 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3799 if (!hive) { 3800 amdgpu_device_unlock_adev(adev); 3801 return -ENODEV; 3802 } 3803 3804 /* 3805 * In case we are in XGMI hive mode device reset is done for all the 3806 * nodes in the hive to retrain all XGMI links and hence the reset 3807 * sequence is executed in loop on all nodes. 3808 */ 3809 device_list_handle = &hive->device_list; 3810 } else { 3811 list_add_tail(&adev->gmc.xgmi.head, &device_list); 3812 device_list_handle = &device_list; 3813 } 3814 3815 /* 3816 * Mark these ASICs to be reseted as untracked first 3817 * And add them back after reset completed 3818 */ 3819 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) 3820 amdgpu_unregister_gpu_instance(tmp_adev); 3821 3822 /* block all schedulers and reset given job's ring */ 3823 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3824 /* disable ras on ALL IPs */ 3825 if (amdgpu_device_ip_need_full_reset(tmp_adev)) 3826 amdgpu_ras_suspend(tmp_adev); 3827 3828 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3829 struct amdgpu_ring *ring = tmp_adev->rings[i]; 3830 3831 if (!ring || !ring->sched.thread) 3832 continue; 3833 3834 drm_sched_stop(&ring->sched, &job->base); 3835 } 3836 } 3837 3838 3839 /* 3840 * Must check guilty signal here since after this point all old 3841 * HW fences are force signaled. 3842 * 3843 * job->base holds a reference to parent fence 3844 */ 3845 if (job && job->base.s_fence->parent && 3846 dma_fence_is_signaled(job->base.s_fence->parent)) 3847 job_signaled = true; 3848 3849 if (!amdgpu_device_ip_need_full_reset(adev)) 3850 device_list_handle = &device_list; 3851 3852 if (job_signaled) { 3853 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 3854 goto skip_hw_reset; 3855 } 3856 3857 3858 /* Guilty job will be freed after this*/ 3859 r = amdgpu_device_pre_asic_reset(adev, 3860 job, 3861 &need_full_reset); 3862 if (r) { 3863 /*TODO Should we stop ?*/ 3864 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 3865 r, adev->ddev->unique); 3866 adev->asic_reset_res = r; 3867 } 3868 3869 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 3870 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3871 3872 if (tmp_adev == adev) 3873 continue; 3874 3875 amdgpu_device_lock_adev(tmp_adev, false); 3876 r = amdgpu_device_pre_asic_reset(tmp_adev, 3877 NULL, 3878 &need_full_reset); 3879 /*TODO Should we stop ?*/ 3880 if (r) { 3881 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", 3882 r, tmp_adev->ddev->unique); 3883 tmp_adev->asic_reset_res = r; 3884 } 3885 } 3886 3887 /* Actual ASIC resets if needed.*/ 3888 /* TODO Implement XGMI hive reset logic for SRIOV */ 3889 if (amdgpu_sriov_vf(adev)) { 3890 r = amdgpu_device_reset_sriov(adev, job ? false : true); 3891 if (r) 3892 adev->asic_reset_res = r; 3893 } else { 3894 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); 3895 if (r && r == -EAGAIN) 3896 goto retry; 3897 } 3898 3899 skip_hw_reset: 3900 3901 /* Post ASIC reset for all devs .*/ 3902 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { 3903 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3904 struct amdgpu_ring *ring = tmp_adev->rings[i]; 3905 3906 if (!ring || !ring->sched.thread) 3907 continue; 3908 3909 /* No point to resubmit jobs if we didn't HW reset*/ 3910 if (!tmp_adev->asic_reset_res && !job_signaled) 3911 drm_sched_resubmit_jobs(&ring->sched); 3912 3913 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 3914 } 3915 3916 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { 3917 drm_helper_resume_force_mode(tmp_adev->ddev); 3918 } 3919 3920 tmp_adev->asic_reset_res = 0; 3921 3922 if (r) { 3923 /* bad news, how to tell it to userspace ? */ 3924 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); 3925 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 3926 } else { 3927 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter)); 3928 } 3929 3930 amdgpu_device_unlock_adev(tmp_adev); 3931 } 3932 3933 if (hive) 3934 mutex_unlock(&hive->reset_lock); 3935 3936 if (r) 3937 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 3938 return r; 3939 } 3940 3941 /** 3942 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 3943 * 3944 * @adev: amdgpu_device pointer 3945 * 3946 * Fetchs and stores in the driver the PCIE capabilities (gen speed 3947 * and lanes) of the slot the device is in. Handles APUs and 3948 * virtualized environments where PCIE config space may not be available. 3949 */ 3950 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 3951 { 3952 struct pci_dev *pdev; 3953 enum pci_bus_speed speed_cap, platform_speed_cap; 3954 enum pcie_link_width platform_link_width; 3955 3956 if (amdgpu_pcie_gen_cap) 3957 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 3958 3959 if (amdgpu_pcie_lane_cap) 3960 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 3961 3962 /* covers APUs as well */ 3963 if (pci_is_root_bus(adev->pdev->bus)) { 3964 if (adev->pm.pcie_gen_mask == 0) 3965 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 3966 if (adev->pm.pcie_mlw_mask == 0) 3967 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 3968 return; 3969 } 3970 3971 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 3972 return; 3973 3974 pcie_bandwidth_available(adev->pdev, NULL, 3975 &platform_speed_cap, &platform_link_width); 3976 3977 if (adev->pm.pcie_gen_mask == 0) { 3978 /* asic caps */ 3979 pdev = adev->pdev; 3980 speed_cap = pcie_get_speed_cap(pdev); 3981 if (speed_cap == PCI_SPEED_UNKNOWN) { 3982 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3983 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 3984 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 3985 } else { 3986 if (speed_cap == PCIE_SPEED_16_0GT) 3987 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3988 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 3989 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 3990 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 3991 else if (speed_cap == PCIE_SPEED_8_0GT) 3992 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3993 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 3994 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 3995 else if (speed_cap == PCIE_SPEED_5_0GT) 3996 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 3997 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 3998 else 3999 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 4000 } 4001 /* platform caps */ 4002 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 4003 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4004 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4005 } else { 4006 if (platform_speed_cap == PCIE_SPEED_16_0GT) 4007 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4008 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4009 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 4010 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 4011 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 4012 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4013 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 4014 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 4015 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 4016 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 4017 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 4018 else 4019 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 4020 4021 } 4022 } 4023 if (adev->pm.pcie_mlw_mask == 0) { 4024 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 4025 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 4026 } else { 4027 switch (platform_link_width) { 4028 case PCIE_LNK_X32: 4029 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 4030 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4031 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4032 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4033 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4034 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4035 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4036 break; 4037 case PCIE_LNK_X16: 4038 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 4039 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4040 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4041 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4042 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4043 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4044 break; 4045 case PCIE_LNK_X12: 4046 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 4047 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4048 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4049 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4050 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4051 break; 4052 case PCIE_LNK_X8: 4053 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 4054 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4055 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4056 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4057 break; 4058 case PCIE_LNK_X4: 4059 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 4060 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4061 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4062 break; 4063 case PCIE_LNK_X2: 4064 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 4065 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 4066 break; 4067 case PCIE_LNK_X1: 4068 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 4069 break; 4070 default: 4071 break; 4072 } 4073 } 4074 } 4075 } 4076 4077