1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 #include <linux/nospec.h> 40 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_client_event.h> 43 #include <drm/drm_crtc_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/device.h> 47 #include <linux/vgaarb.h> 48 #include <linux/vga_switcheroo.h> 49 #include <linux/efi.h> 50 #include "amdgpu.h" 51 #include "amdgpu_trace.h" 52 #include "amdgpu_i2c.h" 53 #include "atom.h" 54 #include "amdgpu_atombios.h" 55 #include "amdgpu_atomfirmware.h" 56 #include "amd_pcie.h" 57 #ifdef CONFIG_DRM_AMDGPU_SI 58 #include "si.h" 59 #endif 60 #ifdef CONFIG_DRM_AMDGPU_CIK 61 #include "cik.h" 62 #endif 63 #include "vi.h" 64 #include "soc15.h" 65 #include "nv.h" 66 #include "bif/bif_4_1_d.h" 67 #include <linux/firmware.h> 68 #include "amdgpu_vf_error.h" 69 70 #include "amdgpu_amdkfd.h" 71 #include "amdgpu_pm.h" 72 73 #include "amdgpu_xgmi.h" 74 #include "amdgpu_ras.h" 75 #include "amdgpu_ras_mgr.h" 76 #include "amdgpu_pmu.h" 77 #include "amdgpu_fru_eeprom.h" 78 #include "amdgpu_reset.h" 79 #include "amdgpu_virt.h" 80 #include "amdgpu_dev_coredump.h" 81 82 #include <linux/suspend.h> 83 #include <drm/task_barrier.h> 84 #include <linux/pm_runtime.h> 85 86 #include <drm/drm_drv.h> 87 88 #if IS_ENABLED(CONFIG_X86) 89 #include <asm/intel-family.h> 90 #include <asm/cpu_device_id.h> 91 #endif 92 93 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 98 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 99 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 100 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); 101 102 #define AMDGPU_RESUME_MS 2000 103 #define AMDGPU_MAX_RETRY_LIMIT 2 104 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 105 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 106 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 107 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 108 109 #define AMDGPU_VBIOS_SKIP (1U << 0) 110 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 111 112 static const struct drm_driver amdgpu_kms_driver; 113 114 const char *amdgpu_asic_name[] = { 115 "TAHITI", 116 "PITCAIRN", 117 "VERDE", 118 "OLAND", 119 "HAINAN", 120 "BONAIRE", 121 "KAVERI", 122 "KABINI", 123 "HAWAII", 124 "MULLINS", 125 "TOPAZ", 126 "TONGA", 127 "FIJI", 128 "CARRIZO", 129 "STONEY", 130 "POLARIS10", 131 "POLARIS11", 132 "POLARIS12", 133 "VEGAM", 134 "VEGA10", 135 "VEGA12", 136 "VEGA20", 137 "RAVEN", 138 "ARCTURUS", 139 "RENOIR", 140 "ALDEBARAN", 141 "NAVI10", 142 "CYAN_SKILLFISH", 143 "NAVI14", 144 "NAVI12", 145 "SIENNA_CICHLID", 146 "NAVY_FLOUNDER", 147 "VANGOGH", 148 "DIMGREY_CAVEFISH", 149 "BEIGE_GOBY", 150 "YELLOW_CARP", 151 "IP DISCOVERY", 152 "LAST", 153 }; 154 155 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 156 /* 157 * Default init level where all blocks are expected to be initialized. This is 158 * the level of initialization expected by default and also after a full reset 159 * of the device. 160 */ 161 struct amdgpu_init_level amdgpu_init_default = { 162 .level = AMDGPU_INIT_LEVEL_DEFAULT, 163 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 164 }; 165 166 struct amdgpu_init_level amdgpu_init_recovery = { 167 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 168 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 169 }; 170 171 /* 172 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 173 * is used for cases like reset on initialization where the entire hive needs to 174 * be reset before first use. 175 */ 176 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 177 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 178 .hwini_ip_block_mask = 179 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 180 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 181 BIT(AMD_IP_BLOCK_TYPE_PSP) 182 }; 183 184 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); 185 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); 186 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); 187 188 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 189 190 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 191 enum amd_ip_block_type block) 192 { 193 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 194 } 195 196 void amdgpu_set_init_level(struct amdgpu_device *adev, 197 enum amdgpu_init_lvl_id lvl) 198 { 199 switch (lvl) { 200 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 201 adev->init_lvl = &amdgpu_init_minimal_xgmi; 202 break; 203 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 204 adev->init_lvl = &amdgpu_init_recovery; 205 break; 206 case AMDGPU_INIT_LEVEL_DEFAULT: 207 fallthrough; 208 default: 209 adev->init_lvl = &amdgpu_init_default; 210 break; 211 } 212 } 213 214 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 215 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 216 void *data); 217 218 /** 219 * DOC: pcie_replay_count 220 * 221 * The amdgpu driver provides a sysfs API for reporting the total number 222 * of PCIe replays (NAKs). 223 * The file pcie_replay_count is used for this and returns the total 224 * number of replays as a sum of the NAKs generated and NAKs received. 225 */ 226 227 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 228 struct device_attribute *attr, char *buf) 229 { 230 struct drm_device *ddev = dev_get_drvdata(dev); 231 struct amdgpu_device *adev = drm_to_adev(ddev); 232 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 233 234 return sysfs_emit(buf, "%llu\n", cnt); 235 } 236 237 static DEVICE_ATTR(pcie_replay_count, 0444, 238 amdgpu_device_get_pcie_replay_count, NULL); 239 240 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 241 { 242 int ret = 0; 243 244 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 245 ret = sysfs_create_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 248 return ret; 249 } 250 251 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 252 { 253 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 254 sysfs_remove_file(&adev->dev->kobj, 255 &dev_attr_pcie_replay_count.attr); 256 } 257 258 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 259 const struct bin_attribute *attr, char *buf, 260 loff_t ppos, size_t count) 261 { 262 struct device *dev = kobj_to_dev(kobj); 263 struct drm_device *ddev = dev_get_drvdata(dev); 264 struct amdgpu_device *adev = drm_to_adev(ddev); 265 ssize_t bytes_read; 266 267 switch (ppos) { 268 case AMDGPU_SYS_REG_STATE_XGMI: 269 bytes_read = amdgpu_asic_get_reg_state( 270 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 271 break; 272 case AMDGPU_SYS_REG_STATE_WAFL: 273 bytes_read = amdgpu_asic_get_reg_state( 274 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 275 break; 276 case AMDGPU_SYS_REG_STATE_PCIE: 277 bytes_read = amdgpu_asic_get_reg_state( 278 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 279 break; 280 case AMDGPU_SYS_REG_STATE_USR: 281 bytes_read = amdgpu_asic_get_reg_state( 282 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 283 break; 284 case AMDGPU_SYS_REG_STATE_USR_1: 285 bytes_read = amdgpu_asic_get_reg_state( 286 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 287 break; 288 default: 289 return -EINVAL; 290 } 291 292 return bytes_read; 293 } 294 295 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 296 AMDGPU_SYS_REG_STATE_END); 297 298 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 299 { 300 int ret; 301 302 if (!amdgpu_asic_get_reg_state_supported(adev)) 303 return 0; 304 305 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 307 return ret; 308 } 309 310 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 311 { 312 if (!amdgpu_asic_get_reg_state_supported(adev)) 313 return; 314 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 315 } 316 317 /** 318 * DOC: board_info 319 * 320 * The amdgpu driver provides a sysfs API for giving board related information. 321 * It provides the form factor information in the format 322 * 323 * type : form factor 324 * 325 * Possible form factor values 326 * 327 * - "cem" - PCIE CEM card 328 * - "oam" - Open Compute Accelerator Module 329 * - "unknown" - Not known 330 * 331 */ 332 333 static ssize_t amdgpu_device_get_board_info(struct device *dev, 334 struct device_attribute *attr, 335 char *buf) 336 { 337 struct drm_device *ddev = dev_get_drvdata(dev); 338 struct amdgpu_device *adev = drm_to_adev(ddev); 339 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 340 const char *pkg; 341 342 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 343 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 344 345 switch (pkg_type) { 346 case AMDGPU_PKG_TYPE_CEM: 347 pkg = "cem"; 348 break; 349 case AMDGPU_PKG_TYPE_OAM: 350 pkg = "oam"; 351 break; 352 default: 353 pkg = "unknown"; 354 break; 355 } 356 357 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 358 } 359 360 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 361 362 static struct attribute *amdgpu_board_attrs[] = { 363 &dev_attr_board_info.attr, 364 NULL, 365 }; 366 367 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 368 struct attribute *attr, int n) 369 { 370 struct device *dev = kobj_to_dev(kobj); 371 struct drm_device *ddev = dev_get_drvdata(dev); 372 struct amdgpu_device *adev = drm_to_adev(ddev); 373 374 if (adev->flags & AMD_IS_APU) 375 return 0; 376 377 return attr->mode; 378 } 379 380 static const struct attribute_group amdgpu_board_attrs_group = { 381 .attrs = amdgpu_board_attrs, 382 .is_visible = amdgpu_board_attrs_is_visible 383 }; 384 385 /** 386 * DOC: uma/carveout_options 387 * 388 * This is a read-only file that lists all available UMA allocation 389 * options and their corresponding indices. Example output:: 390 * 391 * $ cat uma/carveout_options 392 * 0: Minimum (512 MB) 393 * 1: (1 GB) 394 * 2: (2 GB) 395 * 3: (4 GB) 396 * 4: (6 GB) 397 * 5: (8 GB) 398 * 6: (12 GB) 399 * 7: Medium (16 GB) 400 * 8: (24 GB) 401 * 9: High (32 GB) 402 */ 403 static ssize_t carveout_options_show(struct device *dev, 404 struct device_attribute *attr, 405 char *buf) 406 { 407 struct drm_device *ddev = dev_get_drvdata(dev); 408 struct amdgpu_device *adev = drm_to_adev(ddev); 409 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 410 uint32_t memory_carved; 411 ssize_t size = 0; 412 413 if (!uma_info || !uma_info->num_entries) 414 return -ENODEV; 415 416 for (int i = 0; i < uma_info->num_entries; i++) { 417 memory_carved = uma_info->entries[i].memory_carved_mb; 418 if (memory_carved >= SZ_1G/SZ_1M) { 419 size += sysfs_emit_at(buf, size, "%d: %s (%u GB)\n", 420 i, 421 uma_info->entries[i].name, 422 memory_carved >> 10); 423 } else { 424 size += sysfs_emit_at(buf, size, "%d: %s (%u MB)\n", 425 i, 426 uma_info->entries[i].name, 427 memory_carved); 428 } 429 } 430 431 return size; 432 } 433 static DEVICE_ATTR_RO(carveout_options); 434 435 /** 436 * DOC: uma/carveout 437 * 438 * This file is both readable and writable. When read, it shows the 439 * index of the current setting. Writing a valid index to this file 440 * allows users to change the UMA carveout size to the selected option 441 * on the next boot. 442 * 443 * The available options and their corresponding indices can be read 444 * from the uma/carveout_options file. 445 */ 446 static ssize_t carveout_show(struct device *dev, 447 struct device_attribute *attr, 448 char *buf) 449 { 450 struct drm_device *ddev = dev_get_drvdata(dev); 451 struct amdgpu_device *adev = drm_to_adev(ddev); 452 453 return sysfs_emit(buf, "%u\n", adev->uma_info.uma_option_index); 454 } 455 456 static ssize_t carveout_store(struct device *dev, 457 struct device_attribute *attr, 458 const char *buf, size_t count) 459 { 460 struct drm_device *ddev = dev_get_drvdata(dev); 461 struct amdgpu_device *adev = drm_to_adev(ddev); 462 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 463 struct amdgpu_uma_carveout_option *opt; 464 unsigned long val; 465 uint8_t flags; 466 int r; 467 468 r = kstrtoul(buf, 10, &val); 469 if (r) 470 return r; 471 472 if (val >= uma_info->num_entries) 473 return -EINVAL; 474 475 val = array_index_nospec(val, uma_info->num_entries); 476 opt = &uma_info->entries[val]; 477 478 if (!(opt->flags & AMDGPU_UMA_FLAG_AUTO) && 479 !(opt->flags & AMDGPU_UMA_FLAG_CUSTOM)) { 480 drm_err_once(ddev, "Option %lu not supported due to lack of Custom/Auto flag", val); 481 return -EINVAL; 482 } 483 484 flags = opt->flags; 485 flags &= ~((flags & AMDGPU_UMA_FLAG_AUTO) >> 1); 486 487 guard(mutex)(&uma_info->update_lock); 488 489 r = amdgpu_acpi_set_uma_allocation_size(adev, val, flags); 490 if (r) 491 return r; 492 493 uma_info->uma_option_index = val; 494 495 return count; 496 } 497 static DEVICE_ATTR_RW(carveout); 498 499 static struct attribute *amdgpu_uma_attrs[] = { 500 &dev_attr_carveout.attr, 501 &dev_attr_carveout_options.attr, 502 NULL 503 }; 504 505 const struct attribute_group amdgpu_uma_attr_group = { 506 .name = "uma", 507 .attrs = amdgpu_uma_attrs 508 }; 509 510 static void amdgpu_uma_sysfs_init(struct amdgpu_device *adev) 511 { 512 int rc; 513 514 if (!(adev->flags & AMD_IS_APU)) 515 return; 516 517 if (!amdgpu_acpi_is_set_uma_allocation_size_supported()) 518 return; 519 520 rc = amdgpu_atomfirmware_get_uma_carveout_info(adev, &adev->uma_info); 521 if (rc) { 522 drm_dbg(adev_to_drm(adev), 523 "Failed to parse UMA carveout info from VBIOS: %d\n", rc); 524 goto out_info; 525 } 526 527 mutex_init(&adev->uma_info.update_lock); 528 529 rc = devm_device_add_group(adev->dev, &amdgpu_uma_attr_group); 530 if (rc) { 531 drm_dbg(adev_to_drm(adev), "Failed to add UMA carveout sysfs interfaces %d\n", rc); 532 goto out_attr; 533 } 534 535 return; 536 537 out_attr: 538 mutex_destroy(&adev->uma_info.update_lock); 539 out_info: 540 return; 541 } 542 543 static void amdgpu_uma_sysfs_fini(struct amdgpu_device *adev) 544 { 545 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 546 547 if (!amdgpu_acpi_is_set_uma_allocation_size_supported()) 548 return; 549 550 mutex_destroy(&uma_info->update_lock); 551 uma_info->num_entries = 0; 552 } 553 554 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 555 556 /** 557 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 558 * 559 * @adev: amdgpu device pointer 560 * 561 * Returns true if the device is a dGPU with ATPX power control, 562 * otherwise return false. 563 */ 564 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 565 { 566 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 567 return true; 568 return false; 569 } 570 571 /** 572 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 573 * 574 * @adev: amdgpu device pointer 575 * 576 * Returns true if the device is a dGPU with ACPI power control, 577 * otherwise return false. 578 */ 579 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 580 { 581 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 582 return false; 583 584 if (adev->has_pr3 || 585 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 586 return true; 587 return false; 588 } 589 590 /** 591 * amdgpu_device_supports_baco - Does the device support BACO 592 * 593 * @adev: amdgpu device pointer 594 * 595 * Return: 596 * 1 if the device supports BACO; 597 * 3 if the device supports MACO (only works if BACO is supported) 598 * otherwise return 0. 599 */ 600 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 601 { 602 return amdgpu_asic_supports_baco(adev); 603 } 604 605 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 606 { 607 int bamaco_support; 608 609 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 610 bamaco_support = amdgpu_device_supports_baco(adev); 611 612 switch (amdgpu_runtime_pm) { 613 case 2: 614 if (bamaco_support & MACO_SUPPORT) { 615 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 616 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 617 } else if (bamaco_support == BACO_SUPPORT) { 618 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 619 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 620 } 621 break; 622 case 1: 623 if (bamaco_support & BACO_SUPPORT) { 624 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 625 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 626 } 627 break; 628 case -1: 629 case -2: 630 if (amdgpu_device_supports_px(adev)) { 631 /* enable PX as runtime mode */ 632 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 633 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 634 } else if (amdgpu_device_supports_boco(adev)) { 635 /* enable boco as runtime mode */ 636 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 637 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 638 } else { 639 if (!bamaco_support) 640 goto no_runtime_pm; 641 642 switch (adev->asic_type) { 643 case CHIP_VEGA20: 644 case CHIP_ARCTURUS: 645 /* BACO are not supported on vega20 and arctrus */ 646 break; 647 case CHIP_VEGA10: 648 /* enable BACO as runpm mode if noretry=0 */ 649 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 650 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 651 break; 652 default: 653 /* enable BACO as runpm mode on CI+ */ 654 if (!amdgpu_passthrough(adev)) 655 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 656 break; 657 } 658 659 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 660 if (bamaco_support & MACO_SUPPORT) { 661 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 662 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 663 } else { 664 dev_info(adev->dev, "Using BACO for runtime pm\n"); 665 } 666 } 667 } 668 break; 669 case 0: 670 dev_info(adev->dev, "runtime pm is manually disabled\n"); 671 break; 672 default: 673 break; 674 } 675 676 no_runtime_pm: 677 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 678 dev_info(adev->dev, "Runtime PM not available\n"); 679 } 680 /** 681 * amdgpu_device_supports_smart_shift - Is the device dGPU with 682 * smart shift support 683 * 684 * @adev: amdgpu device pointer 685 * 686 * Returns true if the device is a dGPU with Smart Shift support, 687 * otherwise returns false. 688 */ 689 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 690 { 691 return (amdgpu_device_supports_boco(adev) && 692 amdgpu_acpi_is_power_shift_control_supported()); 693 } 694 695 /* 696 * VRAM access helper functions 697 */ 698 699 /** 700 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 701 * 702 * @adev: amdgpu_device pointer 703 * @pos: offset of the buffer in vram 704 * @buf: virtual address of the buffer in system memory 705 * @size: read/write size, sizeof(@buf) must > @size 706 * @write: true - write to vram, otherwise - read from vram 707 */ 708 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 709 void *buf, size_t size, bool write) 710 { 711 unsigned long flags; 712 uint32_t hi = ~0, tmp = 0; 713 uint32_t *data = buf; 714 uint64_t last; 715 int idx; 716 717 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 718 return; 719 720 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 721 722 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 723 for (last = pos + size; pos < last; pos += 4) { 724 tmp = pos >> 31; 725 726 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 727 if (tmp != hi) { 728 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 729 hi = tmp; 730 } 731 if (write) 732 WREG32_NO_KIQ(mmMM_DATA, *data++); 733 else 734 *data++ = RREG32_NO_KIQ(mmMM_DATA); 735 } 736 737 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 738 drm_dev_exit(idx); 739 } 740 741 /** 742 * amdgpu_device_aper_access - access vram by vram aperture 743 * 744 * @adev: amdgpu_device pointer 745 * @pos: offset of the buffer in vram 746 * @buf: virtual address of the buffer in system memory 747 * @size: read/write size, sizeof(@buf) must > @size 748 * @write: true - write to vram, otherwise - read from vram 749 * 750 * The return value means how many bytes have been transferred. 751 */ 752 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 753 void *buf, size_t size, bool write) 754 { 755 #ifdef CONFIG_64BIT 756 void __iomem *addr; 757 size_t count = 0; 758 uint64_t last; 759 760 if (!adev->mman.aper_base_kaddr) 761 return 0; 762 763 last = min(pos + size, adev->gmc.visible_vram_size); 764 if (last > pos) { 765 addr = adev->mman.aper_base_kaddr + pos; 766 count = last - pos; 767 768 if (write) { 769 memcpy_toio(addr, buf, count); 770 /* Make sure HDP write cache flush happens without any reordering 771 * after the system memory contents are sent over PCIe device 772 */ 773 mb(); 774 amdgpu_device_flush_hdp(adev, NULL); 775 } else { 776 amdgpu_device_invalidate_hdp(adev, NULL); 777 /* Make sure HDP read cache is invalidated before issuing a read 778 * to the PCIe device 779 */ 780 mb(); 781 memcpy_fromio(buf, addr, count); 782 } 783 784 } 785 786 return count; 787 #else 788 return 0; 789 #endif 790 } 791 792 /** 793 * amdgpu_device_vram_access - read/write a buffer in vram 794 * 795 * @adev: amdgpu_device pointer 796 * @pos: offset of the buffer in vram 797 * @buf: virtual address of the buffer in system memory 798 * @size: read/write size, sizeof(@buf) must > @size 799 * @write: true - write to vram, otherwise - read from vram 800 */ 801 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 802 void *buf, size_t size, bool write) 803 { 804 size_t count; 805 806 /* try to using vram apreature to access vram first */ 807 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 808 size -= count; 809 if (size) { 810 /* using MM to access rest vram */ 811 pos += count; 812 buf += count; 813 amdgpu_device_mm_access(adev, pos, buf, size, write); 814 } 815 } 816 817 /* 818 * register access helper functions. 819 */ 820 821 /* Check if hw access should be skipped because of hotplug or device error */ 822 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 823 { 824 if (adev->no_hw_access) 825 return true; 826 827 #ifdef CONFIG_LOCKDEP 828 /* 829 * This is a bit complicated to understand, so worth a comment. What we assert 830 * here is that the GPU reset is not running on another thread in parallel. 831 * 832 * For this we trylock the read side of the reset semaphore, if that succeeds 833 * we know that the reset is not running in parallel. 834 * 835 * If the trylock fails we assert that we are either already holding the read 836 * side of the lock or are the reset thread itself and hold the write side of 837 * the lock. 838 */ 839 if (in_task()) { 840 if (down_read_trylock(&adev->reset_domain->sem)) 841 up_read(&adev->reset_domain->sem); 842 else 843 lockdep_assert_held(&adev->reset_domain->sem); 844 } 845 #endif 846 return false; 847 } 848 849 /** 850 * amdgpu_device_rreg - read a memory mapped IO or indirect register 851 * 852 * @adev: amdgpu_device pointer 853 * @reg: dword aligned register offset 854 * @acc_flags: access flags which require special behavior 855 * 856 * Returns the 32 bit value from the offset specified. 857 */ 858 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 859 uint32_t reg, uint32_t acc_flags) 860 { 861 uint32_t ret; 862 863 if (amdgpu_device_skip_hw_access(adev)) 864 return 0; 865 866 if ((reg * 4) < adev->rmmio_size) { 867 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 868 amdgpu_sriov_runtime(adev) && 869 down_read_trylock(&adev->reset_domain->sem)) { 870 ret = amdgpu_kiq_rreg(adev, reg, 0); 871 up_read(&adev->reset_domain->sem); 872 } else { 873 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 874 } 875 } else { 876 ret = adev->pcie_rreg(adev, reg * 4); 877 } 878 879 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 880 881 return ret; 882 } 883 884 /* 885 * MMIO register read with bytes helper functions 886 * @offset:bytes offset from MMIO start 887 */ 888 889 /** 890 * amdgpu_mm_rreg8 - read a memory mapped IO register 891 * 892 * @adev: amdgpu_device pointer 893 * @offset: byte aligned register offset 894 * 895 * Returns the 8 bit value from the offset specified. 896 */ 897 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 898 { 899 if (amdgpu_device_skip_hw_access(adev)) 900 return 0; 901 902 if (offset < adev->rmmio_size) 903 return (readb(adev->rmmio + offset)); 904 BUG(); 905 } 906 907 908 /** 909 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 910 * 911 * @adev: amdgpu_device pointer 912 * @reg: dword aligned register offset 913 * @acc_flags: access flags which require special behavior 914 * @xcc_id: xcc accelerated compute core id 915 * 916 * Returns the 32 bit value from the offset specified. 917 */ 918 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 919 uint32_t reg, uint32_t acc_flags, 920 uint32_t xcc_id) 921 { 922 uint32_t ret, rlcg_flag; 923 924 if (amdgpu_device_skip_hw_access(adev)) 925 return 0; 926 927 if ((reg * 4) < adev->rmmio_size) { 928 if (amdgpu_sriov_vf(adev) && 929 !amdgpu_sriov_runtime(adev) && 930 adev->gfx.rlc.rlcg_reg_access_supported && 931 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 932 GC_HWIP, false, 933 &rlcg_flag)) { 934 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 935 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 936 amdgpu_sriov_runtime(adev) && 937 down_read_trylock(&adev->reset_domain->sem)) { 938 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 939 up_read(&adev->reset_domain->sem); 940 } else { 941 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 942 } 943 } else { 944 ret = adev->pcie_rreg(adev, reg * 4); 945 } 946 947 return ret; 948 } 949 950 /* 951 * MMIO register write with bytes helper functions 952 * @offset:bytes offset from MMIO start 953 * @value: the value want to be written to the register 954 */ 955 956 /** 957 * amdgpu_mm_wreg8 - read a memory mapped IO register 958 * 959 * @adev: amdgpu_device pointer 960 * @offset: byte aligned register offset 961 * @value: 8 bit value to write 962 * 963 * Writes the value specified to the offset specified. 964 */ 965 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 966 { 967 if (amdgpu_device_skip_hw_access(adev)) 968 return; 969 970 if (offset < adev->rmmio_size) 971 writeb(value, adev->rmmio + offset); 972 else 973 BUG(); 974 } 975 976 /** 977 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 978 * 979 * @adev: amdgpu_device pointer 980 * @reg: dword aligned register offset 981 * @v: 32 bit value to write to the register 982 * @acc_flags: access flags which require special behavior 983 * 984 * Writes the value specified to the offset specified. 985 */ 986 void amdgpu_device_wreg(struct amdgpu_device *adev, 987 uint32_t reg, uint32_t v, 988 uint32_t acc_flags) 989 { 990 if (amdgpu_device_skip_hw_access(adev)) 991 return; 992 993 if ((reg * 4) < adev->rmmio_size) { 994 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 995 amdgpu_sriov_runtime(adev) && 996 down_read_trylock(&adev->reset_domain->sem)) { 997 amdgpu_kiq_wreg(adev, reg, v, 0); 998 up_read(&adev->reset_domain->sem); 999 } else { 1000 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 1001 } 1002 } else { 1003 adev->pcie_wreg(adev, reg * 4, v); 1004 } 1005 1006 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 1007 } 1008 1009 /** 1010 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 1011 * 1012 * @adev: amdgpu_device pointer 1013 * @reg: mmio/rlc register 1014 * @v: value to write 1015 * @xcc_id: xcc accelerated compute core id 1016 * 1017 * this function is invoked only for the debugfs register access 1018 */ 1019 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 1020 uint32_t reg, uint32_t v, 1021 uint32_t xcc_id) 1022 { 1023 if (amdgpu_device_skip_hw_access(adev)) 1024 return; 1025 1026 if (amdgpu_sriov_fullaccess(adev) && 1027 adev->gfx.rlc.funcs && 1028 adev->gfx.rlc.funcs->is_rlcg_access_range) { 1029 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 1030 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 1031 } else if ((reg * 4) >= adev->rmmio_size) { 1032 adev->pcie_wreg(adev, reg * 4, v); 1033 } else { 1034 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 1035 } 1036 } 1037 1038 /** 1039 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 1040 * 1041 * @adev: amdgpu_device pointer 1042 * @reg: dword aligned register offset 1043 * @v: 32 bit value to write to the register 1044 * @acc_flags: access flags which require special behavior 1045 * @xcc_id: xcc accelerated compute core id 1046 * 1047 * Writes the value specified to the offset specified. 1048 */ 1049 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 1050 uint32_t reg, uint32_t v, 1051 uint32_t acc_flags, uint32_t xcc_id) 1052 { 1053 uint32_t rlcg_flag; 1054 1055 if (amdgpu_device_skip_hw_access(adev)) 1056 return; 1057 1058 if ((reg * 4) < adev->rmmio_size) { 1059 if (amdgpu_sriov_vf(adev) && 1060 !amdgpu_sriov_runtime(adev) && 1061 adev->gfx.rlc.rlcg_reg_access_supported && 1062 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 1063 GC_HWIP, true, 1064 &rlcg_flag)) { 1065 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 1066 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 1067 amdgpu_sriov_runtime(adev) && 1068 down_read_trylock(&adev->reset_domain->sem)) { 1069 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 1070 up_read(&adev->reset_domain->sem); 1071 } else { 1072 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 1073 } 1074 } else { 1075 adev->pcie_wreg(adev, reg * 4, v); 1076 } 1077 } 1078 1079 /** 1080 * amdgpu_device_indirect_rreg - read an indirect register 1081 * 1082 * @adev: amdgpu_device pointer 1083 * @reg_addr: indirect register address to read from 1084 * 1085 * Returns the value of indirect register @reg_addr 1086 */ 1087 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 1088 u32 reg_addr) 1089 { 1090 unsigned long flags, pcie_index, pcie_data; 1091 void __iomem *pcie_index_offset; 1092 void __iomem *pcie_data_offset; 1093 u32 r; 1094 1095 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1096 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1097 1098 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1099 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1100 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1101 1102 writel(reg_addr, pcie_index_offset); 1103 readl(pcie_index_offset); 1104 r = readl(pcie_data_offset); 1105 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1106 1107 return r; 1108 } 1109 1110 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 1111 u64 reg_addr) 1112 { 1113 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1114 u32 r; 1115 void __iomem *pcie_index_offset; 1116 void __iomem *pcie_index_hi_offset; 1117 void __iomem *pcie_data_offset; 1118 1119 if (unlikely(!adev->nbio.funcs)) { 1120 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 1121 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 1122 } else { 1123 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1124 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1125 } 1126 1127 if (reg_addr >> 32) { 1128 if (unlikely(!adev->nbio.funcs)) 1129 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 1130 else 1131 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1132 } else { 1133 pcie_index_hi = 0; 1134 } 1135 1136 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1137 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1138 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1139 if (pcie_index_hi != 0) 1140 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1141 pcie_index_hi * 4; 1142 1143 writel(reg_addr, pcie_index_offset); 1144 readl(pcie_index_offset); 1145 if (pcie_index_hi != 0) { 1146 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1147 readl(pcie_index_hi_offset); 1148 } 1149 r = readl(pcie_data_offset); 1150 1151 /* clear the high bits */ 1152 if (pcie_index_hi != 0) { 1153 writel(0, pcie_index_hi_offset); 1154 readl(pcie_index_hi_offset); 1155 } 1156 1157 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1158 1159 return r; 1160 } 1161 1162 /** 1163 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1164 * 1165 * @adev: amdgpu_device pointer 1166 * @reg_addr: indirect register address to read from 1167 * 1168 * Returns the value of indirect register @reg_addr 1169 */ 1170 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1171 u32 reg_addr) 1172 { 1173 unsigned long flags, pcie_index, pcie_data; 1174 void __iomem *pcie_index_offset; 1175 void __iomem *pcie_data_offset; 1176 u64 r; 1177 1178 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1179 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1180 1181 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1182 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1183 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1184 1185 /* read low 32 bits */ 1186 writel(reg_addr, pcie_index_offset); 1187 readl(pcie_index_offset); 1188 r = readl(pcie_data_offset); 1189 /* read high 32 bits */ 1190 writel(reg_addr + 4, pcie_index_offset); 1191 readl(pcie_index_offset); 1192 r |= ((u64)readl(pcie_data_offset) << 32); 1193 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1194 1195 return r; 1196 } 1197 1198 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1199 u64 reg_addr) 1200 { 1201 unsigned long flags, pcie_index, pcie_data; 1202 unsigned long pcie_index_hi = 0; 1203 void __iomem *pcie_index_offset; 1204 void __iomem *pcie_index_hi_offset; 1205 void __iomem *pcie_data_offset; 1206 u64 r; 1207 1208 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1209 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1210 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1211 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1212 1213 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1214 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1215 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1216 if (pcie_index_hi != 0) 1217 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1218 pcie_index_hi * 4; 1219 1220 /* read low 32 bits */ 1221 writel(reg_addr, pcie_index_offset); 1222 readl(pcie_index_offset); 1223 if (pcie_index_hi != 0) { 1224 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1225 readl(pcie_index_hi_offset); 1226 } 1227 r = readl(pcie_data_offset); 1228 /* read high 32 bits */ 1229 writel(reg_addr + 4, pcie_index_offset); 1230 readl(pcie_index_offset); 1231 if (pcie_index_hi != 0) { 1232 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1233 readl(pcie_index_hi_offset); 1234 } 1235 r |= ((u64)readl(pcie_data_offset) << 32); 1236 1237 /* clear the high bits */ 1238 if (pcie_index_hi != 0) { 1239 writel(0, pcie_index_hi_offset); 1240 readl(pcie_index_hi_offset); 1241 } 1242 1243 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1244 1245 return r; 1246 } 1247 1248 /** 1249 * amdgpu_device_indirect_wreg - write an indirect register address 1250 * 1251 * @adev: amdgpu_device pointer 1252 * @reg_addr: indirect register offset 1253 * @reg_data: indirect register data 1254 * 1255 */ 1256 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1257 u32 reg_addr, u32 reg_data) 1258 { 1259 unsigned long flags, pcie_index, pcie_data; 1260 void __iomem *pcie_index_offset; 1261 void __iomem *pcie_data_offset; 1262 1263 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1264 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1265 1266 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1267 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1268 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1269 1270 writel(reg_addr, pcie_index_offset); 1271 readl(pcie_index_offset); 1272 writel(reg_data, pcie_data_offset); 1273 readl(pcie_data_offset); 1274 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1275 } 1276 1277 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1278 u64 reg_addr, u32 reg_data) 1279 { 1280 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1281 void __iomem *pcie_index_offset; 1282 void __iomem *pcie_index_hi_offset; 1283 void __iomem *pcie_data_offset; 1284 1285 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1286 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1287 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1288 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1289 else 1290 pcie_index_hi = 0; 1291 1292 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1293 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1294 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1295 if (pcie_index_hi != 0) 1296 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1297 pcie_index_hi * 4; 1298 1299 writel(reg_addr, pcie_index_offset); 1300 readl(pcie_index_offset); 1301 if (pcie_index_hi != 0) { 1302 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1303 readl(pcie_index_hi_offset); 1304 } 1305 writel(reg_data, pcie_data_offset); 1306 readl(pcie_data_offset); 1307 1308 /* clear the high bits */ 1309 if (pcie_index_hi != 0) { 1310 writel(0, pcie_index_hi_offset); 1311 readl(pcie_index_hi_offset); 1312 } 1313 1314 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1315 } 1316 1317 /** 1318 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1319 * 1320 * @adev: amdgpu_device pointer 1321 * @reg_addr: indirect register offset 1322 * @reg_data: indirect register data 1323 * 1324 */ 1325 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1326 u32 reg_addr, u64 reg_data) 1327 { 1328 unsigned long flags, pcie_index, pcie_data; 1329 void __iomem *pcie_index_offset; 1330 void __iomem *pcie_data_offset; 1331 1332 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1333 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1334 1335 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1336 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1337 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1338 1339 /* write low 32 bits */ 1340 writel(reg_addr, pcie_index_offset); 1341 readl(pcie_index_offset); 1342 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1343 readl(pcie_data_offset); 1344 /* write high 32 bits */ 1345 writel(reg_addr + 4, pcie_index_offset); 1346 readl(pcie_index_offset); 1347 writel((u32)(reg_data >> 32), pcie_data_offset); 1348 readl(pcie_data_offset); 1349 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1350 } 1351 1352 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1353 u64 reg_addr, u64 reg_data) 1354 { 1355 unsigned long flags, pcie_index, pcie_data; 1356 unsigned long pcie_index_hi = 0; 1357 void __iomem *pcie_index_offset; 1358 void __iomem *pcie_index_hi_offset; 1359 void __iomem *pcie_data_offset; 1360 1361 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1362 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1363 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1364 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1365 1366 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1367 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1368 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1369 if (pcie_index_hi != 0) 1370 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1371 pcie_index_hi * 4; 1372 1373 /* write low 32 bits */ 1374 writel(reg_addr, pcie_index_offset); 1375 readl(pcie_index_offset); 1376 if (pcie_index_hi != 0) { 1377 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1378 readl(pcie_index_hi_offset); 1379 } 1380 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1381 readl(pcie_data_offset); 1382 /* write high 32 bits */ 1383 writel(reg_addr + 4, pcie_index_offset); 1384 readl(pcie_index_offset); 1385 if (pcie_index_hi != 0) { 1386 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1387 readl(pcie_index_hi_offset); 1388 } 1389 writel((u32)(reg_data >> 32), pcie_data_offset); 1390 readl(pcie_data_offset); 1391 1392 /* clear the high bits */ 1393 if (pcie_index_hi != 0) { 1394 writel(0, pcie_index_hi_offset); 1395 readl(pcie_index_hi_offset); 1396 } 1397 1398 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1399 } 1400 1401 /** 1402 * amdgpu_device_get_rev_id - query device rev_id 1403 * 1404 * @adev: amdgpu_device pointer 1405 * 1406 * Return device rev_id 1407 */ 1408 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1409 { 1410 return adev->nbio.funcs->get_rev_id(adev); 1411 } 1412 1413 /** 1414 * amdgpu_invalid_rreg - dummy reg read function 1415 * 1416 * @adev: amdgpu_device pointer 1417 * @reg: offset of register 1418 * 1419 * Dummy register read function. Used for register blocks 1420 * that certain asics don't have (all asics). 1421 * Returns the value in the register. 1422 */ 1423 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1424 { 1425 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1426 BUG(); 1427 return 0; 1428 } 1429 1430 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1431 { 1432 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1433 BUG(); 1434 return 0; 1435 } 1436 1437 /** 1438 * amdgpu_invalid_wreg - dummy reg write function 1439 * 1440 * @adev: amdgpu_device pointer 1441 * @reg: offset of register 1442 * @v: value to write to the register 1443 * 1444 * Dummy register read function. Used for register blocks 1445 * that certain asics don't have (all asics). 1446 */ 1447 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1448 { 1449 dev_err(adev->dev, 1450 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1451 v); 1452 BUG(); 1453 } 1454 1455 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1456 { 1457 dev_err(adev->dev, 1458 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1459 v); 1460 BUG(); 1461 } 1462 1463 /** 1464 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1465 * 1466 * @adev: amdgpu_device pointer 1467 * @reg: offset of register 1468 * 1469 * Dummy register read function. Used for register blocks 1470 * that certain asics don't have (all asics). 1471 * Returns the value in the register. 1472 */ 1473 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1474 { 1475 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1476 reg); 1477 BUG(); 1478 return 0; 1479 } 1480 1481 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1482 { 1483 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1484 BUG(); 1485 return 0; 1486 } 1487 1488 /** 1489 * amdgpu_invalid_wreg64 - dummy reg write function 1490 * 1491 * @adev: amdgpu_device pointer 1492 * @reg: offset of register 1493 * @v: value to write to the register 1494 * 1495 * Dummy register read function. Used for register blocks 1496 * that certain asics don't have (all asics). 1497 */ 1498 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1499 { 1500 dev_err(adev->dev, 1501 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1502 reg, v); 1503 BUG(); 1504 } 1505 1506 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1507 { 1508 dev_err(adev->dev, 1509 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1510 reg, v); 1511 BUG(); 1512 } 1513 1514 /** 1515 * amdgpu_block_invalid_rreg - dummy reg read function 1516 * 1517 * @adev: amdgpu_device pointer 1518 * @block: offset of instance 1519 * @reg: offset of register 1520 * 1521 * Dummy register read function. Used for register blocks 1522 * that certain asics don't have (all asics). 1523 * Returns the value in the register. 1524 */ 1525 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1526 uint32_t block, uint32_t reg) 1527 { 1528 dev_err(adev->dev, 1529 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1530 reg, block); 1531 BUG(); 1532 return 0; 1533 } 1534 1535 /** 1536 * amdgpu_block_invalid_wreg - dummy reg write function 1537 * 1538 * @adev: amdgpu_device pointer 1539 * @block: offset of instance 1540 * @reg: offset of register 1541 * @v: value to write to the register 1542 * 1543 * Dummy register read function. Used for register blocks 1544 * that certain asics don't have (all asics). 1545 */ 1546 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1547 uint32_t block, 1548 uint32_t reg, uint32_t v) 1549 { 1550 dev_err(adev->dev, 1551 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1552 reg, block, v); 1553 BUG(); 1554 } 1555 1556 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1557 { 1558 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1559 return AMDGPU_VBIOS_SKIP; 1560 1561 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1562 return AMDGPU_VBIOS_OPTIONAL; 1563 1564 return 0; 1565 } 1566 1567 /** 1568 * amdgpu_device_asic_init - Wrapper for atom asic_init 1569 * 1570 * @adev: amdgpu_device pointer 1571 * 1572 * Does any asic specific work and then calls atom asic init. 1573 */ 1574 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1575 { 1576 uint32_t flags; 1577 bool optional; 1578 int ret; 1579 1580 amdgpu_asic_pre_asic_init(adev); 1581 flags = amdgpu_device_get_vbios_flags(adev); 1582 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1583 1584 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1585 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1586 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1587 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1588 amdgpu_psp_wait_for_bootloader(adev); 1589 if (optional && !adev->bios) 1590 return 0; 1591 1592 ret = amdgpu_atomfirmware_asic_init(adev, true); 1593 return ret; 1594 } else { 1595 if (optional && !adev->bios) 1596 return 0; 1597 1598 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1599 } 1600 1601 return 0; 1602 } 1603 1604 /** 1605 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1606 * 1607 * @adev: amdgpu_device pointer 1608 * 1609 * Allocates a scratch page of VRAM for use by various things in the 1610 * driver. 1611 */ 1612 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1613 { 1614 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1615 AMDGPU_GEM_DOMAIN_VRAM | 1616 AMDGPU_GEM_DOMAIN_GTT, 1617 &adev->mem_scratch.robj, 1618 &adev->mem_scratch.gpu_addr, 1619 (void **)&adev->mem_scratch.ptr); 1620 } 1621 1622 /** 1623 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1624 * 1625 * @adev: amdgpu_device pointer 1626 * 1627 * Frees the VRAM scratch page. 1628 */ 1629 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1630 { 1631 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1632 } 1633 1634 /** 1635 * amdgpu_device_program_register_sequence - program an array of registers. 1636 * 1637 * @adev: amdgpu_device pointer 1638 * @registers: pointer to the register array 1639 * @array_size: size of the register array 1640 * 1641 * Programs an array or registers with and or masks. 1642 * This is a helper for setting golden registers. 1643 */ 1644 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1645 const u32 *registers, 1646 const u32 array_size) 1647 { 1648 u32 tmp, reg, and_mask, or_mask; 1649 int i; 1650 1651 if (array_size % 3) 1652 return; 1653 1654 for (i = 0; i < array_size; i += 3) { 1655 reg = registers[i + 0]; 1656 and_mask = registers[i + 1]; 1657 or_mask = registers[i + 2]; 1658 1659 if (and_mask == 0xffffffff) { 1660 tmp = or_mask; 1661 } else { 1662 tmp = RREG32(reg); 1663 tmp &= ~and_mask; 1664 if (adev->family >= AMDGPU_FAMILY_AI) 1665 tmp |= (or_mask & and_mask); 1666 else 1667 tmp |= or_mask; 1668 } 1669 WREG32(reg, tmp); 1670 } 1671 } 1672 1673 /** 1674 * amdgpu_device_pci_config_reset - reset the GPU 1675 * 1676 * @adev: amdgpu_device pointer 1677 * 1678 * Resets the GPU using the pci config reset sequence. 1679 * Only applicable to asics prior to vega10. 1680 */ 1681 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1682 { 1683 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1684 } 1685 1686 /** 1687 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1688 * 1689 * @adev: amdgpu_device pointer 1690 * 1691 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1692 */ 1693 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1694 { 1695 return pci_reset_function(adev->pdev); 1696 } 1697 1698 /* 1699 * amdgpu_device_wb_*() 1700 * Writeback is the method by which the GPU updates special pages in memory 1701 * with the status of certain GPU events (fences, ring pointers,etc.). 1702 */ 1703 1704 /** 1705 * amdgpu_device_wb_fini - Disable Writeback and free memory 1706 * 1707 * @adev: amdgpu_device pointer 1708 * 1709 * Disables Writeback and frees the Writeback memory (all asics). 1710 * Used at driver shutdown. 1711 */ 1712 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1713 { 1714 if (adev->wb.wb_obj) { 1715 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1716 &adev->wb.gpu_addr, 1717 (void **)&adev->wb.wb); 1718 adev->wb.wb_obj = NULL; 1719 } 1720 } 1721 1722 /** 1723 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1724 * 1725 * @adev: amdgpu_device pointer 1726 * 1727 * Initializes writeback and allocates writeback memory (all asics). 1728 * Used at driver startup. 1729 * Returns 0 on success or an -error on failure. 1730 */ 1731 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1732 { 1733 int r; 1734 1735 if (adev->wb.wb_obj == NULL) { 1736 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1737 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1738 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1739 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1740 (void **)&adev->wb.wb); 1741 if (r) { 1742 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1743 return r; 1744 } 1745 1746 adev->wb.num_wb = AMDGPU_MAX_WB; 1747 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1748 1749 /* clear wb memory */ 1750 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1751 } 1752 1753 return 0; 1754 } 1755 1756 /** 1757 * amdgpu_device_wb_get - Allocate a wb entry 1758 * 1759 * @adev: amdgpu_device pointer 1760 * @wb: wb index 1761 * 1762 * Allocate a wb slot for use by the driver (all asics). 1763 * Returns 0 on success or -EINVAL on failure. 1764 */ 1765 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1766 { 1767 unsigned long flags, offset; 1768 1769 spin_lock_irqsave(&adev->wb.lock, flags); 1770 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1771 if (offset < adev->wb.num_wb) { 1772 __set_bit(offset, adev->wb.used); 1773 spin_unlock_irqrestore(&adev->wb.lock, flags); 1774 *wb = offset << 3; /* convert to dw offset */ 1775 return 0; 1776 } else { 1777 spin_unlock_irqrestore(&adev->wb.lock, flags); 1778 return -EINVAL; 1779 } 1780 } 1781 1782 /** 1783 * amdgpu_device_wb_free - Free a wb entry 1784 * 1785 * @adev: amdgpu_device pointer 1786 * @wb: wb index 1787 * 1788 * Free a wb slot allocated for use by the driver (all asics) 1789 */ 1790 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1791 { 1792 unsigned long flags; 1793 1794 wb >>= 3; 1795 spin_lock_irqsave(&adev->wb.lock, flags); 1796 if (wb < adev->wb.num_wb) 1797 __clear_bit(wb, adev->wb.used); 1798 spin_unlock_irqrestore(&adev->wb.lock, flags); 1799 } 1800 1801 /** 1802 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1803 * 1804 * @adev: amdgpu_device pointer 1805 * 1806 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1807 * to fail, but if any of the BARs is not accessible after the size we abort 1808 * driver loading by returning -ENODEV. 1809 */ 1810 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1811 { 1812 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1813 struct pci_bus *root; 1814 struct resource *res; 1815 unsigned int i; 1816 u16 cmd; 1817 int r; 1818 1819 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1820 return 0; 1821 1822 /* Bypass for VF */ 1823 if (amdgpu_sriov_vf(adev)) 1824 return 0; 1825 1826 if (!amdgpu_rebar) 1827 return 0; 1828 1829 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1830 if ((amdgpu_runtime_pm != 0) && 1831 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1832 adev->pdev->device == 0x731f && 1833 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1834 return 0; 1835 1836 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1837 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1838 dev_warn( 1839 adev->dev, 1840 "System can't access extended configuration space, please check!!\n"); 1841 1842 /* skip if the bios has already enabled large BAR */ 1843 if (adev->gmc.real_vram_size && 1844 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1845 return 0; 1846 1847 /* Check if the root BUS has 64bit memory resources */ 1848 root = adev->pdev->bus; 1849 while (root->parent) 1850 root = root->parent; 1851 1852 pci_bus_for_each_resource(root, res, i) { 1853 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1854 res->start > 0x100000000ull) 1855 break; 1856 } 1857 1858 /* Trying to resize is pointless without a root hub window above 4GB */ 1859 if (!res) 1860 return 0; 1861 1862 /* Limit the BAR size to what is available */ 1863 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1864 rbar_size); 1865 1866 /* Disable memory decoding while we change the BAR addresses and size */ 1867 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1868 pci_write_config_word(adev->pdev, PCI_COMMAND, 1869 cmd & ~PCI_COMMAND_MEMORY); 1870 1871 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1872 amdgpu_doorbell_fini(adev); 1873 if (adev->asic_type >= CHIP_BONAIRE) 1874 pci_release_resource(adev->pdev, 2); 1875 1876 pci_release_resource(adev->pdev, 0); 1877 1878 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1879 if (r == -ENOSPC) 1880 dev_info(adev->dev, 1881 "Not enough PCI address space for a large BAR."); 1882 else if (r && r != -ENOTSUPP) 1883 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1884 1885 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1886 1887 /* When the doorbell or fb BAR isn't available we have no chance of 1888 * using the device. 1889 */ 1890 r = amdgpu_doorbell_init(adev); 1891 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1892 return -ENODEV; 1893 1894 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1895 1896 return 0; 1897 } 1898 1899 /* 1900 * GPU helpers function. 1901 */ 1902 /** 1903 * amdgpu_device_need_post - check if the hw need post or not 1904 * 1905 * @adev: amdgpu_device pointer 1906 * 1907 * Check if the asic has been initialized (all asics) at driver startup 1908 * or post is needed if hw reset is performed. 1909 * Returns true if need or false if not. 1910 */ 1911 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1912 { 1913 uint32_t reg, flags; 1914 1915 if (amdgpu_sriov_vf(adev)) 1916 return false; 1917 1918 flags = amdgpu_device_get_vbios_flags(adev); 1919 if (flags & AMDGPU_VBIOS_SKIP) 1920 return false; 1921 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1922 return false; 1923 1924 if (amdgpu_passthrough(adev)) { 1925 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1926 * some old smc fw still need driver do vPost otherwise gpu hang, while 1927 * those smc fw version above 22.15 doesn't have this flaw, so we force 1928 * vpost executed for smc version below 22.15 1929 */ 1930 if (adev->asic_type == CHIP_FIJI) { 1931 int err; 1932 uint32_t fw_ver; 1933 1934 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1935 /* force vPost if error occurred */ 1936 if (err) 1937 return true; 1938 1939 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1940 release_firmware(adev->pm.fw); 1941 if (fw_ver < 0x00160e00) 1942 return true; 1943 } 1944 } 1945 1946 /* Don't post if we need to reset whole hive on init */ 1947 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1948 return false; 1949 1950 if (adev->has_hw_reset) { 1951 adev->has_hw_reset = false; 1952 return true; 1953 } 1954 1955 /* bios scratch used on CIK+ */ 1956 if (adev->asic_type >= CHIP_BONAIRE) 1957 return amdgpu_atombios_scratch_need_asic_init(adev); 1958 1959 /* check MEM_SIZE for older asics */ 1960 reg = amdgpu_asic_get_config_memsize(adev); 1961 1962 if ((reg != 0) && (reg != 0xffffffff)) 1963 return false; 1964 1965 return true; 1966 } 1967 1968 /* 1969 * Check whether seamless boot is supported. 1970 * 1971 * So far we only support seamless boot on DCE 3.0 or later. 1972 * If users report that it works on older ASICS as well, we may 1973 * loosen this. 1974 */ 1975 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1976 { 1977 switch (amdgpu_seamless) { 1978 case -1: 1979 break; 1980 case 1: 1981 return true; 1982 case 0: 1983 return false; 1984 default: 1985 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1986 amdgpu_seamless); 1987 return false; 1988 } 1989 1990 if (!(adev->flags & AMD_IS_APU)) 1991 return false; 1992 1993 if (adev->mman.keep_stolen_vga_memory) 1994 return false; 1995 1996 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1997 } 1998 1999 /* 2000 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 2001 * don't support dynamic speed switching. Until we have confirmation from Intel 2002 * that a specific host supports it, it's safer that we keep it disabled for all. 2003 * 2004 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 2005 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 2006 */ 2007 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 2008 { 2009 #if IS_ENABLED(CONFIG_X86) 2010 struct cpuinfo_x86 *c = &cpu_data(0); 2011 2012 /* eGPU change speeds based on USB4 fabric conditions */ 2013 if (dev_is_removable(adev->dev)) 2014 return true; 2015 2016 if (c->x86_vendor == X86_VENDOR_INTEL) 2017 return false; 2018 #endif 2019 return true; 2020 } 2021 2022 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 2023 { 2024 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. 2025 * It's unclear if this is a platform-specific or GPU-specific issue. 2026 * Disable ASPM on SI for the time being. 2027 */ 2028 if (adev->family == AMDGPU_FAMILY_SI) 2029 return true; 2030 2031 #if IS_ENABLED(CONFIG_X86) 2032 struct cpuinfo_x86 *c = &cpu_data(0); 2033 2034 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 2035 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 2036 return false; 2037 2038 if (c->x86 == 6 && 2039 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 2040 switch (c->x86_model) { 2041 case VFM_MODEL(INTEL_ALDERLAKE): 2042 case VFM_MODEL(INTEL_ALDERLAKE_L): 2043 case VFM_MODEL(INTEL_RAPTORLAKE): 2044 case VFM_MODEL(INTEL_RAPTORLAKE_P): 2045 case VFM_MODEL(INTEL_RAPTORLAKE_S): 2046 return true; 2047 default: 2048 return false; 2049 } 2050 } else { 2051 return false; 2052 } 2053 #else 2054 return false; 2055 #endif 2056 } 2057 2058 /** 2059 * amdgpu_device_should_use_aspm - check if the device should program ASPM 2060 * 2061 * @adev: amdgpu_device pointer 2062 * 2063 * Confirm whether the module parameter and pcie bridge agree that ASPM should 2064 * be set for this device. 2065 * 2066 * Returns true if it should be used or false if not. 2067 */ 2068 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 2069 { 2070 switch (amdgpu_aspm) { 2071 case -1: 2072 break; 2073 case 0: 2074 return false; 2075 case 1: 2076 return true; 2077 default: 2078 return false; 2079 } 2080 if (adev->flags & AMD_IS_APU) 2081 return false; 2082 if (amdgpu_device_aspm_support_quirk(adev)) 2083 return false; 2084 return pcie_aspm_enabled(adev->pdev); 2085 } 2086 2087 /* if we get transitioned to only one device, take VGA back */ 2088 /** 2089 * amdgpu_device_vga_set_decode - enable/disable vga decode 2090 * 2091 * @pdev: PCI device pointer 2092 * @state: enable/disable vga decode 2093 * 2094 * Enable/disable vga decode (all asics). 2095 * Returns VGA resource flags. 2096 */ 2097 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 2098 bool state) 2099 { 2100 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 2101 2102 amdgpu_asic_set_vga_state(adev, state); 2103 if (state) 2104 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 2105 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 2106 else 2107 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 2108 } 2109 2110 /** 2111 * amdgpu_device_check_block_size - validate the vm block size 2112 * 2113 * @adev: amdgpu_device pointer 2114 * 2115 * Validates the vm block size specified via module parameter. 2116 * The vm block size defines number of bits in page table versus page directory, 2117 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 2118 * page table and the remaining bits are in the page directory. 2119 */ 2120 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 2121 { 2122 /* defines number of bits in page table versus page directory, 2123 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 2124 * page table and the remaining bits are in the page directory 2125 */ 2126 if (amdgpu_vm_block_size == -1) 2127 return; 2128 2129 if (amdgpu_vm_block_size < 9) { 2130 dev_warn(adev->dev, "VM page table size (%d) too small\n", 2131 amdgpu_vm_block_size); 2132 amdgpu_vm_block_size = -1; 2133 } 2134 } 2135 2136 /** 2137 * amdgpu_device_check_vm_size - validate the vm size 2138 * 2139 * @adev: amdgpu_device pointer 2140 * 2141 * Validates the vm size in GB specified via module parameter. 2142 * The VM size is the size of the GPU virtual memory space in GB. 2143 */ 2144 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2145 { 2146 /* no need to check the default value */ 2147 if (amdgpu_vm_size == -1) 2148 return; 2149 2150 if (amdgpu_vm_size < 1) { 2151 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2152 amdgpu_vm_size); 2153 amdgpu_vm_size = -1; 2154 } 2155 } 2156 2157 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2158 { 2159 struct sysinfo si; 2160 bool is_os_64 = (sizeof(void *) == 8); 2161 uint64_t total_memory; 2162 uint64_t dram_size_seven_GB = 0x1B8000000; 2163 uint64_t dram_size_three_GB = 0xB8000000; 2164 2165 if (amdgpu_smu_memory_pool_size == 0) 2166 return; 2167 2168 if (!is_os_64) { 2169 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2170 goto def_value; 2171 } 2172 si_meminfo(&si); 2173 total_memory = (uint64_t)si.totalram * si.mem_unit; 2174 2175 if ((amdgpu_smu_memory_pool_size == 1) || 2176 (amdgpu_smu_memory_pool_size == 2)) { 2177 if (total_memory < dram_size_three_GB) 2178 goto def_value1; 2179 } else if ((amdgpu_smu_memory_pool_size == 4) || 2180 (amdgpu_smu_memory_pool_size == 8)) { 2181 if (total_memory < dram_size_seven_GB) 2182 goto def_value1; 2183 } else { 2184 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2185 goto def_value; 2186 } 2187 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2188 2189 return; 2190 2191 def_value1: 2192 dev_warn(adev->dev, "No enough system memory\n"); 2193 def_value: 2194 adev->pm.smu_prv_buffer_size = 0; 2195 } 2196 2197 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2198 { 2199 if (!(adev->flags & AMD_IS_APU) || 2200 adev->asic_type < CHIP_RAVEN) 2201 return 0; 2202 2203 switch (adev->asic_type) { 2204 case CHIP_RAVEN: 2205 if (adev->pdev->device == 0x15dd) 2206 adev->apu_flags |= AMD_APU_IS_RAVEN; 2207 if (adev->pdev->device == 0x15d8) 2208 adev->apu_flags |= AMD_APU_IS_PICASSO; 2209 break; 2210 case CHIP_RENOIR: 2211 if ((adev->pdev->device == 0x1636) || 2212 (adev->pdev->device == 0x164c)) 2213 adev->apu_flags |= AMD_APU_IS_RENOIR; 2214 else 2215 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2216 break; 2217 case CHIP_VANGOGH: 2218 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2219 break; 2220 case CHIP_YELLOW_CARP: 2221 break; 2222 case CHIP_CYAN_SKILLFISH: 2223 if ((adev->pdev->device == 0x13FE) || 2224 (adev->pdev->device == 0x143F)) 2225 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2226 break; 2227 default: 2228 break; 2229 } 2230 2231 return 0; 2232 } 2233 2234 /** 2235 * amdgpu_device_check_arguments - validate module params 2236 * 2237 * @adev: amdgpu_device pointer 2238 * 2239 * Validates certain module parameters and updates 2240 * the associated values used by the driver (all asics). 2241 */ 2242 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2243 { 2244 int i; 2245 2246 if (amdgpu_sched_jobs < 4) { 2247 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2248 amdgpu_sched_jobs); 2249 amdgpu_sched_jobs = 4; 2250 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2251 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2252 amdgpu_sched_jobs); 2253 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2254 } 2255 2256 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2257 /* gart size must be greater or equal to 32M */ 2258 dev_warn(adev->dev, "gart size (%d) too small\n", 2259 amdgpu_gart_size); 2260 amdgpu_gart_size = -1; 2261 } 2262 2263 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2264 /* gtt size must be greater or equal to 32M */ 2265 dev_warn(adev->dev, "gtt size (%d) too small\n", 2266 amdgpu_gtt_size); 2267 amdgpu_gtt_size = -1; 2268 } 2269 2270 /* valid range is between 4 and 9 inclusive */ 2271 if (amdgpu_vm_fragment_size != -1 && 2272 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2273 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2274 amdgpu_vm_fragment_size = -1; 2275 } 2276 2277 if (amdgpu_sched_hw_submission < 2) { 2278 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2279 amdgpu_sched_hw_submission); 2280 amdgpu_sched_hw_submission = 2; 2281 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2282 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2283 amdgpu_sched_hw_submission); 2284 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2285 } 2286 2287 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2288 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2289 amdgpu_reset_method = -1; 2290 } 2291 2292 amdgpu_device_check_smu_prv_buffer_size(adev); 2293 2294 amdgpu_device_check_vm_size(adev); 2295 2296 amdgpu_device_check_block_size(adev); 2297 2298 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2299 2300 for (i = 0; i < MAX_XCP; i++) { 2301 switch (amdgpu_enforce_isolation) { 2302 case -1: 2303 case 0: 2304 default: 2305 /* disable */ 2306 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2307 break; 2308 case 1: 2309 /* enable */ 2310 adev->enforce_isolation[i] = 2311 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2312 break; 2313 case 2: 2314 /* enable legacy mode */ 2315 adev->enforce_isolation[i] = 2316 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2317 break; 2318 case 3: 2319 /* enable only process isolation without submitting cleaner shader */ 2320 adev->enforce_isolation[i] = 2321 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2322 break; 2323 } 2324 } 2325 2326 return 0; 2327 } 2328 2329 /** 2330 * amdgpu_switcheroo_set_state - set switcheroo state 2331 * 2332 * @pdev: pci dev pointer 2333 * @state: vga_switcheroo state 2334 * 2335 * Callback for the switcheroo driver. Suspends or resumes 2336 * the asics before or after it is powered up using ACPI methods. 2337 */ 2338 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2339 enum vga_switcheroo_state state) 2340 { 2341 struct drm_device *dev = pci_get_drvdata(pdev); 2342 int r; 2343 2344 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2345 state == VGA_SWITCHEROO_OFF) 2346 return; 2347 2348 if (state == VGA_SWITCHEROO_ON) { 2349 pr_info("switched on\n"); 2350 /* don't suspend or resume card normally */ 2351 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2352 2353 pci_set_power_state(pdev, PCI_D0); 2354 amdgpu_device_load_pci_state(pdev); 2355 r = pci_enable_device(pdev); 2356 if (r) 2357 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2358 r); 2359 amdgpu_device_resume(dev, true); 2360 2361 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2362 } else { 2363 dev_info(&pdev->dev, "switched off\n"); 2364 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2365 amdgpu_device_prepare(dev); 2366 amdgpu_device_suspend(dev, true); 2367 amdgpu_device_cache_pci_state(pdev); 2368 /* Shut down the device */ 2369 pci_disable_device(pdev); 2370 pci_set_power_state(pdev, PCI_D3cold); 2371 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2372 } 2373 } 2374 2375 /** 2376 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2377 * 2378 * @pdev: pci dev pointer 2379 * 2380 * Callback for the switcheroo driver. Check of the switcheroo 2381 * state can be changed. 2382 * Returns true if the state can be changed, false if not. 2383 */ 2384 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2385 { 2386 struct drm_device *dev = pci_get_drvdata(pdev); 2387 2388 /* 2389 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2390 * locking inversion with the driver load path. And the access here is 2391 * completely racy anyway. So don't bother with locking for now. 2392 */ 2393 return atomic_read(&dev->open_count) == 0; 2394 } 2395 2396 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2397 .set_gpu_state = amdgpu_switcheroo_set_state, 2398 .reprobe = NULL, 2399 .can_switch = amdgpu_switcheroo_can_switch, 2400 }; 2401 2402 /** 2403 * amdgpu_device_enable_virtual_display - enable virtual display feature 2404 * 2405 * @adev: amdgpu_device pointer 2406 * 2407 * Enabled the virtual display feature if the user has enabled it via 2408 * the module parameter virtual_display. This feature provides a virtual 2409 * display hardware on headless boards or in virtualized environments. 2410 * This function parses and validates the configuration string specified by 2411 * the user and configures the virtual display configuration (number of 2412 * virtual connectors, crtcs, etc.) specified. 2413 */ 2414 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2415 { 2416 adev->enable_virtual_display = false; 2417 2418 if (amdgpu_virtual_display) { 2419 const char *pci_address_name = pci_name(adev->pdev); 2420 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2421 2422 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2423 pciaddstr_tmp = pciaddstr; 2424 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2425 pciaddname = strsep(&pciaddname_tmp, ","); 2426 if (!strcmp("all", pciaddname) 2427 || !strcmp(pci_address_name, pciaddname)) { 2428 long num_crtc; 2429 int res = -1; 2430 2431 adev->enable_virtual_display = true; 2432 2433 if (pciaddname_tmp) 2434 res = kstrtol(pciaddname_tmp, 10, 2435 &num_crtc); 2436 2437 if (!res) { 2438 if (num_crtc < 1) 2439 num_crtc = 1; 2440 if (num_crtc > 6) 2441 num_crtc = 6; 2442 adev->mode_info.num_crtc = num_crtc; 2443 } else { 2444 adev->mode_info.num_crtc = 1; 2445 } 2446 break; 2447 } 2448 } 2449 2450 dev_info( 2451 adev->dev, 2452 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2453 amdgpu_virtual_display, pci_address_name, 2454 adev->enable_virtual_display, adev->mode_info.num_crtc); 2455 2456 kfree(pciaddstr); 2457 } 2458 } 2459 2460 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2461 { 2462 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2463 adev->mode_info.num_crtc = 1; 2464 adev->enable_virtual_display = true; 2465 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2466 adev->enable_virtual_display, 2467 adev->mode_info.num_crtc); 2468 } 2469 } 2470 2471 /** 2472 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2473 * 2474 * @adev: amdgpu_device pointer 2475 * 2476 * Parses the asic configuration parameters specified in the gpu info 2477 * firmware and makes them available to the driver for use in configuring 2478 * the asic. 2479 * Returns 0 on success, -EINVAL on failure. 2480 */ 2481 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2482 { 2483 const char *chip_name; 2484 int err; 2485 const struct gpu_info_firmware_header_v1_0 *hdr; 2486 2487 adev->firmware.gpu_info_fw = NULL; 2488 2489 switch (adev->asic_type) { 2490 default: 2491 return 0; 2492 case CHIP_VEGA10: 2493 chip_name = "vega10"; 2494 break; 2495 case CHIP_VEGA12: 2496 chip_name = "vega12"; 2497 break; 2498 case CHIP_RAVEN: 2499 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2500 chip_name = "raven2"; 2501 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2502 chip_name = "picasso"; 2503 else 2504 chip_name = "raven"; 2505 break; 2506 case CHIP_ARCTURUS: 2507 chip_name = "arcturus"; 2508 break; 2509 case CHIP_NAVI12: 2510 if (adev->discovery.bin) 2511 return 0; 2512 chip_name = "navi12"; 2513 break; 2514 case CHIP_CYAN_SKILLFISH: 2515 if (adev->discovery.bin) 2516 return 0; 2517 chip_name = "cyan_skillfish"; 2518 break; 2519 } 2520 2521 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2522 AMDGPU_UCODE_OPTIONAL, 2523 "amdgpu/%s_gpu_info.bin", chip_name); 2524 if (err) { 2525 dev_err(adev->dev, 2526 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2527 chip_name); 2528 goto out; 2529 } 2530 2531 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2532 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2533 2534 switch (hdr->version_major) { 2535 case 1: 2536 { 2537 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2538 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2539 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2540 2541 /* 2542 * Should be dropped when DAL no longer needs it. 2543 */ 2544 if (adev->asic_type == CHIP_NAVI12) 2545 goto parse_soc_bounding_box; 2546 2547 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2548 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2549 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2550 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2551 adev->gfx.config.max_texture_channel_caches = 2552 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2553 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2554 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2555 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2556 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2557 adev->gfx.config.double_offchip_lds_buf = 2558 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2559 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2560 adev->gfx.cu_info.max_waves_per_simd = 2561 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2562 adev->gfx.cu_info.max_scratch_slots_per_cu = 2563 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2564 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2565 if (hdr->version_minor >= 1) { 2566 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2567 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2568 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2569 adev->gfx.config.num_sc_per_sh = 2570 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2571 adev->gfx.config.num_packer_per_sc = 2572 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2573 } 2574 2575 parse_soc_bounding_box: 2576 /* 2577 * soc bounding box info is not integrated in disocovery table, 2578 * we always need to parse it from gpu info firmware if needed. 2579 */ 2580 if (hdr->version_minor == 2) { 2581 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2582 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2583 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2584 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2585 } 2586 break; 2587 } 2588 default: 2589 dev_err(adev->dev, 2590 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2591 err = -EINVAL; 2592 goto out; 2593 } 2594 out: 2595 return err; 2596 } 2597 2598 static void amdgpu_uid_init(struct amdgpu_device *adev) 2599 { 2600 /* Initialize the UID for the device */ 2601 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); 2602 if (!adev->uid_info) { 2603 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2604 return; 2605 } 2606 adev->uid_info->adev = adev; 2607 } 2608 2609 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2610 { 2611 /* Free the UID memory */ 2612 kfree(adev->uid_info); 2613 adev->uid_info = NULL; 2614 } 2615 2616 /** 2617 * amdgpu_device_ip_early_init - run early init for hardware IPs 2618 * 2619 * @adev: amdgpu_device pointer 2620 * 2621 * Early initialization pass for hardware IPs. The hardware IPs that make 2622 * up each asic are discovered each IP's early_init callback is run. This 2623 * is the first stage in initializing the asic. 2624 * Returns 0 on success, negative error code on failure. 2625 */ 2626 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2627 { 2628 struct amdgpu_ip_block *ip_block; 2629 struct pci_dev *parent; 2630 bool total, skip_bios; 2631 uint32_t bios_flags; 2632 int i, r; 2633 2634 amdgpu_device_enable_virtual_display(adev); 2635 2636 if (amdgpu_sriov_vf(adev)) { 2637 r = amdgpu_virt_request_full_gpu(adev, true); 2638 if (r) 2639 return r; 2640 2641 r = amdgpu_virt_init_critical_region(adev); 2642 if (r) 2643 return r; 2644 } 2645 2646 switch (adev->asic_type) { 2647 #ifdef CONFIG_DRM_AMDGPU_SI 2648 case CHIP_VERDE: 2649 case CHIP_TAHITI: 2650 case CHIP_PITCAIRN: 2651 case CHIP_OLAND: 2652 case CHIP_HAINAN: 2653 adev->family = AMDGPU_FAMILY_SI; 2654 r = si_set_ip_blocks(adev); 2655 if (r) 2656 return r; 2657 break; 2658 #endif 2659 #ifdef CONFIG_DRM_AMDGPU_CIK 2660 case CHIP_BONAIRE: 2661 case CHIP_HAWAII: 2662 case CHIP_KAVERI: 2663 case CHIP_KABINI: 2664 case CHIP_MULLINS: 2665 if (adev->flags & AMD_IS_APU) 2666 adev->family = AMDGPU_FAMILY_KV; 2667 else 2668 adev->family = AMDGPU_FAMILY_CI; 2669 2670 r = cik_set_ip_blocks(adev); 2671 if (r) 2672 return r; 2673 break; 2674 #endif 2675 case CHIP_TOPAZ: 2676 case CHIP_TONGA: 2677 case CHIP_FIJI: 2678 case CHIP_POLARIS10: 2679 case CHIP_POLARIS11: 2680 case CHIP_POLARIS12: 2681 case CHIP_VEGAM: 2682 case CHIP_CARRIZO: 2683 case CHIP_STONEY: 2684 if (adev->flags & AMD_IS_APU) 2685 adev->family = AMDGPU_FAMILY_CZ; 2686 else 2687 adev->family = AMDGPU_FAMILY_VI; 2688 2689 r = vi_set_ip_blocks(adev); 2690 if (r) 2691 return r; 2692 break; 2693 default: 2694 r = amdgpu_discovery_set_ip_blocks(adev); 2695 if (r) 2696 return r; 2697 break; 2698 } 2699 2700 /* Check for IP version 9.4.3 with A0 hardware */ 2701 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2702 !amdgpu_device_get_rev_id(adev)) { 2703 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2704 return -ENODEV; /* device unsupported - no device error */ 2705 } 2706 2707 if (amdgpu_has_atpx() && 2708 (amdgpu_is_atpx_hybrid() || 2709 amdgpu_has_atpx_dgpu_power_cntl()) && 2710 ((adev->flags & AMD_IS_APU) == 0) && 2711 !dev_is_removable(&adev->pdev->dev)) 2712 adev->flags |= AMD_IS_PX; 2713 2714 if (!(adev->flags & AMD_IS_APU)) { 2715 parent = pcie_find_root_port(adev->pdev); 2716 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2717 } 2718 2719 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2720 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2721 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2722 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2723 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2724 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2725 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2726 2727 adev->virt.is_xgmi_node_migrate_enabled = false; 2728 if (amdgpu_sriov_vf(adev)) { 2729 adev->virt.is_xgmi_node_migrate_enabled = 2730 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2731 } 2732 2733 total = true; 2734 for (i = 0; i < adev->num_ip_blocks; i++) { 2735 ip_block = &adev->ip_blocks[i]; 2736 2737 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2738 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2739 adev->ip_blocks[i].version->funcs->name); 2740 adev->ip_blocks[i].status.valid = false; 2741 } else if (ip_block->version->funcs->early_init) { 2742 r = ip_block->version->funcs->early_init(ip_block); 2743 if (r == -ENOENT) { 2744 adev->ip_blocks[i].status.valid = false; 2745 } else if (r) { 2746 dev_err(adev->dev, 2747 "early_init of IP block <%s> failed %d\n", 2748 adev->ip_blocks[i].version->funcs->name, 2749 r); 2750 total = false; 2751 } else { 2752 adev->ip_blocks[i].status.valid = true; 2753 } 2754 } else { 2755 adev->ip_blocks[i].status.valid = true; 2756 } 2757 /* get the vbios after the asic_funcs are set up */ 2758 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2759 r = amdgpu_device_parse_gpu_info_fw(adev); 2760 if (r) 2761 return r; 2762 2763 bios_flags = amdgpu_device_get_vbios_flags(adev); 2764 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2765 /* Read BIOS */ 2766 if (!skip_bios) { 2767 bool optional = 2768 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2769 if (!amdgpu_get_bios(adev) && !optional) 2770 return -EINVAL; 2771 2772 if (optional && !adev->bios) 2773 dev_info( 2774 adev->dev, 2775 "VBIOS image optional, proceeding without VBIOS image"); 2776 2777 if (adev->bios) { 2778 r = amdgpu_atombios_init(adev); 2779 if (r) { 2780 dev_err(adev->dev, 2781 "amdgpu_atombios_init failed\n"); 2782 amdgpu_vf_error_put( 2783 adev, 2784 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2785 0, 0); 2786 return r; 2787 } 2788 } 2789 } 2790 2791 /*get pf2vf msg info at it's earliest time*/ 2792 if (amdgpu_sriov_vf(adev)) 2793 amdgpu_virt_init_data_exchange(adev); 2794 2795 } 2796 } 2797 if (!total) 2798 return -ENODEV; 2799 2800 if (adev->gmc.xgmi.supported) 2801 amdgpu_xgmi_early_init(adev); 2802 2803 if (amdgpu_is_multi_aid(adev)) 2804 amdgpu_uid_init(adev); 2805 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2806 if (ip_block->status.valid != false) 2807 amdgpu_amdkfd_device_probe(adev); 2808 2809 adev->cg_flags &= amdgpu_cg_mask; 2810 adev->pg_flags &= amdgpu_pg_mask; 2811 2812 return 0; 2813 } 2814 2815 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2816 { 2817 int i, r; 2818 2819 for (i = 0; i < adev->num_ip_blocks; i++) { 2820 if (!adev->ip_blocks[i].status.sw) 2821 continue; 2822 if (adev->ip_blocks[i].status.hw) 2823 continue; 2824 if (!amdgpu_ip_member_of_hwini( 2825 adev, adev->ip_blocks[i].version->type)) 2826 continue; 2827 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2828 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2829 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2830 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2831 if (r) { 2832 dev_err(adev->dev, 2833 "hw_init of IP block <%s> failed %d\n", 2834 adev->ip_blocks[i].version->funcs->name, 2835 r); 2836 return r; 2837 } 2838 adev->ip_blocks[i].status.hw = true; 2839 } 2840 } 2841 2842 return 0; 2843 } 2844 2845 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2846 { 2847 int i, r; 2848 2849 for (i = 0; i < adev->num_ip_blocks; i++) { 2850 if (!adev->ip_blocks[i].status.sw) 2851 continue; 2852 if (adev->ip_blocks[i].status.hw) 2853 continue; 2854 if (!amdgpu_ip_member_of_hwini( 2855 adev, adev->ip_blocks[i].version->type)) 2856 continue; 2857 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2858 if (r) { 2859 dev_err(adev->dev, 2860 "hw_init of IP block <%s> failed %d\n", 2861 adev->ip_blocks[i].version->funcs->name, r); 2862 return r; 2863 } 2864 adev->ip_blocks[i].status.hw = true; 2865 } 2866 2867 return 0; 2868 } 2869 2870 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2871 { 2872 int r = 0; 2873 int i; 2874 uint32_t smu_version; 2875 2876 if (adev->asic_type >= CHIP_VEGA10) { 2877 for (i = 0; i < adev->num_ip_blocks; i++) { 2878 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2879 continue; 2880 2881 if (!amdgpu_ip_member_of_hwini(adev, 2882 AMD_IP_BLOCK_TYPE_PSP)) 2883 break; 2884 2885 if (!adev->ip_blocks[i].status.sw) 2886 continue; 2887 2888 /* no need to do the fw loading again if already done*/ 2889 if (adev->ip_blocks[i].status.hw == true) 2890 break; 2891 2892 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2893 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2894 if (r) 2895 return r; 2896 } else { 2897 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2898 if (r) { 2899 dev_err(adev->dev, 2900 "hw_init of IP block <%s> failed %d\n", 2901 adev->ip_blocks[i] 2902 .version->funcs->name, 2903 r); 2904 return r; 2905 } 2906 adev->ip_blocks[i].status.hw = true; 2907 } 2908 break; 2909 } 2910 } 2911 2912 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2913 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2914 2915 return r; 2916 } 2917 2918 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2919 { 2920 struct drm_sched_init_args args = { 2921 .ops = &amdgpu_sched_ops, 2922 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2923 .timeout_wq = adev->reset_domain->wq, 2924 .dev = adev->dev, 2925 }; 2926 long timeout; 2927 int r, i; 2928 2929 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2930 struct amdgpu_ring *ring = adev->rings[i]; 2931 2932 /* No need to setup the GPU scheduler for rings that don't need it */ 2933 if (!ring || ring->no_scheduler) 2934 continue; 2935 2936 switch (ring->funcs->type) { 2937 case AMDGPU_RING_TYPE_GFX: 2938 timeout = adev->gfx_timeout; 2939 break; 2940 case AMDGPU_RING_TYPE_COMPUTE: 2941 timeout = adev->compute_timeout; 2942 break; 2943 case AMDGPU_RING_TYPE_SDMA: 2944 timeout = adev->sdma_timeout; 2945 break; 2946 default: 2947 timeout = adev->video_timeout; 2948 break; 2949 } 2950 2951 args.timeout = timeout; 2952 args.credit_limit = ring->num_hw_submission; 2953 args.score = ring->sched_score; 2954 args.name = ring->name; 2955 2956 r = drm_sched_init(&ring->sched, &args); 2957 if (r) { 2958 dev_err(adev->dev, 2959 "Failed to create scheduler on ring %s.\n", 2960 ring->name); 2961 return r; 2962 } 2963 r = amdgpu_uvd_entity_init(adev, ring); 2964 if (r) { 2965 dev_err(adev->dev, 2966 "Failed to create UVD scheduling entity on ring %s.\n", 2967 ring->name); 2968 return r; 2969 } 2970 r = amdgpu_vce_entity_init(adev, ring); 2971 if (r) { 2972 dev_err(adev->dev, 2973 "Failed to create VCE scheduling entity on ring %s.\n", 2974 ring->name); 2975 return r; 2976 } 2977 } 2978 2979 if (adev->xcp_mgr) 2980 amdgpu_xcp_update_partition_sched_list(adev); 2981 2982 return 0; 2983 } 2984 2985 2986 /** 2987 * amdgpu_device_ip_init - run init for hardware IPs 2988 * 2989 * @adev: amdgpu_device pointer 2990 * 2991 * Main initialization pass for hardware IPs. The list of all the hardware 2992 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2993 * are run. sw_init initializes the software state associated with each IP 2994 * and hw_init initializes the hardware associated with each IP. 2995 * Returns 0 on success, negative error code on failure. 2996 */ 2997 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2998 { 2999 bool init_badpage; 3000 int i, r; 3001 3002 r = amdgpu_ras_init(adev); 3003 if (r) 3004 return r; 3005 3006 for (i = 0; i < adev->num_ip_blocks; i++) { 3007 if (!adev->ip_blocks[i].status.valid) 3008 continue; 3009 if (adev->ip_blocks[i].version->funcs->sw_init) { 3010 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3011 if (r) { 3012 dev_err(adev->dev, 3013 "sw_init of IP block <%s> failed %d\n", 3014 adev->ip_blocks[i].version->funcs->name, 3015 r); 3016 goto init_failed; 3017 } 3018 } 3019 adev->ip_blocks[i].status.sw = true; 3020 3021 if (!amdgpu_ip_member_of_hwini( 3022 adev, adev->ip_blocks[i].version->type)) 3023 continue; 3024 3025 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3026 /* need to do common hw init early so everything is set up for gmc */ 3027 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3028 if (r) { 3029 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3030 r); 3031 goto init_failed; 3032 } 3033 adev->ip_blocks[i].status.hw = true; 3034 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3035 /* need to do gmc hw init early so we can allocate gpu mem */ 3036 /* Try to reserve bad pages early */ 3037 if (amdgpu_sriov_vf(adev)) 3038 amdgpu_virt_exchange_data(adev); 3039 3040 r = amdgpu_device_mem_scratch_init(adev); 3041 if (r) { 3042 dev_err(adev->dev, 3043 "amdgpu_mem_scratch_init failed %d\n", 3044 r); 3045 goto init_failed; 3046 } 3047 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3048 if (r) { 3049 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3050 r); 3051 goto init_failed; 3052 } 3053 r = amdgpu_device_wb_init(adev); 3054 if (r) { 3055 dev_err(adev->dev, 3056 "amdgpu_device_wb_init failed %d\n", r); 3057 goto init_failed; 3058 } 3059 adev->ip_blocks[i].status.hw = true; 3060 3061 /* right after GMC hw init, we create CSA */ 3062 if (adev->gfx.mcbp) { 3063 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3064 AMDGPU_GEM_DOMAIN_VRAM | 3065 AMDGPU_GEM_DOMAIN_GTT, 3066 AMDGPU_CSA_SIZE); 3067 if (r) { 3068 dev_err(adev->dev, 3069 "allocate CSA failed %d\n", r); 3070 goto init_failed; 3071 } 3072 } 3073 3074 r = amdgpu_seq64_init(adev); 3075 if (r) { 3076 dev_err(adev->dev, "allocate seq64 failed %d\n", 3077 r); 3078 goto init_failed; 3079 } 3080 } 3081 } 3082 3083 if (amdgpu_sriov_vf(adev)) 3084 amdgpu_virt_init_data_exchange(adev); 3085 3086 r = amdgpu_ib_pool_init(adev); 3087 if (r) { 3088 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3089 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3090 goto init_failed; 3091 } 3092 3093 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3094 if (r) 3095 goto init_failed; 3096 3097 r = amdgpu_device_ip_hw_init_phase1(adev); 3098 if (r) 3099 goto init_failed; 3100 3101 r = amdgpu_device_fw_loading(adev); 3102 if (r) 3103 goto init_failed; 3104 3105 r = amdgpu_device_ip_hw_init_phase2(adev); 3106 if (r) 3107 goto init_failed; 3108 3109 /* 3110 * retired pages will be loaded from eeprom and reserved here, 3111 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3112 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3113 * for I2C communication which only true at this point. 3114 * 3115 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3116 * failure from bad gpu situation and stop amdgpu init process 3117 * accordingly. For other failed cases, it will still release all 3118 * the resource and print error message, rather than returning one 3119 * negative value to upper level. 3120 * 3121 * Note: theoretically, this should be called before all vram allocations 3122 * to protect retired page from abusing 3123 */ 3124 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3125 r = amdgpu_ras_recovery_init(adev, init_badpage); 3126 if (r) 3127 goto init_failed; 3128 3129 /** 3130 * In case of XGMI grab extra reference for reset domain for this device 3131 */ 3132 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3133 if (amdgpu_xgmi_add_device(adev) == 0) { 3134 if (!amdgpu_sriov_vf(adev)) { 3135 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3136 3137 if (WARN_ON(!hive)) { 3138 r = -ENOENT; 3139 goto init_failed; 3140 } 3141 3142 if (!hive->reset_domain || 3143 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3144 r = -ENOENT; 3145 amdgpu_put_xgmi_hive(hive); 3146 goto init_failed; 3147 } 3148 3149 /* Drop the early temporary reset domain we created for device */ 3150 amdgpu_reset_put_reset_domain(adev->reset_domain); 3151 adev->reset_domain = hive->reset_domain; 3152 amdgpu_put_xgmi_hive(hive); 3153 } 3154 } 3155 } 3156 3157 r = amdgpu_device_init_schedulers(adev); 3158 if (r) 3159 goto init_failed; 3160 3161 if (adev->mman.buffer_funcs_ring && 3162 adev->mman.buffer_funcs_ring->sched.ready) 3163 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3164 3165 /* Don't init kfd if whole hive need to be reset during init */ 3166 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3167 amdgpu_amdkfd_device_init(adev); 3168 } 3169 3170 amdgpu_fru_get_product_info(adev); 3171 3172 r = amdgpu_cper_init(adev); 3173 3174 init_failed: 3175 3176 return r; 3177 } 3178 3179 /** 3180 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3181 * 3182 * @adev: amdgpu_device pointer 3183 * 3184 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3185 * this function before a GPU reset. If the value is retained after a 3186 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3187 */ 3188 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3189 { 3190 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3191 } 3192 3193 /** 3194 * amdgpu_device_check_vram_lost - check if vram is valid 3195 * 3196 * @adev: amdgpu_device pointer 3197 * 3198 * Checks the reset magic value written to the gart pointer in VRAM. 3199 * The driver calls this after a GPU reset to see if the contents of 3200 * VRAM is lost or now. 3201 * returns true if vram is lost, false if not. 3202 */ 3203 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3204 { 3205 if (memcmp(adev->gart.ptr, adev->reset_magic, 3206 AMDGPU_RESET_MAGIC_NUM)) 3207 return true; 3208 3209 if (!amdgpu_in_reset(adev)) 3210 return false; 3211 3212 /* 3213 * For all ASICs with baco/mode1 reset, the VRAM is 3214 * always assumed to be lost. 3215 */ 3216 switch (amdgpu_asic_reset_method(adev)) { 3217 case AMD_RESET_METHOD_LEGACY: 3218 case AMD_RESET_METHOD_LINK: 3219 case AMD_RESET_METHOD_BACO: 3220 case AMD_RESET_METHOD_MODE1: 3221 return true; 3222 default: 3223 return false; 3224 } 3225 } 3226 3227 /** 3228 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3229 * 3230 * @adev: amdgpu_device pointer 3231 * @state: clockgating state (gate or ungate) 3232 * 3233 * The list of all the hardware IPs that make up the asic is walked and the 3234 * set_clockgating_state callbacks are run. 3235 * Late initialization pass enabling clockgating for hardware IPs. 3236 * Fini or suspend, pass disabling clockgating for hardware IPs. 3237 * Returns 0 on success, negative error code on failure. 3238 */ 3239 3240 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3241 enum amd_clockgating_state state) 3242 { 3243 int i, j, r; 3244 3245 if (amdgpu_emu_mode == 1) 3246 return 0; 3247 3248 for (j = 0; j < adev->num_ip_blocks; j++) { 3249 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3250 if (!adev->ip_blocks[i].status.late_initialized) 3251 continue; 3252 /* skip CG for GFX, SDMA on S0ix */ 3253 if (adev->in_s0ix && 3254 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3255 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3256 continue; 3257 /* skip CG for VCE/UVD, it's handled specially */ 3258 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3259 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3260 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3261 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3262 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3263 /* enable clockgating to save power */ 3264 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3265 state); 3266 if (r) { 3267 dev_err(adev->dev, 3268 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3269 adev->ip_blocks[i].version->funcs->name, 3270 r); 3271 return r; 3272 } 3273 } 3274 } 3275 3276 return 0; 3277 } 3278 3279 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3280 enum amd_powergating_state state) 3281 { 3282 int i, j, r; 3283 3284 if (amdgpu_emu_mode == 1) 3285 return 0; 3286 3287 for (j = 0; j < adev->num_ip_blocks; j++) { 3288 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3289 if (!adev->ip_blocks[i].status.late_initialized) 3290 continue; 3291 /* skip PG for GFX, SDMA on S0ix */ 3292 if (adev->in_s0ix && 3293 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3294 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3295 continue; 3296 /* skip CG for VCE/UVD, it's handled specially */ 3297 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3298 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3299 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3300 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3301 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3302 /* enable powergating to save power */ 3303 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3304 state); 3305 if (r) { 3306 dev_err(adev->dev, 3307 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3308 adev->ip_blocks[i].version->funcs->name, 3309 r); 3310 return r; 3311 } 3312 } 3313 } 3314 return 0; 3315 } 3316 3317 static int amdgpu_device_enable_mgpu_fan_boost(void) 3318 { 3319 struct amdgpu_gpu_instance *gpu_ins; 3320 struct amdgpu_device *adev; 3321 int i, ret = 0; 3322 3323 mutex_lock(&mgpu_info.mutex); 3324 3325 /* 3326 * MGPU fan boost feature should be enabled 3327 * only when there are two or more dGPUs in 3328 * the system 3329 */ 3330 if (mgpu_info.num_dgpu < 2) 3331 goto out; 3332 3333 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3334 gpu_ins = &(mgpu_info.gpu_ins[i]); 3335 adev = gpu_ins->adev; 3336 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3337 !gpu_ins->mgpu_fan_enabled) { 3338 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3339 if (ret) 3340 break; 3341 3342 gpu_ins->mgpu_fan_enabled = 1; 3343 } 3344 } 3345 3346 out: 3347 mutex_unlock(&mgpu_info.mutex); 3348 3349 return ret; 3350 } 3351 3352 /** 3353 * amdgpu_device_ip_late_init - run late init for hardware IPs 3354 * 3355 * @adev: amdgpu_device pointer 3356 * 3357 * Late initialization pass for hardware IPs. The list of all the hardware 3358 * IPs that make up the asic is walked and the late_init callbacks are run. 3359 * late_init covers any special initialization that an IP requires 3360 * after all of the have been initialized or something that needs to happen 3361 * late in the init process. 3362 * Returns 0 on success, negative error code on failure. 3363 */ 3364 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3365 { 3366 struct amdgpu_gpu_instance *gpu_instance; 3367 int i = 0, r; 3368 3369 for (i = 0; i < adev->num_ip_blocks; i++) { 3370 if (!adev->ip_blocks[i].status.hw) 3371 continue; 3372 if (adev->ip_blocks[i].version->funcs->late_init) { 3373 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3374 if (r) { 3375 dev_err(adev->dev, 3376 "late_init of IP block <%s> failed %d\n", 3377 adev->ip_blocks[i].version->funcs->name, 3378 r); 3379 return r; 3380 } 3381 } 3382 adev->ip_blocks[i].status.late_initialized = true; 3383 } 3384 3385 r = amdgpu_ras_late_init(adev); 3386 if (r) { 3387 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3388 return r; 3389 } 3390 3391 if (!amdgpu_reset_in_recovery(adev)) 3392 amdgpu_ras_set_error_query_ready(adev, true); 3393 3394 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3395 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3396 3397 amdgpu_device_fill_reset_magic(adev); 3398 3399 r = amdgpu_device_enable_mgpu_fan_boost(); 3400 if (r) 3401 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3402 3403 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3404 if (amdgpu_passthrough(adev) && 3405 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3406 adev->asic_type == CHIP_ALDEBARAN)) 3407 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3408 3409 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3410 mutex_lock(&mgpu_info.mutex); 3411 3412 /* 3413 * Reset device p-state to low as this was booted with high. 3414 * 3415 * This should be performed only after all devices from the same 3416 * hive get initialized. 3417 * 3418 * However, it's unknown how many device in the hive in advance. 3419 * As this is counted one by one during devices initializations. 3420 * 3421 * So, we wait for all XGMI interlinked devices initialized. 3422 * This may bring some delays as those devices may come from 3423 * different hives. But that should be OK. 3424 */ 3425 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3426 for (i = 0; i < mgpu_info.num_gpu; i++) { 3427 gpu_instance = &(mgpu_info.gpu_ins[i]); 3428 if (gpu_instance->adev->flags & AMD_IS_APU) 3429 continue; 3430 3431 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3432 AMDGPU_XGMI_PSTATE_MIN); 3433 if (r) { 3434 dev_err(adev->dev, 3435 "pstate setting failed (%d).\n", 3436 r); 3437 break; 3438 } 3439 } 3440 } 3441 3442 mutex_unlock(&mgpu_info.mutex); 3443 } 3444 3445 return 0; 3446 } 3447 3448 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3449 { 3450 struct amdgpu_device *adev = ip_block->adev; 3451 int r; 3452 3453 if (!ip_block->version->funcs->hw_fini) { 3454 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3455 ip_block->version->funcs->name); 3456 } else { 3457 r = ip_block->version->funcs->hw_fini(ip_block); 3458 /* XXX handle errors */ 3459 if (r) { 3460 dev_dbg(adev->dev, 3461 "hw_fini of IP block <%s> failed %d\n", 3462 ip_block->version->funcs->name, r); 3463 } 3464 } 3465 3466 ip_block->status.hw = false; 3467 } 3468 3469 /** 3470 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3471 * 3472 * @adev: amdgpu_device pointer 3473 * 3474 * For ASICs need to disable SMC first 3475 */ 3476 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3477 { 3478 int i; 3479 3480 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3481 return; 3482 3483 for (i = 0; i < adev->num_ip_blocks; i++) { 3484 if (!adev->ip_blocks[i].status.hw) 3485 continue; 3486 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3487 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3488 break; 3489 } 3490 } 3491 } 3492 3493 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3494 { 3495 int i, r; 3496 3497 for (i = 0; i < adev->num_ip_blocks; i++) { 3498 if (!adev->ip_blocks[i].version->funcs->early_fini) 3499 continue; 3500 3501 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3502 if (r) { 3503 dev_dbg(adev->dev, 3504 "early_fini of IP block <%s> failed %d\n", 3505 adev->ip_blocks[i].version->funcs->name, r); 3506 } 3507 } 3508 3509 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3510 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3511 3512 amdgpu_amdkfd_suspend(adev, true); 3513 amdgpu_userq_suspend(adev); 3514 3515 /* Workaround for ASICs need to disable SMC first */ 3516 amdgpu_device_smu_fini_early(adev); 3517 3518 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3519 if (!adev->ip_blocks[i].status.hw) 3520 continue; 3521 3522 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3523 } 3524 3525 if (amdgpu_sriov_vf(adev)) { 3526 if (amdgpu_virt_release_full_gpu(adev, false)) 3527 dev_err(adev->dev, 3528 "failed to release exclusive mode on fini\n"); 3529 } 3530 3531 /* 3532 * Driver reload on the APU can fail due to firmware validation because 3533 * the PSP is always running, as it is shared across the whole SoC. 3534 * This same issue does not occur on dGPU because it has a mechanism 3535 * that checks whether the PSP is running. A solution for those issues 3536 * in the APU is to trigger a GPU reset, but this should be done during 3537 * the unload phase to avoid adding boot latency and screen flicker. 3538 */ 3539 if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) { 3540 r = amdgpu_asic_reset(adev); 3541 if (r) 3542 dev_err(adev->dev, "asic reset on %s failed\n", __func__); 3543 } 3544 3545 return 0; 3546 } 3547 3548 /** 3549 * amdgpu_device_ip_fini - run fini for hardware IPs 3550 * 3551 * @adev: amdgpu_device pointer 3552 * 3553 * Main teardown pass for hardware IPs. The list of all the hardware 3554 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3555 * are run. hw_fini tears down the hardware associated with each IP 3556 * and sw_fini tears down any software state associated with each IP. 3557 * Returns 0 on success, negative error code on failure. 3558 */ 3559 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3560 { 3561 int i, r; 3562 3563 amdgpu_cper_fini(adev); 3564 3565 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3566 amdgpu_virt_release_ras_err_handler_data(adev); 3567 3568 if (adev->gmc.xgmi.num_physical_nodes > 1) 3569 amdgpu_xgmi_remove_device(adev); 3570 3571 amdgpu_amdkfd_device_fini_sw(adev); 3572 3573 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3574 if (!adev->ip_blocks[i].status.sw) 3575 continue; 3576 3577 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3578 amdgpu_ucode_free_bo(adev); 3579 amdgpu_free_static_csa(&adev->virt.csa_obj); 3580 amdgpu_device_wb_fini(adev); 3581 amdgpu_device_mem_scratch_fini(adev); 3582 amdgpu_ib_pool_fini(adev); 3583 amdgpu_seq64_fini(adev); 3584 amdgpu_doorbell_fini(adev); 3585 } 3586 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3587 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3588 /* XXX handle errors */ 3589 if (r) { 3590 dev_dbg(adev->dev, 3591 "sw_fini of IP block <%s> failed %d\n", 3592 adev->ip_blocks[i].version->funcs->name, 3593 r); 3594 } 3595 } 3596 adev->ip_blocks[i].status.sw = false; 3597 adev->ip_blocks[i].status.valid = false; 3598 } 3599 3600 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3601 if (!adev->ip_blocks[i].status.late_initialized) 3602 continue; 3603 if (adev->ip_blocks[i].version->funcs->late_fini) 3604 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3605 adev->ip_blocks[i].status.late_initialized = false; 3606 } 3607 3608 amdgpu_ras_fini(adev); 3609 amdgpu_uid_fini(adev); 3610 3611 return 0; 3612 } 3613 3614 /** 3615 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3616 * 3617 * @work: work_struct. 3618 */ 3619 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3620 { 3621 struct amdgpu_device *adev = 3622 container_of(work, struct amdgpu_device, delayed_init_work.work); 3623 int r; 3624 3625 r = amdgpu_ib_ring_tests(adev); 3626 if (r) 3627 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3628 } 3629 3630 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3631 { 3632 struct amdgpu_device *adev = 3633 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3634 3635 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3636 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3637 3638 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3639 adev->gfx.gfx_off_state = true; 3640 } 3641 3642 /** 3643 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3644 * 3645 * @adev: amdgpu_device pointer 3646 * 3647 * Main suspend function for hardware IPs. The list of all the hardware 3648 * IPs that make up the asic is walked, clockgating is disabled and the 3649 * suspend callbacks are run. suspend puts the hardware and software state 3650 * in each IP into a state suitable for suspend. 3651 * Returns 0 on success, negative error code on failure. 3652 */ 3653 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3654 { 3655 int i, r, rec; 3656 3657 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3658 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3659 3660 /* 3661 * Per PMFW team's suggestion, driver needs to handle gfxoff 3662 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3663 * scenario. Add the missing df cstate disablement here. 3664 */ 3665 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3666 dev_warn(adev->dev, "Failed to disallow df cstate"); 3667 3668 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3669 if (!adev->ip_blocks[i].status.valid) 3670 continue; 3671 3672 /* displays are handled separately */ 3673 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3674 continue; 3675 3676 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3677 if (r) 3678 goto unwind; 3679 } 3680 3681 return 0; 3682 unwind: 3683 rec = amdgpu_device_ip_resume_phase3(adev); 3684 if (rec) 3685 dev_err(adev->dev, 3686 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", 3687 rec); 3688 3689 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); 3690 3691 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3692 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3693 3694 return r; 3695 } 3696 3697 /** 3698 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3699 * 3700 * @adev: amdgpu_device pointer 3701 * 3702 * Main suspend function for hardware IPs. The list of all the hardware 3703 * IPs that make up the asic is walked, clockgating is disabled and the 3704 * suspend callbacks are run. suspend puts the hardware and software state 3705 * in each IP into a state suitable for suspend. 3706 * Returns 0 on success, negative error code on failure. 3707 */ 3708 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3709 { 3710 int i, r, rec; 3711 3712 if (adev->in_s0ix) 3713 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3714 3715 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3716 if (!adev->ip_blocks[i].status.valid) 3717 continue; 3718 /* displays are handled in phase1 */ 3719 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3720 continue; 3721 /* PSP lost connection when err_event_athub occurs */ 3722 if (amdgpu_ras_intr_triggered() && 3723 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3724 adev->ip_blocks[i].status.hw = false; 3725 continue; 3726 } 3727 3728 /* skip unnecessary suspend if we do not initialize them yet */ 3729 if (!amdgpu_ip_member_of_hwini( 3730 adev, adev->ip_blocks[i].version->type)) 3731 continue; 3732 3733 /* Since we skip suspend for S0i3, we need to cancel the delayed 3734 * idle work here as the suspend callback never gets called. 3735 */ 3736 if (adev->in_s0ix && 3737 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3738 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3739 cancel_delayed_work_sync(&adev->gfx.idle_work); 3740 /* skip suspend of gfx/mes and psp for S0ix 3741 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3742 * like at runtime. PSP is also part of the always on hardware 3743 * so no need to suspend it. 3744 */ 3745 if (adev->in_s0ix && 3746 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3747 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3748 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3749 continue; 3750 3751 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3752 if (adev->in_s0ix && 3753 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3754 IP_VERSION(5, 0, 0)) && 3755 (adev->ip_blocks[i].version->type == 3756 AMD_IP_BLOCK_TYPE_SDMA)) 3757 continue; 3758 3759 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3760 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3761 * from this location and RLC Autoload automatically also gets loaded 3762 * from here based on PMFW -> PSP message during re-init sequence. 3763 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3764 * the TMR and reload FWs again for IMU enabled APU ASICs. 3765 */ 3766 if (amdgpu_in_reset(adev) && 3767 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3768 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3769 continue; 3770 3771 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3772 if (r) 3773 goto unwind; 3774 3775 /* handle putting the SMC in the appropriate state */ 3776 if (!amdgpu_sriov_vf(adev)) { 3777 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3778 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3779 if (r) { 3780 dev_err(adev->dev, 3781 "SMC failed to set mp1 state %d, %d\n", 3782 adev->mp1_state, r); 3783 goto unwind; 3784 } 3785 } 3786 } 3787 } 3788 3789 return 0; 3790 unwind: 3791 /* suspend phase 2 = resume phase 1 + resume phase 2 */ 3792 rec = amdgpu_device_ip_resume_phase1(adev); 3793 if (rec) { 3794 dev_err(adev->dev, 3795 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", 3796 rec); 3797 return r; 3798 } 3799 3800 rec = amdgpu_device_fw_loading(adev); 3801 if (rec) { 3802 dev_err(adev->dev, 3803 "amdgpu_device_fw_loading failed during unwind: %d\n", 3804 rec); 3805 return r; 3806 } 3807 3808 rec = amdgpu_device_ip_resume_phase2(adev); 3809 if (rec) { 3810 dev_err(adev->dev, 3811 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", 3812 rec); 3813 return r; 3814 } 3815 3816 return r; 3817 } 3818 3819 /** 3820 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3821 * 3822 * @adev: amdgpu_device pointer 3823 * 3824 * Main suspend function for hardware IPs. The list of all the hardware 3825 * IPs that make up the asic is walked, clockgating is disabled and the 3826 * suspend callbacks are run. suspend puts the hardware and software state 3827 * in each IP into a state suitable for suspend. 3828 * Returns 0 on success, negative error code on failure. 3829 */ 3830 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3831 { 3832 int r; 3833 3834 if (amdgpu_sriov_vf(adev)) { 3835 amdgpu_virt_fini_data_exchange(adev); 3836 amdgpu_virt_request_full_gpu(adev, false); 3837 } 3838 3839 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3840 3841 r = amdgpu_device_ip_suspend_phase1(adev); 3842 if (r) 3843 return r; 3844 r = amdgpu_device_ip_suspend_phase2(adev); 3845 3846 if (amdgpu_sriov_vf(adev)) 3847 amdgpu_virt_release_full_gpu(adev, false); 3848 3849 return r; 3850 } 3851 3852 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3853 { 3854 int i, r; 3855 3856 static enum amd_ip_block_type ip_order[] = { 3857 AMD_IP_BLOCK_TYPE_COMMON, 3858 AMD_IP_BLOCK_TYPE_GMC, 3859 AMD_IP_BLOCK_TYPE_PSP, 3860 AMD_IP_BLOCK_TYPE_IH, 3861 }; 3862 3863 for (i = 0; i < adev->num_ip_blocks; i++) { 3864 int j; 3865 struct amdgpu_ip_block *block; 3866 3867 block = &adev->ip_blocks[i]; 3868 block->status.hw = false; 3869 3870 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3871 3872 if (block->version->type != ip_order[j] || 3873 !block->status.valid) 3874 continue; 3875 3876 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3877 if (r) { 3878 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3879 block->version->funcs->name); 3880 return r; 3881 } 3882 block->status.hw = true; 3883 } 3884 } 3885 3886 return 0; 3887 } 3888 3889 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3890 { 3891 struct amdgpu_ip_block *block; 3892 int i, r = 0; 3893 3894 static enum amd_ip_block_type ip_order[] = { 3895 AMD_IP_BLOCK_TYPE_SMC, 3896 AMD_IP_BLOCK_TYPE_DCE, 3897 AMD_IP_BLOCK_TYPE_GFX, 3898 AMD_IP_BLOCK_TYPE_SDMA, 3899 AMD_IP_BLOCK_TYPE_MES, 3900 AMD_IP_BLOCK_TYPE_UVD, 3901 AMD_IP_BLOCK_TYPE_VCE, 3902 AMD_IP_BLOCK_TYPE_VCN, 3903 AMD_IP_BLOCK_TYPE_JPEG 3904 }; 3905 3906 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3907 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3908 3909 if (!block) 3910 continue; 3911 3912 if (block->status.valid && !block->status.hw) { 3913 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3914 r = amdgpu_ip_block_resume(block); 3915 } else { 3916 r = block->version->funcs->hw_init(block); 3917 } 3918 3919 if (r) { 3920 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3921 block->version->funcs->name); 3922 break; 3923 } 3924 block->status.hw = true; 3925 } 3926 } 3927 3928 return r; 3929 } 3930 3931 /** 3932 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3933 * 3934 * @adev: amdgpu_device pointer 3935 * 3936 * First resume function for hardware IPs. The list of all the hardware 3937 * IPs that make up the asic is walked and the resume callbacks are run for 3938 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3939 * after a suspend and updates the software state as necessary. This 3940 * function is also used for restoring the GPU after a GPU reset. 3941 * Returns 0 on success, negative error code on failure. 3942 */ 3943 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3944 { 3945 int i, r; 3946 3947 for (i = 0; i < adev->num_ip_blocks; i++) { 3948 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3949 continue; 3950 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3951 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3952 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3953 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3954 3955 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3956 if (r) 3957 return r; 3958 } 3959 } 3960 3961 return 0; 3962 } 3963 3964 /** 3965 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3966 * 3967 * @adev: amdgpu_device pointer 3968 * 3969 * Second resume function for hardware IPs. The list of all the hardware 3970 * IPs that make up the asic is walked and the resume callbacks are run for 3971 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3972 * functional state after a suspend and updates the software state as 3973 * necessary. This function is also used for restoring the GPU after a GPU 3974 * reset. 3975 * Returns 0 on success, negative error code on failure. 3976 */ 3977 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3978 { 3979 int i, r; 3980 3981 for (i = 0; i < adev->num_ip_blocks; i++) { 3982 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3983 continue; 3984 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3985 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3986 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3987 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3988 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3989 continue; 3990 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3991 if (r) 3992 return r; 3993 } 3994 3995 return 0; 3996 } 3997 3998 /** 3999 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4000 * 4001 * @adev: amdgpu_device pointer 4002 * 4003 * Third resume function for hardware IPs. The list of all the hardware 4004 * IPs that make up the asic is walked and the resume callbacks are run for 4005 * all DCE. resume puts the hardware into a functional state after a suspend 4006 * and updates the software state as necessary. This function is also used 4007 * for restoring the GPU after a GPU reset. 4008 * 4009 * Returns 0 on success, negative error code on failure. 4010 */ 4011 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4012 { 4013 int i, r; 4014 4015 for (i = 0; i < adev->num_ip_blocks; i++) { 4016 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4017 continue; 4018 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4019 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4020 if (r) 4021 return r; 4022 } 4023 } 4024 4025 return 0; 4026 } 4027 4028 /** 4029 * amdgpu_device_ip_resume - run resume for hardware IPs 4030 * 4031 * @adev: amdgpu_device pointer 4032 * 4033 * Main resume function for hardware IPs. The hardware IPs 4034 * are split into two resume functions because they are 4035 * also used in recovering from a GPU reset and some additional 4036 * steps need to be take between them. In this case (S3/S4) they are 4037 * run sequentially. 4038 * Returns 0 on success, negative error code on failure. 4039 */ 4040 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4041 { 4042 int r; 4043 4044 r = amdgpu_device_ip_resume_phase1(adev); 4045 if (r) 4046 return r; 4047 4048 r = amdgpu_device_fw_loading(adev); 4049 if (r) 4050 return r; 4051 4052 r = amdgpu_device_ip_resume_phase2(adev); 4053 4054 if (adev->mman.buffer_funcs_ring->sched.ready) 4055 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4056 4057 if (r) 4058 return r; 4059 4060 amdgpu_fence_driver_hw_init(adev); 4061 4062 r = amdgpu_device_ip_resume_phase3(adev); 4063 4064 return r; 4065 } 4066 4067 /** 4068 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4069 * 4070 * @adev: amdgpu_device pointer 4071 * 4072 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4073 */ 4074 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4075 { 4076 if (amdgpu_sriov_vf(adev)) { 4077 if (adev->is_atom_fw) { 4078 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4079 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4080 } else { 4081 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4082 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4083 } 4084 4085 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4086 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4087 } 4088 } 4089 4090 /** 4091 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4092 * 4093 * @pdev : pci device context 4094 * @asic_type: AMD asic type 4095 * 4096 * Check if there is DC (new modesetting infrastructre) support for an asic. 4097 * returns true if DC has support, false if not. 4098 */ 4099 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4100 enum amd_asic_type asic_type) 4101 { 4102 switch (asic_type) { 4103 #ifdef CONFIG_DRM_AMDGPU_SI 4104 case CHIP_HAINAN: 4105 #endif 4106 case CHIP_TOPAZ: 4107 /* chips with no display hardware */ 4108 return false; 4109 #if defined(CONFIG_DRM_AMD_DC) 4110 case CHIP_TAHITI: 4111 case CHIP_PITCAIRN: 4112 case CHIP_VERDE: 4113 case CHIP_OLAND: 4114 return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI); 4115 case CHIP_KAVERI: 4116 case CHIP_KABINI: 4117 case CHIP_MULLINS: 4118 /* 4119 * We have systems in the wild with these ASICs that require 4120 * TRAVIS and NUTMEG support which is not supported with DC. 4121 * 4122 * Fallback to the non-DC driver here by default so as not to 4123 * cause regressions. 4124 */ 4125 return amdgpu_dc > 0; 4126 default: 4127 return amdgpu_dc != 0; 4128 #else 4129 default: 4130 if (amdgpu_dc > 0) 4131 dev_info_once( 4132 &pdev->dev, 4133 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4134 return false; 4135 #endif 4136 } 4137 } 4138 4139 /** 4140 * amdgpu_device_has_dc_support - check if dc is supported 4141 * 4142 * @adev: amdgpu_device pointer 4143 * 4144 * Returns true for supported, false for not supported 4145 */ 4146 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4147 { 4148 if (adev->enable_virtual_display || 4149 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4150 return false; 4151 4152 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4153 } 4154 4155 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4156 { 4157 struct amdgpu_device *adev = 4158 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4159 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4160 4161 /* It's a bug to not have a hive within this function */ 4162 if (WARN_ON(!hive)) 4163 return; 4164 4165 /* 4166 * Use task barrier to synchronize all xgmi reset works across the 4167 * hive. task_barrier_enter and task_barrier_exit will block 4168 * until all the threads running the xgmi reset works reach 4169 * those points. task_barrier_full will do both blocks. 4170 */ 4171 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4172 4173 task_barrier_enter(&hive->tb); 4174 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4175 4176 if (adev->asic_reset_res) 4177 goto fail; 4178 4179 task_barrier_exit(&hive->tb); 4180 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4181 4182 if (adev->asic_reset_res) 4183 goto fail; 4184 4185 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4186 } else { 4187 4188 task_barrier_full(&hive->tb); 4189 adev->asic_reset_res = amdgpu_asic_reset(adev); 4190 } 4191 4192 fail: 4193 if (adev->asic_reset_res) 4194 dev_warn(adev->dev, 4195 "ASIC reset failed with error, %d for drm dev, %s", 4196 adev->asic_reset_res, adev_to_drm(adev)->unique); 4197 amdgpu_put_xgmi_hive(hive); 4198 } 4199 4200 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4201 { 4202 char *input = amdgpu_lockup_timeout; 4203 char *timeout_setting = NULL; 4204 int index = 0; 4205 long timeout; 4206 int ret = 0; 4207 4208 /* By default timeout for all queues is 2 sec */ 4209 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4210 adev->video_timeout = msecs_to_jiffies(2000); 4211 4212 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) 4213 return 0; 4214 4215 while ((timeout_setting = strsep(&input, ",")) && 4216 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4217 ret = kstrtol(timeout_setting, 0, &timeout); 4218 if (ret) 4219 return ret; 4220 4221 if (timeout == 0) { 4222 index++; 4223 continue; 4224 } else if (timeout < 0) { 4225 timeout = MAX_SCHEDULE_TIMEOUT; 4226 dev_warn(adev->dev, "lockup timeout disabled"); 4227 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4228 } else { 4229 timeout = msecs_to_jiffies(timeout); 4230 } 4231 4232 switch (index++) { 4233 case 0: 4234 adev->gfx_timeout = timeout; 4235 break; 4236 case 1: 4237 adev->compute_timeout = timeout; 4238 break; 4239 case 2: 4240 adev->sdma_timeout = timeout; 4241 break; 4242 case 3: 4243 adev->video_timeout = timeout; 4244 break; 4245 default: 4246 break; 4247 } 4248 } 4249 4250 /* When only one value specified apply it to all queues. */ 4251 if (index == 1) 4252 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4253 adev->video_timeout = timeout; 4254 4255 return ret; 4256 } 4257 4258 /** 4259 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4260 * 4261 * @adev: amdgpu_device pointer 4262 * 4263 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4264 */ 4265 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4266 { 4267 struct iommu_domain *domain; 4268 4269 domain = iommu_get_domain_for_dev(adev->dev); 4270 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4271 adev->ram_is_direct_mapped = true; 4272 } 4273 4274 #if defined(CONFIG_HSA_AMD_P2P) 4275 /** 4276 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4277 * 4278 * @adev: amdgpu_device pointer 4279 * 4280 * return if IOMMU remapping bar address 4281 */ 4282 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4283 { 4284 struct iommu_domain *domain; 4285 4286 domain = iommu_get_domain_for_dev(adev->dev); 4287 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4288 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4289 return true; 4290 4291 return false; 4292 } 4293 #endif 4294 4295 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4296 { 4297 if (amdgpu_mcbp == 1) 4298 adev->gfx.mcbp = true; 4299 else if (amdgpu_mcbp == 0) 4300 adev->gfx.mcbp = false; 4301 4302 if (amdgpu_sriov_vf(adev)) 4303 adev->gfx.mcbp = true; 4304 4305 if (adev->gfx.mcbp) 4306 dev_info(adev->dev, "MCBP is enabled\n"); 4307 } 4308 4309 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) 4310 { 4311 int r; 4312 4313 r = amdgpu_atombios_sysfs_init(adev); 4314 if (r) 4315 drm_err(&adev->ddev, 4316 "registering atombios sysfs failed (%d).\n", r); 4317 4318 r = amdgpu_pm_sysfs_init(adev); 4319 if (r) 4320 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4321 4322 r = amdgpu_ucode_sysfs_init(adev); 4323 if (r) { 4324 adev->ucode_sysfs_en = false; 4325 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4326 } else 4327 adev->ucode_sysfs_en = true; 4328 4329 r = amdgpu_device_attr_sysfs_init(adev); 4330 if (r) 4331 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4332 4333 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4334 if (r) 4335 dev_err(adev->dev, 4336 "Could not create amdgpu board attributes\n"); 4337 4338 amdgpu_fru_sysfs_init(adev); 4339 amdgpu_reg_state_sysfs_init(adev); 4340 amdgpu_xcp_sysfs_init(adev); 4341 amdgpu_uma_sysfs_init(adev); 4342 4343 return r; 4344 } 4345 4346 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) 4347 { 4348 if (adev->pm.sysfs_initialized) 4349 amdgpu_pm_sysfs_fini(adev); 4350 if (adev->ucode_sysfs_en) 4351 amdgpu_ucode_sysfs_fini(adev); 4352 amdgpu_device_attr_sysfs_fini(adev); 4353 amdgpu_fru_sysfs_fini(adev); 4354 4355 amdgpu_reg_state_sysfs_fini(adev); 4356 amdgpu_xcp_sysfs_fini(adev); 4357 amdgpu_uma_sysfs_fini(adev); 4358 } 4359 4360 /** 4361 * amdgpu_device_init - initialize the driver 4362 * 4363 * @adev: amdgpu_device pointer 4364 * @flags: driver flags 4365 * 4366 * Initializes the driver info and hw (all asics). 4367 * Returns 0 for success or an error on failure. 4368 * Called at driver startup. 4369 */ 4370 int amdgpu_device_init(struct amdgpu_device *adev, 4371 uint32_t flags) 4372 { 4373 struct pci_dev *pdev = adev->pdev; 4374 int r, i; 4375 bool px = false; 4376 u32 max_MBps; 4377 int tmp; 4378 4379 adev->shutdown = false; 4380 adev->flags = flags; 4381 4382 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4383 adev->asic_type = amdgpu_force_asic_type; 4384 else 4385 adev->asic_type = flags & AMD_ASIC_MASK; 4386 4387 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4388 if (amdgpu_emu_mode == 1) 4389 adev->usec_timeout *= 10; 4390 adev->gmc.gart_size = 512 * 1024 * 1024; 4391 adev->accel_working = false; 4392 adev->num_rings = 0; 4393 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4394 adev->mman.buffer_funcs = NULL; 4395 adev->mman.buffer_funcs_ring = NULL; 4396 adev->vm_manager.vm_pte_funcs = NULL; 4397 adev->vm_manager.vm_pte_num_scheds = 0; 4398 adev->gmc.gmc_funcs = NULL; 4399 adev->harvest_ip_mask = 0x0; 4400 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4401 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4402 4403 adev->smc_rreg = &amdgpu_invalid_rreg; 4404 adev->smc_wreg = &amdgpu_invalid_wreg; 4405 adev->pcie_rreg = &amdgpu_invalid_rreg; 4406 adev->pcie_wreg = &amdgpu_invalid_wreg; 4407 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4408 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4409 adev->pciep_rreg = &amdgpu_invalid_rreg; 4410 adev->pciep_wreg = &amdgpu_invalid_wreg; 4411 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4412 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4413 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4414 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4415 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4416 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4417 adev->didt_rreg = &amdgpu_invalid_rreg; 4418 adev->didt_wreg = &amdgpu_invalid_wreg; 4419 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4420 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4421 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4422 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4423 4424 dev_info( 4425 adev->dev, 4426 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4427 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4428 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4429 4430 /* mutex initialization are all done here so we 4431 * can recall function without having locking issues 4432 */ 4433 mutex_init(&adev->firmware.mutex); 4434 mutex_init(&adev->pm.mutex); 4435 mutex_init(&adev->gfx.gpu_clock_mutex); 4436 mutex_init(&adev->srbm_mutex); 4437 mutex_init(&adev->gfx.pipe_reserve_mutex); 4438 mutex_init(&adev->gfx.gfx_off_mutex); 4439 mutex_init(&adev->gfx.partition_mutex); 4440 mutex_init(&adev->grbm_idx_mutex); 4441 mutex_init(&adev->mn_lock); 4442 mutex_init(&adev->virt.vf_errors.lock); 4443 hash_init(adev->mn_hash); 4444 mutex_init(&adev->psp.mutex); 4445 mutex_init(&adev->notifier_lock); 4446 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4447 mutex_init(&adev->benchmark_mutex); 4448 mutex_init(&adev->gfx.reset_sem_mutex); 4449 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4450 mutex_init(&adev->enforce_isolation_mutex); 4451 for (i = 0; i < MAX_XCP; ++i) { 4452 adev->isolation[i].spearhead = dma_fence_get_stub(); 4453 amdgpu_sync_create(&adev->isolation[i].active); 4454 amdgpu_sync_create(&adev->isolation[i].prev); 4455 } 4456 mutex_init(&adev->gfx.userq_sch_mutex); 4457 mutex_init(&adev->gfx.workload_profile_mutex); 4458 mutex_init(&adev->vcn.workload_profile_mutex); 4459 4460 amdgpu_device_init_apu_flags(adev); 4461 4462 r = amdgpu_device_check_arguments(adev); 4463 if (r) 4464 return r; 4465 4466 spin_lock_init(&adev->mmio_idx_lock); 4467 spin_lock_init(&adev->smc_idx_lock); 4468 spin_lock_init(&adev->pcie_idx_lock); 4469 spin_lock_init(&adev->uvd_ctx_idx_lock); 4470 spin_lock_init(&adev->didt_idx_lock); 4471 spin_lock_init(&adev->gc_cac_idx_lock); 4472 spin_lock_init(&adev->se_cac_idx_lock); 4473 spin_lock_init(&adev->audio_endpt_idx_lock); 4474 spin_lock_init(&adev->mm_stats.lock); 4475 spin_lock_init(&adev->virt.rlcg_reg_lock); 4476 spin_lock_init(&adev->wb.lock); 4477 4478 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4479 4480 INIT_LIST_HEAD(&adev->reset_list); 4481 4482 INIT_LIST_HEAD(&adev->ras_list); 4483 4484 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4485 4486 xa_init(&adev->userq_doorbell_xa); 4487 4488 INIT_DELAYED_WORK(&adev->delayed_init_work, 4489 amdgpu_device_delayed_init_work_handler); 4490 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4491 amdgpu_device_delay_enable_gfx_off); 4492 /* 4493 * Initialize the enforce_isolation work structures for each XCP 4494 * partition. This work handler is responsible for enforcing shader 4495 * isolation on AMD GPUs. It counts the number of emitted fences for 4496 * each GFX and compute ring. If there are any fences, it schedules 4497 * the `enforce_isolation_work` to be run after a delay. If there are 4498 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4499 * runqueue. 4500 */ 4501 for (i = 0; i < MAX_XCP; i++) { 4502 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4503 amdgpu_gfx_enforce_isolation_handler); 4504 adev->gfx.enforce_isolation[i].adev = adev; 4505 adev->gfx.enforce_isolation[i].xcp_id = i; 4506 } 4507 4508 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4509 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); 4510 4511 adev->gfx.gfx_off_req_count = 1; 4512 adev->gfx.gfx_off_residency = 0; 4513 adev->gfx.gfx_off_entrycount = 0; 4514 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4515 4516 atomic_set(&adev->throttling_logging_enabled, 1); 4517 /* 4518 * If throttling continues, logging will be performed every minute 4519 * to avoid log flooding. "-1" is subtracted since the thermal 4520 * throttling interrupt comes every second. Thus, the total logging 4521 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4522 * for throttling interrupt) = 60 seconds. 4523 */ 4524 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4525 4526 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4527 4528 /* Registers mapping */ 4529 /* TODO: block userspace mapping of io register */ 4530 if (adev->asic_type >= CHIP_BONAIRE) { 4531 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4532 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4533 } else { 4534 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4535 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4536 } 4537 4538 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4539 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4540 4541 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4542 if (!adev->rmmio) 4543 return -ENOMEM; 4544 4545 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4546 (uint32_t)adev->rmmio_base); 4547 dev_info(adev->dev, "register mmio size: %u\n", 4548 (unsigned int)adev->rmmio_size); 4549 4550 /* 4551 * Reset domain needs to be present early, before XGMI hive discovered 4552 * (if any) and initialized to use reset sem and in_gpu reset flag 4553 * early on during init and before calling to RREG32. 4554 */ 4555 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4556 if (!adev->reset_domain) 4557 return -ENOMEM; 4558 4559 /* detect hw virtualization here */ 4560 amdgpu_virt_init(adev); 4561 4562 amdgpu_device_get_pcie_info(adev); 4563 4564 r = amdgpu_device_get_job_timeout_settings(adev); 4565 if (r) { 4566 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4567 return r; 4568 } 4569 4570 amdgpu_device_set_mcbp(adev); 4571 4572 /* 4573 * By default, use default mode where all blocks are expected to be 4574 * initialized. At present a 'swinit' of blocks is required to be 4575 * completed before the need for a different level is detected. 4576 */ 4577 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4578 /* early init functions */ 4579 r = amdgpu_device_ip_early_init(adev); 4580 if (r) 4581 return r; 4582 4583 /* 4584 * No need to remove conflicting FBs for non-display class devices. 4585 * This prevents the sysfb from being freed accidently. 4586 */ 4587 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4588 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4589 /* Get rid of things like offb */ 4590 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4591 if (r) 4592 return r; 4593 } 4594 4595 /* Enable TMZ based on IP_VERSION */ 4596 amdgpu_gmc_tmz_set(adev); 4597 4598 if (amdgpu_sriov_vf(adev) && 4599 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4600 /* VF MMIO access (except mailbox range) from CPU 4601 * will be blocked during sriov runtime 4602 */ 4603 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4604 4605 amdgpu_gmc_noretry_set(adev); 4606 /* Need to get xgmi info early to decide the reset behavior*/ 4607 if (adev->gmc.xgmi.supported) { 4608 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4609 if (r) 4610 return r; 4611 } 4612 4613 /* enable PCIE atomic ops */ 4614 if (amdgpu_sriov_vf(adev)) { 4615 if (adev->virt.fw_reserve.p_pf2vf) 4616 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4617 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4618 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4619 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4620 * internal path natively support atomics, set have_atomics_support to true. 4621 */ 4622 } else if ((adev->flags & AMD_IS_APU) && 4623 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4624 IP_VERSION(9, 0, 0))) { 4625 adev->have_atomics_support = true; 4626 } else { 4627 adev->have_atomics_support = 4628 !pci_enable_atomic_ops_to_root(adev->pdev, 4629 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4630 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4631 } 4632 4633 if (!adev->have_atomics_support) 4634 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4635 4636 /* doorbell bar mapping and doorbell index init*/ 4637 amdgpu_doorbell_init(adev); 4638 4639 if (amdgpu_emu_mode == 1) { 4640 /* post the asic on emulation mode */ 4641 emu_soc_asic_init(adev); 4642 goto fence_driver_init; 4643 } 4644 4645 amdgpu_reset_init(adev); 4646 4647 /* detect if we are with an SRIOV vbios */ 4648 if (adev->bios) 4649 amdgpu_device_detect_sriov_bios(adev); 4650 4651 /* check if we need to reset the asic 4652 * E.g., driver was not cleanly unloaded previously, etc. 4653 */ 4654 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4655 if (adev->gmc.xgmi.num_physical_nodes) { 4656 dev_info(adev->dev, "Pending hive reset.\n"); 4657 amdgpu_set_init_level(adev, 4658 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4659 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4660 !amdgpu_device_has_display_hardware(adev)) { 4661 r = psp_gpu_reset(adev); 4662 } else { 4663 tmp = amdgpu_reset_method; 4664 /* It should do a default reset when loading or reloading the driver, 4665 * regardless of the module parameter reset_method. 4666 */ 4667 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4668 r = amdgpu_asic_reset(adev); 4669 amdgpu_reset_method = tmp; 4670 } 4671 4672 if (r) { 4673 dev_err(adev->dev, "asic reset on init failed\n"); 4674 goto failed; 4675 } 4676 } 4677 4678 /* Post card if necessary */ 4679 if (amdgpu_device_need_post(adev)) { 4680 if (!adev->bios) { 4681 dev_err(adev->dev, "no vBIOS found\n"); 4682 r = -EINVAL; 4683 goto failed; 4684 } 4685 dev_info(adev->dev, "GPU posting now...\n"); 4686 r = amdgpu_device_asic_init(adev); 4687 if (r) { 4688 dev_err(adev->dev, "gpu post error!\n"); 4689 goto failed; 4690 } 4691 } 4692 4693 if (adev->bios) { 4694 if (adev->is_atom_fw) { 4695 /* Initialize clocks */ 4696 r = amdgpu_atomfirmware_get_clock_info(adev); 4697 if (r) { 4698 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4699 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4700 goto failed; 4701 } 4702 } else { 4703 /* Initialize clocks */ 4704 r = amdgpu_atombios_get_clock_info(adev); 4705 if (r) { 4706 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4707 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4708 goto failed; 4709 } 4710 /* init i2c buses */ 4711 amdgpu_i2c_init(adev); 4712 } 4713 } 4714 4715 fence_driver_init: 4716 /* Fence driver */ 4717 r = amdgpu_fence_driver_sw_init(adev); 4718 if (r) { 4719 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4720 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4721 goto failed; 4722 } 4723 4724 /* init the mode config */ 4725 drm_mode_config_init(adev_to_drm(adev)); 4726 4727 r = amdgpu_device_ip_init(adev); 4728 if (r) { 4729 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4730 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4731 goto release_ras_con; 4732 } 4733 4734 amdgpu_fence_driver_hw_init(adev); 4735 4736 dev_info(adev->dev, 4737 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4738 adev->gfx.config.max_shader_engines, 4739 adev->gfx.config.max_sh_per_se, 4740 adev->gfx.config.max_cu_per_sh, 4741 adev->gfx.cu_info.number); 4742 4743 adev->accel_working = true; 4744 4745 amdgpu_vm_check_compute_bug(adev); 4746 4747 /* Initialize the buffer migration limit. */ 4748 if (amdgpu_moverate >= 0) 4749 max_MBps = amdgpu_moverate; 4750 else 4751 max_MBps = 8; /* Allow 8 MB/s. */ 4752 /* Get a log2 for easy divisions. */ 4753 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4754 4755 /* 4756 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4757 * Otherwise the mgpu fan boost feature will be skipped due to the 4758 * gpu instance is counted less. 4759 */ 4760 amdgpu_register_gpu_instance(adev); 4761 4762 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4763 * explicit gating rather than handling it automatically. 4764 */ 4765 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4766 r = amdgpu_device_ip_late_init(adev); 4767 if (r) { 4768 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4769 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4770 goto release_ras_con; 4771 } 4772 /* must succeed. */ 4773 amdgpu_ras_resume(adev); 4774 queue_delayed_work(system_wq, &adev->delayed_init_work, 4775 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4776 } 4777 4778 if (amdgpu_sriov_vf(adev)) { 4779 amdgpu_virt_release_full_gpu(adev, true); 4780 flush_delayed_work(&adev->delayed_init_work); 4781 } 4782 4783 /* Don't init kfd if whole hive need to be reset during init */ 4784 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4785 kgd2kfd_init_zone_device(adev); 4786 kfd_update_svm_support_properties(adev); 4787 } 4788 4789 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4790 amdgpu_xgmi_reset_on_init(adev); 4791 4792 /* 4793 * Place those sysfs registering after `late_init`. As some of those 4794 * operations performed in `late_init` might affect the sysfs 4795 * interfaces creating. 4796 */ 4797 r = amdgpu_device_sys_interface_init(adev); 4798 4799 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4800 r = amdgpu_pmu_init(adev); 4801 if (r) 4802 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4803 4804 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4805 if (amdgpu_device_cache_pci_state(adev->pdev)) 4806 pci_restore_state(pdev); 4807 4808 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4809 /* this will fail for cards that aren't VGA class devices, just 4810 * ignore it 4811 */ 4812 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4813 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4814 4815 px = amdgpu_device_supports_px(adev); 4816 4817 if (px || (!dev_is_removable(&adev->pdev->dev) && 4818 apple_gmux_detect(NULL, NULL))) 4819 vga_switcheroo_register_client(adev->pdev, 4820 &amdgpu_switcheroo_ops, px); 4821 4822 if (px) 4823 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4824 4825 amdgpu_device_check_iommu_direct_map(adev); 4826 4827 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4828 r = register_pm_notifier(&adev->pm_nb); 4829 if (r) 4830 goto failed; 4831 4832 return 0; 4833 4834 release_ras_con: 4835 if (amdgpu_sriov_vf(adev)) 4836 amdgpu_virt_release_full_gpu(adev, true); 4837 4838 /* failed in exclusive mode due to timeout */ 4839 if (amdgpu_sriov_vf(adev) && 4840 !amdgpu_sriov_runtime(adev) && 4841 amdgpu_virt_mmio_blocked(adev) && 4842 !amdgpu_virt_wait_reset(adev)) { 4843 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4844 /* Don't send request since VF is inactive. */ 4845 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4846 adev->virt.ops = NULL; 4847 r = -EAGAIN; 4848 } 4849 amdgpu_release_ras_context(adev); 4850 4851 failed: 4852 amdgpu_vf_error_trans_all(adev); 4853 4854 return r; 4855 } 4856 4857 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4858 { 4859 4860 /* Clear all CPU mappings pointing to this device */ 4861 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4862 4863 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4864 amdgpu_doorbell_fini(adev); 4865 4866 iounmap(adev->rmmio); 4867 adev->rmmio = NULL; 4868 if (adev->mman.aper_base_kaddr) 4869 iounmap(adev->mman.aper_base_kaddr); 4870 adev->mman.aper_base_kaddr = NULL; 4871 4872 /* Memory manager related */ 4873 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4874 arch_phys_wc_del(adev->gmc.vram_mtrr); 4875 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4876 } 4877 } 4878 4879 /** 4880 * amdgpu_device_fini_hw - tear down the driver 4881 * 4882 * @adev: amdgpu_device pointer 4883 * 4884 * Tear down the driver info (all asics). 4885 * Called at driver shutdown. 4886 */ 4887 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4888 { 4889 dev_info(adev->dev, "finishing device.\n"); 4890 flush_delayed_work(&adev->delayed_init_work); 4891 4892 if (adev->mman.initialized) 4893 drain_workqueue(adev->mman.bdev.wq); 4894 adev->shutdown = true; 4895 4896 unregister_pm_notifier(&adev->pm_nb); 4897 4898 /* make sure IB test finished before entering exclusive mode 4899 * to avoid preemption on IB test 4900 */ 4901 if (amdgpu_sriov_vf(adev)) { 4902 amdgpu_virt_request_full_gpu(adev, false); 4903 amdgpu_virt_fini_data_exchange(adev); 4904 } 4905 4906 /* disable all interrupts */ 4907 amdgpu_irq_disable_all(adev); 4908 if (adev->mode_info.mode_config_initialized) { 4909 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4910 drm_helper_force_disable_all(adev_to_drm(adev)); 4911 else 4912 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4913 } 4914 amdgpu_fence_driver_hw_fini(adev); 4915 4916 amdgpu_device_sys_interface_fini(adev); 4917 4918 /* disable ras feature must before hw fini */ 4919 amdgpu_ras_pre_fini(adev); 4920 4921 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4922 4923 /* 4924 * device went through surprise hotplug; we need to destroy topology 4925 * before ip_fini_early to prevent kfd locking refcount issues by calling 4926 * amdgpu_amdkfd_suspend() 4927 */ 4928 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4929 amdgpu_amdkfd_device_fini_sw(adev); 4930 4931 amdgpu_device_ip_fini_early(adev); 4932 4933 amdgpu_irq_fini_hw(adev); 4934 4935 if (adev->mman.initialized) 4936 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4937 4938 amdgpu_gart_dummy_page_fini(adev); 4939 4940 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4941 amdgpu_device_unmap_mmio(adev); 4942 4943 } 4944 4945 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4946 { 4947 int i, idx; 4948 bool px; 4949 4950 amdgpu_device_ip_fini(adev); 4951 amdgpu_fence_driver_sw_fini(adev); 4952 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4953 adev->accel_working = false; 4954 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4955 for (i = 0; i < MAX_XCP; ++i) { 4956 dma_fence_put(adev->isolation[i].spearhead); 4957 amdgpu_sync_free(&adev->isolation[i].active); 4958 amdgpu_sync_free(&adev->isolation[i].prev); 4959 } 4960 4961 amdgpu_reset_fini(adev); 4962 4963 /* free i2c buses */ 4964 amdgpu_i2c_fini(adev); 4965 4966 if (adev->bios) { 4967 if (amdgpu_emu_mode != 1) 4968 amdgpu_atombios_fini(adev); 4969 amdgpu_bios_release(adev); 4970 } 4971 4972 kfree(adev->fru_info); 4973 adev->fru_info = NULL; 4974 4975 kfree(adev->xcp_mgr); 4976 adev->xcp_mgr = NULL; 4977 4978 px = amdgpu_device_supports_px(adev); 4979 4980 if (px || (!dev_is_removable(&adev->pdev->dev) && 4981 apple_gmux_detect(NULL, NULL))) 4982 vga_switcheroo_unregister_client(adev->pdev); 4983 4984 if (px) 4985 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4986 4987 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4988 vga_client_unregister(adev->pdev); 4989 4990 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4991 4992 iounmap(adev->rmmio); 4993 adev->rmmio = NULL; 4994 drm_dev_exit(idx); 4995 } 4996 4997 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4998 amdgpu_pmu_fini(adev); 4999 if (adev->discovery.bin) 5000 amdgpu_discovery_fini(adev); 5001 5002 amdgpu_reset_put_reset_domain(adev->reset_domain); 5003 adev->reset_domain = NULL; 5004 5005 kfree(adev->pci_state); 5006 kfree(adev->pcie_reset_ctx.swds_pcistate); 5007 kfree(adev->pcie_reset_ctx.swus_pcistate); 5008 } 5009 5010 /** 5011 * amdgpu_device_evict_resources - evict device resources 5012 * @adev: amdgpu device object 5013 * 5014 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5015 * of the vram memory type. Mainly used for evicting device resources 5016 * at suspend time. 5017 * 5018 */ 5019 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5020 { 5021 int ret; 5022 5023 /* No need to evict vram on APUs unless going to S4 */ 5024 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5025 return 0; 5026 5027 /* No need to evict when going to S5 through S4 callbacks */ 5028 if (system_state == SYSTEM_POWER_OFF) 5029 return 0; 5030 5031 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5032 if (ret) { 5033 dev_warn(adev->dev, "evicting device resources failed\n"); 5034 return ret; 5035 } 5036 5037 if (adev->in_s4) { 5038 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5039 if (ret) 5040 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5041 } 5042 return ret; 5043 } 5044 5045 /* 5046 * Suspend & resume. 5047 */ 5048 /** 5049 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5050 * @nb: notifier block 5051 * @mode: suspend mode 5052 * @data: data 5053 * 5054 * This function is called when the system is about to suspend or hibernate. 5055 * It is used to set the appropriate flags so that eviction can be optimized 5056 * in the pm prepare callback. 5057 */ 5058 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5059 void *data) 5060 { 5061 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5062 5063 switch (mode) { 5064 case PM_HIBERNATION_PREPARE: 5065 adev->in_s4 = true; 5066 break; 5067 case PM_POST_HIBERNATION: 5068 adev->in_s4 = false; 5069 break; 5070 } 5071 5072 return NOTIFY_DONE; 5073 } 5074 5075 /** 5076 * amdgpu_device_prepare - prepare for device suspend 5077 * 5078 * @dev: drm dev pointer 5079 * 5080 * Prepare to put the hw in the suspend state (all asics). 5081 * Returns 0 for success or an error on failure. 5082 * Called at driver suspend. 5083 */ 5084 int amdgpu_device_prepare(struct drm_device *dev) 5085 { 5086 struct amdgpu_device *adev = drm_to_adev(dev); 5087 int i, r; 5088 5089 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5090 return 0; 5091 5092 /* Evict the majority of BOs before starting suspend sequence */ 5093 r = amdgpu_device_evict_resources(adev); 5094 if (r) 5095 return r; 5096 5097 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5098 5099 for (i = 0; i < adev->num_ip_blocks; i++) { 5100 if (!adev->ip_blocks[i].status.valid) 5101 continue; 5102 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5103 continue; 5104 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5105 if (r) 5106 return r; 5107 } 5108 5109 return 0; 5110 } 5111 5112 /** 5113 * amdgpu_device_complete - complete power state transition 5114 * 5115 * @dev: drm dev pointer 5116 * 5117 * Undo the changes from amdgpu_device_prepare. This will be 5118 * called on all resume transitions, including those that failed. 5119 */ 5120 void amdgpu_device_complete(struct drm_device *dev) 5121 { 5122 struct amdgpu_device *adev = drm_to_adev(dev); 5123 int i; 5124 5125 for (i = 0; i < adev->num_ip_blocks; i++) { 5126 if (!adev->ip_blocks[i].status.valid) 5127 continue; 5128 if (!adev->ip_blocks[i].version->funcs->complete) 5129 continue; 5130 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5131 } 5132 } 5133 5134 /** 5135 * amdgpu_device_suspend - initiate device suspend 5136 * 5137 * @dev: drm dev pointer 5138 * @notify_clients: notify in-kernel DRM clients 5139 * 5140 * Puts the hw in the suspend state (all asics). 5141 * Returns 0 for success or an error on failure. 5142 * Called at driver suspend. 5143 */ 5144 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5145 { 5146 struct amdgpu_device *adev = drm_to_adev(dev); 5147 int r, rec; 5148 5149 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5150 return 0; 5151 5152 adev->in_suspend = true; 5153 5154 if (amdgpu_sriov_vf(adev)) { 5155 if (!adev->in_runpm) 5156 amdgpu_amdkfd_suspend_process(adev); 5157 amdgpu_virt_fini_data_exchange(adev); 5158 r = amdgpu_virt_request_full_gpu(adev, false); 5159 if (r) 5160 return r; 5161 } 5162 5163 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); 5164 if (r) 5165 goto unwind_sriov; 5166 5167 if (notify_clients) 5168 drm_client_dev_suspend(adev_to_drm(adev)); 5169 5170 cancel_delayed_work_sync(&adev->delayed_init_work); 5171 5172 amdgpu_ras_suspend(adev); 5173 5174 r = amdgpu_device_ip_suspend_phase1(adev); 5175 if (r) 5176 goto unwind_smartshift; 5177 5178 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5179 r = amdgpu_userq_suspend(adev); 5180 if (r) 5181 goto unwind_ip_phase1; 5182 5183 r = amdgpu_device_evict_resources(adev); 5184 if (r) 5185 goto unwind_userq; 5186 5187 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5188 5189 amdgpu_fence_driver_hw_fini(adev); 5190 5191 r = amdgpu_device_ip_suspend_phase2(adev); 5192 if (r) 5193 goto unwind_evict; 5194 5195 if (amdgpu_sriov_vf(adev)) 5196 amdgpu_virt_release_full_gpu(adev, false); 5197 5198 return 0; 5199 5200 unwind_evict: 5201 if (adev->mman.buffer_funcs_ring->sched.ready) 5202 amdgpu_ttm_set_buffer_funcs_status(adev, true); 5203 amdgpu_fence_driver_hw_init(adev); 5204 5205 unwind_userq: 5206 rec = amdgpu_userq_resume(adev); 5207 if (rec) { 5208 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); 5209 return r; 5210 } 5211 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5212 if (rec) { 5213 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); 5214 return r; 5215 } 5216 5217 unwind_ip_phase1: 5218 /* suspend phase 1 = resume phase 3 */ 5219 rec = amdgpu_device_ip_resume_phase3(adev); 5220 if (rec) { 5221 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); 5222 return r; 5223 } 5224 5225 unwind_smartshift: 5226 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); 5227 if (rec) { 5228 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); 5229 return r; 5230 } 5231 5232 if (notify_clients) 5233 drm_client_dev_resume(adev_to_drm(adev)); 5234 5235 amdgpu_ras_resume(adev); 5236 5237 unwind_sriov: 5238 if (amdgpu_sriov_vf(adev)) { 5239 rec = amdgpu_virt_request_full_gpu(adev, true); 5240 if (rec) { 5241 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); 5242 return r; 5243 } 5244 } 5245 5246 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; 5247 5248 return r; 5249 } 5250 5251 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5252 { 5253 int r; 5254 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5255 5256 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5257 * may not work. The access could be blocked by nBIF protection as VF isn't in 5258 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5259 * so that QEMU reprograms MSIX table. 5260 */ 5261 amdgpu_restore_msix(adev); 5262 5263 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5264 if (r) 5265 return r; 5266 5267 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5268 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5269 5270 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5271 adev->vm_manager.vram_base_offset += 5272 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5273 5274 return 0; 5275 } 5276 5277 /** 5278 * amdgpu_device_resume - initiate device resume 5279 * 5280 * @dev: drm dev pointer 5281 * @notify_clients: notify in-kernel DRM clients 5282 * 5283 * Bring the hw back to operating state (all asics). 5284 * Returns 0 for success or an error on failure. 5285 * Called at driver resume. 5286 */ 5287 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5288 { 5289 struct amdgpu_device *adev = drm_to_adev(dev); 5290 int r = 0; 5291 5292 if (amdgpu_sriov_vf(adev)) { 5293 r = amdgpu_virt_request_full_gpu(adev, true); 5294 if (r) 5295 return r; 5296 } 5297 5298 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5299 r = amdgpu_virt_resume(adev); 5300 if (r) 5301 goto exit; 5302 } 5303 5304 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5305 return 0; 5306 5307 if (adev->in_s0ix) 5308 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5309 5310 /* post card */ 5311 if (amdgpu_device_need_post(adev)) { 5312 r = amdgpu_device_asic_init(adev); 5313 if (r) 5314 dev_err(adev->dev, "amdgpu asic init failed\n"); 5315 } 5316 5317 r = amdgpu_device_ip_resume(adev); 5318 5319 if (r) { 5320 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5321 goto exit; 5322 } 5323 5324 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5325 if (r) 5326 goto exit; 5327 5328 r = amdgpu_userq_resume(adev); 5329 if (r) 5330 goto exit; 5331 5332 r = amdgpu_device_ip_late_init(adev); 5333 if (r) 5334 goto exit; 5335 5336 queue_delayed_work(system_wq, &adev->delayed_init_work, 5337 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5338 exit: 5339 if (amdgpu_sriov_vf(adev)) { 5340 amdgpu_virt_init_data_exchange(adev); 5341 amdgpu_virt_release_full_gpu(adev, true); 5342 5343 if (!r && !adev->in_runpm) 5344 r = amdgpu_amdkfd_resume_process(adev); 5345 } 5346 5347 if (r) 5348 return r; 5349 5350 /* Make sure IB tests flushed */ 5351 flush_delayed_work(&adev->delayed_init_work); 5352 5353 if (notify_clients) 5354 drm_client_dev_resume(adev_to_drm(adev)); 5355 5356 amdgpu_ras_resume(adev); 5357 5358 if (adev->mode_info.num_crtc) { 5359 /* 5360 * Most of the connector probing functions try to acquire runtime pm 5361 * refs to ensure that the GPU is powered on when connector polling is 5362 * performed. Since we're calling this from a runtime PM callback, 5363 * trying to acquire rpm refs will cause us to deadlock. 5364 * 5365 * Since we're guaranteed to be holding the rpm lock, it's safe to 5366 * temporarily disable the rpm helpers so this doesn't deadlock us. 5367 */ 5368 #ifdef CONFIG_PM 5369 dev->dev->power.disable_depth++; 5370 #endif 5371 if (!adev->dc_enabled) 5372 drm_helper_hpd_irq_event(dev); 5373 else 5374 drm_kms_helper_hotplug_event(dev); 5375 #ifdef CONFIG_PM 5376 dev->dev->power.disable_depth--; 5377 #endif 5378 } 5379 5380 amdgpu_vram_mgr_clear_reset_blocks(adev); 5381 adev->in_suspend = false; 5382 5383 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5384 dev_warn(adev->dev, "smart shift update failed\n"); 5385 5386 return 0; 5387 } 5388 5389 /** 5390 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5391 * 5392 * @adev: amdgpu_device pointer 5393 * 5394 * The list of all the hardware IPs that make up the asic is walked and 5395 * the check_soft_reset callbacks are run. check_soft_reset determines 5396 * if the asic is still hung or not. 5397 * Returns true if any of the IPs are still in a hung state, false if not. 5398 */ 5399 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5400 { 5401 int i; 5402 bool asic_hang = false; 5403 5404 if (amdgpu_sriov_vf(adev)) 5405 return true; 5406 5407 if (amdgpu_asic_need_full_reset(adev)) 5408 return true; 5409 5410 for (i = 0; i < adev->num_ip_blocks; i++) { 5411 if (!adev->ip_blocks[i].status.valid) 5412 continue; 5413 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5414 adev->ip_blocks[i].status.hang = 5415 adev->ip_blocks[i].version->funcs->check_soft_reset( 5416 &adev->ip_blocks[i]); 5417 if (adev->ip_blocks[i].status.hang) { 5418 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5419 asic_hang = true; 5420 } 5421 } 5422 return asic_hang; 5423 } 5424 5425 /** 5426 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5427 * 5428 * @adev: amdgpu_device pointer 5429 * 5430 * The list of all the hardware IPs that make up the asic is walked and the 5431 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5432 * handles any IP specific hardware or software state changes that are 5433 * necessary for a soft reset to succeed. 5434 * Returns 0 on success, negative error code on failure. 5435 */ 5436 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5437 { 5438 int i, r = 0; 5439 5440 for (i = 0; i < adev->num_ip_blocks; i++) { 5441 if (!adev->ip_blocks[i].status.valid) 5442 continue; 5443 if (adev->ip_blocks[i].status.hang && 5444 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5445 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5446 if (r) 5447 return r; 5448 } 5449 } 5450 5451 return 0; 5452 } 5453 5454 /** 5455 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5456 * 5457 * @adev: amdgpu_device pointer 5458 * 5459 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5460 * reset is necessary to recover. 5461 * Returns true if a full asic reset is required, false if not. 5462 */ 5463 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5464 { 5465 int i; 5466 5467 if (amdgpu_asic_need_full_reset(adev)) 5468 return true; 5469 5470 for (i = 0; i < adev->num_ip_blocks; i++) { 5471 if (!adev->ip_blocks[i].status.valid) 5472 continue; 5473 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5474 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5475 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5476 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5477 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5478 if (adev->ip_blocks[i].status.hang) { 5479 dev_info(adev->dev, "Some block need full reset!\n"); 5480 return true; 5481 } 5482 } 5483 } 5484 return false; 5485 } 5486 5487 /** 5488 * amdgpu_device_ip_soft_reset - do a soft reset 5489 * 5490 * @adev: amdgpu_device pointer 5491 * 5492 * The list of all the hardware IPs that make up the asic is walked and the 5493 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5494 * IP specific hardware or software state changes that are necessary to soft 5495 * reset the IP. 5496 * Returns 0 on success, negative error code on failure. 5497 */ 5498 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5499 { 5500 int i, r = 0; 5501 5502 for (i = 0; i < adev->num_ip_blocks; i++) { 5503 if (!adev->ip_blocks[i].status.valid) 5504 continue; 5505 if (adev->ip_blocks[i].status.hang && 5506 adev->ip_blocks[i].version->funcs->soft_reset) { 5507 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5508 if (r) 5509 return r; 5510 } 5511 } 5512 5513 return 0; 5514 } 5515 5516 /** 5517 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5518 * 5519 * @adev: amdgpu_device pointer 5520 * 5521 * The list of all the hardware IPs that make up the asic is walked and the 5522 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5523 * handles any IP specific hardware or software state changes that are 5524 * necessary after the IP has been soft reset. 5525 * Returns 0 on success, negative error code on failure. 5526 */ 5527 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5528 { 5529 int i, r = 0; 5530 5531 for (i = 0; i < adev->num_ip_blocks; i++) { 5532 if (!adev->ip_blocks[i].status.valid) 5533 continue; 5534 if (adev->ip_blocks[i].status.hang && 5535 adev->ip_blocks[i].version->funcs->post_soft_reset) 5536 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5537 if (r) 5538 return r; 5539 } 5540 5541 return 0; 5542 } 5543 5544 /** 5545 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5546 * 5547 * @adev: amdgpu_device pointer 5548 * @reset_context: amdgpu reset context pointer 5549 * 5550 * do VF FLR and reinitialize Asic 5551 * return 0 means succeeded otherwise failed 5552 */ 5553 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5554 struct amdgpu_reset_context *reset_context) 5555 { 5556 int r; 5557 struct amdgpu_hive_info *hive = NULL; 5558 5559 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5560 if (!amdgpu_ras_get_fed_status(adev)) 5561 amdgpu_virt_ready_to_reset(adev); 5562 amdgpu_virt_wait_reset(adev); 5563 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5564 r = amdgpu_virt_request_full_gpu(adev, true); 5565 } else { 5566 r = amdgpu_virt_reset_gpu(adev); 5567 } 5568 if (r) 5569 return r; 5570 5571 amdgpu_ras_clear_err_state(adev); 5572 amdgpu_irq_gpu_reset_resume_helper(adev); 5573 5574 /* some sw clean up VF needs to do before recover */ 5575 amdgpu_virt_post_reset(adev); 5576 5577 /* Resume IP prior to SMC */ 5578 r = amdgpu_device_ip_reinit_early_sriov(adev); 5579 if (r) 5580 return r; 5581 5582 amdgpu_virt_init_data_exchange(adev); 5583 5584 r = amdgpu_device_fw_loading(adev); 5585 if (r) 5586 return r; 5587 5588 /* now we are okay to resume SMC/CP/SDMA */ 5589 r = amdgpu_device_ip_reinit_late_sriov(adev); 5590 if (r) 5591 return r; 5592 5593 hive = amdgpu_get_xgmi_hive(adev); 5594 /* Update PSP FW topology after reset */ 5595 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5596 r = amdgpu_xgmi_update_topology(hive, adev); 5597 if (hive) 5598 amdgpu_put_xgmi_hive(hive); 5599 if (r) 5600 return r; 5601 5602 r = amdgpu_ib_ring_tests(adev); 5603 if (r) 5604 return r; 5605 5606 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5607 amdgpu_inc_vram_lost(adev); 5608 5609 /* need to be called during full access so we can't do it later like 5610 * bare-metal does. 5611 */ 5612 amdgpu_amdkfd_post_reset(adev); 5613 amdgpu_virt_release_full_gpu(adev, true); 5614 5615 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5616 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5617 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5618 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5619 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5620 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5621 amdgpu_ras_resume(adev); 5622 5623 amdgpu_virt_ras_telemetry_post_reset(adev); 5624 5625 return 0; 5626 } 5627 5628 /** 5629 * amdgpu_device_has_job_running - check if there is any unfinished job 5630 * 5631 * @adev: amdgpu_device pointer 5632 * 5633 * check if there is any job running on the device when guest driver receives 5634 * FLR notification from host driver. If there are still jobs running, then 5635 * the guest driver will not respond the FLR reset. Instead, let the job hit 5636 * the timeout and guest driver then issue the reset request. 5637 */ 5638 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5639 { 5640 int i; 5641 5642 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5643 struct amdgpu_ring *ring = adev->rings[i]; 5644 5645 if (!amdgpu_ring_sched_ready(ring)) 5646 continue; 5647 5648 if (amdgpu_fence_count_emitted(ring)) 5649 return true; 5650 } 5651 return false; 5652 } 5653 5654 /** 5655 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5656 * 5657 * @adev: amdgpu_device pointer 5658 * 5659 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5660 * a hung GPU. 5661 */ 5662 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5663 { 5664 5665 if (amdgpu_gpu_recovery == 0) 5666 goto disabled; 5667 5668 /* Skip soft reset check in fatal error mode */ 5669 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5670 return true; 5671 5672 if (amdgpu_sriov_vf(adev)) 5673 return true; 5674 5675 if (amdgpu_gpu_recovery == -1) { 5676 switch (adev->asic_type) { 5677 #ifdef CONFIG_DRM_AMDGPU_SI 5678 case CHIP_VERDE: 5679 case CHIP_TAHITI: 5680 case CHIP_PITCAIRN: 5681 case CHIP_OLAND: 5682 case CHIP_HAINAN: 5683 #endif 5684 #ifdef CONFIG_DRM_AMDGPU_CIK 5685 case CHIP_KAVERI: 5686 case CHIP_KABINI: 5687 case CHIP_MULLINS: 5688 #endif 5689 case CHIP_CARRIZO: 5690 case CHIP_STONEY: 5691 case CHIP_CYAN_SKILLFISH: 5692 goto disabled; 5693 default: 5694 break; 5695 } 5696 } 5697 5698 return true; 5699 5700 disabled: 5701 dev_info(adev->dev, "GPU recovery disabled.\n"); 5702 return false; 5703 } 5704 5705 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5706 { 5707 u32 i; 5708 int ret = 0; 5709 5710 if (adev->bios) 5711 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5712 5713 dev_info(adev->dev, "GPU mode1 reset\n"); 5714 5715 /* Cache the state before bus master disable. The saved config space 5716 * values are used in other cases like restore after mode-2 reset. 5717 */ 5718 amdgpu_device_cache_pci_state(adev->pdev); 5719 5720 /* disable BM */ 5721 pci_clear_master(adev->pdev); 5722 5723 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5724 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5725 ret = amdgpu_dpm_mode1_reset(adev); 5726 } else { 5727 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5728 ret = psp_gpu_reset(adev); 5729 } 5730 5731 if (ret) 5732 goto mode1_reset_failed; 5733 5734 /* enable mmio access after mode 1 reset completed */ 5735 adev->no_hw_access = false; 5736 5737 amdgpu_device_load_pci_state(adev->pdev); 5738 ret = amdgpu_psp_wait_for_bootloader(adev); 5739 if (ret) 5740 goto mode1_reset_failed; 5741 5742 /* wait for asic to come out of reset */ 5743 for (i = 0; i < adev->usec_timeout; i++) { 5744 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5745 5746 if (memsize != 0xffffffff) 5747 break; 5748 udelay(1); 5749 } 5750 5751 if (i >= adev->usec_timeout) { 5752 ret = -ETIMEDOUT; 5753 goto mode1_reset_failed; 5754 } 5755 5756 if (adev->bios) 5757 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5758 5759 return 0; 5760 5761 mode1_reset_failed: 5762 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5763 return ret; 5764 } 5765 5766 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5767 { 5768 int ret = 0; 5769 5770 dev_info(adev->dev, "GPU link reset\n"); 5771 5772 if (!amdgpu_reset_in_dpc(adev)) 5773 ret = amdgpu_dpm_link_reset(adev); 5774 5775 if (ret) 5776 goto link_reset_failed; 5777 5778 ret = amdgpu_psp_wait_for_bootloader(adev); 5779 if (ret) 5780 goto link_reset_failed; 5781 5782 return 0; 5783 5784 link_reset_failed: 5785 dev_err(adev->dev, "GPU link reset failed\n"); 5786 return ret; 5787 } 5788 5789 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5790 struct amdgpu_reset_context *reset_context) 5791 { 5792 int i, r = 0; 5793 struct amdgpu_job *job = NULL; 5794 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5795 bool need_full_reset = 5796 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5797 5798 if (reset_context->reset_req_dev == adev) 5799 job = reset_context->job; 5800 5801 if (amdgpu_sriov_vf(adev)) 5802 amdgpu_virt_pre_reset(adev); 5803 5804 amdgpu_fence_driver_isr_toggle(adev, true); 5805 5806 /* block all schedulers and reset given job's ring */ 5807 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5808 struct amdgpu_ring *ring = adev->rings[i]; 5809 5810 if (!amdgpu_ring_sched_ready(ring)) 5811 continue; 5812 5813 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5814 amdgpu_fence_driver_force_completion(ring); 5815 } 5816 5817 amdgpu_fence_driver_isr_toggle(adev, false); 5818 5819 if (job && job->vm) 5820 drm_sched_increase_karma(&job->base); 5821 5822 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5823 /* If reset handler not implemented, continue; otherwise return */ 5824 if (r == -EOPNOTSUPP) 5825 r = 0; 5826 else 5827 return r; 5828 5829 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5830 if (!amdgpu_sriov_vf(adev)) { 5831 5832 if (!need_full_reset) 5833 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5834 5835 if (!need_full_reset && amdgpu_gpu_recovery && 5836 amdgpu_device_ip_check_soft_reset(adev)) { 5837 amdgpu_device_ip_pre_soft_reset(adev); 5838 r = amdgpu_device_ip_soft_reset(adev); 5839 amdgpu_device_ip_post_soft_reset(adev); 5840 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5841 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5842 need_full_reset = true; 5843 } 5844 } 5845 5846 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5847 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5848 /* Trigger ip dump before we reset the asic */ 5849 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5850 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5851 tmp_adev->ip_blocks[i].version->funcs 5852 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5853 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5854 } 5855 5856 if (need_full_reset) 5857 r = amdgpu_device_ip_suspend(adev); 5858 if (need_full_reset) 5859 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5860 else 5861 clear_bit(AMDGPU_NEED_FULL_RESET, 5862 &reset_context->flags); 5863 } 5864 5865 return r; 5866 } 5867 5868 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5869 { 5870 struct list_head *device_list_handle; 5871 bool full_reset, vram_lost = false; 5872 struct amdgpu_device *tmp_adev; 5873 int r, init_level; 5874 5875 device_list_handle = reset_context->reset_device_list; 5876 5877 if (!device_list_handle) 5878 return -EINVAL; 5879 5880 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5881 5882 /** 5883 * If it's reset on init, it's default init level, otherwise keep level 5884 * as recovery level. 5885 */ 5886 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5887 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5888 else 5889 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5890 5891 r = 0; 5892 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5893 amdgpu_set_init_level(tmp_adev, init_level); 5894 if (full_reset) { 5895 /* post card */ 5896 amdgpu_reset_set_dpc_status(tmp_adev, false); 5897 amdgpu_ras_clear_err_state(tmp_adev); 5898 r = amdgpu_device_asic_init(tmp_adev); 5899 if (r) { 5900 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5901 } else { 5902 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5903 5904 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5905 if (r) 5906 goto out; 5907 5908 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5909 5910 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5911 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5912 5913 if (vram_lost) { 5914 dev_info( 5915 tmp_adev->dev, 5916 "VRAM is lost due to GPU reset!\n"); 5917 amdgpu_inc_vram_lost(tmp_adev); 5918 } 5919 5920 r = amdgpu_device_fw_loading(tmp_adev); 5921 if (r) 5922 return r; 5923 5924 r = amdgpu_xcp_restore_partition_mode( 5925 tmp_adev->xcp_mgr); 5926 if (r) 5927 goto out; 5928 5929 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5930 if (r) 5931 goto out; 5932 5933 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5934 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5935 5936 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5937 if (r) 5938 goto out; 5939 5940 if (vram_lost) 5941 amdgpu_device_fill_reset_magic(tmp_adev); 5942 5943 /* 5944 * Add this ASIC as tracked as reset was already 5945 * complete successfully. 5946 */ 5947 amdgpu_register_gpu_instance(tmp_adev); 5948 5949 if (!reset_context->hive && 5950 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5951 amdgpu_xgmi_add_device(tmp_adev); 5952 5953 r = amdgpu_device_ip_late_init(tmp_adev); 5954 if (r) 5955 goto out; 5956 5957 r = amdgpu_userq_post_reset(tmp_adev, vram_lost); 5958 if (r) 5959 goto out; 5960 5961 drm_client_dev_resume(adev_to_drm(tmp_adev)); 5962 5963 /* 5964 * The GPU enters bad state once faulty pages 5965 * by ECC has reached the threshold, and ras 5966 * recovery is scheduled next. So add one check 5967 * here to break recovery if it indeed exceeds 5968 * bad page threshold, and remind user to 5969 * retire this GPU or setting one bigger 5970 * bad_page_threshold value to fix this once 5971 * probing driver again. 5972 */ 5973 if (!amdgpu_ras_is_rma(tmp_adev)) { 5974 /* must succeed. */ 5975 amdgpu_ras_resume(tmp_adev); 5976 } else { 5977 r = -EINVAL; 5978 goto out; 5979 } 5980 5981 /* Update PSP FW topology after reset */ 5982 if (reset_context->hive && 5983 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5984 r = amdgpu_xgmi_update_topology( 5985 reset_context->hive, tmp_adev); 5986 } 5987 } 5988 5989 out: 5990 if (!r) { 5991 /* IP init is complete now, set level as default */ 5992 amdgpu_set_init_level(tmp_adev, 5993 AMDGPU_INIT_LEVEL_DEFAULT); 5994 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5995 r = amdgpu_ib_ring_tests(tmp_adev); 5996 if (r) { 5997 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5998 r = -EAGAIN; 5999 goto end; 6000 } 6001 } 6002 6003 if (r) 6004 tmp_adev->asic_reset_res = r; 6005 } 6006 6007 end: 6008 return r; 6009 } 6010 6011 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 6012 struct amdgpu_reset_context *reset_context) 6013 { 6014 struct amdgpu_device *tmp_adev = NULL; 6015 bool need_full_reset, skip_hw_reset; 6016 int r = 0; 6017 6018 /* Try reset handler method first */ 6019 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6020 reset_list); 6021 6022 reset_context->reset_device_list = device_list_handle; 6023 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 6024 /* If reset handler not implemented, continue; otherwise return */ 6025 if (r == -EOPNOTSUPP) 6026 r = 0; 6027 else 6028 return r; 6029 6030 /* Reset handler not implemented, use the default method */ 6031 need_full_reset = 6032 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6033 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 6034 6035 /* 6036 * ASIC reset has to be done on all XGMI hive nodes ASAP 6037 * to allow proper links negotiation in FW (within 1 sec) 6038 */ 6039 if (!skip_hw_reset && need_full_reset) { 6040 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6041 /* For XGMI run all resets in parallel to speed up the process */ 6042 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6043 if (!queue_work(system_unbound_wq, 6044 &tmp_adev->xgmi_reset_work)) 6045 r = -EALREADY; 6046 } else 6047 r = amdgpu_asic_reset(tmp_adev); 6048 6049 if (r) { 6050 dev_err(tmp_adev->dev, 6051 "ASIC reset failed with error, %d for drm dev, %s", 6052 r, adev_to_drm(tmp_adev)->unique); 6053 goto out; 6054 } 6055 } 6056 6057 /* For XGMI wait for all resets to complete before proceed */ 6058 if (!r) { 6059 list_for_each_entry(tmp_adev, device_list_handle, 6060 reset_list) { 6061 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6062 flush_work(&tmp_adev->xgmi_reset_work); 6063 r = tmp_adev->asic_reset_res; 6064 if (r) 6065 break; 6066 } 6067 } 6068 } 6069 } 6070 6071 if (!r && amdgpu_ras_intr_triggered()) { 6072 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6073 amdgpu_ras_reset_error_count(tmp_adev, 6074 AMDGPU_RAS_BLOCK__MMHUB); 6075 } 6076 6077 amdgpu_ras_intr_cleared(); 6078 } 6079 6080 r = amdgpu_device_reinit_after_reset(reset_context); 6081 if (r == -EAGAIN) 6082 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6083 else 6084 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6085 6086 out: 6087 return r; 6088 } 6089 6090 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6091 { 6092 6093 switch (amdgpu_asic_reset_method(adev)) { 6094 case AMD_RESET_METHOD_MODE1: 6095 case AMD_RESET_METHOD_LINK: 6096 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6097 break; 6098 case AMD_RESET_METHOD_MODE2: 6099 adev->mp1_state = PP_MP1_STATE_RESET; 6100 break; 6101 default: 6102 adev->mp1_state = PP_MP1_STATE_NONE; 6103 break; 6104 } 6105 } 6106 6107 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6108 { 6109 amdgpu_vf_error_trans_all(adev); 6110 adev->mp1_state = PP_MP1_STATE_NONE; 6111 } 6112 6113 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6114 { 6115 struct pci_dev *p = NULL; 6116 6117 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6118 adev->pdev->bus->number, 1); 6119 if (p) { 6120 pm_runtime_enable(&(p->dev)); 6121 pm_runtime_resume(&(p->dev)); 6122 } 6123 6124 pci_dev_put(p); 6125 } 6126 6127 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6128 { 6129 enum amd_reset_method reset_method; 6130 struct pci_dev *p = NULL; 6131 u64 expires; 6132 6133 /* 6134 * For now, only BACO and mode1 reset are confirmed 6135 * to suffer the audio issue without proper suspended. 6136 */ 6137 reset_method = amdgpu_asic_reset_method(adev); 6138 if ((reset_method != AMD_RESET_METHOD_BACO) && 6139 (reset_method != AMD_RESET_METHOD_MODE1)) 6140 return -EINVAL; 6141 6142 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6143 adev->pdev->bus->number, 1); 6144 if (!p) 6145 return -ENODEV; 6146 6147 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6148 if (!expires) 6149 /* 6150 * If we cannot get the audio device autosuspend delay, 6151 * a fixed 4S interval will be used. Considering 3S is 6152 * the audio controller default autosuspend delay setting. 6153 * 4S used here is guaranteed to cover that. 6154 */ 6155 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6156 6157 while (!pm_runtime_status_suspended(&(p->dev))) { 6158 if (!pm_runtime_suspend(&(p->dev))) 6159 break; 6160 6161 if (expires < ktime_get_mono_fast_ns()) { 6162 dev_warn(adev->dev, "failed to suspend display audio\n"); 6163 pci_dev_put(p); 6164 /* TODO: abort the succeeding gpu reset? */ 6165 return -ETIMEDOUT; 6166 } 6167 } 6168 6169 pm_runtime_disable(&(p->dev)); 6170 6171 pci_dev_put(p); 6172 return 0; 6173 } 6174 6175 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6176 { 6177 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6178 6179 #if defined(CONFIG_DEBUG_FS) 6180 if (!amdgpu_sriov_vf(adev)) 6181 cancel_work(&adev->reset_work); 6182 #endif 6183 cancel_work(&adev->userq_reset_work); 6184 6185 if (adev->kfd.dev) 6186 cancel_work(&adev->kfd.reset_work); 6187 6188 if (amdgpu_sriov_vf(adev)) 6189 cancel_work(&adev->virt.flr_work); 6190 6191 if (con && adev->ras_enabled) 6192 cancel_work(&con->recovery_work); 6193 6194 } 6195 6196 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6197 { 6198 struct amdgpu_device *tmp_adev; 6199 int ret = 0; 6200 6201 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6202 ret |= amdgpu_device_bus_status_check(tmp_adev); 6203 } 6204 6205 return ret; 6206 } 6207 6208 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6209 struct list_head *device_list, 6210 struct amdgpu_hive_info *hive) 6211 { 6212 struct amdgpu_device *tmp_adev = NULL; 6213 6214 /* 6215 * Build list of devices to reset. 6216 * In case we are in XGMI hive mode, resort the device list 6217 * to put adev in the 1st position. 6218 */ 6219 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6220 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6221 list_add_tail(&tmp_adev->reset_list, device_list); 6222 if (adev->shutdown) 6223 tmp_adev->shutdown = true; 6224 if (amdgpu_reset_in_dpc(adev)) 6225 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6226 } 6227 if (!list_is_first(&adev->reset_list, device_list)) 6228 list_rotate_to_front(&adev->reset_list, device_list); 6229 } else { 6230 list_add_tail(&adev->reset_list, device_list); 6231 } 6232 } 6233 6234 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6235 struct list_head *device_list) 6236 { 6237 struct amdgpu_device *tmp_adev = NULL; 6238 6239 if (list_empty(device_list)) 6240 return; 6241 tmp_adev = 6242 list_first_entry(device_list, struct amdgpu_device, reset_list); 6243 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6244 } 6245 6246 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6247 struct list_head *device_list) 6248 { 6249 struct amdgpu_device *tmp_adev = NULL; 6250 6251 if (list_empty(device_list)) 6252 return; 6253 tmp_adev = 6254 list_first_entry(device_list, struct amdgpu_device, reset_list); 6255 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6256 } 6257 6258 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6259 struct amdgpu_job *job, 6260 struct amdgpu_reset_context *reset_context, 6261 struct list_head *device_list, 6262 struct amdgpu_hive_info *hive, 6263 bool need_emergency_restart) 6264 { 6265 struct amdgpu_device *tmp_adev = NULL; 6266 int i; 6267 6268 /* block all schedulers and reset given job's ring */ 6269 list_for_each_entry(tmp_adev, device_list, reset_list) { 6270 amdgpu_device_set_mp1_state(tmp_adev); 6271 6272 /* 6273 * Try to put the audio codec into suspend state 6274 * before gpu reset started. 6275 * 6276 * Due to the power domain of the graphics device 6277 * is shared with AZ power domain. Without this, 6278 * we may change the audio hardware from behind 6279 * the audio driver's back. That will trigger 6280 * some audio codec errors. 6281 */ 6282 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6283 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6284 6285 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6286 6287 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6288 6289 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6290 6291 /* 6292 * Mark these ASICs to be reset as untracked first 6293 * And add them back after reset completed 6294 */ 6295 amdgpu_unregister_gpu_instance(tmp_adev); 6296 6297 drm_client_dev_suspend(adev_to_drm(tmp_adev)); 6298 6299 /* disable ras on ALL IPs */ 6300 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6301 amdgpu_device_ip_need_full_reset(tmp_adev)) 6302 amdgpu_ras_suspend(tmp_adev); 6303 6304 amdgpu_userq_pre_reset(tmp_adev); 6305 6306 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6307 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6308 6309 if (!amdgpu_ring_sched_ready(ring)) 6310 continue; 6311 6312 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6313 6314 if (need_emergency_restart) 6315 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6316 } 6317 atomic_inc(&tmp_adev->gpu_reset_counter); 6318 } 6319 } 6320 6321 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6322 struct list_head *device_list, 6323 struct amdgpu_reset_context *reset_context) 6324 { 6325 struct amdgpu_device *tmp_adev = NULL; 6326 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6327 int r = 0; 6328 6329 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6330 list_for_each_entry(tmp_adev, device_list, reset_list) { 6331 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6332 /*TODO Should we stop ?*/ 6333 if (r) { 6334 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6335 r, adev_to_drm(tmp_adev)->unique); 6336 tmp_adev->asic_reset_res = r; 6337 } 6338 } 6339 6340 /* Actual ASIC resets if needed.*/ 6341 /* Host driver will handle XGMI hive reset for SRIOV */ 6342 if (amdgpu_sriov_vf(adev)) { 6343 6344 /* Bail out of reset early */ 6345 if (amdgpu_ras_is_rma(adev)) 6346 return -ENODEV; 6347 6348 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6349 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6350 amdgpu_ras_set_fed(adev, true); 6351 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6352 } 6353 6354 r = amdgpu_device_reset_sriov(adev, reset_context); 6355 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6356 amdgpu_virt_release_full_gpu(adev, true); 6357 goto retry; 6358 } 6359 if (r) 6360 adev->asic_reset_res = r; 6361 } else { 6362 r = amdgpu_do_asic_reset(device_list, reset_context); 6363 if (r && r == -EAGAIN) 6364 goto retry; 6365 } 6366 6367 list_for_each_entry(tmp_adev, device_list, reset_list) { 6368 /* 6369 * Drop any pending non scheduler resets queued before reset is done. 6370 * Any reset scheduled after this point would be valid. Scheduler resets 6371 * were already dropped during drm_sched_stop and no new ones can come 6372 * in before drm_sched_start. 6373 */ 6374 amdgpu_device_stop_pending_resets(tmp_adev); 6375 } 6376 6377 return r; 6378 } 6379 6380 static int amdgpu_device_sched_resume(struct list_head *device_list, 6381 struct amdgpu_reset_context *reset_context, 6382 bool job_signaled) 6383 { 6384 struct amdgpu_device *tmp_adev = NULL; 6385 int i, r = 0; 6386 6387 /* Post ASIC reset for all devs .*/ 6388 list_for_each_entry(tmp_adev, device_list, reset_list) { 6389 6390 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6391 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6392 6393 if (!amdgpu_ring_sched_ready(ring)) 6394 continue; 6395 6396 drm_sched_start(&ring->sched, 0); 6397 } 6398 6399 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6400 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6401 6402 if (tmp_adev->asic_reset_res) { 6403 /* bad news, how to tell it to userspace ? 6404 * for ras error, we should report GPU bad status instead of 6405 * reset failure 6406 */ 6407 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6408 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6409 dev_info( 6410 tmp_adev->dev, 6411 "GPU reset(%d) failed with error %d\n", 6412 atomic_read( 6413 &tmp_adev->gpu_reset_counter), 6414 tmp_adev->asic_reset_res); 6415 amdgpu_vf_error_put(tmp_adev, 6416 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, 6417 tmp_adev->asic_reset_res); 6418 if (!r) 6419 r = tmp_adev->asic_reset_res; 6420 tmp_adev->asic_reset_res = 0; 6421 } else { 6422 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 6423 atomic_read(&tmp_adev->gpu_reset_counter)); 6424 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6425 AMDGPU_SS_DEV_D0)) 6426 dev_warn(tmp_adev->dev, 6427 "smart shift update failed\n"); 6428 } 6429 } 6430 6431 return r; 6432 } 6433 6434 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6435 struct list_head *device_list, 6436 bool need_emergency_restart) 6437 { 6438 struct amdgpu_device *tmp_adev = NULL; 6439 6440 list_for_each_entry(tmp_adev, device_list, reset_list) { 6441 /* unlock kfd: SRIOV would do it separately */ 6442 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6443 amdgpu_amdkfd_post_reset(tmp_adev); 6444 6445 /* kfd_post_reset will do nothing if kfd device is not initialized, 6446 * need to bring up kfd here if it's not be initialized before 6447 */ 6448 if (!adev->kfd.init_complete) 6449 amdgpu_amdkfd_device_init(adev); 6450 6451 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6452 amdgpu_device_resume_display_audio(tmp_adev); 6453 6454 amdgpu_device_unset_mp1_state(tmp_adev); 6455 6456 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6457 6458 } 6459 } 6460 6461 6462 /** 6463 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6464 * 6465 * @adev: amdgpu_device pointer 6466 * @job: which job trigger hang 6467 * @reset_context: amdgpu reset context pointer 6468 * 6469 * Attempt to reset the GPU if it has hung (all asics). 6470 * Attempt to do soft-reset or full-reset and reinitialize Asic 6471 * Returns 0 for success or an error on failure. 6472 */ 6473 6474 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6475 struct amdgpu_job *job, 6476 struct amdgpu_reset_context *reset_context) 6477 { 6478 struct list_head device_list; 6479 bool job_signaled = false; 6480 struct amdgpu_hive_info *hive = NULL; 6481 int r = 0; 6482 bool need_emergency_restart = false; 6483 /* save the pasid here as the job may be freed before the end of the reset */ 6484 int pasid = job ? job->pasid : -EINVAL; 6485 6486 /* 6487 * If it reaches here because of hang/timeout and a RAS error is 6488 * detected at the same time, let RAS recovery take care of it. 6489 */ 6490 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6491 !amdgpu_sriov_vf(adev) && 6492 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6493 dev_dbg(adev->dev, 6494 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6495 reset_context->src); 6496 return 0; 6497 } 6498 6499 /* 6500 * Special case: RAS triggered and full reset isn't supported 6501 */ 6502 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6503 6504 /* 6505 * Flush RAM to disk so that after reboot 6506 * the user can read log and see why the system rebooted. 6507 */ 6508 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6509 amdgpu_ras_get_context(adev)->reboot) { 6510 dev_warn(adev->dev, "Emergency reboot."); 6511 6512 ksys_sync_helper(); 6513 emergency_restart(); 6514 } 6515 6516 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6517 need_emergency_restart ? "jobs stop" : "reset", 6518 reset_context->src); 6519 6520 if (!amdgpu_sriov_vf(adev)) 6521 hive = amdgpu_get_xgmi_hive(adev); 6522 if (hive) 6523 mutex_lock(&hive->hive_lock); 6524 6525 reset_context->job = job; 6526 reset_context->hive = hive; 6527 INIT_LIST_HEAD(&device_list); 6528 6529 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6530 6531 if (!amdgpu_sriov_vf(adev)) { 6532 r = amdgpu_device_health_check(&device_list); 6533 if (r) 6534 goto end_reset; 6535 } 6536 6537 /* Cannot be called after locking reset domain */ 6538 amdgpu_ras_pre_reset(adev, &device_list); 6539 6540 /* We need to lock reset domain only once both for XGMI and single device */ 6541 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6542 6543 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6544 hive, need_emergency_restart); 6545 if (need_emergency_restart) 6546 goto skip_sched_resume; 6547 /* 6548 * Must check guilty signal here since after this point all old 6549 * HW fences are force signaled. 6550 * 6551 * job->base holds a reference to parent fence 6552 */ 6553 if (job && (dma_fence_get_status(&job->hw_fence->base) > 0)) { 6554 job_signaled = true; 6555 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6556 goto skip_hw_reset; 6557 } 6558 6559 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6560 if (r) 6561 goto reset_unlock; 6562 skip_hw_reset: 6563 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6564 if (r) 6565 goto reset_unlock; 6566 skip_sched_resume: 6567 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6568 reset_unlock: 6569 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6570 amdgpu_ras_post_reset(adev, &device_list); 6571 end_reset: 6572 if (hive) { 6573 mutex_unlock(&hive->hive_lock); 6574 amdgpu_put_xgmi_hive(hive); 6575 } 6576 6577 if (r) 6578 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6579 6580 atomic_set(&adev->reset_domain->reset_res, r); 6581 6582 if (!r) { 6583 struct amdgpu_task_info *ti = NULL; 6584 6585 /* 6586 * The job may already be freed at this point via the sched tdr workqueue so 6587 * use the cached pasid. 6588 */ 6589 if (pasid >= 0) 6590 ti = amdgpu_vm_get_task_info_pasid(adev, pasid); 6591 6592 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6593 ti ? &ti->task : NULL); 6594 6595 amdgpu_vm_put_task_info(ti); 6596 } 6597 6598 return r; 6599 } 6600 6601 /** 6602 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6603 * 6604 * @adev: amdgpu_device pointer 6605 * @speed: pointer to the speed of the link 6606 * @width: pointer to the width of the link 6607 * 6608 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6609 * first physical partner to an AMD dGPU. 6610 * This will exclude any virtual switches and links. 6611 */ 6612 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6613 enum pci_bus_speed *speed, 6614 enum pcie_link_width *width) 6615 { 6616 struct pci_dev *parent = adev->pdev; 6617 6618 if (!speed || !width) 6619 return; 6620 6621 *speed = PCI_SPEED_UNKNOWN; 6622 *width = PCIE_LNK_WIDTH_UNKNOWN; 6623 6624 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6625 while ((parent = pci_upstream_bridge(parent))) { 6626 /* skip upstream/downstream switches internal to dGPU*/ 6627 if (parent->vendor == PCI_VENDOR_ID_ATI) 6628 continue; 6629 *speed = pcie_get_speed_cap(parent); 6630 *width = pcie_get_width_cap(parent); 6631 break; 6632 } 6633 } else { 6634 /* use the current speeds rather than max if switching is not supported */ 6635 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6636 } 6637 } 6638 6639 /** 6640 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6641 * 6642 * @adev: amdgpu_device pointer 6643 * @speed: pointer to the speed of the link 6644 * @width: pointer to the width of the link 6645 * 6646 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6647 * AMD dGPU which may be a virtual upstream bridge. 6648 */ 6649 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6650 enum pci_bus_speed *speed, 6651 enum pcie_link_width *width) 6652 { 6653 struct pci_dev *parent = adev->pdev; 6654 6655 if (!speed || !width) 6656 return; 6657 6658 parent = pci_upstream_bridge(parent); 6659 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6660 /* use the upstream/downstream switches internal to dGPU */ 6661 *speed = pcie_get_speed_cap(parent); 6662 *width = pcie_get_width_cap(parent); 6663 while ((parent = pci_upstream_bridge(parent))) { 6664 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6665 /* use the upstream/downstream switches internal to dGPU */ 6666 *speed = pcie_get_speed_cap(parent); 6667 *width = pcie_get_width_cap(parent); 6668 } 6669 } 6670 } else { 6671 /* use the device itself */ 6672 *speed = pcie_get_speed_cap(adev->pdev); 6673 *width = pcie_get_width_cap(adev->pdev); 6674 } 6675 } 6676 6677 /** 6678 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6679 * 6680 * @adev: amdgpu_device pointer 6681 * 6682 * Fetches and stores in the driver the PCIE capabilities (gen speed 6683 * and lanes) of the slot the device is in. Handles APUs and 6684 * virtualized environments where PCIE config space may not be available. 6685 */ 6686 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6687 { 6688 enum pci_bus_speed speed_cap, platform_speed_cap; 6689 enum pcie_link_width platform_link_width, link_width; 6690 6691 if (amdgpu_pcie_gen_cap) 6692 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6693 6694 if (amdgpu_pcie_lane_cap) 6695 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6696 6697 /* covers APUs as well */ 6698 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6699 if (adev->pm.pcie_gen_mask == 0) 6700 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6701 if (adev->pm.pcie_mlw_mask == 0) 6702 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6703 return; 6704 } 6705 6706 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6707 return; 6708 6709 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6710 &platform_link_width); 6711 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6712 6713 if (adev->pm.pcie_gen_mask == 0) { 6714 /* asic caps */ 6715 if (speed_cap == PCI_SPEED_UNKNOWN) { 6716 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6717 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6718 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6719 } else { 6720 if (speed_cap == PCIE_SPEED_32_0GT) 6721 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6722 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6723 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6724 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6725 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6726 else if (speed_cap == PCIE_SPEED_16_0GT) 6727 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6728 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6729 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6730 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6731 else if (speed_cap == PCIE_SPEED_8_0GT) 6732 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6733 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6734 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6735 else if (speed_cap == PCIE_SPEED_5_0GT) 6736 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6737 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6738 else 6739 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6740 } 6741 /* platform caps */ 6742 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6743 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6744 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6745 } else { 6746 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6747 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6748 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6749 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6750 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6751 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6752 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6753 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6754 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6755 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6756 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6757 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6758 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6759 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6760 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6761 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6762 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6763 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6764 else 6765 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6766 6767 } 6768 } 6769 if (adev->pm.pcie_mlw_mask == 0) { 6770 /* asic caps */ 6771 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6772 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6773 } else { 6774 switch (link_width) { 6775 case PCIE_LNK_X32: 6776 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6777 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6778 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6779 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6780 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6781 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6782 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6783 break; 6784 case PCIE_LNK_X16: 6785 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6786 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6787 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6788 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6789 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6790 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6791 break; 6792 case PCIE_LNK_X12: 6793 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6794 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6795 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6796 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6797 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6798 break; 6799 case PCIE_LNK_X8: 6800 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6801 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6802 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6803 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6804 break; 6805 case PCIE_LNK_X4: 6806 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6807 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6808 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6809 break; 6810 case PCIE_LNK_X2: 6811 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6812 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6813 break; 6814 case PCIE_LNK_X1: 6815 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6816 break; 6817 default: 6818 break; 6819 } 6820 } 6821 /* platform caps */ 6822 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6823 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6824 } else { 6825 switch (platform_link_width) { 6826 case PCIE_LNK_X32: 6827 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6828 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6829 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6830 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6831 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6832 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6833 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6834 break; 6835 case PCIE_LNK_X16: 6836 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6837 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6838 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6839 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6840 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6841 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6842 break; 6843 case PCIE_LNK_X12: 6844 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6845 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6846 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6847 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6848 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6849 break; 6850 case PCIE_LNK_X8: 6851 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6852 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6853 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6854 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6855 break; 6856 case PCIE_LNK_X4: 6857 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6858 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6859 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6860 break; 6861 case PCIE_LNK_X2: 6862 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6863 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6864 break; 6865 case PCIE_LNK_X1: 6866 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6867 break; 6868 default: 6869 break; 6870 } 6871 } 6872 } 6873 } 6874 6875 /** 6876 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6877 * 6878 * @adev: amdgpu_device pointer 6879 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6880 * 6881 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6882 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6883 * @peer_adev. 6884 */ 6885 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6886 struct amdgpu_device *peer_adev) 6887 { 6888 #ifdef CONFIG_HSA_AMD_P2P 6889 bool p2p_access = 6890 !adev->gmc.xgmi.connected_to_cpu && 6891 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6892 if (!p2p_access) 6893 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6894 pci_name(peer_adev->pdev)); 6895 6896 bool is_large_bar = adev->gmc.visible_vram_size && 6897 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6898 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6899 6900 if (!p2p_addressable) { 6901 uint64_t address_mask = peer_adev->dev->dma_mask ? 6902 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6903 resource_size_t aper_limit = 6904 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6905 6906 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6907 aper_limit & address_mask); 6908 } 6909 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6910 #else 6911 return false; 6912 #endif 6913 } 6914 6915 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 6916 { 6917 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6918 6919 if (!amdgpu_device_supports_baco(adev)) 6920 return -ENOTSUPP; 6921 6922 if (ras && adev->ras_enabled && 6923 adev->nbio.funcs->enable_doorbell_interrupt) 6924 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6925 6926 return amdgpu_dpm_baco_enter(adev); 6927 } 6928 6929 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 6930 { 6931 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6932 int ret = 0; 6933 6934 if (!amdgpu_device_supports_baco(adev)) 6935 return -ENOTSUPP; 6936 6937 ret = amdgpu_dpm_baco_exit(adev); 6938 if (ret) 6939 return ret; 6940 6941 if (ras && adev->ras_enabled && 6942 adev->nbio.funcs->enable_doorbell_interrupt) 6943 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6944 6945 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6946 adev->nbio.funcs->clear_doorbell_interrupt) 6947 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6948 6949 return 0; 6950 } 6951 6952 /** 6953 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6954 * @pdev: PCI device struct 6955 * @state: PCI channel state 6956 * 6957 * Description: Called when a PCI error is detected. 6958 * 6959 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6960 */ 6961 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6962 { 6963 struct drm_device *dev = pci_get_drvdata(pdev); 6964 struct amdgpu_device *adev = drm_to_adev(dev); 6965 struct amdgpu_hive_info *hive __free(xgmi_put_hive) = 6966 amdgpu_get_xgmi_hive(adev); 6967 struct amdgpu_reset_context reset_context; 6968 struct list_head device_list; 6969 6970 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6971 6972 adev->pci_channel_state = state; 6973 6974 switch (state) { 6975 case pci_channel_io_normal: 6976 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6977 return PCI_ERS_RESULT_CAN_RECOVER; 6978 case pci_channel_io_frozen: 6979 /* Fatal error, prepare for slot reset */ 6980 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6981 if (hive) { 6982 /* Hive devices should be able to support FW based 6983 * link reset on other devices, if not return. 6984 */ 6985 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6986 dev_warn(adev->dev, 6987 "No support for XGMI hive yet...\n"); 6988 return PCI_ERS_RESULT_DISCONNECT; 6989 } 6990 /* Set dpc status only if device is part of hive 6991 * Non-hive devices should be able to recover after 6992 * link reset. 6993 */ 6994 amdgpu_reset_set_dpc_status(adev, true); 6995 6996 mutex_lock(&hive->hive_lock); 6997 } 6998 memset(&reset_context, 0, sizeof(reset_context)); 6999 INIT_LIST_HEAD(&device_list); 7000 7001 amdgpu_device_recovery_prepare(adev, &device_list, hive); 7002 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 7003 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 7004 hive, false); 7005 if (hive) 7006 mutex_unlock(&hive->hive_lock); 7007 return PCI_ERS_RESULT_NEED_RESET; 7008 case pci_channel_io_perm_failure: 7009 /* Permanent error, prepare for device removal */ 7010 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 7011 return PCI_ERS_RESULT_DISCONNECT; 7012 } 7013 7014 return PCI_ERS_RESULT_NEED_RESET; 7015 } 7016 7017 /** 7018 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 7019 * @pdev: pointer to PCI device 7020 */ 7021 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 7022 { 7023 struct drm_device *dev = pci_get_drvdata(pdev); 7024 struct amdgpu_device *adev = drm_to_adev(dev); 7025 7026 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 7027 7028 /* TODO - dump whatever for debugging purposes */ 7029 7030 /* This called only if amdgpu_pci_error_detected returns 7031 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 7032 * works, no need to reset slot. 7033 */ 7034 7035 return PCI_ERS_RESULT_RECOVERED; 7036 } 7037 7038 /** 7039 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 7040 * @pdev: PCI device struct 7041 * 7042 * Description: This routine is called by the pci error recovery 7043 * code after the PCI slot has been reset, just before we 7044 * should resume normal operations. 7045 */ 7046 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 7047 { 7048 struct drm_device *dev = pci_get_drvdata(pdev); 7049 struct amdgpu_device *adev = drm_to_adev(dev); 7050 struct amdgpu_reset_context reset_context; 7051 struct amdgpu_device *tmp_adev; 7052 struct amdgpu_hive_info *hive; 7053 struct list_head device_list; 7054 struct pci_dev *link_dev; 7055 int r = 0, i, timeout; 7056 u32 memsize; 7057 u16 status; 7058 7059 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 7060 7061 memset(&reset_context, 0, sizeof(reset_context)); 7062 7063 if (adev->pcie_reset_ctx.swus) 7064 link_dev = adev->pcie_reset_ctx.swus; 7065 else 7066 link_dev = adev->pdev; 7067 /* wait for asic to come out of reset, timeout = 10s */ 7068 timeout = 10000; 7069 do { 7070 usleep_range(10000, 10500); 7071 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 7072 timeout -= 10; 7073 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 7074 (status != PCI_VENDOR_ID_AMD)); 7075 7076 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 7077 r = -ETIME; 7078 goto out; 7079 } 7080 7081 amdgpu_device_load_switch_state(adev); 7082 /* Restore PCI confspace */ 7083 amdgpu_device_load_pci_state(pdev); 7084 7085 /* confirm ASIC came out of reset */ 7086 for (i = 0; i < adev->usec_timeout; i++) { 7087 memsize = amdgpu_asic_get_config_memsize(adev); 7088 7089 if (memsize != 0xffffffff) 7090 break; 7091 udelay(1); 7092 } 7093 if (memsize == 0xffffffff) { 7094 r = -ETIME; 7095 goto out; 7096 } 7097 7098 reset_context.method = AMD_RESET_METHOD_NONE; 7099 reset_context.reset_req_dev = adev; 7100 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7101 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7102 INIT_LIST_HEAD(&device_list); 7103 7104 hive = amdgpu_get_xgmi_hive(adev); 7105 if (hive) { 7106 mutex_lock(&hive->hive_lock); 7107 reset_context.hive = hive; 7108 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7109 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7110 list_add_tail(&tmp_adev->reset_list, &device_list); 7111 } 7112 } else { 7113 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7114 list_add_tail(&adev->reset_list, &device_list); 7115 } 7116 7117 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7118 out: 7119 if (!r) { 7120 if (amdgpu_device_cache_pci_state(adev->pdev)) 7121 pci_restore_state(adev->pdev); 7122 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7123 } else { 7124 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7125 if (hive) { 7126 list_for_each_entry(tmp_adev, &device_list, reset_list) 7127 amdgpu_device_unset_mp1_state(tmp_adev); 7128 } 7129 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7130 } 7131 7132 if (hive) { 7133 mutex_unlock(&hive->hive_lock); 7134 amdgpu_put_xgmi_hive(hive); 7135 } 7136 7137 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7138 } 7139 7140 /** 7141 * amdgpu_pci_resume() - resume normal ops after PCI reset 7142 * @pdev: pointer to PCI device 7143 * 7144 * Called when the error recovery driver tells us that its 7145 * OK to resume normal operation. 7146 */ 7147 void amdgpu_pci_resume(struct pci_dev *pdev) 7148 { 7149 struct drm_device *dev = pci_get_drvdata(pdev); 7150 struct amdgpu_device *adev = drm_to_adev(dev); 7151 struct list_head device_list; 7152 struct amdgpu_hive_info *hive = NULL; 7153 struct amdgpu_device *tmp_adev = NULL; 7154 7155 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7156 7157 /* Only continue execution for the case of pci_channel_io_frozen */ 7158 if (adev->pci_channel_state != pci_channel_io_frozen) 7159 return; 7160 7161 INIT_LIST_HEAD(&device_list); 7162 7163 hive = amdgpu_get_xgmi_hive(adev); 7164 if (hive) { 7165 mutex_lock(&hive->hive_lock); 7166 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7167 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7168 list_add_tail(&tmp_adev->reset_list, &device_list); 7169 } 7170 } else 7171 list_add_tail(&adev->reset_list, &device_list); 7172 7173 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7174 amdgpu_device_gpu_resume(adev, &device_list, false); 7175 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7176 7177 if (hive) { 7178 mutex_unlock(&hive->hive_lock); 7179 amdgpu_put_xgmi_hive(hive); 7180 } 7181 } 7182 7183 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7184 { 7185 struct pci_dev *swus, *swds; 7186 int r; 7187 7188 swds = pci_upstream_bridge(adev->pdev); 7189 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || 7190 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) 7191 return; 7192 swus = pci_upstream_bridge(swds); 7193 if (!swus || 7194 (swus->vendor != PCI_VENDOR_ID_ATI && 7195 swus->vendor != PCI_VENDOR_ID_AMD) || 7196 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) 7197 return; 7198 7199 /* If already saved, return */ 7200 if (adev->pcie_reset_ctx.swus) 7201 return; 7202 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7203 r = pci_save_state(swds); 7204 if (r) 7205 return; 7206 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); 7207 7208 r = pci_save_state(swus); 7209 if (r) 7210 return; 7211 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); 7212 7213 adev->pcie_reset_ctx.swus = swus; 7214 } 7215 7216 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7217 { 7218 struct pci_dev *pdev; 7219 int r; 7220 7221 if (!adev->pcie_reset_ctx.swds_pcistate || 7222 !adev->pcie_reset_ctx.swus_pcistate) 7223 return; 7224 7225 pdev = adev->pcie_reset_ctx.swus; 7226 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7227 if (!r) { 7228 pci_restore_state(pdev); 7229 } else { 7230 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7231 return; 7232 } 7233 7234 pdev = pci_upstream_bridge(adev->pdev); 7235 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7236 if (!r) 7237 pci_restore_state(pdev); 7238 else 7239 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7240 } 7241 7242 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7243 { 7244 struct drm_device *dev = pci_get_drvdata(pdev); 7245 struct amdgpu_device *adev = drm_to_adev(dev); 7246 int r; 7247 7248 if (amdgpu_sriov_vf(adev)) 7249 return false; 7250 7251 r = pci_save_state(pdev); 7252 if (!r) { 7253 kfree(adev->pci_state); 7254 7255 adev->pci_state = pci_store_saved_state(pdev); 7256 7257 if (!adev->pci_state) { 7258 dev_err(adev->dev, "Failed to store PCI saved state"); 7259 return false; 7260 } 7261 } else { 7262 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7263 return false; 7264 } 7265 7266 amdgpu_device_cache_switch_state(adev); 7267 7268 return true; 7269 } 7270 7271 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7272 { 7273 struct drm_device *dev = pci_get_drvdata(pdev); 7274 struct amdgpu_device *adev = drm_to_adev(dev); 7275 int r; 7276 7277 if (!adev->pci_state) 7278 return false; 7279 7280 r = pci_load_saved_state(pdev, adev->pci_state); 7281 7282 if (!r) { 7283 pci_restore_state(pdev); 7284 } else { 7285 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7286 return false; 7287 } 7288 7289 return true; 7290 } 7291 7292 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7293 struct amdgpu_ring *ring) 7294 { 7295 #ifdef CONFIG_X86_64 7296 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7297 return; 7298 #endif 7299 if (adev->gmc.xgmi.connected_to_cpu) 7300 return; 7301 7302 if (ring && ring->funcs->emit_hdp_flush) { 7303 amdgpu_ring_emit_hdp_flush(ring); 7304 return; 7305 } 7306 7307 if (!ring && amdgpu_sriov_runtime(adev)) { 7308 if (!amdgpu_kiq_hdp_flush(adev)) 7309 return; 7310 } 7311 7312 amdgpu_hdp_flush(adev, ring); 7313 } 7314 7315 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7316 struct amdgpu_ring *ring) 7317 { 7318 #ifdef CONFIG_X86_64 7319 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7320 return; 7321 #endif 7322 if (adev->gmc.xgmi.connected_to_cpu) 7323 return; 7324 7325 amdgpu_hdp_invalidate(adev, ring); 7326 } 7327 7328 int amdgpu_in_reset(struct amdgpu_device *adev) 7329 { 7330 return atomic_read(&adev->reset_domain->in_gpu_reset); 7331 } 7332 7333 /** 7334 * amdgpu_device_halt() - bring hardware to some kind of halt state 7335 * 7336 * @adev: amdgpu_device pointer 7337 * 7338 * Bring hardware to some kind of halt state so that no one can touch it 7339 * any more. It will help to maintain error context when error occurred. 7340 * Compare to a simple hang, the system will keep stable at least for SSH 7341 * access. Then it should be trivial to inspect the hardware state and 7342 * see what's going on. Implemented as following: 7343 * 7344 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7345 * clears all CPU mappings to device, disallows remappings through page faults 7346 * 2. amdgpu_irq_disable_all() disables all interrupts 7347 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7348 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7349 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7350 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7351 * flush any in flight DMA operations 7352 */ 7353 void amdgpu_device_halt(struct amdgpu_device *adev) 7354 { 7355 struct pci_dev *pdev = adev->pdev; 7356 struct drm_device *ddev = adev_to_drm(adev); 7357 7358 amdgpu_xcp_dev_unplug(adev); 7359 drm_dev_unplug(ddev); 7360 7361 amdgpu_irq_disable_all(adev); 7362 7363 amdgpu_fence_driver_hw_fini(adev); 7364 7365 adev->no_hw_access = true; 7366 7367 amdgpu_device_unmap_mmio(adev); 7368 7369 pci_disable_device(pdev); 7370 pci_wait_for_pending_transaction(pdev); 7371 } 7372 7373 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7374 u32 reg) 7375 { 7376 unsigned long flags, address, data; 7377 u32 r; 7378 7379 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7380 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7381 7382 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7383 WREG32(address, reg * 4); 7384 (void)RREG32(address); 7385 r = RREG32(data); 7386 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7387 return r; 7388 } 7389 7390 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7391 u32 reg, u32 v) 7392 { 7393 unsigned long flags, address, data; 7394 7395 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7396 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7397 7398 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7399 WREG32(address, reg * 4); 7400 (void)RREG32(address); 7401 WREG32(data, v); 7402 (void)RREG32(data); 7403 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7404 } 7405 7406 /** 7407 * amdgpu_device_get_gang - return a reference to the current gang 7408 * @adev: amdgpu_device pointer 7409 * 7410 * Returns: A new reference to the current gang leader. 7411 */ 7412 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7413 { 7414 struct dma_fence *fence; 7415 7416 rcu_read_lock(); 7417 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7418 rcu_read_unlock(); 7419 return fence; 7420 } 7421 7422 /** 7423 * amdgpu_device_switch_gang - switch to a new gang 7424 * @adev: amdgpu_device pointer 7425 * @gang: the gang to switch to 7426 * 7427 * Try to switch to a new gang. 7428 * Returns: NULL if we switched to the new gang or a reference to the current 7429 * gang leader. 7430 */ 7431 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7432 struct dma_fence *gang) 7433 { 7434 struct dma_fence *old = NULL; 7435 7436 dma_fence_get(gang); 7437 do { 7438 dma_fence_put(old); 7439 old = amdgpu_device_get_gang(adev); 7440 if (old == gang) 7441 break; 7442 7443 if (!dma_fence_is_signaled(old)) { 7444 dma_fence_put(gang); 7445 return old; 7446 } 7447 7448 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7449 old, gang) != old); 7450 7451 /* 7452 * Drop it once for the exchanged reference in adev and once for the 7453 * thread local reference acquired in amdgpu_device_get_gang(). 7454 */ 7455 dma_fence_put(old); 7456 dma_fence_put(old); 7457 return NULL; 7458 } 7459 7460 /** 7461 * amdgpu_device_enforce_isolation - enforce HW isolation 7462 * @adev: the amdgpu device pointer 7463 * @ring: the HW ring the job is supposed to run on 7464 * @job: the job which is about to be pushed to the HW ring 7465 * 7466 * Makes sure that only one client at a time can use the GFX block. 7467 * Returns: The dependency to wait on before the job can be pushed to the HW. 7468 * The function is called multiple times until NULL is returned. 7469 */ 7470 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7471 struct amdgpu_ring *ring, 7472 struct amdgpu_job *job) 7473 { 7474 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7475 struct drm_sched_fence *f = job->base.s_fence; 7476 struct dma_fence *dep; 7477 void *owner; 7478 int r; 7479 7480 /* 7481 * For now enforce isolation only for the GFX block since we only need 7482 * the cleaner shader on those rings. 7483 */ 7484 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7485 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7486 return NULL; 7487 7488 /* 7489 * All submissions where enforce isolation is false are handled as if 7490 * they come from a single client. Use ~0l as the owner to distinct it 7491 * from kernel submissions where the owner is NULL. 7492 */ 7493 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7494 7495 mutex_lock(&adev->enforce_isolation_mutex); 7496 7497 /* 7498 * The "spearhead" submission is the first one which changes the 7499 * ownership to its client. We always need to wait for it to be 7500 * pushed to the HW before proceeding with anything. 7501 */ 7502 if (&f->scheduled != isolation->spearhead && 7503 !dma_fence_is_signaled(isolation->spearhead)) { 7504 dep = isolation->spearhead; 7505 goto out_grab_ref; 7506 } 7507 7508 if (isolation->owner != owner) { 7509 7510 /* 7511 * Wait for any gang to be assembled before switching to a 7512 * different owner or otherwise we could deadlock the 7513 * submissions. 7514 */ 7515 if (!job->gang_submit) { 7516 dep = amdgpu_device_get_gang(adev); 7517 if (!dma_fence_is_signaled(dep)) 7518 goto out_return_dep; 7519 dma_fence_put(dep); 7520 } 7521 7522 dma_fence_put(isolation->spearhead); 7523 isolation->spearhead = dma_fence_get(&f->scheduled); 7524 amdgpu_sync_move(&isolation->active, &isolation->prev); 7525 trace_amdgpu_isolation(isolation->owner, owner); 7526 isolation->owner = owner; 7527 } 7528 7529 /* 7530 * Specifying the ring here helps to pipeline submissions even when 7531 * isolation is enabled. If that is not desired for testing NULL can be 7532 * used instead of the ring to enforce a CPU round trip while switching 7533 * between clients. 7534 */ 7535 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7536 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7537 if (r) 7538 dev_warn(adev->dev, "OOM tracking isolation\n"); 7539 7540 out_grab_ref: 7541 dma_fence_get(dep); 7542 out_return_dep: 7543 mutex_unlock(&adev->enforce_isolation_mutex); 7544 return dep; 7545 } 7546 7547 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7548 { 7549 switch (adev->asic_type) { 7550 #ifdef CONFIG_DRM_AMDGPU_SI 7551 case CHIP_HAINAN: 7552 #endif 7553 case CHIP_TOPAZ: 7554 /* chips with no display hardware */ 7555 return false; 7556 #ifdef CONFIG_DRM_AMDGPU_SI 7557 case CHIP_TAHITI: 7558 case CHIP_PITCAIRN: 7559 case CHIP_VERDE: 7560 case CHIP_OLAND: 7561 #endif 7562 #ifdef CONFIG_DRM_AMDGPU_CIK 7563 case CHIP_BONAIRE: 7564 case CHIP_HAWAII: 7565 case CHIP_KAVERI: 7566 case CHIP_KABINI: 7567 case CHIP_MULLINS: 7568 #endif 7569 case CHIP_TONGA: 7570 case CHIP_FIJI: 7571 case CHIP_POLARIS10: 7572 case CHIP_POLARIS11: 7573 case CHIP_POLARIS12: 7574 case CHIP_VEGAM: 7575 case CHIP_CARRIZO: 7576 case CHIP_STONEY: 7577 /* chips with display hardware */ 7578 return true; 7579 default: 7580 /* IP discovery */ 7581 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7582 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7583 return false; 7584 return true; 7585 } 7586 } 7587 7588 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7589 uint32_t inst, uint32_t reg_addr, char reg_name[], 7590 uint32_t expected_value, uint32_t mask) 7591 { 7592 uint32_t ret = 0; 7593 uint32_t old_ = 0; 7594 uint32_t tmp_ = RREG32(reg_addr); 7595 uint32_t loop = adev->usec_timeout; 7596 7597 while ((tmp_ & (mask)) != (expected_value)) { 7598 if (old_ != tmp_) { 7599 loop = adev->usec_timeout; 7600 old_ = tmp_; 7601 } else 7602 udelay(1); 7603 tmp_ = RREG32(reg_addr); 7604 loop--; 7605 if (!loop) { 7606 dev_warn( 7607 adev->dev, 7608 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7609 inst, reg_name, (uint32_t)expected_value, 7610 (uint32_t)(tmp_ & (mask))); 7611 ret = -ETIMEDOUT; 7612 break; 7613 } 7614 } 7615 return ret; 7616 } 7617 7618 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7619 { 7620 ssize_t size = 0; 7621 7622 if (!ring || !ring->adev) 7623 return size; 7624 7625 if (amdgpu_device_should_recover_gpu(ring->adev)) 7626 size |= AMDGPU_RESET_TYPE_FULL; 7627 7628 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7629 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7630 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7631 7632 return size; 7633 } 7634 7635 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7636 { 7637 ssize_t size = 0; 7638 7639 if (supported_reset == 0) { 7640 size += sysfs_emit_at(buf, size, "unsupported"); 7641 size += sysfs_emit_at(buf, size, "\n"); 7642 return size; 7643 7644 } 7645 7646 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7647 size += sysfs_emit_at(buf, size, "soft "); 7648 7649 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7650 size += sysfs_emit_at(buf, size, "queue "); 7651 7652 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7653 size += sysfs_emit_at(buf, size, "pipe "); 7654 7655 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7656 size += sysfs_emit_at(buf, size, "full "); 7657 7658 size += sysfs_emit_at(buf, size, "\n"); 7659 return size; 7660 } 7661 7662 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7663 enum amdgpu_uid_type type, uint8_t inst, 7664 uint64_t uid) 7665 { 7666 if (!uid_info) 7667 return; 7668 7669 if (type >= AMDGPU_UID_TYPE_MAX) { 7670 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7671 type); 7672 return; 7673 } 7674 7675 if (inst >= AMDGPU_UID_INST_MAX) { 7676 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7677 inst); 7678 return; 7679 } 7680 7681 if (uid_info->uid[type][inst] != 0) { 7682 dev_warn_once( 7683 uid_info->adev->dev, 7684 "Overwriting existing UID %llu for type %d instance %d\n", 7685 uid_info->uid[type][inst], type, inst); 7686 } 7687 7688 uid_info->uid[type][inst] = uid; 7689 } 7690 7691 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7692 enum amdgpu_uid_type type, uint8_t inst) 7693 { 7694 if (!uid_info) 7695 return 0; 7696 7697 if (type >= AMDGPU_UID_TYPE_MAX) { 7698 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7699 type); 7700 return 0; 7701 } 7702 7703 if (inst >= AMDGPU_UID_INST_MAX) { 7704 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7705 inst); 7706 return 0; 7707 } 7708 7709 return uid_info->uid[type][inst]; 7710 } 7711