1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 #include <linux/nospec.h> 40 41 #include <drm/drm_atomic_helper.h> 42 #include <drm/drm_client_event.h> 43 #include <drm/drm_crtc_helper.h> 44 #include <drm/drm_probe_helper.h> 45 #include <drm/amdgpu_drm.h> 46 #include <linux/device.h> 47 #include <linux/vgaarb.h> 48 #include <linux/vga_switcheroo.h> 49 #include <linux/efi.h> 50 #include "amdgpu.h" 51 #include "amdgpu_trace.h" 52 #include "amdgpu_i2c.h" 53 #include "atom.h" 54 #include "amdgpu_atombios.h" 55 #include "amdgpu_atomfirmware.h" 56 #include "amd_pcie.h" 57 #ifdef CONFIG_DRM_AMDGPU_SI 58 #include "si.h" 59 #endif 60 #ifdef CONFIG_DRM_AMDGPU_CIK 61 #include "cik.h" 62 #endif 63 #include "vi.h" 64 #include "soc15.h" 65 #include "nv.h" 66 #include "bif/bif_4_1_d.h" 67 #include <linux/firmware.h> 68 #include "amdgpu_vf_error.h" 69 70 #include "amdgpu_amdkfd.h" 71 #include "amdgpu_pm.h" 72 73 #include "amdgpu_xgmi.h" 74 #include "amdgpu_ras.h" 75 #include "amdgpu_ras_mgr.h" 76 #include "amdgpu_pmu.h" 77 #include "amdgpu_fru_eeprom.h" 78 #include "amdgpu_reset.h" 79 #include "amdgpu_virt.h" 80 #include "amdgpu_dev_coredump.h" 81 82 #include <linux/suspend.h> 83 #include <drm/task_barrier.h> 84 #include <linux/pm_runtime.h> 85 86 #include <drm/drm_drv.h> 87 88 #if IS_ENABLED(CONFIG_X86) 89 #include <asm/intel-family.h> 90 #include <asm/cpu_device_id.h> 91 #endif 92 93 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 98 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 99 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 100 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); 101 102 #define AMDGPU_RESUME_MS 2000 103 #define AMDGPU_MAX_RETRY_LIMIT 2 104 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 105 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 106 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 107 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 108 109 #define AMDGPU_VBIOS_SKIP (1U << 0) 110 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 111 112 static const struct drm_driver amdgpu_kms_driver; 113 114 const char *amdgpu_asic_name[] = { 115 "TAHITI", 116 "PITCAIRN", 117 "VERDE", 118 "OLAND", 119 "HAINAN", 120 "BONAIRE", 121 "KAVERI", 122 "KABINI", 123 "HAWAII", 124 "MULLINS", 125 "TOPAZ", 126 "TONGA", 127 "FIJI", 128 "CARRIZO", 129 "STONEY", 130 "POLARIS10", 131 "POLARIS11", 132 "POLARIS12", 133 "VEGAM", 134 "VEGA10", 135 "VEGA12", 136 "VEGA20", 137 "RAVEN", 138 "ARCTURUS", 139 "RENOIR", 140 "ALDEBARAN", 141 "NAVI10", 142 "CYAN_SKILLFISH", 143 "NAVI14", 144 "NAVI12", 145 "SIENNA_CICHLID", 146 "NAVY_FLOUNDER", 147 "VANGOGH", 148 "DIMGREY_CAVEFISH", 149 "BEIGE_GOBY", 150 "YELLOW_CARP", 151 "IP DISCOVERY", 152 "LAST", 153 }; 154 155 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 156 /* 157 * Default init level where all blocks are expected to be initialized. This is 158 * the level of initialization expected by default and also after a full reset 159 * of the device. 160 */ 161 struct amdgpu_init_level amdgpu_init_default = { 162 .level = AMDGPU_INIT_LEVEL_DEFAULT, 163 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 164 }; 165 166 struct amdgpu_init_level amdgpu_init_recovery = { 167 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 168 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 169 }; 170 171 /* 172 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 173 * is used for cases like reset on initialization where the entire hive needs to 174 * be reset before first use. 175 */ 176 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 177 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 178 .hwini_ip_block_mask = 179 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 180 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 181 BIT(AMD_IP_BLOCK_TYPE_PSP) 182 }; 183 184 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); 185 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); 186 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); 187 188 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 189 190 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 191 enum amd_ip_block_type block) 192 { 193 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 194 } 195 196 void amdgpu_set_init_level(struct amdgpu_device *adev, 197 enum amdgpu_init_lvl_id lvl) 198 { 199 switch (lvl) { 200 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 201 adev->init_lvl = &amdgpu_init_minimal_xgmi; 202 break; 203 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 204 adev->init_lvl = &amdgpu_init_recovery; 205 break; 206 case AMDGPU_INIT_LEVEL_DEFAULT: 207 fallthrough; 208 default: 209 adev->init_lvl = &amdgpu_init_default; 210 break; 211 } 212 } 213 214 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 215 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 216 void *data); 217 218 /** 219 * DOC: pcie_replay_count 220 * 221 * The amdgpu driver provides a sysfs API for reporting the total number 222 * of PCIe replays (NAKs). 223 * The file pcie_replay_count is used for this and returns the total 224 * number of replays as a sum of the NAKs generated and NAKs received. 225 */ 226 227 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 228 struct device_attribute *attr, char *buf) 229 { 230 struct drm_device *ddev = dev_get_drvdata(dev); 231 struct amdgpu_device *adev = drm_to_adev(ddev); 232 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 233 234 return sysfs_emit(buf, "%llu\n", cnt); 235 } 236 237 static DEVICE_ATTR(pcie_replay_count, 0444, 238 amdgpu_device_get_pcie_replay_count, NULL); 239 240 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 241 { 242 int ret = 0; 243 244 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 245 ret = sysfs_create_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 248 return ret; 249 } 250 251 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 252 { 253 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 254 sysfs_remove_file(&adev->dev->kobj, 255 &dev_attr_pcie_replay_count.attr); 256 } 257 258 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 259 const struct bin_attribute *attr, char *buf, 260 loff_t ppos, size_t count) 261 { 262 struct device *dev = kobj_to_dev(kobj); 263 struct drm_device *ddev = dev_get_drvdata(dev); 264 struct amdgpu_device *adev = drm_to_adev(ddev); 265 ssize_t bytes_read; 266 267 switch (ppos) { 268 case AMDGPU_SYS_REG_STATE_XGMI: 269 bytes_read = amdgpu_asic_get_reg_state( 270 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 271 break; 272 case AMDGPU_SYS_REG_STATE_WAFL: 273 bytes_read = amdgpu_asic_get_reg_state( 274 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 275 break; 276 case AMDGPU_SYS_REG_STATE_PCIE: 277 bytes_read = amdgpu_asic_get_reg_state( 278 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 279 break; 280 case AMDGPU_SYS_REG_STATE_USR: 281 bytes_read = amdgpu_asic_get_reg_state( 282 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 283 break; 284 case AMDGPU_SYS_REG_STATE_USR_1: 285 bytes_read = amdgpu_asic_get_reg_state( 286 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 287 break; 288 default: 289 return -EINVAL; 290 } 291 292 return bytes_read; 293 } 294 295 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 296 AMDGPU_SYS_REG_STATE_END); 297 298 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 299 { 300 int ret; 301 302 if (!amdgpu_asic_get_reg_state_supported(adev)) 303 return 0; 304 305 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 307 return ret; 308 } 309 310 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 311 { 312 if (!amdgpu_asic_get_reg_state_supported(adev)) 313 return; 314 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 315 } 316 317 /** 318 * DOC: board_info 319 * 320 * The amdgpu driver provides a sysfs API for giving board related information. 321 * It provides the form factor information in the format 322 * 323 * type : form factor 324 * 325 * Possible form factor values 326 * 327 * - "cem" - PCIE CEM card 328 * - "oam" - Open Compute Accelerator Module 329 * - "unknown" - Not known 330 * 331 */ 332 333 static ssize_t amdgpu_device_get_board_info(struct device *dev, 334 struct device_attribute *attr, 335 char *buf) 336 { 337 struct drm_device *ddev = dev_get_drvdata(dev); 338 struct amdgpu_device *adev = drm_to_adev(ddev); 339 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 340 const char *pkg; 341 342 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 343 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 344 345 switch (pkg_type) { 346 case AMDGPU_PKG_TYPE_CEM: 347 pkg = "cem"; 348 break; 349 case AMDGPU_PKG_TYPE_OAM: 350 pkg = "oam"; 351 break; 352 default: 353 pkg = "unknown"; 354 break; 355 } 356 357 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 358 } 359 360 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 361 362 static struct attribute *amdgpu_board_attrs[] = { 363 &dev_attr_board_info.attr, 364 NULL, 365 }; 366 367 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 368 struct attribute *attr, int n) 369 { 370 struct device *dev = kobj_to_dev(kobj); 371 struct drm_device *ddev = dev_get_drvdata(dev); 372 struct amdgpu_device *adev = drm_to_adev(ddev); 373 374 if (adev->flags & AMD_IS_APU) 375 return 0; 376 377 return attr->mode; 378 } 379 380 static const struct attribute_group amdgpu_board_attrs_group = { 381 .attrs = amdgpu_board_attrs, 382 .is_visible = amdgpu_board_attrs_is_visible 383 }; 384 385 /** 386 * DOC: uma/carveout_options 387 * 388 * This is a read-only file that lists all available UMA allocation 389 * options and their corresponding indices. Example output:: 390 * 391 * $ cat uma/carveout_options 392 * 0: Minimum (512 MB) 393 * 1: (1 GB) 394 * 2: (2 GB) 395 * 3: (4 GB) 396 * 4: (6 GB) 397 * 5: (8 GB) 398 * 6: (12 GB) 399 * 7: Medium (16 GB) 400 * 8: (24 GB) 401 * 9: High (32 GB) 402 */ 403 static ssize_t carveout_options_show(struct device *dev, 404 struct device_attribute *attr, 405 char *buf) 406 { 407 struct drm_device *ddev = dev_get_drvdata(dev); 408 struct amdgpu_device *adev = drm_to_adev(ddev); 409 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 410 uint32_t memory_carved; 411 ssize_t size = 0; 412 413 if (!uma_info || !uma_info->num_entries) 414 return -ENODEV; 415 416 for (int i = 0; i < uma_info->num_entries; i++) { 417 memory_carved = uma_info->entries[i].memory_carved_mb; 418 if (memory_carved >= SZ_1G/SZ_1M) { 419 size += sysfs_emit_at(buf, size, "%d: %s (%u GB)\n", 420 i, 421 uma_info->entries[i].name, 422 memory_carved >> 10); 423 } else { 424 size += sysfs_emit_at(buf, size, "%d: %s (%u MB)\n", 425 i, 426 uma_info->entries[i].name, 427 memory_carved); 428 } 429 } 430 431 return size; 432 } 433 static DEVICE_ATTR_RO(carveout_options); 434 435 /** 436 * DOC: uma/carveout 437 * 438 * This file is both readable and writable. When read, it shows the 439 * index of the current setting. Writing a valid index to this file 440 * allows users to change the UMA carveout size to the selected option 441 * on the next boot. 442 * 443 * The available options and their corresponding indices can be read 444 * from the uma/carveout_options file. 445 */ 446 static ssize_t carveout_show(struct device *dev, 447 struct device_attribute *attr, 448 char *buf) 449 { 450 struct drm_device *ddev = dev_get_drvdata(dev); 451 struct amdgpu_device *adev = drm_to_adev(ddev); 452 453 return sysfs_emit(buf, "%u\n", adev->uma_info.uma_option_index); 454 } 455 456 static ssize_t carveout_store(struct device *dev, 457 struct device_attribute *attr, 458 const char *buf, size_t count) 459 { 460 struct drm_device *ddev = dev_get_drvdata(dev); 461 struct amdgpu_device *adev = drm_to_adev(ddev); 462 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 463 struct amdgpu_uma_carveout_option *opt; 464 unsigned long val; 465 uint8_t flags; 466 int r; 467 468 r = kstrtoul(buf, 10, &val); 469 if (r) 470 return r; 471 472 if (val >= uma_info->num_entries) 473 return -EINVAL; 474 475 val = array_index_nospec(val, uma_info->num_entries); 476 opt = &uma_info->entries[val]; 477 478 if (!(opt->flags & AMDGPU_UMA_FLAG_AUTO) && 479 !(opt->flags & AMDGPU_UMA_FLAG_CUSTOM)) { 480 drm_err_once(ddev, "Option %lu not supported due to lack of Custom/Auto flag", val); 481 return -EINVAL; 482 } 483 484 flags = opt->flags; 485 flags &= ~((flags & AMDGPU_UMA_FLAG_AUTO) >> 1); 486 487 guard(mutex)(&uma_info->update_lock); 488 489 r = amdgpu_acpi_set_uma_allocation_size(adev, val, flags); 490 if (r) 491 return r; 492 493 uma_info->uma_option_index = val; 494 495 return count; 496 } 497 static DEVICE_ATTR_RW(carveout); 498 499 static struct attribute *amdgpu_uma_attrs[] = { 500 &dev_attr_carveout.attr, 501 &dev_attr_carveout_options.attr, 502 NULL 503 }; 504 505 const struct attribute_group amdgpu_uma_attr_group = { 506 .name = "uma", 507 .attrs = amdgpu_uma_attrs 508 }; 509 510 static void amdgpu_uma_sysfs_init(struct amdgpu_device *adev) 511 { 512 int rc; 513 514 if (!(adev->flags & AMD_IS_APU)) 515 return; 516 517 if (!amdgpu_acpi_is_set_uma_allocation_size_supported()) 518 return; 519 520 rc = amdgpu_atomfirmware_get_uma_carveout_info(adev, &adev->uma_info); 521 if (rc) { 522 drm_dbg(adev_to_drm(adev), 523 "Failed to parse UMA carveout info from VBIOS: %d\n", rc); 524 goto out_info; 525 } 526 527 mutex_init(&adev->uma_info.update_lock); 528 529 rc = devm_device_add_group(adev->dev, &amdgpu_uma_attr_group); 530 if (rc) { 531 drm_dbg(adev_to_drm(adev), "Failed to add UMA carveout sysfs interfaces %d\n", rc); 532 goto out_attr; 533 } 534 535 return; 536 537 out_attr: 538 mutex_destroy(&adev->uma_info.update_lock); 539 out_info: 540 return; 541 } 542 543 static void amdgpu_uma_sysfs_fini(struct amdgpu_device *adev) 544 { 545 struct amdgpu_uma_carveout_info *uma_info = &adev->uma_info; 546 547 if (!amdgpu_acpi_is_set_uma_allocation_size_supported()) 548 return; 549 550 mutex_destroy(&uma_info->update_lock); 551 uma_info->num_entries = 0; 552 } 553 554 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 555 556 /** 557 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 558 * 559 * @adev: amdgpu device pointer 560 * 561 * Returns true if the device is a dGPU with ATPX power control, 562 * otherwise return false. 563 */ 564 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 565 { 566 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 567 return true; 568 return false; 569 } 570 571 /** 572 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 573 * 574 * @adev: amdgpu device pointer 575 * 576 * Returns true if the device is a dGPU with ACPI power control, 577 * otherwise return false. 578 */ 579 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 580 { 581 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 582 return false; 583 584 if (adev->has_pr3 || 585 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 586 return true; 587 return false; 588 } 589 590 /** 591 * amdgpu_device_supports_baco - Does the device support BACO 592 * 593 * @adev: amdgpu device pointer 594 * 595 * Return: 596 * 1 if the device supports BACO; 597 * 3 if the device supports MACO (only works if BACO is supported) 598 * otherwise return 0. 599 */ 600 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 601 { 602 return amdgpu_asic_supports_baco(adev); 603 } 604 605 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 606 { 607 int bamaco_support; 608 609 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 610 bamaco_support = amdgpu_device_supports_baco(adev); 611 612 switch (amdgpu_runtime_pm) { 613 case 2: 614 if (bamaco_support & MACO_SUPPORT) { 615 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 616 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 617 } else if (bamaco_support == BACO_SUPPORT) { 618 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 619 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 620 } 621 break; 622 case 1: 623 if (bamaco_support & BACO_SUPPORT) { 624 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 625 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 626 } 627 break; 628 case -1: 629 case -2: 630 if (amdgpu_device_supports_px(adev)) { 631 /* enable PX as runtime mode */ 632 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 633 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 634 } else if (amdgpu_device_supports_boco(adev)) { 635 /* enable boco as runtime mode */ 636 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 637 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 638 } else { 639 if (!bamaco_support) 640 goto no_runtime_pm; 641 642 switch (adev->asic_type) { 643 case CHIP_VEGA20: 644 case CHIP_ARCTURUS: 645 /* BACO are not supported on vega20 and arctrus */ 646 break; 647 case CHIP_VEGA10: 648 /* enable BACO as runpm mode if noretry=0 */ 649 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 650 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 651 break; 652 default: 653 /* enable BACO as runpm mode on CI+ */ 654 if (!amdgpu_passthrough(adev)) 655 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 656 break; 657 } 658 659 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 660 if (bamaco_support & MACO_SUPPORT) { 661 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 662 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 663 } else { 664 dev_info(adev->dev, "Using BACO for runtime pm\n"); 665 } 666 } 667 } 668 break; 669 case 0: 670 dev_info(adev->dev, "runtime pm is manually disabled\n"); 671 break; 672 default: 673 break; 674 } 675 676 no_runtime_pm: 677 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 678 dev_info(adev->dev, "Runtime PM not available\n"); 679 } 680 /** 681 * amdgpu_device_supports_smart_shift - Is the device dGPU with 682 * smart shift support 683 * 684 * @adev: amdgpu device pointer 685 * 686 * Returns true if the device is a dGPU with Smart Shift support, 687 * otherwise returns false. 688 */ 689 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 690 { 691 return (amdgpu_device_supports_boco(adev) && 692 amdgpu_acpi_is_power_shift_control_supported()); 693 } 694 695 /* 696 * VRAM access helper functions 697 */ 698 699 /** 700 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 701 * 702 * @adev: amdgpu_device pointer 703 * @pos: offset of the buffer in vram 704 * @buf: virtual address of the buffer in system memory 705 * @size: read/write size, sizeof(@buf) must > @size 706 * @write: true - write to vram, otherwise - read from vram 707 */ 708 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 709 void *buf, size_t size, bool write) 710 { 711 unsigned long flags; 712 uint32_t hi = ~0, tmp = 0; 713 uint32_t *data = buf; 714 uint64_t last; 715 int idx; 716 717 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 718 return; 719 720 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 721 722 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 723 for (last = pos + size; pos < last; pos += 4) { 724 tmp = pos >> 31; 725 726 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 727 if (tmp != hi) { 728 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 729 hi = tmp; 730 } 731 if (write) 732 WREG32_NO_KIQ(mmMM_DATA, *data++); 733 else 734 *data++ = RREG32_NO_KIQ(mmMM_DATA); 735 } 736 737 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 738 drm_dev_exit(idx); 739 } 740 741 /** 742 * amdgpu_device_aper_access - access vram by vram aperture 743 * 744 * @adev: amdgpu_device pointer 745 * @pos: offset of the buffer in vram 746 * @buf: virtual address of the buffer in system memory 747 * @size: read/write size, sizeof(@buf) must > @size 748 * @write: true - write to vram, otherwise - read from vram 749 * 750 * The return value means how many bytes have been transferred. 751 */ 752 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 753 void *buf, size_t size, bool write) 754 { 755 #ifdef CONFIG_64BIT 756 void __iomem *addr; 757 size_t count = 0; 758 uint64_t last; 759 760 if (!adev->mman.aper_base_kaddr) 761 return 0; 762 763 last = min(pos + size, adev->gmc.visible_vram_size); 764 if (last > pos) { 765 addr = adev->mman.aper_base_kaddr + pos; 766 count = last - pos; 767 768 if (write) { 769 memcpy_toio(addr, buf, count); 770 /* Make sure HDP write cache flush happens without any reordering 771 * after the system memory contents are sent over PCIe device 772 */ 773 mb(); 774 amdgpu_device_flush_hdp(adev, NULL); 775 } else { 776 amdgpu_device_invalidate_hdp(adev, NULL); 777 /* Make sure HDP read cache is invalidated before issuing a read 778 * to the PCIe device 779 */ 780 mb(); 781 memcpy_fromio(buf, addr, count); 782 } 783 784 } 785 786 return count; 787 #else 788 return 0; 789 #endif 790 } 791 792 /** 793 * amdgpu_device_vram_access - read/write a buffer in vram 794 * 795 * @adev: amdgpu_device pointer 796 * @pos: offset of the buffer in vram 797 * @buf: virtual address of the buffer in system memory 798 * @size: read/write size, sizeof(@buf) must > @size 799 * @write: true - write to vram, otherwise - read from vram 800 */ 801 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 802 void *buf, size_t size, bool write) 803 { 804 size_t count; 805 806 /* try to using vram apreature to access vram first */ 807 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 808 size -= count; 809 if (size) { 810 /* using MM to access rest vram */ 811 pos += count; 812 buf += count; 813 amdgpu_device_mm_access(adev, pos, buf, size, write); 814 } 815 } 816 817 /* 818 * register access helper functions. 819 */ 820 821 /* Check if hw access should be skipped because of hotplug or device error */ 822 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 823 { 824 if (adev->no_hw_access) 825 return true; 826 827 #ifdef CONFIG_LOCKDEP 828 /* 829 * This is a bit complicated to understand, so worth a comment. What we assert 830 * here is that the GPU reset is not running on another thread in parallel. 831 * 832 * For this we trylock the read side of the reset semaphore, if that succeeds 833 * we know that the reset is not running in parallel. 834 * 835 * If the trylock fails we assert that we are either already holding the read 836 * side of the lock or are the reset thread itself and hold the write side of 837 * the lock. 838 */ 839 if (in_task()) { 840 if (down_read_trylock(&adev->reset_domain->sem)) 841 up_read(&adev->reset_domain->sem); 842 else 843 lockdep_assert_held(&adev->reset_domain->sem); 844 } 845 #endif 846 return false; 847 } 848 849 /** 850 * amdgpu_device_get_rev_id - query device rev_id 851 * 852 * @adev: amdgpu_device pointer 853 * 854 * Return device rev_id 855 */ 856 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 857 { 858 return adev->nbio.funcs->get_rev_id(adev); 859 } 860 861 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 862 { 863 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 864 return AMDGPU_VBIOS_SKIP; 865 866 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 867 return AMDGPU_VBIOS_OPTIONAL; 868 869 return 0; 870 } 871 872 /** 873 * amdgpu_device_asic_init - Wrapper for atom asic_init 874 * 875 * @adev: amdgpu_device pointer 876 * 877 * Does any asic specific work and then calls atom asic init. 878 */ 879 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 880 { 881 uint32_t flags; 882 bool optional; 883 int ret; 884 885 amdgpu_asic_pre_asic_init(adev); 886 flags = amdgpu_device_get_vbios_flags(adev); 887 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 888 889 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 890 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 891 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 892 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 893 amdgpu_psp_wait_for_bootloader(adev); 894 if (optional && !adev->bios) 895 return 0; 896 897 ret = amdgpu_atomfirmware_asic_init(adev, true); 898 return ret; 899 } else { 900 if (optional && !adev->bios) 901 return 0; 902 903 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 904 } 905 906 return 0; 907 } 908 909 /** 910 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 911 * 912 * @adev: amdgpu_device pointer 913 * 914 * Allocates a scratch page of VRAM for use by various things in the 915 * driver. 916 */ 917 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 918 { 919 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 920 AMDGPU_GEM_DOMAIN_VRAM | 921 AMDGPU_GEM_DOMAIN_GTT, 922 &adev->mem_scratch.robj, 923 &adev->mem_scratch.gpu_addr, 924 (void **)&adev->mem_scratch.ptr); 925 } 926 927 /** 928 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 929 * 930 * @adev: amdgpu_device pointer 931 * 932 * Frees the VRAM scratch page. 933 */ 934 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 935 { 936 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 937 } 938 939 /** 940 * amdgpu_device_program_register_sequence - program an array of registers. 941 * 942 * @adev: amdgpu_device pointer 943 * @registers: pointer to the register array 944 * @array_size: size of the register array 945 * 946 * Programs an array or registers with and or masks. 947 * This is a helper for setting golden registers. 948 */ 949 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 950 const u32 *registers, 951 const u32 array_size) 952 { 953 u32 tmp, reg, and_mask, or_mask; 954 int i; 955 956 if (array_size % 3) 957 return; 958 959 for (i = 0; i < array_size; i += 3) { 960 reg = registers[i + 0]; 961 and_mask = registers[i + 1]; 962 or_mask = registers[i + 2]; 963 964 if (and_mask == 0xffffffff) { 965 tmp = or_mask; 966 } else { 967 tmp = RREG32(reg); 968 tmp &= ~and_mask; 969 if (adev->family >= AMDGPU_FAMILY_AI) 970 tmp |= (or_mask & and_mask); 971 else 972 tmp |= or_mask; 973 } 974 WREG32(reg, tmp); 975 } 976 } 977 978 /** 979 * amdgpu_device_pci_config_reset - reset the GPU 980 * 981 * @adev: amdgpu_device pointer 982 * 983 * Resets the GPU using the pci config reset sequence. 984 * Only applicable to asics prior to vega10. 985 */ 986 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 987 { 988 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 989 } 990 991 /** 992 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 993 * 994 * @adev: amdgpu_device pointer 995 * 996 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 997 */ 998 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 999 { 1000 return pci_reset_function(adev->pdev); 1001 } 1002 1003 /* 1004 * amdgpu_device_wb_*() 1005 * Writeback is the method by which the GPU updates special pages in memory 1006 * with the status of certain GPU events (fences, ring pointers,etc.). 1007 */ 1008 1009 /** 1010 * amdgpu_device_wb_fini - Disable Writeback and free memory 1011 * 1012 * @adev: amdgpu_device pointer 1013 * 1014 * Disables Writeback and frees the Writeback memory (all asics). 1015 * Used at driver shutdown. 1016 */ 1017 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1018 { 1019 if (adev->wb.wb_obj) { 1020 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1021 &adev->wb.gpu_addr, 1022 (void **)&adev->wb.wb); 1023 adev->wb.wb_obj = NULL; 1024 } 1025 } 1026 1027 /** 1028 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1029 * 1030 * @adev: amdgpu_device pointer 1031 * 1032 * Initializes writeback and allocates writeback memory (all asics). 1033 * Used at driver startup. 1034 * Returns 0 on success or an -error on failure. 1035 */ 1036 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1037 { 1038 int r; 1039 1040 if (adev->wb.wb_obj == NULL) { 1041 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1042 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1043 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1044 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1045 (void **)&adev->wb.wb); 1046 if (r) { 1047 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1048 return r; 1049 } 1050 1051 adev->wb.num_wb = AMDGPU_MAX_WB; 1052 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1053 1054 /* clear wb memory */ 1055 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1056 } 1057 1058 return 0; 1059 } 1060 1061 /** 1062 * amdgpu_device_wb_get - Allocate a wb entry 1063 * 1064 * @adev: amdgpu_device pointer 1065 * @wb: wb index 1066 * 1067 * Allocate a wb slot for use by the driver (all asics). 1068 * Returns 0 on success or -EINVAL on failure. 1069 */ 1070 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1071 { 1072 unsigned long flags, offset; 1073 1074 spin_lock_irqsave(&adev->wb.lock, flags); 1075 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1076 if (offset < adev->wb.num_wb) { 1077 __set_bit(offset, adev->wb.used); 1078 spin_unlock_irqrestore(&adev->wb.lock, flags); 1079 *wb = offset << 3; /* convert to dw offset */ 1080 return 0; 1081 } else { 1082 spin_unlock_irqrestore(&adev->wb.lock, flags); 1083 return -EINVAL; 1084 } 1085 } 1086 1087 /** 1088 * amdgpu_device_wb_free - Free a wb entry 1089 * 1090 * @adev: amdgpu_device pointer 1091 * @wb: wb index 1092 * 1093 * Free a wb slot allocated for use by the driver (all asics) 1094 */ 1095 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1096 { 1097 unsigned long flags; 1098 1099 wb >>= 3; 1100 spin_lock_irqsave(&adev->wb.lock, flags); 1101 if (wb < adev->wb.num_wb) 1102 __clear_bit(wb, adev->wb.used); 1103 spin_unlock_irqrestore(&adev->wb.lock, flags); 1104 } 1105 1106 /** 1107 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1108 * 1109 * @adev: amdgpu_device pointer 1110 * 1111 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1112 * to fail, but if any of the BARs is not accessible after the size we abort 1113 * driver loading by returning -ENODEV. 1114 */ 1115 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1116 { 1117 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1118 struct pci_bus *root; 1119 struct resource *res; 1120 int max_size, r; 1121 unsigned int i; 1122 u16 cmd; 1123 1124 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1125 return 0; 1126 1127 /* Bypass for VF */ 1128 if (amdgpu_sriov_vf(adev)) 1129 return 0; 1130 1131 if (!amdgpu_rebar) 1132 return 0; 1133 1134 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1135 if ((amdgpu_runtime_pm != 0) && 1136 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1137 adev->pdev->device == 0x731f && 1138 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1139 return 0; 1140 1141 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1142 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1143 dev_warn( 1144 adev->dev, 1145 "System can't access extended configuration space, please check!!\n"); 1146 1147 /* skip if the bios has already enabled large BAR */ 1148 if (adev->gmc.real_vram_size && 1149 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1150 return 0; 1151 1152 /* Check if the root BUS has 64bit memory resources */ 1153 root = adev->pdev->bus; 1154 while (root->parent) 1155 root = root->parent; 1156 1157 pci_bus_for_each_resource(root, res, i) { 1158 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1159 res->start > 0x100000000ull) 1160 break; 1161 } 1162 1163 /* Trying to resize is pointless without a root hub window above 4GB */ 1164 if (!res) 1165 return 0; 1166 1167 /* Limit the BAR size to what is available */ 1168 max_size = pci_rebar_get_max_size(adev->pdev, 0); 1169 if (max_size < 0) 1170 return 0; 1171 rbar_size = min(max_size, rbar_size); 1172 1173 /* Disable memory decoding while we change the BAR addresses and size */ 1174 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1175 pci_write_config_word(adev->pdev, PCI_COMMAND, 1176 cmd & ~PCI_COMMAND_MEMORY); 1177 1178 /* Tear down doorbell as resizing will release BARs */ 1179 amdgpu_doorbell_fini(adev); 1180 1181 r = pci_resize_resource(adev->pdev, 0, rbar_size, 1182 (adev->asic_type >= CHIP_BONAIRE) ? 1 << 5 1183 : 1 << 2); 1184 if (r == -ENOSPC) 1185 dev_info(adev->dev, 1186 "Not enough PCI address space for a large BAR."); 1187 else if (r && r != -ENOTSUPP) 1188 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1189 1190 /* When the doorbell or fb BAR isn't available we have no chance of 1191 * using the device. 1192 */ 1193 r = amdgpu_doorbell_init(adev); 1194 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1195 return -ENODEV; 1196 1197 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1198 1199 return 0; 1200 } 1201 1202 /* 1203 * GPU helpers function. 1204 */ 1205 /** 1206 * amdgpu_device_need_post - check if the hw need post or not 1207 * 1208 * @adev: amdgpu_device pointer 1209 * 1210 * Check if the asic has been initialized (all asics) at driver startup 1211 * or post is needed if hw reset is performed. 1212 * Returns true if need or false if not. 1213 */ 1214 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1215 { 1216 uint32_t reg, flags; 1217 1218 if (amdgpu_sriov_vf(adev)) 1219 return false; 1220 1221 flags = amdgpu_device_get_vbios_flags(adev); 1222 if (flags & AMDGPU_VBIOS_SKIP) 1223 return false; 1224 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1225 return false; 1226 1227 if (amdgpu_passthrough(adev)) { 1228 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1229 * some old smc fw still need driver do vPost otherwise gpu hang, while 1230 * those smc fw version above 22.15 doesn't have this flaw, so we force 1231 * vpost executed for smc version below 22.15 1232 */ 1233 if (adev->asic_type == CHIP_FIJI) { 1234 int err; 1235 uint32_t fw_ver; 1236 1237 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1238 /* force vPost if error occurred */ 1239 if (err) 1240 return true; 1241 1242 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1243 release_firmware(adev->pm.fw); 1244 if (fw_ver < 0x00160e00) 1245 return true; 1246 } 1247 } 1248 1249 /* Don't post if we need to reset whole hive on init */ 1250 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1251 return false; 1252 1253 if (adev->has_hw_reset) { 1254 adev->has_hw_reset = false; 1255 return true; 1256 } 1257 1258 /* bios scratch used on CIK+ */ 1259 if (adev->asic_type >= CHIP_BONAIRE) 1260 return amdgpu_atombios_scratch_need_asic_init(adev); 1261 1262 /* check MEM_SIZE for older asics */ 1263 reg = amdgpu_asic_get_config_memsize(adev); 1264 1265 if ((reg != 0) && (reg != 0xffffffff)) 1266 return false; 1267 1268 return true; 1269 } 1270 1271 /* 1272 * Check whether seamless boot is supported. 1273 * 1274 * So far we only support seamless boot on DCE 3.0 or later. 1275 * If users report that it works on older ASICS as well, we may 1276 * loosen this. 1277 */ 1278 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1279 { 1280 switch (amdgpu_seamless) { 1281 case -1: 1282 break; 1283 case 1: 1284 return true; 1285 case 0: 1286 return false; 1287 default: 1288 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1289 amdgpu_seamless); 1290 return false; 1291 } 1292 1293 if (!(adev->flags & AMD_IS_APU)) 1294 return false; 1295 1296 if (adev->mman.keep_stolen_vga_memory) 1297 return false; 1298 1299 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1300 } 1301 1302 /* 1303 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1304 * don't support dynamic speed switching. Until we have confirmation from Intel 1305 * that a specific host supports it, it's safer that we keep it disabled for all. 1306 * 1307 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1308 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1309 */ 1310 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1311 { 1312 #if IS_ENABLED(CONFIG_X86) 1313 struct cpuinfo_x86 *c = &cpu_data(0); 1314 1315 /* eGPU change speeds based on USB4 fabric conditions */ 1316 if (dev_is_removable(adev->dev)) 1317 return true; 1318 1319 if (c->x86_vendor == X86_VENDOR_INTEL) 1320 return false; 1321 #endif 1322 return true; 1323 } 1324 1325 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1326 { 1327 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. 1328 * It's unclear if this is a platform-specific or GPU-specific issue. 1329 * Disable ASPM on SI for the time being. 1330 */ 1331 if (adev->family == AMDGPU_FAMILY_SI) 1332 return true; 1333 1334 #if IS_ENABLED(CONFIG_X86) 1335 struct cpuinfo_x86 *c = &cpu_data(0); 1336 1337 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1338 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1339 return false; 1340 1341 if (c->x86 == 6 && 1342 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1343 switch (c->x86_model) { 1344 case VFM_MODEL(INTEL_ALDERLAKE): 1345 case VFM_MODEL(INTEL_ALDERLAKE_L): 1346 case VFM_MODEL(INTEL_RAPTORLAKE): 1347 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1348 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1349 return true; 1350 default: 1351 return false; 1352 } 1353 } else { 1354 return false; 1355 } 1356 #else 1357 return false; 1358 #endif 1359 } 1360 1361 /** 1362 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1363 * 1364 * @adev: amdgpu_device pointer 1365 * 1366 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1367 * be set for this device. 1368 * 1369 * Returns true if it should be used or false if not. 1370 */ 1371 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1372 { 1373 switch (amdgpu_aspm) { 1374 case -1: 1375 break; 1376 case 0: 1377 return false; 1378 case 1: 1379 return true; 1380 default: 1381 return false; 1382 } 1383 if (adev->flags & AMD_IS_APU) 1384 return false; 1385 if (amdgpu_device_aspm_support_quirk(adev)) 1386 return false; 1387 return pcie_aspm_enabled(adev->pdev); 1388 } 1389 1390 /* if we get transitioned to only one device, take VGA back */ 1391 /** 1392 * amdgpu_device_vga_set_decode - enable/disable vga decode 1393 * 1394 * @pdev: PCI device pointer 1395 * @state: enable/disable vga decode 1396 * 1397 * Enable/disable vga decode (all asics). 1398 * Returns VGA resource flags. 1399 */ 1400 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1401 bool state) 1402 { 1403 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1404 1405 amdgpu_asic_set_vga_state(adev, state); 1406 if (state) 1407 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1408 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1409 else 1410 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1411 } 1412 1413 /** 1414 * amdgpu_device_check_block_size - validate the vm block size 1415 * 1416 * @adev: amdgpu_device pointer 1417 * 1418 * Validates the vm block size specified via module parameter. 1419 * The vm block size defines number of bits in page table versus page directory, 1420 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1421 * page table and the remaining bits are in the page directory. 1422 */ 1423 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1424 { 1425 /* defines number of bits in page table versus page directory, 1426 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1427 * page table and the remaining bits are in the page directory 1428 */ 1429 if (amdgpu_vm_block_size == -1) 1430 return; 1431 1432 if (amdgpu_vm_block_size < 9) { 1433 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1434 amdgpu_vm_block_size); 1435 amdgpu_vm_block_size = -1; 1436 } 1437 } 1438 1439 /** 1440 * amdgpu_device_check_vm_size - validate the vm size 1441 * 1442 * @adev: amdgpu_device pointer 1443 * 1444 * Validates the vm size in GB specified via module parameter. 1445 * The VM size is the size of the GPU virtual memory space in GB. 1446 */ 1447 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1448 { 1449 /* no need to check the default value */ 1450 if (amdgpu_vm_size == -1) 1451 return; 1452 1453 if (amdgpu_vm_size < 1) { 1454 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1455 amdgpu_vm_size); 1456 amdgpu_vm_size = -1; 1457 } 1458 } 1459 1460 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1461 { 1462 struct sysinfo si; 1463 bool is_os_64 = (sizeof(void *) == 8); 1464 uint64_t total_memory; 1465 uint64_t dram_size_seven_GB = 0x1B8000000; 1466 uint64_t dram_size_three_GB = 0xB8000000; 1467 1468 if (amdgpu_smu_memory_pool_size == 0) 1469 return; 1470 1471 if (!is_os_64) { 1472 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 1473 goto def_value; 1474 } 1475 si_meminfo(&si); 1476 total_memory = (uint64_t)si.totalram * si.mem_unit; 1477 1478 if ((amdgpu_smu_memory_pool_size == 1) || 1479 (amdgpu_smu_memory_pool_size == 2)) { 1480 if (total_memory < dram_size_three_GB) 1481 goto def_value1; 1482 } else if ((amdgpu_smu_memory_pool_size == 4) || 1483 (amdgpu_smu_memory_pool_size == 8)) { 1484 if (total_memory < dram_size_seven_GB) 1485 goto def_value1; 1486 } else { 1487 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 1488 goto def_value; 1489 } 1490 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1491 1492 return; 1493 1494 def_value1: 1495 dev_warn(adev->dev, "No enough system memory\n"); 1496 def_value: 1497 adev->pm.smu_prv_buffer_size = 0; 1498 } 1499 1500 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1501 { 1502 if (!(adev->flags & AMD_IS_APU) || 1503 adev->asic_type < CHIP_RAVEN) 1504 return 0; 1505 1506 switch (adev->asic_type) { 1507 case CHIP_RAVEN: 1508 if (adev->pdev->device == 0x15dd) 1509 adev->apu_flags |= AMD_APU_IS_RAVEN; 1510 if (adev->pdev->device == 0x15d8) 1511 adev->apu_flags |= AMD_APU_IS_PICASSO; 1512 break; 1513 case CHIP_RENOIR: 1514 if ((adev->pdev->device == 0x1636) || 1515 (adev->pdev->device == 0x164c)) 1516 adev->apu_flags |= AMD_APU_IS_RENOIR; 1517 else 1518 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1519 break; 1520 case CHIP_VANGOGH: 1521 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1522 break; 1523 case CHIP_YELLOW_CARP: 1524 break; 1525 case CHIP_CYAN_SKILLFISH: 1526 if ((adev->pdev->device == 0x13FE) || 1527 (adev->pdev->device == 0x143F)) 1528 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1529 break; 1530 default: 1531 break; 1532 } 1533 1534 return 0; 1535 } 1536 1537 /** 1538 * amdgpu_device_check_arguments - validate module params 1539 * 1540 * @adev: amdgpu_device pointer 1541 * 1542 * Validates certain module parameters and updates 1543 * the associated values used by the driver (all asics). 1544 */ 1545 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1546 { 1547 int i; 1548 1549 if (amdgpu_sched_jobs < 4) { 1550 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1551 amdgpu_sched_jobs); 1552 amdgpu_sched_jobs = 4; 1553 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1554 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1555 amdgpu_sched_jobs); 1556 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1557 } 1558 1559 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1560 /* gart size must be greater or equal to 32M */ 1561 dev_warn(adev->dev, "gart size (%d) too small\n", 1562 amdgpu_gart_size); 1563 amdgpu_gart_size = -1; 1564 } 1565 1566 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1567 /* gtt size must be greater or equal to 32M */ 1568 dev_warn(adev->dev, "gtt size (%d) too small\n", 1569 amdgpu_gtt_size); 1570 amdgpu_gtt_size = -1; 1571 } 1572 1573 /* valid range is between 4 and 9 inclusive */ 1574 if (amdgpu_vm_fragment_size != -1 && 1575 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1576 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1577 amdgpu_vm_fragment_size = -1; 1578 } 1579 1580 if (amdgpu_sched_hw_submission < 2) { 1581 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1582 amdgpu_sched_hw_submission); 1583 amdgpu_sched_hw_submission = 2; 1584 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1585 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1586 amdgpu_sched_hw_submission); 1587 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1588 } 1589 1590 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1591 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1592 amdgpu_reset_method = -1; 1593 } 1594 1595 amdgpu_device_check_smu_prv_buffer_size(adev); 1596 1597 amdgpu_device_check_vm_size(adev); 1598 1599 amdgpu_device_check_block_size(adev); 1600 1601 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1602 1603 for (i = 0; i < MAX_XCP; i++) { 1604 switch (amdgpu_enforce_isolation) { 1605 case -1: 1606 case 0: 1607 default: 1608 /* disable */ 1609 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 1610 break; 1611 case 1: 1612 /* enable */ 1613 adev->enforce_isolation[i] = 1614 AMDGPU_ENFORCE_ISOLATION_ENABLE; 1615 break; 1616 case 2: 1617 /* enable legacy mode */ 1618 adev->enforce_isolation[i] = 1619 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 1620 break; 1621 case 3: 1622 /* enable only process isolation without submitting cleaner shader */ 1623 adev->enforce_isolation[i] = 1624 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 1625 break; 1626 } 1627 } 1628 1629 return 0; 1630 } 1631 1632 /** 1633 * amdgpu_switcheroo_set_state - set switcheroo state 1634 * 1635 * @pdev: pci dev pointer 1636 * @state: vga_switcheroo state 1637 * 1638 * Callback for the switcheroo driver. Suspends or resumes 1639 * the asics before or after it is powered up using ACPI methods. 1640 */ 1641 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1642 enum vga_switcheroo_state state) 1643 { 1644 struct drm_device *dev = pci_get_drvdata(pdev); 1645 int r; 1646 1647 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 1648 state == VGA_SWITCHEROO_OFF) 1649 return; 1650 1651 if (state == VGA_SWITCHEROO_ON) { 1652 pr_info("switched on\n"); 1653 /* don't suspend or resume card normally */ 1654 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1655 1656 pci_set_power_state(pdev, PCI_D0); 1657 amdgpu_device_load_pci_state(pdev); 1658 r = pci_enable_device(pdev); 1659 if (r) 1660 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 1661 r); 1662 amdgpu_device_resume(dev, true); 1663 1664 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1665 } else { 1666 dev_info(&pdev->dev, "switched off\n"); 1667 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1668 amdgpu_device_prepare(dev); 1669 amdgpu_device_suspend(dev, true); 1670 amdgpu_device_cache_pci_state(pdev); 1671 /* Shut down the device */ 1672 pci_disable_device(pdev); 1673 pci_set_power_state(pdev, PCI_D3cold); 1674 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1675 } 1676 } 1677 1678 /** 1679 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1680 * 1681 * @pdev: pci dev pointer 1682 * 1683 * Callback for the switcheroo driver. Check of the switcheroo 1684 * state can be changed. 1685 * Returns true if the state can be changed, false if not. 1686 */ 1687 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1688 { 1689 struct drm_device *dev = pci_get_drvdata(pdev); 1690 1691 /* 1692 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1693 * locking inversion with the driver load path. And the access here is 1694 * completely racy anyway. So don't bother with locking for now. 1695 */ 1696 return atomic_read(&dev->open_count) == 0; 1697 } 1698 1699 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1700 .set_gpu_state = amdgpu_switcheroo_set_state, 1701 .reprobe = NULL, 1702 .can_switch = amdgpu_switcheroo_can_switch, 1703 }; 1704 1705 /** 1706 * amdgpu_device_enable_virtual_display - enable virtual display feature 1707 * 1708 * @adev: amdgpu_device pointer 1709 * 1710 * Enabled the virtual display feature if the user has enabled it via 1711 * the module parameter virtual_display. This feature provides a virtual 1712 * display hardware on headless boards or in virtualized environments. 1713 * This function parses and validates the configuration string specified by 1714 * the user and configures the virtual display configuration (number of 1715 * virtual connectors, crtcs, etc.) specified. 1716 */ 1717 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1718 { 1719 adev->enable_virtual_display = false; 1720 1721 if (amdgpu_virtual_display) { 1722 const char *pci_address_name = pci_name(adev->pdev); 1723 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1724 1725 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1726 pciaddstr_tmp = pciaddstr; 1727 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1728 pciaddname = strsep(&pciaddname_tmp, ","); 1729 if (!strcmp("all", pciaddname) 1730 || !strcmp(pci_address_name, pciaddname)) { 1731 long num_crtc; 1732 int res = -1; 1733 1734 adev->enable_virtual_display = true; 1735 1736 if (pciaddname_tmp) 1737 res = kstrtol(pciaddname_tmp, 10, 1738 &num_crtc); 1739 1740 if (!res) { 1741 if (num_crtc < 1) 1742 num_crtc = 1; 1743 if (num_crtc > 6) 1744 num_crtc = 6; 1745 adev->mode_info.num_crtc = num_crtc; 1746 } else { 1747 adev->mode_info.num_crtc = 1; 1748 } 1749 break; 1750 } 1751 } 1752 1753 dev_info( 1754 adev->dev, 1755 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1756 amdgpu_virtual_display, pci_address_name, 1757 adev->enable_virtual_display, adev->mode_info.num_crtc); 1758 1759 kfree(pciaddstr); 1760 } 1761 } 1762 1763 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 1764 { 1765 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 1766 adev->mode_info.num_crtc = 1; 1767 adev->enable_virtual_display = true; 1768 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 1769 adev->enable_virtual_display, 1770 adev->mode_info.num_crtc); 1771 } 1772 } 1773 1774 /** 1775 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1776 * 1777 * @adev: amdgpu_device pointer 1778 * 1779 * Parses the asic configuration parameters specified in the gpu info 1780 * firmware and makes them available to the driver for use in configuring 1781 * the asic. 1782 * Returns 0 on success, -EINVAL on failure. 1783 */ 1784 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1785 { 1786 const char *chip_name; 1787 int err; 1788 const struct gpu_info_firmware_header_v1_0 *hdr; 1789 1790 adev->firmware.gpu_info_fw = NULL; 1791 1792 switch (adev->asic_type) { 1793 default: 1794 return 0; 1795 case CHIP_VEGA10: 1796 chip_name = "vega10"; 1797 break; 1798 case CHIP_VEGA12: 1799 chip_name = "vega12"; 1800 break; 1801 case CHIP_RAVEN: 1802 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1803 chip_name = "raven2"; 1804 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1805 chip_name = "picasso"; 1806 else 1807 chip_name = "raven"; 1808 break; 1809 case CHIP_ARCTURUS: 1810 chip_name = "arcturus"; 1811 break; 1812 case CHIP_NAVI12: 1813 if (adev->discovery.bin) 1814 return 0; 1815 chip_name = "navi12"; 1816 break; 1817 case CHIP_CYAN_SKILLFISH: 1818 if (adev->discovery.bin) 1819 return 0; 1820 chip_name = "cyan_skillfish"; 1821 break; 1822 } 1823 1824 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 1825 AMDGPU_UCODE_OPTIONAL, 1826 "amdgpu/%s_gpu_info.bin", chip_name); 1827 if (err) { 1828 dev_err(adev->dev, 1829 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 1830 chip_name); 1831 goto out; 1832 } 1833 1834 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 1835 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 1836 1837 switch (hdr->version_major) { 1838 case 1: 1839 { 1840 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 1841 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 1842 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1843 1844 /* 1845 * Should be dropped when DAL no longer needs it. 1846 */ 1847 if (adev->asic_type == CHIP_NAVI12) 1848 goto parse_soc_bounding_box; 1849 1850 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 1851 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 1852 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 1853 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 1854 adev->gfx.config.max_texture_channel_caches = 1855 le32_to_cpu(gpu_info_fw->gc_num_tccs); 1856 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 1857 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 1858 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 1859 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 1860 adev->gfx.config.double_offchip_lds_buf = 1861 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 1862 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 1863 adev->gfx.cu_info.max_waves_per_simd = 1864 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 1865 adev->gfx.cu_info.max_scratch_slots_per_cu = 1866 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 1867 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 1868 if (hdr->version_minor >= 1) { 1869 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 1870 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 1871 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1872 adev->gfx.config.num_sc_per_sh = 1873 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 1874 adev->gfx.config.num_packer_per_sc = 1875 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 1876 } 1877 1878 parse_soc_bounding_box: 1879 /* 1880 * soc bounding box info is not integrated in disocovery table, 1881 * we always need to parse it from gpu info firmware if needed. 1882 */ 1883 if (hdr->version_minor == 2) { 1884 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 1885 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 1886 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 1887 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 1888 } 1889 break; 1890 } 1891 default: 1892 dev_err(adev->dev, 1893 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 1894 err = -EINVAL; 1895 goto out; 1896 } 1897 out: 1898 return err; 1899 } 1900 1901 static void amdgpu_uid_init(struct amdgpu_device *adev) 1902 { 1903 /* Initialize the UID for the device */ 1904 adev->uid_info = kzalloc_obj(struct amdgpu_uid); 1905 if (!adev->uid_info) { 1906 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 1907 return; 1908 } 1909 adev->uid_info->adev = adev; 1910 } 1911 1912 static void amdgpu_uid_fini(struct amdgpu_device *adev) 1913 { 1914 /* Free the UID memory */ 1915 kfree(adev->uid_info); 1916 adev->uid_info = NULL; 1917 } 1918 1919 /** 1920 * amdgpu_device_ip_early_init - run early init for hardware IPs 1921 * 1922 * @adev: amdgpu_device pointer 1923 * 1924 * Early initialization pass for hardware IPs. The hardware IPs that make 1925 * up each asic are discovered each IP's early_init callback is run. This 1926 * is the first stage in initializing the asic. 1927 * Returns 0 on success, negative error code on failure. 1928 */ 1929 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 1930 { 1931 struct amdgpu_ip_block *ip_block; 1932 struct pci_dev *parent; 1933 bool total, skip_bios; 1934 uint32_t bios_flags; 1935 int i, r; 1936 1937 amdgpu_device_enable_virtual_display(adev); 1938 1939 if (amdgpu_sriov_vf(adev)) { 1940 r = amdgpu_virt_request_full_gpu(adev, true); 1941 if (r) 1942 return r; 1943 1944 r = amdgpu_virt_init_critical_region(adev); 1945 if (r) 1946 return r; 1947 } 1948 1949 switch (adev->asic_type) { 1950 #ifdef CONFIG_DRM_AMDGPU_SI 1951 case CHIP_VERDE: 1952 case CHIP_TAHITI: 1953 case CHIP_PITCAIRN: 1954 case CHIP_OLAND: 1955 case CHIP_HAINAN: 1956 adev->family = AMDGPU_FAMILY_SI; 1957 r = si_set_ip_blocks(adev); 1958 if (r) 1959 return r; 1960 break; 1961 #endif 1962 #ifdef CONFIG_DRM_AMDGPU_CIK 1963 case CHIP_BONAIRE: 1964 case CHIP_HAWAII: 1965 case CHIP_KAVERI: 1966 case CHIP_KABINI: 1967 case CHIP_MULLINS: 1968 if (adev->flags & AMD_IS_APU) 1969 adev->family = AMDGPU_FAMILY_KV; 1970 else 1971 adev->family = AMDGPU_FAMILY_CI; 1972 1973 r = cik_set_ip_blocks(adev); 1974 if (r) 1975 return r; 1976 break; 1977 #endif 1978 case CHIP_TOPAZ: 1979 case CHIP_TONGA: 1980 case CHIP_FIJI: 1981 case CHIP_POLARIS10: 1982 case CHIP_POLARIS11: 1983 case CHIP_POLARIS12: 1984 case CHIP_VEGAM: 1985 case CHIP_CARRIZO: 1986 case CHIP_STONEY: 1987 if (adev->flags & AMD_IS_APU) 1988 adev->family = AMDGPU_FAMILY_CZ; 1989 else 1990 adev->family = AMDGPU_FAMILY_VI; 1991 1992 r = vi_set_ip_blocks(adev); 1993 if (r) 1994 return r; 1995 break; 1996 default: 1997 r = amdgpu_discovery_set_ip_blocks(adev); 1998 if (r) 1999 return r; 2000 break; 2001 } 2002 2003 /* Check for IP version 9.4.3 with A0 hardware */ 2004 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2005 !amdgpu_device_get_rev_id(adev)) { 2006 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2007 return -ENODEV; /* device unsupported - no device error */ 2008 } 2009 2010 if (amdgpu_has_atpx() && 2011 (amdgpu_is_atpx_hybrid() || 2012 amdgpu_has_atpx_dgpu_power_cntl()) && 2013 ((adev->flags & AMD_IS_APU) == 0) && 2014 !dev_is_removable(&adev->pdev->dev)) 2015 adev->flags |= AMD_IS_PX; 2016 2017 if (!(adev->flags & AMD_IS_APU)) { 2018 parent = pcie_find_root_port(adev->pdev); 2019 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2020 } 2021 2022 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2023 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2024 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2025 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2026 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2027 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2028 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2029 2030 adev->virt.is_xgmi_node_migrate_enabled = false; 2031 if (amdgpu_sriov_vf(adev)) { 2032 adev->virt.is_xgmi_node_migrate_enabled = 2033 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2034 } 2035 2036 total = true; 2037 for (i = 0; i < adev->num_ip_blocks; i++) { 2038 ip_block = &adev->ip_blocks[i]; 2039 2040 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2041 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2042 adev->ip_blocks[i].version->funcs->name); 2043 adev->ip_blocks[i].status.valid = false; 2044 } else if (ip_block->version->funcs->early_init) { 2045 r = ip_block->version->funcs->early_init(ip_block); 2046 if (r == -ENOENT) { 2047 adev->ip_blocks[i].status.valid = false; 2048 } else if (r) { 2049 dev_err(adev->dev, 2050 "early_init of IP block <%s> failed %d\n", 2051 adev->ip_blocks[i].version->funcs->name, 2052 r); 2053 total = false; 2054 } else { 2055 adev->ip_blocks[i].status.valid = true; 2056 } 2057 } else { 2058 adev->ip_blocks[i].status.valid = true; 2059 } 2060 /* get the vbios after the asic_funcs are set up */ 2061 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2062 r = amdgpu_device_parse_gpu_info_fw(adev); 2063 if (r) 2064 return r; 2065 2066 bios_flags = amdgpu_device_get_vbios_flags(adev); 2067 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2068 /* Read BIOS */ 2069 if (!skip_bios) { 2070 bool optional = 2071 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2072 if (!amdgpu_get_bios(adev) && !optional) 2073 return -EINVAL; 2074 2075 if (optional && !adev->bios) 2076 dev_info( 2077 adev->dev, 2078 "VBIOS image optional, proceeding without VBIOS image"); 2079 2080 if (adev->bios) { 2081 r = amdgpu_atombios_init(adev); 2082 if (r) { 2083 dev_err(adev->dev, 2084 "amdgpu_atombios_init failed\n"); 2085 amdgpu_vf_error_put( 2086 adev, 2087 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2088 0, 0); 2089 return r; 2090 } 2091 } 2092 } 2093 2094 /*get pf2vf msg info at it's earliest time*/ 2095 if (amdgpu_sriov_vf(adev)) 2096 amdgpu_virt_init_data_exchange(adev); 2097 2098 } 2099 } 2100 if (!total) 2101 return -ENODEV; 2102 2103 if (adev->gmc.xgmi.supported) 2104 amdgpu_xgmi_early_init(adev); 2105 2106 if (amdgpu_is_multi_aid(adev)) 2107 amdgpu_uid_init(adev); 2108 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2109 if (ip_block->status.valid != false) 2110 amdgpu_amdkfd_device_probe(adev); 2111 2112 adev->cg_flags &= amdgpu_cg_mask; 2113 adev->pg_flags &= amdgpu_pg_mask; 2114 2115 return 0; 2116 } 2117 2118 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2119 { 2120 int i, r; 2121 2122 for (i = 0; i < adev->num_ip_blocks; i++) { 2123 if (!adev->ip_blocks[i].status.sw) 2124 continue; 2125 if (adev->ip_blocks[i].status.hw) 2126 continue; 2127 if (!amdgpu_ip_member_of_hwini( 2128 adev, adev->ip_blocks[i].version->type)) 2129 continue; 2130 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2131 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2132 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2133 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2134 if (r) { 2135 dev_err(adev->dev, 2136 "hw_init of IP block <%s> failed %d\n", 2137 adev->ip_blocks[i].version->funcs->name, 2138 r); 2139 return r; 2140 } 2141 adev->ip_blocks[i].status.hw = true; 2142 } 2143 } 2144 2145 return 0; 2146 } 2147 2148 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2149 { 2150 int i, r; 2151 2152 for (i = 0; i < adev->num_ip_blocks; i++) { 2153 if (!adev->ip_blocks[i].status.sw) 2154 continue; 2155 if (adev->ip_blocks[i].status.hw) 2156 continue; 2157 if (!amdgpu_ip_member_of_hwini( 2158 adev, adev->ip_blocks[i].version->type)) 2159 continue; 2160 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2161 if (r) { 2162 dev_err(adev->dev, 2163 "hw_init of IP block <%s> failed %d\n", 2164 adev->ip_blocks[i].version->funcs->name, r); 2165 return r; 2166 } 2167 adev->ip_blocks[i].status.hw = true; 2168 } 2169 2170 return 0; 2171 } 2172 2173 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2174 { 2175 int r = 0; 2176 int i; 2177 uint32_t smu_version; 2178 2179 if (adev->asic_type >= CHIP_VEGA10) { 2180 for (i = 0; i < adev->num_ip_blocks; i++) { 2181 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2182 continue; 2183 2184 if (!amdgpu_ip_member_of_hwini(adev, 2185 AMD_IP_BLOCK_TYPE_PSP)) 2186 break; 2187 2188 if (!adev->ip_blocks[i].status.sw) 2189 continue; 2190 2191 /* no need to do the fw loading again if already done*/ 2192 if (adev->ip_blocks[i].status.hw == true) 2193 break; 2194 2195 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2196 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2197 if (r) 2198 return r; 2199 } else { 2200 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2201 if (r) { 2202 dev_err(adev->dev, 2203 "hw_init of IP block <%s> failed %d\n", 2204 adev->ip_blocks[i] 2205 .version->funcs->name, 2206 r); 2207 return r; 2208 } 2209 adev->ip_blocks[i].status.hw = true; 2210 } 2211 break; 2212 } 2213 } 2214 2215 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2216 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2217 2218 return r; 2219 } 2220 2221 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2222 { 2223 struct drm_sched_init_args args = { 2224 .ops = &amdgpu_sched_ops, 2225 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2226 .timeout_wq = adev->reset_domain->wq, 2227 .dev = adev->dev, 2228 }; 2229 long timeout; 2230 int r, i; 2231 2232 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2233 struct amdgpu_ring *ring = adev->rings[i]; 2234 2235 /* No need to setup the GPU scheduler for rings that don't need it */ 2236 if (!ring || ring->no_scheduler) 2237 continue; 2238 2239 switch (ring->funcs->type) { 2240 case AMDGPU_RING_TYPE_GFX: 2241 timeout = adev->gfx_timeout; 2242 break; 2243 case AMDGPU_RING_TYPE_COMPUTE: 2244 timeout = adev->compute_timeout; 2245 break; 2246 case AMDGPU_RING_TYPE_SDMA: 2247 timeout = adev->sdma_timeout; 2248 break; 2249 default: 2250 timeout = adev->video_timeout; 2251 break; 2252 } 2253 2254 args.timeout = timeout; 2255 args.credit_limit = ring->num_hw_submission; 2256 args.score = ring->sched_score; 2257 args.name = ring->name; 2258 2259 r = drm_sched_init(&ring->sched, &args); 2260 if (r) { 2261 dev_err(adev->dev, 2262 "Failed to create scheduler on ring %s.\n", 2263 ring->name); 2264 return r; 2265 } 2266 r = amdgpu_uvd_entity_init(adev, ring); 2267 if (r) { 2268 dev_err(adev->dev, 2269 "Failed to create UVD scheduling entity on ring %s.\n", 2270 ring->name); 2271 return r; 2272 } 2273 r = amdgpu_vce_entity_init(adev, ring); 2274 if (r) { 2275 dev_err(adev->dev, 2276 "Failed to create VCE scheduling entity on ring %s.\n", 2277 ring->name); 2278 return r; 2279 } 2280 } 2281 2282 if (adev->xcp_mgr) 2283 amdgpu_xcp_update_partition_sched_list(adev); 2284 2285 return 0; 2286 } 2287 2288 2289 /** 2290 * amdgpu_device_ip_init - run init for hardware IPs 2291 * 2292 * @adev: amdgpu_device pointer 2293 * 2294 * Main initialization pass for hardware IPs. The list of all the hardware 2295 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2296 * are run. sw_init initializes the software state associated with each IP 2297 * and hw_init initializes the hardware associated with each IP. 2298 * Returns 0 on success, negative error code on failure. 2299 */ 2300 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2301 { 2302 bool init_badpage; 2303 int i, r; 2304 2305 r = amdgpu_ras_init(adev); 2306 if (r) 2307 return r; 2308 2309 for (i = 0; i < adev->num_ip_blocks; i++) { 2310 if (!adev->ip_blocks[i].status.valid) 2311 continue; 2312 if (adev->ip_blocks[i].version->funcs->sw_init) { 2313 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2314 if (r) { 2315 dev_err(adev->dev, 2316 "sw_init of IP block <%s> failed %d\n", 2317 adev->ip_blocks[i].version->funcs->name, 2318 r); 2319 goto init_failed; 2320 } 2321 } 2322 adev->ip_blocks[i].status.sw = true; 2323 2324 if (!amdgpu_ip_member_of_hwini( 2325 adev, adev->ip_blocks[i].version->type)) 2326 continue; 2327 2328 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2329 /* need to do common hw init early so everything is set up for gmc */ 2330 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2331 if (r) { 2332 dev_err(adev->dev, "hw_init %d failed %d\n", i, 2333 r); 2334 goto init_failed; 2335 } 2336 adev->ip_blocks[i].status.hw = true; 2337 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2338 /* need to do gmc hw init early so we can allocate gpu mem */ 2339 /* Try to reserve bad pages early */ 2340 if (amdgpu_sriov_vf(adev)) 2341 amdgpu_virt_exchange_data(adev); 2342 2343 r = amdgpu_device_mem_scratch_init(adev); 2344 if (r) { 2345 dev_err(adev->dev, 2346 "amdgpu_mem_scratch_init failed %d\n", 2347 r); 2348 goto init_failed; 2349 } 2350 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2351 if (r) { 2352 dev_err(adev->dev, "hw_init %d failed %d\n", i, 2353 r); 2354 goto init_failed; 2355 } 2356 r = amdgpu_device_wb_init(adev); 2357 if (r) { 2358 dev_err(adev->dev, 2359 "amdgpu_device_wb_init failed %d\n", r); 2360 goto init_failed; 2361 } 2362 adev->ip_blocks[i].status.hw = true; 2363 2364 /* right after GMC hw init, we create CSA */ 2365 if (adev->gfx.mcbp) { 2366 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2367 AMDGPU_GEM_DOMAIN_VRAM | 2368 AMDGPU_GEM_DOMAIN_GTT, 2369 AMDGPU_CSA_SIZE); 2370 if (r) { 2371 dev_err(adev->dev, 2372 "allocate CSA failed %d\n", r); 2373 goto init_failed; 2374 } 2375 } 2376 2377 r = amdgpu_seq64_init(adev); 2378 if (r) { 2379 dev_err(adev->dev, "allocate seq64 failed %d\n", 2380 r); 2381 goto init_failed; 2382 } 2383 } 2384 } 2385 2386 if (amdgpu_sriov_vf(adev)) 2387 amdgpu_virt_init_data_exchange(adev); 2388 2389 r = amdgpu_ib_pool_init(adev); 2390 if (r) { 2391 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2392 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2393 goto init_failed; 2394 } 2395 2396 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2397 if (r) 2398 goto init_failed; 2399 2400 r = amdgpu_device_ip_hw_init_phase1(adev); 2401 if (r) 2402 goto init_failed; 2403 2404 r = amdgpu_device_fw_loading(adev); 2405 if (r) 2406 goto init_failed; 2407 2408 r = amdgpu_device_ip_hw_init_phase2(adev); 2409 if (r) 2410 goto init_failed; 2411 2412 /* 2413 * retired pages will be loaded from eeprom and reserved here, 2414 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2415 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2416 * for I2C communication which only true at this point. 2417 * 2418 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2419 * failure from bad gpu situation and stop amdgpu init process 2420 * accordingly. For other failed cases, it will still release all 2421 * the resource and print error message, rather than returning one 2422 * negative value to upper level. 2423 * 2424 * Note: theoretically, this should be called before all vram allocations 2425 * to protect retired page from abusing 2426 */ 2427 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 2428 r = amdgpu_ras_recovery_init(adev, init_badpage); 2429 if (r) 2430 goto init_failed; 2431 2432 /** 2433 * In case of XGMI grab extra reference for reset domain for this device 2434 */ 2435 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2436 if (amdgpu_xgmi_add_device(adev) == 0) { 2437 if (!amdgpu_sriov_vf(adev)) { 2438 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2439 2440 if (WARN_ON(!hive)) { 2441 r = -ENOENT; 2442 goto init_failed; 2443 } 2444 2445 if (!hive->reset_domain || 2446 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2447 r = -ENOENT; 2448 amdgpu_put_xgmi_hive(hive); 2449 goto init_failed; 2450 } 2451 2452 /* Drop the early temporary reset domain we created for device */ 2453 amdgpu_reset_put_reset_domain(adev->reset_domain); 2454 adev->reset_domain = hive->reset_domain; 2455 amdgpu_put_xgmi_hive(hive); 2456 } 2457 } 2458 } 2459 2460 r = amdgpu_device_init_schedulers(adev); 2461 if (r) 2462 goto init_failed; 2463 2464 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2465 2466 /* Don't init kfd if whole hive need to be reset during init */ 2467 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 2468 amdgpu_amdkfd_device_init(adev); 2469 } 2470 2471 amdgpu_fru_get_product_info(adev); 2472 2473 r = amdgpu_cper_init(adev); 2474 2475 init_failed: 2476 2477 return r; 2478 } 2479 2480 /** 2481 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2482 * 2483 * @adev: amdgpu_device pointer 2484 * 2485 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2486 * this function before a GPU reset. If the value is retained after a 2487 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 2488 */ 2489 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2490 { 2491 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2492 } 2493 2494 /** 2495 * amdgpu_device_check_vram_lost - check if vram is valid 2496 * 2497 * @adev: amdgpu_device pointer 2498 * 2499 * Checks the reset magic value written to the gart pointer in VRAM. 2500 * The driver calls this after a GPU reset to see if the contents of 2501 * VRAM is lost or now. 2502 * returns true if vram is lost, false if not. 2503 */ 2504 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2505 { 2506 if (memcmp(adev->gart.ptr, adev->reset_magic, 2507 AMDGPU_RESET_MAGIC_NUM)) 2508 return true; 2509 2510 if (!amdgpu_in_reset(adev)) 2511 return false; 2512 2513 /* 2514 * For all ASICs with baco/mode1 reset, the VRAM is 2515 * always assumed to be lost. 2516 */ 2517 switch (amdgpu_asic_reset_method(adev)) { 2518 case AMD_RESET_METHOD_LEGACY: 2519 case AMD_RESET_METHOD_LINK: 2520 case AMD_RESET_METHOD_BACO: 2521 case AMD_RESET_METHOD_MODE1: 2522 return true; 2523 default: 2524 return false; 2525 } 2526 } 2527 2528 /** 2529 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2530 * 2531 * @adev: amdgpu_device pointer 2532 * @state: clockgating state (gate or ungate) 2533 * 2534 * The list of all the hardware IPs that make up the asic is walked and the 2535 * set_clockgating_state callbacks are run. 2536 * Late initialization pass enabling clockgating for hardware IPs. 2537 * Fini or suspend, pass disabling clockgating for hardware IPs. 2538 * Returns 0 on success, negative error code on failure. 2539 */ 2540 2541 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2542 enum amd_clockgating_state state) 2543 { 2544 int i, j, r; 2545 2546 if (amdgpu_emu_mode == 1) 2547 return 0; 2548 2549 for (j = 0; j < adev->num_ip_blocks; j++) { 2550 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2551 if (!adev->ip_blocks[i].status.late_initialized) 2552 continue; 2553 /* skip CG for GFX, SDMA on S0ix */ 2554 if (adev->in_s0ix && 2555 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2556 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2557 continue; 2558 /* skip CG for VCE/UVD, it's handled specially */ 2559 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2560 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2561 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2562 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2563 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2564 /* enable clockgating to save power */ 2565 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 2566 state); 2567 if (r) { 2568 dev_err(adev->dev, 2569 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 2570 adev->ip_blocks[i].version->funcs->name, 2571 r); 2572 return r; 2573 } 2574 } 2575 } 2576 2577 return 0; 2578 } 2579 2580 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2581 enum amd_powergating_state state) 2582 { 2583 int i, j, r; 2584 2585 if (amdgpu_emu_mode == 1) 2586 return 0; 2587 2588 for (j = 0; j < adev->num_ip_blocks; j++) { 2589 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2590 if (!adev->ip_blocks[i].status.late_initialized) 2591 continue; 2592 /* skip PG for GFX, SDMA on S0ix */ 2593 if (adev->in_s0ix && 2594 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2595 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2596 continue; 2597 /* skip CG for VCE/UVD, it's handled specially */ 2598 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2599 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2600 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2601 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2602 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2603 /* enable powergating to save power */ 2604 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 2605 state); 2606 if (r) { 2607 dev_err(adev->dev, 2608 "set_powergating_state(gate) of IP block <%s> failed %d\n", 2609 adev->ip_blocks[i].version->funcs->name, 2610 r); 2611 return r; 2612 } 2613 } 2614 } 2615 return 0; 2616 } 2617 2618 static int amdgpu_device_enable_mgpu_fan_boost(void) 2619 { 2620 struct amdgpu_gpu_instance *gpu_ins; 2621 struct amdgpu_device *adev; 2622 int i, ret = 0; 2623 2624 mutex_lock(&mgpu_info.mutex); 2625 2626 /* 2627 * MGPU fan boost feature should be enabled 2628 * only when there are two or more dGPUs in 2629 * the system 2630 */ 2631 if (mgpu_info.num_dgpu < 2) 2632 goto out; 2633 2634 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2635 gpu_ins = &(mgpu_info.gpu_ins[i]); 2636 adev = gpu_ins->adev; 2637 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 2638 !gpu_ins->mgpu_fan_enabled) { 2639 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2640 if (ret) 2641 break; 2642 2643 gpu_ins->mgpu_fan_enabled = 1; 2644 } 2645 } 2646 2647 out: 2648 mutex_unlock(&mgpu_info.mutex); 2649 2650 return ret; 2651 } 2652 2653 /** 2654 * amdgpu_device_ip_late_init - run late init for hardware IPs 2655 * 2656 * @adev: amdgpu_device pointer 2657 * 2658 * Late initialization pass for hardware IPs. The list of all the hardware 2659 * IPs that make up the asic is walked and the late_init callbacks are run. 2660 * late_init covers any special initialization that an IP requires 2661 * after all of the have been initialized or something that needs to happen 2662 * late in the init process. 2663 * Returns 0 on success, negative error code on failure. 2664 */ 2665 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2666 { 2667 struct amdgpu_gpu_instance *gpu_instance; 2668 int i = 0, r; 2669 2670 for (i = 0; i < adev->num_ip_blocks; i++) { 2671 if (!adev->ip_blocks[i].status.hw) 2672 continue; 2673 if (adev->ip_blocks[i].version->funcs->late_init) { 2674 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 2675 if (r) { 2676 dev_err(adev->dev, 2677 "late_init of IP block <%s> failed %d\n", 2678 adev->ip_blocks[i].version->funcs->name, 2679 r); 2680 return r; 2681 } 2682 } 2683 adev->ip_blocks[i].status.late_initialized = true; 2684 } 2685 2686 r = amdgpu_ras_late_init(adev); 2687 if (r) { 2688 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 2689 return r; 2690 } 2691 2692 if (!amdgpu_reset_in_recovery(adev)) 2693 amdgpu_ras_set_error_query_ready(adev, true); 2694 2695 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2696 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2697 2698 amdgpu_device_fill_reset_magic(adev); 2699 2700 r = amdgpu_device_enable_mgpu_fan_boost(); 2701 if (r) 2702 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 2703 2704 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2705 if (amdgpu_passthrough(adev) && 2706 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 2707 adev->asic_type == CHIP_ALDEBARAN)) 2708 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2709 2710 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2711 mutex_lock(&mgpu_info.mutex); 2712 2713 /* 2714 * Reset device p-state to low as this was booted with high. 2715 * 2716 * This should be performed only after all devices from the same 2717 * hive get initialized. 2718 * 2719 * However, it's unknown how many device in the hive in advance. 2720 * As this is counted one by one during devices initializations. 2721 * 2722 * So, we wait for all XGMI interlinked devices initialized. 2723 * This may bring some delays as those devices may come from 2724 * different hives. But that should be OK. 2725 */ 2726 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2727 for (i = 0; i < mgpu_info.num_gpu; i++) { 2728 gpu_instance = &(mgpu_info.gpu_ins[i]); 2729 if (gpu_instance->adev->flags & AMD_IS_APU) 2730 continue; 2731 2732 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2733 AMDGPU_XGMI_PSTATE_MIN); 2734 if (r) { 2735 dev_err(adev->dev, 2736 "pstate setting failed (%d).\n", 2737 r); 2738 break; 2739 } 2740 } 2741 } 2742 2743 mutex_unlock(&mgpu_info.mutex); 2744 } 2745 2746 return 0; 2747 } 2748 2749 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 2750 { 2751 struct amdgpu_device *adev = ip_block->adev; 2752 int r; 2753 2754 if (!ip_block->version->funcs->hw_fini) { 2755 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 2756 ip_block->version->funcs->name); 2757 } else { 2758 r = ip_block->version->funcs->hw_fini(ip_block); 2759 /* XXX handle errors */ 2760 if (r) { 2761 dev_dbg(adev->dev, 2762 "hw_fini of IP block <%s> failed %d\n", 2763 ip_block->version->funcs->name, r); 2764 } 2765 } 2766 2767 ip_block->status.hw = false; 2768 } 2769 2770 /** 2771 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2772 * 2773 * @adev: amdgpu_device pointer 2774 * 2775 * For ASICs need to disable SMC first 2776 */ 2777 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2778 { 2779 int i; 2780 2781 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 2782 return; 2783 2784 for (i = 0; i < adev->num_ip_blocks; i++) { 2785 if (!adev->ip_blocks[i].status.hw) 2786 continue; 2787 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2788 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 2789 break; 2790 } 2791 } 2792 } 2793 2794 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2795 { 2796 int i, r; 2797 2798 for (i = 0; i < adev->num_ip_blocks; i++) { 2799 if (!adev->ip_blocks[i].version->funcs->early_fini) 2800 continue; 2801 2802 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 2803 if (r) { 2804 dev_dbg(adev->dev, 2805 "early_fini of IP block <%s> failed %d\n", 2806 adev->ip_blocks[i].version->funcs->name, r); 2807 } 2808 } 2809 2810 amdgpu_amdkfd_suspend(adev, true); 2811 amdgpu_amdkfd_teardown_processes(adev); 2812 amdgpu_userq_suspend(adev); 2813 2814 /* Workaround for ASICs need to disable SMC first */ 2815 amdgpu_device_smu_fini_early(adev); 2816 2817 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2818 if (!adev->ip_blocks[i].status.hw) 2819 continue; 2820 2821 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 2822 } 2823 2824 if (amdgpu_sriov_vf(adev)) { 2825 if (amdgpu_virt_release_full_gpu(adev, false)) 2826 dev_err(adev->dev, 2827 "failed to release exclusive mode on fini\n"); 2828 } 2829 2830 /* 2831 * Driver reload on the APU can fail due to firmware validation because 2832 * the PSP is always running, as it is shared across the whole SoC. 2833 * This same issue does not occur on dGPU because it has a mechanism 2834 * that checks whether the PSP is running. A solution for those issues 2835 * in the APU is to trigger a GPU reset, but this should be done during 2836 * the unload phase to avoid adding boot latency and screen flicker. 2837 */ 2838 if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) { 2839 r = amdgpu_asic_reset(adev); 2840 if (r) 2841 dev_err(adev->dev, "asic reset on %s failed\n", __func__); 2842 } 2843 2844 return 0; 2845 } 2846 2847 /** 2848 * amdgpu_device_ip_fini - run fini for hardware IPs 2849 * 2850 * @adev: amdgpu_device pointer 2851 * 2852 * Main teardown pass for hardware IPs. The list of all the hardware 2853 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2854 * are run. hw_fini tears down the hardware associated with each IP 2855 * and sw_fini tears down any software state associated with each IP. 2856 * Returns 0 on success, negative error code on failure. 2857 */ 2858 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2859 { 2860 int i, r; 2861 2862 amdgpu_cper_fini(adev); 2863 2864 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2865 amdgpu_virt_release_ras_err_handler_data(adev); 2866 2867 if (adev->gmc.xgmi.num_physical_nodes > 1) 2868 amdgpu_xgmi_remove_device(adev); 2869 2870 amdgpu_amdkfd_device_fini_sw(adev); 2871 2872 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2873 if (!adev->ip_blocks[i].status.sw) 2874 continue; 2875 2876 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2877 amdgpu_ucode_free_bo(adev); 2878 amdgpu_free_static_csa(&adev->virt.csa_obj); 2879 amdgpu_device_wb_fini(adev); 2880 amdgpu_device_mem_scratch_fini(adev); 2881 amdgpu_ib_pool_fini(adev); 2882 amdgpu_seq64_fini(adev); 2883 amdgpu_doorbell_fini(adev); 2884 } 2885 if (adev->ip_blocks[i].version->funcs->sw_fini) { 2886 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 2887 /* XXX handle errors */ 2888 if (r) { 2889 dev_dbg(adev->dev, 2890 "sw_fini of IP block <%s> failed %d\n", 2891 adev->ip_blocks[i].version->funcs->name, 2892 r); 2893 } 2894 } 2895 adev->ip_blocks[i].status.sw = false; 2896 adev->ip_blocks[i].status.valid = false; 2897 } 2898 2899 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2900 if (!adev->ip_blocks[i].status.late_initialized) 2901 continue; 2902 if (adev->ip_blocks[i].version->funcs->late_fini) 2903 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 2904 adev->ip_blocks[i].status.late_initialized = false; 2905 } 2906 2907 amdgpu_ras_fini(adev); 2908 amdgpu_uid_fini(adev); 2909 2910 return 0; 2911 } 2912 2913 /** 2914 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2915 * 2916 * @work: work_struct. 2917 */ 2918 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2919 { 2920 struct amdgpu_device *adev = 2921 container_of(work, struct amdgpu_device, delayed_init_work.work); 2922 int r; 2923 2924 r = amdgpu_ib_ring_tests(adev); 2925 if (r) 2926 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 2927 } 2928 2929 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2930 { 2931 struct amdgpu_device *adev = 2932 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2933 2934 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2935 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2936 2937 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 2938 adev->gfx.gfx_off_state = true; 2939 } 2940 2941 /** 2942 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2943 * 2944 * @adev: amdgpu_device pointer 2945 * 2946 * Main suspend function for hardware IPs. The list of all the hardware 2947 * IPs that make up the asic is walked, clockgating is disabled and the 2948 * suspend callbacks are run. suspend puts the hardware and software state 2949 * in each IP into a state suitable for suspend. 2950 * Returns 0 on success, negative error code on failure. 2951 */ 2952 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2953 { 2954 int i, r, rec; 2955 2956 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2957 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2958 2959 /* 2960 * Per PMFW team's suggestion, driver needs to handle gfxoff 2961 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 2962 * scenario. Add the missing df cstate disablement here. 2963 */ 2964 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 2965 dev_warn(adev->dev, "Failed to disallow df cstate"); 2966 2967 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2968 if (!adev->ip_blocks[i].status.valid) 2969 continue; 2970 2971 /* displays are handled separately */ 2972 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2973 continue; 2974 2975 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 2976 if (r) 2977 goto unwind; 2978 } 2979 2980 return 0; 2981 unwind: 2982 rec = amdgpu_device_ip_resume_phase3(adev); 2983 if (rec) 2984 dev_err(adev->dev, 2985 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", 2986 rec); 2987 2988 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); 2989 2990 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2991 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2992 2993 return r; 2994 } 2995 2996 /** 2997 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2998 * 2999 * @adev: amdgpu_device pointer 3000 * 3001 * Main suspend function for hardware IPs. The list of all the hardware 3002 * IPs that make up the asic is walked, clockgating is disabled and the 3003 * suspend callbacks are run. suspend puts the hardware and software state 3004 * in each IP into a state suitable for suspend. 3005 * Returns 0 on success, negative error code on failure. 3006 */ 3007 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3008 { 3009 int i, r, rec; 3010 3011 if (adev->in_s0ix) 3012 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3013 3014 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3015 if (!adev->ip_blocks[i].status.valid) 3016 continue; 3017 /* displays are handled in phase1 */ 3018 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3019 continue; 3020 /* PSP lost connection when err_event_athub occurs */ 3021 if (amdgpu_ras_intr_triggered() && 3022 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3023 adev->ip_blocks[i].status.hw = false; 3024 continue; 3025 } 3026 3027 /* skip unnecessary suspend if we do not initialize them yet */ 3028 if (!amdgpu_ip_member_of_hwini( 3029 adev, adev->ip_blocks[i].version->type)) 3030 continue; 3031 3032 /* Since we skip suspend for S0i3, we need to cancel the delayed 3033 * idle work here as the suspend callback never gets called. 3034 */ 3035 if (adev->in_s0ix && 3036 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3037 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3038 cancel_delayed_work_sync(&adev->gfx.idle_work); 3039 /* skip suspend of gfx/mes and psp for S0ix 3040 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3041 * like at runtime. PSP is also part of the always on hardware 3042 * so no need to suspend it. 3043 */ 3044 if (adev->in_s0ix && 3045 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3046 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3047 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3048 continue; 3049 3050 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3051 if (adev->in_s0ix && 3052 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3053 IP_VERSION(5, 0, 0)) && 3054 (adev->ip_blocks[i].version->type == 3055 AMD_IP_BLOCK_TYPE_SDMA)) 3056 continue; 3057 3058 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3059 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3060 * from this location and RLC Autoload automatically also gets loaded 3061 * from here based on PMFW -> PSP message during re-init sequence. 3062 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3063 * the TMR and reload FWs again for IMU enabled APU ASICs. 3064 */ 3065 if (amdgpu_in_reset(adev) && 3066 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3067 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3068 continue; 3069 3070 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3071 if (r) 3072 goto unwind; 3073 3074 /* handle putting the SMC in the appropriate state */ 3075 if (!amdgpu_sriov_vf(adev)) { 3076 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3077 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3078 if (r) { 3079 dev_err(adev->dev, 3080 "SMC failed to set mp1 state %d, %d\n", 3081 adev->mp1_state, r); 3082 goto unwind; 3083 } 3084 } 3085 } 3086 } 3087 3088 return 0; 3089 unwind: 3090 /* suspend phase 2 = resume phase 1 + resume phase 2 */ 3091 rec = amdgpu_device_ip_resume_phase1(adev); 3092 if (rec) { 3093 dev_err(adev->dev, 3094 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", 3095 rec); 3096 return r; 3097 } 3098 3099 rec = amdgpu_device_fw_loading(adev); 3100 if (rec) { 3101 dev_err(adev->dev, 3102 "amdgpu_device_fw_loading failed during unwind: %d\n", 3103 rec); 3104 return r; 3105 } 3106 3107 rec = amdgpu_device_ip_resume_phase2(adev); 3108 if (rec) { 3109 dev_err(adev->dev, 3110 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", 3111 rec); 3112 return r; 3113 } 3114 3115 return r; 3116 } 3117 3118 /** 3119 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3120 * 3121 * @adev: amdgpu_device pointer 3122 * 3123 * Main suspend function for hardware IPs. The list of all the hardware 3124 * IPs that make up the asic is walked, clockgating is disabled and the 3125 * suspend callbacks are run. suspend puts the hardware and software state 3126 * in each IP into a state suitable for suspend. 3127 * Returns 0 on success, negative error code on failure. 3128 */ 3129 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3130 { 3131 int r; 3132 3133 if (amdgpu_sriov_vf(adev)) { 3134 amdgpu_virt_fini_data_exchange(adev); 3135 amdgpu_virt_request_full_gpu(adev, false); 3136 } 3137 3138 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3139 3140 r = amdgpu_device_ip_suspend_phase1(adev); 3141 if (r) 3142 return r; 3143 r = amdgpu_device_ip_suspend_phase2(adev); 3144 3145 if (amdgpu_sriov_vf(adev)) 3146 amdgpu_virt_release_full_gpu(adev, false); 3147 3148 return r; 3149 } 3150 3151 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3152 { 3153 int i, r; 3154 3155 static enum amd_ip_block_type ip_order[] = { 3156 AMD_IP_BLOCK_TYPE_COMMON, 3157 AMD_IP_BLOCK_TYPE_GMC, 3158 AMD_IP_BLOCK_TYPE_PSP, 3159 AMD_IP_BLOCK_TYPE_IH, 3160 }; 3161 3162 for (i = 0; i < adev->num_ip_blocks; i++) { 3163 int j; 3164 struct amdgpu_ip_block *block; 3165 3166 block = &adev->ip_blocks[i]; 3167 block->status.hw = false; 3168 3169 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3170 3171 if (block->version->type != ip_order[j] || 3172 !block->status.valid) 3173 continue; 3174 3175 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3176 if (r) { 3177 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3178 block->version->funcs->name); 3179 return r; 3180 } 3181 block->status.hw = true; 3182 } 3183 } 3184 3185 return 0; 3186 } 3187 3188 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3189 { 3190 struct amdgpu_ip_block *block; 3191 int i, r = 0; 3192 3193 static enum amd_ip_block_type ip_order[] = { 3194 AMD_IP_BLOCK_TYPE_SMC, 3195 AMD_IP_BLOCK_TYPE_DCE, 3196 AMD_IP_BLOCK_TYPE_GFX, 3197 AMD_IP_BLOCK_TYPE_SDMA, 3198 AMD_IP_BLOCK_TYPE_MES, 3199 AMD_IP_BLOCK_TYPE_UVD, 3200 AMD_IP_BLOCK_TYPE_VCE, 3201 AMD_IP_BLOCK_TYPE_VCN, 3202 AMD_IP_BLOCK_TYPE_JPEG 3203 }; 3204 3205 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3206 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3207 3208 if (!block) 3209 continue; 3210 3211 if (block->status.valid && !block->status.hw) { 3212 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3213 r = amdgpu_ip_block_resume(block); 3214 } else { 3215 r = block->version->funcs->hw_init(block); 3216 } 3217 3218 if (r) { 3219 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3220 block->version->funcs->name); 3221 break; 3222 } 3223 block->status.hw = true; 3224 } 3225 } 3226 3227 return r; 3228 } 3229 3230 /** 3231 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3232 * 3233 * @adev: amdgpu_device pointer 3234 * 3235 * First resume function for hardware IPs. The list of all the hardware 3236 * IPs that make up the asic is walked and the resume callbacks are run for 3237 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3238 * after a suspend and updates the software state as necessary. This 3239 * function is also used for restoring the GPU after a GPU reset. 3240 * Returns 0 on success, negative error code on failure. 3241 */ 3242 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3243 { 3244 int i, r; 3245 3246 for (i = 0; i < adev->num_ip_blocks; i++) { 3247 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3248 continue; 3249 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3250 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3251 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3252 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3253 3254 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3255 if (r) 3256 return r; 3257 } 3258 } 3259 3260 return 0; 3261 } 3262 3263 /** 3264 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3265 * 3266 * @adev: amdgpu_device pointer 3267 * 3268 * Second resume function for hardware IPs. The list of all the hardware 3269 * IPs that make up the asic is walked and the resume callbacks are run for 3270 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3271 * functional state after a suspend and updates the software state as 3272 * necessary. This function is also used for restoring the GPU after a GPU 3273 * reset. 3274 * Returns 0 on success, negative error code on failure. 3275 */ 3276 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3277 { 3278 int i, r; 3279 3280 for (i = 0; i < adev->num_ip_blocks; i++) { 3281 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3282 continue; 3283 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3284 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3285 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3286 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3287 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3288 continue; 3289 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3290 if (r) 3291 return r; 3292 } 3293 3294 return 0; 3295 } 3296 3297 /** 3298 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3299 * 3300 * @adev: amdgpu_device pointer 3301 * 3302 * Third resume function for hardware IPs. The list of all the hardware 3303 * IPs that make up the asic is walked and the resume callbacks are run for 3304 * all DCE. resume puts the hardware into a functional state after a suspend 3305 * and updates the software state as necessary. This function is also used 3306 * for restoring the GPU after a GPU reset. 3307 * 3308 * Returns 0 on success, negative error code on failure. 3309 */ 3310 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3311 { 3312 int i, r; 3313 3314 for (i = 0; i < adev->num_ip_blocks; i++) { 3315 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3316 continue; 3317 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3318 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3319 if (r) 3320 return r; 3321 } 3322 } 3323 3324 return 0; 3325 } 3326 3327 /** 3328 * amdgpu_device_ip_resume - run resume for hardware IPs 3329 * 3330 * @adev: amdgpu_device pointer 3331 * 3332 * Main resume function for hardware IPs. The hardware IPs 3333 * are split into two resume functions because they are 3334 * also used in recovering from a GPU reset and some additional 3335 * steps need to be take between them. In this case (S3/S4) they are 3336 * run sequentially. 3337 * Returns 0 on success, negative error code on failure. 3338 */ 3339 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3340 { 3341 int r; 3342 3343 r = amdgpu_device_ip_resume_phase1(adev); 3344 if (r) 3345 return r; 3346 3347 r = amdgpu_device_fw_loading(adev); 3348 if (r) 3349 return r; 3350 3351 r = amdgpu_device_ip_resume_phase2(adev); 3352 3353 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3354 3355 if (r) 3356 return r; 3357 3358 amdgpu_fence_driver_hw_init(adev); 3359 3360 r = amdgpu_device_ip_resume_phase3(adev); 3361 3362 return r; 3363 } 3364 3365 /** 3366 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3367 * 3368 * @adev: amdgpu_device pointer 3369 * 3370 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3371 */ 3372 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3373 { 3374 if (amdgpu_sriov_vf(adev)) { 3375 if (adev->is_atom_fw) { 3376 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3377 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3378 } else { 3379 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3380 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3381 } 3382 3383 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3384 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3385 } 3386 } 3387 3388 /** 3389 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3390 * 3391 * @pdev : pci device context 3392 * @asic_type: AMD asic type 3393 * 3394 * Check if there is DC (new modesetting infrastructre) support for an asic. 3395 * returns true if DC has support, false if not. 3396 */ 3397 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 3398 enum amd_asic_type asic_type) 3399 { 3400 switch (asic_type) { 3401 #ifdef CONFIG_DRM_AMDGPU_SI 3402 case CHIP_HAINAN: 3403 #endif 3404 case CHIP_TOPAZ: 3405 /* chips with no display hardware */ 3406 return false; 3407 #if defined(CONFIG_DRM_AMD_DC) 3408 case CHIP_TAHITI: 3409 case CHIP_PITCAIRN: 3410 case CHIP_VERDE: 3411 case CHIP_OLAND: 3412 return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI); 3413 default: 3414 return amdgpu_dc != 0; 3415 #else 3416 default: 3417 if (amdgpu_dc > 0) 3418 dev_info_once( 3419 &pdev->dev, 3420 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3421 return false; 3422 #endif 3423 } 3424 } 3425 3426 /** 3427 * amdgpu_device_has_dc_support - check if dc is supported 3428 * 3429 * @adev: amdgpu_device pointer 3430 * 3431 * Returns true for supported, false for not supported 3432 */ 3433 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3434 { 3435 if (adev->enable_virtual_display || 3436 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3437 return false; 3438 3439 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 3440 } 3441 3442 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3443 { 3444 struct amdgpu_device *adev = 3445 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3446 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3447 3448 /* It's a bug to not have a hive within this function */ 3449 if (WARN_ON(!hive)) 3450 return; 3451 3452 /* 3453 * Use task barrier to synchronize all xgmi reset works across the 3454 * hive. task_barrier_enter and task_barrier_exit will block 3455 * until all the threads running the xgmi reset works reach 3456 * those points. task_barrier_full will do both blocks. 3457 */ 3458 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3459 3460 task_barrier_enter(&hive->tb); 3461 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 3462 3463 if (adev->asic_reset_res) 3464 goto fail; 3465 3466 task_barrier_exit(&hive->tb); 3467 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 3468 3469 if (adev->asic_reset_res) 3470 goto fail; 3471 3472 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3473 } else { 3474 3475 task_barrier_full(&hive->tb); 3476 adev->asic_reset_res = amdgpu_asic_reset(adev); 3477 } 3478 3479 fail: 3480 if (adev->asic_reset_res) 3481 dev_warn(adev->dev, 3482 "ASIC reset failed with error, %d for drm dev, %s", 3483 adev->asic_reset_res, adev_to_drm(adev)->unique); 3484 amdgpu_put_xgmi_hive(hive); 3485 } 3486 3487 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3488 { 3489 char *input = amdgpu_lockup_timeout; 3490 char *timeout_setting = NULL; 3491 int index = 0; 3492 long timeout; 3493 int ret = 0; 3494 3495 /* By default timeout for all queues is 2 sec */ 3496 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 3497 adev->video_timeout = msecs_to_jiffies(2000); 3498 3499 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) 3500 return 0; 3501 3502 while ((timeout_setting = strsep(&input, ",")) && 3503 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3504 ret = kstrtol(timeout_setting, 0, &timeout); 3505 if (ret) 3506 return ret; 3507 3508 if (timeout == 0) { 3509 index++; 3510 continue; 3511 } else if (timeout < 0) { 3512 timeout = MAX_SCHEDULE_TIMEOUT; 3513 dev_warn(adev->dev, "lockup timeout disabled"); 3514 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3515 } else { 3516 timeout = msecs_to_jiffies(timeout); 3517 } 3518 3519 switch (index++) { 3520 case 0: 3521 adev->gfx_timeout = timeout; 3522 break; 3523 case 1: 3524 adev->compute_timeout = timeout; 3525 break; 3526 case 2: 3527 adev->sdma_timeout = timeout; 3528 break; 3529 case 3: 3530 adev->video_timeout = timeout; 3531 break; 3532 default: 3533 break; 3534 } 3535 } 3536 3537 /* When only one value specified apply it to all queues. */ 3538 if (index == 1) 3539 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 3540 adev->video_timeout = timeout; 3541 3542 return ret; 3543 } 3544 3545 /** 3546 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3547 * 3548 * @adev: amdgpu_device pointer 3549 * 3550 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3551 */ 3552 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3553 { 3554 struct iommu_domain *domain; 3555 3556 domain = iommu_get_domain_for_dev(adev->dev); 3557 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3558 adev->ram_is_direct_mapped = true; 3559 } 3560 3561 #if defined(CONFIG_HSA_AMD_P2P) 3562 /** 3563 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 3564 * 3565 * @adev: amdgpu_device pointer 3566 * 3567 * return if IOMMU remapping bar address 3568 */ 3569 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 3570 { 3571 struct iommu_domain *domain; 3572 3573 domain = iommu_get_domain_for_dev(adev->dev); 3574 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 3575 domain->type == IOMMU_DOMAIN_DMA_FQ)) 3576 return true; 3577 3578 return false; 3579 } 3580 #endif 3581 3582 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3583 { 3584 if (amdgpu_mcbp == 1) 3585 adev->gfx.mcbp = true; 3586 else if (amdgpu_mcbp == 0) 3587 adev->gfx.mcbp = false; 3588 3589 if (amdgpu_sriov_vf(adev)) 3590 adev->gfx.mcbp = true; 3591 3592 if (adev->gfx.mcbp) 3593 dev_info(adev->dev, "MCBP is enabled\n"); 3594 } 3595 3596 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) 3597 { 3598 int r; 3599 3600 r = amdgpu_atombios_sysfs_init(adev); 3601 if (r) 3602 drm_err(&adev->ddev, 3603 "registering atombios sysfs failed (%d).\n", r); 3604 3605 r = amdgpu_pm_sysfs_init(adev); 3606 if (r) 3607 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 3608 3609 r = amdgpu_ucode_sysfs_init(adev); 3610 if (r) { 3611 adev->ucode_sysfs_en = false; 3612 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 3613 } else 3614 adev->ucode_sysfs_en = true; 3615 3616 r = amdgpu_device_attr_sysfs_init(adev); 3617 if (r) 3618 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3619 3620 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 3621 if (r) 3622 dev_err(adev->dev, 3623 "Could not create amdgpu board attributes\n"); 3624 3625 amdgpu_fru_sysfs_init(adev); 3626 amdgpu_reg_state_sysfs_init(adev); 3627 amdgpu_xcp_sysfs_init(adev); 3628 amdgpu_uma_sysfs_init(adev); 3629 3630 return r; 3631 } 3632 3633 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) 3634 { 3635 if (adev->pm.sysfs_initialized) 3636 amdgpu_pm_sysfs_fini(adev); 3637 if (adev->ucode_sysfs_en) 3638 amdgpu_ucode_sysfs_fini(adev); 3639 amdgpu_device_attr_sysfs_fini(adev); 3640 amdgpu_fru_sysfs_fini(adev); 3641 3642 amdgpu_reg_state_sysfs_fini(adev); 3643 amdgpu_xcp_sysfs_fini(adev); 3644 amdgpu_uma_sysfs_fini(adev); 3645 } 3646 3647 /** 3648 * amdgpu_device_init - initialize the driver 3649 * 3650 * @adev: amdgpu_device pointer 3651 * @flags: driver flags 3652 * 3653 * Initializes the driver info and hw (all asics). 3654 * Returns 0 for success or an error on failure. 3655 * Called at driver startup. 3656 */ 3657 int amdgpu_device_init(struct amdgpu_device *adev, 3658 uint32_t flags) 3659 { 3660 struct pci_dev *pdev = adev->pdev; 3661 int r, i; 3662 bool px = false; 3663 u32 max_MBps; 3664 int tmp; 3665 3666 adev->shutdown = false; 3667 adev->flags = flags; 3668 3669 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3670 adev->asic_type = amdgpu_force_asic_type; 3671 else 3672 adev->asic_type = flags & AMD_ASIC_MASK; 3673 3674 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3675 if (amdgpu_emu_mode == 1) 3676 adev->usec_timeout *= 10; 3677 adev->gmc.gart_size = 512 * 1024 * 1024; 3678 adev->accel_working = false; 3679 adev->num_rings = 0; 3680 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3681 adev->mman.buffer_funcs = NULL; 3682 adev->mman.buffer_funcs_ring = NULL; 3683 adev->vm_manager.vm_pte_funcs = NULL; 3684 adev->vm_manager.vm_pte_num_scheds = 0; 3685 adev->gmc.gmc_funcs = NULL; 3686 adev->harvest_ip_mask = 0x0; 3687 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3688 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3689 3690 amdgpu_reg_access_init(adev); 3691 3692 dev_info( 3693 adev->dev, 3694 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3695 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3696 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3697 3698 /* mutex initialization are all done here so we 3699 * can recall function without having locking issues 3700 */ 3701 mutex_init(&adev->firmware.mutex); 3702 mutex_init(&adev->pm.mutex); 3703 mutex_init(&adev->gfx.gpu_clock_mutex); 3704 mutex_init(&adev->srbm_mutex); 3705 mutex_init(&adev->gfx.pipe_reserve_mutex); 3706 mutex_init(&adev->gfx.gfx_off_mutex); 3707 mutex_init(&adev->gfx.partition_mutex); 3708 mutex_init(&adev->grbm_idx_mutex); 3709 mutex_init(&adev->mn_lock); 3710 mutex_init(&adev->virt.vf_errors.lock); 3711 hash_init(adev->mn_hash); 3712 mutex_init(&adev->psp.mutex); 3713 mutex_init(&adev->notifier_lock); 3714 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3715 mutex_init(&adev->benchmark_mutex); 3716 mutex_init(&adev->gfx.reset_sem_mutex); 3717 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 3718 mutex_init(&adev->enforce_isolation_mutex); 3719 for (i = 0; i < MAX_XCP; ++i) { 3720 adev->isolation[i].spearhead = dma_fence_get_stub(); 3721 amdgpu_sync_create(&adev->isolation[i].active); 3722 amdgpu_sync_create(&adev->isolation[i].prev); 3723 } 3724 mutex_init(&adev->gfx.userq_sch_mutex); 3725 mutex_init(&adev->gfx.workload_profile_mutex); 3726 mutex_init(&adev->vcn.workload_profile_mutex); 3727 3728 amdgpu_device_init_apu_flags(adev); 3729 3730 r = amdgpu_device_check_arguments(adev); 3731 if (r) 3732 return r; 3733 3734 spin_lock_init(&adev->mmio_idx_lock); 3735 spin_lock_init(&adev->mm_stats.lock); 3736 spin_lock_init(&adev->virt.rlcg_reg_lock); 3737 spin_lock_init(&adev->wb.lock); 3738 3739 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 3740 3741 INIT_LIST_HEAD(&adev->reset_list); 3742 3743 INIT_LIST_HEAD(&adev->ras_list); 3744 3745 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 3746 3747 xa_init(&adev->userq_doorbell_xa); 3748 3749 INIT_DELAYED_WORK(&adev->delayed_init_work, 3750 amdgpu_device_delayed_init_work_handler); 3751 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3752 amdgpu_device_delay_enable_gfx_off); 3753 /* 3754 * Initialize the enforce_isolation work structures for each XCP 3755 * partition. This work handler is responsible for enforcing shader 3756 * isolation on AMD GPUs. It counts the number of emitted fences for 3757 * each GFX and compute ring. If there are any fences, it schedules 3758 * the `enforce_isolation_work` to be run after a delay. If there are 3759 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 3760 * runqueue. 3761 */ 3762 for (i = 0; i < MAX_XCP; i++) { 3763 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 3764 amdgpu_gfx_enforce_isolation_handler); 3765 adev->gfx.enforce_isolation[i].adev = adev; 3766 adev->gfx.enforce_isolation[i].xcp_id = i; 3767 } 3768 3769 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3770 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); 3771 3772 adev->gfx.gfx_off_req_count = 1; 3773 adev->gfx.gfx_off_residency = 0; 3774 adev->gfx.gfx_off_entrycount = 0; 3775 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3776 3777 atomic_set(&adev->throttling_logging_enabled, 1); 3778 /* 3779 * If throttling continues, logging will be performed every minute 3780 * to avoid log flooding. "-1" is subtracted since the thermal 3781 * throttling interrupt comes every second. Thus, the total logging 3782 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3783 * for throttling interrupt) = 60 seconds. 3784 */ 3785 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3786 3787 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3788 3789 /* Registers mapping */ 3790 /* TODO: block userspace mapping of io register */ 3791 if (adev->asic_type >= CHIP_BONAIRE) { 3792 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3793 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3794 } else { 3795 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3796 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3797 } 3798 3799 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3800 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3801 3802 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3803 if (!adev->rmmio) 3804 return -ENOMEM; 3805 3806 dev_info(adev->dev, "register mmio base: 0x%08X\n", 3807 (uint32_t)adev->rmmio_base); 3808 dev_info(adev->dev, "register mmio size: %u\n", 3809 (unsigned int)adev->rmmio_size); 3810 3811 /* 3812 * Reset domain needs to be present early, before XGMI hive discovered 3813 * (if any) and initialized to use reset sem and in_gpu reset flag 3814 * early on during init and before calling to RREG32. 3815 */ 3816 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 3817 if (!adev->reset_domain) 3818 return -ENOMEM; 3819 3820 /* detect hw virtualization here */ 3821 amdgpu_virt_init(adev); 3822 3823 amdgpu_device_get_pcie_info(adev); 3824 3825 r = amdgpu_device_get_job_timeout_settings(adev); 3826 if (r) { 3827 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3828 return r; 3829 } 3830 3831 amdgpu_device_set_mcbp(adev); 3832 3833 /* 3834 * By default, use default mode where all blocks are expected to be 3835 * initialized. At present a 'swinit' of blocks is required to be 3836 * completed before the need for a different level is detected. 3837 */ 3838 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 3839 /* early init functions */ 3840 r = amdgpu_device_ip_early_init(adev); 3841 if (r) 3842 return r; 3843 3844 /* 3845 * No need to remove conflicting FBs for non-display class devices. 3846 * This prevents the sysfb from being freed accidently. 3847 */ 3848 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 3849 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 3850 /* Get rid of things like offb */ 3851 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 3852 if (r) 3853 return r; 3854 } 3855 3856 /* Enable TMZ based on IP_VERSION */ 3857 amdgpu_gmc_tmz_set(adev); 3858 3859 if (amdgpu_sriov_vf(adev) && 3860 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 3861 /* VF MMIO access (except mailbox range) from CPU 3862 * will be blocked during sriov runtime 3863 */ 3864 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 3865 3866 amdgpu_gmc_noretry_set(adev); 3867 /* Need to get xgmi info early to decide the reset behavior*/ 3868 if (adev->gmc.xgmi.supported) { 3869 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3870 if (r) 3871 return r; 3872 } 3873 3874 /* enable PCIE atomic ops */ 3875 if (amdgpu_sriov_vf(adev)) { 3876 if (adev->virt.fw_reserve.p_pf2vf) 3877 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3878 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 3879 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3880 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 3881 * internal path natively support atomics, set have_atomics_support to true. 3882 */ 3883 } else if ((adev->flags & AMD_IS_APU && 3884 amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) || 3885 (adev->gmc.xgmi.connected_to_cpu && 3886 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 1, 0))) { 3887 adev->have_atomics_support = true; 3888 } else { 3889 adev->have_atomics_support = 3890 !pci_enable_atomic_ops_to_root(adev->pdev, 3891 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3892 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3893 } 3894 3895 if (!adev->have_atomics_support) 3896 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3897 3898 /* doorbell bar mapping and doorbell index init*/ 3899 amdgpu_doorbell_init(adev); 3900 3901 if (amdgpu_emu_mode == 1) { 3902 /* post the asic on emulation mode */ 3903 emu_soc_asic_init(adev); 3904 goto fence_driver_init; 3905 } 3906 3907 amdgpu_reset_init(adev); 3908 3909 /* detect if we are with an SRIOV vbios */ 3910 if (adev->bios) 3911 amdgpu_device_detect_sriov_bios(adev); 3912 3913 /* check if we need to reset the asic 3914 * E.g., driver was not cleanly unloaded previously, etc. 3915 */ 3916 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3917 if (adev->gmc.xgmi.num_physical_nodes) { 3918 dev_info(adev->dev, "Pending hive reset.\n"); 3919 amdgpu_set_init_level(adev, 3920 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3921 } else { 3922 tmp = amdgpu_reset_method; 3923 /* It should do a default reset when loading or reloading the driver, 3924 * regardless of the module parameter reset_method. 3925 */ 3926 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 3927 r = amdgpu_asic_reset(adev); 3928 amdgpu_reset_method = tmp; 3929 } 3930 3931 if (r) { 3932 dev_err(adev->dev, "asic reset on init failed\n"); 3933 goto failed; 3934 } 3935 } 3936 3937 /* Post card if necessary */ 3938 if (amdgpu_device_need_post(adev)) { 3939 if (!adev->bios) { 3940 dev_err(adev->dev, "no vBIOS found\n"); 3941 r = -EINVAL; 3942 goto failed; 3943 } 3944 dev_info(adev->dev, "GPU posting now...\n"); 3945 r = amdgpu_device_asic_init(adev); 3946 if (r) { 3947 dev_err(adev->dev, "gpu post error!\n"); 3948 goto failed; 3949 } 3950 } 3951 3952 if (adev->bios) { 3953 if (adev->is_atom_fw) { 3954 /* Initialize clocks */ 3955 r = amdgpu_atomfirmware_get_clock_info(adev); 3956 if (r) { 3957 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3958 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3959 goto failed; 3960 } 3961 } else { 3962 /* Initialize clocks */ 3963 r = amdgpu_atombios_get_clock_info(adev); 3964 if (r) { 3965 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3966 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3967 goto failed; 3968 } 3969 /* init i2c buses */ 3970 amdgpu_i2c_init(adev); 3971 } 3972 } 3973 3974 fence_driver_init: 3975 /* Fence driver */ 3976 r = amdgpu_fence_driver_sw_init(adev); 3977 if (r) { 3978 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3979 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3980 goto failed; 3981 } 3982 3983 /* init the mode config */ 3984 drm_mode_config_init(adev_to_drm(adev)); 3985 3986 r = amdgpu_device_ip_init(adev); 3987 if (r) { 3988 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3989 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3990 goto release_ras_con; 3991 } 3992 3993 amdgpu_fence_driver_hw_init(adev); 3994 3995 dev_info(adev->dev, 3996 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3997 adev->gfx.config.max_shader_engines, 3998 adev->gfx.config.max_sh_per_se, 3999 adev->gfx.config.max_cu_per_sh, 4000 adev->gfx.cu_info.number); 4001 4002 adev->accel_working = true; 4003 4004 amdgpu_vm_check_compute_bug(adev); 4005 4006 /* Initialize the buffer migration limit. */ 4007 if (amdgpu_moverate >= 0) 4008 max_MBps = amdgpu_moverate; 4009 else 4010 max_MBps = 8; /* Allow 8 MB/s. */ 4011 /* Get a log2 for easy divisions. */ 4012 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4013 4014 /* 4015 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4016 * Otherwise the mgpu fan boost feature will be skipped due to the 4017 * gpu instance is counted less. 4018 */ 4019 amdgpu_register_gpu_instance(adev); 4020 4021 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4022 * explicit gating rather than handling it automatically. 4023 */ 4024 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4025 r = amdgpu_device_ip_late_init(adev); 4026 if (r) { 4027 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4028 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4029 goto release_ras_con; 4030 } 4031 /* must succeed. */ 4032 amdgpu_ras_resume(adev); 4033 queue_delayed_work(system_wq, &adev->delayed_init_work, 4034 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4035 } 4036 4037 if (amdgpu_sriov_vf(adev)) { 4038 amdgpu_virt_release_full_gpu(adev, true); 4039 flush_delayed_work(&adev->delayed_init_work); 4040 } 4041 4042 /* Don't init kfd if whole hive need to be reset during init */ 4043 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4044 kgd2kfd_init_zone_device(adev); 4045 kfd_update_svm_support_properties(adev); 4046 } 4047 4048 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4049 amdgpu_xgmi_reset_on_init(adev); 4050 4051 /* 4052 * Place those sysfs registering after `late_init`. As some of those 4053 * operations performed in `late_init` might affect the sysfs 4054 * interfaces creating. 4055 */ 4056 r = amdgpu_device_sys_interface_init(adev); 4057 4058 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4059 r = amdgpu_pmu_init(adev); 4060 if (r) 4061 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4062 4063 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4064 if (amdgpu_device_cache_pci_state(adev->pdev)) 4065 pci_restore_state(pdev); 4066 4067 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4068 /* this will fail for cards that aren't VGA class devices, just 4069 * ignore it 4070 */ 4071 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4072 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4073 4074 px = amdgpu_device_supports_px(adev); 4075 4076 if (px || (!dev_is_removable(&adev->pdev->dev) && 4077 apple_gmux_detect(NULL, NULL))) 4078 vga_switcheroo_register_client(adev->pdev, 4079 &amdgpu_switcheroo_ops, px); 4080 4081 if (px) 4082 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4083 4084 amdgpu_device_check_iommu_direct_map(adev); 4085 4086 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4087 r = register_pm_notifier(&adev->pm_nb); 4088 if (r) 4089 goto failed; 4090 4091 return 0; 4092 4093 release_ras_con: 4094 if (amdgpu_sriov_vf(adev)) 4095 amdgpu_virt_release_full_gpu(adev, true); 4096 4097 /* failed in exclusive mode due to timeout */ 4098 if (amdgpu_sriov_vf(adev) && 4099 !amdgpu_sriov_runtime(adev) && 4100 amdgpu_virt_mmio_blocked(adev) && 4101 !amdgpu_virt_wait_reset(adev)) { 4102 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4103 /* Don't send request since VF is inactive. */ 4104 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4105 adev->virt.ops = NULL; 4106 r = -EAGAIN; 4107 } 4108 amdgpu_release_ras_context(adev); 4109 4110 failed: 4111 amdgpu_vf_error_trans_all(adev); 4112 4113 return r; 4114 } 4115 4116 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4117 { 4118 4119 /* Clear all CPU mappings pointing to this device */ 4120 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4121 4122 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4123 amdgpu_doorbell_fini(adev); 4124 4125 iounmap(adev->rmmio); 4126 adev->rmmio = NULL; 4127 if (adev->mman.aper_base_kaddr) 4128 iounmap(adev->mman.aper_base_kaddr); 4129 adev->mman.aper_base_kaddr = NULL; 4130 4131 /* Memory manager related */ 4132 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4133 arch_phys_wc_del(adev->gmc.vram_mtrr); 4134 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4135 } 4136 } 4137 4138 /** 4139 * amdgpu_device_fini_hw - tear down the driver 4140 * 4141 * @adev: amdgpu_device pointer 4142 * 4143 * Tear down the driver info (all asics). 4144 * Called at driver shutdown. 4145 */ 4146 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4147 { 4148 dev_info(adev->dev, "finishing device.\n"); 4149 flush_delayed_work(&adev->delayed_init_work); 4150 4151 if (adev->mman.initialized) 4152 drain_workqueue(adev->mman.bdev.wq); 4153 adev->shutdown = true; 4154 4155 unregister_pm_notifier(&adev->pm_nb); 4156 4157 /* make sure IB test finished before entering exclusive mode 4158 * to avoid preemption on IB test 4159 */ 4160 if (amdgpu_sriov_vf(adev)) { 4161 amdgpu_virt_request_full_gpu(adev, false); 4162 amdgpu_virt_fini_data_exchange(adev); 4163 } 4164 4165 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 4166 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 4167 4168 /* disable all interrupts */ 4169 amdgpu_irq_disable_all(adev); 4170 if (adev->mode_info.mode_config_initialized) { 4171 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4172 drm_helper_force_disable_all(adev_to_drm(adev)); 4173 else 4174 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4175 } 4176 amdgpu_fence_driver_hw_fini(adev); 4177 4178 amdgpu_device_sys_interface_fini(adev); 4179 4180 /* disable ras feature must before hw fini */ 4181 amdgpu_ras_pre_fini(adev); 4182 4183 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4184 4185 /* 4186 * device went through surprise hotplug; we need to destroy topology 4187 * before ip_fini_early to prevent kfd locking refcount issues by calling 4188 * amdgpu_amdkfd_suspend() 4189 */ 4190 if (pci_dev_is_disconnected(adev->pdev)) 4191 amdgpu_amdkfd_device_fini_sw(adev); 4192 4193 amdgpu_device_ip_fini_early(adev); 4194 4195 amdgpu_irq_fini_hw(adev); 4196 4197 if (adev->mman.initialized) 4198 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4199 4200 amdgpu_gart_dummy_page_fini(adev); 4201 4202 if (pci_dev_is_disconnected(adev->pdev)) 4203 amdgpu_device_unmap_mmio(adev); 4204 4205 } 4206 4207 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4208 { 4209 int i, idx; 4210 bool px; 4211 4212 amdgpu_device_ip_fini(adev); 4213 amdgpu_fence_driver_sw_fini(adev); 4214 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4215 adev->accel_working = false; 4216 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4217 for (i = 0; i < MAX_XCP; ++i) { 4218 dma_fence_put(adev->isolation[i].spearhead); 4219 amdgpu_sync_free(&adev->isolation[i].active); 4220 amdgpu_sync_free(&adev->isolation[i].prev); 4221 } 4222 4223 amdgpu_reset_fini(adev); 4224 4225 /* free i2c buses */ 4226 amdgpu_i2c_fini(adev); 4227 4228 if (adev->bios) { 4229 if (amdgpu_emu_mode != 1) 4230 amdgpu_atombios_fini(adev); 4231 amdgpu_bios_release(adev); 4232 } 4233 4234 kfree(adev->fru_info); 4235 adev->fru_info = NULL; 4236 4237 kfree(adev->xcp_mgr); 4238 adev->xcp_mgr = NULL; 4239 4240 px = amdgpu_device_supports_px(adev); 4241 4242 if (px || (!dev_is_removable(&adev->pdev->dev) && 4243 apple_gmux_detect(NULL, NULL))) 4244 vga_switcheroo_unregister_client(adev->pdev); 4245 4246 if (px) 4247 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4248 4249 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4250 vga_client_unregister(adev->pdev); 4251 4252 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4253 4254 iounmap(adev->rmmio); 4255 adev->rmmio = NULL; 4256 drm_dev_exit(idx); 4257 } 4258 4259 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4260 amdgpu_pmu_fini(adev); 4261 if (adev->discovery.bin) 4262 amdgpu_discovery_fini(adev); 4263 4264 amdgpu_reset_put_reset_domain(adev->reset_domain); 4265 adev->reset_domain = NULL; 4266 4267 kfree(adev->pci_state); 4268 kfree(adev->pcie_reset_ctx.swds_pcistate); 4269 kfree(adev->pcie_reset_ctx.swus_pcistate); 4270 } 4271 4272 /** 4273 * amdgpu_device_evict_resources - evict device resources 4274 * @adev: amdgpu device object 4275 * 4276 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4277 * of the vram memory type. Mainly used for evicting device resources 4278 * at suspend time. 4279 * 4280 */ 4281 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4282 { 4283 int ret; 4284 4285 /* No need to evict vram on APUs unless going to S4 */ 4286 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4287 return 0; 4288 4289 /* No need to evict when going to S5 through S4 callbacks */ 4290 if (system_state == SYSTEM_POWER_OFF) 4291 return 0; 4292 4293 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4294 if (ret) { 4295 dev_warn(adev->dev, "evicting device resources failed\n"); 4296 return ret; 4297 } 4298 4299 if (adev->in_s4) { 4300 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 4301 if (ret) 4302 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 4303 } 4304 return ret; 4305 } 4306 4307 /* 4308 * Suspend & resume. 4309 */ 4310 /** 4311 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4312 * @nb: notifier block 4313 * @mode: suspend mode 4314 * @data: data 4315 * 4316 * This function is called when the system is about to suspend or hibernate. 4317 * It is used to set the appropriate flags so that eviction can be optimized 4318 * in the pm prepare callback. 4319 */ 4320 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4321 void *data) 4322 { 4323 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4324 4325 switch (mode) { 4326 case PM_HIBERNATION_PREPARE: 4327 adev->in_s4 = true; 4328 break; 4329 case PM_POST_HIBERNATION: 4330 adev->in_s4 = false; 4331 break; 4332 } 4333 4334 return NOTIFY_DONE; 4335 } 4336 4337 /** 4338 * amdgpu_device_prepare - prepare for device suspend 4339 * 4340 * @dev: drm dev pointer 4341 * 4342 * Prepare to put the hw in the suspend state (all asics). 4343 * Returns 0 for success or an error on failure. 4344 * Called at driver suspend. 4345 */ 4346 int amdgpu_device_prepare(struct drm_device *dev) 4347 { 4348 struct amdgpu_device *adev = drm_to_adev(dev); 4349 int i, r; 4350 4351 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4352 return 0; 4353 4354 /* Evict the majority of BOs before starting suspend sequence */ 4355 r = amdgpu_device_evict_resources(adev); 4356 if (r) 4357 return r; 4358 4359 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4360 4361 for (i = 0; i < adev->num_ip_blocks; i++) { 4362 if (!adev->ip_blocks[i].status.valid) 4363 continue; 4364 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4365 continue; 4366 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4367 if (r) 4368 return r; 4369 } 4370 4371 return 0; 4372 } 4373 4374 /** 4375 * amdgpu_device_complete - complete power state transition 4376 * 4377 * @dev: drm dev pointer 4378 * 4379 * Undo the changes from amdgpu_device_prepare. This will be 4380 * called on all resume transitions, including those that failed. 4381 */ 4382 void amdgpu_device_complete(struct drm_device *dev) 4383 { 4384 struct amdgpu_device *adev = drm_to_adev(dev); 4385 int i; 4386 4387 for (i = 0; i < adev->num_ip_blocks; i++) { 4388 if (!adev->ip_blocks[i].status.valid) 4389 continue; 4390 if (!adev->ip_blocks[i].version->funcs->complete) 4391 continue; 4392 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 4393 } 4394 } 4395 4396 /** 4397 * amdgpu_device_suspend - initiate device suspend 4398 * 4399 * @dev: drm dev pointer 4400 * @notify_clients: notify in-kernel DRM clients 4401 * 4402 * Puts the hw in the suspend state (all asics). 4403 * Returns 0 for success or an error on failure. 4404 * Called at driver suspend. 4405 */ 4406 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4407 { 4408 struct amdgpu_device *adev = drm_to_adev(dev); 4409 int r, rec; 4410 4411 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4412 return 0; 4413 4414 adev->in_suspend = true; 4415 4416 if (amdgpu_sriov_vf(adev)) { 4417 if (!adev->in_runpm) 4418 amdgpu_amdkfd_suspend_process(adev); 4419 amdgpu_virt_fini_data_exchange(adev); 4420 r = amdgpu_virt_request_full_gpu(adev, false); 4421 if (r) 4422 return r; 4423 } 4424 4425 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); 4426 if (r) 4427 goto unwind_sriov; 4428 4429 if (notify_clients) 4430 drm_client_dev_suspend(adev_to_drm(adev)); 4431 4432 cancel_delayed_work_sync(&adev->delayed_init_work); 4433 4434 amdgpu_ras_suspend(adev); 4435 4436 r = amdgpu_device_ip_suspend_phase1(adev); 4437 if (r) 4438 goto unwind_smartshift; 4439 4440 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 4441 r = amdgpu_userq_suspend(adev); 4442 if (r) 4443 goto unwind_ip_phase1; 4444 4445 r = amdgpu_device_evict_resources(adev); 4446 if (r) 4447 goto unwind_userq; 4448 4449 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4450 4451 amdgpu_fence_driver_hw_fini(adev); 4452 4453 r = amdgpu_device_ip_suspend_phase2(adev); 4454 if (r) 4455 goto unwind_evict; 4456 4457 if (amdgpu_sriov_vf(adev)) 4458 amdgpu_virt_release_full_gpu(adev, false); 4459 4460 return 0; 4461 4462 unwind_evict: 4463 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4464 amdgpu_fence_driver_hw_init(adev); 4465 4466 unwind_userq: 4467 rec = amdgpu_userq_resume(adev); 4468 if (rec) { 4469 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); 4470 return r; 4471 } 4472 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 4473 if (rec) { 4474 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); 4475 return r; 4476 } 4477 4478 unwind_ip_phase1: 4479 /* suspend phase 1 = resume phase 3 */ 4480 rec = amdgpu_device_ip_resume_phase3(adev); 4481 if (rec) { 4482 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); 4483 return r; 4484 } 4485 4486 unwind_smartshift: 4487 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); 4488 if (rec) { 4489 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); 4490 return r; 4491 } 4492 4493 if (notify_clients) 4494 drm_client_dev_resume(adev_to_drm(adev)); 4495 4496 amdgpu_ras_resume(adev); 4497 4498 unwind_sriov: 4499 if (amdgpu_sriov_vf(adev)) { 4500 rec = amdgpu_virt_request_full_gpu(adev, true); 4501 if (rec) { 4502 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); 4503 return r; 4504 } 4505 } 4506 4507 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; 4508 4509 return r; 4510 } 4511 4512 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 4513 { 4514 int r; 4515 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 4516 4517 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 4518 * may not work. The access could be blocked by nBIF protection as VF isn't in 4519 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 4520 * so that QEMU reprograms MSIX table. 4521 */ 4522 amdgpu_restore_msix(adev); 4523 4524 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4525 if (r) 4526 return r; 4527 4528 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 4529 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 4530 4531 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 4532 adev->vm_manager.vram_base_offset += 4533 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 4534 4535 return 0; 4536 } 4537 4538 /** 4539 * amdgpu_device_resume - initiate device resume 4540 * 4541 * @dev: drm dev pointer 4542 * @notify_clients: notify in-kernel DRM clients 4543 * 4544 * Bring the hw back to operating state (all asics). 4545 * Returns 0 for success or an error on failure. 4546 * Called at driver resume. 4547 */ 4548 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 4549 { 4550 struct amdgpu_device *adev = drm_to_adev(dev); 4551 int r = 0; 4552 4553 if (amdgpu_sriov_vf(adev)) { 4554 r = amdgpu_virt_request_full_gpu(adev, true); 4555 if (r) 4556 return r; 4557 } 4558 4559 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 4560 r = amdgpu_virt_resume(adev); 4561 if (r) 4562 goto exit; 4563 } 4564 4565 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4566 return 0; 4567 4568 if (adev->in_s0ix) 4569 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4570 4571 /* post card */ 4572 if (amdgpu_device_need_post(adev)) { 4573 r = amdgpu_device_asic_init(adev); 4574 if (r) 4575 dev_err(adev->dev, "amdgpu asic init failed\n"); 4576 } 4577 4578 r = amdgpu_device_ip_resume(adev); 4579 4580 if (r) { 4581 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4582 goto exit; 4583 } 4584 4585 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 4586 if (r) 4587 goto exit; 4588 4589 r = amdgpu_userq_resume(adev); 4590 if (r) 4591 goto exit; 4592 4593 r = amdgpu_device_ip_late_init(adev); 4594 if (r) 4595 goto exit; 4596 4597 queue_delayed_work(system_wq, &adev->delayed_init_work, 4598 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4599 exit: 4600 if (amdgpu_sriov_vf(adev)) { 4601 amdgpu_virt_init_data_exchange(adev); 4602 amdgpu_virt_release_full_gpu(adev, true); 4603 4604 if (!r && !adev->in_runpm) 4605 r = amdgpu_amdkfd_resume_process(adev); 4606 } 4607 4608 if (r) 4609 return r; 4610 4611 /* Make sure IB tests flushed */ 4612 flush_delayed_work(&adev->delayed_init_work); 4613 4614 if (notify_clients) 4615 drm_client_dev_resume(adev_to_drm(adev)); 4616 4617 amdgpu_ras_resume(adev); 4618 4619 if (adev->mode_info.num_crtc) { 4620 /* 4621 * Most of the connector probing functions try to acquire runtime pm 4622 * refs to ensure that the GPU is powered on when connector polling is 4623 * performed. Since we're calling this from a runtime PM callback, 4624 * trying to acquire rpm refs will cause us to deadlock. 4625 * 4626 * Since we're guaranteed to be holding the rpm lock, it's safe to 4627 * temporarily disable the rpm helpers so this doesn't deadlock us. 4628 */ 4629 #ifdef CONFIG_PM 4630 dev->dev->power.disable_depth++; 4631 #endif 4632 if (!adev->dc_enabled) 4633 drm_helper_hpd_irq_event(dev); 4634 else 4635 drm_kms_helper_hotplug_event(dev); 4636 #ifdef CONFIG_PM 4637 dev->dev->power.disable_depth--; 4638 #endif 4639 } 4640 4641 amdgpu_vram_mgr_clear_reset_blocks(adev); 4642 adev->in_suspend = false; 4643 4644 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 4645 dev_warn(adev->dev, "smart shift update failed\n"); 4646 4647 return 0; 4648 } 4649 4650 /** 4651 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4652 * 4653 * @adev: amdgpu_device pointer 4654 * 4655 * The list of all the hardware IPs that make up the asic is walked and 4656 * the check_soft_reset callbacks are run. check_soft_reset determines 4657 * if the asic is still hung or not. 4658 * Returns true if any of the IPs are still in a hung state, false if not. 4659 */ 4660 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4661 { 4662 int i; 4663 bool asic_hang = false; 4664 4665 if (amdgpu_sriov_vf(adev)) 4666 return true; 4667 4668 if (amdgpu_asic_need_full_reset(adev)) 4669 return true; 4670 4671 for (i = 0; i < adev->num_ip_blocks; i++) { 4672 if (!adev->ip_blocks[i].status.valid) 4673 continue; 4674 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4675 adev->ip_blocks[i].status.hang = 4676 adev->ip_blocks[i].version->funcs->check_soft_reset( 4677 &adev->ip_blocks[i]); 4678 if (adev->ip_blocks[i].status.hang) { 4679 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4680 asic_hang = true; 4681 } 4682 } 4683 return asic_hang; 4684 } 4685 4686 /** 4687 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4688 * 4689 * @adev: amdgpu_device pointer 4690 * 4691 * The list of all the hardware IPs that make up the asic is walked and the 4692 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4693 * handles any IP specific hardware or software state changes that are 4694 * necessary for a soft reset to succeed. 4695 * Returns 0 on success, negative error code on failure. 4696 */ 4697 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4698 { 4699 int i, r = 0; 4700 4701 for (i = 0; i < adev->num_ip_blocks; i++) { 4702 if (!adev->ip_blocks[i].status.valid) 4703 continue; 4704 if (adev->ip_blocks[i].status.hang && 4705 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4706 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 4707 if (r) 4708 return r; 4709 } 4710 } 4711 4712 return 0; 4713 } 4714 4715 /** 4716 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4717 * 4718 * @adev: amdgpu_device pointer 4719 * 4720 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4721 * reset is necessary to recover. 4722 * Returns true if a full asic reset is required, false if not. 4723 */ 4724 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4725 { 4726 int i; 4727 4728 if (amdgpu_asic_need_full_reset(adev)) 4729 return true; 4730 4731 for (i = 0; i < adev->num_ip_blocks; i++) { 4732 if (!adev->ip_blocks[i].status.valid) 4733 continue; 4734 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4735 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4736 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4737 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4738 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4739 if (adev->ip_blocks[i].status.hang) { 4740 dev_info(adev->dev, "Some block need full reset!\n"); 4741 return true; 4742 } 4743 } 4744 } 4745 return false; 4746 } 4747 4748 /** 4749 * amdgpu_device_ip_soft_reset - do a soft reset 4750 * 4751 * @adev: amdgpu_device pointer 4752 * 4753 * The list of all the hardware IPs that make up the asic is walked and the 4754 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4755 * IP specific hardware or software state changes that are necessary to soft 4756 * reset the IP. 4757 * Returns 0 on success, negative error code on failure. 4758 */ 4759 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4760 { 4761 int i, r = 0; 4762 4763 for (i = 0; i < adev->num_ip_blocks; i++) { 4764 if (!adev->ip_blocks[i].status.valid) 4765 continue; 4766 if (adev->ip_blocks[i].status.hang && 4767 adev->ip_blocks[i].version->funcs->soft_reset) { 4768 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 4769 if (r) 4770 return r; 4771 } 4772 } 4773 4774 return 0; 4775 } 4776 4777 /** 4778 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4779 * 4780 * @adev: amdgpu_device pointer 4781 * 4782 * The list of all the hardware IPs that make up the asic is walked and the 4783 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4784 * handles any IP specific hardware or software state changes that are 4785 * necessary after the IP has been soft reset. 4786 * Returns 0 on success, negative error code on failure. 4787 */ 4788 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4789 { 4790 int i, r = 0; 4791 4792 for (i = 0; i < adev->num_ip_blocks; i++) { 4793 if (!adev->ip_blocks[i].status.valid) 4794 continue; 4795 if (adev->ip_blocks[i].status.hang && 4796 adev->ip_blocks[i].version->funcs->post_soft_reset) 4797 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 4798 if (r) 4799 return r; 4800 } 4801 4802 return 0; 4803 } 4804 4805 /** 4806 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4807 * 4808 * @adev: amdgpu_device pointer 4809 * @reset_context: amdgpu reset context pointer 4810 * 4811 * do VF FLR and reinitialize Asic 4812 * return 0 means succeeded otherwise failed 4813 */ 4814 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4815 struct amdgpu_reset_context *reset_context) 4816 { 4817 int r; 4818 struct amdgpu_hive_info *hive = NULL; 4819 4820 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 4821 if (!amdgpu_ras_get_fed_status(adev)) 4822 amdgpu_virt_ready_to_reset(adev); 4823 amdgpu_virt_wait_reset(adev); 4824 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 4825 r = amdgpu_virt_request_full_gpu(adev, true); 4826 } else { 4827 r = amdgpu_virt_reset_gpu(adev); 4828 } 4829 if (r) 4830 return r; 4831 4832 amdgpu_ras_clear_err_state(adev); 4833 amdgpu_irq_gpu_reset_resume_helper(adev); 4834 4835 /* some sw clean up VF needs to do before recover */ 4836 amdgpu_virt_post_reset(adev); 4837 4838 /* Resume IP prior to SMC */ 4839 r = amdgpu_device_ip_reinit_early_sriov(adev); 4840 if (r) 4841 return r; 4842 4843 amdgpu_virt_init_data_exchange(adev); 4844 4845 r = amdgpu_device_fw_loading(adev); 4846 if (r) 4847 return r; 4848 4849 /* now we are okay to resume SMC/CP/SDMA */ 4850 r = amdgpu_device_ip_reinit_late_sriov(adev); 4851 if (r) 4852 return r; 4853 4854 hive = amdgpu_get_xgmi_hive(adev); 4855 /* Update PSP FW topology after reset */ 4856 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4857 r = amdgpu_xgmi_update_topology(hive, adev); 4858 if (hive) 4859 amdgpu_put_xgmi_hive(hive); 4860 if (r) 4861 return r; 4862 4863 r = amdgpu_ib_ring_tests(adev); 4864 if (r) 4865 return r; 4866 4867 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 4868 amdgpu_inc_vram_lost(adev); 4869 4870 /* need to be called during full access so we can't do it later like 4871 * bare-metal does. 4872 */ 4873 amdgpu_amdkfd_post_reset(adev); 4874 amdgpu_virt_release_full_gpu(adev, true); 4875 4876 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 4877 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 4878 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 4879 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 4880 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 4881 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 4882 amdgpu_ras_resume(adev); 4883 4884 amdgpu_virt_ras_telemetry_post_reset(adev); 4885 4886 return 0; 4887 } 4888 4889 /** 4890 * amdgpu_device_has_job_running - check if there is any unfinished job 4891 * 4892 * @adev: amdgpu_device pointer 4893 * 4894 * check if there is any job running on the device when guest driver receives 4895 * FLR notification from host driver. If there are still jobs running, then 4896 * the guest driver will not respond the FLR reset. Instead, let the job hit 4897 * the timeout and guest driver then issue the reset request. 4898 */ 4899 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4900 { 4901 int i; 4902 4903 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4904 struct amdgpu_ring *ring = adev->rings[i]; 4905 4906 if (!amdgpu_ring_sched_ready(ring)) 4907 continue; 4908 4909 if (amdgpu_fence_count_emitted(ring)) 4910 return true; 4911 } 4912 return false; 4913 } 4914 4915 /** 4916 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4917 * 4918 * @adev: amdgpu_device pointer 4919 * 4920 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4921 * a hung GPU. 4922 */ 4923 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4924 { 4925 4926 if (amdgpu_gpu_recovery == 0) 4927 goto disabled; 4928 4929 /* Skip soft reset check in fatal error mode */ 4930 if (!amdgpu_ras_is_poison_mode_supported(adev)) 4931 return true; 4932 4933 if (amdgpu_sriov_vf(adev)) 4934 return true; 4935 4936 if (amdgpu_gpu_recovery == -1) { 4937 switch (adev->asic_type) { 4938 #ifdef CONFIG_DRM_AMDGPU_SI 4939 case CHIP_VERDE: 4940 case CHIP_TAHITI: 4941 case CHIP_PITCAIRN: 4942 case CHIP_OLAND: 4943 case CHIP_HAINAN: 4944 #endif 4945 #ifdef CONFIG_DRM_AMDGPU_CIK 4946 case CHIP_KAVERI: 4947 case CHIP_KABINI: 4948 case CHIP_MULLINS: 4949 #endif 4950 case CHIP_CARRIZO: 4951 case CHIP_STONEY: 4952 case CHIP_CYAN_SKILLFISH: 4953 goto disabled; 4954 default: 4955 break; 4956 } 4957 } 4958 4959 return true; 4960 4961 disabled: 4962 dev_info(adev->dev, "GPU recovery disabled.\n"); 4963 return false; 4964 } 4965 4966 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4967 { 4968 u32 i; 4969 int ret = 0; 4970 4971 if (adev->bios) 4972 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4973 4974 dev_info(adev->dev, "GPU mode1 reset\n"); 4975 4976 /* Cache the state before bus master disable. The saved config space 4977 * values are used in other cases like restore after mode-2 reset. 4978 */ 4979 amdgpu_device_cache_pci_state(adev->pdev); 4980 4981 /* disable BM */ 4982 pci_clear_master(adev->pdev); 4983 4984 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4985 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4986 ret = amdgpu_dpm_mode1_reset(adev); 4987 } else { 4988 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4989 ret = psp_gpu_reset(adev); 4990 } 4991 4992 if (ret) 4993 goto mode1_reset_failed; 4994 4995 /* enable mmio access after mode 1 reset completed */ 4996 adev->no_hw_access = false; 4997 4998 /* ensure no_hw_access is updated before we access hw */ 4999 smp_mb(); 5000 5001 amdgpu_device_load_pci_state(adev->pdev); 5002 ret = amdgpu_psp_wait_for_bootloader(adev); 5003 if (ret) 5004 goto mode1_reset_failed; 5005 5006 /* wait for asic to come out of reset */ 5007 for (i = 0; i < adev->usec_timeout; i++) { 5008 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5009 5010 if (memsize != 0xffffffff) 5011 break; 5012 udelay(1); 5013 } 5014 5015 if (i >= adev->usec_timeout) { 5016 ret = -ETIMEDOUT; 5017 goto mode1_reset_failed; 5018 } 5019 5020 if (adev->bios) 5021 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5022 5023 return 0; 5024 5025 mode1_reset_failed: 5026 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5027 return ret; 5028 } 5029 5030 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5031 { 5032 int ret = 0; 5033 5034 dev_info(adev->dev, "GPU link reset\n"); 5035 5036 if (!amdgpu_reset_in_dpc(adev)) 5037 ret = amdgpu_dpm_link_reset(adev); 5038 5039 if (ret) 5040 goto link_reset_failed; 5041 5042 ret = amdgpu_psp_wait_for_bootloader(adev); 5043 if (ret) 5044 goto link_reset_failed; 5045 5046 return 0; 5047 5048 link_reset_failed: 5049 dev_err(adev->dev, "GPU link reset failed\n"); 5050 return ret; 5051 } 5052 5053 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5054 struct amdgpu_reset_context *reset_context) 5055 { 5056 int i, r = 0; 5057 struct amdgpu_job *job = NULL; 5058 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5059 bool need_full_reset = 5060 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5061 5062 if (reset_context->reset_req_dev == adev) 5063 job = reset_context->job; 5064 5065 if (amdgpu_sriov_vf(adev)) 5066 amdgpu_virt_pre_reset(adev); 5067 5068 amdgpu_fence_driver_isr_toggle(adev, true); 5069 5070 /* block all schedulers and reset given job's ring */ 5071 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5072 struct amdgpu_ring *ring = adev->rings[i]; 5073 5074 if (!amdgpu_ring_sched_ready(ring)) 5075 continue; 5076 5077 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5078 amdgpu_fence_driver_force_completion(ring); 5079 } 5080 5081 amdgpu_fence_driver_isr_toggle(adev, false); 5082 5083 if (job && job->vm) 5084 drm_sched_increase_karma(&job->base); 5085 5086 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5087 /* If reset handler not implemented, continue; otherwise return */ 5088 if (r == -EOPNOTSUPP) 5089 r = 0; 5090 else 5091 return r; 5092 5093 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5094 if (!amdgpu_sriov_vf(adev)) { 5095 5096 if (!need_full_reset) 5097 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5098 5099 if (!need_full_reset && amdgpu_gpu_recovery && 5100 amdgpu_device_ip_check_soft_reset(adev)) { 5101 amdgpu_device_ip_pre_soft_reset(adev); 5102 r = amdgpu_device_ip_soft_reset(adev); 5103 amdgpu_device_ip_post_soft_reset(adev); 5104 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5105 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5106 need_full_reset = true; 5107 } 5108 } 5109 5110 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5111 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5112 /* Trigger ip dump before we reset the asic */ 5113 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5114 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5115 tmp_adev->ip_blocks[i].version->funcs 5116 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5117 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5118 } 5119 5120 if (need_full_reset) 5121 r = amdgpu_device_ip_suspend(adev); 5122 if (need_full_reset) 5123 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5124 else 5125 clear_bit(AMDGPU_NEED_FULL_RESET, 5126 &reset_context->flags); 5127 } 5128 5129 return r; 5130 } 5131 5132 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5133 { 5134 struct list_head *device_list_handle; 5135 bool full_reset, vram_lost = false; 5136 struct amdgpu_device *tmp_adev; 5137 int r, init_level; 5138 5139 device_list_handle = reset_context->reset_device_list; 5140 5141 if (!device_list_handle) 5142 return -EINVAL; 5143 5144 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5145 5146 /** 5147 * If it's reset on init, it's default init level, otherwise keep level 5148 * as recovery level. 5149 */ 5150 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5151 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5152 else 5153 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5154 5155 r = 0; 5156 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5157 amdgpu_set_init_level(tmp_adev, init_level); 5158 if (full_reset) { 5159 /* post card */ 5160 amdgpu_reset_set_dpc_status(tmp_adev, false); 5161 amdgpu_ras_clear_err_state(tmp_adev); 5162 r = amdgpu_device_asic_init(tmp_adev); 5163 if (r) { 5164 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5165 } else { 5166 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5167 5168 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5169 if (r) 5170 goto out; 5171 5172 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5173 5174 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5175 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5176 5177 if (vram_lost) { 5178 dev_info( 5179 tmp_adev->dev, 5180 "VRAM is lost due to GPU reset!\n"); 5181 amdgpu_inc_vram_lost(tmp_adev); 5182 } 5183 5184 r = amdgpu_device_fw_loading(tmp_adev); 5185 if (r) 5186 return r; 5187 5188 r = amdgpu_xcp_restore_partition_mode( 5189 tmp_adev->xcp_mgr); 5190 if (r) 5191 goto out; 5192 5193 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5194 if (r) 5195 goto out; 5196 5197 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5198 5199 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5200 if (r) 5201 goto out; 5202 5203 if (vram_lost) 5204 amdgpu_device_fill_reset_magic(tmp_adev); 5205 5206 /* 5207 * Add this ASIC as tracked as reset was already 5208 * complete successfully. 5209 */ 5210 amdgpu_register_gpu_instance(tmp_adev); 5211 5212 if (!reset_context->hive && 5213 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5214 amdgpu_xgmi_add_device(tmp_adev); 5215 5216 r = amdgpu_device_ip_late_init(tmp_adev); 5217 if (r) 5218 goto out; 5219 5220 r = amdgpu_userq_post_reset(tmp_adev, vram_lost); 5221 if (r) 5222 goto out; 5223 5224 drm_client_dev_resume(adev_to_drm(tmp_adev)); 5225 5226 /* 5227 * The GPU enters bad state once faulty pages 5228 * by ECC has reached the threshold, and ras 5229 * recovery is scheduled next. So add one check 5230 * here to break recovery if it indeed exceeds 5231 * bad page threshold, and remind user to 5232 * retire this GPU or setting one bigger 5233 * bad_page_threshold value to fix this once 5234 * probing driver again. 5235 */ 5236 if (!amdgpu_ras_is_rma(tmp_adev)) { 5237 /* must succeed. */ 5238 amdgpu_ras_resume(tmp_adev); 5239 } else { 5240 r = -EINVAL; 5241 goto out; 5242 } 5243 5244 /* Update PSP FW topology after reset */ 5245 if (reset_context->hive && 5246 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5247 r = amdgpu_xgmi_update_topology( 5248 reset_context->hive, tmp_adev); 5249 } 5250 } 5251 5252 out: 5253 if (!r) { 5254 /* IP init is complete now, set level as default */ 5255 amdgpu_set_init_level(tmp_adev, 5256 AMDGPU_INIT_LEVEL_DEFAULT); 5257 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5258 r = amdgpu_ib_ring_tests(tmp_adev); 5259 if (r) { 5260 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5261 r = -EAGAIN; 5262 goto end; 5263 } 5264 } 5265 5266 if (r) 5267 tmp_adev->asic_reset_res = r; 5268 } 5269 5270 end: 5271 return r; 5272 } 5273 5274 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5275 struct amdgpu_reset_context *reset_context) 5276 { 5277 struct amdgpu_device *tmp_adev = NULL; 5278 bool need_full_reset, skip_hw_reset; 5279 int r = 0; 5280 5281 /* Try reset handler method first */ 5282 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5283 reset_list); 5284 5285 reset_context->reset_device_list = device_list_handle; 5286 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5287 /* If reset handler not implemented, continue; otherwise return */ 5288 if (r == -EOPNOTSUPP) 5289 r = 0; 5290 else 5291 return r; 5292 5293 /* Reset handler not implemented, use the default method */ 5294 need_full_reset = 5295 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5296 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5297 5298 /* 5299 * ASIC reset has to be done on all XGMI hive nodes ASAP 5300 * to allow proper links negotiation in FW (within 1 sec) 5301 */ 5302 if (!skip_hw_reset && need_full_reset) { 5303 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5304 /* For XGMI run all resets in parallel to speed up the process */ 5305 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5306 if (!queue_work(system_unbound_wq, 5307 &tmp_adev->xgmi_reset_work)) 5308 r = -EALREADY; 5309 } else 5310 r = amdgpu_asic_reset(tmp_adev); 5311 5312 if (r) { 5313 dev_err(tmp_adev->dev, 5314 "ASIC reset failed with error, %d for drm dev, %s", 5315 r, adev_to_drm(tmp_adev)->unique); 5316 goto out; 5317 } 5318 } 5319 5320 /* For XGMI wait for all resets to complete before proceed */ 5321 if (!r) { 5322 list_for_each_entry(tmp_adev, device_list_handle, 5323 reset_list) { 5324 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5325 flush_work(&tmp_adev->xgmi_reset_work); 5326 r = tmp_adev->asic_reset_res; 5327 if (r) 5328 break; 5329 } 5330 } 5331 } 5332 } 5333 5334 if (!r && amdgpu_ras_intr_triggered()) { 5335 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5336 amdgpu_ras_reset_error_count(tmp_adev, 5337 AMDGPU_RAS_BLOCK__MMHUB); 5338 } 5339 5340 amdgpu_ras_intr_cleared(); 5341 } 5342 5343 r = amdgpu_device_reinit_after_reset(reset_context); 5344 if (r == -EAGAIN) 5345 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5346 else 5347 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5348 5349 out: 5350 return r; 5351 } 5352 5353 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5354 { 5355 5356 switch (amdgpu_asic_reset_method(adev)) { 5357 case AMD_RESET_METHOD_MODE1: 5358 case AMD_RESET_METHOD_LINK: 5359 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5360 break; 5361 case AMD_RESET_METHOD_MODE2: 5362 adev->mp1_state = PP_MP1_STATE_RESET; 5363 break; 5364 default: 5365 adev->mp1_state = PP_MP1_STATE_NONE; 5366 break; 5367 } 5368 } 5369 5370 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5371 { 5372 amdgpu_vf_error_trans_all(adev); 5373 adev->mp1_state = PP_MP1_STATE_NONE; 5374 } 5375 5376 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5377 { 5378 struct pci_dev *p = NULL; 5379 5380 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5381 adev->pdev->bus->number, 1); 5382 if (p) { 5383 pm_runtime_enable(&(p->dev)); 5384 pm_runtime_resume(&(p->dev)); 5385 } 5386 5387 pci_dev_put(p); 5388 } 5389 5390 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5391 { 5392 enum amd_reset_method reset_method; 5393 struct pci_dev *p = NULL; 5394 u64 expires; 5395 5396 /* 5397 * For now, only BACO and mode1 reset are confirmed 5398 * to suffer the audio issue without proper suspended. 5399 */ 5400 reset_method = amdgpu_asic_reset_method(adev); 5401 if ((reset_method != AMD_RESET_METHOD_BACO) && 5402 (reset_method != AMD_RESET_METHOD_MODE1)) 5403 return -EINVAL; 5404 5405 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5406 adev->pdev->bus->number, 1); 5407 if (!p) 5408 return -ENODEV; 5409 5410 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5411 if (!expires) 5412 /* 5413 * If we cannot get the audio device autosuspend delay, 5414 * a fixed 4S interval will be used. Considering 3S is 5415 * the audio controller default autosuspend delay setting. 5416 * 4S used here is guaranteed to cover that. 5417 */ 5418 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5419 5420 while (!pm_runtime_status_suspended(&(p->dev))) { 5421 if (!pm_runtime_suspend(&(p->dev))) 5422 break; 5423 5424 if (expires < ktime_get_mono_fast_ns()) { 5425 dev_warn(adev->dev, "failed to suspend display audio\n"); 5426 pci_dev_put(p); 5427 /* TODO: abort the succeeding gpu reset? */ 5428 return -ETIMEDOUT; 5429 } 5430 } 5431 5432 pm_runtime_disable(&(p->dev)); 5433 5434 pci_dev_put(p); 5435 return 0; 5436 } 5437 5438 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5439 { 5440 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5441 5442 #if defined(CONFIG_DEBUG_FS) 5443 if (!amdgpu_sriov_vf(adev)) 5444 cancel_work(&adev->reset_work); 5445 #endif 5446 cancel_work(&adev->userq_reset_work); 5447 5448 if (adev->kfd.dev) 5449 cancel_work(&adev->kfd.reset_work); 5450 5451 if (amdgpu_sriov_vf(adev)) 5452 cancel_work(&adev->virt.flr_work); 5453 5454 if (con && adev->ras_enabled) 5455 cancel_work(&con->recovery_work); 5456 5457 } 5458 5459 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5460 { 5461 struct amdgpu_device *tmp_adev; 5462 int ret = 0; 5463 5464 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5465 ret |= amdgpu_device_bus_status_check(tmp_adev); 5466 } 5467 5468 return ret; 5469 } 5470 5471 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 5472 struct list_head *device_list, 5473 struct amdgpu_hive_info *hive) 5474 { 5475 struct amdgpu_device *tmp_adev = NULL; 5476 5477 /* 5478 * Build list of devices to reset. 5479 * In case we are in XGMI hive mode, resort the device list 5480 * to put adev in the 1st position. 5481 */ 5482 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5483 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5484 list_add_tail(&tmp_adev->reset_list, device_list); 5485 if (adev->shutdown) 5486 tmp_adev->shutdown = true; 5487 if (amdgpu_reset_in_dpc(adev)) 5488 tmp_adev->pcie_reset_ctx.in_link_reset = true; 5489 } 5490 if (!list_is_first(&adev->reset_list, device_list)) 5491 list_rotate_to_front(&adev->reset_list, device_list); 5492 } else { 5493 list_add_tail(&adev->reset_list, device_list); 5494 } 5495 } 5496 5497 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 5498 struct list_head *device_list) 5499 { 5500 struct amdgpu_device *tmp_adev = NULL; 5501 5502 if (list_empty(device_list)) 5503 return; 5504 tmp_adev = 5505 list_first_entry(device_list, struct amdgpu_device, reset_list); 5506 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5507 } 5508 5509 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 5510 struct list_head *device_list) 5511 { 5512 struct amdgpu_device *tmp_adev = NULL; 5513 5514 if (list_empty(device_list)) 5515 return; 5516 tmp_adev = 5517 list_first_entry(device_list, struct amdgpu_device, reset_list); 5518 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5519 } 5520 5521 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 5522 struct amdgpu_job *job, 5523 struct amdgpu_reset_context *reset_context, 5524 struct list_head *device_list, 5525 struct amdgpu_hive_info *hive, 5526 bool need_emergency_restart) 5527 { 5528 struct amdgpu_device *tmp_adev = NULL; 5529 int i; 5530 5531 /* block all schedulers and reset given job's ring */ 5532 list_for_each_entry(tmp_adev, device_list, reset_list) { 5533 amdgpu_device_set_mp1_state(tmp_adev); 5534 5535 /* 5536 * Try to put the audio codec into suspend state 5537 * before gpu reset started. 5538 * 5539 * Due to the power domain of the graphics device 5540 * is shared with AZ power domain. Without this, 5541 * we may change the audio hardware from behind 5542 * the audio driver's back. That will trigger 5543 * some audio codec errors. 5544 */ 5545 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5546 tmp_adev->pcie_reset_ctx.audio_suspended = true; 5547 5548 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5549 5550 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5551 5552 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5553 5554 /* 5555 * Mark these ASICs to be reset as untracked first 5556 * And add them back after reset completed 5557 */ 5558 amdgpu_unregister_gpu_instance(tmp_adev); 5559 5560 drm_client_dev_suspend(adev_to_drm(tmp_adev)); 5561 5562 /* disable ras on ALL IPs */ 5563 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 5564 amdgpu_device_ip_need_full_reset(tmp_adev)) 5565 amdgpu_ras_suspend(tmp_adev); 5566 5567 amdgpu_userq_pre_reset(tmp_adev); 5568 5569 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5570 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5571 5572 if (!amdgpu_ring_sched_ready(ring)) 5573 continue; 5574 5575 drm_sched_wqueue_stop(&ring->sched); 5576 5577 if (need_emergency_restart) 5578 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5579 } 5580 atomic_inc(&tmp_adev->gpu_reset_counter); 5581 } 5582 } 5583 5584 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 5585 struct list_head *device_list, 5586 struct amdgpu_reset_context *reset_context) 5587 { 5588 struct amdgpu_device *tmp_adev = NULL; 5589 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5590 int r = 0; 5591 5592 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5593 list_for_each_entry(tmp_adev, device_list, reset_list) { 5594 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5595 /*TODO Should we stop ?*/ 5596 if (r) { 5597 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5598 r, adev_to_drm(tmp_adev)->unique); 5599 tmp_adev->asic_reset_res = r; 5600 } 5601 } 5602 5603 /* Actual ASIC resets if needed.*/ 5604 /* Host driver will handle XGMI hive reset for SRIOV */ 5605 if (amdgpu_sriov_vf(adev)) { 5606 5607 /* Bail out of reset early */ 5608 if (amdgpu_ras_is_rma(adev)) 5609 return -ENODEV; 5610 5611 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5612 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5613 amdgpu_ras_set_fed(adev, true); 5614 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5615 } 5616 5617 r = amdgpu_device_reset_sriov(adev, reset_context); 5618 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5619 amdgpu_virt_release_full_gpu(adev, true); 5620 goto retry; 5621 } 5622 if (r) 5623 adev->asic_reset_res = r; 5624 } else { 5625 r = amdgpu_do_asic_reset(device_list, reset_context); 5626 if (r && r == -EAGAIN) 5627 goto retry; 5628 } 5629 5630 list_for_each_entry(tmp_adev, device_list, reset_list) { 5631 /* 5632 * Drop any pending non scheduler resets queued before reset is done. 5633 * Any reset scheduled after this point would be valid. Scheduler resets 5634 * were already dropped during drm_sched_stop and no new ones can come 5635 * in before drm_sched_start. 5636 */ 5637 amdgpu_device_stop_pending_resets(tmp_adev); 5638 } 5639 5640 return r; 5641 } 5642 5643 static int amdgpu_device_sched_resume(struct list_head *device_list, 5644 struct amdgpu_reset_context *reset_context, 5645 bool job_signaled) 5646 { 5647 struct amdgpu_device *tmp_adev = NULL; 5648 int i, r = 0; 5649 5650 /* Post ASIC reset for all devs .*/ 5651 list_for_each_entry(tmp_adev, device_list, reset_list) { 5652 5653 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5654 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5655 5656 if (!amdgpu_ring_sched_ready(ring)) 5657 continue; 5658 5659 drm_sched_wqueue_start(&ring->sched); 5660 } 5661 5662 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5663 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5664 5665 if (tmp_adev->asic_reset_res) { 5666 /* bad news, how to tell it to userspace ? 5667 * for ras error, we should report GPU bad status instead of 5668 * reset failure 5669 */ 5670 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 5671 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 5672 dev_info( 5673 tmp_adev->dev, 5674 "GPU reset(%d) failed with error %d\n", 5675 atomic_read( 5676 &tmp_adev->gpu_reset_counter), 5677 tmp_adev->asic_reset_res); 5678 amdgpu_vf_error_put(tmp_adev, 5679 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, 5680 tmp_adev->asic_reset_res); 5681 if (!r) 5682 r = tmp_adev->asic_reset_res; 5683 tmp_adev->asic_reset_res = 0; 5684 } else { 5685 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 5686 atomic_read(&tmp_adev->gpu_reset_counter)); 5687 if (amdgpu_acpi_smart_shift_update(tmp_adev, 5688 AMDGPU_SS_DEV_D0)) 5689 dev_warn(tmp_adev->dev, 5690 "smart shift update failed\n"); 5691 } 5692 } 5693 5694 return r; 5695 } 5696 5697 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 5698 struct list_head *device_list, 5699 bool need_emergency_restart) 5700 { 5701 struct amdgpu_device *tmp_adev = NULL; 5702 5703 list_for_each_entry(tmp_adev, device_list, reset_list) { 5704 /* unlock kfd: SRIOV would do it separately */ 5705 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5706 amdgpu_amdkfd_post_reset(tmp_adev); 5707 5708 /* kfd_post_reset will do nothing if kfd device is not initialized, 5709 * need to bring up kfd here if it's not be initialized before 5710 */ 5711 if (!adev->kfd.init_complete) 5712 amdgpu_amdkfd_device_init(adev); 5713 5714 if (tmp_adev->pcie_reset_ctx.audio_suspended) 5715 amdgpu_device_resume_display_audio(tmp_adev); 5716 5717 amdgpu_device_unset_mp1_state(tmp_adev); 5718 5719 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5720 5721 } 5722 } 5723 5724 5725 /** 5726 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5727 * 5728 * @adev: amdgpu_device pointer 5729 * @job: which job trigger hang 5730 * @reset_context: amdgpu reset context pointer 5731 * 5732 * Attempt to reset the GPU if it has hung (all asics). 5733 * Attempt to do soft-reset or full-reset and reinitialize Asic 5734 * Returns 0 for success or an error on failure. 5735 */ 5736 5737 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5738 struct amdgpu_job *job, 5739 struct amdgpu_reset_context *reset_context) 5740 { 5741 struct list_head device_list; 5742 bool job_signaled = false; 5743 struct amdgpu_hive_info *hive = NULL; 5744 int r = 0; 5745 bool need_emergency_restart = false; 5746 /* save the pasid here as the job may be freed before the end of the reset */ 5747 int pasid = job ? job->pasid : -EINVAL; 5748 5749 /* 5750 * If it reaches here because of hang/timeout and a RAS error is 5751 * detected at the same time, let RAS recovery take care of it. 5752 */ 5753 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 5754 !amdgpu_sriov_vf(adev) && 5755 reset_context->src != AMDGPU_RESET_SRC_RAS) { 5756 dev_dbg(adev->dev, 5757 "Gpu recovery from source: %d yielding to RAS error recovery handling", 5758 reset_context->src); 5759 return 0; 5760 } 5761 5762 /* 5763 * Special case: RAS triggered and full reset isn't supported 5764 */ 5765 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5766 5767 /* 5768 * Flush RAM to disk so that after reboot 5769 * the user can read log and see why the system rebooted. 5770 */ 5771 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5772 amdgpu_ras_get_context(adev)->reboot) { 5773 dev_warn(adev->dev, "Emergency reboot."); 5774 5775 ksys_sync_helper(); 5776 emergency_restart(); 5777 } 5778 5779 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 5780 need_emergency_restart ? "jobs stop" : "reset", 5781 reset_context->src); 5782 5783 if (!amdgpu_sriov_vf(adev)) 5784 hive = amdgpu_get_xgmi_hive(adev); 5785 if (hive) 5786 mutex_lock(&hive->hive_lock); 5787 5788 reset_context->job = job; 5789 reset_context->hive = hive; 5790 INIT_LIST_HEAD(&device_list); 5791 5792 amdgpu_device_recovery_prepare(adev, &device_list, hive); 5793 5794 if (!amdgpu_sriov_vf(adev)) { 5795 r = amdgpu_device_health_check(&device_list); 5796 if (r) 5797 goto end_reset; 5798 } 5799 5800 /* Cannot be called after locking reset domain */ 5801 amdgpu_ras_pre_reset(adev, &device_list); 5802 5803 /* We need to lock reset domain only once both for XGMI and single device */ 5804 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 5805 5806 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 5807 hive, need_emergency_restart); 5808 if (need_emergency_restart) 5809 goto skip_sched_resume; 5810 /* 5811 * Must check guilty signal here since after this point all old 5812 * HW fences are force signaled. 5813 * 5814 * job->base holds a reference to parent fence 5815 */ 5816 if (job && (dma_fence_get_status(&job->hw_fence->base) > 0)) { 5817 job_signaled = true; 5818 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5819 goto skip_hw_reset; 5820 } 5821 5822 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 5823 if (r) 5824 goto reset_unlock; 5825 skip_hw_reset: 5826 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 5827 if (r) 5828 goto reset_unlock; 5829 skip_sched_resume: 5830 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 5831 reset_unlock: 5832 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 5833 amdgpu_ras_post_reset(adev, &device_list); 5834 end_reset: 5835 if (hive) { 5836 mutex_unlock(&hive->hive_lock); 5837 amdgpu_put_xgmi_hive(hive); 5838 } 5839 5840 if (r) 5841 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5842 5843 atomic_set(&adev->reset_domain->reset_res, r); 5844 5845 if (!r) { 5846 struct amdgpu_task_info *ti = NULL; 5847 5848 /* 5849 * The job may already be freed at this point via the sched tdr workqueue so 5850 * use the cached pasid. 5851 */ 5852 if (pasid >= 0) 5853 ti = amdgpu_vm_get_task_info_pasid(adev, pasid); 5854 5855 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 5856 ti ? &ti->task : NULL); 5857 5858 amdgpu_vm_put_task_info(ti); 5859 } 5860 5861 return r; 5862 } 5863 5864 /** 5865 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5866 * 5867 * @adev: amdgpu_device pointer 5868 * @speed: pointer to the speed of the link 5869 * @width: pointer to the width of the link 5870 * 5871 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5872 * first physical partner to an AMD dGPU. 5873 * This will exclude any virtual switches and links. 5874 */ 5875 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5876 enum pci_bus_speed *speed, 5877 enum pcie_link_width *width) 5878 { 5879 struct pci_dev *parent = adev->pdev; 5880 5881 if (!speed || !width) 5882 return; 5883 5884 *speed = PCI_SPEED_UNKNOWN; 5885 *width = PCIE_LNK_WIDTH_UNKNOWN; 5886 5887 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 5888 while ((parent = pci_upstream_bridge(parent))) { 5889 /* skip upstream/downstream switches internal to dGPU*/ 5890 if (parent->vendor == PCI_VENDOR_ID_ATI) 5891 continue; 5892 *speed = pcie_get_speed_cap(parent); 5893 *width = pcie_get_width_cap(parent); 5894 break; 5895 } 5896 } else { 5897 /* use the current speeds rather than max if switching is not supported */ 5898 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 5899 } 5900 } 5901 5902 /** 5903 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 5904 * 5905 * @adev: amdgpu_device pointer 5906 * @speed: pointer to the speed of the link 5907 * @width: pointer to the width of the link 5908 * 5909 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5910 * AMD dGPU which may be a virtual upstream bridge. 5911 */ 5912 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 5913 enum pci_bus_speed *speed, 5914 enum pcie_link_width *width) 5915 { 5916 struct pci_dev *parent = adev->pdev; 5917 5918 if (!speed || !width) 5919 return; 5920 5921 parent = pci_upstream_bridge(parent); 5922 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 5923 /* use the upstream/downstream switches internal to dGPU */ 5924 *speed = pcie_get_speed_cap(parent); 5925 *width = pcie_get_width_cap(parent); 5926 while ((parent = pci_upstream_bridge(parent))) { 5927 if (parent->vendor == PCI_VENDOR_ID_ATI) { 5928 /* use the upstream/downstream switches internal to dGPU */ 5929 *speed = pcie_get_speed_cap(parent); 5930 *width = pcie_get_width_cap(parent); 5931 } 5932 } 5933 } else { 5934 /* use the device itself */ 5935 *speed = pcie_get_speed_cap(adev->pdev); 5936 *width = pcie_get_width_cap(adev->pdev); 5937 } 5938 } 5939 5940 /** 5941 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5942 * 5943 * @adev: amdgpu_device pointer 5944 * 5945 * Fetches and stores in the driver the PCIE capabilities (gen speed 5946 * and lanes) of the slot the device is in. Handles APUs and 5947 * virtualized environments where PCIE config space may not be available. 5948 */ 5949 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5950 { 5951 enum pci_bus_speed speed_cap, platform_speed_cap; 5952 enum pcie_link_width platform_link_width, link_width; 5953 5954 if (amdgpu_pcie_gen_cap) 5955 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5956 5957 if (amdgpu_pcie_lane_cap) 5958 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5959 5960 /* covers APUs as well */ 5961 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5962 if (adev->pm.pcie_gen_mask == 0) 5963 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5964 if (adev->pm.pcie_mlw_mask == 0) 5965 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5966 return; 5967 } 5968 5969 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5970 return; 5971 5972 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 5973 &platform_link_width); 5974 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 5975 5976 if (adev->pm.pcie_gen_mask == 0) { 5977 /* asic caps */ 5978 if (speed_cap == PCI_SPEED_UNKNOWN) { 5979 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5980 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5981 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5982 } else { 5983 if (speed_cap == PCIE_SPEED_32_0GT) 5984 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5985 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5986 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5987 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5988 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5989 else if (speed_cap == PCIE_SPEED_16_0GT) 5990 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5991 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5992 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5993 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5994 else if (speed_cap == PCIE_SPEED_8_0GT) 5995 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5996 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5997 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5998 else if (speed_cap == PCIE_SPEED_5_0GT) 5999 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6000 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6001 else 6002 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6003 } 6004 /* platform caps */ 6005 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6006 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6007 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6008 } else { 6009 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6010 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6011 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6012 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6013 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6014 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6015 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6016 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6017 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6018 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6019 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6020 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6021 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6022 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6023 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6024 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6025 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6026 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6027 else 6028 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6029 6030 } 6031 } 6032 if (adev->pm.pcie_mlw_mask == 0) { 6033 /* asic caps */ 6034 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6035 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6036 } else { 6037 switch (link_width) { 6038 case PCIE_LNK_X32: 6039 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6040 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6041 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6042 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6043 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6044 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6045 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6046 break; 6047 case PCIE_LNK_X16: 6048 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6049 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6050 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6051 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6052 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6053 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6054 break; 6055 case PCIE_LNK_X12: 6056 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6057 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6058 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6059 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6060 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6061 break; 6062 case PCIE_LNK_X8: 6063 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6064 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6065 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6066 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6067 break; 6068 case PCIE_LNK_X4: 6069 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6070 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6071 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6072 break; 6073 case PCIE_LNK_X2: 6074 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6075 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6076 break; 6077 case PCIE_LNK_X1: 6078 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6079 break; 6080 default: 6081 break; 6082 } 6083 } 6084 /* platform caps */ 6085 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6086 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6087 } else { 6088 switch (platform_link_width) { 6089 case PCIE_LNK_X32: 6090 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6091 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6092 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6093 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6094 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6095 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6096 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6097 break; 6098 case PCIE_LNK_X16: 6099 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6100 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6101 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6102 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6103 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6104 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6105 break; 6106 case PCIE_LNK_X12: 6107 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6108 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6109 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6110 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6111 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6112 break; 6113 case PCIE_LNK_X8: 6114 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6115 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6116 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6117 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6118 break; 6119 case PCIE_LNK_X4: 6120 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6121 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6122 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6123 break; 6124 case PCIE_LNK_X2: 6125 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6126 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6127 break; 6128 case PCIE_LNK_X1: 6129 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6130 break; 6131 default: 6132 break; 6133 } 6134 } 6135 } 6136 } 6137 6138 /** 6139 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6140 * 6141 * @adev: amdgpu_device pointer 6142 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6143 * 6144 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6145 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6146 * @peer_adev. 6147 */ 6148 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6149 struct amdgpu_device *peer_adev) 6150 { 6151 #ifdef CONFIG_HSA_AMD_P2P 6152 bool p2p_access = 6153 !adev->gmc.xgmi.connected_to_cpu && 6154 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6155 if (!p2p_access) 6156 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6157 pci_name(peer_adev->pdev)); 6158 6159 bool is_large_bar = adev->gmc.visible_vram_size && 6160 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6161 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6162 6163 if (!p2p_addressable) { 6164 uint64_t address_mask = peer_adev->dev->dma_mask ? 6165 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6166 resource_size_t aper_limit = 6167 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6168 6169 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6170 aper_limit & address_mask); 6171 } 6172 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6173 #else 6174 return false; 6175 #endif 6176 } 6177 6178 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 6179 { 6180 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6181 6182 if (!amdgpu_device_supports_baco(adev)) 6183 return -ENOTSUPP; 6184 6185 if (ras && adev->ras_enabled && 6186 adev->nbio.funcs->enable_doorbell_interrupt) 6187 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6188 6189 return amdgpu_dpm_baco_enter(adev); 6190 } 6191 6192 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 6193 { 6194 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6195 int ret = 0; 6196 6197 if (!amdgpu_device_supports_baco(adev)) 6198 return -ENOTSUPP; 6199 6200 ret = amdgpu_dpm_baco_exit(adev); 6201 if (ret) 6202 return ret; 6203 6204 if (ras && adev->ras_enabled && 6205 adev->nbio.funcs->enable_doorbell_interrupt) 6206 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6207 6208 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6209 adev->nbio.funcs->clear_doorbell_interrupt) 6210 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6211 6212 return 0; 6213 } 6214 6215 /** 6216 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6217 * @pdev: PCI device struct 6218 * @state: PCI channel state 6219 * 6220 * Description: Called when a PCI error is detected. 6221 * 6222 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6223 */ 6224 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6225 { 6226 struct drm_device *dev = pci_get_drvdata(pdev); 6227 struct amdgpu_device *adev = drm_to_adev(dev); 6228 struct amdgpu_hive_info *hive __free(xgmi_put_hive) = 6229 amdgpu_get_xgmi_hive(adev); 6230 struct amdgpu_reset_context reset_context; 6231 struct list_head device_list; 6232 6233 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6234 6235 adev->pci_channel_state = state; 6236 6237 switch (state) { 6238 case pci_channel_io_normal: 6239 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6240 return PCI_ERS_RESULT_CAN_RECOVER; 6241 case pci_channel_io_frozen: 6242 /* Fatal error, prepare for slot reset */ 6243 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6244 if (hive) { 6245 /* Hive devices should be able to support FW based 6246 * link reset on other devices, if not return. 6247 */ 6248 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6249 dev_warn(adev->dev, 6250 "No support for XGMI hive yet...\n"); 6251 return PCI_ERS_RESULT_DISCONNECT; 6252 } 6253 /* Set dpc status only if device is part of hive 6254 * Non-hive devices should be able to recover after 6255 * link reset. 6256 */ 6257 amdgpu_reset_set_dpc_status(adev, true); 6258 6259 mutex_lock(&hive->hive_lock); 6260 } 6261 memset(&reset_context, 0, sizeof(reset_context)); 6262 INIT_LIST_HEAD(&device_list); 6263 6264 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6265 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6266 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6267 hive, false); 6268 if (hive) 6269 mutex_unlock(&hive->hive_lock); 6270 return PCI_ERS_RESULT_NEED_RESET; 6271 case pci_channel_io_perm_failure: 6272 /* Permanent error, prepare for device removal */ 6273 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6274 return PCI_ERS_RESULT_DISCONNECT; 6275 } 6276 6277 return PCI_ERS_RESULT_NEED_RESET; 6278 } 6279 6280 /** 6281 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6282 * @pdev: pointer to PCI device 6283 */ 6284 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6285 { 6286 struct drm_device *dev = pci_get_drvdata(pdev); 6287 struct amdgpu_device *adev = drm_to_adev(dev); 6288 6289 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6290 6291 /* TODO - dump whatever for debugging purposes */ 6292 6293 /* This called only if amdgpu_pci_error_detected returns 6294 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6295 * works, no need to reset slot. 6296 */ 6297 6298 return PCI_ERS_RESULT_RECOVERED; 6299 } 6300 6301 /** 6302 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6303 * @pdev: PCI device struct 6304 * 6305 * Description: This routine is called by the pci error recovery 6306 * code after the PCI slot has been reset, just before we 6307 * should resume normal operations. 6308 */ 6309 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6310 { 6311 struct drm_device *dev = pci_get_drvdata(pdev); 6312 struct amdgpu_device *adev = drm_to_adev(dev); 6313 struct amdgpu_reset_context reset_context; 6314 struct amdgpu_device *tmp_adev; 6315 struct amdgpu_hive_info *hive; 6316 struct list_head device_list; 6317 struct pci_dev *link_dev; 6318 int r = 0, i, timeout; 6319 u32 memsize; 6320 u16 status; 6321 6322 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6323 6324 memset(&reset_context, 0, sizeof(reset_context)); 6325 INIT_LIST_HEAD(&device_list); 6326 hive = amdgpu_get_xgmi_hive(adev); 6327 if (hive) { 6328 mutex_lock(&hive->hive_lock); 6329 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 6330 list_add_tail(&tmp_adev->reset_list, &device_list); 6331 } else { 6332 list_add_tail(&adev->reset_list, &device_list); 6333 } 6334 6335 if (adev->pcie_reset_ctx.swus) 6336 link_dev = adev->pcie_reset_ctx.swus; 6337 else 6338 link_dev = adev->pdev; 6339 /* wait for asic to come out of reset, timeout = 10s */ 6340 timeout = 10000; 6341 do { 6342 usleep_range(10000, 10500); 6343 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 6344 timeout -= 10; 6345 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 6346 (status != PCI_VENDOR_ID_AMD)); 6347 6348 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 6349 r = -ETIME; 6350 goto out; 6351 } 6352 6353 amdgpu_device_load_switch_state(adev); 6354 /* Restore PCI confspace */ 6355 amdgpu_device_load_pci_state(pdev); 6356 6357 /* confirm ASIC came out of reset */ 6358 for (i = 0; i < adev->usec_timeout; i++) { 6359 memsize = amdgpu_asic_get_config_memsize(adev); 6360 6361 if (memsize != 0xffffffff) 6362 break; 6363 udelay(1); 6364 } 6365 if (memsize == 0xffffffff) { 6366 r = -ETIME; 6367 goto out; 6368 } 6369 6370 reset_context.method = AMD_RESET_METHOD_NONE; 6371 reset_context.reset_req_dev = adev; 6372 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6373 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 6374 6375 if (hive) { 6376 reset_context.hive = hive; 6377 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 6378 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6379 } else { 6380 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6381 } 6382 6383 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 6384 out: 6385 if (!r) { 6386 if (amdgpu_device_cache_pci_state(adev->pdev)) 6387 pci_restore_state(adev->pdev); 6388 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 6389 } else { 6390 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 6391 if (hive) { 6392 list_for_each_entry(tmp_adev, &device_list, reset_list) 6393 amdgpu_device_unset_mp1_state(tmp_adev); 6394 } 6395 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6396 } 6397 6398 if (hive) { 6399 mutex_unlock(&hive->hive_lock); 6400 amdgpu_put_xgmi_hive(hive); 6401 } 6402 6403 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6404 } 6405 6406 /** 6407 * amdgpu_pci_resume() - resume normal ops after PCI reset 6408 * @pdev: pointer to PCI device 6409 * 6410 * Called when the error recovery driver tells us that its 6411 * OK to resume normal operation. 6412 */ 6413 void amdgpu_pci_resume(struct pci_dev *pdev) 6414 { 6415 struct drm_device *dev = pci_get_drvdata(pdev); 6416 struct amdgpu_device *adev = drm_to_adev(dev); 6417 struct list_head device_list; 6418 struct amdgpu_hive_info *hive = NULL; 6419 struct amdgpu_device *tmp_adev = NULL; 6420 6421 dev_info(adev->dev, "PCI error: resume callback!!\n"); 6422 6423 /* Only continue execution for the case of pci_channel_io_frozen */ 6424 if (adev->pci_channel_state != pci_channel_io_frozen) 6425 return; 6426 6427 INIT_LIST_HEAD(&device_list); 6428 6429 hive = amdgpu_get_xgmi_hive(adev); 6430 if (hive) { 6431 mutex_lock(&hive->hive_lock); 6432 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6433 tmp_adev->pcie_reset_ctx.in_link_reset = false; 6434 list_add_tail(&tmp_adev->reset_list, &device_list); 6435 } 6436 } else 6437 list_add_tail(&adev->reset_list, &device_list); 6438 6439 amdgpu_device_sched_resume(&device_list, NULL, NULL); 6440 amdgpu_device_gpu_resume(adev, &device_list, false); 6441 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6442 6443 if (hive) { 6444 mutex_unlock(&hive->hive_lock); 6445 amdgpu_put_xgmi_hive(hive); 6446 } 6447 } 6448 6449 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 6450 { 6451 struct pci_dev *swus, *swds; 6452 int r; 6453 6454 swds = pci_upstream_bridge(adev->pdev); 6455 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || 6456 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) 6457 return; 6458 swus = pci_upstream_bridge(swds); 6459 if (!swus || 6460 (swus->vendor != PCI_VENDOR_ID_ATI && 6461 swus->vendor != PCI_VENDOR_ID_AMD) || 6462 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) 6463 return; 6464 6465 /* If already saved, return */ 6466 if (adev->pcie_reset_ctx.swus) 6467 return; 6468 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 6469 r = pci_save_state(swds); 6470 if (r) 6471 return; 6472 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); 6473 6474 r = pci_save_state(swus); 6475 if (r) 6476 return; 6477 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); 6478 6479 adev->pcie_reset_ctx.swus = swus; 6480 } 6481 6482 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 6483 { 6484 struct pci_dev *pdev; 6485 int r; 6486 6487 if (!adev->pcie_reset_ctx.swds_pcistate || 6488 !adev->pcie_reset_ctx.swus_pcistate) 6489 return; 6490 6491 pdev = adev->pcie_reset_ctx.swus; 6492 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 6493 if (!r) { 6494 pci_restore_state(pdev); 6495 } else { 6496 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 6497 return; 6498 } 6499 6500 pdev = pci_upstream_bridge(adev->pdev); 6501 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 6502 if (!r) 6503 pci_restore_state(pdev); 6504 else 6505 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 6506 } 6507 6508 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6509 { 6510 struct drm_device *dev = pci_get_drvdata(pdev); 6511 struct amdgpu_device *adev = drm_to_adev(dev); 6512 int r; 6513 6514 if (amdgpu_sriov_vf(adev)) 6515 return false; 6516 6517 r = pci_save_state(pdev); 6518 if (!r) { 6519 kfree(adev->pci_state); 6520 6521 adev->pci_state = pci_store_saved_state(pdev); 6522 6523 if (!adev->pci_state) { 6524 dev_err(adev->dev, "Failed to store PCI saved state"); 6525 return false; 6526 } 6527 } else { 6528 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 6529 return false; 6530 } 6531 6532 amdgpu_device_cache_switch_state(adev); 6533 6534 return true; 6535 } 6536 6537 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6538 { 6539 struct drm_device *dev = pci_get_drvdata(pdev); 6540 struct amdgpu_device *adev = drm_to_adev(dev); 6541 int r; 6542 6543 if (!adev->pci_state) 6544 return false; 6545 6546 r = pci_load_saved_state(pdev, adev->pci_state); 6547 6548 if (!r) { 6549 pci_restore_state(pdev); 6550 } else { 6551 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 6552 return false; 6553 } 6554 6555 return true; 6556 } 6557 6558 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6559 struct amdgpu_ring *ring) 6560 { 6561 #ifdef CONFIG_X86_64 6562 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6563 return; 6564 #endif 6565 if (adev->gmc.xgmi.connected_to_cpu) 6566 return; 6567 6568 if (ring && ring->funcs->emit_hdp_flush) { 6569 amdgpu_ring_emit_hdp_flush(ring); 6570 return; 6571 } 6572 6573 if (!ring && amdgpu_sriov_runtime(adev)) { 6574 if (!amdgpu_kiq_hdp_flush(adev)) 6575 return; 6576 } 6577 6578 amdgpu_hdp_flush(adev, ring); 6579 } 6580 6581 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6582 struct amdgpu_ring *ring) 6583 { 6584 #ifdef CONFIG_X86_64 6585 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6586 return; 6587 #endif 6588 if (adev->gmc.xgmi.connected_to_cpu) 6589 return; 6590 6591 amdgpu_hdp_invalidate(adev, ring); 6592 } 6593 6594 int amdgpu_in_reset(struct amdgpu_device *adev) 6595 { 6596 return atomic_read(&adev->reset_domain->in_gpu_reset); 6597 } 6598 6599 /** 6600 * amdgpu_device_halt() - bring hardware to some kind of halt state 6601 * 6602 * @adev: amdgpu_device pointer 6603 * 6604 * Bring hardware to some kind of halt state so that no one can touch it 6605 * any more. It will help to maintain error context when error occurred. 6606 * Compare to a simple hang, the system will keep stable at least for SSH 6607 * access. Then it should be trivial to inspect the hardware state and 6608 * see what's going on. Implemented as following: 6609 * 6610 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6611 * clears all CPU mappings to device, disallows remappings through page faults 6612 * 2. amdgpu_irq_disable_all() disables all interrupts 6613 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6614 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6615 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6616 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6617 * flush any in flight DMA operations 6618 */ 6619 void amdgpu_device_halt(struct amdgpu_device *adev) 6620 { 6621 struct pci_dev *pdev = adev->pdev; 6622 struct drm_device *ddev = adev_to_drm(adev); 6623 6624 amdgpu_xcp_dev_unplug(adev); 6625 drm_dev_unplug(ddev); 6626 6627 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 6628 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 6629 6630 amdgpu_irq_disable_all(adev); 6631 6632 amdgpu_fence_driver_hw_fini(adev); 6633 6634 adev->no_hw_access = true; 6635 6636 amdgpu_device_unmap_mmio(adev); 6637 6638 pci_disable_device(pdev); 6639 pci_wait_for_pending_transaction(pdev); 6640 } 6641 6642 /** 6643 * amdgpu_device_get_gang - return a reference to the current gang 6644 * @adev: amdgpu_device pointer 6645 * 6646 * Returns: A new reference to the current gang leader. 6647 */ 6648 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6649 { 6650 struct dma_fence *fence; 6651 6652 rcu_read_lock(); 6653 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6654 rcu_read_unlock(); 6655 return fence; 6656 } 6657 6658 /** 6659 * amdgpu_device_switch_gang - switch to a new gang 6660 * @adev: amdgpu_device pointer 6661 * @gang: the gang to switch to 6662 * 6663 * Try to switch to a new gang. 6664 * Returns: NULL if we switched to the new gang or a reference to the current 6665 * gang leader. 6666 */ 6667 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6668 struct dma_fence *gang) 6669 { 6670 struct dma_fence *old = NULL; 6671 6672 dma_fence_get(gang); 6673 do { 6674 dma_fence_put(old); 6675 old = amdgpu_device_get_gang(adev); 6676 if (old == gang) 6677 break; 6678 6679 if (!dma_fence_is_signaled(old)) { 6680 dma_fence_put(gang); 6681 return old; 6682 } 6683 6684 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6685 old, gang) != old); 6686 6687 /* 6688 * Drop it once for the exchanged reference in adev and once for the 6689 * thread local reference acquired in amdgpu_device_get_gang(). 6690 */ 6691 dma_fence_put(old); 6692 dma_fence_put(old); 6693 return NULL; 6694 } 6695 6696 /** 6697 * amdgpu_device_enforce_isolation - enforce HW isolation 6698 * @adev: the amdgpu device pointer 6699 * @ring: the HW ring the job is supposed to run on 6700 * @job: the job which is about to be pushed to the HW ring 6701 * 6702 * Makes sure that only one client at a time can use the GFX block. 6703 * Returns: The dependency to wait on before the job can be pushed to the HW. 6704 * The function is called multiple times until NULL is returned. 6705 */ 6706 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 6707 struct amdgpu_ring *ring, 6708 struct amdgpu_job *job) 6709 { 6710 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 6711 struct drm_sched_fence *f = job->base.s_fence; 6712 struct dma_fence *dep; 6713 void *owner; 6714 int r; 6715 6716 /* 6717 * For now enforce isolation only for the GFX block since we only need 6718 * the cleaner shader on those rings. 6719 */ 6720 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 6721 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 6722 return NULL; 6723 6724 /* 6725 * All submissions where enforce isolation is false are handled as if 6726 * they come from a single client. Use ~0l as the owner to distinct it 6727 * from kernel submissions where the owner is NULL. 6728 */ 6729 owner = job->enforce_isolation ? f->owner : (void *)~0l; 6730 6731 mutex_lock(&adev->enforce_isolation_mutex); 6732 6733 /* 6734 * The "spearhead" submission is the first one which changes the 6735 * ownership to its client. We always need to wait for it to be 6736 * pushed to the HW before proceeding with anything. 6737 */ 6738 if (&f->scheduled != isolation->spearhead && 6739 !dma_fence_is_signaled(isolation->spearhead)) { 6740 dep = isolation->spearhead; 6741 goto out_grab_ref; 6742 } 6743 6744 if (isolation->owner != owner) { 6745 6746 /* 6747 * Wait for any gang to be assembled before switching to a 6748 * different owner or otherwise we could deadlock the 6749 * submissions. 6750 */ 6751 if (!job->gang_submit) { 6752 dep = amdgpu_device_get_gang(adev); 6753 if (!dma_fence_is_signaled(dep)) 6754 goto out_return_dep; 6755 dma_fence_put(dep); 6756 } 6757 6758 dma_fence_put(isolation->spearhead); 6759 isolation->spearhead = dma_fence_get(&f->scheduled); 6760 amdgpu_sync_move(&isolation->active, &isolation->prev); 6761 trace_amdgpu_isolation(isolation->owner, owner); 6762 isolation->owner = owner; 6763 } 6764 6765 /* 6766 * Specifying the ring here helps to pipeline submissions even when 6767 * isolation is enabled. If that is not desired for testing NULL can be 6768 * used instead of the ring to enforce a CPU round trip while switching 6769 * between clients. 6770 */ 6771 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 6772 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 6773 if (r) 6774 dev_warn(adev->dev, "OOM tracking isolation\n"); 6775 6776 out_grab_ref: 6777 dma_fence_get(dep); 6778 out_return_dep: 6779 mutex_unlock(&adev->enforce_isolation_mutex); 6780 return dep; 6781 } 6782 6783 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6784 { 6785 switch (adev->asic_type) { 6786 #ifdef CONFIG_DRM_AMDGPU_SI 6787 case CHIP_HAINAN: 6788 #endif 6789 case CHIP_TOPAZ: 6790 /* chips with no display hardware */ 6791 return false; 6792 #ifdef CONFIG_DRM_AMDGPU_SI 6793 case CHIP_TAHITI: 6794 case CHIP_PITCAIRN: 6795 case CHIP_VERDE: 6796 case CHIP_OLAND: 6797 #endif 6798 #ifdef CONFIG_DRM_AMDGPU_CIK 6799 case CHIP_BONAIRE: 6800 case CHIP_HAWAII: 6801 case CHIP_KAVERI: 6802 case CHIP_KABINI: 6803 case CHIP_MULLINS: 6804 #endif 6805 case CHIP_TONGA: 6806 case CHIP_FIJI: 6807 case CHIP_POLARIS10: 6808 case CHIP_POLARIS11: 6809 case CHIP_POLARIS12: 6810 case CHIP_VEGAM: 6811 case CHIP_CARRIZO: 6812 case CHIP_STONEY: 6813 /* chips with display hardware */ 6814 return true; 6815 default: 6816 /* IP discovery */ 6817 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6818 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6819 return false; 6820 return true; 6821 } 6822 } 6823 6824 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 6825 { 6826 ssize_t size = 0; 6827 6828 if (!ring || !ring->adev) 6829 return size; 6830 6831 if (amdgpu_device_should_recover_gpu(ring->adev)) 6832 size |= AMDGPU_RESET_TYPE_FULL; 6833 6834 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 6835 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 6836 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 6837 6838 return size; 6839 } 6840 6841 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 6842 { 6843 ssize_t size = 0; 6844 6845 if (supported_reset == 0) { 6846 size += sysfs_emit_at(buf, size, "unsupported"); 6847 size += sysfs_emit_at(buf, size, "\n"); 6848 return size; 6849 6850 } 6851 6852 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 6853 size += sysfs_emit_at(buf, size, "soft "); 6854 6855 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 6856 size += sysfs_emit_at(buf, size, "queue "); 6857 6858 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 6859 size += sysfs_emit_at(buf, size, "pipe "); 6860 6861 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 6862 size += sysfs_emit_at(buf, size, "full "); 6863 6864 size += sysfs_emit_at(buf, size, "\n"); 6865 return size; 6866 } 6867 6868 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 6869 enum amdgpu_uid_type type, uint8_t inst, 6870 uint64_t uid) 6871 { 6872 if (!uid_info) 6873 return; 6874 6875 if (type >= AMDGPU_UID_TYPE_MAX) { 6876 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 6877 type); 6878 return; 6879 } 6880 6881 if (inst >= AMDGPU_UID_INST_MAX) { 6882 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 6883 inst); 6884 return; 6885 } 6886 6887 if (uid_info->uid[type][inst] != 0) { 6888 dev_warn_once( 6889 uid_info->adev->dev, 6890 "Overwriting existing UID %llu for type %d instance %d\n", 6891 uid_info->uid[type][inst], type, inst); 6892 } 6893 6894 uid_info->uid[type][inst] = uid; 6895 } 6896 6897 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 6898 enum amdgpu_uid_type type, uint8_t inst) 6899 { 6900 if (!uid_info) 6901 return 0; 6902 6903 if (type >= AMDGPU_UID_TYPE_MAX) { 6904 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 6905 type); 6906 return 0; 6907 } 6908 6909 if (inst >= AMDGPU_UID_INST_MAX) { 6910 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 6911 inst); 6912 return 0; 6913 } 6914 6915 return uid_info->uid[type][inst]; 6916 } 6917