1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28
29 #include <linux/aperture.h>
30 #include <linux/power_supply.h>
31 #include <linux/kthread.h>
32 #include <linux/module.h>
33 #include <linux/console.h>
34 #include <linux/slab.h>
35 #include <linux/iommu.h>
36 #include <linux/pci.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39
40 #include <drm/drm_atomic_helper.h>
41 #include <drm/drm_client_event.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_probe_helper.h>
44 #include <drm/amdgpu_drm.h>
45 #include <linux/device.h>
46 #include <linux/vgaarb.h>
47 #include <linux/vga_switcheroo.h>
48 #include <linux/efi.h>
49 #include "amdgpu.h"
50 #include "amdgpu_trace.h"
51 #include "amdgpu_i2c.h"
52 #include "atom.h"
53 #include "amdgpu_atombios.h"
54 #include "amdgpu_atomfirmware.h"
55 #include "amd_pcie.h"
56 #ifdef CONFIG_DRM_AMDGPU_SI
57 #include "si.h"
58 #endif
59 #ifdef CONFIG_DRM_AMDGPU_CIK
60 #include "cik.h"
61 #endif
62 #include "vi.h"
63 #include "soc15.h"
64 #include "nv.h"
65 #include "bif/bif_4_1_d.h"
66 #include <linux/firmware.h>
67 #include "amdgpu_vf_error.h"
68
69 #include "amdgpu_amdkfd.h"
70 #include "amdgpu_pm.h"
71
72 #include "amdgpu_xgmi.h"
73 #include "amdgpu_ras.h"
74 #include "amdgpu_ras_mgr.h"
75 #include "amdgpu_pmu.h"
76 #include "amdgpu_fru_eeprom.h"
77 #include "amdgpu_reset.h"
78 #include "amdgpu_virt.h"
79 #include "amdgpu_dev_coredump.h"
80
81 #include <linux/suspend.h>
82 #include <drm/task_barrier.h>
83 #include <linux/pm_runtime.h>
84
85 #include <drm/drm_drv.h>
86
87 #if IS_ENABLED(CONFIG_X86)
88 #include <asm/intel-family.h>
89 #include <asm/cpu_device_id.h>
90 #endif
91
92 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
96 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
97 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
98 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
99 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin");
100
101 #define AMDGPU_RESUME_MS 2000
102 #define AMDGPU_MAX_RETRY_LIMIT 2
103 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
104 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
105 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
106 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
107
108 #define AMDGPU_VBIOS_SKIP (1U << 0)
109 #define AMDGPU_VBIOS_OPTIONAL (1U << 1)
110
111 static const struct drm_driver amdgpu_kms_driver;
112
113 const char *amdgpu_asic_name[] = {
114 "TAHITI",
115 "PITCAIRN",
116 "VERDE",
117 "OLAND",
118 "HAINAN",
119 "BONAIRE",
120 "KAVERI",
121 "KABINI",
122 "HAWAII",
123 "MULLINS",
124 "TOPAZ",
125 "TONGA",
126 "FIJI",
127 "CARRIZO",
128 "STONEY",
129 "POLARIS10",
130 "POLARIS11",
131 "POLARIS12",
132 "VEGAM",
133 "VEGA10",
134 "VEGA12",
135 "VEGA20",
136 "RAVEN",
137 "ARCTURUS",
138 "RENOIR",
139 "ALDEBARAN",
140 "NAVI10",
141 "CYAN_SKILLFISH",
142 "NAVI14",
143 "NAVI12",
144 "SIENNA_CICHLID",
145 "NAVY_FLOUNDER",
146 "VANGOGH",
147 "DIMGREY_CAVEFISH",
148 "BEIGE_GOBY",
149 "YELLOW_CARP",
150 "IP DISCOVERY",
151 "LAST",
152 };
153
154 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0)
155 /*
156 * Default init level where all blocks are expected to be initialized. This is
157 * the level of initialization expected by default and also after a full reset
158 * of the device.
159 */
160 struct amdgpu_init_level amdgpu_init_default = {
161 .level = AMDGPU_INIT_LEVEL_DEFAULT,
162 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
163 };
164
165 struct amdgpu_init_level amdgpu_init_recovery = {
166 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
167 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
168 };
169
170 /*
171 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
172 * is used for cases like reset on initialization where the entire hive needs to
173 * be reset before first use.
174 */
175 struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
176 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
177 .hwini_ip_block_mask =
178 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
179 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
180 BIT(AMD_IP_BLOCK_TYPE_PSP)
181 };
182
183 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev);
184 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev);
185 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev);
186
187 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev);
188
amdgpu_ip_member_of_hwini(struct amdgpu_device * adev,enum amd_ip_block_type block)189 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
190 enum amd_ip_block_type block)
191 {
192 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0;
193 }
194
amdgpu_set_init_level(struct amdgpu_device * adev,enum amdgpu_init_lvl_id lvl)195 void amdgpu_set_init_level(struct amdgpu_device *adev,
196 enum amdgpu_init_lvl_id lvl)
197 {
198 switch (lvl) {
199 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
200 adev->init_lvl = &amdgpu_init_minimal_xgmi;
201 break;
202 case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
203 adev->init_lvl = &amdgpu_init_recovery;
204 break;
205 case AMDGPU_INIT_LEVEL_DEFAULT:
206 fallthrough;
207 default:
208 adev->init_lvl = &amdgpu_init_default;
209 break;
210 }
211 }
212
213 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
214 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
215 void *data);
216
217 /**
218 * DOC: pcie_replay_count
219 *
220 * The amdgpu driver provides a sysfs API for reporting the total number
221 * of PCIe replays (NAKs).
222 * The file pcie_replay_count is used for this and returns the total
223 * number of replays as a sum of the NAKs generated and NAKs received.
224 */
225
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)226 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
227 struct device_attribute *attr, char *buf)
228 {
229 struct drm_device *ddev = dev_get_drvdata(dev);
230 struct amdgpu_device *adev = drm_to_adev(ddev);
231 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
232
233 return sysfs_emit(buf, "%llu\n", cnt);
234 }
235
236 static DEVICE_ATTR(pcie_replay_count, 0444,
237 amdgpu_device_get_pcie_replay_count, NULL);
238
amdgpu_device_attr_sysfs_init(struct amdgpu_device * adev)239 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev)
240 {
241 int ret = 0;
242
243 if (amdgpu_nbio_is_replay_cnt_supported(adev))
244 ret = sysfs_create_file(&adev->dev->kobj,
245 &dev_attr_pcie_replay_count.attr);
246
247 return ret;
248 }
249
amdgpu_device_attr_sysfs_fini(struct amdgpu_device * adev)250 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev)
251 {
252 if (amdgpu_nbio_is_replay_cnt_supported(adev))
253 sysfs_remove_file(&adev->dev->kobj,
254 &dev_attr_pcie_replay_count.attr);
255 }
256
amdgpu_sysfs_reg_state_get(struct file * f,struct kobject * kobj,const struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)257 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
258 const struct bin_attribute *attr, char *buf,
259 loff_t ppos, size_t count)
260 {
261 struct device *dev = kobj_to_dev(kobj);
262 struct drm_device *ddev = dev_get_drvdata(dev);
263 struct amdgpu_device *adev = drm_to_adev(ddev);
264 ssize_t bytes_read;
265
266 switch (ppos) {
267 case AMDGPU_SYS_REG_STATE_XGMI:
268 bytes_read = amdgpu_asic_get_reg_state(
269 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
270 break;
271 case AMDGPU_SYS_REG_STATE_WAFL:
272 bytes_read = amdgpu_asic_get_reg_state(
273 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
274 break;
275 case AMDGPU_SYS_REG_STATE_PCIE:
276 bytes_read = amdgpu_asic_get_reg_state(
277 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
278 break;
279 case AMDGPU_SYS_REG_STATE_USR:
280 bytes_read = amdgpu_asic_get_reg_state(
281 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
282 break;
283 case AMDGPU_SYS_REG_STATE_USR_1:
284 bytes_read = amdgpu_asic_get_reg_state(
285 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
286 break;
287 default:
288 return -EINVAL;
289 }
290
291 return bytes_read;
292 }
293
294 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
295 AMDGPU_SYS_REG_STATE_END);
296
amdgpu_reg_state_sysfs_init(struct amdgpu_device * adev)297 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
298 {
299 int ret;
300
301 if (!amdgpu_asic_get_reg_state_supported(adev))
302 return 0;
303
304 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
305
306 return ret;
307 }
308
amdgpu_reg_state_sysfs_fini(struct amdgpu_device * adev)309 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
310 {
311 if (!amdgpu_asic_get_reg_state_supported(adev))
312 return;
313 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
314 }
315
amdgpu_ip_block_suspend(struct amdgpu_ip_block * ip_block)316 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
317 {
318 int r;
319
320 if (ip_block->version->funcs->suspend) {
321 r = ip_block->version->funcs->suspend(ip_block);
322 if (r) {
323 dev_err(ip_block->adev->dev,
324 "suspend of IP block <%s> failed %d\n",
325 ip_block->version->funcs->name, r);
326 return r;
327 }
328 }
329
330 ip_block->status.hw = false;
331 return 0;
332 }
333
amdgpu_ip_block_resume(struct amdgpu_ip_block * ip_block)334 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
335 {
336 int r;
337
338 if (ip_block->version->funcs->resume) {
339 r = ip_block->version->funcs->resume(ip_block);
340 if (r) {
341 dev_err(ip_block->adev->dev,
342 "resume of IP block <%s> failed %d\n",
343 ip_block->version->funcs->name, r);
344 return r;
345 }
346 }
347
348 ip_block->status.hw = true;
349 return 0;
350 }
351
352 /**
353 * DOC: board_info
354 *
355 * The amdgpu driver provides a sysfs API for giving board related information.
356 * It provides the form factor information in the format
357 *
358 * type : form factor
359 *
360 * Possible form factor values
361 *
362 * - "cem" - PCIE CEM card
363 * - "oam" - Open Compute Accelerator Module
364 * - "unknown" - Not known
365 *
366 */
367
amdgpu_device_get_board_info(struct device * dev,struct device_attribute * attr,char * buf)368 static ssize_t amdgpu_device_get_board_info(struct device *dev,
369 struct device_attribute *attr,
370 char *buf)
371 {
372 struct drm_device *ddev = dev_get_drvdata(dev);
373 struct amdgpu_device *adev = drm_to_adev(ddev);
374 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
375 const char *pkg;
376
377 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
378 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
379
380 switch (pkg_type) {
381 case AMDGPU_PKG_TYPE_CEM:
382 pkg = "cem";
383 break;
384 case AMDGPU_PKG_TYPE_OAM:
385 pkg = "oam";
386 break;
387 default:
388 pkg = "unknown";
389 break;
390 }
391
392 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
393 }
394
395 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
396
397 static struct attribute *amdgpu_board_attrs[] = {
398 &dev_attr_board_info.attr,
399 NULL,
400 };
401
amdgpu_board_attrs_is_visible(struct kobject * kobj,struct attribute * attr,int n)402 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
403 struct attribute *attr, int n)
404 {
405 struct device *dev = kobj_to_dev(kobj);
406 struct drm_device *ddev = dev_get_drvdata(dev);
407 struct amdgpu_device *adev = drm_to_adev(ddev);
408
409 if (adev->flags & AMD_IS_APU)
410 return 0;
411
412 return attr->mode;
413 }
414
415 static const struct attribute_group amdgpu_board_attrs_group = {
416 .attrs = amdgpu_board_attrs,
417 .is_visible = amdgpu_board_attrs_is_visible
418 };
419
420 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
421
422 /**
423 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
424 *
425 * @adev: amdgpu device pointer
426 *
427 * Returns true if the device is a dGPU with ATPX power control,
428 * otherwise return false.
429 */
amdgpu_device_supports_px(struct amdgpu_device * adev)430 bool amdgpu_device_supports_px(struct amdgpu_device *adev)
431 {
432 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
433 return true;
434 return false;
435 }
436
437 /**
438 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
439 *
440 * @adev: amdgpu device pointer
441 *
442 * Returns true if the device is a dGPU with ACPI power control,
443 * otherwise return false.
444 */
amdgpu_device_supports_boco(struct amdgpu_device * adev)445 bool amdgpu_device_supports_boco(struct amdgpu_device *adev)
446 {
447 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
448 return false;
449
450 if (adev->has_pr3 ||
451 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
452 return true;
453 return false;
454 }
455
456 /**
457 * amdgpu_device_supports_baco - Does the device support BACO
458 *
459 * @adev: amdgpu device pointer
460 *
461 * Return:
462 * 1 if the device supports BACO;
463 * 3 if the device supports MACO (only works if BACO is supported)
464 * otherwise return 0.
465 */
amdgpu_device_supports_baco(struct amdgpu_device * adev)466 int amdgpu_device_supports_baco(struct amdgpu_device *adev)
467 {
468 return amdgpu_asic_supports_baco(adev);
469 }
470
amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device * adev)471 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
472 {
473 int bamaco_support;
474
475 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
476 bamaco_support = amdgpu_device_supports_baco(adev);
477
478 switch (amdgpu_runtime_pm) {
479 case 2:
480 if (bamaco_support & MACO_SUPPORT) {
481 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
482 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
483 } else if (bamaco_support == BACO_SUPPORT) {
484 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
485 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
486 }
487 break;
488 case 1:
489 if (bamaco_support & BACO_SUPPORT) {
490 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
491 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
492 }
493 break;
494 case -1:
495 case -2:
496 if (amdgpu_device_supports_px(adev)) {
497 /* enable PX as runtime mode */
498 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
499 dev_info(adev->dev, "Using ATPX for runtime pm\n");
500 } else if (amdgpu_device_supports_boco(adev)) {
501 /* enable boco as runtime mode */
502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
503 dev_info(adev->dev, "Using BOCO for runtime pm\n");
504 } else {
505 if (!bamaco_support)
506 goto no_runtime_pm;
507
508 switch (adev->asic_type) {
509 case CHIP_VEGA20:
510 case CHIP_ARCTURUS:
511 /* BACO are not supported on vega20 and arctrus */
512 break;
513 case CHIP_VEGA10:
514 /* enable BACO as runpm mode if noretry=0 */
515 if (!adev->gmc.noretry && !amdgpu_passthrough(adev))
516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
517 break;
518 default:
519 /* enable BACO as runpm mode on CI+ */
520 if (!amdgpu_passthrough(adev))
521 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
522 break;
523 }
524
525 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
526 if (bamaco_support & MACO_SUPPORT) {
527 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
528 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
529 } else {
530 dev_info(adev->dev, "Using BACO for runtime pm\n");
531 }
532 }
533 }
534 break;
535 case 0:
536 dev_info(adev->dev, "runtime pm is manually disabled\n");
537 break;
538 default:
539 break;
540 }
541
542 no_runtime_pm:
543 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
544 dev_info(adev->dev, "Runtime PM not available\n");
545 }
546 /**
547 * amdgpu_device_supports_smart_shift - Is the device dGPU with
548 * smart shift support
549 *
550 * @adev: amdgpu device pointer
551 *
552 * Returns true if the device is a dGPU with Smart Shift support,
553 * otherwise returns false.
554 */
amdgpu_device_supports_smart_shift(struct amdgpu_device * adev)555 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev)
556 {
557 return (amdgpu_device_supports_boco(adev) &&
558 amdgpu_acpi_is_power_shift_control_supported());
559 }
560
561 /*
562 * VRAM access helper functions
563 */
564
565 /**
566 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
567 *
568 * @adev: amdgpu_device pointer
569 * @pos: offset of the buffer in vram
570 * @buf: virtual address of the buffer in system memory
571 * @size: read/write size, sizeof(@buf) must > @size
572 * @write: true - write to vram, otherwise - read from vram
573 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
575 void *buf, size_t size, bool write)
576 {
577 unsigned long flags;
578 uint32_t hi = ~0, tmp = 0;
579 uint32_t *data = buf;
580 uint64_t last;
581 int idx;
582
583 if (!drm_dev_enter(adev_to_drm(adev), &idx))
584 return;
585
586 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
587
588 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
589 for (last = pos + size; pos < last; pos += 4) {
590 tmp = pos >> 31;
591
592 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
593 if (tmp != hi) {
594 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
595 hi = tmp;
596 }
597 if (write)
598 WREG32_NO_KIQ(mmMM_DATA, *data++);
599 else
600 *data++ = RREG32_NO_KIQ(mmMM_DATA);
601 }
602
603 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
604 drm_dev_exit(idx);
605 }
606
607 /**
608 * amdgpu_device_aper_access - access vram by vram aperture
609 *
610 * @adev: amdgpu_device pointer
611 * @pos: offset of the buffer in vram
612 * @buf: virtual address of the buffer in system memory
613 * @size: read/write size, sizeof(@buf) must > @size
614 * @write: true - write to vram, otherwise - read from vram
615 *
616 * The return value means how many bytes have been transferred.
617 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
619 void *buf, size_t size, bool write)
620 {
621 #ifdef CONFIG_64BIT
622 void __iomem *addr;
623 size_t count = 0;
624 uint64_t last;
625
626 if (!adev->mman.aper_base_kaddr)
627 return 0;
628
629 last = min(pos + size, adev->gmc.visible_vram_size);
630 if (last > pos) {
631 addr = adev->mman.aper_base_kaddr + pos;
632 count = last - pos;
633
634 if (write) {
635 memcpy_toio(addr, buf, count);
636 /* Make sure HDP write cache flush happens without any reordering
637 * after the system memory contents are sent over PCIe device
638 */
639 mb();
640 amdgpu_device_flush_hdp(adev, NULL);
641 } else {
642 amdgpu_device_invalidate_hdp(adev, NULL);
643 /* Make sure HDP read cache is invalidated before issuing a read
644 * to the PCIe device
645 */
646 mb();
647 memcpy_fromio(buf, addr, count);
648 }
649
650 }
651
652 return count;
653 #else
654 return 0;
655 #endif
656 }
657
658 /**
659 * amdgpu_device_vram_access - read/write a buffer in vram
660 *
661 * @adev: amdgpu_device pointer
662 * @pos: offset of the buffer in vram
663 * @buf: virtual address of the buffer in system memory
664 * @size: read/write size, sizeof(@buf) must > @size
665 * @write: true - write to vram, otherwise - read from vram
666 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
668 void *buf, size_t size, bool write)
669 {
670 size_t count;
671
672 /* try to using vram apreature to access vram first */
673 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
674 size -= count;
675 if (size) {
676 /* using MM to access rest vram */
677 pos += count;
678 buf += count;
679 amdgpu_device_mm_access(adev, pos, buf, size, write);
680 }
681 }
682
683 /*
684 * register access helper functions.
685 */
686
687 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
689 {
690 if (adev->no_hw_access)
691 return true;
692
693 #ifdef CONFIG_LOCKDEP
694 /*
695 * This is a bit complicated to understand, so worth a comment. What we assert
696 * here is that the GPU reset is not running on another thread in parallel.
697 *
698 * For this we trylock the read side of the reset semaphore, if that succeeds
699 * we know that the reset is not running in parallel.
700 *
701 * If the trylock fails we assert that we are either already holding the read
702 * side of the lock or are the reset thread itself and hold the write side of
703 * the lock.
704 */
705 if (in_task()) {
706 if (down_read_trylock(&adev->reset_domain->sem))
707 up_read(&adev->reset_domain->sem);
708 else
709 lockdep_assert_held(&adev->reset_domain->sem);
710 }
711 #endif
712 return false;
713 }
714
715 /**
716 * amdgpu_device_rreg - read a memory mapped IO or indirect register
717 *
718 * @adev: amdgpu_device pointer
719 * @reg: dword aligned register offset
720 * @acc_flags: access flags which require special behavior
721 *
722 * Returns the 32 bit value from the offset specified.
723 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
725 uint32_t reg, uint32_t acc_flags)
726 {
727 uint32_t ret;
728
729 if (amdgpu_device_skip_hw_access(adev))
730 return 0;
731
732 if ((reg * 4) < adev->rmmio_size) {
733 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
734 amdgpu_sriov_runtime(adev) &&
735 down_read_trylock(&adev->reset_domain->sem)) {
736 ret = amdgpu_kiq_rreg(adev, reg, 0);
737 up_read(&adev->reset_domain->sem);
738 } else {
739 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
740 }
741 } else {
742 ret = adev->pcie_rreg(adev, reg * 4);
743 }
744
745 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
746
747 return ret;
748 }
749
750 /*
751 * MMIO register read with bytes helper functions
752 * @offset:bytes offset from MMIO start
753 */
754
755 /**
756 * amdgpu_mm_rreg8 - read a memory mapped IO register
757 *
758 * @adev: amdgpu_device pointer
759 * @offset: byte aligned register offset
760 *
761 * Returns the 8 bit value from the offset specified.
762 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
764 {
765 if (amdgpu_device_skip_hw_access(adev))
766 return 0;
767
768 if (offset < adev->rmmio_size)
769 return (readb(adev->rmmio + offset));
770 BUG();
771 }
772
773
774 /**
775 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
776 *
777 * @adev: amdgpu_device pointer
778 * @reg: dword aligned register offset
779 * @acc_flags: access flags which require special behavior
780 * @xcc_id: xcc accelerated compute core id
781 *
782 * Returns the 32 bit value from the offset specified.
783 */
amdgpu_device_xcc_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags,uint32_t xcc_id)784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
785 uint32_t reg, uint32_t acc_flags,
786 uint32_t xcc_id)
787 {
788 uint32_t ret, rlcg_flag;
789
790 if (amdgpu_device_skip_hw_access(adev))
791 return 0;
792
793 if ((reg * 4) < adev->rmmio_size) {
794 if (amdgpu_sriov_vf(adev) &&
795 !amdgpu_sriov_runtime(adev) &&
796 adev->gfx.rlc.rlcg_reg_access_supported &&
797 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
798 GC_HWIP, false,
799 &rlcg_flag)) {
800 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
801 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
802 amdgpu_sriov_runtime(adev) &&
803 down_read_trylock(&adev->reset_domain->sem)) {
804 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
805 up_read(&adev->reset_domain->sem);
806 } else {
807 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
808 }
809 } else {
810 ret = adev->pcie_rreg(adev, reg * 4);
811 }
812
813 return ret;
814 }
815
816 /*
817 * MMIO register write with bytes helper functions
818 * @offset:bytes offset from MMIO start
819 * @value: the value want to be written to the register
820 */
821
822 /**
823 * amdgpu_mm_wreg8 - read a memory mapped IO register
824 *
825 * @adev: amdgpu_device pointer
826 * @offset: byte aligned register offset
827 * @value: 8 bit value to write
828 *
829 * Writes the value specified to the offset specified.
830 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
832 {
833 if (amdgpu_device_skip_hw_access(adev))
834 return;
835
836 if (offset < adev->rmmio_size)
837 writeb(value, adev->rmmio + offset);
838 else
839 BUG();
840 }
841
842 /**
843 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
844 *
845 * @adev: amdgpu_device pointer
846 * @reg: dword aligned register offset
847 * @v: 32 bit value to write to the register
848 * @acc_flags: access flags which require special behavior
849 *
850 * Writes the value specified to the offset specified.
851 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)852 void amdgpu_device_wreg(struct amdgpu_device *adev,
853 uint32_t reg, uint32_t v,
854 uint32_t acc_flags)
855 {
856 if (amdgpu_device_skip_hw_access(adev))
857 return;
858
859 if ((reg * 4) < adev->rmmio_size) {
860 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
861 amdgpu_sriov_runtime(adev) &&
862 down_read_trylock(&adev->reset_domain->sem)) {
863 amdgpu_kiq_wreg(adev, reg, v, 0);
864 up_read(&adev->reset_domain->sem);
865 } else {
866 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
867 }
868 } else {
869 adev->pcie_wreg(adev, reg * 4, v);
870 }
871
872 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
873 }
874
875 /**
876 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
877 *
878 * @adev: amdgpu_device pointer
879 * @reg: mmio/rlc register
880 * @v: value to write
881 * @xcc_id: xcc accelerated compute core id
882 *
883 * this function is invoked only for the debugfs register access
884 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
886 uint32_t reg, uint32_t v,
887 uint32_t xcc_id)
888 {
889 if (amdgpu_device_skip_hw_access(adev))
890 return;
891
892 if (amdgpu_sriov_fullaccess(adev) &&
893 adev->gfx.rlc.funcs &&
894 adev->gfx.rlc.funcs->is_rlcg_access_range) {
895 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
896 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
897 } else if ((reg * 4) >= adev->rmmio_size) {
898 adev->pcie_wreg(adev, reg * 4, v);
899 } else {
900 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
901 }
902 }
903
904 /**
905 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
906 *
907 * @adev: amdgpu_device pointer
908 * @reg: dword aligned register offset
909 * @v: 32 bit value to write to the register
910 * @acc_flags: access flags which require special behavior
911 * @xcc_id: xcc accelerated compute core id
912 *
913 * Writes the value specified to the offset specified.
914 */
amdgpu_device_xcc_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags,uint32_t xcc_id)915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
916 uint32_t reg, uint32_t v,
917 uint32_t acc_flags, uint32_t xcc_id)
918 {
919 uint32_t rlcg_flag;
920
921 if (amdgpu_device_skip_hw_access(adev))
922 return;
923
924 if ((reg * 4) < adev->rmmio_size) {
925 if (amdgpu_sriov_vf(adev) &&
926 !amdgpu_sriov_runtime(adev) &&
927 adev->gfx.rlc.rlcg_reg_access_supported &&
928 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
929 GC_HWIP, true,
930 &rlcg_flag)) {
931 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
932 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
933 amdgpu_sriov_runtime(adev) &&
934 down_read_trylock(&adev->reset_domain->sem)) {
935 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
936 up_read(&adev->reset_domain->sem);
937 } else {
938 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
939 }
940 } else {
941 adev->pcie_wreg(adev, reg * 4, v);
942 }
943 }
944
945 /**
946 * amdgpu_device_indirect_rreg - read an indirect register
947 *
948 * @adev: amdgpu_device pointer
949 * @reg_addr: indirect register address to read from
950 *
951 * Returns the value of indirect register @reg_addr
952 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
954 u32 reg_addr)
955 {
956 unsigned long flags, pcie_index, pcie_data;
957 void __iomem *pcie_index_offset;
958 void __iomem *pcie_data_offset;
959 u32 r;
960
961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
963
964 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
965 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
966 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
967
968 writel(reg_addr, pcie_index_offset);
969 readl(pcie_index_offset);
970 r = readl(pcie_data_offset);
971 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
972
973 return r;
974 }
975
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
977 u64 reg_addr)
978 {
979 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
980 u32 r;
981 void __iomem *pcie_index_offset;
982 void __iomem *pcie_index_hi_offset;
983 void __iomem *pcie_data_offset;
984
985 if (unlikely(!adev->nbio.funcs)) {
986 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
987 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
988 } else {
989 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
990 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
991 }
992
993 if (reg_addr >> 32) {
994 if (unlikely(!adev->nbio.funcs))
995 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
996 else
997 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
998 } else {
999 pcie_index_hi = 0;
1000 }
1001
1002 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1003 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1004 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1005 if (pcie_index_hi != 0)
1006 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1007 pcie_index_hi * 4;
1008
1009 writel(reg_addr, pcie_index_offset);
1010 readl(pcie_index_offset);
1011 if (pcie_index_hi != 0) {
1012 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1013 readl(pcie_index_hi_offset);
1014 }
1015 r = readl(pcie_data_offset);
1016
1017 /* clear the high bits */
1018 if (pcie_index_hi != 0) {
1019 writel(0, pcie_index_hi_offset);
1020 readl(pcie_index_hi_offset);
1021 }
1022
1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1024
1025 return r;
1026 }
1027
1028 /**
1029 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
1030 *
1031 * @adev: amdgpu_device pointer
1032 * @reg_addr: indirect register address to read from
1033 *
1034 * Returns the value of indirect register @reg_addr
1035 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
1037 u32 reg_addr)
1038 {
1039 unsigned long flags, pcie_index, pcie_data;
1040 void __iomem *pcie_index_offset;
1041 void __iomem *pcie_data_offset;
1042 u64 r;
1043
1044 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1045 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1046
1047 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1048 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1049 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1050
1051 /* read low 32 bits */
1052 writel(reg_addr, pcie_index_offset);
1053 readl(pcie_index_offset);
1054 r = readl(pcie_data_offset);
1055 /* read high 32 bits */
1056 writel(reg_addr + 4, pcie_index_offset);
1057 readl(pcie_index_offset);
1058 r |= ((u64)readl(pcie_data_offset) << 32);
1059 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1060
1061 return r;
1062 }
1063
amdgpu_device_indirect_rreg64_ext(struct amdgpu_device * adev,u64 reg_addr)1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
1065 u64 reg_addr)
1066 {
1067 unsigned long flags, pcie_index, pcie_data;
1068 unsigned long pcie_index_hi = 0;
1069 void __iomem *pcie_index_offset;
1070 void __iomem *pcie_index_hi_offset;
1071 void __iomem *pcie_data_offset;
1072 u64 r;
1073
1074 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1075 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1076 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1077 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1078
1079 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1080 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1081 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1082 if (pcie_index_hi != 0)
1083 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1084 pcie_index_hi * 4;
1085
1086 /* read low 32 bits */
1087 writel(reg_addr, pcie_index_offset);
1088 readl(pcie_index_offset);
1089 if (pcie_index_hi != 0) {
1090 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1091 readl(pcie_index_hi_offset);
1092 }
1093 r = readl(pcie_data_offset);
1094 /* read high 32 bits */
1095 writel(reg_addr + 4, pcie_index_offset);
1096 readl(pcie_index_offset);
1097 if (pcie_index_hi != 0) {
1098 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1099 readl(pcie_index_hi_offset);
1100 }
1101 r |= ((u64)readl(pcie_data_offset) << 32);
1102
1103 /* clear the high bits */
1104 if (pcie_index_hi != 0) {
1105 writel(0, pcie_index_hi_offset);
1106 readl(pcie_index_hi_offset);
1107 }
1108
1109 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1110
1111 return r;
1112 }
1113
1114 /**
1115 * amdgpu_device_indirect_wreg - write an indirect register address
1116 *
1117 * @adev: amdgpu_device pointer
1118 * @reg_addr: indirect register offset
1119 * @reg_data: indirect register data
1120 *
1121 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1123 u32 reg_addr, u32 reg_data)
1124 {
1125 unsigned long flags, pcie_index, pcie_data;
1126 void __iomem *pcie_index_offset;
1127 void __iomem *pcie_data_offset;
1128
1129 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1130 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1131
1132 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1133 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1134 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1135
1136 writel(reg_addr, pcie_index_offset);
1137 readl(pcie_index_offset);
1138 writel(reg_data, pcie_data_offset);
1139 readl(pcie_data_offset);
1140 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1141 }
1142
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1144 u64 reg_addr, u32 reg_data)
1145 {
1146 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1147 void __iomem *pcie_index_offset;
1148 void __iomem *pcie_index_hi_offset;
1149 void __iomem *pcie_data_offset;
1150
1151 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1152 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1153 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1154 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1155 else
1156 pcie_index_hi = 0;
1157
1158 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1159 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1160 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1161 if (pcie_index_hi != 0)
1162 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1163 pcie_index_hi * 4;
1164
1165 writel(reg_addr, pcie_index_offset);
1166 readl(pcie_index_offset);
1167 if (pcie_index_hi != 0) {
1168 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1169 readl(pcie_index_hi_offset);
1170 }
1171 writel(reg_data, pcie_data_offset);
1172 readl(pcie_data_offset);
1173
1174 /* clear the high bits */
1175 if (pcie_index_hi != 0) {
1176 writel(0, pcie_index_hi_offset);
1177 readl(pcie_index_hi_offset);
1178 }
1179
1180 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1181 }
1182
1183 /**
1184 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1185 *
1186 * @adev: amdgpu_device pointer
1187 * @reg_addr: indirect register offset
1188 * @reg_data: indirect register data
1189 *
1190 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1192 u32 reg_addr, u64 reg_data)
1193 {
1194 unsigned long flags, pcie_index, pcie_data;
1195 void __iomem *pcie_index_offset;
1196 void __iomem *pcie_data_offset;
1197
1198 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1199 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1200
1201 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1202 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1203 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1204
1205 /* write low 32 bits */
1206 writel(reg_addr, pcie_index_offset);
1207 readl(pcie_index_offset);
1208 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1209 readl(pcie_data_offset);
1210 /* write high 32 bits */
1211 writel(reg_addr + 4, pcie_index_offset);
1212 readl(pcie_index_offset);
1213 writel((u32)(reg_data >> 32), pcie_data_offset);
1214 readl(pcie_data_offset);
1215 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1216 }
1217
amdgpu_device_indirect_wreg64_ext(struct amdgpu_device * adev,u64 reg_addr,u64 reg_data)1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1219 u64 reg_addr, u64 reg_data)
1220 {
1221 unsigned long flags, pcie_index, pcie_data;
1222 unsigned long pcie_index_hi = 0;
1223 void __iomem *pcie_index_offset;
1224 void __iomem *pcie_index_hi_offset;
1225 void __iomem *pcie_data_offset;
1226
1227 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1228 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1229 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1230 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1231
1232 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1233 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1234 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1235 if (pcie_index_hi != 0)
1236 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1237 pcie_index_hi * 4;
1238
1239 /* write low 32 bits */
1240 writel(reg_addr, pcie_index_offset);
1241 readl(pcie_index_offset);
1242 if (pcie_index_hi != 0) {
1243 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1244 readl(pcie_index_hi_offset);
1245 }
1246 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1247 readl(pcie_data_offset);
1248 /* write high 32 bits */
1249 writel(reg_addr + 4, pcie_index_offset);
1250 readl(pcie_index_offset);
1251 if (pcie_index_hi != 0) {
1252 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1253 readl(pcie_index_hi_offset);
1254 }
1255 writel((u32)(reg_data >> 32), pcie_data_offset);
1256 readl(pcie_data_offset);
1257
1258 /* clear the high bits */
1259 if (pcie_index_hi != 0) {
1260 writel(0, pcie_index_hi_offset);
1261 readl(pcie_index_hi_offset);
1262 }
1263
1264 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1265 }
1266
1267 /**
1268 * amdgpu_device_get_rev_id - query device rev_id
1269 *
1270 * @adev: amdgpu_device pointer
1271 *
1272 * Return device rev_id
1273 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1275 {
1276 return adev->nbio.funcs->get_rev_id(adev);
1277 }
1278
1279 /**
1280 * amdgpu_invalid_rreg - dummy reg read function
1281 *
1282 * @adev: amdgpu_device pointer
1283 * @reg: offset of register
1284 *
1285 * Dummy register read function. Used for register blocks
1286 * that certain asics don't have (all asics).
1287 * Returns the value in the register.
1288 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1290 {
1291 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg);
1292 BUG();
1293 return 0;
1294 }
1295
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1297 {
1298 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg);
1299 BUG();
1300 return 0;
1301 }
1302
1303 /**
1304 * amdgpu_invalid_wreg - dummy reg write function
1305 *
1306 * @adev: amdgpu_device pointer
1307 * @reg: offset of register
1308 * @v: value to write to the register
1309 *
1310 * Dummy register read function. Used for register blocks
1311 * that certain asics don't have (all asics).
1312 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1314 {
1315 dev_err(adev->dev,
1316 "Invalid callback to write register 0x%04X with 0x%08X\n", reg,
1317 v);
1318 BUG();
1319 }
1320
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)1321 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1322 {
1323 dev_err(adev->dev,
1324 "Invalid callback to write register 0x%llX with 0x%08X\n", reg,
1325 v);
1326 BUG();
1327 }
1328
1329 /**
1330 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1331 *
1332 * @adev: amdgpu_device pointer
1333 * @reg: offset of register
1334 *
1335 * Dummy register read function. Used for register blocks
1336 * that certain asics don't have (all asics).
1337 * Returns the value in the register.
1338 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)1339 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1340 {
1341 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n",
1342 reg);
1343 BUG();
1344 return 0;
1345 }
1346
amdgpu_invalid_rreg64_ext(struct amdgpu_device * adev,uint64_t reg)1347 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1348 {
1349 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg);
1350 BUG();
1351 return 0;
1352 }
1353
1354 /**
1355 * amdgpu_invalid_wreg64 - dummy reg write function
1356 *
1357 * @adev: amdgpu_device pointer
1358 * @reg: offset of register
1359 * @v: value to write to the register
1360 *
1361 * Dummy register read function. Used for register blocks
1362 * that certain asics don't have (all asics).
1363 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)1364 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1365 {
1366 dev_err(adev->dev,
1367 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1368 reg, v);
1369 BUG();
1370 }
1371
amdgpu_invalid_wreg64_ext(struct amdgpu_device * adev,uint64_t reg,uint64_t v)1372 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1373 {
1374 dev_err(adev->dev,
1375 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1376 reg, v);
1377 BUG();
1378 }
1379
1380 /**
1381 * amdgpu_block_invalid_rreg - dummy reg read function
1382 *
1383 * @adev: amdgpu_device pointer
1384 * @block: offset of instance
1385 * @reg: offset of register
1386 *
1387 * Dummy register read function. Used for register blocks
1388 * that certain asics don't have (all asics).
1389 * Returns the value in the register.
1390 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)1391 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1392 uint32_t block, uint32_t reg)
1393 {
1394 dev_err(adev->dev,
1395 "Invalid callback to read register 0x%04X in block 0x%04X\n",
1396 reg, block);
1397 BUG();
1398 return 0;
1399 }
1400
1401 /**
1402 * amdgpu_block_invalid_wreg - dummy reg write function
1403 *
1404 * @adev: amdgpu_device pointer
1405 * @block: offset of instance
1406 * @reg: offset of register
1407 * @v: value to write to the register
1408 *
1409 * Dummy register read function. Used for register blocks
1410 * that certain asics don't have (all asics).
1411 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)1412 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1413 uint32_t block,
1414 uint32_t reg, uint32_t v)
1415 {
1416 dev_err(adev->dev,
1417 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1418 reg, block, v);
1419 BUG();
1420 }
1421
amdgpu_device_get_vbios_flags(struct amdgpu_device * adev)1422 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev)
1423 {
1424 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1425 return AMDGPU_VBIOS_SKIP;
1426
1427 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev))
1428 return AMDGPU_VBIOS_OPTIONAL;
1429
1430 return 0;
1431 }
1432
1433 /**
1434 * amdgpu_device_asic_init - Wrapper for atom asic_init
1435 *
1436 * @adev: amdgpu_device pointer
1437 *
1438 * Does any asic specific work and then calls atom asic init.
1439 */
amdgpu_device_asic_init(struct amdgpu_device * adev)1440 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1441 {
1442 uint32_t flags;
1443 bool optional;
1444 int ret;
1445
1446 amdgpu_asic_pre_asic_init(adev);
1447 flags = amdgpu_device_get_vbios_flags(adev);
1448 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP));
1449
1450 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1451 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1452 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
1453 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1454 amdgpu_psp_wait_for_bootloader(adev);
1455 if (optional && !adev->bios)
1456 return 0;
1457
1458 ret = amdgpu_atomfirmware_asic_init(adev, true);
1459 return ret;
1460 } else {
1461 if (optional && !adev->bios)
1462 return 0;
1463
1464 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1465 }
1466
1467 return 0;
1468 }
1469
1470 /**
1471 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1472 *
1473 * @adev: amdgpu_device pointer
1474 *
1475 * Allocates a scratch page of VRAM for use by various things in the
1476 * driver.
1477 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)1478 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1479 {
1480 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1481 AMDGPU_GEM_DOMAIN_VRAM |
1482 AMDGPU_GEM_DOMAIN_GTT,
1483 &adev->mem_scratch.robj,
1484 &adev->mem_scratch.gpu_addr,
1485 (void **)&adev->mem_scratch.ptr);
1486 }
1487
1488 /**
1489 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1490 *
1491 * @adev: amdgpu_device pointer
1492 *
1493 * Frees the VRAM scratch page.
1494 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)1495 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1496 {
1497 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1498 }
1499
1500 /**
1501 * amdgpu_device_program_register_sequence - program an array of registers.
1502 *
1503 * @adev: amdgpu_device pointer
1504 * @registers: pointer to the register array
1505 * @array_size: size of the register array
1506 *
1507 * Programs an array or registers with and or masks.
1508 * This is a helper for setting golden registers.
1509 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)1510 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1511 const u32 *registers,
1512 const u32 array_size)
1513 {
1514 u32 tmp, reg, and_mask, or_mask;
1515 int i;
1516
1517 if (array_size % 3)
1518 return;
1519
1520 for (i = 0; i < array_size; i += 3) {
1521 reg = registers[i + 0];
1522 and_mask = registers[i + 1];
1523 or_mask = registers[i + 2];
1524
1525 if (and_mask == 0xffffffff) {
1526 tmp = or_mask;
1527 } else {
1528 tmp = RREG32(reg);
1529 tmp &= ~and_mask;
1530 if (adev->family >= AMDGPU_FAMILY_AI)
1531 tmp |= (or_mask & and_mask);
1532 else
1533 tmp |= or_mask;
1534 }
1535 WREG32(reg, tmp);
1536 }
1537 }
1538
1539 /**
1540 * amdgpu_device_pci_config_reset - reset the GPU
1541 *
1542 * @adev: amdgpu_device pointer
1543 *
1544 * Resets the GPU using the pci config reset sequence.
1545 * Only applicable to asics prior to vega10.
1546 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)1547 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1548 {
1549 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1550 }
1551
1552 /**
1553 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1554 *
1555 * @adev: amdgpu_device pointer
1556 *
1557 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1558 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1559 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1560 {
1561 return pci_reset_function(adev->pdev);
1562 }
1563
1564 /*
1565 * amdgpu_device_wb_*()
1566 * Writeback is the method by which the GPU updates special pages in memory
1567 * with the status of certain GPU events (fences, ring pointers,etc.).
1568 */
1569
1570 /**
1571 * amdgpu_device_wb_fini - Disable Writeback and free memory
1572 *
1573 * @adev: amdgpu_device pointer
1574 *
1575 * Disables Writeback and frees the Writeback memory (all asics).
1576 * Used at driver shutdown.
1577 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1578 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1579 {
1580 if (adev->wb.wb_obj) {
1581 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1582 &adev->wb.gpu_addr,
1583 (void **)&adev->wb.wb);
1584 adev->wb.wb_obj = NULL;
1585 }
1586 }
1587
1588 /**
1589 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1590 *
1591 * @adev: amdgpu_device pointer
1592 *
1593 * Initializes writeback and allocates writeback memory (all asics).
1594 * Used at driver startup.
1595 * Returns 0 on success or an -error on failure.
1596 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1597 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1598 {
1599 int r;
1600
1601 if (adev->wb.wb_obj == NULL) {
1602 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1603 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1604 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1605 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1606 (void **)&adev->wb.wb);
1607 if (r) {
1608 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1609 return r;
1610 }
1611
1612 adev->wb.num_wb = AMDGPU_MAX_WB;
1613 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1614
1615 /* clear wb memory */
1616 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1617 }
1618
1619 return 0;
1620 }
1621
1622 /**
1623 * amdgpu_device_wb_get - Allocate a wb entry
1624 *
1625 * @adev: amdgpu_device pointer
1626 * @wb: wb index
1627 *
1628 * Allocate a wb slot for use by the driver (all asics).
1629 * Returns 0 on success or -EINVAL on failure.
1630 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1631 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1632 {
1633 unsigned long flags, offset;
1634
1635 spin_lock_irqsave(&adev->wb.lock, flags);
1636 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1637 if (offset < adev->wb.num_wb) {
1638 __set_bit(offset, adev->wb.used);
1639 spin_unlock_irqrestore(&adev->wb.lock, flags);
1640 *wb = offset << 3; /* convert to dw offset */
1641 return 0;
1642 } else {
1643 spin_unlock_irqrestore(&adev->wb.lock, flags);
1644 return -EINVAL;
1645 }
1646 }
1647
1648 /**
1649 * amdgpu_device_wb_free - Free a wb entry
1650 *
1651 * @adev: amdgpu_device pointer
1652 * @wb: wb index
1653 *
1654 * Free a wb slot allocated for use by the driver (all asics)
1655 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1656 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1657 {
1658 unsigned long flags;
1659
1660 wb >>= 3;
1661 spin_lock_irqsave(&adev->wb.lock, flags);
1662 if (wb < adev->wb.num_wb)
1663 __clear_bit(wb, adev->wb.used);
1664 spin_unlock_irqrestore(&adev->wb.lock, flags);
1665 }
1666
1667 /**
1668 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1669 *
1670 * @adev: amdgpu_device pointer
1671 *
1672 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1673 * to fail, but if any of the BARs is not accessible after the size we abort
1674 * driver loading by returning -ENODEV.
1675 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1676 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1677 {
1678 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1679 struct pci_bus *root;
1680 struct resource *res;
1681 int max_size, r;
1682 unsigned int i;
1683 u16 cmd;
1684
1685 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1686 return 0;
1687
1688 /* Bypass for VF */
1689 if (amdgpu_sriov_vf(adev))
1690 return 0;
1691
1692 if (!amdgpu_rebar)
1693 return 0;
1694
1695 /* resizing on Dell G5 SE platforms causes problems with runtime pm */
1696 if ((amdgpu_runtime_pm != 0) &&
1697 adev->pdev->vendor == PCI_VENDOR_ID_ATI &&
1698 adev->pdev->device == 0x731f &&
1699 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL)
1700 return 0;
1701
1702 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1703 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1704 dev_warn(
1705 adev->dev,
1706 "System can't access extended configuration space, please check!!\n");
1707
1708 /* skip if the bios has already enabled large BAR */
1709 if (adev->gmc.real_vram_size &&
1710 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1711 return 0;
1712
1713 /* Check if the root BUS has 64bit memory resources */
1714 root = adev->pdev->bus;
1715 while (root->parent)
1716 root = root->parent;
1717
1718 pci_bus_for_each_resource(root, res, i) {
1719 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1720 res->start > 0x100000000ull)
1721 break;
1722 }
1723
1724 /* Trying to resize is pointless without a root hub window above 4GB */
1725 if (!res)
1726 return 0;
1727
1728 /* Limit the BAR size to what is available */
1729 max_size = pci_rebar_get_max_size(adev->pdev, 0);
1730 if (max_size < 0)
1731 return 0;
1732 rbar_size = min(max_size, rbar_size);
1733
1734 /* Disable memory decoding while we change the BAR addresses and size */
1735 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1736 pci_write_config_word(adev->pdev, PCI_COMMAND,
1737 cmd & ~PCI_COMMAND_MEMORY);
1738
1739 /* Tear down doorbell as resizing will release BARs */
1740 amdgpu_doorbell_fini(adev);
1741
1742 r = pci_resize_resource(adev->pdev, 0, rbar_size,
1743 (adev->asic_type >= CHIP_BONAIRE) ? 1 << 5
1744 : 1 << 2);
1745 if (r == -ENOSPC)
1746 dev_info(adev->dev,
1747 "Not enough PCI address space for a large BAR.");
1748 else if (r && r != -ENOTSUPP)
1749 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r);
1750
1751 /* When the doorbell or fb BAR isn't available we have no chance of
1752 * using the device.
1753 */
1754 r = amdgpu_doorbell_init(adev);
1755 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1756 return -ENODEV;
1757
1758 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1759
1760 return 0;
1761 }
1762
1763 /*
1764 * GPU helpers function.
1765 */
1766 /**
1767 * amdgpu_device_need_post - check if the hw need post or not
1768 *
1769 * @adev: amdgpu_device pointer
1770 *
1771 * Check if the asic has been initialized (all asics) at driver startup
1772 * or post is needed if hw reset is performed.
1773 * Returns true if need or false if not.
1774 */
amdgpu_device_need_post(struct amdgpu_device * adev)1775 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1776 {
1777 uint32_t reg, flags;
1778
1779 if (amdgpu_sriov_vf(adev))
1780 return false;
1781
1782 flags = amdgpu_device_get_vbios_flags(adev);
1783 if (flags & AMDGPU_VBIOS_SKIP)
1784 return false;
1785 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios)
1786 return false;
1787
1788 if (amdgpu_passthrough(adev)) {
1789 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1790 * some old smc fw still need driver do vPost otherwise gpu hang, while
1791 * those smc fw version above 22.15 doesn't have this flaw, so we force
1792 * vpost executed for smc version below 22.15
1793 */
1794 if (adev->asic_type == CHIP_FIJI) {
1795 int err;
1796 uint32_t fw_ver;
1797
1798 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1799 /* force vPost if error occurred */
1800 if (err)
1801 return true;
1802
1803 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1804 release_firmware(adev->pm.fw);
1805 if (fw_ver < 0x00160e00)
1806 return true;
1807 }
1808 }
1809
1810 /* Don't post if we need to reset whole hive on init */
1811 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
1812 return false;
1813
1814 if (adev->has_hw_reset) {
1815 adev->has_hw_reset = false;
1816 return true;
1817 }
1818
1819 /* bios scratch used on CIK+ */
1820 if (adev->asic_type >= CHIP_BONAIRE)
1821 return amdgpu_atombios_scratch_need_asic_init(adev);
1822
1823 /* check MEM_SIZE for older asics */
1824 reg = amdgpu_asic_get_config_memsize(adev);
1825
1826 if ((reg != 0) && (reg != 0xffffffff))
1827 return false;
1828
1829 return true;
1830 }
1831
1832 /*
1833 * Check whether seamless boot is supported.
1834 *
1835 * So far we only support seamless boot on DCE 3.0 or later.
1836 * If users report that it works on older ASICS as well, we may
1837 * loosen this.
1838 */
amdgpu_device_seamless_boot_supported(struct amdgpu_device * adev)1839 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1840 {
1841 switch (amdgpu_seamless) {
1842 case -1:
1843 break;
1844 case 1:
1845 return true;
1846 case 0:
1847 return false;
1848 default:
1849 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n",
1850 amdgpu_seamless);
1851 return false;
1852 }
1853
1854 if (!(adev->flags & AMD_IS_APU))
1855 return false;
1856
1857 if (adev->mman.keep_stolen_vga_memory)
1858 return false;
1859
1860 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1861 }
1862
1863 /*
1864 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1865 * don't support dynamic speed switching. Until we have confirmation from Intel
1866 * that a specific host supports it, it's safer that we keep it disabled for all.
1867 *
1868 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1869 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1870 */
amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device * adev)1871 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1872 {
1873 #if IS_ENABLED(CONFIG_X86)
1874 struct cpuinfo_x86 *c = &cpu_data(0);
1875
1876 /* eGPU change speeds based on USB4 fabric conditions */
1877 if (dev_is_removable(adev->dev))
1878 return true;
1879
1880 if (c->x86_vendor == X86_VENDOR_INTEL)
1881 return false;
1882 #endif
1883 return true;
1884 }
1885
amdgpu_device_aspm_support_quirk(struct amdgpu_device * adev)1886 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev)
1887 {
1888 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4.
1889 * It's unclear if this is a platform-specific or GPU-specific issue.
1890 * Disable ASPM on SI for the time being.
1891 */
1892 if (adev->family == AMDGPU_FAMILY_SI)
1893 return true;
1894
1895 #if IS_ENABLED(CONFIG_X86)
1896 struct cpuinfo_x86 *c = &cpu_data(0);
1897
1898 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) ||
1899 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1)))
1900 return false;
1901
1902 if (c->x86 == 6 &&
1903 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) {
1904 switch (c->x86_model) {
1905 case VFM_MODEL(INTEL_ALDERLAKE):
1906 case VFM_MODEL(INTEL_ALDERLAKE_L):
1907 case VFM_MODEL(INTEL_RAPTORLAKE):
1908 case VFM_MODEL(INTEL_RAPTORLAKE_P):
1909 case VFM_MODEL(INTEL_RAPTORLAKE_S):
1910 return true;
1911 default:
1912 return false;
1913 }
1914 } else {
1915 return false;
1916 }
1917 #else
1918 return false;
1919 #endif
1920 }
1921
1922 /**
1923 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1924 *
1925 * @adev: amdgpu_device pointer
1926 *
1927 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1928 * be set for this device.
1929 *
1930 * Returns true if it should be used or false if not.
1931 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1932 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1933 {
1934 switch (amdgpu_aspm) {
1935 case -1:
1936 break;
1937 case 0:
1938 return false;
1939 case 1:
1940 return true;
1941 default:
1942 return false;
1943 }
1944 if (adev->flags & AMD_IS_APU)
1945 return false;
1946 if (amdgpu_device_aspm_support_quirk(adev))
1947 return false;
1948 return pcie_aspm_enabled(adev->pdev);
1949 }
1950
1951 /* if we get transitioned to only one device, take VGA back */
1952 /**
1953 * amdgpu_device_vga_set_decode - enable/disable vga decode
1954 *
1955 * @pdev: PCI device pointer
1956 * @state: enable/disable vga decode
1957 *
1958 * Enable/disable vga decode (all asics).
1959 * Returns VGA resource flags.
1960 */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1961 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1962 bool state)
1963 {
1964 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1965
1966 amdgpu_asic_set_vga_state(adev, state);
1967 if (state)
1968 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1969 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1970 else
1971 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1972 }
1973
1974 /**
1975 * amdgpu_device_check_block_size - validate the vm block size
1976 *
1977 * @adev: amdgpu_device pointer
1978 *
1979 * Validates the vm block size specified via module parameter.
1980 * The vm block size defines number of bits in page table versus page directory,
1981 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1982 * page table and the remaining bits are in the page directory.
1983 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1984 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1985 {
1986 /* defines number of bits in page table versus page directory,
1987 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1988 * page table and the remaining bits are in the page directory
1989 */
1990 if (amdgpu_vm_block_size == -1)
1991 return;
1992
1993 if (amdgpu_vm_block_size < 9) {
1994 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1995 amdgpu_vm_block_size);
1996 amdgpu_vm_block_size = -1;
1997 }
1998 }
1999
2000 /**
2001 * amdgpu_device_check_vm_size - validate the vm size
2002 *
2003 * @adev: amdgpu_device pointer
2004 *
2005 * Validates the vm size in GB specified via module parameter.
2006 * The VM size is the size of the GPU virtual memory space in GB.
2007 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)2008 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
2009 {
2010 /* no need to check the default value */
2011 if (amdgpu_vm_size == -1)
2012 return;
2013
2014 if (amdgpu_vm_size < 1) {
2015 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
2016 amdgpu_vm_size);
2017 amdgpu_vm_size = -1;
2018 }
2019 }
2020
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)2021 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
2022 {
2023 struct sysinfo si;
2024 bool is_os_64 = (sizeof(void *) == 8);
2025 uint64_t total_memory;
2026 uint64_t dram_size_seven_GB = 0x1B8000000;
2027 uint64_t dram_size_three_GB = 0xB8000000;
2028
2029 if (amdgpu_smu_memory_pool_size == 0)
2030 return;
2031
2032 if (!is_os_64) {
2033 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n");
2034 goto def_value;
2035 }
2036 si_meminfo(&si);
2037 total_memory = (uint64_t)si.totalram * si.mem_unit;
2038
2039 if ((amdgpu_smu_memory_pool_size == 1) ||
2040 (amdgpu_smu_memory_pool_size == 2)) {
2041 if (total_memory < dram_size_three_GB)
2042 goto def_value1;
2043 } else if ((amdgpu_smu_memory_pool_size == 4) ||
2044 (amdgpu_smu_memory_pool_size == 8)) {
2045 if (total_memory < dram_size_seven_GB)
2046 goto def_value1;
2047 } else {
2048 dev_warn(adev->dev, "Smu memory pool size not supported\n");
2049 goto def_value;
2050 }
2051 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
2052
2053 return;
2054
2055 def_value1:
2056 dev_warn(adev->dev, "No enough system memory\n");
2057 def_value:
2058 adev->pm.smu_prv_buffer_size = 0;
2059 }
2060
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)2061 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
2062 {
2063 if (!(adev->flags & AMD_IS_APU) ||
2064 adev->asic_type < CHIP_RAVEN)
2065 return 0;
2066
2067 switch (adev->asic_type) {
2068 case CHIP_RAVEN:
2069 if (adev->pdev->device == 0x15dd)
2070 adev->apu_flags |= AMD_APU_IS_RAVEN;
2071 if (adev->pdev->device == 0x15d8)
2072 adev->apu_flags |= AMD_APU_IS_PICASSO;
2073 break;
2074 case CHIP_RENOIR:
2075 if ((adev->pdev->device == 0x1636) ||
2076 (adev->pdev->device == 0x164c))
2077 adev->apu_flags |= AMD_APU_IS_RENOIR;
2078 else
2079 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
2080 break;
2081 case CHIP_VANGOGH:
2082 adev->apu_flags |= AMD_APU_IS_VANGOGH;
2083 break;
2084 case CHIP_YELLOW_CARP:
2085 break;
2086 case CHIP_CYAN_SKILLFISH:
2087 if ((adev->pdev->device == 0x13FE) ||
2088 (adev->pdev->device == 0x143F))
2089 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
2090 break;
2091 default:
2092 break;
2093 }
2094
2095 return 0;
2096 }
2097
2098 /**
2099 * amdgpu_device_check_arguments - validate module params
2100 *
2101 * @adev: amdgpu_device pointer
2102 *
2103 * Validates certain module parameters and updates
2104 * the associated values used by the driver (all asics).
2105 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)2106 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
2107 {
2108 int i;
2109
2110 if (amdgpu_sched_jobs < 4) {
2111 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
2112 amdgpu_sched_jobs);
2113 amdgpu_sched_jobs = 4;
2114 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
2115 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
2116 amdgpu_sched_jobs);
2117 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
2118 }
2119
2120 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
2121 /* gart size must be greater or equal to 32M */
2122 dev_warn(adev->dev, "gart size (%d) too small\n",
2123 amdgpu_gart_size);
2124 amdgpu_gart_size = -1;
2125 }
2126
2127 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
2128 /* gtt size must be greater or equal to 32M */
2129 dev_warn(adev->dev, "gtt size (%d) too small\n",
2130 amdgpu_gtt_size);
2131 amdgpu_gtt_size = -1;
2132 }
2133
2134 /* valid range is between 4 and 9 inclusive */
2135 if (amdgpu_vm_fragment_size != -1 &&
2136 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
2137 dev_warn(adev->dev, "valid range is between 4 and 9\n");
2138 amdgpu_vm_fragment_size = -1;
2139 }
2140
2141 if (amdgpu_sched_hw_submission < 2) {
2142 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
2143 amdgpu_sched_hw_submission);
2144 amdgpu_sched_hw_submission = 2;
2145 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
2146 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
2147 amdgpu_sched_hw_submission);
2148 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
2149 }
2150
2151 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
2152 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
2153 amdgpu_reset_method = -1;
2154 }
2155
2156 amdgpu_device_check_smu_prv_buffer_size(adev);
2157
2158 amdgpu_device_check_vm_size(adev);
2159
2160 amdgpu_device_check_block_size(adev);
2161
2162 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2163
2164 for (i = 0; i < MAX_XCP; i++) {
2165 switch (amdgpu_enforce_isolation) {
2166 case -1:
2167 case 0:
2168 default:
2169 /* disable */
2170 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE;
2171 break;
2172 case 1:
2173 /* enable */
2174 adev->enforce_isolation[i] =
2175 AMDGPU_ENFORCE_ISOLATION_ENABLE;
2176 break;
2177 case 2:
2178 /* enable legacy mode */
2179 adev->enforce_isolation[i] =
2180 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY;
2181 break;
2182 case 3:
2183 /* enable only process isolation without submitting cleaner shader */
2184 adev->enforce_isolation[i] =
2185 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER;
2186 break;
2187 }
2188 }
2189
2190 return 0;
2191 }
2192
2193 /**
2194 * amdgpu_switcheroo_set_state - set switcheroo state
2195 *
2196 * @pdev: pci dev pointer
2197 * @state: vga_switcheroo state
2198 *
2199 * Callback for the switcheroo driver. Suspends or resumes
2200 * the asics before or after it is powered up using ACPI methods.
2201 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)2202 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2203 enum vga_switcheroo_state state)
2204 {
2205 struct drm_device *dev = pci_get_drvdata(pdev);
2206 int r;
2207
2208 if (amdgpu_device_supports_px(drm_to_adev(dev)) &&
2209 state == VGA_SWITCHEROO_OFF)
2210 return;
2211
2212 if (state == VGA_SWITCHEROO_ON) {
2213 pr_info("switched on\n");
2214 /* don't suspend or resume card normally */
2215 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2216
2217 pci_set_power_state(pdev, PCI_D0);
2218 amdgpu_device_load_pci_state(pdev);
2219 r = pci_enable_device(pdev);
2220 if (r)
2221 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n",
2222 r);
2223 amdgpu_device_resume(dev, true);
2224
2225 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2226 } else {
2227 dev_info(&pdev->dev, "switched off\n");
2228 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2229 amdgpu_device_prepare(dev);
2230 amdgpu_device_suspend(dev, true);
2231 amdgpu_device_cache_pci_state(pdev);
2232 /* Shut down the device */
2233 pci_disable_device(pdev);
2234 pci_set_power_state(pdev, PCI_D3cold);
2235 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2236 }
2237 }
2238
2239 /**
2240 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2241 *
2242 * @pdev: pci dev pointer
2243 *
2244 * Callback for the switcheroo driver. Check of the switcheroo
2245 * state can be changed.
2246 * Returns true if the state can be changed, false if not.
2247 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)2248 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2249 {
2250 struct drm_device *dev = pci_get_drvdata(pdev);
2251
2252 /*
2253 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2254 * locking inversion with the driver load path. And the access here is
2255 * completely racy anyway. So don't bother with locking for now.
2256 */
2257 return atomic_read(&dev->open_count) == 0;
2258 }
2259
2260 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2261 .set_gpu_state = amdgpu_switcheroo_set_state,
2262 .reprobe = NULL,
2263 .can_switch = amdgpu_switcheroo_can_switch,
2264 };
2265
2266 /**
2267 * amdgpu_device_ip_set_clockgating_state - set the CG state
2268 *
2269 * @dev: amdgpu_device pointer
2270 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2271 * @state: clockgating state (gate or ungate)
2272 *
2273 * Sets the requested clockgating state for all instances of
2274 * the hardware IP specified.
2275 * Returns the error code from the last instance.
2276 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)2277 int amdgpu_device_ip_set_clockgating_state(void *dev,
2278 enum amd_ip_block_type block_type,
2279 enum amd_clockgating_state state)
2280 {
2281 struct amdgpu_device *adev = dev;
2282 int i, r = 0;
2283
2284 for (i = 0; i < adev->num_ip_blocks; i++) {
2285 if (!adev->ip_blocks[i].status.valid)
2286 continue;
2287 if (adev->ip_blocks[i].version->type != block_type)
2288 continue;
2289 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2290 continue;
2291 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2292 &adev->ip_blocks[i], state);
2293 if (r)
2294 dev_err(adev->dev,
2295 "set_clockgating_state of IP block <%s> failed %d\n",
2296 adev->ip_blocks[i].version->funcs->name, r);
2297 }
2298 return r;
2299 }
2300
2301 /**
2302 * amdgpu_device_ip_set_powergating_state - set the PG state
2303 *
2304 * @dev: amdgpu_device pointer
2305 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2306 * @state: powergating state (gate or ungate)
2307 *
2308 * Sets the requested powergating state for all instances of
2309 * the hardware IP specified.
2310 * Returns the error code from the last instance.
2311 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)2312 int amdgpu_device_ip_set_powergating_state(void *dev,
2313 enum amd_ip_block_type block_type,
2314 enum amd_powergating_state state)
2315 {
2316 struct amdgpu_device *adev = dev;
2317 int i, r = 0;
2318
2319 for (i = 0; i < adev->num_ip_blocks; i++) {
2320 if (!adev->ip_blocks[i].status.valid)
2321 continue;
2322 if (adev->ip_blocks[i].version->type != block_type)
2323 continue;
2324 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2325 continue;
2326 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2327 &adev->ip_blocks[i], state);
2328 if (r)
2329 dev_err(adev->dev,
2330 "set_powergating_state of IP block <%s> failed %d\n",
2331 adev->ip_blocks[i].version->funcs->name, r);
2332 }
2333 return r;
2334 }
2335
2336 /**
2337 * amdgpu_device_ip_get_clockgating_state - get the CG state
2338 *
2339 * @adev: amdgpu_device pointer
2340 * @flags: clockgating feature flags
2341 *
2342 * Walks the list of IPs on the device and updates the clockgating
2343 * flags for each IP.
2344 * Updates @flags with the feature flags for each hardware IP where
2345 * clockgating is enabled.
2346 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)2347 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2348 u64 *flags)
2349 {
2350 int i;
2351
2352 for (i = 0; i < adev->num_ip_blocks; i++) {
2353 if (!adev->ip_blocks[i].status.valid)
2354 continue;
2355 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2356 adev->ip_blocks[i].version->funcs->get_clockgating_state(
2357 &adev->ip_blocks[i], flags);
2358 }
2359 }
2360
2361 /**
2362 * amdgpu_device_ip_wait_for_idle - wait for idle
2363 *
2364 * @adev: amdgpu_device pointer
2365 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2366 *
2367 * Waits for the request hardware IP to be idle.
2368 * Returns 0 for success or a negative error code on failure.
2369 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2370 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2371 enum amd_ip_block_type block_type)
2372 {
2373 int i, r;
2374
2375 for (i = 0; i < adev->num_ip_blocks; i++) {
2376 if (!adev->ip_blocks[i].status.valid)
2377 continue;
2378 if (adev->ip_blocks[i].version->type == block_type) {
2379 if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
2380 r = adev->ip_blocks[i].version->funcs->wait_for_idle(
2381 &adev->ip_blocks[i]);
2382 if (r)
2383 return r;
2384 }
2385 break;
2386 }
2387 }
2388 return 0;
2389
2390 }
2391
2392 /**
2393 * amdgpu_device_ip_is_hw - is the hardware IP enabled
2394 *
2395 * @adev: amdgpu_device pointer
2396 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2397 *
2398 * Check if the hardware IP is enable or not.
2399 * Returns true if it the IP is enable, false if not.
2400 */
amdgpu_device_ip_is_hw(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2401 bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev,
2402 enum amd_ip_block_type block_type)
2403 {
2404 int i;
2405
2406 for (i = 0; i < adev->num_ip_blocks; i++) {
2407 if (adev->ip_blocks[i].version->type == block_type)
2408 return adev->ip_blocks[i].status.hw;
2409 }
2410 return false;
2411 }
2412
2413 /**
2414 * amdgpu_device_ip_is_valid - is the hardware IP valid
2415 *
2416 * @adev: amdgpu_device pointer
2417 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2418 *
2419 * Check if the hardware IP is valid or not.
2420 * Returns true if it the IP is valid, false if not.
2421 */
amdgpu_device_ip_is_valid(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2422 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
2423 enum amd_ip_block_type block_type)
2424 {
2425 int i;
2426
2427 for (i = 0; i < adev->num_ip_blocks; i++) {
2428 if (adev->ip_blocks[i].version->type == block_type)
2429 return adev->ip_blocks[i].status.valid;
2430 }
2431 return false;
2432
2433 }
2434
2435 /**
2436 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2437 *
2438 * @adev: amdgpu_device pointer
2439 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2440 *
2441 * Returns a pointer to the hardware IP block structure
2442 * if it exists for the asic, otherwise NULL.
2443 */
2444 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)2445 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2446 enum amd_ip_block_type type)
2447 {
2448 int i;
2449
2450 for (i = 0; i < adev->num_ip_blocks; i++)
2451 if (adev->ip_blocks[i].version->type == type)
2452 return &adev->ip_blocks[i];
2453
2454 return NULL;
2455 }
2456
2457 /**
2458 * amdgpu_device_ip_block_version_cmp
2459 *
2460 * @adev: amdgpu_device pointer
2461 * @type: enum amd_ip_block_type
2462 * @major: major version
2463 * @minor: minor version
2464 *
2465 * return 0 if equal or greater
2466 * return 1 if smaller or the ip_block doesn't exist
2467 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)2468 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2469 enum amd_ip_block_type type,
2470 u32 major, u32 minor)
2471 {
2472 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2473
2474 if (ip_block && ((ip_block->version->major > major) ||
2475 ((ip_block->version->major == major) &&
2476 (ip_block->version->minor >= minor))))
2477 return 0;
2478
2479 return 1;
2480 }
2481
2482 static const char *ip_block_names[] = {
2483 [AMD_IP_BLOCK_TYPE_COMMON] = "common",
2484 [AMD_IP_BLOCK_TYPE_GMC] = "gmc",
2485 [AMD_IP_BLOCK_TYPE_IH] = "ih",
2486 [AMD_IP_BLOCK_TYPE_SMC] = "smu",
2487 [AMD_IP_BLOCK_TYPE_PSP] = "psp",
2488 [AMD_IP_BLOCK_TYPE_DCE] = "dce",
2489 [AMD_IP_BLOCK_TYPE_GFX] = "gfx",
2490 [AMD_IP_BLOCK_TYPE_SDMA] = "sdma",
2491 [AMD_IP_BLOCK_TYPE_UVD] = "uvd",
2492 [AMD_IP_BLOCK_TYPE_VCE] = "vce",
2493 [AMD_IP_BLOCK_TYPE_ACP] = "acp",
2494 [AMD_IP_BLOCK_TYPE_VCN] = "vcn",
2495 [AMD_IP_BLOCK_TYPE_MES] = "mes",
2496 [AMD_IP_BLOCK_TYPE_JPEG] = "jpeg",
2497 [AMD_IP_BLOCK_TYPE_VPE] = "vpe",
2498 [AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm",
2499 [AMD_IP_BLOCK_TYPE_ISP] = "isp",
2500 [AMD_IP_BLOCK_TYPE_RAS] = "ras",
2501 };
2502
ip_block_name(struct amdgpu_device * adev,enum amd_ip_block_type type)2503 static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type)
2504 {
2505 int idx = (int)type;
2506
2507 return idx < ARRAY_SIZE(ip_block_names) ? ip_block_names[idx] : "unknown";
2508 }
2509
2510 /**
2511 * amdgpu_device_ip_block_add
2512 *
2513 * @adev: amdgpu_device pointer
2514 * @ip_block_version: pointer to the IP to add
2515 *
2516 * Adds the IP block driver information to the collection of IPs
2517 * on the asic.
2518 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)2519 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2520 const struct amdgpu_ip_block_version *ip_block_version)
2521 {
2522 if (!ip_block_version)
2523 return -EINVAL;
2524
2525 switch (ip_block_version->type) {
2526 case AMD_IP_BLOCK_TYPE_VCN:
2527 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2528 return 0;
2529 break;
2530 case AMD_IP_BLOCK_TYPE_JPEG:
2531 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2532 return 0;
2533 break;
2534 default:
2535 break;
2536 }
2537
2538 dev_info(adev->dev, "detected ip block number %d <%s_v%d_%d_%d> (%s)\n",
2539 adev->num_ip_blocks,
2540 ip_block_name(adev, ip_block_version->type),
2541 ip_block_version->major,
2542 ip_block_version->minor,
2543 ip_block_version->rev,
2544 ip_block_version->funcs->name);
2545
2546 adev->ip_blocks[adev->num_ip_blocks].adev = adev;
2547
2548 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2549
2550 return 0;
2551 }
2552
2553 /**
2554 * amdgpu_device_enable_virtual_display - enable virtual display feature
2555 *
2556 * @adev: amdgpu_device pointer
2557 *
2558 * Enabled the virtual display feature if the user has enabled it via
2559 * the module parameter virtual_display. This feature provides a virtual
2560 * display hardware on headless boards or in virtualized environments.
2561 * This function parses and validates the configuration string specified by
2562 * the user and configures the virtual display configuration (number of
2563 * virtual connectors, crtcs, etc.) specified.
2564 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)2565 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2566 {
2567 adev->enable_virtual_display = false;
2568
2569 if (amdgpu_virtual_display) {
2570 const char *pci_address_name = pci_name(adev->pdev);
2571 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2572
2573 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2574 pciaddstr_tmp = pciaddstr;
2575 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2576 pciaddname = strsep(&pciaddname_tmp, ",");
2577 if (!strcmp("all", pciaddname)
2578 || !strcmp(pci_address_name, pciaddname)) {
2579 long num_crtc;
2580 int res = -1;
2581
2582 adev->enable_virtual_display = true;
2583
2584 if (pciaddname_tmp)
2585 res = kstrtol(pciaddname_tmp, 10,
2586 &num_crtc);
2587
2588 if (!res) {
2589 if (num_crtc < 1)
2590 num_crtc = 1;
2591 if (num_crtc > 6)
2592 num_crtc = 6;
2593 adev->mode_info.num_crtc = num_crtc;
2594 } else {
2595 adev->mode_info.num_crtc = 1;
2596 }
2597 break;
2598 }
2599 }
2600
2601 dev_info(
2602 adev->dev,
2603 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2604 amdgpu_virtual_display, pci_address_name,
2605 adev->enable_virtual_display, adev->mode_info.num_crtc);
2606
2607 kfree(pciaddstr);
2608 }
2609 }
2610
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)2611 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2612 {
2613 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2614 adev->mode_info.num_crtc = 1;
2615 adev->enable_virtual_display = true;
2616 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n",
2617 adev->enable_virtual_display,
2618 adev->mode_info.num_crtc);
2619 }
2620 }
2621
2622 /**
2623 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2624 *
2625 * @adev: amdgpu_device pointer
2626 *
2627 * Parses the asic configuration parameters specified in the gpu info
2628 * firmware and makes them available to the driver for use in configuring
2629 * the asic.
2630 * Returns 0 on success, -EINVAL on failure.
2631 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)2632 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2633 {
2634 const char *chip_name;
2635 int err;
2636 const struct gpu_info_firmware_header_v1_0 *hdr;
2637
2638 adev->firmware.gpu_info_fw = NULL;
2639
2640 switch (adev->asic_type) {
2641 default:
2642 return 0;
2643 case CHIP_VEGA10:
2644 chip_name = "vega10";
2645 break;
2646 case CHIP_VEGA12:
2647 chip_name = "vega12";
2648 break;
2649 case CHIP_RAVEN:
2650 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2651 chip_name = "raven2";
2652 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2653 chip_name = "picasso";
2654 else
2655 chip_name = "raven";
2656 break;
2657 case CHIP_ARCTURUS:
2658 chip_name = "arcturus";
2659 break;
2660 case CHIP_NAVI12:
2661 if (adev->discovery.bin)
2662 return 0;
2663 chip_name = "navi12";
2664 break;
2665 case CHIP_CYAN_SKILLFISH:
2666 if (adev->discovery.bin)
2667 return 0;
2668 chip_name = "cyan_skillfish";
2669 break;
2670 }
2671
2672 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2673 AMDGPU_UCODE_OPTIONAL,
2674 "amdgpu/%s_gpu_info.bin", chip_name);
2675 if (err) {
2676 dev_err(adev->dev,
2677 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2678 chip_name);
2679 goto out;
2680 }
2681
2682 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2683 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2684
2685 switch (hdr->version_major) {
2686 case 1:
2687 {
2688 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2689 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2690 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2691
2692 /*
2693 * Should be dropped when DAL no longer needs it.
2694 */
2695 if (adev->asic_type == CHIP_NAVI12)
2696 goto parse_soc_bounding_box;
2697
2698 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2699 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2700 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2701 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2702 adev->gfx.config.max_texture_channel_caches =
2703 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2704 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2705 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2706 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2707 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2708 adev->gfx.config.double_offchip_lds_buf =
2709 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2710 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2711 adev->gfx.cu_info.max_waves_per_simd =
2712 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2713 adev->gfx.cu_info.max_scratch_slots_per_cu =
2714 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2715 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2716 if (hdr->version_minor >= 1) {
2717 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2718 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2719 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2720 adev->gfx.config.num_sc_per_sh =
2721 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2722 adev->gfx.config.num_packer_per_sc =
2723 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2724 }
2725
2726 parse_soc_bounding_box:
2727 /*
2728 * soc bounding box info is not integrated in disocovery table,
2729 * we always need to parse it from gpu info firmware if needed.
2730 */
2731 if (hdr->version_minor == 2) {
2732 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2733 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2734 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2735 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2736 }
2737 break;
2738 }
2739 default:
2740 dev_err(adev->dev,
2741 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2742 err = -EINVAL;
2743 goto out;
2744 }
2745 out:
2746 return err;
2747 }
2748
amdgpu_uid_init(struct amdgpu_device * adev)2749 static void amdgpu_uid_init(struct amdgpu_device *adev)
2750 {
2751 /* Initialize the UID for the device */
2752 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL);
2753 if (!adev->uid_info) {
2754 dev_warn(adev->dev, "Failed to allocate memory for UID\n");
2755 return;
2756 }
2757 adev->uid_info->adev = adev;
2758 }
2759
amdgpu_uid_fini(struct amdgpu_device * adev)2760 static void amdgpu_uid_fini(struct amdgpu_device *adev)
2761 {
2762 /* Free the UID memory */
2763 kfree(adev->uid_info);
2764 adev->uid_info = NULL;
2765 }
2766
2767 /**
2768 * amdgpu_device_ip_early_init - run early init for hardware IPs
2769 *
2770 * @adev: amdgpu_device pointer
2771 *
2772 * Early initialization pass for hardware IPs. The hardware IPs that make
2773 * up each asic are discovered each IP's early_init callback is run. This
2774 * is the first stage in initializing the asic.
2775 * Returns 0 on success, negative error code on failure.
2776 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2777 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2778 {
2779 struct amdgpu_ip_block *ip_block;
2780 struct pci_dev *parent;
2781 bool total, skip_bios;
2782 uint32_t bios_flags;
2783 int i, r;
2784
2785 amdgpu_device_enable_virtual_display(adev);
2786
2787 if (amdgpu_sriov_vf(adev)) {
2788 r = amdgpu_virt_request_full_gpu(adev, true);
2789 if (r)
2790 return r;
2791
2792 r = amdgpu_virt_init_critical_region(adev);
2793 if (r)
2794 return r;
2795 }
2796
2797 switch (adev->asic_type) {
2798 #ifdef CONFIG_DRM_AMDGPU_SI
2799 case CHIP_VERDE:
2800 case CHIP_TAHITI:
2801 case CHIP_PITCAIRN:
2802 case CHIP_OLAND:
2803 case CHIP_HAINAN:
2804 adev->family = AMDGPU_FAMILY_SI;
2805 r = si_set_ip_blocks(adev);
2806 if (r)
2807 return r;
2808 break;
2809 #endif
2810 #ifdef CONFIG_DRM_AMDGPU_CIK
2811 case CHIP_BONAIRE:
2812 case CHIP_HAWAII:
2813 case CHIP_KAVERI:
2814 case CHIP_KABINI:
2815 case CHIP_MULLINS:
2816 if (adev->flags & AMD_IS_APU)
2817 adev->family = AMDGPU_FAMILY_KV;
2818 else
2819 adev->family = AMDGPU_FAMILY_CI;
2820
2821 r = cik_set_ip_blocks(adev);
2822 if (r)
2823 return r;
2824 break;
2825 #endif
2826 case CHIP_TOPAZ:
2827 case CHIP_TONGA:
2828 case CHIP_FIJI:
2829 case CHIP_POLARIS10:
2830 case CHIP_POLARIS11:
2831 case CHIP_POLARIS12:
2832 case CHIP_VEGAM:
2833 case CHIP_CARRIZO:
2834 case CHIP_STONEY:
2835 if (adev->flags & AMD_IS_APU)
2836 adev->family = AMDGPU_FAMILY_CZ;
2837 else
2838 adev->family = AMDGPU_FAMILY_VI;
2839
2840 r = vi_set_ip_blocks(adev);
2841 if (r)
2842 return r;
2843 break;
2844 default:
2845 r = amdgpu_discovery_set_ip_blocks(adev);
2846 if (r)
2847 return r;
2848 break;
2849 }
2850
2851 /* Check for IP version 9.4.3 with A0 hardware */
2852 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
2853 !amdgpu_device_get_rev_id(adev)) {
2854 dev_err(adev->dev, "Unsupported A0 hardware\n");
2855 return -ENODEV; /* device unsupported - no device error */
2856 }
2857
2858 if (amdgpu_has_atpx() &&
2859 (amdgpu_is_atpx_hybrid() ||
2860 amdgpu_has_atpx_dgpu_power_cntl()) &&
2861 ((adev->flags & AMD_IS_APU) == 0) &&
2862 !dev_is_removable(&adev->pdev->dev))
2863 adev->flags |= AMD_IS_PX;
2864
2865 if (!(adev->flags & AMD_IS_APU)) {
2866 parent = pcie_find_root_port(adev->pdev);
2867 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2868 }
2869
2870 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2871 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2872 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2873 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2874 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2875 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2876 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2877
2878 adev->virt.is_xgmi_node_migrate_enabled = false;
2879 if (amdgpu_sriov_vf(adev)) {
2880 adev->virt.is_xgmi_node_migrate_enabled =
2881 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4);
2882 }
2883
2884 total = true;
2885 for (i = 0; i < adev->num_ip_blocks; i++) {
2886 ip_block = &adev->ip_blocks[i];
2887
2888 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2889 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i,
2890 adev->ip_blocks[i].version->funcs->name);
2891 adev->ip_blocks[i].status.valid = false;
2892 } else if (ip_block->version->funcs->early_init) {
2893 r = ip_block->version->funcs->early_init(ip_block);
2894 if (r == -ENOENT) {
2895 adev->ip_blocks[i].status.valid = false;
2896 } else if (r) {
2897 dev_err(adev->dev,
2898 "early_init of IP block <%s> failed %d\n",
2899 adev->ip_blocks[i].version->funcs->name,
2900 r);
2901 total = false;
2902 } else {
2903 adev->ip_blocks[i].status.valid = true;
2904 }
2905 } else {
2906 adev->ip_blocks[i].status.valid = true;
2907 }
2908 /* get the vbios after the asic_funcs are set up */
2909 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2910 r = amdgpu_device_parse_gpu_info_fw(adev);
2911 if (r)
2912 return r;
2913
2914 bios_flags = amdgpu_device_get_vbios_flags(adev);
2915 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP);
2916 /* Read BIOS */
2917 if (!skip_bios) {
2918 bool optional =
2919 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL);
2920 if (!amdgpu_get_bios(adev) && !optional)
2921 return -EINVAL;
2922
2923 if (optional && !adev->bios)
2924 dev_info(
2925 adev->dev,
2926 "VBIOS image optional, proceeding without VBIOS image");
2927
2928 if (adev->bios) {
2929 r = amdgpu_atombios_init(adev);
2930 if (r) {
2931 dev_err(adev->dev,
2932 "amdgpu_atombios_init failed\n");
2933 amdgpu_vf_error_put(
2934 adev,
2935 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL,
2936 0, 0);
2937 return r;
2938 }
2939 }
2940 }
2941
2942 /*get pf2vf msg info at it's earliest time*/
2943 if (amdgpu_sriov_vf(adev))
2944 amdgpu_virt_init_data_exchange(adev);
2945
2946 }
2947 }
2948 if (!total)
2949 return -ENODEV;
2950
2951 if (adev->gmc.xgmi.supported)
2952 amdgpu_xgmi_early_init(adev);
2953
2954 if (amdgpu_is_multi_aid(adev))
2955 amdgpu_uid_init(adev);
2956 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2957 if (ip_block->status.valid != false)
2958 amdgpu_amdkfd_device_probe(adev);
2959
2960 adev->cg_flags &= amdgpu_cg_mask;
2961 adev->pg_flags &= amdgpu_pg_mask;
2962
2963 return 0;
2964 }
2965
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2966 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2967 {
2968 int i, r;
2969
2970 for (i = 0; i < adev->num_ip_blocks; i++) {
2971 if (!adev->ip_blocks[i].status.sw)
2972 continue;
2973 if (adev->ip_blocks[i].status.hw)
2974 continue;
2975 if (!amdgpu_ip_member_of_hwini(
2976 adev, adev->ip_blocks[i].version->type))
2977 continue;
2978 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2979 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2980 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2981 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2982 if (r) {
2983 dev_err(adev->dev,
2984 "hw_init of IP block <%s> failed %d\n",
2985 adev->ip_blocks[i].version->funcs->name,
2986 r);
2987 return r;
2988 }
2989 adev->ip_blocks[i].status.hw = true;
2990 }
2991 }
2992
2993 return 0;
2994 }
2995
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2996 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2997 {
2998 int i, r;
2999
3000 for (i = 0; i < adev->num_ip_blocks; i++) {
3001 if (!adev->ip_blocks[i].status.sw)
3002 continue;
3003 if (adev->ip_blocks[i].status.hw)
3004 continue;
3005 if (!amdgpu_ip_member_of_hwini(
3006 adev, adev->ip_blocks[i].version->type))
3007 continue;
3008 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3009 if (r) {
3010 dev_err(adev->dev,
3011 "hw_init of IP block <%s> failed %d\n",
3012 adev->ip_blocks[i].version->funcs->name, r);
3013 return r;
3014 }
3015 adev->ip_blocks[i].status.hw = true;
3016 }
3017
3018 return 0;
3019 }
3020
amdgpu_device_fw_loading(struct amdgpu_device * adev)3021 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
3022 {
3023 int r = 0;
3024 int i;
3025 uint32_t smu_version;
3026
3027 if (adev->asic_type >= CHIP_VEGA10) {
3028 for (i = 0; i < adev->num_ip_blocks; i++) {
3029 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
3030 continue;
3031
3032 if (!amdgpu_ip_member_of_hwini(adev,
3033 AMD_IP_BLOCK_TYPE_PSP))
3034 break;
3035
3036 if (!adev->ip_blocks[i].status.sw)
3037 continue;
3038
3039 /* no need to do the fw loading again if already done*/
3040 if (adev->ip_blocks[i].status.hw == true)
3041 break;
3042
3043 if (amdgpu_in_reset(adev) || adev->in_suspend) {
3044 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3045 if (r)
3046 return r;
3047 } else {
3048 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3049 if (r) {
3050 dev_err(adev->dev,
3051 "hw_init of IP block <%s> failed %d\n",
3052 adev->ip_blocks[i]
3053 .version->funcs->name,
3054 r);
3055 return r;
3056 }
3057 adev->ip_blocks[i].status.hw = true;
3058 }
3059 break;
3060 }
3061 }
3062
3063 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
3064 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
3065
3066 return r;
3067 }
3068
amdgpu_device_init_schedulers(struct amdgpu_device * adev)3069 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
3070 {
3071 struct drm_sched_init_args args = {
3072 .ops = &amdgpu_sched_ops,
3073 .num_rqs = DRM_SCHED_PRIORITY_COUNT,
3074 .timeout_wq = adev->reset_domain->wq,
3075 .dev = adev->dev,
3076 };
3077 long timeout;
3078 int r, i;
3079
3080 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3081 struct amdgpu_ring *ring = adev->rings[i];
3082
3083 /* No need to setup the GPU scheduler for rings that don't need it */
3084 if (!ring || ring->no_scheduler)
3085 continue;
3086
3087 switch (ring->funcs->type) {
3088 case AMDGPU_RING_TYPE_GFX:
3089 timeout = adev->gfx_timeout;
3090 break;
3091 case AMDGPU_RING_TYPE_COMPUTE:
3092 timeout = adev->compute_timeout;
3093 break;
3094 case AMDGPU_RING_TYPE_SDMA:
3095 timeout = adev->sdma_timeout;
3096 break;
3097 default:
3098 timeout = adev->video_timeout;
3099 break;
3100 }
3101
3102 args.timeout = timeout;
3103 args.credit_limit = ring->num_hw_submission;
3104 args.score = ring->sched_score;
3105 args.name = ring->name;
3106
3107 r = drm_sched_init(&ring->sched, &args);
3108 if (r) {
3109 dev_err(adev->dev,
3110 "Failed to create scheduler on ring %s.\n",
3111 ring->name);
3112 return r;
3113 }
3114 r = amdgpu_uvd_entity_init(adev, ring);
3115 if (r) {
3116 dev_err(adev->dev,
3117 "Failed to create UVD scheduling entity on ring %s.\n",
3118 ring->name);
3119 return r;
3120 }
3121 r = amdgpu_vce_entity_init(adev, ring);
3122 if (r) {
3123 dev_err(adev->dev,
3124 "Failed to create VCE scheduling entity on ring %s.\n",
3125 ring->name);
3126 return r;
3127 }
3128 }
3129
3130 if (adev->xcp_mgr)
3131 amdgpu_xcp_update_partition_sched_list(adev);
3132
3133 return 0;
3134 }
3135
3136
3137 /**
3138 * amdgpu_device_ip_init - run init for hardware IPs
3139 *
3140 * @adev: amdgpu_device pointer
3141 *
3142 * Main initialization pass for hardware IPs. The list of all the hardware
3143 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
3144 * are run. sw_init initializes the software state associated with each IP
3145 * and hw_init initializes the hardware associated with each IP.
3146 * Returns 0 on success, negative error code on failure.
3147 */
amdgpu_device_ip_init(struct amdgpu_device * adev)3148 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
3149 {
3150 bool init_badpage;
3151 int i, r;
3152
3153 r = amdgpu_ras_init(adev);
3154 if (r)
3155 return r;
3156
3157 for (i = 0; i < adev->num_ip_blocks; i++) {
3158 if (!adev->ip_blocks[i].status.valid)
3159 continue;
3160 if (adev->ip_blocks[i].version->funcs->sw_init) {
3161 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
3162 if (r) {
3163 dev_err(adev->dev,
3164 "sw_init of IP block <%s> failed %d\n",
3165 adev->ip_blocks[i].version->funcs->name,
3166 r);
3167 goto init_failed;
3168 }
3169 }
3170 adev->ip_blocks[i].status.sw = true;
3171
3172 if (!amdgpu_ip_member_of_hwini(
3173 adev, adev->ip_blocks[i].version->type))
3174 continue;
3175
3176 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
3177 /* need to do common hw init early so everything is set up for gmc */
3178 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3179 if (r) {
3180 dev_err(adev->dev, "hw_init %d failed %d\n", i,
3181 r);
3182 goto init_failed;
3183 }
3184 adev->ip_blocks[i].status.hw = true;
3185 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3186 /* need to do gmc hw init early so we can allocate gpu mem */
3187 /* Try to reserve bad pages early */
3188 if (amdgpu_sriov_vf(adev))
3189 amdgpu_virt_exchange_data(adev);
3190
3191 r = amdgpu_device_mem_scratch_init(adev);
3192 if (r) {
3193 dev_err(adev->dev,
3194 "amdgpu_mem_scratch_init failed %d\n",
3195 r);
3196 goto init_failed;
3197 }
3198 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3199 if (r) {
3200 dev_err(adev->dev, "hw_init %d failed %d\n", i,
3201 r);
3202 goto init_failed;
3203 }
3204 r = amdgpu_device_wb_init(adev);
3205 if (r) {
3206 dev_err(adev->dev,
3207 "amdgpu_device_wb_init failed %d\n", r);
3208 goto init_failed;
3209 }
3210 adev->ip_blocks[i].status.hw = true;
3211
3212 /* right after GMC hw init, we create CSA */
3213 if (adev->gfx.mcbp) {
3214 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
3215 AMDGPU_GEM_DOMAIN_VRAM |
3216 AMDGPU_GEM_DOMAIN_GTT,
3217 AMDGPU_CSA_SIZE);
3218 if (r) {
3219 dev_err(adev->dev,
3220 "allocate CSA failed %d\n", r);
3221 goto init_failed;
3222 }
3223 }
3224
3225 r = amdgpu_seq64_init(adev);
3226 if (r) {
3227 dev_err(adev->dev, "allocate seq64 failed %d\n",
3228 r);
3229 goto init_failed;
3230 }
3231 }
3232 }
3233
3234 if (amdgpu_sriov_vf(adev))
3235 amdgpu_virt_init_data_exchange(adev);
3236
3237 r = amdgpu_ib_pool_init(adev);
3238 if (r) {
3239 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
3240 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
3241 goto init_failed;
3242 }
3243
3244 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
3245 if (r)
3246 goto init_failed;
3247
3248 r = amdgpu_device_ip_hw_init_phase1(adev);
3249 if (r)
3250 goto init_failed;
3251
3252 r = amdgpu_device_fw_loading(adev);
3253 if (r)
3254 goto init_failed;
3255
3256 r = amdgpu_device_ip_hw_init_phase2(adev);
3257 if (r)
3258 goto init_failed;
3259
3260 /*
3261 * retired pages will be loaded from eeprom and reserved here,
3262 * it should be called after amdgpu_device_ip_hw_init_phase2 since
3263 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
3264 * for I2C communication which only true at this point.
3265 *
3266 * amdgpu_ras_recovery_init may fail, but the upper only cares the
3267 * failure from bad gpu situation and stop amdgpu init process
3268 * accordingly. For other failed cases, it will still release all
3269 * the resource and print error message, rather than returning one
3270 * negative value to upper level.
3271 *
3272 * Note: theoretically, this should be called before all vram allocations
3273 * to protect retired page from abusing
3274 */
3275 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
3276 r = amdgpu_ras_recovery_init(adev, init_badpage);
3277 if (r)
3278 goto init_failed;
3279
3280 /**
3281 * In case of XGMI grab extra reference for reset domain for this device
3282 */
3283 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3284 if (amdgpu_xgmi_add_device(adev) == 0) {
3285 if (!amdgpu_sriov_vf(adev)) {
3286 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3287
3288 if (WARN_ON(!hive)) {
3289 r = -ENOENT;
3290 goto init_failed;
3291 }
3292
3293 if (!hive->reset_domain ||
3294 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
3295 r = -ENOENT;
3296 amdgpu_put_xgmi_hive(hive);
3297 goto init_failed;
3298 }
3299
3300 /* Drop the early temporary reset domain we created for device */
3301 amdgpu_reset_put_reset_domain(adev->reset_domain);
3302 adev->reset_domain = hive->reset_domain;
3303 amdgpu_put_xgmi_hive(hive);
3304 }
3305 }
3306 }
3307
3308 r = amdgpu_device_init_schedulers(adev);
3309 if (r)
3310 goto init_failed;
3311
3312 if (adev->mman.buffer_funcs_ring->sched.ready)
3313 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3314
3315 /* Don't init kfd if whole hive need to be reset during init */
3316 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
3317 kgd2kfd_init_zone_device(adev);
3318 amdgpu_amdkfd_device_init(adev);
3319 }
3320
3321 amdgpu_fru_get_product_info(adev);
3322
3323 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev))
3324 r = amdgpu_cper_init(adev);
3325
3326 init_failed:
3327
3328 return r;
3329 }
3330
3331 /**
3332 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
3333 *
3334 * @adev: amdgpu_device pointer
3335 *
3336 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
3337 * this function before a GPU reset. If the value is retained after a
3338 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents.
3339 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)3340 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
3341 {
3342 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
3343 }
3344
3345 /**
3346 * amdgpu_device_check_vram_lost - check if vram is valid
3347 *
3348 * @adev: amdgpu_device pointer
3349 *
3350 * Checks the reset magic value written to the gart pointer in VRAM.
3351 * The driver calls this after a GPU reset to see if the contents of
3352 * VRAM is lost or now.
3353 * returns true if vram is lost, false if not.
3354 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)3355 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3356 {
3357 if (memcmp(adev->gart.ptr, adev->reset_magic,
3358 AMDGPU_RESET_MAGIC_NUM))
3359 return true;
3360
3361 if (!amdgpu_in_reset(adev))
3362 return false;
3363
3364 /*
3365 * For all ASICs with baco/mode1 reset, the VRAM is
3366 * always assumed to be lost.
3367 */
3368 switch (amdgpu_asic_reset_method(adev)) {
3369 case AMD_RESET_METHOD_LEGACY:
3370 case AMD_RESET_METHOD_LINK:
3371 case AMD_RESET_METHOD_BACO:
3372 case AMD_RESET_METHOD_MODE1:
3373 return true;
3374 default:
3375 return false;
3376 }
3377 }
3378
3379 /**
3380 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3381 *
3382 * @adev: amdgpu_device pointer
3383 * @state: clockgating state (gate or ungate)
3384 *
3385 * The list of all the hardware IPs that make up the asic is walked and the
3386 * set_clockgating_state callbacks are run.
3387 * Late initialization pass enabling clockgating for hardware IPs.
3388 * Fini or suspend, pass disabling clockgating for hardware IPs.
3389 * Returns 0 on success, negative error code on failure.
3390 */
3391
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)3392 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3393 enum amd_clockgating_state state)
3394 {
3395 int i, j, r;
3396
3397 if (amdgpu_emu_mode == 1)
3398 return 0;
3399
3400 for (j = 0; j < adev->num_ip_blocks; j++) {
3401 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3402 if (!adev->ip_blocks[i].status.late_initialized)
3403 continue;
3404 /* skip CG for GFX, SDMA on S0ix */
3405 if (adev->in_s0ix &&
3406 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3407 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3408 continue;
3409 /* skip CG for VCE/UVD, it's handled specially */
3410 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3411 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3412 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3413 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3414 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3415 /* enable clockgating to save power */
3416 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i],
3417 state);
3418 if (r) {
3419 dev_err(adev->dev,
3420 "set_clockgating_state(gate) of IP block <%s> failed %d\n",
3421 adev->ip_blocks[i].version->funcs->name,
3422 r);
3423 return r;
3424 }
3425 }
3426 }
3427
3428 return 0;
3429 }
3430
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)3431 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3432 enum amd_powergating_state state)
3433 {
3434 int i, j, r;
3435
3436 if (amdgpu_emu_mode == 1)
3437 return 0;
3438
3439 for (j = 0; j < adev->num_ip_blocks; j++) {
3440 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3441 if (!adev->ip_blocks[i].status.late_initialized)
3442 continue;
3443 /* skip PG for GFX, SDMA on S0ix */
3444 if (adev->in_s0ix &&
3445 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3446 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3447 continue;
3448 /* skip CG for VCE/UVD, it's handled specially */
3449 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3450 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3451 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3452 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3453 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3454 /* enable powergating to save power */
3455 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i],
3456 state);
3457 if (r) {
3458 dev_err(adev->dev,
3459 "set_powergating_state(gate) of IP block <%s> failed %d\n",
3460 adev->ip_blocks[i].version->funcs->name,
3461 r);
3462 return r;
3463 }
3464 }
3465 }
3466 return 0;
3467 }
3468
amdgpu_device_enable_mgpu_fan_boost(void)3469 static int amdgpu_device_enable_mgpu_fan_boost(void)
3470 {
3471 struct amdgpu_gpu_instance *gpu_ins;
3472 struct amdgpu_device *adev;
3473 int i, ret = 0;
3474
3475 mutex_lock(&mgpu_info.mutex);
3476
3477 /*
3478 * MGPU fan boost feature should be enabled
3479 * only when there are two or more dGPUs in
3480 * the system
3481 */
3482 if (mgpu_info.num_dgpu < 2)
3483 goto out;
3484
3485 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3486 gpu_ins = &(mgpu_info.gpu_ins[i]);
3487 adev = gpu_ins->adev;
3488 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) &&
3489 !gpu_ins->mgpu_fan_enabled) {
3490 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3491 if (ret)
3492 break;
3493
3494 gpu_ins->mgpu_fan_enabled = 1;
3495 }
3496 }
3497
3498 out:
3499 mutex_unlock(&mgpu_info.mutex);
3500
3501 return ret;
3502 }
3503
3504 /**
3505 * amdgpu_device_ip_late_init - run late init for hardware IPs
3506 *
3507 * @adev: amdgpu_device pointer
3508 *
3509 * Late initialization pass for hardware IPs. The list of all the hardware
3510 * IPs that make up the asic is walked and the late_init callbacks are run.
3511 * late_init covers any special initialization that an IP requires
3512 * after all of the have been initialized or something that needs to happen
3513 * late in the init process.
3514 * Returns 0 on success, negative error code on failure.
3515 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)3516 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3517 {
3518 struct amdgpu_gpu_instance *gpu_instance;
3519 int i = 0, r;
3520
3521 for (i = 0; i < adev->num_ip_blocks; i++) {
3522 if (!adev->ip_blocks[i].status.hw)
3523 continue;
3524 if (adev->ip_blocks[i].version->funcs->late_init) {
3525 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
3526 if (r) {
3527 dev_err(adev->dev,
3528 "late_init of IP block <%s> failed %d\n",
3529 adev->ip_blocks[i].version->funcs->name,
3530 r);
3531 return r;
3532 }
3533 }
3534 adev->ip_blocks[i].status.late_initialized = true;
3535 }
3536
3537 r = amdgpu_ras_late_init(adev);
3538 if (r) {
3539 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r);
3540 return r;
3541 }
3542
3543 if (!amdgpu_reset_in_recovery(adev))
3544 amdgpu_ras_set_error_query_ready(adev, true);
3545
3546 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3547 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3548
3549 amdgpu_device_fill_reset_magic(adev);
3550
3551 r = amdgpu_device_enable_mgpu_fan_boost();
3552 if (r)
3553 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r);
3554
3555 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3556 if (amdgpu_passthrough(adev) &&
3557 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3558 adev->asic_type == CHIP_ALDEBARAN))
3559 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3560
3561 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3562 mutex_lock(&mgpu_info.mutex);
3563
3564 /*
3565 * Reset device p-state to low as this was booted with high.
3566 *
3567 * This should be performed only after all devices from the same
3568 * hive get initialized.
3569 *
3570 * However, it's unknown how many device in the hive in advance.
3571 * As this is counted one by one during devices initializations.
3572 *
3573 * So, we wait for all XGMI interlinked devices initialized.
3574 * This may bring some delays as those devices may come from
3575 * different hives. But that should be OK.
3576 */
3577 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3578 for (i = 0; i < mgpu_info.num_gpu; i++) {
3579 gpu_instance = &(mgpu_info.gpu_ins[i]);
3580 if (gpu_instance->adev->flags & AMD_IS_APU)
3581 continue;
3582
3583 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3584 AMDGPU_XGMI_PSTATE_MIN);
3585 if (r) {
3586 dev_err(adev->dev,
3587 "pstate setting failed (%d).\n",
3588 r);
3589 break;
3590 }
3591 }
3592 }
3593
3594 mutex_unlock(&mgpu_info.mutex);
3595 }
3596
3597 return 0;
3598 }
3599
amdgpu_ip_block_hw_fini(struct amdgpu_ip_block * ip_block)3600 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
3601 {
3602 struct amdgpu_device *adev = ip_block->adev;
3603 int r;
3604
3605 if (!ip_block->version->funcs->hw_fini) {
3606 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n",
3607 ip_block->version->funcs->name);
3608 } else {
3609 r = ip_block->version->funcs->hw_fini(ip_block);
3610 /* XXX handle errors */
3611 if (r) {
3612 dev_dbg(adev->dev,
3613 "hw_fini of IP block <%s> failed %d\n",
3614 ip_block->version->funcs->name, r);
3615 }
3616 }
3617
3618 ip_block->status.hw = false;
3619 }
3620
3621 /**
3622 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3623 *
3624 * @adev: amdgpu_device pointer
3625 *
3626 * For ASICs need to disable SMC first
3627 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)3628 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3629 {
3630 int i;
3631
3632 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3633 return;
3634
3635 for (i = 0; i < adev->num_ip_blocks; i++) {
3636 if (!adev->ip_blocks[i].status.hw)
3637 continue;
3638 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3639 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3640 break;
3641 }
3642 }
3643 }
3644
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)3645 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3646 {
3647 int i, r;
3648
3649 for (i = 0; i < adev->num_ip_blocks; i++) {
3650 if (!adev->ip_blocks[i].version->funcs->early_fini)
3651 continue;
3652
3653 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
3654 if (r) {
3655 dev_dbg(adev->dev,
3656 "early_fini of IP block <%s> failed %d\n",
3657 adev->ip_blocks[i].version->funcs->name, r);
3658 }
3659 }
3660
3661 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3662 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3663
3664 amdgpu_amdkfd_suspend(adev, true);
3665 amdgpu_userq_suspend(adev);
3666
3667 /* Workaround for ASICs need to disable SMC first */
3668 amdgpu_device_smu_fini_early(adev);
3669
3670 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3671 if (!adev->ip_blocks[i].status.hw)
3672 continue;
3673
3674 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3675 }
3676
3677 if (amdgpu_sriov_vf(adev)) {
3678 if (amdgpu_virt_release_full_gpu(adev, false))
3679 dev_err(adev->dev,
3680 "failed to release exclusive mode on fini\n");
3681 }
3682
3683 /*
3684 * Driver reload on the APU can fail due to firmware validation because
3685 * the PSP is always running, as it is shared across the whole SoC.
3686 * This same issue does not occur on dGPU because it has a mechanism
3687 * that checks whether the PSP is running. A solution for those issues
3688 * in the APU is to trigger a GPU reset, but this should be done during
3689 * the unload phase to avoid adding boot latency and screen flicker.
3690 */
3691 if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) {
3692 r = amdgpu_asic_reset(adev);
3693 if (r)
3694 dev_err(adev->dev, "asic reset on %s failed\n", __func__);
3695 }
3696
3697 return 0;
3698 }
3699
3700 /**
3701 * amdgpu_device_ip_fini - run fini for hardware IPs
3702 *
3703 * @adev: amdgpu_device pointer
3704 *
3705 * Main teardown pass for hardware IPs. The list of all the hardware
3706 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3707 * are run. hw_fini tears down the hardware associated with each IP
3708 * and sw_fini tears down any software state associated with each IP.
3709 * Returns 0 on success, negative error code on failure.
3710 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)3711 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3712 {
3713 int i, r;
3714
3715 amdgpu_cper_fini(adev);
3716
3717 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3718 amdgpu_virt_release_ras_err_handler_data(adev);
3719
3720 if (adev->gmc.xgmi.num_physical_nodes > 1)
3721 amdgpu_xgmi_remove_device(adev);
3722
3723 amdgpu_amdkfd_device_fini_sw(adev);
3724
3725 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3726 if (!adev->ip_blocks[i].status.sw)
3727 continue;
3728
3729 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3730 amdgpu_ucode_free_bo(adev);
3731 amdgpu_free_static_csa(&adev->virt.csa_obj);
3732 amdgpu_device_wb_fini(adev);
3733 amdgpu_device_mem_scratch_fini(adev);
3734 amdgpu_ib_pool_fini(adev);
3735 amdgpu_seq64_fini(adev);
3736 amdgpu_doorbell_fini(adev);
3737 }
3738 if (adev->ip_blocks[i].version->funcs->sw_fini) {
3739 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
3740 /* XXX handle errors */
3741 if (r) {
3742 dev_dbg(adev->dev,
3743 "sw_fini of IP block <%s> failed %d\n",
3744 adev->ip_blocks[i].version->funcs->name,
3745 r);
3746 }
3747 }
3748 adev->ip_blocks[i].status.sw = false;
3749 adev->ip_blocks[i].status.valid = false;
3750 }
3751
3752 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3753 if (!adev->ip_blocks[i].status.late_initialized)
3754 continue;
3755 if (adev->ip_blocks[i].version->funcs->late_fini)
3756 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
3757 adev->ip_blocks[i].status.late_initialized = false;
3758 }
3759
3760 amdgpu_ras_fini(adev);
3761 amdgpu_uid_fini(adev);
3762
3763 return 0;
3764 }
3765
3766 /**
3767 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3768 *
3769 * @work: work_struct.
3770 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)3771 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3772 {
3773 struct amdgpu_device *adev =
3774 container_of(work, struct amdgpu_device, delayed_init_work.work);
3775 int r;
3776
3777 r = amdgpu_ib_ring_tests(adev);
3778 if (r)
3779 dev_err(adev->dev, "ib ring test failed (%d).\n", r);
3780 }
3781
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)3782 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3783 {
3784 struct amdgpu_device *adev =
3785 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3786
3787 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3788 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3789
3790 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0))
3791 adev->gfx.gfx_off_state = true;
3792 }
3793
3794 /**
3795 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3796 *
3797 * @adev: amdgpu_device pointer
3798 *
3799 * Main suspend function for hardware IPs. The list of all the hardware
3800 * IPs that make up the asic is walked, clockgating is disabled and the
3801 * suspend callbacks are run. suspend puts the hardware and software state
3802 * in each IP into a state suitable for suspend.
3803 * Returns 0 on success, negative error code on failure.
3804 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)3805 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3806 {
3807 int i, r, rec;
3808
3809 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3810 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3811
3812 /*
3813 * Per PMFW team's suggestion, driver needs to handle gfxoff
3814 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3815 * scenario. Add the missing df cstate disablement here.
3816 */
3817 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3818 dev_warn(adev->dev, "Failed to disallow df cstate");
3819
3820 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3821 if (!adev->ip_blocks[i].status.valid)
3822 continue;
3823
3824 /* displays are handled separately */
3825 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3826 continue;
3827
3828 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3829 if (r)
3830 goto unwind;
3831 }
3832
3833 return 0;
3834 unwind:
3835 rec = amdgpu_device_ip_resume_phase3(adev);
3836 if (rec)
3837 dev_err(adev->dev,
3838 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n",
3839 rec);
3840
3841 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW);
3842
3843 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3844 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3845
3846 return r;
3847 }
3848
3849 /**
3850 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3851 *
3852 * @adev: amdgpu_device pointer
3853 *
3854 * Main suspend function for hardware IPs. The list of all the hardware
3855 * IPs that make up the asic is walked, clockgating is disabled and the
3856 * suspend callbacks are run. suspend puts the hardware and software state
3857 * in each IP into a state suitable for suspend.
3858 * Returns 0 on success, negative error code on failure.
3859 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)3860 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3861 {
3862 int i, r, rec;
3863
3864 if (adev->in_s0ix)
3865 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3866
3867 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3868 if (!adev->ip_blocks[i].status.valid)
3869 continue;
3870 /* displays are handled in phase1 */
3871 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3872 continue;
3873 /* PSP lost connection when err_event_athub occurs */
3874 if (amdgpu_ras_intr_triggered() &&
3875 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3876 adev->ip_blocks[i].status.hw = false;
3877 continue;
3878 }
3879
3880 /* skip unnecessary suspend if we do not initialize them yet */
3881 if (!amdgpu_ip_member_of_hwini(
3882 adev, adev->ip_blocks[i].version->type))
3883 continue;
3884
3885 /* Since we skip suspend for S0i3, we need to cancel the delayed
3886 * idle work here as the suspend callback never gets called.
3887 */
3888 if (adev->in_s0ix &&
3889 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX &&
3890 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0))
3891 cancel_delayed_work_sync(&adev->gfx.idle_work);
3892 /* skip suspend of gfx/mes and psp for S0ix
3893 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3894 * like at runtime. PSP is also part of the always on hardware
3895 * so no need to suspend it.
3896 */
3897 if (adev->in_s0ix &&
3898 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3899 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3900 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3901 continue;
3902
3903 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3904 if (adev->in_s0ix &&
3905 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3906 IP_VERSION(5, 0, 0)) &&
3907 (adev->ip_blocks[i].version->type ==
3908 AMD_IP_BLOCK_TYPE_SDMA))
3909 continue;
3910
3911 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3912 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3913 * from this location and RLC Autoload automatically also gets loaded
3914 * from here based on PMFW -> PSP message during re-init sequence.
3915 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3916 * the TMR and reload FWs again for IMU enabled APU ASICs.
3917 */
3918 if (amdgpu_in_reset(adev) &&
3919 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3920 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3921 continue;
3922
3923 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3924 if (r)
3925 goto unwind;
3926
3927 /* handle putting the SMC in the appropriate state */
3928 if (!amdgpu_sriov_vf(adev)) {
3929 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3930 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3931 if (r) {
3932 dev_err(adev->dev,
3933 "SMC failed to set mp1 state %d, %d\n",
3934 adev->mp1_state, r);
3935 goto unwind;
3936 }
3937 }
3938 }
3939 }
3940
3941 return 0;
3942 unwind:
3943 /* suspend phase 2 = resume phase 1 + resume phase 2 */
3944 rec = amdgpu_device_ip_resume_phase1(adev);
3945 if (rec) {
3946 dev_err(adev->dev,
3947 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n",
3948 rec);
3949 return r;
3950 }
3951
3952 rec = amdgpu_device_fw_loading(adev);
3953 if (rec) {
3954 dev_err(adev->dev,
3955 "amdgpu_device_fw_loading failed during unwind: %d\n",
3956 rec);
3957 return r;
3958 }
3959
3960 rec = amdgpu_device_ip_resume_phase2(adev);
3961 if (rec) {
3962 dev_err(adev->dev,
3963 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n",
3964 rec);
3965 return r;
3966 }
3967
3968 return r;
3969 }
3970
3971 /**
3972 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3973 *
3974 * @adev: amdgpu_device pointer
3975 *
3976 * Main suspend function for hardware IPs. The list of all the hardware
3977 * IPs that make up the asic is walked, clockgating is disabled and the
3978 * suspend callbacks are run. suspend puts the hardware and software state
3979 * in each IP into a state suitable for suspend.
3980 * Returns 0 on success, negative error code on failure.
3981 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3982 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3983 {
3984 int r;
3985
3986 if (amdgpu_sriov_vf(adev)) {
3987 amdgpu_virt_fini_data_exchange(adev);
3988 amdgpu_virt_request_full_gpu(adev, false);
3989 }
3990
3991 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3992
3993 r = amdgpu_device_ip_suspend_phase1(adev);
3994 if (r)
3995 return r;
3996 r = amdgpu_device_ip_suspend_phase2(adev);
3997
3998 if (amdgpu_sriov_vf(adev))
3999 amdgpu_virt_release_full_gpu(adev, false);
4000
4001 return r;
4002 }
4003
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)4004 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
4005 {
4006 int i, r;
4007
4008 static enum amd_ip_block_type ip_order[] = {
4009 AMD_IP_BLOCK_TYPE_COMMON,
4010 AMD_IP_BLOCK_TYPE_GMC,
4011 AMD_IP_BLOCK_TYPE_PSP,
4012 AMD_IP_BLOCK_TYPE_IH,
4013 };
4014
4015 for (i = 0; i < adev->num_ip_blocks; i++) {
4016 int j;
4017 struct amdgpu_ip_block *block;
4018
4019 block = &adev->ip_blocks[i];
4020 block->status.hw = false;
4021
4022 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
4023
4024 if (block->version->type != ip_order[j] ||
4025 !block->status.valid)
4026 continue;
4027
4028 r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
4029 if (r) {
4030 dev_err(adev->dev, "RE-INIT-early: %s failed\n",
4031 block->version->funcs->name);
4032 return r;
4033 }
4034 block->status.hw = true;
4035 }
4036 }
4037
4038 return 0;
4039 }
4040
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)4041 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
4042 {
4043 struct amdgpu_ip_block *block;
4044 int i, r = 0;
4045
4046 static enum amd_ip_block_type ip_order[] = {
4047 AMD_IP_BLOCK_TYPE_SMC,
4048 AMD_IP_BLOCK_TYPE_DCE,
4049 AMD_IP_BLOCK_TYPE_GFX,
4050 AMD_IP_BLOCK_TYPE_SDMA,
4051 AMD_IP_BLOCK_TYPE_MES,
4052 AMD_IP_BLOCK_TYPE_UVD,
4053 AMD_IP_BLOCK_TYPE_VCE,
4054 AMD_IP_BLOCK_TYPE_VCN,
4055 AMD_IP_BLOCK_TYPE_JPEG
4056 };
4057
4058 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
4059 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]);
4060
4061 if (!block)
4062 continue;
4063
4064 if (block->status.valid && !block->status.hw) {
4065 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
4066 r = amdgpu_ip_block_resume(block);
4067 } else {
4068 r = block->version->funcs->hw_init(block);
4069 }
4070
4071 if (r) {
4072 dev_err(adev->dev, "RE-INIT-late: %s failed\n",
4073 block->version->funcs->name);
4074 break;
4075 }
4076 block->status.hw = true;
4077 }
4078 }
4079
4080 return r;
4081 }
4082
4083 /**
4084 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
4085 *
4086 * @adev: amdgpu_device pointer
4087 *
4088 * First resume function for hardware IPs. The list of all the hardware
4089 * IPs that make up the asic is walked and the resume callbacks are run for
4090 * COMMON, GMC, and IH. resume puts the hardware into a functional state
4091 * after a suspend and updates the software state as necessary. This
4092 * function is also used for restoring the GPU after a GPU reset.
4093 * Returns 0 on success, negative error code on failure.
4094 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)4095 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
4096 {
4097 int i, r;
4098
4099 for (i = 0; i < adev->num_ip_blocks; i++) {
4100 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
4101 continue;
4102 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
4103 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
4104 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
4105 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
4106
4107 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
4108 if (r)
4109 return r;
4110 }
4111 }
4112
4113 return 0;
4114 }
4115
4116 /**
4117 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
4118 *
4119 * @adev: amdgpu_device pointer
4120 *
4121 * Second resume function for hardware IPs. The list of all the hardware
4122 * IPs that make up the asic is walked and the resume callbacks are run for
4123 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
4124 * functional state after a suspend and updates the software state as
4125 * necessary. This function is also used for restoring the GPU after a GPU
4126 * reset.
4127 * Returns 0 on success, negative error code on failure.
4128 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)4129 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
4130 {
4131 int i, r;
4132
4133 for (i = 0; i < adev->num_ip_blocks; i++) {
4134 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
4135 continue;
4136 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
4137 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
4138 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
4139 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
4140 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
4141 continue;
4142 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
4143 if (r)
4144 return r;
4145 }
4146
4147 return 0;
4148 }
4149
4150 /**
4151 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
4152 *
4153 * @adev: amdgpu_device pointer
4154 *
4155 * Third resume function for hardware IPs. The list of all the hardware
4156 * IPs that make up the asic is walked and the resume callbacks are run for
4157 * all DCE. resume puts the hardware into a functional state after a suspend
4158 * and updates the software state as necessary. This function is also used
4159 * for restoring the GPU after a GPU reset.
4160 *
4161 * Returns 0 on success, negative error code on failure.
4162 */
amdgpu_device_ip_resume_phase3(struct amdgpu_device * adev)4163 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
4164 {
4165 int i, r;
4166
4167 for (i = 0; i < adev->num_ip_blocks; i++) {
4168 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
4169 continue;
4170 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
4171 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
4172 if (r)
4173 return r;
4174 }
4175 }
4176
4177 return 0;
4178 }
4179
4180 /**
4181 * amdgpu_device_ip_resume - run resume for hardware IPs
4182 *
4183 * @adev: amdgpu_device pointer
4184 *
4185 * Main resume function for hardware IPs. The hardware IPs
4186 * are split into two resume functions because they are
4187 * also used in recovering from a GPU reset and some additional
4188 * steps need to be take between them. In this case (S3/S4) they are
4189 * run sequentially.
4190 * Returns 0 on success, negative error code on failure.
4191 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)4192 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
4193 {
4194 int r;
4195
4196 r = amdgpu_device_ip_resume_phase1(adev);
4197 if (r)
4198 return r;
4199
4200 r = amdgpu_device_fw_loading(adev);
4201 if (r)
4202 return r;
4203
4204 r = amdgpu_device_ip_resume_phase2(adev);
4205
4206 if (adev->mman.buffer_funcs_ring->sched.ready)
4207 amdgpu_ttm_set_buffer_funcs_status(adev, true);
4208
4209 if (r)
4210 return r;
4211
4212 amdgpu_fence_driver_hw_init(adev);
4213
4214 r = amdgpu_device_ip_resume_phase3(adev);
4215
4216 return r;
4217 }
4218
4219 /**
4220 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
4221 *
4222 * @adev: amdgpu_device pointer
4223 *
4224 * Query the VBIOS data tables to determine if the board supports SR-IOV.
4225 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)4226 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
4227 {
4228 if (amdgpu_sriov_vf(adev)) {
4229 if (adev->is_atom_fw) {
4230 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
4231 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
4232 } else {
4233 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
4234 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
4235 }
4236
4237 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
4238 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
4239 }
4240 }
4241
4242 /**
4243 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
4244 *
4245 * @pdev : pci device context
4246 * @asic_type: AMD asic type
4247 *
4248 * Check if there is DC (new modesetting infrastructre) support for an asic.
4249 * returns true if DC has support, false if not.
4250 */
amdgpu_device_asic_has_dc_support(struct pci_dev * pdev,enum amd_asic_type asic_type)4251 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev,
4252 enum amd_asic_type asic_type)
4253 {
4254 switch (asic_type) {
4255 #ifdef CONFIG_DRM_AMDGPU_SI
4256 case CHIP_HAINAN:
4257 #endif
4258 case CHIP_TOPAZ:
4259 /* chips with no display hardware */
4260 return false;
4261 #if defined(CONFIG_DRM_AMD_DC)
4262 case CHIP_TAHITI:
4263 case CHIP_PITCAIRN:
4264 case CHIP_VERDE:
4265 case CHIP_OLAND:
4266 return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI);
4267 case CHIP_KAVERI:
4268 case CHIP_KABINI:
4269 case CHIP_MULLINS:
4270 /*
4271 * We have systems in the wild with these ASICs that require
4272 * TRAVIS and NUTMEG support which is not supported with DC.
4273 *
4274 * Fallback to the non-DC driver here by default so as not to
4275 * cause regressions.
4276 */
4277 return amdgpu_dc > 0;
4278 default:
4279 return amdgpu_dc != 0;
4280 #else
4281 default:
4282 if (amdgpu_dc > 0)
4283 dev_info_once(
4284 &pdev->dev,
4285 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
4286 return false;
4287 #endif
4288 }
4289 }
4290
4291 /**
4292 * amdgpu_device_has_dc_support - check if dc is supported
4293 *
4294 * @adev: amdgpu_device pointer
4295 *
4296 * Returns true for supported, false for not supported
4297 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)4298 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
4299 {
4300 if (adev->enable_virtual_display ||
4301 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
4302 return false;
4303
4304 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type);
4305 }
4306
amdgpu_device_xgmi_reset_func(struct work_struct * __work)4307 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
4308 {
4309 struct amdgpu_device *adev =
4310 container_of(__work, struct amdgpu_device, xgmi_reset_work);
4311 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
4312
4313 /* It's a bug to not have a hive within this function */
4314 if (WARN_ON(!hive))
4315 return;
4316
4317 /*
4318 * Use task barrier to synchronize all xgmi reset works across the
4319 * hive. task_barrier_enter and task_barrier_exit will block
4320 * until all the threads running the xgmi reset works reach
4321 * those points. task_barrier_full will do both blocks.
4322 */
4323 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
4324
4325 task_barrier_enter(&hive->tb);
4326 adev->asic_reset_res = amdgpu_device_baco_enter(adev);
4327
4328 if (adev->asic_reset_res)
4329 goto fail;
4330
4331 task_barrier_exit(&hive->tb);
4332 adev->asic_reset_res = amdgpu_device_baco_exit(adev);
4333
4334 if (adev->asic_reset_res)
4335 goto fail;
4336
4337 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
4338 } else {
4339
4340 task_barrier_full(&hive->tb);
4341 adev->asic_reset_res = amdgpu_asic_reset(adev);
4342 }
4343
4344 fail:
4345 if (adev->asic_reset_res)
4346 dev_warn(adev->dev,
4347 "ASIC reset failed with error, %d for drm dev, %s",
4348 adev->asic_reset_res, adev_to_drm(adev)->unique);
4349 amdgpu_put_xgmi_hive(hive);
4350 }
4351
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)4352 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
4353 {
4354 char *input = amdgpu_lockup_timeout;
4355 char *timeout_setting = NULL;
4356 int index = 0;
4357 long timeout;
4358 int ret = 0;
4359
4360 /* By default timeout for all queues is 2 sec */
4361 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
4362 adev->video_timeout = msecs_to_jiffies(2000);
4363
4364 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
4365 return 0;
4366
4367 while ((timeout_setting = strsep(&input, ",")) &&
4368 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4369 ret = kstrtol(timeout_setting, 0, &timeout);
4370 if (ret)
4371 return ret;
4372
4373 if (timeout == 0) {
4374 index++;
4375 continue;
4376 } else if (timeout < 0) {
4377 timeout = MAX_SCHEDULE_TIMEOUT;
4378 dev_warn(adev->dev, "lockup timeout disabled");
4379 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
4380 } else {
4381 timeout = msecs_to_jiffies(timeout);
4382 }
4383
4384 switch (index++) {
4385 case 0:
4386 adev->gfx_timeout = timeout;
4387 break;
4388 case 1:
4389 adev->compute_timeout = timeout;
4390 break;
4391 case 2:
4392 adev->sdma_timeout = timeout;
4393 break;
4394 case 3:
4395 adev->video_timeout = timeout;
4396 break;
4397 default:
4398 break;
4399 }
4400 }
4401
4402 /* When only one value specified apply it to all queues. */
4403 if (index == 1)
4404 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
4405 adev->video_timeout = timeout;
4406
4407 return ret;
4408 }
4409
4410 /**
4411 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4412 *
4413 * @adev: amdgpu_device pointer
4414 *
4415 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4416 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)4417 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4418 {
4419 struct iommu_domain *domain;
4420
4421 domain = iommu_get_domain_for_dev(adev->dev);
4422 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4423 adev->ram_is_direct_mapped = true;
4424 }
4425
4426 #if defined(CONFIG_HSA_AMD_P2P)
4427 /**
4428 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4429 *
4430 * @adev: amdgpu_device pointer
4431 *
4432 * return if IOMMU remapping bar address
4433 */
amdgpu_device_check_iommu_remap(struct amdgpu_device * adev)4434 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4435 {
4436 struct iommu_domain *domain;
4437
4438 domain = iommu_get_domain_for_dev(adev->dev);
4439 if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4440 domain->type == IOMMU_DOMAIN_DMA_FQ))
4441 return true;
4442
4443 return false;
4444 }
4445 #endif
4446
amdgpu_device_set_mcbp(struct amdgpu_device * adev)4447 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4448 {
4449 if (amdgpu_mcbp == 1)
4450 adev->gfx.mcbp = true;
4451 else if (amdgpu_mcbp == 0)
4452 adev->gfx.mcbp = false;
4453
4454 if (amdgpu_sriov_vf(adev))
4455 adev->gfx.mcbp = true;
4456
4457 if (adev->gfx.mcbp)
4458 dev_info(adev->dev, "MCBP is enabled\n");
4459 }
4460
amdgpu_device_sys_interface_init(struct amdgpu_device * adev)4461 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev)
4462 {
4463 int r;
4464
4465 r = amdgpu_atombios_sysfs_init(adev);
4466 if (r)
4467 drm_err(&adev->ddev,
4468 "registering atombios sysfs failed (%d).\n", r);
4469
4470 r = amdgpu_pm_sysfs_init(adev);
4471 if (r)
4472 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r);
4473
4474 r = amdgpu_ucode_sysfs_init(adev);
4475 if (r) {
4476 adev->ucode_sysfs_en = false;
4477 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r);
4478 } else
4479 adev->ucode_sysfs_en = true;
4480
4481 r = amdgpu_device_attr_sysfs_init(adev);
4482 if (r)
4483 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4484
4485 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4486 if (r)
4487 dev_err(adev->dev,
4488 "Could not create amdgpu board attributes\n");
4489
4490 amdgpu_fru_sysfs_init(adev);
4491 amdgpu_reg_state_sysfs_init(adev);
4492 amdgpu_xcp_sysfs_init(adev);
4493
4494 return r;
4495 }
4496
amdgpu_device_sys_interface_fini(struct amdgpu_device * adev)4497 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev)
4498 {
4499 if (adev->pm.sysfs_initialized)
4500 amdgpu_pm_sysfs_fini(adev);
4501 if (adev->ucode_sysfs_en)
4502 amdgpu_ucode_sysfs_fini(adev);
4503 amdgpu_device_attr_sysfs_fini(adev);
4504 amdgpu_fru_sysfs_fini(adev);
4505
4506 amdgpu_reg_state_sysfs_fini(adev);
4507 amdgpu_xcp_sysfs_fini(adev);
4508 }
4509
4510 /**
4511 * amdgpu_device_init - initialize the driver
4512 *
4513 * @adev: amdgpu_device pointer
4514 * @flags: driver flags
4515 *
4516 * Initializes the driver info and hw (all asics).
4517 * Returns 0 for success or an error on failure.
4518 * Called at driver startup.
4519 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)4520 int amdgpu_device_init(struct amdgpu_device *adev,
4521 uint32_t flags)
4522 {
4523 struct pci_dev *pdev = adev->pdev;
4524 int r, i;
4525 bool px = false;
4526 u32 max_MBps;
4527 int tmp;
4528
4529 adev->shutdown = false;
4530 adev->flags = flags;
4531
4532 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4533 adev->asic_type = amdgpu_force_asic_type;
4534 else
4535 adev->asic_type = flags & AMD_ASIC_MASK;
4536
4537 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4538 if (amdgpu_emu_mode == 1)
4539 adev->usec_timeout *= 10;
4540 adev->gmc.gart_size = 512 * 1024 * 1024;
4541 adev->accel_working = false;
4542 adev->num_rings = 0;
4543 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4544 adev->mman.buffer_funcs = NULL;
4545 adev->mman.buffer_funcs_ring = NULL;
4546 adev->vm_manager.vm_pte_funcs = NULL;
4547 adev->vm_manager.vm_pte_num_scheds = 0;
4548 adev->gmc.gmc_funcs = NULL;
4549 adev->harvest_ip_mask = 0x0;
4550 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4551 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4552
4553 adev->smc_rreg = &amdgpu_invalid_rreg;
4554 adev->smc_wreg = &amdgpu_invalid_wreg;
4555 adev->pcie_rreg = &amdgpu_invalid_rreg;
4556 adev->pcie_wreg = &amdgpu_invalid_wreg;
4557 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4558 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4559 adev->pciep_rreg = &amdgpu_invalid_rreg;
4560 adev->pciep_wreg = &amdgpu_invalid_wreg;
4561 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4562 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4563 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4564 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4565 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4566 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4567 adev->didt_rreg = &amdgpu_invalid_rreg;
4568 adev->didt_wreg = &amdgpu_invalid_wreg;
4569 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4570 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4571 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4572 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4573
4574 dev_info(
4575 adev->dev,
4576 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4577 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4578 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4579
4580 /* mutex initialization are all done here so we
4581 * can recall function without having locking issues
4582 */
4583 mutex_init(&adev->firmware.mutex);
4584 mutex_init(&adev->pm.mutex);
4585 mutex_init(&adev->gfx.gpu_clock_mutex);
4586 mutex_init(&adev->srbm_mutex);
4587 mutex_init(&adev->gfx.pipe_reserve_mutex);
4588 mutex_init(&adev->gfx.gfx_off_mutex);
4589 mutex_init(&adev->gfx.partition_mutex);
4590 mutex_init(&adev->grbm_idx_mutex);
4591 mutex_init(&adev->mn_lock);
4592 mutex_init(&adev->virt.vf_errors.lock);
4593 hash_init(adev->mn_hash);
4594 mutex_init(&adev->psp.mutex);
4595 mutex_init(&adev->notifier_lock);
4596 mutex_init(&adev->pm.stable_pstate_ctx_lock);
4597 mutex_init(&adev->benchmark_mutex);
4598 mutex_init(&adev->gfx.reset_sem_mutex);
4599 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4600 mutex_init(&adev->enforce_isolation_mutex);
4601 for (i = 0; i < MAX_XCP; ++i) {
4602 adev->isolation[i].spearhead = dma_fence_get_stub();
4603 amdgpu_sync_create(&adev->isolation[i].active);
4604 amdgpu_sync_create(&adev->isolation[i].prev);
4605 }
4606 mutex_init(&adev->gfx.userq_sch_mutex);
4607 mutex_init(&adev->gfx.workload_profile_mutex);
4608 mutex_init(&adev->vcn.workload_profile_mutex);
4609
4610 amdgpu_device_init_apu_flags(adev);
4611
4612 r = amdgpu_device_check_arguments(adev);
4613 if (r)
4614 return r;
4615
4616 spin_lock_init(&adev->mmio_idx_lock);
4617 spin_lock_init(&adev->smc_idx_lock);
4618 spin_lock_init(&adev->pcie_idx_lock);
4619 spin_lock_init(&adev->uvd_ctx_idx_lock);
4620 spin_lock_init(&adev->didt_idx_lock);
4621 spin_lock_init(&adev->gc_cac_idx_lock);
4622 spin_lock_init(&adev->se_cac_idx_lock);
4623 spin_lock_init(&adev->audio_endpt_idx_lock);
4624 spin_lock_init(&adev->mm_stats.lock);
4625 spin_lock_init(&adev->virt.rlcg_reg_lock);
4626 spin_lock_init(&adev->wb.lock);
4627
4628 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ);
4629
4630 INIT_LIST_HEAD(&adev->reset_list);
4631
4632 INIT_LIST_HEAD(&adev->ras_list);
4633
4634 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4635
4636 xa_init(&adev->userq_doorbell_xa);
4637
4638 INIT_DELAYED_WORK(&adev->delayed_init_work,
4639 amdgpu_device_delayed_init_work_handler);
4640 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4641 amdgpu_device_delay_enable_gfx_off);
4642 /*
4643 * Initialize the enforce_isolation work structures for each XCP
4644 * partition. This work handler is responsible for enforcing shader
4645 * isolation on AMD GPUs. It counts the number of emitted fences for
4646 * each GFX and compute ring. If there are any fences, it schedules
4647 * the `enforce_isolation_work` to be run after a delay. If there are
4648 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4649 * runqueue.
4650 */
4651 for (i = 0; i < MAX_XCP; i++) {
4652 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4653 amdgpu_gfx_enforce_isolation_handler);
4654 adev->gfx.enforce_isolation[i].adev = adev;
4655 adev->gfx.enforce_isolation[i].xcp_id = i;
4656 }
4657
4658 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4659 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
4660
4661 adev->gfx.gfx_off_req_count = 1;
4662 adev->gfx.gfx_off_residency = 0;
4663 adev->gfx.gfx_off_entrycount = 0;
4664 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4665
4666 atomic_set(&adev->throttling_logging_enabled, 1);
4667 /*
4668 * If throttling continues, logging will be performed every minute
4669 * to avoid log flooding. "-1" is subtracted since the thermal
4670 * throttling interrupt comes every second. Thus, the total logging
4671 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4672 * for throttling interrupt) = 60 seconds.
4673 */
4674 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4675
4676 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4677
4678 /* Registers mapping */
4679 /* TODO: block userspace mapping of io register */
4680 if (adev->asic_type >= CHIP_BONAIRE) {
4681 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4682 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4683 } else {
4684 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4685 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4686 }
4687
4688 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4689 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4690
4691 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4692 if (!adev->rmmio)
4693 return -ENOMEM;
4694
4695 dev_info(adev->dev, "register mmio base: 0x%08X\n",
4696 (uint32_t)adev->rmmio_base);
4697 dev_info(adev->dev, "register mmio size: %u\n",
4698 (unsigned int)adev->rmmio_size);
4699
4700 /*
4701 * Reset domain needs to be present early, before XGMI hive discovered
4702 * (if any) and initialized to use reset sem and in_gpu reset flag
4703 * early on during init and before calling to RREG32.
4704 */
4705 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4706 if (!adev->reset_domain)
4707 return -ENOMEM;
4708
4709 /* detect hw virtualization here */
4710 amdgpu_virt_init(adev);
4711
4712 amdgpu_device_get_pcie_info(adev);
4713
4714 r = amdgpu_device_get_job_timeout_settings(adev);
4715 if (r) {
4716 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4717 return r;
4718 }
4719
4720 amdgpu_device_set_mcbp(adev);
4721
4722 /*
4723 * By default, use default mode where all blocks are expected to be
4724 * initialized. At present a 'swinit' of blocks is required to be
4725 * completed before the need for a different level is detected.
4726 */
4727 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT);
4728 /* early init functions */
4729 r = amdgpu_device_ip_early_init(adev);
4730 if (r)
4731 return r;
4732
4733 /*
4734 * No need to remove conflicting FBs for non-display class devices.
4735 * This prevents the sysfb from being freed accidently.
4736 */
4737 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA ||
4738 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) {
4739 /* Get rid of things like offb */
4740 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
4741 if (r)
4742 return r;
4743 }
4744
4745 /* Enable TMZ based on IP_VERSION */
4746 amdgpu_gmc_tmz_set(adev);
4747
4748 if (amdgpu_sriov_vf(adev) &&
4749 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4750 /* VF MMIO access (except mailbox range) from CPU
4751 * will be blocked during sriov runtime
4752 */
4753 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4754
4755 amdgpu_gmc_noretry_set(adev);
4756 /* Need to get xgmi info early to decide the reset behavior*/
4757 if (adev->gmc.xgmi.supported) {
4758 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4759 if (r)
4760 return r;
4761 }
4762
4763 /* enable PCIE atomic ops */
4764 if (amdgpu_sriov_vf(adev)) {
4765 if (adev->virt.fw_reserve.p_pf2vf)
4766 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4767 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4768 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4769 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4770 * internal path natively support atomics, set have_atomics_support to true.
4771 */
4772 } else if ((adev->flags & AMD_IS_APU) &&
4773 (amdgpu_ip_version(adev, GC_HWIP, 0) >
4774 IP_VERSION(9, 0, 0))) {
4775 adev->have_atomics_support = true;
4776 } else {
4777 adev->have_atomics_support =
4778 !pci_enable_atomic_ops_to_root(adev->pdev,
4779 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4780 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4781 }
4782
4783 if (!adev->have_atomics_support)
4784 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4785
4786 /* doorbell bar mapping and doorbell index init*/
4787 amdgpu_doorbell_init(adev);
4788
4789 if (amdgpu_emu_mode == 1) {
4790 /* post the asic on emulation mode */
4791 emu_soc_asic_init(adev);
4792 goto fence_driver_init;
4793 }
4794
4795 amdgpu_reset_init(adev);
4796
4797 /* detect if we are with an SRIOV vbios */
4798 if (adev->bios)
4799 amdgpu_device_detect_sriov_bios(adev);
4800
4801 /* check if we need to reset the asic
4802 * E.g., driver was not cleanly unloaded previously, etc.
4803 */
4804 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4805 if (adev->gmc.xgmi.num_physical_nodes) {
4806 dev_info(adev->dev, "Pending hive reset.\n");
4807 amdgpu_set_init_level(adev,
4808 AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
4809 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4810 !amdgpu_device_has_display_hardware(adev)) {
4811 r = psp_gpu_reset(adev);
4812 } else {
4813 tmp = amdgpu_reset_method;
4814 /* It should do a default reset when loading or reloading the driver,
4815 * regardless of the module parameter reset_method.
4816 */
4817 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4818 r = amdgpu_asic_reset(adev);
4819 amdgpu_reset_method = tmp;
4820 }
4821
4822 if (r) {
4823 dev_err(adev->dev, "asic reset on init failed\n");
4824 goto failed;
4825 }
4826 }
4827
4828 /* Post card if necessary */
4829 if (amdgpu_device_need_post(adev)) {
4830 if (!adev->bios) {
4831 dev_err(adev->dev, "no vBIOS found\n");
4832 r = -EINVAL;
4833 goto failed;
4834 }
4835 dev_info(adev->dev, "GPU posting now...\n");
4836 r = amdgpu_device_asic_init(adev);
4837 if (r) {
4838 dev_err(adev->dev, "gpu post error!\n");
4839 goto failed;
4840 }
4841 }
4842
4843 if (adev->bios) {
4844 if (adev->is_atom_fw) {
4845 /* Initialize clocks */
4846 r = amdgpu_atomfirmware_get_clock_info(adev);
4847 if (r) {
4848 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4849 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4850 goto failed;
4851 }
4852 } else {
4853 /* Initialize clocks */
4854 r = amdgpu_atombios_get_clock_info(adev);
4855 if (r) {
4856 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4857 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4858 goto failed;
4859 }
4860 /* init i2c buses */
4861 amdgpu_i2c_init(adev);
4862 }
4863 }
4864
4865 fence_driver_init:
4866 /* Fence driver */
4867 r = amdgpu_fence_driver_sw_init(adev);
4868 if (r) {
4869 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4870 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4871 goto failed;
4872 }
4873
4874 /* init the mode config */
4875 drm_mode_config_init(adev_to_drm(adev));
4876
4877 r = amdgpu_device_ip_init(adev);
4878 if (r) {
4879 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4880 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4881 goto release_ras_con;
4882 }
4883
4884 amdgpu_fence_driver_hw_init(adev);
4885
4886 dev_info(adev->dev,
4887 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4888 adev->gfx.config.max_shader_engines,
4889 adev->gfx.config.max_sh_per_se,
4890 adev->gfx.config.max_cu_per_sh,
4891 adev->gfx.cu_info.number);
4892
4893 adev->accel_working = true;
4894
4895 amdgpu_vm_check_compute_bug(adev);
4896
4897 /* Initialize the buffer migration limit. */
4898 if (amdgpu_moverate >= 0)
4899 max_MBps = amdgpu_moverate;
4900 else
4901 max_MBps = 8; /* Allow 8 MB/s. */
4902 /* Get a log2 for easy divisions. */
4903 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4904
4905 /*
4906 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4907 * Otherwise the mgpu fan boost feature will be skipped due to the
4908 * gpu instance is counted less.
4909 */
4910 amdgpu_register_gpu_instance(adev);
4911
4912 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4913 * explicit gating rather than handling it automatically.
4914 */
4915 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
4916 r = amdgpu_device_ip_late_init(adev);
4917 if (r) {
4918 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4919 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4920 goto release_ras_con;
4921 }
4922 /* must succeed. */
4923 amdgpu_ras_resume(adev);
4924 queue_delayed_work(system_wq, &adev->delayed_init_work,
4925 msecs_to_jiffies(AMDGPU_RESUME_MS));
4926 }
4927
4928 if (amdgpu_sriov_vf(adev)) {
4929 amdgpu_virt_release_full_gpu(adev, true);
4930 flush_delayed_work(&adev->delayed_init_work);
4931 }
4932
4933 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
4934 amdgpu_xgmi_reset_on_init(adev);
4935 /*
4936 * Place those sysfs registering after `late_init`. As some of those
4937 * operations performed in `late_init` might affect the sysfs
4938 * interfaces creating.
4939 */
4940 r = amdgpu_device_sys_interface_init(adev);
4941
4942 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4943 r = amdgpu_pmu_init(adev);
4944 if (r)
4945 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4946
4947 /* Have stored pci confspace at hand for restore in sudden PCI error */
4948 if (amdgpu_device_cache_pci_state(adev->pdev))
4949 pci_restore_state(pdev);
4950
4951 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4952 /* this will fail for cards that aren't VGA class devices, just
4953 * ignore it
4954 */
4955 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4956 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4957
4958 px = amdgpu_device_supports_px(adev);
4959
4960 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4961 apple_gmux_detect(NULL, NULL)))
4962 vga_switcheroo_register_client(adev->pdev,
4963 &amdgpu_switcheroo_ops, px);
4964
4965 if (px)
4966 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4967
4968 amdgpu_device_check_iommu_direct_map(adev);
4969
4970 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
4971 r = register_pm_notifier(&adev->pm_nb);
4972 if (r)
4973 goto failed;
4974
4975 return 0;
4976
4977 release_ras_con:
4978 if (amdgpu_sriov_vf(adev))
4979 amdgpu_virt_release_full_gpu(adev, true);
4980
4981 /* failed in exclusive mode due to timeout */
4982 if (amdgpu_sriov_vf(adev) &&
4983 !amdgpu_sriov_runtime(adev) &&
4984 amdgpu_virt_mmio_blocked(adev) &&
4985 !amdgpu_virt_wait_reset(adev)) {
4986 dev_err(adev->dev, "VF exclusive mode timeout\n");
4987 /* Don't send request since VF is inactive. */
4988 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4989 adev->virt.ops = NULL;
4990 r = -EAGAIN;
4991 }
4992 amdgpu_release_ras_context(adev);
4993
4994 failed:
4995 amdgpu_vf_error_trans_all(adev);
4996
4997 return r;
4998 }
4999
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)5000 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
5001 {
5002
5003 /* Clear all CPU mappings pointing to this device */
5004 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
5005
5006 /* Unmap all mapped bars - Doorbell, registers and VRAM */
5007 amdgpu_doorbell_fini(adev);
5008
5009 iounmap(adev->rmmio);
5010 adev->rmmio = NULL;
5011 if (adev->mman.aper_base_kaddr)
5012 iounmap(adev->mman.aper_base_kaddr);
5013 adev->mman.aper_base_kaddr = NULL;
5014
5015 /* Memory manager related */
5016 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
5017 arch_phys_wc_del(adev->gmc.vram_mtrr);
5018 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
5019 }
5020 }
5021
5022 /**
5023 * amdgpu_device_fini_hw - tear down the driver
5024 *
5025 * @adev: amdgpu_device pointer
5026 *
5027 * Tear down the driver info (all asics).
5028 * Called at driver shutdown.
5029 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)5030 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
5031 {
5032 dev_info(adev->dev, "amdgpu: finishing device.\n");
5033 flush_delayed_work(&adev->delayed_init_work);
5034
5035 if (adev->mman.initialized)
5036 drain_workqueue(adev->mman.bdev.wq);
5037 adev->shutdown = true;
5038
5039 unregister_pm_notifier(&adev->pm_nb);
5040
5041 /* make sure IB test finished before entering exclusive mode
5042 * to avoid preemption on IB test
5043 */
5044 if (amdgpu_sriov_vf(adev)) {
5045 amdgpu_virt_request_full_gpu(adev, false);
5046 amdgpu_virt_fini_data_exchange(adev);
5047 }
5048
5049 /* disable all interrupts */
5050 amdgpu_irq_disable_all(adev);
5051 if (adev->mode_info.mode_config_initialized) {
5052 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
5053 drm_helper_force_disable_all(adev_to_drm(adev));
5054 else
5055 drm_atomic_helper_shutdown(adev_to_drm(adev));
5056 }
5057 amdgpu_fence_driver_hw_fini(adev);
5058
5059 amdgpu_device_sys_interface_fini(adev);
5060
5061 /* disable ras feature must before hw fini */
5062 amdgpu_ras_pre_fini(adev);
5063
5064 amdgpu_ttm_set_buffer_funcs_status(adev, false);
5065
5066 /*
5067 * device went through surprise hotplug; we need to destroy topology
5068 * before ip_fini_early to prevent kfd locking refcount issues by calling
5069 * amdgpu_amdkfd_suspend()
5070 */
5071 if (drm_dev_is_unplugged(adev_to_drm(adev)))
5072 amdgpu_amdkfd_device_fini_sw(adev);
5073
5074 amdgpu_device_ip_fini_early(adev);
5075
5076 amdgpu_irq_fini_hw(adev);
5077
5078 if (adev->mman.initialized)
5079 ttm_device_clear_dma_mappings(&adev->mman.bdev);
5080
5081 amdgpu_gart_dummy_page_fini(adev);
5082
5083 if (drm_dev_is_unplugged(adev_to_drm(adev)))
5084 amdgpu_device_unmap_mmio(adev);
5085
5086 }
5087
amdgpu_device_fini_sw(struct amdgpu_device * adev)5088 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
5089 {
5090 int i, idx;
5091 bool px;
5092
5093 amdgpu_device_ip_fini(adev);
5094 amdgpu_fence_driver_sw_fini(adev);
5095 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
5096 adev->accel_working = false;
5097 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
5098 for (i = 0; i < MAX_XCP; ++i) {
5099 dma_fence_put(adev->isolation[i].spearhead);
5100 amdgpu_sync_free(&adev->isolation[i].active);
5101 amdgpu_sync_free(&adev->isolation[i].prev);
5102 }
5103
5104 amdgpu_reset_fini(adev);
5105
5106 /* free i2c buses */
5107 amdgpu_i2c_fini(adev);
5108
5109 if (adev->bios) {
5110 if (amdgpu_emu_mode != 1)
5111 amdgpu_atombios_fini(adev);
5112 amdgpu_bios_release(adev);
5113 }
5114
5115 kfree(adev->fru_info);
5116 adev->fru_info = NULL;
5117
5118 kfree(adev->xcp_mgr);
5119 adev->xcp_mgr = NULL;
5120
5121 px = amdgpu_device_supports_px(adev);
5122
5123 if (px || (!dev_is_removable(&adev->pdev->dev) &&
5124 apple_gmux_detect(NULL, NULL)))
5125 vga_switcheroo_unregister_client(adev->pdev);
5126
5127 if (px)
5128 vga_switcheroo_fini_domain_pm_ops(adev->dev);
5129
5130 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
5131 vga_client_unregister(adev->pdev);
5132
5133 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
5134
5135 iounmap(adev->rmmio);
5136 adev->rmmio = NULL;
5137 drm_dev_exit(idx);
5138 }
5139
5140 if (IS_ENABLED(CONFIG_PERF_EVENTS))
5141 amdgpu_pmu_fini(adev);
5142 if (adev->discovery.bin)
5143 amdgpu_discovery_fini(adev);
5144
5145 amdgpu_reset_put_reset_domain(adev->reset_domain);
5146 adev->reset_domain = NULL;
5147
5148 kfree(adev->pci_state);
5149 kfree(adev->pcie_reset_ctx.swds_pcistate);
5150 kfree(adev->pcie_reset_ctx.swus_pcistate);
5151 }
5152
5153 /**
5154 * amdgpu_device_evict_resources - evict device resources
5155 * @adev: amdgpu device object
5156 *
5157 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
5158 * of the vram memory type. Mainly used for evicting device resources
5159 * at suspend time.
5160 *
5161 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)5162 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
5163 {
5164 int ret;
5165
5166 /* No need to evict vram on APUs unless going to S4 */
5167 if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
5168 return 0;
5169
5170 /* No need to evict when going to S5 through S4 callbacks */
5171 if (system_state == SYSTEM_POWER_OFF)
5172 return 0;
5173
5174 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
5175 if (ret) {
5176 dev_warn(adev->dev, "evicting device resources failed\n");
5177 return ret;
5178 }
5179
5180 if (adev->in_s4) {
5181 ret = ttm_device_prepare_hibernation(&adev->mman.bdev);
5182 if (ret)
5183 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret);
5184 }
5185 return ret;
5186 }
5187
5188 /*
5189 * Suspend & resume.
5190 */
5191 /**
5192 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
5193 * @nb: notifier block
5194 * @mode: suspend mode
5195 * @data: data
5196 *
5197 * This function is called when the system is about to suspend or hibernate.
5198 * It is used to set the appropriate flags so that eviction can be optimized
5199 * in the pm prepare callback.
5200 */
amdgpu_device_pm_notifier(struct notifier_block * nb,unsigned long mode,void * data)5201 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
5202 void *data)
5203 {
5204 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
5205
5206 switch (mode) {
5207 case PM_HIBERNATION_PREPARE:
5208 adev->in_s4 = true;
5209 break;
5210 case PM_POST_HIBERNATION:
5211 adev->in_s4 = false;
5212 break;
5213 }
5214
5215 return NOTIFY_DONE;
5216 }
5217
5218 /**
5219 * amdgpu_device_prepare - prepare for device suspend
5220 *
5221 * @dev: drm dev pointer
5222 *
5223 * Prepare to put the hw in the suspend state (all asics).
5224 * Returns 0 for success or an error on failure.
5225 * Called at driver suspend.
5226 */
amdgpu_device_prepare(struct drm_device * dev)5227 int amdgpu_device_prepare(struct drm_device *dev)
5228 {
5229 struct amdgpu_device *adev = drm_to_adev(dev);
5230 int i, r;
5231
5232 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5233 return 0;
5234
5235 /* Evict the majority of BOs before starting suspend sequence */
5236 r = amdgpu_device_evict_resources(adev);
5237 if (r)
5238 return r;
5239
5240 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
5241
5242 for (i = 0; i < adev->num_ip_blocks; i++) {
5243 if (!adev->ip_blocks[i].status.valid)
5244 continue;
5245 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
5246 continue;
5247 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
5248 if (r)
5249 return r;
5250 }
5251
5252 return 0;
5253 }
5254
5255 /**
5256 * amdgpu_device_complete - complete power state transition
5257 *
5258 * @dev: drm dev pointer
5259 *
5260 * Undo the changes from amdgpu_device_prepare. This will be
5261 * called on all resume transitions, including those that failed.
5262 */
amdgpu_device_complete(struct drm_device * dev)5263 void amdgpu_device_complete(struct drm_device *dev)
5264 {
5265 struct amdgpu_device *adev = drm_to_adev(dev);
5266 int i;
5267
5268 for (i = 0; i < adev->num_ip_blocks; i++) {
5269 if (!adev->ip_blocks[i].status.valid)
5270 continue;
5271 if (!adev->ip_blocks[i].version->funcs->complete)
5272 continue;
5273 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]);
5274 }
5275 }
5276
5277 /**
5278 * amdgpu_device_suspend - initiate device suspend
5279 *
5280 * @dev: drm dev pointer
5281 * @notify_clients: notify in-kernel DRM clients
5282 *
5283 * Puts the hw in the suspend state (all asics).
5284 * Returns 0 for success or an error on failure.
5285 * Called at driver suspend.
5286 */
amdgpu_device_suspend(struct drm_device * dev,bool notify_clients)5287 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
5288 {
5289 struct amdgpu_device *adev = drm_to_adev(dev);
5290 int r, rec;
5291
5292 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5293 return 0;
5294
5295 adev->in_suspend = true;
5296
5297 if (amdgpu_sriov_vf(adev)) {
5298 if (!adev->in_runpm)
5299 amdgpu_amdkfd_suspend_process(adev);
5300 amdgpu_virt_fini_data_exchange(adev);
5301 r = amdgpu_virt_request_full_gpu(adev, false);
5302 if (r)
5303 return r;
5304 }
5305
5306 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3);
5307 if (r)
5308 goto unwind_sriov;
5309
5310 if (notify_clients)
5311 drm_client_dev_suspend(adev_to_drm(adev));
5312
5313 cancel_delayed_work_sync(&adev->delayed_init_work);
5314
5315 amdgpu_ras_suspend(adev);
5316
5317 r = amdgpu_device_ip_suspend_phase1(adev);
5318 if (r)
5319 goto unwind_smartshift;
5320
5321 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
5322 r = amdgpu_userq_suspend(adev);
5323 if (r)
5324 goto unwind_ip_phase1;
5325
5326 r = amdgpu_device_evict_resources(adev);
5327 if (r)
5328 goto unwind_userq;
5329
5330 amdgpu_ttm_set_buffer_funcs_status(adev, false);
5331
5332 amdgpu_fence_driver_hw_fini(adev);
5333
5334 r = amdgpu_device_ip_suspend_phase2(adev);
5335 if (r)
5336 goto unwind_evict;
5337
5338 if (amdgpu_sriov_vf(adev))
5339 amdgpu_virt_release_full_gpu(adev, false);
5340
5341 return 0;
5342
5343 unwind_evict:
5344 if (adev->mman.buffer_funcs_ring->sched.ready)
5345 amdgpu_ttm_set_buffer_funcs_status(adev, true);
5346 amdgpu_fence_driver_hw_init(adev);
5347
5348 unwind_userq:
5349 rec = amdgpu_userq_resume(adev);
5350 if (rec) {
5351 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec);
5352 return r;
5353 }
5354 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
5355 if (rec) {
5356 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec);
5357 return r;
5358 }
5359
5360 unwind_ip_phase1:
5361 /* suspend phase 1 = resume phase 3 */
5362 rec = amdgpu_device_ip_resume_phase3(adev);
5363 if (rec) {
5364 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec);
5365 return r;
5366 }
5367
5368 unwind_smartshift:
5369 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0);
5370 if (rec) {
5371 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec);
5372 return r;
5373 }
5374
5375 if (notify_clients)
5376 drm_client_dev_resume(adev_to_drm(adev));
5377
5378 amdgpu_ras_resume(adev);
5379
5380 unwind_sriov:
5381 if (amdgpu_sriov_vf(adev)) {
5382 rec = amdgpu_virt_request_full_gpu(adev, true);
5383 if (rec) {
5384 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec);
5385 return r;
5386 }
5387 }
5388
5389 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;
5390
5391 return r;
5392 }
5393
amdgpu_virt_resume(struct amdgpu_device * adev)5394 static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
5395 {
5396 int r;
5397 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id;
5398
5399 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO)
5400 * may not work. The access could be blocked by nBIF protection as VF isn't in
5401 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX
5402 * so that QEMU reprograms MSIX table.
5403 */
5404 amdgpu_restore_msix(adev);
5405
5406 r = adev->gfxhub.funcs->get_xgmi_info(adev);
5407 if (r)
5408 return r;
5409
5410 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n",
5411 prev_physical_node_id, adev->gmc.xgmi.physical_node_id);
5412
5413 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev);
5414 adev->vm_manager.vram_base_offset +=
5415 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
5416
5417 return 0;
5418 }
5419
5420 /**
5421 * amdgpu_device_resume - initiate device resume
5422 *
5423 * @dev: drm dev pointer
5424 * @notify_clients: notify in-kernel DRM clients
5425 *
5426 * Bring the hw back to operating state (all asics).
5427 * Returns 0 for success or an error on failure.
5428 * Called at driver resume.
5429 */
amdgpu_device_resume(struct drm_device * dev,bool notify_clients)5430 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
5431 {
5432 struct amdgpu_device *adev = drm_to_adev(dev);
5433 int r = 0;
5434
5435 if (amdgpu_sriov_vf(adev)) {
5436 r = amdgpu_virt_request_full_gpu(adev, true);
5437 if (r)
5438 return r;
5439 }
5440
5441 if (amdgpu_virt_xgmi_migrate_enabled(adev)) {
5442 r = amdgpu_virt_resume(adev);
5443 if (r)
5444 goto exit;
5445 }
5446
5447 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5448 return 0;
5449
5450 if (adev->in_s0ix)
5451 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
5452
5453 /* post card */
5454 if (amdgpu_device_need_post(adev)) {
5455 r = amdgpu_device_asic_init(adev);
5456 if (r)
5457 dev_err(adev->dev, "amdgpu asic init failed\n");
5458 }
5459
5460 r = amdgpu_device_ip_resume(adev);
5461
5462 if (r) {
5463 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
5464 goto exit;
5465 }
5466
5467 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
5468 if (r)
5469 goto exit;
5470
5471 r = amdgpu_userq_resume(adev);
5472 if (r)
5473 goto exit;
5474
5475 r = amdgpu_device_ip_late_init(adev);
5476 if (r)
5477 goto exit;
5478
5479 queue_delayed_work(system_wq, &adev->delayed_init_work,
5480 msecs_to_jiffies(AMDGPU_RESUME_MS));
5481 exit:
5482 if (amdgpu_sriov_vf(adev)) {
5483 amdgpu_virt_init_data_exchange(adev);
5484 amdgpu_virt_release_full_gpu(adev, true);
5485
5486 if (!r && !adev->in_runpm)
5487 r = amdgpu_amdkfd_resume_process(adev);
5488 }
5489
5490 if (r)
5491 return r;
5492
5493 /* Make sure IB tests flushed */
5494 flush_delayed_work(&adev->delayed_init_work);
5495
5496 if (notify_clients)
5497 drm_client_dev_resume(adev_to_drm(adev));
5498
5499 amdgpu_ras_resume(adev);
5500
5501 if (adev->mode_info.num_crtc) {
5502 /*
5503 * Most of the connector probing functions try to acquire runtime pm
5504 * refs to ensure that the GPU is powered on when connector polling is
5505 * performed. Since we're calling this from a runtime PM callback,
5506 * trying to acquire rpm refs will cause us to deadlock.
5507 *
5508 * Since we're guaranteed to be holding the rpm lock, it's safe to
5509 * temporarily disable the rpm helpers so this doesn't deadlock us.
5510 */
5511 #ifdef CONFIG_PM
5512 dev->dev->power.disable_depth++;
5513 #endif
5514 if (!adev->dc_enabled)
5515 drm_helper_hpd_irq_event(dev);
5516 else
5517 drm_kms_helper_hotplug_event(dev);
5518 #ifdef CONFIG_PM
5519 dev->dev->power.disable_depth--;
5520 #endif
5521 }
5522
5523 amdgpu_vram_mgr_clear_reset_blocks(adev);
5524 adev->in_suspend = false;
5525
5526 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0))
5527 dev_warn(adev->dev, "smart shift update failed\n");
5528
5529 return 0;
5530 }
5531
5532 /**
5533 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
5534 *
5535 * @adev: amdgpu_device pointer
5536 *
5537 * The list of all the hardware IPs that make up the asic is walked and
5538 * the check_soft_reset callbacks are run. check_soft_reset determines
5539 * if the asic is still hung or not.
5540 * Returns true if any of the IPs are still in a hung state, false if not.
5541 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)5542 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
5543 {
5544 int i;
5545 bool asic_hang = false;
5546
5547 if (amdgpu_sriov_vf(adev))
5548 return true;
5549
5550 if (amdgpu_asic_need_full_reset(adev))
5551 return true;
5552
5553 for (i = 0; i < adev->num_ip_blocks; i++) {
5554 if (!adev->ip_blocks[i].status.valid)
5555 continue;
5556 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
5557 adev->ip_blocks[i].status.hang =
5558 adev->ip_blocks[i].version->funcs->check_soft_reset(
5559 &adev->ip_blocks[i]);
5560 if (adev->ip_blocks[i].status.hang) {
5561 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
5562 asic_hang = true;
5563 }
5564 }
5565 return asic_hang;
5566 }
5567
5568 /**
5569 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5570 *
5571 * @adev: amdgpu_device pointer
5572 *
5573 * The list of all the hardware IPs that make up the asic is walked and the
5574 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
5575 * handles any IP specific hardware or software state changes that are
5576 * necessary for a soft reset to succeed.
5577 * Returns 0 on success, negative error code on failure.
5578 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)5579 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5580 {
5581 int i, r = 0;
5582
5583 for (i = 0; i < adev->num_ip_blocks; i++) {
5584 if (!adev->ip_blocks[i].status.valid)
5585 continue;
5586 if (adev->ip_blocks[i].status.hang &&
5587 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5588 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
5589 if (r)
5590 return r;
5591 }
5592 }
5593
5594 return 0;
5595 }
5596
5597 /**
5598 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5599 *
5600 * @adev: amdgpu_device pointer
5601 *
5602 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
5603 * reset is necessary to recover.
5604 * Returns true if a full asic reset is required, false if not.
5605 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)5606 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5607 {
5608 int i;
5609
5610 if (amdgpu_asic_need_full_reset(adev))
5611 return true;
5612
5613 for (i = 0; i < adev->num_ip_blocks; i++) {
5614 if (!adev->ip_blocks[i].status.valid)
5615 continue;
5616 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5617 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5618 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5619 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5620 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5621 if (adev->ip_blocks[i].status.hang) {
5622 dev_info(adev->dev, "Some block need full reset!\n");
5623 return true;
5624 }
5625 }
5626 }
5627 return false;
5628 }
5629
5630 /**
5631 * amdgpu_device_ip_soft_reset - do a soft reset
5632 *
5633 * @adev: amdgpu_device pointer
5634 *
5635 * The list of all the hardware IPs that make up the asic is walked and the
5636 * soft_reset callbacks are run if the block is hung. soft_reset handles any
5637 * IP specific hardware or software state changes that are necessary to soft
5638 * reset the IP.
5639 * Returns 0 on success, negative error code on failure.
5640 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)5641 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5642 {
5643 int i, r = 0;
5644
5645 for (i = 0; i < adev->num_ip_blocks; i++) {
5646 if (!adev->ip_blocks[i].status.valid)
5647 continue;
5648 if (adev->ip_blocks[i].status.hang &&
5649 adev->ip_blocks[i].version->funcs->soft_reset) {
5650 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
5651 if (r)
5652 return r;
5653 }
5654 }
5655
5656 return 0;
5657 }
5658
5659 /**
5660 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5661 *
5662 * @adev: amdgpu_device pointer
5663 *
5664 * The list of all the hardware IPs that make up the asic is walked and the
5665 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5666 * handles any IP specific hardware or software state changes that are
5667 * necessary after the IP has been soft reset.
5668 * Returns 0 on success, negative error code on failure.
5669 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)5670 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5671 {
5672 int i, r = 0;
5673
5674 for (i = 0; i < adev->num_ip_blocks; i++) {
5675 if (!adev->ip_blocks[i].status.valid)
5676 continue;
5677 if (adev->ip_blocks[i].status.hang &&
5678 adev->ip_blocks[i].version->funcs->post_soft_reset)
5679 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
5680 if (r)
5681 return r;
5682 }
5683
5684 return 0;
5685 }
5686
5687 /**
5688 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5689 *
5690 * @adev: amdgpu_device pointer
5691 * @reset_context: amdgpu reset context pointer
5692 *
5693 * do VF FLR and reinitialize Asic
5694 * return 0 means succeeded otherwise failed
5695 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5696 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5697 struct amdgpu_reset_context *reset_context)
5698 {
5699 int r;
5700 struct amdgpu_hive_info *hive = NULL;
5701
5702 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5703 if (!amdgpu_ras_get_fed_status(adev))
5704 amdgpu_virt_ready_to_reset(adev);
5705 amdgpu_virt_wait_reset(adev);
5706 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5707 r = amdgpu_virt_request_full_gpu(adev, true);
5708 } else {
5709 r = amdgpu_virt_reset_gpu(adev);
5710 }
5711 if (r)
5712 return r;
5713
5714 amdgpu_ras_clear_err_state(adev);
5715 amdgpu_irq_gpu_reset_resume_helper(adev);
5716
5717 /* some sw clean up VF needs to do before recover */
5718 amdgpu_virt_post_reset(adev);
5719
5720 /* Resume IP prior to SMC */
5721 r = amdgpu_device_ip_reinit_early_sriov(adev);
5722 if (r)
5723 return r;
5724
5725 amdgpu_virt_init_data_exchange(adev);
5726
5727 r = amdgpu_device_fw_loading(adev);
5728 if (r)
5729 return r;
5730
5731 /* now we are okay to resume SMC/CP/SDMA */
5732 r = amdgpu_device_ip_reinit_late_sriov(adev);
5733 if (r)
5734 return r;
5735
5736 hive = amdgpu_get_xgmi_hive(adev);
5737 /* Update PSP FW topology after reset */
5738 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5739 r = amdgpu_xgmi_update_topology(hive, adev);
5740 if (hive)
5741 amdgpu_put_xgmi_hive(hive);
5742 if (r)
5743 return r;
5744
5745 r = amdgpu_ib_ring_tests(adev);
5746 if (r)
5747 return r;
5748
5749 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5750 amdgpu_inc_vram_lost(adev);
5751
5752 /* need to be called during full access so we can't do it later like
5753 * bare-metal does.
5754 */
5755 amdgpu_amdkfd_post_reset(adev);
5756 amdgpu_virt_release_full_gpu(adev, true);
5757
5758 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5759 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5760 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5761 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5762 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
5763 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5764 amdgpu_ras_resume(adev);
5765
5766 amdgpu_virt_ras_telemetry_post_reset(adev);
5767
5768 return 0;
5769 }
5770
5771 /**
5772 * amdgpu_device_has_job_running - check if there is any unfinished job
5773 *
5774 * @adev: amdgpu_device pointer
5775 *
5776 * check if there is any job running on the device when guest driver receives
5777 * FLR notification from host driver. If there are still jobs running, then
5778 * the guest driver will not respond the FLR reset. Instead, let the job hit
5779 * the timeout and guest driver then issue the reset request.
5780 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)5781 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5782 {
5783 int i;
5784
5785 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5786 struct amdgpu_ring *ring = adev->rings[i];
5787
5788 if (!amdgpu_ring_sched_ready(ring))
5789 continue;
5790
5791 if (amdgpu_fence_count_emitted(ring))
5792 return true;
5793 }
5794 return false;
5795 }
5796
5797 /**
5798 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5799 *
5800 * @adev: amdgpu_device pointer
5801 *
5802 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5803 * a hung GPU.
5804 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)5805 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5806 {
5807
5808 if (amdgpu_gpu_recovery == 0)
5809 goto disabled;
5810
5811 /* Skip soft reset check in fatal error mode */
5812 if (!amdgpu_ras_is_poison_mode_supported(adev))
5813 return true;
5814
5815 if (amdgpu_sriov_vf(adev))
5816 return true;
5817
5818 if (amdgpu_gpu_recovery == -1) {
5819 switch (adev->asic_type) {
5820 #ifdef CONFIG_DRM_AMDGPU_SI
5821 case CHIP_VERDE:
5822 case CHIP_TAHITI:
5823 case CHIP_PITCAIRN:
5824 case CHIP_OLAND:
5825 case CHIP_HAINAN:
5826 #endif
5827 #ifdef CONFIG_DRM_AMDGPU_CIK
5828 case CHIP_KAVERI:
5829 case CHIP_KABINI:
5830 case CHIP_MULLINS:
5831 #endif
5832 case CHIP_CARRIZO:
5833 case CHIP_STONEY:
5834 case CHIP_CYAN_SKILLFISH:
5835 goto disabled;
5836 default:
5837 break;
5838 }
5839 }
5840
5841 return true;
5842
5843 disabled:
5844 dev_info(adev->dev, "GPU recovery disabled.\n");
5845 return false;
5846 }
5847
amdgpu_device_mode1_reset(struct amdgpu_device * adev)5848 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5849 {
5850 u32 i;
5851 int ret = 0;
5852
5853 if (adev->bios)
5854 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5855
5856 dev_info(adev->dev, "GPU mode1 reset\n");
5857
5858 /* Cache the state before bus master disable. The saved config space
5859 * values are used in other cases like restore after mode-2 reset.
5860 */
5861 amdgpu_device_cache_pci_state(adev->pdev);
5862
5863 /* disable BM */
5864 pci_clear_master(adev->pdev);
5865
5866 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5867 dev_info(adev->dev, "GPU smu mode1 reset\n");
5868 ret = amdgpu_dpm_mode1_reset(adev);
5869 } else {
5870 dev_info(adev->dev, "GPU psp mode1 reset\n");
5871 ret = psp_gpu_reset(adev);
5872 }
5873
5874 if (ret)
5875 goto mode1_reset_failed;
5876
5877 /* enable mmio access after mode 1 reset completed */
5878 adev->no_hw_access = false;
5879
5880 amdgpu_device_load_pci_state(adev->pdev);
5881 ret = amdgpu_psp_wait_for_bootloader(adev);
5882 if (ret)
5883 goto mode1_reset_failed;
5884
5885 /* wait for asic to come out of reset */
5886 for (i = 0; i < adev->usec_timeout; i++) {
5887 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5888
5889 if (memsize != 0xffffffff)
5890 break;
5891 udelay(1);
5892 }
5893
5894 if (i >= adev->usec_timeout) {
5895 ret = -ETIMEDOUT;
5896 goto mode1_reset_failed;
5897 }
5898
5899 if (adev->bios)
5900 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5901
5902 return 0;
5903
5904 mode1_reset_failed:
5905 dev_err(adev->dev, "GPU mode1 reset failed\n");
5906 return ret;
5907 }
5908
amdgpu_device_link_reset(struct amdgpu_device * adev)5909 int amdgpu_device_link_reset(struct amdgpu_device *adev)
5910 {
5911 int ret = 0;
5912
5913 dev_info(adev->dev, "GPU link reset\n");
5914
5915 if (!amdgpu_reset_in_dpc(adev))
5916 ret = amdgpu_dpm_link_reset(adev);
5917
5918 if (ret)
5919 goto link_reset_failed;
5920
5921 ret = amdgpu_psp_wait_for_bootloader(adev);
5922 if (ret)
5923 goto link_reset_failed;
5924
5925 return 0;
5926
5927 link_reset_failed:
5928 dev_err(adev->dev, "GPU link reset failed\n");
5929 return ret;
5930 }
5931
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5932 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5933 struct amdgpu_reset_context *reset_context)
5934 {
5935 int i, r = 0;
5936 struct amdgpu_job *job = NULL;
5937 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5938 bool need_full_reset =
5939 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5940
5941 if (reset_context->reset_req_dev == adev)
5942 job = reset_context->job;
5943
5944 if (amdgpu_sriov_vf(adev))
5945 amdgpu_virt_pre_reset(adev);
5946
5947 amdgpu_fence_driver_isr_toggle(adev, true);
5948
5949 /* block all schedulers and reset given job's ring */
5950 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5951 struct amdgpu_ring *ring = adev->rings[i];
5952
5953 if (!amdgpu_ring_sched_ready(ring))
5954 continue;
5955
5956 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5957 amdgpu_fence_driver_force_completion(ring);
5958 }
5959
5960 amdgpu_fence_driver_isr_toggle(adev, false);
5961
5962 if (job && job->vm)
5963 drm_sched_increase_karma(&job->base);
5964
5965 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5966 /* If reset handler not implemented, continue; otherwise return */
5967 if (r == -EOPNOTSUPP)
5968 r = 0;
5969 else
5970 return r;
5971
5972 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5973 if (!amdgpu_sriov_vf(adev)) {
5974
5975 if (!need_full_reset)
5976 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5977
5978 if (!need_full_reset && amdgpu_gpu_recovery &&
5979 amdgpu_device_ip_check_soft_reset(adev)) {
5980 amdgpu_device_ip_pre_soft_reset(adev);
5981 r = amdgpu_device_ip_soft_reset(adev);
5982 amdgpu_device_ip_post_soft_reset(adev);
5983 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5984 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5985 need_full_reset = true;
5986 }
5987 }
5988
5989 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5990 dev_info(tmp_adev->dev, "Dumping IP State\n");
5991 /* Trigger ip dump before we reset the asic */
5992 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5993 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5994 tmp_adev->ip_blocks[i].version->funcs
5995 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
5996 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5997 }
5998
5999 if (need_full_reset)
6000 r = amdgpu_device_ip_suspend(adev);
6001 if (need_full_reset)
6002 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
6003 else
6004 clear_bit(AMDGPU_NEED_FULL_RESET,
6005 &reset_context->flags);
6006 }
6007
6008 return r;
6009 }
6010
amdgpu_device_reinit_after_reset(struct amdgpu_reset_context * reset_context)6011 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
6012 {
6013 struct list_head *device_list_handle;
6014 bool full_reset, vram_lost = false;
6015 struct amdgpu_device *tmp_adev;
6016 int r, init_level;
6017
6018 device_list_handle = reset_context->reset_device_list;
6019
6020 if (!device_list_handle)
6021 return -EINVAL;
6022
6023 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
6024
6025 /**
6026 * If it's reset on init, it's default init level, otherwise keep level
6027 * as recovery level.
6028 */
6029 if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
6030 init_level = AMDGPU_INIT_LEVEL_DEFAULT;
6031 else
6032 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
6033
6034 r = 0;
6035 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6036 amdgpu_set_init_level(tmp_adev, init_level);
6037 if (full_reset) {
6038 /* post card */
6039 amdgpu_reset_set_dpc_status(tmp_adev, false);
6040 amdgpu_ras_clear_err_state(tmp_adev);
6041 r = amdgpu_device_asic_init(tmp_adev);
6042 if (r) {
6043 dev_warn(tmp_adev->dev, "asic atom init failed!");
6044 } else {
6045 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
6046
6047 r = amdgpu_device_ip_resume_phase1(tmp_adev);
6048 if (r)
6049 goto out;
6050
6051 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
6052
6053 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
6054 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
6055
6056 if (vram_lost) {
6057 dev_info(
6058 tmp_adev->dev,
6059 "VRAM is lost due to GPU reset!\n");
6060 amdgpu_inc_vram_lost(tmp_adev);
6061 }
6062
6063 r = amdgpu_device_fw_loading(tmp_adev);
6064 if (r)
6065 return r;
6066
6067 r = amdgpu_xcp_restore_partition_mode(
6068 tmp_adev->xcp_mgr);
6069 if (r)
6070 goto out;
6071
6072 r = amdgpu_device_ip_resume_phase2(tmp_adev);
6073 if (r)
6074 goto out;
6075
6076 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
6077 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
6078
6079 r = amdgpu_device_ip_resume_phase3(tmp_adev);
6080 if (r)
6081 goto out;
6082
6083 if (vram_lost)
6084 amdgpu_device_fill_reset_magic(tmp_adev);
6085
6086 /*
6087 * Add this ASIC as tracked as reset was already
6088 * complete successfully.
6089 */
6090 amdgpu_register_gpu_instance(tmp_adev);
6091
6092 if (!reset_context->hive &&
6093 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
6094 amdgpu_xgmi_add_device(tmp_adev);
6095
6096 r = amdgpu_device_ip_late_init(tmp_adev);
6097 if (r)
6098 goto out;
6099
6100 r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
6101 if (r)
6102 goto out;
6103
6104 drm_client_dev_resume(adev_to_drm(tmp_adev));
6105
6106 /*
6107 * The GPU enters bad state once faulty pages
6108 * by ECC has reached the threshold, and ras
6109 * recovery is scheduled next. So add one check
6110 * here to break recovery if it indeed exceeds
6111 * bad page threshold, and remind user to
6112 * retire this GPU or setting one bigger
6113 * bad_page_threshold value to fix this once
6114 * probing driver again.
6115 */
6116 if (!amdgpu_ras_is_rma(tmp_adev)) {
6117 /* must succeed. */
6118 amdgpu_ras_resume(tmp_adev);
6119 } else {
6120 r = -EINVAL;
6121 goto out;
6122 }
6123
6124 /* Update PSP FW topology after reset */
6125 if (reset_context->hive &&
6126 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
6127 r = amdgpu_xgmi_update_topology(
6128 reset_context->hive, tmp_adev);
6129 }
6130 }
6131
6132 out:
6133 if (!r) {
6134 /* IP init is complete now, set level as default */
6135 amdgpu_set_init_level(tmp_adev,
6136 AMDGPU_INIT_LEVEL_DEFAULT);
6137 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
6138 r = amdgpu_ib_ring_tests(tmp_adev);
6139 if (r) {
6140 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
6141 r = -EAGAIN;
6142 goto end;
6143 }
6144 }
6145
6146 if (r)
6147 tmp_adev->asic_reset_res = r;
6148 }
6149
6150 end:
6151 return r;
6152 }
6153
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)6154 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
6155 struct amdgpu_reset_context *reset_context)
6156 {
6157 struct amdgpu_device *tmp_adev = NULL;
6158 bool need_full_reset, skip_hw_reset;
6159 int r = 0;
6160
6161 /* Try reset handler method first */
6162 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6163 reset_list);
6164
6165 reset_context->reset_device_list = device_list_handle;
6166 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
6167 /* If reset handler not implemented, continue; otherwise return */
6168 if (r == -EOPNOTSUPP)
6169 r = 0;
6170 else
6171 return r;
6172
6173 /* Reset handler not implemented, use the default method */
6174 need_full_reset =
6175 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
6176 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
6177
6178 /*
6179 * ASIC reset has to be done on all XGMI hive nodes ASAP
6180 * to allow proper links negotiation in FW (within 1 sec)
6181 */
6182 if (!skip_hw_reset && need_full_reset) {
6183 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6184 /* For XGMI run all resets in parallel to speed up the process */
6185 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
6186 if (!queue_work(system_unbound_wq,
6187 &tmp_adev->xgmi_reset_work))
6188 r = -EALREADY;
6189 } else
6190 r = amdgpu_asic_reset(tmp_adev);
6191
6192 if (r) {
6193 dev_err(tmp_adev->dev,
6194 "ASIC reset failed with error, %d for drm dev, %s",
6195 r, adev_to_drm(tmp_adev)->unique);
6196 goto out;
6197 }
6198 }
6199
6200 /* For XGMI wait for all resets to complete before proceed */
6201 if (!r) {
6202 list_for_each_entry(tmp_adev, device_list_handle,
6203 reset_list) {
6204 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
6205 flush_work(&tmp_adev->xgmi_reset_work);
6206 r = tmp_adev->asic_reset_res;
6207 if (r)
6208 break;
6209 }
6210 }
6211 }
6212 }
6213
6214 if (!r && amdgpu_ras_intr_triggered()) {
6215 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6216 amdgpu_ras_reset_error_count(tmp_adev,
6217 AMDGPU_RAS_BLOCK__MMHUB);
6218 }
6219
6220 amdgpu_ras_intr_cleared();
6221 }
6222
6223 r = amdgpu_device_reinit_after_reset(reset_context);
6224 if (r == -EAGAIN)
6225 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
6226 else
6227 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
6228
6229 out:
6230 return r;
6231 }
6232
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)6233 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
6234 {
6235
6236 switch (amdgpu_asic_reset_method(adev)) {
6237 case AMD_RESET_METHOD_MODE1:
6238 case AMD_RESET_METHOD_LINK:
6239 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
6240 break;
6241 case AMD_RESET_METHOD_MODE2:
6242 adev->mp1_state = PP_MP1_STATE_RESET;
6243 break;
6244 default:
6245 adev->mp1_state = PP_MP1_STATE_NONE;
6246 break;
6247 }
6248 }
6249
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)6250 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
6251 {
6252 amdgpu_vf_error_trans_all(adev);
6253 adev->mp1_state = PP_MP1_STATE_NONE;
6254 }
6255
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)6256 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
6257 {
6258 struct pci_dev *p = NULL;
6259
6260 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
6261 adev->pdev->bus->number, 1);
6262 if (p) {
6263 pm_runtime_enable(&(p->dev));
6264 pm_runtime_resume(&(p->dev));
6265 }
6266
6267 pci_dev_put(p);
6268 }
6269
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)6270 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
6271 {
6272 enum amd_reset_method reset_method;
6273 struct pci_dev *p = NULL;
6274 u64 expires;
6275
6276 /*
6277 * For now, only BACO and mode1 reset are confirmed
6278 * to suffer the audio issue without proper suspended.
6279 */
6280 reset_method = amdgpu_asic_reset_method(adev);
6281 if ((reset_method != AMD_RESET_METHOD_BACO) &&
6282 (reset_method != AMD_RESET_METHOD_MODE1))
6283 return -EINVAL;
6284
6285 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
6286 adev->pdev->bus->number, 1);
6287 if (!p)
6288 return -ENODEV;
6289
6290 expires = pm_runtime_autosuspend_expiration(&(p->dev));
6291 if (!expires)
6292 /*
6293 * If we cannot get the audio device autosuspend delay,
6294 * a fixed 4S interval will be used. Considering 3S is
6295 * the audio controller default autosuspend delay setting.
6296 * 4S used here is guaranteed to cover that.
6297 */
6298 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
6299
6300 while (!pm_runtime_status_suspended(&(p->dev))) {
6301 if (!pm_runtime_suspend(&(p->dev)))
6302 break;
6303
6304 if (expires < ktime_get_mono_fast_ns()) {
6305 dev_warn(adev->dev, "failed to suspend display audio\n");
6306 pci_dev_put(p);
6307 /* TODO: abort the succeeding gpu reset? */
6308 return -ETIMEDOUT;
6309 }
6310 }
6311
6312 pm_runtime_disable(&(p->dev));
6313
6314 pci_dev_put(p);
6315 return 0;
6316 }
6317
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)6318 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
6319 {
6320 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
6321
6322 #if defined(CONFIG_DEBUG_FS)
6323 if (!amdgpu_sriov_vf(adev))
6324 cancel_work(&adev->reset_work);
6325 #endif
6326 cancel_work(&adev->userq_reset_work);
6327
6328 if (adev->kfd.dev)
6329 cancel_work(&adev->kfd.reset_work);
6330
6331 if (amdgpu_sriov_vf(adev))
6332 cancel_work(&adev->virt.flr_work);
6333
6334 if (con && adev->ras_enabled)
6335 cancel_work(&con->recovery_work);
6336
6337 }
6338
amdgpu_device_health_check(struct list_head * device_list_handle)6339 static int amdgpu_device_health_check(struct list_head *device_list_handle)
6340 {
6341 struct amdgpu_device *tmp_adev;
6342 int ret = 0;
6343
6344 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6345 ret |= amdgpu_device_bus_status_check(tmp_adev);
6346 }
6347
6348 return ret;
6349 }
6350
amdgpu_device_recovery_prepare(struct amdgpu_device * adev,struct list_head * device_list,struct amdgpu_hive_info * hive)6351 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
6352 struct list_head *device_list,
6353 struct amdgpu_hive_info *hive)
6354 {
6355 struct amdgpu_device *tmp_adev = NULL;
6356
6357 /*
6358 * Build list of devices to reset.
6359 * In case we are in XGMI hive mode, resort the device list
6360 * to put adev in the 1st position.
6361 */
6362 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
6363 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
6364 list_add_tail(&tmp_adev->reset_list, device_list);
6365 if (adev->shutdown)
6366 tmp_adev->shutdown = true;
6367 if (amdgpu_reset_in_dpc(adev))
6368 tmp_adev->pcie_reset_ctx.in_link_reset = true;
6369 }
6370 if (!list_is_first(&adev->reset_list, device_list))
6371 list_rotate_to_front(&adev->reset_list, device_list);
6372 } else {
6373 list_add_tail(&adev->reset_list, device_list);
6374 }
6375 }
6376
amdgpu_device_recovery_get_reset_lock(struct amdgpu_device * adev,struct list_head * device_list)6377 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
6378 struct list_head *device_list)
6379 {
6380 struct amdgpu_device *tmp_adev = NULL;
6381
6382 if (list_empty(device_list))
6383 return;
6384 tmp_adev =
6385 list_first_entry(device_list, struct amdgpu_device, reset_list);
6386 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
6387 }
6388
amdgpu_device_recovery_put_reset_lock(struct amdgpu_device * adev,struct list_head * device_list)6389 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev,
6390 struct list_head *device_list)
6391 {
6392 struct amdgpu_device *tmp_adev = NULL;
6393
6394 if (list_empty(device_list))
6395 return;
6396 tmp_adev =
6397 list_first_entry(device_list, struct amdgpu_device, reset_list);
6398 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6399 }
6400
amdgpu_device_halt_activities(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context,struct list_head * device_list,struct amdgpu_hive_info * hive,bool need_emergency_restart)6401 static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
6402 struct amdgpu_job *job,
6403 struct amdgpu_reset_context *reset_context,
6404 struct list_head *device_list,
6405 struct amdgpu_hive_info *hive,
6406 bool need_emergency_restart)
6407 {
6408 struct amdgpu_device *tmp_adev = NULL;
6409 int i;
6410
6411 /* block all schedulers and reset given job's ring */
6412 list_for_each_entry(tmp_adev, device_list, reset_list) {
6413 amdgpu_device_set_mp1_state(tmp_adev);
6414
6415 /*
6416 * Try to put the audio codec into suspend state
6417 * before gpu reset started.
6418 *
6419 * Due to the power domain of the graphics device
6420 * is shared with AZ power domain. Without this,
6421 * we may change the audio hardware from behind
6422 * the audio driver's back. That will trigger
6423 * some audio codec errors.
6424 */
6425 if (!amdgpu_device_suspend_display_audio(tmp_adev))
6426 tmp_adev->pcie_reset_ctx.audio_suspended = true;
6427
6428 amdgpu_ras_set_error_query_ready(tmp_adev, false);
6429
6430 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
6431
6432 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
6433
6434 /*
6435 * Mark these ASICs to be reset as untracked first
6436 * And add them back after reset completed
6437 */
6438 amdgpu_unregister_gpu_instance(tmp_adev);
6439
6440 drm_client_dev_suspend(adev_to_drm(tmp_adev));
6441
6442 /* disable ras on ALL IPs */
6443 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) &&
6444 amdgpu_device_ip_need_full_reset(tmp_adev))
6445 amdgpu_ras_suspend(tmp_adev);
6446
6447 amdgpu_userq_pre_reset(tmp_adev);
6448
6449 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6450 struct amdgpu_ring *ring = tmp_adev->rings[i];
6451
6452 if (!amdgpu_ring_sched_ready(ring))
6453 continue;
6454
6455 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
6456
6457 if (need_emergency_restart)
6458 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
6459 }
6460 atomic_inc(&tmp_adev->gpu_reset_counter);
6461 }
6462 }
6463
amdgpu_device_asic_reset(struct amdgpu_device * adev,struct list_head * device_list,struct amdgpu_reset_context * reset_context)6464 static int amdgpu_device_asic_reset(struct amdgpu_device *adev,
6465 struct list_head *device_list,
6466 struct amdgpu_reset_context *reset_context)
6467 {
6468 struct amdgpu_device *tmp_adev = NULL;
6469 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
6470 int r = 0;
6471
6472 retry: /* Rest of adevs pre asic reset from XGMI hive. */
6473 list_for_each_entry(tmp_adev, device_list, reset_list) {
6474 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
6475 /*TODO Should we stop ?*/
6476 if (r) {
6477 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
6478 r, adev_to_drm(tmp_adev)->unique);
6479 tmp_adev->asic_reset_res = r;
6480 }
6481 }
6482
6483 /* Actual ASIC resets if needed.*/
6484 /* Host driver will handle XGMI hive reset for SRIOV */
6485 if (amdgpu_sriov_vf(adev)) {
6486
6487 /* Bail out of reset early */
6488 if (amdgpu_ras_is_rma(adev))
6489 return -ENODEV;
6490
6491 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
6492 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
6493 amdgpu_ras_set_fed(adev, true);
6494 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
6495 }
6496
6497 r = amdgpu_device_reset_sriov(adev, reset_context);
6498 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
6499 amdgpu_virt_release_full_gpu(adev, true);
6500 goto retry;
6501 }
6502 if (r)
6503 adev->asic_reset_res = r;
6504 } else {
6505 r = amdgpu_do_asic_reset(device_list, reset_context);
6506 if (r && r == -EAGAIN)
6507 goto retry;
6508 }
6509
6510 list_for_each_entry(tmp_adev, device_list, reset_list) {
6511 /*
6512 * Drop any pending non scheduler resets queued before reset is done.
6513 * Any reset scheduled after this point would be valid. Scheduler resets
6514 * were already dropped during drm_sched_stop and no new ones can come
6515 * in before drm_sched_start.
6516 */
6517 amdgpu_device_stop_pending_resets(tmp_adev);
6518 }
6519
6520 return r;
6521 }
6522
amdgpu_device_sched_resume(struct list_head * device_list,struct amdgpu_reset_context * reset_context,bool job_signaled)6523 static int amdgpu_device_sched_resume(struct list_head *device_list,
6524 struct amdgpu_reset_context *reset_context,
6525 bool job_signaled)
6526 {
6527 struct amdgpu_device *tmp_adev = NULL;
6528 int i, r = 0;
6529
6530 /* Post ASIC reset for all devs .*/
6531 list_for_each_entry(tmp_adev, device_list, reset_list) {
6532
6533 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6534 struct amdgpu_ring *ring = tmp_adev->rings[i];
6535
6536 if (!amdgpu_ring_sched_ready(ring))
6537 continue;
6538
6539 drm_sched_start(&ring->sched, 0);
6540 }
6541
6542 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
6543 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
6544
6545 if (tmp_adev->asic_reset_res) {
6546 /* bad news, how to tell it to userspace ?
6547 * for ras error, we should report GPU bad status instead of
6548 * reset failure
6549 */
6550 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
6551 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
6552 dev_info(
6553 tmp_adev->dev,
6554 "GPU reset(%d) failed with error %d \n",
6555 atomic_read(
6556 &tmp_adev->gpu_reset_counter),
6557 tmp_adev->asic_reset_res);
6558 amdgpu_vf_error_put(tmp_adev,
6559 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0,
6560 tmp_adev->asic_reset_res);
6561 if (!r)
6562 r = tmp_adev->asic_reset_res;
6563 tmp_adev->asic_reset_res = 0;
6564 } else {
6565 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n",
6566 atomic_read(&tmp_adev->gpu_reset_counter));
6567 if (amdgpu_acpi_smart_shift_update(tmp_adev,
6568 AMDGPU_SS_DEV_D0))
6569 dev_warn(tmp_adev->dev,
6570 "smart shift update failed\n");
6571 }
6572 }
6573
6574 return r;
6575 }
6576
amdgpu_device_gpu_resume(struct amdgpu_device * adev,struct list_head * device_list,bool need_emergency_restart)6577 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
6578 struct list_head *device_list,
6579 bool need_emergency_restart)
6580 {
6581 struct amdgpu_device *tmp_adev = NULL;
6582
6583 list_for_each_entry(tmp_adev, device_list, reset_list) {
6584 /* unlock kfd: SRIOV would do it separately */
6585 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
6586 amdgpu_amdkfd_post_reset(tmp_adev);
6587
6588 /* kfd_post_reset will do nothing if kfd device is not initialized,
6589 * need to bring up kfd here if it's not be initialized before
6590 */
6591 if (!adev->kfd.init_complete)
6592 amdgpu_amdkfd_device_init(adev);
6593
6594 if (tmp_adev->pcie_reset_ctx.audio_suspended)
6595 amdgpu_device_resume_display_audio(tmp_adev);
6596
6597 amdgpu_device_unset_mp1_state(tmp_adev);
6598
6599 amdgpu_ras_set_error_query_ready(tmp_adev, true);
6600
6601 }
6602 }
6603
6604
6605 /**
6606 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
6607 *
6608 * @adev: amdgpu_device pointer
6609 * @job: which job trigger hang
6610 * @reset_context: amdgpu reset context pointer
6611 *
6612 * Attempt to reset the GPU if it has hung (all asics).
6613 * Attempt to do soft-reset or full-reset and reinitialize Asic
6614 * Returns 0 for success or an error on failure.
6615 */
6616
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)6617 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
6618 struct amdgpu_job *job,
6619 struct amdgpu_reset_context *reset_context)
6620 {
6621 struct list_head device_list;
6622 bool job_signaled = false;
6623 struct amdgpu_hive_info *hive = NULL;
6624 int r = 0;
6625 bool need_emergency_restart = false;
6626 /* save the pasid here as the job may be freed before the end of the reset */
6627 int pasid = job ? job->pasid : -EINVAL;
6628
6629 /*
6630 * If it reaches here because of hang/timeout and a RAS error is
6631 * detected at the same time, let RAS recovery take care of it.
6632 */
6633 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) &&
6634 !amdgpu_sriov_vf(adev) &&
6635 reset_context->src != AMDGPU_RESET_SRC_RAS) {
6636 dev_dbg(adev->dev,
6637 "Gpu recovery from source: %d yielding to RAS error recovery handling",
6638 reset_context->src);
6639 return 0;
6640 }
6641
6642 /*
6643 * Special case: RAS triggered and full reset isn't supported
6644 */
6645 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
6646
6647 /*
6648 * Flush RAM to disk so that after reboot
6649 * the user can read log and see why the system rebooted.
6650 */
6651 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
6652 amdgpu_ras_get_context(adev)->reboot) {
6653 dev_warn(adev->dev, "Emergency reboot.");
6654
6655 ksys_sync_helper();
6656 emergency_restart();
6657 }
6658
6659 dev_info(adev->dev, "GPU %s begin!. Source: %d\n",
6660 need_emergency_restart ? "jobs stop" : "reset",
6661 reset_context->src);
6662
6663 if (!amdgpu_sriov_vf(adev))
6664 hive = amdgpu_get_xgmi_hive(adev);
6665 if (hive)
6666 mutex_lock(&hive->hive_lock);
6667
6668 reset_context->job = job;
6669 reset_context->hive = hive;
6670 INIT_LIST_HEAD(&device_list);
6671
6672 amdgpu_device_recovery_prepare(adev, &device_list, hive);
6673
6674 if (!amdgpu_sriov_vf(adev)) {
6675 r = amdgpu_device_health_check(&device_list);
6676 if (r)
6677 goto end_reset;
6678 }
6679
6680 /* Cannot be called after locking reset domain */
6681 amdgpu_ras_pre_reset(adev, &device_list);
6682
6683 /* We need to lock reset domain only once both for XGMI and single device */
6684 amdgpu_device_recovery_get_reset_lock(adev, &device_list);
6685
6686 amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
6687 hive, need_emergency_restart);
6688 if (need_emergency_restart)
6689 goto skip_sched_resume;
6690 /*
6691 * Must check guilty signal here since after this point all old
6692 * HW fences are force signaled.
6693 *
6694 * job->base holds a reference to parent fence
6695 */
6696 if (job && dma_fence_is_signaled(&job->hw_fence->base)) {
6697 job_signaled = true;
6698 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
6699 goto skip_hw_reset;
6700 }
6701
6702 r = amdgpu_device_asic_reset(adev, &device_list, reset_context);
6703 if (r)
6704 goto reset_unlock;
6705 skip_hw_reset:
6706 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled);
6707 if (r)
6708 goto reset_unlock;
6709 skip_sched_resume:
6710 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
6711 reset_unlock:
6712 amdgpu_device_recovery_put_reset_lock(adev, &device_list);
6713 amdgpu_ras_post_reset(adev, &device_list);
6714 end_reset:
6715 if (hive) {
6716 mutex_unlock(&hive->hive_lock);
6717 amdgpu_put_xgmi_hive(hive);
6718 }
6719
6720 if (r)
6721 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6722
6723 atomic_set(&adev->reset_domain->reset_res, r);
6724
6725 if (!r) {
6726 struct amdgpu_task_info *ti = NULL;
6727
6728 /*
6729 * The job may already be freed at this point via the sched tdr workqueue so
6730 * use the cached pasid.
6731 */
6732 if (pasid >= 0)
6733 ti = amdgpu_vm_get_task_info_pasid(adev, pasid);
6734
6735 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE,
6736 ti ? &ti->task : NULL);
6737
6738 amdgpu_vm_put_task_info(ti);
6739 }
6740
6741 return r;
6742 }
6743
6744 /**
6745 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6746 *
6747 * @adev: amdgpu_device pointer
6748 * @speed: pointer to the speed of the link
6749 * @width: pointer to the width of the link
6750 *
6751 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6752 * first physical partner to an AMD dGPU.
6753 * This will exclude any virtual switches and links.
6754 */
amdgpu_device_partner_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6755 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6756 enum pci_bus_speed *speed,
6757 enum pcie_link_width *width)
6758 {
6759 struct pci_dev *parent = adev->pdev;
6760
6761 if (!speed || !width)
6762 return;
6763
6764 *speed = PCI_SPEED_UNKNOWN;
6765 *width = PCIE_LNK_WIDTH_UNKNOWN;
6766
6767 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6768 while ((parent = pci_upstream_bridge(parent))) {
6769 /* skip upstream/downstream switches internal to dGPU*/
6770 if (parent->vendor == PCI_VENDOR_ID_ATI)
6771 continue;
6772 *speed = pcie_get_speed_cap(parent);
6773 *width = pcie_get_width_cap(parent);
6774 break;
6775 }
6776 } else {
6777 /* use the current speeds rather than max if switching is not supported */
6778 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6779 }
6780 }
6781
6782 /**
6783 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU
6784 *
6785 * @adev: amdgpu_device pointer
6786 * @speed: pointer to the speed of the link
6787 * @width: pointer to the width of the link
6788 *
6789 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6790 * AMD dGPU which may be a virtual upstream bridge.
6791 */
amdgpu_device_gpu_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6792 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev,
6793 enum pci_bus_speed *speed,
6794 enum pcie_link_width *width)
6795 {
6796 struct pci_dev *parent = adev->pdev;
6797
6798 if (!speed || !width)
6799 return;
6800
6801 parent = pci_upstream_bridge(parent);
6802 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) {
6803 /* use the upstream/downstream switches internal to dGPU */
6804 *speed = pcie_get_speed_cap(parent);
6805 *width = pcie_get_width_cap(parent);
6806 while ((parent = pci_upstream_bridge(parent))) {
6807 if (parent->vendor == PCI_VENDOR_ID_ATI) {
6808 /* use the upstream/downstream switches internal to dGPU */
6809 *speed = pcie_get_speed_cap(parent);
6810 *width = pcie_get_width_cap(parent);
6811 }
6812 }
6813 } else {
6814 /* use the device itself */
6815 *speed = pcie_get_speed_cap(adev->pdev);
6816 *width = pcie_get_width_cap(adev->pdev);
6817 }
6818 }
6819
6820 /**
6821 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6822 *
6823 * @adev: amdgpu_device pointer
6824 *
6825 * Fetches and stores in the driver the PCIE capabilities (gen speed
6826 * and lanes) of the slot the device is in. Handles APUs and
6827 * virtualized environments where PCIE config space may not be available.
6828 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)6829 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6830 {
6831 enum pci_bus_speed speed_cap, platform_speed_cap;
6832 enum pcie_link_width platform_link_width, link_width;
6833
6834 if (amdgpu_pcie_gen_cap)
6835 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6836
6837 if (amdgpu_pcie_lane_cap)
6838 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6839
6840 /* covers APUs as well */
6841 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6842 if (adev->pm.pcie_gen_mask == 0)
6843 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6844 if (adev->pm.pcie_mlw_mask == 0)
6845 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6846 return;
6847 }
6848
6849 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6850 return;
6851
6852 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6853 &platform_link_width);
6854 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width);
6855
6856 if (adev->pm.pcie_gen_mask == 0) {
6857 /* asic caps */
6858 if (speed_cap == PCI_SPEED_UNKNOWN) {
6859 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6860 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6861 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6862 } else {
6863 if (speed_cap == PCIE_SPEED_32_0GT)
6864 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6865 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6866 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6867 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6868 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6869 else if (speed_cap == PCIE_SPEED_16_0GT)
6870 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6871 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6872 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6873 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6874 else if (speed_cap == PCIE_SPEED_8_0GT)
6875 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6876 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6877 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6878 else if (speed_cap == PCIE_SPEED_5_0GT)
6879 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6880 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6881 else
6882 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6883 }
6884 /* platform caps */
6885 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6886 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6887 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6888 } else {
6889 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6890 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6891 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6892 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6893 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6894 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6895 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6896 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6897 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6898 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6899 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6900 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6901 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6902 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6903 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6904 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6905 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6906 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6907 else
6908 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6909
6910 }
6911 }
6912 if (adev->pm.pcie_mlw_mask == 0) {
6913 /* asic caps */
6914 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6915 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK;
6916 } else {
6917 switch (link_width) {
6918 case PCIE_LNK_X32:
6919 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 |
6920 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6921 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6922 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6923 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6924 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6925 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6926 break;
6927 case PCIE_LNK_X16:
6928 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6929 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6930 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6931 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6932 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6933 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6934 break;
6935 case PCIE_LNK_X12:
6936 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6937 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6938 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6939 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6940 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6941 break;
6942 case PCIE_LNK_X8:
6943 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6944 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6945 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6946 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6947 break;
6948 case PCIE_LNK_X4:
6949 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6950 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6951 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6952 break;
6953 case PCIE_LNK_X2:
6954 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6955 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6956 break;
6957 case PCIE_LNK_X1:
6958 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1;
6959 break;
6960 default:
6961 break;
6962 }
6963 }
6964 /* platform caps */
6965 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6966 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6967 } else {
6968 switch (platform_link_width) {
6969 case PCIE_LNK_X32:
6970 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6971 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6972 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6973 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6974 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6975 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6976 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6977 break;
6978 case PCIE_LNK_X16:
6979 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6980 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6981 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6982 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6983 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6984 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6985 break;
6986 case PCIE_LNK_X12:
6987 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6988 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6989 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6990 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6991 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6992 break;
6993 case PCIE_LNK_X8:
6994 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6995 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6996 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6997 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6998 break;
6999 case PCIE_LNK_X4:
7000 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
7001 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
7002 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
7003 break;
7004 case PCIE_LNK_X2:
7005 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
7006 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
7007 break;
7008 case PCIE_LNK_X1:
7009 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
7010 break;
7011 default:
7012 break;
7013 }
7014 }
7015 }
7016 }
7017
7018 /**
7019 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
7020 *
7021 * @adev: amdgpu_device pointer
7022 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
7023 *
7024 * Return true if @peer_adev can access (DMA) @adev through the PCIe
7025 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
7026 * @peer_adev.
7027 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)7028 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
7029 struct amdgpu_device *peer_adev)
7030 {
7031 #ifdef CONFIG_HSA_AMD_P2P
7032 bool p2p_access =
7033 !adev->gmc.xgmi.connected_to_cpu &&
7034 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
7035 if (!p2p_access)
7036 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n",
7037 pci_name(peer_adev->pdev));
7038
7039 bool is_large_bar = adev->gmc.visible_vram_size &&
7040 adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
7041 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
7042
7043 if (!p2p_addressable) {
7044 uint64_t address_mask = peer_adev->dev->dma_mask ?
7045 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
7046 resource_size_t aper_limit =
7047 adev->gmc.aper_base + adev->gmc.aper_size - 1;
7048
7049 p2p_addressable = !(adev->gmc.aper_base & address_mask ||
7050 aper_limit & address_mask);
7051 }
7052 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
7053 #else
7054 return false;
7055 #endif
7056 }
7057
amdgpu_device_baco_enter(struct amdgpu_device * adev)7058 int amdgpu_device_baco_enter(struct amdgpu_device *adev)
7059 {
7060 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
7061
7062 if (!amdgpu_device_supports_baco(adev))
7063 return -ENOTSUPP;
7064
7065 if (ras && adev->ras_enabled &&
7066 adev->nbio.funcs->enable_doorbell_interrupt)
7067 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
7068
7069 return amdgpu_dpm_baco_enter(adev);
7070 }
7071
amdgpu_device_baco_exit(struct amdgpu_device * adev)7072 int amdgpu_device_baco_exit(struct amdgpu_device *adev)
7073 {
7074 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
7075 int ret = 0;
7076
7077 if (!amdgpu_device_supports_baco(adev))
7078 return -ENOTSUPP;
7079
7080 ret = amdgpu_dpm_baco_exit(adev);
7081 if (ret)
7082 return ret;
7083
7084 if (ras && adev->ras_enabled &&
7085 adev->nbio.funcs->enable_doorbell_interrupt)
7086 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
7087
7088 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
7089 adev->nbio.funcs->clear_doorbell_interrupt)
7090 adev->nbio.funcs->clear_doorbell_interrupt(adev);
7091
7092 return 0;
7093 }
7094
7095 /**
7096 * amdgpu_pci_error_detected - Called when a PCI error is detected.
7097 * @pdev: PCI device struct
7098 * @state: PCI channel state
7099 *
7100 * Description: Called when a PCI error is detected.
7101 *
7102 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
7103 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)7104 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
7105 {
7106 struct drm_device *dev = pci_get_drvdata(pdev);
7107 struct amdgpu_device *adev = drm_to_adev(dev);
7108 struct amdgpu_hive_info *hive __free(xgmi_put_hive) =
7109 amdgpu_get_xgmi_hive(adev);
7110 struct amdgpu_reset_context reset_context;
7111 struct list_head device_list;
7112
7113 dev_info(adev->dev, "PCI error: detected callback!!\n");
7114
7115 adev->pci_channel_state = state;
7116
7117 switch (state) {
7118 case pci_channel_io_normal:
7119 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state);
7120 return PCI_ERS_RESULT_CAN_RECOVER;
7121 case pci_channel_io_frozen:
7122 /* Fatal error, prepare for slot reset */
7123 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state);
7124 if (hive) {
7125 /* Hive devices should be able to support FW based
7126 * link reset on other devices, if not return.
7127 */
7128 if (!amdgpu_dpm_is_link_reset_supported(adev)) {
7129 dev_warn(adev->dev,
7130 "No support for XGMI hive yet...\n");
7131 return PCI_ERS_RESULT_DISCONNECT;
7132 }
7133 /* Set dpc status only if device is part of hive
7134 * Non-hive devices should be able to recover after
7135 * link reset.
7136 */
7137 amdgpu_reset_set_dpc_status(adev, true);
7138
7139 mutex_lock(&hive->hive_lock);
7140 }
7141 memset(&reset_context, 0, sizeof(reset_context));
7142 INIT_LIST_HEAD(&device_list);
7143
7144 amdgpu_device_recovery_prepare(adev, &device_list, hive);
7145 amdgpu_device_recovery_get_reset_lock(adev, &device_list);
7146 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
7147 hive, false);
7148 if (hive)
7149 mutex_unlock(&hive->hive_lock);
7150 return PCI_ERS_RESULT_NEED_RESET;
7151 case pci_channel_io_perm_failure:
7152 /* Permanent error, prepare for device removal */
7153 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state);
7154 return PCI_ERS_RESULT_DISCONNECT;
7155 }
7156
7157 return PCI_ERS_RESULT_NEED_RESET;
7158 }
7159
7160 /**
7161 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
7162 * @pdev: pointer to PCI device
7163 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)7164 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
7165 {
7166 struct drm_device *dev = pci_get_drvdata(pdev);
7167 struct amdgpu_device *adev = drm_to_adev(dev);
7168
7169 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n");
7170
7171 /* TODO - dump whatever for debugging purposes */
7172
7173 /* This called only if amdgpu_pci_error_detected returns
7174 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
7175 * works, no need to reset slot.
7176 */
7177
7178 return PCI_ERS_RESULT_RECOVERED;
7179 }
7180
7181 /**
7182 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
7183 * @pdev: PCI device struct
7184 *
7185 * Description: This routine is called by the pci error recovery
7186 * code after the PCI slot has been reset, just before we
7187 * should resume normal operations.
7188 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)7189 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
7190 {
7191 struct drm_device *dev = pci_get_drvdata(pdev);
7192 struct amdgpu_device *adev = drm_to_adev(dev);
7193 struct amdgpu_reset_context reset_context;
7194 struct amdgpu_device *tmp_adev;
7195 struct amdgpu_hive_info *hive;
7196 struct list_head device_list;
7197 struct pci_dev *link_dev;
7198 int r = 0, i, timeout;
7199 u32 memsize;
7200 u16 status;
7201
7202 dev_info(adev->dev, "PCI error: slot reset callback!!\n");
7203
7204 memset(&reset_context, 0, sizeof(reset_context));
7205
7206 if (adev->pcie_reset_ctx.swus)
7207 link_dev = adev->pcie_reset_ctx.swus;
7208 else
7209 link_dev = adev->pdev;
7210 /* wait for asic to come out of reset, timeout = 10s */
7211 timeout = 10000;
7212 do {
7213 usleep_range(10000, 10500);
7214 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status);
7215 timeout -= 10;
7216 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) &&
7217 (status != PCI_VENDOR_ID_AMD));
7218
7219 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) {
7220 r = -ETIME;
7221 goto out;
7222 }
7223
7224 amdgpu_device_load_switch_state(adev);
7225 /* Restore PCI confspace */
7226 amdgpu_device_load_pci_state(pdev);
7227
7228 /* confirm ASIC came out of reset */
7229 for (i = 0; i < adev->usec_timeout; i++) {
7230 memsize = amdgpu_asic_get_config_memsize(adev);
7231
7232 if (memsize != 0xffffffff)
7233 break;
7234 udelay(1);
7235 }
7236 if (memsize == 0xffffffff) {
7237 r = -ETIME;
7238 goto out;
7239 }
7240
7241 reset_context.method = AMD_RESET_METHOD_NONE;
7242 reset_context.reset_req_dev = adev;
7243 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
7244 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
7245 INIT_LIST_HEAD(&device_list);
7246
7247 hive = amdgpu_get_xgmi_hive(adev);
7248 if (hive) {
7249 mutex_lock(&hive->hive_lock);
7250 reset_context.hive = hive;
7251 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
7252 tmp_adev->pcie_reset_ctx.in_link_reset = true;
7253 list_add_tail(&tmp_adev->reset_list, &device_list);
7254 }
7255 } else {
7256 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
7257 list_add_tail(&adev->reset_list, &device_list);
7258 }
7259
7260 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context);
7261 out:
7262 if (!r) {
7263 if (amdgpu_device_cache_pci_state(adev->pdev))
7264 pci_restore_state(adev->pdev);
7265 dev_info(adev->dev, "PCIe error recovery succeeded\n");
7266 } else {
7267 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r);
7268 if (hive) {
7269 list_for_each_entry(tmp_adev, &device_list, reset_list)
7270 amdgpu_device_unset_mp1_state(tmp_adev);
7271 }
7272 amdgpu_device_recovery_put_reset_lock(adev, &device_list);
7273 }
7274
7275 if (hive) {
7276 mutex_unlock(&hive->hive_lock);
7277 amdgpu_put_xgmi_hive(hive);
7278 }
7279
7280 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
7281 }
7282
7283 /**
7284 * amdgpu_pci_resume() - resume normal ops after PCI reset
7285 * @pdev: pointer to PCI device
7286 *
7287 * Called when the error recovery driver tells us that its
7288 * OK to resume normal operation.
7289 */
amdgpu_pci_resume(struct pci_dev * pdev)7290 void amdgpu_pci_resume(struct pci_dev *pdev)
7291 {
7292 struct drm_device *dev = pci_get_drvdata(pdev);
7293 struct amdgpu_device *adev = drm_to_adev(dev);
7294 struct list_head device_list;
7295 struct amdgpu_hive_info *hive = NULL;
7296 struct amdgpu_device *tmp_adev = NULL;
7297
7298 dev_info(adev->dev, "PCI error: resume callback!!\n");
7299
7300 /* Only continue execution for the case of pci_channel_io_frozen */
7301 if (adev->pci_channel_state != pci_channel_io_frozen)
7302 return;
7303
7304 INIT_LIST_HEAD(&device_list);
7305
7306 hive = amdgpu_get_xgmi_hive(adev);
7307 if (hive) {
7308 mutex_lock(&hive->hive_lock);
7309 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
7310 tmp_adev->pcie_reset_ctx.in_link_reset = false;
7311 list_add_tail(&tmp_adev->reset_list, &device_list);
7312 }
7313 } else
7314 list_add_tail(&adev->reset_list, &device_list);
7315
7316 amdgpu_device_sched_resume(&device_list, NULL, NULL);
7317 amdgpu_device_gpu_resume(adev, &device_list, false);
7318 amdgpu_device_recovery_put_reset_lock(adev, &device_list);
7319
7320 if (hive) {
7321 mutex_unlock(&hive->hive_lock);
7322 amdgpu_put_xgmi_hive(hive);
7323 }
7324 }
7325
amdgpu_device_cache_switch_state(struct amdgpu_device * adev)7326 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev)
7327 {
7328 struct pci_dev *swus, *swds;
7329 int r;
7330
7331 swds = pci_upstream_bridge(adev->pdev);
7332 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI ||
7333 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM)
7334 return;
7335 swus = pci_upstream_bridge(swds);
7336 if (!swus ||
7337 (swus->vendor != PCI_VENDOR_ID_ATI &&
7338 swus->vendor != PCI_VENDOR_ID_AMD) ||
7339 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM)
7340 return;
7341
7342 /* If already saved, return */
7343 if (adev->pcie_reset_ctx.swus)
7344 return;
7345 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */
7346 r = pci_save_state(swds);
7347 if (r)
7348 return;
7349 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds);
7350
7351 r = pci_save_state(swus);
7352 if (r)
7353 return;
7354 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus);
7355
7356 adev->pcie_reset_ctx.swus = swus;
7357 }
7358
amdgpu_device_load_switch_state(struct amdgpu_device * adev)7359 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev)
7360 {
7361 struct pci_dev *pdev;
7362 int r;
7363
7364 if (!adev->pcie_reset_ctx.swds_pcistate ||
7365 !adev->pcie_reset_ctx.swus_pcistate)
7366 return;
7367
7368 pdev = adev->pcie_reset_ctx.swus;
7369 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate);
7370 if (!r) {
7371 pci_restore_state(pdev);
7372 } else {
7373 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r);
7374 return;
7375 }
7376
7377 pdev = pci_upstream_bridge(adev->pdev);
7378 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate);
7379 if (!r)
7380 pci_restore_state(pdev);
7381 else
7382 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r);
7383 }
7384
amdgpu_device_cache_pci_state(struct pci_dev * pdev)7385 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
7386 {
7387 struct drm_device *dev = pci_get_drvdata(pdev);
7388 struct amdgpu_device *adev = drm_to_adev(dev);
7389 int r;
7390
7391 if (amdgpu_sriov_vf(adev))
7392 return false;
7393
7394 r = pci_save_state(pdev);
7395 if (!r) {
7396 kfree(adev->pci_state);
7397
7398 adev->pci_state = pci_store_saved_state(pdev);
7399
7400 if (!adev->pci_state) {
7401 dev_err(adev->dev, "Failed to store PCI saved state");
7402 return false;
7403 }
7404 } else {
7405 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r);
7406 return false;
7407 }
7408
7409 amdgpu_device_cache_switch_state(adev);
7410
7411 return true;
7412 }
7413
amdgpu_device_load_pci_state(struct pci_dev * pdev)7414 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
7415 {
7416 struct drm_device *dev = pci_get_drvdata(pdev);
7417 struct amdgpu_device *adev = drm_to_adev(dev);
7418 int r;
7419
7420 if (!adev->pci_state)
7421 return false;
7422
7423 r = pci_load_saved_state(pdev, adev->pci_state);
7424
7425 if (!r) {
7426 pci_restore_state(pdev);
7427 } else {
7428 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r);
7429 return false;
7430 }
7431
7432 return true;
7433 }
7434
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)7435 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
7436 struct amdgpu_ring *ring)
7437 {
7438 #ifdef CONFIG_X86_64
7439 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
7440 return;
7441 #endif
7442 if (adev->gmc.xgmi.connected_to_cpu)
7443 return;
7444
7445 if (ring && ring->funcs->emit_hdp_flush) {
7446 amdgpu_ring_emit_hdp_flush(ring);
7447 return;
7448 }
7449
7450 if (!ring && amdgpu_sriov_runtime(adev)) {
7451 if (!amdgpu_kiq_hdp_flush(adev))
7452 return;
7453 }
7454
7455 amdgpu_hdp_flush(adev, ring);
7456 }
7457
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)7458 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
7459 struct amdgpu_ring *ring)
7460 {
7461 #ifdef CONFIG_X86_64
7462 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
7463 return;
7464 #endif
7465 if (adev->gmc.xgmi.connected_to_cpu)
7466 return;
7467
7468 amdgpu_hdp_invalidate(adev, ring);
7469 }
7470
amdgpu_in_reset(struct amdgpu_device * adev)7471 int amdgpu_in_reset(struct amdgpu_device *adev)
7472 {
7473 return atomic_read(&adev->reset_domain->in_gpu_reset);
7474 }
7475
7476 /**
7477 * amdgpu_device_halt() - bring hardware to some kind of halt state
7478 *
7479 * @adev: amdgpu_device pointer
7480 *
7481 * Bring hardware to some kind of halt state so that no one can touch it
7482 * any more. It will help to maintain error context when error occurred.
7483 * Compare to a simple hang, the system will keep stable at least for SSH
7484 * access. Then it should be trivial to inspect the hardware state and
7485 * see what's going on. Implemented as following:
7486 *
7487 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
7488 * clears all CPU mappings to device, disallows remappings through page faults
7489 * 2. amdgpu_irq_disable_all() disables all interrupts
7490 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
7491 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
7492 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
7493 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
7494 * flush any in flight DMA operations
7495 */
amdgpu_device_halt(struct amdgpu_device * adev)7496 void amdgpu_device_halt(struct amdgpu_device *adev)
7497 {
7498 struct pci_dev *pdev = adev->pdev;
7499 struct drm_device *ddev = adev_to_drm(adev);
7500
7501 amdgpu_xcp_dev_unplug(adev);
7502 drm_dev_unplug(ddev);
7503
7504 amdgpu_irq_disable_all(adev);
7505
7506 amdgpu_fence_driver_hw_fini(adev);
7507
7508 adev->no_hw_access = true;
7509
7510 amdgpu_device_unmap_mmio(adev);
7511
7512 pci_disable_device(pdev);
7513 pci_wait_for_pending_transaction(pdev);
7514 }
7515
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)7516 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
7517 u32 reg)
7518 {
7519 unsigned long flags, address, data;
7520 u32 r;
7521
7522 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
7523 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
7524
7525 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
7526 WREG32(address, reg * 4);
7527 (void)RREG32(address);
7528 r = RREG32(data);
7529 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
7530 return r;
7531 }
7532
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)7533 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
7534 u32 reg, u32 v)
7535 {
7536 unsigned long flags, address, data;
7537
7538 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
7539 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
7540
7541 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
7542 WREG32(address, reg * 4);
7543 (void)RREG32(address);
7544 WREG32(data, v);
7545 (void)RREG32(data);
7546 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
7547 }
7548
7549 /**
7550 * amdgpu_device_get_gang - return a reference to the current gang
7551 * @adev: amdgpu_device pointer
7552 *
7553 * Returns: A new reference to the current gang leader.
7554 */
amdgpu_device_get_gang(struct amdgpu_device * adev)7555 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
7556 {
7557 struct dma_fence *fence;
7558
7559 rcu_read_lock();
7560 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
7561 rcu_read_unlock();
7562 return fence;
7563 }
7564
7565 /**
7566 * amdgpu_device_switch_gang - switch to a new gang
7567 * @adev: amdgpu_device pointer
7568 * @gang: the gang to switch to
7569 *
7570 * Try to switch to a new gang.
7571 * Returns: NULL if we switched to the new gang or a reference to the current
7572 * gang leader.
7573 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)7574 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
7575 struct dma_fence *gang)
7576 {
7577 struct dma_fence *old = NULL;
7578
7579 dma_fence_get(gang);
7580 do {
7581 dma_fence_put(old);
7582 old = amdgpu_device_get_gang(adev);
7583 if (old == gang)
7584 break;
7585
7586 if (!dma_fence_is_signaled(old)) {
7587 dma_fence_put(gang);
7588 return old;
7589 }
7590
7591 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
7592 old, gang) != old);
7593
7594 /*
7595 * Drop it once for the exchanged reference in adev and once for the
7596 * thread local reference acquired in amdgpu_device_get_gang().
7597 */
7598 dma_fence_put(old);
7599 dma_fence_put(old);
7600 return NULL;
7601 }
7602
7603 /**
7604 * amdgpu_device_enforce_isolation - enforce HW isolation
7605 * @adev: the amdgpu device pointer
7606 * @ring: the HW ring the job is supposed to run on
7607 * @job: the job which is about to be pushed to the HW ring
7608 *
7609 * Makes sure that only one client at a time can use the GFX block.
7610 * Returns: The dependency to wait on before the job can be pushed to the HW.
7611 * The function is called multiple times until NULL is returned.
7612 */
amdgpu_device_enforce_isolation(struct amdgpu_device * adev,struct amdgpu_ring * ring,struct amdgpu_job * job)7613 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
7614 struct amdgpu_ring *ring,
7615 struct amdgpu_job *job)
7616 {
7617 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id];
7618 struct drm_sched_fence *f = job->base.s_fence;
7619 struct dma_fence *dep;
7620 void *owner;
7621 int r;
7622
7623 /*
7624 * For now enforce isolation only for the GFX block since we only need
7625 * the cleaner shader on those rings.
7626 */
7627 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX &&
7628 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE)
7629 return NULL;
7630
7631 /*
7632 * All submissions where enforce isolation is false are handled as if
7633 * they come from a single client. Use ~0l as the owner to distinct it
7634 * from kernel submissions where the owner is NULL.
7635 */
7636 owner = job->enforce_isolation ? f->owner : (void *)~0l;
7637
7638 mutex_lock(&adev->enforce_isolation_mutex);
7639
7640 /*
7641 * The "spearhead" submission is the first one which changes the
7642 * ownership to its client. We always need to wait for it to be
7643 * pushed to the HW before proceeding with anything.
7644 */
7645 if (&f->scheduled != isolation->spearhead &&
7646 !dma_fence_is_signaled(isolation->spearhead)) {
7647 dep = isolation->spearhead;
7648 goto out_grab_ref;
7649 }
7650
7651 if (isolation->owner != owner) {
7652
7653 /*
7654 * Wait for any gang to be assembled before switching to a
7655 * different owner or otherwise we could deadlock the
7656 * submissions.
7657 */
7658 if (!job->gang_submit) {
7659 dep = amdgpu_device_get_gang(adev);
7660 if (!dma_fence_is_signaled(dep))
7661 goto out_return_dep;
7662 dma_fence_put(dep);
7663 }
7664
7665 dma_fence_put(isolation->spearhead);
7666 isolation->spearhead = dma_fence_get(&f->scheduled);
7667 amdgpu_sync_move(&isolation->active, &isolation->prev);
7668 trace_amdgpu_isolation(isolation->owner, owner);
7669 isolation->owner = owner;
7670 }
7671
7672 /*
7673 * Specifying the ring here helps to pipeline submissions even when
7674 * isolation is enabled. If that is not desired for testing NULL can be
7675 * used instead of the ring to enforce a CPU round trip while switching
7676 * between clients.
7677 */
7678 dep = amdgpu_sync_peek_fence(&isolation->prev, ring);
7679 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT);
7680 if (r)
7681 dev_warn(adev->dev, "OOM tracking isolation\n");
7682
7683 out_grab_ref:
7684 dma_fence_get(dep);
7685 out_return_dep:
7686 mutex_unlock(&adev->enforce_isolation_mutex);
7687 return dep;
7688 }
7689
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)7690 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
7691 {
7692 switch (adev->asic_type) {
7693 #ifdef CONFIG_DRM_AMDGPU_SI
7694 case CHIP_HAINAN:
7695 #endif
7696 case CHIP_TOPAZ:
7697 /* chips with no display hardware */
7698 return false;
7699 #ifdef CONFIG_DRM_AMDGPU_SI
7700 case CHIP_TAHITI:
7701 case CHIP_PITCAIRN:
7702 case CHIP_VERDE:
7703 case CHIP_OLAND:
7704 #endif
7705 #ifdef CONFIG_DRM_AMDGPU_CIK
7706 case CHIP_BONAIRE:
7707 case CHIP_HAWAII:
7708 case CHIP_KAVERI:
7709 case CHIP_KABINI:
7710 case CHIP_MULLINS:
7711 #endif
7712 case CHIP_TONGA:
7713 case CHIP_FIJI:
7714 case CHIP_POLARIS10:
7715 case CHIP_POLARIS11:
7716 case CHIP_POLARIS12:
7717 case CHIP_VEGAM:
7718 case CHIP_CARRIZO:
7719 case CHIP_STONEY:
7720 /* chips with display hardware */
7721 return true;
7722 default:
7723 /* IP discovery */
7724 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
7725 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
7726 return false;
7727 return true;
7728 }
7729 }
7730
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)7731 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
7732 uint32_t inst, uint32_t reg_addr, char reg_name[],
7733 uint32_t expected_value, uint32_t mask)
7734 {
7735 uint32_t ret = 0;
7736 uint32_t old_ = 0;
7737 uint32_t tmp_ = RREG32(reg_addr);
7738 uint32_t loop = adev->usec_timeout;
7739
7740 while ((tmp_ & (mask)) != (expected_value)) {
7741 if (old_ != tmp_) {
7742 loop = adev->usec_timeout;
7743 old_ = tmp_;
7744 } else
7745 udelay(1);
7746 tmp_ = RREG32(reg_addr);
7747 loop--;
7748 if (!loop) {
7749 dev_warn(
7750 adev->dev,
7751 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
7752 inst, reg_name, (uint32_t)expected_value,
7753 (uint32_t)(tmp_ & (mask)));
7754 ret = -ETIMEDOUT;
7755 break;
7756 }
7757 }
7758 return ret;
7759 }
7760
amdgpu_get_soft_full_reset_mask(struct amdgpu_ring * ring)7761 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
7762 {
7763 ssize_t size = 0;
7764
7765 if (!ring || !ring->adev)
7766 return size;
7767
7768 if (amdgpu_device_should_recover_gpu(ring->adev))
7769 size |= AMDGPU_RESET_TYPE_FULL;
7770
7771 if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
7772 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
7773 size |= AMDGPU_RESET_TYPE_SOFT_RESET;
7774
7775 return size;
7776 }
7777
amdgpu_show_reset_mask(char * buf,uint32_t supported_reset)7778 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
7779 {
7780 ssize_t size = 0;
7781
7782 if (supported_reset == 0) {
7783 size += sysfs_emit_at(buf, size, "unsupported");
7784 size += sysfs_emit_at(buf, size, "\n");
7785 return size;
7786
7787 }
7788
7789 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
7790 size += sysfs_emit_at(buf, size, "soft ");
7791
7792 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
7793 size += sysfs_emit_at(buf, size, "queue ");
7794
7795 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
7796 size += sysfs_emit_at(buf, size, "pipe ");
7797
7798 if (supported_reset & AMDGPU_RESET_TYPE_FULL)
7799 size += sysfs_emit_at(buf, size, "full ");
7800
7801 size += sysfs_emit_at(buf, size, "\n");
7802 return size;
7803 }
7804
amdgpu_device_set_uid(struct amdgpu_uid * uid_info,enum amdgpu_uid_type type,uint8_t inst,uint64_t uid)7805 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info,
7806 enum amdgpu_uid_type type, uint8_t inst,
7807 uint64_t uid)
7808 {
7809 if (!uid_info)
7810 return;
7811
7812 if (type >= AMDGPU_UID_TYPE_MAX) {
7813 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n",
7814 type);
7815 return;
7816 }
7817
7818 if (inst >= AMDGPU_UID_INST_MAX) {
7819 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n",
7820 inst);
7821 return;
7822 }
7823
7824 if (uid_info->uid[type][inst] != 0) {
7825 dev_warn_once(
7826 uid_info->adev->dev,
7827 "Overwriting existing UID %llu for type %d instance %d\n",
7828 uid_info->uid[type][inst], type, inst);
7829 }
7830
7831 uid_info->uid[type][inst] = uid;
7832 }
7833
amdgpu_device_get_uid(struct amdgpu_uid * uid_info,enum amdgpu_uid_type type,uint8_t inst)7834 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info,
7835 enum amdgpu_uid_type type, uint8_t inst)
7836 {
7837 if (!uid_info)
7838 return 0;
7839
7840 if (type >= AMDGPU_UID_TYPE_MAX) {
7841 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n",
7842 type);
7843 return 0;
7844 }
7845
7846 if (inst >= AMDGPU_UID_INST_MAX) {
7847 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n",
7848 inst);
7849 return 0;
7850 }
7851
7852 return uid_info->uid[type][inst];
7853 }
7854