1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28
29 #include <linux/aperture.h>
30 #include <linux/power_supply.h>
31 #include <linux/kthread.h>
32 #include <linux/module.h>
33 #include <linux/console.h>
34 #include <linux/slab.h>
35 #include <linux/iommu.h>
36 #include <linux/pci.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39
40 #include <drm/drm_atomic_helper.h>
41 #include <drm/drm_client_event.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_probe_helper.h>
44 #include <drm/amdgpu_drm.h>
45 #include <linux/device.h>
46 #include <linux/vgaarb.h>
47 #include <linux/vga_switcheroo.h>
48 #include <linux/efi.h>
49 #include "amdgpu.h"
50 #include "amdgpu_trace.h"
51 #include "amdgpu_i2c.h"
52 #include "atom.h"
53 #include "amdgpu_atombios.h"
54 #include "amdgpu_atomfirmware.h"
55 #include "amd_pcie.h"
56 #ifdef CONFIG_DRM_AMDGPU_SI
57 #include "si.h"
58 #endif
59 #ifdef CONFIG_DRM_AMDGPU_CIK
60 #include "cik.h"
61 #endif
62 #include "vi.h"
63 #include "soc15.h"
64 #include "nv.h"
65 #include "bif/bif_4_1_d.h"
66 #include <linux/firmware.h>
67 #include "amdgpu_vf_error.h"
68
69 #include "amdgpu_amdkfd.h"
70 #include "amdgpu_pm.h"
71
72 #include "amdgpu_xgmi.h"
73 #include "amdgpu_ras.h"
74 #include "amdgpu_pmu.h"
75 #include "amdgpu_fru_eeprom.h"
76 #include "amdgpu_reset.h"
77 #include "amdgpu_virt.h"
78 #include "amdgpu_dev_coredump.h"
79
80 #include <linux/suspend.h>
81 #include <drm/task_barrier.h>
82 #include <linux/pm_runtime.h>
83
84 #include <drm/drm_drv.h>
85
86 #if IS_ENABLED(CONFIG_X86)
87 #include <asm/intel-family.h>
88 #endif
89
90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
97
98 #define AMDGPU_RESUME_MS 2000
99 #define AMDGPU_MAX_RETRY_LIMIT 2
100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
104
105 static const struct drm_driver amdgpu_kms_driver;
106
107 const char *amdgpu_asic_name[] = {
108 "TAHITI",
109 "PITCAIRN",
110 "VERDE",
111 "OLAND",
112 "HAINAN",
113 "BONAIRE",
114 "KAVERI",
115 "KABINI",
116 "HAWAII",
117 "MULLINS",
118 "TOPAZ",
119 "TONGA",
120 "FIJI",
121 "CARRIZO",
122 "STONEY",
123 "POLARIS10",
124 "POLARIS11",
125 "POLARIS12",
126 "VEGAM",
127 "VEGA10",
128 "VEGA12",
129 "VEGA20",
130 "RAVEN",
131 "ARCTURUS",
132 "RENOIR",
133 "ALDEBARAN",
134 "NAVI10",
135 "CYAN_SKILLFISH",
136 "NAVI14",
137 "NAVI12",
138 "SIENNA_CICHLID",
139 "NAVY_FLOUNDER",
140 "VANGOGH",
141 "DIMGREY_CAVEFISH",
142 "BEIGE_GOBY",
143 "YELLOW_CARP",
144 "IP DISCOVERY",
145 "LAST",
146 };
147
148 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0)
149 /*
150 * Default init level where all blocks are expected to be initialized. This is
151 * the level of initialization expected by default and also after a full reset
152 * of the device.
153 */
154 struct amdgpu_init_level amdgpu_init_default = {
155 .level = AMDGPU_INIT_LEVEL_DEFAULT,
156 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
157 };
158
159 struct amdgpu_init_level amdgpu_init_recovery = {
160 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
161 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
162 };
163
164 /*
165 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
166 * is used for cases like reset on initialization where the entire hive needs to
167 * be reset before first use.
168 */
169 struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
170 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
171 .hwini_ip_block_mask =
172 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
173 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
174 BIT(AMD_IP_BLOCK_TYPE_PSP)
175 };
176
amdgpu_ip_member_of_hwini(struct amdgpu_device * adev,enum amd_ip_block_type block)177 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
178 enum amd_ip_block_type block)
179 {
180 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0;
181 }
182
amdgpu_set_init_level(struct amdgpu_device * adev,enum amdgpu_init_lvl_id lvl)183 void amdgpu_set_init_level(struct amdgpu_device *adev,
184 enum amdgpu_init_lvl_id lvl)
185 {
186 switch (lvl) {
187 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
188 adev->init_lvl = &amdgpu_init_minimal_xgmi;
189 break;
190 case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
191 adev->init_lvl = &amdgpu_init_recovery;
192 break;
193 case AMDGPU_INIT_LEVEL_DEFAULT:
194 fallthrough;
195 default:
196 adev->init_lvl = &amdgpu_init_default;
197 break;
198 }
199 }
200
201 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
202 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
203 void *data);
204
205 /**
206 * DOC: pcie_replay_count
207 *
208 * The amdgpu driver provides a sysfs API for reporting the total number
209 * of PCIe replays (NAKs).
210 * The file pcie_replay_count is used for this and returns the total
211 * number of replays as a sum of the NAKs generated and NAKs received.
212 */
213
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)214 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
215 struct device_attribute *attr, char *buf)
216 {
217 struct drm_device *ddev = dev_get_drvdata(dev);
218 struct amdgpu_device *adev = drm_to_adev(ddev);
219 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
220
221 return sysfs_emit(buf, "%llu\n", cnt);
222 }
223
224 static DEVICE_ATTR(pcie_replay_count, 0444,
225 amdgpu_device_get_pcie_replay_count, NULL);
226
amdgpu_sysfs_reg_state_get(struct file * f,struct kobject * kobj,struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)227 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
228 struct bin_attribute *attr, char *buf,
229 loff_t ppos, size_t count)
230 {
231 struct device *dev = kobj_to_dev(kobj);
232 struct drm_device *ddev = dev_get_drvdata(dev);
233 struct amdgpu_device *adev = drm_to_adev(ddev);
234 ssize_t bytes_read;
235
236 switch (ppos) {
237 case AMDGPU_SYS_REG_STATE_XGMI:
238 bytes_read = amdgpu_asic_get_reg_state(
239 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
240 break;
241 case AMDGPU_SYS_REG_STATE_WAFL:
242 bytes_read = amdgpu_asic_get_reg_state(
243 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
244 break;
245 case AMDGPU_SYS_REG_STATE_PCIE:
246 bytes_read = amdgpu_asic_get_reg_state(
247 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
248 break;
249 case AMDGPU_SYS_REG_STATE_USR:
250 bytes_read = amdgpu_asic_get_reg_state(
251 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
252 break;
253 case AMDGPU_SYS_REG_STATE_USR_1:
254 bytes_read = amdgpu_asic_get_reg_state(
255 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
256 break;
257 default:
258 return -EINVAL;
259 }
260
261 return bytes_read;
262 }
263
264 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
265 AMDGPU_SYS_REG_STATE_END);
266
amdgpu_reg_state_sysfs_init(struct amdgpu_device * adev)267 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
268 {
269 int ret;
270
271 if (!amdgpu_asic_get_reg_state_supported(adev))
272 return 0;
273
274 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
275
276 return ret;
277 }
278
amdgpu_reg_state_sysfs_fini(struct amdgpu_device * adev)279 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
280 {
281 if (!amdgpu_asic_get_reg_state_supported(adev))
282 return;
283 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
284 }
285
amdgpu_ip_block_suspend(struct amdgpu_ip_block * ip_block)286 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
287 {
288 int r;
289
290 if (ip_block->version->funcs->suspend) {
291 r = ip_block->version->funcs->suspend(ip_block);
292 if (r) {
293 dev_err(ip_block->adev->dev,
294 "suspend of IP block <%s> failed %d\n",
295 ip_block->version->funcs->name, r);
296 return r;
297 }
298 }
299
300 ip_block->status.hw = false;
301 return 0;
302 }
303
amdgpu_ip_block_resume(struct amdgpu_ip_block * ip_block)304 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
305 {
306 int r;
307
308 if (ip_block->version->funcs->resume) {
309 r = ip_block->version->funcs->resume(ip_block);
310 if (r) {
311 dev_err(ip_block->adev->dev,
312 "resume of IP block <%s> failed %d\n",
313 ip_block->version->funcs->name, r);
314 return r;
315 }
316 }
317
318 ip_block->status.hw = true;
319 return 0;
320 }
321
322 /**
323 * DOC: board_info
324 *
325 * The amdgpu driver provides a sysfs API for giving board related information.
326 * It provides the form factor information in the format
327 *
328 * type : form factor
329 *
330 * Possible form factor values
331 *
332 * - "cem" - PCIE CEM card
333 * - "oam" - Open Compute Accelerator Module
334 * - "unknown" - Not known
335 *
336 */
337
amdgpu_device_get_board_info(struct device * dev,struct device_attribute * attr,char * buf)338 static ssize_t amdgpu_device_get_board_info(struct device *dev,
339 struct device_attribute *attr,
340 char *buf)
341 {
342 struct drm_device *ddev = dev_get_drvdata(dev);
343 struct amdgpu_device *adev = drm_to_adev(ddev);
344 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
345 const char *pkg;
346
347 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
348 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
349
350 switch (pkg_type) {
351 case AMDGPU_PKG_TYPE_CEM:
352 pkg = "cem";
353 break;
354 case AMDGPU_PKG_TYPE_OAM:
355 pkg = "oam";
356 break;
357 default:
358 pkg = "unknown";
359 break;
360 }
361
362 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
363 }
364
365 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
366
367 static struct attribute *amdgpu_board_attrs[] = {
368 &dev_attr_board_info.attr,
369 NULL,
370 };
371
amdgpu_board_attrs_is_visible(struct kobject * kobj,struct attribute * attr,int n)372 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
373 struct attribute *attr, int n)
374 {
375 struct device *dev = kobj_to_dev(kobj);
376 struct drm_device *ddev = dev_get_drvdata(dev);
377 struct amdgpu_device *adev = drm_to_adev(ddev);
378
379 if (adev->flags & AMD_IS_APU)
380 return 0;
381
382 return attr->mode;
383 }
384
385 static const struct attribute_group amdgpu_board_attrs_group = {
386 .attrs = amdgpu_board_attrs,
387 .is_visible = amdgpu_board_attrs_is_visible
388 };
389
390 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
391
392
393 /**
394 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
395 *
396 * @dev: drm_device pointer
397 *
398 * Returns true if the device is a dGPU with ATPX power control,
399 * otherwise return false.
400 */
amdgpu_device_supports_px(struct drm_device * dev)401 bool amdgpu_device_supports_px(struct drm_device *dev)
402 {
403 struct amdgpu_device *adev = drm_to_adev(dev);
404
405 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
406 return true;
407 return false;
408 }
409
410 /**
411 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
412 *
413 * @dev: drm_device pointer
414 *
415 * Returns true if the device is a dGPU with ACPI power control,
416 * otherwise return false.
417 */
amdgpu_device_supports_boco(struct drm_device * dev)418 bool amdgpu_device_supports_boco(struct drm_device *dev)
419 {
420 struct amdgpu_device *adev = drm_to_adev(dev);
421
422 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
423 return false;
424
425 if (adev->has_pr3 ||
426 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
427 return true;
428 return false;
429 }
430
431 /**
432 * amdgpu_device_supports_baco - Does the device support BACO
433 *
434 * @dev: drm_device pointer
435 *
436 * Return:
437 * 1 if the device supports BACO;
438 * 3 if the device supports MACO (only works if BACO is supported)
439 * otherwise return 0.
440 */
amdgpu_device_supports_baco(struct drm_device * dev)441 int amdgpu_device_supports_baco(struct drm_device *dev)
442 {
443 struct amdgpu_device *adev = drm_to_adev(dev);
444
445 return amdgpu_asic_supports_baco(adev);
446 }
447
amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device * adev)448 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
449 {
450 struct drm_device *dev;
451 int bamaco_support;
452
453 dev = adev_to_drm(adev);
454
455 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
456 bamaco_support = amdgpu_device_supports_baco(dev);
457
458 switch (amdgpu_runtime_pm) {
459 case 2:
460 if (bamaco_support & MACO_SUPPORT) {
461 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
462 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
463 } else if (bamaco_support == BACO_SUPPORT) {
464 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
465 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
466 }
467 break;
468 case 1:
469 if (bamaco_support & BACO_SUPPORT) {
470 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
471 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
472 }
473 break;
474 case -1:
475 case -2:
476 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
477 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
478 dev_info(adev->dev, "Using ATPX for runtime pm\n");
479 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
480 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
481 dev_info(adev->dev, "Using BOCO for runtime pm\n");
482 } else {
483 if (!bamaco_support)
484 goto no_runtime_pm;
485
486 switch (adev->asic_type) {
487 case CHIP_VEGA20:
488 case CHIP_ARCTURUS:
489 /* BACO are not supported on vega20 and arctrus */
490 break;
491 case CHIP_VEGA10:
492 /* enable BACO as runpm mode if noretry=0 */
493 if (!adev->gmc.noretry)
494 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
495 break;
496 default:
497 /* enable BACO as runpm mode on CI+ */
498 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
499 break;
500 }
501
502 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
503 if (bamaco_support & MACO_SUPPORT) {
504 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
505 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
506 } else {
507 dev_info(adev->dev, "Using BACO for runtime pm\n");
508 }
509 }
510 }
511 break;
512 case 0:
513 dev_info(adev->dev, "runtime pm is manually disabled\n");
514 break;
515 default:
516 break;
517 }
518
519 no_runtime_pm:
520 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
521 dev_info(adev->dev, "Runtime PM not available\n");
522 }
523 /**
524 * amdgpu_device_supports_smart_shift - Is the device dGPU with
525 * smart shift support
526 *
527 * @dev: drm_device pointer
528 *
529 * Returns true if the device is a dGPU with Smart Shift support,
530 * otherwise returns false.
531 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)532 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
533 {
534 return (amdgpu_device_supports_boco(dev) &&
535 amdgpu_acpi_is_power_shift_control_supported());
536 }
537
538 /*
539 * VRAM access helper functions
540 */
541
542 /**
543 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
544 *
545 * @adev: amdgpu_device pointer
546 * @pos: offset of the buffer in vram
547 * @buf: virtual address of the buffer in system memory
548 * @size: read/write size, sizeof(@buf) must > @size
549 * @write: true - write to vram, otherwise - read from vram
550 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)551 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
552 void *buf, size_t size, bool write)
553 {
554 unsigned long flags;
555 uint32_t hi = ~0, tmp = 0;
556 uint32_t *data = buf;
557 uint64_t last;
558 int idx;
559
560 if (!drm_dev_enter(adev_to_drm(adev), &idx))
561 return;
562
563 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
564
565 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
566 for (last = pos + size; pos < last; pos += 4) {
567 tmp = pos >> 31;
568
569 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
570 if (tmp != hi) {
571 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
572 hi = tmp;
573 }
574 if (write)
575 WREG32_NO_KIQ(mmMM_DATA, *data++);
576 else
577 *data++ = RREG32_NO_KIQ(mmMM_DATA);
578 }
579
580 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
581 drm_dev_exit(idx);
582 }
583
584 /**
585 * amdgpu_device_aper_access - access vram by vram aperture
586 *
587 * @adev: amdgpu_device pointer
588 * @pos: offset of the buffer in vram
589 * @buf: virtual address of the buffer in system memory
590 * @size: read/write size, sizeof(@buf) must > @size
591 * @write: true - write to vram, otherwise - read from vram
592 *
593 * The return value means how many bytes have been transferred.
594 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)595 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
596 void *buf, size_t size, bool write)
597 {
598 #ifdef CONFIG_64BIT
599 void __iomem *addr;
600 size_t count = 0;
601 uint64_t last;
602
603 if (!adev->mman.aper_base_kaddr)
604 return 0;
605
606 last = min(pos + size, adev->gmc.visible_vram_size);
607 if (last > pos) {
608 addr = adev->mman.aper_base_kaddr + pos;
609 count = last - pos;
610
611 if (write) {
612 memcpy_toio(addr, buf, count);
613 /* Make sure HDP write cache flush happens without any reordering
614 * after the system memory contents are sent over PCIe device
615 */
616 mb();
617 amdgpu_device_flush_hdp(adev, NULL);
618 } else {
619 amdgpu_device_invalidate_hdp(adev, NULL);
620 /* Make sure HDP read cache is invalidated before issuing a read
621 * to the PCIe device
622 */
623 mb();
624 memcpy_fromio(buf, addr, count);
625 }
626
627 }
628
629 return count;
630 #else
631 return 0;
632 #endif
633 }
634
635 /**
636 * amdgpu_device_vram_access - read/write a buffer in vram
637 *
638 * @adev: amdgpu_device pointer
639 * @pos: offset of the buffer in vram
640 * @buf: virtual address of the buffer in system memory
641 * @size: read/write size, sizeof(@buf) must > @size
642 * @write: true - write to vram, otherwise - read from vram
643 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)644 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
645 void *buf, size_t size, bool write)
646 {
647 size_t count;
648
649 /* try to using vram apreature to access vram first */
650 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
651 size -= count;
652 if (size) {
653 /* using MM to access rest vram */
654 pos += count;
655 buf += count;
656 amdgpu_device_mm_access(adev, pos, buf, size, write);
657 }
658 }
659
660 /*
661 * register access helper functions.
662 */
663
664 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)665 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
666 {
667 if (adev->no_hw_access)
668 return true;
669
670 #ifdef CONFIG_LOCKDEP
671 /*
672 * This is a bit complicated to understand, so worth a comment. What we assert
673 * here is that the GPU reset is not running on another thread in parallel.
674 *
675 * For this we trylock the read side of the reset semaphore, if that succeeds
676 * we know that the reset is not running in parallel.
677 *
678 * If the trylock fails we assert that we are either already holding the read
679 * side of the lock or are the reset thread itself and hold the write side of
680 * the lock.
681 */
682 if (in_task()) {
683 if (down_read_trylock(&adev->reset_domain->sem))
684 up_read(&adev->reset_domain->sem);
685 else
686 lockdep_assert_held(&adev->reset_domain->sem);
687 }
688 #endif
689 return false;
690 }
691
692 /**
693 * amdgpu_device_rreg - read a memory mapped IO or indirect register
694 *
695 * @adev: amdgpu_device pointer
696 * @reg: dword aligned register offset
697 * @acc_flags: access flags which require special behavior
698 *
699 * Returns the 32 bit value from the offset specified.
700 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)701 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
702 uint32_t reg, uint32_t acc_flags)
703 {
704 uint32_t ret;
705
706 if (amdgpu_device_skip_hw_access(adev))
707 return 0;
708
709 if ((reg * 4) < adev->rmmio_size) {
710 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
711 amdgpu_sriov_runtime(adev) &&
712 down_read_trylock(&adev->reset_domain->sem)) {
713 ret = amdgpu_kiq_rreg(adev, reg, 0);
714 up_read(&adev->reset_domain->sem);
715 } else {
716 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
717 }
718 } else {
719 ret = adev->pcie_rreg(adev, reg * 4);
720 }
721
722 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
723
724 return ret;
725 }
726
727 /*
728 * MMIO register read with bytes helper functions
729 * @offset:bytes offset from MMIO start
730 */
731
732 /**
733 * amdgpu_mm_rreg8 - read a memory mapped IO register
734 *
735 * @adev: amdgpu_device pointer
736 * @offset: byte aligned register offset
737 *
738 * Returns the 8 bit value from the offset specified.
739 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)740 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
741 {
742 if (amdgpu_device_skip_hw_access(adev))
743 return 0;
744
745 if (offset < adev->rmmio_size)
746 return (readb(adev->rmmio + offset));
747 BUG();
748 }
749
750
751 /**
752 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
753 *
754 * @adev: amdgpu_device pointer
755 * @reg: dword aligned register offset
756 * @acc_flags: access flags which require special behavior
757 * @xcc_id: xcc accelerated compute core id
758 *
759 * Returns the 32 bit value from the offset specified.
760 */
amdgpu_device_xcc_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags,uint32_t xcc_id)761 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
762 uint32_t reg, uint32_t acc_flags,
763 uint32_t xcc_id)
764 {
765 uint32_t ret, rlcg_flag;
766
767 if (amdgpu_device_skip_hw_access(adev))
768 return 0;
769
770 if ((reg * 4) < adev->rmmio_size) {
771 if (amdgpu_sriov_vf(adev) &&
772 !amdgpu_sriov_runtime(adev) &&
773 adev->gfx.rlc.rlcg_reg_access_supported &&
774 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
775 GC_HWIP, false,
776 &rlcg_flag)) {
777 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
778 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
779 amdgpu_sriov_runtime(adev) &&
780 down_read_trylock(&adev->reset_domain->sem)) {
781 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
782 up_read(&adev->reset_domain->sem);
783 } else {
784 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
785 }
786 } else {
787 ret = adev->pcie_rreg(adev, reg * 4);
788 }
789
790 return ret;
791 }
792
793 /*
794 * MMIO register write with bytes helper functions
795 * @offset:bytes offset from MMIO start
796 * @value: the value want to be written to the register
797 */
798
799 /**
800 * amdgpu_mm_wreg8 - read a memory mapped IO register
801 *
802 * @adev: amdgpu_device pointer
803 * @offset: byte aligned register offset
804 * @value: 8 bit value to write
805 *
806 * Writes the value specified to the offset specified.
807 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)808 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
809 {
810 if (amdgpu_device_skip_hw_access(adev))
811 return;
812
813 if (offset < adev->rmmio_size)
814 writeb(value, adev->rmmio + offset);
815 else
816 BUG();
817 }
818
819 /**
820 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
821 *
822 * @adev: amdgpu_device pointer
823 * @reg: dword aligned register offset
824 * @v: 32 bit value to write to the register
825 * @acc_flags: access flags which require special behavior
826 *
827 * Writes the value specified to the offset specified.
828 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)829 void amdgpu_device_wreg(struct amdgpu_device *adev,
830 uint32_t reg, uint32_t v,
831 uint32_t acc_flags)
832 {
833 if (amdgpu_device_skip_hw_access(adev))
834 return;
835
836 if ((reg * 4) < adev->rmmio_size) {
837 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
838 amdgpu_sriov_runtime(adev) &&
839 down_read_trylock(&adev->reset_domain->sem)) {
840 amdgpu_kiq_wreg(adev, reg, v, 0);
841 up_read(&adev->reset_domain->sem);
842 } else {
843 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
844 }
845 } else {
846 adev->pcie_wreg(adev, reg * 4, v);
847 }
848
849 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
850 }
851
852 /**
853 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
854 *
855 * @adev: amdgpu_device pointer
856 * @reg: mmio/rlc register
857 * @v: value to write
858 * @xcc_id: xcc accelerated compute core id
859 *
860 * this function is invoked only for the debugfs register access
861 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)862 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
863 uint32_t reg, uint32_t v,
864 uint32_t xcc_id)
865 {
866 if (amdgpu_device_skip_hw_access(adev))
867 return;
868
869 if (amdgpu_sriov_fullaccess(adev) &&
870 adev->gfx.rlc.funcs &&
871 adev->gfx.rlc.funcs->is_rlcg_access_range) {
872 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
873 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
874 } else if ((reg * 4) >= adev->rmmio_size) {
875 adev->pcie_wreg(adev, reg * 4, v);
876 } else {
877 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
878 }
879 }
880
881 /**
882 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
883 *
884 * @adev: amdgpu_device pointer
885 * @reg: dword aligned register offset
886 * @v: 32 bit value to write to the register
887 * @acc_flags: access flags which require special behavior
888 * @xcc_id: xcc accelerated compute core id
889 *
890 * Writes the value specified to the offset specified.
891 */
amdgpu_device_xcc_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags,uint32_t xcc_id)892 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
893 uint32_t reg, uint32_t v,
894 uint32_t acc_flags, uint32_t xcc_id)
895 {
896 uint32_t rlcg_flag;
897
898 if (amdgpu_device_skip_hw_access(adev))
899 return;
900
901 if ((reg * 4) < adev->rmmio_size) {
902 if (amdgpu_sriov_vf(adev) &&
903 !amdgpu_sriov_runtime(adev) &&
904 adev->gfx.rlc.rlcg_reg_access_supported &&
905 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
906 GC_HWIP, true,
907 &rlcg_flag)) {
908 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
909 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
910 amdgpu_sriov_runtime(adev) &&
911 down_read_trylock(&adev->reset_domain->sem)) {
912 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
913 up_read(&adev->reset_domain->sem);
914 } else {
915 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
916 }
917 } else {
918 adev->pcie_wreg(adev, reg * 4, v);
919 }
920 }
921
922 /**
923 * amdgpu_device_indirect_rreg - read an indirect register
924 *
925 * @adev: amdgpu_device pointer
926 * @reg_addr: indirect register address to read from
927 *
928 * Returns the value of indirect register @reg_addr
929 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)930 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
931 u32 reg_addr)
932 {
933 unsigned long flags, pcie_index, pcie_data;
934 void __iomem *pcie_index_offset;
935 void __iomem *pcie_data_offset;
936 u32 r;
937
938 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
939 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
940
941 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
942 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
943 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
944
945 writel(reg_addr, pcie_index_offset);
946 readl(pcie_index_offset);
947 r = readl(pcie_data_offset);
948 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
949
950 return r;
951 }
952
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)953 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
954 u64 reg_addr)
955 {
956 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
957 u32 r;
958 void __iomem *pcie_index_offset;
959 void __iomem *pcie_index_hi_offset;
960 void __iomem *pcie_data_offset;
961
962 if (unlikely(!adev->nbio.funcs)) {
963 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
964 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
965 } else {
966 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
967 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
968 }
969
970 if (reg_addr >> 32) {
971 if (unlikely(!adev->nbio.funcs))
972 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
973 else
974 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
975 } else {
976 pcie_index_hi = 0;
977 }
978
979 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
980 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
981 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
982 if (pcie_index_hi != 0)
983 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
984 pcie_index_hi * 4;
985
986 writel(reg_addr, pcie_index_offset);
987 readl(pcie_index_offset);
988 if (pcie_index_hi != 0) {
989 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
990 readl(pcie_index_hi_offset);
991 }
992 r = readl(pcie_data_offset);
993
994 /* clear the high bits */
995 if (pcie_index_hi != 0) {
996 writel(0, pcie_index_hi_offset);
997 readl(pcie_index_hi_offset);
998 }
999
1000 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1001
1002 return r;
1003 }
1004
1005 /**
1006 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
1007 *
1008 * @adev: amdgpu_device pointer
1009 * @reg_addr: indirect register address to read from
1010 *
1011 * Returns the value of indirect register @reg_addr
1012 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)1013 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
1014 u32 reg_addr)
1015 {
1016 unsigned long flags, pcie_index, pcie_data;
1017 void __iomem *pcie_index_offset;
1018 void __iomem *pcie_data_offset;
1019 u64 r;
1020
1021 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1022 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1023
1024 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1025 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1026 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1027
1028 /* read low 32 bits */
1029 writel(reg_addr, pcie_index_offset);
1030 readl(pcie_index_offset);
1031 r = readl(pcie_data_offset);
1032 /* read high 32 bits */
1033 writel(reg_addr + 4, pcie_index_offset);
1034 readl(pcie_index_offset);
1035 r |= ((u64)readl(pcie_data_offset) << 32);
1036 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1037
1038 return r;
1039 }
1040
amdgpu_device_indirect_rreg64_ext(struct amdgpu_device * adev,u64 reg_addr)1041 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
1042 u64 reg_addr)
1043 {
1044 unsigned long flags, pcie_index, pcie_data;
1045 unsigned long pcie_index_hi = 0;
1046 void __iomem *pcie_index_offset;
1047 void __iomem *pcie_index_hi_offset;
1048 void __iomem *pcie_data_offset;
1049 u64 r;
1050
1051 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1052 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1053 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1054 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1055
1056 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1057 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1058 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1059 if (pcie_index_hi != 0)
1060 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1061 pcie_index_hi * 4;
1062
1063 /* read low 32 bits */
1064 writel(reg_addr, pcie_index_offset);
1065 readl(pcie_index_offset);
1066 if (pcie_index_hi != 0) {
1067 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1068 readl(pcie_index_hi_offset);
1069 }
1070 r = readl(pcie_data_offset);
1071 /* read high 32 bits */
1072 writel(reg_addr + 4, pcie_index_offset);
1073 readl(pcie_index_offset);
1074 if (pcie_index_hi != 0) {
1075 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1076 readl(pcie_index_hi_offset);
1077 }
1078 r |= ((u64)readl(pcie_data_offset) << 32);
1079
1080 /* clear the high bits */
1081 if (pcie_index_hi != 0) {
1082 writel(0, pcie_index_hi_offset);
1083 readl(pcie_index_hi_offset);
1084 }
1085
1086 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1087
1088 return r;
1089 }
1090
1091 /**
1092 * amdgpu_device_indirect_wreg - write an indirect register address
1093 *
1094 * @adev: amdgpu_device pointer
1095 * @reg_addr: indirect register offset
1096 * @reg_data: indirect register data
1097 *
1098 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)1099 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1100 u32 reg_addr, u32 reg_data)
1101 {
1102 unsigned long flags, pcie_index, pcie_data;
1103 void __iomem *pcie_index_offset;
1104 void __iomem *pcie_data_offset;
1105
1106 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1107 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1108
1109 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1110 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1111 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1112
1113 writel(reg_addr, pcie_index_offset);
1114 readl(pcie_index_offset);
1115 writel(reg_data, pcie_data_offset);
1116 readl(pcie_data_offset);
1117 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1118 }
1119
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)1120 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1121 u64 reg_addr, u32 reg_data)
1122 {
1123 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1124 void __iomem *pcie_index_offset;
1125 void __iomem *pcie_index_hi_offset;
1126 void __iomem *pcie_data_offset;
1127
1128 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1129 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1130 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1131 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1132 else
1133 pcie_index_hi = 0;
1134
1135 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1136 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1137 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1138 if (pcie_index_hi != 0)
1139 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1140 pcie_index_hi * 4;
1141
1142 writel(reg_addr, pcie_index_offset);
1143 readl(pcie_index_offset);
1144 if (pcie_index_hi != 0) {
1145 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1146 readl(pcie_index_hi_offset);
1147 }
1148 writel(reg_data, pcie_data_offset);
1149 readl(pcie_data_offset);
1150
1151 /* clear the high bits */
1152 if (pcie_index_hi != 0) {
1153 writel(0, pcie_index_hi_offset);
1154 readl(pcie_index_hi_offset);
1155 }
1156
1157 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1158 }
1159
1160 /**
1161 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1162 *
1163 * @adev: amdgpu_device pointer
1164 * @reg_addr: indirect register offset
1165 * @reg_data: indirect register data
1166 *
1167 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)1168 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1169 u32 reg_addr, u64 reg_data)
1170 {
1171 unsigned long flags, pcie_index, pcie_data;
1172 void __iomem *pcie_index_offset;
1173 void __iomem *pcie_data_offset;
1174
1175 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1176 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1177
1178 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1179 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1180 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1181
1182 /* write low 32 bits */
1183 writel(reg_addr, pcie_index_offset);
1184 readl(pcie_index_offset);
1185 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1186 readl(pcie_data_offset);
1187 /* write high 32 bits */
1188 writel(reg_addr + 4, pcie_index_offset);
1189 readl(pcie_index_offset);
1190 writel((u32)(reg_data >> 32), pcie_data_offset);
1191 readl(pcie_data_offset);
1192 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1193 }
1194
amdgpu_device_indirect_wreg64_ext(struct amdgpu_device * adev,u64 reg_addr,u64 reg_data)1195 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1196 u64 reg_addr, u64 reg_data)
1197 {
1198 unsigned long flags, pcie_index, pcie_data;
1199 unsigned long pcie_index_hi = 0;
1200 void __iomem *pcie_index_offset;
1201 void __iomem *pcie_index_hi_offset;
1202 void __iomem *pcie_data_offset;
1203
1204 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1205 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1206 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1207 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1208
1209 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1210 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1211 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1212 if (pcie_index_hi != 0)
1213 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1214 pcie_index_hi * 4;
1215
1216 /* write low 32 bits */
1217 writel(reg_addr, pcie_index_offset);
1218 readl(pcie_index_offset);
1219 if (pcie_index_hi != 0) {
1220 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1221 readl(pcie_index_hi_offset);
1222 }
1223 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1224 readl(pcie_data_offset);
1225 /* write high 32 bits */
1226 writel(reg_addr + 4, pcie_index_offset);
1227 readl(pcie_index_offset);
1228 if (pcie_index_hi != 0) {
1229 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1230 readl(pcie_index_hi_offset);
1231 }
1232 writel((u32)(reg_data >> 32), pcie_data_offset);
1233 readl(pcie_data_offset);
1234
1235 /* clear the high bits */
1236 if (pcie_index_hi != 0) {
1237 writel(0, pcie_index_hi_offset);
1238 readl(pcie_index_hi_offset);
1239 }
1240
1241 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1242 }
1243
1244 /**
1245 * amdgpu_device_get_rev_id - query device rev_id
1246 *
1247 * @adev: amdgpu_device pointer
1248 *
1249 * Return device rev_id
1250 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)1251 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1252 {
1253 return adev->nbio.funcs->get_rev_id(adev);
1254 }
1255
1256 /**
1257 * amdgpu_invalid_rreg - dummy reg read function
1258 *
1259 * @adev: amdgpu_device pointer
1260 * @reg: offset of register
1261 *
1262 * Dummy register read function. Used for register blocks
1263 * that certain asics don't have (all asics).
1264 * Returns the value in the register.
1265 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)1266 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1267 {
1268 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1269 BUG();
1270 return 0;
1271 }
1272
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)1273 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1274 {
1275 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1276 BUG();
1277 return 0;
1278 }
1279
1280 /**
1281 * amdgpu_invalid_wreg - dummy reg write function
1282 *
1283 * @adev: amdgpu_device pointer
1284 * @reg: offset of register
1285 * @v: value to write to the register
1286 *
1287 * Dummy register read function. Used for register blocks
1288 * that certain asics don't have (all asics).
1289 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)1290 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1291 {
1292 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1293 reg, v);
1294 BUG();
1295 }
1296
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)1297 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1298 {
1299 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1300 reg, v);
1301 BUG();
1302 }
1303
1304 /**
1305 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1306 *
1307 * @adev: amdgpu_device pointer
1308 * @reg: offset of register
1309 *
1310 * Dummy register read function. Used for register blocks
1311 * that certain asics don't have (all asics).
1312 * Returns the value in the register.
1313 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)1314 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1315 {
1316 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1317 BUG();
1318 return 0;
1319 }
1320
amdgpu_invalid_rreg64_ext(struct amdgpu_device * adev,uint64_t reg)1321 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1322 {
1323 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1324 BUG();
1325 return 0;
1326 }
1327
1328 /**
1329 * amdgpu_invalid_wreg64 - dummy reg write function
1330 *
1331 * @adev: amdgpu_device pointer
1332 * @reg: offset of register
1333 * @v: value to write to the register
1334 *
1335 * Dummy register read function. Used for register blocks
1336 * that certain asics don't have (all asics).
1337 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)1338 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1339 {
1340 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1341 reg, v);
1342 BUG();
1343 }
1344
amdgpu_invalid_wreg64_ext(struct amdgpu_device * adev,uint64_t reg,uint64_t v)1345 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1346 {
1347 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1348 reg, v);
1349 BUG();
1350 }
1351
1352 /**
1353 * amdgpu_block_invalid_rreg - dummy reg read function
1354 *
1355 * @adev: amdgpu_device pointer
1356 * @block: offset of instance
1357 * @reg: offset of register
1358 *
1359 * Dummy register read function. Used for register blocks
1360 * that certain asics don't have (all asics).
1361 * Returns the value in the register.
1362 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)1363 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1364 uint32_t block, uint32_t reg)
1365 {
1366 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1367 reg, block);
1368 BUG();
1369 return 0;
1370 }
1371
1372 /**
1373 * amdgpu_block_invalid_wreg - dummy reg write function
1374 *
1375 * @adev: amdgpu_device pointer
1376 * @block: offset of instance
1377 * @reg: offset of register
1378 * @v: value to write to the register
1379 *
1380 * Dummy register read function. Used for register blocks
1381 * that certain asics don't have (all asics).
1382 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)1383 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1384 uint32_t block,
1385 uint32_t reg, uint32_t v)
1386 {
1387 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1388 reg, block, v);
1389 BUG();
1390 }
1391
1392 /**
1393 * amdgpu_device_asic_init - Wrapper for atom asic_init
1394 *
1395 * @adev: amdgpu_device pointer
1396 *
1397 * Does any asic specific work and then calls atom asic init.
1398 */
amdgpu_device_asic_init(struct amdgpu_device * adev)1399 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1400 {
1401 int ret;
1402
1403 amdgpu_asic_pre_asic_init(adev);
1404
1405 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1406 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1407 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
1408 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1409 amdgpu_psp_wait_for_bootloader(adev);
1410 ret = amdgpu_atomfirmware_asic_init(adev, true);
1411 return ret;
1412 } else {
1413 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1414 }
1415
1416 return 0;
1417 }
1418
1419 /**
1420 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1421 *
1422 * @adev: amdgpu_device pointer
1423 *
1424 * Allocates a scratch page of VRAM for use by various things in the
1425 * driver.
1426 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)1427 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1428 {
1429 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1430 AMDGPU_GEM_DOMAIN_VRAM |
1431 AMDGPU_GEM_DOMAIN_GTT,
1432 &adev->mem_scratch.robj,
1433 &adev->mem_scratch.gpu_addr,
1434 (void **)&adev->mem_scratch.ptr);
1435 }
1436
1437 /**
1438 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1439 *
1440 * @adev: amdgpu_device pointer
1441 *
1442 * Frees the VRAM scratch page.
1443 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)1444 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1445 {
1446 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1447 }
1448
1449 /**
1450 * amdgpu_device_program_register_sequence - program an array of registers.
1451 *
1452 * @adev: amdgpu_device pointer
1453 * @registers: pointer to the register array
1454 * @array_size: size of the register array
1455 *
1456 * Programs an array or registers with and or masks.
1457 * This is a helper for setting golden registers.
1458 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)1459 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1460 const u32 *registers,
1461 const u32 array_size)
1462 {
1463 u32 tmp, reg, and_mask, or_mask;
1464 int i;
1465
1466 if (array_size % 3)
1467 return;
1468
1469 for (i = 0; i < array_size; i += 3) {
1470 reg = registers[i + 0];
1471 and_mask = registers[i + 1];
1472 or_mask = registers[i + 2];
1473
1474 if (and_mask == 0xffffffff) {
1475 tmp = or_mask;
1476 } else {
1477 tmp = RREG32(reg);
1478 tmp &= ~and_mask;
1479 if (adev->family >= AMDGPU_FAMILY_AI)
1480 tmp |= (or_mask & and_mask);
1481 else
1482 tmp |= or_mask;
1483 }
1484 WREG32(reg, tmp);
1485 }
1486 }
1487
1488 /**
1489 * amdgpu_device_pci_config_reset - reset the GPU
1490 *
1491 * @adev: amdgpu_device pointer
1492 *
1493 * Resets the GPU using the pci config reset sequence.
1494 * Only applicable to asics prior to vega10.
1495 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)1496 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1497 {
1498 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1499 }
1500
1501 /**
1502 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1503 *
1504 * @adev: amdgpu_device pointer
1505 *
1506 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1507 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1508 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1509 {
1510 return pci_reset_function(adev->pdev);
1511 }
1512
1513 /*
1514 * amdgpu_device_wb_*()
1515 * Writeback is the method by which the GPU updates special pages in memory
1516 * with the status of certain GPU events (fences, ring pointers,etc.).
1517 */
1518
1519 /**
1520 * amdgpu_device_wb_fini - Disable Writeback and free memory
1521 *
1522 * @adev: amdgpu_device pointer
1523 *
1524 * Disables Writeback and frees the Writeback memory (all asics).
1525 * Used at driver shutdown.
1526 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1527 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1528 {
1529 if (adev->wb.wb_obj) {
1530 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1531 &adev->wb.gpu_addr,
1532 (void **)&adev->wb.wb);
1533 adev->wb.wb_obj = NULL;
1534 }
1535 }
1536
1537 /**
1538 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1539 *
1540 * @adev: amdgpu_device pointer
1541 *
1542 * Initializes writeback and allocates writeback memory (all asics).
1543 * Used at driver startup.
1544 * Returns 0 on success or an -error on failure.
1545 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1546 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1547 {
1548 int r;
1549
1550 if (adev->wb.wb_obj == NULL) {
1551 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1552 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1553 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1554 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1555 (void **)&adev->wb.wb);
1556 if (r) {
1557 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1558 return r;
1559 }
1560
1561 adev->wb.num_wb = AMDGPU_MAX_WB;
1562 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1563
1564 /* clear wb memory */
1565 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1566 }
1567
1568 return 0;
1569 }
1570
1571 /**
1572 * amdgpu_device_wb_get - Allocate a wb entry
1573 *
1574 * @adev: amdgpu_device pointer
1575 * @wb: wb index
1576 *
1577 * Allocate a wb slot for use by the driver (all asics).
1578 * Returns 0 on success or -EINVAL on failure.
1579 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1580 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1581 {
1582 unsigned long flags, offset;
1583
1584 spin_lock_irqsave(&adev->wb.lock, flags);
1585 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1586 if (offset < adev->wb.num_wb) {
1587 __set_bit(offset, adev->wb.used);
1588 spin_unlock_irqrestore(&adev->wb.lock, flags);
1589 *wb = offset << 3; /* convert to dw offset */
1590 return 0;
1591 } else {
1592 spin_unlock_irqrestore(&adev->wb.lock, flags);
1593 return -EINVAL;
1594 }
1595 }
1596
1597 /**
1598 * amdgpu_device_wb_free - Free a wb entry
1599 *
1600 * @adev: amdgpu_device pointer
1601 * @wb: wb index
1602 *
1603 * Free a wb slot allocated for use by the driver (all asics)
1604 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1605 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1606 {
1607 unsigned long flags;
1608
1609 wb >>= 3;
1610 spin_lock_irqsave(&adev->wb.lock, flags);
1611 if (wb < adev->wb.num_wb)
1612 __clear_bit(wb, adev->wb.used);
1613 spin_unlock_irqrestore(&adev->wb.lock, flags);
1614 }
1615
1616 /**
1617 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1618 *
1619 * @adev: amdgpu_device pointer
1620 *
1621 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1622 * to fail, but if any of the BARs is not accessible after the size we abort
1623 * driver loading by returning -ENODEV.
1624 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1625 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1626 {
1627 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1628 struct pci_bus *root;
1629 struct resource *res;
1630 unsigned int i;
1631 u16 cmd;
1632 int r;
1633
1634 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1635 return 0;
1636
1637 /* Bypass for VF */
1638 if (amdgpu_sriov_vf(adev))
1639 return 0;
1640
1641 /* resizing on Dell G5 SE platforms causes problems with runtime pm */
1642 if ((amdgpu_runtime_pm != 0) &&
1643 adev->pdev->vendor == PCI_VENDOR_ID_ATI &&
1644 adev->pdev->device == 0x731f &&
1645 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL)
1646 return 0;
1647
1648 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1649 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1650 DRM_WARN("System can't access extended configuration space, please check!!\n");
1651
1652 /* skip if the bios has already enabled large BAR */
1653 if (adev->gmc.real_vram_size &&
1654 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1655 return 0;
1656
1657 /* Check if the root BUS has 64bit memory resources */
1658 root = adev->pdev->bus;
1659 while (root->parent)
1660 root = root->parent;
1661
1662 pci_bus_for_each_resource(root, res, i) {
1663 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1664 res->start > 0x100000000ull)
1665 break;
1666 }
1667
1668 /* Trying to resize is pointless without a root hub window above 4GB */
1669 if (!res)
1670 return 0;
1671
1672 /* Limit the BAR size to what is available */
1673 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1674 rbar_size);
1675
1676 /* Disable memory decoding while we change the BAR addresses and size */
1677 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1678 pci_write_config_word(adev->pdev, PCI_COMMAND,
1679 cmd & ~PCI_COMMAND_MEMORY);
1680
1681 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1682 amdgpu_doorbell_fini(adev);
1683 if (adev->asic_type >= CHIP_BONAIRE)
1684 pci_release_resource(adev->pdev, 2);
1685
1686 pci_release_resource(adev->pdev, 0);
1687
1688 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1689 if (r == -ENOSPC)
1690 DRM_INFO("Not enough PCI address space for a large BAR.");
1691 else if (r && r != -ENOTSUPP)
1692 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1693
1694 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1695
1696 /* When the doorbell or fb BAR isn't available we have no chance of
1697 * using the device.
1698 */
1699 r = amdgpu_doorbell_init(adev);
1700 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1701 return -ENODEV;
1702
1703 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1704
1705 return 0;
1706 }
1707
amdgpu_device_read_bios(struct amdgpu_device * adev)1708 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1709 {
1710 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1711 return false;
1712
1713 return true;
1714 }
1715
1716 /*
1717 * GPU helpers function.
1718 */
1719 /**
1720 * amdgpu_device_need_post - check if the hw need post or not
1721 *
1722 * @adev: amdgpu_device pointer
1723 *
1724 * Check if the asic has been initialized (all asics) at driver startup
1725 * or post is needed if hw reset is performed.
1726 * Returns true if need or false if not.
1727 */
amdgpu_device_need_post(struct amdgpu_device * adev)1728 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1729 {
1730 uint32_t reg;
1731
1732 if (amdgpu_sriov_vf(adev))
1733 return false;
1734
1735 if (!amdgpu_device_read_bios(adev))
1736 return false;
1737
1738 if (amdgpu_passthrough(adev)) {
1739 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1740 * some old smc fw still need driver do vPost otherwise gpu hang, while
1741 * those smc fw version above 22.15 doesn't have this flaw, so we force
1742 * vpost executed for smc version below 22.15
1743 */
1744 if (adev->asic_type == CHIP_FIJI) {
1745 int err;
1746 uint32_t fw_ver;
1747
1748 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1749 /* force vPost if error occurred */
1750 if (err)
1751 return true;
1752
1753 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1754 release_firmware(adev->pm.fw);
1755 if (fw_ver < 0x00160e00)
1756 return true;
1757 }
1758 }
1759
1760 /* Don't post if we need to reset whole hive on init */
1761 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
1762 return false;
1763
1764 if (adev->has_hw_reset) {
1765 adev->has_hw_reset = false;
1766 return true;
1767 }
1768
1769 /* bios scratch used on CIK+ */
1770 if (adev->asic_type >= CHIP_BONAIRE)
1771 return amdgpu_atombios_scratch_need_asic_init(adev);
1772
1773 /* check MEM_SIZE for older asics */
1774 reg = amdgpu_asic_get_config_memsize(adev);
1775
1776 if ((reg != 0) && (reg != 0xffffffff))
1777 return false;
1778
1779 return true;
1780 }
1781
1782 /*
1783 * Check whether seamless boot is supported.
1784 *
1785 * So far we only support seamless boot on DCE 3.0 or later.
1786 * If users report that it works on older ASICS as well, we may
1787 * loosen this.
1788 */
amdgpu_device_seamless_boot_supported(struct amdgpu_device * adev)1789 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1790 {
1791 switch (amdgpu_seamless) {
1792 case -1:
1793 break;
1794 case 1:
1795 return true;
1796 case 0:
1797 return false;
1798 default:
1799 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1800 amdgpu_seamless);
1801 return false;
1802 }
1803
1804 if (!(adev->flags & AMD_IS_APU))
1805 return false;
1806
1807 if (adev->mman.keep_stolen_vga_memory)
1808 return false;
1809
1810 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1811 }
1812
1813 /*
1814 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1815 * don't support dynamic speed switching. Until we have confirmation from Intel
1816 * that a specific host supports it, it's safer that we keep it disabled for all.
1817 *
1818 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1819 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1820 */
amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device * adev)1821 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1822 {
1823 #if IS_ENABLED(CONFIG_X86)
1824 struct cpuinfo_x86 *c = &cpu_data(0);
1825
1826 /* eGPU change speeds based on USB4 fabric conditions */
1827 if (dev_is_removable(adev->dev))
1828 return true;
1829
1830 if (c->x86_vendor == X86_VENDOR_INTEL)
1831 return false;
1832 #endif
1833 return true;
1834 }
1835
1836 /**
1837 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1838 *
1839 * @adev: amdgpu_device pointer
1840 *
1841 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1842 * be set for this device.
1843 *
1844 * Returns true if it should be used or false if not.
1845 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1846 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1847 {
1848 switch (amdgpu_aspm) {
1849 case -1:
1850 break;
1851 case 0:
1852 return false;
1853 case 1:
1854 return true;
1855 default:
1856 return false;
1857 }
1858 if (adev->flags & AMD_IS_APU)
1859 return false;
1860 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK))
1861 return false;
1862 return pcie_aspm_enabled(adev->pdev);
1863 }
1864
1865 /* if we get transitioned to only one device, take VGA back */
1866 /**
1867 * amdgpu_device_vga_set_decode - enable/disable vga decode
1868 *
1869 * @pdev: PCI device pointer
1870 * @state: enable/disable vga decode
1871 *
1872 * Enable/disable vga decode (all asics).
1873 * Returns VGA resource flags.
1874 */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1875 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1876 bool state)
1877 {
1878 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1879
1880 amdgpu_asic_set_vga_state(adev, state);
1881 if (state)
1882 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1883 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1884 else
1885 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1886 }
1887
1888 /**
1889 * amdgpu_device_check_block_size - validate the vm block size
1890 *
1891 * @adev: amdgpu_device pointer
1892 *
1893 * Validates the vm block size specified via module parameter.
1894 * The vm block size defines number of bits in page table versus page directory,
1895 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1896 * page table and the remaining bits are in the page directory.
1897 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1898 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1899 {
1900 /* defines number of bits in page table versus page directory,
1901 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1902 * page table and the remaining bits are in the page directory
1903 */
1904 if (amdgpu_vm_block_size == -1)
1905 return;
1906
1907 if (amdgpu_vm_block_size < 9) {
1908 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1909 amdgpu_vm_block_size);
1910 amdgpu_vm_block_size = -1;
1911 }
1912 }
1913
1914 /**
1915 * amdgpu_device_check_vm_size - validate the vm size
1916 *
1917 * @adev: amdgpu_device pointer
1918 *
1919 * Validates the vm size in GB specified via module parameter.
1920 * The VM size is the size of the GPU virtual memory space in GB.
1921 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1922 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1923 {
1924 /* no need to check the default value */
1925 if (amdgpu_vm_size == -1)
1926 return;
1927
1928 if (amdgpu_vm_size < 1) {
1929 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1930 amdgpu_vm_size);
1931 amdgpu_vm_size = -1;
1932 }
1933 }
1934
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1935 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1936 {
1937 struct sysinfo si;
1938 bool is_os_64 = (sizeof(void *) == 8);
1939 uint64_t total_memory;
1940 uint64_t dram_size_seven_GB = 0x1B8000000;
1941 uint64_t dram_size_three_GB = 0xB8000000;
1942
1943 if (amdgpu_smu_memory_pool_size == 0)
1944 return;
1945
1946 if (!is_os_64) {
1947 DRM_WARN("Not 64-bit OS, feature not supported\n");
1948 goto def_value;
1949 }
1950 si_meminfo(&si);
1951 total_memory = (uint64_t)si.totalram * si.mem_unit;
1952
1953 if ((amdgpu_smu_memory_pool_size == 1) ||
1954 (amdgpu_smu_memory_pool_size == 2)) {
1955 if (total_memory < dram_size_three_GB)
1956 goto def_value1;
1957 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1958 (amdgpu_smu_memory_pool_size == 8)) {
1959 if (total_memory < dram_size_seven_GB)
1960 goto def_value1;
1961 } else {
1962 DRM_WARN("Smu memory pool size not supported\n");
1963 goto def_value;
1964 }
1965 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1966
1967 return;
1968
1969 def_value1:
1970 DRM_WARN("No enough system memory\n");
1971 def_value:
1972 adev->pm.smu_prv_buffer_size = 0;
1973 }
1974
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1975 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1976 {
1977 if (!(adev->flags & AMD_IS_APU) ||
1978 adev->asic_type < CHIP_RAVEN)
1979 return 0;
1980
1981 switch (adev->asic_type) {
1982 case CHIP_RAVEN:
1983 if (adev->pdev->device == 0x15dd)
1984 adev->apu_flags |= AMD_APU_IS_RAVEN;
1985 if (adev->pdev->device == 0x15d8)
1986 adev->apu_flags |= AMD_APU_IS_PICASSO;
1987 break;
1988 case CHIP_RENOIR:
1989 if ((adev->pdev->device == 0x1636) ||
1990 (adev->pdev->device == 0x164c))
1991 adev->apu_flags |= AMD_APU_IS_RENOIR;
1992 else
1993 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1994 break;
1995 case CHIP_VANGOGH:
1996 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1997 break;
1998 case CHIP_YELLOW_CARP:
1999 break;
2000 case CHIP_CYAN_SKILLFISH:
2001 if ((adev->pdev->device == 0x13FE) ||
2002 (adev->pdev->device == 0x143F))
2003 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
2004 break;
2005 default:
2006 break;
2007 }
2008
2009 return 0;
2010 }
2011
2012 /**
2013 * amdgpu_device_check_arguments - validate module params
2014 *
2015 * @adev: amdgpu_device pointer
2016 *
2017 * Validates certain module parameters and updates
2018 * the associated values used by the driver (all asics).
2019 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)2020 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
2021 {
2022 int i;
2023
2024 if (amdgpu_sched_jobs < 4) {
2025 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
2026 amdgpu_sched_jobs);
2027 amdgpu_sched_jobs = 4;
2028 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
2029 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
2030 amdgpu_sched_jobs);
2031 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
2032 }
2033
2034 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
2035 /* gart size must be greater or equal to 32M */
2036 dev_warn(adev->dev, "gart size (%d) too small\n",
2037 amdgpu_gart_size);
2038 amdgpu_gart_size = -1;
2039 }
2040
2041 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
2042 /* gtt size must be greater or equal to 32M */
2043 dev_warn(adev->dev, "gtt size (%d) too small\n",
2044 amdgpu_gtt_size);
2045 amdgpu_gtt_size = -1;
2046 }
2047
2048 /* valid range is between 4 and 9 inclusive */
2049 if (amdgpu_vm_fragment_size != -1 &&
2050 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
2051 dev_warn(adev->dev, "valid range is between 4 and 9\n");
2052 amdgpu_vm_fragment_size = -1;
2053 }
2054
2055 if (amdgpu_sched_hw_submission < 2) {
2056 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
2057 amdgpu_sched_hw_submission);
2058 amdgpu_sched_hw_submission = 2;
2059 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
2060 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
2061 amdgpu_sched_hw_submission);
2062 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
2063 }
2064
2065 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
2066 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
2067 amdgpu_reset_method = -1;
2068 }
2069
2070 amdgpu_device_check_smu_prv_buffer_size(adev);
2071
2072 amdgpu_device_check_vm_size(adev);
2073
2074 amdgpu_device_check_block_size(adev);
2075
2076 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2077
2078 for (i = 0; i < MAX_XCP; i++)
2079 adev->enforce_isolation[i] = !!enforce_isolation;
2080
2081 return 0;
2082 }
2083
2084 /**
2085 * amdgpu_switcheroo_set_state - set switcheroo state
2086 *
2087 * @pdev: pci dev pointer
2088 * @state: vga_switcheroo state
2089 *
2090 * Callback for the switcheroo driver. Suspends or resumes
2091 * the asics before or after it is powered up using ACPI methods.
2092 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)2093 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2094 enum vga_switcheroo_state state)
2095 {
2096 struct drm_device *dev = pci_get_drvdata(pdev);
2097 int r;
2098
2099 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
2100 return;
2101
2102 if (state == VGA_SWITCHEROO_ON) {
2103 pr_info("switched on\n");
2104 /* don't suspend or resume card normally */
2105 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2106
2107 pci_set_power_state(pdev, PCI_D0);
2108 amdgpu_device_load_pci_state(pdev);
2109 r = pci_enable_device(pdev);
2110 if (r)
2111 DRM_WARN("pci_enable_device failed (%d)\n", r);
2112 amdgpu_device_resume(dev, true);
2113
2114 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2115 } else {
2116 pr_info("switched off\n");
2117 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2118 amdgpu_device_prepare(dev);
2119 amdgpu_device_suspend(dev, true);
2120 amdgpu_device_cache_pci_state(pdev);
2121 /* Shut down the device */
2122 pci_disable_device(pdev);
2123 pci_set_power_state(pdev, PCI_D3cold);
2124 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2125 }
2126 }
2127
2128 /**
2129 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2130 *
2131 * @pdev: pci dev pointer
2132 *
2133 * Callback for the switcheroo driver. Check of the switcheroo
2134 * state can be changed.
2135 * Returns true if the state can be changed, false if not.
2136 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)2137 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2138 {
2139 struct drm_device *dev = pci_get_drvdata(pdev);
2140
2141 /*
2142 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2143 * locking inversion with the driver load path. And the access here is
2144 * completely racy anyway. So don't bother with locking for now.
2145 */
2146 return atomic_read(&dev->open_count) == 0;
2147 }
2148
2149 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2150 .set_gpu_state = amdgpu_switcheroo_set_state,
2151 .reprobe = NULL,
2152 .can_switch = amdgpu_switcheroo_can_switch,
2153 };
2154
2155 /**
2156 * amdgpu_device_ip_set_clockgating_state - set the CG state
2157 *
2158 * @dev: amdgpu_device pointer
2159 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2160 * @state: clockgating state (gate or ungate)
2161 *
2162 * Sets the requested clockgating state for all instances of
2163 * the hardware IP specified.
2164 * Returns the error code from the last instance.
2165 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)2166 int amdgpu_device_ip_set_clockgating_state(void *dev,
2167 enum amd_ip_block_type block_type,
2168 enum amd_clockgating_state state)
2169 {
2170 struct amdgpu_device *adev = dev;
2171 int i, r = 0;
2172
2173 for (i = 0; i < adev->num_ip_blocks; i++) {
2174 if (!adev->ip_blocks[i].status.valid)
2175 continue;
2176 if (adev->ip_blocks[i].version->type != block_type)
2177 continue;
2178 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2179 continue;
2180 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2181 &adev->ip_blocks[i], state);
2182 if (r)
2183 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2184 adev->ip_blocks[i].version->funcs->name, r);
2185 }
2186 return r;
2187 }
2188
2189 /**
2190 * amdgpu_device_ip_set_powergating_state - set the PG state
2191 *
2192 * @dev: amdgpu_device pointer
2193 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2194 * @state: powergating state (gate or ungate)
2195 *
2196 * Sets the requested powergating state for all instances of
2197 * the hardware IP specified.
2198 * Returns the error code from the last instance.
2199 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)2200 int amdgpu_device_ip_set_powergating_state(void *dev,
2201 enum amd_ip_block_type block_type,
2202 enum amd_powergating_state state)
2203 {
2204 struct amdgpu_device *adev = dev;
2205 int i, r = 0;
2206
2207 for (i = 0; i < adev->num_ip_blocks; i++) {
2208 if (!adev->ip_blocks[i].status.valid)
2209 continue;
2210 if (adev->ip_blocks[i].version->type != block_type)
2211 continue;
2212 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2213 continue;
2214 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2215 &adev->ip_blocks[i], state);
2216 if (r)
2217 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2218 adev->ip_blocks[i].version->funcs->name, r);
2219 }
2220 return r;
2221 }
2222
2223 /**
2224 * amdgpu_device_ip_get_clockgating_state - get the CG state
2225 *
2226 * @adev: amdgpu_device pointer
2227 * @flags: clockgating feature flags
2228 *
2229 * Walks the list of IPs on the device and updates the clockgating
2230 * flags for each IP.
2231 * Updates @flags with the feature flags for each hardware IP where
2232 * clockgating is enabled.
2233 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)2234 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2235 u64 *flags)
2236 {
2237 int i;
2238
2239 for (i = 0; i < adev->num_ip_blocks; i++) {
2240 if (!adev->ip_blocks[i].status.valid)
2241 continue;
2242 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2243 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
2244 }
2245 }
2246
2247 /**
2248 * amdgpu_device_ip_wait_for_idle - wait for idle
2249 *
2250 * @adev: amdgpu_device pointer
2251 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2252 *
2253 * Waits for the request hardware IP to be idle.
2254 * Returns 0 for success or a negative error code on failure.
2255 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2256 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2257 enum amd_ip_block_type block_type)
2258 {
2259 int i, r;
2260
2261 for (i = 0; i < adev->num_ip_blocks; i++) {
2262 if (!adev->ip_blocks[i].status.valid)
2263 continue;
2264 if (adev->ip_blocks[i].version->type == block_type) {
2265 if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
2266 r = adev->ip_blocks[i].version->funcs->wait_for_idle(
2267 &adev->ip_blocks[i]);
2268 if (r)
2269 return r;
2270 }
2271 break;
2272 }
2273 }
2274 return 0;
2275
2276 }
2277
2278 /**
2279 * amdgpu_device_ip_is_valid - is the hardware IP enabled
2280 *
2281 * @adev: amdgpu_device pointer
2282 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2283 *
2284 * Check if the hardware IP is enable or not.
2285 * Returns true if it the IP is enable, false if not.
2286 */
amdgpu_device_ip_is_valid(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2287 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
2288 enum amd_ip_block_type block_type)
2289 {
2290 int i;
2291
2292 for (i = 0; i < adev->num_ip_blocks; i++) {
2293 if (adev->ip_blocks[i].version->type == block_type)
2294 return adev->ip_blocks[i].status.valid;
2295 }
2296 return false;
2297
2298 }
2299
2300 /**
2301 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2302 *
2303 * @adev: amdgpu_device pointer
2304 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2305 *
2306 * Returns a pointer to the hardware IP block structure
2307 * if it exists for the asic, otherwise NULL.
2308 */
2309 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)2310 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2311 enum amd_ip_block_type type)
2312 {
2313 int i;
2314
2315 for (i = 0; i < adev->num_ip_blocks; i++)
2316 if (adev->ip_blocks[i].version->type == type)
2317 return &adev->ip_blocks[i];
2318
2319 return NULL;
2320 }
2321
2322 /**
2323 * amdgpu_device_ip_block_version_cmp
2324 *
2325 * @adev: amdgpu_device pointer
2326 * @type: enum amd_ip_block_type
2327 * @major: major version
2328 * @minor: minor version
2329 *
2330 * return 0 if equal or greater
2331 * return 1 if smaller or the ip_block doesn't exist
2332 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)2333 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2334 enum amd_ip_block_type type,
2335 u32 major, u32 minor)
2336 {
2337 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2338
2339 if (ip_block && ((ip_block->version->major > major) ||
2340 ((ip_block->version->major == major) &&
2341 (ip_block->version->minor >= minor))))
2342 return 0;
2343
2344 return 1;
2345 }
2346
2347 /**
2348 * amdgpu_device_ip_block_add
2349 *
2350 * @adev: amdgpu_device pointer
2351 * @ip_block_version: pointer to the IP to add
2352 *
2353 * Adds the IP block driver information to the collection of IPs
2354 * on the asic.
2355 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)2356 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2357 const struct amdgpu_ip_block_version *ip_block_version)
2358 {
2359 if (!ip_block_version)
2360 return -EINVAL;
2361
2362 switch (ip_block_version->type) {
2363 case AMD_IP_BLOCK_TYPE_VCN:
2364 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2365 return 0;
2366 break;
2367 case AMD_IP_BLOCK_TYPE_JPEG:
2368 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2369 return 0;
2370 break;
2371 default:
2372 break;
2373 }
2374
2375 dev_info(adev->dev, "detected ip block number %d <%s>\n",
2376 adev->num_ip_blocks, ip_block_version->funcs->name);
2377
2378 adev->ip_blocks[adev->num_ip_blocks].adev = adev;
2379
2380 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2381
2382 return 0;
2383 }
2384
2385 /**
2386 * amdgpu_device_enable_virtual_display - enable virtual display feature
2387 *
2388 * @adev: amdgpu_device pointer
2389 *
2390 * Enabled the virtual display feature if the user has enabled it via
2391 * the module parameter virtual_display. This feature provides a virtual
2392 * display hardware on headless boards or in virtualized environments.
2393 * This function parses and validates the configuration string specified by
2394 * the user and configures the virtual display configuration (number of
2395 * virtual connectors, crtcs, etc.) specified.
2396 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)2397 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2398 {
2399 adev->enable_virtual_display = false;
2400
2401 if (amdgpu_virtual_display) {
2402 const char *pci_address_name = pci_name(adev->pdev);
2403 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2404
2405 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2406 pciaddstr_tmp = pciaddstr;
2407 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2408 pciaddname = strsep(&pciaddname_tmp, ",");
2409 if (!strcmp("all", pciaddname)
2410 || !strcmp(pci_address_name, pciaddname)) {
2411 long num_crtc;
2412 int res = -1;
2413
2414 adev->enable_virtual_display = true;
2415
2416 if (pciaddname_tmp)
2417 res = kstrtol(pciaddname_tmp, 10,
2418 &num_crtc);
2419
2420 if (!res) {
2421 if (num_crtc < 1)
2422 num_crtc = 1;
2423 if (num_crtc > 6)
2424 num_crtc = 6;
2425 adev->mode_info.num_crtc = num_crtc;
2426 } else {
2427 adev->mode_info.num_crtc = 1;
2428 }
2429 break;
2430 }
2431 }
2432
2433 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2434 amdgpu_virtual_display, pci_address_name,
2435 adev->enable_virtual_display, adev->mode_info.num_crtc);
2436
2437 kfree(pciaddstr);
2438 }
2439 }
2440
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)2441 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2442 {
2443 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2444 adev->mode_info.num_crtc = 1;
2445 adev->enable_virtual_display = true;
2446 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2447 adev->enable_virtual_display, adev->mode_info.num_crtc);
2448 }
2449 }
2450
2451 /**
2452 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2453 *
2454 * @adev: amdgpu_device pointer
2455 *
2456 * Parses the asic configuration parameters specified in the gpu info
2457 * firmware and makes them available to the driver for use in configuring
2458 * the asic.
2459 * Returns 0 on success, -EINVAL on failure.
2460 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)2461 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2462 {
2463 const char *chip_name;
2464 int err;
2465 const struct gpu_info_firmware_header_v1_0 *hdr;
2466
2467 adev->firmware.gpu_info_fw = NULL;
2468
2469 if (adev->mman.discovery_bin)
2470 return 0;
2471
2472 switch (adev->asic_type) {
2473 default:
2474 return 0;
2475 case CHIP_VEGA10:
2476 chip_name = "vega10";
2477 break;
2478 case CHIP_VEGA12:
2479 chip_name = "vega12";
2480 break;
2481 case CHIP_RAVEN:
2482 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2483 chip_name = "raven2";
2484 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2485 chip_name = "picasso";
2486 else
2487 chip_name = "raven";
2488 break;
2489 case CHIP_ARCTURUS:
2490 chip_name = "arcturus";
2491 break;
2492 case CHIP_NAVI12:
2493 chip_name = "navi12";
2494 break;
2495 }
2496
2497 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2498 AMDGPU_UCODE_OPTIONAL,
2499 "amdgpu/%s_gpu_info.bin", chip_name);
2500 if (err) {
2501 dev_err(adev->dev,
2502 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2503 chip_name);
2504 goto out;
2505 }
2506
2507 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2508 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2509
2510 switch (hdr->version_major) {
2511 case 1:
2512 {
2513 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2514 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2515 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2516
2517 /*
2518 * Should be dropped when DAL no longer needs it.
2519 */
2520 if (adev->asic_type == CHIP_NAVI12)
2521 goto parse_soc_bounding_box;
2522
2523 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2524 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2525 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2526 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2527 adev->gfx.config.max_texture_channel_caches =
2528 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2529 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2530 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2531 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2532 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2533 adev->gfx.config.double_offchip_lds_buf =
2534 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2535 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2536 adev->gfx.cu_info.max_waves_per_simd =
2537 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2538 adev->gfx.cu_info.max_scratch_slots_per_cu =
2539 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2540 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2541 if (hdr->version_minor >= 1) {
2542 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2543 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2544 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2545 adev->gfx.config.num_sc_per_sh =
2546 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2547 adev->gfx.config.num_packer_per_sc =
2548 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2549 }
2550
2551 parse_soc_bounding_box:
2552 /*
2553 * soc bounding box info is not integrated in disocovery table,
2554 * we always need to parse it from gpu info firmware if needed.
2555 */
2556 if (hdr->version_minor == 2) {
2557 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2558 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2559 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2560 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2561 }
2562 break;
2563 }
2564 default:
2565 dev_err(adev->dev,
2566 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2567 err = -EINVAL;
2568 goto out;
2569 }
2570 out:
2571 return err;
2572 }
2573
2574 /**
2575 * amdgpu_device_ip_early_init - run early init for hardware IPs
2576 *
2577 * @adev: amdgpu_device pointer
2578 *
2579 * Early initialization pass for hardware IPs. The hardware IPs that make
2580 * up each asic are discovered each IP's early_init callback is run. This
2581 * is the first stage in initializing the asic.
2582 * Returns 0 on success, negative error code on failure.
2583 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2584 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2585 {
2586 struct amdgpu_ip_block *ip_block;
2587 struct pci_dev *parent;
2588 int i, r;
2589 bool total;
2590
2591 amdgpu_device_enable_virtual_display(adev);
2592
2593 if (amdgpu_sriov_vf(adev)) {
2594 r = amdgpu_virt_request_full_gpu(adev, true);
2595 if (r)
2596 return r;
2597 }
2598
2599 switch (adev->asic_type) {
2600 #ifdef CONFIG_DRM_AMDGPU_SI
2601 case CHIP_VERDE:
2602 case CHIP_TAHITI:
2603 case CHIP_PITCAIRN:
2604 case CHIP_OLAND:
2605 case CHIP_HAINAN:
2606 adev->family = AMDGPU_FAMILY_SI;
2607 r = si_set_ip_blocks(adev);
2608 if (r)
2609 return r;
2610 break;
2611 #endif
2612 #ifdef CONFIG_DRM_AMDGPU_CIK
2613 case CHIP_BONAIRE:
2614 case CHIP_HAWAII:
2615 case CHIP_KAVERI:
2616 case CHIP_KABINI:
2617 case CHIP_MULLINS:
2618 if (adev->flags & AMD_IS_APU)
2619 adev->family = AMDGPU_FAMILY_KV;
2620 else
2621 adev->family = AMDGPU_FAMILY_CI;
2622
2623 r = cik_set_ip_blocks(adev);
2624 if (r)
2625 return r;
2626 break;
2627 #endif
2628 case CHIP_TOPAZ:
2629 case CHIP_TONGA:
2630 case CHIP_FIJI:
2631 case CHIP_POLARIS10:
2632 case CHIP_POLARIS11:
2633 case CHIP_POLARIS12:
2634 case CHIP_VEGAM:
2635 case CHIP_CARRIZO:
2636 case CHIP_STONEY:
2637 if (adev->flags & AMD_IS_APU)
2638 adev->family = AMDGPU_FAMILY_CZ;
2639 else
2640 adev->family = AMDGPU_FAMILY_VI;
2641
2642 r = vi_set_ip_blocks(adev);
2643 if (r)
2644 return r;
2645 break;
2646 default:
2647 r = amdgpu_discovery_set_ip_blocks(adev);
2648 if (r)
2649 return r;
2650 break;
2651 }
2652
2653 if (amdgpu_has_atpx() &&
2654 (amdgpu_is_atpx_hybrid() ||
2655 amdgpu_has_atpx_dgpu_power_cntl()) &&
2656 ((adev->flags & AMD_IS_APU) == 0) &&
2657 !dev_is_removable(&adev->pdev->dev))
2658 adev->flags |= AMD_IS_PX;
2659
2660 if (!(adev->flags & AMD_IS_APU)) {
2661 parent = pcie_find_root_port(adev->pdev);
2662 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2663 }
2664
2665
2666 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2667 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2668 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2669 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2670 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2671 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2672 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2673
2674 total = true;
2675 for (i = 0; i < adev->num_ip_blocks; i++) {
2676 ip_block = &adev->ip_blocks[i];
2677
2678 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2679 DRM_WARN("disabled ip block: %d <%s>\n",
2680 i, adev->ip_blocks[i].version->funcs->name);
2681 adev->ip_blocks[i].status.valid = false;
2682 } else if (ip_block->version->funcs->early_init) {
2683 r = ip_block->version->funcs->early_init(ip_block);
2684 if (r == -ENOENT) {
2685 adev->ip_blocks[i].status.valid = false;
2686 } else if (r) {
2687 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2688 adev->ip_blocks[i].version->funcs->name, r);
2689 total = false;
2690 } else {
2691 adev->ip_blocks[i].status.valid = true;
2692 }
2693 } else {
2694 adev->ip_blocks[i].status.valid = true;
2695 }
2696 /* get the vbios after the asic_funcs are set up */
2697 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2698 r = amdgpu_device_parse_gpu_info_fw(adev);
2699 if (r)
2700 return r;
2701
2702 /* Read BIOS */
2703 if (amdgpu_device_read_bios(adev)) {
2704 if (!amdgpu_get_bios(adev))
2705 return -EINVAL;
2706
2707 r = amdgpu_atombios_init(adev);
2708 if (r) {
2709 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2710 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2711 return r;
2712 }
2713 }
2714
2715 /*get pf2vf msg info at it's earliest time*/
2716 if (amdgpu_sriov_vf(adev))
2717 amdgpu_virt_init_data_exchange(adev);
2718
2719 }
2720 }
2721 if (!total)
2722 return -ENODEV;
2723
2724 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2725 if (ip_block->status.valid != false)
2726 amdgpu_amdkfd_device_probe(adev);
2727
2728 adev->cg_flags &= amdgpu_cg_mask;
2729 adev->pg_flags &= amdgpu_pg_mask;
2730
2731 return 0;
2732 }
2733
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2734 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2735 {
2736 int i, r;
2737
2738 for (i = 0; i < adev->num_ip_blocks; i++) {
2739 if (!adev->ip_blocks[i].status.sw)
2740 continue;
2741 if (adev->ip_blocks[i].status.hw)
2742 continue;
2743 if (!amdgpu_ip_member_of_hwini(
2744 adev, adev->ip_blocks[i].version->type))
2745 continue;
2746 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2747 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2748 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2749 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2750 if (r) {
2751 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2752 adev->ip_blocks[i].version->funcs->name, r);
2753 return r;
2754 }
2755 adev->ip_blocks[i].status.hw = true;
2756 }
2757 }
2758
2759 return 0;
2760 }
2761
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2762 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2763 {
2764 int i, r;
2765
2766 for (i = 0; i < adev->num_ip_blocks; i++) {
2767 if (!adev->ip_blocks[i].status.sw)
2768 continue;
2769 if (adev->ip_blocks[i].status.hw)
2770 continue;
2771 if (!amdgpu_ip_member_of_hwini(
2772 adev, adev->ip_blocks[i].version->type))
2773 continue;
2774 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2775 if (r) {
2776 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2777 adev->ip_blocks[i].version->funcs->name, r);
2778 return r;
2779 }
2780 adev->ip_blocks[i].status.hw = true;
2781 }
2782
2783 return 0;
2784 }
2785
amdgpu_device_fw_loading(struct amdgpu_device * adev)2786 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2787 {
2788 int r = 0;
2789 int i;
2790 uint32_t smu_version;
2791
2792 if (adev->asic_type >= CHIP_VEGA10) {
2793 for (i = 0; i < adev->num_ip_blocks; i++) {
2794 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2795 continue;
2796
2797 if (!amdgpu_ip_member_of_hwini(adev,
2798 AMD_IP_BLOCK_TYPE_PSP))
2799 break;
2800
2801 if (!adev->ip_blocks[i].status.sw)
2802 continue;
2803
2804 /* no need to do the fw loading again if already done*/
2805 if (adev->ip_blocks[i].status.hw == true)
2806 break;
2807
2808 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2809 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
2810 if (r)
2811 return r;
2812 } else {
2813 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2814 if (r) {
2815 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2816 adev->ip_blocks[i].version->funcs->name, r);
2817 return r;
2818 }
2819 adev->ip_blocks[i].status.hw = true;
2820 }
2821 break;
2822 }
2823 }
2824
2825 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2826 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2827
2828 return r;
2829 }
2830
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2831 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2832 {
2833 long timeout;
2834 int r, i;
2835
2836 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2837 struct amdgpu_ring *ring = adev->rings[i];
2838
2839 /* No need to setup the GPU scheduler for rings that don't need it */
2840 if (!ring || ring->no_scheduler)
2841 continue;
2842
2843 switch (ring->funcs->type) {
2844 case AMDGPU_RING_TYPE_GFX:
2845 timeout = adev->gfx_timeout;
2846 break;
2847 case AMDGPU_RING_TYPE_COMPUTE:
2848 timeout = adev->compute_timeout;
2849 break;
2850 case AMDGPU_RING_TYPE_SDMA:
2851 timeout = adev->sdma_timeout;
2852 break;
2853 default:
2854 timeout = adev->video_timeout;
2855 break;
2856 }
2857
2858 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
2859 DRM_SCHED_PRIORITY_COUNT,
2860 ring->num_hw_submission, 0,
2861 timeout, adev->reset_domain->wq,
2862 ring->sched_score, ring->name,
2863 adev->dev);
2864 if (r) {
2865 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2866 ring->name);
2867 return r;
2868 }
2869 r = amdgpu_uvd_entity_init(adev, ring);
2870 if (r) {
2871 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2872 ring->name);
2873 return r;
2874 }
2875 r = amdgpu_vce_entity_init(adev, ring);
2876 if (r) {
2877 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
2878 ring->name);
2879 return r;
2880 }
2881 }
2882
2883 amdgpu_xcp_update_partition_sched_list(adev);
2884
2885 return 0;
2886 }
2887
2888
2889 /**
2890 * amdgpu_device_ip_init - run init for hardware IPs
2891 *
2892 * @adev: amdgpu_device pointer
2893 *
2894 * Main initialization pass for hardware IPs. The list of all the hardware
2895 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2896 * are run. sw_init initializes the software state associated with each IP
2897 * and hw_init initializes the hardware associated with each IP.
2898 * Returns 0 on success, negative error code on failure.
2899 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2900 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2901 {
2902 bool init_badpage;
2903 int i, r;
2904
2905 r = amdgpu_ras_init(adev);
2906 if (r)
2907 return r;
2908
2909 for (i = 0; i < adev->num_ip_blocks; i++) {
2910 if (!adev->ip_blocks[i].status.valid)
2911 continue;
2912 if (adev->ip_blocks[i].version->funcs->sw_init) {
2913 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
2914 if (r) {
2915 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2916 adev->ip_blocks[i].version->funcs->name, r);
2917 goto init_failed;
2918 }
2919 }
2920 adev->ip_blocks[i].status.sw = true;
2921
2922 if (!amdgpu_ip_member_of_hwini(
2923 adev, adev->ip_blocks[i].version->type))
2924 continue;
2925
2926 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2927 /* need to do common hw init early so everything is set up for gmc */
2928 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2929 if (r) {
2930 DRM_ERROR("hw_init %d failed %d\n", i, r);
2931 goto init_failed;
2932 }
2933 adev->ip_blocks[i].status.hw = true;
2934 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2935 /* need to do gmc hw init early so we can allocate gpu mem */
2936 /* Try to reserve bad pages early */
2937 if (amdgpu_sriov_vf(adev))
2938 amdgpu_virt_exchange_data(adev);
2939
2940 r = amdgpu_device_mem_scratch_init(adev);
2941 if (r) {
2942 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2943 goto init_failed;
2944 }
2945 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2946 if (r) {
2947 DRM_ERROR("hw_init %d failed %d\n", i, r);
2948 goto init_failed;
2949 }
2950 r = amdgpu_device_wb_init(adev);
2951 if (r) {
2952 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2953 goto init_failed;
2954 }
2955 adev->ip_blocks[i].status.hw = true;
2956
2957 /* right after GMC hw init, we create CSA */
2958 if (adev->gfx.mcbp) {
2959 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2960 AMDGPU_GEM_DOMAIN_VRAM |
2961 AMDGPU_GEM_DOMAIN_GTT,
2962 AMDGPU_CSA_SIZE);
2963 if (r) {
2964 DRM_ERROR("allocate CSA failed %d\n", r);
2965 goto init_failed;
2966 }
2967 }
2968
2969 r = amdgpu_seq64_init(adev);
2970 if (r) {
2971 DRM_ERROR("allocate seq64 failed %d\n", r);
2972 goto init_failed;
2973 }
2974 }
2975 }
2976
2977 if (amdgpu_sriov_vf(adev))
2978 amdgpu_virt_init_data_exchange(adev);
2979
2980 r = amdgpu_ib_pool_init(adev);
2981 if (r) {
2982 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2983 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2984 goto init_failed;
2985 }
2986
2987 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2988 if (r)
2989 goto init_failed;
2990
2991 r = amdgpu_device_ip_hw_init_phase1(adev);
2992 if (r)
2993 goto init_failed;
2994
2995 r = amdgpu_device_fw_loading(adev);
2996 if (r)
2997 goto init_failed;
2998
2999 r = amdgpu_device_ip_hw_init_phase2(adev);
3000 if (r)
3001 goto init_failed;
3002
3003 /*
3004 * retired pages will be loaded from eeprom and reserved here,
3005 * it should be called after amdgpu_device_ip_hw_init_phase2 since
3006 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
3007 * for I2C communication which only true at this point.
3008 *
3009 * amdgpu_ras_recovery_init may fail, but the upper only cares the
3010 * failure from bad gpu situation and stop amdgpu init process
3011 * accordingly. For other failed cases, it will still release all
3012 * the resource and print error message, rather than returning one
3013 * negative value to upper level.
3014 *
3015 * Note: theoretically, this should be called before all vram allocations
3016 * to protect retired page from abusing
3017 */
3018 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
3019 r = amdgpu_ras_recovery_init(adev, init_badpage);
3020 if (r)
3021 goto init_failed;
3022
3023 /**
3024 * In case of XGMI grab extra reference for reset domain for this device
3025 */
3026 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3027 if (amdgpu_xgmi_add_device(adev) == 0) {
3028 if (!amdgpu_sriov_vf(adev)) {
3029 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3030
3031 if (WARN_ON(!hive)) {
3032 r = -ENOENT;
3033 goto init_failed;
3034 }
3035
3036 if (!hive->reset_domain ||
3037 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
3038 r = -ENOENT;
3039 amdgpu_put_xgmi_hive(hive);
3040 goto init_failed;
3041 }
3042
3043 /* Drop the early temporary reset domain we created for device */
3044 amdgpu_reset_put_reset_domain(adev->reset_domain);
3045 adev->reset_domain = hive->reset_domain;
3046 amdgpu_put_xgmi_hive(hive);
3047 }
3048 }
3049 }
3050
3051 r = amdgpu_device_init_schedulers(adev);
3052 if (r)
3053 goto init_failed;
3054
3055 if (adev->mman.buffer_funcs_ring->sched.ready)
3056 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3057
3058 /* Don't init kfd if whole hive need to be reset during init */
3059 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
3060 kgd2kfd_init_zone_device(adev);
3061 amdgpu_amdkfd_device_init(adev);
3062 }
3063
3064 amdgpu_fru_get_product_info(adev);
3065
3066 init_failed:
3067
3068 return r;
3069 }
3070
3071 /**
3072 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
3073 *
3074 * @adev: amdgpu_device pointer
3075 *
3076 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
3077 * this function before a GPU reset. If the value is retained after a
3078 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents.
3079 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)3080 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
3081 {
3082 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
3083 }
3084
3085 /**
3086 * amdgpu_device_check_vram_lost - check if vram is valid
3087 *
3088 * @adev: amdgpu_device pointer
3089 *
3090 * Checks the reset magic value written to the gart pointer in VRAM.
3091 * The driver calls this after a GPU reset to see if the contents of
3092 * VRAM is lost or now.
3093 * returns true if vram is lost, false if not.
3094 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)3095 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3096 {
3097 if (memcmp(adev->gart.ptr, adev->reset_magic,
3098 AMDGPU_RESET_MAGIC_NUM))
3099 return true;
3100
3101 if (!amdgpu_in_reset(adev))
3102 return false;
3103
3104 /*
3105 * For all ASICs with baco/mode1 reset, the VRAM is
3106 * always assumed to be lost.
3107 */
3108 switch (amdgpu_asic_reset_method(adev)) {
3109 case AMD_RESET_METHOD_BACO:
3110 case AMD_RESET_METHOD_MODE1:
3111 return true;
3112 default:
3113 return false;
3114 }
3115 }
3116
3117 /**
3118 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3119 *
3120 * @adev: amdgpu_device pointer
3121 * @state: clockgating state (gate or ungate)
3122 *
3123 * The list of all the hardware IPs that make up the asic is walked and the
3124 * set_clockgating_state callbacks are run.
3125 * Late initialization pass enabling clockgating for hardware IPs.
3126 * Fini or suspend, pass disabling clockgating for hardware IPs.
3127 * Returns 0 on success, negative error code on failure.
3128 */
3129
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)3130 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3131 enum amd_clockgating_state state)
3132 {
3133 int i, j, r;
3134
3135 if (amdgpu_emu_mode == 1)
3136 return 0;
3137
3138 for (j = 0; j < adev->num_ip_blocks; j++) {
3139 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3140 if (!adev->ip_blocks[i].status.late_initialized)
3141 continue;
3142 /* skip CG for GFX, SDMA on S0ix */
3143 if (adev->in_s0ix &&
3144 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3145 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3146 continue;
3147 /* skip CG for VCE/UVD, it's handled specially */
3148 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3149 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3150 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3151 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3152 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3153 /* enable clockgating to save power */
3154 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i],
3155 state);
3156 if (r) {
3157 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3158 adev->ip_blocks[i].version->funcs->name, r);
3159 return r;
3160 }
3161 }
3162 }
3163
3164 return 0;
3165 }
3166
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)3167 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3168 enum amd_powergating_state state)
3169 {
3170 int i, j, r;
3171
3172 if (amdgpu_emu_mode == 1)
3173 return 0;
3174
3175 for (j = 0; j < adev->num_ip_blocks; j++) {
3176 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3177 if (!adev->ip_blocks[i].status.late_initialized)
3178 continue;
3179 /* skip PG for GFX, SDMA on S0ix */
3180 if (adev->in_s0ix &&
3181 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3182 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3183 continue;
3184 /* skip CG for VCE/UVD, it's handled specially */
3185 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3186 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3187 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3188 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3189 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3190 /* enable powergating to save power */
3191 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i],
3192 state);
3193 if (r) {
3194 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3195 adev->ip_blocks[i].version->funcs->name, r);
3196 return r;
3197 }
3198 }
3199 }
3200 return 0;
3201 }
3202
amdgpu_device_enable_mgpu_fan_boost(void)3203 static int amdgpu_device_enable_mgpu_fan_boost(void)
3204 {
3205 struct amdgpu_gpu_instance *gpu_ins;
3206 struct amdgpu_device *adev;
3207 int i, ret = 0;
3208
3209 mutex_lock(&mgpu_info.mutex);
3210
3211 /*
3212 * MGPU fan boost feature should be enabled
3213 * only when there are two or more dGPUs in
3214 * the system
3215 */
3216 if (mgpu_info.num_dgpu < 2)
3217 goto out;
3218
3219 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3220 gpu_ins = &(mgpu_info.gpu_ins[i]);
3221 adev = gpu_ins->adev;
3222 if (!(adev->flags & AMD_IS_APU) &&
3223 !gpu_ins->mgpu_fan_enabled) {
3224 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3225 if (ret)
3226 break;
3227
3228 gpu_ins->mgpu_fan_enabled = 1;
3229 }
3230 }
3231
3232 out:
3233 mutex_unlock(&mgpu_info.mutex);
3234
3235 return ret;
3236 }
3237
3238 /**
3239 * amdgpu_device_ip_late_init - run late init for hardware IPs
3240 *
3241 * @adev: amdgpu_device pointer
3242 *
3243 * Late initialization pass for hardware IPs. The list of all the hardware
3244 * IPs that make up the asic is walked and the late_init callbacks are run.
3245 * late_init covers any special initialization that an IP requires
3246 * after all of the have been initialized or something that needs to happen
3247 * late in the init process.
3248 * Returns 0 on success, negative error code on failure.
3249 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)3250 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3251 {
3252 struct amdgpu_gpu_instance *gpu_instance;
3253 int i = 0, r;
3254
3255 for (i = 0; i < adev->num_ip_blocks; i++) {
3256 if (!adev->ip_blocks[i].status.hw)
3257 continue;
3258 if (adev->ip_blocks[i].version->funcs->late_init) {
3259 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
3260 if (r) {
3261 DRM_ERROR("late_init of IP block <%s> failed %d\n",
3262 adev->ip_blocks[i].version->funcs->name, r);
3263 return r;
3264 }
3265 }
3266 adev->ip_blocks[i].status.late_initialized = true;
3267 }
3268
3269 r = amdgpu_ras_late_init(adev);
3270 if (r) {
3271 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3272 return r;
3273 }
3274
3275 if (!amdgpu_reset_in_recovery(adev))
3276 amdgpu_ras_set_error_query_ready(adev, true);
3277
3278 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3279 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3280
3281 amdgpu_device_fill_reset_magic(adev);
3282
3283 r = amdgpu_device_enable_mgpu_fan_boost();
3284 if (r)
3285 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3286
3287 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3288 if (amdgpu_passthrough(adev) &&
3289 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3290 adev->asic_type == CHIP_ALDEBARAN))
3291 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3292
3293 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3294 mutex_lock(&mgpu_info.mutex);
3295
3296 /*
3297 * Reset device p-state to low as this was booted with high.
3298 *
3299 * This should be performed only after all devices from the same
3300 * hive get initialized.
3301 *
3302 * However, it's unknown how many device in the hive in advance.
3303 * As this is counted one by one during devices initializations.
3304 *
3305 * So, we wait for all XGMI interlinked devices initialized.
3306 * This may bring some delays as those devices may come from
3307 * different hives. But that should be OK.
3308 */
3309 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3310 for (i = 0; i < mgpu_info.num_gpu; i++) {
3311 gpu_instance = &(mgpu_info.gpu_ins[i]);
3312 if (gpu_instance->adev->flags & AMD_IS_APU)
3313 continue;
3314
3315 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3316 AMDGPU_XGMI_PSTATE_MIN);
3317 if (r) {
3318 DRM_ERROR("pstate setting failed (%d).\n", r);
3319 break;
3320 }
3321 }
3322 }
3323
3324 mutex_unlock(&mgpu_info.mutex);
3325 }
3326
3327 return 0;
3328 }
3329
amdgpu_ip_block_hw_fini(struct amdgpu_ip_block * ip_block)3330 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
3331 {
3332 int r;
3333
3334 if (!ip_block->version->funcs->hw_fini) {
3335 DRM_ERROR("hw_fini of IP block <%s> not defined\n",
3336 ip_block->version->funcs->name);
3337 } else {
3338 r = ip_block->version->funcs->hw_fini(ip_block);
3339 /* XXX handle errors */
3340 if (r) {
3341 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3342 ip_block->version->funcs->name, r);
3343 }
3344 }
3345
3346 ip_block->status.hw = false;
3347 }
3348
3349 /**
3350 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3351 *
3352 * @adev: amdgpu_device pointer
3353 *
3354 * For ASICs need to disable SMC first
3355 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)3356 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3357 {
3358 int i;
3359
3360 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3361 return;
3362
3363 for (i = 0; i < adev->num_ip_blocks; i++) {
3364 if (!adev->ip_blocks[i].status.hw)
3365 continue;
3366 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3367 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3368 break;
3369 }
3370 }
3371 }
3372
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)3373 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3374 {
3375 int i, r;
3376
3377 for (i = 0; i < adev->num_ip_blocks; i++) {
3378 if (!adev->ip_blocks[i].version->funcs->early_fini)
3379 continue;
3380
3381 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
3382 if (r) {
3383 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3384 adev->ip_blocks[i].version->funcs->name, r);
3385 }
3386 }
3387
3388 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3389 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3390
3391 amdgpu_amdkfd_suspend(adev, false);
3392
3393 /* Workaround for ASICs need to disable SMC first */
3394 amdgpu_device_smu_fini_early(adev);
3395
3396 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3397 if (!adev->ip_blocks[i].status.hw)
3398 continue;
3399
3400 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3401 }
3402
3403 if (amdgpu_sriov_vf(adev)) {
3404 if (amdgpu_virt_release_full_gpu(adev, false))
3405 DRM_ERROR("failed to release exclusive mode on fini\n");
3406 }
3407
3408 return 0;
3409 }
3410
3411 /**
3412 * amdgpu_device_ip_fini - run fini for hardware IPs
3413 *
3414 * @adev: amdgpu_device pointer
3415 *
3416 * Main teardown pass for hardware IPs. The list of all the hardware
3417 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3418 * are run. hw_fini tears down the hardware associated with each IP
3419 * and sw_fini tears down any software state associated with each IP.
3420 * Returns 0 on success, negative error code on failure.
3421 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)3422 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3423 {
3424 int i, r;
3425
3426 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3427 amdgpu_virt_release_ras_err_handler_data(adev);
3428
3429 if (adev->gmc.xgmi.num_physical_nodes > 1)
3430 amdgpu_xgmi_remove_device(adev);
3431
3432 amdgpu_amdkfd_device_fini_sw(adev);
3433
3434 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3435 if (!adev->ip_blocks[i].status.sw)
3436 continue;
3437
3438 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3439 amdgpu_ucode_free_bo(adev);
3440 amdgpu_free_static_csa(&adev->virt.csa_obj);
3441 amdgpu_device_wb_fini(adev);
3442 amdgpu_device_mem_scratch_fini(adev);
3443 amdgpu_ib_pool_fini(adev);
3444 amdgpu_seq64_fini(adev);
3445 }
3446 if (adev->ip_blocks[i].version->funcs->sw_fini) {
3447 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
3448 /* XXX handle errors */
3449 if (r) {
3450 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3451 adev->ip_blocks[i].version->funcs->name, r);
3452 }
3453 }
3454 adev->ip_blocks[i].status.sw = false;
3455 adev->ip_blocks[i].status.valid = false;
3456 }
3457
3458 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3459 if (!adev->ip_blocks[i].status.late_initialized)
3460 continue;
3461 if (adev->ip_blocks[i].version->funcs->late_fini)
3462 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
3463 adev->ip_blocks[i].status.late_initialized = false;
3464 }
3465
3466 amdgpu_ras_fini(adev);
3467
3468 return 0;
3469 }
3470
3471 /**
3472 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3473 *
3474 * @work: work_struct.
3475 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)3476 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3477 {
3478 struct amdgpu_device *adev =
3479 container_of(work, struct amdgpu_device, delayed_init_work.work);
3480 int r;
3481
3482 r = amdgpu_ib_ring_tests(adev);
3483 if (r)
3484 DRM_ERROR("ib ring test failed (%d).\n", r);
3485 }
3486
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)3487 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3488 {
3489 struct amdgpu_device *adev =
3490 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3491
3492 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3493 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3494
3495 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0))
3496 adev->gfx.gfx_off_state = true;
3497 }
3498
3499 /**
3500 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3501 *
3502 * @adev: amdgpu_device pointer
3503 *
3504 * Main suspend function for hardware IPs. The list of all the hardware
3505 * IPs that make up the asic is walked, clockgating is disabled and the
3506 * suspend callbacks are run. suspend puts the hardware and software state
3507 * in each IP into a state suitable for suspend.
3508 * Returns 0 on success, negative error code on failure.
3509 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)3510 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3511 {
3512 int i, r;
3513
3514 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3515 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3516
3517 /*
3518 * Per PMFW team's suggestion, driver needs to handle gfxoff
3519 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3520 * scenario. Add the missing df cstate disablement here.
3521 */
3522 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3523 dev_warn(adev->dev, "Failed to disallow df cstate");
3524
3525 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3526 if (!adev->ip_blocks[i].status.valid)
3527 continue;
3528
3529 /* displays are handled separately */
3530 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3531 continue;
3532
3533 /* XXX handle errors */
3534 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3535 if (r)
3536 return r;
3537 }
3538
3539 return 0;
3540 }
3541
3542 /**
3543 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3544 *
3545 * @adev: amdgpu_device pointer
3546 *
3547 * Main suspend function for hardware IPs. The list of all the hardware
3548 * IPs that make up the asic is walked, clockgating is disabled and the
3549 * suspend callbacks are run. suspend puts the hardware and software state
3550 * in each IP into a state suitable for suspend.
3551 * Returns 0 on success, negative error code on failure.
3552 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)3553 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3554 {
3555 int i, r;
3556
3557 if (adev->in_s0ix)
3558 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3559
3560 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3561 if (!adev->ip_blocks[i].status.valid)
3562 continue;
3563 /* displays are handled in phase1 */
3564 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3565 continue;
3566 /* PSP lost connection when err_event_athub occurs */
3567 if (amdgpu_ras_intr_triggered() &&
3568 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3569 adev->ip_blocks[i].status.hw = false;
3570 continue;
3571 }
3572
3573 /* skip unnecessary suspend if we do not initialize them yet */
3574 if (!amdgpu_ip_member_of_hwini(
3575 adev, adev->ip_blocks[i].version->type))
3576 continue;
3577
3578 /* skip suspend of gfx/mes and psp for S0ix
3579 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3580 * like at runtime. PSP is also part of the always on hardware
3581 * so no need to suspend it.
3582 */
3583 if (adev->in_s0ix &&
3584 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3585 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3586 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3587 continue;
3588
3589 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3590 if (adev->in_s0ix &&
3591 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3592 IP_VERSION(5, 0, 0)) &&
3593 (adev->ip_blocks[i].version->type ==
3594 AMD_IP_BLOCK_TYPE_SDMA))
3595 continue;
3596
3597 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3598 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3599 * from this location and RLC Autoload automatically also gets loaded
3600 * from here based on PMFW -> PSP message during re-init sequence.
3601 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3602 * the TMR and reload FWs again for IMU enabled APU ASICs.
3603 */
3604 if (amdgpu_in_reset(adev) &&
3605 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3606 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3607 continue;
3608
3609 /* XXX handle errors */
3610 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3611 adev->ip_blocks[i].status.hw = false;
3612
3613 /* handle putting the SMC in the appropriate state */
3614 if (!amdgpu_sriov_vf(adev)) {
3615 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3616 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3617 if (r) {
3618 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3619 adev->mp1_state, r);
3620 return r;
3621 }
3622 }
3623 }
3624 }
3625
3626 return 0;
3627 }
3628
3629 /**
3630 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3631 *
3632 * @adev: amdgpu_device pointer
3633 *
3634 * Main suspend function for hardware IPs. The list of all the hardware
3635 * IPs that make up the asic is walked, clockgating is disabled and the
3636 * suspend callbacks are run. suspend puts the hardware and software state
3637 * in each IP into a state suitable for suspend.
3638 * Returns 0 on success, negative error code on failure.
3639 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3640 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3641 {
3642 int r;
3643
3644 if (amdgpu_sriov_vf(adev)) {
3645 amdgpu_virt_fini_data_exchange(adev);
3646 amdgpu_virt_request_full_gpu(adev, false);
3647 }
3648
3649 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3650
3651 r = amdgpu_device_ip_suspend_phase1(adev);
3652 if (r)
3653 return r;
3654 r = amdgpu_device_ip_suspend_phase2(adev);
3655
3656 if (amdgpu_sriov_vf(adev))
3657 amdgpu_virt_release_full_gpu(adev, false);
3658
3659 return r;
3660 }
3661
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3662 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3663 {
3664 int i, r;
3665
3666 static enum amd_ip_block_type ip_order[] = {
3667 AMD_IP_BLOCK_TYPE_COMMON,
3668 AMD_IP_BLOCK_TYPE_GMC,
3669 AMD_IP_BLOCK_TYPE_PSP,
3670 AMD_IP_BLOCK_TYPE_IH,
3671 };
3672
3673 for (i = 0; i < adev->num_ip_blocks; i++) {
3674 int j;
3675 struct amdgpu_ip_block *block;
3676
3677 block = &adev->ip_blocks[i];
3678 block->status.hw = false;
3679
3680 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3681
3682 if (block->version->type != ip_order[j] ||
3683 !block->status.valid)
3684 continue;
3685
3686 r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
3687 if (r) {
3688 dev_err(adev->dev, "RE-INIT-early: %s failed\n",
3689 block->version->funcs->name);
3690 return r;
3691 }
3692 block->status.hw = true;
3693 }
3694 }
3695
3696 return 0;
3697 }
3698
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3699 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3700 {
3701 struct amdgpu_ip_block *block;
3702 int i, r = 0;
3703
3704 static enum amd_ip_block_type ip_order[] = {
3705 AMD_IP_BLOCK_TYPE_SMC,
3706 AMD_IP_BLOCK_TYPE_DCE,
3707 AMD_IP_BLOCK_TYPE_GFX,
3708 AMD_IP_BLOCK_TYPE_SDMA,
3709 AMD_IP_BLOCK_TYPE_MES,
3710 AMD_IP_BLOCK_TYPE_UVD,
3711 AMD_IP_BLOCK_TYPE_VCE,
3712 AMD_IP_BLOCK_TYPE_VCN,
3713 AMD_IP_BLOCK_TYPE_JPEG
3714 };
3715
3716 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3717 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]);
3718
3719 if (!block)
3720 continue;
3721
3722 if (block->status.valid && !block->status.hw) {
3723 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
3724 r = amdgpu_ip_block_resume(block);
3725 } else {
3726 r = block->version->funcs->hw_init(block);
3727 }
3728
3729 if (r) {
3730 dev_err(adev->dev, "RE-INIT-late: %s failed\n",
3731 block->version->funcs->name);
3732 break;
3733 }
3734 block->status.hw = true;
3735 }
3736 }
3737
3738 return r;
3739 }
3740
3741 /**
3742 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3743 *
3744 * @adev: amdgpu_device pointer
3745 *
3746 * First resume function for hardware IPs. The list of all the hardware
3747 * IPs that make up the asic is walked and the resume callbacks are run for
3748 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3749 * after a suspend and updates the software state as necessary. This
3750 * function is also used for restoring the GPU after a GPU reset.
3751 * Returns 0 on success, negative error code on failure.
3752 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3753 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3754 {
3755 int i, r;
3756
3757 for (i = 0; i < adev->num_ip_blocks; i++) {
3758 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3759 continue;
3760 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3761 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3762 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3763 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3764
3765 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3766 if (r)
3767 return r;
3768 }
3769 }
3770
3771 return 0;
3772 }
3773
3774 /**
3775 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3776 *
3777 * @adev: amdgpu_device pointer
3778 *
3779 * Second resume function for hardware IPs. The list of all the hardware
3780 * IPs that make up the asic is walked and the resume callbacks are run for
3781 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3782 * functional state after a suspend and updates the software state as
3783 * necessary. This function is also used for restoring the GPU after a GPU
3784 * reset.
3785 * Returns 0 on success, negative error code on failure.
3786 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3787 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3788 {
3789 int i, r;
3790
3791 for (i = 0; i < adev->num_ip_blocks; i++) {
3792 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3793 continue;
3794 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3795 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3796 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3797 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3798 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3799 continue;
3800 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3801 if (r)
3802 return r;
3803 }
3804
3805 return 0;
3806 }
3807
3808 /**
3809 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3810 *
3811 * @adev: amdgpu_device pointer
3812 *
3813 * Third resume function for hardware IPs. The list of all the hardware
3814 * IPs that make up the asic is walked and the resume callbacks are run for
3815 * all DCE. resume puts the hardware into a functional state after a suspend
3816 * and updates the software state as necessary. This function is also used
3817 * for restoring the GPU after a GPU reset.
3818 *
3819 * Returns 0 on success, negative error code on failure.
3820 */
amdgpu_device_ip_resume_phase3(struct amdgpu_device * adev)3821 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
3822 {
3823 int i, r;
3824
3825 for (i = 0; i < adev->num_ip_blocks; i++) {
3826 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3827 continue;
3828 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
3829 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3830 if (r)
3831 return r;
3832 }
3833 }
3834
3835 return 0;
3836 }
3837
3838 /**
3839 * amdgpu_device_ip_resume - run resume for hardware IPs
3840 *
3841 * @adev: amdgpu_device pointer
3842 *
3843 * Main resume function for hardware IPs. The hardware IPs
3844 * are split into two resume functions because they are
3845 * also used in recovering from a GPU reset and some additional
3846 * steps need to be take between them. In this case (S3/S4) they are
3847 * run sequentially.
3848 * Returns 0 on success, negative error code on failure.
3849 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3850 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3851 {
3852 int r;
3853
3854 r = amdgpu_device_ip_resume_phase1(adev);
3855 if (r)
3856 return r;
3857
3858 r = amdgpu_device_fw_loading(adev);
3859 if (r)
3860 return r;
3861
3862 r = amdgpu_device_ip_resume_phase2(adev);
3863
3864 if (adev->mman.buffer_funcs_ring->sched.ready)
3865 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3866
3867 if (r)
3868 return r;
3869
3870 amdgpu_fence_driver_hw_init(adev);
3871
3872 r = amdgpu_device_ip_resume_phase3(adev);
3873
3874 return r;
3875 }
3876
3877 /**
3878 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3879 *
3880 * @adev: amdgpu_device pointer
3881 *
3882 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3883 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3884 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3885 {
3886 if (amdgpu_sriov_vf(adev)) {
3887 if (adev->is_atom_fw) {
3888 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3889 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3890 } else {
3891 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3892 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3893 }
3894
3895 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3896 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3897 }
3898 }
3899
3900 /**
3901 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3902 *
3903 * @asic_type: AMD asic type
3904 *
3905 * Check if there is DC (new modesetting infrastructre) support for an asic.
3906 * returns true if DC has support, false if not.
3907 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3908 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3909 {
3910 switch (asic_type) {
3911 #ifdef CONFIG_DRM_AMDGPU_SI
3912 case CHIP_HAINAN:
3913 #endif
3914 case CHIP_TOPAZ:
3915 /* chips with no display hardware */
3916 return false;
3917 #if defined(CONFIG_DRM_AMD_DC)
3918 case CHIP_TAHITI:
3919 case CHIP_PITCAIRN:
3920 case CHIP_VERDE:
3921 case CHIP_OLAND:
3922 /*
3923 * We have systems in the wild with these ASICs that require
3924 * LVDS and VGA support which is not supported with DC.
3925 *
3926 * Fallback to the non-DC driver here by default so as not to
3927 * cause regressions.
3928 */
3929 #if defined(CONFIG_DRM_AMD_DC_SI)
3930 return amdgpu_dc > 0;
3931 #else
3932 return false;
3933 #endif
3934 case CHIP_BONAIRE:
3935 case CHIP_KAVERI:
3936 case CHIP_KABINI:
3937 case CHIP_MULLINS:
3938 /*
3939 * We have systems in the wild with these ASICs that require
3940 * VGA support which is not supported with DC.
3941 *
3942 * Fallback to the non-DC driver here by default so as not to
3943 * cause regressions.
3944 */
3945 return amdgpu_dc > 0;
3946 default:
3947 return amdgpu_dc != 0;
3948 #else
3949 default:
3950 if (amdgpu_dc > 0)
3951 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3952 return false;
3953 #endif
3954 }
3955 }
3956
3957 /**
3958 * amdgpu_device_has_dc_support - check if dc is supported
3959 *
3960 * @adev: amdgpu_device pointer
3961 *
3962 * Returns true for supported, false for not supported
3963 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3964 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3965 {
3966 if (adev->enable_virtual_display ||
3967 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3968 return false;
3969
3970 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3971 }
3972
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3973 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3974 {
3975 struct amdgpu_device *adev =
3976 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3977 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3978
3979 /* It's a bug to not have a hive within this function */
3980 if (WARN_ON(!hive))
3981 return;
3982
3983 /*
3984 * Use task barrier to synchronize all xgmi reset works across the
3985 * hive. task_barrier_enter and task_barrier_exit will block
3986 * until all the threads running the xgmi reset works reach
3987 * those points. task_barrier_full will do both blocks.
3988 */
3989 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3990
3991 task_barrier_enter(&hive->tb);
3992 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3993
3994 if (adev->asic_reset_res)
3995 goto fail;
3996
3997 task_barrier_exit(&hive->tb);
3998 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3999
4000 if (adev->asic_reset_res)
4001 goto fail;
4002
4003 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
4004 } else {
4005
4006 task_barrier_full(&hive->tb);
4007 adev->asic_reset_res = amdgpu_asic_reset(adev);
4008 }
4009
4010 fail:
4011 if (adev->asic_reset_res)
4012 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4013 adev->asic_reset_res, adev_to_drm(adev)->unique);
4014 amdgpu_put_xgmi_hive(hive);
4015 }
4016
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)4017 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
4018 {
4019 char *input = amdgpu_lockup_timeout;
4020 char *timeout_setting = NULL;
4021 int index = 0;
4022 long timeout;
4023 int ret = 0;
4024
4025 /*
4026 * By default timeout for non compute jobs is 10000
4027 * and 60000 for compute jobs.
4028 * In SR-IOV or passthrough mode, timeout for compute
4029 * jobs are 60000 by default.
4030 */
4031 adev->gfx_timeout = msecs_to_jiffies(10000);
4032 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4033 if (amdgpu_sriov_vf(adev))
4034 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
4035 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
4036 else
4037 adev->compute_timeout = msecs_to_jiffies(60000);
4038
4039 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4040 while ((timeout_setting = strsep(&input, ",")) &&
4041 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4042 ret = kstrtol(timeout_setting, 0, &timeout);
4043 if (ret)
4044 return ret;
4045
4046 if (timeout == 0) {
4047 index++;
4048 continue;
4049 } else if (timeout < 0) {
4050 timeout = MAX_SCHEDULE_TIMEOUT;
4051 dev_warn(adev->dev, "lockup timeout disabled");
4052 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
4053 } else {
4054 timeout = msecs_to_jiffies(timeout);
4055 }
4056
4057 switch (index++) {
4058 case 0:
4059 adev->gfx_timeout = timeout;
4060 break;
4061 case 1:
4062 adev->compute_timeout = timeout;
4063 break;
4064 case 2:
4065 adev->sdma_timeout = timeout;
4066 break;
4067 case 3:
4068 adev->video_timeout = timeout;
4069 break;
4070 default:
4071 break;
4072 }
4073 }
4074 /*
4075 * There is only one value specified and
4076 * it should apply to all non-compute jobs.
4077 */
4078 if (index == 1) {
4079 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4080 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
4081 adev->compute_timeout = adev->gfx_timeout;
4082 }
4083 }
4084
4085 return ret;
4086 }
4087
4088 /**
4089 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4090 *
4091 * @adev: amdgpu_device pointer
4092 *
4093 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4094 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)4095 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4096 {
4097 struct iommu_domain *domain;
4098
4099 domain = iommu_get_domain_for_dev(adev->dev);
4100 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4101 adev->ram_is_direct_mapped = true;
4102 }
4103
4104 #if defined(CONFIG_HSA_AMD_P2P)
4105 /**
4106 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4107 *
4108 * @adev: amdgpu_device pointer
4109 *
4110 * return if IOMMU remapping bar address
4111 */
amdgpu_device_check_iommu_remap(struct amdgpu_device * adev)4112 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4113 {
4114 struct iommu_domain *domain;
4115
4116 domain = iommu_get_domain_for_dev(adev->dev);
4117 if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4118 domain->type == IOMMU_DOMAIN_DMA_FQ))
4119 return true;
4120
4121 return false;
4122 }
4123 #endif
4124
4125 static const struct attribute *amdgpu_dev_attributes[] = {
4126 &dev_attr_pcie_replay_count.attr,
4127 NULL
4128 };
4129
amdgpu_device_set_mcbp(struct amdgpu_device * adev)4130 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4131 {
4132 if (amdgpu_mcbp == 1)
4133 adev->gfx.mcbp = true;
4134 else if (amdgpu_mcbp == 0)
4135 adev->gfx.mcbp = false;
4136
4137 if (amdgpu_sriov_vf(adev))
4138 adev->gfx.mcbp = true;
4139
4140 if (adev->gfx.mcbp)
4141 DRM_INFO("MCBP is enabled\n");
4142 }
4143
4144 /**
4145 * amdgpu_device_init - initialize the driver
4146 *
4147 * @adev: amdgpu_device pointer
4148 * @flags: driver flags
4149 *
4150 * Initializes the driver info and hw (all asics).
4151 * Returns 0 for success or an error on failure.
4152 * Called at driver startup.
4153 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)4154 int amdgpu_device_init(struct amdgpu_device *adev,
4155 uint32_t flags)
4156 {
4157 struct drm_device *ddev = adev_to_drm(adev);
4158 struct pci_dev *pdev = adev->pdev;
4159 int r, i;
4160 bool px = false;
4161 u32 max_MBps;
4162 int tmp;
4163
4164 adev->shutdown = false;
4165 adev->flags = flags;
4166
4167 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4168 adev->asic_type = amdgpu_force_asic_type;
4169 else
4170 adev->asic_type = flags & AMD_ASIC_MASK;
4171
4172 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4173 if (amdgpu_emu_mode == 1)
4174 adev->usec_timeout *= 10;
4175 adev->gmc.gart_size = 512 * 1024 * 1024;
4176 adev->accel_working = false;
4177 adev->num_rings = 0;
4178 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4179 adev->mman.buffer_funcs = NULL;
4180 adev->mman.buffer_funcs_ring = NULL;
4181 adev->vm_manager.vm_pte_funcs = NULL;
4182 adev->vm_manager.vm_pte_num_scheds = 0;
4183 adev->gmc.gmc_funcs = NULL;
4184 adev->harvest_ip_mask = 0x0;
4185 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4186 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4187
4188 adev->smc_rreg = &amdgpu_invalid_rreg;
4189 adev->smc_wreg = &amdgpu_invalid_wreg;
4190 adev->pcie_rreg = &amdgpu_invalid_rreg;
4191 adev->pcie_wreg = &amdgpu_invalid_wreg;
4192 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4193 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4194 adev->pciep_rreg = &amdgpu_invalid_rreg;
4195 adev->pciep_wreg = &amdgpu_invalid_wreg;
4196 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4197 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4198 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4199 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4200 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4201 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4202 adev->didt_rreg = &amdgpu_invalid_rreg;
4203 adev->didt_wreg = &amdgpu_invalid_wreg;
4204 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4205 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4206 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4207 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4208
4209 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4210 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4211 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4212
4213 /* mutex initialization are all done here so we
4214 * can recall function without having locking issues
4215 */
4216 mutex_init(&adev->firmware.mutex);
4217 mutex_init(&adev->pm.mutex);
4218 mutex_init(&adev->gfx.gpu_clock_mutex);
4219 mutex_init(&adev->srbm_mutex);
4220 mutex_init(&adev->gfx.pipe_reserve_mutex);
4221 mutex_init(&adev->gfx.gfx_off_mutex);
4222 mutex_init(&adev->gfx.partition_mutex);
4223 mutex_init(&adev->grbm_idx_mutex);
4224 mutex_init(&adev->mn_lock);
4225 mutex_init(&adev->virt.vf_errors.lock);
4226 mutex_init(&adev->virt.rlcg_reg_lock);
4227 hash_init(adev->mn_hash);
4228 mutex_init(&adev->psp.mutex);
4229 mutex_init(&adev->notifier_lock);
4230 mutex_init(&adev->pm.stable_pstate_ctx_lock);
4231 mutex_init(&adev->benchmark_mutex);
4232 mutex_init(&adev->gfx.reset_sem_mutex);
4233 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4234 mutex_init(&adev->enforce_isolation_mutex);
4235 mutex_init(&adev->gfx.kfd_sch_mutex);
4236
4237 amdgpu_device_init_apu_flags(adev);
4238
4239 r = amdgpu_device_check_arguments(adev);
4240 if (r)
4241 return r;
4242
4243 spin_lock_init(&adev->mmio_idx_lock);
4244 spin_lock_init(&adev->smc_idx_lock);
4245 spin_lock_init(&adev->pcie_idx_lock);
4246 spin_lock_init(&adev->uvd_ctx_idx_lock);
4247 spin_lock_init(&adev->didt_idx_lock);
4248 spin_lock_init(&adev->gc_cac_idx_lock);
4249 spin_lock_init(&adev->se_cac_idx_lock);
4250 spin_lock_init(&adev->audio_endpt_idx_lock);
4251 spin_lock_init(&adev->mm_stats.lock);
4252 spin_lock_init(&adev->wb.lock);
4253
4254 INIT_LIST_HEAD(&adev->reset_list);
4255
4256 INIT_LIST_HEAD(&adev->ras_list);
4257
4258 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4259
4260 INIT_DELAYED_WORK(&adev->delayed_init_work,
4261 amdgpu_device_delayed_init_work_handler);
4262 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4263 amdgpu_device_delay_enable_gfx_off);
4264 /*
4265 * Initialize the enforce_isolation work structures for each XCP
4266 * partition. This work handler is responsible for enforcing shader
4267 * isolation on AMD GPUs. It counts the number of emitted fences for
4268 * each GFX and compute ring. If there are any fences, it schedules
4269 * the `enforce_isolation_work` to be run after a delay. If there are
4270 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4271 * runqueue.
4272 */
4273 for (i = 0; i < MAX_XCP; i++) {
4274 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4275 amdgpu_gfx_enforce_isolation_handler);
4276 adev->gfx.enforce_isolation[i].adev = adev;
4277 adev->gfx.enforce_isolation[i].xcp_id = i;
4278 }
4279
4280 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4281
4282 adev->gfx.gfx_off_req_count = 1;
4283 adev->gfx.gfx_off_residency = 0;
4284 adev->gfx.gfx_off_entrycount = 0;
4285 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4286
4287 atomic_set(&adev->throttling_logging_enabled, 1);
4288 /*
4289 * If throttling continues, logging will be performed every minute
4290 * to avoid log flooding. "-1" is subtracted since the thermal
4291 * throttling interrupt comes every second. Thus, the total logging
4292 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4293 * for throttling interrupt) = 60 seconds.
4294 */
4295 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4296 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1);
4297
4298 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4299 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE);
4300
4301 /* Registers mapping */
4302 /* TODO: block userspace mapping of io register */
4303 if (adev->asic_type >= CHIP_BONAIRE) {
4304 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4305 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4306 } else {
4307 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4308 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4309 }
4310
4311 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4312 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4313
4314 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4315 if (!adev->rmmio)
4316 return -ENOMEM;
4317
4318 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4319 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4320
4321 /*
4322 * Reset domain needs to be present early, before XGMI hive discovered
4323 * (if any) and initialized to use reset sem and in_gpu reset flag
4324 * early on during init and before calling to RREG32.
4325 */
4326 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4327 if (!adev->reset_domain)
4328 return -ENOMEM;
4329
4330 /* detect hw virtualization here */
4331 amdgpu_detect_virtualization(adev);
4332
4333 amdgpu_device_get_pcie_info(adev);
4334
4335 r = amdgpu_device_get_job_timeout_settings(adev);
4336 if (r) {
4337 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4338 return r;
4339 }
4340
4341 amdgpu_device_set_mcbp(adev);
4342
4343 /*
4344 * By default, use default mode where all blocks are expected to be
4345 * initialized. At present a 'swinit' of blocks is required to be
4346 * completed before the need for a different level is detected.
4347 */
4348 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT);
4349 /* early init functions */
4350 r = amdgpu_device_ip_early_init(adev);
4351 if (r)
4352 return r;
4353
4354 /* Get rid of things like offb */
4355 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
4356 if (r)
4357 return r;
4358
4359 /* Enable TMZ based on IP_VERSION */
4360 amdgpu_gmc_tmz_set(adev);
4361
4362 if (amdgpu_sriov_vf(adev) &&
4363 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4364 /* VF MMIO access (except mailbox range) from CPU
4365 * will be blocked during sriov runtime
4366 */
4367 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4368
4369 amdgpu_gmc_noretry_set(adev);
4370 /* Need to get xgmi info early to decide the reset behavior*/
4371 if (adev->gmc.xgmi.supported) {
4372 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4373 if (r)
4374 return r;
4375 }
4376
4377 /* enable PCIE atomic ops */
4378 if (amdgpu_sriov_vf(adev)) {
4379 if (adev->virt.fw_reserve.p_pf2vf)
4380 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4381 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4382 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4383 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4384 * internal path natively support atomics, set have_atomics_support to true.
4385 */
4386 } else if ((adev->flags & AMD_IS_APU) &&
4387 (amdgpu_ip_version(adev, GC_HWIP, 0) >
4388 IP_VERSION(9, 0, 0))) {
4389 adev->have_atomics_support = true;
4390 } else {
4391 adev->have_atomics_support =
4392 !pci_enable_atomic_ops_to_root(adev->pdev,
4393 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4394 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4395 }
4396
4397 if (!adev->have_atomics_support)
4398 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4399
4400 /* doorbell bar mapping and doorbell index init*/
4401 amdgpu_doorbell_init(adev);
4402
4403 if (amdgpu_emu_mode == 1) {
4404 /* post the asic on emulation mode */
4405 emu_soc_asic_init(adev);
4406 goto fence_driver_init;
4407 }
4408
4409 amdgpu_reset_init(adev);
4410
4411 /* detect if we are with an SRIOV vbios */
4412 if (adev->bios)
4413 amdgpu_device_detect_sriov_bios(adev);
4414
4415 /* check if we need to reset the asic
4416 * E.g., driver was not cleanly unloaded previously, etc.
4417 */
4418 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4419 if (adev->gmc.xgmi.num_physical_nodes) {
4420 dev_info(adev->dev, "Pending hive reset.\n");
4421 amdgpu_set_init_level(adev,
4422 AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
4423 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4424 !amdgpu_device_has_display_hardware(adev)) {
4425 r = psp_gpu_reset(adev);
4426 } else {
4427 tmp = amdgpu_reset_method;
4428 /* It should do a default reset when loading or reloading the driver,
4429 * regardless of the module parameter reset_method.
4430 */
4431 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4432 r = amdgpu_asic_reset(adev);
4433 amdgpu_reset_method = tmp;
4434 }
4435
4436 if (r) {
4437 dev_err(adev->dev, "asic reset on init failed\n");
4438 goto failed;
4439 }
4440 }
4441
4442 /* Post card if necessary */
4443 if (amdgpu_device_need_post(adev)) {
4444 if (!adev->bios) {
4445 dev_err(adev->dev, "no vBIOS found\n");
4446 r = -EINVAL;
4447 goto failed;
4448 }
4449 DRM_INFO("GPU posting now...\n");
4450 r = amdgpu_device_asic_init(adev);
4451 if (r) {
4452 dev_err(adev->dev, "gpu post error!\n");
4453 goto failed;
4454 }
4455 }
4456
4457 if (adev->bios) {
4458 if (adev->is_atom_fw) {
4459 /* Initialize clocks */
4460 r = amdgpu_atomfirmware_get_clock_info(adev);
4461 if (r) {
4462 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4463 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4464 goto failed;
4465 }
4466 } else {
4467 /* Initialize clocks */
4468 r = amdgpu_atombios_get_clock_info(adev);
4469 if (r) {
4470 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4471 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4472 goto failed;
4473 }
4474 /* init i2c buses */
4475 if (!amdgpu_device_has_dc_support(adev))
4476 amdgpu_atombios_i2c_init(adev);
4477 }
4478 }
4479
4480 fence_driver_init:
4481 /* Fence driver */
4482 r = amdgpu_fence_driver_sw_init(adev);
4483 if (r) {
4484 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4485 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4486 goto failed;
4487 }
4488
4489 /* init the mode config */
4490 drm_mode_config_init(adev_to_drm(adev));
4491
4492 r = amdgpu_device_ip_init(adev);
4493 if (r) {
4494 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4495 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4496 goto release_ras_con;
4497 }
4498
4499 amdgpu_fence_driver_hw_init(adev);
4500
4501 dev_info(adev->dev,
4502 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4503 adev->gfx.config.max_shader_engines,
4504 adev->gfx.config.max_sh_per_se,
4505 adev->gfx.config.max_cu_per_sh,
4506 adev->gfx.cu_info.number);
4507
4508 adev->accel_working = true;
4509
4510 amdgpu_vm_check_compute_bug(adev);
4511
4512 /* Initialize the buffer migration limit. */
4513 if (amdgpu_moverate >= 0)
4514 max_MBps = amdgpu_moverate;
4515 else
4516 max_MBps = 8; /* Allow 8 MB/s. */
4517 /* Get a log2 for easy divisions. */
4518 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4519
4520 /*
4521 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4522 * Otherwise the mgpu fan boost feature will be skipped due to the
4523 * gpu instance is counted less.
4524 */
4525 amdgpu_register_gpu_instance(adev);
4526
4527 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4528 * explicit gating rather than handling it automatically.
4529 */
4530 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
4531 r = amdgpu_device_ip_late_init(adev);
4532 if (r) {
4533 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4534 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4535 goto release_ras_con;
4536 }
4537 /* must succeed. */
4538 amdgpu_ras_resume(adev);
4539 queue_delayed_work(system_wq, &adev->delayed_init_work,
4540 msecs_to_jiffies(AMDGPU_RESUME_MS));
4541 }
4542
4543 if (amdgpu_sriov_vf(adev)) {
4544 amdgpu_virt_release_full_gpu(adev, true);
4545 flush_delayed_work(&adev->delayed_init_work);
4546 }
4547
4548 /*
4549 * Place those sysfs registering after `late_init`. As some of those
4550 * operations performed in `late_init` might affect the sysfs
4551 * interfaces creating.
4552 */
4553 r = amdgpu_atombios_sysfs_init(adev);
4554 if (r)
4555 drm_err(&adev->ddev,
4556 "registering atombios sysfs failed (%d).\n", r);
4557
4558 r = amdgpu_pm_sysfs_init(adev);
4559 if (r)
4560 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4561
4562 r = amdgpu_ucode_sysfs_init(adev);
4563 if (r) {
4564 adev->ucode_sysfs_en = false;
4565 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4566 } else
4567 adev->ucode_sysfs_en = true;
4568
4569 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
4570 if (r)
4571 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4572
4573 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4574 if (r)
4575 dev_err(adev->dev,
4576 "Could not create amdgpu board attributes\n");
4577
4578 amdgpu_fru_sysfs_init(adev);
4579 amdgpu_reg_state_sysfs_init(adev);
4580 amdgpu_xcp_cfg_sysfs_init(adev);
4581
4582 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4583 r = amdgpu_pmu_init(adev);
4584 if (r)
4585 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4586
4587 /* Have stored pci confspace at hand for restore in sudden PCI error */
4588 if (amdgpu_device_cache_pci_state(adev->pdev))
4589 pci_restore_state(pdev);
4590
4591 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4592 /* this will fail for cards that aren't VGA class devices, just
4593 * ignore it
4594 */
4595 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4596 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4597
4598 px = amdgpu_device_supports_px(ddev);
4599
4600 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4601 apple_gmux_detect(NULL, NULL)))
4602 vga_switcheroo_register_client(adev->pdev,
4603 &amdgpu_switcheroo_ops, px);
4604
4605 if (px)
4606 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4607
4608 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
4609 amdgpu_xgmi_reset_on_init(adev);
4610
4611 amdgpu_device_check_iommu_direct_map(adev);
4612
4613 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
4614 r = register_pm_notifier(&adev->pm_nb);
4615 if (r)
4616 goto failed;
4617
4618 return 0;
4619
4620 release_ras_con:
4621 if (amdgpu_sriov_vf(adev))
4622 amdgpu_virt_release_full_gpu(adev, true);
4623
4624 /* failed in exclusive mode due to timeout */
4625 if (amdgpu_sriov_vf(adev) &&
4626 !amdgpu_sriov_runtime(adev) &&
4627 amdgpu_virt_mmio_blocked(adev) &&
4628 !amdgpu_virt_wait_reset(adev)) {
4629 dev_err(adev->dev, "VF exclusive mode timeout\n");
4630 /* Don't send request since VF is inactive. */
4631 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4632 adev->virt.ops = NULL;
4633 r = -EAGAIN;
4634 }
4635 amdgpu_release_ras_context(adev);
4636
4637 failed:
4638 amdgpu_vf_error_trans_all(adev);
4639
4640 return r;
4641 }
4642
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4643 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4644 {
4645
4646 /* Clear all CPU mappings pointing to this device */
4647 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4648
4649 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4650 amdgpu_doorbell_fini(adev);
4651
4652 iounmap(adev->rmmio);
4653 adev->rmmio = NULL;
4654 if (adev->mman.aper_base_kaddr)
4655 iounmap(adev->mman.aper_base_kaddr);
4656 adev->mman.aper_base_kaddr = NULL;
4657
4658 /* Memory manager related */
4659 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4660 arch_phys_wc_del(adev->gmc.vram_mtrr);
4661 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4662 }
4663 }
4664
4665 /**
4666 * amdgpu_device_fini_hw - tear down the driver
4667 *
4668 * @adev: amdgpu_device pointer
4669 *
4670 * Tear down the driver info (all asics).
4671 * Called at driver shutdown.
4672 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4673 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4674 {
4675 dev_info(adev->dev, "amdgpu: finishing device.\n");
4676 flush_delayed_work(&adev->delayed_init_work);
4677
4678 if (adev->mman.initialized)
4679 drain_workqueue(adev->mman.bdev.wq);
4680 adev->shutdown = true;
4681
4682 unregister_pm_notifier(&adev->pm_nb);
4683
4684 /* make sure IB test finished before entering exclusive mode
4685 * to avoid preemption on IB test
4686 */
4687 if (amdgpu_sriov_vf(adev)) {
4688 amdgpu_virt_request_full_gpu(adev, false);
4689 amdgpu_virt_fini_data_exchange(adev);
4690 }
4691
4692 /* disable all interrupts */
4693 amdgpu_irq_disable_all(adev);
4694 if (adev->mode_info.mode_config_initialized) {
4695 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4696 drm_helper_force_disable_all(adev_to_drm(adev));
4697 else
4698 drm_atomic_helper_shutdown(adev_to_drm(adev));
4699 }
4700 amdgpu_fence_driver_hw_fini(adev);
4701
4702 if (adev->pm.sysfs_initialized)
4703 amdgpu_pm_sysfs_fini(adev);
4704 if (adev->ucode_sysfs_en)
4705 amdgpu_ucode_sysfs_fini(adev);
4706 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4707 amdgpu_fru_sysfs_fini(adev);
4708
4709 amdgpu_reg_state_sysfs_fini(adev);
4710 amdgpu_xcp_cfg_sysfs_fini(adev);
4711
4712 /* disable ras feature must before hw fini */
4713 amdgpu_ras_pre_fini(adev);
4714
4715 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4716
4717 amdgpu_device_ip_fini_early(adev);
4718
4719 amdgpu_irq_fini_hw(adev);
4720
4721 if (adev->mman.initialized)
4722 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4723
4724 amdgpu_gart_dummy_page_fini(adev);
4725
4726 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4727 amdgpu_device_unmap_mmio(adev);
4728
4729 }
4730
amdgpu_device_fini_sw(struct amdgpu_device * adev)4731 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4732 {
4733 int idx;
4734 bool px;
4735
4736 amdgpu_device_ip_fini(adev);
4737 amdgpu_fence_driver_sw_fini(adev);
4738 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4739 adev->accel_working = false;
4740 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4741
4742 amdgpu_reset_fini(adev);
4743
4744 /* free i2c buses */
4745 if (!amdgpu_device_has_dc_support(adev))
4746 amdgpu_i2c_fini(adev);
4747
4748 if (amdgpu_emu_mode != 1)
4749 amdgpu_atombios_fini(adev);
4750
4751 kfree(adev->bios);
4752 adev->bios = NULL;
4753
4754 kfree(adev->fru_info);
4755 adev->fru_info = NULL;
4756
4757 px = amdgpu_device_supports_px(adev_to_drm(adev));
4758
4759 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4760 apple_gmux_detect(NULL, NULL)))
4761 vga_switcheroo_unregister_client(adev->pdev);
4762
4763 if (px)
4764 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4765
4766 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4767 vga_client_unregister(adev->pdev);
4768
4769 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4770
4771 iounmap(adev->rmmio);
4772 adev->rmmio = NULL;
4773 amdgpu_doorbell_fini(adev);
4774 drm_dev_exit(idx);
4775 }
4776
4777 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4778 amdgpu_pmu_fini(adev);
4779 if (adev->mman.discovery_bin)
4780 amdgpu_discovery_fini(adev);
4781
4782 amdgpu_reset_put_reset_domain(adev->reset_domain);
4783 adev->reset_domain = NULL;
4784
4785 kfree(adev->pci_state);
4786
4787 }
4788
4789 /**
4790 * amdgpu_device_evict_resources - evict device resources
4791 * @adev: amdgpu device object
4792 *
4793 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4794 * of the vram memory type. Mainly used for evicting device resources
4795 * at suspend time.
4796 *
4797 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4798 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4799 {
4800 int ret;
4801
4802 /* No need to evict vram on APUs unless going to S4 */
4803 if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
4804 return 0;
4805
4806 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4807 if (ret)
4808 DRM_WARN("evicting device resources failed\n");
4809 return ret;
4810 }
4811
4812 /*
4813 * Suspend & resume.
4814 */
4815 /**
4816 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
4817 * @nb: notifier block
4818 * @mode: suspend mode
4819 * @data: data
4820 *
4821 * This function is called when the system is about to suspend or hibernate.
4822 * It is used to evict resources from the device before the system goes to
4823 * sleep while there is still access to swap.
4824 */
amdgpu_device_pm_notifier(struct notifier_block * nb,unsigned long mode,void * data)4825 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
4826 void *data)
4827 {
4828 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
4829 int r;
4830
4831 switch (mode) {
4832 case PM_HIBERNATION_PREPARE:
4833 adev->in_s4 = true;
4834 fallthrough;
4835 case PM_SUSPEND_PREPARE:
4836 r = amdgpu_device_evict_resources(adev);
4837 /*
4838 * This is considered non-fatal at this time because
4839 * amdgpu_device_prepare() will also fatally evict resources.
4840 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781
4841 */
4842 if (r)
4843 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r);
4844 break;
4845 }
4846
4847 return NOTIFY_DONE;
4848 }
4849
4850 /**
4851 * amdgpu_device_prepare - prepare for device suspend
4852 *
4853 * @dev: drm dev pointer
4854 *
4855 * Prepare to put the hw in the suspend state (all asics).
4856 * Returns 0 for success or an error on failure.
4857 * Called at driver suspend.
4858 */
amdgpu_device_prepare(struct drm_device * dev)4859 int amdgpu_device_prepare(struct drm_device *dev)
4860 {
4861 struct amdgpu_device *adev = drm_to_adev(dev);
4862 int i, r;
4863
4864 amdgpu_choose_low_power_state(adev);
4865
4866 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4867 return 0;
4868
4869 /* Evict the majority of BOs before starting suspend sequence */
4870 r = amdgpu_device_evict_resources(adev);
4871 if (r)
4872 goto unprepare;
4873
4874 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4875
4876 for (i = 0; i < adev->num_ip_blocks; i++) {
4877 if (!adev->ip_blocks[i].status.valid)
4878 continue;
4879 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4880 continue;
4881 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
4882 if (r)
4883 goto unprepare;
4884 }
4885
4886 return 0;
4887
4888 unprepare:
4889 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false;
4890
4891 return r;
4892 }
4893
4894 /**
4895 * amdgpu_device_suspend - initiate device suspend
4896 *
4897 * @dev: drm dev pointer
4898 * @notify_clients: notify in-kernel DRM clients
4899 *
4900 * Puts the hw in the suspend state (all asics).
4901 * Returns 0 for success or an error on failure.
4902 * Called at driver suspend.
4903 */
amdgpu_device_suspend(struct drm_device * dev,bool notify_clients)4904 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
4905 {
4906 struct amdgpu_device *adev = drm_to_adev(dev);
4907 int r = 0;
4908
4909 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4910 return 0;
4911
4912 adev->in_suspend = true;
4913
4914 if (amdgpu_sriov_vf(adev)) {
4915 amdgpu_virt_fini_data_exchange(adev);
4916 r = amdgpu_virt_request_full_gpu(adev, false);
4917 if (r)
4918 return r;
4919 }
4920
4921 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4922 DRM_WARN("smart shift update failed\n");
4923
4924 if (notify_clients)
4925 drm_client_dev_suspend(adev_to_drm(adev), false);
4926
4927 cancel_delayed_work_sync(&adev->delayed_init_work);
4928
4929 amdgpu_ras_suspend(adev);
4930
4931 amdgpu_device_ip_suspend_phase1(adev);
4932
4933 if (!adev->in_s0ix)
4934 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4935
4936 r = amdgpu_device_evict_resources(adev);
4937 if (r)
4938 return r;
4939
4940 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4941
4942 amdgpu_fence_driver_hw_fini(adev);
4943
4944 amdgpu_device_ip_suspend_phase2(adev);
4945
4946 if (amdgpu_sriov_vf(adev))
4947 amdgpu_virt_release_full_gpu(adev, false);
4948
4949 r = amdgpu_dpm_notify_rlc_state(adev, false);
4950 if (r)
4951 return r;
4952
4953 return 0;
4954 }
4955
4956 /**
4957 * amdgpu_device_resume - initiate device resume
4958 *
4959 * @dev: drm dev pointer
4960 * @notify_clients: notify in-kernel DRM clients
4961 *
4962 * Bring the hw back to operating state (all asics).
4963 * Returns 0 for success or an error on failure.
4964 * Called at driver resume.
4965 */
amdgpu_device_resume(struct drm_device * dev,bool notify_clients)4966 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
4967 {
4968 struct amdgpu_device *adev = drm_to_adev(dev);
4969 int r = 0;
4970
4971 if (amdgpu_sriov_vf(adev)) {
4972 r = amdgpu_virt_request_full_gpu(adev, true);
4973 if (r)
4974 return r;
4975 }
4976
4977 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4978 return 0;
4979
4980 if (adev->in_s0ix)
4981 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4982
4983 /* post card */
4984 if (amdgpu_device_need_post(adev)) {
4985 r = amdgpu_device_asic_init(adev);
4986 if (r)
4987 dev_err(adev->dev, "amdgpu asic init failed\n");
4988 }
4989
4990 r = amdgpu_device_ip_resume(adev);
4991
4992 if (r) {
4993 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4994 goto exit;
4995 }
4996
4997 if (!adev->in_s0ix) {
4998 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4999 if (r)
5000 goto exit;
5001 }
5002
5003 r = amdgpu_device_ip_late_init(adev);
5004 if (r)
5005 goto exit;
5006
5007 queue_delayed_work(system_wq, &adev->delayed_init_work,
5008 msecs_to_jiffies(AMDGPU_RESUME_MS));
5009 exit:
5010 if (amdgpu_sriov_vf(adev)) {
5011 amdgpu_virt_init_data_exchange(adev);
5012 amdgpu_virt_release_full_gpu(adev, true);
5013 }
5014
5015 if (r)
5016 return r;
5017
5018 /* Make sure IB tests flushed */
5019 flush_delayed_work(&adev->delayed_init_work);
5020
5021 if (notify_clients)
5022 drm_client_dev_resume(adev_to_drm(adev), false);
5023
5024 amdgpu_ras_resume(adev);
5025
5026 if (adev->mode_info.num_crtc) {
5027 /*
5028 * Most of the connector probing functions try to acquire runtime pm
5029 * refs to ensure that the GPU is powered on when connector polling is
5030 * performed. Since we're calling this from a runtime PM callback,
5031 * trying to acquire rpm refs will cause us to deadlock.
5032 *
5033 * Since we're guaranteed to be holding the rpm lock, it's safe to
5034 * temporarily disable the rpm helpers so this doesn't deadlock us.
5035 */
5036 #ifdef CONFIG_PM
5037 dev->dev->power.disable_depth++;
5038 #endif
5039 if (!adev->dc_enabled)
5040 drm_helper_hpd_irq_event(dev);
5041 else
5042 drm_kms_helper_hotplug_event(dev);
5043 #ifdef CONFIG_PM
5044 dev->dev->power.disable_depth--;
5045 #endif
5046 }
5047 adev->in_suspend = false;
5048
5049 if (adev->enable_mes)
5050 amdgpu_mes_self_test(adev);
5051
5052 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
5053 DRM_WARN("smart shift update failed\n");
5054
5055 return 0;
5056 }
5057
5058 /**
5059 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
5060 *
5061 * @adev: amdgpu_device pointer
5062 *
5063 * The list of all the hardware IPs that make up the asic is walked and
5064 * the check_soft_reset callbacks are run. check_soft_reset determines
5065 * if the asic is still hung or not.
5066 * Returns true if any of the IPs are still in a hung state, false if not.
5067 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)5068 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
5069 {
5070 int i;
5071 bool asic_hang = false;
5072
5073 if (amdgpu_sriov_vf(adev))
5074 return true;
5075
5076 if (amdgpu_asic_need_full_reset(adev))
5077 return true;
5078
5079 for (i = 0; i < adev->num_ip_blocks; i++) {
5080 if (!adev->ip_blocks[i].status.valid)
5081 continue;
5082 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
5083 adev->ip_blocks[i].status.hang =
5084 adev->ip_blocks[i].version->funcs->check_soft_reset(
5085 &adev->ip_blocks[i]);
5086 if (adev->ip_blocks[i].status.hang) {
5087 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
5088 asic_hang = true;
5089 }
5090 }
5091 return asic_hang;
5092 }
5093
5094 /**
5095 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5096 *
5097 * @adev: amdgpu_device pointer
5098 *
5099 * The list of all the hardware IPs that make up the asic is walked and the
5100 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
5101 * handles any IP specific hardware or software state changes that are
5102 * necessary for a soft reset to succeed.
5103 * Returns 0 on success, negative error code on failure.
5104 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)5105 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5106 {
5107 int i, r = 0;
5108
5109 for (i = 0; i < adev->num_ip_blocks; i++) {
5110 if (!adev->ip_blocks[i].status.valid)
5111 continue;
5112 if (adev->ip_blocks[i].status.hang &&
5113 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5114 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
5115 if (r)
5116 return r;
5117 }
5118 }
5119
5120 return 0;
5121 }
5122
5123 /**
5124 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5125 *
5126 * @adev: amdgpu_device pointer
5127 *
5128 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
5129 * reset is necessary to recover.
5130 * Returns true if a full asic reset is required, false if not.
5131 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)5132 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5133 {
5134 int i;
5135
5136 if (amdgpu_asic_need_full_reset(adev))
5137 return true;
5138
5139 for (i = 0; i < adev->num_ip_blocks; i++) {
5140 if (!adev->ip_blocks[i].status.valid)
5141 continue;
5142 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5143 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5144 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5145 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5146 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5147 if (adev->ip_blocks[i].status.hang) {
5148 dev_info(adev->dev, "Some block need full reset!\n");
5149 return true;
5150 }
5151 }
5152 }
5153 return false;
5154 }
5155
5156 /**
5157 * amdgpu_device_ip_soft_reset - do a soft reset
5158 *
5159 * @adev: amdgpu_device pointer
5160 *
5161 * The list of all the hardware IPs that make up the asic is walked and the
5162 * soft_reset callbacks are run if the block is hung. soft_reset handles any
5163 * IP specific hardware or software state changes that are necessary to soft
5164 * reset the IP.
5165 * Returns 0 on success, negative error code on failure.
5166 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)5167 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5168 {
5169 int i, r = 0;
5170
5171 for (i = 0; i < adev->num_ip_blocks; i++) {
5172 if (!adev->ip_blocks[i].status.valid)
5173 continue;
5174 if (adev->ip_blocks[i].status.hang &&
5175 adev->ip_blocks[i].version->funcs->soft_reset) {
5176 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
5177 if (r)
5178 return r;
5179 }
5180 }
5181
5182 return 0;
5183 }
5184
5185 /**
5186 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5187 *
5188 * @adev: amdgpu_device pointer
5189 *
5190 * The list of all the hardware IPs that make up the asic is walked and the
5191 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5192 * handles any IP specific hardware or software state changes that are
5193 * necessary after the IP has been soft reset.
5194 * Returns 0 on success, negative error code on failure.
5195 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)5196 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5197 {
5198 int i, r = 0;
5199
5200 for (i = 0; i < adev->num_ip_blocks; i++) {
5201 if (!adev->ip_blocks[i].status.valid)
5202 continue;
5203 if (adev->ip_blocks[i].status.hang &&
5204 adev->ip_blocks[i].version->funcs->post_soft_reset)
5205 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
5206 if (r)
5207 return r;
5208 }
5209
5210 return 0;
5211 }
5212
5213 /**
5214 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5215 *
5216 * @adev: amdgpu_device pointer
5217 * @reset_context: amdgpu reset context pointer
5218 *
5219 * do VF FLR and reinitialize Asic
5220 * return 0 means succeeded otherwise failed
5221 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5222 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5223 struct amdgpu_reset_context *reset_context)
5224 {
5225 int r;
5226 struct amdgpu_hive_info *hive = NULL;
5227
5228 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5229 if (!amdgpu_ras_get_fed_status(adev))
5230 amdgpu_virt_ready_to_reset(adev);
5231 amdgpu_virt_wait_reset(adev);
5232 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5233 r = amdgpu_virt_request_full_gpu(adev, true);
5234 } else {
5235 r = amdgpu_virt_reset_gpu(adev);
5236 }
5237 if (r)
5238 return r;
5239
5240 amdgpu_ras_clear_err_state(adev);
5241 amdgpu_irq_gpu_reset_resume_helper(adev);
5242
5243 /* some sw clean up VF needs to do before recover */
5244 amdgpu_virt_post_reset(adev);
5245
5246 /* Resume IP prior to SMC */
5247 r = amdgpu_device_ip_reinit_early_sriov(adev);
5248 if (r)
5249 return r;
5250
5251 amdgpu_virt_init_data_exchange(adev);
5252
5253 r = amdgpu_device_fw_loading(adev);
5254 if (r)
5255 return r;
5256
5257 /* now we are okay to resume SMC/CP/SDMA */
5258 r = amdgpu_device_ip_reinit_late_sriov(adev);
5259 if (r)
5260 return r;
5261
5262 hive = amdgpu_get_xgmi_hive(adev);
5263 /* Update PSP FW topology after reset */
5264 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5265 r = amdgpu_xgmi_update_topology(hive, adev);
5266 if (hive)
5267 amdgpu_put_xgmi_hive(hive);
5268 if (r)
5269 return r;
5270
5271 r = amdgpu_ib_ring_tests(adev);
5272 if (r)
5273 return r;
5274
5275 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5276 amdgpu_inc_vram_lost(adev);
5277
5278 /* need to be called during full access so we can't do it later like
5279 * bare-metal does.
5280 */
5281 amdgpu_amdkfd_post_reset(adev);
5282 amdgpu_virt_release_full_gpu(adev, true);
5283
5284 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5285 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5286 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5287 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5288 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5289 amdgpu_ras_resume(adev);
5290
5291 amdgpu_virt_ras_telemetry_post_reset(adev);
5292
5293 return 0;
5294 }
5295
5296 /**
5297 * amdgpu_device_has_job_running - check if there is any unfinished job
5298 *
5299 * @adev: amdgpu_device pointer
5300 *
5301 * check if there is any job running on the device when guest driver receives
5302 * FLR notification from host driver. If there are still jobs running, then
5303 * the guest driver will not respond the FLR reset. Instead, let the job hit
5304 * the timeout and guest driver then issue the reset request.
5305 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)5306 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5307 {
5308 int i;
5309
5310 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5311 struct amdgpu_ring *ring = adev->rings[i];
5312
5313 if (!amdgpu_ring_sched_ready(ring))
5314 continue;
5315
5316 if (amdgpu_fence_count_emitted(ring))
5317 return true;
5318 }
5319 return false;
5320 }
5321
5322 /**
5323 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5324 *
5325 * @adev: amdgpu_device pointer
5326 *
5327 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5328 * a hung GPU.
5329 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)5330 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5331 {
5332
5333 if (amdgpu_gpu_recovery == 0)
5334 goto disabled;
5335
5336 /* Skip soft reset check in fatal error mode */
5337 if (!amdgpu_ras_is_poison_mode_supported(adev))
5338 return true;
5339
5340 if (amdgpu_sriov_vf(adev))
5341 return true;
5342
5343 if (amdgpu_gpu_recovery == -1) {
5344 switch (adev->asic_type) {
5345 #ifdef CONFIG_DRM_AMDGPU_SI
5346 case CHIP_VERDE:
5347 case CHIP_TAHITI:
5348 case CHIP_PITCAIRN:
5349 case CHIP_OLAND:
5350 case CHIP_HAINAN:
5351 #endif
5352 #ifdef CONFIG_DRM_AMDGPU_CIK
5353 case CHIP_KAVERI:
5354 case CHIP_KABINI:
5355 case CHIP_MULLINS:
5356 #endif
5357 case CHIP_CARRIZO:
5358 case CHIP_STONEY:
5359 case CHIP_CYAN_SKILLFISH:
5360 goto disabled;
5361 default:
5362 break;
5363 }
5364 }
5365
5366 return true;
5367
5368 disabled:
5369 dev_info(adev->dev, "GPU recovery disabled.\n");
5370 return false;
5371 }
5372
amdgpu_device_mode1_reset(struct amdgpu_device * adev)5373 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5374 {
5375 u32 i;
5376 int ret = 0;
5377
5378 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5379
5380 dev_info(adev->dev, "GPU mode1 reset\n");
5381
5382 /* Cache the state before bus master disable. The saved config space
5383 * values are used in other cases like restore after mode-2 reset.
5384 */
5385 amdgpu_device_cache_pci_state(adev->pdev);
5386
5387 /* disable BM */
5388 pci_clear_master(adev->pdev);
5389
5390 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5391 dev_info(adev->dev, "GPU smu mode1 reset\n");
5392 ret = amdgpu_dpm_mode1_reset(adev);
5393 } else {
5394 dev_info(adev->dev, "GPU psp mode1 reset\n");
5395 ret = psp_gpu_reset(adev);
5396 }
5397
5398 if (ret)
5399 goto mode1_reset_failed;
5400
5401 amdgpu_device_load_pci_state(adev->pdev);
5402 ret = amdgpu_psp_wait_for_bootloader(adev);
5403 if (ret)
5404 goto mode1_reset_failed;
5405
5406 /* wait for asic to come out of reset */
5407 for (i = 0; i < adev->usec_timeout; i++) {
5408 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5409
5410 if (memsize != 0xffffffff)
5411 break;
5412 udelay(1);
5413 }
5414
5415 if (i >= adev->usec_timeout) {
5416 ret = -ETIMEDOUT;
5417 goto mode1_reset_failed;
5418 }
5419
5420 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5421
5422 return 0;
5423
5424 mode1_reset_failed:
5425 dev_err(adev->dev, "GPU mode1 reset failed\n");
5426 return ret;
5427 }
5428
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5429 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5430 struct amdgpu_reset_context *reset_context)
5431 {
5432 int i, r = 0;
5433 struct amdgpu_job *job = NULL;
5434 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5435 bool need_full_reset =
5436 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5437
5438 if (reset_context->reset_req_dev == adev)
5439 job = reset_context->job;
5440
5441 if (amdgpu_sriov_vf(adev))
5442 amdgpu_virt_pre_reset(adev);
5443
5444 amdgpu_fence_driver_isr_toggle(adev, true);
5445
5446 /* block all schedulers and reset given job's ring */
5447 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5448 struct amdgpu_ring *ring = adev->rings[i];
5449
5450 if (!amdgpu_ring_sched_ready(ring))
5451 continue;
5452
5453 /* Clear job fence from fence drv to avoid force_completion
5454 * leave NULL and vm flush fence in fence drv
5455 */
5456 amdgpu_fence_driver_clear_job_fences(ring);
5457
5458 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5459 amdgpu_fence_driver_force_completion(ring);
5460 }
5461
5462 amdgpu_fence_driver_isr_toggle(adev, false);
5463
5464 if (job && job->vm)
5465 drm_sched_increase_karma(&job->base);
5466
5467 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5468 /* If reset handler not implemented, continue; otherwise return */
5469 if (r == -EOPNOTSUPP)
5470 r = 0;
5471 else
5472 return r;
5473
5474 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5475 if (!amdgpu_sriov_vf(adev)) {
5476
5477 if (!need_full_reset)
5478 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5479
5480 if (!need_full_reset && amdgpu_gpu_recovery &&
5481 amdgpu_device_ip_check_soft_reset(adev)) {
5482 amdgpu_device_ip_pre_soft_reset(adev);
5483 r = amdgpu_device_ip_soft_reset(adev);
5484 amdgpu_device_ip_post_soft_reset(adev);
5485 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5486 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5487 need_full_reset = true;
5488 }
5489 }
5490
5491 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5492 dev_info(tmp_adev->dev, "Dumping IP State\n");
5493 /* Trigger ip dump before we reset the asic */
5494 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5495 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5496 tmp_adev->ip_blocks[i].version->funcs
5497 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
5498 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5499 }
5500
5501 if (need_full_reset)
5502 r = amdgpu_device_ip_suspend(adev);
5503 if (need_full_reset)
5504 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5505 else
5506 clear_bit(AMDGPU_NEED_FULL_RESET,
5507 &reset_context->flags);
5508 }
5509
5510 return r;
5511 }
5512
amdgpu_device_reinit_after_reset(struct amdgpu_reset_context * reset_context)5513 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
5514 {
5515 struct list_head *device_list_handle;
5516 bool full_reset, vram_lost = false;
5517 struct amdgpu_device *tmp_adev;
5518 int r, init_level;
5519
5520 device_list_handle = reset_context->reset_device_list;
5521
5522 if (!device_list_handle)
5523 return -EINVAL;
5524
5525 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5526
5527 /**
5528 * If it's reset on init, it's default init level, otherwise keep level
5529 * as recovery level.
5530 */
5531 if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
5532 init_level = AMDGPU_INIT_LEVEL_DEFAULT;
5533 else
5534 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
5535
5536 r = 0;
5537 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5538 amdgpu_set_init_level(tmp_adev, init_level);
5539 if (full_reset) {
5540 /* post card */
5541 amdgpu_ras_clear_err_state(tmp_adev);
5542 r = amdgpu_device_asic_init(tmp_adev);
5543 if (r) {
5544 dev_warn(tmp_adev->dev, "asic atom init failed!");
5545 } else {
5546 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5547
5548 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5549 if (r)
5550 goto out;
5551
5552 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5553
5554 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5555 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5556
5557 if (vram_lost) {
5558 DRM_INFO("VRAM is lost due to GPU reset!\n");
5559 amdgpu_inc_vram_lost(tmp_adev);
5560 }
5561
5562 r = amdgpu_device_fw_loading(tmp_adev);
5563 if (r)
5564 return r;
5565
5566 r = amdgpu_xcp_restore_partition_mode(
5567 tmp_adev->xcp_mgr);
5568 if (r)
5569 goto out;
5570
5571 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5572 if (r)
5573 goto out;
5574
5575 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5576 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5577
5578 r = amdgpu_device_ip_resume_phase3(tmp_adev);
5579 if (r)
5580 goto out;
5581
5582 if (vram_lost)
5583 amdgpu_device_fill_reset_magic(tmp_adev);
5584
5585 /*
5586 * Add this ASIC as tracked as reset was already
5587 * complete successfully.
5588 */
5589 amdgpu_register_gpu_instance(tmp_adev);
5590
5591 if (!reset_context->hive &&
5592 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5593 amdgpu_xgmi_add_device(tmp_adev);
5594
5595 r = amdgpu_device_ip_late_init(tmp_adev);
5596 if (r)
5597 goto out;
5598
5599 drm_client_dev_resume(adev_to_drm(tmp_adev), false);
5600
5601 /*
5602 * The GPU enters bad state once faulty pages
5603 * by ECC has reached the threshold, and ras
5604 * recovery is scheduled next. So add one check
5605 * here to break recovery if it indeed exceeds
5606 * bad page threshold, and remind user to
5607 * retire this GPU or setting one bigger
5608 * bad_page_threshold value to fix this once
5609 * probing driver again.
5610 */
5611 if (!amdgpu_ras_is_rma(tmp_adev)) {
5612 /* must succeed. */
5613 amdgpu_ras_resume(tmp_adev);
5614 } else {
5615 r = -EINVAL;
5616 goto out;
5617 }
5618
5619 /* Update PSP FW topology after reset */
5620 if (reset_context->hive &&
5621 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5622 r = amdgpu_xgmi_update_topology(
5623 reset_context->hive, tmp_adev);
5624 }
5625 }
5626
5627 out:
5628 if (!r) {
5629 /* IP init is complete now, set level as default */
5630 amdgpu_set_init_level(tmp_adev,
5631 AMDGPU_INIT_LEVEL_DEFAULT);
5632 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5633 r = amdgpu_ib_ring_tests(tmp_adev);
5634 if (r) {
5635 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5636 r = -EAGAIN;
5637 goto end;
5638 }
5639 }
5640
5641 if (r)
5642 tmp_adev->asic_reset_res = r;
5643 }
5644
5645 end:
5646 return r;
5647 }
5648
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5649 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5650 struct amdgpu_reset_context *reset_context)
5651 {
5652 struct amdgpu_device *tmp_adev = NULL;
5653 bool need_full_reset, skip_hw_reset;
5654 int r = 0;
5655
5656 /* Try reset handler method first */
5657 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5658 reset_list);
5659
5660 reset_context->reset_device_list = device_list_handle;
5661 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5662 /* If reset handler not implemented, continue; otherwise return */
5663 if (r == -EOPNOTSUPP)
5664 r = 0;
5665 else
5666 return r;
5667
5668 /* Reset handler not implemented, use the default method */
5669 need_full_reset =
5670 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5671 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5672
5673 /*
5674 * ASIC reset has to be done on all XGMI hive nodes ASAP
5675 * to allow proper links negotiation in FW (within 1 sec)
5676 */
5677 if (!skip_hw_reset && need_full_reset) {
5678 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5679 /* For XGMI run all resets in parallel to speed up the process */
5680 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5681 if (!queue_work(system_unbound_wq,
5682 &tmp_adev->xgmi_reset_work))
5683 r = -EALREADY;
5684 } else
5685 r = amdgpu_asic_reset(tmp_adev);
5686
5687 if (r) {
5688 dev_err(tmp_adev->dev,
5689 "ASIC reset failed with error, %d for drm dev, %s",
5690 r, adev_to_drm(tmp_adev)->unique);
5691 goto out;
5692 }
5693 }
5694
5695 /* For XGMI wait for all resets to complete before proceed */
5696 if (!r) {
5697 list_for_each_entry(tmp_adev, device_list_handle,
5698 reset_list) {
5699 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5700 flush_work(&tmp_adev->xgmi_reset_work);
5701 r = tmp_adev->asic_reset_res;
5702 if (r)
5703 break;
5704 }
5705 }
5706 }
5707 }
5708
5709 if (!r && amdgpu_ras_intr_triggered()) {
5710 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5711 amdgpu_ras_reset_error_count(tmp_adev,
5712 AMDGPU_RAS_BLOCK__MMHUB);
5713 }
5714
5715 amdgpu_ras_intr_cleared();
5716 }
5717
5718 r = amdgpu_device_reinit_after_reset(reset_context);
5719 if (r == -EAGAIN)
5720 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5721 else
5722 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5723
5724 out:
5725 return r;
5726 }
5727
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5728 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5729 {
5730
5731 switch (amdgpu_asic_reset_method(adev)) {
5732 case AMD_RESET_METHOD_MODE1:
5733 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5734 break;
5735 case AMD_RESET_METHOD_MODE2:
5736 adev->mp1_state = PP_MP1_STATE_RESET;
5737 break;
5738 default:
5739 adev->mp1_state = PP_MP1_STATE_NONE;
5740 break;
5741 }
5742 }
5743
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5744 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5745 {
5746 amdgpu_vf_error_trans_all(adev);
5747 adev->mp1_state = PP_MP1_STATE_NONE;
5748 }
5749
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5750 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5751 {
5752 struct pci_dev *p = NULL;
5753
5754 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5755 adev->pdev->bus->number, 1);
5756 if (p) {
5757 pm_runtime_enable(&(p->dev));
5758 pm_runtime_resume(&(p->dev));
5759 }
5760
5761 pci_dev_put(p);
5762 }
5763
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5764 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5765 {
5766 enum amd_reset_method reset_method;
5767 struct pci_dev *p = NULL;
5768 u64 expires;
5769
5770 /*
5771 * For now, only BACO and mode1 reset are confirmed
5772 * to suffer the audio issue without proper suspended.
5773 */
5774 reset_method = amdgpu_asic_reset_method(adev);
5775 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5776 (reset_method != AMD_RESET_METHOD_MODE1))
5777 return -EINVAL;
5778
5779 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5780 adev->pdev->bus->number, 1);
5781 if (!p)
5782 return -ENODEV;
5783
5784 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5785 if (!expires)
5786 /*
5787 * If we cannot get the audio device autosuspend delay,
5788 * a fixed 4S interval will be used. Considering 3S is
5789 * the audio controller default autosuspend delay setting.
5790 * 4S used here is guaranteed to cover that.
5791 */
5792 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5793
5794 while (!pm_runtime_status_suspended(&(p->dev))) {
5795 if (!pm_runtime_suspend(&(p->dev)))
5796 break;
5797
5798 if (expires < ktime_get_mono_fast_ns()) {
5799 dev_warn(adev->dev, "failed to suspend display audio\n");
5800 pci_dev_put(p);
5801 /* TODO: abort the succeeding gpu reset? */
5802 return -ETIMEDOUT;
5803 }
5804 }
5805
5806 pm_runtime_disable(&(p->dev));
5807
5808 pci_dev_put(p);
5809 return 0;
5810 }
5811
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5812 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5813 {
5814 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5815
5816 #if defined(CONFIG_DEBUG_FS)
5817 if (!amdgpu_sriov_vf(adev))
5818 cancel_work(&adev->reset_work);
5819 #endif
5820
5821 if (adev->kfd.dev)
5822 cancel_work(&adev->kfd.reset_work);
5823
5824 if (amdgpu_sriov_vf(adev))
5825 cancel_work(&adev->virt.flr_work);
5826
5827 if (con && adev->ras_enabled)
5828 cancel_work(&con->recovery_work);
5829
5830 }
5831
amdgpu_device_health_check(struct list_head * device_list_handle)5832 static int amdgpu_device_health_check(struct list_head *device_list_handle)
5833 {
5834 struct amdgpu_device *tmp_adev;
5835 int ret = 0;
5836 u32 status;
5837
5838 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5839 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
5840 if (PCI_POSSIBLE_ERROR(status)) {
5841 dev_err(tmp_adev->dev, "device lost from bus!");
5842 ret = -ENODEV;
5843 }
5844 }
5845
5846 return ret;
5847 }
5848
5849 /**
5850 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5851 *
5852 * @adev: amdgpu_device pointer
5853 * @job: which job trigger hang
5854 * @reset_context: amdgpu reset context pointer
5855 *
5856 * Attempt to reset the GPU if it has hung (all asics).
5857 * Attempt to do soft-reset or full-reset and reinitialize Asic
5858 * Returns 0 for success or an error on failure.
5859 */
5860
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5861 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5862 struct amdgpu_job *job,
5863 struct amdgpu_reset_context *reset_context)
5864 {
5865 struct list_head device_list, *device_list_handle = NULL;
5866 bool job_signaled = false;
5867 struct amdgpu_hive_info *hive = NULL;
5868 struct amdgpu_device *tmp_adev = NULL;
5869 int i, r = 0;
5870 bool need_emergency_restart = false;
5871 bool audio_suspended = false;
5872 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
5873
5874 /*
5875 * If it reaches here because of hang/timeout and a RAS error is
5876 * detected at the same time, let RAS recovery take care of it.
5877 */
5878 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) &&
5879 !amdgpu_sriov_vf(adev) &&
5880 reset_context->src != AMDGPU_RESET_SRC_RAS) {
5881 dev_dbg(adev->dev,
5882 "Gpu recovery from source: %d yielding to RAS error recovery handling",
5883 reset_context->src);
5884 return 0;
5885 }
5886 /*
5887 * Special case: RAS triggered and full reset isn't supported
5888 */
5889 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5890
5891 /*
5892 * Flush RAM to disk so that after reboot
5893 * the user can read log and see why the system rebooted.
5894 */
5895 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5896 amdgpu_ras_get_context(adev)->reboot) {
5897 DRM_WARN("Emergency reboot.");
5898
5899 ksys_sync_helper();
5900 emergency_restart();
5901 }
5902
5903 dev_info(adev->dev, "GPU %s begin!\n",
5904 need_emergency_restart ? "jobs stop":"reset");
5905
5906 if (!amdgpu_sriov_vf(adev))
5907 hive = amdgpu_get_xgmi_hive(adev);
5908 if (hive)
5909 mutex_lock(&hive->hive_lock);
5910
5911 reset_context->job = job;
5912 reset_context->hive = hive;
5913 /*
5914 * Build list of devices to reset.
5915 * In case we are in XGMI hive mode, resort the device list
5916 * to put adev in the 1st position.
5917 */
5918 INIT_LIST_HEAD(&device_list);
5919 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5920 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5921 list_add_tail(&tmp_adev->reset_list, &device_list);
5922 if (adev->shutdown)
5923 tmp_adev->shutdown = true;
5924 }
5925 if (!list_is_first(&adev->reset_list, &device_list))
5926 list_rotate_to_front(&adev->reset_list, &device_list);
5927 device_list_handle = &device_list;
5928 } else {
5929 list_add_tail(&adev->reset_list, &device_list);
5930 device_list_handle = &device_list;
5931 }
5932
5933 if (!amdgpu_sriov_vf(adev)) {
5934 r = amdgpu_device_health_check(device_list_handle);
5935 if (r)
5936 goto end_reset;
5937 }
5938
5939 /* We need to lock reset domain only once both for XGMI and single device */
5940 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5941 reset_list);
5942 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5943
5944 /* block all schedulers and reset given job's ring */
5945 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5946
5947 amdgpu_device_set_mp1_state(tmp_adev);
5948
5949 /*
5950 * Try to put the audio codec into suspend state
5951 * before gpu reset started.
5952 *
5953 * Due to the power domain of the graphics device
5954 * is shared with AZ power domain. Without this,
5955 * we may change the audio hardware from behind
5956 * the audio driver's back. That will trigger
5957 * some audio codec errors.
5958 */
5959 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5960 audio_suspended = true;
5961
5962 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5963
5964 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5965
5966 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
5967
5968 /*
5969 * Mark these ASICs to be reset as untracked first
5970 * And add them back after reset completed
5971 */
5972 amdgpu_unregister_gpu_instance(tmp_adev);
5973
5974 drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
5975
5976 /* disable ras on ALL IPs */
5977 if (!need_emergency_restart &&
5978 amdgpu_device_ip_need_full_reset(tmp_adev))
5979 amdgpu_ras_suspend(tmp_adev);
5980
5981 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5982 struct amdgpu_ring *ring = tmp_adev->rings[i];
5983
5984 if (!amdgpu_ring_sched_ready(ring))
5985 continue;
5986
5987 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5988
5989 if (need_emergency_restart)
5990 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5991 }
5992 atomic_inc(&tmp_adev->gpu_reset_counter);
5993 }
5994
5995 if (need_emergency_restart)
5996 goto skip_sched_resume;
5997
5998 /*
5999 * Must check guilty signal here since after this point all old
6000 * HW fences are force signaled.
6001 *
6002 * job->base holds a reference to parent fence
6003 */
6004 if (job && dma_fence_is_signaled(&job->hw_fence)) {
6005 job_signaled = true;
6006 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
6007 goto skip_hw_reset;
6008 }
6009
6010 retry: /* Rest of adevs pre asic reset from XGMI hive. */
6011 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6012 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
6013 /*TODO Should we stop ?*/
6014 if (r) {
6015 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
6016 r, adev_to_drm(tmp_adev)->unique);
6017 tmp_adev->asic_reset_res = r;
6018 }
6019 }
6020
6021 /* Actual ASIC resets if needed.*/
6022 /* Host driver will handle XGMI hive reset for SRIOV */
6023 if (amdgpu_sriov_vf(adev)) {
6024 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
6025 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
6026 amdgpu_ras_set_fed(adev, true);
6027 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
6028 }
6029
6030 r = amdgpu_device_reset_sriov(adev, reset_context);
6031 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
6032 amdgpu_virt_release_full_gpu(adev, true);
6033 goto retry;
6034 }
6035 if (r)
6036 adev->asic_reset_res = r;
6037 } else {
6038 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
6039 if (r && r == -EAGAIN)
6040 goto retry;
6041 }
6042
6043 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6044 /*
6045 * Drop any pending non scheduler resets queued before reset is done.
6046 * Any reset scheduled after this point would be valid. Scheduler resets
6047 * were already dropped during drm_sched_stop and no new ones can come
6048 * in before drm_sched_start.
6049 */
6050 amdgpu_device_stop_pending_resets(tmp_adev);
6051 }
6052
6053 skip_hw_reset:
6054
6055 /* Post ASIC reset for all devs .*/
6056 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6057
6058 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6059 struct amdgpu_ring *ring = tmp_adev->rings[i];
6060
6061 if (!amdgpu_ring_sched_ready(ring))
6062 continue;
6063
6064 drm_sched_start(&ring->sched, 0);
6065 }
6066
6067 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
6068 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
6069
6070 if (tmp_adev->asic_reset_res)
6071 r = tmp_adev->asic_reset_res;
6072
6073 tmp_adev->asic_reset_res = 0;
6074
6075 if (r) {
6076 /* bad news, how to tell it to userspace ?
6077 * for ras error, we should report GPU bad status instead of
6078 * reset failure
6079 */
6080 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
6081 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
6082 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
6083 atomic_read(&tmp_adev->gpu_reset_counter));
6084 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
6085 } else {
6086 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
6087 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
6088 DRM_WARN("smart shift update failed\n");
6089 }
6090 }
6091
6092 skip_sched_resume:
6093 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6094 /* unlock kfd: SRIOV would do it separately */
6095 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
6096 amdgpu_amdkfd_post_reset(tmp_adev);
6097
6098 /* kfd_post_reset will do nothing if kfd device is not initialized,
6099 * need to bring up kfd here if it's not be initialized before
6100 */
6101 if (!adev->kfd.init_complete)
6102 amdgpu_amdkfd_device_init(adev);
6103
6104 if (audio_suspended)
6105 amdgpu_device_resume_display_audio(tmp_adev);
6106
6107 amdgpu_device_unset_mp1_state(tmp_adev);
6108
6109 amdgpu_ras_set_error_query_ready(tmp_adev, true);
6110 }
6111
6112 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6113 reset_list);
6114 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6115
6116 end_reset:
6117 if (hive) {
6118 mutex_unlock(&hive->hive_lock);
6119 amdgpu_put_xgmi_hive(hive);
6120 }
6121
6122 if (r)
6123 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6124
6125 atomic_set(&adev->reset_domain->reset_res, r);
6126 return r;
6127 }
6128
6129 /**
6130 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6131 *
6132 * @adev: amdgpu_device pointer
6133 * @speed: pointer to the speed of the link
6134 * @width: pointer to the width of the link
6135 *
6136 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6137 * first physical partner to an AMD dGPU.
6138 * This will exclude any virtual switches and links.
6139 */
amdgpu_device_partner_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6140 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6141 enum pci_bus_speed *speed,
6142 enum pcie_link_width *width)
6143 {
6144 struct pci_dev *parent = adev->pdev;
6145
6146 if (!speed || !width)
6147 return;
6148
6149 *speed = PCI_SPEED_UNKNOWN;
6150 *width = PCIE_LNK_WIDTH_UNKNOWN;
6151
6152 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6153 while ((parent = pci_upstream_bridge(parent))) {
6154 /* skip upstream/downstream switches internal to dGPU*/
6155 if (parent->vendor == PCI_VENDOR_ID_ATI)
6156 continue;
6157 *speed = pcie_get_speed_cap(parent);
6158 *width = pcie_get_width_cap(parent);
6159 break;
6160 }
6161 } else {
6162 /* use the current speeds rather than max if switching is not supported */
6163 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6164 }
6165 }
6166
6167 /**
6168 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU
6169 *
6170 * @adev: amdgpu_device pointer
6171 * @speed: pointer to the speed of the link
6172 * @width: pointer to the width of the link
6173 *
6174 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6175 * AMD dGPU which may be a virtual upstream bridge.
6176 */
amdgpu_device_gpu_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6177 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev,
6178 enum pci_bus_speed *speed,
6179 enum pcie_link_width *width)
6180 {
6181 struct pci_dev *parent = adev->pdev;
6182
6183 if (!speed || !width)
6184 return;
6185
6186 parent = pci_upstream_bridge(parent);
6187 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) {
6188 /* use the upstream/downstream switches internal to dGPU */
6189 *speed = pcie_get_speed_cap(parent);
6190 *width = pcie_get_width_cap(parent);
6191 while ((parent = pci_upstream_bridge(parent))) {
6192 if (parent->vendor == PCI_VENDOR_ID_ATI) {
6193 /* use the upstream/downstream switches internal to dGPU */
6194 *speed = pcie_get_speed_cap(parent);
6195 *width = pcie_get_width_cap(parent);
6196 }
6197 }
6198 } else {
6199 /* use the device itself */
6200 *speed = pcie_get_speed_cap(adev->pdev);
6201 *width = pcie_get_width_cap(adev->pdev);
6202 }
6203 }
6204
6205 /**
6206 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6207 *
6208 * @adev: amdgpu_device pointer
6209 *
6210 * Fetches and stores in the driver the PCIE capabilities (gen speed
6211 * and lanes) of the slot the device is in. Handles APUs and
6212 * virtualized environments where PCIE config space may not be available.
6213 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)6214 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6215 {
6216 enum pci_bus_speed speed_cap, platform_speed_cap;
6217 enum pcie_link_width platform_link_width, link_width;
6218
6219 if (amdgpu_pcie_gen_cap)
6220 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6221
6222 if (amdgpu_pcie_lane_cap)
6223 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6224
6225 /* covers APUs as well */
6226 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6227 if (adev->pm.pcie_gen_mask == 0)
6228 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6229 if (adev->pm.pcie_mlw_mask == 0)
6230 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6231 return;
6232 }
6233
6234 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6235 return;
6236
6237 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6238 &platform_link_width);
6239 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width);
6240
6241 if (adev->pm.pcie_gen_mask == 0) {
6242 /* asic caps */
6243 if (speed_cap == PCI_SPEED_UNKNOWN) {
6244 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6245 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6246 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6247 } else {
6248 if (speed_cap == PCIE_SPEED_32_0GT)
6249 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6250 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6251 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6252 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6253 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6254 else if (speed_cap == PCIE_SPEED_16_0GT)
6255 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6256 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6257 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6258 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6259 else if (speed_cap == PCIE_SPEED_8_0GT)
6260 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6261 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6262 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6263 else if (speed_cap == PCIE_SPEED_5_0GT)
6264 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6265 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6266 else
6267 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6268 }
6269 /* platform caps */
6270 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6271 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6272 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6273 } else {
6274 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6275 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6276 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6277 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6278 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6279 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6280 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6281 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6282 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6283 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6284 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6285 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6286 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6287 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6288 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6289 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6290 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6291 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6292 else
6293 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6294
6295 }
6296 }
6297 if (adev->pm.pcie_mlw_mask == 0) {
6298 /* asic caps */
6299 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6300 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK;
6301 } else {
6302 switch (link_width) {
6303 case PCIE_LNK_X32:
6304 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 |
6305 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6306 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6307 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6308 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6309 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6310 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6311 break;
6312 case PCIE_LNK_X16:
6313 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6314 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6315 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6316 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6317 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6318 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6319 break;
6320 case PCIE_LNK_X12:
6321 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6322 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6323 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6324 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6325 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6326 break;
6327 case PCIE_LNK_X8:
6328 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6329 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6330 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6331 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6332 break;
6333 case PCIE_LNK_X4:
6334 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6335 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6336 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6337 break;
6338 case PCIE_LNK_X2:
6339 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6340 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6341 break;
6342 case PCIE_LNK_X1:
6343 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1;
6344 break;
6345 default:
6346 break;
6347 }
6348 }
6349 /* platform caps */
6350 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6351 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6352 } else {
6353 switch (platform_link_width) {
6354 case PCIE_LNK_X32:
6355 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6356 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6357 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6358 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6359 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6360 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6361 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6362 break;
6363 case PCIE_LNK_X16:
6364 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6365 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6366 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6367 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6368 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6369 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6370 break;
6371 case PCIE_LNK_X12:
6372 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6373 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6374 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6375 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6376 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6377 break;
6378 case PCIE_LNK_X8:
6379 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6380 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6381 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6382 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6383 break;
6384 case PCIE_LNK_X4:
6385 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6386 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6387 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6388 break;
6389 case PCIE_LNK_X2:
6390 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6391 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6392 break;
6393 case PCIE_LNK_X1:
6394 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6395 break;
6396 default:
6397 break;
6398 }
6399 }
6400 }
6401 }
6402
6403 /**
6404 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6405 *
6406 * @adev: amdgpu_device pointer
6407 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6408 *
6409 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6410 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6411 * @peer_adev.
6412 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)6413 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6414 struct amdgpu_device *peer_adev)
6415 {
6416 #ifdef CONFIG_HSA_AMD_P2P
6417 bool p2p_access =
6418 !adev->gmc.xgmi.connected_to_cpu &&
6419 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6420 if (!p2p_access)
6421 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n",
6422 pci_name(peer_adev->pdev));
6423
6424 bool is_large_bar = adev->gmc.visible_vram_size &&
6425 adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6426 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6427
6428 if (!p2p_addressable) {
6429 uint64_t address_mask = peer_adev->dev->dma_mask ?
6430 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6431 resource_size_t aper_limit =
6432 adev->gmc.aper_base + adev->gmc.aper_size - 1;
6433
6434 p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6435 aper_limit & address_mask);
6436 }
6437 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6438 #else
6439 return false;
6440 #endif
6441 }
6442
amdgpu_device_baco_enter(struct drm_device * dev)6443 int amdgpu_device_baco_enter(struct drm_device *dev)
6444 {
6445 struct amdgpu_device *adev = drm_to_adev(dev);
6446 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6447
6448 if (!amdgpu_device_supports_baco(dev))
6449 return -ENOTSUPP;
6450
6451 if (ras && adev->ras_enabled &&
6452 adev->nbio.funcs->enable_doorbell_interrupt)
6453 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6454
6455 return amdgpu_dpm_baco_enter(adev);
6456 }
6457
amdgpu_device_baco_exit(struct drm_device * dev)6458 int amdgpu_device_baco_exit(struct drm_device *dev)
6459 {
6460 struct amdgpu_device *adev = drm_to_adev(dev);
6461 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6462 int ret = 0;
6463
6464 if (!amdgpu_device_supports_baco(dev))
6465 return -ENOTSUPP;
6466
6467 ret = amdgpu_dpm_baco_exit(adev);
6468 if (ret)
6469 return ret;
6470
6471 if (ras && adev->ras_enabled &&
6472 adev->nbio.funcs->enable_doorbell_interrupt)
6473 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6474
6475 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6476 adev->nbio.funcs->clear_doorbell_interrupt)
6477 adev->nbio.funcs->clear_doorbell_interrupt(adev);
6478
6479 return 0;
6480 }
6481
6482 /**
6483 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6484 * @pdev: PCI device struct
6485 * @state: PCI channel state
6486 *
6487 * Description: Called when a PCI error is detected.
6488 *
6489 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6490 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)6491 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6492 {
6493 struct drm_device *dev = pci_get_drvdata(pdev);
6494 struct amdgpu_device *adev = drm_to_adev(dev);
6495 int i;
6496
6497 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
6498
6499 if (adev->gmc.xgmi.num_physical_nodes > 1) {
6500 DRM_WARN("No support for XGMI hive yet...");
6501 return PCI_ERS_RESULT_DISCONNECT;
6502 }
6503
6504 adev->pci_channel_state = state;
6505
6506 switch (state) {
6507 case pci_channel_io_normal:
6508 return PCI_ERS_RESULT_CAN_RECOVER;
6509 /* Fatal error, prepare for slot reset */
6510 case pci_channel_io_frozen:
6511 /*
6512 * Locking adev->reset_domain->sem will prevent any external access
6513 * to GPU during PCI error recovery
6514 */
6515 amdgpu_device_lock_reset_domain(adev->reset_domain);
6516 amdgpu_device_set_mp1_state(adev);
6517
6518 /*
6519 * Block any work scheduling as we do for regular GPU reset
6520 * for the duration of the recovery
6521 */
6522 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6523 struct amdgpu_ring *ring = adev->rings[i];
6524
6525 if (!amdgpu_ring_sched_ready(ring))
6526 continue;
6527
6528 drm_sched_stop(&ring->sched, NULL);
6529 }
6530 atomic_inc(&adev->gpu_reset_counter);
6531 return PCI_ERS_RESULT_NEED_RESET;
6532 case pci_channel_io_perm_failure:
6533 /* Permanent error, prepare for device removal */
6534 return PCI_ERS_RESULT_DISCONNECT;
6535 }
6536
6537 return PCI_ERS_RESULT_NEED_RESET;
6538 }
6539
6540 /**
6541 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6542 * @pdev: pointer to PCI device
6543 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)6544 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6545 {
6546
6547 DRM_INFO("PCI error: mmio enabled callback!!\n");
6548
6549 /* TODO - dump whatever for debugging purposes */
6550
6551 /* This called only if amdgpu_pci_error_detected returns
6552 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6553 * works, no need to reset slot.
6554 */
6555
6556 return PCI_ERS_RESULT_RECOVERED;
6557 }
6558
6559 /**
6560 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6561 * @pdev: PCI device struct
6562 *
6563 * Description: This routine is called by the pci error recovery
6564 * code after the PCI slot has been reset, just before we
6565 * should resume normal operations.
6566 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)6567 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6568 {
6569 struct drm_device *dev = pci_get_drvdata(pdev);
6570 struct amdgpu_device *adev = drm_to_adev(dev);
6571 int r, i;
6572 struct amdgpu_reset_context reset_context;
6573 u32 memsize;
6574 struct list_head device_list;
6575
6576 /* PCI error slot reset should be skipped During RAS recovery */
6577 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6578 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6579 amdgpu_ras_in_recovery(adev))
6580 return PCI_ERS_RESULT_RECOVERED;
6581
6582 DRM_INFO("PCI error: slot reset callback!!\n");
6583
6584 memset(&reset_context, 0, sizeof(reset_context));
6585
6586 INIT_LIST_HEAD(&device_list);
6587 list_add_tail(&adev->reset_list, &device_list);
6588
6589 /* wait for asic to come out of reset */
6590 msleep(500);
6591
6592 /* Restore PCI confspace */
6593 amdgpu_device_load_pci_state(pdev);
6594
6595 /* confirm ASIC came out of reset */
6596 for (i = 0; i < adev->usec_timeout; i++) {
6597 memsize = amdgpu_asic_get_config_memsize(adev);
6598
6599 if (memsize != 0xffffffff)
6600 break;
6601 udelay(1);
6602 }
6603 if (memsize == 0xffffffff) {
6604 r = -ETIME;
6605 goto out;
6606 }
6607
6608 reset_context.method = AMD_RESET_METHOD_NONE;
6609 reset_context.reset_req_dev = adev;
6610 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6611 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6612
6613 adev->no_hw_access = true;
6614 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
6615 adev->no_hw_access = false;
6616 if (r)
6617 goto out;
6618
6619 r = amdgpu_do_asic_reset(&device_list, &reset_context);
6620
6621 out:
6622 if (!r) {
6623 if (amdgpu_device_cache_pci_state(adev->pdev))
6624 pci_restore_state(adev->pdev);
6625
6626 DRM_INFO("PCIe error recovery succeeded\n");
6627 } else {
6628 DRM_ERROR("PCIe error recovery failed, err:%d", r);
6629 amdgpu_device_unset_mp1_state(adev);
6630 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6631 }
6632
6633 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6634 }
6635
6636 /**
6637 * amdgpu_pci_resume() - resume normal ops after PCI reset
6638 * @pdev: pointer to PCI device
6639 *
6640 * Called when the error recovery driver tells us that its
6641 * OK to resume normal operation.
6642 */
amdgpu_pci_resume(struct pci_dev * pdev)6643 void amdgpu_pci_resume(struct pci_dev *pdev)
6644 {
6645 struct drm_device *dev = pci_get_drvdata(pdev);
6646 struct amdgpu_device *adev = drm_to_adev(dev);
6647 int i;
6648
6649
6650 DRM_INFO("PCI error: resume callback!!\n");
6651
6652 /* Only continue execution for the case of pci_channel_io_frozen */
6653 if (adev->pci_channel_state != pci_channel_io_frozen)
6654 return;
6655
6656 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6657 struct amdgpu_ring *ring = adev->rings[i];
6658
6659 if (!amdgpu_ring_sched_ready(ring))
6660 continue;
6661
6662 drm_sched_start(&ring->sched, 0);
6663 }
6664
6665 amdgpu_device_unset_mp1_state(adev);
6666 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6667 }
6668
amdgpu_device_cache_pci_state(struct pci_dev * pdev)6669 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6670 {
6671 struct drm_device *dev = pci_get_drvdata(pdev);
6672 struct amdgpu_device *adev = drm_to_adev(dev);
6673 int r;
6674
6675 if (amdgpu_sriov_vf(adev))
6676 return false;
6677
6678 r = pci_save_state(pdev);
6679 if (!r) {
6680 kfree(adev->pci_state);
6681
6682 adev->pci_state = pci_store_saved_state(pdev);
6683
6684 if (!adev->pci_state) {
6685 DRM_ERROR("Failed to store PCI saved state");
6686 return false;
6687 }
6688 } else {
6689 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6690 return false;
6691 }
6692
6693 return true;
6694 }
6695
amdgpu_device_load_pci_state(struct pci_dev * pdev)6696 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6697 {
6698 struct drm_device *dev = pci_get_drvdata(pdev);
6699 struct amdgpu_device *adev = drm_to_adev(dev);
6700 int r;
6701
6702 if (!adev->pci_state)
6703 return false;
6704
6705 r = pci_load_saved_state(pdev, adev->pci_state);
6706
6707 if (!r) {
6708 pci_restore_state(pdev);
6709 } else {
6710 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6711 return false;
6712 }
6713
6714 return true;
6715 }
6716
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6717 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6718 struct amdgpu_ring *ring)
6719 {
6720 #ifdef CONFIG_X86_64
6721 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6722 return;
6723 #endif
6724 if (adev->gmc.xgmi.connected_to_cpu)
6725 return;
6726
6727 if (ring && ring->funcs->emit_hdp_flush)
6728 amdgpu_ring_emit_hdp_flush(ring);
6729 else
6730 amdgpu_asic_flush_hdp(adev, ring);
6731 }
6732
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6733 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6734 struct amdgpu_ring *ring)
6735 {
6736 #ifdef CONFIG_X86_64
6737 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6738 return;
6739 #endif
6740 if (adev->gmc.xgmi.connected_to_cpu)
6741 return;
6742
6743 amdgpu_asic_invalidate_hdp(adev, ring);
6744 }
6745
amdgpu_in_reset(struct amdgpu_device * adev)6746 int amdgpu_in_reset(struct amdgpu_device *adev)
6747 {
6748 return atomic_read(&adev->reset_domain->in_gpu_reset);
6749 }
6750
6751 /**
6752 * amdgpu_device_halt() - bring hardware to some kind of halt state
6753 *
6754 * @adev: amdgpu_device pointer
6755 *
6756 * Bring hardware to some kind of halt state so that no one can touch it
6757 * any more. It will help to maintain error context when error occurred.
6758 * Compare to a simple hang, the system will keep stable at least for SSH
6759 * access. Then it should be trivial to inspect the hardware state and
6760 * see what's going on. Implemented as following:
6761 *
6762 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6763 * clears all CPU mappings to device, disallows remappings through page faults
6764 * 2. amdgpu_irq_disable_all() disables all interrupts
6765 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6766 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6767 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6768 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6769 * flush any in flight DMA operations
6770 */
amdgpu_device_halt(struct amdgpu_device * adev)6771 void amdgpu_device_halt(struct amdgpu_device *adev)
6772 {
6773 struct pci_dev *pdev = adev->pdev;
6774 struct drm_device *ddev = adev_to_drm(adev);
6775
6776 amdgpu_xcp_dev_unplug(adev);
6777 drm_dev_unplug(ddev);
6778
6779 amdgpu_irq_disable_all(adev);
6780
6781 amdgpu_fence_driver_hw_fini(adev);
6782
6783 adev->no_hw_access = true;
6784
6785 amdgpu_device_unmap_mmio(adev);
6786
6787 pci_disable_device(pdev);
6788 pci_wait_for_pending_transaction(pdev);
6789 }
6790
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6791 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6792 u32 reg)
6793 {
6794 unsigned long flags, address, data;
6795 u32 r;
6796
6797 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6798 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6799
6800 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6801 WREG32(address, reg * 4);
6802 (void)RREG32(address);
6803 r = RREG32(data);
6804 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6805 return r;
6806 }
6807
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6808 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6809 u32 reg, u32 v)
6810 {
6811 unsigned long flags, address, data;
6812
6813 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6814 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6815
6816 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6817 WREG32(address, reg * 4);
6818 (void)RREG32(address);
6819 WREG32(data, v);
6820 (void)RREG32(data);
6821 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6822 }
6823
6824 /**
6825 * amdgpu_device_get_gang - return a reference to the current gang
6826 * @adev: amdgpu_device pointer
6827 *
6828 * Returns: A new reference to the current gang leader.
6829 */
amdgpu_device_get_gang(struct amdgpu_device * adev)6830 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
6831 {
6832 struct dma_fence *fence;
6833
6834 rcu_read_lock();
6835 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
6836 rcu_read_unlock();
6837 return fence;
6838 }
6839
6840 /**
6841 * amdgpu_device_switch_gang - switch to a new gang
6842 * @adev: amdgpu_device pointer
6843 * @gang: the gang to switch to
6844 *
6845 * Try to switch to a new gang.
6846 * Returns: NULL if we switched to the new gang or a reference to the current
6847 * gang leader.
6848 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6849 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6850 struct dma_fence *gang)
6851 {
6852 struct dma_fence *old = NULL;
6853
6854 do {
6855 dma_fence_put(old);
6856 old = amdgpu_device_get_gang(adev);
6857 if (old == gang)
6858 break;
6859
6860 if (!dma_fence_is_signaled(old))
6861 return old;
6862
6863 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6864 old, gang) != old);
6865
6866 dma_fence_put(old);
6867 return NULL;
6868 }
6869
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6870 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6871 {
6872 switch (adev->asic_type) {
6873 #ifdef CONFIG_DRM_AMDGPU_SI
6874 case CHIP_HAINAN:
6875 #endif
6876 case CHIP_TOPAZ:
6877 /* chips with no display hardware */
6878 return false;
6879 #ifdef CONFIG_DRM_AMDGPU_SI
6880 case CHIP_TAHITI:
6881 case CHIP_PITCAIRN:
6882 case CHIP_VERDE:
6883 case CHIP_OLAND:
6884 #endif
6885 #ifdef CONFIG_DRM_AMDGPU_CIK
6886 case CHIP_BONAIRE:
6887 case CHIP_HAWAII:
6888 case CHIP_KAVERI:
6889 case CHIP_KABINI:
6890 case CHIP_MULLINS:
6891 #endif
6892 case CHIP_TONGA:
6893 case CHIP_FIJI:
6894 case CHIP_POLARIS10:
6895 case CHIP_POLARIS11:
6896 case CHIP_POLARIS12:
6897 case CHIP_VEGAM:
6898 case CHIP_CARRIZO:
6899 case CHIP_STONEY:
6900 /* chips with display hardware */
6901 return true;
6902 default:
6903 /* IP discovery */
6904 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
6905 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6906 return false;
6907 return true;
6908 }
6909 }
6910
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6911 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6912 uint32_t inst, uint32_t reg_addr, char reg_name[],
6913 uint32_t expected_value, uint32_t mask)
6914 {
6915 uint32_t ret = 0;
6916 uint32_t old_ = 0;
6917 uint32_t tmp_ = RREG32(reg_addr);
6918 uint32_t loop = adev->usec_timeout;
6919
6920 while ((tmp_ & (mask)) != (expected_value)) {
6921 if (old_ != tmp_) {
6922 loop = adev->usec_timeout;
6923 old_ = tmp_;
6924 } else
6925 udelay(1);
6926 tmp_ = RREG32(reg_addr);
6927 loop--;
6928 if (!loop) {
6929 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6930 inst, reg_name, (uint32_t)expected_value,
6931 (uint32_t)(tmp_ & (mask)));
6932 ret = -ETIMEDOUT;
6933 break;
6934 }
6935 }
6936 return ret;
6937 }
6938
amdgpu_get_soft_full_reset_mask(struct amdgpu_ring * ring)6939 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
6940 {
6941 ssize_t size = 0;
6942
6943 if (!ring || !ring->adev)
6944 return size;
6945
6946 if (amdgpu_device_should_recover_gpu(ring->adev))
6947 size |= AMDGPU_RESET_TYPE_FULL;
6948
6949 if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
6950 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
6951 size |= AMDGPU_RESET_TYPE_SOFT_RESET;
6952
6953 return size;
6954 }
6955
amdgpu_show_reset_mask(char * buf,uint32_t supported_reset)6956 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
6957 {
6958 ssize_t size = 0;
6959
6960 if (supported_reset == 0) {
6961 size += sysfs_emit_at(buf, size, "unsupported");
6962 size += sysfs_emit_at(buf, size, "\n");
6963 return size;
6964
6965 }
6966
6967 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
6968 size += sysfs_emit_at(buf, size, "soft ");
6969
6970 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
6971 size += sysfs_emit_at(buf, size, "queue ");
6972
6973 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
6974 size += sysfs_emit_at(buf, size, "pipe ");
6975
6976 if (supported_reset & AMDGPU_RESET_TYPE_FULL)
6977 size += sysfs_emit_at(buf, size, "full ");
6978
6979 size += sysfs_emit_at(buf, size, "\n");
6980 return size;
6981 }
6982