1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28
29 #include <linux/aperture.h>
30 #include <linux/power_supply.h>
31 #include <linux/kthread.h>
32 #include <linux/module.h>
33 #include <linux/console.h>
34 #include <linux/slab.h>
35 #include <linux/iommu.h>
36 #include <linux/pci.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39
40 #include <drm/drm_atomic_helper.h>
41 #include <drm/drm_client_event.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_probe_helper.h>
44 #include <drm/amdgpu_drm.h>
45 #include <linux/device.h>
46 #include <linux/vgaarb.h>
47 #include <linux/vga_switcheroo.h>
48 #include <linux/efi.h>
49 #include "amdgpu.h"
50 #include "amdgpu_trace.h"
51 #include "amdgpu_i2c.h"
52 #include "atom.h"
53 #include "amdgpu_atombios.h"
54 #include "amdgpu_atomfirmware.h"
55 #include "amd_pcie.h"
56 #ifdef CONFIG_DRM_AMDGPU_SI
57 #include "si.h"
58 #endif
59 #ifdef CONFIG_DRM_AMDGPU_CIK
60 #include "cik.h"
61 #endif
62 #include "vi.h"
63 #include "soc15.h"
64 #include "nv.h"
65 #include "bif/bif_4_1_d.h"
66 #include <linux/firmware.h>
67 #include "amdgpu_vf_error.h"
68
69 #include "amdgpu_amdkfd.h"
70 #include "amdgpu_pm.h"
71
72 #include "amdgpu_xgmi.h"
73 #include "amdgpu_ras.h"
74 #include "amdgpu_pmu.h"
75 #include "amdgpu_fru_eeprom.h"
76 #include "amdgpu_reset.h"
77 #include "amdgpu_virt.h"
78 #include "amdgpu_dev_coredump.h"
79
80 #include <linux/suspend.h>
81 #include <drm/task_barrier.h>
82 #include <linux/pm_runtime.h>
83
84 #include <drm/drm_drv.h>
85
86 #if IS_ENABLED(CONFIG_X86)
87 #include <asm/intel-family.h>
88 #include <asm/cpu_device_id.h>
89 #endif
90
91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
98
99 #define AMDGPU_RESUME_MS 2000
100 #define AMDGPU_MAX_RETRY_LIMIT 2
101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
105
106 #define AMDGPU_VBIOS_SKIP (1U << 0)
107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1)
108
109 static const struct drm_driver amdgpu_kms_driver;
110
111 const char *amdgpu_asic_name[] = {
112 "TAHITI",
113 "PITCAIRN",
114 "VERDE",
115 "OLAND",
116 "HAINAN",
117 "BONAIRE",
118 "KAVERI",
119 "KABINI",
120 "HAWAII",
121 "MULLINS",
122 "TOPAZ",
123 "TONGA",
124 "FIJI",
125 "CARRIZO",
126 "STONEY",
127 "POLARIS10",
128 "POLARIS11",
129 "POLARIS12",
130 "VEGAM",
131 "VEGA10",
132 "VEGA12",
133 "VEGA20",
134 "RAVEN",
135 "ARCTURUS",
136 "RENOIR",
137 "ALDEBARAN",
138 "NAVI10",
139 "CYAN_SKILLFISH",
140 "NAVI14",
141 "NAVI12",
142 "SIENNA_CICHLID",
143 "NAVY_FLOUNDER",
144 "VANGOGH",
145 "DIMGREY_CAVEFISH",
146 "BEIGE_GOBY",
147 "YELLOW_CARP",
148 "IP DISCOVERY",
149 "LAST",
150 };
151
152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0)
153 /*
154 * Default init level where all blocks are expected to be initialized. This is
155 * the level of initialization expected by default and also after a full reset
156 * of the device.
157 */
158 struct amdgpu_init_level amdgpu_init_default = {
159 .level = AMDGPU_INIT_LEVEL_DEFAULT,
160 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
161 };
162
163 struct amdgpu_init_level amdgpu_init_recovery = {
164 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
165 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
166 };
167
168 /*
169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
170 * is used for cases like reset on initialization where the entire hive needs to
171 * be reset before first use.
172 */
173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
174 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
175 .hwini_ip_block_mask =
176 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
177 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
178 BIT(AMD_IP_BLOCK_TYPE_PSP)
179 };
180
amdgpu_ip_member_of_hwini(struct amdgpu_device * adev,enum amd_ip_block_type block)181 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
182 enum amd_ip_block_type block)
183 {
184 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0;
185 }
186
amdgpu_set_init_level(struct amdgpu_device * adev,enum amdgpu_init_lvl_id lvl)187 void amdgpu_set_init_level(struct amdgpu_device *adev,
188 enum amdgpu_init_lvl_id lvl)
189 {
190 switch (lvl) {
191 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
192 adev->init_lvl = &amdgpu_init_minimal_xgmi;
193 break;
194 case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
195 adev->init_lvl = &amdgpu_init_recovery;
196 break;
197 case AMDGPU_INIT_LEVEL_DEFAULT:
198 fallthrough;
199 default:
200 adev->init_lvl = &amdgpu_init_default;
201 break;
202 }
203 }
204
205 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
206 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
207 void *data);
208
209 /**
210 * DOC: pcie_replay_count
211 *
212 * The amdgpu driver provides a sysfs API for reporting the total number
213 * of PCIe replays (NAKs).
214 * The file pcie_replay_count is used for this and returns the total
215 * number of replays as a sum of the NAKs generated and NAKs received.
216 */
217
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)218 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
219 struct device_attribute *attr, char *buf)
220 {
221 struct drm_device *ddev = dev_get_drvdata(dev);
222 struct amdgpu_device *adev = drm_to_adev(ddev);
223 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
224
225 return sysfs_emit(buf, "%llu\n", cnt);
226 }
227
228 static DEVICE_ATTR(pcie_replay_count, 0444,
229 amdgpu_device_get_pcie_replay_count, NULL);
230
amdgpu_device_attr_sysfs_init(struct amdgpu_device * adev)231 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev)
232 {
233 int ret = 0;
234
235 if (!amdgpu_sriov_vf(adev))
236 ret = sysfs_create_file(&adev->dev->kobj,
237 &dev_attr_pcie_replay_count.attr);
238
239 return ret;
240 }
241
amdgpu_device_attr_sysfs_fini(struct amdgpu_device * adev)242 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev)
243 {
244 if (!amdgpu_sriov_vf(adev))
245 sysfs_remove_file(&adev->dev->kobj,
246 &dev_attr_pcie_replay_count.attr);
247 }
248
amdgpu_sysfs_reg_state_get(struct file * f,struct kobject * kobj,const struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)249 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
250 const struct bin_attribute *attr, char *buf,
251 loff_t ppos, size_t count)
252 {
253 struct device *dev = kobj_to_dev(kobj);
254 struct drm_device *ddev = dev_get_drvdata(dev);
255 struct amdgpu_device *adev = drm_to_adev(ddev);
256 ssize_t bytes_read;
257
258 switch (ppos) {
259 case AMDGPU_SYS_REG_STATE_XGMI:
260 bytes_read = amdgpu_asic_get_reg_state(
261 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
262 break;
263 case AMDGPU_SYS_REG_STATE_WAFL:
264 bytes_read = amdgpu_asic_get_reg_state(
265 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
266 break;
267 case AMDGPU_SYS_REG_STATE_PCIE:
268 bytes_read = amdgpu_asic_get_reg_state(
269 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
270 break;
271 case AMDGPU_SYS_REG_STATE_USR:
272 bytes_read = amdgpu_asic_get_reg_state(
273 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
274 break;
275 case AMDGPU_SYS_REG_STATE_USR_1:
276 bytes_read = amdgpu_asic_get_reg_state(
277 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
278 break;
279 default:
280 return -EINVAL;
281 }
282
283 return bytes_read;
284 }
285
286 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
287 AMDGPU_SYS_REG_STATE_END);
288
amdgpu_reg_state_sysfs_init(struct amdgpu_device * adev)289 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
290 {
291 int ret;
292
293 if (!amdgpu_asic_get_reg_state_supported(adev))
294 return 0;
295
296 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
297
298 return ret;
299 }
300
amdgpu_reg_state_sysfs_fini(struct amdgpu_device * adev)301 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
302 {
303 if (!amdgpu_asic_get_reg_state_supported(adev))
304 return;
305 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
306 }
307
amdgpu_ip_block_suspend(struct amdgpu_ip_block * ip_block)308 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
309 {
310 int r;
311
312 if (ip_block->version->funcs->suspend) {
313 r = ip_block->version->funcs->suspend(ip_block);
314 if (r) {
315 dev_err(ip_block->adev->dev,
316 "suspend of IP block <%s> failed %d\n",
317 ip_block->version->funcs->name, r);
318 return r;
319 }
320 }
321
322 ip_block->status.hw = false;
323 return 0;
324 }
325
amdgpu_ip_block_resume(struct amdgpu_ip_block * ip_block)326 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
327 {
328 int r;
329
330 if (ip_block->version->funcs->resume) {
331 r = ip_block->version->funcs->resume(ip_block);
332 if (r) {
333 dev_err(ip_block->adev->dev,
334 "resume of IP block <%s> failed %d\n",
335 ip_block->version->funcs->name, r);
336 return r;
337 }
338 }
339
340 ip_block->status.hw = true;
341 return 0;
342 }
343
344 /**
345 * DOC: board_info
346 *
347 * The amdgpu driver provides a sysfs API for giving board related information.
348 * It provides the form factor information in the format
349 *
350 * type : form factor
351 *
352 * Possible form factor values
353 *
354 * - "cem" - PCIE CEM card
355 * - "oam" - Open Compute Accelerator Module
356 * - "unknown" - Not known
357 *
358 */
359
amdgpu_device_get_board_info(struct device * dev,struct device_attribute * attr,char * buf)360 static ssize_t amdgpu_device_get_board_info(struct device *dev,
361 struct device_attribute *attr,
362 char *buf)
363 {
364 struct drm_device *ddev = dev_get_drvdata(dev);
365 struct amdgpu_device *adev = drm_to_adev(ddev);
366 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
367 const char *pkg;
368
369 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
370 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
371
372 switch (pkg_type) {
373 case AMDGPU_PKG_TYPE_CEM:
374 pkg = "cem";
375 break;
376 case AMDGPU_PKG_TYPE_OAM:
377 pkg = "oam";
378 break;
379 default:
380 pkg = "unknown";
381 break;
382 }
383
384 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
385 }
386
387 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
388
389 static struct attribute *amdgpu_board_attrs[] = {
390 &dev_attr_board_info.attr,
391 NULL,
392 };
393
amdgpu_board_attrs_is_visible(struct kobject * kobj,struct attribute * attr,int n)394 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
395 struct attribute *attr, int n)
396 {
397 struct device *dev = kobj_to_dev(kobj);
398 struct drm_device *ddev = dev_get_drvdata(dev);
399 struct amdgpu_device *adev = drm_to_adev(ddev);
400
401 if (adev->flags & AMD_IS_APU)
402 return 0;
403
404 return attr->mode;
405 }
406
407 static const struct attribute_group amdgpu_board_attrs_group = {
408 .attrs = amdgpu_board_attrs,
409 .is_visible = amdgpu_board_attrs_is_visible
410 };
411
412 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
413
414
415 /**
416 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
417 *
418 * @dev: drm_device pointer
419 *
420 * Returns true if the device is a dGPU with ATPX power control,
421 * otherwise return false.
422 */
amdgpu_device_supports_px(struct drm_device * dev)423 bool amdgpu_device_supports_px(struct drm_device *dev)
424 {
425 struct amdgpu_device *adev = drm_to_adev(dev);
426
427 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
428 return true;
429 return false;
430 }
431
432 /**
433 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
434 *
435 * @dev: drm_device pointer
436 *
437 * Returns true if the device is a dGPU with ACPI power control,
438 * otherwise return false.
439 */
amdgpu_device_supports_boco(struct drm_device * dev)440 bool amdgpu_device_supports_boco(struct drm_device *dev)
441 {
442 struct amdgpu_device *adev = drm_to_adev(dev);
443
444 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
445 return false;
446
447 if (adev->has_pr3 ||
448 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
449 return true;
450 return false;
451 }
452
453 /**
454 * amdgpu_device_supports_baco - Does the device support BACO
455 *
456 * @dev: drm_device pointer
457 *
458 * Return:
459 * 1 if the device supports BACO;
460 * 3 if the device supports MACO (only works if BACO is supported)
461 * otherwise return 0.
462 */
amdgpu_device_supports_baco(struct drm_device * dev)463 int amdgpu_device_supports_baco(struct drm_device *dev)
464 {
465 struct amdgpu_device *adev = drm_to_adev(dev);
466
467 return amdgpu_asic_supports_baco(adev);
468 }
469
amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device * adev)470 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
471 {
472 struct drm_device *dev;
473 int bamaco_support;
474
475 dev = adev_to_drm(adev);
476
477 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
478 bamaco_support = amdgpu_device_supports_baco(dev);
479
480 switch (amdgpu_runtime_pm) {
481 case 2:
482 if (bamaco_support & MACO_SUPPORT) {
483 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
484 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
485 } else if (bamaco_support == BACO_SUPPORT) {
486 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
487 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
488 }
489 break;
490 case 1:
491 if (bamaco_support & BACO_SUPPORT) {
492 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
493 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
494 }
495 break;
496 case -1:
497 case -2:
498 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
499 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
500 dev_info(adev->dev, "Using ATPX for runtime pm\n");
501 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
503 dev_info(adev->dev, "Using BOCO for runtime pm\n");
504 } else {
505 if (!bamaco_support)
506 goto no_runtime_pm;
507
508 switch (adev->asic_type) {
509 case CHIP_VEGA20:
510 case CHIP_ARCTURUS:
511 /* BACO are not supported on vega20 and arctrus */
512 break;
513 case CHIP_VEGA10:
514 /* enable BACO as runpm mode if noretry=0 */
515 if (!adev->gmc.noretry && !amdgpu_passthrough(adev))
516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
517 break;
518 default:
519 /* enable BACO as runpm mode on CI+ */
520 if (!amdgpu_passthrough(adev))
521 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
522 break;
523 }
524
525 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
526 if (bamaco_support & MACO_SUPPORT) {
527 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
528 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
529 } else {
530 dev_info(adev->dev, "Using BACO for runtime pm\n");
531 }
532 }
533 }
534 break;
535 case 0:
536 dev_info(adev->dev, "runtime pm is manually disabled\n");
537 break;
538 default:
539 break;
540 }
541
542 no_runtime_pm:
543 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
544 dev_info(adev->dev, "Runtime PM not available\n");
545 }
546 /**
547 * amdgpu_device_supports_smart_shift - Is the device dGPU with
548 * smart shift support
549 *
550 * @dev: drm_device pointer
551 *
552 * Returns true if the device is a dGPU with Smart Shift support,
553 * otherwise returns false.
554 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)555 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
556 {
557 return (amdgpu_device_supports_boco(dev) &&
558 amdgpu_acpi_is_power_shift_control_supported());
559 }
560
561 /*
562 * VRAM access helper functions
563 */
564
565 /**
566 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
567 *
568 * @adev: amdgpu_device pointer
569 * @pos: offset of the buffer in vram
570 * @buf: virtual address of the buffer in system memory
571 * @size: read/write size, sizeof(@buf) must > @size
572 * @write: true - write to vram, otherwise - read from vram
573 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
575 void *buf, size_t size, bool write)
576 {
577 unsigned long flags;
578 uint32_t hi = ~0, tmp = 0;
579 uint32_t *data = buf;
580 uint64_t last;
581 int idx;
582
583 if (!drm_dev_enter(adev_to_drm(adev), &idx))
584 return;
585
586 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
587
588 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
589 for (last = pos + size; pos < last; pos += 4) {
590 tmp = pos >> 31;
591
592 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
593 if (tmp != hi) {
594 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
595 hi = tmp;
596 }
597 if (write)
598 WREG32_NO_KIQ(mmMM_DATA, *data++);
599 else
600 *data++ = RREG32_NO_KIQ(mmMM_DATA);
601 }
602
603 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
604 drm_dev_exit(idx);
605 }
606
607 /**
608 * amdgpu_device_aper_access - access vram by vram aperture
609 *
610 * @adev: amdgpu_device pointer
611 * @pos: offset of the buffer in vram
612 * @buf: virtual address of the buffer in system memory
613 * @size: read/write size, sizeof(@buf) must > @size
614 * @write: true - write to vram, otherwise - read from vram
615 *
616 * The return value means how many bytes have been transferred.
617 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
619 void *buf, size_t size, bool write)
620 {
621 #ifdef CONFIG_64BIT
622 void __iomem *addr;
623 size_t count = 0;
624 uint64_t last;
625
626 if (!adev->mman.aper_base_kaddr)
627 return 0;
628
629 last = min(pos + size, adev->gmc.visible_vram_size);
630 if (last > pos) {
631 addr = adev->mman.aper_base_kaddr + pos;
632 count = last - pos;
633
634 if (write) {
635 memcpy_toio(addr, buf, count);
636 /* Make sure HDP write cache flush happens without any reordering
637 * after the system memory contents are sent over PCIe device
638 */
639 mb();
640 amdgpu_device_flush_hdp(adev, NULL);
641 } else {
642 amdgpu_device_invalidate_hdp(adev, NULL);
643 /* Make sure HDP read cache is invalidated before issuing a read
644 * to the PCIe device
645 */
646 mb();
647 memcpy_fromio(buf, addr, count);
648 }
649
650 }
651
652 return count;
653 #else
654 return 0;
655 #endif
656 }
657
658 /**
659 * amdgpu_device_vram_access - read/write a buffer in vram
660 *
661 * @adev: amdgpu_device pointer
662 * @pos: offset of the buffer in vram
663 * @buf: virtual address of the buffer in system memory
664 * @size: read/write size, sizeof(@buf) must > @size
665 * @write: true - write to vram, otherwise - read from vram
666 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
668 void *buf, size_t size, bool write)
669 {
670 size_t count;
671
672 /* try to using vram apreature to access vram first */
673 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
674 size -= count;
675 if (size) {
676 /* using MM to access rest vram */
677 pos += count;
678 buf += count;
679 amdgpu_device_mm_access(adev, pos, buf, size, write);
680 }
681 }
682
683 /*
684 * register access helper functions.
685 */
686
687 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
689 {
690 if (adev->no_hw_access)
691 return true;
692
693 #ifdef CONFIG_LOCKDEP
694 /*
695 * This is a bit complicated to understand, so worth a comment. What we assert
696 * here is that the GPU reset is not running on another thread in parallel.
697 *
698 * For this we trylock the read side of the reset semaphore, if that succeeds
699 * we know that the reset is not running in parallel.
700 *
701 * If the trylock fails we assert that we are either already holding the read
702 * side of the lock or are the reset thread itself and hold the write side of
703 * the lock.
704 */
705 if (in_task()) {
706 if (down_read_trylock(&adev->reset_domain->sem))
707 up_read(&adev->reset_domain->sem);
708 else
709 lockdep_assert_held(&adev->reset_domain->sem);
710 }
711 #endif
712 return false;
713 }
714
715 /**
716 * amdgpu_device_rreg - read a memory mapped IO or indirect register
717 *
718 * @adev: amdgpu_device pointer
719 * @reg: dword aligned register offset
720 * @acc_flags: access flags which require special behavior
721 *
722 * Returns the 32 bit value from the offset specified.
723 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
725 uint32_t reg, uint32_t acc_flags)
726 {
727 uint32_t ret;
728
729 if (amdgpu_device_skip_hw_access(adev))
730 return 0;
731
732 if ((reg * 4) < adev->rmmio_size) {
733 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
734 amdgpu_sriov_runtime(adev) &&
735 down_read_trylock(&adev->reset_domain->sem)) {
736 ret = amdgpu_kiq_rreg(adev, reg, 0);
737 up_read(&adev->reset_domain->sem);
738 } else {
739 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
740 }
741 } else {
742 ret = adev->pcie_rreg(adev, reg * 4);
743 }
744
745 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
746
747 return ret;
748 }
749
750 /*
751 * MMIO register read with bytes helper functions
752 * @offset:bytes offset from MMIO start
753 */
754
755 /**
756 * amdgpu_mm_rreg8 - read a memory mapped IO register
757 *
758 * @adev: amdgpu_device pointer
759 * @offset: byte aligned register offset
760 *
761 * Returns the 8 bit value from the offset specified.
762 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
764 {
765 if (amdgpu_device_skip_hw_access(adev))
766 return 0;
767
768 if (offset < adev->rmmio_size)
769 return (readb(adev->rmmio + offset));
770 BUG();
771 }
772
773
774 /**
775 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
776 *
777 * @adev: amdgpu_device pointer
778 * @reg: dword aligned register offset
779 * @acc_flags: access flags which require special behavior
780 * @xcc_id: xcc accelerated compute core id
781 *
782 * Returns the 32 bit value from the offset specified.
783 */
amdgpu_device_xcc_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags,uint32_t xcc_id)784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
785 uint32_t reg, uint32_t acc_flags,
786 uint32_t xcc_id)
787 {
788 uint32_t ret, rlcg_flag;
789
790 if (amdgpu_device_skip_hw_access(adev))
791 return 0;
792
793 if ((reg * 4) < adev->rmmio_size) {
794 if (amdgpu_sriov_vf(adev) &&
795 !amdgpu_sriov_runtime(adev) &&
796 adev->gfx.rlc.rlcg_reg_access_supported &&
797 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
798 GC_HWIP, false,
799 &rlcg_flag)) {
800 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
801 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
802 amdgpu_sriov_runtime(adev) &&
803 down_read_trylock(&adev->reset_domain->sem)) {
804 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
805 up_read(&adev->reset_domain->sem);
806 } else {
807 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
808 }
809 } else {
810 ret = adev->pcie_rreg(adev, reg * 4);
811 }
812
813 return ret;
814 }
815
816 /*
817 * MMIO register write with bytes helper functions
818 * @offset:bytes offset from MMIO start
819 * @value: the value want to be written to the register
820 */
821
822 /**
823 * amdgpu_mm_wreg8 - read a memory mapped IO register
824 *
825 * @adev: amdgpu_device pointer
826 * @offset: byte aligned register offset
827 * @value: 8 bit value to write
828 *
829 * Writes the value specified to the offset specified.
830 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
832 {
833 if (amdgpu_device_skip_hw_access(adev))
834 return;
835
836 if (offset < adev->rmmio_size)
837 writeb(value, adev->rmmio + offset);
838 else
839 BUG();
840 }
841
842 /**
843 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
844 *
845 * @adev: amdgpu_device pointer
846 * @reg: dword aligned register offset
847 * @v: 32 bit value to write to the register
848 * @acc_flags: access flags which require special behavior
849 *
850 * Writes the value specified to the offset specified.
851 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)852 void amdgpu_device_wreg(struct amdgpu_device *adev,
853 uint32_t reg, uint32_t v,
854 uint32_t acc_flags)
855 {
856 if (amdgpu_device_skip_hw_access(adev))
857 return;
858
859 if ((reg * 4) < adev->rmmio_size) {
860 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
861 amdgpu_sriov_runtime(adev) &&
862 down_read_trylock(&adev->reset_domain->sem)) {
863 amdgpu_kiq_wreg(adev, reg, v, 0);
864 up_read(&adev->reset_domain->sem);
865 } else {
866 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
867 }
868 } else {
869 adev->pcie_wreg(adev, reg * 4, v);
870 }
871
872 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
873 }
874
875 /**
876 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
877 *
878 * @adev: amdgpu_device pointer
879 * @reg: mmio/rlc register
880 * @v: value to write
881 * @xcc_id: xcc accelerated compute core id
882 *
883 * this function is invoked only for the debugfs register access
884 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
886 uint32_t reg, uint32_t v,
887 uint32_t xcc_id)
888 {
889 if (amdgpu_device_skip_hw_access(adev))
890 return;
891
892 if (amdgpu_sriov_fullaccess(adev) &&
893 adev->gfx.rlc.funcs &&
894 adev->gfx.rlc.funcs->is_rlcg_access_range) {
895 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
896 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
897 } else if ((reg * 4) >= adev->rmmio_size) {
898 adev->pcie_wreg(adev, reg * 4, v);
899 } else {
900 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
901 }
902 }
903
904 /**
905 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
906 *
907 * @adev: amdgpu_device pointer
908 * @reg: dword aligned register offset
909 * @v: 32 bit value to write to the register
910 * @acc_flags: access flags which require special behavior
911 * @xcc_id: xcc accelerated compute core id
912 *
913 * Writes the value specified to the offset specified.
914 */
amdgpu_device_xcc_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags,uint32_t xcc_id)915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
916 uint32_t reg, uint32_t v,
917 uint32_t acc_flags, uint32_t xcc_id)
918 {
919 uint32_t rlcg_flag;
920
921 if (amdgpu_device_skip_hw_access(adev))
922 return;
923
924 if ((reg * 4) < adev->rmmio_size) {
925 if (amdgpu_sriov_vf(adev) &&
926 !amdgpu_sriov_runtime(adev) &&
927 adev->gfx.rlc.rlcg_reg_access_supported &&
928 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
929 GC_HWIP, true,
930 &rlcg_flag)) {
931 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
932 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
933 amdgpu_sriov_runtime(adev) &&
934 down_read_trylock(&adev->reset_domain->sem)) {
935 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
936 up_read(&adev->reset_domain->sem);
937 } else {
938 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
939 }
940 } else {
941 adev->pcie_wreg(adev, reg * 4, v);
942 }
943 }
944
945 /**
946 * amdgpu_device_indirect_rreg - read an indirect register
947 *
948 * @adev: amdgpu_device pointer
949 * @reg_addr: indirect register address to read from
950 *
951 * Returns the value of indirect register @reg_addr
952 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
954 u32 reg_addr)
955 {
956 unsigned long flags, pcie_index, pcie_data;
957 void __iomem *pcie_index_offset;
958 void __iomem *pcie_data_offset;
959 u32 r;
960
961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
963
964 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
965 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
966 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
967
968 writel(reg_addr, pcie_index_offset);
969 readl(pcie_index_offset);
970 r = readl(pcie_data_offset);
971 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
972
973 return r;
974 }
975
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
977 u64 reg_addr)
978 {
979 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
980 u32 r;
981 void __iomem *pcie_index_offset;
982 void __iomem *pcie_index_hi_offset;
983 void __iomem *pcie_data_offset;
984
985 if (unlikely(!adev->nbio.funcs)) {
986 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
987 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
988 } else {
989 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
990 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
991 }
992
993 if (reg_addr >> 32) {
994 if (unlikely(!adev->nbio.funcs))
995 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
996 else
997 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
998 } else {
999 pcie_index_hi = 0;
1000 }
1001
1002 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1003 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1004 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1005 if (pcie_index_hi != 0)
1006 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1007 pcie_index_hi * 4;
1008
1009 writel(reg_addr, pcie_index_offset);
1010 readl(pcie_index_offset);
1011 if (pcie_index_hi != 0) {
1012 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1013 readl(pcie_index_hi_offset);
1014 }
1015 r = readl(pcie_data_offset);
1016
1017 /* clear the high bits */
1018 if (pcie_index_hi != 0) {
1019 writel(0, pcie_index_hi_offset);
1020 readl(pcie_index_hi_offset);
1021 }
1022
1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1024
1025 return r;
1026 }
1027
1028 /**
1029 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
1030 *
1031 * @adev: amdgpu_device pointer
1032 * @reg_addr: indirect register address to read from
1033 *
1034 * Returns the value of indirect register @reg_addr
1035 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
1037 u32 reg_addr)
1038 {
1039 unsigned long flags, pcie_index, pcie_data;
1040 void __iomem *pcie_index_offset;
1041 void __iomem *pcie_data_offset;
1042 u64 r;
1043
1044 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1045 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1046
1047 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1048 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1049 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1050
1051 /* read low 32 bits */
1052 writel(reg_addr, pcie_index_offset);
1053 readl(pcie_index_offset);
1054 r = readl(pcie_data_offset);
1055 /* read high 32 bits */
1056 writel(reg_addr + 4, pcie_index_offset);
1057 readl(pcie_index_offset);
1058 r |= ((u64)readl(pcie_data_offset) << 32);
1059 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1060
1061 return r;
1062 }
1063
amdgpu_device_indirect_rreg64_ext(struct amdgpu_device * adev,u64 reg_addr)1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
1065 u64 reg_addr)
1066 {
1067 unsigned long flags, pcie_index, pcie_data;
1068 unsigned long pcie_index_hi = 0;
1069 void __iomem *pcie_index_offset;
1070 void __iomem *pcie_index_hi_offset;
1071 void __iomem *pcie_data_offset;
1072 u64 r;
1073
1074 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1075 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1076 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1077 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1078
1079 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1080 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1081 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1082 if (pcie_index_hi != 0)
1083 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1084 pcie_index_hi * 4;
1085
1086 /* read low 32 bits */
1087 writel(reg_addr, pcie_index_offset);
1088 readl(pcie_index_offset);
1089 if (pcie_index_hi != 0) {
1090 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1091 readl(pcie_index_hi_offset);
1092 }
1093 r = readl(pcie_data_offset);
1094 /* read high 32 bits */
1095 writel(reg_addr + 4, pcie_index_offset);
1096 readl(pcie_index_offset);
1097 if (pcie_index_hi != 0) {
1098 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1099 readl(pcie_index_hi_offset);
1100 }
1101 r |= ((u64)readl(pcie_data_offset) << 32);
1102
1103 /* clear the high bits */
1104 if (pcie_index_hi != 0) {
1105 writel(0, pcie_index_hi_offset);
1106 readl(pcie_index_hi_offset);
1107 }
1108
1109 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1110
1111 return r;
1112 }
1113
1114 /**
1115 * amdgpu_device_indirect_wreg - write an indirect register address
1116 *
1117 * @adev: amdgpu_device pointer
1118 * @reg_addr: indirect register offset
1119 * @reg_data: indirect register data
1120 *
1121 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1123 u32 reg_addr, u32 reg_data)
1124 {
1125 unsigned long flags, pcie_index, pcie_data;
1126 void __iomem *pcie_index_offset;
1127 void __iomem *pcie_data_offset;
1128
1129 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1130 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1131
1132 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1133 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1134 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1135
1136 writel(reg_addr, pcie_index_offset);
1137 readl(pcie_index_offset);
1138 writel(reg_data, pcie_data_offset);
1139 readl(pcie_data_offset);
1140 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1141 }
1142
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1144 u64 reg_addr, u32 reg_data)
1145 {
1146 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1147 void __iomem *pcie_index_offset;
1148 void __iomem *pcie_index_hi_offset;
1149 void __iomem *pcie_data_offset;
1150
1151 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1152 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1153 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1154 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1155 else
1156 pcie_index_hi = 0;
1157
1158 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1159 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1160 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1161 if (pcie_index_hi != 0)
1162 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1163 pcie_index_hi * 4;
1164
1165 writel(reg_addr, pcie_index_offset);
1166 readl(pcie_index_offset);
1167 if (pcie_index_hi != 0) {
1168 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1169 readl(pcie_index_hi_offset);
1170 }
1171 writel(reg_data, pcie_data_offset);
1172 readl(pcie_data_offset);
1173
1174 /* clear the high bits */
1175 if (pcie_index_hi != 0) {
1176 writel(0, pcie_index_hi_offset);
1177 readl(pcie_index_hi_offset);
1178 }
1179
1180 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1181 }
1182
1183 /**
1184 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1185 *
1186 * @adev: amdgpu_device pointer
1187 * @reg_addr: indirect register offset
1188 * @reg_data: indirect register data
1189 *
1190 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1192 u32 reg_addr, u64 reg_data)
1193 {
1194 unsigned long flags, pcie_index, pcie_data;
1195 void __iomem *pcie_index_offset;
1196 void __iomem *pcie_data_offset;
1197
1198 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1199 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1200
1201 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1202 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1203 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1204
1205 /* write low 32 bits */
1206 writel(reg_addr, pcie_index_offset);
1207 readl(pcie_index_offset);
1208 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1209 readl(pcie_data_offset);
1210 /* write high 32 bits */
1211 writel(reg_addr + 4, pcie_index_offset);
1212 readl(pcie_index_offset);
1213 writel((u32)(reg_data >> 32), pcie_data_offset);
1214 readl(pcie_data_offset);
1215 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1216 }
1217
amdgpu_device_indirect_wreg64_ext(struct amdgpu_device * adev,u64 reg_addr,u64 reg_data)1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1219 u64 reg_addr, u64 reg_data)
1220 {
1221 unsigned long flags, pcie_index, pcie_data;
1222 unsigned long pcie_index_hi = 0;
1223 void __iomem *pcie_index_offset;
1224 void __iomem *pcie_index_hi_offset;
1225 void __iomem *pcie_data_offset;
1226
1227 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1228 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1229 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1230 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1231
1232 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1233 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1234 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1235 if (pcie_index_hi != 0)
1236 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1237 pcie_index_hi * 4;
1238
1239 /* write low 32 bits */
1240 writel(reg_addr, pcie_index_offset);
1241 readl(pcie_index_offset);
1242 if (pcie_index_hi != 0) {
1243 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1244 readl(pcie_index_hi_offset);
1245 }
1246 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1247 readl(pcie_data_offset);
1248 /* write high 32 bits */
1249 writel(reg_addr + 4, pcie_index_offset);
1250 readl(pcie_index_offset);
1251 if (pcie_index_hi != 0) {
1252 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1253 readl(pcie_index_hi_offset);
1254 }
1255 writel((u32)(reg_data >> 32), pcie_data_offset);
1256 readl(pcie_data_offset);
1257
1258 /* clear the high bits */
1259 if (pcie_index_hi != 0) {
1260 writel(0, pcie_index_hi_offset);
1261 readl(pcie_index_hi_offset);
1262 }
1263
1264 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1265 }
1266
1267 /**
1268 * amdgpu_device_get_rev_id - query device rev_id
1269 *
1270 * @adev: amdgpu_device pointer
1271 *
1272 * Return device rev_id
1273 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1275 {
1276 return adev->nbio.funcs->get_rev_id(adev);
1277 }
1278
1279 /**
1280 * amdgpu_invalid_rreg - dummy reg read function
1281 *
1282 * @adev: amdgpu_device pointer
1283 * @reg: offset of register
1284 *
1285 * Dummy register read function. Used for register blocks
1286 * that certain asics don't have (all asics).
1287 * Returns the value in the register.
1288 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1290 {
1291 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1292 BUG();
1293 return 0;
1294 }
1295
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1297 {
1298 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1299 BUG();
1300 return 0;
1301 }
1302
1303 /**
1304 * amdgpu_invalid_wreg - dummy reg write function
1305 *
1306 * @adev: amdgpu_device pointer
1307 * @reg: offset of register
1308 * @v: value to write to the register
1309 *
1310 * Dummy register read function. Used for register blocks
1311 * that certain asics don't have (all asics).
1312 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1314 {
1315 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1316 reg, v);
1317 BUG();
1318 }
1319
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)1320 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1321 {
1322 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1323 reg, v);
1324 BUG();
1325 }
1326
1327 /**
1328 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1329 *
1330 * @adev: amdgpu_device pointer
1331 * @reg: offset of register
1332 *
1333 * Dummy register read function. Used for register blocks
1334 * that certain asics don't have (all asics).
1335 * Returns the value in the register.
1336 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)1337 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1338 {
1339 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1340 BUG();
1341 return 0;
1342 }
1343
amdgpu_invalid_rreg64_ext(struct amdgpu_device * adev,uint64_t reg)1344 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1345 {
1346 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1347 BUG();
1348 return 0;
1349 }
1350
1351 /**
1352 * amdgpu_invalid_wreg64 - dummy reg write function
1353 *
1354 * @adev: amdgpu_device pointer
1355 * @reg: offset of register
1356 * @v: value to write to the register
1357 *
1358 * Dummy register read function. Used for register blocks
1359 * that certain asics don't have (all asics).
1360 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)1361 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1362 {
1363 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1364 reg, v);
1365 BUG();
1366 }
1367
amdgpu_invalid_wreg64_ext(struct amdgpu_device * adev,uint64_t reg,uint64_t v)1368 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1369 {
1370 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1371 reg, v);
1372 BUG();
1373 }
1374
1375 /**
1376 * amdgpu_block_invalid_rreg - dummy reg read function
1377 *
1378 * @adev: amdgpu_device pointer
1379 * @block: offset of instance
1380 * @reg: offset of register
1381 *
1382 * Dummy register read function. Used for register blocks
1383 * that certain asics don't have (all asics).
1384 * Returns the value in the register.
1385 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)1386 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1387 uint32_t block, uint32_t reg)
1388 {
1389 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1390 reg, block);
1391 BUG();
1392 return 0;
1393 }
1394
1395 /**
1396 * amdgpu_block_invalid_wreg - dummy reg write function
1397 *
1398 * @adev: amdgpu_device pointer
1399 * @block: offset of instance
1400 * @reg: offset of register
1401 * @v: value to write to the register
1402 *
1403 * Dummy register read function. Used for register blocks
1404 * that certain asics don't have (all asics).
1405 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)1406 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1407 uint32_t block,
1408 uint32_t reg, uint32_t v)
1409 {
1410 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1411 reg, block, v);
1412 BUG();
1413 }
1414
amdgpu_device_get_vbios_flags(struct amdgpu_device * adev)1415 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev)
1416 {
1417 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1418 return AMDGPU_VBIOS_SKIP;
1419
1420 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev))
1421 return AMDGPU_VBIOS_OPTIONAL;
1422
1423 return 0;
1424 }
1425
1426 /**
1427 * amdgpu_device_asic_init - Wrapper for atom asic_init
1428 *
1429 * @adev: amdgpu_device pointer
1430 *
1431 * Does any asic specific work and then calls atom asic init.
1432 */
amdgpu_device_asic_init(struct amdgpu_device * adev)1433 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1434 {
1435 uint32_t flags;
1436 bool optional;
1437 int ret;
1438
1439 amdgpu_asic_pre_asic_init(adev);
1440 flags = amdgpu_device_get_vbios_flags(adev);
1441 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP));
1442
1443 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1444 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1445 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
1446 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1447 amdgpu_psp_wait_for_bootloader(adev);
1448 if (optional && !adev->bios)
1449 return 0;
1450
1451 ret = amdgpu_atomfirmware_asic_init(adev, true);
1452 return ret;
1453 } else {
1454 if (optional && !adev->bios)
1455 return 0;
1456
1457 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1458 }
1459
1460 return 0;
1461 }
1462
1463 /**
1464 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1465 *
1466 * @adev: amdgpu_device pointer
1467 *
1468 * Allocates a scratch page of VRAM for use by various things in the
1469 * driver.
1470 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)1471 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1472 {
1473 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1474 AMDGPU_GEM_DOMAIN_VRAM |
1475 AMDGPU_GEM_DOMAIN_GTT,
1476 &adev->mem_scratch.robj,
1477 &adev->mem_scratch.gpu_addr,
1478 (void **)&adev->mem_scratch.ptr);
1479 }
1480
1481 /**
1482 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1483 *
1484 * @adev: amdgpu_device pointer
1485 *
1486 * Frees the VRAM scratch page.
1487 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)1488 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1489 {
1490 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1491 }
1492
1493 /**
1494 * amdgpu_device_program_register_sequence - program an array of registers.
1495 *
1496 * @adev: amdgpu_device pointer
1497 * @registers: pointer to the register array
1498 * @array_size: size of the register array
1499 *
1500 * Programs an array or registers with and or masks.
1501 * This is a helper for setting golden registers.
1502 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)1503 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1504 const u32 *registers,
1505 const u32 array_size)
1506 {
1507 u32 tmp, reg, and_mask, or_mask;
1508 int i;
1509
1510 if (array_size % 3)
1511 return;
1512
1513 for (i = 0; i < array_size; i += 3) {
1514 reg = registers[i + 0];
1515 and_mask = registers[i + 1];
1516 or_mask = registers[i + 2];
1517
1518 if (and_mask == 0xffffffff) {
1519 tmp = or_mask;
1520 } else {
1521 tmp = RREG32(reg);
1522 tmp &= ~and_mask;
1523 if (adev->family >= AMDGPU_FAMILY_AI)
1524 tmp |= (or_mask & and_mask);
1525 else
1526 tmp |= or_mask;
1527 }
1528 WREG32(reg, tmp);
1529 }
1530 }
1531
1532 /**
1533 * amdgpu_device_pci_config_reset - reset the GPU
1534 *
1535 * @adev: amdgpu_device pointer
1536 *
1537 * Resets the GPU using the pci config reset sequence.
1538 * Only applicable to asics prior to vega10.
1539 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)1540 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1541 {
1542 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1543 }
1544
1545 /**
1546 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1547 *
1548 * @adev: amdgpu_device pointer
1549 *
1550 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1551 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1552 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1553 {
1554 return pci_reset_function(adev->pdev);
1555 }
1556
1557 /*
1558 * amdgpu_device_wb_*()
1559 * Writeback is the method by which the GPU updates special pages in memory
1560 * with the status of certain GPU events (fences, ring pointers,etc.).
1561 */
1562
1563 /**
1564 * amdgpu_device_wb_fini - Disable Writeback and free memory
1565 *
1566 * @adev: amdgpu_device pointer
1567 *
1568 * Disables Writeback and frees the Writeback memory (all asics).
1569 * Used at driver shutdown.
1570 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1571 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1572 {
1573 if (adev->wb.wb_obj) {
1574 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1575 &adev->wb.gpu_addr,
1576 (void **)&adev->wb.wb);
1577 adev->wb.wb_obj = NULL;
1578 }
1579 }
1580
1581 /**
1582 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1583 *
1584 * @adev: amdgpu_device pointer
1585 *
1586 * Initializes writeback and allocates writeback memory (all asics).
1587 * Used at driver startup.
1588 * Returns 0 on success or an -error on failure.
1589 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1590 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1591 {
1592 int r;
1593
1594 if (adev->wb.wb_obj == NULL) {
1595 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1596 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1597 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1598 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1599 (void **)&adev->wb.wb);
1600 if (r) {
1601 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1602 return r;
1603 }
1604
1605 adev->wb.num_wb = AMDGPU_MAX_WB;
1606 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1607
1608 /* clear wb memory */
1609 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1610 }
1611
1612 return 0;
1613 }
1614
1615 /**
1616 * amdgpu_device_wb_get - Allocate a wb entry
1617 *
1618 * @adev: amdgpu_device pointer
1619 * @wb: wb index
1620 *
1621 * Allocate a wb slot for use by the driver (all asics).
1622 * Returns 0 on success or -EINVAL on failure.
1623 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1624 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1625 {
1626 unsigned long flags, offset;
1627
1628 spin_lock_irqsave(&adev->wb.lock, flags);
1629 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1630 if (offset < adev->wb.num_wb) {
1631 __set_bit(offset, adev->wb.used);
1632 spin_unlock_irqrestore(&adev->wb.lock, flags);
1633 *wb = offset << 3; /* convert to dw offset */
1634 return 0;
1635 } else {
1636 spin_unlock_irqrestore(&adev->wb.lock, flags);
1637 return -EINVAL;
1638 }
1639 }
1640
1641 /**
1642 * amdgpu_device_wb_free - Free a wb entry
1643 *
1644 * @adev: amdgpu_device pointer
1645 * @wb: wb index
1646 *
1647 * Free a wb slot allocated for use by the driver (all asics)
1648 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1649 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1650 {
1651 unsigned long flags;
1652
1653 wb >>= 3;
1654 spin_lock_irqsave(&adev->wb.lock, flags);
1655 if (wb < adev->wb.num_wb)
1656 __clear_bit(wb, adev->wb.used);
1657 spin_unlock_irqrestore(&adev->wb.lock, flags);
1658 }
1659
1660 /**
1661 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1662 *
1663 * @adev: amdgpu_device pointer
1664 *
1665 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1666 * to fail, but if any of the BARs is not accessible after the size we abort
1667 * driver loading by returning -ENODEV.
1668 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1669 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1670 {
1671 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1672 struct pci_bus *root;
1673 struct resource *res;
1674 unsigned int i;
1675 u16 cmd;
1676 int r;
1677
1678 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1679 return 0;
1680
1681 /* Bypass for VF */
1682 if (amdgpu_sriov_vf(adev))
1683 return 0;
1684
1685 if (!amdgpu_rebar)
1686 return 0;
1687
1688 /* resizing on Dell G5 SE platforms causes problems with runtime pm */
1689 if ((amdgpu_runtime_pm != 0) &&
1690 adev->pdev->vendor == PCI_VENDOR_ID_ATI &&
1691 adev->pdev->device == 0x731f &&
1692 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL)
1693 return 0;
1694
1695 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1696 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1697 DRM_WARN("System can't access extended configuration space, please check!!\n");
1698
1699 /* skip if the bios has already enabled large BAR */
1700 if (adev->gmc.real_vram_size &&
1701 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1702 return 0;
1703
1704 /* Check if the root BUS has 64bit memory resources */
1705 root = adev->pdev->bus;
1706 while (root->parent)
1707 root = root->parent;
1708
1709 pci_bus_for_each_resource(root, res, i) {
1710 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1711 res->start > 0x100000000ull)
1712 break;
1713 }
1714
1715 /* Trying to resize is pointless without a root hub window above 4GB */
1716 if (!res)
1717 return 0;
1718
1719 /* Limit the BAR size to what is available */
1720 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1721 rbar_size);
1722
1723 /* Disable memory decoding while we change the BAR addresses and size */
1724 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1725 pci_write_config_word(adev->pdev, PCI_COMMAND,
1726 cmd & ~PCI_COMMAND_MEMORY);
1727
1728 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1729 amdgpu_doorbell_fini(adev);
1730 if (adev->asic_type >= CHIP_BONAIRE)
1731 pci_release_resource(adev->pdev, 2);
1732
1733 pci_release_resource(adev->pdev, 0);
1734
1735 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1736 if (r == -ENOSPC)
1737 DRM_INFO("Not enough PCI address space for a large BAR.");
1738 else if (r && r != -ENOTSUPP)
1739 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1740
1741 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1742
1743 /* When the doorbell or fb BAR isn't available we have no chance of
1744 * using the device.
1745 */
1746 r = amdgpu_doorbell_init(adev);
1747 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1748 return -ENODEV;
1749
1750 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1751
1752 return 0;
1753 }
1754
1755 /*
1756 * GPU helpers function.
1757 */
1758 /**
1759 * amdgpu_device_need_post - check if the hw need post or not
1760 *
1761 * @adev: amdgpu_device pointer
1762 *
1763 * Check if the asic has been initialized (all asics) at driver startup
1764 * or post is needed if hw reset is performed.
1765 * Returns true if need or false if not.
1766 */
amdgpu_device_need_post(struct amdgpu_device * adev)1767 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1768 {
1769 uint32_t reg, flags;
1770
1771 if (amdgpu_sriov_vf(adev))
1772 return false;
1773
1774 flags = amdgpu_device_get_vbios_flags(adev);
1775 if (flags & AMDGPU_VBIOS_SKIP)
1776 return false;
1777 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios)
1778 return false;
1779
1780 if (amdgpu_passthrough(adev)) {
1781 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1782 * some old smc fw still need driver do vPost otherwise gpu hang, while
1783 * those smc fw version above 22.15 doesn't have this flaw, so we force
1784 * vpost executed for smc version below 22.15
1785 */
1786 if (adev->asic_type == CHIP_FIJI) {
1787 int err;
1788 uint32_t fw_ver;
1789
1790 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1791 /* force vPost if error occurred */
1792 if (err)
1793 return true;
1794
1795 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1796 release_firmware(adev->pm.fw);
1797 if (fw_ver < 0x00160e00)
1798 return true;
1799 }
1800 }
1801
1802 /* Don't post if we need to reset whole hive on init */
1803 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
1804 return false;
1805
1806 if (adev->has_hw_reset) {
1807 adev->has_hw_reset = false;
1808 return true;
1809 }
1810
1811 /* bios scratch used on CIK+ */
1812 if (adev->asic_type >= CHIP_BONAIRE)
1813 return amdgpu_atombios_scratch_need_asic_init(adev);
1814
1815 /* check MEM_SIZE for older asics */
1816 reg = amdgpu_asic_get_config_memsize(adev);
1817
1818 if ((reg != 0) && (reg != 0xffffffff))
1819 return false;
1820
1821 return true;
1822 }
1823
1824 /*
1825 * Check whether seamless boot is supported.
1826 *
1827 * So far we only support seamless boot on DCE 3.0 or later.
1828 * If users report that it works on older ASICS as well, we may
1829 * loosen this.
1830 */
amdgpu_device_seamless_boot_supported(struct amdgpu_device * adev)1831 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1832 {
1833 switch (amdgpu_seamless) {
1834 case -1:
1835 break;
1836 case 1:
1837 return true;
1838 case 0:
1839 return false;
1840 default:
1841 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1842 amdgpu_seamless);
1843 return false;
1844 }
1845
1846 if (!(adev->flags & AMD_IS_APU))
1847 return false;
1848
1849 if (adev->mman.keep_stolen_vga_memory)
1850 return false;
1851
1852 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1853 }
1854
1855 /*
1856 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1857 * don't support dynamic speed switching. Until we have confirmation from Intel
1858 * that a specific host supports it, it's safer that we keep it disabled for all.
1859 *
1860 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1861 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1862 */
amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device * adev)1863 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1864 {
1865 #if IS_ENABLED(CONFIG_X86)
1866 struct cpuinfo_x86 *c = &cpu_data(0);
1867
1868 /* eGPU change speeds based on USB4 fabric conditions */
1869 if (dev_is_removable(adev->dev))
1870 return true;
1871
1872 if (c->x86_vendor == X86_VENDOR_INTEL)
1873 return false;
1874 #endif
1875 return true;
1876 }
1877
amdgpu_device_aspm_support_quirk(struct amdgpu_device * adev)1878 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev)
1879 {
1880 #if IS_ENABLED(CONFIG_X86)
1881 struct cpuinfo_x86 *c = &cpu_data(0);
1882
1883 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) ||
1884 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1)))
1885 return false;
1886
1887 if (c->x86 == 6 &&
1888 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) {
1889 switch (c->x86_model) {
1890 case VFM_MODEL(INTEL_ALDERLAKE):
1891 case VFM_MODEL(INTEL_ALDERLAKE_L):
1892 case VFM_MODEL(INTEL_RAPTORLAKE):
1893 case VFM_MODEL(INTEL_RAPTORLAKE_P):
1894 case VFM_MODEL(INTEL_RAPTORLAKE_S):
1895 return true;
1896 default:
1897 return false;
1898 }
1899 } else {
1900 return false;
1901 }
1902 #else
1903 return false;
1904 #endif
1905 }
1906
1907 /**
1908 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1909 *
1910 * @adev: amdgpu_device pointer
1911 *
1912 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1913 * be set for this device.
1914 *
1915 * Returns true if it should be used or false if not.
1916 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1917 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1918 {
1919 switch (amdgpu_aspm) {
1920 case -1:
1921 break;
1922 case 0:
1923 return false;
1924 case 1:
1925 return true;
1926 default:
1927 return false;
1928 }
1929 if (adev->flags & AMD_IS_APU)
1930 return false;
1931 if (amdgpu_device_aspm_support_quirk(adev))
1932 return false;
1933 return pcie_aspm_enabled(adev->pdev);
1934 }
1935
1936 /* if we get transitioned to only one device, take VGA back */
1937 /**
1938 * amdgpu_device_vga_set_decode - enable/disable vga decode
1939 *
1940 * @pdev: PCI device pointer
1941 * @state: enable/disable vga decode
1942 *
1943 * Enable/disable vga decode (all asics).
1944 * Returns VGA resource flags.
1945 */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1946 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1947 bool state)
1948 {
1949 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1950
1951 amdgpu_asic_set_vga_state(adev, state);
1952 if (state)
1953 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1954 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1955 else
1956 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1957 }
1958
1959 /**
1960 * amdgpu_device_check_block_size - validate the vm block size
1961 *
1962 * @adev: amdgpu_device pointer
1963 *
1964 * Validates the vm block size specified via module parameter.
1965 * The vm block size defines number of bits in page table versus page directory,
1966 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1967 * page table and the remaining bits are in the page directory.
1968 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1969 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1970 {
1971 /* defines number of bits in page table versus page directory,
1972 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1973 * page table and the remaining bits are in the page directory
1974 */
1975 if (amdgpu_vm_block_size == -1)
1976 return;
1977
1978 if (amdgpu_vm_block_size < 9) {
1979 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1980 amdgpu_vm_block_size);
1981 amdgpu_vm_block_size = -1;
1982 }
1983 }
1984
1985 /**
1986 * amdgpu_device_check_vm_size - validate the vm size
1987 *
1988 * @adev: amdgpu_device pointer
1989 *
1990 * Validates the vm size in GB specified via module parameter.
1991 * The VM size is the size of the GPU virtual memory space in GB.
1992 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1993 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1994 {
1995 /* no need to check the default value */
1996 if (amdgpu_vm_size == -1)
1997 return;
1998
1999 if (amdgpu_vm_size < 1) {
2000 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
2001 amdgpu_vm_size);
2002 amdgpu_vm_size = -1;
2003 }
2004 }
2005
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)2006 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
2007 {
2008 struct sysinfo si;
2009 bool is_os_64 = (sizeof(void *) == 8);
2010 uint64_t total_memory;
2011 uint64_t dram_size_seven_GB = 0x1B8000000;
2012 uint64_t dram_size_three_GB = 0xB8000000;
2013
2014 if (amdgpu_smu_memory_pool_size == 0)
2015 return;
2016
2017 if (!is_os_64) {
2018 DRM_WARN("Not 64-bit OS, feature not supported\n");
2019 goto def_value;
2020 }
2021 si_meminfo(&si);
2022 total_memory = (uint64_t)si.totalram * si.mem_unit;
2023
2024 if ((amdgpu_smu_memory_pool_size == 1) ||
2025 (amdgpu_smu_memory_pool_size == 2)) {
2026 if (total_memory < dram_size_three_GB)
2027 goto def_value1;
2028 } else if ((amdgpu_smu_memory_pool_size == 4) ||
2029 (amdgpu_smu_memory_pool_size == 8)) {
2030 if (total_memory < dram_size_seven_GB)
2031 goto def_value1;
2032 } else {
2033 DRM_WARN("Smu memory pool size not supported\n");
2034 goto def_value;
2035 }
2036 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
2037
2038 return;
2039
2040 def_value1:
2041 DRM_WARN("No enough system memory\n");
2042 def_value:
2043 adev->pm.smu_prv_buffer_size = 0;
2044 }
2045
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)2046 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
2047 {
2048 if (!(adev->flags & AMD_IS_APU) ||
2049 adev->asic_type < CHIP_RAVEN)
2050 return 0;
2051
2052 switch (adev->asic_type) {
2053 case CHIP_RAVEN:
2054 if (adev->pdev->device == 0x15dd)
2055 adev->apu_flags |= AMD_APU_IS_RAVEN;
2056 if (adev->pdev->device == 0x15d8)
2057 adev->apu_flags |= AMD_APU_IS_PICASSO;
2058 break;
2059 case CHIP_RENOIR:
2060 if ((adev->pdev->device == 0x1636) ||
2061 (adev->pdev->device == 0x164c))
2062 adev->apu_flags |= AMD_APU_IS_RENOIR;
2063 else
2064 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
2065 break;
2066 case CHIP_VANGOGH:
2067 adev->apu_flags |= AMD_APU_IS_VANGOGH;
2068 break;
2069 case CHIP_YELLOW_CARP:
2070 break;
2071 case CHIP_CYAN_SKILLFISH:
2072 if ((adev->pdev->device == 0x13FE) ||
2073 (adev->pdev->device == 0x143F))
2074 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
2075 break;
2076 default:
2077 break;
2078 }
2079
2080 return 0;
2081 }
2082
2083 /**
2084 * amdgpu_device_check_arguments - validate module params
2085 *
2086 * @adev: amdgpu_device pointer
2087 *
2088 * Validates certain module parameters and updates
2089 * the associated values used by the driver (all asics).
2090 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)2091 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
2092 {
2093 int i;
2094
2095 if (amdgpu_sched_jobs < 4) {
2096 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
2097 amdgpu_sched_jobs);
2098 amdgpu_sched_jobs = 4;
2099 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
2100 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
2101 amdgpu_sched_jobs);
2102 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
2103 }
2104
2105 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
2106 /* gart size must be greater or equal to 32M */
2107 dev_warn(adev->dev, "gart size (%d) too small\n",
2108 amdgpu_gart_size);
2109 amdgpu_gart_size = -1;
2110 }
2111
2112 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
2113 /* gtt size must be greater or equal to 32M */
2114 dev_warn(adev->dev, "gtt size (%d) too small\n",
2115 amdgpu_gtt_size);
2116 amdgpu_gtt_size = -1;
2117 }
2118
2119 /* valid range is between 4 and 9 inclusive */
2120 if (amdgpu_vm_fragment_size != -1 &&
2121 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
2122 dev_warn(adev->dev, "valid range is between 4 and 9\n");
2123 amdgpu_vm_fragment_size = -1;
2124 }
2125
2126 if (amdgpu_sched_hw_submission < 2) {
2127 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
2128 amdgpu_sched_hw_submission);
2129 amdgpu_sched_hw_submission = 2;
2130 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
2131 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
2132 amdgpu_sched_hw_submission);
2133 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
2134 }
2135
2136 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
2137 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
2138 amdgpu_reset_method = -1;
2139 }
2140
2141 amdgpu_device_check_smu_prv_buffer_size(adev);
2142
2143 amdgpu_device_check_vm_size(adev);
2144
2145 amdgpu_device_check_block_size(adev);
2146
2147 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2148
2149 for (i = 0; i < MAX_XCP; i++) {
2150 switch (amdgpu_enforce_isolation) {
2151 case -1:
2152 case 0:
2153 default:
2154 /* disable */
2155 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE;
2156 break;
2157 case 1:
2158 /* enable */
2159 adev->enforce_isolation[i] =
2160 AMDGPU_ENFORCE_ISOLATION_ENABLE;
2161 break;
2162 case 2:
2163 /* enable legacy mode */
2164 adev->enforce_isolation[i] =
2165 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY;
2166 break;
2167 case 3:
2168 /* enable only process isolation without submitting cleaner shader */
2169 adev->enforce_isolation[i] =
2170 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER;
2171 break;
2172 }
2173 }
2174
2175 return 0;
2176 }
2177
2178 /**
2179 * amdgpu_switcheroo_set_state - set switcheroo state
2180 *
2181 * @pdev: pci dev pointer
2182 * @state: vga_switcheroo state
2183 *
2184 * Callback for the switcheroo driver. Suspends or resumes
2185 * the asics before or after it is powered up using ACPI methods.
2186 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)2187 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2188 enum vga_switcheroo_state state)
2189 {
2190 struct drm_device *dev = pci_get_drvdata(pdev);
2191 int r;
2192
2193 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
2194 return;
2195
2196 if (state == VGA_SWITCHEROO_ON) {
2197 pr_info("switched on\n");
2198 /* don't suspend or resume card normally */
2199 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2200
2201 pci_set_power_state(pdev, PCI_D0);
2202 amdgpu_device_load_pci_state(pdev);
2203 r = pci_enable_device(pdev);
2204 if (r)
2205 DRM_WARN("pci_enable_device failed (%d)\n", r);
2206 amdgpu_device_resume(dev, true);
2207
2208 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2209 } else {
2210 pr_info("switched off\n");
2211 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2212 amdgpu_device_prepare(dev);
2213 amdgpu_device_suspend(dev, true);
2214 amdgpu_device_cache_pci_state(pdev);
2215 /* Shut down the device */
2216 pci_disable_device(pdev);
2217 pci_set_power_state(pdev, PCI_D3cold);
2218 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2219 }
2220 }
2221
2222 /**
2223 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2224 *
2225 * @pdev: pci dev pointer
2226 *
2227 * Callback for the switcheroo driver. Check of the switcheroo
2228 * state can be changed.
2229 * Returns true if the state can be changed, false if not.
2230 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)2231 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2232 {
2233 struct drm_device *dev = pci_get_drvdata(pdev);
2234
2235 /*
2236 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2237 * locking inversion with the driver load path. And the access here is
2238 * completely racy anyway. So don't bother with locking for now.
2239 */
2240 return atomic_read(&dev->open_count) == 0;
2241 }
2242
2243 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2244 .set_gpu_state = amdgpu_switcheroo_set_state,
2245 .reprobe = NULL,
2246 .can_switch = amdgpu_switcheroo_can_switch,
2247 };
2248
2249 /**
2250 * amdgpu_device_ip_set_clockgating_state - set the CG state
2251 *
2252 * @dev: amdgpu_device pointer
2253 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2254 * @state: clockgating state (gate or ungate)
2255 *
2256 * Sets the requested clockgating state for all instances of
2257 * the hardware IP specified.
2258 * Returns the error code from the last instance.
2259 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)2260 int amdgpu_device_ip_set_clockgating_state(void *dev,
2261 enum amd_ip_block_type block_type,
2262 enum amd_clockgating_state state)
2263 {
2264 struct amdgpu_device *adev = dev;
2265 int i, r = 0;
2266
2267 for (i = 0; i < adev->num_ip_blocks; i++) {
2268 if (!adev->ip_blocks[i].status.valid)
2269 continue;
2270 if (adev->ip_blocks[i].version->type != block_type)
2271 continue;
2272 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2273 continue;
2274 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2275 &adev->ip_blocks[i], state);
2276 if (r)
2277 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2278 adev->ip_blocks[i].version->funcs->name, r);
2279 }
2280 return r;
2281 }
2282
2283 /**
2284 * amdgpu_device_ip_set_powergating_state - set the PG state
2285 *
2286 * @dev: amdgpu_device pointer
2287 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2288 * @state: powergating state (gate or ungate)
2289 *
2290 * Sets the requested powergating state for all instances of
2291 * the hardware IP specified.
2292 * Returns the error code from the last instance.
2293 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)2294 int amdgpu_device_ip_set_powergating_state(void *dev,
2295 enum amd_ip_block_type block_type,
2296 enum amd_powergating_state state)
2297 {
2298 struct amdgpu_device *adev = dev;
2299 int i, r = 0;
2300
2301 for (i = 0; i < adev->num_ip_blocks; i++) {
2302 if (!adev->ip_blocks[i].status.valid)
2303 continue;
2304 if (adev->ip_blocks[i].version->type != block_type)
2305 continue;
2306 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2307 continue;
2308 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2309 &adev->ip_blocks[i], state);
2310 if (r)
2311 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2312 adev->ip_blocks[i].version->funcs->name, r);
2313 }
2314 return r;
2315 }
2316
2317 /**
2318 * amdgpu_device_ip_get_clockgating_state - get the CG state
2319 *
2320 * @adev: amdgpu_device pointer
2321 * @flags: clockgating feature flags
2322 *
2323 * Walks the list of IPs on the device and updates the clockgating
2324 * flags for each IP.
2325 * Updates @flags with the feature flags for each hardware IP where
2326 * clockgating is enabled.
2327 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)2328 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2329 u64 *flags)
2330 {
2331 int i;
2332
2333 for (i = 0; i < adev->num_ip_blocks; i++) {
2334 if (!adev->ip_blocks[i].status.valid)
2335 continue;
2336 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2337 adev->ip_blocks[i].version->funcs->get_clockgating_state(
2338 &adev->ip_blocks[i], flags);
2339 }
2340 }
2341
2342 /**
2343 * amdgpu_device_ip_wait_for_idle - wait for idle
2344 *
2345 * @adev: amdgpu_device pointer
2346 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2347 *
2348 * Waits for the request hardware IP to be idle.
2349 * Returns 0 for success or a negative error code on failure.
2350 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2351 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2352 enum amd_ip_block_type block_type)
2353 {
2354 int i, r;
2355
2356 for (i = 0; i < adev->num_ip_blocks; i++) {
2357 if (!adev->ip_blocks[i].status.valid)
2358 continue;
2359 if (adev->ip_blocks[i].version->type == block_type) {
2360 if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
2361 r = adev->ip_blocks[i].version->funcs->wait_for_idle(
2362 &adev->ip_blocks[i]);
2363 if (r)
2364 return r;
2365 }
2366 break;
2367 }
2368 }
2369 return 0;
2370
2371 }
2372
2373 /**
2374 * amdgpu_device_ip_is_valid - is the hardware IP enabled
2375 *
2376 * @adev: amdgpu_device pointer
2377 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2378 *
2379 * Check if the hardware IP is enable or not.
2380 * Returns true if it the IP is enable, false if not.
2381 */
amdgpu_device_ip_is_valid(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2382 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
2383 enum amd_ip_block_type block_type)
2384 {
2385 int i;
2386
2387 for (i = 0; i < adev->num_ip_blocks; i++) {
2388 if (adev->ip_blocks[i].version->type == block_type)
2389 return adev->ip_blocks[i].status.valid;
2390 }
2391 return false;
2392
2393 }
2394
2395 /**
2396 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2397 *
2398 * @adev: amdgpu_device pointer
2399 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2400 *
2401 * Returns a pointer to the hardware IP block structure
2402 * if it exists for the asic, otherwise NULL.
2403 */
2404 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)2405 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2406 enum amd_ip_block_type type)
2407 {
2408 int i;
2409
2410 for (i = 0; i < adev->num_ip_blocks; i++)
2411 if (adev->ip_blocks[i].version->type == type)
2412 return &adev->ip_blocks[i];
2413
2414 return NULL;
2415 }
2416
2417 /**
2418 * amdgpu_device_ip_block_version_cmp
2419 *
2420 * @adev: amdgpu_device pointer
2421 * @type: enum amd_ip_block_type
2422 * @major: major version
2423 * @minor: minor version
2424 *
2425 * return 0 if equal or greater
2426 * return 1 if smaller or the ip_block doesn't exist
2427 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)2428 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2429 enum amd_ip_block_type type,
2430 u32 major, u32 minor)
2431 {
2432 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2433
2434 if (ip_block && ((ip_block->version->major > major) ||
2435 ((ip_block->version->major == major) &&
2436 (ip_block->version->minor >= minor))))
2437 return 0;
2438
2439 return 1;
2440 }
2441
2442 /**
2443 * amdgpu_device_ip_block_add
2444 *
2445 * @adev: amdgpu_device pointer
2446 * @ip_block_version: pointer to the IP to add
2447 *
2448 * Adds the IP block driver information to the collection of IPs
2449 * on the asic.
2450 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)2451 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2452 const struct amdgpu_ip_block_version *ip_block_version)
2453 {
2454 if (!ip_block_version)
2455 return -EINVAL;
2456
2457 switch (ip_block_version->type) {
2458 case AMD_IP_BLOCK_TYPE_VCN:
2459 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2460 return 0;
2461 break;
2462 case AMD_IP_BLOCK_TYPE_JPEG:
2463 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2464 return 0;
2465 break;
2466 default:
2467 break;
2468 }
2469
2470 dev_info(adev->dev, "detected ip block number %d <%s>\n",
2471 adev->num_ip_blocks, ip_block_version->funcs->name);
2472
2473 adev->ip_blocks[adev->num_ip_blocks].adev = adev;
2474
2475 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2476
2477 return 0;
2478 }
2479
2480 /**
2481 * amdgpu_device_enable_virtual_display - enable virtual display feature
2482 *
2483 * @adev: amdgpu_device pointer
2484 *
2485 * Enabled the virtual display feature if the user has enabled it via
2486 * the module parameter virtual_display. This feature provides a virtual
2487 * display hardware on headless boards or in virtualized environments.
2488 * This function parses and validates the configuration string specified by
2489 * the user and configures the virtual display configuration (number of
2490 * virtual connectors, crtcs, etc.) specified.
2491 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)2492 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2493 {
2494 adev->enable_virtual_display = false;
2495
2496 if (amdgpu_virtual_display) {
2497 const char *pci_address_name = pci_name(adev->pdev);
2498 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2499
2500 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2501 pciaddstr_tmp = pciaddstr;
2502 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2503 pciaddname = strsep(&pciaddname_tmp, ",");
2504 if (!strcmp("all", pciaddname)
2505 || !strcmp(pci_address_name, pciaddname)) {
2506 long num_crtc;
2507 int res = -1;
2508
2509 adev->enable_virtual_display = true;
2510
2511 if (pciaddname_tmp)
2512 res = kstrtol(pciaddname_tmp, 10,
2513 &num_crtc);
2514
2515 if (!res) {
2516 if (num_crtc < 1)
2517 num_crtc = 1;
2518 if (num_crtc > 6)
2519 num_crtc = 6;
2520 adev->mode_info.num_crtc = num_crtc;
2521 } else {
2522 adev->mode_info.num_crtc = 1;
2523 }
2524 break;
2525 }
2526 }
2527
2528 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2529 amdgpu_virtual_display, pci_address_name,
2530 adev->enable_virtual_display, adev->mode_info.num_crtc);
2531
2532 kfree(pciaddstr);
2533 }
2534 }
2535
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)2536 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2537 {
2538 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2539 adev->mode_info.num_crtc = 1;
2540 adev->enable_virtual_display = true;
2541 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2542 adev->enable_virtual_display, adev->mode_info.num_crtc);
2543 }
2544 }
2545
2546 /**
2547 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2548 *
2549 * @adev: amdgpu_device pointer
2550 *
2551 * Parses the asic configuration parameters specified in the gpu info
2552 * firmware and makes them available to the driver for use in configuring
2553 * the asic.
2554 * Returns 0 on success, -EINVAL on failure.
2555 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)2556 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2557 {
2558 const char *chip_name;
2559 int err;
2560 const struct gpu_info_firmware_header_v1_0 *hdr;
2561
2562 adev->firmware.gpu_info_fw = NULL;
2563
2564 if (adev->mman.discovery_bin)
2565 return 0;
2566
2567 switch (adev->asic_type) {
2568 default:
2569 return 0;
2570 case CHIP_VEGA10:
2571 chip_name = "vega10";
2572 break;
2573 case CHIP_VEGA12:
2574 chip_name = "vega12";
2575 break;
2576 case CHIP_RAVEN:
2577 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2578 chip_name = "raven2";
2579 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2580 chip_name = "picasso";
2581 else
2582 chip_name = "raven";
2583 break;
2584 case CHIP_ARCTURUS:
2585 chip_name = "arcturus";
2586 break;
2587 case CHIP_NAVI12:
2588 chip_name = "navi12";
2589 break;
2590 }
2591
2592 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2593 AMDGPU_UCODE_OPTIONAL,
2594 "amdgpu/%s_gpu_info.bin", chip_name);
2595 if (err) {
2596 dev_err(adev->dev,
2597 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2598 chip_name);
2599 goto out;
2600 }
2601
2602 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2603 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2604
2605 switch (hdr->version_major) {
2606 case 1:
2607 {
2608 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2609 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2610 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2611
2612 /*
2613 * Should be dropped when DAL no longer needs it.
2614 */
2615 if (adev->asic_type == CHIP_NAVI12)
2616 goto parse_soc_bounding_box;
2617
2618 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2619 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2620 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2621 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2622 adev->gfx.config.max_texture_channel_caches =
2623 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2624 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2625 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2626 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2627 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2628 adev->gfx.config.double_offchip_lds_buf =
2629 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2630 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2631 adev->gfx.cu_info.max_waves_per_simd =
2632 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2633 adev->gfx.cu_info.max_scratch_slots_per_cu =
2634 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2635 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2636 if (hdr->version_minor >= 1) {
2637 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2638 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2639 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2640 adev->gfx.config.num_sc_per_sh =
2641 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2642 adev->gfx.config.num_packer_per_sc =
2643 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2644 }
2645
2646 parse_soc_bounding_box:
2647 /*
2648 * soc bounding box info is not integrated in disocovery table,
2649 * we always need to parse it from gpu info firmware if needed.
2650 */
2651 if (hdr->version_minor == 2) {
2652 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2653 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2654 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2655 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2656 }
2657 break;
2658 }
2659 default:
2660 dev_err(adev->dev,
2661 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2662 err = -EINVAL;
2663 goto out;
2664 }
2665 out:
2666 return err;
2667 }
2668
2669 /**
2670 * amdgpu_device_ip_early_init - run early init for hardware IPs
2671 *
2672 * @adev: amdgpu_device pointer
2673 *
2674 * Early initialization pass for hardware IPs. The hardware IPs that make
2675 * up each asic are discovered each IP's early_init callback is run. This
2676 * is the first stage in initializing the asic.
2677 * Returns 0 on success, negative error code on failure.
2678 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2679 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2680 {
2681 struct amdgpu_ip_block *ip_block;
2682 struct pci_dev *parent;
2683 bool total, skip_bios;
2684 uint32_t bios_flags;
2685 int i, r;
2686
2687 amdgpu_device_enable_virtual_display(adev);
2688
2689 if (amdgpu_sriov_vf(adev)) {
2690 r = amdgpu_virt_request_full_gpu(adev, true);
2691 if (r)
2692 return r;
2693 }
2694
2695 switch (adev->asic_type) {
2696 #ifdef CONFIG_DRM_AMDGPU_SI
2697 case CHIP_VERDE:
2698 case CHIP_TAHITI:
2699 case CHIP_PITCAIRN:
2700 case CHIP_OLAND:
2701 case CHIP_HAINAN:
2702 adev->family = AMDGPU_FAMILY_SI;
2703 r = si_set_ip_blocks(adev);
2704 if (r)
2705 return r;
2706 break;
2707 #endif
2708 #ifdef CONFIG_DRM_AMDGPU_CIK
2709 case CHIP_BONAIRE:
2710 case CHIP_HAWAII:
2711 case CHIP_KAVERI:
2712 case CHIP_KABINI:
2713 case CHIP_MULLINS:
2714 if (adev->flags & AMD_IS_APU)
2715 adev->family = AMDGPU_FAMILY_KV;
2716 else
2717 adev->family = AMDGPU_FAMILY_CI;
2718
2719 r = cik_set_ip_blocks(adev);
2720 if (r)
2721 return r;
2722 break;
2723 #endif
2724 case CHIP_TOPAZ:
2725 case CHIP_TONGA:
2726 case CHIP_FIJI:
2727 case CHIP_POLARIS10:
2728 case CHIP_POLARIS11:
2729 case CHIP_POLARIS12:
2730 case CHIP_VEGAM:
2731 case CHIP_CARRIZO:
2732 case CHIP_STONEY:
2733 if (adev->flags & AMD_IS_APU)
2734 adev->family = AMDGPU_FAMILY_CZ;
2735 else
2736 adev->family = AMDGPU_FAMILY_VI;
2737
2738 r = vi_set_ip_blocks(adev);
2739 if (r)
2740 return r;
2741 break;
2742 default:
2743 r = amdgpu_discovery_set_ip_blocks(adev);
2744 if (r)
2745 return r;
2746 break;
2747 }
2748
2749 /* Check for IP version 9.4.3 with A0 hardware */
2750 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
2751 !amdgpu_device_get_rev_id(adev)) {
2752 dev_err(adev->dev, "Unsupported A0 hardware\n");
2753 return -ENODEV; /* device unsupported - no device error */
2754 }
2755
2756 if (amdgpu_has_atpx() &&
2757 (amdgpu_is_atpx_hybrid() ||
2758 amdgpu_has_atpx_dgpu_power_cntl()) &&
2759 ((adev->flags & AMD_IS_APU) == 0) &&
2760 !dev_is_removable(&adev->pdev->dev))
2761 adev->flags |= AMD_IS_PX;
2762
2763 if (!(adev->flags & AMD_IS_APU)) {
2764 parent = pcie_find_root_port(adev->pdev);
2765 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2766 }
2767
2768 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2769 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2770 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2771 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2772 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2773 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2774 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2775
2776 total = true;
2777 for (i = 0; i < adev->num_ip_blocks; i++) {
2778 ip_block = &adev->ip_blocks[i];
2779
2780 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2781 DRM_WARN("disabled ip block: %d <%s>\n",
2782 i, adev->ip_blocks[i].version->funcs->name);
2783 adev->ip_blocks[i].status.valid = false;
2784 } else if (ip_block->version->funcs->early_init) {
2785 r = ip_block->version->funcs->early_init(ip_block);
2786 if (r == -ENOENT) {
2787 adev->ip_blocks[i].status.valid = false;
2788 } else if (r) {
2789 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2790 adev->ip_blocks[i].version->funcs->name, r);
2791 total = false;
2792 } else {
2793 adev->ip_blocks[i].status.valid = true;
2794 }
2795 } else {
2796 adev->ip_blocks[i].status.valid = true;
2797 }
2798 /* get the vbios after the asic_funcs are set up */
2799 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2800 r = amdgpu_device_parse_gpu_info_fw(adev);
2801 if (r)
2802 return r;
2803
2804 bios_flags = amdgpu_device_get_vbios_flags(adev);
2805 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP);
2806 /* Read BIOS */
2807 if (!skip_bios) {
2808 bool optional =
2809 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL);
2810 if (!amdgpu_get_bios(adev) && !optional)
2811 return -EINVAL;
2812
2813 if (optional && !adev->bios)
2814 dev_info(
2815 adev->dev,
2816 "VBIOS image optional, proceeding without VBIOS image");
2817
2818 if (adev->bios) {
2819 r = amdgpu_atombios_init(adev);
2820 if (r) {
2821 dev_err(adev->dev,
2822 "amdgpu_atombios_init failed\n");
2823 amdgpu_vf_error_put(
2824 adev,
2825 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL,
2826 0, 0);
2827 return r;
2828 }
2829 }
2830 }
2831
2832 /*get pf2vf msg info at it's earliest time*/
2833 if (amdgpu_sriov_vf(adev))
2834 amdgpu_virt_init_data_exchange(adev);
2835
2836 }
2837 }
2838 if (!total)
2839 return -ENODEV;
2840
2841 if (adev->gmc.xgmi.supported)
2842 amdgpu_xgmi_early_init(adev);
2843
2844 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2845 if (ip_block->status.valid != false)
2846 amdgpu_amdkfd_device_probe(adev);
2847
2848 adev->cg_flags &= amdgpu_cg_mask;
2849 adev->pg_flags &= amdgpu_pg_mask;
2850
2851 return 0;
2852 }
2853
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2854 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2855 {
2856 int i, r;
2857
2858 for (i = 0; i < adev->num_ip_blocks; i++) {
2859 if (!adev->ip_blocks[i].status.sw)
2860 continue;
2861 if (adev->ip_blocks[i].status.hw)
2862 continue;
2863 if (!amdgpu_ip_member_of_hwini(
2864 adev, adev->ip_blocks[i].version->type))
2865 continue;
2866 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2867 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2868 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2869 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2870 if (r) {
2871 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2872 adev->ip_blocks[i].version->funcs->name, r);
2873 return r;
2874 }
2875 adev->ip_blocks[i].status.hw = true;
2876 }
2877 }
2878
2879 return 0;
2880 }
2881
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2882 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2883 {
2884 int i, r;
2885
2886 for (i = 0; i < adev->num_ip_blocks; i++) {
2887 if (!adev->ip_blocks[i].status.sw)
2888 continue;
2889 if (adev->ip_blocks[i].status.hw)
2890 continue;
2891 if (!amdgpu_ip_member_of_hwini(
2892 adev, adev->ip_blocks[i].version->type))
2893 continue;
2894 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2895 if (r) {
2896 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2897 adev->ip_blocks[i].version->funcs->name, r);
2898 return r;
2899 }
2900 adev->ip_blocks[i].status.hw = true;
2901 }
2902
2903 return 0;
2904 }
2905
amdgpu_device_fw_loading(struct amdgpu_device * adev)2906 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2907 {
2908 int r = 0;
2909 int i;
2910 uint32_t smu_version;
2911
2912 if (adev->asic_type >= CHIP_VEGA10) {
2913 for (i = 0; i < adev->num_ip_blocks; i++) {
2914 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2915 continue;
2916
2917 if (!amdgpu_ip_member_of_hwini(adev,
2918 AMD_IP_BLOCK_TYPE_PSP))
2919 break;
2920
2921 if (!adev->ip_blocks[i].status.sw)
2922 continue;
2923
2924 /* no need to do the fw loading again if already done*/
2925 if (adev->ip_blocks[i].status.hw == true)
2926 break;
2927
2928 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2929 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
2930 if (r)
2931 return r;
2932 } else {
2933 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2934 if (r) {
2935 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2936 adev->ip_blocks[i].version->funcs->name, r);
2937 return r;
2938 }
2939 adev->ip_blocks[i].status.hw = true;
2940 }
2941 break;
2942 }
2943 }
2944
2945 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2946 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2947
2948 return r;
2949 }
2950
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2951 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2952 {
2953 struct drm_sched_init_args args = {
2954 .ops = &amdgpu_sched_ops,
2955 .num_rqs = DRM_SCHED_PRIORITY_COUNT,
2956 .timeout_wq = adev->reset_domain->wq,
2957 .dev = adev->dev,
2958 };
2959 long timeout;
2960 int r, i;
2961
2962 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2963 struct amdgpu_ring *ring = adev->rings[i];
2964
2965 /* No need to setup the GPU scheduler for rings that don't need it */
2966 if (!ring || ring->no_scheduler)
2967 continue;
2968
2969 switch (ring->funcs->type) {
2970 case AMDGPU_RING_TYPE_GFX:
2971 timeout = adev->gfx_timeout;
2972 break;
2973 case AMDGPU_RING_TYPE_COMPUTE:
2974 timeout = adev->compute_timeout;
2975 break;
2976 case AMDGPU_RING_TYPE_SDMA:
2977 timeout = adev->sdma_timeout;
2978 break;
2979 default:
2980 timeout = adev->video_timeout;
2981 break;
2982 }
2983
2984 args.timeout = timeout;
2985 args.credit_limit = ring->num_hw_submission;
2986 args.score = ring->sched_score;
2987 args.name = ring->name;
2988
2989 r = drm_sched_init(&ring->sched, &args);
2990 if (r) {
2991 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2992 ring->name);
2993 return r;
2994 }
2995 r = amdgpu_uvd_entity_init(adev, ring);
2996 if (r) {
2997 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2998 ring->name);
2999 return r;
3000 }
3001 r = amdgpu_vce_entity_init(adev, ring);
3002 if (r) {
3003 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
3004 ring->name);
3005 return r;
3006 }
3007 }
3008
3009 amdgpu_xcp_update_partition_sched_list(adev);
3010
3011 return 0;
3012 }
3013
3014
3015 /**
3016 * amdgpu_device_ip_init - run init for hardware IPs
3017 *
3018 * @adev: amdgpu_device pointer
3019 *
3020 * Main initialization pass for hardware IPs. The list of all the hardware
3021 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
3022 * are run. sw_init initializes the software state associated with each IP
3023 * and hw_init initializes the hardware associated with each IP.
3024 * Returns 0 on success, negative error code on failure.
3025 */
amdgpu_device_ip_init(struct amdgpu_device * adev)3026 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
3027 {
3028 bool init_badpage;
3029 int i, r;
3030
3031 r = amdgpu_ras_init(adev);
3032 if (r)
3033 return r;
3034
3035 for (i = 0; i < adev->num_ip_blocks; i++) {
3036 if (!adev->ip_blocks[i].status.valid)
3037 continue;
3038 if (adev->ip_blocks[i].version->funcs->sw_init) {
3039 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
3040 if (r) {
3041 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
3042 adev->ip_blocks[i].version->funcs->name, r);
3043 goto init_failed;
3044 }
3045 }
3046 adev->ip_blocks[i].status.sw = true;
3047
3048 if (!amdgpu_ip_member_of_hwini(
3049 adev, adev->ip_blocks[i].version->type))
3050 continue;
3051
3052 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
3053 /* need to do common hw init early so everything is set up for gmc */
3054 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3055 if (r) {
3056 DRM_ERROR("hw_init %d failed %d\n", i, r);
3057 goto init_failed;
3058 }
3059 adev->ip_blocks[i].status.hw = true;
3060 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3061 /* need to do gmc hw init early so we can allocate gpu mem */
3062 /* Try to reserve bad pages early */
3063 if (amdgpu_sriov_vf(adev))
3064 amdgpu_virt_exchange_data(adev);
3065
3066 r = amdgpu_device_mem_scratch_init(adev);
3067 if (r) {
3068 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
3069 goto init_failed;
3070 }
3071 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
3072 if (r) {
3073 DRM_ERROR("hw_init %d failed %d\n", i, r);
3074 goto init_failed;
3075 }
3076 r = amdgpu_device_wb_init(adev);
3077 if (r) {
3078 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
3079 goto init_failed;
3080 }
3081 adev->ip_blocks[i].status.hw = true;
3082
3083 /* right after GMC hw init, we create CSA */
3084 if (adev->gfx.mcbp) {
3085 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
3086 AMDGPU_GEM_DOMAIN_VRAM |
3087 AMDGPU_GEM_DOMAIN_GTT,
3088 AMDGPU_CSA_SIZE);
3089 if (r) {
3090 DRM_ERROR("allocate CSA failed %d\n", r);
3091 goto init_failed;
3092 }
3093 }
3094
3095 r = amdgpu_seq64_init(adev);
3096 if (r) {
3097 DRM_ERROR("allocate seq64 failed %d\n", r);
3098 goto init_failed;
3099 }
3100 }
3101 }
3102
3103 if (amdgpu_sriov_vf(adev))
3104 amdgpu_virt_init_data_exchange(adev);
3105
3106 r = amdgpu_ib_pool_init(adev);
3107 if (r) {
3108 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
3109 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
3110 goto init_failed;
3111 }
3112
3113 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
3114 if (r)
3115 goto init_failed;
3116
3117 r = amdgpu_device_ip_hw_init_phase1(adev);
3118 if (r)
3119 goto init_failed;
3120
3121 r = amdgpu_device_fw_loading(adev);
3122 if (r)
3123 goto init_failed;
3124
3125 r = amdgpu_device_ip_hw_init_phase2(adev);
3126 if (r)
3127 goto init_failed;
3128
3129 /*
3130 * retired pages will be loaded from eeprom and reserved here,
3131 * it should be called after amdgpu_device_ip_hw_init_phase2 since
3132 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
3133 * for I2C communication which only true at this point.
3134 *
3135 * amdgpu_ras_recovery_init may fail, but the upper only cares the
3136 * failure from bad gpu situation and stop amdgpu init process
3137 * accordingly. For other failed cases, it will still release all
3138 * the resource and print error message, rather than returning one
3139 * negative value to upper level.
3140 *
3141 * Note: theoretically, this should be called before all vram allocations
3142 * to protect retired page from abusing
3143 */
3144 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
3145 r = amdgpu_ras_recovery_init(adev, init_badpage);
3146 if (r)
3147 goto init_failed;
3148
3149 /**
3150 * In case of XGMI grab extra reference for reset domain for this device
3151 */
3152 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3153 if (amdgpu_xgmi_add_device(adev) == 0) {
3154 if (!amdgpu_sriov_vf(adev)) {
3155 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3156
3157 if (WARN_ON(!hive)) {
3158 r = -ENOENT;
3159 goto init_failed;
3160 }
3161
3162 if (!hive->reset_domain ||
3163 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
3164 r = -ENOENT;
3165 amdgpu_put_xgmi_hive(hive);
3166 goto init_failed;
3167 }
3168
3169 /* Drop the early temporary reset domain we created for device */
3170 amdgpu_reset_put_reset_domain(adev->reset_domain);
3171 adev->reset_domain = hive->reset_domain;
3172 amdgpu_put_xgmi_hive(hive);
3173 }
3174 }
3175 }
3176
3177 r = amdgpu_device_init_schedulers(adev);
3178 if (r)
3179 goto init_failed;
3180
3181 if (adev->mman.buffer_funcs_ring->sched.ready)
3182 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3183
3184 /* Don't init kfd if whole hive need to be reset during init */
3185 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
3186 kgd2kfd_init_zone_device(adev);
3187 amdgpu_amdkfd_device_init(adev);
3188 }
3189
3190 amdgpu_fru_get_product_info(adev);
3191
3192 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev))
3193 r = amdgpu_cper_init(adev);
3194
3195 init_failed:
3196
3197 return r;
3198 }
3199
3200 /**
3201 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
3202 *
3203 * @adev: amdgpu_device pointer
3204 *
3205 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
3206 * this function before a GPU reset. If the value is retained after a
3207 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents.
3208 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)3209 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
3210 {
3211 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
3212 }
3213
3214 /**
3215 * amdgpu_device_check_vram_lost - check if vram is valid
3216 *
3217 * @adev: amdgpu_device pointer
3218 *
3219 * Checks the reset magic value written to the gart pointer in VRAM.
3220 * The driver calls this after a GPU reset to see if the contents of
3221 * VRAM is lost or now.
3222 * returns true if vram is lost, false if not.
3223 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)3224 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3225 {
3226 if (memcmp(adev->gart.ptr, adev->reset_magic,
3227 AMDGPU_RESET_MAGIC_NUM))
3228 return true;
3229
3230 if (!amdgpu_in_reset(adev))
3231 return false;
3232
3233 /*
3234 * For all ASICs with baco/mode1 reset, the VRAM is
3235 * always assumed to be lost.
3236 */
3237 switch (amdgpu_asic_reset_method(adev)) {
3238 case AMD_RESET_METHOD_LINK:
3239 case AMD_RESET_METHOD_BACO:
3240 case AMD_RESET_METHOD_MODE1:
3241 return true;
3242 default:
3243 return false;
3244 }
3245 }
3246
3247 /**
3248 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3249 *
3250 * @adev: amdgpu_device pointer
3251 * @state: clockgating state (gate or ungate)
3252 *
3253 * The list of all the hardware IPs that make up the asic is walked and the
3254 * set_clockgating_state callbacks are run.
3255 * Late initialization pass enabling clockgating for hardware IPs.
3256 * Fini or suspend, pass disabling clockgating for hardware IPs.
3257 * Returns 0 on success, negative error code on failure.
3258 */
3259
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)3260 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3261 enum amd_clockgating_state state)
3262 {
3263 int i, j, r;
3264
3265 if (amdgpu_emu_mode == 1)
3266 return 0;
3267
3268 for (j = 0; j < adev->num_ip_blocks; j++) {
3269 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3270 if (!adev->ip_blocks[i].status.late_initialized)
3271 continue;
3272 /* skip CG for GFX, SDMA on S0ix */
3273 if (adev->in_s0ix &&
3274 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3275 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3276 continue;
3277 /* skip CG for VCE/UVD, it's handled specially */
3278 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3279 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3280 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3281 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3282 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3283 /* enable clockgating to save power */
3284 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i],
3285 state);
3286 if (r) {
3287 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3288 adev->ip_blocks[i].version->funcs->name, r);
3289 return r;
3290 }
3291 }
3292 }
3293
3294 return 0;
3295 }
3296
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)3297 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3298 enum amd_powergating_state state)
3299 {
3300 int i, j, r;
3301
3302 if (amdgpu_emu_mode == 1)
3303 return 0;
3304
3305 for (j = 0; j < adev->num_ip_blocks; j++) {
3306 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3307 if (!adev->ip_blocks[i].status.late_initialized)
3308 continue;
3309 /* skip PG for GFX, SDMA on S0ix */
3310 if (adev->in_s0ix &&
3311 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3312 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3313 continue;
3314 /* skip CG for VCE/UVD, it's handled specially */
3315 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3316 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3317 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3318 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3319 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3320 /* enable powergating to save power */
3321 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i],
3322 state);
3323 if (r) {
3324 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3325 adev->ip_blocks[i].version->funcs->name, r);
3326 return r;
3327 }
3328 }
3329 }
3330 return 0;
3331 }
3332
amdgpu_device_enable_mgpu_fan_boost(void)3333 static int amdgpu_device_enable_mgpu_fan_boost(void)
3334 {
3335 struct amdgpu_gpu_instance *gpu_ins;
3336 struct amdgpu_device *adev;
3337 int i, ret = 0;
3338
3339 mutex_lock(&mgpu_info.mutex);
3340
3341 /*
3342 * MGPU fan boost feature should be enabled
3343 * only when there are two or more dGPUs in
3344 * the system
3345 */
3346 if (mgpu_info.num_dgpu < 2)
3347 goto out;
3348
3349 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3350 gpu_ins = &(mgpu_info.gpu_ins[i]);
3351 adev = gpu_ins->adev;
3352 if (!(adev->flags & AMD_IS_APU) &&
3353 !gpu_ins->mgpu_fan_enabled) {
3354 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3355 if (ret)
3356 break;
3357
3358 gpu_ins->mgpu_fan_enabled = 1;
3359 }
3360 }
3361
3362 out:
3363 mutex_unlock(&mgpu_info.mutex);
3364
3365 return ret;
3366 }
3367
3368 /**
3369 * amdgpu_device_ip_late_init - run late init for hardware IPs
3370 *
3371 * @adev: amdgpu_device pointer
3372 *
3373 * Late initialization pass for hardware IPs. The list of all the hardware
3374 * IPs that make up the asic is walked and the late_init callbacks are run.
3375 * late_init covers any special initialization that an IP requires
3376 * after all of the have been initialized or something that needs to happen
3377 * late in the init process.
3378 * Returns 0 on success, negative error code on failure.
3379 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)3380 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3381 {
3382 struct amdgpu_gpu_instance *gpu_instance;
3383 int i = 0, r;
3384
3385 for (i = 0; i < adev->num_ip_blocks; i++) {
3386 if (!adev->ip_blocks[i].status.hw)
3387 continue;
3388 if (adev->ip_blocks[i].version->funcs->late_init) {
3389 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
3390 if (r) {
3391 DRM_ERROR("late_init of IP block <%s> failed %d\n",
3392 adev->ip_blocks[i].version->funcs->name, r);
3393 return r;
3394 }
3395 }
3396 adev->ip_blocks[i].status.late_initialized = true;
3397 }
3398
3399 r = amdgpu_ras_late_init(adev);
3400 if (r) {
3401 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3402 return r;
3403 }
3404
3405 if (!amdgpu_reset_in_recovery(adev))
3406 amdgpu_ras_set_error_query_ready(adev, true);
3407
3408 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3409 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3410
3411 amdgpu_device_fill_reset_magic(adev);
3412
3413 r = amdgpu_device_enable_mgpu_fan_boost();
3414 if (r)
3415 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3416
3417 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3418 if (amdgpu_passthrough(adev) &&
3419 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3420 adev->asic_type == CHIP_ALDEBARAN))
3421 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3422
3423 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3424 mutex_lock(&mgpu_info.mutex);
3425
3426 /*
3427 * Reset device p-state to low as this was booted with high.
3428 *
3429 * This should be performed only after all devices from the same
3430 * hive get initialized.
3431 *
3432 * However, it's unknown how many device in the hive in advance.
3433 * As this is counted one by one during devices initializations.
3434 *
3435 * So, we wait for all XGMI interlinked devices initialized.
3436 * This may bring some delays as those devices may come from
3437 * different hives. But that should be OK.
3438 */
3439 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3440 for (i = 0; i < mgpu_info.num_gpu; i++) {
3441 gpu_instance = &(mgpu_info.gpu_ins[i]);
3442 if (gpu_instance->adev->flags & AMD_IS_APU)
3443 continue;
3444
3445 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3446 AMDGPU_XGMI_PSTATE_MIN);
3447 if (r) {
3448 DRM_ERROR("pstate setting failed (%d).\n", r);
3449 break;
3450 }
3451 }
3452 }
3453
3454 mutex_unlock(&mgpu_info.mutex);
3455 }
3456
3457 return 0;
3458 }
3459
amdgpu_ip_block_hw_fini(struct amdgpu_ip_block * ip_block)3460 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
3461 {
3462 int r;
3463
3464 if (!ip_block->version->funcs->hw_fini) {
3465 DRM_ERROR("hw_fini of IP block <%s> not defined\n",
3466 ip_block->version->funcs->name);
3467 } else {
3468 r = ip_block->version->funcs->hw_fini(ip_block);
3469 /* XXX handle errors */
3470 if (r) {
3471 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3472 ip_block->version->funcs->name, r);
3473 }
3474 }
3475
3476 ip_block->status.hw = false;
3477 }
3478
3479 /**
3480 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3481 *
3482 * @adev: amdgpu_device pointer
3483 *
3484 * For ASICs need to disable SMC first
3485 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)3486 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3487 {
3488 int i;
3489
3490 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3491 return;
3492
3493 for (i = 0; i < adev->num_ip_blocks; i++) {
3494 if (!adev->ip_blocks[i].status.hw)
3495 continue;
3496 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3497 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3498 break;
3499 }
3500 }
3501 }
3502
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)3503 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3504 {
3505 int i, r;
3506
3507 for (i = 0; i < adev->num_ip_blocks; i++) {
3508 if (!adev->ip_blocks[i].version->funcs->early_fini)
3509 continue;
3510
3511 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
3512 if (r) {
3513 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3514 adev->ip_blocks[i].version->funcs->name, r);
3515 }
3516 }
3517
3518 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3519 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3520
3521 amdgpu_amdkfd_suspend(adev, false);
3522 amdgpu_userq_suspend(adev);
3523
3524 /* Workaround for ASICs need to disable SMC first */
3525 amdgpu_device_smu_fini_early(adev);
3526
3527 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3528 if (!adev->ip_blocks[i].status.hw)
3529 continue;
3530
3531 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3532 }
3533
3534 if (amdgpu_sriov_vf(adev)) {
3535 if (amdgpu_virt_release_full_gpu(adev, false))
3536 DRM_ERROR("failed to release exclusive mode on fini\n");
3537 }
3538
3539 return 0;
3540 }
3541
3542 /**
3543 * amdgpu_device_ip_fini - run fini for hardware IPs
3544 *
3545 * @adev: amdgpu_device pointer
3546 *
3547 * Main teardown pass for hardware IPs. The list of all the hardware
3548 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3549 * are run. hw_fini tears down the hardware associated with each IP
3550 * and sw_fini tears down any software state associated with each IP.
3551 * Returns 0 on success, negative error code on failure.
3552 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)3553 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3554 {
3555 int i, r;
3556
3557 amdgpu_cper_fini(adev);
3558
3559 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3560 amdgpu_virt_release_ras_err_handler_data(adev);
3561
3562 if (adev->gmc.xgmi.num_physical_nodes > 1)
3563 amdgpu_xgmi_remove_device(adev);
3564
3565 amdgpu_amdkfd_device_fini_sw(adev);
3566
3567 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3568 if (!adev->ip_blocks[i].status.sw)
3569 continue;
3570
3571 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3572 amdgpu_ucode_free_bo(adev);
3573 amdgpu_free_static_csa(&adev->virt.csa_obj);
3574 amdgpu_device_wb_fini(adev);
3575 amdgpu_device_mem_scratch_fini(adev);
3576 amdgpu_ib_pool_fini(adev);
3577 amdgpu_seq64_fini(adev);
3578 amdgpu_doorbell_fini(adev);
3579 }
3580 if (adev->ip_blocks[i].version->funcs->sw_fini) {
3581 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
3582 /* XXX handle errors */
3583 if (r) {
3584 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3585 adev->ip_blocks[i].version->funcs->name, r);
3586 }
3587 }
3588 adev->ip_blocks[i].status.sw = false;
3589 adev->ip_blocks[i].status.valid = false;
3590 }
3591
3592 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3593 if (!adev->ip_blocks[i].status.late_initialized)
3594 continue;
3595 if (adev->ip_blocks[i].version->funcs->late_fini)
3596 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
3597 adev->ip_blocks[i].status.late_initialized = false;
3598 }
3599
3600 amdgpu_ras_fini(adev);
3601
3602 return 0;
3603 }
3604
3605 /**
3606 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3607 *
3608 * @work: work_struct.
3609 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)3610 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3611 {
3612 struct amdgpu_device *adev =
3613 container_of(work, struct amdgpu_device, delayed_init_work.work);
3614 int r;
3615
3616 r = amdgpu_ib_ring_tests(adev);
3617 if (r)
3618 DRM_ERROR("ib ring test failed (%d).\n", r);
3619 }
3620
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)3621 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3622 {
3623 struct amdgpu_device *adev =
3624 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3625
3626 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3627 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3628
3629 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0))
3630 adev->gfx.gfx_off_state = true;
3631 }
3632
3633 /**
3634 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3635 *
3636 * @adev: amdgpu_device pointer
3637 *
3638 * Main suspend function for hardware IPs. The list of all the hardware
3639 * IPs that make up the asic is walked, clockgating is disabled and the
3640 * suspend callbacks are run. suspend puts the hardware and software state
3641 * in each IP into a state suitable for suspend.
3642 * Returns 0 on success, negative error code on failure.
3643 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)3644 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3645 {
3646 int i, r;
3647
3648 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3649 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3650
3651 /*
3652 * Per PMFW team's suggestion, driver needs to handle gfxoff
3653 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3654 * scenario. Add the missing df cstate disablement here.
3655 */
3656 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3657 dev_warn(adev->dev, "Failed to disallow df cstate");
3658
3659 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3660 if (!adev->ip_blocks[i].status.valid)
3661 continue;
3662
3663 /* displays are handled separately */
3664 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3665 continue;
3666
3667 /* XXX handle errors */
3668 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3669 if (r)
3670 return r;
3671 }
3672
3673 return 0;
3674 }
3675
3676 /**
3677 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3678 *
3679 * @adev: amdgpu_device pointer
3680 *
3681 * Main suspend function for hardware IPs. The list of all the hardware
3682 * IPs that make up the asic is walked, clockgating is disabled and the
3683 * suspend callbacks are run. suspend puts the hardware and software state
3684 * in each IP into a state suitable for suspend.
3685 * Returns 0 on success, negative error code on failure.
3686 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)3687 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3688 {
3689 int i, r;
3690
3691 if (adev->in_s0ix)
3692 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3693
3694 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3695 if (!adev->ip_blocks[i].status.valid)
3696 continue;
3697 /* displays are handled in phase1 */
3698 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3699 continue;
3700 /* PSP lost connection when err_event_athub occurs */
3701 if (amdgpu_ras_intr_triggered() &&
3702 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3703 adev->ip_blocks[i].status.hw = false;
3704 continue;
3705 }
3706
3707 /* skip unnecessary suspend if we do not initialize them yet */
3708 if (!amdgpu_ip_member_of_hwini(
3709 adev, adev->ip_blocks[i].version->type))
3710 continue;
3711
3712 /* Since we skip suspend for S0i3, we need to cancel the delayed
3713 * idle work here as the suspend callback never gets called.
3714 */
3715 if (adev->in_s0ix &&
3716 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX &&
3717 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0))
3718 cancel_delayed_work_sync(&adev->gfx.idle_work);
3719 /* skip suspend of gfx/mes and psp for S0ix
3720 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3721 * like at runtime. PSP is also part of the always on hardware
3722 * so no need to suspend it.
3723 */
3724 if (adev->in_s0ix &&
3725 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3726 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3727 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3728 continue;
3729
3730 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3731 if (adev->in_s0ix &&
3732 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3733 IP_VERSION(5, 0, 0)) &&
3734 (adev->ip_blocks[i].version->type ==
3735 AMD_IP_BLOCK_TYPE_SDMA))
3736 continue;
3737
3738 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3739 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3740 * from this location and RLC Autoload automatically also gets loaded
3741 * from here based on PMFW -> PSP message during re-init sequence.
3742 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3743 * the TMR and reload FWs again for IMU enabled APU ASICs.
3744 */
3745 if (amdgpu_in_reset(adev) &&
3746 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3747 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3748 continue;
3749
3750 /* XXX handle errors */
3751 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3752 adev->ip_blocks[i].status.hw = false;
3753
3754 /* handle putting the SMC in the appropriate state */
3755 if (!amdgpu_sriov_vf(adev)) {
3756 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3757 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3758 if (r) {
3759 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3760 adev->mp1_state, r);
3761 return r;
3762 }
3763 }
3764 }
3765 }
3766
3767 return 0;
3768 }
3769
3770 /**
3771 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3772 *
3773 * @adev: amdgpu_device pointer
3774 *
3775 * Main suspend function for hardware IPs. The list of all the hardware
3776 * IPs that make up the asic is walked, clockgating is disabled and the
3777 * suspend callbacks are run. suspend puts the hardware and software state
3778 * in each IP into a state suitable for suspend.
3779 * Returns 0 on success, negative error code on failure.
3780 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3781 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3782 {
3783 int r;
3784
3785 if (amdgpu_sriov_vf(adev)) {
3786 amdgpu_virt_fini_data_exchange(adev);
3787 amdgpu_virt_request_full_gpu(adev, false);
3788 }
3789
3790 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3791
3792 r = amdgpu_device_ip_suspend_phase1(adev);
3793 if (r)
3794 return r;
3795 r = amdgpu_device_ip_suspend_phase2(adev);
3796
3797 if (amdgpu_sriov_vf(adev))
3798 amdgpu_virt_release_full_gpu(adev, false);
3799
3800 return r;
3801 }
3802
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3803 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3804 {
3805 int i, r;
3806
3807 static enum amd_ip_block_type ip_order[] = {
3808 AMD_IP_BLOCK_TYPE_COMMON,
3809 AMD_IP_BLOCK_TYPE_GMC,
3810 AMD_IP_BLOCK_TYPE_PSP,
3811 AMD_IP_BLOCK_TYPE_IH,
3812 };
3813
3814 for (i = 0; i < adev->num_ip_blocks; i++) {
3815 int j;
3816 struct amdgpu_ip_block *block;
3817
3818 block = &adev->ip_blocks[i];
3819 block->status.hw = false;
3820
3821 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3822
3823 if (block->version->type != ip_order[j] ||
3824 !block->status.valid)
3825 continue;
3826
3827 r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
3828 if (r) {
3829 dev_err(adev->dev, "RE-INIT-early: %s failed\n",
3830 block->version->funcs->name);
3831 return r;
3832 }
3833 block->status.hw = true;
3834 }
3835 }
3836
3837 return 0;
3838 }
3839
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3840 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3841 {
3842 struct amdgpu_ip_block *block;
3843 int i, r = 0;
3844
3845 static enum amd_ip_block_type ip_order[] = {
3846 AMD_IP_BLOCK_TYPE_SMC,
3847 AMD_IP_BLOCK_TYPE_DCE,
3848 AMD_IP_BLOCK_TYPE_GFX,
3849 AMD_IP_BLOCK_TYPE_SDMA,
3850 AMD_IP_BLOCK_TYPE_MES,
3851 AMD_IP_BLOCK_TYPE_UVD,
3852 AMD_IP_BLOCK_TYPE_VCE,
3853 AMD_IP_BLOCK_TYPE_VCN,
3854 AMD_IP_BLOCK_TYPE_JPEG
3855 };
3856
3857 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3858 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]);
3859
3860 if (!block)
3861 continue;
3862
3863 if (block->status.valid && !block->status.hw) {
3864 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
3865 r = amdgpu_ip_block_resume(block);
3866 } else {
3867 r = block->version->funcs->hw_init(block);
3868 }
3869
3870 if (r) {
3871 dev_err(adev->dev, "RE-INIT-late: %s failed\n",
3872 block->version->funcs->name);
3873 break;
3874 }
3875 block->status.hw = true;
3876 }
3877 }
3878
3879 return r;
3880 }
3881
3882 /**
3883 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3884 *
3885 * @adev: amdgpu_device pointer
3886 *
3887 * First resume function for hardware IPs. The list of all the hardware
3888 * IPs that make up the asic is walked and the resume callbacks are run for
3889 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3890 * after a suspend and updates the software state as necessary. This
3891 * function is also used for restoring the GPU after a GPU reset.
3892 * Returns 0 on success, negative error code on failure.
3893 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3894 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3895 {
3896 int i, r;
3897
3898 for (i = 0; i < adev->num_ip_blocks; i++) {
3899 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3900 continue;
3901 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3902 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3903 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3904 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3905
3906 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3907 if (r)
3908 return r;
3909 }
3910 }
3911
3912 return 0;
3913 }
3914
3915 /**
3916 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3917 *
3918 * @adev: amdgpu_device pointer
3919 *
3920 * Second resume function for hardware IPs. The list of all the hardware
3921 * IPs that make up the asic is walked and the resume callbacks are run for
3922 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3923 * functional state after a suspend and updates the software state as
3924 * necessary. This function is also used for restoring the GPU after a GPU
3925 * reset.
3926 * Returns 0 on success, negative error code on failure.
3927 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3928 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3929 {
3930 int i, r;
3931
3932 for (i = 0; i < adev->num_ip_blocks; i++) {
3933 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3934 continue;
3935 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3936 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3937 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3938 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3939 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3940 continue;
3941 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3942 if (r)
3943 return r;
3944 }
3945
3946 return 0;
3947 }
3948
3949 /**
3950 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3951 *
3952 * @adev: amdgpu_device pointer
3953 *
3954 * Third resume function for hardware IPs. The list of all the hardware
3955 * IPs that make up the asic is walked and the resume callbacks are run for
3956 * all DCE. resume puts the hardware into a functional state after a suspend
3957 * and updates the software state as necessary. This function is also used
3958 * for restoring the GPU after a GPU reset.
3959 *
3960 * Returns 0 on success, negative error code on failure.
3961 */
amdgpu_device_ip_resume_phase3(struct amdgpu_device * adev)3962 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
3963 {
3964 int i, r;
3965
3966 for (i = 0; i < adev->num_ip_blocks; i++) {
3967 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3968 continue;
3969 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
3970 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3971 if (r)
3972 return r;
3973 }
3974 }
3975
3976 return 0;
3977 }
3978
3979 /**
3980 * amdgpu_device_ip_resume - run resume for hardware IPs
3981 *
3982 * @adev: amdgpu_device pointer
3983 *
3984 * Main resume function for hardware IPs. The hardware IPs
3985 * are split into two resume functions because they are
3986 * also used in recovering from a GPU reset and some additional
3987 * steps need to be take between them. In this case (S3/S4) they are
3988 * run sequentially.
3989 * Returns 0 on success, negative error code on failure.
3990 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3991 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3992 {
3993 int r;
3994
3995 r = amdgpu_device_ip_resume_phase1(adev);
3996 if (r)
3997 return r;
3998
3999 r = amdgpu_device_fw_loading(adev);
4000 if (r)
4001 return r;
4002
4003 r = amdgpu_device_ip_resume_phase2(adev);
4004
4005 if (adev->mman.buffer_funcs_ring->sched.ready)
4006 amdgpu_ttm_set_buffer_funcs_status(adev, true);
4007
4008 if (r)
4009 return r;
4010
4011 amdgpu_fence_driver_hw_init(adev);
4012
4013 r = amdgpu_device_ip_resume_phase3(adev);
4014
4015 return r;
4016 }
4017
4018 /**
4019 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
4020 *
4021 * @adev: amdgpu_device pointer
4022 *
4023 * Query the VBIOS data tables to determine if the board supports SR-IOV.
4024 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)4025 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
4026 {
4027 if (amdgpu_sriov_vf(adev)) {
4028 if (adev->is_atom_fw) {
4029 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
4030 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
4031 } else {
4032 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
4033 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
4034 }
4035
4036 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
4037 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
4038 }
4039 }
4040
4041 /**
4042 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
4043 *
4044 * @asic_type: AMD asic type
4045 *
4046 * Check if there is DC (new modesetting infrastructre) support for an asic.
4047 * returns true if DC has support, false if not.
4048 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)4049 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
4050 {
4051 switch (asic_type) {
4052 #ifdef CONFIG_DRM_AMDGPU_SI
4053 case CHIP_HAINAN:
4054 #endif
4055 case CHIP_TOPAZ:
4056 /* chips with no display hardware */
4057 return false;
4058 #if defined(CONFIG_DRM_AMD_DC)
4059 case CHIP_TAHITI:
4060 case CHIP_PITCAIRN:
4061 case CHIP_VERDE:
4062 case CHIP_OLAND:
4063 /*
4064 * We have systems in the wild with these ASICs that require
4065 * LVDS and VGA support which is not supported with DC.
4066 *
4067 * Fallback to the non-DC driver here by default so as not to
4068 * cause regressions.
4069 */
4070 #if defined(CONFIG_DRM_AMD_DC_SI)
4071 return amdgpu_dc > 0;
4072 #else
4073 return false;
4074 #endif
4075 case CHIP_BONAIRE:
4076 case CHIP_KAVERI:
4077 case CHIP_KABINI:
4078 case CHIP_MULLINS:
4079 /*
4080 * We have systems in the wild with these ASICs that require
4081 * VGA support which is not supported with DC.
4082 *
4083 * Fallback to the non-DC driver here by default so as not to
4084 * cause regressions.
4085 */
4086 return amdgpu_dc > 0;
4087 default:
4088 return amdgpu_dc != 0;
4089 #else
4090 default:
4091 if (amdgpu_dc > 0)
4092 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
4093 return false;
4094 #endif
4095 }
4096 }
4097
4098 /**
4099 * amdgpu_device_has_dc_support - check if dc is supported
4100 *
4101 * @adev: amdgpu_device pointer
4102 *
4103 * Returns true for supported, false for not supported
4104 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)4105 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
4106 {
4107 if (adev->enable_virtual_display ||
4108 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
4109 return false;
4110
4111 return amdgpu_device_asic_has_dc_support(adev->asic_type);
4112 }
4113
amdgpu_device_xgmi_reset_func(struct work_struct * __work)4114 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
4115 {
4116 struct amdgpu_device *adev =
4117 container_of(__work, struct amdgpu_device, xgmi_reset_work);
4118 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
4119
4120 /* It's a bug to not have a hive within this function */
4121 if (WARN_ON(!hive))
4122 return;
4123
4124 /*
4125 * Use task barrier to synchronize all xgmi reset works across the
4126 * hive. task_barrier_enter and task_barrier_exit will block
4127 * until all the threads running the xgmi reset works reach
4128 * those points. task_barrier_full will do both blocks.
4129 */
4130 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
4131
4132 task_barrier_enter(&hive->tb);
4133 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
4134
4135 if (adev->asic_reset_res)
4136 goto fail;
4137
4138 task_barrier_exit(&hive->tb);
4139 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
4140
4141 if (adev->asic_reset_res)
4142 goto fail;
4143
4144 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
4145 } else {
4146
4147 task_barrier_full(&hive->tb);
4148 adev->asic_reset_res = amdgpu_asic_reset(adev);
4149 }
4150
4151 fail:
4152 if (adev->asic_reset_res)
4153 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4154 adev->asic_reset_res, adev_to_drm(adev)->unique);
4155 amdgpu_put_xgmi_hive(hive);
4156 }
4157
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)4158 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
4159 {
4160 char *input = amdgpu_lockup_timeout;
4161 char *timeout_setting = NULL;
4162 int index = 0;
4163 long timeout;
4164 int ret = 0;
4165
4166 /*
4167 * By default timeout for non compute jobs is 10000
4168 * and 60000 for compute jobs.
4169 * In SR-IOV or passthrough mode, timeout for compute
4170 * jobs are 60000 by default.
4171 */
4172 adev->gfx_timeout = msecs_to_jiffies(10000);
4173 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4174 if (amdgpu_sriov_vf(adev))
4175 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
4176 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
4177 else
4178 adev->compute_timeout = msecs_to_jiffies(60000);
4179
4180 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4181 while ((timeout_setting = strsep(&input, ",")) &&
4182 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4183 ret = kstrtol(timeout_setting, 0, &timeout);
4184 if (ret)
4185 return ret;
4186
4187 if (timeout == 0) {
4188 index++;
4189 continue;
4190 } else if (timeout < 0) {
4191 timeout = MAX_SCHEDULE_TIMEOUT;
4192 dev_warn(adev->dev, "lockup timeout disabled");
4193 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
4194 } else {
4195 timeout = msecs_to_jiffies(timeout);
4196 }
4197
4198 switch (index++) {
4199 case 0:
4200 adev->gfx_timeout = timeout;
4201 break;
4202 case 1:
4203 adev->compute_timeout = timeout;
4204 break;
4205 case 2:
4206 adev->sdma_timeout = timeout;
4207 break;
4208 case 3:
4209 adev->video_timeout = timeout;
4210 break;
4211 default:
4212 break;
4213 }
4214 }
4215 /*
4216 * There is only one value specified and
4217 * it should apply to all non-compute jobs.
4218 */
4219 if (index == 1) {
4220 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4221 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
4222 adev->compute_timeout = adev->gfx_timeout;
4223 }
4224 }
4225
4226 return ret;
4227 }
4228
4229 /**
4230 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4231 *
4232 * @adev: amdgpu_device pointer
4233 *
4234 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4235 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)4236 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4237 {
4238 struct iommu_domain *domain;
4239
4240 domain = iommu_get_domain_for_dev(adev->dev);
4241 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4242 adev->ram_is_direct_mapped = true;
4243 }
4244
4245 #if defined(CONFIG_HSA_AMD_P2P)
4246 /**
4247 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4248 *
4249 * @adev: amdgpu_device pointer
4250 *
4251 * return if IOMMU remapping bar address
4252 */
amdgpu_device_check_iommu_remap(struct amdgpu_device * adev)4253 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4254 {
4255 struct iommu_domain *domain;
4256
4257 domain = iommu_get_domain_for_dev(adev->dev);
4258 if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4259 domain->type == IOMMU_DOMAIN_DMA_FQ))
4260 return true;
4261
4262 return false;
4263 }
4264 #endif
4265
amdgpu_device_set_mcbp(struct amdgpu_device * adev)4266 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4267 {
4268 if (amdgpu_mcbp == 1)
4269 adev->gfx.mcbp = true;
4270 else if (amdgpu_mcbp == 0)
4271 adev->gfx.mcbp = false;
4272
4273 if (amdgpu_sriov_vf(adev))
4274 adev->gfx.mcbp = true;
4275
4276 if (adev->gfx.mcbp)
4277 DRM_INFO("MCBP is enabled\n");
4278 }
4279
4280 /**
4281 * amdgpu_device_init - initialize the driver
4282 *
4283 * @adev: amdgpu_device pointer
4284 * @flags: driver flags
4285 *
4286 * Initializes the driver info and hw (all asics).
4287 * Returns 0 for success or an error on failure.
4288 * Called at driver startup.
4289 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)4290 int amdgpu_device_init(struct amdgpu_device *adev,
4291 uint32_t flags)
4292 {
4293 struct drm_device *ddev = adev_to_drm(adev);
4294 struct pci_dev *pdev = adev->pdev;
4295 int r, i;
4296 bool px = false;
4297 u32 max_MBps;
4298 int tmp;
4299
4300 adev->shutdown = false;
4301 adev->flags = flags;
4302
4303 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4304 adev->asic_type = amdgpu_force_asic_type;
4305 else
4306 adev->asic_type = flags & AMD_ASIC_MASK;
4307
4308 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4309 if (amdgpu_emu_mode == 1)
4310 adev->usec_timeout *= 10;
4311 adev->gmc.gart_size = 512 * 1024 * 1024;
4312 adev->accel_working = false;
4313 adev->num_rings = 0;
4314 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4315 adev->mman.buffer_funcs = NULL;
4316 adev->mman.buffer_funcs_ring = NULL;
4317 adev->vm_manager.vm_pte_funcs = NULL;
4318 adev->vm_manager.vm_pte_num_scheds = 0;
4319 adev->gmc.gmc_funcs = NULL;
4320 adev->harvest_ip_mask = 0x0;
4321 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4322 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4323
4324 adev->smc_rreg = &amdgpu_invalid_rreg;
4325 adev->smc_wreg = &amdgpu_invalid_wreg;
4326 adev->pcie_rreg = &amdgpu_invalid_rreg;
4327 adev->pcie_wreg = &amdgpu_invalid_wreg;
4328 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4329 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4330 adev->pciep_rreg = &amdgpu_invalid_rreg;
4331 adev->pciep_wreg = &amdgpu_invalid_wreg;
4332 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4333 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4334 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4335 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4336 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4337 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4338 adev->didt_rreg = &amdgpu_invalid_rreg;
4339 adev->didt_wreg = &amdgpu_invalid_wreg;
4340 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4341 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4342 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4343 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4344
4345 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4346 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4347 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4348
4349 /* mutex initialization are all done here so we
4350 * can recall function without having locking issues
4351 */
4352 mutex_init(&adev->firmware.mutex);
4353 mutex_init(&adev->pm.mutex);
4354 mutex_init(&adev->gfx.gpu_clock_mutex);
4355 mutex_init(&adev->srbm_mutex);
4356 mutex_init(&adev->gfx.pipe_reserve_mutex);
4357 mutex_init(&adev->gfx.gfx_off_mutex);
4358 mutex_init(&adev->gfx.partition_mutex);
4359 mutex_init(&adev->grbm_idx_mutex);
4360 mutex_init(&adev->mn_lock);
4361 mutex_init(&adev->virt.vf_errors.lock);
4362 hash_init(adev->mn_hash);
4363 mutex_init(&adev->psp.mutex);
4364 mutex_init(&adev->notifier_lock);
4365 mutex_init(&adev->pm.stable_pstate_ctx_lock);
4366 mutex_init(&adev->benchmark_mutex);
4367 mutex_init(&adev->gfx.reset_sem_mutex);
4368 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4369 mutex_init(&adev->enforce_isolation_mutex);
4370 for (i = 0; i < MAX_XCP; ++i) {
4371 adev->isolation[i].spearhead = dma_fence_get_stub();
4372 amdgpu_sync_create(&adev->isolation[i].active);
4373 amdgpu_sync_create(&adev->isolation[i].prev);
4374 }
4375 mutex_init(&adev->gfx.userq_sch_mutex);
4376 mutex_init(&adev->gfx.workload_profile_mutex);
4377 mutex_init(&adev->vcn.workload_profile_mutex);
4378 mutex_init(&adev->userq_mutex);
4379
4380 amdgpu_device_init_apu_flags(adev);
4381
4382 r = amdgpu_device_check_arguments(adev);
4383 if (r)
4384 return r;
4385
4386 spin_lock_init(&adev->mmio_idx_lock);
4387 spin_lock_init(&adev->smc_idx_lock);
4388 spin_lock_init(&adev->pcie_idx_lock);
4389 spin_lock_init(&adev->uvd_ctx_idx_lock);
4390 spin_lock_init(&adev->didt_idx_lock);
4391 spin_lock_init(&adev->gc_cac_idx_lock);
4392 spin_lock_init(&adev->se_cac_idx_lock);
4393 spin_lock_init(&adev->audio_endpt_idx_lock);
4394 spin_lock_init(&adev->mm_stats.lock);
4395 spin_lock_init(&adev->virt.rlcg_reg_lock);
4396 spin_lock_init(&adev->wb.lock);
4397
4398 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ);
4399
4400 INIT_LIST_HEAD(&adev->reset_list);
4401
4402 INIT_LIST_HEAD(&adev->ras_list);
4403
4404 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4405
4406 INIT_LIST_HEAD(&adev->userq_mgr_list);
4407
4408 INIT_DELAYED_WORK(&adev->delayed_init_work,
4409 amdgpu_device_delayed_init_work_handler);
4410 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4411 amdgpu_device_delay_enable_gfx_off);
4412 /*
4413 * Initialize the enforce_isolation work structures for each XCP
4414 * partition. This work handler is responsible for enforcing shader
4415 * isolation on AMD GPUs. It counts the number of emitted fences for
4416 * each GFX and compute ring. If there are any fences, it schedules
4417 * the `enforce_isolation_work` to be run after a delay. If there are
4418 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4419 * runqueue.
4420 */
4421 for (i = 0; i < MAX_XCP; i++) {
4422 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4423 amdgpu_gfx_enforce_isolation_handler);
4424 adev->gfx.enforce_isolation[i].adev = adev;
4425 adev->gfx.enforce_isolation[i].xcp_id = i;
4426 }
4427
4428 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4429
4430 adev->gfx.gfx_off_req_count = 1;
4431 adev->gfx.gfx_off_residency = 0;
4432 adev->gfx.gfx_off_entrycount = 0;
4433 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4434
4435 atomic_set(&adev->throttling_logging_enabled, 1);
4436 /*
4437 * If throttling continues, logging will be performed every minute
4438 * to avoid log flooding. "-1" is subtracted since the thermal
4439 * throttling interrupt comes every second. Thus, the total logging
4440 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4441 * for throttling interrupt) = 60 seconds.
4442 */
4443 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4444
4445 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4446
4447 /* Registers mapping */
4448 /* TODO: block userspace mapping of io register */
4449 if (adev->asic_type >= CHIP_BONAIRE) {
4450 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4451 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4452 } else {
4453 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4454 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4455 }
4456
4457 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4458 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4459
4460 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4461 if (!adev->rmmio)
4462 return -ENOMEM;
4463
4464 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4465 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4466
4467 /*
4468 * Reset domain needs to be present early, before XGMI hive discovered
4469 * (if any) and initialized to use reset sem and in_gpu reset flag
4470 * early on during init and before calling to RREG32.
4471 */
4472 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4473 if (!adev->reset_domain)
4474 return -ENOMEM;
4475
4476 /* detect hw virtualization here */
4477 amdgpu_virt_init(adev);
4478
4479 amdgpu_device_get_pcie_info(adev);
4480
4481 r = amdgpu_device_get_job_timeout_settings(adev);
4482 if (r) {
4483 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4484 return r;
4485 }
4486
4487 amdgpu_device_set_mcbp(adev);
4488
4489 /*
4490 * By default, use default mode where all blocks are expected to be
4491 * initialized. At present a 'swinit' of blocks is required to be
4492 * completed before the need for a different level is detected.
4493 */
4494 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT);
4495 /* early init functions */
4496 r = amdgpu_device_ip_early_init(adev);
4497 if (r)
4498 return r;
4499
4500 /*
4501 * No need to remove conflicting FBs for non-display class devices.
4502 * This prevents the sysfb from being freed accidently.
4503 */
4504 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA ||
4505 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) {
4506 /* Get rid of things like offb */
4507 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
4508 if (r)
4509 return r;
4510 }
4511
4512 /* Enable TMZ based on IP_VERSION */
4513 amdgpu_gmc_tmz_set(adev);
4514
4515 if (amdgpu_sriov_vf(adev) &&
4516 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4517 /* VF MMIO access (except mailbox range) from CPU
4518 * will be blocked during sriov runtime
4519 */
4520 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4521
4522 amdgpu_gmc_noretry_set(adev);
4523 /* Need to get xgmi info early to decide the reset behavior*/
4524 if (adev->gmc.xgmi.supported) {
4525 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4526 if (r)
4527 return r;
4528 }
4529
4530 /* enable PCIE atomic ops */
4531 if (amdgpu_sriov_vf(adev)) {
4532 if (adev->virt.fw_reserve.p_pf2vf)
4533 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4534 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4535 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4536 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4537 * internal path natively support atomics, set have_atomics_support to true.
4538 */
4539 } else if ((adev->flags & AMD_IS_APU) &&
4540 (amdgpu_ip_version(adev, GC_HWIP, 0) >
4541 IP_VERSION(9, 0, 0))) {
4542 adev->have_atomics_support = true;
4543 } else {
4544 adev->have_atomics_support =
4545 !pci_enable_atomic_ops_to_root(adev->pdev,
4546 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4547 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4548 }
4549
4550 if (!adev->have_atomics_support)
4551 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4552
4553 /* doorbell bar mapping and doorbell index init*/
4554 amdgpu_doorbell_init(adev);
4555
4556 if (amdgpu_emu_mode == 1) {
4557 /* post the asic on emulation mode */
4558 emu_soc_asic_init(adev);
4559 goto fence_driver_init;
4560 }
4561
4562 amdgpu_reset_init(adev);
4563
4564 /* detect if we are with an SRIOV vbios */
4565 if (adev->bios)
4566 amdgpu_device_detect_sriov_bios(adev);
4567
4568 /* check if we need to reset the asic
4569 * E.g., driver was not cleanly unloaded previously, etc.
4570 */
4571 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4572 if (adev->gmc.xgmi.num_physical_nodes) {
4573 dev_info(adev->dev, "Pending hive reset.\n");
4574 amdgpu_set_init_level(adev,
4575 AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
4576 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4577 !amdgpu_device_has_display_hardware(adev)) {
4578 r = psp_gpu_reset(adev);
4579 } else {
4580 tmp = amdgpu_reset_method;
4581 /* It should do a default reset when loading or reloading the driver,
4582 * regardless of the module parameter reset_method.
4583 */
4584 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4585 r = amdgpu_asic_reset(adev);
4586 amdgpu_reset_method = tmp;
4587 }
4588
4589 if (r) {
4590 dev_err(adev->dev, "asic reset on init failed\n");
4591 goto failed;
4592 }
4593 }
4594
4595 /* Post card if necessary */
4596 if (amdgpu_device_need_post(adev)) {
4597 if (!adev->bios) {
4598 dev_err(adev->dev, "no vBIOS found\n");
4599 r = -EINVAL;
4600 goto failed;
4601 }
4602 DRM_INFO("GPU posting now...\n");
4603 r = amdgpu_device_asic_init(adev);
4604 if (r) {
4605 dev_err(adev->dev, "gpu post error!\n");
4606 goto failed;
4607 }
4608 }
4609
4610 if (adev->bios) {
4611 if (adev->is_atom_fw) {
4612 /* Initialize clocks */
4613 r = amdgpu_atomfirmware_get_clock_info(adev);
4614 if (r) {
4615 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4616 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4617 goto failed;
4618 }
4619 } else {
4620 /* Initialize clocks */
4621 r = amdgpu_atombios_get_clock_info(adev);
4622 if (r) {
4623 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4624 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4625 goto failed;
4626 }
4627 /* init i2c buses */
4628 amdgpu_i2c_init(adev);
4629 }
4630 }
4631
4632 fence_driver_init:
4633 /* Fence driver */
4634 r = amdgpu_fence_driver_sw_init(adev);
4635 if (r) {
4636 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4637 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4638 goto failed;
4639 }
4640
4641 /* init the mode config */
4642 drm_mode_config_init(adev_to_drm(adev));
4643
4644 r = amdgpu_device_ip_init(adev);
4645 if (r) {
4646 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4647 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4648 goto release_ras_con;
4649 }
4650
4651 amdgpu_fence_driver_hw_init(adev);
4652
4653 dev_info(adev->dev,
4654 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4655 adev->gfx.config.max_shader_engines,
4656 adev->gfx.config.max_sh_per_se,
4657 adev->gfx.config.max_cu_per_sh,
4658 adev->gfx.cu_info.number);
4659
4660 adev->accel_working = true;
4661
4662 amdgpu_vm_check_compute_bug(adev);
4663
4664 /* Initialize the buffer migration limit. */
4665 if (amdgpu_moverate >= 0)
4666 max_MBps = amdgpu_moverate;
4667 else
4668 max_MBps = 8; /* Allow 8 MB/s. */
4669 /* Get a log2 for easy divisions. */
4670 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4671
4672 /*
4673 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4674 * Otherwise the mgpu fan boost feature will be skipped due to the
4675 * gpu instance is counted less.
4676 */
4677 amdgpu_register_gpu_instance(adev);
4678
4679 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4680 * explicit gating rather than handling it automatically.
4681 */
4682 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
4683 r = amdgpu_device_ip_late_init(adev);
4684 if (r) {
4685 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4686 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4687 goto release_ras_con;
4688 }
4689 /* must succeed. */
4690 amdgpu_ras_resume(adev);
4691 queue_delayed_work(system_wq, &adev->delayed_init_work,
4692 msecs_to_jiffies(AMDGPU_RESUME_MS));
4693 }
4694
4695 if (amdgpu_sriov_vf(adev)) {
4696 amdgpu_virt_release_full_gpu(adev, true);
4697 flush_delayed_work(&adev->delayed_init_work);
4698 }
4699
4700 /*
4701 * Place those sysfs registering after `late_init`. As some of those
4702 * operations performed in `late_init` might affect the sysfs
4703 * interfaces creating.
4704 */
4705 r = amdgpu_atombios_sysfs_init(adev);
4706 if (r)
4707 drm_err(&adev->ddev,
4708 "registering atombios sysfs failed (%d).\n", r);
4709
4710 r = amdgpu_pm_sysfs_init(adev);
4711 if (r)
4712 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4713
4714 r = amdgpu_ucode_sysfs_init(adev);
4715 if (r) {
4716 adev->ucode_sysfs_en = false;
4717 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4718 } else
4719 adev->ucode_sysfs_en = true;
4720
4721 r = amdgpu_device_attr_sysfs_init(adev);
4722 if (r)
4723 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4724
4725 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4726 if (r)
4727 dev_err(adev->dev,
4728 "Could not create amdgpu board attributes\n");
4729
4730 amdgpu_fru_sysfs_init(adev);
4731 amdgpu_reg_state_sysfs_init(adev);
4732 amdgpu_xcp_sysfs_init(adev);
4733
4734 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4735 r = amdgpu_pmu_init(adev);
4736 if (r)
4737 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4738
4739 /* Have stored pci confspace at hand for restore in sudden PCI error */
4740 if (amdgpu_device_cache_pci_state(adev->pdev))
4741 pci_restore_state(pdev);
4742
4743 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4744 /* this will fail for cards that aren't VGA class devices, just
4745 * ignore it
4746 */
4747 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4748 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4749
4750 px = amdgpu_device_supports_px(ddev);
4751
4752 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4753 apple_gmux_detect(NULL, NULL)))
4754 vga_switcheroo_register_client(adev->pdev,
4755 &amdgpu_switcheroo_ops, px);
4756
4757 if (px)
4758 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4759
4760 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
4761 amdgpu_xgmi_reset_on_init(adev);
4762
4763 amdgpu_device_check_iommu_direct_map(adev);
4764
4765 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
4766 r = register_pm_notifier(&adev->pm_nb);
4767 if (r)
4768 goto failed;
4769
4770 return 0;
4771
4772 release_ras_con:
4773 if (amdgpu_sriov_vf(adev))
4774 amdgpu_virt_release_full_gpu(adev, true);
4775
4776 /* failed in exclusive mode due to timeout */
4777 if (amdgpu_sriov_vf(adev) &&
4778 !amdgpu_sriov_runtime(adev) &&
4779 amdgpu_virt_mmio_blocked(adev) &&
4780 !amdgpu_virt_wait_reset(adev)) {
4781 dev_err(adev->dev, "VF exclusive mode timeout\n");
4782 /* Don't send request since VF is inactive. */
4783 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4784 adev->virt.ops = NULL;
4785 r = -EAGAIN;
4786 }
4787 amdgpu_release_ras_context(adev);
4788
4789 failed:
4790 amdgpu_vf_error_trans_all(adev);
4791
4792 return r;
4793 }
4794
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4795 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4796 {
4797
4798 /* Clear all CPU mappings pointing to this device */
4799 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4800
4801 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4802 amdgpu_doorbell_fini(adev);
4803
4804 iounmap(adev->rmmio);
4805 adev->rmmio = NULL;
4806 if (adev->mman.aper_base_kaddr)
4807 iounmap(adev->mman.aper_base_kaddr);
4808 adev->mman.aper_base_kaddr = NULL;
4809
4810 /* Memory manager related */
4811 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4812 arch_phys_wc_del(adev->gmc.vram_mtrr);
4813 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4814 }
4815 }
4816
4817 /**
4818 * amdgpu_device_fini_hw - tear down the driver
4819 *
4820 * @adev: amdgpu_device pointer
4821 *
4822 * Tear down the driver info (all asics).
4823 * Called at driver shutdown.
4824 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4825 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4826 {
4827 dev_info(adev->dev, "amdgpu: finishing device.\n");
4828 flush_delayed_work(&adev->delayed_init_work);
4829
4830 if (adev->mman.initialized)
4831 drain_workqueue(adev->mman.bdev.wq);
4832 adev->shutdown = true;
4833
4834 unregister_pm_notifier(&adev->pm_nb);
4835
4836 /* make sure IB test finished before entering exclusive mode
4837 * to avoid preemption on IB test
4838 */
4839 if (amdgpu_sriov_vf(adev)) {
4840 amdgpu_virt_request_full_gpu(adev, false);
4841 amdgpu_virt_fini_data_exchange(adev);
4842 }
4843
4844 /* disable all interrupts */
4845 amdgpu_irq_disable_all(adev);
4846 if (adev->mode_info.mode_config_initialized) {
4847 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4848 drm_helper_force_disable_all(adev_to_drm(adev));
4849 else
4850 drm_atomic_helper_shutdown(adev_to_drm(adev));
4851 }
4852 amdgpu_fence_driver_hw_fini(adev);
4853
4854 if (adev->pm.sysfs_initialized)
4855 amdgpu_pm_sysfs_fini(adev);
4856 if (adev->ucode_sysfs_en)
4857 amdgpu_ucode_sysfs_fini(adev);
4858 amdgpu_device_attr_sysfs_fini(adev);
4859 amdgpu_fru_sysfs_fini(adev);
4860
4861 amdgpu_reg_state_sysfs_fini(adev);
4862 amdgpu_xcp_sysfs_fini(adev);
4863
4864 /* disable ras feature must before hw fini */
4865 amdgpu_ras_pre_fini(adev);
4866
4867 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4868
4869 amdgpu_device_ip_fini_early(adev);
4870
4871 amdgpu_irq_fini_hw(adev);
4872
4873 if (adev->mman.initialized)
4874 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4875
4876 amdgpu_gart_dummy_page_fini(adev);
4877
4878 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4879 amdgpu_device_unmap_mmio(adev);
4880
4881 }
4882
amdgpu_device_fini_sw(struct amdgpu_device * adev)4883 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4884 {
4885 int i, idx;
4886 bool px;
4887
4888 amdgpu_device_ip_fini(adev);
4889 amdgpu_fence_driver_sw_fini(adev);
4890 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4891 adev->accel_working = false;
4892 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4893 for (i = 0; i < MAX_XCP; ++i) {
4894 dma_fence_put(adev->isolation[i].spearhead);
4895 amdgpu_sync_free(&adev->isolation[i].active);
4896 amdgpu_sync_free(&adev->isolation[i].prev);
4897 }
4898
4899 amdgpu_reset_fini(adev);
4900
4901 /* free i2c buses */
4902 amdgpu_i2c_fini(adev);
4903
4904 if (adev->bios) {
4905 if (amdgpu_emu_mode != 1)
4906 amdgpu_atombios_fini(adev);
4907 amdgpu_bios_release(adev);
4908 }
4909
4910 kfree(adev->fru_info);
4911 adev->fru_info = NULL;
4912
4913 kfree(adev->xcp_mgr);
4914 adev->xcp_mgr = NULL;
4915
4916 px = amdgpu_device_supports_px(adev_to_drm(adev));
4917
4918 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4919 apple_gmux_detect(NULL, NULL)))
4920 vga_switcheroo_unregister_client(adev->pdev);
4921
4922 if (px)
4923 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4924
4925 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4926 vga_client_unregister(adev->pdev);
4927
4928 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4929
4930 iounmap(adev->rmmio);
4931 adev->rmmio = NULL;
4932 drm_dev_exit(idx);
4933 }
4934
4935 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4936 amdgpu_pmu_fini(adev);
4937 if (adev->mman.discovery_bin)
4938 amdgpu_discovery_fini(adev);
4939
4940 amdgpu_reset_put_reset_domain(adev->reset_domain);
4941 adev->reset_domain = NULL;
4942
4943 kfree(adev->pci_state);
4944
4945 }
4946
4947 /**
4948 * amdgpu_device_evict_resources - evict device resources
4949 * @adev: amdgpu device object
4950 *
4951 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4952 * of the vram memory type. Mainly used for evicting device resources
4953 * at suspend time.
4954 *
4955 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4956 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4957 {
4958 int ret;
4959
4960 /* No need to evict vram on APUs unless going to S4 */
4961 if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
4962 return 0;
4963
4964 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4965 if (ret)
4966 DRM_WARN("evicting device resources failed\n");
4967 return ret;
4968 }
4969
4970 /*
4971 * Suspend & resume.
4972 */
4973 /**
4974 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
4975 * @nb: notifier block
4976 * @mode: suspend mode
4977 * @data: data
4978 *
4979 * This function is called when the system is about to suspend or hibernate.
4980 * It is used to set the appropriate flags so that eviction can be optimized
4981 * in the pm prepare callback.
4982 */
amdgpu_device_pm_notifier(struct notifier_block * nb,unsigned long mode,void * data)4983 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
4984 void *data)
4985 {
4986 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
4987
4988 switch (mode) {
4989 case PM_HIBERNATION_PREPARE:
4990 adev->in_s4 = true;
4991 break;
4992 case PM_POST_HIBERNATION:
4993 adev->in_s4 = false;
4994 break;
4995 }
4996
4997 return NOTIFY_DONE;
4998 }
4999
5000 /**
5001 * amdgpu_device_prepare - prepare for device suspend
5002 *
5003 * @dev: drm dev pointer
5004 *
5005 * Prepare to put the hw in the suspend state (all asics).
5006 * Returns 0 for success or an error on failure.
5007 * Called at driver suspend.
5008 */
amdgpu_device_prepare(struct drm_device * dev)5009 int amdgpu_device_prepare(struct drm_device *dev)
5010 {
5011 struct amdgpu_device *adev = drm_to_adev(dev);
5012 int i, r;
5013
5014 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5015 return 0;
5016
5017 /* Evict the majority of BOs before starting suspend sequence */
5018 r = amdgpu_device_evict_resources(adev);
5019 if (r)
5020 return r;
5021
5022 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
5023
5024 for (i = 0; i < adev->num_ip_blocks; i++) {
5025 if (!adev->ip_blocks[i].status.valid)
5026 continue;
5027 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
5028 continue;
5029 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
5030 if (r)
5031 return r;
5032 }
5033
5034 return 0;
5035 }
5036
5037 /**
5038 * amdgpu_device_suspend - initiate device suspend
5039 *
5040 * @dev: drm dev pointer
5041 * @notify_clients: notify in-kernel DRM clients
5042 *
5043 * Puts the hw in the suspend state (all asics).
5044 * Returns 0 for success or an error on failure.
5045 * Called at driver suspend.
5046 */
amdgpu_device_suspend(struct drm_device * dev,bool notify_clients)5047 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
5048 {
5049 struct amdgpu_device *adev = drm_to_adev(dev);
5050 int r = 0;
5051
5052 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5053 return 0;
5054
5055 adev->in_suspend = true;
5056
5057 if (amdgpu_sriov_vf(adev)) {
5058 amdgpu_virt_fini_data_exchange(adev);
5059 r = amdgpu_virt_request_full_gpu(adev, false);
5060 if (r)
5061 return r;
5062 }
5063
5064 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
5065 DRM_WARN("smart shift update failed\n");
5066
5067 if (notify_clients)
5068 drm_client_dev_suspend(adev_to_drm(adev), false);
5069
5070 cancel_delayed_work_sync(&adev->delayed_init_work);
5071
5072 amdgpu_ras_suspend(adev);
5073
5074 amdgpu_device_ip_suspend_phase1(adev);
5075
5076 if (!adev->in_s0ix) {
5077 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
5078 amdgpu_userq_suspend(adev);
5079 }
5080
5081 r = amdgpu_device_evict_resources(adev);
5082 if (r)
5083 return r;
5084
5085 amdgpu_ttm_set_buffer_funcs_status(adev, false);
5086
5087 amdgpu_fence_driver_hw_fini(adev);
5088
5089 amdgpu_device_ip_suspend_phase2(adev);
5090
5091 if (amdgpu_sriov_vf(adev))
5092 amdgpu_virt_release_full_gpu(adev, false);
5093
5094 r = amdgpu_dpm_notify_rlc_state(adev, false);
5095 if (r)
5096 return r;
5097
5098 return 0;
5099 }
5100
5101 /**
5102 * amdgpu_device_resume - initiate device resume
5103 *
5104 * @dev: drm dev pointer
5105 * @notify_clients: notify in-kernel DRM clients
5106 *
5107 * Bring the hw back to operating state (all asics).
5108 * Returns 0 for success or an error on failure.
5109 * Called at driver resume.
5110 */
amdgpu_device_resume(struct drm_device * dev,bool notify_clients)5111 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
5112 {
5113 struct amdgpu_device *adev = drm_to_adev(dev);
5114 int r = 0;
5115
5116 if (amdgpu_sriov_vf(adev)) {
5117 r = amdgpu_virt_request_full_gpu(adev, true);
5118 if (r)
5119 return r;
5120 }
5121
5122 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
5123 return 0;
5124
5125 if (adev->in_s0ix)
5126 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
5127
5128 /* post card */
5129 if (amdgpu_device_need_post(adev)) {
5130 r = amdgpu_device_asic_init(adev);
5131 if (r)
5132 dev_err(adev->dev, "amdgpu asic init failed\n");
5133 }
5134
5135 r = amdgpu_device_ip_resume(adev);
5136
5137 if (r) {
5138 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
5139 goto exit;
5140 }
5141
5142 if (!adev->in_s0ix) {
5143 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
5144 if (r)
5145 goto exit;
5146
5147 r = amdgpu_userq_resume(adev);
5148 if (r)
5149 goto exit;
5150 }
5151
5152 r = amdgpu_device_ip_late_init(adev);
5153 if (r)
5154 goto exit;
5155
5156 queue_delayed_work(system_wq, &adev->delayed_init_work,
5157 msecs_to_jiffies(AMDGPU_RESUME_MS));
5158 exit:
5159 if (amdgpu_sriov_vf(adev)) {
5160 amdgpu_virt_init_data_exchange(adev);
5161 amdgpu_virt_release_full_gpu(adev, true);
5162 }
5163
5164 if (r)
5165 return r;
5166
5167 /* Make sure IB tests flushed */
5168 flush_delayed_work(&adev->delayed_init_work);
5169
5170 if (notify_clients)
5171 drm_client_dev_resume(adev_to_drm(adev), false);
5172
5173 amdgpu_ras_resume(adev);
5174
5175 if (adev->mode_info.num_crtc) {
5176 /*
5177 * Most of the connector probing functions try to acquire runtime pm
5178 * refs to ensure that the GPU is powered on when connector polling is
5179 * performed. Since we're calling this from a runtime PM callback,
5180 * trying to acquire rpm refs will cause us to deadlock.
5181 *
5182 * Since we're guaranteed to be holding the rpm lock, it's safe to
5183 * temporarily disable the rpm helpers so this doesn't deadlock us.
5184 */
5185 #ifdef CONFIG_PM
5186 dev->dev->power.disable_depth++;
5187 #endif
5188 if (!adev->dc_enabled)
5189 drm_helper_hpd_irq_event(dev);
5190 else
5191 drm_kms_helper_hotplug_event(dev);
5192 #ifdef CONFIG_PM
5193 dev->dev->power.disable_depth--;
5194 #endif
5195 }
5196 adev->in_suspend = false;
5197
5198 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
5199 DRM_WARN("smart shift update failed\n");
5200
5201 return 0;
5202 }
5203
5204 /**
5205 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
5206 *
5207 * @adev: amdgpu_device pointer
5208 *
5209 * The list of all the hardware IPs that make up the asic is walked and
5210 * the check_soft_reset callbacks are run. check_soft_reset determines
5211 * if the asic is still hung or not.
5212 * Returns true if any of the IPs are still in a hung state, false if not.
5213 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)5214 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
5215 {
5216 int i;
5217 bool asic_hang = false;
5218
5219 if (amdgpu_sriov_vf(adev))
5220 return true;
5221
5222 if (amdgpu_asic_need_full_reset(adev))
5223 return true;
5224
5225 for (i = 0; i < adev->num_ip_blocks; i++) {
5226 if (!adev->ip_blocks[i].status.valid)
5227 continue;
5228 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
5229 adev->ip_blocks[i].status.hang =
5230 adev->ip_blocks[i].version->funcs->check_soft_reset(
5231 &adev->ip_blocks[i]);
5232 if (adev->ip_blocks[i].status.hang) {
5233 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
5234 asic_hang = true;
5235 }
5236 }
5237 return asic_hang;
5238 }
5239
5240 /**
5241 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5242 *
5243 * @adev: amdgpu_device pointer
5244 *
5245 * The list of all the hardware IPs that make up the asic is walked and the
5246 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
5247 * handles any IP specific hardware or software state changes that are
5248 * necessary for a soft reset to succeed.
5249 * Returns 0 on success, negative error code on failure.
5250 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)5251 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5252 {
5253 int i, r = 0;
5254
5255 for (i = 0; i < adev->num_ip_blocks; i++) {
5256 if (!adev->ip_blocks[i].status.valid)
5257 continue;
5258 if (adev->ip_blocks[i].status.hang &&
5259 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5260 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
5261 if (r)
5262 return r;
5263 }
5264 }
5265
5266 return 0;
5267 }
5268
5269 /**
5270 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5271 *
5272 * @adev: amdgpu_device pointer
5273 *
5274 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
5275 * reset is necessary to recover.
5276 * Returns true if a full asic reset is required, false if not.
5277 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)5278 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5279 {
5280 int i;
5281
5282 if (amdgpu_asic_need_full_reset(adev))
5283 return true;
5284
5285 for (i = 0; i < adev->num_ip_blocks; i++) {
5286 if (!adev->ip_blocks[i].status.valid)
5287 continue;
5288 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5289 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5290 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5291 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5292 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5293 if (adev->ip_blocks[i].status.hang) {
5294 dev_info(adev->dev, "Some block need full reset!\n");
5295 return true;
5296 }
5297 }
5298 }
5299 return false;
5300 }
5301
5302 /**
5303 * amdgpu_device_ip_soft_reset - do a soft reset
5304 *
5305 * @adev: amdgpu_device pointer
5306 *
5307 * The list of all the hardware IPs that make up the asic is walked and the
5308 * soft_reset callbacks are run if the block is hung. soft_reset handles any
5309 * IP specific hardware or software state changes that are necessary to soft
5310 * reset the IP.
5311 * Returns 0 on success, negative error code on failure.
5312 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)5313 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5314 {
5315 int i, r = 0;
5316
5317 for (i = 0; i < adev->num_ip_blocks; i++) {
5318 if (!adev->ip_blocks[i].status.valid)
5319 continue;
5320 if (adev->ip_blocks[i].status.hang &&
5321 adev->ip_blocks[i].version->funcs->soft_reset) {
5322 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
5323 if (r)
5324 return r;
5325 }
5326 }
5327
5328 return 0;
5329 }
5330
5331 /**
5332 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5333 *
5334 * @adev: amdgpu_device pointer
5335 *
5336 * The list of all the hardware IPs that make up the asic is walked and the
5337 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5338 * handles any IP specific hardware or software state changes that are
5339 * necessary after the IP has been soft reset.
5340 * Returns 0 on success, negative error code on failure.
5341 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)5342 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5343 {
5344 int i, r = 0;
5345
5346 for (i = 0; i < adev->num_ip_blocks; i++) {
5347 if (!adev->ip_blocks[i].status.valid)
5348 continue;
5349 if (adev->ip_blocks[i].status.hang &&
5350 adev->ip_blocks[i].version->funcs->post_soft_reset)
5351 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
5352 if (r)
5353 return r;
5354 }
5355
5356 return 0;
5357 }
5358
5359 /**
5360 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5361 *
5362 * @adev: amdgpu_device pointer
5363 * @reset_context: amdgpu reset context pointer
5364 *
5365 * do VF FLR and reinitialize Asic
5366 * return 0 means succeeded otherwise failed
5367 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5368 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5369 struct amdgpu_reset_context *reset_context)
5370 {
5371 int r;
5372 struct amdgpu_hive_info *hive = NULL;
5373
5374 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5375 if (!amdgpu_ras_get_fed_status(adev))
5376 amdgpu_virt_ready_to_reset(adev);
5377 amdgpu_virt_wait_reset(adev);
5378 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5379 r = amdgpu_virt_request_full_gpu(adev, true);
5380 } else {
5381 r = amdgpu_virt_reset_gpu(adev);
5382 }
5383 if (r)
5384 return r;
5385
5386 amdgpu_ras_clear_err_state(adev);
5387 amdgpu_irq_gpu_reset_resume_helper(adev);
5388
5389 /* some sw clean up VF needs to do before recover */
5390 amdgpu_virt_post_reset(adev);
5391
5392 /* Resume IP prior to SMC */
5393 r = amdgpu_device_ip_reinit_early_sriov(adev);
5394 if (r)
5395 return r;
5396
5397 amdgpu_virt_init_data_exchange(adev);
5398
5399 r = amdgpu_device_fw_loading(adev);
5400 if (r)
5401 return r;
5402
5403 /* now we are okay to resume SMC/CP/SDMA */
5404 r = amdgpu_device_ip_reinit_late_sriov(adev);
5405 if (r)
5406 return r;
5407
5408 hive = amdgpu_get_xgmi_hive(adev);
5409 /* Update PSP FW topology after reset */
5410 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5411 r = amdgpu_xgmi_update_topology(hive, adev);
5412 if (hive)
5413 amdgpu_put_xgmi_hive(hive);
5414 if (r)
5415 return r;
5416
5417 r = amdgpu_ib_ring_tests(adev);
5418 if (r)
5419 return r;
5420
5421 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5422 amdgpu_inc_vram_lost(adev);
5423
5424 /* need to be called during full access so we can't do it later like
5425 * bare-metal does.
5426 */
5427 amdgpu_amdkfd_post_reset(adev);
5428 amdgpu_virt_release_full_gpu(adev, true);
5429
5430 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5431 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5432 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5433 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5434 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) ||
5435 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5436 amdgpu_ras_resume(adev);
5437
5438 amdgpu_virt_ras_telemetry_post_reset(adev);
5439
5440 return 0;
5441 }
5442
5443 /**
5444 * amdgpu_device_has_job_running - check if there is any unfinished job
5445 *
5446 * @adev: amdgpu_device pointer
5447 *
5448 * check if there is any job running on the device when guest driver receives
5449 * FLR notification from host driver. If there are still jobs running, then
5450 * the guest driver will not respond the FLR reset. Instead, let the job hit
5451 * the timeout and guest driver then issue the reset request.
5452 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)5453 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5454 {
5455 int i;
5456
5457 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5458 struct amdgpu_ring *ring = adev->rings[i];
5459
5460 if (!amdgpu_ring_sched_ready(ring))
5461 continue;
5462
5463 if (amdgpu_fence_count_emitted(ring))
5464 return true;
5465 }
5466 return false;
5467 }
5468
5469 /**
5470 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5471 *
5472 * @adev: amdgpu_device pointer
5473 *
5474 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5475 * a hung GPU.
5476 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)5477 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5478 {
5479
5480 if (amdgpu_gpu_recovery == 0)
5481 goto disabled;
5482
5483 /* Skip soft reset check in fatal error mode */
5484 if (!amdgpu_ras_is_poison_mode_supported(adev))
5485 return true;
5486
5487 if (amdgpu_sriov_vf(adev))
5488 return true;
5489
5490 if (amdgpu_gpu_recovery == -1) {
5491 switch (adev->asic_type) {
5492 #ifdef CONFIG_DRM_AMDGPU_SI
5493 case CHIP_VERDE:
5494 case CHIP_TAHITI:
5495 case CHIP_PITCAIRN:
5496 case CHIP_OLAND:
5497 case CHIP_HAINAN:
5498 #endif
5499 #ifdef CONFIG_DRM_AMDGPU_CIK
5500 case CHIP_KAVERI:
5501 case CHIP_KABINI:
5502 case CHIP_MULLINS:
5503 #endif
5504 case CHIP_CARRIZO:
5505 case CHIP_STONEY:
5506 case CHIP_CYAN_SKILLFISH:
5507 goto disabled;
5508 default:
5509 break;
5510 }
5511 }
5512
5513 return true;
5514
5515 disabled:
5516 dev_info(adev->dev, "GPU recovery disabled.\n");
5517 return false;
5518 }
5519
amdgpu_device_mode1_reset(struct amdgpu_device * adev)5520 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5521 {
5522 u32 i;
5523 int ret = 0;
5524
5525 if (adev->bios)
5526 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5527
5528 dev_info(adev->dev, "GPU mode1 reset\n");
5529
5530 /* Cache the state before bus master disable. The saved config space
5531 * values are used in other cases like restore after mode-2 reset.
5532 */
5533 amdgpu_device_cache_pci_state(adev->pdev);
5534
5535 /* disable BM */
5536 pci_clear_master(adev->pdev);
5537
5538 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5539 dev_info(adev->dev, "GPU smu mode1 reset\n");
5540 ret = amdgpu_dpm_mode1_reset(adev);
5541 } else {
5542 dev_info(adev->dev, "GPU psp mode1 reset\n");
5543 ret = psp_gpu_reset(adev);
5544 }
5545
5546 if (ret)
5547 goto mode1_reset_failed;
5548
5549 amdgpu_device_load_pci_state(adev->pdev);
5550 ret = amdgpu_psp_wait_for_bootloader(adev);
5551 if (ret)
5552 goto mode1_reset_failed;
5553
5554 /* wait for asic to come out of reset */
5555 for (i = 0; i < adev->usec_timeout; i++) {
5556 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5557
5558 if (memsize != 0xffffffff)
5559 break;
5560 udelay(1);
5561 }
5562
5563 if (i >= adev->usec_timeout) {
5564 ret = -ETIMEDOUT;
5565 goto mode1_reset_failed;
5566 }
5567
5568 if (adev->bios)
5569 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5570
5571 return 0;
5572
5573 mode1_reset_failed:
5574 dev_err(adev->dev, "GPU mode1 reset failed\n");
5575 return ret;
5576 }
5577
amdgpu_device_link_reset(struct amdgpu_device * adev)5578 int amdgpu_device_link_reset(struct amdgpu_device *adev)
5579 {
5580 int ret = 0;
5581
5582 dev_info(adev->dev, "GPU link reset\n");
5583
5584 if (!adev->pcie_reset_ctx.occurs_dpc)
5585 ret = amdgpu_dpm_link_reset(adev);
5586
5587 if (ret)
5588 goto link_reset_failed;
5589
5590 ret = amdgpu_psp_wait_for_bootloader(adev);
5591 if (ret)
5592 goto link_reset_failed;
5593
5594 return 0;
5595
5596 link_reset_failed:
5597 dev_err(adev->dev, "GPU link reset failed\n");
5598 return ret;
5599 }
5600
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5601 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5602 struct amdgpu_reset_context *reset_context)
5603 {
5604 int i, r = 0;
5605 struct amdgpu_job *job = NULL;
5606 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5607 bool need_full_reset =
5608 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5609
5610 if (reset_context->reset_req_dev == adev)
5611 job = reset_context->job;
5612
5613 if (amdgpu_sriov_vf(adev))
5614 amdgpu_virt_pre_reset(adev);
5615
5616 amdgpu_fence_driver_isr_toggle(adev, true);
5617
5618 /* block all schedulers and reset given job's ring */
5619 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5620 struct amdgpu_ring *ring = adev->rings[i];
5621
5622 if (!amdgpu_ring_sched_ready(ring))
5623 continue;
5624
5625 /* Clear job fence from fence drv to avoid force_completion
5626 * leave NULL and vm flush fence in fence drv
5627 */
5628 amdgpu_fence_driver_clear_job_fences(ring);
5629
5630 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5631 amdgpu_fence_driver_force_completion(ring);
5632 }
5633
5634 amdgpu_fence_driver_isr_toggle(adev, false);
5635
5636 if (job && job->vm)
5637 drm_sched_increase_karma(&job->base);
5638
5639 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5640 /* If reset handler not implemented, continue; otherwise return */
5641 if (r == -EOPNOTSUPP)
5642 r = 0;
5643 else
5644 return r;
5645
5646 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5647 if (!amdgpu_sriov_vf(adev)) {
5648
5649 if (!need_full_reset)
5650 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5651
5652 if (!need_full_reset && amdgpu_gpu_recovery &&
5653 amdgpu_device_ip_check_soft_reset(adev)) {
5654 amdgpu_device_ip_pre_soft_reset(adev);
5655 r = amdgpu_device_ip_soft_reset(adev);
5656 amdgpu_device_ip_post_soft_reset(adev);
5657 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5658 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5659 need_full_reset = true;
5660 }
5661 }
5662
5663 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5664 dev_info(tmp_adev->dev, "Dumping IP State\n");
5665 /* Trigger ip dump before we reset the asic */
5666 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5667 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5668 tmp_adev->ip_blocks[i].version->funcs
5669 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
5670 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5671 }
5672
5673 if (need_full_reset)
5674 r = amdgpu_device_ip_suspend(adev);
5675 if (need_full_reset)
5676 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5677 else
5678 clear_bit(AMDGPU_NEED_FULL_RESET,
5679 &reset_context->flags);
5680 }
5681
5682 return r;
5683 }
5684
amdgpu_device_reinit_after_reset(struct amdgpu_reset_context * reset_context)5685 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
5686 {
5687 struct list_head *device_list_handle;
5688 bool full_reset, vram_lost = false;
5689 struct amdgpu_device *tmp_adev;
5690 int r, init_level;
5691
5692 device_list_handle = reset_context->reset_device_list;
5693
5694 if (!device_list_handle)
5695 return -EINVAL;
5696
5697 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5698
5699 /**
5700 * If it's reset on init, it's default init level, otherwise keep level
5701 * as recovery level.
5702 */
5703 if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
5704 init_level = AMDGPU_INIT_LEVEL_DEFAULT;
5705 else
5706 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
5707
5708 r = 0;
5709 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5710 amdgpu_set_init_level(tmp_adev, init_level);
5711 if (full_reset) {
5712 /* post card */
5713 amdgpu_ras_clear_err_state(tmp_adev);
5714 r = amdgpu_device_asic_init(tmp_adev);
5715 if (r) {
5716 dev_warn(tmp_adev->dev, "asic atom init failed!");
5717 } else {
5718 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5719
5720 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5721 if (r)
5722 goto out;
5723
5724 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5725
5726 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5727 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5728
5729 if (vram_lost) {
5730 DRM_INFO("VRAM is lost due to GPU reset!\n");
5731 amdgpu_inc_vram_lost(tmp_adev);
5732 }
5733
5734 r = amdgpu_device_fw_loading(tmp_adev);
5735 if (r)
5736 return r;
5737
5738 r = amdgpu_xcp_restore_partition_mode(
5739 tmp_adev->xcp_mgr);
5740 if (r)
5741 goto out;
5742
5743 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5744 if (r)
5745 goto out;
5746
5747 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5748 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5749
5750 r = amdgpu_device_ip_resume_phase3(tmp_adev);
5751 if (r)
5752 goto out;
5753
5754 if (vram_lost)
5755 amdgpu_device_fill_reset_magic(tmp_adev);
5756
5757 /*
5758 * Add this ASIC as tracked as reset was already
5759 * complete successfully.
5760 */
5761 amdgpu_register_gpu_instance(tmp_adev);
5762
5763 if (!reset_context->hive &&
5764 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5765 amdgpu_xgmi_add_device(tmp_adev);
5766
5767 r = amdgpu_device_ip_late_init(tmp_adev);
5768 if (r)
5769 goto out;
5770
5771 drm_client_dev_resume(adev_to_drm(tmp_adev), false);
5772
5773 /*
5774 * The GPU enters bad state once faulty pages
5775 * by ECC has reached the threshold, and ras
5776 * recovery is scheduled next. So add one check
5777 * here to break recovery if it indeed exceeds
5778 * bad page threshold, and remind user to
5779 * retire this GPU or setting one bigger
5780 * bad_page_threshold value to fix this once
5781 * probing driver again.
5782 */
5783 if (!amdgpu_ras_is_rma(tmp_adev)) {
5784 /* must succeed. */
5785 amdgpu_ras_resume(tmp_adev);
5786 } else {
5787 r = -EINVAL;
5788 goto out;
5789 }
5790
5791 /* Update PSP FW topology after reset */
5792 if (reset_context->hive &&
5793 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5794 r = amdgpu_xgmi_update_topology(
5795 reset_context->hive, tmp_adev);
5796 }
5797 }
5798
5799 out:
5800 if (!r) {
5801 /* IP init is complete now, set level as default */
5802 amdgpu_set_init_level(tmp_adev,
5803 AMDGPU_INIT_LEVEL_DEFAULT);
5804 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5805 r = amdgpu_ib_ring_tests(tmp_adev);
5806 if (r) {
5807 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5808 r = -EAGAIN;
5809 goto end;
5810 }
5811 }
5812
5813 if (r)
5814 tmp_adev->asic_reset_res = r;
5815 }
5816
5817 end:
5818 return r;
5819 }
5820
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5821 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5822 struct amdgpu_reset_context *reset_context)
5823 {
5824 struct amdgpu_device *tmp_adev = NULL;
5825 bool need_full_reset, skip_hw_reset;
5826 int r = 0;
5827
5828 /* Try reset handler method first */
5829 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5830 reset_list);
5831
5832 reset_context->reset_device_list = device_list_handle;
5833 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5834 /* If reset handler not implemented, continue; otherwise return */
5835 if (r == -EOPNOTSUPP)
5836 r = 0;
5837 else
5838 return r;
5839
5840 /* Reset handler not implemented, use the default method */
5841 need_full_reset =
5842 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5843 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5844
5845 /*
5846 * ASIC reset has to be done on all XGMI hive nodes ASAP
5847 * to allow proper links negotiation in FW (within 1 sec)
5848 */
5849 if (!skip_hw_reset && need_full_reset) {
5850 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5851 /* For XGMI run all resets in parallel to speed up the process */
5852 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5853 if (!queue_work(system_unbound_wq,
5854 &tmp_adev->xgmi_reset_work))
5855 r = -EALREADY;
5856 } else
5857 r = amdgpu_asic_reset(tmp_adev);
5858
5859 if (r) {
5860 dev_err(tmp_adev->dev,
5861 "ASIC reset failed with error, %d for drm dev, %s",
5862 r, adev_to_drm(tmp_adev)->unique);
5863 goto out;
5864 }
5865 }
5866
5867 /* For XGMI wait for all resets to complete before proceed */
5868 if (!r) {
5869 list_for_each_entry(tmp_adev, device_list_handle,
5870 reset_list) {
5871 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5872 flush_work(&tmp_adev->xgmi_reset_work);
5873 r = tmp_adev->asic_reset_res;
5874 if (r)
5875 break;
5876 }
5877 }
5878 }
5879 }
5880
5881 if (!r && amdgpu_ras_intr_triggered()) {
5882 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5883 amdgpu_ras_reset_error_count(tmp_adev,
5884 AMDGPU_RAS_BLOCK__MMHUB);
5885 }
5886
5887 amdgpu_ras_intr_cleared();
5888 }
5889
5890 r = amdgpu_device_reinit_after_reset(reset_context);
5891 if (r == -EAGAIN)
5892 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5893 else
5894 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5895
5896 out:
5897 return r;
5898 }
5899
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5900 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5901 {
5902
5903 switch (amdgpu_asic_reset_method(adev)) {
5904 case AMD_RESET_METHOD_MODE1:
5905 case AMD_RESET_METHOD_LINK:
5906 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5907 break;
5908 case AMD_RESET_METHOD_MODE2:
5909 adev->mp1_state = PP_MP1_STATE_RESET;
5910 break;
5911 default:
5912 adev->mp1_state = PP_MP1_STATE_NONE;
5913 break;
5914 }
5915 }
5916
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5917 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5918 {
5919 amdgpu_vf_error_trans_all(adev);
5920 adev->mp1_state = PP_MP1_STATE_NONE;
5921 }
5922
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5923 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5924 {
5925 struct pci_dev *p = NULL;
5926
5927 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5928 adev->pdev->bus->number, 1);
5929 if (p) {
5930 pm_runtime_enable(&(p->dev));
5931 pm_runtime_resume(&(p->dev));
5932 }
5933
5934 pci_dev_put(p);
5935 }
5936
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5937 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5938 {
5939 enum amd_reset_method reset_method;
5940 struct pci_dev *p = NULL;
5941 u64 expires;
5942
5943 /*
5944 * For now, only BACO and mode1 reset are confirmed
5945 * to suffer the audio issue without proper suspended.
5946 */
5947 reset_method = amdgpu_asic_reset_method(adev);
5948 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5949 (reset_method != AMD_RESET_METHOD_MODE1))
5950 return -EINVAL;
5951
5952 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5953 adev->pdev->bus->number, 1);
5954 if (!p)
5955 return -ENODEV;
5956
5957 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5958 if (!expires)
5959 /*
5960 * If we cannot get the audio device autosuspend delay,
5961 * a fixed 4S interval will be used. Considering 3S is
5962 * the audio controller default autosuspend delay setting.
5963 * 4S used here is guaranteed to cover that.
5964 */
5965 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5966
5967 while (!pm_runtime_status_suspended(&(p->dev))) {
5968 if (!pm_runtime_suspend(&(p->dev)))
5969 break;
5970
5971 if (expires < ktime_get_mono_fast_ns()) {
5972 dev_warn(adev->dev, "failed to suspend display audio\n");
5973 pci_dev_put(p);
5974 /* TODO: abort the succeeding gpu reset? */
5975 return -ETIMEDOUT;
5976 }
5977 }
5978
5979 pm_runtime_disable(&(p->dev));
5980
5981 pci_dev_put(p);
5982 return 0;
5983 }
5984
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5985 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5986 {
5987 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5988
5989 #if defined(CONFIG_DEBUG_FS)
5990 if (!amdgpu_sriov_vf(adev))
5991 cancel_work(&adev->reset_work);
5992 #endif
5993
5994 if (adev->kfd.dev)
5995 cancel_work(&adev->kfd.reset_work);
5996
5997 if (amdgpu_sriov_vf(adev))
5998 cancel_work(&adev->virt.flr_work);
5999
6000 if (con && adev->ras_enabled)
6001 cancel_work(&con->recovery_work);
6002
6003 }
6004
amdgpu_device_health_check(struct list_head * device_list_handle)6005 static int amdgpu_device_health_check(struct list_head *device_list_handle)
6006 {
6007 struct amdgpu_device *tmp_adev;
6008 int ret = 0;
6009 u32 status;
6010
6011 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6012 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
6013 if (PCI_POSSIBLE_ERROR(status)) {
6014 dev_err(tmp_adev->dev, "device lost from bus!");
6015 ret = -ENODEV;
6016 }
6017 }
6018
6019 return ret;
6020 }
6021
amdgpu_device_recovery_prepare(struct amdgpu_device * adev,struct list_head * device_list,struct amdgpu_hive_info * hive)6022 static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
6023 struct list_head *device_list,
6024 struct amdgpu_hive_info *hive)
6025 {
6026 struct amdgpu_device *tmp_adev = NULL;
6027 int r;
6028
6029 /*
6030 * Build list of devices to reset.
6031 * In case we are in XGMI hive mode, resort the device list
6032 * to put adev in the 1st position.
6033 */
6034 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
6035 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
6036 list_add_tail(&tmp_adev->reset_list, device_list);
6037 if (adev->shutdown)
6038 tmp_adev->shutdown = true;
6039 if (adev->pcie_reset_ctx.occurs_dpc)
6040 tmp_adev->pcie_reset_ctx.in_link_reset = true;
6041 }
6042 if (!list_is_first(&adev->reset_list, device_list))
6043 list_rotate_to_front(&adev->reset_list, device_list);
6044 } else {
6045 list_add_tail(&adev->reset_list, device_list);
6046 }
6047
6048 if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
6049 r = amdgpu_device_health_check(device_list);
6050 if (r)
6051 return r;
6052 }
6053
6054 return 0;
6055 }
6056
amdgpu_device_recovery_get_reset_lock(struct amdgpu_device * adev,struct list_head * device_list)6057 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
6058 struct list_head *device_list)
6059 {
6060 struct amdgpu_device *tmp_adev = NULL;
6061
6062 if (list_empty(device_list))
6063 return;
6064 tmp_adev =
6065 list_first_entry(device_list, struct amdgpu_device, reset_list);
6066 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
6067 }
6068
amdgpu_device_recovery_put_reset_lock(struct amdgpu_device * adev,struct list_head * device_list)6069 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev,
6070 struct list_head *device_list)
6071 {
6072 struct amdgpu_device *tmp_adev = NULL;
6073
6074 if (list_empty(device_list))
6075 return;
6076 tmp_adev =
6077 list_first_entry(device_list, struct amdgpu_device, reset_list);
6078 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6079 }
6080
amdgpu_device_halt_activities(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context,struct list_head * device_list,struct amdgpu_hive_info * hive,bool need_emergency_restart)6081 static int amdgpu_device_halt_activities(
6082 struct amdgpu_device *adev, struct amdgpu_job *job,
6083 struct amdgpu_reset_context *reset_context,
6084 struct list_head *device_list, struct amdgpu_hive_info *hive,
6085 bool need_emergency_restart)
6086 {
6087 struct amdgpu_device *tmp_adev = NULL;
6088 int i, r = 0;
6089
6090 /* block all schedulers and reset given job's ring */
6091 list_for_each_entry(tmp_adev, device_list, reset_list) {
6092 amdgpu_device_set_mp1_state(tmp_adev);
6093
6094 /*
6095 * Try to put the audio codec into suspend state
6096 * before gpu reset started.
6097 *
6098 * Due to the power domain of the graphics device
6099 * is shared with AZ power domain. Without this,
6100 * we may change the audio hardware from behind
6101 * the audio driver's back. That will trigger
6102 * some audio codec errors.
6103 */
6104 if (!amdgpu_device_suspend_display_audio(tmp_adev))
6105 tmp_adev->pcie_reset_ctx.audio_suspended = true;
6106
6107 amdgpu_ras_set_error_query_ready(tmp_adev, false);
6108
6109 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
6110
6111 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
6112
6113 /*
6114 * Mark these ASICs to be reset as untracked first
6115 * And add them back after reset completed
6116 */
6117 amdgpu_unregister_gpu_instance(tmp_adev);
6118
6119 drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
6120
6121 /* disable ras on ALL IPs */
6122 if (!need_emergency_restart &&
6123 (!adev->pcie_reset_ctx.occurs_dpc) &&
6124 amdgpu_device_ip_need_full_reset(tmp_adev))
6125 amdgpu_ras_suspend(tmp_adev);
6126
6127 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6128 struct amdgpu_ring *ring = tmp_adev->rings[i];
6129
6130 if (!amdgpu_ring_sched_ready(ring))
6131 continue;
6132
6133 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
6134
6135 if (need_emergency_restart)
6136 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
6137 }
6138 atomic_inc(&tmp_adev->gpu_reset_counter);
6139 }
6140
6141 return r;
6142 }
6143
amdgpu_device_asic_reset(struct amdgpu_device * adev,struct list_head * device_list,struct amdgpu_reset_context * reset_context)6144 static int amdgpu_device_asic_reset(struct amdgpu_device *adev,
6145 struct list_head *device_list,
6146 struct amdgpu_reset_context *reset_context)
6147 {
6148 struct amdgpu_device *tmp_adev = NULL;
6149 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
6150 int r = 0;
6151
6152 retry: /* Rest of adevs pre asic reset from XGMI hive. */
6153 list_for_each_entry(tmp_adev, device_list, reset_list) {
6154 if (adev->pcie_reset_ctx.occurs_dpc)
6155 tmp_adev->no_hw_access = true;
6156 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
6157 if (adev->pcie_reset_ctx.occurs_dpc)
6158 tmp_adev->no_hw_access = false;
6159 /*TODO Should we stop ?*/
6160 if (r) {
6161 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
6162 r, adev_to_drm(tmp_adev)->unique);
6163 tmp_adev->asic_reset_res = r;
6164 }
6165 }
6166
6167 /* Actual ASIC resets if needed.*/
6168 /* Host driver will handle XGMI hive reset for SRIOV */
6169 if (amdgpu_sriov_vf(adev)) {
6170
6171 /* Bail out of reset early */
6172 if (amdgpu_ras_is_rma(adev))
6173 return -ENODEV;
6174
6175 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
6176 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
6177 amdgpu_ras_set_fed(adev, true);
6178 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
6179 }
6180
6181 r = amdgpu_device_reset_sriov(adev, reset_context);
6182 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
6183 amdgpu_virt_release_full_gpu(adev, true);
6184 goto retry;
6185 }
6186 if (r)
6187 adev->asic_reset_res = r;
6188 } else {
6189 r = amdgpu_do_asic_reset(device_list, reset_context);
6190 if (r && r == -EAGAIN)
6191 goto retry;
6192 }
6193
6194 list_for_each_entry(tmp_adev, device_list, reset_list) {
6195 /*
6196 * Drop any pending non scheduler resets queued before reset is done.
6197 * Any reset scheduled after this point would be valid. Scheduler resets
6198 * were already dropped during drm_sched_stop and no new ones can come
6199 * in before drm_sched_start.
6200 */
6201 amdgpu_device_stop_pending_resets(tmp_adev);
6202 }
6203
6204 return r;
6205 }
6206
amdgpu_device_sched_resume(struct list_head * device_list,struct amdgpu_reset_context * reset_context,bool job_signaled)6207 static int amdgpu_device_sched_resume(struct list_head *device_list,
6208 struct amdgpu_reset_context *reset_context,
6209 bool job_signaled)
6210 {
6211 struct amdgpu_device *tmp_adev = NULL;
6212 int i, r = 0;
6213
6214 /* Post ASIC reset for all devs .*/
6215 list_for_each_entry(tmp_adev, device_list, reset_list) {
6216
6217 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6218 struct amdgpu_ring *ring = tmp_adev->rings[i];
6219
6220 if (!amdgpu_ring_sched_ready(ring))
6221 continue;
6222
6223 drm_sched_start(&ring->sched, 0);
6224 }
6225
6226 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
6227 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
6228
6229 if (tmp_adev->asic_reset_res)
6230 r = tmp_adev->asic_reset_res;
6231
6232 tmp_adev->asic_reset_res = 0;
6233
6234 if (r) {
6235 /* bad news, how to tell it to userspace ?
6236 * for ras error, we should report GPU bad status instead of
6237 * reset failure
6238 */
6239 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
6240 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
6241 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
6242 atomic_read(&tmp_adev->gpu_reset_counter));
6243 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
6244 } else {
6245 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
6246 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
6247 DRM_WARN("smart shift update failed\n");
6248 }
6249 }
6250
6251 return r;
6252 }
6253
amdgpu_device_gpu_resume(struct amdgpu_device * adev,struct list_head * device_list,bool need_emergency_restart)6254 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
6255 struct list_head *device_list,
6256 bool need_emergency_restart)
6257 {
6258 struct amdgpu_device *tmp_adev = NULL;
6259
6260 list_for_each_entry(tmp_adev, device_list, reset_list) {
6261 /* unlock kfd: SRIOV would do it separately */
6262 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
6263 amdgpu_amdkfd_post_reset(tmp_adev);
6264
6265 /* kfd_post_reset will do nothing if kfd device is not initialized,
6266 * need to bring up kfd here if it's not be initialized before
6267 */
6268 if (!adev->kfd.init_complete)
6269 amdgpu_amdkfd_device_init(adev);
6270
6271 if (tmp_adev->pcie_reset_ctx.audio_suspended)
6272 amdgpu_device_resume_display_audio(tmp_adev);
6273
6274 amdgpu_device_unset_mp1_state(tmp_adev);
6275
6276 amdgpu_ras_set_error_query_ready(tmp_adev, true);
6277
6278 }
6279 }
6280
6281
6282 /**
6283 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
6284 *
6285 * @adev: amdgpu_device pointer
6286 * @job: which job trigger hang
6287 * @reset_context: amdgpu reset context pointer
6288 *
6289 * Attempt to reset the GPU if it has hung (all asics).
6290 * Attempt to do soft-reset or full-reset and reinitialize Asic
6291 * Returns 0 for success or an error on failure.
6292 */
6293
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)6294 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
6295 struct amdgpu_job *job,
6296 struct amdgpu_reset_context *reset_context)
6297 {
6298 struct list_head device_list;
6299 bool job_signaled = false;
6300 struct amdgpu_hive_info *hive = NULL;
6301 int r = 0;
6302 bool need_emergency_restart = false;
6303
6304 /*
6305 * If it reaches here because of hang/timeout and a RAS error is
6306 * detected at the same time, let RAS recovery take care of it.
6307 */
6308 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) &&
6309 !amdgpu_sriov_vf(adev) &&
6310 reset_context->src != AMDGPU_RESET_SRC_RAS) {
6311 dev_dbg(adev->dev,
6312 "Gpu recovery from source: %d yielding to RAS error recovery handling",
6313 reset_context->src);
6314 return 0;
6315 }
6316
6317 /*
6318 * Special case: RAS triggered and full reset isn't supported
6319 */
6320 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
6321
6322 /*
6323 * Flush RAM to disk so that after reboot
6324 * the user can read log and see why the system rebooted.
6325 */
6326 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
6327 amdgpu_ras_get_context(adev)->reboot) {
6328 DRM_WARN("Emergency reboot.");
6329
6330 ksys_sync_helper();
6331 emergency_restart();
6332 }
6333
6334 dev_info(adev->dev, "GPU %s begin!\n",
6335 need_emergency_restart ? "jobs stop":"reset");
6336
6337 if (!amdgpu_sriov_vf(adev))
6338 hive = amdgpu_get_xgmi_hive(adev);
6339 if (hive)
6340 mutex_lock(&hive->hive_lock);
6341
6342 reset_context->job = job;
6343 reset_context->hive = hive;
6344 INIT_LIST_HEAD(&device_list);
6345
6346 if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
6347 goto end_reset;
6348
6349 /* We need to lock reset domain only once both for XGMI and single device */
6350 amdgpu_device_recovery_get_reset_lock(adev, &device_list);
6351
6352 r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
6353 hive, need_emergency_restart);
6354 if (r)
6355 goto reset_unlock;
6356
6357 if (need_emergency_restart)
6358 goto skip_sched_resume;
6359 /*
6360 * Must check guilty signal here since after this point all old
6361 * HW fences are force signaled.
6362 *
6363 * job->base holds a reference to parent fence
6364 */
6365 if (job && dma_fence_is_signaled(&job->hw_fence.base)) {
6366 job_signaled = true;
6367 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
6368 goto skip_hw_reset;
6369 }
6370
6371 r = amdgpu_device_asic_reset(adev, &device_list, reset_context);
6372 if (r)
6373 goto reset_unlock;
6374 skip_hw_reset:
6375 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled);
6376 if (r)
6377 goto reset_unlock;
6378 skip_sched_resume:
6379 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
6380 reset_unlock:
6381 amdgpu_device_recovery_put_reset_lock(adev, &device_list);
6382 end_reset:
6383 if (hive) {
6384 mutex_unlock(&hive->hive_lock);
6385 amdgpu_put_xgmi_hive(hive);
6386 }
6387
6388 if (r)
6389 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6390
6391 atomic_set(&adev->reset_domain->reset_res, r);
6392
6393 if (!r)
6394 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
6395
6396 return r;
6397 }
6398
6399 /**
6400 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6401 *
6402 * @adev: amdgpu_device pointer
6403 * @speed: pointer to the speed of the link
6404 * @width: pointer to the width of the link
6405 *
6406 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6407 * first physical partner to an AMD dGPU.
6408 * This will exclude any virtual switches and links.
6409 */
amdgpu_device_partner_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6410 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6411 enum pci_bus_speed *speed,
6412 enum pcie_link_width *width)
6413 {
6414 struct pci_dev *parent = adev->pdev;
6415
6416 if (!speed || !width)
6417 return;
6418
6419 *speed = PCI_SPEED_UNKNOWN;
6420 *width = PCIE_LNK_WIDTH_UNKNOWN;
6421
6422 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6423 while ((parent = pci_upstream_bridge(parent))) {
6424 /* skip upstream/downstream switches internal to dGPU*/
6425 if (parent->vendor == PCI_VENDOR_ID_ATI)
6426 continue;
6427 *speed = pcie_get_speed_cap(parent);
6428 *width = pcie_get_width_cap(parent);
6429 break;
6430 }
6431 } else {
6432 /* use the current speeds rather than max if switching is not supported */
6433 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6434 }
6435 }
6436
6437 /**
6438 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU
6439 *
6440 * @adev: amdgpu_device pointer
6441 * @speed: pointer to the speed of the link
6442 * @width: pointer to the width of the link
6443 *
6444 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6445 * AMD dGPU which may be a virtual upstream bridge.
6446 */
amdgpu_device_gpu_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6447 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev,
6448 enum pci_bus_speed *speed,
6449 enum pcie_link_width *width)
6450 {
6451 struct pci_dev *parent = adev->pdev;
6452
6453 if (!speed || !width)
6454 return;
6455
6456 parent = pci_upstream_bridge(parent);
6457 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) {
6458 /* use the upstream/downstream switches internal to dGPU */
6459 *speed = pcie_get_speed_cap(parent);
6460 *width = pcie_get_width_cap(parent);
6461 while ((parent = pci_upstream_bridge(parent))) {
6462 if (parent->vendor == PCI_VENDOR_ID_ATI) {
6463 /* use the upstream/downstream switches internal to dGPU */
6464 *speed = pcie_get_speed_cap(parent);
6465 *width = pcie_get_width_cap(parent);
6466 }
6467 }
6468 } else {
6469 /* use the device itself */
6470 *speed = pcie_get_speed_cap(adev->pdev);
6471 *width = pcie_get_width_cap(adev->pdev);
6472 }
6473 }
6474
6475 /**
6476 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6477 *
6478 * @adev: amdgpu_device pointer
6479 *
6480 * Fetches and stores in the driver the PCIE capabilities (gen speed
6481 * and lanes) of the slot the device is in. Handles APUs and
6482 * virtualized environments where PCIE config space may not be available.
6483 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)6484 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6485 {
6486 enum pci_bus_speed speed_cap, platform_speed_cap;
6487 enum pcie_link_width platform_link_width, link_width;
6488
6489 if (amdgpu_pcie_gen_cap)
6490 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6491
6492 if (amdgpu_pcie_lane_cap)
6493 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6494
6495 /* covers APUs as well */
6496 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6497 if (adev->pm.pcie_gen_mask == 0)
6498 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6499 if (adev->pm.pcie_mlw_mask == 0)
6500 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6501 return;
6502 }
6503
6504 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6505 return;
6506
6507 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6508 &platform_link_width);
6509 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width);
6510
6511 if (adev->pm.pcie_gen_mask == 0) {
6512 /* asic caps */
6513 if (speed_cap == PCI_SPEED_UNKNOWN) {
6514 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6515 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6516 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6517 } else {
6518 if (speed_cap == PCIE_SPEED_32_0GT)
6519 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6520 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6521 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6522 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6523 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6524 else if (speed_cap == PCIE_SPEED_16_0GT)
6525 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6526 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6527 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6528 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6529 else if (speed_cap == PCIE_SPEED_8_0GT)
6530 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6531 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6532 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6533 else if (speed_cap == PCIE_SPEED_5_0GT)
6534 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6535 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6536 else
6537 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6538 }
6539 /* platform caps */
6540 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6541 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6542 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6543 } else {
6544 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6545 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6546 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6547 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6548 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6549 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6550 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6551 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6552 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6553 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6554 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6555 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6556 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6557 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6558 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6559 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6560 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6561 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6562 else
6563 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6564
6565 }
6566 }
6567 if (adev->pm.pcie_mlw_mask == 0) {
6568 /* asic caps */
6569 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6570 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK;
6571 } else {
6572 switch (link_width) {
6573 case PCIE_LNK_X32:
6574 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 |
6575 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6576 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6577 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6578 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6579 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6580 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6581 break;
6582 case PCIE_LNK_X16:
6583 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 |
6584 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6585 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6586 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6587 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6588 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6589 break;
6590 case PCIE_LNK_X12:
6591 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 |
6592 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6593 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6594 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6595 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6596 break;
6597 case PCIE_LNK_X8:
6598 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 |
6599 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6600 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6601 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6602 break;
6603 case PCIE_LNK_X4:
6604 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 |
6605 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6606 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6607 break;
6608 case PCIE_LNK_X2:
6609 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 |
6610 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1);
6611 break;
6612 case PCIE_LNK_X1:
6613 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1;
6614 break;
6615 default:
6616 break;
6617 }
6618 }
6619 /* platform caps */
6620 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6621 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6622 } else {
6623 switch (platform_link_width) {
6624 case PCIE_LNK_X32:
6625 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6626 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6627 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6628 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6629 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6630 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6631 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6632 break;
6633 case PCIE_LNK_X16:
6634 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6635 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6636 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6637 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6638 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6639 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6640 break;
6641 case PCIE_LNK_X12:
6642 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6643 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6644 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6645 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6646 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6647 break;
6648 case PCIE_LNK_X8:
6649 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6650 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6651 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6652 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6653 break;
6654 case PCIE_LNK_X4:
6655 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6656 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6657 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6658 break;
6659 case PCIE_LNK_X2:
6660 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6661 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6662 break;
6663 case PCIE_LNK_X1:
6664 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6665 break;
6666 default:
6667 break;
6668 }
6669 }
6670 }
6671 }
6672
6673 /**
6674 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6675 *
6676 * @adev: amdgpu_device pointer
6677 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6678 *
6679 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6680 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6681 * @peer_adev.
6682 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)6683 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6684 struct amdgpu_device *peer_adev)
6685 {
6686 #ifdef CONFIG_HSA_AMD_P2P
6687 bool p2p_access =
6688 !adev->gmc.xgmi.connected_to_cpu &&
6689 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6690 if (!p2p_access)
6691 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n",
6692 pci_name(peer_adev->pdev));
6693
6694 bool is_large_bar = adev->gmc.visible_vram_size &&
6695 adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6696 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6697
6698 if (!p2p_addressable) {
6699 uint64_t address_mask = peer_adev->dev->dma_mask ?
6700 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6701 resource_size_t aper_limit =
6702 adev->gmc.aper_base + adev->gmc.aper_size - 1;
6703
6704 p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6705 aper_limit & address_mask);
6706 }
6707 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6708 #else
6709 return false;
6710 #endif
6711 }
6712
amdgpu_device_baco_enter(struct drm_device * dev)6713 int amdgpu_device_baco_enter(struct drm_device *dev)
6714 {
6715 struct amdgpu_device *adev = drm_to_adev(dev);
6716 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6717
6718 if (!amdgpu_device_supports_baco(dev))
6719 return -ENOTSUPP;
6720
6721 if (ras && adev->ras_enabled &&
6722 adev->nbio.funcs->enable_doorbell_interrupt)
6723 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6724
6725 return amdgpu_dpm_baco_enter(adev);
6726 }
6727
amdgpu_device_baco_exit(struct drm_device * dev)6728 int amdgpu_device_baco_exit(struct drm_device *dev)
6729 {
6730 struct amdgpu_device *adev = drm_to_adev(dev);
6731 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6732 int ret = 0;
6733
6734 if (!amdgpu_device_supports_baco(dev))
6735 return -ENOTSUPP;
6736
6737 ret = amdgpu_dpm_baco_exit(adev);
6738 if (ret)
6739 return ret;
6740
6741 if (ras && adev->ras_enabled &&
6742 adev->nbio.funcs->enable_doorbell_interrupt)
6743 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6744
6745 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6746 adev->nbio.funcs->clear_doorbell_interrupt)
6747 adev->nbio.funcs->clear_doorbell_interrupt(adev);
6748
6749 return 0;
6750 }
6751
6752 /**
6753 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6754 * @pdev: PCI device struct
6755 * @state: PCI channel state
6756 *
6757 * Description: Called when a PCI error is detected.
6758 *
6759 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6760 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)6761 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6762 {
6763 struct drm_device *dev = pci_get_drvdata(pdev);
6764 struct amdgpu_device *adev = drm_to_adev(dev);
6765 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
6766 struct amdgpu_reset_context reset_context;
6767 struct list_head device_list;
6768 int r = 0;
6769
6770 dev_info(adev->dev, "PCI error: detected callback!!\n");
6771
6772 if (!amdgpu_dpm_is_link_reset_supported(adev)) {
6773 dev_warn(adev->dev, "No support for XGMI hive yet...\n");
6774 return PCI_ERS_RESULT_DISCONNECT;
6775 }
6776
6777 adev->pci_channel_state = state;
6778
6779 switch (state) {
6780 case pci_channel_io_normal:
6781 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state);
6782 return PCI_ERS_RESULT_CAN_RECOVER;
6783 case pci_channel_io_frozen:
6784 /* Fatal error, prepare for slot reset */
6785 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state);
6786
6787 if (hive)
6788 mutex_lock(&hive->hive_lock);
6789 adev->pcie_reset_ctx.occurs_dpc = true;
6790 memset(&reset_context, 0, sizeof(reset_context));
6791 INIT_LIST_HEAD(&device_list);
6792
6793 amdgpu_device_recovery_prepare(adev, &device_list, hive);
6794 amdgpu_device_recovery_get_reset_lock(adev, &device_list);
6795 r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
6796 hive, false);
6797 if (hive) {
6798 mutex_unlock(&hive->hive_lock);
6799 amdgpu_put_xgmi_hive(hive);
6800 }
6801 if (r)
6802 return PCI_ERS_RESULT_DISCONNECT;
6803 return PCI_ERS_RESULT_NEED_RESET;
6804 case pci_channel_io_perm_failure:
6805 /* Permanent error, prepare for device removal */
6806 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state);
6807 return PCI_ERS_RESULT_DISCONNECT;
6808 }
6809
6810 return PCI_ERS_RESULT_NEED_RESET;
6811 }
6812
6813 /**
6814 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6815 * @pdev: pointer to PCI device
6816 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)6817 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6818 {
6819 struct drm_device *dev = pci_get_drvdata(pdev);
6820 struct amdgpu_device *adev = drm_to_adev(dev);
6821
6822 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n");
6823
6824 /* TODO - dump whatever for debugging purposes */
6825
6826 /* This called only if amdgpu_pci_error_detected returns
6827 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6828 * works, no need to reset slot.
6829 */
6830
6831 return PCI_ERS_RESULT_RECOVERED;
6832 }
6833
6834 /**
6835 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6836 * @pdev: PCI device struct
6837 *
6838 * Description: This routine is called by the pci error recovery
6839 * code after the PCI slot has been reset, just before we
6840 * should resume normal operations.
6841 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)6842 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6843 {
6844 struct drm_device *dev = pci_get_drvdata(pdev);
6845 struct amdgpu_device *adev = drm_to_adev(dev);
6846 struct amdgpu_reset_context reset_context;
6847 struct amdgpu_device *tmp_adev;
6848 struct amdgpu_hive_info *hive;
6849 struct list_head device_list;
6850 int r = 0, i;
6851 u32 memsize;
6852
6853 /* PCI error slot reset should be skipped During RAS recovery */
6854 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6855 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6856 amdgpu_ras_in_recovery(adev))
6857 return PCI_ERS_RESULT_RECOVERED;
6858
6859 dev_info(adev->dev, "PCI error: slot reset callback!!\n");
6860
6861 memset(&reset_context, 0, sizeof(reset_context));
6862
6863 /* wait for asic to come out of reset */
6864 msleep(700);
6865
6866 /* Restore PCI confspace */
6867 amdgpu_device_load_pci_state(pdev);
6868
6869 /* confirm ASIC came out of reset */
6870 for (i = 0; i < adev->usec_timeout; i++) {
6871 memsize = amdgpu_asic_get_config_memsize(adev);
6872
6873 if (memsize != 0xffffffff)
6874 break;
6875 udelay(1);
6876 }
6877 if (memsize == 0xffffffff) {
6878 r = -ETIME;
6879 goto out;
6880 }
6881
6882 reset_context.method = AMD_RESET_METHOD_NONE;
6883 reset_context.reset_req_dev = adev;
6884 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6885 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
6886 INIT_LIST_HEAD(&device_list);
6887
6888 hive = amdgpu_get_xgmi_hive(adev);
6889 if (hive) {
6890 mutex_lock(&hive->hive_lock);
6891 reset_context.hive = hive;
6892 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
6893 tmp_adev->pcie_reset_ctx.in_link_reset = true;
6894 list_add_tail(&tmp_adev->reset_list, &device_list);
6895 }
6896 } else {
6897 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6898 list_add_tail(&adev->reset_list, &device_list);
6899 }
6900
6901 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context);
6902 out:
6903 if (!r) {
6904 if (amdgpu_device_cache_pci_state(adev->pdev))
6905 pci_restore_state(adev->pdev);
6906 dev_info(adev->dev, "PCIe error recovery succeeded\n");
6907 } else {
6908 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r);
6909 if (hive) {
6910 list_for_each_entry(tmp_adev, &device_list, reset_list)
6911 amdgpu_device_unset_mp1_state(tmp_adev);
6912 }
6913 amdgpu_device_recovery_put_reset_lock(adev, &device_list);
6914 }
6915
6916 if (hive) {
6917 mutex_unlock(&hive->hive_lock);
6918 amdgpu_put_xgmi_hive(hive);
6919 }
6920
6921 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6922 }
6923
6924 /**
6925 * amdgpu_pci_resume() - resume normal ops after PCI reset
6926 * @pdev: pointer to PCI device
6927 *
6928 * Called when the error recovery driver tells us that its
6929 * OK to resume normal operation.
6930 */
amdgpu_pci_resume(struct pci_dev * pdev)6931 void amdgpu_pci_resume(struct pci_dev *pdev)
6932 {
6933 struct drm_device *dev = pci_get_drvdata(pdev);
6934 struct amdgpu_device *adev = drm_to_adev(dev);
6935 struct list_head device_list;
6936 struct amdgpu_hive_info *hive = NULL;
6937 struct amdgpu_device *tmp_adev = NULL;
6938
6939 dev_info(adev->dev, "PCI error: resume callback!!\n");
6940
6941 /* Only continue execution for the case of pci_channel_io_frozen */
6942 if (adev->pci_channel_state != pci_channel_io_frozen)
6943 return;
6944
6945 INIT_LIST_HEAD(&device_list);
6946
6947 hive = amdgpu_get_xgmi_hive(adev);
6948 if (hive) {
6949 mutex_lock(&hive->hive_lock);
6950 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
6951 tmp_adev->pcie_reset_ctx.in_link_reset = false;
6952 list_add_tail(&tmp_adev->reset_list, &device_list);
6953 }
6954 } else
6955 list_add_tail(&adev->reset_list, &device_list);
6956
6957 amdgpu_device_sched_resume(&device_list, NULL, NULL);
6958 amdgpu_device_gpu_resume(adev, &device_list, false);
6959 amdgpu_device_recovery_put_reset_lock(adev, &device_list);
6960 adev->pcie_reset_ctx.occurs_dpc = false;
6961
6962 if (hive) {
6963 mutex_unlock(&hive->hive_lock);
6964 amdgpu_put_xgmi_hive(hive);
6965 }
6966 }
6967
amdgpu_device_cache_pci_state(struct pci_dev * pdev)6968 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6969 {
6970 struct drm_device *dev = pci_get_drvdata(pdev);
6971 struct amdgpu_device *adev = drm_to_adev(dev);
6972 int r;
6973
6974 if (amdgpu_sriov_vf(adev))
6975 return false;
6976
6977 r = pci_save_state(pdev);
6978 if (!r) {
6979 kfree(adev->pci_state);
6980
6981 adev->pci_state = pci_store_saved_state(pdev);
6982
6983 if (!adev->pci_state) {
6984 DRM_ERROR("Failed to store PCI saved state");
6985 return false;
6986 }
6987 } else {
6988 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6989 return false;
6990 }
6991
6992 return true;
6993 }
6994
amdgpu_device_load_pci_state(struct pci_dev * pdev)6995 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6996 {
6997 struct drm_device *dev = pci_get_drvdata(pdev);
6998 struct amdgpu_device *adev = drm_to_adev(dev);
6999 int r;
7000
7001 if (!adev->pci_state)
7002 return false;
7003
7004 r = pci_load_saved_state(pdev, adev->pci_state);
7005
7006 if (!r) {
7007 pci_restore_state(pdev);
7008 } else {
7009 DRM_WARN("Failed to load PCI state, err:%d\n", r);
7010 return false;
7011 }
7012
7013 return true;
7014 }
7015
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)7016 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
7017 struct amdgpu_ring *ring)
7018 {
7019 #ifdef CONFIG_X86_64
7020 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
7021 return;
7022 #endif
7023 if (adev->gmc.xgmi.connected_to_cpu)
7024 return;
7025
7026 if (ring && ring->funcs->emit_hdp_flush)
7027 amdgpu_ring_emit_hdp_flush(ring);
7028 else
7029 amdgpu_asic_flush_hdp(adev, ring);
7030 }
7031
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)7032 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
7033 struct amdgpu_ring *ring)
7034 {
7035 #ifdef CONFIG_X86_64
7036 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
7037 return;
7038 #endif
7039 if (adev->gmc.xgmi.connected_to_cpu)
7040 return;
7041
7042 amdgpu_asic_invalidate_hdp(adev, ring);
7043 }
7044
amdgpu_in_reset(struct amdgpu_device * adev)7045 int amdgpu_in_reset(struct amdgpu_device *adev)
7046 {
7047 return atomic_read(&adev->reset_domain->in_gpu_reset);
7048 }
7049
7050 /**
7051 * amdgpu_device_halt() - bring hardware to some kind of halt state
7052 *
7053 * @adev: amdgpu_device pointer
7054 *
7055 * Bring hardware to some kind of halt state so that no one can touch it
7056 * any more. It will help to maintain error context when error occurred.
7057 * Compare to a simple hang, the system will keep stable at least for SSH
7058 * access. Then it should be trivial to inspect the hardware state and
7059 * see what's going on. Implemented as following:
7060 *
7061 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
7062 * clears all CPU mappings to device, disallows remappings through page faults
7063 * 2. amdgpu_irq_disable_all() disables all interrupts
7064 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
7065 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
7066 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
7067 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
7068 * flush any in flight DMA operations
7069 */
amdgpu_device_halt(struct amdgpu_device * adev)7070 void amdgpu_device_halt(struct amdgpu_device *adev)
7071 {
7072 struct pci_dev *pdev = adev->pdev;
7073 struct drm_device *ddev = adev_to_drm(adev);
7074
7075 amdgpu_xcp_dev_unplug(adev);
7076 drm_dev_unplug(ddev);
7077
7078 amdgpu_irq_disable_all(adev);
7079
7080 amdgpu_fence_driver_hw_fini(adev);
7081
7082 adev->no_hw_access = true;
7083
7084 amdgpu_device_unmap_mmio(adev);
7085
7086 pci_disable_device(pdev);
7087 pci_wait_for_pending_transaction(pdev);
7088 }
7089
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)7090 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
7091 u32 reg)
7092 {
7093 unsigned long flags, address, data;
7094 u32 r;
7095
7096 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
7097 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
7098
7099 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
7100 WREG32(address, reg * 4);
7101 (void)RREG32(address);
7102 r = RREG32(data);
7103 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
7104 return r;
7105 }
7106
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)7107 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
7108 u32 reg, u32 v)
7109 {
7110 unsigned long flags, address, data;
7111
7112 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
7113 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
7114
7115 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
7116 WREG32(address, reg * 4);
7117 (void)RREG32(address);
7118 WREG32(data, v);
7119 (void)RREG32(data);
7120 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
7121 }
7122
7123 /**
7124 * amdgpu_device_get_gang - return a reference to the current gang
7125 * @adev: amdgpu_device pointer
7126 *
7127 * Returns: A new reference to the current gang leader.
7128 */
amdgpu_device_get_gang(struct amdgpu_device * adev)7129 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
7130 {
7131 struct dma_fence *fence;
7132
7133 rcu_read_lock();
7134 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
7135 rcu_read_unlock();
7136 return fence;
7137 }
7138
7139 /**
7140 * amdgpu_device_switch_gang - switch to a new gang
7141 * @adev: amdgpu_device pointer
7142 * @gang: the gang to switch to
7143 *
7144 * Try to switch to a new gang.
7145 * Returns: NULL if we switched to the new gang or a reference to the current
7146 * gang leader.
7147 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)7148 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
7149 struct dma_fence *gang)
7150 {
7151 struct dma_fence *old = NULL;
7152
7153 dma_fence_get(gang);
7154 do {
7155 dma_fence_put(old);
7156 old = amdgpu_device_get_gang(adev);
7157 if (old == gang)
7158 break;
7159
7160 if (!dma_fence_is_signaled(old)) {
7161 dma_fence_put(gang);
7162 return old;
7163 }
7164
7165 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
7166 old, gang) != old);
7167
7168 /*
7169 * Drop it once for the exchanged reference in adev and once for the
7170 * thread local reference acquired in amdgpu_device_get_gang().
7171 */
7172 dma_fence_put(old);
7173 dma_fence_put(old);
7174 return NULL;
7175 }
7176
7177 /**
7178 * amdgpu_device_enforce_isolation - enforce HW isolation
7179 * @adev: the amdgpu device pointer
7180 * @ring: the HW ring the job is supposed to run on
7181 * @job: the job which is about to be pushed to the HW ring
7182 *
7183 * Makes sure that only one client at a time can use the GFX block.
7184 * Returns: The dependency to wait on before the job can be pushed to the HW.
7185 * The function is called multiple times until NULL is returned.
7186 */
amdgpu_device_enforce_isolation(struct amdgpu_device * adev,struct amdgpu_ring * ring,struct amdgpu_job * job)7187 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
7188 struct amdgpu_ring *ring,
7189 struct amdgpu_job *job)
7190 {
7191 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id];
7192 struct drm_sched_fence *f = job->base.s_fence;
7193 struct dma_fence *dep;
7194 void *owner;
7195 int r;
7196
7197 /*
7198 * For now enforce isolation only for the GFX block since we only need
7199 * the cleaner shader on those rings.
7200 */
7201 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX &&
7202 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE)
7203 return NULL;
7204
7205 /*
7206 * All submissions where enforce isolation is false are handled as if
7207 * they come from a single client. Use ~0l as the owner to distinct it
7208 * from kernel submissions where the owner is NULL.
7209 */
7210 owner = job->enforce_isolation ? f->owner : (void *)~0l;
7211
7212 mutex_lock(&adev->enforce_isolation_mutex);
7213
7214 /*
7215 * The "spearhead" submission is the first one which changes the
7216 * ownership to its client. We always need to wait for it to be
7217 * pushed to the HW before proceeding with anything.
7218 */
7219 if (&f->scheduled != isolation->spearhead &&
7220 !dma_fence_is_signaled(isolation->spearhead)) {
7221 dep = isolation->spearhead;
7222 goto out_grab_ref;
7223 }
7224
7225 if (isolation->owner != owner) {
7226
7227 /*
7228 * Wait for any gang to be assembled before switching to a
7229 * different owner or otherwise we could deadlock the
7230 * submissions.
7231 */
7232 if (!job->gang_submit) {
7233 dep = amdgpu_device_get_gang(adev);
7234 if (!dma_fence_is_signaled(dep))
7235 goto out_return_dep;
7236 dma_fence_put(dep);
7237 }
7238
7239 dma_fence_put(isolation->spearhead);
7240 isolation->spearhead = dma_fence_get(&f->scheduled);
7241 amdgpu_sync_move(&isolation->active, &isolation->prev);
7242 trace_amdgpu_isolation(isolation->owner, owner);
7243 isolation->owner = owner;
7244 }
7245
7246 /*
7247 * Specifying the ring here helps to pipeline submissions even when
7248 * isolation is enabled. If that is not desired for testing NULL can be
7249 * used instead of the ring to enforce a CPU round trip while switching
7250 * between clients.
7251 */
7252 dep = amdgpu_sync_peek_fence(&isolation->prev, ring);
7253 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT);
7254 if (r)
7255 DRM_WARN("OOM tracking isolation\n");
7256
7257 out_grab_ref:
7258 dma_fence_get(dep);
7259 out_return_dep:
7260 mutex_unlock(&adev->enforce_isolation_mutex);
7261 return dep;
7262 }
7263
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)7264 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
7265 {
7266 switch (adev->asic_type) {
7267 #ifdef CONFIG_DRM_AMDGPU_SI
7268 case CHIP_HAINAN:
7269 #endif
7270 case CHIP_TOPAZ:
7271 /* chips with no display hardware */
7272 return false;
7273 #ifdef CONFIG_DRM_AMDGPU_SI
7274 case CHIP_TAHITI:
7275 case CHIP_PITCAIRN:
7276 case CHIP_VERDE:
7277 case CHIP_OLAND:
7278 #endif
7279 #ifdef CONFIG_DRM_AMDGPU_CIK
7280 case CHIP_BONAIRE:
7281 case CHIP_HAWAII:
7282 case CHIP_KAVERI:
7283 case CHIP_KABINI:
7284 case CHIP_MULLINS:
7285 #endif
7286 case CHIP_TONGA:
7287 case CHIP_FIJI:
7288 case CHIP_POLARIS10:
7289 case CHIP_POLARIS11:
7290 case CHIP_POLARIS12:
7291 case CHIP_VEGAM:
7292 case CHIP_CARRIZO:
7293 case CHIP_STONEY:
7294 /* chips with display hardware */
7295 return true;
7296 default:
7297 /* IP discovery */
7298 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
7299 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
7300 return false;
7301 return true;
7302 }
7303 }
7304
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)7305 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
7306 uint32_t inst, uint32_t reg_addr, char reg_name[],
7307 uint32_t expected_value, uint32_t mask)
7308 {
7309 uint32_t ret = 0;
7310 uint32_t old_ = 0;
7311 uint32_t tmp_ = RREG32(reg_addr);
7312 uint32_t loop = adev->usec_timeout;
7313
7314 while ((tmp_ & (mask)) != (expected_value)) {
7315 if (old_ != tmp_) {
7316 loop = adev->usec_timeout;
7317 old_ = tmp_;
7318 } else
7319 udelay(1);
7320 tmp_ = RREG32(reg_addr);
7321 loop--;
7322 if (!loop) {
7323 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
7324 inst, reg_name, (uint32_t)expected_value,
7325 (uint32_t)(tmp_ & (mask)));
7326 ret = -ETIMEDOUT;
7327 break;
7328 }
7329 }
7330 return ret;
7331 }
7332
amdgpu_get_soft_full_reset_mask(struct amdgpu_ring * ring)7333 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
7334 {
7335 ssize_t size = 0;
7336
7337 if (!ring || !ring->adev)
7338 return size;
7339
7340 if (amdgpu_device_should_recover_gpu(ring->adev))
7341 size |= AMDGPU_RESET_TYPE_FULL;
7342
7343 if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
7344 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
7345 size |= AMDGPU_RESET_TYPE_SOFT_RESET;
7346
7347 return size;
7348 }
7349
amdgpu_show_reset_mask(char * buf,uint32_t supported_reset)7350 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
7351 {
7352 ssize_t size = 0;
7353
7354 if (supported_reset == 0) {
7355 size += sysfs_emit_at(buf, size, "unsupported");
7356 size += sysfs_emit_at(buf, size, "\n");
7357 return size;
7358
7359 }
7360
7361 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
7362 size += sysfs_emit_at(buf, size, "soft ");
7363
7364 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
7365 size += sysfs_emit_at(buf, size, "queue ");
7366
7367 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
7368 size += sysfs_emit_at(buf, size, "pipe ");
7369
7370 if (supported_reset & AMDGPU_RESET_TYPE_FULL)
7371 size += sysfs_emit_at(buf, size, "full ");
7372
7373 size += sysfs_emit_at(buf, size, "\n");
7374 return size;
7375 }
7376