1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28
29 #include <linux/aperture.h>
30 #include <linux/power_supply.h>
31 #include <linux/kthread.h>
32 #include <linux/module.h>
33 #include <linux/console.h>
34 #include <linux/slab.h>
35 #include <linux/iommu.h>
36 #include <linux/pci.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39
40 #include <drm/drm_atomic_helper.h>
41 #include <drm/drm_client_event.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_probe_helper.h>
44 #include <drm/amdgpu_drm.h>
45 #include <linux/device.h>
46 #include <linux/vgaarb.h>
47 #include <linux/vga_switcheroo.h>
48 #include <linux/efi.h>
49 #include "amdgpu.h"
50 #include "amdgpu_trace.h"
51 #include "amdgpu_i2c.h"
52 #include "atom.h"
53 #include "amdgpu_atombios.h"
54 #include "amdgpu_atomfirmware.h"
55 #include "amd_pcie.h"
56 #ifdef CONFIG_DRM_AMDGPU_SI
57 #include "si.h"
58 #endif
59 #ifdef CONFIG_DRM_AMDGPU_CIK
60 #include "cik.h"
61 #endif
62 #include "vi.h"
63 #include "soc15.h"
64 #include "nv.h"
65 #include "bif/bif_4_1_d.h"
66 #include <linux/firmware.h>
67 #include "amdgpu_vf_error.h"
68
69 #include "amdgpu_amdkfd.h"
70 #include "amdgpu_pm.h"
71
72 #include "amdgpu_xgmi.h"
73 #include "amdgpu_ras.h"
74 #include "amdgpu_pmu.h"
75 #include "amdgpu_fru_eeprom.h"
76 #include "amdgpu_reset.h"
77 #include "amdgpu_virt.h"
78 #include "amdgpu_dev_coredump.h"
79
80 #include <linux/suspend.h>
81 #include <drm/task_barrier.h>
82 #include <linux/pm_runtime.h>
83
84 #include <drm/drm_drv.h>
85
86 #if IS_ENABLED(CONFIG_X86)
87 #include <asm/intel-family.h>
88 #endif
89
90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
97
98 #define AMDGPU_RESUME_MS 2000
99 #define AMDGPU_MAX_RETRY_LIMIT 2
100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
104
105 static const struct drm_driver amdgpu_kms_driver;
106
107 const char *amdgpu_asic_name[] = {
108 "TAHITI",
109 "PITCAIRN",
110 "VERDE",
111 "OLAND",
112 "HAINAN",
113 "BONAIRE",
114 "KAVERI",
115 "KABINI",
116 "HAWAII",
117 "MULLINS",
118 "TOPAZ",
119 "TONGA",
120 "FIJI",
121 "CARRIZO",
122 "STONEY",
123 "POLARIS10",
124 "POLARIS11",
125 "POLARIS12",
126 "VEGAM",
127 "VEGA10",
128 "VEGA12",
129 "VEGA20",
130 "RAVEN",
131 "ARCTURUS",
132 "RENOIR",
133 "ALDEBARAN",
134 "NAVI10",
135 "CYAN_SKILLFISH",
136 "NAVI14",
137 "NAVI12",
138 "SIENNA_CICHLID",
139 "NAVY_FLOUNDER",
140 "VANGOGH",
141 "DIMGREY_CAVEFISH",
142 "BEIGE_GOBY",
143 "YELLOW_CARP",
144 "IP DISCOVERY",
145 "LAST",
146 };
147
148 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0)
149 /*
150 * Default init level where all blocks are expected to be initialized. This is
151 * the level of initialization expected by default and also after a full reset
152 * of the device.
153 */
154 struct amdgpu_init_level amdgpu_init_default = {
155 .level = AMDGPU_INIT_LEVEL_DEFAULT,
156 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
157 };
158
159 struct amdgpu_init_level amdgpu_init_recovery = {
160 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
161 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
162 };
163
164 /*
165 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
166 * is used for cases like reset on initialization where the entire hive needs to
167 * be reset before first use.
168 */
169 struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
170 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
171 .hwini_ip_block_mask =
172 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
173 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
174 BIT(AMD_IP_BLOCK_TYPE_PSP)
175 };
176
amdgpu_ip_member_of_hwini(struct amdgpu_device * adev,enum amd_ip_block_type block)177 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
178 enum amd_ip_block_type block)
179 {
180 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0;
181 }
182
amdgpu_set_init_level(struct amdgpu_device * adev,enum amdgpu_init_lvl_id lvl)183 void amdgpu_set_init_level(struct amdgpu_device *adev,
184 enum amdgpu_init_lvl_id lvl)
185 {
186 switch (lvl) {
187 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
188 adev->init_lvl = &amdgpu_init_minimal_xgmi;
189 break;
190 case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
191 adev->init_lvl = &amdgpu_init_recovery;
192 break;
193 case AMDGPU_INIT_LEVEL_DEFAULT:
194 fallthrough;
195 default:
196 adev->init_lvl = &amdgpu_init_default;
197 break;
198 }
199 }
200
201 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
202
203 /**
204 * DOC: pcie_replay_count
205 *
206 * The amdgpu driver provides a sysfs API for reporting the total number
207 * of PCIe replays (NAKs)
208 * The file pcie_replay_count is used for this and returns the total
209 * number of replays as a sum of the NAKs generated and NAKs received
210 */
211
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)212 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
213 struct device_attribute *attr, char *buf)
214 {
215 struct drm_device *ddev = dev_get_drvdata(dev);
216 struct amdgpu_device *adev = drm_to_adev(ddev);
217 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
218
219 return sysfs_emit(buf, "%llu\n", cnt);
220 }
221
222 static DEVICE_ATTR(pcie_replay_count, 0444,
223 amdgpu_device_get_pcie_replay_count, NULL);
224
amdgpu_sysfs_reg_state_get(struct file * f,struct kobject * kobj,struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)225 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
226 struct bin_attribute *attr, char *buf,
227 loff_t ppos, size_t count)
228 {
229 struct device *dev = kobj_to_dev(kobj);
230 struct drm_device *ddev = dev_get_drvdata(dev);
231 struct amdgpu_device *adev = drm_to_adev(ddev);
232 ssize_t bytes_read;
233
234 switch (ppos) {
235 case AMDGPU_SYS_REG_STATE_XGMI:
236 bytes_read = amdgpu_asic_get_reg_state(
237 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
238 break;
239 case AMDGPU_SYS_REG_STATE_WAFL:
240 bytes_read = amdgpu_asic_get_reg_state(
241 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
242 break;
243 case AMDGPU_SYS_REG_STATE_PCIE:
244 bytes_read = amdgpu_asic_get_reg_state(
245 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
246 break;
247 case AMDGPU_SYS_REG_STATE_USR:
248 bytes_read = amdgpu_asic_get_reg_state(
249 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
250 break;
251 case AMDGPU_SYS_REG_STATE_USR_1:
252 bytes_read = amdgpu_asic_get_reg_state(
253 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
254 break;
255 default:
256 return -EINVAL;
257 }
258
259 return bytes_read;
260 }
261
262 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
263 AMDGPU_SYS_REG_STATE_END);
264
amdgpu_reg_state_sysfs_init(struct amdgpu_device * adev)265 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
266 {
267 int ret;
268
269 if (!amdgpu_asic_get_reg_state_supported(adev))
270 return 0;
271
272 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
273
274 return ret;
275 }
276
amdgpu_reg_state_sysfs_fini(struct amdgpu_device * adev)277 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
278 {
279 if (!amdgpu_asic_get_reg_state_supported(adev))
280 return;
281 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
282 }
283
amdgpu_ip_block_suspend(struct amdgpu_ip_block * ip_block)284 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
285 {
286 int r;
287
288 if (ip_block->version->funcs->suspend) {
289 r = ip_block->version->funcs->suspend(ip_block);
290 if (r) {
291 dev_err(ip_block->adev->dev,
292 "suspend of IP block <%s> failed %d\n",
293 ip_block->version->funcs->name, r);
294 return r;
295 }
296 }
297
298 ip_block->status.hw = false;
299 return 0;
300 }
301
amdgpu_ip_block_resume(struct amdgpu_ip_block * ip_block)302 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
303 {
304 int r;
305
306 if (ip_block->version->funcs->resume) {
307 r = ip_block->version->funcs->resume(ip_block);
308 if (r) {
309 dev_err(ip_block->adev->dev,
310 "resume of IP block <%s> failed %d\n",
311 ip_block->version->funcs->name, r);
312 return r;
313 }
314 }
315
316 ip_block->status.hw = true;
317 return 0;
318 }
319
320 /**
321 * DOC: board_info
322 *
323 * The amdgpu driver provides a sysfs API for giving board related information.
324 * It provides the form factor information in the format
325 *
326 * type : form factor
327 *
328 * Possible form factor values
329 *
330 * - "cem" - PCIE CEM card
331 * - "oam" - Open Compute Accelerator Module
332 * - "unknown" - Not known
333 *
334 */
335
amdgpu_device_get_board_info(struct device * dev,struct device_attribute * attr,char * buf)336 static ssize_t amdgpu_device_get_board_info(struct device *dev,
337 struct device_attribute *attr,
338 char *buf)
339 {
340 struct drm_device *ddev = dev_get_drvdata(dev);
341 struct amdgpu_device *adev = drm_to_adev(ddev);
342 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
343 const char *pkg;
344
345 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
346 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
347
348 switch (pkg_type) {
349 case AMDGPU_PKG_TYPE_CEM:
350 pkg = "cem";
351 break;
352 case AMDGPU_PKG_TYPE_OAM:
353 pkg = "oam";
354 break;
355 default:
356 pkg = "unknown";
357 break;
358 }
359
360 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
361 }
362
363 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
364
365 static struct attribute *amdgpu_board_attrs[] = {
366 &dev_attr_board_info.attr,
367 NULL,
368 };
369
amdgpu_board_attrs_is_visible(struct kobject * kobj,struct attribute * attr,int n)370 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
371 struct attribute *attr, int n)
372 {
373 struct device *dev = kobj_to_dev(kobj);
374 struct drm_device *ddev = dev_get_drvdata(dev);
375 struct amdgpu_device *adev = drm_to_adev(ddev);
376
377 if (adev->flags & AMD_IS_APU)
378 return 0;
379
380 return attr->mode;
381 }
382
383 static const struct attribute_group amdgpu_board_attrs_group = {
384 .attrs = amdgpu_board_attrs,
385 .is_visible = amdgpu_board_attrs_is_visible
386 };
387
388 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
389
390
391 /**
392 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
393 *
394 * @dev: drm_device pointer
395 *
396 * Returns true if the device is a dGPU with ATPX power control,
397 * otherwise return false.
398 */
amdgpu_device_supports_px(struct drm_device * dev)399 bool amdgpu_device_supports_px(struct drm_device *dev)
400 {
401 struct amdgpu_device *adev = drm_to_adev(dev);
402
403 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
404 return true;
405 return false;
406 }
407
408 /**
409 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
410 *
411 * @dev: drm_device pointer
412 *
413 * Returns true if the device is a dGPU with ACPI power control,
414 * otherwise return false.
415 */
amdgpu_device_supports_boco(struct drm_device * dev)416 bool amdgpu_device_supports_boco(struct drm_device *dev)
417 {
418 struct amdgpu_device *adev = drm_to_adev(dev);
419
420 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
421 return false;
422
423 if (adev->has_pr3 ||
424 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
425 return true;
426 return false;
427 }
428
429 /**
430 * amdgpu_device_supports_baco - Does the device support BACO
431 *
432 * @dev: drm_device pointer
433 *
434 * Return:
435 * 1 if the device supporte BACO;
436 * 3 if the device support MACO (only works if BACO is supported)
437 * otherwise return 0.
438 */
amdgpu_device_supports_baco(struct drm_device * dev)439 int amdgpu_device_supports_baco(struct drm_device *dev)
440 {
441 struct amdgpu_device *adev = drm_to_adev(dev);
442
443 return amdgpu_asic_supports_baco(adev);
444 }
445
amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device * adev)446 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
447 {
448 struct drm_device *dev;
449 int bamaco_support;
450
451 dev = adev_to_drm(adev);
452
453 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
454 bamaco_support = amdgpu_device_supports_baco(dev);
455
456 switch (amdgpu_runtime_pm) {
457 case 2:
458 if (bamaco_support & MACO_SUPPORT) {
459 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
460 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
461 } else if (bamaco_support == BACO_SUPPORT) {
462 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
463 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
464 }
465 break;
466 case 1:
467 if (bamaco_support & BACO_SUPPORT) {
468 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
469 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
470 }
471 break;
472 case -1:
473 case -2:
474 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
475 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
476 dev_info(adev->dev, "Using ATPX for runtime pm\n");
477 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
478 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
479 dev_info(adev->dev, "Using BOCO for runtime pm\n");
480 } else {
481 if (!bamaco_support)
482 goto no_runtime_pm;
483
484 switch (adev->asic_type) {
485 case CHIP_VEGA20:
486 case CHIP_ARCTURUS:
487 /* BACO are not supported on vega20 and arctrus */
488 break;
489 case CHIP_VEGA10:
490 /* enable BACO as runpm mode if noretry=0 */
491 if (!adev->gmc.noretry)
492 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
493 break;
494 default:
495 /* enable BACO as runpm mode on CI+ */
496 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
497 break;
498 }
499
500 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
501 if (bamaco_support & MACO_SUPPORT) {
502 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
503 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
504 } else {
505 dev_info(adev->dev, "Using BACO for runtime pm\n");
506 }
507 }
508 }
509 break;
510 case 0:
511 dev_info(adev->dev, "runtime pm is manually disabled\n");
512 break;
513 default:
514 break;
515 }
516
517 no_runtime_pm:
518 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
519 dev_info(adev->dev, "Runtime PM not available\n");
520 }
521 /**
522 * amdgpu_device_supports_smart_shift - Is the device dGPU with
523 * smart shift support
524 *
525 * @dev: drm_device pointer
526 *
527 * Returns true if the device is a dGPU with Smart Shift support,
528 * otherwise returns false.
529 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)530 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
531 {
532 return (amdgpu_device_supports_boco(dev) &&
533 amdgpu_acpi_is_power_shift_control_supported());
534 }
535
536 /*
537 * VRAM access helper functions
538 */
539
540 /**
541 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
542 *
543 * @adev: amdgpu_device pointer
544 * @pos: offset of the buffer in vram
545 * @buf: virtual address of the buffer in system memory
546 * @size: read/write size, sizeof(@buf) must > @size
547 * @write: true - write to vram, otherwise - read from vram
548 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)549 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
550 void *buf, size_t size, bool write)
551 {
552 unsigned long flags;
553 uint32_t hi = ~0, tmp = 0;
554 uint32_t *data = buf;
555 uint64_t last;
556 int idx;
557
558 if (!drm_dev_enter(adev_to_drm(adev), &idx))
559 return;
560
561 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
562
563 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
564 for (last = pos + size; pos < last; pos += 4) {
565 tmp = pos >> 31;
566
567 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
568 if (tmp != hi) {
569 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
570 hi = tmp;
571 }
572 if (write)
573 WREG32_NO_KIQ(mmMM_DATA, *data++);
574 else
575 *data++ = RREG32_NO_KIQ(mmMM_DATA);
576 }
577
578 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
579 drm_dev_exit(idx);
580 }
581
582 /**
583 * amdgpu_device_aper_access - access vram by vram aperature
584 *
585 * @adev: amdgpu_device pointer
586 * @pos: offset of the buffer in vram
587 * @buf: virtual address of the buffer in system memory
588 * @size: read/write size, sizeof(@buf) must > @size
589 * @write: true - write to vram, otherwise - read from vram
590 *
591 * The return value means how many bytes have been transferred.
592 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)593 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
594 void *buf, size_t size, bool write)
595 {
596 #ifdef CONFIG_64BIT
597 void __iomem *addr;
598 size_t count = 0;
599 uint64_t last;
600
601 if (!adev->mman.aper_base_kaddr)
602 return 0;
603
604 last = min(pos + size, adev->gmc.visible_vram_size);
605 if (last > pos) {
606 addr = adev->mman.aper_base_kaddr + pos;
607 count = last - pos;
608
609 if (write) {
610 memcpy_toio(addr, buf, count);
611 /* Make sure HDP write cache flush happens without any reordering
612 * after the system memory contents are sent over PCIe device
613 */
614 mb();
615 amdgpu_device_flush_hdp(adev, NULL);
616 } else {
617 amdgpu_device_invalidate_hdp(adev, NULL);
618 /* Make sure HDP read cache is invalidated before issuing a read
619 * to the PCIe device
620 */
621 mb();
622 memcpy_fromio(buf, addr, count);
623 }
624
625 }
626
627 return count;
628 #else
629 return 0;
630 #endif
631 }
632
633 /**
634 * amdgpu_device_vram_access - read/write a buffer in vram
635 *
636 * @adev: amdgpu_device pointer
637 * @pos: offset of the buffer in vram
638 * @buf: virtual address of the buffer in system memory
639 * @size: read/write size, sizeof(@buf) must > @size
640 * @write: true - write to vram, otherwise - read from vram
641 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)642 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
643 void *buf, size_t size, bool write)
644 {
645 size_t count;
646
647 /* try to using vram apreature to access vram first */
648 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
649 size -= count;
650 if (size) {
651 /* using MM to access rest vram */
652 pos += count;
653 buf += count;
654 amdgpu_device_mm_access(adev, pos, buf, size, write);
655 }
656 }
657
658 /*
659 * register access helper functions.
660 */
661
662 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)663 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
664 {
665 if (adev->no_hw_access)
666 return true;
667
668 #ifdef CONFIG_LOCKDEP
669 /*
670 * This is a bit complicated to understand, so worth a comment. What we assert
671 * here is that the GPU reset is not running on another thread in parallel.
672 *
673 * For this we trylock the read side of the reset semaphore, if that succeeds
674 * we know that the reset is not running in paralell.
675 *
676 * If the trylock fails we assert that we are either already holding the read
677 * side of the lock or are the reset thread itself and hold the write side of
678 * the lock.
679 */
680 if (in_task()) {
681 if (down_read_trylock(&adev->reset_domain->sem))
682 up_read(&adev->reset_domain->sem);
683 else
684 lockdep_assert_held(&adev->reset_domain->sem);
685 }
686 #endif
687 return false;
688 }
689
690 /**
691 * amdgpu_device_rreg - read a memory mapped IO or indirect register
692 *
693 * @adev: amdgpu_device pointer
694 * @reg: dword aligned register offset
695 * @acc_flags: access flags which require special behavior
696 *
697 * Returns the 32 bit value from the offset specified.
698 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)699 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
700 uint32_t reg, uint32_t acc_flags)
701 {
702 uint32_t ret;
703
704 if (amdgpu_device_skip_hw_access(adev))
705 return 0;
706
707 if ((reg * 4) < adev->rmmio_size) {
708 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
709 amdgpu_sriov_runtime(adev) &&
710 down_read_trylock(&adev->reset_domain->sem)) {
711 ret = amdgpu_kiq_rreg(adev, reg, 0);
712 up_read(&adev->reset_domain->sem);
713 } else {
714 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
715 }
716 } else {
717 ret = adev->pcie_rreg(adev, reg * 4);
718 }
719
720 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
721
722 return ret;
723 }
724
725 /*
726 * MMIO register read with bytes helper functions
727 * @offset:bytes offset from MMIO start
728 */
729
730 /**
731 * amdgpu_mm_rreg8 - read a memory mapped IO register
732 *
733 * @adev: amdgpu_device pointer
734 * @offset: byte aligned register offset
735 *
736 * Returns the 8 bit value from the offset specified.
737 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)738 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
739 {
740 if (amdgpu_device_skip_hw_access(adev))
741 return 0;
742
743 if (offset < adev->rmmio_size)
744 return (readb(adev->rmmio + offset));
745 BUG();
746 }
747
748
749 /**
750 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
751 *
752 * @adev: amdgpu_device pointer
753 * @reg: dword aligned register offset
754 * @acc_flags: access flags which require special behavior
755 * @xcc_id: xcc accelerated compute core id
756 *
757 * Returns the 32 bit value from the offset specified.
758 */
amdgpu_device_xcc_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags,uint32_t xcc_id)759 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
760 uint32_t reg, uint32_t acc_flags,
761 uint32_t xcc_id)
762 {
763 uint32_t ret, rlcg_flag;
764
765 if (amdgpu_device_skip_hw_access(adev))
766 return 0;
767
768 if ((reg * 4) < adev->rmmio_size) {
769 if (amdgpu_sriov_vf(adev) &&
770 !amdgpu_sriov_runtime(adev) &&
771 adev->gfx.rlc.rlcg_reg_access_supported &&
772 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
773 GC_HWIP, false,
774 &rlcg_flag)) {
775 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
776 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
777 amdgpu_sriov_runtime(adev) &&
778 down_read_trylock(&adev->reset_domain->sem)) {
779 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
780 up_read(&adev->reset_domain->sem);
781 } else {
782 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
783 }
784 } else {
785 ret = adev->pcie_rreg(adev, reg * 4);
786 }
787
788 return ret;
789 }
790
791 /*
792 * MMIO register write with bytes helper functions
793 * @offset:bytes offset from MMIO start
794 * @value: the value want to be written to the register
795 */
796
797 /**
798 * amdgpu_mm_wreg8 - read a memory mapped IO register
799 *
800 * @adev: amdgpu_device pointer
801 * @offset: byte aligned register offset
802 * @value: 8 bit value to write
803 *
804 * Writes the value specified to the offset specified.
805 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)806 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
807 {
808 if (amdgpu_device_skip_hw_access(adev))
809 return;
810
811 if (offset < adev->rmmio_size)
812 writeb(value, adev->rmmio + offset);
813 else
814 BUG();
815 }
816
817 /**
818 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
819 *
820 * @adev: amdgpu_device pointer
821 * @reg: dword aligned register offset
822 * @v: 32 bit value to write to the register
823 * @acc_flags: access flags which require special behavior
824 *
825 * Writes the value specified to the offset specified.
826 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)827 void amdgpu_device_wreg(struct amdgpu_device *adev,
828 uint32_t reg, uint32_t v,
829 uint32_t acc_flags)
830 {
831 if (amdgpu_device_skip_hw_access(adev))
832 return;
833
834 if ((reg * 4) < adev->rmmio_size) {
835 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
836 amdgpu_sriov_runtime(adev) &&
837 down_read_trylock(&adev->reset_domain->sem)) {
838 amdgpu_kiq_wreg(adev, reg, v, 0);
839 up_read(&adev->reset_domain->sem);
840 } else {
841 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
842 }
843 } else {
844 adev->pcie_wreg(adev, reg * 4, v);
845 }
846
847 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
848 }
849
850 /**
851 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
852 *
853 * @adev: amdgpu_device pointer
854 * @reg: mmio/rlc register
855 * @v: value to write
856 * @xcc_id: xcc accelerated compute core id
857 *
858 * this function is invoked only for the debugfs register access
859 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)860 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
861 uint32_t reg, uint32_t v,
862 uint32_t xcc_id)
863 {
864 if (amdgpu_device_skip_hw_access(adev))
865 return;
866
867 if (amdgpu_sriov_fullaccess(adev) &&
868 adev->gfx.rlc.funcs &&
869 adev->gfx.rlc.funcs->is_rlcg_access_range) {
870 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
871 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
872 } else if ((reg * 4) >= adev->rmmio_size) {
873 adev->pcie_wreg(adev, reg * 4, v);
874 } else {
875 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
876 }
877 }
878
879 /**
880 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
881 *
882 * @adev: amdgpu_device pointer
883 * @reg: dword aligned register offset
884 * @v: 32 bit value to write to the register
885 * @acc_flags: access flags which require special behavior
886 * @xcc_id: xcc accelerated compute core id
887 *
888 * Writes the value specified to the offset specified.
889 */
amdgpu_device_xcc_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags,uint32_t xcc_id)890 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
891 uint32_t reg, uint32_t v,
892 uint32_t acc_flags, uint32_t xcc_id)
893 {
894 uint32_t rlcg_flag;
895
896 if (amdgpu_device_skip_hw_access(adev))
897 return;
898
899 if ((reg * 4) < adev->rmmio_size) {
900 if (amdgpu_sriov_vf(adev) &&
901 !amdgpu_sriov_runtime(adev) &&
902 adev->gfx.rlc.rlcg_reg_access_supported &&
903 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
904 GC_HWIP, true,
905 &rlcg_flag)) {
906 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
907 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
908 amdgpu_sriov_runtime(adev) &&
909 down_read_trylock(&adev->reset_domain->sem)) {
910 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
911 up_read(&adev->reset_domain->sem);
912 } else {
913 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
914 }
915 } else {
916 adev->pcie_wreg(adev, reg * 4, v);
917 }
918 }
919
920 /**
921 * amdgpu_device_indirect_rreg - read an indirect register
922 *
923 * @adev: amdgpu_device pointer
924 * @reg_addr: indirect register address to read from
925 *
926 * Returns the value of indirect register @reg_addr
927 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)928 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
929 u32 reg_addr)
930 {
931 unsigned long flags, pcie_index, pcie_data;
932 void __iomem *pcie_index_offset;
933 void __iomem *pcie_data_offset;
934 u32 r;
935
936 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
937 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
938
939 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
940 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
941 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
942
943 writel(reg_addr, pcie_index_offset);
944 readl(pcie_index_offset);
945 r = readl(pcie_data_offset);
946 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
947
948 return r;
949 }
950
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)951 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
952 u64 reg_addr)
953 {
954 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
955 u32 r;
956 void __iomem *pcie_index_offset;
957 void __iomem *pcie_index_hi_offset;
958 void __iomem *pcie_data_offset;
959
960 if (unlikely(!adev->nbio.funcs)) {
961 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
962 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
963 } else {
964 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
965 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
966 }
967
968 if (reg_addr >> 32) {
969 if (unlikely(!adev->nbio.funcs))
970 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
971 else
972 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
973 } else {
974 pcie_index_hi = 0;
975 }
976
977 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
978 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
979 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
980 if (pcie_index_hi != 0)
981 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
982 pcie_index_hi * 4;
983
984 writel(reg_addr, pcie_index_offset);
985 readl(pcie_index_offset);
986 if (pcie_index_hi != 0) {
987 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
988 readl(pcie_index_hi_offset);
989 }
990 r = readl(pcie_data_offset);
991
992 /* clear the high bits */
993 if (pcie_index_hi != 0) {
994 writel(0, pcie_index_hi_offset);
995 readl(pcie_index_hi_offset);
996 }
997
998 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
999
1000 return r;
1001 }
1002
1003 /**
1004 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
1005 *
1006 * @adev: amdgpu_device pointer
1007 * @reg_addr: indirect register address to read from
1008 *
1009 * Returns the value of indirect register @reg_addr
1010 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)1011 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
1012 u32 reg_addr)
1013 {
1014 unsigned long flags, pcie_index, pcie_data;
1015 void __iomem *pcie_index_offset;
1016 void __iomem *pcie_data_offset;
1017 u64 r;
1018
1019 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1020 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1021
1022 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1023 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1024 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1025
1026 /* read low 32 bits */
1027 writel(reg_addr, pcie_index_offset);
1028 readl(pcie_index_offset);
1029 r = readl(pcie_data_offset);
1030 /* read high 32 bits */
1031 writel(reg_addr + 4, pcie_index_offset);
1032 readl(pcie_index_offset);
1033 r |= ((u64)readl(pcie_data_offset) << 32);
1034 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1035
1036 return r;
1037 }
1038
amdgpu_device_indirect_rreg64_ext(struct amdgpu_device * adev,u64 reg_addr)1039 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
1040 u64 reg_addr)
1041 {
1042 unsigned long flags, pcie_index, pcie_data;
1043 unsigned long pcie_index_hi = 0;
1044 void __iomem *pcie_index_offset;
1045 void __iomem *pcie_index_hi_offset;
1046 void __iomem *pcie_data_offset;
1047 u64 r;
1048
1049 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1050 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1051 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1052 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1053
1054 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1055 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1056 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1057 if (pcie_index_hi != 0)
1058 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1059 pcie_index_hi * 4;
1060
1061 /* read low 32 bits */
1062 writel(reg_addr, pcie_index_offset);
1063 readl(pcie_index_offset);
1064 if (pcie_index_hi != 0) {
1065 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1066 readl(pcie_index_hi_offset);
1067 }
1068 r = readl(pcie_data_offset);
1069 /* read high 32 bits */
1070 writel(reg_addr + 4, pcie_index_offset);
1071 readl(pcie_index_offset);
1072 if (pcie_index_hi != 0) {
1073 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1074 readl(pcie_index_hi_offset);
1075 }
1076 r |= ((u64)readl(pcie_data_offset) << 32);
1077
1078 /* clear the high bits */
1079 if (pcie_index_hi != 0) {
1080 writel(0, pcie_index_hi_offset);
1081 readl(pcie_index_hi_offset);
1082 }
1083
1084 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1085
1086 return r;
1087 }
1088
1089 /**
1090 * amdgpu_device_indirect_wreg - write an indirect register address
1091 *
1092 * @adev: amdgpu_device pointer
1093 * @reg_addr: indirect register offset
1094 * @reg_data: indirect register data
1095 *
1096 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)1097 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1098 u32 reg_addr, u32 reg_data)
1099 {
1100 unsigned long flags, pcie_index, pcie_data;
1101 void __iomem *pcie_index_offset;
1102 void __iomem *pcie_data_offset;
1103
1104 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1105 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1106
1107 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1108 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1109 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1110
1111 writel(reg_addr, pcie_index_offset);
1112 readl(pcie_index_offset);
1113 writel(reg_data, pcie_data_offset);
1114 readl(pcie_data_offset);
1115 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1116 }
1117
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)1118 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1119 u64 reg_addr, u32 reg_data)
1120 {
1121 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1122 void __iomem *pcie_index_offset;
1123 void __iomem *pcie_index_hi_offset;
1124 void __iomem *pcie_data_offset;
1125
1126 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1127 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1128 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1129 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1130 else
1131 pcie_index_hi = 0;
1132
1133 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1134 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1135 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1136 if (pcie_index_hi != 0)
1137 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1138 pcie_index_hi * 4;
1139
1140 writel(reg_addr, pcie_index_offset);
1141 readl(pcie_index_offset);
1142 if (pcie_index_hi != 0) {
1143 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1144 readl(pcie_index_hi_offset);
1145 }
1146 writel(reg_data, pcie_data_offset);
1147 readl(pcie_data_offset);
1148
1149 /* clear the high bits */
1150 if (pcie_index_hi != 0) {
1151 writel(0, pcie_index_hi_offset);
1152 readl(pcie_index_hi_offset);
1153 }
1154
1155 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1156 }
1157
1158 /**
1159 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1160 *
1161 * @adev: amdgpu_device pointer
1162 * @reg_addr: indirect register offset
1163 * @reg_data: indirect register data
1164 *
1165 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)1166 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1167 u32 reg_addr, u64 reg_data)
1168 {
1169 unsigned long flags, pcie_index, pcie_data;
1170 void __iomem *pcie_index_offset;
1171 void __iomem *pcie_data_offset;
1172
1173 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1174 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1175
1176 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1177 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1178 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1179
1180 /* write low 32 bits */
1181 writel(reg_addr, pcie_index_offset);
1182 readl(pcie_index_offset);
1183 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1184 readl(pcie_data_offset);
1185 /* write high 32 bits */
1186 writel(reg_addr + 4, pcie_index_offset);
1187 readl(pcie_index_offset);
1188 writel((u32)(reg_data >> 32), pcie_data_offset);
1189 readl(pcie_data_offset);
1190 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1191 }
1192
amdgpu_device_indirect_wreg64_ext(struct amdgpu_device * adev,u64 reg_addr,u64 reg_data)1193 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1194 u64 reg_addr, u64 reg_data)
1195 {
1196 unsigned long flags, pcie_index, pcie_data;
1197 unsigned long pcie_index_hi = 0;
1198 void __iomem *pcie_index_offset;
1199 void __iomem *pcie_index_hi_offset;
1200 void __iomem *pcie_data_offset;
1201
1202 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1203 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1204 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1205 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1206
1207 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1208 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1209 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1210 if (pcie_index_hi != 0)
1211 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1212 pcie_index_hi * 4;
1213
1214 /* write low 32 bits */
1215 writel(reg_addr, pcie_index_offset);
1216 readl(pcie_index_offset);
1217 if (pcie_index_hi != 0) {
1218 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1219 readl(pcie_index_hi_offset);
1220 }
1221 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1222 readl(pcie_data_offset);
1223 /* write high 32 bits */
1224 writel(reg_addr + 4, pcie_index_offset);
1225 readl(pcie_index_offset);
1226 if (pcie_index_hi != 0) {
1227 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1228 readl(pcie_index_hi_offset);
1229 }
1230 writel((u32)(reg_data >> 32), pcie_data_offset);
1231 readl(pcie_data_offset);
1232
1233 /* clear the high bits */
1234 if (pcie_index_hi != 0) {
1235 writel(0, pcie_index_hi_offset);
1236 readl(pcie_index_hi_offset);
1237 }
1238
1239 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1240 }
1241
1242 /**
1243 * amdgpu_device_get_rev_id - query device rev_id
1244 *
1245 * @adev: amdgpu_device pointer
1246 *
1247 * Return device rev_id
1248 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)1249 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1250 {
1251 return adev->nbio.funcs->get_rev_id(adev);
1252 }
1253
1254 /**
1255 * amdgpu_invalid_rreg - dummy reg read function
1256 *
1257 * @adev: amdgpu_device pointer
1258 * @reg: offset of register
1259 *
1260 * Dummy register read function. Used for register blocks
1261 * that certain asics don't have (all asics).
1262 * Returns the value in the register.
1263 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)1264 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1265 {
1266 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1267 BUG();
1268 return 0;
1269 }
1270
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)1271 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1272 {
1273 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1274 BUG();
1275 return 0;
1276 }
1277
1278 /**
1279 * amdgpu_invalid_wreg - dummy reg write function
1280 *
1281 * @adev: amdgpu_device pointer
1282 * @reg: offset of register
1283 * @v: value to write to the register
1284 *
1285 * Dummy register read function. Used for register blocks
1286 * that certain asics don't have (all asics).
1287 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)1288 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1289 {
1290 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1291 reg, v);
1292 BUG();
1293 }
1294
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)1295 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1296 {
1297 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1298 reg, v);
1299 BUG();
1300 }
1301
1302 /**
1303 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1304 *
1305 * @adev: amdgpu_device pointer
1306 * @reg: offset of register
1307 *
1308 * Dummy register read function. Used for register blocks
1309 * that certain asics don't have (all asics).
1310 * Returns the value in the register.
1311 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)1312 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1313 {
1314 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1315 BUG();
1316 return 0;
1317 }
1318
amdgpu_invalid_rreg64_ext(struct amdgpu_device * adev,uint64_t reg)1319 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1320 {
1321 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1322 BUG();
1323 return 0;
1324 }
1325
1326 /**
1327 * amdgpu_invalid_wreg64 - dummy reg write function
1328 *
1329 * @adev: amdgpu_device pointer
1330 * @reg: offset of register
1331 * @v: value to write to the register
1332 *
1333 * Dummy register read function. Used for register blocks
1334 * that certain asics don't have (all asics).
1335 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)1336 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1337 {
1338 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1339 reg, v);
1340 BUG();
1341 }
1342
amdgpu_invalid_wreg64_ext(struct amdgpu_device * adev,uint64_t reg,uint64_t v)1343 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1344 {
1345 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1346 reg, v);
1347 BUG();
1348 }
1349
1350 /**
1351 * amdgpu_block_invalid_rreg - dummy reg read function
1352 *
1353 * @adev: amdgpu_device pointer
1354 * @block: offset of instance
1355 * @reg: offset of register
1356 *
1357 * Dummy register read function. Used for register blocks
1358 * that certain asics don't have (all asics).
1359 * Returns the value in the register.
1360 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)1361 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1362 uint32_t block, uint32_t reg)
1363 {
1364 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1365 reg, block);
1366 BUG();
1367 return 0;
1368 }
1369
1370 /**
1371 * amdgpu_block_invalid_wreg - dummy reg write function
1372 *
1373 * @adev: amdgpu_device pointer
1374 * @block: offset of instance
1375 * @reg: offset of register
1376 * @v: value to write to the register
1377 *
1378 * Dummy register read function. Used for register blocks
1379 * that certain asics don't have (all asics).
1380 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)1381 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1382 uint32_t block,
1383 uint32_t reg, uint32_t v)
1384 {
1385 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1386 reg, block, v);
1387 BUG();
1388 }
1389
1390 /**
1391 * amdgpu_device_asic_init - Wrapper for atom asic_init
1392 *
1393 * @adev: amdgpu_device pointer
1394 *
1395 * Does any asic specific work and then calls atom asic init.
1396 */
amdgpu_device_asic_init(struct amdgpu_device * adev)1397 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1398 {
1399 int ret;
1400
1401 amdgpu_asic_pre_asic_init(adev);
1402
1403 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1404 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1405 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1406 amdgpu_psp_wait_for_bootloader(adev);
1407 ret = amdgpu_atomfirmware_asic_init(adev, true);
1408 return ret;
1409 } else {
1410 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1411 }
1412
1413 return 0;
1414 }
1415
1416 /**
1417 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1418 *
1419 * @adev: amdgpu_device pointer
1420 *
1421 * Allocates a scratch page of VRAM for use by various things in the
1422 * driver.
1423 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)1424 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1425 {
1426 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1427 AMDGPU_GEM_DOMAIN_VRAM |
1428 AMDGPU_GEM_DOMAIN_GTT,
1429 &adev->mem_scratch.robj,
1430 &adev->mem_scratch.gpu_addr,
1431 (void **)&adev->mem_scratch.ptr);
1432 }
1433
1434 /**
1435 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1436 *
1437 * @adev: amdgpu_device pointer
1438 *
1439 * Frees the VRAM scratch page.
1440 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)1441 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1442 {
1443 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1444 }
1445
1446 /**
1447 * amdgpu_device_program_register_sequence - program an array of registers.
1448 *
1449 * @adev: amdgpu_device pointer
1450 * @registers: pointer to the register array
1451 * @array_size: size of the register array
1452 *
1453 * Programs an array or registers with and or masks.
1454 * This is a helper for setting golden registers.
1455 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)1456 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1457 const u32 *registers,
1458 const u32 array_size)
1459 {
1460 u32 tmp, reg, and_mask, or_mask;
1461 int i;
1462
1463 if (array_size % 3)
1464 return;
1465
1466 for (i = 0; i < array_size; i += 3) {
1467 reg = registers[i + 0];
1468 and_mask = registers[i + 1];
1469 or_mask = registers[i + 2];
1470
1471 if (and_mask == 0xffffffff) {
1472 tmp = or_mask;
1473 } else {
1474 tmp = RREG32(reg);
1475 tmp &= ~and_mask;
1476 if (adev->family >= AMDGPU_FAMILY_AI)
1477 tmp |= (or_mask & and_mask);
1478 else
1479 tmp |= or_mask;
1480 }
1481 WREG32(reg, tmp);
1482 }
1483 }
1484
1485 /**
1486 * amdgpu_device_pci_config_reset - reset the GPU
1487 *
1488 * @adev: amdgpu_device pointer
1489 *
1490 * Resets the GPU using the pci config reset sequence.
1491 * Only applicable to asics prior to vega10.
1492 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)1493 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1494 {
1495 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1496 }
1497
1498 /**
1499 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1500 *
1501 * @adev: amdgpu_device pointer
1502 *
1503 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1504 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1505 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1506 {
1507 return pci_reset_function(adev->pdev);
1508 }
1509
1510 /*
1511 * amdgpu_device_wb_*()
1512 * Writeback is the method by which the GPU updates special pages in memory
1513 * with the status of certain GPU events (fences, ring pointers,etc.).
1514 */
1515
1516 /**
1517 * amdgpu_device_wb_fini - Disable Writeback and free memory
1518 *
1519 * @adev: amdgpu_device pointer
1520 *
1521 * Disables Writeback and frees the Writeback memory (all asics).
1522 * Used at driver shutdown.
1523 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1524 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1525 {
1526 if (adev->wb.wb_obj) {
1527 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1528 &adev->wb.gpu_addr,
1529 (void **)&adev->wb.wb);
1530 adev->wb.wb_obj = NULL;
1531 }
1532 }
1533
1534 /**
1535 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1536 *
1537 * @adev: amdgpu_device pointer
1538 *
1539 * Initializes writeback and allocates writeback memory (all asics).
1540 * Used at driver startup.
1541 * Returns 0 on success or an -error on failure.
1542 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1543 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1544 {
1545 int r;
1546
1547 if (adev->wb.wb_obj == NULL) {
1548 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1549 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1550 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1551 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1552 (void **)&adev->wb.wb);
1553 if (r) {
1554 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1555 return r;
1556 }
1557
1558 adev->wb.num_wb = AMDGPU_MAX_WB;
1559 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1560
1561 /* clear wb memory */
1562 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1563 }
1564
1565 return 0;
1566 }
1567
1568 /**
1569 * amdgpu_device_wb_get - Allocate a wb entry
1570 *
1571 * @adev: amdgpu_device pointer
1572 * @wb: wb index
1573 *
1574 * Allocate a wb slot for use by the driver (all asics).
1575 * Returns 0 on success or -EINVAL on failure.
1576 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1577 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1578 {
1579 unsigned long flags, offset;
1580
1581 spin_lock_irqsave(&adev->wb.lock, flags);
1582 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1583 if (offset < adev->wb.num_wb) {
1584 __set_bit(offset, adev->wb.used);
1585 spin_unlock_irqrestore(&adev->wb.lock, flags);
1586 *wb = offset << 3; /* convert to dw offset */
1587 return 0;
1588 } else {
1589 spin_unlock_irqrestore(&adev->wb.lock, flags);
1590 return -EINVAL;
1591 }
1592 }
1593
1594 /**
1595 * amdgpu_device_wb_free - Free a wb entry
1596 *
1597 * @adev: amdgpu_device pointer
1598 * @wb: wb index
1599 *
1600 * Free a wb slot allocated for use by the driver (all asics)
1601 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1602 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1603 {
1604 unsigned long flags;
1605
1606 wb >>= 3;
1607 spin_lock_irqsave(&adev->wb.lock, flags);
1608 if (wb < adev->wb.num_wb)
1609 __clear_bit(wb, adev->wb.used);
1610 spin_unlock_irqrestore(&adev->wb.lock, flags);
1611 }
1612
1613 /**
1614 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1615 *
1616 * @adev: amdgpu_device pointer
1617 *
1618 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1619 * to fail, but if any of the BARs is not accessible after the size we abort
1620 * driver loading by returning -ENODEV.
1621 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1622 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1623 {
1624 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1625 struct pci_bus *root;
1626 struct resource *res;
1627 unsigned int i;
1628 u16 cmd;
1629 int r;
1630
1631 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1632 return 0;
1633
1634 /* Bypass for VF */
1635 if (amdgpu_sriov_vf(adev))
1636 return 0;
1637
1638 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1639 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1640 DRM_WARN("System can't access extended configuration space, please check!!\n");
1641
1642 /* skip if the bios has already enabled large BAR */
1643 if (adev->gmc.real_vram_size &&
1644 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1645 return 0;
1646
1647 /* Check if the root BUS has 64bit memory resources */
1648 root = adev->pdev->bus;
1649 while (root->parent)
1650 root = root->parent;
1651
1652 pci_bus_for_each_resource(root, res, i) {
1653 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1654 res->start > 0x100000000ull)
1655 break;
1656 }
1657
1658 /* Trying to resize is pointless without a root hub window above 4GB */
1659 if (!res)
1660 return 0;
1661
1662 /* Limit the BAR size to what is available */
1663 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1664 rbar_size);
1665
1666 /* Disable memory decoding while we change the BAR addresses and size */
1667 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1668 pci_write_config_word(adev->pdev, PCI_COMMAND,
1669 cmd & ~PCI_COMMAND_MEMORY);
1670
1671 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1672 amdgpu_doorbell_fini(adev);
1673 if (adev->asic_type >= CHIP_BONAIRE)
1674 pci_release_resource(adev->pdev, 2);
1675
1676 pci_release_resource(adev->pdev, 0);
1677
1678 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1679 if (r == -ENOSPC)
1680 DRM_INFO("Not enough PCI address space for a large BAR.");
1681 else if (r && r != -ENOTSUPP)
1682 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1683
1684 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1685
1686 /* When the doorbell or fb BAR isn't available we have no chance of
1687 * using the device.
1688 */
1689 r = amdgpu_doorbell_init(adev);
1690 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1691 return -ENODEV;
1692
1693 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1694
1695 return 0;
1696 }
1697
amdgpu_device_read_bios(struct amdgpu_device * adev)1698 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1699 {
1700 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1701 return false;
1702
1703 return true;
1704 }
1705
1706 /*
1707 * GPU helpers function.
1708 */
1709 /**
1710 * amdgpu_device_need_post - check if the hw need post or not
1711 *
1712 * @adev: amdgpu_device pointer
1713 *
1714 * Check if the asic has been initialized (all asics) at driver startup
1715 * or post is needed if hw reset is performed.
1716 * Returns true if need or false if not.
1717 */
amdgpu_device_need_post(struct amdgpu_device * adev)1718 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1719 {
1720 uint32_t reg;
1721
1722 if (amdgpu_sriov_vf(adev))
1723 return false;
1724
1725 if (!amdgpu_device_read_bios(adev))
1726 return false;
1727
1728 if (amdgpu_passthrough(adev)) {
1729 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1730 * some old smc fw still need driver do vPost otherwise gpu hang, while
1731 * those smc fw version above 22.15 doesn't have this flaw, so we force
1732 * vpost executed for smc version below 22.15
1733 */
1734 if (adev->asic_type == CHIP_FIJI) {
1735 int err;
1736 uint32_t fw_ver;
1737
1738 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1739 /* force vPost if error occured */
1740 if (err)
1741 return true;
1742
1743 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1744 release_firmware(adev->pm.fw);
1745 if (fw_ver < 0x00160e00)
1746 return true;
1747 }
1748 }
1749
1750 /* Don't post if we need to reset whole hive on init */
1751 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
1752 return false;
1753
1754 if (adev->has_hw_reset) {
1755 adev->has_hw_reset = false;
1756 return true;
1757 }
1758
1759 /* bios scratch used on CIK+ */
1760 if (adev->asic_type >= CHIP_BONAIRE)
1761 return amdgpu_atombios_scratch_need_asic_init(adev);
1762
1763 /* check MEM_SIZE for older asics */
1764 reg = amdgpu_asic_get_config_memsize(adev);
1765
1766 if ((reg != 0) && (reg != 0xffffffff))
1767 return false;
1768
1769 return true;
1770 }
1771
1772 /*
1773 * Check whether seamless boot is supported.
1774 *
1775 * So far we only support seamless boot on DCE 3.0 or later.
1776 * If users report that it works on older ASICS as well, we may
1777 * loosen this.
1778 */
amdgpu_device_seamless_boot_supported(struct amdgpu_device * adev)1779 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1780 {
1781 switch (amdgpu_seamless) {
1782 case -1:
1783 break;
1784 case 1:
1785 return true;
1786 case 0:
1787 return false;
1788 default:
1789 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1790 amdgpu_seamless);
1791 return false;
1792 }
1793
1794 if (!(adev->flags & AMD_IS_APU))
1795 return false;
1796
1797 if (adev->mman.keep_stolen_vga_memory)
1798 return false;
1799
1800 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1801 }
1802
1803 /*
1804 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1805 * don't support dynamic speed switching. Until we have confirmation from Intel
1806 * that a specific host supports it, it's safer that we keep it disabled for all.
1807 *
1808 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1809 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1810 */
amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device * adev)1811 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1812 {
1813 #if IS_ENABLED(CONFIG_X86)
1814 struct cpuinfo_x86 *c = &cpu_data(0);
1815
1816 /* eGPU change speeds based on USB4 fabric conditions */
1817 if (dev_is_removable(adev->dev))
1818 return true;
1819
1820 if (c->x86_vendor == X86_VENDOR_INTEL)
1821 return false;
1822 #endif
1823 return true;
1824 }
1825
1826 /**
1827 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1828 *
1829 * @adev: amdgpu_device pointer
1830 *
1831 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1832 * be set for this device.
1833 *
1834 * Returns true if it should be used or false if not.
1835 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1836 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1837 {
1838 switch (amdgpu_aspm) {
1839 case -1:
1840 break;
1841 case 0:
1842 return false;
1843 case 1:
1844 return true;
1845 default:
1846 return false;
1847 }
1848 if (adev->flags & AMD_IS_APU)
1849 return false;
1850 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK))
1851 return false;
1852 return pcie_aspm_enabled(adev->pdev);
1853 }
1854
1855 /* if we get transitioned to only one device, take VGA back */
1856 /**
1857 * amdgpu_device_vga_set_decode - enable/disable vga decode
1858 *
1859 * @pdev: PCI device pointer
1860 * @state: enable/disable vga decode
1861 *
1862 * Enable/disable vga decode (all asics).
1863 * Returns VGA resource flags.
1864 */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1865 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1866 bool state)
1867 {
1868 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1869
1870 amdgpu_asic_set_vga_state(adev, state);
1871 if (state)
1872 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1873 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1874 else
1875 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1876 }
1877
1878 /**
1879 * amdgpu_device_check_block_size - validate the vm block size
1880 *
1881 * @adev: amdgpu_device pointer
1882 *
1883 * Validates the vm block size specified via module parameter.
1884 * The vm block size defines number of bits in page table versus page directory,
1885 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1886 * page table and the remaining bits are in the page directory.
1887 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1888 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1889 {
1890 /* defines number of bits in page table versus page directory,
1891 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1892 * page table and the remaining bits are in the page directory
1893 */
1894 if (amdgpu_vm_block_size == -1)
1895 return;
1896
1897 if (amdgpu_vm_block_size < 9) {
1898 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1899 amdgpu_vm_block_size);
1900 amdgpu_vm_block_size = -1;
1901 }
1902 }
1903
1904 /**
1905 * amdgpu_device_check_vm_size - validate the vm size
1906 *
1907 * @adev: amdgpu_device pointer
1908 *
1909 * Validates the vm size in GB specified via module parameter.
1910 * The VM size is the size of the GPU virtual memory space in GB.
1911 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1912 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1913 {
1914 /* no need to check the default value */
1915 if (amdgpu_vm_size == -1)
1916 return;
1917
1918 if (amdgpu_vm_size < 1) {
1919 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1920 amdgpu_vm_size);
1921 amdgpu_vm_size = -1;
1922 }
1923 }
1924
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1925 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1926 {
1927 struct sysinfo si;
1928 bool is_os_64 = (sizeof(void *) == 8);
1929 uint64_t total_memory;
1930 uint64_t dram_size_seven_GB = 0x1B8000000;
1931 uint64_t dram_size_three_GB = 0xB8000000;
1932
1933 if (amdgpu_smu_memory_pool_size == 0)
1934 return;
1935
1936 if (!is_os_64) {
1937 DRM_WARN("Not 64-bit OS, feature not supported\n");
1938 goto def_value;
1939 }
1940 si_meminfo(&si);
1941 total_memory = (uint64_t)si.totalram * si.mem_unit;
1942
1943 if ((amdgpu_smu_memory_pool_size == 1) ||
1944 (amdgpu_smu_memory_pool_size == 2)) {
1945 if (total_memory < dram_size_three_GB)
1946 goto def_value1;
1947 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1948 (amdgpu_smu_memory_pool_size == 8)) {
1949 if (total_memory < dram_size_seven_GB)
1950 goto def_value1;
1951 } else {
1952 DRM_WARN("Smu memory pool size not supported\n");
1953 goto def_value;
1954 }
1955 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1956
1957 return;
1958
1959 def_value1:
1960 DRM_WARN("No enough system memory\n");
1961 def_value:
1962 adev->pm.smu_prv_buffer_size = 0;
1963 }
1964
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1965 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1966 {
1967 if (!(adev->flags & AMD_IS_APU) ||
1968 adev->asic_type < CHIP_RAVEN)
1969 return 0;
1970
1971 switch (adev->asic_type) {
1972 case CHIP_RAVEN:
1973 if (adev->pdev->device == 0x15dd)
1974 adev->apu_flags |= AMD_APU_IS_RAVEN;
1975 if (adev->pdev->device == 0x15d8)
1976 adev->apu_flags |= AMD_APU_IS_PICASSO;
1977 break;
1978 case CHIP_RENOIR:
1979 if ((adev->pdev->device == 0x1636) ||
1980 (adev->pdev->device == 0x164c))
1981 adev->apu_flags |= AMD_APU_IS_RENOIR;
1982 else
1983 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1984 break;
1985 case CHIP_VANGOGH:
1986 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1987 break;
1988 case CHIP_YELLOW_CARP:
1989 break;
1990 case CHIP_CYAN_SKILLFISH:
1991 if ((adev->pdev->device == 0x13FE) ||
1992 (adev->pdev->device == 0x143F))
1993 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1994 break;
1995 default:
1996 break;
1997 }
1998
1999 return 0;
2000 }
2001
2002 /**
2003 * amdgpu_device_check_arguments - validate module params
2004 *
2005 * @adev: amdgpu_device pointer
2006 *
2007 * Validates certain module parameters and updates
2008 * the associated values used by the driver (all asics).
2009 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)2010 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
2011 {
2012 int i;
2013
2014 if (amdgpu_sched_jobs < 4) {
2015 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
2016 amdgpu_sched_jobs);
2017 amdgpu_sched_jobs = 4;
2018 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
2019 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
2020 amdgpu_sched_jobs);
2021 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
2022 }
2023
2024 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
2025 /* gart size must be greater or equal to 32M */
2026 dev_warn(adev->dev, "gart size (%d) too small\n",
2027 amdgpu_gart_size);
2028 amdgpu_gart_size = -1;
2029 }
2030
2031 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
2032 /* gtt size must be greater or equal to 32M */
2033 dev_warn(adev->dev, "gtt size (%d) too small\n",
2034 amdgpu_gtt_size);
2035 amdgpu_gtt_size = -1;
2036 }
2037
2038 /* valid range is between 4 and 9 inclusive */
2039 if (amdgpu_vm_fragment_size != -1 &&
2040 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
2041 dev_warn(adev->dev, "valid range is between 4 and 9\n");
2042 amdgpu_vm_fragment_size = -1;
2043 }
2044
2045 if (amdgpu_sched_hw_submission < 2) {
2046 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
2047 amdgpu_sched_hw_submission);
2048 amdgpu_sched_hw_submission = 2;
2049 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
2050 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
2051 amdgpu_sched_hw_submission);
2052 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
2053 }
2054
2055 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
2056 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
2057 amdgpu_reset_method = -1;
2058 }
2059
2060 amdgpu_device_check_smu_prv_buffer_size(adev);
2061
2062 amdgpu_device_check_vm_size(adev);
2063
2064 amdgpu_device_check_block_size(adev);
2065
2066 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2067
2068 for (i = 0; i < MAX_XCP; i++)
2069 adev->enforce_isolation[i] = !!enforce_isolation;
2070
2071 return 0;
2072 }
2073
2074 /**
2075 * amdgpu_switcheroo_set_state - set switcheroo state
2076 *
2077 * @pdev: pci dev pointer
2078 * @state: vga_switcheroo state
2079 *
2080 * Callback for the switcheroo driver. Suspends or resumes
2081 * the asics before or after it is powered up using ACPI methods.
2082 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)2083 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2084 enum vga_switcheroo_state state)
2085 {
2086 struct drm_device *dev = pci_get_drvdata(pdev);
2087 int r;
2088
2089 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
2090 return;
2091
2092 if (state == VGA_SWITCHEROO_ON) {
2093 pr_info("switched on\n");
2094 /* don't suspend or resume card normally */
2095 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2096
2097 pci_set_power_state(pdev, PCI_D0);
2098 amdgpu_device_load_pci_state(pdev);
2099 r = pci_enable_device(pdev);
2100 if (r)
2101 DRM_WARN("pci_enable_device failed (%d)\n", r);
2102 amdgpu_device_resume(dev, true);
2103
2104 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2105 } else {
2106 pr_info("switched off\n");
2107 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2108 amdgpu_device_prepare(dev);
2109 amdgpu_device_suspend(dev, true);
2110 amdgpu_device_cache_pci_state(pdev);
2111 /* Shut down the device */
2112 pci_disable_device(pdev);
2113 pci_set_power_state(pdev, PCI_D3cold);
2114 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2115 }
2116 }
2117
2118 /**
2119 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2120 *
2121 * @pdev: pci dev pointer
2122 *
2123 * Callback for the switcheroo driver. Check of the switcheroo
2124 * state can be changed.
2125 * Returns true if the state can be changed, false if not.
2126 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)2127 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2128 {
2129 struct drm_device *dev = pci_get_drvdata(pdev);
2130
2131 /*
2132 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2133 * locking inversion with the driver load path. And the access here is
2134 * completely racy anyway. So don't bother with locking for now.
2135 */
2136 return atomic_read(&dev->open_count) == 0;
2137 }
2138
2139 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2140 .set_gpu_state = amdgpu_switcheroo_set_state,
2141 .reprobe = NULL,
2142 .can_switch = amdgpu_switcheroo_can_switch,
2143 };
2144
2145 /**
2146 * amdgpu_device_ip_set_clockgating_state - set the CG state
2147 *
2148 * @dev: amdgpu_device pointer
2149 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2150 * @state: clockgating state (gate or ungate)
2151 *
2152 * Sets the requested clockgating state for all instances of
2153 * the hardware IP specified.
2154 * Returns the error code from the last instance.
2155 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)2156 int amdgpu_device_ip_set_clockgating_state(void *dev,
2157 enum amd_ip_block_type block_type,
2158 enum amd_clockgating_state state)
2159 {
2160 struct amdgpu_device *adev = dev;
2161 int i, r = 0;
2162
2163 for (i = 0; i < adev->num_ip_blocks; i++) {
2164 if (!adev->ip_blocks[i].status.valid)
2165 continue;
2166 if (adev->ip_blocks[i].version->type != block_type)
2167 continue;
2168 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2169 continue;
2170 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2171 (void *)adev, state);
2172 if (r)
2173 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2174 adev->ip_blocks[i].version->funcs->name, r);
2175 }
2176 return r;
2177 }
2178
2179 /**
2180 * amdgpu_device_ip_set_powergating_state - set the PG state
2181 *
2182 * @dev: amdgpu_device pointer
2183 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2184 * @state: powergating state (gate or ungate)
2185 *
2186 * Sets the requested powergating state for all instances of
2187 * the hardware IP specified.
2188 * Returns the error code from the last instance.
2189 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)2190 int amdgpu_device_ip_set_powergating_state(void *dev,
2191 enum amd_ip_block_type block_type,
2192 enum amd_powergating_state state)
2193 {
2194 struct amdgpu_device *adev = dev;
2195 int i, r = 0;
2196
2197 for (i = 0; i < adev->num_ip_blocks; i++) {
2198 if (!adev->ip_blocks[i].status.valid)
2199 continue;
2200 if (adev->ip_blocks[i].version->type != block_type)
2201 continue;
2202 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2203 continue;
2204 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2205 (void *)adev, state);
2206 if (r)
2207 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2208 adev->ip_blocks[i].version->funcs->name, r);
2209 }
2210 return r;
2211 }
2212
2213 /**
2214 * amdgpu_device_ip_get_clockgating_state - get the CG state
2215 *
2216 * @adev: amdgpu_device pointer
2217 * @flags: clockgating feature flags
2218 *
2219 * Walks the list of IPs on the device and updates the clockgating
2220 * flags for each IP.
2221 * Updates @flags with the feature flags for each hardware IP where
2222 * clockgating is enabled.
2223 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)2224 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2225 u64 *flags)
2226 {
2227 int i;
2228
2229 for (i = 0; i < adev->num_ip_blocks; i++) {
2230 if (!adev->ip_blocks[i].status.valid)
2231 continue;
2232 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2233 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
2234 }
2235 }
2236
2237 /**
2238 * amdgpu_device_ip_wait_for_idle - wait for idle
2239 *
2240 * @adev: amdgpu_device pointer
2241 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2242 *
2243 * Waits for the request hardware IP to be idle.
2244 * Returns 0 for success or a negative error code on failure.
2245 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2246 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2247 enum amd_ip_block_type block_type)
2248 {
2249 int i, r;
2250
2251 for (i = 0; i < adev->num_ip_blocks; i++) {
2252 if (!adev->ip_blocks[i].status.valid)
2253 continue;
2254 if (adev->ip_blocks[i].version->type == block_type) {
2255 if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
2256 r = adev->ip_blocks[i].version->funcs->wait_for_idle(
2257 &adev->ip_blocks[i]);
2258 if (r)
2259 return r;
2260 }
2261 break;
2262 }
2263 }
2264 return 0;
2265
2266 }
2267
2268 /**
2269 * amdgpu_device_ip_is_valid - is the hardware IP enabled
2270 *
2271 * @adev: amdgpu_device pointer
2272 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2273 *
2274 * Check if the hardware IP is enable or not.
2275 * Returns true if it the IP is enable, false if not.
2276 */
amdgpu_device_ip_is_valid(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2277 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
2278 enum amd_ip_block_type block_type)
2279 {
2280 int i;
2281
2282 for (i = 0; i < adev->num_ip_blocks; i++) {
2283 if (adev->ip_blocks[i].version->type == block_type)
2284 return adev->ip_blocks[i].status.valid;
2285 }
2286 return false;
2287
2288 }
2289
2290 /**
2291 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2292 *
2293 * @adev: amdgpu_device pointer
2294 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2295 *
2296 * Returns a pointer to the hardware IP block structure
2297 * if it exists for the asic, otherwise NULL.
2298 */
2299 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)2300 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2301 enum amd_ip_block_type type)
2302 {
2303 int i;
2304
2305 for (i = 0; i < adev->num_ip_blocks; i++)
2306 if (adev->ip_blocks[i].version->type == type)
2307 return &adev->ip_blocks[i];
2308
2309 return NULL;
2310 }
2311
2312 /**
2313 * amdgpu_device_ip_block_version_cmp
2314 *
2315 * @adev: amdgpu_device pointer
2316 * @type: enum amd_ip_block_type
2317 * @major: major version
2318 * @minor: minor version
2319 *
2320 * return 0 if equal or greater
2321 * return 1 if smaller or the ip_block doesn't exist
2322 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)2323 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2324 enum amd_ip_block_type type,
2325 u32 major, u32 minor)
2326 {
2327 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2328
2329 if (ip_block && ((ip_block->version->major > major) ||
2330 ((ip_block->version->major == major) &&
2331 (ip_block->version->minor >= minor))))
2332 return 0;
2333
2334 return 1;
2335 }
2336
2337 /**
2338 * amdgpu_device_ip_block_add
2339 *
2340 * @adev: amdgpu_device pointer
2341 * @ip_block_version: pointer to the IP to add
2342 *
2343 * Adds the IP block driver information to the collection of IPs
2344 * on the asic.
2345 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)2346 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2347 const struct amdgpu_ip_block_version *ip_block_version)
2348 {
2349 if (!ip_block_version)
2350 return -EINVAL;
2351
2352 switch (ip_block_version->type) {
2353 case AMD_IP_BLOCK_TYPE_VCN:
2354 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2355 return 0;
2356 break;
2357 case AMD_IP_BLOCK_TYPE_JPEG:
2358 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2359 return 0;
2360 break;
2361 default:
2362 break;
2363 }
2364
2365 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
2366 ip_block_version->funcs->name);
2367
2368 adev->ip_blocks[adev->num_ip_blocks].adev = adev;
2369
2370 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2371
2372 return 0;
2373 }
2374
2375 /**
2376 * amdgpu_device_enable_virtual_display - enable virtual display feature
2377 *
2378 * @adev: amdgpu_device pointer
2379 *
2380 * Enabled the virtual display feature if the user has enabled it via
2381 * the module parameter virtual_display. This feature provides a virtual
2382 * display hardware on headless boards or in virtualized environments.
2383 * This function parses and validates the configuration string specified by
2384 * the user and configues the virtual display configuration (number of
2385 * virtual connectors, crtcs, etc.) specified.
2386 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)2387 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2388 {
2389 adev->enable_virtual_display = false;
2390
2391 if (amdgpu_virtual_display) {
2392 const char *pci_address_name = pci_name(adev->pdev);
2393 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2394
2395 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2396 pciaddstr_tmp = pciaddstr;
2397 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2398 pciaddname = strsep(&pciaddname_tmp, ",");
2399 if (!strcmp("all", pciaddname)
2400 || !strcmp(pci_address_name, pciaddname)) {
2401 long num_crtc;
2402 int res = -1;
2403
2404 adev->enable_virtual_display = true;
2405
2406 if (pciaddname_tmp)
2407 res = kstrtol(pciaddname_tmp, 10,
2408 &num_crtc);
2409
2410 if (!res) {
2411 if (num_crtc < 1)
2412 num_crtc = 1;
2413 if (num_crtc > 6)
2414 num_crtc = 6;
2415 adev->mode_info.num_crtc = num_crtc;
2416 } else {
2417 adev->mode_info.num_crtc = 1;
2418 }
2419 break;
2420 }
2421 }
2422
2423 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2424 amdgpu_virtual_display, pci_address_name,
2425 adev->enable_virtual_display, adev->mode_info.num_crtc);
2426
2427 kfree(pciaddstr);
2428 }
2429 }
2430
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)2431 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2432 {
2433 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2434 adev->mode_info.num_crtc = 1;
2435 adev->enable_virtual_display = true;
2436 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2437 adev->enable_virtual_display, adev->mode_info.num_crtc);
2438 }
2439 }
2440
2441 /**
2442 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2443 *
2444 * @adev: amdgpu_device pointer
2445 *
2446 * Parses the asic configuration parameters specified in the gpu info
2447 * firmware and makes them availale to the driver for use in configuring
2448 * the asic.
2449 * Returns 0 on success, -EINVAL on failure.
2450 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)2451 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2452 {
2453 const char *chip_name;
2454 int err;
2455 const struct gpu_info_firmware_header_v1_0 *hdr;
2456
2457 adev->firmware.gpu_info_fw = NULL;
2458
2459 if (adev->mman.discovery_bin)
2460 return 0;
2461
2462 switch (adev->asic_type) {
2463 default:
2464 return 0;
2465 case CHIP_VEGA10:
2466 chip_name = "vega10";
2467 break;
2468 case CHIP_VEGA12:
2469 chip_name = "vega12";
2470 break;
2471 case CHIP_RAVEN:
2472 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2473 chip_name = "raven2";
2474 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2475 chip_name = "picasso";
2476 else
2477 chip_name = "raven";
2478 break;
2479 case CHIP_ARCTURUS:
2480 chip_name = "arcturus";
2481 break;
2482 case CHIP_NAVI12:
2483 chip_name = "navi12";
2484 break;
2485 }
2486
2487 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2488 "amdgpu/%s_gpu_info.bin", chip_name);
2489 if (err) {
2490 dev_err(adev->dev,
2491 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2492 chip_name);
2493 goto out;
2494 }
2495
2496 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2497 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2498
2499 switch (hdr->version_major) {
2500 case 1:
2501 {
2502 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2503 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2504 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2505
2506 /*
2507 * Should be droped when DAL no longer needs it.
2508 */
2509 if (adev->asic_type == CHIP_NAVI12)
2510 goto parse_soc_bounding_box;
2511
2512 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2513 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2514 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2515 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2516 adev->gfx.config.max_texture_channel_caches =
2517 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2518 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2519 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2520 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2521 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2522 adev->gfx.config.double_offchip_lds_buf =
2523 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2524 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2525 adev->gfx.cu_info.max_waves_per_simd =
2526 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2527 adev->gfx.cu_info.max_scratch_slots_per_cu =
2528 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2529 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2530 if (hdr->version_minor >= 1) {
2531 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2532 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2533 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2534 adev->gfx.config.num_sc_per_sh =
2535 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2536 adev->gfx.config.num_packer_per_sc =
2537 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2538 }
2539
2540 parse_soc_bounding_box:
2541 /*
2542 * soc bounding box info is not integrated in disocovery table,
2543 * we always need to parse it from gpu info firmware if needed.
2544 */
2545 if (hdr->version_minor == 2) {
2546 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2547 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2548 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2549 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2550 }
2551 break;
2552 }
2553 default:
2554 dev_err(adev->dev,
2555 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2556 err = -EINVAL;
2557 goto out;
2558 }
2559 out:
2560 return err;
2561 }
2562
2563 /**
2564 * amdgpu_device_ip_early_init - run early init for hardware IPs
2565 *
2566 * @adev: amdgpu_device pointer
2567 *
2568 * Early initialization pass for hardware IPs. The hardware IPs that make
2569 * up each asic are discovered each IP's early_init callback is run. This
2570 * is the first stage in initializing the asic.
2571 * Returns 0 on success, negative error code on failure.
2572 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2573 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2574 {
2575 struct amdgpu_ip_block *ip_block;
2576 struct pci_dev *parent;
2577 int i, r;
2578 bool total;
2579
2580 amdgpu_device_enable_virtual_display(adev);
2581
2582 if (amdgpu_sriov_vf(adev)) {
2583 r = amdgpu_virt_request_full_gpu(adev, true);
2584 if (r)
2585 return r;
2586 }
2587
2588 switch (adev->asic_type) {
2589 #ifdef CONFIG_DRM_AMDGPU_SI
2590 case CHIP_VERDE:
2591 case CHIP_TAHITI:
2592 case CHIP_PITCAIRN:
2593 case CHIP_OLAND:
2594 case CHIP_HAINAN:
2595 adev->family = AMDGPU_FAMILY_SI;
2596 r = si_set_ip_blocks(adev);
2597 if (r)
2598 return r;
2599 break;
2600 #endif
2601 #ifdef CONFIG_DRM_AMDGPU_CIK
2602 case CHIP_BONAIRE:
2603 case CHIP_HAWAII:
2604 case CHIP_KAVERI:
2605 case CHIP_KABINI:
2606 case CHIP_MULLINS:
2607 if (adev->flags & AMD_IS_APU)
2608 adev->family = AMDGPU_FAMILY_KV;
2609 else
2610 adev->family = AMDGPU_FAMILY_CI;
2611
2612 r = cik_set_ip_blocks(adev);
2613 if (r)
2614 return r;
2615 break;
2616 #endif
2617 case CHIP_TOPAZ:
2618 case CHIP_TONGA:
2619 case CHIP_FIJI:
2620 case CHIP_POLARIS10:
2621 case CHIP_POLARIS11:
2622 case CHIP_POLARIS12:
2623 case CHIP_VEGAM:
2624 case CHIP_CARRIZO:
2625 case CHIP_STONEY:
2626 if (adev->flags & AMD_IS_APU)
2627 adev->family = AMDGPU_FAMILY_CZ;
2628 else
2629 adev->family = AMDGPU_FAMILY_VI;
2630
2631 r = vi_set_ip_blocks(adev);
2632 if (r)
2633 return r;
2634 break;
2635 default:
2636 r = amdgpu_discovery_set_ip_blocks(adev);
2637 if (r)
2638 return r;
2639 break;
2640 }
2641
2642 if (amdgpu_has_atpx() &&
2643 (amdgpu_is_atpx_hybrid() ||
2644 amdgpu_has_atpx_dgpu_power_cntl()) &&
2645 ((adev->flags & AMD_IS_APU) == 0) &&
2646 !dev_is_removable(&adev->pdev->dev))
2647 adev->flags |= AMD_IS_PX;
2648
2649 if (!(adev->flags & AMD_IS_APU)) {
2650 parent = pcie_find_root_port(adev->pdev);
2651 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2652 }
2653
2654
2655 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2656 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2657 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2658 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2659 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2660 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2661 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2662
2663 total = true;
2664 for (i = 0; i < adev->num_ip_blocks; i++) {
2665 ip_block = &adev->ip_blocks[i];
2666
2667 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2668 DRM_WARN("disabled ip block: %d <%s>\n",
2669 i, adev->ip_blocks[i].version->funcs->name);
2670 adev->ip_blocks[i].status.valid = false;
2671 } else if (ip_block->version->funcs->early_init) {
2672 r = ip_block->version->funcs->early_init(ip_block);
2673 if (r == -ENOENT) {
2674 adev->ip_blocks[i].status.valid = false;
2675 } else if (r) {
2676 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2677 adev->ip_blocks[i].version->funcs->name, r);
2678 total = false;
2679 } else {
2680 adev->ip_blocks[i].status.valid = true;
2681 }
2682 } else {
2683 adev->ip_blocks[i].status.valid = true;
2684 }
2685 /* get the vbios after the asic_funcs are set up */
2686 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2687 r = amdgpu_device_parse_gpu_info_fw(adev);
2688 if (r)
2689 return r;
2690
2691 /* Read BIOS */
2692 if (amdgpu_device_read_bios(adev)) {
2693 if (!amdgpu_get_bios(adev))
2694 return -EINVAL;
2695
2696 r = amdgpu_atombios_init(adev);
2697 if (r) {
2698 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2699 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2700 return r;
2701 }
2702 }
2703
2704 /*get pf2vf msg info at it's earliest time*/
2705 if (amdgpu_sriov_vf(adev))
2706 amdgpu_virt_init_data_exchange(adev);
2707
2708 }
2709 }
2710 if (!total)
2711 return -ENODEV;
2712
2713 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2714 if (ip_block->status.valid != false)
2715 amdgpu_amdkfd_device_probe(adev);
2716
2717 adev->cg_flags &= amdgpu_cg_mask;
2718 adev->pg_flags &= amdgpu_pg_mask;
2719
2720 return 0;
2721 }
2722
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2723 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2724 {
2725 int i, r;
2726
2727 for (i = 0; i < adev->num_ip_blocks; i++) {
2728 if (!adev->ip_blocks[i].status.sw)
2729 continue;
2730 if (adev->ip_blocks[i].status.hw)
2731 continue;
2732 if (!amdgpu_ip_member_of_hwini(
2733 adev, adev->ip_blocks[i].version->type))
2734 continue;
2735 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2736 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2737 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2738 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2739 if (r) {
2740 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2741 adev->ip_blocks[i].version->funcs->name, r);
2742 return r;
2743 }
2744 adev->ip_blocks[i].status.hw = true;
2745 }
2746 }
2747
2748 return 0;
2749 }
2750
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2751 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2752 {
2753 int i, r;
2754
2755 for (i = 0; i < adev->num_ip_blocks; i++) {
2756 if (!adev->ip_blocks[i].status.sw)
2757 continue;
2758 if (adev->ip_blocks[i].status.hw)
2759 continue;
2760 if (!amdgpu_ip_member_of_hwini(
2761 adev, adev->ip_blocks[i].version->type))
2762 continue;
2763 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2764 if (r) {
2765 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2766 adev->ip_blocks[i].version->funcs->name, r);
2767 return r;
2768 }
2769 adev->ip_blocks[i].status.hw = true;
2770 }
2771
2772 return 0;
2773 }
2774
amdgpu_device_fw_loading(struct amdgpu_device * adev)2775 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2776 {
2777 int r = 0;
2778 int i;
2779 uint32_t smu_version;
2780
2781 if (adev->asic_type >= CHIP_VEGA10) {
2782 for (i = 0; i < adev->num_ip_blocks; i++) {
2783 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2784 continue;
2785
2786 if (!amdgpu_ip_member_of_hwini(adev,
2787 AMD_IP_BLOCK_TYPE_PSP))
2788 break;
2789
2790 if (!adev->ip_blocks[i].status.sw)
2791 continue;
2792
2793 /* no need to do the fw loading again if already done*/
2794 if (adev->ip_blocks[i].status.hw == true)
2795 break;
2796
2797 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2798 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
2799 if (r)
2800 return r;
2801 } else {
2802 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2803 if (r) {
2804 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2805 adev->ip_blocks[i].version->funcs->name, r);
2806 return r;
2807 }
2808 adev->ip_blocks[i].status.hw = true;
2809 }
2810 break;
2811 }
2812 }
2813
2814 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2815 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2816
2817 return r;
2818 }
2819
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2820 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2821 {
2822 long timeout;
2823 int r, i;
2824
2825 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2826 struct amdgpu_ring *ring = adev->rings[i];
2827
2828 /* No need to setup the GPU scheduler for rings that don't need it */
2829 if (!ring || ring->no_scheduler)
2830 continue;
2831
2832 switch (ring->funcs->type) {
2833 case AMDGPU_RING_TYPE_GFX:
2834 timeout = adev->gfx_timeout;
2835 break;
2836 case AMDGPU_RING_TYPE_COMPUTE:
2837 timeout = adev->compute_timeout;
2838 break;
2839 case AMDGPU_RING_TYPE_SDMA:
2840 timeout = adev->sdma_timeout;
2841 break;
2842 default:
2843 timeout = adev->video_timeout;
2844 break;
2845 }
2846
2847 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
2848 DRM_SCHED_PRIORITY_COUNT,
2849 ring->num_hw_submission, 0,
2850 timeout, adev->reset_domain->wq,
2851 ring->sched_score, ring->name,
2852 adev->dev);
2853 if (r) {
2854 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2855 ring->name);
2856 return r;
2857 }
2858 r = amdgpu_uvd_entity_init(adev, ring);
2859 if (r) {
2860 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2861 ring->name);
2862 return r;
2863 }
2864 r = amdgpu_vce_entity_init(adev, ring);
2865 if (r) {
2866 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
2867 ring->name);
2868 return r;
2869 }
2870 }
2871
2872 amdgpu_xcp_update_partition_sched_list(adev);
2873
2874 return 0;
2875 }
2876
2877
2878 /**
2879 * amdgpu_device_ip_init - run init for hardware IPs
2880 *
2881 * @adev: amdgpu_device pointer
2882 *
2883 * Main initialization pass for hardware IPs. The list of all the hardware
2884 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2885 * are run. sw_init initializes the software state associated with each IP
2886 * and hw_init initializes the hardware associated with each IP.
2887 * Returns 0 on success, negative error code on failure.
2888 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2889 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2890 {
2891 bool init_badpage;
2892 int i, r;
2893
2894 r = amdgpu_ras_init(adev);
2895 if (r)
2896 return r;
2897
2898 for (i = 0; i < adev->num_ip_blocks; i++) {
2899 if (!adev->ip_blocks[i].status.valid)
2900 continue;
2901 if (adev->ip_blocks[i].version->funcs->sw_init) {
2902 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
2903 if (r) {
2904 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2905 adev->ip_blocks[i].version->funcs->name, r);
2906 goto init_failed;
2907 }
2908 }
2909 adev->ip_blocks[i].status.sw = true;
2910
2911 if (!amdgpu_ip_member_of_hwini(
2912 adev, adev->ip_blocks[i].version->type))
2913 continue;
2914
2915 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2916 /* need to do common hw init early so everything is set up for gmc */
2917 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2918 if (r) {
2919 DRM_ERROR("hw_init %d failed %d\n", i, r);
2920 goto init_failed;
2921 }
2922 adev->ip_blocks[i].status.hw = true;
2923 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2924 /* need to do gmc hw init early so we can allocate gpu mem */
2925 /* Try to reserve bad pages early */
2926 if (amdgpu_sriov_vf(adev))
2927 amdgpu_virt_exchange_data(adev);
2928
2929 r = amdgpu_device_mem_scratch_init(adev);
2930 if (r) {
2931 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2932 goto init_failed;
2933 }
2934 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
2935 if (r) {
2936 DRM_ERROR("hw_init %d failed %d\n", i, r);
2937 goto init_failed;
2938 }
2939 r = amdgpu_device_wb_init(adev);
2940 if (r) {
2941 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2942 goto init_failed;
2943 }
2944 adev->ip_blocks[i].status.hw = true;
2945
2946 /* right after GMC hw init, we create CSA */
2947 if (adev->gfx.mcbp) {
2948 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2949 AMDGPU_GEM_DOMAIN_VRAM |
2950 AMDGPU_GEM_DOMAIN_GTT,
2951 AMDGPU_CSA_SIZE);
2952 if (r) {
2953 DRM_ERROR("allocate CSA failed %d\n", r);
2954 goto init_failed;
2955 }
2956 }
2957
2958 r = amdgpu_seq64_init(adev);
2959 if (r) {
2960 DRM_ERROR("allocate seq64 failed %d\n", r);
2961 goto init_failed;
2962 }
2963 }
2964 }
2965
2966 if (amdgpu_sriov_vf(adev))
2967 amdgpu_virt_init_data_exchange(adev);
2968
2969 r = amdgpu_ib_pool_init(adev);
2970 if (r) {
2971 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2972 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2973 goto init_failed;
2974 }
2975
2976 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2977 if (r)
2978 goto init_failed;
2979
2980 r = amdgpu_device_ip_hw_init_phase1(adev);
2981 if (r)
2982 goto init_failed;
2983
2984 r = amdgpu_device_fw_loading(adev);
2985 if (r)
2986 goto init_failed;
2987
2988 r = amdgpu_device_ip_hw_init_phase2(adev);
2989 if (r)
2990 goto init_failed;
2991
2992 /*
2993 * retired pages will be loaded from eeprom and reserved here,
2994 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2995 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2996 * for I2C communication which only true at this point.
2997 *
2998 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2999 * failure from bad gpu situation and stop amdgpu init process
3000 * accordingly. For other failed cases, it will still release all
3001 * the resource and print error message, rather than returning one
3002 * negative value to upper level.
3003 *
3004 * Note: theoretically, this should be called before all vram allocations
3005 * to protect retired page from abusing
3006 */
3007 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
3008 r = amdgpu_ras_recovery_init(adev, init_badpage);
3009 if (r)
3010 goto init_failed;
3011
3012 /**
3013 * In case of XGMI grab extra reference for reset domain for this device
3014 */
3015 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3016 if (amdgpu_xgmi_add_device(adev) == 0) {
3017 if (!amdgpu_sriov_vf(adev)) {
3018 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3019
3020 if (WARN_ON(!hive)) {
3021 r = -ENOENT;
3022 goto init_failed;
3023 }
3024
3025 if (!hive->reset_domain ||
3026 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
3027 r = -ENOENT;
3028 amdgpu_put_xgmi_hive(hive);
3029 goto init_failed;
3030 }
3031
3032 /* Drop the early temporary reset domain we created for device */
3033 amdgpu_reset_put_reset_domain(adev->reset_domain);
3034 adev->reset_domain = hive->reset_domain;
3035 amdgpu_put_xgmi_hive(hive);
3036 }
3037 }
3038 }
3039
3040 r = amdgpu_device_init_schedulers(adev);
3041 if (r)
3042 goto init_failed;
3043
3044 if (adev->mman.buffer_funcs_ring->sched.ready)
3045 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3046
3047 /* Don't init kfd if whole hive need to be reset during init */
3048 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
3049 kgd2kfd_init_zone_device(adev);
3050 amdgpu_amdkfd_device_init(adev);
3051 }
3052
3053 amdgpu_fru_get_product_info(adev);
3054
3055 init_failed:
3056
3057 return r;
3058 }
3059
3060 /**
3061 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
3062 *
3063 * @adev: amdgpu_device pointer
3064 *
3065 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
3066 * this function before a GPU reset. If the value is retained after a
3067 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
3068 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)3069 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
3070 {
3071 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
3072 }
3073
3074 /**
3075 * amdgpu_device_check_vram_lost - check if vram is valid
3076 *
3077 * @adev: amdgpu_device pointer
3078 *
3079 * Checks the reset magic value written to the gart pointer in VRAM.
3080 * The driver calls this after a GPU reset to see if the contents of
3081 * VRAM is lost or now.
3082 * returns true if vram is lost, false if not.
3083 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)3084 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3085 {
3086 if (memcmp(adev->gart.ptr, adev->reset_magic,
3087 AMDGPU_RESET_MAGIC_NUM))
3088 return true;
3089
3090 if (!amdgpu_in_reset(adev))
3091 return false;
3092
3093 /*
3094 * For all ASICs with baco/mode1 reset, the VRAM is
3095 * always assumed to be lost.
3096 */
3097 switch (amdgpu_asic_reset_method(adev)) {
3098 case AMD_RESET_METHOD_BACO:
3099 case AMD_RESET_METHOD_MODE1:
3100 return true;
3101 default:
3102 return false;
3103 }
3104 }
3105
3106 /**
3107 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3108 *
3109 * @adev: amdgpu_device pointer
3110 * @state: clockgating state (gate or ungate)
3111 *
3112 * The list of all the hardware IPs that make up the asic is walked and the
3113 * set_clockgating_state callbacks are run.
3114 * Late initialization pass enabling clockgating for hardware IPs.
3115 * Fini or suspend, pass disabling clockgating for hardware IPs.
3116 * Returns 0 on success, negative error code on failure.
3117 */
3118
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)3119 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3120 enum amd_clockgating_state state)
3121 {
3122 int i, j, r;
3123
3124 if (amdgpu_emu_mode == 1)
3125 return 0;
3126
3127 for (j = 0; j < adev->num_ip_blocks; j++) {
3128 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3129 if (!adev->ip_blocks[i].status.late_initialized)
3130 continue;
3131 /* skip CG for GFX, SDMA on S0ix */
3132 if (adev->in_s0ix &&
3133 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3134 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3135 continue;
3136 /* skip CG for VCE/UVD, it's handled specially */
3137 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3138 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3139 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3140 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3141 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3142 /* enable clockgating to save power */
3143 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
3144 state);
3145 if (r) {
3146 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3147 adev->ip_blocks[i].version->funcs->name, r);
3148 return r;
3149 }
3150 }
3151 }
3152
3153 return 0;
3154 }
3155
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)3156 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3157 enum amd_powergating_state state)
3158 {
3159 int i, j, r;
3160
3161 if (amdgpu_emu_mode == 1)
3162 return 0;
3163
3164 for (j = 0; j < adev->num_ip_blocks; j++) {
3165 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3166 if (!adev->ip_blocks[i].status.late_initialized)
3167 continue;
3168 /* skip PG for GFX, SDMA on S0ix */
3169 if (adev->in_s0ix &&
3170 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3171 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3172 continue;
3173 /* skip CG for VCE/UVD, it's handled specially */
3174 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3175 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3176 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3177 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3178 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3179 /* enable powergating to save power */
3180 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
3181 state);
3182 if (r) {
3183 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3184 adev->ip_blocks[i].version->funcs->name, r);
3185 return r;
3186 }
3187 }
3188 }
3189 return 0;
3190 }
3191
amdgpu_device_enable_mgpu_fan_boost(void)3192 static int amdgpu_device_enable_mgpu_fan_boost(void)
3193 {
3194 struct amdgpu_gpu_instance *gpu_ins;
3195 struct amdgpu_device *adev;
3196 int i, ret = 0;
3197
3198 mutex_lock(&mgpu_info.mutex);
3199
3200 /*
3201 * MGPU fan boost feature should be enabled
3202 * only when there are two or more dGPUs in
3203 * the system
3204 */
3205 if (mgpu_info.num_dgpu < 2)
3206 goto out;
3207
3208 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3209 gpu_ins = &(mgpu_info.gpu_ins[i]);
3210 adev = gpu_ins->adev;
3211 if (!(adev->flags & AMD_IS_APU) &&
3212 !gpu_ins->mgpu_fan_enabled) {
3213 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3214 if (ret)
3215 break;
3216
3217 gpu_ins->mgpu_fan_enabled = 1;
3218 }
3219 }
3220
3221 out:
3222 mutex_unlock(&mgpu_info.mutex);
3223
3224 return ret;
3225 }
3226
3227 /**
3228 * amdgpu_device_ip_late_init - run late init for hardware IPs
3229 *
3230 * @adev: amdgpu_device pointer
3231 *
3232 * Late initialization pass for hardware IPs. The list of all the hardware
3233 * IPs that make up the asic is walked and the late_init callbacks are run.
3234 * late_init covers any special initialization that an IP requires
3235 * after all of the have been initialized or something that needs to happen
3236 * late in the init process.
3237 * Returns 0 on success, negative error code on failure.
3238 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)3239 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3240 {
3241 struct amdgpu_gpu_instance *gpu_instance;
3242 int i = 0, r;
3243
3244 for (i = 0; i < adev->num_ip_blocks; i++) {
3245 if (!adev->ip_blocks[i].status.hw)
3246 continue;
3247 if (adev->ip_blocks[i].version->funcs->late_init) {
3248 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
3249 if (r) {
3250 DRM_ERROR("late_init of IP block <%s> failed %d\n",
3251 adev->ip_blocks[i].version->funcs->name, r);
3252 return r;
3253 }
3254 }
3255 adev->ip_blocks[i].status.late_initialized = true;
3256 }
3257
3258 r = amdgpu_ras_late_init(adev);
3259 if (r) {
3260 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3261 return r;
3262 }
3263
3264 if (!amdgpu_reset_in_recovery(adev))
3265 amdgpu_ras_set_error_query_ready(adev, true);
3266
3267 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3268 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3269
3270 amdgpu_device_fill_reset_magic(adev);
3271
3272 r = amdgpu_device_enable_mgpu_fan_boost();
3273 if (r)
3274 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3275
3276 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3277 if (amdgpu_passthrough(adev) &&
3278 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3279 adev->asic_type == CHIP_ALDEBARAN))
3280 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3281
3282 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3283 mutex_lock(&mgpu_info.mutex);
3284
3285 /*
3286 * Reset device p-state to low as this was booted with high.
3287 *
3288 * This should be performed only after all devices from the same
3289 * hive get initialized.
3290 *
3291 * However, it's unknown how many device in the hive in advance.
3292 * As this is counted one by one during devices initializations.
3293 *
3294 * So, we wait for all XGMI interlinked devices initialized.
3295 * This may bring some delays as those devices may come from
3296 * different hives. But that should be OK.
3297 */
3298 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3299 for (i = 0; i < mgpu_info.num_gpu; i++) {
3300 gpu_instance = &(mgpu_info.gpu_ins[i]);
3301 if (gpu_instance->adev->flags & AMD_IS_APU)
3302 continue;
3303
3304 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3305 AMDGPU_XGMI_PSTATE_MIN);
3306 if (r) {
3307 DRM_ERROR("pstate setting failed (%d).\n", r);
3308 break;
3309 }
3310 }
3311 }
3312
3313 mutex_unlock(&mgpu_info.mutex);
3314 }
3315
3316 return 0;
3317 }
3318
amdgpu_ip_block_hw_fini(struct amdgpu_ip_block * ip_block)3319 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
3320 {
3321 int r;
3322
3323 if (!ip_block->version->funcs->hw_fini) {
3324 DRM_ERROR("hw_fini of IP block <%s> not defined\n",
3325 ip_block->version->funcs->name);
3326 } else {
3327 r = ip_block->version->funcs->hw_fini(ip_block);
3328 /* XXX handle errors */
3329 if (r) {
3330 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3331 ip_block->version->funcs->name, r);
3332 }
3333 }
3334
3335 ip_block->status.hw = false;
3336 }
3337
3338 /**
3339 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3340 *
3341 * @adev: amdgpu_device pointer
3342 *
3343 * For ASICs need to disable SMC first
3344 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)3345 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3346 {
3347 int i;
3348
3349 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3350 return;
3351
3352 for (i = 0; i < adev->num_ip_blocks; i++) {
3353 if (!adev->ip_blocks[i].status.hw)
3354 continue;
3355 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3356 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3357 break;
3358 }
3359 }
3360 }
3361
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)3362 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3363 {
3364 int i, r;
3365
3366 for (i = 0; i < adev->num_ip_blocks; i++) {
3367 if (!adev->ip_blocks[i].version->funcs->early_fini)
3368 continue;
3369
3370 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
3371 if (r) {
3372 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3373 adev->ip_blocks[i].version->funcs->name, r);
3374 }
3375 }
3376
3377 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3378 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3379
3380 amdgpu_amdkfd_suspend(adev, false);
3381
3382 /* Workaroud for ASICs need to disable SMC first */
3383 amdgpu_device_smu_fini_early(adev);
3384
3385 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3386 if (!adev->ip_blocks[i].status.hw)
3387 continue;
3388
3389 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
3390 }
3391
3392 if (amdgpu_sriov_vf(adev)) {
3393 if (amdgpu_virt_release_full_gpu(adev, false))
3394 DRM_ERROR("failed to release exclusive mode on fini\n");
3395 }
3396
3397 return 0;
3398 }
3399
3400 /**
3401 * amdgpu_device_ip_fini - run fini for hardware IPs
3402 *
3403 * @adev: amdgpu_device pointer
3404 *
3405 * Main teardown pass for hardware IPs. The list of all the hardware
3406 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3407 * are run. hw_fini tears down the hardware associated with each IP
3408 * and sw_fini tears down any software state associated with each IP.
3409 * Returns 0 on success, negative error code on failure.
3410 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)3411 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3412 {
3413 int i, r;
3414
3415 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3416 amdgpu_virt_release_ras_err_handler_data(adev);
3417
3418 if (adev->gmc.xgmi.num_physical_nodes > 1)
3419 amdgpu_xgmi_remove_device(adev);
3420
3421 amdgpu_amdkfd_device_fini_sw(adev);
3422
3423 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3424 if (!adev->ip_blocks[i].status.sw)
3425 continue;
3426
3427 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3428 amdgpu_ucode_free_bo(adev);
3429 amdgpu_free_static_csa(&adev->virt.csa_obj);
3430 amdgpu_device_wb_fini(adev);
3431 amdgpu_device_mem_scratch_fini(adev);
3432 amdgpu_ib_pool_fini(adev);
3433 amdgpu_seq64_fini(adev);
3434 }
3435 if (adev->ip_blocks[i].version->funcs->sw_fini) {
3436 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
3437 /* XXX handle errors */
3438 if (r) {
3439 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3440 adev->ip_blocks[i].version->funcs->name, r);
3441 }
3442 }
3443 adev->ip_blocks[i].status.sw = false;
3444 adev->ip_blocks[i].status.valid = false;
3445 }
3446
3447 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3448 if (!adev->ip_blocks[i].status.late_initialized)
3449 continue;
3450 if (adev->ip_blocks[i].version->funcs->late_fini)
3451 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
3452 adev->ip_blocks[i].status.late_initialized = false;
3453 }
3454
3455 amdgpu_ras_fini(adev);
3456
3457 return 0;
3458 }
3459
3460 /**
3461 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3462 *
3463 * @work: work_struct.
3464 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)3465 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3466 {
3467 struct amdgpu_device *adev =
3468 container_of(work, struct amdgpu_device, delayed_init_work.work);
3469 int r;
3470
3471 r = amdgpu_ib_ring_tests(adev);
3472 if (r)
3473 DRM_ERROR("ib ring test failed (%d).\n", r);
3474 }
3475
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)3476 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3477 {
3478 struct amdgpu_device *adev =
3479 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3480
3481 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3482 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3483
3484 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
3485 adev->gfx.gfx_off_state = true;
3486 }
3487
3488 /**
3489 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3490 *
3491 * @adev: amdgpu_device pointer
3492 *
3493 * Main suspend function for hardware IPs. The list of all the hardware
3494 * IPs that make up the asic is walked, clockgating is disabled and the
3495 * suspend callbacks are run. suspend puts the hardware and software state
3496 * in each IP into a state suitable for suspend.
3497 * Returns 0 on success, negative error code on failure.
3498 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)3499 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3500 {
3501 int i, r;
3502
3503 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3504 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3505
3506 /*
3507 * Per PMFW team's suggestion, driver needs to handle gfxoff
3508 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3509 * scenario. Add the missing df cstate disablement here.
3510 */
3511 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3512 dev_warn(adev->dev, "Failed to disallow df cstate");
3513
3514 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3515 if (!adev->ip_blocks[i].status.valid)
3516 continue;
3517
3518 /* displays are handled separately */
3519 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3520 continue;
3521
3522 /* XXX handle errors */
3523 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3524 if (r)
3525 return r;
3526 }
3527
3528 return 0;
3529 }
3530
3531 /**
3532 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3533 *
3534 * @adev: amdgpu_device pointer
3535 *
3536 * Main suspend function for hardware IPs. The list of all the hardware
3537 * IPs that make up the asic is walked, clockgating is disabled and the
3538 * suspend callbacks are run. suspend puts the hardware and software state
3539 * in each IP into a state suitable for suspend.
3540 * Returns 0 on success, negative error code on failure.
3541 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)3542 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3543 {
3544 int i, r;
3545
3546 if (adev->in_s0ix)
3547 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3548
3549 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3550 if (!adev->ip_blocks[i].status.valid)
3551 continue;
3552 /* displays are handled in phase1 */
3553 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3554 continue;
3555 /* PSP lost connection when err_event_athub occurs */
3556 if (amdgpu_ras_intr_triggered() &&
3557 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3558 adev->ip_blocks[i].status.hw = false;
3559 continue;
3560 }
3561
3562 /* skip unnecessary suspend if we do not initialize them yet */
3563 if (!amdgpu_ip_member_of_hwini(
3564 adev, adev->ip_blocks[i].version->type))
3565 continue;
3566
3567 /* skip suspend of gfx/mes and psp for S0ix
3568 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3569 * like at runtime. PSP is also part of the always on hardware
3570 * so no need to suspend it.
3571 */
3572 if (adev->in_s0ix &&
3573 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3574 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3575 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3576 continue;
3577
3578 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3579 if (adev->in_s0ix &&
3580 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3581 IP_VERSION(5, 0, 0)) &&
3582 (adev->ip_blocks[i].version->type ==
3583 AMD_IP_BLOCK_TYPE_SDMA))
3584 continue;
3585
3586 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3587 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3588 * from this location and RLC Autoload automatically also gets loaded
3589 * from here based on PMFW -> PSP message during re-init sequence.
3590 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3591 * the TMR and reload FWs again for IMU enabled APU ASICs.
3592 */
3593 if (amdgpu_in_reset(adev) &&
3594 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3595 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3596 continue;
3597
3598 /* XXX handle errors */
3599 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
3600 adev->ip_blocks[i].status.hw = false;
3601
3602 /* handle putting the SMC in the appropriate state */
3603 if (!amdgpu_sriov_vf(adev)) {
3604 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3605 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3606 if (r) {
3607 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3608 adev->mp1_state, r);
3609 return r;
3610 }
3611 }
3612 }
3613 }
3614
3615 return 0;
3616 }
3617
3618 /**
3619 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3620 *
3621 * @adev: amdgpu_device pointer
3622 *
3623 * Main suspend function for hardware IPs. The list of all the hardware
3624 * IPs that make up the asic is walked, clockgating is disabled and the
3625 * suspend callbacks are run. suspend puts the hardware and software state
3626 * in each IP into a state suitable for suspend.
3627 * Returns 0 on success, negative error code on failure.
3628 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3629 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3630 {
3631 int r;
3632
3633 if (amdgpu_sriov_vf(adev)) {
3634 amdgpu_virt_fini_data_exchange(adev);
3635 amdgpu_virt_request_full_gpu(adev, false);
3636 }
3637
3638 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3639
3640 r = amdgpu_device_ip_suspend_phase1(adev);
3641 if (r)
3642 return r;
3643 r = amdgpu_device_ip_suspend_phase2(adev);
3644
3645 if (amdgpu_sriov_vf(adev))
3646 amdgpu_virt_release_full_gpu(adev, false);
3647
3648 return r;
3649 }
3650
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3651 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3652 {
3653 int i, r;
3654
3655 static enum amd_ip_block_type ip_order[] = {
3656 AMD_IP_BLOCK_TYPE_COMMON,
3657 AMD_IP_BLOCK_TYPE_GMC,
3658 AMD_IP_BLOCK_TYPE_PSP,
3659 AMD_IP_BLOCK_TYPE_IH,
3660 };
3661
3662 for (i = 0; i < adev->num_ip_blocks; i++) {
3663 int j;
3664 struct amdgpu_ip_block *block;
3665
3666 block = &adev->ip_blocks[i];
3667 block->status.hw = false;
3668
3669 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3670
3671 if (block->version->type != ip_order[j] ||
3672 !block->status.valid)
3673 continue;
3674
3675 r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
3676 if (r) {
3677 dev_err(adev->dev, "RE-INIT-early: %s failed\n",
3678 block->version->funcs->name);
3679 return r;
3680 }
3681 block->status.hw = true;
3682 }
3683 }
3684
3685 return 0;
3686 }
3687
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3688 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3689 {
3690 struct amdgpu_ip_block *block;
3691 int i, r = 0;
3692
3693 static enum amd_ip_block_type ip_order[] = {
3694 AMD_IP_BLOCK_TYPE_SMC,
3695 AMD_IP_BLOCK_TYPE_DCE,
3696 AMD_IP_BLOCK_TYPE_GFX,
3697 AMD_IP_BLOCK_TYPE_SDMA,
3698 AMD_IP_BLOCK_TYPE_MES,
3699 AMD_IP_BLOCK_TYPE_UVD,
3700 AMD_IP_BLOCK_TYPE_VCE,
3701 AMD_IP_BLOCK_TYPE_VCN,
3702 AMD_IP_BLOCK_TYPE_JPEG
3703 };
3704
3705 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3706 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]);
3707
3708 if (!block)
3709 continue;
3710
3711 if (block->status.valid && !block->status.hw) {
3712 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
3713 r = amdgpu_ip_block_resume(block);
3714 } else {
3715 r = block->version->funcs->hw_init(block);
3716 }
3717
3718 if (r) {
3719 dev_err(adev->dev, "RE-INIT-late: %s failed\n",
3720 block->version->funcs->name);
3721 break;
3722 }
3723 block->status.hw = true;
3724 }
3725 }
3726
3727 return r;
3728 }
3729
3730 /**
3731 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3732 *
3733 * @adev: amdgpu_device pointer
3734 *
3735 * First resume function for hardware IPs. The list of all the hardware
3736 * IPs that make up the asic is walked and the resume callbacks are run for
3737 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3738 * after a suspend and updates the software state as necessary. This
3739 * function is also used for restoring the GPU after a GPU reset.
3740 * Returns 0 on success, negative error code on failure.
3741 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3742 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3743 {
3744 int i, r;
3745
3746 for (i = 0; i < adev->num_ip_blocks; i++) {
3747 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3748 continue;
3749 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3750 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3751 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3752 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3753
3754 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3755 if (r)
3756 return r;
3757 }
3758 }
3759
3760 return 0;
3761 }
3762
3763 /**
3764 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3765 *
3766 * @adev: amdgpu_device pointer
3767 *
3768 * Second resume function for hardware IPs. The list of all the hardware
3769 * IPs that make up the asic is walked and the resume callbacks are run for
3770 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3771 * functional state after a suspend and updates the software state as
3772 * necessary. This function is also used for restoring the GPU after a GPU
3773 * reset.
3774 * Returns 0 on success, negative error code on failure.
3775 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3776 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3777 {
3778 int i, r;
3779
3780 for (i = 0; i < adev->num_ip_blocks; i++) {
3781 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3782 continue;
3783 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3784 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3785 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3786 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3787 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3788 continue;
3789 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3790 if (r)
3791 return r;
3792 }
3793
3794 return 0;
3795 }
3796
3797 /**
3798 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3799 *
3800 * @adev: amdgpu_device pointer
3801 *
3802 * Third resume function for hardware IPs. The list of all the hardware
3803 * IPs that make up the asic is walked and the resume callbacks are run for
3804 * all DCE. resume puts the hardware into a functional state after a suspend
3805 * and updates the software state as necessary. This function is also used
3806 * for restoring the GPU after a GPU reset.
3807 *
3808 * Returns 0 on success, negative error code on failure.
3809 */
amdgpu_device_ip_resume_phase3(struct amdgpu_device * adev)3810 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
3811 {
3812 int i, r;
3813
3814 for (i = 0; i < adev->num_ip_blocks; i++) {
3815 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3816 continue;
3817 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
3818 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
3819 if (r)
3820 return r;
3821 }
3822 }
3823
3824 return 0;
3825 }
3826
3827 /**
3828 * amdgpu_device_ip_resume - run resume for hardware IPs
3829 *
3830 * @adev: amdgpu_device pointer
3831 *
3832 * Main resume function for hardware IPs. The hardware IPs
3833 * are split into two resume functions because they are
3834 * also used in recovering from a GPU reset and some additional
3835 * steps need to be take between them. In this case (S3/S4) they are
3836 * run sequentially.
3837 * Returns 0 on success, negative error code on failure.
3838 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3839 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3840 {
3841 int r;
3842
3843 r = amdgpu_device_ip_resume_phase1(adev);
3844 if (r)
3845 return r;
3846
3847 r = amdgpu_device_fw_loading(adev);
3848 if (r)
3849 return r;
3850
3851 r = amdgpu_device_ip_resume_phase2(adev);
3852
3853 if (adev->mman.buffer_funcs_ring->sched.ready)
3854 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3855
3856 if (r)
3857 return r;
3858
3859 amdgpu_fence_driver_hw_init(adev);
3860
3861 r = amdgpu_device_ip_resume_phase3(adev);
3862
3863 return r;
3864 }
3865
3866 /**
3867 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3868 *
3869 * @adev: amdgpu_device pointer
3870 *
3871 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3872 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3873 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3874 {
3875 if (amdgpu_sriov_vf(adev)) {
3876 if (adev->is_atom_fw) {
3877 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3878 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3879 } else {
3880 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3881 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3882 }
3883
3884 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3885 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3886 }
3887 }
3888
3889 /**
3890 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3891 *
3892 * @asic_type: AMD asic type
3893 *
3894 * Check if there is DC (new modesetting infrastructre) support for an asic.
3895 * returns true if DC has support, false if not.
3896 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3897 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3898 {
3899 switch (asic_type) {
3900 #ifdef CONFIG_DRM_AMDGPU_SI
3901 case CHIP_HAINAN:
3902 #endif
3903 case CHIP_TOPAZ:
3904 /* chips with no display hardware */
3905 return false;
3906 #if defined(CONFIG_DRM_AMD_DC)
3907 case CHIP_TAHITI:
3908 case CHIP_PITCAIRN:
3909 case CHIP_VERDE:
3910 case CHIP_OLAND:
3911 /*
3912 * We have systems in the wild with these ASICs that require
3913 * LVDS and VGA support which is not supported with DC.
3914 *
3915 * Fallback to the non-DC driver here by default so as not to
3916 * cause regressions.
3917 */
3918 #if defined(CONFIG_DRM_AMD_DC_SI)
3919 return amdgpu_dc > 0;
3920 #else
3921 return false;
3922 #endif
3923 case CHIP_BONAIRE:
3924 case CHIP_KAVERI:
3925 case CHIP_KABINI:
3926 case CHIP_MULLINS:
3927 /*
3928 * We have systems in the wild with these ASICs that require
3929 * VGA support which is not supported with DC.
3930 *
3931 * Fallback to the non-DC driver here by default so as not to
3932 * cause regressions.
3933 */
3934 return amdgpu_dc > 0;
3935 default:
3936 return amdgpu_dc != 0;
3937 #else
3938 default:
3939 if (amdgpu_dc > 0)
3940 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3941 return false;
3942 #endif
3943 }
3944 }
3945
3946 /**
3947 * amdgpu_device_has_dc_support - check if dc is supported
3948 *
3949 * @adev: amdgpu_device pointer
3950 *
3951 * Returns true for supported, false for not supported
3952 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3953 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3954 {
3955 if (adev->enable_virtual_display ||
3956 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3957 return false;
3958
3959 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3960 }
3961
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3962 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3963 {
3964 struct amdgpu_device *adev =
3965 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3966 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3967
3968 /* It's a bug to not have a hive within this function */
3969 if (WARN_ON(!hive))
3970 return;
3971
3972 /*
3973 * Use task barrier to synchronize all xgmi reset works across the
3974 * hive. task_barrier_enter and task_barrier_exit will block
3975 * until all the threads running the xgmi reset works reach
3976 * those points. task_barrier_full will do both blocks.
3977 */
3978 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3979
3980 task_barrier_enter(&hive->tb);
3981 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3982
3983 if (adev->asic_reset_res)
3984 goto fail;
3985
3986 task_barrier_exit(&hive->tb);
3987 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3988
3989 if (adev->asic_reset_res)
3990 goto fail;
3991
3992 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
3993 } else {
3994
3995 task_barrier_full(&hive->tb);
3996 adev->asic_reset_res = amdgpu_asic_reset(adev);
3997 }
3998
3999 fail:
4000 if (adev->asic_reset_res)
4001 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
4002 adev->asic_reset_res, adev_to_drm(adev)->unique);
4003 amdgpu_put_xgmi_hive(hive);
4004 }
4005
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)4006 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
4007 {
4008 char *input = amdgpu_lockup_timeout;
4009 char *timeout_setting = NULL;
4010 int index = 0;
4011 long timeout;
4012 int ret = 0;
4013
4014 /*
4015 * By default timeout for non compute jobs is 10000
4016 * and 60000 for compute jobs.
4017 * In SR-IOV or passthrough mode, timeout for compute
4018 * jobs are 60000 by default.
4019 */
4020 adev->gfx_timeout = msecs_to_jiffies(10000);
4021 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4022 if (amdgpu_sriov_vf(adev))
4023 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
4024 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
4025 else
4026 adev->compute_timeout = msecs_to_jiffies(60000);
4027
4028 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4029 while ((timeout_setting = strsep(&input, ",")) &&
4030 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
4031 ret = kstrtol(timeout_setting, 0, &timeout);
4032 if (ret)
4033 return ret;
4034
4035 if (timeout == 0) {
4036 index++;
4037 continue;
4038 } else if (timeout < 0) {
4039 timeout = MAX_SCHEDULE_TIMEOUT;
4040 dev_warn(adev->dev, "lockup timeout disabled");
4041 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
4042 } else {
4043 timeout = msecs_to_jiffies(timeout);
4044 }
4045
4046 switch (index++) {
4047 case 0:
4048 adev->gfx_timeout = timeout;
4049 break;
4050 case 1:
4051 adev->compute_timeout = timeout;
4052 break;
4053 case 2:
4054 adev->sdma_timeout = timeout;
4055 break;
4056 case 3:
4057 adev->video_timeout = timeout;
4058 break;
4059 default:
4060 break;
4061 }
4062 }
4063 /*
4064 * There is only one value specified and
4065 * it should apply to all non-compute jobs.
4066 */
4067 if (index == 1) {
4068 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4069 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
4070 adev->compute_timeout = adev->gfx_timeout;
4071 }
4072 }
4073
4074 return ret;
4075 }
4076
4077 /**
4078 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4079 *
4080 * @adev: amdgpu_device pointer
4081 *
4082 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4083 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)4084 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4085 {
4086 struct iommu_domain *domain;
4087
4088 domain = iommu_get_domain_for_dev(adev->dev);
4089 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4090 adev->ram_is_direct_mapped = true;
4091 }
4092
4093 #if defined(CONFIG_HSA_AMD_P2P)
4094 /**
4095 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4096 *
4097 * @adev: amdgpu_device pointer
4098 *
4099 * return if IOMMU remapping bar address
4100 */
amdgpu_device_check_iommu_remap(struct amdgpu_device * adev)4101 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4102 {
4103 struct iommu_domain *domain;
4104
4105 domain = iommu_get_domain_for_dev(adev->dev);
4106 if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4107 domain->type == IOMMU_DOMAIN_DMA_FQ))
4108 return true;
4109
4110 return false;
4111 }
4112 #endif
4113
4114 static const struct attribute *amdgpu_dev_attributes[] = {
4115 &dev_attr_pcie_replay_count.attr,
4116 NULL
4117 };
4118
amdgpu_device_set_mcbp(struct amdgpu_device * adev)4119 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4120 {
4121 if (amdgpu_mcbp == 1)
4122 adev->gfx.mcbp = true;
4123 else if (amdgpu_mcbp == 0)
4124 adev->gfx.mcbp = false;
4125
4126 if (amdgpu_sriov_vf(adev))
4127 adev->gfx.mcbp = true;
4128
4129 if (adev->gfx.mcbp)
4130 DRM_INFO("MCBP is enabled\n");
4131 }
4132
4133 /**
4134 * amdgpu_device_init - initialize the driver
4135 *
4136 * @adev: amdgpu_device pointer
4137 * @flags: driver flags
4138 *
4139 * Initializes the driver info and hw (all asics).
4140 * Returns 0 for success or an error on failure.
4141 * Called at driver startup.
4142 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)4143 int amdgpu_device_init(struct amdgpu_device *adev,
4144 uint32_t flags)
4145 {
4146 struct drm_device *ddev = adev_to_drm(adev);
4147 struct pci_dev *pdev = adev->pdev;
4148 int r, i;
4149 bool px = false;
4150 u32 max_MBps;
4151 int tmp;
4152
4153 adev->shutdown = false;
4154 adev->flags = flags;
4155
4156 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4157 adev->asic_type = amdgpu_force_asic_type;
4158 else
4159 adev->asic_type = flags & AMD_ASIC_MASK;
4160
4161 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4162 if (amdgpu_emu_mode == 1)
4163 adev->usec_timeout *= 10;
4164 adev->gmc.gart_size = 512 * 1024 * 1024;
4165 adev->accel_working = false;
4166 adev->num_rings = 0;
4167 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4168 adev->mman.buffer_funcs = NULL;
4169 adev->mman.buffer_funcs_ring = NULL;
4170 adev->vm_manager.vm_pte_funcs = NULL;
4171 adev->vm_manager.vm_pte_num_scheds = 0;
4172 adev->gmc.gmc_funcs = NULL;
4173 adev->harvest_ip_mask = 0x0;
4174 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4175 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4176
4177 adev->smc_rreg = &amdgpu_invalid_rreg;
4178 adev->smc_wreg = &amdgpu_invalid_wreg;
4179 adev->pcie_rreg = &amdgpu_invalid_rreg;
4180 adev->pcie_wreg = &amdgpu_invalid_wreg;
4181 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4182 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4183 adev->pciep_rreg = &amdgpu_invalid_rreg;
4184 adev->pciep_wreg = &amdgpu_invalid_wreg;
4185 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4186 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4187 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4188 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4189 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4190 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4191 adev->didt_rreg = &amdgpu_invalid_rreg;
4192 adev->didt_wreg = &amdgpu_invalid_wreg;
4193 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4194 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4195 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4196 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4197
4198 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4199 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4200 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4201
4202 /* mutex initialization are all done here so we
4203 * can recall function without having locking issues
4204 */
4205 mutex_init(&adev->firmware.mutex);
4206 mutex_init(&adev->pm.mutex);
4207 mutex_init(&adev->gfx.gpu_clock_mutex);
4208 mutex_init(&adev->srbm_mutex);
4209 mutex_init(&adev->gfx.pipe_reserve_mutex);
4210 mutex_init(&adev->gfx.gfx_off_mutex);
4211 mutex_init(&adev->gfx.partition_mutex);
4212 mutex_init(&adev->grbm_idx_mutex);
4213 mutex_init(&adev->mn_lock);
4214 mutex_init(&adev->virt.vf_errors.lock);
4215 mutex_init(&adev->virt.rlcg_reg_lock);
4216 hash_init(adev->mn_hash);
4217 mutex_init(&adev->psp.mutex);
4218 mutex_init(&adev->notifier_lock);
4219 mutex_init(&adev->pm.stable_pstate_ctx_lock);
4220 mutex_init(&adev->benchmark_mutex);
4221 mutex_init(&adev->gfx.reset_sem_mutex);
4222 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4223 mutex_init(&adev->enforce_isolation_mutex);
4224 mutex_init(&adev->gfx.kfd_sch_mutex);
4225
4226 amdgpu_device_init_apu_flags(adev);
4227
4228 r = amdgpu_device_check_arguments(adev);
4229 if (r)
4230 return r;
4231
4232 spin_lock_init(&adev->mmio_idx_lock);
4233 spin_lock_init(&adev->smc_idx_lock);
4234 spin_lock_init(&adev->pcie_idx_lock);
4235 spin_lock_init(&adev->uvd_ctx_idx_lock);
4236 spin_lock_init(&adev->didt_idx_lock);
4237 spin_lock_init(&adev->gc_cac_idx_lock);
4238 spin_lock_init(&adev->se_cac_idx_lock);
4239 spin_lock_init(&adev->audio_endpt_idx_lock);
4240 spin_lock_init(&adev->mm_stats.lock);
4241 spin_lock_init(&adev->wb.lock);
4242
4243 INIT_LIST_HEAD(&adev->reset_list);
4244
4245 INIT_LIST_HEAD(&adev->ras_list);
4246
4247 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4248
4249 INIT_DELAYED_WORK(&adev->delayed_init_work,
4250 amdgpu_device_delayed_init_work_handler);
4251 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4252 amdgpu_device_delay_enable_gfx_off);
4253 /*
4254 * Initialize the enforce_isolation work structures for each XCP
4255 * partition. This work handler is responsible for enforcing shader
4256 * isolation on AMD GPUs. It counts the number of emitted fences for
4257 * each GFX and compute ring. If there are any fences, it schedules
4258 * the `enforce_isolation_work` to be run after a delay. If there are
4259 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4260 * runqueue.
4261 */
4262 for (i = 0; i < MAX_XCP; i++) {
4263 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4264 amdgpu_gfx_enforce_isolation_handler);
4265 adev->gfx.enforce_isolation[i].adev = adev;
4266 adev->gfx.enforce_isolation[i].xcp_id = i;
4267 }
4268
4269 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4270
4271 adev->gfx.gfx_off_req_count = 1;
4272 adev->gfx.gfx_off_residency = 0;
4273 adev->gfx.gfx_off_entrycount = 0;
4274 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4275
4276 atomic_set(&adev->throttling_logging_enabled, 1);
4277 /*
4278 * If throttling continues, logging will be performed every minute
4279 * to avoid log flooding. "-1" is subtracted since the thermal
4280 * throttling interrupt comes every second. Thus, the total logging
4281 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4282 * for throttling interrupt) = 60 seconds.
4283 */
4284 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4285 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1);
4286
4287 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4288 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE);
4289
4290 /* Registers mapping */
4291 /* TODO: block userspace mapping of io register */
4292 if (adev->asic_type >= CHIP_BONAIRE) {
4293 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4294 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4295 } else {
4296 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4297 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4298 }
4299
4300 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4301 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4302
4303 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4304 if (!adev->rmmio)
4305 return -ENOMEM;
4306
4307 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4308 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4309
4310 /*
4311 * Reset domain needs to be present early, before XGMI hive discovered
4312 * (if any) and intitialized to use reset sem and in_gpu reset flag
4313 * early on during init and before calling to RREG32.
4314 */
4315 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4316 if (!adev->reset_domain)
4317 return -ENOMEM;
4318
4319 /* detect hw virtualization here */
4320 amdgpu_detect_virtualization(adev);
4321
4322 amdgpu_device_get_pcie_info(adev);
4323
4324 r = amdgpu_device_get_job_timeout_settings(adev);
4325 if (r) {
4326 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4327 return r;
4328 }
4329
4330 amdgpu_device_set_mcbp(adev);
4331
4332 /*
4333 * By default, use default mode where all blocks are expected to be
4334 * initialized. At present a 'swinit' of blocks is required to be
4335 * completed before the need for a different level is detected.
4336 */
4337 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT);
4338 /* early init functions */
4339 r = amdgpu_device_ip_early_init(adev);
4340 if (r)
4341 return r;
4342
4343 /* Get rid of things like offb */
4344 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name);
4345 if (r)
4346 return r;
4347
4348 /* Enable TMZ based on IP_VERSION */
4349 amdgpu_gmc_tmz_set(adev);
4350
4351 if (amdgpu_sriov_vf(adev) &&
4352 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4353 /* VF MMIO access (except mailbox range) from CPU
4354 * will be blocked during sriov runtime
4355 */
4356 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4357
4358 amdgpu_gmc_noretry_set(adev);
4359 /* Need to get xgmi info early to decide the reset behavior*/
4360 if (adev->gmc.xgmi.supported) {
4361 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4362 if (r)
4363 return r;
4364 }
4365
4366 /* enable PCIE atomic ops */
4367 if (amdgpu_sriov_vf(adev)) {
4368 if (adev->virt.fw_reserve.p_pf2vf)
4369 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4370 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4371 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4372 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4373 * internal path natively support atomics, set have_atomics_support to true.
4374 */
4375 } else if ((adev->flags & AMD_IS_APU) &&
4376 (amdgpu_ip_version(adev, GC_HWIP, 0) >
4377 IP_VERSION(9, 0, 0))) {
4378 adev->have_atomics_support = true;
4379 } else {
4380 adev->have_atomics_support =
4381 !pci_enable_atomic_ops_to_root(adev->pdev,
4382 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4383 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4384 }
4385
4386 if (!adev->have_atomics_support)
4387 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4388
4389 /* doorbell bar mapping and doorbell index init*/
4390 amdgpu_doorbell_init(adev);
4391
4392 if (amdgpu_emu_mode == 1) {
4393 /* post the asic on emulation mode */
4394 emu_soc_asic_init(adev);
4395 goto fence_driver_init;
4396 }
4397
4398 amdgpu_reset_init(adev);
4399
4400 /* detect if we are with an SRIOV vbios */
4401 if (adev->bios)
4402 amdgpu_device_detect_sriov_bios(adev);
4403
4404 /* check if we need to reset the asic
4405 * E.g., driver was not cleanly unloaded previously, etc.
4406 */
4407 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4408 if (adev->gmc.xgmi.num_physical_nodes) {
4409 dev_info(adev->dev, "Pending hive reset.\n");
4410 amdgpu_set_init_level(adev,
4411 AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
4412 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4413 !amdgpu_device_has_display_hardware(adev)) {
4414 r = psp_gpu_reset(adev);
4415 } else {
4416 tmp = amdgpu_reset_method;
4417 /* It should do a default reset when loading or reloading the driver,
4418 * regardless of the module parameter reset_method.
4419 */
4420 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4421 r = amdgpu_asic_reset(adev);
4422 amdgpu_reset_method = tmp;
4423 }
4424
4425 if (r) {
4426 dev_err(adev->dev, "asic reset on init failed\n");
4427 goto failed;
4428 }
4429 }
4430
4431 /* Post card if necessary */
4432 if (amdgpu_device_need_post(adev)) {
4433 if (!adev->bios) {
4434 dev_err(adev->dev, "no vBIOS found\n");
4435 r = -EINVAL;
4436 goto failed;
4437 }
4438 DRM_INFO("GPU posting now...\n");
4439 r = amdgpu_device_asic_init(adev);
4440 if (r) {
4441 dev_err(adev->dev, "gpu post error!\n");
4442 goto failed;
4443 }
4444 }
4445
4446 if (adev->bios) {
4447 if (adev->is_atom_fw) {
4448 /* Initialize clocks */
4449 r = amdgpu_atomfirmware_get_clock_info(adev);
4450 if (r) {
4451 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4452 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4453 goto failed;
4454 }
4455 } else {
4456 /* Initialize clocks */
4457 r = amdgpu_atombios_get_clock_info(adev);
4458 if (r) {
4459 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4460 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4461 goto failed;
4462 }
4463 /* init i2c buses */
4464 if (!amdgpu_device_has_dc_support(adev))
4465 amdgpu_atombios_i2c_init(adev);
4466 }
4467 }
4468
4469 fence_driver_init:
4470 /* Fence driver */
4471 r = amdgpu_fence_driver_sw_init(adev);
4472 if (r) {
4473 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4474 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4475 goto failed;
4476 }
4477
4478 /* init the mode config */
4479 drm_mode_config_init(adev_to_drm(adev));
4480
4481 r = amdgpu_device_ip_init(adev);
4482 if (r) {
4483 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4484 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4485 goto release_ras_con;
4486 }
4487
4488 amdgpu_fence_driver_hw_init(adev);
4489
4490 dev_info(adev->dev,
4491 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4492 adev->gfx.config.max_shader_engines,
4493 adev->gfx.config.max_sh_per_se,
4494 adev->gfx.config.max_cu_per_sh,
4495 adev->gfx.cu_info.number);
4496
4497 adev->accel_working = true;
4498
4499 amdgpu_vm_check_compute_bug(adev);
4500
4501 /* Initialize the buffer migration limit. */
4502 if (amdgpu_moverate >= 0)
4503 max_MBps = amdgpu_moverate;
4504 else
4505 max_MBps = 8; /* Allow 8 MB/s. */
4506 /* Get a log2 for easy divisions. */
4507 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4508
4509 /*
4510 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4511 * Otherwise the mgpu fan boost feature will be skipped due to the
4512 * gpu instance is counted less.
4513 */
4514 amdgpu_register_gpu_instance(adev);
4515
4516 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4517 * explicit gating rather than handling it automatically.
4518 */
4519 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
4520 r = amdgpu_device_ip_late_init(adev);
4521 if (r) {
4522 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4523 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4524 goto release_ras_con;
4525 }
4526 /* must succeed. */
4527 amdgpu_ras_resume(adev);
4528 queue_delayed_work(system_wq, &adev->delayed_init_work,
4529 msecs_to_jiffies(AMDGPU_RESUME_MS));
4530 }
4531
4532 if (amdgpu_sriov_vf(adev)) {
4533 amdgpu_virt_release_full_gpu(adev, true);
4534 flush_delayed_work(&adev->delayed_init_work);
4535 }
4536
4537 /*
4538 * Place those sysfs registering after `late_init`. As some of those
4539 * operations performed in `late_init` might affect the sysfs
4540 * interfaces creating.
4541 */
4542 r = amdgpu_atombios_sysfs_init(adev);
4543 if (r)
4544 drm_err(&adev->ddev,
4545 "registering atombios sysfs failed (%d).\n", r);
4546
4547 r = amdgpu_pm_sysfs_init(adev);
4548 if (r)
4549 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4550
4551 r = amdgpu_ucode_sysfs_init(adev);
4552 if (r) {
4553 adev->ucode_sysfs_en = false;
4554 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4555 } else
4556 adev->ucode_sysfs_en = true;
4557
4558 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
4559 if (r)
4560 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4561
4562 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4563 if (r)
4564 dev_err(adev->dev,
4565 "Could not create amdgpu board attributes\n");
4566
4567 amdgpu_fru_sysfs_init(adev);
4568 amdgpu_reg_state_sysfs_init(adev);
4569 amdgpu_xcp_cfg_sysfs_init(adev);
4570
4571 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4572 r = amdgpu_pmu_init(adev);
4573 if (r)
4574 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4575
4576 /* Have stored pci confspace at hand for restore in sudden PCI error */
4577 if (amdgpu_device_cache_pci_state(adev->pdev))
4578 pci_restore_state(pdev);
4579
4580 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4581 /* this will fail for cards that aren't VGA class devices, just
4582 * ignore it
4583 */
4584 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4585 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4586
4587 px = amdgpu_device_supports_px(ddev);
4588
4589 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4590 apple_gmux_detect(NULL, NULL)))
4591 vga_switcheroo_register_client(adev->pdev,
4592 &amdgpu_switcheroo_ops, px);
4593
4594 if (px)
4595 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4596
4597 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
4598 amdgpu_xgmi_reset_on_init(adev);
4599
4600 amdgpu_device_check_iommu_direct_map(adev);
4601
4602 return 0;
4603
4604 release_ras_con:
4605 if (amdgpu_sriov_vf(adev))
4606 amdgpu_virt_release_full_gpu(adev, true);
4607
4608 /* failed in exclusive mode due to timeout */
4609 if (amdgpu_sriov_vf(adev) &&
4610 !amdgpu_sriov_runtime(adev) &&
4611 amdgpu_virt_mmio_blocked(adev) &&
4612 !amdgpu_virt_wait_reset(adev)) {
4613 dev_err(adev->dev, "VF exclusive mode timeout\n");
4614 /* Don't send request since VF is inactive. */
4615 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4616 adev->virt.ops = NULL;
4617 r = -EAGAIN;
4618 }
4619 amdgpu_release_ras_context(adev);
4620
4621 failed:
4622 amdgpu_vf_error_trans_all(adev);
4623
4624 return r;
4625 }
4626
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4627 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4628 {
4629
4630 /* Clear all CPU mappings pointing to this device */
4631 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4632
4633 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4634 amdgpu_doorbell_fini(adev);
4635
4636 iounmap(adev->rmmio);
4637 adev->rmmio = NULL;
4638 if (adev->mman.aper_base_kaddr)
4639 iounmap(adev->mman.aper_base_kaddr);
4640 adev->mman.aper_base_kaddr = NULL;
4641
4642 /* Memory manager related */
4643 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4644 arch_phys_wc_del(adev->gmc.vram_mtrr);
4645 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4646 }
4647 }
4648
4649 /**
4650 * amdgpu_device_fini_hw - tear down the driver
4651 *
4652 * @adev: amdgpu_device pointer
4653 *
4654 * Tear down the driver info (all asics).
4655 * Called at driver shutdown.
4656 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4657 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4658 {
4659 dev_info(adev->dev, "amdgpu: finishing device.\n");
4660 flush_delayed_work(&adev->delayed_init_work);
4661
4662 if (adev->mman.initialized)
4663 drain_workqueue(adev->mman.bdev.wq);
4664 adev->shutdown = true;
4665
4666 /* make sure IB test finished before entering exclusive mode
4667 * to avoid preemption on IB test
4668 */
4669 if (amdgpu_sriov_vf(adev)) {
4670 amdgpu_virt_request_full_gpu(adev, false);
4671 amdgpu_virt_fini_data_exchange(adev);
4672 }
4673
4674 /* disable all interrupts */
4675 amdgpu_irq_disable_all(adev);
4676 if (adev->mode_info.mode_config_initialized) {
4677 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4678 drm_helper_force_disable_all(adev_to_drm(adev));
4679 else
4680 drm_atomic_helper_shutdown(adev_to_drm(adev));
4681 }
4682 amdgpu_fence_driver_hw_fini(adev);
4683
4684 if (adev->pm.sysfs_initialized)
4685 amdgpu_pm_sysfs_fini(adev);
4686 if (adev->ucode_sysfs_en)
4687 amdgpu_ucode_sysfs_fini(adev);
4688 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4689 amdgpu_fru_sysfs_fini(adev);
4690
4691 amdgpu_reg_state_sysfs_fini(adev);
4692 amdgpu_xcp_cfg_sysfs_fini(adev);
4693
4694 /* disable ras feature must before hw fini */
4695 amdgpu_ras_pre_fini(adev);
4696
4697 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4698
4699 amdgpu_device_ip_fini_early(adev);
4700
4701 amdgpu_irq_fini_hw(adev);
4702
4703 if (adev->mman.initialized)
4704 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4705
4706 amdgpu_gart_dummy_page_fini(adev);
4707
4708 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4709 amdgpu_device_unmap_mmio(adev);
4710
4711 }
4712
amdgpu_device_fini_sw(struct amdgpu_device * adev)4713 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4714 {
4715 int idx;
4716 bool px;
4717
4718 amdgpu_device_ip_fini(adev);
4719 amdgpu_fence_driver_sw_fini(adev);
4720 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4721 adev->accel_working = false;
4722 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4723
4724 amdgpu_reset_fini(adev);
4725
4726 /* free i2c buses */
4727 if (!amdgpu_device_has_dc_support(adev))
4728 amdgpu_i2c_fini(adev);
4729
4730 if (amdgpu_emu_mode != 1)
4731 amdgpu_atombios_fini(adev);
4732
4733 kfree(adev->bios);
4734 adev->bios = NULL;
4735
4736 kfree(adev->fru_info);
4737 adev->fru_info = NULL;
4738
4739 px = amdgpu_device_supports_px(adev_to_drm(adev));
4740
4741 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4742 apple_gmux_detect(NULL, NULL)))
4743 vga_switcheroo_unregister_client(adev->pdev);
4744
4745 if (px)
4746 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4747
4748 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4749 vga_client_unregister(adev->pdev);
4750
4751 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4752
4753 iounmap(adev->rmmio);
4754 adev->rmmio = NULL;
4755 amdgpu_doorbell_fini(adev);
4756 drm_dev_exit(idx);
4757 }
4758
4759 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4760 amdgpu_pmu_fini(adev);
4761 if (adev->mman.discovery_bin)
4762 amdgpu_discovery_fini(adev);
4763
4764 amdgpu_reset_put_reset_domain(adev->reset_domain);
4765 adev->reset_domain = NULL;
4766
4767 kfree(adev->pci_state);
4768
4769 }
4770
4771 /**
4772 * amdgpu_device_evict_resources - evict device resources
4773 * @adev: amdgpu device object
4774 *
4775 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4776 * of the vram memory type. Mainly used for evicting device resources
4777 * at suspend time.
4778 *
4779 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4780 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4781 {
4782 int ret;
4783
4784 /* No need to evict vram on APUs for suspend to ram or s2idle */
4785 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4786 return 0;
4787
4788 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4789 if (ret)
4790 DRM_WARN("evicting device resources failed\n");
4791 return ret;
4792 }
4793
4794 /*
4795 * Suspend & resume.
4796 */
4797 /**
4798 * amdgpu_device_prepare - prepare for device suspend
4799 *
4800 * @dev: drm dev pointer
4801 *
4802 * Prepare to put the hw in the suspend state (all asics).
4803 * Returns 0 for success or an error on failure.
4804 * Called at driver suspend.
4805 */
amdgpu_device_prepare(struct drm_device * dev)4806 int amdgpu_device_prepare(struct drm_device *dev)
4807 {
4808 struct amdgpu_device *adev = drm_to_adev(dev);
4809 int i, r;
4810
4811 amdgpu_choose_low_power_state(adev);
4812
4813 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4814 return 0;
4815
4816 /* Evict the majority of BOs before starting suspend sequence */
4817 r = amdgpu_device_evict_resources(adev);
4818 if (r)
4819 goto unprepare;
4820
4821 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4822
4823 for (i = 0; i < adev->num_ip_blocks; i++) {
4824 if (!adev->ip_blocks[i].status.valid)
4825 continue;
4826 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4827 continue;
4828 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
4829 if (r)
4830 goto unprepare;
4831 }
4832
4833 return 0;
4834
4835 unprepare:
4836 adev->in_s0ix = adev->in_s3 = false;
4837
4838 return r;
4839 }
4840
4841 /**
4842 * amdgpu_device_suspend - initiate device suspend
4843 *
4844 * @dev: drm dev pointer
4845 * @notify_clients: notify in-kernel DRM clients
4846 *
4847 * Puts the hw in the suspend state (all asics).
4848 * Returns 0 for success or an error on failure.
4849 * Called at driver suspend.
4850 */
amdgpu_device_suspend(struct drm_device * dev,bool notify_clients)4851 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
4852 {
4853 struct amdgpu_device *adev = drm_to_adev(dev);
4854 int r = 0;
4855
4856 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4857 return 0;
4858
4859 adev->in_suspend = true;
4860
4861 if (amdgpu_sriov_vf(adev)) {
4862 amdgpu_virt_fini_data_exchange(adev);
4863 r = amdgpu_virt_request_full_gpu(adev, false);
4864 if (r)
4865 return r;
4866 }
4867
4868 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4869 DRM_WARN("smart shift update failed\n");
4870
4871 if (notify_clients)
4872 drm_client_dev_suspend(adev_to_drm(adev), false);
4873
4874 cancel_delayed_work_sync(&adev->delayed_init_work);
4875
4876 amdgpu_ras_suspend(adev);
4877
4878 amdgpu_device_ip_suspend_phase1(adev);
4879
4880 if (!adev->in_s0ix)
4881 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4882
4883 r = amdgpu_device_evict_resources(adev);
4884 if (r)
4885 return r;
4886
4887 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4888
4889 amdgpu_fence_driver_hw_fini(adev);
4890
4891 amdgpu_device_ip_suspend_phase2(adev);
4892
4893 if (amdgpu_sriov_vf(adev))
4894 amdgpu_virt_release_full_gpu(adev, false);
4895
4896 r = amdgpu_dpm_notify_rlc_state(adev, false);
4897 if (r)
4898 return r;
4899
4900 return 0;
4901 }
4902
4903 /**
4904 * amdgpu_device_resume - initiate device resume
4905 *
4906 * @dev: drm dev pointer
4907 * @notify_clients: notify in-kernel DRM clients
4908 *
4909 * Bring the hw back to operating state (all asics).
4910 * Returns 0 for success or an error on failure.
4911 * Called at driver resume.
4912 */
amdgpu_device_resume(struct drm_device * dev,bool notify_clients)4913 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
4914 {
4915 struct amdgpu_device *adev = drm_to_adev(dev);
4916 int r = 0;
4917
4918 if (amdgpu_sriov_vf(adev)) {
4919 r = amdgpu_virt_request_full_gpu(adev, true);
4920 if (r)
4921 return r;
4922 }
4923
4924 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4925 return 0;
4926
4927 if (adev->in_s0ix)
4928 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4929
4930 /* post card */
4931 if (amdgpu_device_need_post(adev)) {
4932 r = amdgpu_device_asic_init(adev);
4933 if (r)
4934 dev_err(adev->dev, "amdgpu asic init failed\n");
4935 }
4936
4937 r = amdgpu_device_ip_resume(adev);
4938
4939 if (r) {
4940 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4941 goto exit;
4942 }
4943
4944 if (!adev->in_s0ix) {
4945 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4946 if (r)
4947 goto exit;
4948 }
4949
4950 r = amdgpu_device_ip_late_init(adev);
4951 if (r)
4952 goto exit;
4953
4954 queue_delayed_work(system_wq, &adev->delayed_init_work,
4955 msecs_to_jiffies(AMDGPU_RESUME_MS));
4956 exit:
4957 if (amdgpu_sriov_vf(adev)) {
4958 amdgpu_virt_init_data_exchange(adev);
4959 amdgpu_virt_release_full_gpu(adev, true);
4960 }
4961
4962 if (r)
4963 return r;
4964
4965 /* Make sure IB tests flushed */
4966 flush_delayed_work(&adev->delayed_init_work);
4967
4968 if (notify_clients)
4969 drm_client_dev_resume(adev_to_drm(adev), false);
4970
4971 amdgpu_ras_resume(adev);
4972
4973 if (adev->mode_info.num_crtc) {
4974 /*
4975 * Most of the connector probing functions try to acquire runtime pm
4976 * refs to ensure that the GPU is powered on when connector polling is
4977 * performed. Since we're calling this from a runtime PM callback,
4978 * trying to acquire rpm refs will cause us to deadlock.
4979 *
4980 * Since we're guaranteed to be holding the rpm lock, it's safe to
4981 * temporarily disable the rpm helpers so this doesn't deadlock us.
4982 */
4983 #ifdef CONFIG_PM
4984 dev->dev->power.disable_depth++;
4985 #endif
4986 if (!adev->dc_enabled)
4987 drm_helper_hpd_irq_event(dev);
4988 else
4989 drm_kms_helper_hotplug_event(dev);
4990 #ifdef CONFIG_PM
4991 dev->dev->power.disable_depth--;
4992 #endif
4993 }
4994 adev->in_suspend = false;
4995
4996 if (adev->enable_mes)
4997 amdgpu_mes_self_test(adev);
4998
4999 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
5000 DRM_WARN("smart shift update failed\n");
5001
5002 return 0;
5003 }
5004
5005 /**
5006 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
5007 *
5008 * @adev: amdgpu_device pointer
5009 *
5010 * The list of all the hardware IPs that make up the asic is walked and
5011 * the check_soft_reset callbacks are run. check_soft_reset determines
5012 * if the asic is still hung or not.
5013 * Returns true if any of the IPs are still in a hung state, false if not.
5014 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)5015 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
5016 {
5017 int i;
5018 bool asic_hang = false;
5019
5020 if (amdgpu_sriov_vf(adev))
5021 return true;
5022
5023 if (amdgpu_asic_need_full_reset(adev))
5024 return true;
5025
5026 for (i = 0; i < adev->num_ip_blocks; i++) {
5027 if (!adev->ip_blocks[i].status.valid)
5028 continue;
5029 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
5030 adev->ip_blocks[i].status.hang =
5031 adev->ip_blocks[i].version->funcs->check_soft_reset(
5032 &adev->ip_blocks[i]);
5033 if (adev->ip_blocks[i].status.hang) {
5034 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
5035 asic_hang = true;
5036 }
5037 }
5038 return asic_hang;
5039 }
5040
5041 /**
5042 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5043 *
5044 * @adev: amdgpu_device pointer
5045 *
5046 * The list of all the hardware IPs that make up the asic is walked and the
5047 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
5048 * handles any IP specific hardware or software state changes that are
5049 * necessary for a soft reset to succeed.
5050 * Returns 0 on success, negative error code on failure.
5051 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)5052 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5053 {
5054 int i, r = 0;
5055
5056 for (i = 0; i < adev->num_ip_blocks; i++) {
5057 if (!adev->ip_blocks[i].status.valid)
5058 continue;
5059 if (adev->ip_blocks[i].status.hang &&
5060 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5061 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
5062 if (r)
5063 return r;
5064 }
5065 }
5066
5067 return 0;
5068 }
5069
5070 /**
5071 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5072 *
5073 * @adev: amdgpu_device pointer
5074 *
5075 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
5076 * reset is necessary to recover.
5077 * Returns true if a full asic reset is required, false if not.
5078 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)5079 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5080 {
5081 int i;
5082
5083 if (amdgpu_asic_need_full_reset(adev))
5084 return true;
5085
5086 for (i = 0; i < adev->num_ip_blocks; i++) {
5087 if (!adev->ip_blocks[i].status.valid)
5088 continue;
5089 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5090 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5091 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5092 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5093 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5094 if (adev->ip_blocks[i].status.hang) {
5095 dev_info(adev->dev, "Some block need full reset!\n");
5096 return true;
5097 }
5098 }
5099 }
5100 return false;
5101 }
5102
5103 /**
5104 * amdgpu_device_ip_soft_reset - do a soft reset
5105 *
5106 * @adev: amdgpu_device pointer
5107 *
5108 * The list of all the hardware IPs that make up the asic is walked and the
5109 * soft_reset callbacks are run if the block is hung. soft_reset handles any
5110 * IP specific hardware or software state changes that are necessary to soft
5111 * reset the IP.
5112 * Returns 0 on success, negative error code on failure.
5113 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)5114 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5115 {
5116 int i, r = 0;
5117
5118 for (i = 0; i < adev->num_ip_blocks; i++) {
5119 if (!adev->ip_blocks[i].status.valid)
5120 continue;
5121 if (adev->ip_blocks[i].status.hang &&
5122 adev->ip_blocks[i].version->funcs->soft_reset) {
5123 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
5124 if (r)
5125 return r;
5126 }
5127 }
5128
5129 return 0;
5130 }
5131
5132 /**
5133 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5134 *
5135 * @adev: amdgpu_device pointer
5136 *
5137 * The list of all the hardware IPs that make up the asic is walked and the
5138 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5139 * handles any IP specific hardware or software state changes that are
5140 * necessary after the IP has been soft reset.
5141 * Returns 0 on success, negative error code on failure.
5142 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)5143 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5144 {
5145 int i, r = 0;
5146
5147 for (i = 0; i < adev->num_ip_blocks; i++) {
5148 if (!adev->ip_blocks[i].status.valid)
5149 continue;
5150 if (adev->ip_blocks[i].status.hang &&
5151 adev->ip_blocks[i].version->funcs->post_soft_reset)
5152 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
5153 if (r)
5154 return r;
5155 }
5156
5157 return 0;
5158 }
5159
5160 /**
5161 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5162 *
5163 * @adev: amdgpu_device pointer
5164 * @reset_context: amdgpu reset context pointer
5165 *
5166 * do VF FLR and reinitialize Asic
5167 * return 0 means succeeded otherwise failed
5168 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5169 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5170 struct amdgpu_reset_context *reset_context)
5171 {
5172 int r;
5173 struct amdgpu_hive_info *hive = NULL;
5174
5175 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5176 if (!amdgpu_ras_get_fed_status(adev))
5177 amdgpu_virt_ready_to_reset(adev);
5178 amdgpu_virt_wait_reset(adev);
5179 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5180 r = amdgpu_virt_request_full_gpu(adev, true);
5181 } else {
5182 r = amdgpu_virt_reset_gpu(adev);
5183 }
5184 if (r)
5185 return r;
5186
5187 amdgpu_ras_set_fed(adev, false);
5188 amdgpu_irq_gpu_reset_resume_helper(adev);
5189
5190 /* some sw clean up VF needs to do before recover */
5191 amdgpu_virt_post_reset(adev);
5192
5193 /* Resume IP prior to SMC */
5194 r = amdgpu_device_ip_reinit_early_sriov(adev);
5195 if (r)
5196 return r;
5197
5198 amdgpu_virt_init_data_exchange(adev);
5199
5200 r = amdgpu_device_fw_loading(adev);
5201 if (r)
5202 return r;
5203
5204 /* now we are okay to resume SMC/CP/SDMA */
5205 r = amdgpu_device_ip_reinit_late_sriov(adev);
5206 if (r)
5207 return r;
5208
5209 hive = amdgpu_get_xgmi_hive(adev);
5210 /* Update PSP FW topology after reset */
5211 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5212 r = amdgpu_xgmi_update_topology(hive, adev);
5213 if (hive)
5214 amdgpu_put_xgmi_hive(hive);
5215 if (r)
5216 return r;
5217
5218 r = amdgpu_ib_ring_tests(adev);
5219 if (r)
5220 return r;
5221
5222 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5223 amdgpu_inc_vram_lost(adev);
5224
5225 /* need to be called during full access so we can't do it later like
5226 * bare-metal does.
5227 */
5228 amdgpu_amdkfd_post_reset(adev);
5229 amdgpu_virt_release_full_gpu(adev, true);
5230
5231 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5232 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5233 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5234 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5235 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5236 amdgpu_ras_resume(adev);
5237
5238 amdgpu_virt_ras_telemetry_post_reset(adev);
5239
5240 return 0;
5241 }
5242
5243 /**
5244 * amdgpu_device_has_job_running - check if there is any job in mirror list
5245 *
5246 * @adev: amdgpu_device pointer
5247 *
5248 * check if there is any job in mirror list
5249 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)5250 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5251 {
5252 int i;
5253 struct drm_sched_job *job;
5254
5255 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5256 struct amdgpu_ring *ring = adev->rings[i];
5257
5258 if (!amdgpu_ring_sched_ready(ring))
5259 continue;
5260
5261 spin_lock(&ring->sched.job_list_lock);
5262 job = list_first_entry_or_null(&ring->sched.pending_list,
5263 struct drm_sched_job, list);
5264 spin_unlock(&ring->sched.job_list_lock);
5265 if (job)
5266 return true;
5267 }
5268 return false;
5269 }
5270
5271 /**
5272 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5273 *
5274 * @adev: amdgpu_device pointer
5275 *
5276 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5277 * a hung GPU.
5278 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)5279 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5280 {
5281
5282 if (amdgpu_gpu_recovery == 0)
5283 goto disabled;
5284
5285 /* Skip soft reset check in fatal error mode */
5286 if (!amdgpu_ras_is_poison_mode_supported(adev))
5287 return true;
5288
5289 if (amdgpu_sriov_vf(adev))
5290 return true;
5291
5292 if (amdgpu_gpu_recovery == -1) {
5293 switch (adev->asic_type) {
5294 #ifdef CONFIG_DRM_AMDGPU_SI
5295 case CHIP_VERDE:
5296 case CHIP_TAHITI:
5297 case CHIP_PITCAIRN:
5298 case CHIP_OLAND:
5299 case CHIP_HAINAN:
5300 #endif
5301 #ifdef CONFIG_DRM_AMDGPU_CIK
5302 case CHIP_KAVERI:
5303 case CHIP_KABINI:
5304 case CHIP_MULLINS:
5305 #endif
5306 case CHIP_CARRIZO:
5307 case CHIP_STONEY:
5308 case CHIP_CYAN_SKILLFISH:
5309 goto disabled;
5310 default:
5311 break;
5312 }
5313 }
5314
5315 return true;
5316
5317 disabled:
5318 dev_info(adev->dev, "GPU recovery disabled.\n");
5319 return false;
5320 }
5321
amdgpu_device_mode1_reset(struct amdgpu_device * adev)5322 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5323 {
5324 u32 i;
5325 int ret = 0;
5326
5327 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5328
5329 dev_info(adev->dev, "GPU mode1 reset\n");
5330
5331 /* Cache the state before bus master disable. The saved config space
5332 * values are used in other cases like restore after mode-2 reset.
5333 */
5334 amdgpu_device_cache_pci_state(adev->pdev);
5335
5336 /* disable BM */
5337 pci_clear_master(adev->pdev);
5338
5339 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5340 dev_info(adev->dev, "GPU smu mode1 reset\n");
5341 ret = amdgpu_dpm_mode1_reset(adev);
5342 } else {
5343 dev_info(adev->dev, "GPU psp mode1 reset\n");
5344 ret = psp_gpu_reset(adev);
5345 }
5346
5347 if (ret)
5348 goto mode1_reset_failed;
5349
5350 amdgpu_device_load_pci_state(adev->pdev);
5351 ret = amdgpu_psp_wait_for_bootloader(adev);
5352 if (ret)
5353 goto mode1_reset_failed;
5354
5355 /* wait for asic to come out of reset */
5356 for (i = 0; i < adev->usec_timeout; i++) {
5357 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5358
5359 if (memsize != 0xffffffff)
5360 break;
5361 udelay(1);
5362 }
5363
5364 if (i >= adev->usec_timeout) {
5365 ret = -ETIMEDOUT;
5366 goto mode1_reset_failed;
5367 }
5368
5369 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5370
5371 return 0;
5372
5373 mode1_reset_failed:
5374 dev_err(adev->dev, "GPU mode1 reset failed\n");
5375 return ret;
5376 }
5377
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5378 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5379 struct amdgpu_reset_context *reset_context)
5380 {
5381 int i, r = 0;
5382 struct amdgpu_job *job = NULL;
5383 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5384 bool need_full_reset =
5385 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5386
5387 if (reset_context->reset_req_dev == adev)
5388 job = reset_context->job;
5389
5390 if (amdgpu_sriov_vf(adev))
5391 amdgpu_virt_pre_reset(adev);
5392
5393 amdgpu_fence_driver_isr_toggle(adev, true);
5394
5395 /* block all schedulers and reset given job's ring */
5396 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5397 struct amdgpu_ring *ring = adev->rings[i];
5398
5399 if (!amdgpu_ring_sched_ready(ring))
5400 continue;
5401
5402 /* Clear job fence from fence drv to avoid force_completion
5403 * leave NULL and vm flush fence in fence drv
5404 */
5405 amdgpu_fence_driver_clear_job_fences(ring);
5406
5407 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5408 amdgpu_fence_driver_force_completion(ring);
5409 }
5410
5411 amdgpu_fence_driver_isr_toggle(adev, false);
5412
5413 if (job && job->vm)
5414 drm_sched_increase_karma(&job->base);
5415
5416 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5417 /* If reset handler not implemented, continue; otherwise return */
5418 if (r == -EOPNOTSUPP)
5419 r = 0;
5420 else
5421 return r;
5422
5423 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5424 if (!amdgpu_sriov_vf(adev)) {
5425
5426 if (!need_full_reset)
5427 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5428
5429 if (!need_full_reset && amdgpu_gpu_recovery &&
5430 amdgpu_device_ip_check_soft_reset(adev)) {
5431 amdgpu_device_ip_pre_soft_reset(adev);
5432 r = amdgpu_device_ip_soft_reset(adev);
5433 amdgpu_device_ip_post_soft_reset(adev);
5434 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5435 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5436 need_full_reset = true;
5437 }
5438 }
5439
5440 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5441 dev_info(tmp_adev->dev, "Dumping IP State\n");
5442 /* Trigger ip dump before we reset the asic */
5443 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5444 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5445 tmp_adev->ip_blocks[i].version->funcs
5446 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
5447 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5448 }
5449
5450 if (need_full_reset)
5451 r = amdgpu_device_ip_suspend(adev);
5452 if (need_full_reset)
5453 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5454 else
5455 clear_bit(AMDGPU_NEED_FULL_RESET,
5456 &reset_context->flags);
5457 }
5458
5459 return r;
5460 }
5461
amdgpu_device_reinit_after_reset(struct amdgpu_reset_context * reset_context)5462 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
5463 {
5464 struct list_head *device_list_handle;
5465 bool full_reset, vram_lost = false;
5466 struct amdgpu_device *tmp_adev;
5467 int r, init_level;
5468
5469 device_list_handle = reset_context->reset_device_list;
5470
5471 if (!device_list_handle)
5472 return -EINVAL;
5473
5474 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5475
5476 /**
5477 * If it's reset on init, it's default init level, otherwise keep level
5478 * as recovery level.
5479 */
5480 if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
5481 init_level = AMDGPU_INIT_LEVEL_DEFAULT;
5482 else
5483 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
5484
5485 r = 0;
5486 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5487 amdgpu_set_init_level(tmp_adev, init_level);
5488 if (full_reset) {
5489 /* post card */
5490 amdgpu_ras_set_fed(tmp_adev, false);
5491 r = amdgpu_device_asic_init(tmp_adev);
5492 if (r) {
5493 dev_warn(tmp_adev->dev, "asic atom init failed!");
5494 } else {
5495 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5496
5497 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5498 if (r)
5499 goto out;
5500
5501 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5502
5503 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5504 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5505
5506 if (vram_lost) {
5507 DRM_INFO("VRAM is lost due to GPU reset!\n");
5508 amdgpu_inc_vram_lost(tmp_adev);
5509 }
5510
5511 r = amdgpu_device_fw_loading(tmp_adev);
5512 if (r)
5513 return r;
5514
5515 r = amdgpu_xcp_restore_partition_mode(
5516 tmp_adev->xcp_mgr);
5517 if (r)
5518 goto out;
5519
5520 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5521 if (r)
5522 goto out;
5523
5524 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5525 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5526
5527 r = amdgpu_device_ip_resume_phase3(tmp_adev);
5528 if (r)
5529 goto out;
5530
5531 if (vram_lost)
5532 amdgpu_device_fill_reset_magic(tmp_adev);
5533
5534 /*
5535 * Add this ASIC as tracked as reset was already
5536 * complete successfully.
5537 */
5538 amdgpu_register_gpu_instance(tmp_adev);
5539
5540 if (!reset_context->hive &&
5541 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5542 amdgpu_xgmi_add_device(tmp_adev);
5543
5544 r = amdgpu_device_ip_late_init(tmp_adev);
5545 if (r)
5546 goto out;
5547
5548 drm_client_dev_resume(adev_to_drm(tmp_adev), false);
5549
5550 /*
5551 * The GPU enters bad state once faulty pages
5552 * by ECC has reached the threshold, and ras
5553 * recovery is scheduled next. So add one check
5554 * here to break recovery if it indeed exceeds
5555 * bad page threshold, and remind user to
5556 * retire this GPU or setting one bigger
5557 * bad_page_threshold value to fix this once
5558 * probing driver again.
5559 */
5560 if (!amdgpu_ras_is_rma(tmp_adev)) {
5561 /* must succeed. */
5562 amdgpu_ras_resume(tmp_adev);
5563 } else {
5564 r = -EINVAL;
5565 goto out;
5566 }
5567
5568 /* Update PSP FW topology after reset */
5569 if (reset_context->hive &&
5570 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5571 r = amdgpu_xgmi_update_topology(
5572 reset_context->hive, tmp_adev);
5573 }
5574 }
5575
5576 out:
5577 if (!r) {
5578 /* IP init is complete now, set level as default */
5579 amdgpu_set_init_level(tmp_adev,
5580 AMDGPU_INIT_LEVEL_DEFAULT);
5581 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5582 r = amdgpu_ib_ring_tests(tmp_adev);
5583 if (r) {
5584 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5585 r = -EAGAIN;
5586 goto end;
5587 }
5588 }
5589
5590 if (r)
5591 tmp_adev->asic_reset_res = r;
5592 }
5593
5594 end:
5595 return r;
5596 }
5597
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5598 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5599 struct amdgpu_reset_context *reset_context)
5600 {
5601 struct amdgpu_device *tmp_adev = NULL;
5602 bool need_full_reset, skip_hw_reset;
5603 int r = 0;
5604
5605 /* Try reset handler method first */
5606 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5607 reset_list);
5608
5609 reset_context->reset_device_list = device_list_handle;
5610 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5611 /* If reset handler not implemented, continue; otherwise return */
5612 if (r == -EOPNOTSUPP)
5613 r = 0;
5614 else
5615 return r;
5616
5617 /* Reset handler not implemented, use the default method */
5618 need_full_reset =
5619 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5620 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5621
5622 /*
5623 * ASIC reset has to be done on all XGMI hive nodes ASAP
5624 * to allow proper links negotiation in FW (within 1 sec)
5625 */
5626 if (!skip_hw_reset && need_full_reset) {
5627 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5628 /* For XGMI run all resets in parallel to speed up the process */
5629 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5630 if (!queue_work(system_unbound_wq,
5631 &tmp_adev->xgmi_reset_work))
5632 r = -EALREADY;
5633 } else
5634 r = amdgpu_asic_reset(tmp_adev);
5635
5636 if (r) {
5637 dev_err(tmp_adev->dev,
5638 "ASIC reset failed with error, %d for drm dev, %s",
5639 r, adev_to_drm(tmp_adev)->unique);
5640 goto out;
5641 }
5642 }
5643
5644 /* For XGMI wait for all resets to complete before proceed */
5645 if (!r) {
5646 list_for_each_entry(tmp_adev, device_list_handle,
5647 reset_list) {
5648 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5649 flush_work(&tmp_adev->xgmi_reset_work);
5650 r = tmp_adev->asic_reset_res;
5651 if (r)
5652 break;
5653 }
5654 }
5655 }
5656 }
5657
5658 if (!r && amdgpu_ras_intr_triggered()) {
5659 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5660 amdgpu_ras_reset_error_count(tmp_adev,
5661 AMDGPU_RAS_BLOCK__MMHUB);
5662 }
5663
5664 amdgpu_ras_intr_cleared();
5665 }
5666
5667 r = amdgpu_device_reinit_after_reset(reset_context);
5668 if (r == -EAGAIN)
5669 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5670 else
5671 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5672
5673 out:
5674 return r;
5675 }
5676
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5677 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5678 {
5679
5680 switch (amdgpu_asic_reset_method(adev)) {
5681 case AMD_RESET_METHOD_MODE1:
5682 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5683 break;
5684 case AMD_RESET_METHOD_MODE2:
5685 adev->mp1_state = PP_MP1_STATE_RESET;
5686 break;
5687 default:
5688 adev->mp1_state = PP_MP1_STATE_NONE;
5689 break;
5690 }
5691 }
5692
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5693 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5694 {
5695 amdgpu_vf_error_trans_all(adev);
5696 adev->mp1_state = PP_MP1_STATE_NONE;
5697 }
5698
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5699 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5700 {
5701 struct pci_dev *p = NULL;
5702
5703 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5704 adev->pdev->bus->number, 1);
5705 if (p) {
5706 pm_runtime_enable(&(p->dev));
5707 pm_runtime_resume(&(p->dev));
5708 }
5709
5710 pci_dev_put(p);
5711 }
5712
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5713 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5714 {
5715 enum amd_reset_method reset_method;
5716 struct pci_dev *p = NULL;
5717 u64 expires;
5718
5719 /*
5720 * For now, only BACO and mode1 reset are confirmed
5721 * to suffer the audio issue without proper suspended.
5722 */
5723 reset_method = amdgpu_asic_reset_method(adev);
5724 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5725 (reset_method != AMD_RESET_METHOD_MODE1))
5726 return -EINVAL;
5727
5728 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5729 adev->pdev->bus->number, 1);
5730 if (!p)
5731 return -ENODEV;
5732
5733 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5734 if (!expires)
5735 /*
5736 * If we cannot get the audio device autosuspend delay,
5737 * a fixed 4S interval will be used. Considering 3S is
5738 * the audio controller default autosuspend delay setting.
5739 * 4S used here is guaranteed to cover that.
5740 */
5741 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5742
5743 while (!pm_runtime_status_suspended(&(p->dev))) {
5744 if (!pm_runtime_suspend(&(p->dev)))
5745 break;
5746
5747 if (expires < ktime_get_mono_fast_ns()) {
5748 dev_warn(adev->dev, "failed to suspend display audio\n");
5749 pci_dev_put(p);
5750 /* TODO: abort the succeeding gpu reset? */
5751 return -ETIMEDOUT;
5752 }
5753 }
5754
5755 pm_runtime_disable(&(p->dev));
5756
5757 pci_dev_put(p);
5758 return 0;
5759 }
5760
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5761 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5762 {
5763 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5764
5765 #if defined(CONFIG_DEBUG_FS)
5766 if (!amdgpu_sriov_vf(adev))
5767 cancel_work(&adev->reset_work);
5768 #endif
5769
5770 if (adev->kfd.dev)
5771 cancel_work(&adev->kfd.reset_work);
5772
5773 if (amdgpu_sriov_vf(adev))
5774 cancel_work(&adev->virt.flr_work);
5775
5776 if (con && adev->ras_enabled)
5777 cancel_work(&con->recovery_work);
5778
5779 }
5780
amdgpu_device_health_check(struct list_head * device_list_handle)5781 static int amdgpu_device_health_check(struct list_head *device_list_handle)
5782 {
5783 struct amdgpu_device *tmp_adev;
5784 int ret = 0;
5785 u32 status;
5786
5787 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5788 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
5789 if (PCI_POSSIBLE_ERROR(status)) {
5790 dev_err(tmp_adev->dev, "device lost from bus!");
5791 ret = -ENODEV;
5792 }
5793 }
5794
5795 return ret;
5796 }
5797
5798 /**
5799 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5800 *
5801 * @adev: amdgpu_device pointer
5802 * @job: which job trigger hang
5803 * @reset_context: amdgpu reset context pointer
5804 *
5805 * Attempt to reset the GPU if it has hung (all asics).
5806 * Attempt to do soft-reset or full-reset and reinitialize Asic
5807 * Returns 0 for success or an error on failure.
5808 */
5809
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5810 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5811 struct amdgpu_job *job,
5812 struct amdgpu_reset_context *reset_context)
5813 {
5814 struct list_head device_list, *device_list_handle = NULL;
5815 bool job_signaled = false;
5816 struct amdgpu_hive_info *hive = NULL;
5817 struct amdgpu_device *tmp_adev = NULL;
5818 int i, r = 0;
5819 bool need_emergency_restart = false;
5820 bool audio_suspended = false;
5821 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
5822
5823 /*
5824 * Special case: RAS triggered and full reset isn't supported
5825 */
5826 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5827
5828 /*
5829 * Flush RAM to disk so that after reboot
5830 * the user can read log and see why the system rebooted.
5831 */
5832 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5833 amdgpu_ras_get_context(adev)->reboot) {
5834 DRM_WARN("Emergency reboot.");
5835
5836 ksys_sync_helper();
5837 emergency_restart();
5838 }
5839
5840 dev_info(adev->dev, "GPU %s begin!\n",
5841 need_emergency_restart ? "jobs stop":"reset");
5842
5843 if (!amdgpu_sriov_vf(adev))
5844 hive = amdgpu_get_xgmi_hive(adev);
5845 if (hive)
5846 mutex_lock(&hive->hive_lock);
5847
5848 reset_context->job = job;
5849 reset_context->hive = hive;
5850 /*
5851 * Build list of devices to reset.
5852 * In case we are in XGMI hive mode, resort the device list
5853 * to put adev in the 1st position.
5854 */
5855 INIT_LIST_HEAD(&device_list);
5856 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5857 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5858 list_add_tail(&tmp_adev->reset_list, &device_list);
5859 if (adev->shutdown)
5860 tmp_adev->shutdown = true;
5861 }
5862 if (!list_is_first(&adev->reset_list, &device_list))
5863 list_rotate_to_front(&adev->reset_list, &device_list);
5864 device_list_handle = &device_list;
5865 } else {
5866 list_add_tail(&adev->reset_list, &device_list);
5867 device_list_handle = &device_list;
5868 }
5869
5870 if (!amdgpu_sriov_vf(adev)) {
5871 r = amdgpu_device_health_check(device_list_handle);
5872 if (r)
5873 goto end_reset;
5874 }
5875
5876 /* We need to lock reset domain only once both for XGMI and single device */
5877 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5878 reset_list);
5879 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5880
5881 /* block all schedulers and reset given job's ring */
5882 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5883
5884 amdgpu_device_set_mp1_state(tmp_adev);
5885
5886 /*
5887 * Try to put the audio codec into suspend state
5888 * before gpu reset started.
5889 *
5890 * Due to the power domain of the graphics device
5891 * is shared with AZ power domain. Without this,
5892 * we may change the audio hardware from behind
5893 * the audio driver's back. That will trigger
5894 * some audio codec errors.
5895 */
5896 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5897 audio_suspended = true;
5898
5899 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5900
5901 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5902
5903 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
5904
5905 /*
5906 * Mark these ASICs to be reseted as untracked first
5907 * And add them back after reset completed
5908 */
5909 amdgpu_unregister_gpu_instance(tmp_adev);
5910
5911 drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
5912
5913 /* disable ras on ALL IPs */
5914 if (!need_emergency_restart &&
5915 amdgpu_device_ip_need_full_reset(tmp_adev))
5916 amdgpu_ras_suspend(tmp_adev);
5917
5918 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5919 struct amdgpu_ring *ring = tmp_adev->rings[i];
5920
5921 if (!amdgpu_ring_sched_ready(ring))
5922 continue;
5923
5924 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5925
5926 if (need_emergency_restart)
5927 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5928 }
5929 atomic_inc(&tmp_adev->gpu_reset_counter);
5930 }
5931
5932 if (need_emergency_restart)
5933 goto skip_sched_resume;
5934
5935 /*
5936 * Must check guilty signal here since after this point all old
5937 * HW fences are force signaled.
5938 *
5939 * job->base holds a reference to parent fence
5940 */
5941 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5942 job_signaled = true;
5943 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5944 goto skip_hw_reset;
5945 }
5946
5947 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5948 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5949 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5950 /*TODO Should we stop ?*/
5951 if (r) {
5952 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5953 r, adev_to_drm(tmp_adev)->unique);
5954 tmp_adev->asic_reset_res = r;
5955 }
5956 }
5957
5958 /* Actual ASIC resets if needed.*/
5959 /* Host driver will handle XGMI hive reset for SRIOV */
5960 if (amdgpu_sriov_vf(adev)) {
5961 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
5962 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
5963 amdgpu_ras_set_fed(adev, true);
5964 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5965 }
5966
5967 r = amdgpu_device_reset_sriov(adev, reset_context);
5968 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
5969 amdgpu_virt_release_full_gpu(adev, true);
5970 goto retry;
5971 }
5972 if (r)
5973 adev->asic_reset_res = r;
5974 } else {
5975 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5976 if (r && r == -EAGAIN)
5977 goto retry;
5978 }
5979
5980 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5981 /*
5982 * Drop any pending non scheduler resets queued before reset is done.
5983 * Any reset scheduled after this point would be valid. Scheduler resets
5984 * were already dropped during drm_sched_stop and no new ones can come
5985 * in before drm_sched_start.
5986 */
5987 amdgpu_device_stop_pending_resets(tmp_adev);
5988 }
5989
5990 skip_hw_reset:
5991
5992 /* Post ASIC reset for all devs .*/
5993 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5994
5995 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5996 struct amdgpu_ring *ring = tmp_adev->rings[i];
5997
5998 if (!amdgpu_ring_sched_ready(ring))
5999 continue;
6000
6001 drm_sched_start(&ring->sched, 0);
6002 }
6003
6004 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
6005 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
6006
6007 if (tmp_adev->asic_reset_res)
6008 r = tmp_adev->asic_reset_res;
6009
6010 tmp_adev->asic_reset_res = 0;
6011
6012 if (r) {
6013 /* bad news, how to tell it to userspace ?
6014 * for ras error, we should report GPU bad status instead of
6015 * reset failure
6016 */
6017 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
6018 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
6019 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
6020 atomic_read(&tmp_adev->gpu_reset_counter));
6021 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
6022 } else {
6023 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
6024 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
6025 DRM_WARN("smart shift update failed\n");
6026 }
6027 }
6028
6029 skip_sched_resume:
6030 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6031 /* unlock kfd: SRIOV would do it separately */
6032 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
6033 amdgpu_amdkfd_post_reset(tmp_adev);
6034
6035 /* kfd_post_reset will do nothing if kfd device is not initialized,
6036 * need to bring up kfd here if it's not be initialized before
6037 */
6038 if (!adev->kfd.init_complete)
6039 amdgpu_amdkfd_device_init(adev);
6040
6041 if (audio_suspended)
6042 amdgpu_device_resume_display_audio(tmp_adev);
6043
6044 amdgpu_device_unset_mp1_state(tmp_adev);
6045
6046 amdgpu_ras_set_error_query_ready(tmp_adev, true);
6047 }
6048
6049 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6050 reset_list);
6051 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6052
6053 end_reset:
6054 if (hive) {
6055 mutex_unlock(&hive->hive_lock);
6056 amdgpu_put_xgmi_hive(hive);
6057 }
6058
6059 if (r)
6060 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6061
6062 atomic_set(&adev->reset_domain->reset_res, r);
6063 return r;
6064 }
6065
6066 /**
6067 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
6068 *
6069 * @adev: amdgpu_device pointer
6070 * @speed: pointer to the speed of the link
6071 * @width: pointer to the width of the link
6072 *
6073 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
6074 * first physical partner to an AMD dGPU.
6075 * This will exclude any virtual switches and links.
6076 */
amdgpu_device_partner_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6077 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6078 enum pci_bus_speed *speed,
6079 enum pcie_link_width *width)
6080 {
6081 struct pci_dev *parent = adev->pdev;
6082
6083 if (!speed || !width)
6084 return;
6085
6086 *speed = PCI_SPEED_UNKNOWN;
6087 *width = PCIE_LNK_WIDTH_UNKNOWN;
6088
6089 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6090 while ((parent = pci_upstream_bridge(parent))) {
6091 /* skip upstream/downstream switches internal to dGPU*/
6092 if (parent->vendor == PCI_VENDOR_ID_ATI)
6093 continue;
6094 *speed = pcie_get_speed_cap(parent);
6095 *width = pcie_get_width_cap(parent);
6096 break;
6097 }
6098 } else {
6099 /* use the current speeds rather than max if switching is not supported */
6100 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6101 }
6102 }
6103
6104 /**
6105 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6106 *
6107 * @adev: amdgpu_device pointer
6108 *
6109 * Fetchs and stores in the driver the PCIE capabilities (gen speed
6110 * and lanes) of the slot the device is in. Handles APUs and
6111 * virtualized environments where PCIE config space may not be available.
6112 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)6113 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6114 {
6115 struct pci_dev *pdev;
6116 enum pci_bus_speed speed_cap, platform_speed_cap;
6117 enum pcie_link_width platform_link_width;
6118
6119 if (amdgpu_pcie_gen_cap)
6120 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6121
6122 if (amdgpu_pcie_lane_cap)
6123 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6124
6125 /* covers APUs as well */
6126 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6127 if (adev->pm.pcie_gen_mask == 0)
6128 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6129 if (adev->pm.pcie_mlw_mask == 0)
6130 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6131 return;
6132 }
6133
6134 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6135 return;
6136
6137 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6138 &platform_link_width);
6139
6140 if (adev->pm.pcie_gen_mask == 0) {
6141 /* asic caps */
6142 pdev = adev->pdev;
6143 speed_cap = pcie_get_speed_cap(pdev);
6144 if (speed_cap == PCI_SPEED_UNKNOWN) {
6145 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6146 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6147 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6148 } else {
6149 if (speed_cap == PCIE_SPEED_32_0GT)
6150 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6151 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6152 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6153 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6154 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6155 else if (speed_cap == PCIE_SPEED_16_0GT)
6156 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6157 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6158 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6159 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6160 else if (speed_cap == PCIE_SPEED_8_0GT)
6161 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6162 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6163 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6164 else if (speed_cap == PCIE_SPEED_5_0GT)
6165 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6166 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6167 else
6168 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6169 }
6170 /* platform caps */
6171 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6172 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6173 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6174 } else {
6175 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6176 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6177 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6178 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6179 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6180 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6181 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6182 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6183 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6184 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6185 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6186 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6187 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6188 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6189 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6190 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6191 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6192 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6193 else
6194 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6195
6196 }
6197 }
6198 if (adev->pm.pcie_mlw_mask == 0) {
6199 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6200 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6201 } else {
6202 switch (platform_link_width) {
6203 case PCIE_LNK_X32:
6204 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6205 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6206 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6207 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6208 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6209 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6210 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6211 break;
6212 case PCIE_LNK_X16:
6213 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6214 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6215 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6216 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6217 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6218 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6219 break;
6220 case PCIE_LNK_X12:
6221 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6222 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6223 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6224 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6225 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6226 break;
6227 case PCIE_LNK_X8:
6228 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6229 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6230 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6231 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6232 break;
6233 case PCIE_LNK_X4:
6234 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6235 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6236 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6237 break;
6238 case PCIE_LNK_X2:
6239 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6240 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6241 break;
6242 case PCIE_LNK_X1:
6243 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6244 break;
6245 default:
6246 break;
6247 }
6248 }
6249 }
6250 }
6251
6252 /**
6253 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6254 *
6255 * @adev: amdgpu_device pointer
6256 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6257 *
6258 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6259 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6260 * @peer_adev.
6261 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)6262 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6263 struct amdgpu_device *peer_adev)
6264 {
6265 #ifdef CONFIG_HSA_AMD_P2P
6266 bool p2p_access =
6267 !adev->gmc.xgmi.connected_to_cpu &&
6268 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6269 if (!p2p_access)
6270 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n",
6271 pci_name(peer_adev->pdev));
6272
6273 bool is_large_bar = adev->gmc.visible_vram_size &&
6274 adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6275 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6276
6277 if (!p2p_addressable) {
6278 uint64_t address_mask = peer_adev->dev->dma_mask ?
6279 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6280 resource_size_t aper_limit =
6281 adev->gmc.aper_base + adev->gmc.aper_size - 1;
6282
6283 p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6284 aper_limit & address_mask);
6285 }
6286 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6287 #else
6288 return false;
6289 #endif
6290 }
6291
amdgpu_device_baco_enter(struct drm_device * dev)6292 int amdgpu_device_baco_enter(struct drm_device *dev)
6293 {
6294 struct amdgpu_device *adev = drm_to_adev(dev);
6295 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6296
6297 if (!amdgpu_device_supports_baco(dev))
6298 return -ENOTSUPP;
6299
6300 if (ras && adev->ras_enabled &&
6301 adev->nbio.funcs->enable_doorbell_interrupt)
6302 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6303
6304 return amdgpu_dpm_baco_enter(adev);
6305 }
6306
amdgpu_device_baco_exit(struct drm_device * dev)6307 int amdgpu_device_baco_exit(struct drm_device *dev)
6308 {
6309 struct amdgpu_device *adev = drm_to_adev(dev);
6310 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6311 int ret = 0;
6312
6313 if (!amdgpu_device_supports_baco(dev))
6314 return -ENOTSUPP;
6315
6316 ret = amdgpu_dpm_baco_exit(adev);
6317 if (ret)
6318 return ret;
6319
6320 if (ras && adev->ras_enabled &&
6321 adev->nbio.funcs->enable_doorbell_interrupt)
6322 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6323
6324 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6325 adev->nbio.funcs->clear_doorbell_interrupt)
6326 adev->nbio.funcs->clear_doorbell_interrupt(adev);
6327
6328 return 0;
6329 }
6330
6331 /**
6332 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6333 * @pdev: PCI device struct
6334 * @state: PCI channel state
6335 *
6336 * Description: Called when a PCI error is detected.
6337 *
6338 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6339 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)6340 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6341 {
6342 struct drm_device *dev = pci_get_drvdata(pdev);
6343 struct amdgpu_device *adev = drm_to_adev(dev);
6344 int i;
6345
6346 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
6347
6348 if (adev->gmc.xgmi.num_physical_nodes > 1) {
6349 DRM_WARN("No support for XGMI hive yet...");
6350 return PCI_ERS_RESULT_DISCONNECT;
6351 }
6352
6353 adev->pci_channel_state = state;
6354
6355 switch (state) {
6356 case pci_channel_io_normal:
6357 return PCI_ERS_RESULT_CAN_RECOVER;
6358 /* Fatal error, prepare for slot reset */
6359 case pci_channel_io_frozen:
6360 /*
6361 * Locking adev->reset_domain->sem will prevent any external access
6362 * to GPU during PCI error recovery
6363 */
6364 amdgpu_device_lock_reset_domain(adev->reset_domain);
6365 amdgpu_device_set_mp1_state(adev);
6366
6367 /*
6368 * Block any work scheduling as we do for regular GPU reset
6369 * for the duration of the recovery
6370 */
6371 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6372 struct amdgpu_ring *ring = adev->rings[i];
6373
6374 if (!amdgpu_ring_sched_ready(ring))
6375 continue;
6376
6377 drm_sched_stop(&ring->sched, NULL);
6378 }
6379 atomic_inc(&adev->gpu_reset_counter);
6380 return PCI_ERS_RESULT_NEED_RESET;
6381 case pci_channel_io_perm_failure:
6382 /* Permanent error, prepare for device removal */
6383 return PCI_ERS_RESULT_DISCONNECT;
6384 }
6385
6386 return PCI_ERS_RESULT_NEED_RESET;
6387 }
6388
6389 /**
6390 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6391 * @pdev: pointer to PCI device
6392 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)6393 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6394 {
6395
6396 DRM_INFO("PCI error: mmio enabled callback!!\n");
6397
6398 /* TODO - dump whatever for debugging purposes */
6399
6400 /* This called only if amdgpu_pci_error_detected returns
6401 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6402 * works, no need to reset slot.
6403 */
6404
6405 return PCI_ERS_RESULT_RECOVERED;
6406 }
6407
6408 /**
6409 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6410 * @pdev: PCI device struct
6411 *
6412 * Description: This routine is called by the pci error recovery
6413 * code after the PCI slot has been reset, just before we
6414 * should resume normal operations.
6415 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)6416 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6417 {
6418 struct drm_device *dev = pci_get_drvdata(pdev);
6419 struct amdgpu_device *adev = drm_to_adev(dev);
6420 int r, i;
6421 struct amdgpu_reset_context reset_context;
6422 u32 memsize;
6423 struct list_head device_list;
6424
6425 /* PCI error slot reset should be skipped During RAS recovery */
6426 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6427 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6428 amdgpu_ras_in_recovery(adev))
6429 return PCI_ERS_RESULT_RECOVERED;
6430
6431 DRM_INFO("PCI error: slot reset callback!!\n");
6432
6433 memset(&reset_context, 0, sizeof(reset_context));
6434
6435 INIT_LIST_HEAD(&device_list);
6436 list_add_tail(&adev->reset_list, &device_list);
6437
6438 /* wait for asic to come out of reset */
6439 msleep(500);
6440
6441 /* Restore PCI confspace */
6442 amdgpu_device_load_pci_state(pdev);
6443
6444 /* confirm ASIC came out of reset */
6445 for (i = 0; i < adev->usec_timeout; i++) {
6446 memsize = amdgpu_asic_get_config_memsize(adev);
6447
6448 if (memsize != 0xffffffff)
6449 break;
6450 udelay(1);
6451 }
6452 if (memsize == 0xffffffff) {
6453 r = -ETIME;
6454 goto out;
6455 }
6456
6457 reset_context.method = AMD_RESET_METHOD_NONE;
6458 reset_context.reset_req_dev = adev;
6459 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6460 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6461
6462 adev->no_hw_access = true;
6463 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
6464 adev->no_hw_access = false;
6465 if (r)
6466 goto out;
6467
6468 r = amdgpu_do_asic_reset(&device_list, &reset_context);
6469
6470 out:
6471 if (!r) {
6472 if (amdgpu_device_cache_pci_state(adev->pdev))
6473 pci_restore_state(adev->pdev);
6474
6475 DRM_INFO("PCIe error recovery succeeded\n");
6476 } else {
6477 DRM_ERROR("PCIe error recovery failed, err:%d", r);
6478 amdgpu_device_unset_mp1_state(adev);
6479 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6480 }
6481
6482 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6483 }
6484
6485 /**
6486 * amdgpu_pci_resume() - resume normal ops after PCI reset
6487 * @pdev: pointer to PCI device
6488 *
6489 * Called when the error recovery driver tells us that its
6490 * OK to resume normal operation.
6491 */
amdgpu_pci_resume(struct pci_dev * pdev)6492 void amdgpu_pci_resume(struct pci_dev *pdev)
6493 {
6494 struct drm_device *dev = pci_get_drvdata(pdev);
6495 struct amdgpu_device *adev = drm_to_adev(dev);
6496 int i;
6497
6498
6499 DRM_INFO("PCI error: resume callback!!\n");
6500
6501 /* Only continue execution for the case of pci_channel_io_frozen */
6502 if (adev->pci_channel_state != pci_channel_io_frozen)
6503 return;
6504
6505 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6506 struct amdgpu_ring *ring = adev->rings[i];
6507
6508 if (!amdgpu_ring_sched_ready(ring))
6509 continue;
6510
6511 drm_sched_start(&ring->sched, 0);
6512 }
6513
6514 amdgpu_device_unset_mp1_state(adev);
6515 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6516 }
6517
amdgpu_device_cache_pci_state(struct pci_dev * pdev)6518 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6519 {
6520 struct drm_device *dev = pci_get_drvdata(pdev);
6521 struct amdgpu_device *adev = drm_to_adev(dev);
6522 int r;
6523
6524 if (amdgpu_sriov_vf(adev))
6525 return false;
6526
6527 r = pci_save_state(pdev);
6528 if (!r) {
6529 kfree(adev->pci_state);
6530
6531 adev->pci_state = pci_store_saved_state(pdev);
6532
6533 if (!adev->pci_state) {
6534 DRM_ERROR("Failed to store PCI saved state");
6535 return false;
6536 }
6537 } else {
6538 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6539 return false;
6540 }
6541
6542 return true;
6543 }
6544
amdgpu_device_load_pci_state(struct pci_dev * pdev)6545 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6546 {
6547 struct drm_device *dev = pci_get_drvdata(pdev);
6548 struct amdgpu_device *adev = drm_to_adev(dev);
6549 int r;
6550
6551 if (!adev->pci_state)
6552 return false;
6553
6554 r = pci_load_saved_state(pdev, adev->pci_state);
6555
6556 if (!r) {
6557 pci_restore_state(pdev);
6558 } else {
6559 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6560 return false;
6561 }
6562
6563 return true;
6564 }
6565
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6566 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6567 struct amdgpu_ring *ring)
6568 {
6569 #ifdef CONFIG_X86_64
6570 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6571 return;
6572 #endif
6573 if (adev->gmc.xgmi.connected_to_cpu)
6574 return;
6575
6576 if (ring && ring->funcs->emit_hdp_flush)
6577 amdgpu_ring_emit_hdp_flush(ring);
6578 else
6579 amdgpu_asic_flush_hdp(adev, ring);
6580 }
6581
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6582 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6583 struct amdgpu_ring *ring)
6584 {
6585 #ifdef CONFIG_X86_64
6586 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6587 return;
6588 #endif
6589 if (adev->gmc.xgmi.connected_to_cpu)
6590 return;
6591
6592 amdgpu_asic_invalidate_hdp(adev, ring);
6593 }
6594
amdgpu_in_reset(struct amdgpu_device * adev)6595 int amdgpu_in_reset(struct amdgpu_device *adev)
6596 {
6597 return atomic_read(&adev->reset_domain->in_gpu_reset);
6598 }
6599
6600 /**
6601 * amdgpu_device_halt() - bring hardware to some kind of halt state
6602 *
6603 * @adev: amdgpu_device pointer
6604 *
6605 * Bring hardware to some kind of halt state so that no one can touch it
6606 * any more. It will help to maintain error context when error occurred.
6607 * Compare to a simple hang, the system will keep stable at least for SSH
6608 * access. Then it should be trivial to inspect the hardware state and
6609 * see what's going on. Implemented as following:
6610 *
6611 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6612 * clears all CPU mappings to device, disallows remappings through page faults
6613 * 2. amdgpu_irq_disable_all() disables all interrupts
6614 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6615 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6616 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6617 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6618 * flush any in flight DMA operations
6619 */
amdgpu_device_halt(struct amdgpu_device * adev)6620 void amdgpu_device_halt(struct amdgpu_device *adev)
6621 {
6622 struct pci_dev *pdev = adev->pdev;
6623 struct drm_device *ddev = adev_to_drm(adev);
6624
6625 amdgpu_xcp_dev_unplug(adev);
6626 drm_dev_unplug(ddev);
6627
6628 amdgpu_irq_disable_all(adev);
6629
6630 amdgpu_fence_driver_hw_fini(adev);
6631
6632 adev->no_hw_access = true;
6633
6634 amdgpu_device_unmap_mmio(adev);
6635
6636 pci_disable_device(pdev);
6637 pci_wait_for_pending_transaction(pdev);
6638 }
6639
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6640 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6641 u32 reg)
6642 {
6643 unsigned long flags, address, data;
6644 u32 r;
6645
6646 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6647 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6648
6649 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6650 WREG32(address, reg * 4);
6651 (void)RREG32(address);
6652 r = RREG32(data);
6653 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6654 return r;
6655 }
6656
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6657 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6658 u32 reg, u32 v)
6659 {
6660 unsigned long flags, address, data;
6661
6662 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6663 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6664
6665 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6666 WREG32(address, reg * 4);
6667 (void)RREG32(address);
6668 WREG32(data, v);
6669 (void)RREG32(data);
6670 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6671 }
6672
6673 /**
6674 * amdgpu_device_get_gang - return a reference to the current gang
6675 * @adev: amdgpu_device pointer
6676 *
6677 * Returns: A new reference to the current gang leader.
6678 */
amdgpu_device_get_gang(struct amdgpu_device * adev)6679 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
6680 {
6681 struct dma_fence *fence;
6682
6683 rcu_read_lock();
6684 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
6685 rcu_read_unlock();
6686 return fence;
6687 }
6688
6689 /**
6690 * amdgpu_device_switch_gang - switch to a new gang
6691 * @adev: amdgpu_device pointer
6692 * @gang: the gang to switch to
6693 *
6694 * Try to switch to a new gang.
6695 * Returns: NULL if we switched to the new gang or a reference to the current
6696 * gang leader.
6697 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6698 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6699 struct dma_fence *gang)
6700 {
6701 struct dma_fence *old = NULL;
6702
6703 do {
6704 dma_fence_put(old);
6705 old = amdgpu_device_get_gang(adev);
6706 if (old == gang)
6707 break;
6708
6709 if (!dma_fence_is_signaled(old))
6710 return old;
6711
6712 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6713 old, gang) != old);
6714
6715 dma_fence_put(old);
6716 return NULL;
6717 }
6718
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6719 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6720 {
6721 switch (adev->asic_type) {
6722 #ifdef CONFIG_DRM_AMDGPU_SI
6723 case CHIP_HAINAN:
6724 #endif
6725 case CHIP_TOPAZ:
6726 /* chips with no display hardware */
6727 return false;
6728 #ifdef CONFIG_DRM_AMDGPU_SI
6729 case CHIP_TAHITI:
6730 case CHIP_PITCAIRN:
6731 case CHIP_VERDE:
6732 case CHIP_OLAND:
6733 #endif
6734 #ifdef CONFIG_DRM_AMDGPU_CIK
6735 case CHIP_BONAIRE:
6736 case CHIP_HAWAII:
6737 case CHIP_KAVERI:
6738 case CHIP_KABINI:
6739 case CHIP_MULLINS:
6740 #endif
6741 case CHIP_TONGA:
6742 case CHIP_FIJI:
6743 case CHIP_POLARIS10:
6744 case CHIP_POLARIS11:
6745 case CHIP_POLARIS12:
6746 case CHIP_VEGAM:
6747 case CHIP_CARRIZO:
6748 case CHIP_STONEY:
6749 /* chips with display hardware */
6750 return true;
6751 default:
6752 /* IP discovery */
6753 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
6754 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6755 return false;
6756 return true;
6757 }
6758 }
6759
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6760 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6761 uint32_t inst, uint32_t reg_addr, char reg_name[],
6762 uint32_t expected_value, uint32_t mask)
6763 {
6764 uint32_t ret = 0;
6765 uint32_t old_ = 0;
6766 uint32_t tmp_ = RREG32(reg_addr);
6767 uint32_t loop = adev->usec_timeout;
6768
6769 while ((tmp_ & (mask)) != (expected_value)) {
6770 if (old_ != tmp_) {
6771 loop = adev->usec_timeout;
6772 old_ = tmp_;
6773 } else
6774 udelay(1);
6775 tmp_ = RREG32(reg_addr);
6776 loop--;
6777 if (!loop) {
6778 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6779 inst, reg_name, (uint32_t)expected_value,
6780 (uint32_t)(tmp_ & (mask)));
6781 ret = -ETIMEDOUT;
6782 break;
6783 }
6784 }
6785 return ret;
6786 }
6787
amdgpu_get_soft_full_reset_mask(struct amdgpu_ring * ring)6788 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring)
6789 {
6790 ssize_t size = 0;
6791
6792 if (!ring || !ring->adev)
6793 return size;
6794
6795 if (amdgpu_device_should_recover_gpu(ring->adev))
6796 size |= AMDGPU_RESET_TYPE_FULL;
6797
6798 if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
6799 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery)
6800 size |= AMDGPU_RESET_TYPE_SOFT_RESET;
6801
6802 return size;
6803 }
6804
amdgpu_show_reset_mask(char * buf,uint32_t supported_reset)6805 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
6806 {
6807 ssize_t size = 0;
6808
6809 if (supported_reset == 0) {
6810 size += sysfs_emit_at(buf, size, "unsupported");
6811 size += sysfs_emit_at(buf, size, "\n");
6812 return size;
6813
6814 }
6815
6816 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET)
6817 size += sysfs_emit_at(buf, size, "soft ");
6818
6819 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)
6820 size += sysfs_emit_at(buf, size, "queue ");
6821
6822 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)
6823 size += sysfs_emit_at(buf, size, "pipe ");
6824
6825 if (supported_reset & AMDGPU_RESET_TYPE_FULL)
6826 size += sysfs_emit_at(buf, size, "full ");
6827
6828 size += sysfs_emit_at(buf, size, "\n");
6829 return size;
6830 }
6831