1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/pci-p2pdma.h>
36 #include <linux/apple-gmux.h>
37
38 #include <drm/drm_aperture.h>
39 #include <drm/drm_atomic_helper.h>
40 #include <drm/drm_crtc_helper.h>
41 #include <drm/drm_fb_helper.h>
42 #include <drm/drm_probe_helper.h>
43 #include <drm/amdgpu_drm.h>
44 #include <linux/device.h>
45 #include <linux/vgaarb.h>
46 #include <linux/vga_switcheroo.h>
47 #include <linux/efi.h>
48 #include "amdgpu.h"
49 #include "amdgpu_trace.h"
50 #include "amdgpu_i2c.h"
51 #include "atom.h"
52 #include "amdgpu_atombios.h"
53 #include "amdgpu_atomfirmware.h"
54 #include "amd_pcie.h"
55 #ifdef CONFIG_DRM_AMDGPU_SI
56 #include "si.h"
57 #endif
58 #ifdef CONFIG_DRM_AMDGPU_CIK
59 #include "cik.h"
60 #endif
61 #include "vi.h"
62 #include "soc15.h"
63 #include "nv.h"
64 #include "bif/bif_4_1_d.h"
65 #include <linux/firmware.h>
66 #include "amdgpu_vf_error.h"
67
68 #include "amdgpu_amdkfd.h"
69 #include "amdgpu_pm.h"
70
71 #include "amdgpu_xgmi.h"
72 #include "amdgpu_ras.h"
73 #include "amdgpu_pmu.h"
74 #include "amdgpu_fru_eeprom.h"
75 #include "amdgpu_reset.h"
76 #include "amdgpu_virt.h"
77 #include "amdgpu_dev_coredump.h"
78
79 #include <linux/suspend.h>
80 #include <drm/task_barrier.h>
81 #include <linux/pm_runtime.h>
82
83 #include <drm/drm_drv.h>
84
85 #if IS_ENABLED(CONFIG_X86)
86 #include <asm/intel-family.h>
87 #endif
88
89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
96
97 #define AMDGPU_RESUME_MS 2000
98 #define AMDGPU_MAX_RETRY_LIMIT 2
99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
103
104 static const struct drm_driver amdgpu_kms_driver;
105
106 const char *amdgpu_asic_name[] = {
107 "TAHITI",
108 "PITCAIRN",
109 "VERDE",
110 "OLAND",
111 "HAINAN",
112 "BONAIRE",
113 "KAVERI",
114 "KABINI",
115 "HAWAII",
116 "MULLINS",
117 "TOPAZ",
118 "TONGA",
119 "FIJI",
120 "CARRIZO",
121 "STONEY",
122 "POLARIS10",
123 "POLARIS11",
124 "POLARIS12",
125 "VEGAM",
126 "VEGA10",
127 "VEGA12",
128 "VEGA20",
129 "RAVEN",
130 "ARCTURUS",
131 "RENOIR",
132 "ALDEBARAN",
133 "NAVI10",
134 "CYAN_SKILLFISH",
135 "NAVI14",
136 "NAVI12",
137 "SIENNA_CICHLID",
138 "NAVY_FLOUNDER",
139 "VANGOGH",
140 "DIMGREY_CAVEFISH",
141 "BEIGE_GOBY",
142 "YELLOW_CARP",
143 "IP DISCOVERY",
144 "LAST",
145 };
146
147 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
148
149 /**
150 * DOC: pcie_replay_count
151 *
152 * The amdgpu driver provides a sysfs API for reporting the total number
153 * of PCIe replays (NAKs)
154 * The file pcie_replay_count is used for this and returns the total
155 * number of replays as a sum of the NAKs generated and NAKs received
156 */
157
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)158 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
159 struct device_attribute *attr, char *buf)
160 {
161 struct drm_device *ddev = dev_get_drvdata(dev);
162 struct amdgpu_device *adev = drm_to_adev(ddev);
163 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
164
165 return sysfs_emit(buf, "%llu\n", cnt);
166 }
167
168 static DEVICE_ATTR(pcie_replay_count, 0444,
169 amdgpu_device_get_pcie_replay_count, NULL);
170
amdgpu_sysfs_reg_state_get(struct file * f,struct kobject * kobj,struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)171 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
172 struct bin_attribute *attr, char *buf,
173 loff_t ppos, size_t count)
174 {
175 struct device *dev = kobj_to_dev(kobj);
176 struct drm_device *ddev = dev_get_drvdata(dev);
177 struct amdgpu_device *adev = drm_to_adev(ddev);
178 ssize_t bytes_read;
179
180 switch (ppos) {
181 case AMDGPU_SYS_REG_STATE_XGMI:
182 bytes_read = amdgpu_asic_get_reg_state(
183 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
184 break;
185 case AMDGPU_SYS_REG_STATE_WAFL:
186 bytes_read = amdgpu_asic_get_reg_state(
187 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
188 break;
189 case AMDGPU_SYS_REG_STATE_PCIE:
190 bytes_read = amdgpu_asic_get_reg_state(
191 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
192 break;
193 case AMDGPU_SYS_REG_STATE_USR:
194 bytes_read = amdgpu_asic_get_reg_state(
195 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
196 break;
197 case AMDGPU_SYS_REG_STATE_USR_1:
198 bytes_read = amdgpu_asic_get_reg_state(
199 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
200 break;
201 default:
202 return -EINVAL;
203 }
204
205 return bytes_read;
206 }
207
208 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
209 AMDGPU_SYS_REG_STATE_END);
210
amdgpu_reg_state_sysfs_init(struct amdgpu_device * adev)211 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
212 {
213 int ret;
214
215 if (!amdgpu_asic_get_reg_state_supported(adev))
216 return 0;
217
218 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
219
220 return ret;
221 }
222
amdgpu_reg_state_sysfs_fini(struct amdgpu_device * adev)223 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
224 {
225 if (!amdgpu_asic_get_reg_state_supported(adev))
226 return;
227 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
228 }
229
230 /**
231 * DOC: board_info
232 *
233 * The amdgpu driver provides a sysfs API for giving board related information.
234 * It provides the form factor information in the format
235 *
236 * type : form factor
237 *
238 * Possible form factor values
239 *
240 * - "cem" - PCIE CEM card
241 * - "oam" - Open Compute Accelerator Module
242 * - "unknown" - Not known
243 *
244 */
245
amdgpu_device_get_board_info(struct device * dev,struct device_attribute * attr,char * buf)246 static ssize_t amdgpu_device_get_board_info(struct device *dev,
247 struct device_attribute *attr,
248 char *buf)
249 {
250 struct drm_device *ddev = dev_get_drvdata(dev);
251 struct amdgpu_device *adev = drm_to_adev(ddev);
252 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
253 const char *pkg;
254
255 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
256 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
257
258 switch (pkg_type) {
259 case AMDGPU_PKG_TYPE_CEM:
260 pkg = "cem";
261 break;
262 case AMDGPU_PKG_TYPE_OAM:
263 pkg = "oam";
264 break;
265 default:
266 pkg = "unknown";
267 break;
268 }
269
270 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
271 }
272
273 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
274
275 static struct attribute *amdgpu_board_attrs[] = {
276 &dev_attr_board_info.attr,
277 NULL,
278 };
279
amdgpu_board_attrs_is_visible(struct kobject * kobj,struct attribute * attr,int n)280 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
281 struct attribute *attr, int n)
282 {
283 struct device *dev = kobj_to_dev(kobj);
284 struct drm_device *ddev = dev_get_drvdata(dev);
285 struct amdgpu_device *adev = drm_to_adev(ddev);
286
287 if (adev->flags & AMD_IS_APU)
288 return 0;
289
290 return attr->mode;
291 }
292
293 static const struct attribute_group amdgpu_board_attrs_group = {
294 .attrs = amdgpu_board_attrs,
295 .is_visible = amdgpu_board_attrs_is_visible
296 };
297
298 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
299
300
301 /**
302 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
303 *
304 * @dev: drm_device pointer
305 *
306 * Returns true if the device is a dGPU with ATPX power control,
307 * otherwise return false.
308 */
amdgpu_device_supports_px(struct drm_device * dev)309 bool amdgpu_device_supports_px(struct drm_device *dev)
310 {
311 struct amdgpu_device *adev = drm_to_adev(dev);
312
313 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
314 return true;
315 return false;
316 }
317
318 /**
319 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
320 *
321 * @dev: drm_device pointer
322 *
323 * Returns true if the device is a dGPU with ACPI power control,
324 * otherwise return false.
325 */
amdgpu_device_supports_boco(struct drm_device * dev)326 bool amdgpu_device_supports_boco(struct drm_device *dev)
327 {
328 struct amdgpu_device *adev = drm_to_adev(dev);
329
330 if (adev->has_pr3 ||
331 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
332 return true;
333 return false;
334 }
335
336 /**
337 * amdgpu_device_supports_baco - Does the device support BACO
338 *
339 * @dev: drm_device pointer
340 *
341 * Return:
342 * 1 if the device supporte BACO;
343 * 3 if the device support MACO (only works if BACO is supported)
344 * otherwise return 0.
345 */
amdgpu_device_supports_baco(struct drm_device * dev)346 int amdgpu_device_supports_baco(struct drm_device *dev)
347 {
348 struct amdgpu_device *adev = drm_to_adev(dev);
349
350 return amdgpu_asic_supports_baco(adev);
351 }
352
amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device * adev)353 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
354 {
355 struct drm_device *dev;
356 int bamaco_support;
357
358 dev = adev_to_drm(adev);
359
360 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
361 bamaco_support = amdgpu_device_supports_baco(dev);
362
363 switch (amdgpu_runtime_pm) {
364 case 2:
365 if (bamaco_support & MACO_SUPPORT) {
366 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
367 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
368 } else if (bamaco_support == BACO_SUPPORT) {
369 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
370 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
371 }
372 break;
373 case 1:
374 if (bamaco_support & BACO_SUPPORT) {
375 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
376 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
377 }
378 break;
379 case -1:
380 case -2:
381 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
382 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
383 dev_info(adev->dev, "Using ATPX for runtime pm\n");
384 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
385 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
386 dev_info(adev->dev, "Using BOCO for runtime pm\n");
387 } else {
388 if (!bamaco_support)
389 goto no_runtime_pm;
390
391 switch (adev->asic_type) {
392 case CHIP_VEGA20:
393 case CHIP_ARCTURUS:
394 /* BACO are not supported on vega20 and arctrus */
395 break;
396 case CHIP_VEGA10:
397 /* enable BACO as runpm mode if noretry=0 */
398 if (!adev->gmc.noretry)
399 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
400 break;
401 default:
402 /* enable BACO as runpm mode on CI+ */
403 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
404 break;
405 }
406
407 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
408 if (bamaco_support & MACO_SUPPORT) {
409 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
410 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
411 } else {
412 dev_info(adev->dev, "Using BACO for runtime pm\n");
413 }
414 }
415 }
416 break;
417 case 0:
418 dev_info(adev->dev, "runtime pm is manually disabled\n");
419 break;
420 default:
421 break;
422 }
423
424 no_runtime_pm:
425 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
426 dev_info(adev->dev, "Runtime PM not available\n");
427 }
428 /**
429 * amdgpu_device_supports_smart_shift - Is the device dGPU with
430 * smart shift support
431 *
432 * @dev: drm_device pointer
433 *
434 * Returns true if the device is a dGPU with Smart Shift support,
435 * otherwise returns false.
436 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)437 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
438 {
439 return (amdgpu_device_supports_boco(dev) &&
440 amdgpu_acpi_is_power_shift_control_supported());
441 }
442
443 /*
444 * VRAM access helper functions
445 */
446
447 /**
448 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
449 *
450 * @adev: amdgpu_device pointer
451 * @pos: offset of the buffer in vram
452 * @buf: virtual address of the buffer in system memory
453 * @size: read/write size, sizeof(@buf) must > @size
454 * @write: true - write to vram, otherwise - read from vram
455 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)456 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
457 void *buf, size_t size, bool write)
458 {
459 unsigned long flags;
460 uint32_t hi = ~0, tmp = 0;
461 uint32_t *data = buf;
462 uint64_t last;
463 int idx;
464
465 if (!drm_dev_enter(adev_to_drm(adev), &idx))
466 return;
467
468 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
469
470 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
471 for (last = pos + size; pos < last; pos += 4) {
472 tmp = pos >> 31;
473
474 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
475 if (tmp != hi) {
476 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
477 hi = tmp;
478 }
479 if (write)
480 WREG32_NO_KIQ(mmMM_DATA, *data++);
481 else
482 *data++ = RREG32_NO_KIQ(mmMM_DATA);
483 }
484
485 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
486 drm_dev_exit(idx);
487 }
488
489 /**
490 * amdgpu_device_aper_access - access vram by vram aperature
491 *
492 * @adev: amdgpu_device pointer
493 * @pos: offset of the buffer in vram
494 * @buf: virtual address of the buffer in system memory
495 * @size: read/write size, sizeof(@buf) must > @size
496 * @write: true - write to vram, otherwise - read from vram
497 *
498 * The return value means how many bytes have been transferred.
499 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)500 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
501 void *buf, size_t size, bool write)
502 {
503 #ifdef CONFIG_64BIT
504 void __iomem *addr;
505 size_t count = 0;
506 uint64_t last;
507
508 if (!adev->mman.aper_base_kaddr)
509 return 0;
510
511 last = min(pos + size, adev->gmc.visible_vram_size);
512 if (last > pos) {
513 addr = adev->mman.aper_base_kaddr + pos;
514 count = last - pos;
515
516 if (write) {
517 memcpy_toio(addr, buf, count);
518 /* Make sure HDP write cache flush happens without any reordering
519 * after the system memory contents are sent over PCIe device
520 */
521 mb();
522 amdgpu_device_flush_hdp(adev, NULL);
523 } else {
524 amdgpu_device_invalidate_hdp(adev, NULL);
525 /* Make sure HDP read cache is invalidated before issuing a read
526 * to the PCIe device
527 */
528 mb();
529 memcpy_fromio(buf, addr, count);
530 }
531
532 }
533
534 return count;
535 #else
536 return 0;
537 #endif
538 }
539
540 /**
541 * amdgpu_device_vram_access - read/write a buffer in vram
542 *
543 * @adev: amdgpu_device pointer
544 * @pos: offset of the buffer in vram
545 * @buf: virtual address of the buffer in system memory
546 * @size: read/write size, sizeof(@buf) must > @size
547 * @write: true - write to vram, otherwise - read from vram
548 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)549 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
550 void *buf, size_t size, bool write)
551 {
552 size_t count;
553
554 /* try to using vram apreature to access vram first */
555 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
556 size -= count;
557 if (size) {
558 /* using MM to access rest vram */
559 pos += count;
560 buf += count;
561 amdgpu_device_mm_access(adev, pos, buf, size, write);
562 }
563 }
564
565 /*
566 * register access helper functions.
567 */
568
569 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)570 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
571 {
572 if (adev->no_hw_access)
573 return true;
574
575 #ifdef CONFIG_LOCKDEP
576 /*
577 * This is a bit complicated to understand, so worth a comment. What we assert
578 * here is that the GPU reset is not running on another thread in parallel.
579 *
580 * For this we trylock the read side of the reset semaphore, if that succeeds
581 * we know that the reset is not running in paralell.
582 *
583 * If the trylock fails we assert that we are either already holding the read
584 * side of the lock or are the reset thread itself and hold the write side of
585 * the lock.
586 */
587 if (in_task()) {
588 if (down_read_trylock(&adev->reset_domain->sem))
589 up_read(&adev->reset_domain->sem);
590 else
591 lockdep_assert_held(&adev->reset_domain->sem);
592 }
593 #endif
594 return false;
595 }
596
597 /**
598 * amdgpu_device_rreg - read a memory mapped IO or indirect register
599 *
600 * @adev: amdgpu_device pointer
601 * @reg: dword aligned register offset
602 * @acc_flags: access flags which require special behavior
603 *
604 * Returns the 32 bit value from the offset specified.
605 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)606 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
607 uint32_t reg, uint32_t acc_flags)
608 {
609 uint32_t ret;
610
611 if (amdgpu_device_skip_hw_access(adev))
612 return 0;
613
614 if ((reg * 4) < adev->rmmio_size) {
615 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
616 amdgpu_sriov_runtime(adev) &&
617 down_read_trylock(&adev->reset_domain->sem)) {
618 ret = amdgpu_kiq_rreg(adev, reg, 0);
619 up_read(&adev->reset_domain->sem);
620 } else {
621 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
622 }
623 } else {
624 ret = adev->pcie_rreg(adev, reg * 4);
625 }
626
627 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
628
629 return ret;
630 }
631
632 /*
633 * MMIO register read with bytes helper functions
634 * @offset:bytes offset from MMIO start
635 */
636
637 /**
638 * amdgpu_mm_rreg8 - read a memory mapped IO register
639 *
640 * @adev: amdgpu_device pointer
641 * @offset: byte aligned register offset
642 *
643 * Returns the 8 bit value from the offset specified.
644 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)645 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
646 {
647 if (amdgpu_device_skip_hw_access(adev))
648 return 0;
649
650 if (offset < adev->rmmio_size)
651 return (readb(adev->rmmio + offset));
652 BUG();
653 }
654
655
656 /**
657 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
658 *
659 * @adev: amdgpu_device pointer
660 * @reg: dword aligned register offset
661 * @acc_flags: access flags which require special behavior
662 * @xcc_id: xcc accelerated compute core id
663 *
664 * Returns the 32 bit value from the offset specified.
665 */
amdgpu_device_xcc_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags,uint32_t xcc_id)666 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
667 uint32_t reg, uint32_t acc_flags,
668 uint32_t xcc_id)
669 {
670 uint32_t ret, rlcg_flag;
671
672 if (amdgpu_device_skip_hw_access(adev))
673 return 0;
674
675 if ((reg * 4) < adev->rmmio_size) {
676 if (amdgpu_sriov_vf(adev) &&
677 !amdgpu_sriov_runtime(adev) &&
678 adev->gfx.rlc.rlcg_reg_access_supported &&
679 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
680 GC_HWIP, false,
681 &rlcg_flag)) {
682 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
683 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
684 amdgpu_sriov_runtime(adev) &&
685 down_read_trylock(&adev->reset_domain->sem)) {
686 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
687 up_read(&adev->reset_domain->sem);
688 } else {
689 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
690 }
691 } else {
692 ret = adev->pcie_rreg(adev, reg * 4);
693 }
694
695 return ret;
696 }
697
698 /*
699 * MMIO register write with bytes helper functions
700 * @offset:bytes offset from MMIO start
701 * @value: the value want to be written to the register
702 */
703
704 /**
705 * amdgpu_mm_wreg8 - read a memory mapped IO register
706 *
707 * @adev: amdgpu_device pointer
708 * @offset: byte aligned register offset
709 * @value: 8 bit value to write
710 *
711 * Writes the value specified to the offset specified.
712 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)713 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
714 {
715 if (amdgpu_device_skip_hw_access(adev))
716 return;
717
718 if (offset < adev->rmmio_size)
719 writeb(value, adev->rmmio + offset);
720 else
721 BUG();
722 }
723
724 /**
725 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
726 *
727 * @adev: amdgpu_device pointer
728 * @reg: dword aligned register offset
729 * @v: 32 bit value to write to the register
730 * @acc_flags: access flags which require special behavior
731 *
732 * Writes the value specified to the offset specified.
733 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)734 void amdgpu_device_wreg(struct amdgpu_device *adev,
735 uint32_t reg, uint32_t v,
736 uint32_t acc_flags)
737 {
738 if (amdgpu_device_skip_hw_access(adev))
739 return;
740
741 if ((reg * 4) < adev->rmmio_size) {
742 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
743 amdgpu_sriov_runtime(adev) &&
744 down_read_trylock(&adev->reset_domain->sem)) {
745 amdgpu_kiq_wreg(adev, reg, v, 0);
746 up_read(&adev->reset_domain->sem);
747 } else {
748 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
749 }
750 } else {
751 adev->pcie_wreg(adev, reg * 4, v);
752 }
753
754 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
755 }
756
757 /**
758 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
759 *
760 * @adev: amdgpu_device pointer
761 * @reg: mmio/rlc register
762 * @v: value to write
763 * @xcc_id: xcc accelerated compute core id
764 *
765 * this function is invoked only for the debugfs register access
766 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)767 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
768 uint32_t reg, uint32_t v,
769 uint32_t xcc_id)
770 {
771 if (amdgpu_device_skip_hw_access(adev))
772 return;
773
774 if (amdgpu_sriov_fullaccess(adev) &&
775 adev->gfx.rlc.funcs &&
776 adev->gfx.rlc.funcs->is_rlcg_access_range) {
777 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
778 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
779 } else if ((reg * 4) >= adev->rmmio_size) {
780 adev->pcie_wreg(adev, reg * 4, v);
781 } else {
782 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
783 }
784 }
785
786 /**
787 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
788 *
789 * @adev: amdgpu_device pointer
790 * @reg: dword aligned register offset
791 * @v: 32 bit value to write to the register
792 * @acc_flags: access flags which require special behavior
793 * @xcc_id: xcc accelerated compute core id
794 *
795 * Writes the value specified to the offset specified.
796 */
amdgpu_device_xcc_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags,uint32_t xcc_id)797 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
798 uint32_t reg, uint32_t v,
799 uint32_t acc_flags, uint32_t xcc_id)
800 {
801 uint32_t rlcg_flag;
802
803 if (amdgpu_device_skip_hw_access(adev))
804 return;
805
806 if ((reg * 4) < adev->rmmio_size) {
807 if (amdgpu_sriov_vf(adev) &&
808 !amdgpu_sriov_runtime(adev) &&
809 adev->gfx.rlc.rlcg_reg_access_supported &&
810 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
811 GC_HWIP, true,
812 &rlcg_flag)) {
813 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
814 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
815 amdgpu_sriov_runtime(adev) &&
816 down_read_trylock(&adev->reset_domain->sem)) {
817 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
818 up_read(&adev->reset_domain->sem);
819 } else {
820 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
821 }
822 } else {
823 adev->pcie_wreg(adev, reg * 4, v);
824 }
825 }
826
827 /**
828 * amdgpu_device_indirect_rreg - read an indirect register
829 *
830 * @adev: amdgpu_device pointer
831 * @reg_addr: indirect register address to read from
832 *
833 * Returns the value of indirect register @reg_addr
834 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)835 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
836 u32 reg_addr)
837 {
838 unsigned long flags, pcie_index, pcie_data;
839 void __iomem *pcie_index_offset;
840 void __iomem *pcie_data_offset;
841 u32 r;
842
843 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
844 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
845
846 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
847 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
848 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
849
850 writel(reg_addr, pcie_index_offset);
851 readl(pcie_index_offset);
852 r = readl(pcie_data_offset);
853 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
854
855 return r;
856 }
857
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)858 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
859 u64 reg_addr)
860 {
861 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
862 u32 r;
863 void __iomem *pcie_index_offset;
864 void __iomem *pcie_index_hi_offset;
865 void __iomem *pcie_data_offset;
866
867 if (unlikely(!adev->nbio.funcs)) {
868 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
869 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
870 } else {
871 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
872 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
873 }
874
875 if (reg_addr >> 32) {
876 if (unlikely(!adev->nbio.funcs))
877 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
878 else
879 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
880 } else {
881 pcie_index_hi = 0;
882 }
883
884 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
885 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
886 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
887 if (pcie_index_hi != 0)
888 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
889 pcie_index_hi * 4;
890
891 writel(reg_addr, pcie_index_offset);
892 readl(pcie_index_offset);
893 if (pcie_index_hi != 0) {
894 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
895 readl(pcie_index_hi_offset);
896 }
897 r = readl(pcie_data_offset);
898
899 /* clear the high bits */
900 if (pcie_index_hi != 0) {
901 writel(0, pcie_index_hi_offset);
902 readl(pcie_index_hi_offset);
903 }
904
905 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
906
907 return r;
908 }
909
910 /**
911 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
912 *
913 * @adev: amdgpu_device pointer
914 * @reg_addr: indirect register address to read from
915 *
916 * Returns the value of indirect register @reg_addr
917 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)918 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
919 u32 reg_addr)
920 {
921 unsigned long flags, pcie_index, pcie_data;
922 void __iomem *pcie_index_offset;
923 void __iomem *pcie_data_offset;
924 u64 r;
925
926 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
927 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
928
929 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
930 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
931 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
932
933 /* read low 32 bits */
934 writel(reg_addr, pcie_index_offset);
935 readl(pcie_index_offset);
936 r = readl(pcie_data_offset);
937 /* read high 32 bits */
938 writel(reg_addr + 4, pcie_index_offset);
939 readl(pcie_index_offset);
940 r |= ((u64)readl(pcie_data_offset) << 32);
941 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
942
943 return r;
944 }
945
amdgpu_device_indirect_rreg64_ext(struct amdgpu_device * adev,u64 reg_addr)946 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
947 u64 reg_addr)
948 {
949 unsigned long flags, pcie_index, pcie_data;
950 unsigned long pcie_index_hi = 0;
951 void __iomem *pcie_index_offset;
952 void __iomem *pcie_index_hi_offset;
953 void __iomem *pcie_data_offset;
954 u64 r;
955
956 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
957 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
958 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
959 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
960
961 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
962 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
963 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
964 if (pcie_index_hi != 0)
965 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
966 pcie_index_hi * 4;
967
968 /* read low 32 bits */
969 writel(reg_addr, pcie_index_offset);
970 readl(pcie_index_offset);
971 if (pcie_index_hi != 0) {
972 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
973 readl(pcie_index_hi_offset);
974 }
975 r = readl(pcie_data_offset);
976 /* read high 32 bits */
977 writel(reg_addr + 4, pcie_index_offset);
978 readl(pcie_index_offset);
979 if (pcie_index_hi != 0) {
980 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
981 readl(pcie_index_hi_offset);
982 }
983 r |= ((u64)readl(pcie_data_offset) << 32);
984
985 /* clear the high bits */
986 if (pcie_index_hi != 0) {
987 writel(0, pcie_index_hi_offset);
988 readl(pcie_index_hi_offset);
989 }
990
991 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
992
993 return r;
994 }
995
996 /**
997 * amdgpu_device_indirect_wreg - write an indirect register address
998 *
999 * @adev: amdgpu_device pointer
1000 * @reg_addr: indirect register offset
1001 * @reg_data: indirect register data
1002 *
1003 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)1004 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1005 u32 reg_addr, u32 reg_data)
1006 {
1007 unsigned long flags, pcie_index, pcie_data;
1008 void __iomem *pcie_index_offset;
1009 void __iomem *pcie_data_offset;
1010
1011 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1012 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1013
1014 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1015 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1016 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1017
1018 writel(reg_addr, pcie_index_offset);
1019 readl(pcie_index_offset);
1020 writel(reg_data, pcie_data_offset);
1021 readl(pcie_data_offset);
1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1023 }
1024
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)1025 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1026 u64 reg_addr, u32 reg_data)
1027 {
1028 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1029 void __iomem *pcie_index_offset;
1030 void __iomem *pcie_index_hi_offset;
1031 void __iomem *pcie_data_offset;
1032
1033 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1034 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1035 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1036 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1037 else
1038 pcie_index_hi = 0;
1039
1040 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1041 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1042 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1043 if (pcie_index_hi != 0)
1044 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1045 pcie_index_hi * 4;
1046
1047 writel(reg_addr, pcie_index_offset);
1048 readl(pcie_index_offset);
1049 if (pcie_index_hi != 0) {
1050 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1051 readl(pcie_index_hi_offset);
1052 }
1053 writel(reg_data, pcie_data_offset);
1054 readl(pcie_data_offset);
1055
1056 /* clear the high bits */
1057 if (pcie_index_hi != 0) {
1058 writel(0, pcie_index_hi_offset);
1059 readl(pcie_index_hi_offset);
1060 }
1061
1062 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1063 }
1064
1065 /**
1066 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1067 *
1068 * @adev: amdgpu_device pointer
1069 * @reg_addr: indirect register offset
1070 * @reg_data: indirect register data
1071 *
1072 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)1073 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1074 u32 reg_addr, u64 reg_data)
1075 {
1076 unsigned long flags, pcie_index, pcie_data;
1077 void __iomem *pcie_index_offset;
1078 void __iomem *pcie_data_offset;
1079
1080 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1081 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1082
1083 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1084 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1085 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1086
1087 /* write low 32 bits */
1088 writel(reg_addr, pcie_index_offset);
1089 readl(pcie_index_offset);
1090 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1091 readl(pcie_data_offset);
1092 /* write high 32 bits */
1093 writel(reg_addr + 4, pcie_index_offset);
1094 readl(pcie_index_offset);
1095 writel((u32)(reg_data >> 32), pcie_data_offset);
1096 readl(pcie_data_offset);
1097 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1098 }
1099
amdgpu_device_indirect_wreg64_ext(struct amdgpu_device * adev,u64 reg_addr,u64 reg_data)1100 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1101 u64 reg_addr, u64 reg_data)
1102 {
1103 unsigned long flags, pcie_index, pcie_data;
1104 unsigned long pcie_index_hi = 0;
1105 void __iomem *pcie_index_offset;
1106 void __iomem *pcie_index_hi_offset;
1107 void __iomem *pcie_data_offset;
1108
1109 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1110 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1111 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1112 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1113
1114 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1115 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1116 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1117 if (pcie_index_hi != 0)
1118 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1119 pcie_index_hi * 4;
1120
1121 /* write low 32 bits */
1122 writel(reg_addr, pcie_index_offset);
1123 readl(pcie_index_offset);
1124 if (pcie_index_hi != 0) {
1125 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1126 readl(pcie_index_hi_offset);
1127 }
1128 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1129 readl(pcie_data_offset);
1130 /* write high 32 bits */
1131 writel(reg_addr + 4, pcie_index_offset);
1132 readl(pcie_index_offset);
1133 if (pcie_index_hi != 0) {
1134 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1135 readl(pcie_index_hi_offset);
1136 }
1137 writel((u32)(reg_data >> 32), pcie_data_offset);
1138 readl(pcie_data_offset);
1139
1140 /* clear the high bits */
1141 if (pcie_index_hi != 0) {
1142 writel(0, pcie_index_hi_offset);
1143 readl(pcie_index_hi_offset);
1144 }
1145
1146 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1147 }
1148
1149 /**
1150 * amdgpu_device_get_rev_id - query device rev_id
1151 *
1152 * @adev: amdgpu_device pointer
1153 *
1154 * Return device rev_id
1155 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)1156 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1157 {
1158 return adev->nbio.funcs->get_rev_id(adev);
1159 }
1160
1161 /**
1162 * amdgpu_invalid_rreg - dummy reg read function
1163 *
1164 * @adev: amdgpu_device pointer
1165 * @reg: offset of register
1166 *
1167 * Dummy register read function. Used for register blocks
1168 * that certain asics don't have (all asics).
1169 * Returns the value in the register.
1170 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)1171 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1172 {
1173 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1174 BUG();
1175 return 0;
1176 }
1177
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)1178 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1179 {
1180 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1181 BUG();
1182 return 0;
1183 }
1184
1185 /**
1186 * amdgpu_invalid_wreg - dummy reg write function
1187 *
1188 * @adev: amdgpu_device pointer
1189 * @reg: offset of register
1190 * @v: value to write to the register
1191 *
1192 * Dummy register read function. Used for register blocks
1193 * that certain asics don't have (all asics).
1194 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)1195 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1196 {
1197 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1198 reg, v);
1199 BUG();
1200 }
1201
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)1202 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1203 {
1204 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1205 reg, v);
1206 BUG();
1207 }
1208
1209 /**
1210 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1211 *
1212 * @adev: amdgpu_device pointer
1213 * @reg: offset of register
1214 *
1215 * Dummy register read function. Used for register blocks
1216 * that certain asics don't have (all asics).
1217 * Returns the value in the register.
1218 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)1219 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1220 {
1221 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1222 BUG();
1223 return 0;
1224 }
1225
amdgpu_invalid_rreg64_ext(struct amdgpu_device * adev,uint64_t reg)1226 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1227 {
1228 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1229 BUG();
1230 return 0;
1231 }
1232
1233 /**
1234 * amdgpu_invalid_wreg64 - dummy reg write function
1235 *
1236 * @adev: amdgpu_device pointer
1237 * @reg: offset of register
1238 * @v: value to write to the register
1239 *
1240 * Dummy register read function. Used for register blocks
1241 * that certain asics don't have (all asics).
1242 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)1243 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1244 {
1245 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1246 reg, v);
1247 BUG();
1248 }
1249
amdgpu_invalid_wreg64_ext(struct amdgpu_device * adev,uint64_t reg,uint64_t v)1250 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1251 {
1252 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1253 reg, v);
1254 BUG();
1255 }
1256
1257 /**
1258 * amdgpu_block_invalid_rreg - dummy reg read function
1259 *
1260 * @adev: amdgpu_device pointer
1261 * @block: offset of instance
1262 * @reg: offset of register
1263 *
1264 * Dummy register read function. Used for register blocks
1265 * that certain asics don't have (all asics).
1266 * Returns the value in the register.
1267 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)1268 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1269 uint32_t block, uint32_t reg)
1270 {
1271 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1272 reg, block);
1273 BUG();
1274 return 0;
1275 }
1276
1277 /**
1278 * amdgpu_block_invalid_wreg - dummy reg write function
1279 *
1280 * @adev: amdgpu_device pointer
1281 * @block: offset of instance
1282 * @reg: offset of register
1283 * @v: value to write to the register
1284 *
1285 * Dummy register read function. Used for register blocks
1286 * that certain asics don't have (all asics).
1287 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)1288 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1289 uint32_t block,
1290 uint32_t reg, uint32_t v)
1291 {
1292 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1293 reg, block, v);
1294 BUG();
1295 }
1296
1297 /**
1298 * amdgpu_device_asic_init - Wrapper for atom asic_init
1299 *
1300 * @adev: amdgpu_device pointer
1301 *
1302 * Does any asic specific work and then calls atom asic init.
1303 */
amdgpu_device_asic_init(struct amdgpu_device * adev)1304 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1305 {
1306 int ret;
1307
1308 amdgpu_asic_pre_asic_init(adev);
1309
1310 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1311 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1312 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1313 amdgpu_psp_wait_for_bootloader(adev);
1314 ret = amdgpu_atomfirmware_asic_init(adev, true);
1315 return ret;
1316 } else {
1317 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1318 }
1319
1320 return 0;
1321 }
1322
1323 /**
1324 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1325 *
1326 * @adev: amdgpu_device pointer
1327 *
1328 * Allocates a scratch page of VRAM for use by various things in the
1329 * driver.
1330 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)1331 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1332 {
1333 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1334 AMDGPU_GEM_DOMAIN_VRAM |
1335 AMDGPU_GEM_DOMAIN_GTT,
1336 &adev->mem_scratch.robj,
1337 &adev->mem_scratch.gpu_addr,
1338 (void **)&adev->mem_scratch.ptr);
1339 }
1340
1341 /**
1342 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1343 *
1344 * @adev: amdgpu_device pointer
1345 *
1346 * Frees the VRAM scratch page.
1347 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)1348 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1349 {
1350 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1351 }
1352
1353 /**
1354 * amdgpu_device_program_register_sequence - program an array of registers.
1355 *
1356 * @adev: amdgpu_device pointer
1357 * @registers: pointer to the register array
1358 * @array_size: size of the register array
1359 *
1360 * Programs an array or registers with and or masks.
1361 * This is a helper for setting golden registers.
1362 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)1363 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1364 const u32 *registers,
1365 const u32 array_size)
1366 {
1367 u32 tmp, reg, and_mask, or_mask;
1368 int i;
1369
1370 if (array_size % 3)
1371 return;
1372
1373 for (i = 0; i < array_size; i += 3) {
1374 reg = registers[i + 0];
1375 and_mask = registers[i + 1];
1376 or_mask = registers[i + 2];
1377
1378 if (and_mask == 0xffffffff) {
1379 tmp = or_mask;
1380 } else {
1381 tmp = RREG32(reg);
1382 tmp &= ~and_mask;
1383 if (adev->family >= AMDGPU_FAMILY_AI)
1384 tmp |= (or_mask & and_mask);
1385 else
1386 tmp |= or_mask;
1387 }
1388 WREG32(reg, tmp);
1389 }
1390 }
1391
1392 /**
1393 * amdgpu_device_pci_config_reset - reset the GPU
1394 *
1395 * @adev: amdgpu_device pointer
1396 *
1397 * Resets the GPU using the pci config reset sequence.
1398 * Only applicable to asics prior to vega10.
1399 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)1400 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1401 {
1402 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1403 }
1404
1405 /**
1406 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1407 *
1408 * @adev: amdgpu_device pointer
1409 *
1410 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1411 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1412 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1413 {
1414 return pci_reset_function(adev->pdev);
1415 }
1416
1417 /*
1418 * amdgpu_device_wb_*()
1419 * Writeback is the method by which the GPU updates special pages in memory
1420 * with the status of certain GPU events (fences, ring pointers,etc.).
1421 */
1422
1423 /**
1424 * amdgpu_device_wb_fini - Disable Writeback and free memory
1425 *
1426 * @adev: amdgpu_device pointer
1427 *
1428 * Disables Writeback and frees the Writeback memory (all asics).
1429 * Used at driver shutdown.
1430 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1431 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1432 {
1433 if (adev->wb.wb_obj) {
1434 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1435 &adev->wb.gpu_addr,
1436 (void **)&adev->wb.wb);
1437 adev->wb.wb_obj = NULL;
1438 }
1439 }
1440
1441 /**
1442 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1443 *
1444 * @adev: amdgpu_device pointer
1445 *
1446 * Initializes writeback and allocates writeback memory (all asics).
1447 * Used at driver startup.
1448 * Returns 0 on success or an -error on failure.
1449 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1450 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1451 {
1452 int r;
1453
1454 if (adev->wb.wb_obj == NULL) {
1455 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1456 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1457 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1458 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1459 (void **)&adev->wb.wb);
1460 if (r) {
1461 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1462 return r;
1463 }
1464
1465 adev->wb.num_wb = AMDGPU_MAX_WB;
1466 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1467
1468 /* clear wb memory */
1469 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1470 }
1471
1472 return 0;
1473 }
1474
1475 /**
1476 * amdgpu_device_wb_get - Allocate a wb entry
1477 *
1478 * @adev: amdgpu_device pointer
1479 * @wb: wb index
1480 *
1481 * Allocate a wb slot for use by the driver (all asics).
1482 * Returns 0 on success or -EINVAL on failure.
1483 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1484 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1485 {
1486 unsigned long flags, offset;
1487
1488 spin_lock_irqsave(&adev->wb.lock, flags);
1489 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1490 if (offset < adev->wb.num_wb) {
1491 __set_bit(offset, adev->wb.used);
1492 spin_unlock_irqrestore(&adev->wb.lock, flags);
1493 *wb = offset << 3; /* convert to dw offset */
1494 return 0;
1495 } else {
1496 spin_unlock_irqrestore(&adev->wb.lock, flags);
1497 return -EINVAL;
1498 }
1499 }
1500
1501 /**
1502 * amdgpu_device_wb_free - Free a wb entry
1503 *
1504 * @adev: amdgpu_device pointer
1505 * @wb: wb index
1506 *
1507 * Free a wb slot allocated for use by the driver (all asics)
1508 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1509 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1510 {
1511 unsigned long flags;
1512
1513 wb >>= 3;
1514 spin_lock_irqsave(&adev->wb.lock, flags);
1515 if (wb < adev->wb.num_wb)
1516 __clear_bit(wb, adev->wb.used);
1517 spin_unlock_irqrestore(&adev->wb.lock, flags);
1518 }
1519
1520 /**
1521 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1522 *
1523 * @adev: amdgpu_device pointer
1524 *
1525 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1526 * to fail, but if any of the BARs is not accessible after the size we abort
1527 * driver loading by returning -ENODEV.
1528 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1529 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1530 {
1531 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1532 struct pci_bus *root;
1533 struct resource *res;
1534 unsigned int i;
1535 u16 cmd;
1536 int r;
1537
1538 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1539 return 0;
1540
1541 /* Bypass for VF */
1542 if (amdgpu_sriov_vf(adev))
1543 return 0;
1544
1545 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1546 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1547 DRM_WARN("System can't access extended configuration space, please check!!\n");
1548
1549 /* skip if the bios has already enabled large BAR */
1550 if (adev->gmc.real_vram_size &&
1551 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1552 return 0;
1553
1554 /* Check if the root BUS has 64bit memory resources */
1555 root = adev->pdev->bus;
1556 while (root->parent)
1557 root = root->parent;
1558
1559 pci_bus_for_each_resource(root, res, i) {
1560 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1561 res->start > 0x100000000ull)
1562 break;
1563 }
1564
1565 /* Trying to resize is pointless without a root hub window above 4GB */
1566 if (!res)
1567 return 0;
1568
1569 /* Limit the BAR size to what is available */
1570 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1571 rbar_size);
1572
1573 /* Disable memory decoding while we change the BAR addresses and size */
1574 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1575 pci_write_config_word(adev->pdev, PCI_COMMAND,
1576 cmd & ~PCI_COMMAND_MEMORY);
1577
1578 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1579 amdgpu_doorbell_fini(adev);
1580 if (adev->asic_type >= CHIP_BONAIRE)
1581 pci_release_resource(adev->pdev, 2);
1582
1583 pci_release_resource(adev->pdev, 0);
1584
1585 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1586 if (r == -ENOSPC)
1587 DRM_INFO("Not enough PCI address space for a large BAR.");
1588 else if (r && r != -ENOTSUPP)
1589 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1590
1591 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1592
1593 /* When the doorbell or fb BAR isn't available we have no chance of
1594 * using the device.
1595 */
1596 r = amdgpu_doorbell_init(adev);
1597 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1598 return -ENODEV;
1599
1600 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1601
1602 return 0;
1603 }
1604
amdgpu_device_read_bios(struct amdgpu_device * adev)1605 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1606 {
1607 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1608 return false;
1609
1610 return true;
1611 }
1612
1613 /*
1614 * GPU helpers function.
1615 */
1616 /**
1617 * amdgpu_device_need_post - check if the hw need post or not
1618 *
1619 * @adev: amdgpu_device pointer
1620 *
1621 * Check if the asic has been initialized (all asics) at driver startup
1622 * or post is needed if hw reset is performed.
1623 * Returns true if need or false if not.
1624 */
amdgpu_device_need_post(struct amdgpu_device * adev)1625 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1626 {
1627 uint32_t reg;
1628
1629 if (amdgpu_sriov_vf(adev))
1630 return false;
1631
1632 if (!amdgpu_device_read_bios(adev))
1633 return false;
1634
1635 if (amdgpu_passthrough(adev)) {
1636 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1637 * some old smc fw still need driver do vPost otherwise gpu hang, while
1638 * those smc fw version above 22.15 doesn't have this flaw, so we force
1639 * vpost executed for smc version below 22.15
1640 */
1641 if (adev->asic_type == CHIP_FIJI) {
1642 int err;
1643 uint32_t fw_ver;
1644
1645 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1646 /* force vPost if error occured */
1647 if (err)
1648 return true;
1649
1650 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1651 release_firmware(adev->pm.fw);
1652 if (fw_ver < 0x00160e00)
1653 return true;
1654 }
1655 }
1656
1657 /* Don't post if we need to reset whole hive on init */
1658 if (adev->gmc.xgmi.pending_reset)
1659 return false;
1660
1661 if (adev->has_hw_reset) {
1662 adev->has_hw_reset = false;
1663 return true;
1664 }
1665
1666 /* bios scratch used on CIK+ */
1667 if (adev->asic_type >= CHIP_BONAIRE)
1668 return amdgpu_atombios_scratch_need_asic_init(adev);
1669
1670 /* check MEM_SIZE for older asics */
1671 reg = amdgpu_asic_get_config_memsize(adev);
1672
1673 if ((reg != 0) && (reg != 0xffffffff))
1674 return false;
1675
1676 return true;
1677 }
1678
1679 /*
1680 * Check whether seamless boot is supported.
1681 *
1682 * So far we only support seamless boot on DCE 3.0 or later.
1683 * If users report that it works on older ASICS as well, we may
1684 * loosen this.
1685 */
amdgpu_device_seamless_boot_supported(struct amdgpu_device * adev)1686 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1687 {
1688 switch (amdgpu_seamless) {
1689 case -1:
1690 break;
1691 case 1:
1692 return true;
1693 case 0:
1694 return false;
1695 default:
1696 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1697 amdgpu_seamless);
1698 return false;
1699 }
1700
1701 if (!(adev->flags & AMD_IS_APU))
1702 return false;
1703
1704 if (adev->mman.keep_stolen_vga_memory)
1705 return false;
1706
1707 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1708 }
1709
1710 /*
1711 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1712 * don't support dynamic speed switching. Until we have confirmation from Intel
1713 * that a specific host supports it, it's safer that we keep it disabled for all.
1714 *
1715 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1716 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1717 */
amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device * adev)1718 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1719 {
1720 #if IS_ENABLED(CONFIG_X86)
1721 struct cpuinfo_x86 *c = &cpu_data(0);
1722
1723 /* eGPU change speeds based on USB4 fabric conditions */
1724 if (dev_is_removable(adev->dev))
1725 return true;
1726
1727 if (c->x86_vendor == X86_VENDOR_INTEL)
1728 return false;
1729 #endif
1730 return true;
1731 }
1732
1733 /**
1734 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1735 *
1736 * @adev: amdgpu_device pointer
1737 *
1738 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1739 * be set for this device.
1740 *
1741 * Returns true if it should be used or false if not.
1742 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1743 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1744 {
1745 switch (amdgpu_aspm) {
1746 case -1:
1747 break;
1748 case 0:
1749 return false;
1750 case 1:
1751 return true;
1752 default:
1753 return false;
1754 }
1755 if (adev->flags & AMD_IS_APU)
1756 return false;
1757 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK))
1758 return false;
1759 return pcie_aspm_enabled(adev->pdev);
1760 }
1761
1762 /* if we get transitioned to only one device, take VGA back */
1763 /**
1764 * amdgpu_device_vga_set_decode - enable/disable vga decode
1765 *
1766 * @pdev: PCI device pointer
1767 * @state: enable/disable vga decode
1768 *
1769 * Enable/disable vga decode (all asics).
1770 * Returns VGA resource flags.
1771 */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1772 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1773 bool state)
1774 {
1775 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1776
1777 amdgpu_asic_set_vga_state(adev, state);
1778 if (state)
1779 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1780 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1781 else
1782 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1783 }
1784
1785 /**
1786 * amdgpu_device_check_block_size - validate the vm block size
1787 *
1788 * @adev: amdgpu_device pointer
1789 *
1790 * Validates the vm block size specified via module parameter.
1791 * The vm block size defines number of bits in page table versus page directory,
1792 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1793 * page table and the remaining bits are in the page directory.
1794 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1795 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1796 {
1797 /* defines number of bits in page table versus page directory,
1798 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1799 * page table and the remaining bits are in the page directory
1800 */
1801 if (amdgpu_vm_block_size == -1)
1802 return;
1803
1804 if (amdgpu_vm_block_size < 9) {
1805 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1806 amdgpu_vm_block_size);
1807 amdgpu_vm_block_size = -1;
1808 }
1809 }
1810
1811 /**
1812 * amdgpu_device_check_vm_size - validate the vm size
1813 *
1814 * @adev: amdgpu_device pointer
1815 *
1816 * Validates the vm size in GB specified via module parameter.
1817 * The VM size is the size of the GPU virtual memory space in GB.
1818 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1819 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1820 {
1821 /* no need to check the default value */
1822 if (amdgpu_vm_size == -1)
1823 return;
1824
1825 if (amdgpu_vm_size < 1) {
1826 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1827 amdgpu_vm_size);
1828 amdgpu_vm_size = -1;
1829 }
1830 }
1831
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1832 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1833 {
1834 struct sysinfo si;
1835 bool is_os_64 = (sizeof(void *) == 8);
1836 uint64_t total_memory;
1837 uint64_t dram_size_seven_GB = 0x1B8000000;
1838 uint64_t dram_size_three_GB = 0xB8000000;
1839
1840 if (amdgpu_smu_memory_pool_size == 0)
1841 return;
1842
1843 if (!is_os_64) {
1844 DRM_WARN("Not 64-bit OS, feature not supported\n");
1845 goto def_value;
1846 }
1847 si_meminfo(&si);
1848 total_memory = (uint64_t)si.totalram * si.mem_unit;
1849
1850 if ((amdgpu_smu_memory_pool_size == 1) ||
1851 (amdgpu_smu_memory_pool_size == 2)) {
1852 if (total_memory < dram_size_three_GB)
1853 goto def_value1;
1854 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1855 (amdgpu_smu_memory_pool_size == 8)) {
1856 if (total_memory < dram_size_seven_GB)
1857 goto def_value1;
1858 } else {
1859 DRM_WARN("Smu memory pool size not supported\n");
1860 goto def_value;
1861 }
1862 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1863
1864 return;
1865
1866 def_value1:
1867 DRM_WARN("No enough system memory\n");
1868 def_value:
1869 adev->pm.smu_prv_buffer_size = 0;
1870 }
1871
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1872 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1873 {
1874 if (!(adev->flags & AMD_IS_APU) ||
1875 adev->asic_type < CHIP_RAVEN)
1876 return 0;
1877
1878 switch (adev->asic_type) {
1879 case CHIP_RAVEN:
1880 if (adev->pdev->device == 0x15dd)
1881 adev->apu_flags |= AMD_APU_IS_RAVEN;
1882 if (adev->pdev->device == 0x15d8)
1883 adev->apu_flags |= AMD_APU_IS_PICASSO;
1884 break;
1885 case CHIP_RENOIR:
1886 if ((adev->pdev->device == 0x1636) ||
1887 (adev->pdev->device == 0x164c))
1888 adev->apu_flags |= AMD_APU_IS_RENOIR;
1889 else
1890 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1891 break;
1892 case CHIP_VANGOGH:
1893 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1894 break;
1895 case CHIP_YELLOW_CARP:
1896 break;
1897 case CHIP_CYAN_SKILLFISH:
1898 if ((adev->pdev->device == 0x13FE) ||
1899 (adev->pdev->device == 0x143F))
1900 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1901 break;
1902 default:
1903 break;
1904 }
1905
1906 return 0;
1907 }
1908
1909 /**
1910 * amdgpu_device_check_arguments - validate module params
1911 *
1912 * @adev: amdgpu_device pointer
1913 *
1914 * Validates certain module parameters and updates
1915 * the associated values used by the driver (all asics).
1916 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1917 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1918 {
1919 int i;
1920
1921 if (amdgpu_sched_jobs < 4) {
1922 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1923 amdgpu_sched_jobs);
1924 amdgpu_sched_jobs = 4;
1925 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1926 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1927 amdgpu_sched_jobs);
1928 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1929 }
1930
1931 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1932 /* gart size must be greater or equal to 32M */
1933 dev_warn(adev->dev, "gart size (%d) too small\n",
1934 amdgpu_gart_size);
1935 amdgpu_gart_size = -1;
1936 }
1937
1938 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1939 /* gtt size must be greater or equal to 32M */
1940 dev_warn(adev->dev, "gtt size (%d) too small\n",
1941 amdgpu_gtt_size);
1942 amdgpu_gtt_size = -1;
1943 }
1944
1945 /* valid range is between 4 and 9 inclusive */
1946 if (amdgpu_vm_fragment_size != -1 &&
1947 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1948 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1949 amdgpu_vm_fragment_size = -1;
1950 }
1951
1952 if (amdgpu_sched_hw_submission < 2) {
1953 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1954 amdgpu_sched_hw_submission);
1955 amdgpu_sched_hw_submission = 2;
1956 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1957 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1958 amdgpu_sched_hw_submission);
1959 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1960 }
1961
1962 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1963 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1964 amdgpu_reset_method = -1;
1965 }
1966
1967 amdgpu_device_check_smu_prv_buffer_size(adev);
1968
1969 amdgpu_device_check_vm_size(adev);
1970
1971 amdgpu_device_check_block_size(adev);
1972
1973 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1974
1975 for (i = 0; i < MAX_XCP; i++)
1976 adev->enforce_isolation[i] = !!enforce_isolation;
1977
1978 return 0;
1979 }
1980
1981 /**
1982 * amdgpu_switcheroo_set_state - set switcheroo state
1983 *
1984 * @pdev: pci dev pointer
1985 * @state: vga_switcheroo state
1986 *
1987 * Callback for the switcheroo driver. Suspends or resumes
1988 * the asics before or after it is powered up using ACPI methods.
1989 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1990 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1991 enum vga_switcheroo_state state)
1992 {
1993 struct drm_device *dev = pci_get_drvdata(pdev);
1994 int r;
1995
1996 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1997 return;
1998
1999 if (state == VGA_SWITCHEROO_ON) {
2000 pr_info("switched on\n");
2001 /* don't suspend or resume card normally */
2002 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2003
2004 pci_set_power_state(pdev, PCI_D0);
2005 amdgpu_device_load_pci_state(pdev);
2006 r = pci_enable_device(pdev);
2007 if (r)
2008 DRM_WARN("pci_enable_device failed (%d)\n", r);
2009 amdgpu_device_resume(dev, true);
2010
2011 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2012 } else {
2013 pr_info("switched off\n");
2014 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2015 amdgpu_device_prepare(dev);
2016 amdgpu_device_suspend(dev, true);
2017 amdgpu_device_cache_pci_state(pdev);
2018 /* Shut down the device */
2019 pci_disable_device(pdev);
2020 pci_set_power_state(pdev, PCI_D3cold);
2021 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2022 }
2023 }
2024
2025 /**
2026 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2027 *
2028 * @pdev: pci dev pointer
2029 *
2030 * Callback for the switcheroo driver. Check of the switcheroo
2031 * state can be changed.
2032 * Returns true if the state can be changed, false if not.
2033 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)2034 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2035 {
2036 struct drm_device *dev = pci_get_drvdata(pdev);
2037
2038 /*
2039 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2040 * locking inversion with the driver load path. And the access here is
2041 * completely racy anyway. So don't bother with locking for now.
2042 */
2043 return atomic_read(&dev->open_count) == 0;
2044 }
2045
2046 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2047 .set_gpu_state = amdgpu_switcheroo_set_state,
2048 .reprobe = NULL,
2049 .can_switch = amdgpu_switcheroo_can_switch,
2050 };
2051
2052 /**
2053 * amdgpu_device_ip_set_clockgating_state - set the CG state
2054 *
2055 * @dev: amdgpu_device pointer
2056 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2057 * @state: clockgating state (gate or ungate)
2058 *
2059 * Sets the requested clockgating state for all instances of
2060 * the hardware IP specified.
2061 * Returns the error code from the last instance.
2062 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)2063 int amdgpu_device_ip_set_clockgating_state(void *dev,
2064 enum amd_ip_block_type block_type,
2065 enum amd_clockgating_state state)
2066 {
2067 struct amdgpu_device *adev = dev;
2068 int i, r = 0;
2069
2070 for (i = 0; i < adev->num_ip_blocks; i++) {
2071 if (!adev->ip_blocks[i].status.valid)
2072 continue;
2073 if (adev->ip_blocks[i].version->type != block_type)
2074 continue;
2075 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2076 continue;
2077 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2078 (void *)adev, state);
2079 if (r)
2080 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2081 adev->ip_blocks[i].version->funcs->name, r);
2082 }
2083 return r;
2084 }
2085
2086 /**
2087 * amdgpu_device_ip_set_powergating_state - set the PG state
2088 *
2089 * @dev: amdgpu_device pointer
2090 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2091 * @state: powergating state (gate or ungate)
2092 *
2093 * Sets the requested powergating state for all instances of
2094 * the hardware IP specified.
2095 * Returns the error code from the last instance.
2096 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)2097 int amdgpu_device_ip_set_powergating_state(void *dev,
2098 enum amd_ip_block_type block_type,
2099 enum amd_powergating_state state)
2100 {
2101 struct amdgpu_device *adev = dev;
2102 int i, r = 0;
2103
2104 for (i = 0; i < adev->num_ip_blocks; i++) {
2105 if (!adev->ip_blocks[i].status.valid)
2106 continue;
2107 if (adev->ip_blocks[i].version->type != block_type)
2108 continue;
2109 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2110 continue;
2111 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2112 (void *)adev, state);
2113 if (r)
2114 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2115 adev->ip_blocks[i].version->funcs->name, r);
2116 }
2117 return r;
2118 }
2119
2120 /**
2121 * amdgpu_device_ip_get_clockgating_state - get the CG state
2122 *
2123 * @adev: amdgpu_device pointer
2124 * @flags: clockgating feature flags
2125 *
2126 * Walks the list of IPs on the device and updates the clockgating
2127 * flags for each IP.
2128 * Updates @flags with the feature flags for each hardware IP where
2129 * clockgating is enabled.
2130 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)2131 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2132 u64 *flags)
2133 {
2134 int i;
2135
2136 for (i = 0; i < adev->num_ip_blocks; i++) {
2137 if (!adev->ip_blocks[i].status.valid)
2138 continue;
2139 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2140 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
2141 }
2142 }
2143
2144 /**
2145 * amdgpu_device_ip_wait_for_idle - wait for idle
2146 *
2147 * @adev: amdgpu_device pointer
2148 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2149 *
2150 * Waits for the request hardware IP to be idle.
2151 * Returns 0 for success or a negative error code on failure.
2152 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2153 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2154 enum amd_ip_block_type block_type)
2155 {
2156 int i, r;
2157
2158 for (i = 0; i < adev->num_ip_blocks; i++) {
2159 if (!adev->ip_blocks[i].status.valid)
2160 continue;
2161 if (adev->ip_blocks[i].version->type == block_type) {
2162 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
2163 if (r)
2164 return r;
2165 break;
2166 }
2167 }
2168 return 0;
2169
2170 }
2171
2172 /**
2173 * amdgpu_device_ip_is_idle - is the hardware IP idle
2174 *
2175 * @adev: amdgpu_device pointer
2176 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2177 *
2178 * Check if the hardware IP is idle or not.
2179 * Returns true if it the IP is idle, false if not.
2180 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2181 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
2182 enum amd_ip_block_type block_type)
2183 {
2184 int i;
2185
2186 for (i = 0; i < adev->num_ip_blocks; i++) {
2187 if (!adev->ip_blocks[i].status.valid)
2188 continue;
2189 if (adev->ip_blocks[i].version->type == block_type)
2190 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
2191 }
2192 return true;
2193
2194 }
2195
2196 /**
2197 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2198 *
2199 * @adev: amdgpu_device pointer
2200 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2201 *
2202 * Returns a pointer to the hardware IP block structure
2203 * if it exists for the asic, otherwise NULL.
2204 */
2205 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)2206 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2207 enum amd_ip_block_type type)
2208 {
2209 int i;
2210
2211 for (i = 0; i < adev->num_ip_blocks; i++)
2212 if (adev->ip_blocks[i].version->type == type)
2213 return &adev->ip_blocks[i];
2214
2215 return NULL;
2216 }
2217
2218 /**
2219 * amdgpu_device_ip_block_version_cmp
2220 *
2221 * @adev: amdgpu_device pointer
2222 * @type: enum amd_ip_block_type
2223 * @major: major version
2224 * @minor: minor version
2225 *
2226 * return 0 if equal or greater
2227 * return 1 if smaller or the ip_block doesn't exist
2228 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)2229 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2230 enum amd_ip_block_type type,
2231 u32 major, u32 minor)
2232 {
2233 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2234
2235 if (ip_block && ((ip_block->version->major > major) ||
2236 ((ip_block->version->major == major) &&
2237 (ip_block->version->minor >= minor))))
2238 return 0;
2239
2240 return 1;
2241 }
2242
2243 /**
2244 * amdgpu_device_ip_block_add
2245 *
2246 * @adev: amdgpu_device pointer
2247 * @ip_block_version: pointer to the IP to add
2248 *
2249 * Adds the IP block driver information to the collection of IPs
2250 * on the asic.
2251 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)2252 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2253 const struct amdgpu_ip_block_version *ip_block_version)
2254 {
2255 if (!ip_block_version)
2256 return -EINVAL;
2257
2258 switch (ip_block_version->type) {
2259 case AMD_IP_BLOCK_TYPE_VCN:
2260 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2261 return 0;
2262 break;
2263 case AMD_IP_BLOCK_TYPE_JPEG:
2264 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2265 return 0;
2266 break;
2267 default:
2268 break;
2269 }
2270
2271 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
2272 ip_block_version->funcs->name);
2273
2274 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2275
2276 return 0;
2277 }
2278
2279 /**
2280 * amdgpu_device_enable_virtual_display - enable virtual display feature
2281 *
2282 * @adev: amdgpu_device pointer
2283 *
2284 * Enabled the virtual display feature if the user has enabled it via
2285 * the module parameter virtual_display. This feature provides a virtual
2286 * display hardware on headless boards or in virtualized environments.
2287 * This function parses and validates the configuration string specified by
2288 * the user and configues the virtual display configuration (number of
2289 * virtual connectors, crtcs, etc.) specified.
2290 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)2291 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2292 {
2293 adev->enable_virtual_display = false;
2294
2295 if (amdgpu_virtual_display) {
2296 const char *pci_address_name = pci_name(adev->pdev);
2297 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2298
2299 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2300 pciaddstr_tmp = pciaddstr;
2301 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2302 pciaddname = strsep(&pciaddname_tmp, ",");
2303 if (!strcmp("all", pciaddname)
2304 || !strcmp(pci_address_name, pciaddname)) {
2305 long num_crtc;
2306 int res = -1;
2307
2308 adev->enable_virtual_display = true;
2309
2310 if (pciaddname_tmp)
2311 res = kstrtol(pciaddname_tmp, 10,
2312 &num_crtc);
2313
2314 if (!res) {
2315 if (num_crtc < 1)
2316 num_crtc = 1;
2317 if (num_crtc > 6)
2318 num_crtc = 6;
2319 adev->mode_info.num_crtc = num_crtc;
2320 } else {
2321 adev->mode_info.num_crtc = 1;
2322 }
2323 break;
2324 }
2325 }
2326
2327 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2328 amdgpu_virtual_display, pci_address_name,
2329 adev->enable_virtual_display, adev->mode_info.num_crtc);
2330
2331 kfree(pciaddstr);
2332 }
2333 }
2334
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)2335 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2336 {
2337 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2338 adev->mode_info.num_crtc = 1;
2339 adev->enable_virtual_display = true;
2340 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2341 adev->enable_virtual_display, adev->mode_info.num_crtc);
2342 }
2343 }
2344
2345 /**
2346 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2347 *
2348 * @adev: amdgpu_device pointer
2349 *
2350 * Parses the asic configuration parameters specified in the gpu info
2351 * firmware and makes them availale to the driver for use in configuring
2352 * the asic.
2353 * Returns 0 on success, -EINVAL on failure.
2354 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)2355 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2356 {
2357 const char *chip_name;
2358 int err;
2359 const struct gpu_info_firmware_header_v1_0 *hdr;
2360
2361 adev->firmware.gpu_info_fw = NULL;
2362
2363 if (adev->mman.discovery_bin)
2364 return 0;
2365
2366 switch (adev->asic_type) {
2367 default:
2368 return 0;
2369 case CHIP_VEGA10:
2370 chip_name = "vega10";
2371 break;
2372 case CHIP_VEGA12:
2373 chip_name = "vega12";
2374 break;
2375 case CHIP_RAVEN:
2376 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2377 chip_name = "raven2";
2378 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2379 chip_name = "picasso";
2380 else
2381 chip_name = "raven";
2382 break;
2383 case CHIP_ARCTURUS:
2384 chip_name = "arcturus";
2385 break;
2386 case CHIP_NAVI12:
2387 chip_name = "navi12";
2388 break;
2389 }
2390
2391 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2392 "amdgpu/%s_gpu_info.bin", chip_name);
2393 if (err) {
2394 dev_err(adev->dev,
2395 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2396 chip_name);
2397 goto out;
2398 }
2399
2400 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2401 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2402
2403 switch (hdr->version_major) {
2404 case 1:
2405 {
2406 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2407 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2408 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2409
2410 /*
2411 * Should be droped when DAL no longer needs it.
2412 */
2413 if (adev->asic_type == CHIP_NAVI12)
2414 goto parse_soc_bounding_box;
2415
2416 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2417 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2418 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2419 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2420 adev->gfx.config.max_texture_channel_caches =
2421 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2422 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2423 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2424 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2425 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2426 adev->gfx.config.double_offchip_lds_buf =
2427 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2428 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2429 adev->gfx.cu_info.max_waves_per_simd =
2430 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2431 adev->gfx.cu_info.max_scratch_slots_per_cu =
2432 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2433 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2434 if (hdr->version_minor >= 1) {
2435 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2436 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2437 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2438 adev->gfx.config.num_sc_per_sh =
2439 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2440 adev->gfx.config.num_packer_per_sc =
2441 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2442 }
2443
2444 parse_soc_bounding_box:
2445 /*
2446 * soc bounding box info is not integrated in disocovery table,
2447 * we always need to parse it from gpu info firmware if needed.
2448 */
2449 if (hdr->version_minor == 2) {
2450 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2451 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2452 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2453 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2454 }
2455 break;
2456 }
2457 default:
2458 dev_err(adev->dev,
2459 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2460 err = -EINVAL;
2461 goto out;
2462 }
2463 out:
2464 return err;
2465 }
2466
2467 /**
2468 * amdgpu_device_ip_early_init - run early init for hardware IPs
2469 *
2470 * @adev: amdgpu_device pointer
2471 *
2472 * Early initialization pass for hardware IPs. The hardware IPs that make
2473 * up each asic are discovered each IP's early_init callback is run. This
2474 * is the first stage in initializing the asic.
2475 * Returns 0 on success, negative error code on failure.
2476 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2477 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2478 {
2479 struct amdgpu_ip_block *ip_block;
2480 struct pci_dev *parent;
2481 int i, r;
2482 bool total;
2483
2484 amdgpu_device_enable_virtual_display(adev);
2485
2486 if (amdgpu_sriov_vf(adev)) {
2487 r = amdgpu_virt_request_full_gpu(adev, true);
2488 if (r)
2489 return r;
2490 }
2491
2492 switch (adev->asic_type) {
2493 #ifdef CONFIG_DRM_AMDGPU_SI
2494 case CHIP_VERDE:
2495 case CHIP_TAHITI:
2496 case CHIP_PITCAIRN:
2497 case CHIP_OLAND:
2498 case CHIP_HAINAN:
2499 adev->family = AMDGPU_FAMILY_SI;
2500 r = si_set_ip_blocks(adev);
2501 if (r)
2502 return r;
2503 break;
2504 #endif
2505 #ifdef CONFIG_DRM_AMDGPU_CIK
2506 case CHIP_BONAIRE:
2507 case CHIP_HAWAII:
2508 case CHIP_KAVERI:
2509 case CHIP_KABINI:
2510 case CHIP_MULLINS:
2511 if (adev->flags & AMD_IS_APU)
2512 adev->family = AMDGPU_FAMILY_KV;
2513 else
2514 adev->family = AMDGPU_FAMILY_CI;
2515
2516 r = cik_set_ip_blocks(adev);
2517 if (r)
2518 return r;
2519 break;
2520 #endif
2521 case CHIP_TOPAZ:
2522 case CHIP_TONGA:
2523 case CHIP_FIJI:
2524 case CHIP_POLARIS10:
2525 case CHIP_POLARIS11:
2526 case CHIP_POLARIS12:
2527 case CHIP_VEGAM:
2528 case CHIP_CARRIZO:
2529 case CHIP_STONEY:
2530 if (adev->flags & AMD_IS_APU)
2531 adev->family = AMDGPU_FAMILY_CZ;
2532 else
2533 adev->family = AMDGPU_FAMILY_VI;
2534
2535 r = vi_set_ip_blocks(adev);
2536 if (r)
2537 return r;
2538 break;
2539 default:
2540 r = amdgpu_discovery_set_ip_blocks(adev);
2541 if (r)
2542 return r;
2543 break;
2544 }
2545
2546 if (amdgpu_has_atpx() &&
2547 (amdgpu_is_atpx_hybrid() ||
2548 amdgpu_has_atpx_dgpu_power_cntl()) &&
2549 ((adev->flags & AMD_IS_APU) == 0) &&
2550 !dev_is_removable(&adev->pdev->dev))
2551 adev->flags |= AMD_IS_PX;
2552
2553 if (!(adev->flags & AMD_IS_APU)) {
2554 parent = pcie_find_root_port(adev->pdev);
2555 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2556 }
2557
2558
2559 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2560 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2561 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2562 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2563 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2564 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2565 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2566
2567 total = true;
2568 for (i = 0; i < adev->num_ip_blocks; i++) {
2569 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2570 DRM_WARN("disabled ip block: %d <%s>\n",
2571 i, adev->ip_blocks[i].version->funcs->name);
2572 adev->ip_blocks[i].status.valid = false;
2573 } else {
2574 if (adev->ip_blocks[i].version->funcs->early_init) {
2575 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2576 if (r == -ENOENT) {
2577 adev->ip_blocks[i].status.valid = false;
2578 } else if (r) {
2579 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2580 adev->ip_blocks[i].version->funcs->name, r);
2581 total = false;
2582 } else {
2583 adev->ip_blocks[i].status.valid = true;
2584 }
2585 } else {
2586 adev->ip_blocks[i].status.valid = true;
2587 }
2588 }
2589 /* get the vbios after the asic_funcs are set up */
2590 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2591 r = amdgpu_device_parse_gpu_info_fw(adev);
2592 if (r)
2593 return r;
2594
2595 /* Read BIOS */
2596 if (amdgpu_device_read_bios(adev)) {
2597 if (!amdgpu_get_bios(adev))
2598 return -EINVAL;
2599
2600 r = amdgpu_atombios_init(adev);
2601 if (r) {
2602 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2603 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2604 return r;
2605 }
2606 }
2607
2608 /*get pf2vf msg info at it's earliest time*/
2609 if (amdgpu_sriov_vf(adev))
2610 amdgpu_virt_init_data_exchange(adev);
2611
2612 }
2613 }
2614 if (!total)
2615 return -ENODEV;
2616
2617 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2618 if (ip_block->status.valid != false)
2619 amdgpu_amdkfd_device_probe(adev);
2620
2621 adev->cg_flags &= amdgpu_cg_mask;
2622 adev->pg_flags &= amdgpu_pg_mask;
2623
2624 return 0;
2625 }
2626
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2627 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2628 {
2629 int i, r;
2630
2631 for (i = 0; i < adev->num_ip_blocks; i++) {
2632 if (!adev->ip_blocks[i].status.sw)
2633 continue;
2634 if (adev->ip_blocks[i].status.hw)
2635 continue;
2636 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2637 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2638 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2639 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2640 if (r) {
2641 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2642 adev->ip_blocks[i].version->funcs->name, r);
2643 return r;
2644 }
2645 adev->ip_blocks[i].status.hw = true;
2646 }
2647 }
2648
2649 return 0;
2650 }
2651
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2652 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2653 {
2654 int i, r;
2655
2656 for (i = 0; i < adev->num_ip_blocks; i++) {
2657 if (!adev->ip_blocks[i].status.sw)
2658 continue;
2659 if (adev->ip_blocks[i].status.hw)
2660 continue;
2661 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2662 if (r) {
2663 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2664 adev->ip_blocks[i].version->funcs->name, r);
2665 return r;
2666 }
2667 adev->ip_blocks[i].status.hw = true;
2668 }
2669
2670 return 0;
2671 }
2672
amdgpu_device_fw_loading(struct amdgpu_device * adev)2673 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2674 {
2675 int r = 0;
2676 int i;
2677 uint32_t smu_version;
2678
2679 if (adev->asic_type >= CHIP_VEGA10) {
2680 for (i = 0; i < adev->num_ip_blocks; i++) {
2681 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2682 continue;
2683
2684 if (!adev->ip_blocks[i].status.sw)
2685 continue;
2686
2687 /* no need to do the fw loading again if already done*/
2688 if (adev->ip_blocks[i].status.hw == true)
2689 break;
2690
2691 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2692 r = adev->ip_blocks[i].version->funcs->resume(adev);
2693 if (r) {
2694 DRM_ERROR("resume of IP block <%s> failed %d\n",
2695 adev->ip_blocks[i].version->funcs->name, r);
2696 return r;
2697 }
2698 } else {
2699 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2700 if (r) {
2701 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2702 adev->ip_blocks[i].version->funcs->name, r);
2703 return r;
2704 }
2705 }
2706
2707 adev->ip_blocks[i].status.hw = true;
2708 break;
2709 }
2710 }
2711
2712 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2713 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2714
2715 return r;
2716 }
2717
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2718 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2719 {
2720 long timeout;
2721 int r, i;
2722
2723 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2724 struct amdgpu_ring *ring = adev->rings[i];
2725
2726 /* No need to setup the GPU scheduler for rings that don't need it */
2727 if (!ring || ring->no_scheduler)
2728 continue;
2729
2730 switch (ring->funcs->type) {
2731 case AMDGPU_RING_TYPE_GFX:
2732 timeout = adev->gfx_timeout;
2733 break;
2734 case AMDGPU_RING_TYPE_COMPUTE:
2735 timeout = adev->compute_timeout;
2736 break;
2737 case AMDGPU_RING_TYPE_SDMA:
2738 timeout = adev->sdma_timeout;
2739 break;
2740 default:
2741 timeout = adev->video_timeout;
2742 break;
2743 }
2744
2745 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
2746 DRM_SCHED_PRIORITY_COUNT,
2747 ring->num_hw_submission, 0,
2748 timeout, adev->reset_domain->wq,
2749 ring->sched_score, ring->name,
2750 adev->dev);
2751 if (r) {
2752 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2753 ring->name);
2754 return r;
2755 }
2756 r = amdgpu_uvd_entity_init(adev, ring);
2757 if (r) {
2758 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2759 ring->name);
2760 return r;
2761 }
2762 r = amdgpu_vce_entity_init(adev, ring);
2763 if (r) {
2764 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
2765 ring->name);
2766 return r;
2767 }
2768 }
2769
2770 amdgpu_xcp_update_partition_sched_list(adev);
2771
2772 return 0;
2773 }
2774
2775
2776 /**
2777 * amdgpu_device_ip_init - run init for hardware IPs
2778 *
2779 * @adev: amdgpu_device pointer
2780 *
2781 * Main initialization pass for hardware IPs. The list of all the hardware
2782 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2783 * are run. sw_init initializes the software state associated with each IP
2784 * and hw_init initializes the hardware associated with each IP.
2785 * Returns 0 on success, negative error code on failure.
2786 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2787 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2788 {
2789 int i, r;
2790
2791 r = amdgpu_ras_init(adev);
2792 if (r)
2793 return r;
2794
2795 for (i = 0; i < adev->num_ip_blocks; i++) {
2796 if (!adev->ip_blocks[i].status.valid)
2797 continue;
2798 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2799 if (r) {
2800 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2801 adev->ip_blocks[i].version->funcs->name, r);
2802 goto init_failed;
2803 }
2804 adev->ip_blocks[i].status.sw = true;
2805
2806 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2807 /* need to do common hw init early so everything is set up for gmc */
2808 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2809 if (r) {
2810 DRM_ERROR("hw_init %d failed %d\n", i, r);
2811 goto init_failed;
2812 }
2813 adev->ip_blocks[i].status.hw = true;
2814 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2815 /* need to do gmc hw init early so we can allocate gpu mem */
2816 /* Try to reserve bad pages early */
2817 if (amdgpu_sriov_vf(adev))
2818 amdgpu_virt_exchange_data(adev);
2819
2820 r = amdgpu_device_mem_scratch_init(adev);
2821 if (r) {
2822 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2823 goto init_failed;
2824 }
2825 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2826 if (r) {
2827 DRM_ERROR("hw_init %d failed %d\n", i, r);
2828 goto init_failed;
2829 }
2830 r = amdgpu_device_wb_init(adev);
2831 if (r) {
2832 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2833 goto init_failed;
2834 }
2835 adev->ip_blocks[i].status.hw = true;
2836
2837 /* right after GMC hw init, we create CSA */
2838 if (adev->gfx.mcbp) {
2839 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2840 AMDGPU_GEM_DOMAIN_VRAM |
2841 AMDGPU_GEM_DOMAIN_GTT,
2842 AMDGPU_CSA_SIZE);
2843 if (r) {
2844 DRM_ERROR("allocate CSA failed %d\n", r);
2845 goto init_failed;
2846 }
2847 }
2848
2849 r = amdgpu_seq64_init(adev);
2850 if (r) {
2851 DRM_ERROR("allocate seq64 failed %d\n", r);
2852 goto init_failed;
2853 }
2854 }
2855 }
2856
2857 if (amdgpu_sriov_vf(adev))
2858 amdgpu_virt_init_data_exchange(adev);
2859
2860 r = amdgpu_ib_pool_init(adev);
2861 if (r) {
2862 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2863 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2864 goto init_failed;
2865 }
2866
2867 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2868 if (r)
2869 goto init_failed;
2870
2871 r = amdgpu_device_ip_hw_init_phase1(adev);
2872 if (r)
2873 goto init_failed;
2874
2875 r = amdgpu_device_fw_loading(adev);
2876 if (r)
2877 goto init_failed;
2878
2879 r = amdgpu_device_ip_hw_init_phase2(adev);
2880 if (r)
2881 goto init_failed;
2882
2883 /*
2884 * retired pages will be loaded from eeprom and reserved here,
2885 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2886 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2887 * for I2C communication which only true at this point.
2888 *
2889 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2890 * failure from bad gpu situation and stop amdgpu init process
2891 * accordingly. For other failed cases, it will still release all
2892 * the resource and print error message, rather than returning one
2893 * negative value to upper level.
2894 *
2895 * Note: theoretically, this should be called before all vram allocations
2896 * to protect retired page from abusing
2897 */
2898 r = amdgpu_ras_recovery_init(adev);
2899 if (r)
2900 goto init_failed;
2901
2902 /**
2903 * In case of XGMI grab extra reference for reset domain for this device
2904 */
2905 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2906 if (amdgpu_xgmi_add_device(adev) == 0) {
2907 if (!amdgpu_sriov_vf(adev)) {
2908 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2909
2910 if (WARN_ON(!hive)) {
2911 r = -ENOENT;
2912 goto init_failed;
2913 }
2914
2915 if (!hive->reset_domain ||
2916 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2917 r = -ENOENT;
2918 amdgpu_put_xgmi_hive(hive);
2919 goto init_failed;
2920 }
2921
2922 /* Drop the early temporary reset domain we created for device */
2923 amdgpu_reset_put_reset_domain(adev->reset_domain);
2924 adev->reset_domain = hive->reset_domain;
2925 amdgpu_put_xgmi_hive(hive);
2926 }
2927 }
2928 }
2929
2930 r = amdgpu_device_init_schedulers(adev);
2931 if (r)
2932 goto init_failed;
2933
2934 if (adev->mman.buffer_funcs_ring->sched.ready)
2935 amdgpu_ttm_set_buffer_funcs_status(adev, true);
2936
2937 /* Don't init kfd if whole hive need to be reset during init */
2938 if (!adev->gmc.xgmi.pending_reset) {
2939 kgd2kfd_init_zone_device(adev);
2940 amdgpu_amdkfd_device_init(adev);
2941 }
2942
2943 amdgpu_fru_get_product_info(adev);
2944
2945 init_failed:
2946
2947 return r;
2948 }
2949
2950 /**
2951 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2952 *
2953 * @adev: amdgpu_device pointer
2954 *
2955 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2956 * this function before a GPU reset. If the value is retained after a
2957 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2958 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2959 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2960 {
2961 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2962 }
2963
2964 /**
2965 * amdgpu_device_check_vram_lost - check if vram is valid
2966 *
2967 * @adev: amdgpu_device pointer
2968 *
2969 * Checks the reset magic value written to the gart pointer in VRAM.
2970 * The driver calls this after a GPU reset to see if the contents of
2971 * VRAM is lost or now.
2972 * returns true if vram is lost, false if not.
2973 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2974 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2975 {
2976 if (memcmp(adev->gart.ptr, adev->reset_magic,
2977 AMDGPU_RESET_MAGIC_NUM))
2978 return true;
2979
2980 if (!amdgpu_in_reset(adev))
2981 return false;
2982
2983 /*
2984 * For all ASICs with baco/mode1 reset, the VRAM is
2985 * always assumed to be lost.
2986 */
2987 switch (amdgpu_asic_reset_method(adev)) {
2988 case AMD_RESET_METHOD_BACO:
2989 case AMD_RESET_METHOD_MODE1:
2990 return true;
2991 default:
2992 return false;
2993 }
2994 }
2995
2996 /**
2997 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2998 *
2999 * @adev: amdgpu_device pointer
3000 * @state: clockgating state (gate or ungate)
3001 *
3002 * The list of all the hardware IPs that make up the asic is walked and the
3003 * set_clockgating_state callbacks are run.
3004 * Late initialization pass enabling clockgating for hardware IPs.
3005 * Fini or suspend, pass disabling clockgating for hardware IPs.
3006 * Returns 0 on success, negative error code on failure.
3007 */
3008
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)3009 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3010 enum amd_clockgating_state state)
3011 {
3012 int i, j, r;
3013
3014 if (amdgpu_emu_mode == 1)
3015 return 0;
3016
3017 for (j = 0; j < adev->num_ip_blocks; j++) {
3018 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3019 if (!adev->ip_blocks[i].status.late_initialized)
3020 continue;
3021 /* skip CG for GFX, SDMA on S0ix */
3022 if (adev->in_s0ix &&
3023 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3024 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3025 continue;
3026 /* skip CG for VCE/UVD, it's handled specially */
3027 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3028 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3029 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3030 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3031 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3032 /* enable clockgating to save power */
3033 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
3034 state);
3035 if (r) {
3036 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3037 adev->ip_blocks[i].version->funcs->name, r);
3038 return r;
3039 }
3040 }
3041 }
3042
3043 return 0;
3044 }
3045
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)3046 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3047 enum amd_powergating_state state)
3048 {
3049 int i, j, r;
3050
3051 if (amdgpu_emu_mode == 1)
3052 return 0;
3053
3054 for (j = 0; j < adev->num_ip_blocks; j++) {
3055 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3056 if (!adev->ip_blocks[i].status.late_initialized)
3057 continue;
3058 /* skip PG for GFX, SDMA on S0ix */
3059 if (adev->in_s0ix &&
3060 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3061 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3062 continue;
3063 /* skip CG for VCE/UVD, it's handled specially */
3064 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3065 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3066 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3067 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3068 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3069 /* enable powergating to save power */
3070 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
3071 state);
3072 if (r) {
3073 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3074 adev->ip_blocks[i].version->funcs->name, r);
3075 return r;
3076 }
3077 }
3078 }
3079 return 0;
3080 }
3081
amdgpu_device_enable_mgpu_fan_boost(void)3082 static int amdgpu_device_enable_mgpu_fan_boost(void)
3083 {
3084 struct amdgpu_gpu_instance *gpu_ins;
3085 struct amdgpu_device *adev;
3086 int i, ret = 0;
3087
3088 mutex_lock(&mgpu_info.mutex);
3089
3090 /*
3091 * MGPU fan boost feature should be enabled
3092 * only when there are two or more dGPUs in
3093 * the system
3094 */
3095 if (mgpu_info.num_dgpu < 2)
3096 goto out;
3097
3098 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3099 gpu_ins = &(mgpu_info.gpu_ins[i]);
3100 adev = gpu_ins->adev;
3101 if (!(adev->flags & AMD_IS_APU) &&
3102 !gpu_ins->mgpu_fan_enabled) {
3103 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3104 if (ret)
3105 break;
3106
3107 gpu_ins->mgpu_fan_enabled = 1;
3108 }
3109 }
3110
3111 out:
3112 mutex_unlock(&mgpu_info.mutex);
3113
3114 return ret;
3115 }
3116
3117 /**
3118 * amdgpu_device_ip_late_init - run late init for hardware IPs
3119 *
3120 * @adev: amdgpu_device pointer
3121 *
3122 * Late initialization pass for hardware IPs. The list of all the hardware
3123 * IPs that make up the asic is walked and the late_init callbacks are run.
3124 * late_init covers any special initialization that an IP requires
3125 * after all of the have been initialized or something that needs to happen
3126 * late in the init process.
3127 * Returns 0 on success, negative error code on failure.
3128 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)3129 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3130 {
3131 struct amdgpu_gpu_instance *gpu_instance;
3132 int i = 0, r;
3133
3134 for (i = 0; i < adev->num_ip_blocks; i++) {
3135 if (!adev->ip_blocks[i].status.hw)
3136 continue;
3137 if (adev->ip_blocks[i].version->funcs->late_init) {
3138 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
3139 if (r) {
3140 DRM_ERROR("late_init of IP block <%s> failed %d\n",
3141 adev->ip_blocks[i].version->funcs->name, r);
3142 return r;
3143 }
3144 }
3145 adev->ip_blocks[i].status.late_initialized = true;
3146 }
3147
3148 r = amdgpu_ras_late_init(adev);
3149 if (r) {
3150 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3151 return r;
3152 }
3153
3154 if (!amdgpu_in_reset(adev))
3155 amdgpu_ras_set_error_query_ready(adev, true);
3156
3157 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3158 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3159
3160 amdgpu_device_fill_reset_magic(adev);
3161
3162 r = amdgpu_device_enable_mgpu_fan_boost();
3163 if (r)
3164 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3165
3166 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3167 if (amdgpu_passthrough(adev) &&
3168 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3169 adev->asic_type == CHIP_ALDEBARAN))
3170 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3171
3172 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3173 mutex_lock(&mgpu_info.mutex);
3174
3175 /*
3176 * Reset device p-state to low as this was booted with high.
3177 *
3178 * This should be performed only after all devices from the same
3179 * hive get initialized.
3180 *
3181 * However, it's unknown how many device in the hive in advance.
3182 * As this is counted one by one during devices initializations.
3183 *
3184 * So, we wait for all XGMI interlinked devices initialized.
3185 * This may bring some delays as those devices may come from
3186 * different hives. But that should be OK.
3187 */
3188 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3189 for (i = 0; i < mgpu_info.num_gpu; i++) {
3190 gpu_instance = &(mgpu_info.gpu_ins[i]);
3191 if (gpu_instance->adev->flags & AMD_IS_APU)
3192 continue;
3193
3194 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3195 AMDGPU_XGMI_PSTATE_MIN);
3196 if (r) {
3197 DRM_ERROR("pstate setting failed (%d).\n", r);
3198 break;
3199 }
3200 }
3201 }
3202
3203 mutex_unlock(&mgpu_info.mutex);
3204 }
3205
3206 return 0;
3207 }
3208
3209 /**
3210 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3211 *
3212 * @adev: amdgpu_device pointer
3213 *
3214 * For ASICs need to disable SMC first
3215 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)3216 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3217 {
3218 int i, r;
3219
3220 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3221 return;
3222
3223 for (i = 0; i < adev->num_ip_blocks; i++) {
3224 if (!adev->ip_blocks[i].status.hw)
3225 continue;
3226 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3227 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3228 /* XXX handle errors */
3229 if (r) {
3230 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3231 adev->ip_blocks[i].version->funcs->name, r);
3232 }
3233 adev->ip_blocks[i].status.hw = false;
3234 break;
3235 }
3236 }
3237 }
3238
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)3239 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3240 {
3241 int i, r;
3242
3243 for (i = 0; i < adev->num_ip_blocks; i++) {
3244 if (!adev->ip_blocks[i].version->funcs->early_fini)
3245 continue;
3246
3247 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
3248 if (r) {
3249 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3250 adev->ip_blocks[i].version->funcs->name, r);
3251 }
3252 }
3253
3254 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3255 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3256
3257 amdgpu_amdkfd_suspend(adev, false);
3258
3259 /* Workaroud for ASICs need to disable SMC first */
3260 amdgpu_device_smu_fini_early(adev);
3261
3262 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3263 if (!adev->ip_blocks[i].status.hw)
3264 continue;
3265
3266 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3267 /* XXX handle errors */
3268 if (r) {
3269 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3270 adev->ip_blocks[i].version->funcs->name, r);
3271 }
3272
3273 adev->ip_blocks[i].status.hw = false;
3274 }
3275
3276 if (amdgpu_sriov_vf(adev)) {
3277 if (amdgpu_virt_release_full_gpu(adev, false))
3278 DRM_ERROR("failed to release exclusive mode on fini\n");
3279 }
3280
3281 return 0;
3282 }
3283
3284 /**
3285 * amdgpu_device_ip_fini - run fini for hardware IPs
3286 *
3287 * @adev: amdgpu_device pointer
3288 *
3289 * Main teardown pass for hardware IPs. The list of all the hardware
3290 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3291 * are run. hw_fini tears down the hardware associated with each IP
3292 * and sw_fini tears down any software state associated with each IP.
3293 * Returns 0 on success, negative error code on failure.
3294 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)3295 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3296 {
3297 int i, r;
3298
3299 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3300 amdgpu_virt_release_ras_err_handler_data(adev);
3301
3302 if (adev->gmc.xgmi.num_physical_nodes > 1)
3303 amdgpu_xgmi_remove_device(adev);
3304
3305 amdgpu_amdkfd_device_fini_sw(adev);
3306
3307 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3308 if (!adev->ip_blocks[i].status.sw)
3309 continue;
3310
3311 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3312 amdgpu_ucode_free_bo(adev);
3313 amdgpu_free_static_csa(&adev->virt.csa_obj);
3314 amdgpu_device_wb_fini(adev);
3315 amdgpu_device_mem_scratch_fini(adev);
3316 amdgpu_ib_pool_fini(adev);
3317 amdgpu_seq64_fini(adev);
3318 }
3319
3320 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
3321 /* XXX handle errors */
3322 if (r) {
3323 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3324 adev->ip_blocks[i].version->funcs->name, r);
3325 }
3326 adev->ip_blocks[i].status.sw = false;
3327 adev->ip_blocks[i].status.valid = false;
3328 }
3329
3330 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3331 if (!adev->ip_blocks[i].status.late_initialized)
3332 continue;
3333 if (adev->ip_blocks[i].version->funcs->late_fini)
3334 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
3335 adev->ip_blocks[i].status.late_initialized = false;
3336 }
3337
3338 amdgpu_ras_fini(adev);
3339
3340 return 0;
3341 }
3342
3343 /**
3344 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3345 *
3346 * @work: work_struct.
3347 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)3348 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3349 {
3350 struct amdgpu_device *adev =
3351 container_of(work, struct amdgpu_device, delayed_init_work.work);
3352 int r;
3353
3354 r = amdgpu_ib_ring_tests(adev);
3355 if (r)
3356 DRM_ERROR("ib ring test failed (%d).\n", r);
3357 }
3358
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)3359 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3360 {
3361 struct amdgpu_device *adev =
3362 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3363
3364 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3365 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3366
3367 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
3368 adev->gfx.gfx_off_state = true;
3369 }
3370
3371 /**
3372 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3373 *
3374 * @adev: amdgpu_device pointer
3375 *
3376 * Main suspend function for hardware IPs. The list of all the hardware
3377 * IPs that make up the asic is walked, clockgating is disabled and the
3378 * suspend callbacks are run. suspend puts the hardware and software state
3379 * in each IP into a state suitable for suspend.
3380 * Returns 0 on success, negative error code on failure.
3381 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)3382 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3383 {
3384 int i, r;
3385
3386 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3387 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3388
3389 /*
3390 * Per PMFW team's suggestion, driver needs to handle gfxoff
3391 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3392 * scenario. Add the missing df cstate disablement here.
3393 */
3394 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3395 dev_warn(adev->dev, "Failed to disallow df cstate");
3396
3397 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3398 if (!adev->ip_blocks[i].status.valid)
3399 continue;
3400
3401 /* displays are handled separately */
3402 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3403 continue;
3404
3405 /* XXX handle errors */
3406 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3407 /* XXX handle errors */
3408 if (r) {
3409 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3410 adev->ip_blocks[i].version->funcs->name, r);
3411 return r;
3412 }
3413
3414 adev->ip_blocks[i].status.hw = false;
3415 }
3416
3417 return 0;
3418 }
3419
3420 /**
3421 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3422 *
3423 * @adev: amdgpu_device pointer
3424 *
3425 * Main suspend function for hardware IPs. The list of all the hardware
3426 * IPs that make up the asic is walked, clockgating is disabled and the
3427 * suspend callbacks are run. suspend puts the hardware and software state
3428 * in each IP into a state suitable for suspend.
3429 * Returns 0 on success, negative error code on failure.
3430 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)3431 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3432 {
3433 int i, r;
3434
3435 if (adev->in_s0ix)
3436 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3437
3438 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3439 if (!adev->ip_blocks[i].status.valid)
3440 continue;
3441 /* displays are handled in phase1 */
3442 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3443 continue;
3444 /* PSP lost connection when err_event_athub occurs */
3445 if (amdgpu_ras_intr_triggered() &&
3446 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3447 adev->ip_blocks[i].status.hw = false;
3448 continue;
3449 }
3450
3451 /* skip unnecessary suspend if we do not initialize them yet */
3452 if (adev->gmc.xgmi.pending_reset &&
3453 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3454 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3455 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3456 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3457 adev->ip_blocks[i].status.hw = false;
3458 continue;
3459 }
3460
3461 /* skip suspend of gfx/mes and psp for S0ix
3462 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3463 * like at runtime. PSP is also part of the always on hardware
3464 * so no need to suspend it.
3465 */
3466 if (adev->in_s0ix &&
3467 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3468 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3469 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3470 continue;
3471
3472 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3473 if (adev->in_s0ix &&
3474 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3475 IP_VERSION(5, 0, 0)) &&
3476 (adev->ip_blocks[i].version->type ==
3477 AMD_IP_BLOCK_TYPE_SDMA))
3478 continue;
3479
3480 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3481 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3482 * from this location and RLC Autoload automatically also gets loaded
3483 * from here based on PMFW -> PSP message during re-init sequence.
3484 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3485 * the TMR and reload FWs again for IMU enabled APU ASICs.
3486 */
3487 if (amdgpu_in_reset(adev) &&
3488 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3489 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3490 continue;
3491
3492 /* XXX handle errors */
3493 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3494 /* XXX handle errors */
3495 if (r) {
3496 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3497 adev->ip_blocks[i].version->funcs->name, r);
3498 }
3499 adev->ip_blocks[i].status.hw = false;
3500 /* handle putting the SMC in the appropriate state */
3501 if (!amdgpu_sriov_vf(adev)) {
3502 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3503 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3504 if (r) {
3505 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3506 adev->mp1_state, r);
3507 return r;
3508 }
3509 }
3510 }
3511 }
3512
3513 return 0;
3514 }
3515
3516 /**
3517 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3518 *
3519 * @adev: amdgpu_device pointer
3520 *
3521 * Main suspend function for hardware IPs. The list of all the hardware
3522 * IPs that make up the asic is walked, clockgating is disabled and the
3523 * suspend callbacks are run. suspend puts the hardware and software state
3524 * in each IP into a state suitable for suspend.
3525 * Returns 0 on success, negative error code on failure.
3526 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3527 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3528 {
3529 int r;
3530
3531 if (amdgpu_sriov_vf(adev)) {
3532 amdgpu_virt_fini_data_exchange(adev);
3533 amdgpu_virt_request_full_gpu(adev, false);
3534 }
3535
3536 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3537
3538 r = amdgpu_device_ip_suspend_phase1(adev);
3539 if (r)
3540 return r;
3541 r = amdgpu_device_ip_suspend_phase2(adev);
3542
3543 if (amdgpu_sriov_vf(adev))
3544 amdgpu_virt_release_full_gpu(adev, false);
3545
3546 return r;
3547 }
3548
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3549 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3550 {
3551 int i, r;
3552
3553 static enum amd_ip_block_type ip_order[] = {
3554 AMD_IP_BLOCK_TYPE_COMMON,
3555 AMD_IP_BLOCK_TYPE_GMC,
3556 AMD_IP_BLOCK_TYPE_PSP,
3557 AMD_IP_BLOCK_TYPE_IH,
3558 };
3559
3560 for (i = 0; i < adev->num_ip_blocks; i++) {
3561 int j;
3562 struct amdgpu_ip_block *block;
3563
3564 block = &adev->ip_blocks[i];
3565 block->status.hw = false;
3566
3567 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3568
3569 if (block->version->type != ip_order[j] ||
3570 !block->status.valid)
3571 continue;
3572
3573 r = block->version->funcs->hw_init(adev);
3574 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3575 if (r)
3576 return r;
3577 block->status.hw = true;
3578 }
3579 }
3580
3581 return 0;
3582 }
3583
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3584 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3585 {
3586 int i, r;
3587
3588 static enum amd_ip_block_type ip_order[] = {
3589 AMD_IP_BLOCK_TYPE_SMC,
3590 AMD_IP_BLOCK_TYPE_DCE,
3591 AMD_IP_BLOCK_TYPE_GFX,
3592 AMD_IP_BLOCK_TYPE_SDMA,
3593 AMD_IP_BLOCK_TYPE_MES,
3594 AMD_IP_BLOCK_TYPE_UVD,
3595 AMD_IP_BLOCK_TYPE_VCE,
3596 AMD_IP_BLOCK_TYPE_VCN,
3597 AMD_IP_BLOCK_TYPE_JPEG
3598 };
3599
3600 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3601 int j;
3602 struct amdgpu_ip_block *block;
3603
3604 for (j = 0; j < adev->num_ip_blocks; j++) {
3605 block = &adev->ip_blocks[j];
3606
3607 if (block->version->type != ip_order[i] ||
3608 !block->status.valid ||
3609 block->status.hw)
3610 continue;
3611
3612 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3613 r = block->version->funcs->resume(adev);
3614 else
3615 r = block->version->funcs->hw_init(adev);
3616
3617 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3618 if (r)
3619 return r;
3620 block->status.hw = true;
3621 }
3622 }
3623
3624 return 0;
3625 }
3626
3627 /**
3628 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3629 *
3630 * @adev: amdgpu_device pointer
3631 *
3632 * First resume function for hardware IPs. The list of all the hardware
3633 * IPs that make up the asic is walked and the resume callbacks are run for
3634 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3635 * after a suspend and updates the software state as necessary. This
3636 * function is also used for restoring the GPU after a GPU reset.
3637 * Returns 0 on success, negative error code on failure.
3638 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3639 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3640 {
3641 int i, r;
3642
3643 for (i = 0; i < adev->num_ip_blocks; i++) {
3644 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3645 continue;
3646 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3647 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3648 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3649 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3650
3651 r = adev->ip_blocks[i].version->funcs->resume(adev);
3652 if (r) {
3653 DRM_ERROR("resume of IP block <%s> failed %d\n",
3654 adev->ip_blocks[i].version->funcs->name, r);
3655 return r;
3656 }
3657 adev->ip_blocks[i].status.hw = true;
3658 }
3659 }
3660
3661 return 0;
3662 }
3663
3664 /**
3665 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3666 *
3667 * @adev: amdgpu_device pointer
3668 *
3669 * First resume function for hardware IPs. The list of all the hardware
3670 * IPs that make up the asic is walked and the resume callbacks are run for
3671 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3672 * functional state after a suspend and updates the software state as
3673 * necessary. This function is also used for restoring the GPU after a GPU
3674 * reset.
3675 * Returns 0 on success, negative error code on failure.
3676 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3677 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3678 {
3679 int i, r;
3680
3681 for (i = 0; i < adev->num_ip_blocks; i++) {
3682 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3683 continue;
3684 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3685 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3686 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3687 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3688 continue;
3689 r = adev->ip_blocks[i].version->funcs->resume(adev);
3690 if (r) {
3691 DRM_ERROR("resume of IP block <%s> failed %d\n",
3692 adev->ip_blocks[i].version->funcs->name, r);
3693 return r;
3694 }
3695 adev->ip_blocks[i].status.hw = true;
3696 }
3697
3698 return 0;
3699 }
3700
3701 /**
3702 * amdgpu_device_ip_resume - run resume for hardware IPs
3703 *
3704 * @adev: amdgpu_device pointer
3705 *
3706 * Main resume function for hardware IPs. The hardware IPs
3707 * are split into two resume functions because they are
3708 * also used in recovering from a GPU reset and some additional
3709 * steps need to be take between them. In this case (S3/S4) they are
3710 * run sequentially.
3711 * Returns 0 on success, negative error code on failure.
3712 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3713 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3714 {
3715 int r;
3716
3717 r = amdgpu_device_ip_resume_phase1(adev);
3718 if (r)
3719 return r;
3720
3721 r = amdgpu_device_fw_loading(adev);
3722 if (r)
3723 return r;
3724
3725 r = amdgpu_device_ip_resume_phase2(adev);
3726
3727 if (adev->mman.buffer_funcs_ring->sched.ready)
3728 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3729
3730 return r;
3731 }
3732
3733 /**
3734 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3735 *
3736 * @adev: amdgpu_device pointer
3737 *
3738 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3739 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3740 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3741 {
3742 if (amdgpu_sriov_vf(adev)) {
3743 if (adev->is_atom_fw) {
3744 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3745 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3746 } else {
3747 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3748 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3749 }
3750
3751 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3752 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3753 }
3754 }
3755
3756 /**
3757 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3758 *
3759 * @asic_type: AMD asic type
3760 *
3761 * Check if there is DC (new modesetting infrastructre) support for an asic.
3762 * returns true if DC has support, false if not.
3763 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3764 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3765 {
3766 switch (asic_type) {
3767 #ifdef CONFIG_DRM_AMDGPU_SI
3768 case CHIP_HAINAN:
3769 #endif
3770 case CHIP_TOPAZ:
3771 /* chips with no display hardware */
3772 return false;
3773 #if defined(CONFIG_DRM_AMD_DC)
3774 case CHIP_TAHITI:
3775 case CHIP_PITCAIRN:
3776 case CHIP_VERDE:
3777 case CHIP_OLAND:
3778 /*
3779 * We have systems in the wild with these ASICs that require
3780 * LVDS and VGA support which is not supported with DC.
3781 *
3782 * Fallback to the non-DC driver here by default so as not to
3783 * cause regressions.
3784 */
3785 #if defined(CONFIG_DRM_AMD_DC_SI)
3786 return amdgpu_dc > 0;
3787 #else
3788 return false;
3789 #endif
3790 case CHIP_BONAIRE:
3791 case CHIP_KAVERI:
3792 case CHIP_KABINI:
3793 case CHIP_MULLINS:
3794 /*
3795 * We have systems in the wild with these ASICs that require
3796 * VGA support which is not supported with DC.
3797 *
3798 * Fallback to the non-DC driver here by default so as not to
3799 * cause regressions.
3800 */
3801 return amdgpu_dc > 0;
3802 default:
3803 return amdgpu_dc != 0;
3804 #else
3805 default:
3806 if (amdgpu_dc > 0)
3807 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3808 return false;
3809 #endif
3810 }
3811 }
3812
3813 /**
3814 * amdgpu_device_has_dc_support - check if dc is supported
3815 *
3816 * @adev: amdgpu_device pointer
3817 *
3818 * Returns true for supported, false for not supported
3819 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3820 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3821 {
3822 if (adev->enable_virtual_display ||
3823 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3824 return false;
3825
3826 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3827 }
3828
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3829 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3830 {
3831 struct amdgpu_device *adev =
3832 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3833 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3834
3835 /* It's a bug to not have a hive within this function */
3836 if (WARN_ON(!hive))
3837 return;
3838
3839 /*
3840 * Use task barrier to synchronize all xgmi reset works across the
3841 * hive. task_barrier_enter and task_barrier_exit will block
3842 * until all the threads running the xgmi reset works reach
3843 * those points. task_barrier_full will do both blocks.
3844 */
3845 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3846
3847 task_barrier_enter(&hive->tb);
3848 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3849
3850 if (adev->asic_reset_res)
3851 goto fail;
3852
3853 task_barrier_exit(&hive->tb);
3854 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3855
3856 if (adev->asic_reset_res)
3857 goto fail;
3858
3859 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
3860 } else {
3861
3862 task_barrier_full(&hive->tb);
3863 adev->asic_reset_res = amdgpu_asic_reset(adev);
3864 }
3865
3866 fail:
3867 if (adev->asic_reset_res)
3868 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3869 adev->asic_reset_res, adev_to_drm(adev)->unique);
3870 amdgpu_put_xgmi_hive(hive);
3871 }
3872
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3873 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3874 {
3875 char *input = amdgpu_lockup_timeout;
3876 char *timeout_setting = NULL;
3877 int index = 0;
3878 long timeout;
3879 int ret = 0;
3880
3881 /*
3882 * By default timeout for non compute jobs is 10000
3883 * and 60000 for compute jobs.
3884 * In SR-IOV or passthrough mode, timeout for compute
3885 * jobs are 60000 by default.
3886 */
3887 adev->gfx_timeout = msecs_to_jiffies(10000);
3888 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3889 if (amdgpu_sriov_vf(adev))
3890 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3891 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3892 else
3893 adev->compute_timeout = msecs_to_jiffies(60000);
3894
3895 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3896 while ((timeout_setting = strsep(&input, ",")) &&
3897 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3898 ret = kstrtol(timeout_setting, 0, &timeout);
3899 if (ret)
3900 return ret;
3901
3902 if (timeout == 0) {
3903 index++;
3904 continue;
3905 } else if (timeout < 0) {
3906 timeout = MAX_SCHEDULE_TIMEOUT;
3907 dev_warn(adev->dev, "lockup timeout disabled");
3908 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3909 } else {
3910 timeout = msecs_to_jiffies(timeout);
3911 }
3912
3913 switch (index++) {
3914 case 0:
3915 adev->gfx_timeout = timeout;
3916 break;
3917 case 1:
3918 adev->compute_timeout = timeout;
3919 break;
3920 case 2:
3921 adev->sdma_timeout = timeout;
3922 break;
3923 case 3:
3924 adev->video_timeout = timeout;
3925 break;
3926 default:
3927 break;
3928 }
3929 }
3930 /*
3931 * There is only one value specified and
3932 * it should apply to all non-compute jobs.
3933 */
3934 if (index == 1) {
3935 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3936 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3937 adev->compute_timeout = adev->gfx_timeout;
3938 }
3939 }
3940
3941 return ret;
3942 }
3943
3944 /**
3945 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3946 *
3947 * @adev: amdgpu_device pointer
3948 *
3949 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3950 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)3951 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3952 {
3953 struct iommu_domain *domain;
3954
3955 domain = iommu_get_domain_for_dev(adev->dev);
3956 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3957 adev->ram_is_direct_mapped = true;
3958 }
3959
3960 #if defined(CONFIG_HSA_AMD_P2P)
3961 /**
3962 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
3963 *
3964 * @adev: amdgpu_device pointer
3965 *
3966 * return if IOMMU remapping bar address
3967 */
amdgpu_device_check_iommu_remap(struct amdgpu_device * adev)3968 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
3969 {
3970 struct iommu_domain *domain;
3971
3972 domain = iommu_get_domain_for_dev(adev->dev);
3973 if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
3974 domain->type == IOMMU_DOMAIN_DMA_FQ))
3975 return true;
3976
3977 return false;
3978 }
3979 #endif
3980
3981 static const struct attribute *amdgpu_dev_attributes[] = {
3982 &dev_attr_pcie_replay_count.attr,
3983 NULL
3984 };
3985
amdgpu_device_set_mcbp(struct amdgpu_device * adev)3986 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3987 {
3988 if (amdgpu_mcbp == 1)
3989 adev->gfx.mcbp = true;
3990 else if (amdgpu_mcbp == 0)
3991 adev->gfx.mcbp = false;
3992
3993 if (amdgpu_sriov_vf(adev))
3994 adev->gfx.mcbp = true;
3995
3996 if (adev->gfx.mcbp)
3997 DRM_INFO("MCBP is enabled\n");
3998 }
3999
4000 /**
4001 * amdgpu_device_init - initialize the driver
4002 *
4003 * @adev: amdgpu_device pointer
4004 * @flags: driver flags
4005 *
4006 * Initializes the driver info and hw (all asics).
4007 * Returns 0 for success or an error on failure.
4008 * Called at driver startup.
4009 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)4010 int amdgpu_device_init(struct amdgpu_device *adev,
4011 uint32_t flags)
4012 {
4013 struct drm_device *ddev = adev_to_drm(adev);
4014 struct pci_dev *pdev = adev->pdev;
4015 int r, i;
4016 bool px = false;
4017 u32 max_MBps;
4018 int tmp;
4019
4020 adev->shutdown = false;
4021 adev->flags = flags;
4022
4023 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4024 adev->asic_type = amdgpu_force_asic_type;
4025 else
4026 adev->asic_type = flags & AMD_ASIC_MASK;
4027
4028 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4029 if (amdgpu_emu_mode == 1)
4030 adev->usec_timeout *= 10;
4031 adev->gmc.gart_size = 512 * 1024 * 1024;
4032 adev->accel_working = false;
4033 adev->num_rings = 0;
4034 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4035 adev->mman.buffer_funcs = NULL;
4036 adev->mman.buffer_funcs_ring = NULL;
4037 adev->vm_manager.vm_pte_funcs = NULL;
4038 adev->vm_manager.vm_pte_num_scheds = 0;
4039 adev->gmc.gmc_funcs = NULL;
4040 adev->harvest_ip_mask = 0x0;
4041 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4042 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4043
4044 adev->smc_rreg = &amdgpu_invalid_rreg;
4045 adev->smc_wreg = &amdgpu_invalid_wreg;
4046 adev->pcie_rreg = &amdgpu_invalid_rreg;
4047 adev->pcie_wreg = &amdgpu_invalid_wreg;
4048 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4049 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4050 adev->pciep_rreg = &amdgpu_invalid_rreg;
4051 adev->pciep_wreg = &amdgpu_invalid_wreg;
4052 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4053 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4054 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4055 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4056 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4057 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4058 adev->didt_rreg = &amdgpu_invalid_rreg;
4059 adev->didt_wreg = &amdgpu_invalid_wreg;
4060 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4061 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4062 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4063 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4064
4065 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4066 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4067 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4068
4069 /* mutex initialization are all done here so we
4070 * can recall function without having locking issues
4071 */
4072 mutex_init(&adev->firmware.mutex);
4073 mutex_init(&adev->pm.mutex);
4074 mutex_init(&adev->gfx.gpu_clock_mutex);
4075 mutex_init(&adev->srbm_mutex);
4076 mutex_init(&adev->gfx.pipe_reserve_mutex);
4077 mutex_init(&adev->gfx.gfx_off_mutex);
4078 mutex_init(&adev->gfx.partition_mutex);
4079 mutex_init(&adev->grbm_idx_mutex);
4080 mutex_init(&adev->mn_lock);
4081 mutex_init(&adev->virt.vf_errors.lock);
4082 mutex_init(&adev->virt.rlcg_reg_lock);
4083 hash_init(adev->mn_hash);
4084 mutex_init(&adev->psp.mutex);
4085 mutex_init(&adev->notifier_lock);
4086 mutex_init(&adev->pm.stable_pstate_ctx_lock);
4087 mutex_init(&adev->benchmark_mutex);
4088 mutex_init(&adev->gfx.reset_sem_mutex);
4089 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4090 mutex_init(&adev->enforce_isolation_mutex);
4091 mutex_init(&adev->gfx.kfd_sch_mutex);
4092
4093 amdgpu_device_init_apu_flags(adev);
4094
4095 r = amdgpu_device_check_arguments(adev);
4096 if (r)
4097 return r;
4098
4099 spin_lock_init(&adev->mmio_idx_lock);
4100 spin_lock_init(&adev->smc_idx_lock);
4101 spin_lock_init(&adev->pcie_idx_lock);
4102 spin_lock_init(&adev->uvd_ctx_idx_lock);
4103 spin_lock_init(&adev->didt_idx_lock);
4104 spin_lock_init(&adev->gc_cac_idx_lock);
4105 spin_lock_init(&adev->se_cac_idx_lock);
4106 spin_lock_init(&adev->audio_endpt_idx_lock);
4107 spin_lock_init(&adev->mm_stats.lock);
4108 spin_lock_init(&adev->wb.lock);
4109
4110 INIT_LIST_HEAD(&adev->reset_list);
4111
4112 INIT_LIST_HEAD(&adev->ras_list);
4113
4114 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4115
4116 INIT_DELAYED_WORK(&adev->delayed_init_work,
4117 amdgpu_device_delayed_init_work_handler);
4118 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4119 amdgpu_device_delay_enable_gfx_off);
4120 /*
4121 * Initialize the enforce_isolation work structures for each XCP
4122 * partition. This work handler is responsible for enforcing shader
4123 * isolation on AMD GPUs. It counts the number of emitted fences for
4124 * each GFX and compute ring. If there are any fences, it schedules
4125 * the `enforce_isolation_work` to be run after a delay. If there are
4126 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4127 * runqueue.
4128 */
4129 for (i = 0; i < MAX_XCP; i++) {
4130 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4131 amdgpu_gfx_enforce_isolation_handler);
4132 adev->gfx.enforce_isolation[i].adev = adev;
4133 adev->gfx.enforce_isolation[i].xcp_id = i;
4134 }
4135
4136 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4137
4138 adev->gfx.gfx_off_req_count = 1;
4139 adev->gfx.gfx_off_residency = 0;
4140 adev->gfx.gfx_off_entrycount = 0;
4141 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4142
4143 atomic_set(&adev->throttling_logging_enabled, 1);
4144 /*
4145 * If throttling continues, logging will be performed every minute
4146 * to avoid log flooding. "-1" is subtracted since the thermal
4147 * throttling interrupt comes every second. Thus, the total logging
4148 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4149 * for throttling interrupt) = 60 seconds.
4150 */
4151 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4152 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4153
4154 /* Registers mapping */
4155 /* TODO: block userspace mapping of io register */
4156 if (adev->asic_type >= CHIP_BONAIRE) {
4157 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4158 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4159 } else {
4160 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4161 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4162 }
4163
4164 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4165 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4166
4167 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4168 if (!adev->rmmio)
4169 return -ENOMEM;
4170
4171 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4172 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4173
4174 /*
4175 * Reset domain needs to be present early, before XGMI hive discovered
4176 * (if any) and intitialized to use reset sem and in_gpu reset flag
4177 * early on during init and before calling to RREG32.
4178 */
4179 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4180 if (!adev->reset_domain)
4181 return -ENOMEM;
4182
4183 /* detect hw virtualization here */
4184 amdgpu_detect_virtualization(adev);
4185
4186 amdgpu_device_get_pcie_info(adev);
4187
4188 r = amdgpu_device_get_job_timeout_settings(adev);
4189 if (r) {
4190 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4191 return r;
4192 }
4193
4194 amdgpu_device_set_mcbp(adev);
4195
4196 /* early init functions */
4197 r = amdgpu_device_ip_early_init(adev);
4198 if (r)
4199 return r;
4200
4201 /* Get rid of things like offb */
4202 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
4203 if (r)
4204 return r;
4205
4206 /* Enable TMZ based on IP_VERSION */
4207 amdgpu_gmc_tmz_set(adev);
4208
4209 if (amdgpu_sriov_vf(adev) &&
4210 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4211 /* VF MMIO access (except mailbox range) from CPU
4212 * will be blocked during sriov runtime
4213 */
4214 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4215
4216 amdgpu_gmc_noretry_set(adev);
4217 /* Need to get xgmi info early to decide the reset behavior*/
4218 if (adev->gmc.xgmi.supported) {
4219 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4220 if (r)
4221 return r;
4222 }
4223
4224 /* enable PCIE atomic ops */
4225 if (amdgpu_sriov_vf(adev)) {
4226 if (adev->virt.fw_reserve.p_pf2vf)
4227 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4228 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4229 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4230 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4231 * internal path natively support atomics, set have_atomics_support to true.
4232 */
4233 } else if ((adev->flags & AMD_IS_APU) &&
4234 (amdgpu_ip_version(adev, GC_HWIP, 0) >
4235 IP_VERSION(9, 0, 0))) {
4236 adev->have_atomics_support = true;
4237 } else {
4238 adev->have_atomics_support =
4239 !pci_enable_atomic_ops_to_root(adev->pdev,
4240 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4241 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4242 }
4243
4244 if (!adev->have_atomics_support)
4245 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4246
4247 /* doorbell bar mapping and doorbell index init*/
4248 amdgpu_doorbell_init(adev);
4249
4250 if (amdgpu_emu_mode == 1) {
4251 /* post the asic on emulation mode */
4252 emu_soc_asic_init(adev);
4253 goto fence_driver_init;
4254 }
4255
4256 amdgpu_reset_init(adev);
4257
4258 /* detect if we are with an SRIOV vbios */
4259 if (adev->bios)
4260 amdgpu_device_detect_sriov_bios(adev);
4261
4262 /* check if we need to reset the asic
4263 * E.g., driver was not cleanly unloaded previously, etc.
4264 */
4265 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4266 if (adev->gmc.xgmi.num_physical_nodes) {
4267 dev_info(adev->dev, "Pending hive reset.\n");
4268 adev->gmc.xgmi.pending_reset = true;
4269 /* Only need to init necessary block for SMU to handle the reset */
4270 for (i = 0; i < adev->num_ip_blocks; i++) {
4271 if (!adev->ip_blocks[i].status.valid)
4272 continue;
4273 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
4274 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
4275 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
4276 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
4277 DRM_DEBUG("IP %s disabled for hw_init.\n",
4278 adev->ip_blocks[i].version->funcs->name);
4279 adev->ip_blocks[i].status.hw = true;
4280 }
4281 }
4282 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4283 !amdgpu_device_has_display_hardware(adev)) {
4284 r = psp_gpu_reset(adev);
4285 } else {
4286 tmp = amdgpu_reset_method;
4287 /* It should do a default reset when loading or reloading the driver,
4288 * regardless of the module parameter reset_method.
4289 */
4290 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4291 r = amdgpu_asic_reset(adev);
4292 amdgpu_reset_method = tmp;
4293 }
4294
4295 if (r) {
4296 dev_err(adev->dev, "asic reset on init failed\n");
4297 goto failed;
4298 }
4299 }
4300
4301 /* Post card if necessary */
4302 if (amdgpu_device_need_post(adev)) {
4303 if (!adev->bios) {
4304 dev_err(adev->dev, "no vBIOS found\n");
4305 r = -EINVAL;
4306 goto failed;
4307 }
4308 DRM_INFO("GPU posting now...\n");
4309 r = amdgpu_device_asic_init(adev);
4310 if (r) {
4311 dev_err(adev->dev, "gpu post error!\n");
4312 goto failed;
4313 }
4314 }
4315
4316 if (adev->bios) {
4317 if (adev->is_atom_fw) {
4318 /* Initialize clocks */
4319 r = amdgpu_atomfirmware_get_clock_info(adev);
4320 if (r) {
4321 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4322 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4323 goto failed;
4324 }
4325 } else {
4326 /* Initialize clocks */
4327 r = amdgpu_atombios_get_clock_info(adev);
4328 if (r) {
4329 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4330 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4331 goto failed;
4332 }
4333 /* init i2c buses */
4334 if (!amdgpu_device_has_dc_support(adev))
4335 amdgpu_atombios_i2c_init(adev);
4336 }
4337 }
4338
4339 fence_driver_init:
4340 /* Fence driver */
4341 r = amdgpu_fence_driver_sw_init(adev);
4342 if (r) {
4343 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4344 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4345 goto failed;
4346 }
4347
4348 /* init the mode config */
4349 drm_mode_config_init(adev_to_drm(adev));
4350
4351 r = amdgpu_device_ip_init(adev);
4352 if (r) {
4353 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4354 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4355 goto release_ras_con;
4356 }
4357
4358 amdgpu_fence_driver_hw_init(adev);
4359
4360 dev_info(adev->dev,
4361 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4362 adev->gfx.config.max_shader_engines,
4363 adev->gfx.config.max_sh_per_se,
4364 adev->gfx.config.max_cu_per_sh,
4365 adev->gfx.cu_info.number);
4366
4367 adev->accel_working = true;
4368
4369 amdgpu_vm_check_compute_bug(adev);
4370
4371 /* Initialize the buffer migration limit. */
4372 if (amdgpu_moverate >= 0)
4373 max_MBps = amdgpu_moverate;
4374 else
4375 max_MBps = 8; /* Allow 8 MB/s. */
4376 /* Get a log2 for easy divisions. */
4377 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4378
4379 /*
4380 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4381 * Otherwise the mgpu fan boost feature will be skipped due to the
4382 * gpu instance is counted less.
4383 */
4384 amdgpu_register_gpu_instance(adev);
4385
4386 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4387 * explicit gating rather than handling it automatically.
4388 */
4389 if (!adev->gmc.xgmi.pending_reset) {
4390 r = amdgpu_device_ip_late_init(adev);
4391 if (r) {
4392 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4393 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4394 goto release_ras_con;
4395 }
4396 /* must succeed. */
4397 amdgpu_ras_resume(adev);
4398 queue_delayed_work(system_wq, &adev->delayed_init_work,
4399 msecs_to_jiffies(AMDGPU_RESUME_MS));
4400 }
4401
4402 if (amdgpu_sriov_vf(adev)) {
4403 amdgpu_virt_release_full_gpu(adev, true);
4404 flush_delayed_work(&adev->delayed_init_work);
4405 }
4406
4407 /*
4408 * Place those sysfs registering after `late_init`. As some of those
4409 * operations performed in `late_init` might affect the sysfs
4410 * interfaces creating.
4411 */
4412 r = amdgpu_atombios_sysfs_init(adev);
4413 if (r)
4414 drm_err(&adev->ddev,
4415 "registering atombios sysfs failed (%d).\n", r);
4416
4417 r = amdgpu_pm_sysfs_init(adev);
4418 if (r)
4419 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4420
4421 r = amdgpu_ucode_sysfs_init(adev);
4422 if (r) {
4423 adev->ucode_sysfs_en = false;
4424 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4425 } else
4426 adev->ucode_sysfs_en = true;
4427
4428 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
4429 if (r)
4430 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4431
4432 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4433 if (r)
4434 dev_err(adev->dev,
4435 "Could not create amdgpu board attributes\n");
4436
4437 amdgpu_fru_sysfs_init(adev);
4438 amdgpu_reg_state_sysfs_init(adev);
4439
4440 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4441 r = amdgpu_pmu_init(adev);
4442 if (r)
4443 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4444
4445 /* Have stored pci confspace at hand for restore in sudden PCI error */
4446 if (amdgpu_device_cache_pci_state(adev->pdev))
4447 pci_restore_state(pdev);
4448
4449 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4450 /* this will fail for cards that aren't VGA class devices, just
4451 * ignore it
4452 */
4453 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4454 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4455
4456 px = amdgpu_device_supports_px(ddev);
4457
4458 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4459 apple_gmux_detect(NULL, NULL)))
4460 vga_switcheroo_register_client(adev->pdev,
4461 &amdgpu_switcheroo_ops, px);
4462
4463 if (px)
4464 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4465
4466 if (adev->gmc.xgmi.pending_reset)
4467 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
4468 msecs_to_jiffies(AMDGPU_RESUME_MS));
4469
4470 amdgpu_device_check_iommu_direct_map(adev);
4471
4472 return 0;
4473
4474 release_ras_con:
4475 if (amdgpu_sriov_vf(adev))
4476 amdgpu_virt_release_full_gpu(adev, true);
4477
4478 /* failed in exclusive mode due to timeout */
4479 if (amdgpu_sriov_vf(adev) &&
4480 !amdgpu_sriov_runtime(adev) &&
4481 amdgpu_virt_mmio_blocked(adev) &&
4482 !amdgpu_virt_wait_reset(adev)) {
4483 dev_err(adev->dev, "VF exclusive mode timeout\n");
4484 /* Don't send request since VF is inactive. */
4485 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4486 adev->virt.ops = NULL;
4487 r = -EAGAIN;
4488 }
4489 amdgpu_release_ras_context(adev);
4490
4491 failed:
4492 amdgpu_vf_error_trans_all(adev);
4493
4494 return r;
4495 }
4496
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4497 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4498 {
4499
4500 /* Clear all CPU mappings pointing to this device */
4501 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4502
4503 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4504 amdgpu_doorbell_fini(adev);
4505
4506 iounmap(adev->rmmio);
4507 adev->rmmio = NULL;
4508 if (adev->mman.aper_base_kaddr)
4509 iounmap(adev->mman.aper_base_kaddr);
4510 adev->mman.aper_base_kaddr = NULL;
4511
4512 /* Memory manager related */
4513 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4514 arch_phys_wc_del(adev->gmc.vram_mtrr);
4515 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4516 }
4517 }
4518
4519 /**
4520 * amdgpu_device_fini_hw - tear down the driver
4521 *
4522 * @adev: amdgpu_device pointer
4523 *
4524 * Tear down the driver info (all asics).
4525 * Called at driver shutdown.
4526 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4527 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4528 {
4529 dev_info(adev->dev, "amdgpu: finishing device.\n");
4530 flush_delayed_work(&adev->delayed_init_work);
4531
4532 if (adev->mman.initialized)
4533 drain_workqueue(adev->mman.bdev.wq);
4534 adev->shutdown = true;
4535
4536 /* make sure IB test finished before entering exclusive mode
4537 * to avoid preemption on IB test
4538 */
4539 if (amdgpu_sriov_vf(adev)) {
4540 amdgpu_virt_request_full_gpu(adev, false);
4541 amdgpu_virt_fini_data_exchange(adev);
4542 }
4543
4544 /* disable all interrupts */
4545 amdgpu_irq_disable_all(adev);
4546 if (adev->mode_info.mode_config_initialized) {
4547 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4548 drm_helper_force_disable_all(adev_to_drm(adev));
4549 else
4550 drm_atomic_helper_shutdown(adev_to_drm(adev));
4551 }
4552 amdgpu_fence_driver_hw_fini(adev);
4553
4554 if (adev->pm.sysfs_initialized)
4555 amdgpu_pm_sysfs_fini(adev);
4556 if (adev->ucode_sysfs_en)
4557 amdgpu_ucode_sysfs_fini(adev);
4558 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4559 amdgpu_fru_sysfs_fini(adev);
4560
4561 amdgpu_reg_state_sysfs_fini(adev);
4562
4563 /* disable ras feature must before hw fini */
4564 amdgpu_ras_pre_fini(adev);
4565
4566 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4567
4568 amdgpu_device_ip_fini_early(adev);
4569
4570 amdgpu_irq_fini_hw(adev);
4571
4572 if (adev->mman.initialized)
4573 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4574
4575 amdgpu_gart_dummy_page_fini(adev);
4576
4577 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4578 amdgpu_device_unmap_mmio(adev);
4579
4580 }
4581
amdgpu_device_fini_sw(struct amdgpu_device * adev)4582 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4583 {
4584 int idx;
4585 bool px;
4586
4587 amdgpu_fence_driver_sw_fini(adev);
4588 amdgpu_device_ip_fini(adev);
4589 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4590 adev->accel_working = false;
4591 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4592
4593 amdgpu_reset_fini(adev);
4594
4595 /* free i2c buses */
4596 if (!amdgpu_device_has_dc_support(adev))
4597 amdgpu_i2c_fini(adev);
4598
4599 if (amdgpu_emu_mode != 1)
4600 amdgpu_atombios_fini(adev);
4601
4602 kfree(adev->bios);
4603 adev->bios = NULL;
4604
4605 kfree(adev->fru_info);
4606 adev->fru_info = NULL;
4607
4608 px = amdgpu_device_supports_px(adev_to_drm(adev));
4609
4610 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4611 apple_gmux_detect(NULL, NULL)))
4612 vga_switcheroo_unregister_client(adev->pdev);
4613
4614 if (px)
4615 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4616
4617 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4618 vga_client_unregister(adev->pdev);
4619
4620 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4621
4622 iounmap(adev->rmmio);
4623 adev->rmmio = NULL;
4624 amdgpu_doorbell_fini(adev);
4625 drm_dev_exit(idx);
4626 }
4627
4628 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4629 amdgpu_pmu_fini(adev);
4630 if (adev->mman.discovery_bin)
4631 amdgpu_discovery_fini(adev);
4632
4633 amdgpu_reset_put_reset_domain(adev->reset_domain);
4634 adev->reset_domain = NULL;
4635
4636 kfree(adev->pci_state);
4637
4638 }
4639
4640 /**
4641 * amdgpu_device_evict_resources - evict device resources
4642 * @adev: amdgpu device object
4643 *
4644 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4645 * of the vram memory type. Mainly used for evicting device resources
4646 * at suspend time.
4647 *
4648 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4649 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4650 {
4651 int ret;
4652
4653 /* No need to evict vram on APUs for suspend to ram or s2idle */
4654 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4655 return 0;
4656
4657 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4658 if (ret)
4659 DRM_WARN("evicting device resources failed\n");
4660 return ret;
4661 }
4662
4663 /*
4664 * Suspend & resume.
4665 */
4666 /**
4667 * amdgpu_device_prepare - prepare for device suspend
4668 *
4669 * @dev: drm dev pointer
4670 *
4671 * Prepare to put the hw in the suspend state (all asics).
4672 * Returns 0 for success or an error on failure.
4673 * Called at driver suspend.
4674 */
amdgpu_device_prepare(struct drm_device * dev)4675 int amdgpu_device_prepare(struct drm_device *dev)
4676 {
4677 struct amdgpu_device *adev = drm_to_adev(dev);
4678 int i, r;
4679
4680 amdgpu_choose_low_power_state(adev);
4681
4682 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4683 return 0;
4684
4685 /* Evict the majority of BOs before starting suspend sequence */
4686 r = amdgpu_device_evict_resources(adev);
4687 if (r)
4688 goto unprepare;
4689
4690 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4691
4692 for (i = 0; i < adev->num_ip_blocks; i++) {
4693 if (!adev->ip_blocks[i].status.valid)
4694 continue;
4695 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4696 continue;
4697 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
4698 if (r)
4699 goto unprepare;
4700 }
4701
4702 return 0;
4703
4704 unprepare:
4705 adev->in_s0ix = adev->in_s3 = false;
4706
4707 return r;
4708 }
4709
4710 /**
4711 * amdgpu_device_suspend - initiate device suspend
4712 *
4713 * @dev: drm dev pointer
4714 * @fbcon : notify the fbdev of suspend
4715 *
4716 * Puts the hw in the suspend state (all asics).
4717 * Returns 0 for success or an error on failure.
4718 * Called at driver suspend.
4719 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4720 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4721 {
4722 struct amdgpu_device *adev = drm_to_adev(dev);
4723 int r = 0;
4724
4725 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4726 return 0;
4727
4728 adev->in_suspend = true;
4729
4730 if (amdgpu_sriov_vf(adev)) {
4731 amdgpu_virt_fini_data_exchange(adev);
4732 r = amdgpu_virt_request_full_gpu(adev, false);
4733 if (r)
4734 return r;
4735 }
4736
4737 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4738 DRM_WARN("smart shift update failed\n");
4739
4740 if (fbcon)
4741 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4742
4743 cancel_delayed_work_sync(&adev->delayed_init_work);
4744
4745 amdgpu_ras_suspend(adev);
4746
4747 amdgpu_device_ip_suspend_phase1(adev);
4748
4749 if (!adev->in_s0ix)
4750 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4751
4752 r = amdgpu_device_evict_resources(adev);
4753 if (r)
4754 return r;
4755
4756 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4757
4758 amdgpu_fence_driver_hw_fini(adev);
4759
4760 amdgpu_device_ip_suspend_phase2(adev);
4761
4762 if (amdgpu_sriov_vf(adev))
4763 amdgpu_virt_release_full_gpu(adev, false);
4764
4765 r = amdgpu_dpm_notify_rlc_state(adev, false);
4766 if (r)
4767 return r;
4768
4769 return 0;
4770 }
4771
4772 /**
4773 * amdgpu_device_resume - initiate device resume
4774 *
4775 * @dev: drm dev pointer
4776 * @fbcon : notify the fbdev of resume
4777 *
4778 * Bring the hw back to operating state (all asics).
4779 * Returns 0 for success or an error on failure.
4780 * Called at driver resume.
4781 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4782 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4783 {
4784 struct amdgpu_device *adev = drm_to_adev(dev);
4785 int r = 0;
4786
4787 if (amdgpu_sriov_vf(adev)) {
4788 r = amdgpu_virt_request_full_gpu(adev, true);
4789 if (r)
4790 return r;
4791 }
4792
4793 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4794 return 0;
4795
4796 if (adev->in_s0ix)
4797 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4798
4799 /* post card */
4800 if (amdgpu_device_need_post(adev)) {
4801 r = amdgpu_device_asic_init(adev);
4802 if (r)
4803 dev_err(adev->dev, "amdgpu asic init failed\n");
4804 }
4805
4806 r = amdgpu_device_ip_resume(adev);
4807
4808 if (r) {
4809 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4810 goto exit;
4811 }
4812 amdgpu_fence_driver_hw_init(adev);
4813
4814 if (!adev->in_s0ix) {
4815 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4816 if (r)
4817 goto exit;
4818 }
4819
4820 r = amdgpu_device_ip_late_init(adev);
4821 if (r)
4822 goto exit;
4823
4824 queue_delayed_work(system_wq, &adev->delayed_init_work,
4825 msecs_to_jiffies(AMDGPU_RESUME_MS));
4826 exit:
4827 if (amdgpu_sriov_vf(adev)) {
4828 amdgpu_virt_init_data_exchange(adev);
4829 amdgpu_virt_release_full_gpu(adev, true);
4830 }
4831
4832 if (r)
4833 return r;
4834
4835 /* Make sure IB tests flushed */
4836 flush_delayed_work(&adev->delayed_init_work);
4837
4838 if (fbcon)
4839 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4840
4841 amdgpu_ras_resume(adev);
4842
4843 if (adev->mode_info.num_crtc) {
4844 /*
4845 * Most of the connector probing functions try to acquire runtime pm
4846 * refs to ensure that the GPU is powered on when connector polling is
4847 * performed. Since we're calling this from a runtime PM callback,
4848 * trying to acquire rpm refs will cause us to deadlock.
4849 *
4850 * Since we're guaranteed to be holding the rpm lock, it's safe to
4851 * temporarily disable the rpm helpers so this doesn't deadlock us.
4852 */
4853 #ifdef CONFIG_PM
4854 dev->dev->power.disable_depth++;
4855 #endif
4856 if (!adev->dc_enabled)
4857 drm_helper_hpd_irq_event(dev);
4858 else
4859 drm_kms_helper_hotplug_event(dev);
4860 #ifdef CONFIG_PM
4861 dev->dev->power.disable_depth--;
4862 #endif
4863 }
4864 adev->in_suspend = false;
4865
4866 if (adev->enable_mes)
4867 amdgpu_mes_self_test(adev);
4868
4869 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4870 DRM_WARN("smart shift update failed\n");
4871
4872 return 0;
4873 }
4874
4875 /**
4876 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4877 *
4878 * @adev: amdgpu_device pointer
4879 *
4880 * The list of all the hardware IPs that make up the asic is walked and
4881 * the check_soft_reset callbacks are run. check_soft_reset determines
4882 * if the asic is still hung or not.
4883 * Returns true if any of the IPs are still in a hung state, false if not.
4884 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4885 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4886 {
4887 int i;
4888 bool asic_hang = false;
4889
4890 if (amdgpu_sriov_vf(adev))
4891 return true;
4892
4893 if (amdgpu_asic_need_full_reset(adev))
4894 return true;
4895
4896 for (i = 0; i < adev->num_ip_blocks; i++) {
4897 if (!adev->ip_blocks[i].status.valid)
4898 continue;
4899 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4900 adev->ip_blocks[i].status.hang =
4901 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4902 if (adev->ip_blocks[i].status.hang) {
4903 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4904 asic_hang = true;
4905 }
4906 }
4907 return asic_hang;
4908 }
4909
4910 /**
4911 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4912 *
4913 * @adev: amdgpu_device pointer
4914 *
4915 * The list of all the hardware IPs that make up the asic is walked and the
4916 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4917 * handles any IP specific hardware or software state changes that are
4918 * necessary for a soft reset to succeed.
4919 * Returns 0 on success, negative error code on failure.
4920 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)4921 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4922 {
4923 int i, r = 0;
4924
4925 for (i = 0; i < adev->num_ip_blocks; i++) {
4926 if (!adev->ip_blocks[i].status.valid)
4927 continue;
4928 if (adev->ip_blocks[i].status.hang &&
4929 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4930 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4931 if (r)
4932 return r;
4933 }
4934 }
4935
4936 return 0;
4937 }
4938
4939 /**
4940 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4941 *
4942 * @adev: amdgpu_device pointer
4943 *
4944 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4945 * reset is necessary to recover.
4946 * Returns true if a full asic reset is required, false if not.
4947 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)4948 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4949 {
4950 int i;
4951
4952 if (amdgpu_asic_need_full_reset(adev))
4953 return true;
4954
4955 for (i = 0; i < adev->num_ip_blocks; i++) {
4956 if (!adev->ip_blocks[i].status.valid)
4957 continue;
4958 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4959 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4960 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4961 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4962 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4963 if (adev->ip_blocks[i].status.hang) {
4964 dev_info(adev->dev, "Some block need full reset!\n");
4965 return true;
4966 }
4967 }
4968 }
4969 return false;
4970 }
4971
4972 /**
4973 * amdgpu_device_ip_soft_reset - do a soft reset
4974 *
4975 * @adev: amdgpu_device pointer
4976 *
4977 * The list of all the hardware IPs that make up the asic is walked and the
4978 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4979 * IP specific hardware or software state changes that are necessary to soft
4980 * reset the IP.
4981 * Returns 0 on success, negative error code on failure.
4982 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)4983 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4984 {
4985 int i, r = 0;
4986
4987 for (i = 0; i < adev->num_ip_blocks; i++) {
4988 if (!adev->ip_blocks[i].status.valid)
4989 continue;
4990 if (adev->ip_blocks[i].status.hang &&
4991 adev->ip_blocks[i].version->funcs->soft_reset) {
4992 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4993 if (r)
4994 return r;
4995 }
4996 }
4997
4998 return 0;
4999 }
5000
5001 /**
5002 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5003 *
5004 * @adev: amdgpu_device pointer
5005 *
5006 * The list of all the hardware IPs that make up the asic is walked and the
5007 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5008 * handles any IP specific hardware or software state changes that are
5009 * necessary after the IP has been soft reset.
5010 * Returns 0 on success, negative error code on failure.
5011 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)5012 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5013 {
5014 int i, r = 0;
5015
5016 for (i = 0; i < adev->num_ip_blocks; i++) {
5017 if (!adev->ip_blocks[i].status.valid)
5018 continue;
5019 if (adev->ip_blocks[i].status.hang &&
5020 adev->ip_blocks[i].version->funcs->post_soft_reset)
5021 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
5022 if (r)
5023 return r;
5024 }
5025
5026 return 0;
5027 }
5028
5029 /**
5030 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5031 *
5032 * @adev: amdgpu_device pointer
5033 * @reset_context: amdgpu reset context pointer
5034 *
5035 * do VF FLR and reinitialize Asic
5036 * return 0 means succeeded otherwise failed
5037 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5038 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5039 struct amdgpu_reset_context *reset_context)
5040 {
5041 int r;
5042 struct amdgpu_hive_info *hive = NULL;
5043
5044 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5045 if (!amdgpu_ras_get_fed_status(adev))
5046 amdgpu_virt_ready_to_reset(adev);
5047 amdgpu_virt_wait_reset(adev);
5048 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5049 r = amdgpu_virt_request_full_gpu(adev, true);
5050 } else {
5051 r = amdgpu_virt_reset_gpu(adev);
5052 }
5053 if (r)
5054 return r;
5055
5056 amdgpu_ras_set_fed(adev, false);
5057 amdgpu_irq_gpu_reset_resume_helper(adev);
5058
5059 /* some sw clean up VF needs to do before recover */
5060 amdgpu_virt_post_reset(adev);
5061
5062 /* Resume IP prior to SMC */
5063 r = amdgpu_device_ip_reinit_early_sriov(adev);
5064 if (r)
5065 return r;
5066
5067 amdgpu_virt_init_data_exchange(adev);
5068
5069 r = amdgpu_device_fw_loading(adev);
5070 if (r)
5071 return r;
5072
5073 /* now we are okay to resume SMC/CP/SDMA */
5074 r = amdgpu_device_ip_reinit_late_sriov(adev);
5075 if (r)
5076 return r;
5077
5078 hive = amdgpu_get_xgmi_hive(adev);
5079 /* Update PSP FW topology after reset */
5080 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5081 r = amdgpu_xgmi_update_topology(hive, adev);
5082 if (hive)
5083 amdgpu_put_xgmi_hive(hive);
5084 if (r)
5085 return r;
5086
5087 r = amdgpu_ib_ring_tests(adev);
5088 if (r)
5089 return r;
5090
5091 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5092 amdgpu_inc_vram_lost(adev);
5093
5094 /* need to be called during full access so we can't do it later like
5095 * bare-metal does.
5096 */
5097 amdgpu_amdkfd_post_reset(adev);
5098 amdgpu_virt_release_full_gpu(adev, true);
5099
5100 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5101 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5102 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5103 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5104 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5105 amdgpu_ras_resume(adev);
5106 return 0;
5107 }
5108
5109 /**
5110 * amdgpu_device_has_job_running - check if there is any job in mirror list
5111 *
5112 * @adev: amdgpu_device pointer
5113 *
5114 * check if there is any job in mirror list
5115 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)5116 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5117 {
5118 int i;
5119 struct drm_sched_job *job;
5120
5121 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5122 struct amdgpu_ring *ring = adev->rings[i];
5123
5124 if (!amdgpu_ring_sched_ready(ring))
5125 continue;
5126
5127 spin_lock(&ring->sched.job_list_lock);
5128 job = list_first_entry_or_null(&ring->sched.pending_list,
5129 struct drm_sched_job, list);
5130 spin_unlock(&ring->sched.job_list_lock);
5131 if (job)
5132 return true;
5133 }
5134 return false;
5135 }
5136
5137 /**
5138 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5139 *
5140 * @adev: amdgpu_device pointer
5141 *
5142 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5143 * a hung GPU.
5144 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)5145 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5146 {
5147
5148 if (amdgpu_gpu_recovery == 0)
5149 goto disabled;
5150
5151 /* Skip soft reset check in fatal error mode */
5152 if (!amdgpu_ras_is_poison_mode_supported(adev))
5153 return true;
5154
5155 if (amdgpu_sriov_vf(adev))
5156 return true;
5157
5158 if (amdgpu_gpu_recovery == -1) {
5159 switch (adev->asic_type) {
5160 #ifdef CONFIG_DRM_AMDGPU_SI
5161 case CHIP_VERDE:
5162 case CHIP_TAHITI:
5163 case CHIP_PITCAIRN:
5164 case CHIP_OLAND:
5165 case CHIP_HAINAN:
5166 #endif
5167 #ifdef CONFIG_DRM_AMDGPU_CIK
5168 case CHIP_KAVERI:
5169 case CHIP_KABINI:
5170 case CHIP_MULLINS:
5171 #endif
5172 case CHIP_CARRIZO:
5173 case CHIP_STONEY:
5174 case CHIP_CYAN_SKILLFISH:
5175 goto disabled;
5176 default:
5177 break;
5178 }
5179 }
5180
5181 return true;
5182
5183 disabled:
5184 dev_info(adev->dev, "GPU recovery disabled.\n");
5185 return false;
5186 }
5187
amdgpu_device_mode1_reset(struct amdgpu_device * adev)5188 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5189 {
5190 u32 i;
5191 int ret = 0;
5192
5193 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5194
5195 dev_info(adev->dev, "GPU mode1 reset\n");
5196
5197 /* Cache the state before bus master disable. The saved config space
5198 * values are used in other cases like restore after mode-2 reset.
5199 */
5200 amdgpu_device_cache_pci_state(adev->pdev);
5201
5202 /* disable BM */
5203 pci_clear_master(adev->pdev);
5204
5205 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5206 dev_info(adev->dev, "GPU smu mode1 reset\n");
5207 ret = amdgpu_dpm_mode1_reset(adev);
5208 } else {
5209 dev_info(adev->dev, "GPU psp mode1 reset\n");
5210 ret = psp_gpu_reset(adev);
5211 }
5212
5213 if (ret)
5214 goto mode1_reset_failed;
5215
5216 amdgpu_device_load_pci_state(adev->pdev);
5217 ret = amdgpu_psp_wait_for_bootloader(adev);
5218 if (ret)
5219 goto mode1_reset_failed;
5220
5221 /* wait for asic to come out of reset */
5222 for (i = 0; i < adev->usec_timeout; i++) {
5223 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5224
5225 if (memsize != 0xffffffff)
5226 break;
5227 udelay(1);
5228 }
5229
5230 if (i >= adev->usec_timeout) {
5231 ret = -ETIMEDOUT;
5232 goto mode1_reset_failed;
5233 }
5234
5235 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5236
5237 return 0;
5238
5239 mode1_reset_failed:
5240 dev_err(adev->dev, "GPU mode1 reset failed\n");
5241 return ret;
5242 }
5243
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5244 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5245 struct amdgpu_reset_context *reset_context)
5246 {
5247 int i, r = 0;
5248 struct amdgpu_job *job = NULL;
5249 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5250 bool need_full_reset =
5251 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5252
5253 if (reset_context->reset_req_dev == adev)
5254 job = reset_context->job;
5255
5256 if (amdgpu_sriov_vf(adev))
5257 amdgpu_virt_pre_reset(adev);
5258
5259 amdgpu_fence_driver_isr_toggle(adev, true);
5260
5261 /* block all schedulers and reset given job's ring */
5262 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5263 struct amdgpu_ring *ring = adev->rings[i];
5264
5265 if (!amdgpu_ring_sched_ready(ring))
5266 continue;
5267
5268 /* Clear job fence from fence drv to avoid force_completion
5269 * leave NULL and vm flush fence in fence drv
5270 */
5271 amdgpu_fence_driver_clear_job_fences(ring);
5272
5273 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5274 amdgpu_fence_driver_force_completion(ring);
5275 }
5276
5277 amdgpu_fence_driver_isr_toggle(adev, false);
5278
5279 if (job && job->vm)
5280 drm_sched_increase_karma(&job->base);
5281
5282 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5283 /* If reset handler not implemented, continue; otherwise return */
5284 if (r == -EOPNOTSUPP)
5285 r = 0;
5286 else
5287 return r;
5288
5289 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5290 if (!amdgpu_sriov_vf(adev)) {
5291
5292 if (!need_full_reset)
5293 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5294
5295 if (!need_full_reset && amdgpu_gpu_recovery &&
5296 amdgpu_device_ip_check_soft_reset(adev)) {
5297 amdgpu_device_ip_pre_soft_reset(adev);
5298 r = amdgpu_device_ip_soft_reset(adev);
5299 amdgpu_device_ip_post_soft_reset(adev);
5300 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5301 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5302 need_full_reset = true;
5303 }
5304 }
5305
5306 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5307 dev_info(tmp_adev->dev, "Dumping IP State\n");
5308 /* Trigger ip dump before we reset the asic */
5309 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5310 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5311 tmp_adev->ip_blocks[i].version->funcs
5312 ->dump_ip_state((void *)tmp_adev);
5313 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5314 }
5315
5316 if (need_full_reset)
5317 r = amdgpu_device_ip_suspend(adev);
5318 if (need_full_reset)
5319 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5320 else
5321 clear_bit(AMDGPU_NEED_FULL_RESET,
5322 &reset_context->flags);
5323 }
5324
5325 return r;
5326 }
5327
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5328 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5329 struct amdgpu_reset_context *reset_context)
5330 {
5331 struct amdgpu_device *tmp_adev = NULL;
5332 bool need_full_reset, skip_hw_reset, vram_lost = false;
5333 int r = 0;
5334
5335 /* Try reset handler method first */
5336 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5337 reset_list);
5338
5339 reset_context->reset_device_list = device_list_handle;
5340 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5341 /* If reset handler not implemented, continue; otherwise return */
5342 if (r == -EOPNOTSUPP)
5343 r = 0;
5344 else
5345 return r;
5346
5347 /* Reset handler not implemented, use the default method */
5348 need_full_reset =
5349 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5350 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5351
5352 /*
5353 * ASIC reset has to be done on all XGMI hive nodes ASAP
5354 * to allow proper links negotiation in FW (within 1 sec)
5355 */
5356 if (!skip_hw_reset && need_full_reset) {
5357 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5358 /* For XGMI run all resets in parallel to speed up the process */
5359 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5360 tmp_adev->gmc.xgmi.pending_reset = false;
5361 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
5362 r = -EALREADY;
5363 } else
5364 r = amdgpu_asic_reset(tmp_adev);
5365
5366 if (r) {
5367 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
5368 r, adev_to_drm(tmp_adev)->unique);
5369 goto out;
5370 }
5371 }
5372
5373 /* For XGMI wait for all resets to complete before proceed */
5374 if (!r) {
5375 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5376 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5377 flush_work(&tmp_adev->xgmi_reset_work);
5378 r = tmp_adev->asic_reset_res;
5379 if (r)
5380 break;
5381 }
5382 }
5383 }
5384 }
5385
5386 if (!r && amdgpu_ras_intr_triggered()) {
5387 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5388 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB);
5389 }
5390
5391 amdgpu_ras_intr_cleared();
5392 }
5393
5394 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5395 if (need_full_reset) {
5396 /* post card */
5397 amdgpu_ras_set_fed(tmp_adev, false);
5398 r = amdgpu_device_asic_init(tmp_adev);
5399 if (r) {
5400 dev_warn(tmp_adev->dev, "asic atom init failed!");
5401 } else {
5402 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5403
5404 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5405 if (r)
5406 goto out;
5407
5408 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5409
5410 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5411 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5412
5413 if (vram_lost) {
5414 DRM_INFO("VRAM is lost due to GPU reset!\n");
5415 amdgpu_inc_vram_lost(tmp_adev);
5416 }
5417
5418 r = amdgpu_device_fw_loading(tmp_adev);
5419 if (r)
5420 return r;
5421
5422 r = amdgpu_xcp_restore_partition_mode(
5423 tmp_adev->xcp_mgr);
5424 if (r)
5425 goto out;
5426
5427 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5428 if (r)
5429 goto out;
5430
5431 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5432 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5433
5434 if (vram_lost)
5435 amdgpu_device_fill_reset_magic(tmp_adev);
5436
5437 /*
5438 * Add this ASIC as tracked as reset was already
5439 * complete successfully.
5440 */
5441 amdgpu_register_gpu_instance(tmp_adev);
5442
5443 if (!reset_context->hive &&
5444 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5445 amdgpu_xgmi_add_device(tmp_adev);
5446
5447 r = amdgpu_device_ip_late_init(tmp_adev);
5448 if (r)
5449 goto out;
5450
5451 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5452
5453 /*
5454 * The GPU enters bad state once faulty pages
5455 * by ECC has reached the threshold, and ras
5456 * recovery is scheduled next. So add one check
5457 * here to break recovery if it indeed exceeds
5458 * bad page threshold, and remind user to
5459 * retire this GPU or setting one bigger
5460 * bad_page_threshold value to fix this once
5461 * probing driver again.
5462 */
5463 if (!amdgpu_ras_is_rma(tmp_adev)) {
5464 /* must succeed. */
5465 amdgpu_ras_resume(tmp_adev);
5466 } else {
5467 r = -EINVAL;
5468 goto out;
5469 }
5470
5471 /* Update PSP FW topology after reset */
5472 if (reset_context->hive &&
5473 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5474 r = amdgpu_xgmi_update_topology(
5475 reset_context->hive, tmp_adev);
5476 }
5477 }
5478
5479 out:
5480 if (!r) {
5481 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5482 r = amdgpu_ib_ring_tests(tmp_adev);
5483 if (r) {
5484 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5485 need_full_reset = true;
5486 r = -EAGAIN;
5487 goto end;
5488 }
5489 }
5490
5491 if (r)
5492 tmp_adev->asic_reset_res = r;
5493 }
5494
5495 end:
5496 if (need_full_reset)
5497 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5498 else
5499 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5500 return r;
5501 }
5502
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5503 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5504 {
5505
5506 switch (amdgpu_asic_reset_method(adev)) {
5507 case AMD_RESET_METHOD_MODE1:
5508 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5509 break;
5510 case AMD_RESET_METHOD_MODE2:
5511 adev->mp1_state = PP_MP1_STATE_RESET;
5512 break;
5513 default:
5514 adev->mp1_state = PP_MP1_STATE_NONE;
5515 break;
5516 }
5517 }
5518
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5519 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5520 {
5521 amdgpu_vf_error_trans_all(adev);
5522 adev->mp1_state = PP_MP1_STATE_NONE;
5523 }
5524
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5525 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5526 {
5527 struct pci_dev *p = NULL;
5528
5529 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5530 adev->pdev->bus->number, 1);
5531 if (p) {
5532 pm_runtime_enable(&(p->dev));
5533 pm_runtime_resume(&(p->dev));
5534 }
5535
5536 pci_dev_put(p);
5537 }
5538
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5539 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5540 {
5541 enum amd_reset_method reset_method;
5542 struct pci_dev *p = NULL;
5543 u64 expires;
5544
5545 /*
5546 * For now, only BACO and mode1 reset are confirmed
5547 * to suffer the audio issue without proper suspended.
5548 */
5549 reset_method = amdgpu_asic_reset_method(adev);
5550 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5551 (reset_method != AMD_RESET_METHOD_MODE1))
5552 return -EINVAL;
5553
5554 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5555 adev->pdev->bus->number, 1);
5556 if (!p)
5557 return -ENODEV;
5558
5559 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5560 if (!expires)
5561 /*
5562 * If we cannot get the audio device autosuspend delay,
5563 * a fixed 4S interval will be used. Considering 3S is
5564 * the audio controller default autosuspend delay setting.
5565 * 4S used here is guaranteed to cover that.
5566 */
5567 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5568
5569 while (!pm_runtime_status_suspended(&(p->dev))) {
5570 if (!pm_runtime_suspend(&(p->dev)))
5571 break;
5572
5573 if (expires < ktime_get_mono_fast_ns()) {
5574 dev_warn(adev->dev, "failed to suspend display audio\n");
5575 pci_dev_put(p);
5576 /* TODO: abort the succeeding gpu reset? */
5577 return -ETIMEDOUT;
5578 }
5579 }
5580
5581 pm_runtime_disable(&(p->dev));
5582
5583 pci_dev_put(p);
5584 return 0;
5585 }
5586
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5587 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5588 {
5589 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5590
5591 #if defined(CONFIG_DEBUG_FS)
5592 if (!amdgpu_sriov_vf(adev))
5593 cancel_work(&adev->reset_work);
5594 #endif
5595
5596 if (adev->kfd.dev)
5597 cancel_work(&adev->kfd.reset_work);
5598
5599 if (amdgpu_sriov_vf(adev))
5600 cancel_work(&adev->virt.flr_work);
5601
5602 if (con && adev->ras_enabled)
5603 cancel_work(&con->recovery_work);
5604
5605 }
5606
amdgpu_device_health_check(struct list_head * device_list_handle)5607 static int amdgpu_device_health_check(struct list_head *device_list_handle)
5608 {
5609 struct amdgpu_device *tmp_adev;
5610 int ret = 0;
5611 u32 status;
5612
5613 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5614 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
5615 if (PCI_POSSIBLE_ERROR(status)) {
5616 dev_err(tmp_adev->dev, "device lost from bus!");
5617 ret = -ENODEV;
5618 }
5619 }
5620
5621 return ret;
5622 }
5623
5624 /**
5625 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5626 *
5627 * @adev: amdgpu_device pointer
5628 * @job: which job trigger hang
5629 * @reset_context: amdgpu reset context pointer
5630 *
5631 * Attempt to reset the GPU if it has hung (all asics).
5632 * Attempt to do soft-reset or full-reset and reinitialize Asic
5633 * Returns 0 for success or an error on failure.
5634 */
5635
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5636 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5637 struct amdgpu_job *job,
5638 struct amdgpu_reset_context *reset_context)
5639 {
5640 struct list_head device_list, *device_list_handle = NULL;
5641 bool job_signaled = false;
5642 struct amdgpu_hive_info *hive = NULL;
5643 struct amdgpu_device *tmp_adev = NULL;
5644 int i, r = 0;
5645 bool need_emergency_restart = false;
5646 bool audio_suspended = false;
5647 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
5648
5649 /*
5650 * Special case: RAS triggered and full reset isn't supported
5651 */
5652 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5653
5654 /*
5655 * Flush RAM to disk so that after reboot
5656 * the user can read log and see why the system rebooted.
5657 */
5658 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5659 amdgpu_ras_get_context(adev)->reboot) {
5660 DRM_WARN("Emergency reboot.");
5661
5662 ksys_sync_helper();
5663 emergency_restart();
5664 }
5665
5666 dev_info(adev->dev, "GPU %s begin!\n",
5667 need_emergency_restart ? "jobs stop":"reset");
5668
5669 if (!amdgpu_sriov_vf(adev))
5670 hive = amdgpu_get_xgmi_hive(adev);
5671 if (hive)
5672 mutex_lock(&hive->hive_lock);
5673
5674 reset_context->job = job;
5675 reset_context->hive = hive;
5676 /*
5677 * Build list of devices to reset.
5678 * In case we are in XGMI hive mode, resort the device list
5679 * to put adev in the 1st position.
5680 */
5681 INIT_LIST_HEAD(&device_list);
5682 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5683 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5684 list_add_tail(&tmp_adev->reset_list, &device_list);
5685 if (adev->shutdown)
5686 tmp_adev->shutdown = true;
5687 }
5688 if (!list_is_first(&adev->reset_list, &device_list))
5689 list_rotate_to_front(&adev->reset_list, &device_list);
5690 device_list_handle = &device_list;
5691 } else {
5692 list_add_tail(&adev->reset_list, &device_list);
5693 device_list_handle = &device_list;
5694 }
5695
5696 if (!amdgpu_sriov_vf(adev)) {
5697 r = amdgpu_device_health_check(device_list_handle);
5698 if (r)
5699 goto end_reset;
5700 }
5701
5702 /* We need to lock reset domain only once both for XGMI and single device */
5703 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5704 reset_list);
5705 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5706
5707 /* block all schedulers and reset given job's ring */
5708 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5709
5710 amdgpu_device_set_mp1_state(tmp_adev);
5711
5712 /*
5713 * Try to put the audio codec into suspend state
5714 * before gpu reset started.
5715 *
5716 * Due to the power domain of the graphics device
5717 * is shared with AZ power domain. Without this,
5718 * we may change the audio hardware from behind
5719 * the audio driver's back. That will trigger
5720 * some audio codec errors.
5721 */
5722 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5723 audio_suspended = true;
5724
5725 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5726
5727 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5728
5729 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
5730
5731 /*
5732 * Mark these ASICs to be reseted as untracked first
5733 * And add them back after reset completed
5734 */
5735 amdgpu_unregister_gpu_instance(tmp_adev);
5736
5737 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5738
5739 /* disable ras on ALL IPs */
5740 if (!need_emergency_restart &&
5741 amdgpu_device_ip_need_full_reset(tmp_adev))
5742 amdgpu_ras_suspend(tmp_adev);
5743
5744 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5745 struct amdgpu_ring *ring = tmp_adev->rings[i];
5746
5747 if (!amdgpu_ring_sched_ready(ring))
5748 continue;
5749
5750 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5751
5752 if (need_emergency_restart)
5753 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5754 }
5755 atomic_inc(&tmp_adev->gpu_reset_counter);
5756 }
5757
5758 if (need_emergency_restart)
5759 goto skip_sched_resume;
5760
5761 /*
5762 * Must check guilty signal here since after this point all old
5763 * HW fences are force signaled.
5764 *
5765 * job->base holds a reference to parent fence
5766 */
5767 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5768 job_signaled = true;
5769 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5770 goto skip_hw_reset;
5771 }
5772
5773 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5774 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5775 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5776 /*TODO Should we stop ?*/
5777 if (r) {
5778 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5779 r, adev_to_drm(tmp_adev)->unique);
5780 tmp_adev->asic_reset_res = r;
5781 }
5782 }
5783
5784 /* Actual ASIC resets if needed.*/
5785 /* Host driver will handle XGMI hive reset for SRIOV */
5786 if (amdgpu_sriov_vf(adev)) {
5787 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
5788 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
5789 amdgpu_ras_set_fed(adev, true);
5790 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5791 }
5792
5793 r = amdgpu_device_reset_sriov(adev, reset_context);
5794 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
5795 amdgpu_virt_release_full_gpu(adev, true);
5796 goto retry;
5797 }
5798 if (r)
5799 adev->asic_reset_res = r;
5800 } else {
5801 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5802 if (r && r == -EAGAIN)
5803 goto retry;
5804 }
5805
5806 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5807 /*
5808 * Drop any pending non scheduler resets queued before reset is done.
5809 * Any reset scheduled after this point would be valid. Scheduler resets
5810 * were already dropped during drm_sched_stop and no new ones can come
5811 * in before drm_sched_start.
5812 */
5813 amdgpu_device_stop_pending_resets(tmp_adev);
5814 }
5815
5816 skip_hw_reset:
5817
5818 /* Post ASIC reset for all devs .*/
5819 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5820
5821 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5822 struct amdgpu_ring *ring = tmp_adev->rings[i];
5823
5824 if (!amdgpu_ring_sched_ready(ring))
5825 continue;
5826
5827 drm_sched_start(&ring->sched);
5828 }
5829
5830 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5831 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5832
5833 if (tmp_adev->asic_reset_res)
5834 r = tmp_adev->asic_reset_res;
5835
5836 tmp_adev->asic_reset_res = 0;
5837
5838 if (r) {
5839 /* bad news, how to tell it to userspace ?
5840 * for ras error, we should report GPU bad status instead of
5841 * reset failure
5842 */
5843 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
5844 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
5845 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
5846 atomic_read(&tmp_adev->gpu_reset_counter));
5847 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5848 } else {
5849 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5850 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5851 DRM_WARN("smart shift update failed\n");
5852 }
5853 }
5854
5855 skip_sched_resume:
5856 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5857 /* unlock kfd: SRIOV would do it separately */
5858 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5859 amdgpu_amdkfd_post_reset(tmp_adev);
5860
5861 /* kfd_post_reset will do nothing if kfd device is not initialized,
5862 * need to bring up kfd here if it's not be initialized before
5863 */
5864 if (!adev->kfd.init_complete)
5865 amdgpu_amdkfd_device_init(adev);
5866
5867 if (audio_suspended)
5868 amdgpu_device_resume_display_audio(tmp_adev);
5869
5870 amdgpu_device_unset_mp1_state(tmp_adev);
5871
5872 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5873 }
5874
5875 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5876 reset_list);
5877 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5878
5879 end_reset:
5880 if (hive) {
5881 mutex_unlock(&hive->hive_lock);
5882 amdgpu_put_xgmi_hive(hive);
5883 }
5884
5885 if (r)
5886 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5887
5888 atomic_set(&adev->reset_domain->reset_res, r);
5889 return r;
5890 }
5891
5892 /**
5893 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
5894 *
5895 * @adev: amdgpu_device pointer
5896 * @speed: pointer to the speed of the link
5897 * @width: pointer to the width of the link
5898 *
5899 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
5900 * first physical partner to an AMD dGPU.
5901 * This will exclude any virtual switches and links.
5902 */
amdgpu_device_partner_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)5903 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
5904 enum pci_bus_speed *speed,
5905 enum pcie_link_width *width)
5906 {
5907 struct pci_dev *parent = adev->pdev;
5908
5909 if (!speed || !width)
5910 return;
5911
5912 *speed = PCI_SPEED_UNKNOWN;
5913 *width = PCIE_LNK_WIDTH_UNKNOWN;
5914
5915 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
5916 while ((parent = pci_upstream_bridge(parent))) {
5917 /* skip upstream/downstream switches internal to dGPU*/
5918 if (parent->vendor == PCI_VENDOR_ID_ATI)
5919 continue;
5920 *speed = pcie_get_speed_cap(parent);
5921 *width = pcie_get_width_cap(parent);
5922 break;
5923 }
5924 } else {
5925 /* use the current speeds rather than max if switching is not supported */
5926 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
5927 }
5928 }
5929
5930 /**
5931 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5932 *
5933 * @adev: amdgpu_device pointer
5934 *
5935 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5936 * and lanes) of the slot the device is in. Handles APUs and
5937 * virtualized environments where PCIE config space may not be available.
5938 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)5939 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5940 {
5941 struct pci_dev *pdev;
5942 enum pci_bus_speed speed_cap, platform_speed_cap;
5943 enum pcie_link_width platform_link_width;
5944
5945 if (amdgpu_pcie_gen_cap)
5946 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5947
5948 if (amdgpu_pcie_lane_cap)
5949 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5950
5951 /* covers APUs as well */
5952 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5953 if (adev->pm.pcie_gen_mask == 0)
5954 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5955 if (adev->pm.pcie_mlw_mask == 0)
5956 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5957 return;
5958 }
5959
5960 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5961 return;
5962
5963 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
5964 &platform_link_width);
5965
5966 if (adev->pm.pcie_gen_mask == 0) {
5967 /* asic caps */
5968 pdev = adev->pdev;
5969 speed_cap = pcie_get_speed_cap(pdev);
5970 if (speed_cap == PCI_SPEED_UNKNOWN) {
5971 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5972 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5973 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5974 } else {
5975 if (speed_cap == PCIE_SPEED_32_0GT)
5976 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5977 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5978 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5979 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5980 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5981 else if (speed_cap == PCIE_SPEED_16_0GT)
5982 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5983 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5984 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5985 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5986 else if (speed_cap == PCIE_SPEED_8_0GT)
5987 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5988 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5989 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5990 else if (speed_cap == PCIE_SPEED_5_0GT)
5991 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5992 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5993 else
5994 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5995 }
5996 /* platform caps */
5997 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5998 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5999 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6000 } else {
6001 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6002 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6003 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6004 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6005 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6006 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6007 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6008 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6009 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6010 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6011 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6012 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6013 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6014 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6015 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6016 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6017 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6018 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6019 else
6020 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6021
6022 }
6023 }
6024 if (adev->pm.pcie_mlw_mask == 0) {
6025 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6026 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6027 } else {
6028 switch (platform_link_width) {
6029 case PCIE_LNK_X32:
6030 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6031 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6032 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6033 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6034 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6035 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6036 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6037 break;
6038 case PCIE_LNK_X16:
6039 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6040 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6041 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6042 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6043 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6044 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6045 break;
6046 case PCIE_LNK_X12:
6047 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6048 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6049 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6050 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6051 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6052 break;
6053 case PCIE_LNK_X8:
6054 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6055 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6056 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6057 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6058 break;
6059 case PCIE_LNK_X4:
6060 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6061 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6062 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6063 break;
6064 case PCIE_LNK_X2:
6065 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6066 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6067 break;
6068 case PCIE_LNK_X1:
6069 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6070 break;
6071 default:
6072 break;
6073 }
6074 }
6075 }
6076 }
6077
6078 /**
6079 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6080 *
6081 * @adev: amdgpu_device pointer
6082 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6083 *
6084 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6085 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6086 * @peer_adev.
6087 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)6088 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6089 struct amdgpu_device *peer_adev)
6090 {
6091 #ifdef CONFIG_HSA_AMD_P2P
6092 bool p2p_access =
6093 !adev->gmc.xgmi.connected_to_cpu &&
6094 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6095
6096 bool is_large_bar = adev->gmc.visible_vram_size &&
6097 adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6098 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6099
6100 if (!p2p_addressable) {
6101 uint64_t address_mask = peer_adev->dev->dma_mask ?
6102 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6103 resource_size_t aper_limit =
6104 adev->gmc.aper_base + adev->gmc.aper_size - 1;
6105
6106 p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6107 aper_limit & address_mask);
6108 }
6109 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6110 #else
6111 return false;
6112 #endif
6113 }
6114
amdgpu_device_baco_enter(struct drm_device * dev)6115 int amdgpu_device_baco_enter(struct drm_device *dev)
6116 {
6117 struct amdgpu_device *adev = drm_to_adev(dev);
6118 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6119
6120 if (!amdgpu_device_supports_baco(dev))
6121 return -ENOTSUPP;
6122
6123 if (ras && adev->ras_enabled &&
6124 adev->nbio.funcs->enable_doorbell_interrupt)
6125 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6126
6127 return amdgpu_dpm_baco_enter(adev);
6128 }
6129
amdgpu_device_baco_exit(struct drm_device * dev)6130 int amdgpu_device_baco_exit(struct drm_device *dev)
6131 {
6132 struct amdgpu_device *adev = drm_to_adev(dev);
6133 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6134 int ret = 0;
6135
6136 if (!amdgpu_device_supports_baco(dev))
6137 return -ENOTSUPP;
6138
6139 ret = amdgpu_dpm_baco_exit(adev);
6140 if (ret)
6141 return ret;
6142
6143 if (ras && adev->ras_enabled &&
6144 adev->nbio.funcs->enable_doorbell_interrupt)
6145 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6146
6147 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6148 adev->nbio.funcs->clear_doorbell_interrupt)
6149 adev->nbio.funcs->clear_doorbell_interrupt(adev);
6150
6151 return 0;
6152 }
6153
6154 /**
6155 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6156 * @pdev: PCI device struct
6157 * @state: PCI channel state
6158 *
6159 * Description: Called when a PCI error is detected.
6160 *
6161 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6162 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)6163 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6164 {
6165 struct drm_device *dev = pci_get_drvdata(pdev);
6166 struct amdgpu_device *adev = drm_to_adev(dev);
6167 int i;
6168
6169 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
6170
6171 if (adev->gmc.xgmi.num_physical_nodes > 1) {
6172 DRM_WARN("No support for XGMI hive yet...");
6173 return PCI_ERS_RESULT_DISCONNECT;
6174 }
6175
6176 adev->pci_channel_state = state;
6177
6178 switch (state) {
6179 case pci_channel_io_normal:
6180 return PCI_ERS_RESULT_CAN_RECOVER;
6181 /* Fatal error, prepare for slot reset */
6182 case pci_channel_io_frozen:
6183 /*
6184 * Locking adev->reset_domain->sem will prevent any external access
6185 * to GPU during PCI error recovery
6186 */
6187 amdgpu_device_lock_reset_domain(adev->reset_domain);
6188 amdgpu_device_set_mp1_state(adev);
6189
6190 /*
6191 * Block any work scheduling as we do for regular GPU reset
6192 * for the duration of the recovery
6193 */
6194 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6195 struct amdgpu_ring *ring = adev->rings[i];
6196
6197 if (!amdgpu_ring_sched_ready(ring))
6198 continue;
6199
6200 drm_sched_stop(&ring->sched, NULL);
6201 }
6202 atomic_inc(&adev->gpu_reset_counter);
6203 return PCI_ERS_RESULT_NEED_RESET;
6204 case pci_channel_io_perm_failure:
6205 /* Permanent error, prepare for device removal */
6206 return PCI_ERS_RESULT_DISCONNECT;
6207 }
6208
6209 return PCI_ERS_RESULT_NEED_RESET;
6210 }
6211
6212 /**
6213 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6214 * @pdev: pointer to PCI device
6215 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)6216 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6217 {
6218
6219 DRM_INFO("PCI error: mmio enabled callback!!\n");
6220
6221 /* TODO - dump whatever for debugging purposes */
6222
6223 /* This called only if amdgpu_pci_error_detected returns
6224 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6225 * works, no need to reset slot.
6226 */
6227
6228 return PCI_ERS_RESULT_RECOVERED;
6229 }
6230
6231 /**
6232 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6233 * @pdev: PCI device struct
6234 *
6235 * Description: This routine is called by the pci error recovery
6236 * code after the PCI slot has been reset, just before we
6237 * should resume normal operations.
6238 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)6239 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6240 {
6241 struct drm_device *dev = pci_get_drvdata(pdev);
6242 struct amdgpu_device *adev = drm_to_adev(dev);
6243 int r, i;
6244 struct amdgpu_reset_context reset_context;
6245 u32 memsize;
6246 struct list_head device_list;
6247
6248 /* PCI error slot reset should be skipped During RAS recovery */
6249 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6250 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6251 amdgpu_ras_in_recovery(adev))
6252 return PCI_ERS_RESULT_RECOVERED;
6253
6254 DRM_INFO("PCI error: slot reset callback!!\n");
6255
6256 memset(&reset_context, 0, sizeof(reset_context));
6257
6258 INIT_LIST_HEAD(&device_list);
6259 list_add_tail(&adev->reset_list, &device_list);
6260
6261 /* wait for asic to come out of reset */
6262 msleep(500);
6263
6264 /* Restore PCI confspace */
6265 amdgpu_device_load_pci_state(pdev);
6266
6267 /* confirm ASIC came out of reset */
6268 for (i = 0; i < adev->usec_timeout; i++) {
6269 memsize = amdgpu_asic_get_config_memsize(adev);
6270
6271 if (memsize != 0xffffffff)
6272 break;
6273 udelay(1);
6274 }
6275 if (memsize == 0xffffffff) {
6276 r = -ETIME;
6277 goto out;
6278 }
6279
6280 reset_context.method = AMD_RESET_METHOD_NONE;
6281 reset_context.reset_req_dev = adev;
6282 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6283 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6284
6285 adev->no_hw_access = true;
6286 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
6287 adev->no_hw_access = false;
6288 if (r)
6289 goto out;
6290
6291 r = amdgpu_do_asic_reset(&device_list, &reset_context);
6292
6293 out:
6294 if (!r) {
6295 if (amdgpu_device_cache_pci_state(adev->pdev))
6296 pci_restore_state(adev->pdev);
6297
6298 DRM_INFO("PCIe error recovery succeeded\n");
6299 } else {
6300 DRM_ERROR("PCIe error recovery failed, err:%d", r);
6301 amdgpu_device_unset_mp1_state(adev);
6302 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6303 }
6304
6305 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6306 }
6307
6308 /**
6309 * amdgpu_pci_resume() - resume normal ops after PCI reset
6310 * @pdev: pointer to PCI device
6311 *
6312 * Called when the error recovery driver tells us that its
6313 * OK to resume normal operation.
6314 */
amdgpu_pci_resume(struct pci_dev * pdev)6315 void amdgpu_pci_resume(struct pci_dev *pdev)
6316 {
6317 struct drm_device *dev = pci_get_drvdata(pdev);
6318 struct amdgpu_device *adev = drm_to_adev(dev);
6319 int i;
6320
6321
6322 DRM_INFO("PCI error: resume callback!!\n");
6323
6324 /* Only continue execution for the case of pci_channel_io_frozen */
6325 if (adev->pci_channel_state != pci_channel_io_frozen)
6326 return;
6327
6328 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6329 struct amdgpu_ring *ring = adev->rings[i];
6330
6331 if (!amdgpu_ring_sched_ready(ring))
6332 continue;
6333
6334 drm_sched_start(&ring->sched);
6335 }
6336
6337 amdgpu_device_unset_mp1_state(adev);
6338 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6339 }
6340
amdgpu_device_cache_pci_state(struct pci_dev * pdev)6341 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6342 {
6343 struct drm_device *dev = pci_get_drvdata(pdev);
6344 struct amdgpu_device *adev = drm_to_adev(dev);
6345 int r;
6346
6347 r = pci_save_state(pdev);
6348 if (!r) {
6349 kfree(adev->pci_state);
6350
6351 adev->pci_state = pci_store_saved_state(pdev);
6352
6353 if (!adev->pci_state) {
6354 DRM_ERROR("Failed to store PCI saved state");
6355 return false;
6356 }
6357 } else {
6358 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6359 return false;
6360 }
6361
6362 return true;
6363 }
6364
amdgpu_device_load_pci_state(struct pci_dev * pdev)6365 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6366 {
6367 struct drm_device *dev = pci_get_drvdata(pdev);
6368 struct amdgpu_device *adev = drm_to_adev(dev);
6369 int r;
6370
6371 if (!adev->pci_state)
6372 return false;
6373
6374 r = pci_load_saved_state(pdev, adev->pci_state);
6375
6376 if (!r) {
6377 pci_restore_state(pdev);
6378 } else {
6379 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6380 return false;
6381 }
6382
6383 return true;
6384 }
6385
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6386 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6387 struct amdgpu_ring *ring)
6388 {
6389 #ifdef CONFIG_X86_64
6390 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6391 return;
6392 #endif
6393 if (adev->gmc.xgmi.connected_to_cpu)
6394 return;
6395
6396 if (ring && ring->funcs->emit_hdp_flush)
6397 amdgpu_ring_emit_hdp_flush(ring);
6398 else
6399 amdgpu_asic_flush_hdp(adev, ring);
6400 }
6401
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6402 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6403 struct amdgpu_ring *ring)
6404 {
6405 #ifdef CONFIG_X86_64
6406 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6407 return;
6408 #endif
6409 if (adev->gmc.xgmi.connected_to_cpu)
6410 return;
6411
6412 amdgpu_asic_invalidate_hdp(adev, ring);
6413 }
6414
amdgpu_in_reset(struct amdgpu_device * adev)6415 int amdgpu_in_reset(struct amdgpu_device *adev)
6416 {
6417 return atomic_read(&adev->reset_domain->in_gpu_reset);
6418 }
6419
6420 /**
6421 * amdgpu_device_halt() - bring hardware to some kind of halt state
6422 *
6423 * @adev: amdgpu_device pointer
6424 *
6425 * Bring hardware to some kind of halt state so that no one can touch it
6426 * any more. It will help to maintain error context when error occurred.
6427 * Compare to a simple hang, the system will keep stable at least for SSH
6428 * access. Then it should be trivial to inspect the hardware state and
6429 * see what's going on. Implemented as following:
6430 *
6431 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6432 * clears all CPU mappings to device, disallows remappings through page faults
6433 * 2. amdgpu_irq_disable_all() disables all interrupts
6434 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6435 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6436 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6437 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6438 * flush any in flight DMA operations
6439 */
amdgpu_device_halt(struct amdgpu_device * adev)6440 void amdgpu_device_halt(struct amdgpu_device *adev)
6441 {
6442 struct pci_dev *pdev = adev->pdev;
6443 struct drm_device *ddev = adev_to_drm(adev);
6444
6445 amdgpu_xcp_dev_unplug(adev);
6446 drm_dev_unplug(ddev);
6447
6448 amdgpu_irq_disable_all(adev);
6449
6450 amdgpu_fence_driver_hw_fini(adev);
6451
6452 adev->no_hw_access = true;
6453
6454 amdgpu_device_unmap_mmio(adev);
6455
6456 pci_disable_device(pdev);
6457 pci_wait_for_pending_transaction(pdev);
6458 }
6459
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6460 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6461 u32 reg)
6462 {
6463 unsigned long flags, address, data;
6464 u32 r;
6465
6466 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6467 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6468
6469 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6470 WREG32(address, reg * 4);
6471 (void)RREG32(address);
6472 r = RREG32(data);
6473 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6474 return r;
6475 }
6476
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6477 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6478 u32 reg, u32 v)
6479 {
6480 unsigned long flags, address, data;
6481
6482 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6483 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6484
6485 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6486 WREG32(address, reg * 4);
6487 (void)RREG32(address);
6488 WREG32(data, v);
6489 (void)RREG32(data);
6490 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6491 }
6492
6493 /**
6494 * amdgpu_device_get_gang - return a reference to the current gang
6495 * @adev: amdgpu_device pointer
6496 *
6497 * Returns: A new reference to the current gang leader.
6498 */
amdgpu_device_get_gang(struct amdgpu_device * adev)6499 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
6500 {
6501 struct dma_fence *fence;
6502
6503 rcu_read_lock();
6504 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
6505 rcu_read_unlock();
6506 return fence;
6507 }
6508
6509 /**
6510 * amdgpu_device_switch_gang - switch to a new gang
6511 * @adev: amdgpu_device pointer
6512 * @gang: the gang to switch to
6513 *
6514 * Try to switch to a new gang.
6515 * Returns: NULL if we switched to the new gang or a reference to the current
6516 * gang leader.
6517 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6518 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6519 struct dma_fence *gang)
6520 {
6521 struct dma_fence *old = NULL;
6522
6523 do {
6524 dma_fence_put(old);
6525 old = amdgpu_device_get_gang(adev);
6526 if (old == gang)
6527 break;
6528
6529 if (!dma_fence_is_signaled(old))
6530 return old;
6531
6532 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6533 old, gang) != old);
6534
6535 dma_fence_put(old);
6536 return NULL;
6537 }
6538
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6539 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6540 {
6541 switch (adev->asic_type) {
6542 #ifdef CONFIG_DRM_AMDGPU_SI
6543 case CHIP_HAINAN:
6544 #endif
6545 case CHIP_TOPAZ:
6546 /* chips with no display hardware */
6547 return false;
6548 #ifdef CONFIG_DRM_AMDGPU_SI
6549 case CHIP_TAHITI:
6550 case CHIP_PITCAIRN:
6551 case CHIP_VERDE:
6552 case CHIP_OLAND:
6553 #endif
6554 #ifdef CONFIG_DRM_AMDGPU_CIK
6555 case CHIP_BONAIRE:
6556 case CHIP_HAWAII:
6557 case CHIP_KAVERI:
6558 case CHIP_KABINI:
6559 case CHIP_MULLINS:
6560 #endif
6561 case CHIP_TONGA:
6562 case CHIP_FIJI:
6563 case CHIP_POLARIS10:
6564 case CHIP_POLARIS11:
6565 case CHIP_POLARIS12:
6566 case CHIP_VEGAM:
6567 case CHIP_CARRIZO:
6568 case CHIP_STONEY:
6569 /* chips with display hardware */
6570 return true;
6571 default:
6572 /* IP discovery */
6573 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
6574 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6575 return false;
6576 return true;
6577 }
6578 }
6579
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6580 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6581 uint32_t inst, uint32_t reg_addr, char reg_name[],
6582 uint32_t expected_value, uint32_t mask)
6583 {
6584 uint32_t ret = 0;
6585 uint32_t old_ = 0;
6586 uint32_t tmp_ = RREG32(reg_addr);
6587 uint32_t loop = adev->usec_timeout;
6588
6589 while ((tmp_ & (mask)) != (expected_value)) {
6590 if (old_ != tmp_) {
6591 loop = adev->usec_timeout;
6592 old_ = tmp_;
6593 } else
6594 udelay(1);
6595 tmp_ = RREG32(reg_addr);
6596 loop--;
6597 if (!loop) {
6598 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6599 inst, reg_name, (uint32_t)expected_value,
6600 (uint32_t)(tmp_ & (mask)));
6601 ret = -ETIMEDOUT;
6602 break;
6603 }
6604 }
6605 return ret;
6606 }
6607