xref: /linux/drivers/gpu/drm/xe/xe_survivability_mode.c (revision 4f9786035f9e519db41375818e1d0b5f20da2f10)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2025 Intel Corporation
4  */
5 
6 #include "xe_survivability_mode.h"
7 #include "xe_survivability_mode_types.h"
8 
9 #include <linux/kobject.h>
10 #include <linux/pci.h>
11 #include <linux/sysfs.h>
12 
13 #include "xe_device.h"
14 #include "xe_gt.h"
15 #include "xe_heci_gsc.h"
16 #include "xe_mmio.h"
17 #include "xe_pcode_api.h"
18 #include "xe_vsec.h"
19 
20 #define MAX_SCRATCH_MMIO 8
21 
22 /**
23  * DOC: Xe Boot Survivability
24  *
25  * Boot Survivability is a software based workflow for recovering a system in a failed boot state
26  * Here system recoverability is concerned with recovering the firmware responsible for boot.
27  *
28  * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
29  * to be flashed through mei and collect telemetry. The driver's probe flow is modified
30  * such that it enters survivability mode when pcode initialization is incomplete and boot status
31  * denotes a failure. The driver then  populates the survivability_mode PCI sysfs indicating
32  * survivability mode and provides additional information required for debug
33  *
34  * KMD exposes below admin-only readable sysfs in survivability mode
35  *
36  * device/survivability_mode: The presence of this file indicates that the card is in survivability
37  *			      mode. Also, provides additional information on why the driver entered
38  *			      survivability mode.
39  *
40  *			      Capability Information - Provides boot status
41  *			      Postcode Information   - Provides information about the failure
42  *			      Overflow Information   - Provides history of previous failures
43  *			      Auxiliary Information  - Certain failures may have information in
44  *						       addition to postcode information
45  */
46 
47 static u32 aux_history_offset(u32 reg_value)
48 {
49 	return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
50 }
51 
52 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
53 				   int id, char *name)
54 {
55 	strscpy(info[id].name, name, sizeof(info[id].name));
56 	info[id].reg = PCODE_SCRATCH(id).raw;
57 	info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
58 }
59 
60 static void populate_survivability_info(struct xe_device *xe)
61 {
62 	struct xe_survivability *survivability = &xe->survivability;
63 	struct xe_survivability_info *info = survivability->info;
64 	struct xe_mmio *mmio;
65 	u32 id = 0, reg_value;
66 	char name[NAME_MAX];
67 	int index;
68 
69 	mmio = xe_root_tile_mmio(xe);
70 	set_survivability_info(mmio, info, id, "Capability Info");
71 	reg_value = info[id].value;
72 
73 	if (reg_value & HISTORY_TRACKING) {
74 		id++;
75 		set_survivability_info(mmio, info, id, "Postcode Info");
76 
77 		if (reg_value & OVERFLOW_SUPPORT) {
78 			id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
79 			set_survivability_info(mmio, info, id, "Overflow Info");
80 		}
81 	}
82 
83 	if (reg_value & AUXINFO_SUPPORT) {
84 		id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);
85 
86 		for (index = 0; id && reg_value; index++, reg_value = info[id].value,
87 		     id = aux_history_offset(reg_value)) {
88 			snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
89 			set_survivability_info(mmio, info, id, name);
90 		}
91 	}
92 }
93 
94 static void log_survivability_info(struct pci_dev *pdev)
95 {
96 	struct xe_device *xe = pdev_to_xe_device(pdev);
97 	struct xe_survivability *survivability = &xe->survivability;
98 	struct xe_survivability_info *info = survivability->info;
99 	int id;
100 
101 	dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
102 		 survivability->boot_status);
103 	for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
104 		if (info[id].reg)
105 			dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
106 				 info[id].reg, info[id].value);
107 	}
108 }
109 
110 static ssize_t survivability_mode_show(struct device *dev,
111 				       struct device_attribute *attr, char *buff)
112 {
113 	struct pci_dev *pdev = to_pci_dev(dev);
114 	struct xe_device *xe = pdev_to_xe_device(pdev);
115 	struct xe_survivability *survivability = &xe->survivability;
116 	struct xe_survivability_info *info = survivability->info;
117 	int index = 0, count = 0;
118 
119 	for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
120 		if (info[index].reg)
121 			count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
122 					       info[index].reg, info[index].value);
123 	}
124 
125 	return count;
126 }
127 
128 static DEVICE_ATTR_ADMIN_RO(survivability_mode);
129 
130 static void xe_survivability_mode_fini(void *arg)
131 {
132 	struct xe_device *xe = arg;
133 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
134 	struct device *dev = &pdev->dev;
135 
136 	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
137 }
138 
139 static int enable_survivability_mode(struct pci_dev *pdev)
140 {
141 	struct device *dev = &pdev->dev;
142 	struct xe_device *xe = pdev_to_xe_device(pdev);
143 	struct xe_survivability *survivability = &xe->survivability;
144 	int ret = 0;
145 
146 	/* create survivability mode sysfs */
147 	ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
148 	if (ret) {
149 		dev_warn(dev, "Failed to create survivability sysfs files\n");
150 		return ret;
151 	}
152 
153 	ret = devm_add_action_or_reset(xe->drm.dev,
154 				       xe_survivability_mode_fini, xe);
155 	if (ret)
156 		return ret;
157 
158 	/* Make sure xe_heci_gsc_init() knows about survivability mode */
159 	survivability->mode = true;
160 
161 	ret = xe_heci_gsc_init(xe);
162 	if (ret) {
163 		/*
164 		 * But if it fails, device can't enter survivability
165 		 * so move it back for correct error handling
166 		 */
167 		survivability->mode = false;
168 		return ret;
169 	}
170 
171 	xe_vsec_init(xe);
172 
173 	dev_err(dev, "In Survivability Mode\n");
174 
175 	return 0;
176 }
177 
178 /**
179  * xe_survivability_mode_is_enabled - check if survivability mode is enabled
180  * @xe: xe device instance
181  *
182  * Returns true if in survivability mode, false otherwise
183  */
184 bool xe_survivability_mode_is_enabled(struct xe_device *xe)
185 {
186 	return xe->survivability.mode;
187 }
188 
189 /*
190  * survivability_mode_requested - check if it's possible to enable
191  * survivability mode and that was requested by firmware
192  *
193  * This function reads the boot status from Pcode.
194  *
195  * Return: true if platform support is available and boot status indicates
196  * failure, false otherwise.
197  */
198 static bool survivability_mode_requested(struct xe_device *xe)
199 {
200 	struct xe_survivability *survivability = &xe->survivability;
201 	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
202 	u32 data;
203 
204 	if (!IS_DGFX(xe) || xe->info.platform < XE_BATTLEMAGE || IS_SRIOV_VF(xe))
205 		return false;
206 
207 	data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
208 	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
209 
210 	return survivability->boot_status == NON_CRITICAL_FAILURE ||
211 		survivability->boot_status == CRITICAL_FAILURE;
212 }
213 
214 /**
215  * xe_survivability_mode_enable - Initialize and enable the survivability mode
216  * @xe: xe device instance
217  *
218  * Initialize survivability information and enable survivability mode
219  *
220  * Return: 0 if survivability mode is enabled or not requested; negative error
221  * code otherwise.
222  */
223 int xe_survivability_mode_enable(struct xe_device *xe)
224 {
225 	struct xe_survivability *survivability = &xe->survivability;
226 	struct xe_survivability_info *info;
227 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
228 
229 	if (!survivability_mode_requested(xe))
230 		return 0;
231 
232 	survivability->size = MAX_SCRATCH_MMIO;
233 
234 	info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
235 			    GFP_KERNEL);
236 	if (!info)
237 		return -ENOMEM;
238 
239 	survivability->info = info;
240 
241 	populate_survivability_info(xe);
242 
243 	/* Only log debug information and exit if it is a critical failure */
244 	if (survivability->boot_status == CRITICAL_FAILURE) {
245 		log_survivability_info(pdev);
246 		return -ENXIO;
247 	}
248 
249 	return enable_survivability_mode(pdev);
250 }
251