xref: /linux/drivers/gpu/drm/xe/xe_survivability_mode.c (revision 54fd6bd42e7bd351802ff1d193a2e33e4bfb1836)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2025 Intel Corporation
4  */
5 
6 #include "xe_survivability_mode.h"
7 #include "xe_survivability_mode_types.h"
8 
9 #include <linux/kobject.h>
10 #include <linux/pci.h>
11 #include <linux/sysfs.h>
12 
13 #include "xe_configfs.h"
14 #include "xe_device.h"
15 #include "xe_gt.h"
16 #include "xe_heci_gsc.h"
17 #include "xe_i2c.h"
18 #include "xe_mmio.h"
19 #include "xe_pcode_api.h"
20 #include "xe_vsec.h"
21 
22 #define MAX_SCRATCH_MMIO 8
23 
24 /**
25  * DOC: Xe Boot Survivability
26  *
27  * Boot Survivability is a software based workflow for recovering a system in a failed boot state
28  * Here system recoverability is concerned with recovering the firmware responsible for boot.
29  *
30  * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
31  * to be flashed through mei and collect telemetry. The driver's probe flow is modified
32  * such that it enters survivability mode when pcode initialization is incomplete and boot status
33  * denotes a failure.
34  *
35  * Survivability mode can also be entered manually using the survivability mode attribute available
36  * through configfs which is beneficial in several usecases. It can be used to address scenarios
37  * where pcode does not detect failure or for validation purposes. It can also be used in
38  * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node.
39  *
40  * Use below command enable survivability mode manually::
41  *
42  *	# echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
43  *
44  * Refer :ref:`xe_configfs` for more details on how to use configfs
45  *
46  * Survivability mode is indicated by the below admin-only readable sysfs which provides additional
47  * debug information::
48  *
49  *	/sys/bus/pci/devices/<device>/surivability_mode
50  *
51  * Capability Information:
52  *	Provides boot status
53  * Postcode Information:
54  *	Provides information about the failure
55  * Overflow Information
56  *	Provides history of previous failures
57  * Auxiliary Information
58  *	Certain failures may have information in addition to postcode information
59  */
60 
61 static u32 aux_history_offset(u32 reg_value)
62 {
63 	return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
64 }
65 
66 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
67 				   int id, char *name)
68 {
69 	strscpy(info[id].name, name, sizeof(info[id].name));
70 	info[id].reg = PCODE_SCRATCH(id).raw;
71 	info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
72 }
73 
74 static void populate_survivability_info(struct xe_device *xe)
75 {
76 	struct xe_survivability *survivability = &xe->survivability;
77 	struct xe_survivability_info *info = survivability->info;
78 	struct xe_mmio *mmio;
79 	u32 id = 0, reg_value;
80 	char name[NAME_MAX];
81 	int index;
82 
83 	mmio = xe_root_tile_mmio(xe);
84 	set_survivability_info(mmio, info, id, "Capability Info");
85 	reg_value = info[id].value;
86 
87 	if (reg_value & HISTORY_TRACKING) {
88 		id++;
89 		set_survivability_info(mmio, info, id, "Postcode Info");
90 
91 		if (reg_value & OVERFLOW_SUPPORT) {
92 			id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
93 			set_survivability_info(mmio, info, id, "Overflow Info");
94 		}
95 	}
96 
97 	if (reg_value & AUXINFO_SUPPORT) {
98 		id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);
99 
100 		for (index = 0; id && reg_value; index++, reg_value = info[id].value,
101 		     id = aux_history_offset(reg_value)) {
102 			snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
103 			set_survivability_info(mmio, info, id, name);
104 		}
105 	}
106 }
107 
108 static void log_survivability_info(struct pci_dev *pdev)
109 {
110 	struct xe_device *xe = pdev_to_xe_device(pdev);
111 	struct xe_survivability *survivability = &xe->survivability;
112 	struct xe_survivability_info *info = survivability->info;
113 	int id;
114 
115 	dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
116 		 survivability->boot_status);
117 	for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
118 		if (info[id].reg)
119 			dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
120 				 info[id].reg, info[id].value);
121 	}
122 }
123 
124 static ssize_t survivability_mode_show(struct device *dev,
125 				       struct device_attribute *attr, char *buff)
126 {
127 	struct pci_dev *pdev = to_pci_dev(dev);
128 	struct xe_device *xe = pdev_to_xe_device(pdev);
129 	struct xe_survivability *survivability = &xe->survivability;
130 	struct xe_survivability_info *info = survivability->info;
131 	int index = 0, count = 0;
132 
133 	for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
134 		if (info[index].reg)
135 			count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
136 					       info[index].reg, info[index].value);
137 	}
138 
139 	return count;
140 }
141 
142 static DEVICE_ATTR_ADMIN_RO(survivability_mode);
143 
144 static void xe_survivability_mode_fini(void *arg)
145 {
146 	struct xe_device *xe = arg;
147 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
148 	struct device *dev = &pdev->dev;
149 
150 	xe_configfs_clear_survivability_mode(pdev);
151 	sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
152 }
153 
154 static int enable_survivability_mode(struct pci_dev *pdev)
155 {
156 	struct device *dev = &pdev->dev;
157 	struct xe_device *xe = pdev_to_xe_device(pdev);
158 	struct xe_survivability *survivability = &xe->survivability;
159 	int ret = 0;
160 
161 	/* create survivability mode sysfs */
162 	ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
163 	if (ret) {
164 		dev_warn(dev, "Failed to create survivability sysfs files\n");
165 		return ret;
166 	}
167 
168 	ret = devm_add_action_or_reset(xe->drm.dev,
169 				       xe_survivability_mode_fini, xe);
170 	if (ret)
171 		return ret;
172 
173 	/* Make sure xe_heci_gsc_init() knows about survivability mode */
174 	survivability->mode = true;
175 
176 	ret = xe_heci_gsc_init(xe);
177 	if (ret)
178 		goto err;
179 
180 	xe_vsec_init(xe);
181 
182 	ret = xe_i2c_probe(xe);
183 	if (ret)
184 		goto err;
185 
186 	dev_err(dev, "In Survivability Mode\n");
187 
188 	return 0;
189 
190 err:
191 	survivability->mode = false;
192 	return ret;
193 }
194 
195 /**
196  * xe_survivability_mode_is_enabled - check if survivability mode is enabled
197  * @xe: xe device instance
198  *
199  * Returns true if in survivability mode, false otherwise
200  */
201 bool xe_survivability_mode_is_enabled(struct xe_device *xe)
202 {
203 	return xe->survivability.mode;
204 }
205 
206 /**
207  * xe_survivability_mode_is_requested - check if it's possible to enable survivability
208  *					mode that was requested by firmware or userspace
209  * @xe: xe device instance
210  *
211  * This function reads configfs and  boot status from Pcode.
212  *
213  * Return: true if platform support is available and boot status indicates
214  * failure or if survivability mode is requested, false otherwise.
215  */
216 bool xe_survivability_mode_is_requested(struct xe_device *xe)
217 {
218 	struct xe_survivability *survivability = &xe->survivability;
219 	struct xe_mmio *mmio = xe_root_tile_mmio(xe);
220 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
221 	u32 data;
222 	bool survivability_mode;
223 
224 	if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
225 		return false;
226 
227 	survivability_mode = xe_configfs_get_survivability_mode(pdev);
228 
229 	if (xe->info.platform < XE_BATTLEMAGE) {
230 		if (survivability_mode) {
231 			dev_err(&pdev->dev, "Survivability Mode is not supported on this card\n");
232 			xe_configfs_clear_survivability_mode(pdev);
233 		}
234 		return false;
235 	}
236 
237 	/* Enable survivability mode if set via configfs */
238 	if (survivability_mode)
239 		return true;
240 
241 	data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
242 	survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
243 
244 	return survivability->boot_status == NON_CRITICAL_FAILURE ||
245 		survivability->boot_status == CRITICAL_FAILURE;
246 }
247 
248 /**
249  * xe_survivability_mode_enable - Initialize and enable the survivability mode
250  * @xe: xe device instance
251  *
252  * Initialize survivability information and enable survivability mode
253  *
254  * Return: 0 if survivability mode is enabled or not requested; negative error
255  * code otherwise.
256  */
257 int xe_survivability_mode_enable(struct xe_device *xe)
258 {
259 	struct xe_survivability *survivability = &xe->survivability;
260 	struct xe_survivability_info *info;
261 	struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
262 
263 	if (!xe_survivability_mode_is_requested(xe))
264 		return 0;
265 
266 	survivability->size = MAX_SCRATCH_MMIO;
267 
268 	info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
269 			    GFP_KERNEL);
270 	if (!info)
271 		return -ENOMEM;
272 
273 	survivability->info = info;
274 
275 	populate_survivability_info(xe);
276 
277 	/* Only log debug information and exit if it is a critical failure */
278 	if (survivability->boot_status == CRITICAL_FAILURE) {
279 		log_survivability_info(pdev);
280 		return -ENXIO;
281 	}
282 
283 	return enable_survivability_mode(pdev);
284 }
285