1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2025 Intel Corporation
4 */
5
6 #include "xe_survivability_mode.h"
7 #include "xe_survivability_mode_types.h"
8
9 #include <linux/kobject.h>
10 #include <linux/pci.h>
11 #include <linux/sysfs.h>
12
13 #include "xe_configfs.h"
14 #include "xe_device.h"
15 #include "xe_gt.h"
16 #include "xe_heci_gsc.h"
17 #include "xe_i2c.h"
18 #include "xe_mmio.h"
19 #include "xe_pcode_api.h"
20 #include "xe_vsec.h"
21
22 #define MAX_SCRATCH_MMIO 8
23
24 /**
25 * DOC: Xe Boot Survivability
26 *
27 * Boot Survivability is a software based workflow for recovering a system in a failed boot state
28 * Here system recoverability is concerned with recovering the firmware responsible for boot.
29 *
30 * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
31 * to be flashed through mei and collect telemetry. The driver's probe flow is modified
32 * such that it enters survivability mode when pcode initialization is incomplete and boot status
33 * denotes a failure.
34 *
35 * Survivability mode can also be entered manually using the survivability mode attribute available
36 * through configfs which is beneficial in several usecases. It can be used to address scenarios
37 * where pcode does not detect failure or for validation purposes. It can also be used in
38 * In-Field-Repair (IFR) to repair a single card without impacting the other cards in a node.
39 *
40 * Use below command enable survivability mode manually::
41 *
42 * # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
43 *
44 * Refer :ref:`xe_configfs` for more details on how to use configfs
45 *
46 * Survivability mode is indicated by the below admin-only readable sysfs which provides additional
47 * debug information::
48 *
49 * /sys/bus/pci/devices/<device>/surivability_mode
50 *
51 * Capability Information:
52 * Provides boot status
53 * Postcode Information:
54 * Provides information about the failure
55 * Overflow Information
56 * Provides history of previous failures
57 * Auxiliary Information
58 * Certain failures may have information in addition to postcode information
59 */
60
aux_history_offset(u32 reg_value)61 static u32 aux_history_offset(u32 reg_value)
62 {
63 return REG_FIELD_GET(AUXINFO_HISTORY_OFFSET, reg_value);
64 }
65
set_survivability_info(struct xe_mmio * mmio,struct xe_survivability_info * info,int id,char * name)66 static void set_survivability_info(struct xe_mmio *mmio, struct xe_survivability_info *info,
67 int id, char *name)
68 {
69 strscpy(info[id].name, name, sizeof(info[id].name));
70 info[id].reg = PCODE_SCRATCH(id).raw;
71 info[id].value = xe_mmio_read32(mmio, PCODE_SCRATCH(id));
72 }
73
populate_survivability_info(struct xe_device * xe)74 static void populate_survivability_info(struct xe_device *xe)
75 {
76 struct xe_survivability *survivability = &xe->survivability;
77 struct xe_survivability_info *info = survivability->info;
78 struct xe_mmio *mmio;
79 u32 id = 0, reg_value;
80 char name[NAME_MAX];
81 int index;
82
83 mmio = xe_root_tile_mmio(xe);
84 set_survivability_info(mmio, info, id, "Capability Info");
85 reg_value = info[id].value;
86
87 if (reg_value & HISTORY_TRACKING) {
88 id++;
89 set_survivability_info(mmio, info, id, "Postcode Info");
90
91 if (reg_value & OVERFLOW_SUPPORT) {
92 id = REG_FIELD_GET(OVERFLOW_REG_OFFSET, reg_value);
93 set_survivability_info(mmio, info, id, "Overflow Info");
94 }
95 }
96
97 if (reg_value & AUXINFO_SUPPORT) {
98 id = REG_FIELD_GET(AUXINFO_REG_OFFSET, reg_value);
99
100 for (index = 0; id && reg_value; index++, reg_value = info[id].value,
101 id = aux_history_offset(reg_value)) {
102 snprintf(name, NAME_MAX, "Auxiliary Info %d", index);
103 set_survivability_info(mmio, info, id, name);
104 }
105 }
106 }
107
log_survivability_info(struct pci_dev * pdev)108 static void log_survivability_info(struct pci_dev *pdev)
109 {
110 struct xe_device *xe = pdev_to_xe_device(pdev);
111 struct xe_survivability *survivability = &xe->survivability;
112 struct xe_survivability_info *info = survivability->info;
113 int id;
114
115 dev_info(&pdev->dev, "Survivability Boot Status : Critical Failure (%d)\n",
116 survivability->boot_status);
117 for (id = 0; id < MAX_SCRATCH_MMIO; id++) {
118 if (info[id].reg)
119 dev_info(&pdev->dev, "%s: 0x%x - 0x%x\n", info[id].name,
120 info[id].reg, info[id].value);
121 }
122 }
123
survivability_mode_show(struct device * dev,struct device_attribute * attr,char * buff)124 static ssize_t survivability_mode_show(struct device *dev,
125 struct device_attribute *attr, char *buff)
126 {
127 struct pci_dev *pdev = to_pci_dev(dev);
128 struct xe_device *xe = pdev_to_xe_device(pdev);
129 struct xe_survivability *survivability = &xe->survivability;
130 struct xe_survivability_info *info = survivability->info;
131 int index = 0, count = 0;
132
133 for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
134 if (info[index].reg)
135 count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
136 info[index].reg, info[index].value);
137 }
138
139 return count;
140 }
141
142 static DEVICE_ATTR_ADMIN_RO(survivability_mode);
143
xe_survivability_mode_fini(void * arg)144 static void xe_survivability_mode_fini(void *arg)
145 {
146 struct xe_device *xe = arg;
147 struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
148 struct device *dev = &pdev->dev;
149
150 xe_configfs_clear_survivability_mode(pdev);
151 sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
152 }
153
enable_survivability_mode(struct pci_dev * pdev)154 static int enable_survivability_mode(struct pci_dev *pdev)
155 {
156 struct device *dev = &pdev->dev;
157 struct xe_device *xe = pdev_to_xe_device(pdev);
158 struct xe_survivability *survivability = &xe->survivability;
159 int ret = 0;
160
161 /* create survivability mode sysfs */
162 ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
163 if (ret) {
164 dev_warn(dev, "Failed to create survivability sysfs files\n");
165 return ret;
166 }
167
168 ret = devm_add_action_or_reset(xe->drm.dev,
169 xe_survivability_mode_fini, xe);
170 if (ret)
171 return ret;
172
173 /* Make sure xe_heci_gsc_init() knows about survivability mode */
174 survivability->mode = true;
175
176 ret = xe_heci_gsc_init(xe);
177 if (ret)
178 goto err;
179
180 xe_vsec_init(xe);
181
182 ret = xe_i2c_probe(xe);
183 if (ret)
184 goto err;
185
186 dev_err(dev, "In Survivability Mode\n");
187
188 return 0;
189
190 err:
191 survivability->mode = false;
192 return ret;
193 }
194
195 /**
196 * xe_survivability_mode_is_enabled - check if survivability mode is enabled
197 * @xe: xe device instance
198 *
199 * Returns true if in survivability mode, false otherwise
200 */
xe_survivability_mode_is_enabled(struct xe_device * xe)201 bool xe_survivability_mode_is_enabled(struct xe_device *xe)
202 {
203 return xe->survivability.mode;
204 }
205
206 /**
207 * xe_survivability_mode_is_requested - check if it's possible to enable survivability
208 * mode that was requested by firmware or userspace
209 * @xe: xe device instance
210 *
211 * This function reads configfs and boot status from Pcode.
212 *
213 * Return: true if platform support is available and boot status indicates
214 * failure or if survivability mode is requested, false otherwise.
215 */
xe_survivability_mode_is_requested(struct xe_device * xe)216 bool xe_survivability_mode_is_requested(struct xe_device *xe)
217 {
218 struct xe_survivability *survivability = &xe->survivability;
219 struct xe_mmio *mmio = xe_root_tile_mmio(xe);
220 struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
221 u32 data;
222 bool survivability_mode;
223
224 if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
225 return false;
226
227 survivability_mode = xe_configfs_get_survivability_mode(pdev);
228
229 if (xe->info.platform < XE_BATTLEMAGE) {
230 if (survivability_mode) {
231 dev_err(&pdev->dev, "Survivability Mode is not supported on this card\n");
232 xe_configfs_clear_survivability_mode(pdev);
233 }
234 return false;
235 }
236
237 /* Enable survivability mode if set via configfs */
238 if (survivability_mode)
239 return true;
240
241 data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
242 survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
243
244 return survivability->boot_status == NON_CRITICAL_FAILURE ||
245 survivability->boot_status == CRITICAL_FAILURE;
246 }
247
248 /**
249 * xe_survivability_mode_enable - Initialize and enable the survivability mode
250 * @xe: xe device instance
251 *
252 * Initialize survivability information and enable survivability mode
253 *
254 * Return: 0 if survivability mode is enabled or not requested; negative error
255 * code otherwise.
256 */
xe_survivability_mode_enable(struct xe_device * xe)257 int xe_survivability_mode_enable(struct xe_device *xe)
258 {
259 struct xe_survivability *survivability = &xe->survivability;
260 struct xe_survivability_info *info;
261 struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
262
263 if (!xe_survivability_mode_is_requested(xe))
264 return 0;
265
266 survivability->size = MAX_SCRATCH_MMIO;
267
268 info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
269 GFP_KERNEL);
270 if (!info)
271 return -ENOMEM;
272
273 survivability->info = info;
274
275 populate_survivability_info(xe);
276
277 /* Only log debug information and exit if it is a critical failure */
278 if (survivability->boot_status == CRITICAL_FAILURE) {
279 log_survivability_info(pdev);
280 return -ENXIO;
281 }
282
283 return enable_survivability_mode(pdev);
284 }
285