xref: /linux/drivers/gpu/drm/i915/i915_gpu_error.h (revision 8ab2e96d8ff188006f1e3346a56443cd07fe1858)
1 /*
2  * SPDX-License-Identifier: MIT
3  *
4  * Copyright � 2008-2018 Intel Corporation
5  */
6 
7 #ifndef _I915_GPU_ERROR_H_
8 #define _I915_GPU_ERROR_H_
9 
10 #include <linux/atomic.h>
11 #include <linux/kref.h>
12 #include <linux/ktime.h>
13 #include <linux/sched.h>
14 
15 #include <drm/drm_mm.h>
16 
17 #include "gt/intel_engine.h"
18 #include "gt/uc/intel_uc_fw.h"
19 
20 #include "intel_device_info.h"
21 
22 #include "i915_gem.h"
23 #include "i915_gem_gtt.h"
24 #include "i915_params.h"
25 #include "i915_scheduler.h"
26 
27 struct drm_i915_private;
28 struct i915_vma_compress;
29 struct intel_engine_capture_vma;
30 struct intel_overlay_error_state;
31 struct intel_display_error_state;
32 
33 struct i915_vma_coredump {
34 	struct i915_vma_coredump *next;
35 
36 	char name[20];
37 
38 	u64 gtt_offset;
39 	u64 gtt_size;
40 	u32 gtt_page_sizes;
41 
42 	int num_pages;
43 	int page_count;
44 	int unused;
45 	u32 *pages[0];
46 };
47 
48 struct i915_request_coredump {
49 	unsigned long flags;
50 	pid_t pid;
51 	u32 context;
52 	u32 seqno;
53 	u32 start;
54 	u32 head;
55 	u32 tail;
56 	struct i915_sched_attr sched_attr;
57 };
58 
59 struct intel_engine_coredump {
60 	const struct intel_engine_cs *engine;
61 
62 	bool simulated;
63 	u32 reset_count;
64 
65 	/* position of active request inside the ring */
66 	u32 rq_head, rq_post, rq_tail;
67 
68 	/* Register state */
69 	u32 ccid;
70 	u32 start;
71 	u32 tail;
72 	u32 head;
73 	u32 ctl;
74 	u32 mode;
75 	u32 hws;
76 	u32 ipeir;
77 	u32 ipehr;
78 	u32 esr;
79 	u32 bbstate;
80 	u32 instpm;
81 	u32 instps;
82 	u64 bbaddr;
83 	u64 acthd;
84 	u32 fault_reg;
85 	u64 faddr;
86 	u32 rc_psmi; /* sleep state */
87 	struct intel_instdone instdone;
88 
89 	struct i915_gem_context_coredump {
90 		char comm[TASK_COMM_LEN];
91 
92 		u64 total_runtime;
93 		u32 avg_runtime;
94 
95 		pid_t pid;
96 		int active;
97 		int guilty;
98 		struct i915_sched_attr sched_attr;
99 	} context;
100 
101 	struct i915_vma_coredump *vma;
102 
103 	struct i915_request_coredump execlist[EXECLIST_MAX_PORTS];
104 	unsigned int num_ports;
105 
106 	struct {
107 		u32 gfx_mode;
108 		union {
109 			u64 pdp[4];
110 			u32 pp_dir_base;
111 		};
112 	} vm_info;
113 
114 	struct intel_engine_coredump *next;
115 };
116 
117 struct intel_gt_coredump {
118 	const struct intel_gt *_gt;
119 	bool awake;
120 	bool simulated;
121 
122 	/* Generic register state */
123 	u32 eir;
124 	u32 pgtbl_er;
125 	u32 ier;
126 	u32 gtier[6], ngtier;
127 	u32 derrmr;
128 	u32 forcewake;
129 	u32 error; /* gen6+ */
130 	u32 err_int; /* gen7 */
131 	u32 fault_data0; /* gen8, gen9 */
132 	u32 fault_data1; /* gen8, gen9 */
133 	u32 done_reg;
134 	u32 gac_eco;
135 	u32 gam_ecochk;
136 	u32 gab_ctl;
137 	u32 gfx_mode;
138 	u32 gtt_cache;
139 	u32 aux_err; /* gen12 */
140 	u32 sfc_done[GEN12_SFC_DONE_MAX]; /* gen12 */
141 	u32 gam_done; /* gen12 */
142 
143 	u32 nfence;
144 	u64 fence[I915_MAX_NUM_FENCES];
145 
146 	struct intel_engine_coredump *engine;
147 
148 	struct intel_uc_coredump {
149 		struct intel_uc_fw guc_fw;
150 		struct intel_uc_fw huc_fw;
151 		struct i915_vma_coredump *guc_log;
152 	} *uc;
153 
154 	struct intel_gt_coredump *next;
155 };
156 
157 struct i915_gpu_coredump {
158 	struct kref ref;
159 	ktime_t time;
160 	ktime_t boottime;
161 	ktime_t uptime;
162 	unsigned long capture;
163 
164 	struct drm_i915_private *i915;
165 
166 	struct intel_gt_coredump *gt;
167 
168 	char error_msg[128];
169 	bool simulated;
170 	bool wakelock;
171 	bool suspended;
172 	int iommu;
173 	u32 reset_count;
174 	u32 suspend_count;
175 
176 	struct intel_device_info device_info;
177 	struct intel_runtime_info runtime_info;
178 	struct intel_driver_caps driver_caps;
179 	struct i915_params params;
180 
181 	struct intel_overlay_error_state *overlay;
182 	struct intel_display_error_state *display;
183 
184 	struct scatterlist *sgl, *fit;
185 };
186 
187 struct i915_gpu_error {
188 	/* For reset and error_state handling. */
189 	spinlock_t lock;
190 	/* Protected by the above dev->gpu_error.lock. */
191 	struct i915_gpu_coredump *first_error;
192 
193 	atomic_t pending_fb_pin;
194 
195 	/** Number of times the device has been reset (global) */
196 	atomic_t reset_count;
197 
198 	/** Number of times an engine has been reset */
199 	atomic_t reset_engine_count[I915_NUM_ENGINES];
200 };
201 
202 struct drm_i915_error_state_buf {
203 	struct drm_i915_private *i915;
204 	struct scatterlist *sgl, *cur, *end;
205 
206 	char *buf;
207 	size_t bytes;
208 	size_t size;
209 	loff_t iter;
210 
211 	int err;
212 };
213 
214 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
215 
216 __printf(2, 3)
217 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
218 
219 struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915);
220 void i915_capture_error_state(struct drm_i915_private *i915);
221 
222 struct i915_gpu_coredump *
223 i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);
224 
225 struct intel_gt_coredump *
226 intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp);
227 
228 struct intel_engine_coredump *
229 intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp);
230 
231 struct intel_engine_capture_vma *
232 intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
233 				  struct i915_request *rq,
234 				  gfp_t gfp);
235 
236 void intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
237 				   struct intel_engine_capture_vma *capture,
238 				   struct i915_vma_compress *compress);
239 
240 struct i915_vma_compress *
241 i915_vma_capture_prepare(struct intel_gt_coredump *gt);
242 
243 void i915_vma_capture_finish(struct intel_gt_coredump *gt,
244 			     struct i915_vma_compress *compress);
245 
246 void i915_error_state_store(struct i915_gpu_coredump *error);
247 
248 static inline struct i915_gpu_coredump *
249 i915_gpu_coredump_get(struct i915_gpu_coredump *gpu)
250 {
251 	kref_get(&gpu->ref);
252 	return gpu;
253 }
254 
255 ssize_t
256 i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
257 				 char *buf, loff_t offset, size_t count);
258 
259 void __i915_gpu_coredump_free(struct kref *kref);
260 static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
261 {
262 	if (gpu)
263 		kref_put(&gpu->ref, __i915_gpu_coredump_free);
264 }
265 
266 struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915);
267 void i915_reset_error_state(struct drm_i915_private *i915);
268 void i915_disable_error_state(struct drm_i915_private *i915, int err);
269 
270 #else
271 
272 static inline void i915_capture_error_state(struct drm_i915_private *i915)
273 {
274 }
275 
276 static inline struct i915_gpu_coredump *
277 i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
278 {
279 	return NULL;
280 }
281 
282 static inline struct intel_gt_coredump *
283 intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp)
284 {
285 	return NULL;
286 }
287 
288 static inline struct intel_engine_coredump *
289 intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp)
290 {
291 	return NULL;
292 }
293 
294 static inline struct intel_engine_capture_vma *
295 intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
296 				  struct i915_request *rq,
297 				  gfp_t gfp)
298 {
299 	return NULL;
300 }
301 
302 static inline void
303 intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
304 			      struct intel_engine_capture_vma *capture,
305 			      struct i915_vma_compress *compress)
306 {
307 }
308 
309 static inline struct i915_vma_compress *
310 i915_vma_capture_prepare(struct intel_gt_coredump *gt)
311 {
312 	return NULL;
313 }
314 
315 static inline void
316 i915_vma_capture_finish(struct intel_gt_coredump *gt,
317 			struct i915_vma_compress *compress)
318 {
319 }
320 
321 static inline void
322 i915_error_state_store(struct i915_gpu_coredump *error)
323 {
324 }
325 
326 static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
327 {
328 }
329 
330 static inline struct i915_gpu_coredump *
331 i915_first_error_state(struct drm_i915_private *i915)
332 {
333 	return ERR_PTR(-ENODEV);
334 }
335 
336 static inline void i915_reset_error_state(struct drm_i915_private *i915)
337 {
338 }
339 
340 static inline void i915_disable_error_state(struct drm_i915_private *i915,
341 					    int err)
342 {
343 }
344 
345 #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
346 
347 #endif /* _I915_GPU_ERROR_H_ */
348