xref: /linux/drivers/gpu/drm/i915/i915_gpu_error.h (revision e53b20598f394e37951d6355f1c88ae01165b53f)
1 /*
2  * SPDX-License-Identifier: MIT
3  *
4  * Copyright © 2008-2018 Intel Corporation
5  */
6 
7 #ifndef _I915_GPU_ERROR_H_
8 #define _I915_GPU_ERROR_H_
9 
10 #include <linux/atomic.h>
11 #include <linux/kref.h>
12 #include <linux/ktime.h>
13 #include <linux/sched.h>
14 
15 #include <drm/drm_mm.h>
16 
17 #include "gt/intel_engine.h"
18 #include "gt/intel_gt_types.h"
19 #include "gt/uc/intel_uc_fw.h"
20 
21 #include "intel_device_info.h"
22 
23 #include "i915_gem.h"
24 #include "i915_gem_gtt.h"
25 #include "i915_params.h"
26 #include "i915_scheduler.h"
27 
28 struct drm_i915_private;
29 struct i915_vma_compress;
30 struct intel_engine_capture_vma;
31 struct intel_overlay_error_state;
32 
33 struct i915_vma_coredump {
34 	struct i915_vma_coredump *next;
35 
36 	char name[20];
37 
38 	u64 gtt_offset;
39 	u64 gtt_size;
40 	u32 gtt_page_sizes;
41 
42 	int unused;
43 	struct list_head page_list;
44 };
45 
46 struct i915_request_coredump {
47 	unsigned long flags;
48 	pid_t pid;
49 	u32 context;
50 	u32 seqno;
51 	u32 head;
52 	u32 tail;
53 	struct i915_sched_attr sched_attr;
54 };
55 
56 struct __guc_capture_parsed_output;
57 
58 struct intel_engine_coredump {
59 	const struct intel_engine_cs *engine;
60 
61 	bool hung;
62 	bool simulated;
63 	u32 reset_count;
64 
65 	/* position of active request inside the ring */
66 	u32 rq_head, rq_post, rq_tail;
67 
68 	/* Register state */
69 	u32 ccid;
70 	u32 start;
71 	u32 tail;
72 	u32 head;
73 	u32 ctl;
74 	u32 mode;
75 	u32 hws;
76 	u32 ipeir;
77 	u32 ipehr;
78 	u32 esr;
79 	u32 bbstate;
80 	u32 instpm;
81 	u32 instps;
82 	u64 bbaddr;
83 	u64 acthd;
84 	u32 fault_reg;
85 	u64 faddr;
86 	u32 rc_psmi; /* sleep state */
87 	struct intel_instdone instdone;
88 
89 	/* GuC matched capture-lists info */
90 	struct intel_guc_state_capture *capture;
91 	struct __guc_capture_parsed_output *guc_capture_node;
92 
93 	struct i915_gem_context_coredump {
94 		char comm[TASK_COMM_LEN];
95 
96 		u64 total_runtime;
97 		u64 avg_runtime;
98 
99 		pid_t pid;
100 		int active;
101 		int guilty;
102 		struct i915_sched_attr sched_attr;
103 	} context;
104 
105 	struct i915_vma_coredump *vma;
106 
107 	struct i915_request_coredump execlist[EXECLIST_MAX_PORTS];
108 	unsigned int num_ports;
109 
110 	struct {
111 		u32 gfx_mode;
112 		union {
113 			u64 pdp[4];
114 			u32 pp_dir_base;
115 		};
116 	} vm_info;
117 
118 	struct intel_engine_coredump *next;
119 };
120 
121 struct intel_gt_coredump {
122 	const struct intel_gt *_gt;
123 	bool awake;
124 	bool simulated;
125 
126 	struct intel_gt_info info;
127 
128 	/* Generic register state */
129 	u32 eir;
130 	u32 pgtbl_er;
131 	u32 ier;
132 	u32 gtier[6], ngtier;
133 	u32 forcewake;
134 	u32 error; /* gen6+ */
135 	u32 err_int; /* gen7 */
136 	u32 fault_data0; /* gen8, gen9 */
137 	u32 fault_data1; /* gen8, gen9 */
138 	u32 done_reg;
139 	u32 gac_eco;
140 	u32 gam_ecochk;
141 	u32 gab_ctl;
142 	u32 gfx_mode;
143 	u32 gtt_cache;
144 	u32 aux_err; /* gen12 */
145 	u32 gam_done; /* gen12 */
146 
147 	/* Display related */
148 	u32 derrmr;
149 	u32 sfc_done[I915_MAX_SFC]; /* gen12 */
150 
151 	u32 nfence;
152 	u64 fence[I915_MAX_NUM_FENCES];
153 
154 	struct intel_engine_coredump *engine;
155 
156 	struct intel_uc_coredump {
157 		struct intel_uc_fw guc_fw;
158 		struct intel_uc_fw huc_fw;
159 		struct i915_vma_coredump *guc_log;
160 		bool is_guc_capture;
161 	} *uc;
162 
163 	struct intel_gt_coredump *next;
164 };
165 
166 struct i915_gpu_coredump {
167 	struct kref ref;
168 	ktime_t time;
169 	ktime_t boottime;
170 	ktime_t uptime;
171 	unsigned long capture;
172 
173 	struct drm_i915_private *i915;
174 
175 	struct intel_gt_coredump *gt;
176 
177 	char error_msg[128];
178 	bool simulated;
179 	bool wakelock;
180 	bool suspended;
181 	int iommu;
182 	u32 reset_count;
183 	u32 suspend_count;
184 
185 	struct intel_device_info device_info;
186 	struct intel_runtime_info runtime_info;
187 	struct intel_driver_caps driver_caps;
188 	struct i915_params params;
189 
190 	struct intel_overlay_error_state *overlay;
191 
192 	struct scatterlist *sgl, *fit;
193 };
194 
195 struct i915_gpu_error {
196 	/* For reset and error_state handling. */
197 	spinlock_t lock;
198 	/* Protected by the above dev->gpu_error.lock. */
199 	struct i915_gpu_coredump *first_error;
200 
201 	atomic_t pending_fb_pin;
202 
203 	/** Number of times the device has been reset (global) */
204 	atomic_t reset_count;
205 
206 	/** Number of times an engine has been reset */
207 	atomic_t reset_engine_count[I915_NUM_ENGINES];
208 };
209 
210 struct drm_i915_error_state_buf {
211 	struct drm_i915_private *i915;
212 	struct scatterlist *sgl, *cur, *end;
213 
214 	char *buf;
215 	size_t bytes;
216 	size_t size;
217 	loff_t iter;
218 
219 	int err;
220 };
221 
222 static inline u32 i915_reset_count(struct i915_gpu_error *error)
223 {
224 	return atomic_read(&error->reset_count);
225 }
226 
227 static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
228 					  const struct intel_engine_cs *engine)
229 {
230 	return atomic_read(&error->reset_engine_count[engine->uabi_class]);
231 }
232 
233 #define CORE_DUMP_FLAG_NONE           0x0
234 #define CORE_DUMP_FLAG_IS_GUC_CAPTURE BIT(0)
235 
236 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
237 
238 __printf(2, 3)
239 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
240 void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m,
241 			       const struct intel_engine_cs *engine,
242 			       const struct i915_vma_coredump *vma);
243 struct i915_vma_coredump *
244 intel_gpu_error_find_batch(const struct intel_engine_coredump *ee);
245 
246 struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt,
247 					    intel_engine_mask_t engine_mask, u32 dump_flags);
248 void i915_capture_error_state(struct intel_gt *gt,
249 			      intel_engine_mask_t engine_mask, u32 dump_flags);
250 
251 struct i915_gpu_coredump *
252 i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);
253 
254 struct intel_gt_coredump *
255 intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags);
256 
257 struct intel_engine_coredump *
258 intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_flags);
259 
260 struct intel_engine_capture_vma *
261 intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
262 				  struct i915_request *rq,
263 				  gfp_t gfp);
264 
265 void intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
266 				   struct intel_engine_capture_vma *capture,
267 				   struct i915_vma_compress *compress);
268 
269 struct i915_vma_compress *
270 i915_vma_capture_prepare(struct intel_gt_coredump *gt);
271 
272 void i915_vma_capture_finish(struct intel_gt_coredump *gt,
273 			     struct i915_vma_compress *compress);
274 
275 void i915_error_state_store(struct i915_gpu_coredump *error);
276 
277 static inline struct i915_gpu_coredump *
278 i915_gpu_coredump_get(struct i915_gpu_coredump *gpu)
279 {
280 	kref_get(&gpu->ref);
281 	return gpu;
282 }
283 
284 ssize_t
285 i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
286 				 char *buf, loff_t offset, size_t count);
287 
288 void __i915_gpu_coredump_free(struct kref *kref);
289 static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
290 {
291 	if (gpu)
292 		kref_put(&gpu->ref, __i915_gpu_coredump_free);
293 }
294 
295 struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915);
296 void i915_reset_error_state(struct drm_i915_private *i915);
297 void i915_disable_error_state(struct drm_i915_private *i915, int err);
298 
299 #else
300 
301 __printf(2, 3)
302 static inline void
303 i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
304 {
305 }
306 
307 static inline void
308 i915_capture_error_state(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags)
309 {
310 }
311 
312 static inline struct i915_gpu_coredump *
313 i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
314 {
315 	return NULL;
316 }
317 
318 static inline struct intel_gt_coredump *
319 intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags)
320 {
321 	return NULL;
322 }
323 
324 static inline struct intel_engine_coredump *
325 intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_flags)
326 {
327 	return NULL;
328 }
329 
330 static inline struct intel_engine_capture_vma *
331 intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
332 				  struct i915_request *rq,
333 				  gfp_t gfp)
334 {
335 	return NULL;
336 }
337 
338 static inline void
339 intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
340 			      struct intel_engine_capture_vma *capture,
341 			      struct i915_vma_compress *compress)
342 {
343 }
344 
345 static inline struct i915_vma_compress *
346 i915_vma_capture_prepare(struct intel_gt_coredump *gt)
347 {
348 	return NULL;
349 }
350 
351 static inline void
352 i915_vma_capture_finish(struct intel_gt_coredump *gt,
353 			struct i915_vma_compress *compress)
354 {
355 }
356 
357 static inline void
358 i915_error_state_store(struct i915_gpu_coredump *error)
359 {
360 }
361 
362 static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
363 {
364 }
365 
366 static inline struct i915_gpu_coredump *
367 i915_first_error_state(struct drm_i915_private *i915)
368 {
369 	return ERR_PTR(-ENODEV);
370 }
371 
372 static inline void i915_reset_error_state(struct drm_i915_private *i915)
373 {
374 }
375 
376 static inline void i915_disable_error_state(struct drm_i915_private *i915,
377 					    int err)
378 {
379 }
380 
381 #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
382 
383 #endif /* _I915_GPU_ERROR_H_ */
384