1 /* 2 * Copyright 2021 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 24 #include <linux/devcoredump.h> 25 #include <generated/utsrelease.h> 26 27 #include "amdgpu_reset.h" 28 #include "aldebaran.h" 29 #include "sienna_cichlid.h" 30 #include "smu_v13_0_10.h" 31 32 int amdgpu_reset_init(struct amdgpu_device *adev) 33 { 34 int ret = 0; 35 36 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 37 case IP_VERSION(13, 0, 2): 38 case IP_VERSION(13, 0, 6): 39 ret = aldebaran_reset_init(adev); 40 break; 41 case IP_VERSION(11, 0, 7): 42 ret = sienna_cichlid_reset_init(adev); 43 break; 44 case IP_VERSION(13, 0, 10): 45 ret = smu_v13_0_10_reset_init(adev); 46 break; 47 default: 48 break; 49 } 50 51 return ret; 52 } 53 54 int amdgpu_reset_fini(struct amdgpu_device *adev) 55 { 56 int ret = 0; 57 58 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 59 case IP_VERSION(13, 0, 2): 60 case IP_VERSION(13, 0, 6): 61 ret = aldebaran_reset_fini(adev); 62 break; 63 case IP_VERSION(11, 0, 7): 64 ret = sienna_cichlid_reset_fini(adev); 65 break; 66 case IP_VERSION(13, 0, 10): 67 ret = smu_v13_0_10_reset_fini(adev); 68 break; 69 default: 70 break; 71 } 72 73 return ret; 74 } 75 76 int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev, 77 struct amdgpu_reset_context *reset_context) 78 { 79 struct amdgpu_reset_handler *reset_handler = NULL; 80 81 if (adev->reset_cntl && adev->reset_cntl->get_reset_handler) 82 reset_handler = adev->reset_cntl->get_reset_handler( 83 adev->reset_cntl, reset_context); 84 if (!reset_handler) 85 return -EOPNOTSUPP; 86 87 return reset_handler->prepare_hwcontext(adev->reset_cntl, 88 reset_context); 89 } 90 91 int amdgpu_reset_perform_reset(struct amdgpu_device *adev, 92 struct amdgpu_reset_context *reset_context) 93 { 94 int ret; 95 struct amdgpu_reset_handler *reset_handler = NULL; 96 97 if (adev->reset_cntl) 98 reset_handler = adev->reset_cntl->get_reset_handler( 99 adev->reset_cntl, reset_context); 100 if (!reset_handler) 101 return -EOPNOTSUPP; 102 103 ret = reset_handler->perform_reset(adev->reset_cntl, reset_context); 104 if (ret) 105 return ret; 106 107 return reset_handler->restore_hwcontext(adev->reset_cntl, 108 reset_context); 109 } 110 111 112 void amdgpu_reset_destroy_reset_domain(struct kref *ref) 113 { 114 struct amdgpu_reset_domain *reset_domain = container_of(ref, 115 struct amdgpu_reset_domain, 116 refcount); 117 if (reset_domain->wq) 118 destroy_workqueue(reset_domain->wq); 119 120 kvfree(reset_domain); 121 } 122 123 struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type, 124 char *wq_name) 125 { 126 struct amdgpu_reset_domain *reset_domain; 127 128 reset_domain = kvzalloc(sizeof(struct amdgpu_reset_domain), GFP_KERNEL); 129 if (!reset_domain) { 130 DRM_ERROR("Failed to allocate amdgpu_reset_domain!"); 131 return NULL; 132 } 133 134 reset_domain->type = type; 135 kref_init(&reset_domain->refcount); 136 137 reset_domain->wq = create_singlethread_workqueue(wq_name); 138 if (!reset_domain->wq) { 139 DRM_ERROR("Failed to allocate wq for amdgpu_reset_domain!"); 140 amdgpu_reset_put_reset_domain(reset_domain); 141 return NULL; 142 143 } 144 145 atomic_set(&reset_domain->in_gpu_reset, 0); 146 atomic_set(&reset_domain->reset_res, 0); 147 init_rwsem(&reset_domain->sem); 148 149 return reset_domain; 150 } 151 152 void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain) 153 { 154 atomic_set(&reset_domain->in_gpu_reset, 1); 155 down_write(&reset_domain->sem); 156 } 157 158 159 void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain) 160 { 161 atomic_set(&reset_domain->in_gpu_reset, 0); 162 up_write(&reset_domain->sem); 163 } 164 165 #ifndef CONFIG_DEV_COREDUMP 166 void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, 167 struct amdgpu_reset_context *reset_context) 168 { 169 } 170 #else 171 static ssize_t 172 amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, 173 void *data, size_t datalen) 174 { 175 struct drm_printer p; 176 struct amdgpu_coredump_info *coredump = data; 177 struct drm_print_iterator iter; 178 int i; 179 180 iter.data = buffer; 181 iter.offset = 0; 182 iter.start = offset; 183 iter.remain = count; 184 185 p = drm_coredump_printer(&iter); 186 187 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 188 drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n"); 189 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 190 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 191 drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, 192 coredump->reset_time.tv_nsec); 193 194 if (coredump->reset_task_info.pid) 195 drm_printf(&p, "process_name: %s PID: %d\n", 196 coredump->reset_task_info.process_name, 197 coredump->reset_task_info.pid); 198 199 if (coredump->ring) { 200 drm_printf(&p, "\nRing timed out details\n"); 201 drm_printf(&p, "IP Type: %d Ring Name: %s\n", 202 coredump->ring->funcs->type, 203 coredump->ring->name); 204 } 205 206 if (coredump->adev) { 207 struct amdgpu_vm_fault_info *fault_info = 208 &coredump->adev->vm_manager.fault_info; 209 210 drm_printf(&p, "\n[%s] Page fault observed\n", 211 fault_info->vmhub ? "mmhub" : "gfxhub"); 212 drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", 213 fault_info->addr); 214 drm_printf(&p, "Protection fault status register: 0x%x\n\n", 215 fault_info->status); 216 } 217 218 drm_printf(&p, "Ring buffer information\n"); 219 for (int i = 0; i < coredump->adev->num_rings; i++) { 220 int j = 0; 221 struct amdgpu_ring *ring = coredump->adev->rings[i]; 222 223 drm_printf(&p, "ring name: %s\n", ring->name); 224 drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n", 225 amdgpu_ring_get_rptr(ring), 226 amdgpu_ring_get_wptr(ring), 227 ring->buf_mask); 228 drm_printf(&p, "Ring size in dwords: %d\n", 229 ring->ring_size / 4); 230 drm_printf(&p, "Ring contents\n"); 231 drm_printf(&p, "Offset \t Value\n"); 232 233 while (j < ring->ring_size) { 234 drm_printf(&p, "0x%x \t 0x%x\n", j, ring->ring[j/4]); 235 j += 4; 236 } 237 } 238 239 if (coredump->reset_vram_lost) 240 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 241 if (coredump->adev->reset_info.num_regs) { 242 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 243 244 for (i = 0; i < coredump->adev->reset_info.num_regs; i++) 245 drm_printf(&p, "0x%08x: 0x%08x\n", 246 coredump->adev->reset_info.reset_dump_reg_list[i], 247 coredump->adev->reset_info.reset_dump_reg_value[i]); 248 } 249 250 return count - iter.remain; 251 } 252 253 static void amdgpu_devcoredump_free(void *data) 254 { 255 kfree(data); 256 } 257 258 void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, 259 struct amdgpu_reset_context *reset_context) 260 { 261 struct amdgpu_coredump_info *coredump; 262 struct drm_device *dev = adev_to_drm(adev); 263 struct amdgpu_job *job = reset_context->job; 264 struct drm_sched_job *s_job; 265 266 coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); 267 268 if (!coredump) { 269 DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__); 270 return; 271 } 272 273 coredump->reset_vram_lost = vram_lost; 274 275 if (reset_context->job && reset_context->job->vm) { 276 struct amdgpu_task_info *ti; 277 struct amdgpu_vm *vm = reset_context->job->vm; 278 279 ti = amdgpu_vm_get_task_info_vm(vm); 280 if (ti) { 281 coredump->reset_task_info = *ti; 282 amdgpu_vm_put_task_info(ti); 283 } 284 } 285 286 if (job) { 287 s_job = &job->base; 288 coredump->ring = to_amdgpu_ring(s_job->sched); 289 } 290 291 coredump->adev = adev; 292 293 ktime_get_ts64(&coredump->reset_time); 294 295 dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, 296 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 297 } 298 #endif 299