1 /* 2 * Copyright 2021 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 24 #include <linux/devcoredump.h> 25 #include <generated/utsrelease.h> 26 27 #include "amdgpu_reset.h" 28 #include "aldebaran.h" 29 #include "sienna_cichlid.h" 30 #include "smu_v13_0_10.h" 31 32 int amdgpu_reset_init(struct amdgpu_device *adev) 33 { 34 int ret = 0; 35 36 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 37 case IP_VERSION(13, 0, 2): 38 case IP_VERSION(13, 0, 6): 39 ret = aldebaran_reset_init(adev); 40 break; 41 case IP_VERSION(11, 0, 7): 42 ret = sienna_cichlid_reset_init(adev); 43 break; 44 case IP_VERSION(13, 0, 10): 45 ret = smu_v13_0_10_reset_init(adev); 46 break; 47 default: 48 break; 49 } 50 51 return ret; 52 } 53 54 int amdgpu_reset_fini(struct amdgpu_device *adev) 55 { 56 int ret = 0; 57 58 switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { 59 case IP_VERSION(13, 0, 2): 60 case IP_VERSION(13, 0, 6): 61 ret = aldebaran_reset_fini(adev); 62 break; 63 case IP_VERSION(11, 0, 7): 64 ret = sienna_cichlid_reset_fini(adev); 65 break; 66 case IP_VERSION(13, 0, 10): 67 ret = smu_v13_0_10_reset_fini(adev); 68 break; 69 default: 70 break; 71 } 72 73 return ret; 74 } 75 76 int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev, 77 struct amdgpu_reset_context *reset_context) 78 { 79 struct amdgpu_reset_handler *reset_handler = NULL; 80 81 if (adev->reset_cntl && adev->reset_cntl->get_reset_handler) 82 reset_handler = adev->reset_cntl->get_reset_handler( 83 adev->reset_cntl, reset_context); 84 if (!reset_handler) 85 return -EOPNOTSUPP; 86 87 return reset_handler->prepare_hwcontext(adev->reset_cntl, 88 reset_context); 89 } 90 91 int amdgpu_reset_perform_reset(struct amdgpu_device *adev, 92 struct amdgpu_reset_context *reset_context) 93 { 94 int ret; 95 struct amdgpu_reset_handler *reset_handler = NULL; 96 97 if (adev->reset_cntl) 98 reset_handler = adev->reset_cntl->get_reset_handler( 99 adev->reset_cntl, reset_context); 100 if (!reset_handler) 101 return -EOPNOTSUPP; 102 103 ret = reset_handler->perform_reset(adev->reset_cntl, reset_context); 104 if (ret) 105 return ret; 106 107 return reset_handler->restore_hwcontext(adev->reset_cntl, 108 reset_context); 109 } 110 111 112 void amdgpu_reset_destroy_reset_domain(struct kref *ref) 113 { 114 struct amdgpu_reset_domain *reset_domain = container_of(ref, 115 struct amdgpu_reset_domain, 116 refcount); 117 if (reset_domain->wq) 118 destroy_workqueue(reset_domain->wq); 119 120 kvfree(reset_domain); 121 } 122 123 struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type, 124 char *wq_name) 125 { 126 struct amdgpu_reset_domain *reset_domain; 127 128 reset_domain = kvzalloc(sizeof(struct amdgpu_reset_domain), GFP_KERNEL); 129 if (!reset_domain) { 130 DRM_ERROR("Failed to allocate amdgpu_reset_domain!"); 131 return NULL; 132 } 133 134 reset_domain->type = type; 135 kref_init(&reset_domain->refcount); 136 137 reset_domain->wq = create_singlethread_workqueue(wq_name); 138 if (!reset_domain->wq) { 139 DRM_ERROR("Failed to allocate wq for amdgpu_reset_domain!"); 140 amdgpu_reset_put_reset_domain(reset_domain); 141 return NULL; 142 143 } 144 145 atomic_set(&reset_domain->in_gpu_reset, 0); 146 atomic_set(&reset_domain->reset_res, 0); 147 init_rwsem(&reset_domain->sem); 148 149 return reset_domain; 150 } 151 152 void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain) 153 { 154 atomic_set(&reset_domain->in_gpu_reset, 1); 155 down_write(&reset_domain->sem); 156 } 157 158 159 void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain) 160 { 161 atomic_set(&reset_domain->in_gpu_reset, 0); 162 up_write(&reset_domain->sem); 163 } 164 165 #ifndef CONFIG_DEV_COREDUMP 166 void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, 167 struct amdgpu_reset_context *reset_context) 168 { 169 } 170 #else 171 static ssize_t 172 amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, 173 void *data, size_t datalen) 174 { 175 struct drm_printer p; 176 struct amdgpu_coredump_info *coredump = data; 177 struct drm_print_iterator iter; 178 int i; 179 180 iter.data = buffer; 181 iter.offset = 0; 182 iter.start = offset; 183 iter.remain = count; 184 185 p = drm_coredump_printer(&iter); 186 187 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 188 drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n"); 189 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 190 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 191 drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, 192 coredump->reset_time.tv_nsec); 193 194 if (coredump->reset_task_info.pid) 195 drm_printf(&p, "process_name: %s PID: %d\n", 196 coredump->reset_task_info.process_name, 197 coredump->reset_task_info.pid); 198 199 if (coredump->ring) { 200 drm_printf(&p, "\nRing timed out details\n"); 201 drm_printf(&p, "IP Type: %d Ring Name: %s\n", 202 coredump->ring->funcs->type, 203 coredump->ring->name); 204 } 205 206 if (coredump->reset_vram_lost) 207 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 208 if (coredump->adev->reset_info.num_regs) { 209 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 210 211 for (i = 0; i < coredump->adev->reset_info.num_regs; i++) 212 drm_printf(&p, "0x%08x: 0x%08x\n", 213 coredump->adev->reset_info.reset_dump_reg_list[i], 214 coredump->adev->reset_info.reset_dump_reg_value[i]); 215 } 216 217 return count - iter.remain; 218 } 219 220 static void amdgpu_devcoredump_free(void *data) 221 { 222 kfree(data); 223 } 224 225 void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, 226 struct amdgpu_reset_context *reset_context) 227 { 228 struct amdgpu_coredump_info *coredump; 229 struct drm_device *dev = adev_to_drm(adev); 230 struct amdgpu_job *job = reset_context->job; 231 struct drm_sched_job *s_job; 232 233 coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); 234 235 if (!coredump) { 236 DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__); 237 return; 238 } 239 240 coredump->reset_vram_lost = vram_lost; 241 242 if (reset_context->job && reset_context->job->vm) { 243 struct amdgpu_task_info *ti; 244 struct amdgpu_vm *vm = reset_context->job->vm; 245 246 ti = amdgpu_vm_get_task_info_vm(vm); 247 if (ti) { 248 coredump->reset_task_info = *ti; 249 amdgpu_vm_put_task_info(ti); 250 } 251 } 252 253 if (job) { 254 s_job = &job->base; 255 coredump->ring = to_amdgpu_ring(s_job->sched); 256 } 257 258 coredump->adev = adev; 259 260 ktime_get_ts64(&coredump->reset_time); 261 262 dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, 263 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 264 } 265 #endif 266