1 /* 2 * Copyright 2019 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 24 #include "amdgpu.h" 25 #include "umc_v6_7.h" 26 27 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, 28 struct ras_err_data *err_data, uint64_t err_addr, 29 uint32_t ch_inst, uint32_t umc_inst) 30 { 31 switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) { 32 case IP_VERSION(6, 7, 0): 33 umc_v6_7_convert_error_address(adev, 34 err_data, err_addr, ch_inst, umc_inst); 35 break; 36 default: 37 dev_warn(adev->dev, 38 "UMC address to Physical address translation is not supported\n"); 39 return AMDGPU_RAS_FAIL; 40 } 41 42 return AMDGPU_RAS_SUCCESS; 43 } 44 45 int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, 46 uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) 47 { 48 struct ras_err_data err_data; 49 int ret; 50 51 ret = amdgpu_ras_error_data_init(&err_data); 52 if (ret) 53 return ret; 54 55 err_data.err_addr = 56 kcalloc(adev->umc.max_ras_err_cnt_per_query, 57 sizeof(struct eeprom_table_record), GFP_KERNEL); 58 if (!err_data.err_addr) { 59 dev_warn(adev->dev, 60 "Failed to alloc memory for umc error record in MCA notifier!\n"); 61 ret = AMDGPU_RAS_FAIL; 62 goto out_fini_err_data; 63 } 64 65 /* 66 * Translate UMC channel address to Physical address 67 */ 68 ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr, 69 ch_inst, umc_inst); 70 if (ret) 71 goto out_free_err_addr; 72 73 if (amdgpu_bad_page_threshold != 0) { 74 amdgpu_ras_add_bad_pages(adev, err_data.err_addr, 75 err_data.err_addr_cnt); 76 amdgpu_ras_save_bad_pages(adev, NULL); 77 } 78 79 out_free_err_addr: 80 kfree(err_data.err_addr); 81 82 out_fini_err_data: 83 amdgpu_ras_error_data_fini(&err_data); 84 85 return ret; 86 } 87 88 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, 89 void *ras_error_status, 90 struct amdgpu_iv_entry *entry, 91 bool reset) 92 { 93 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 94 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 95 int ret = 0; 96 97 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); 98 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc)); 99 if (ret == -EOPNOTSUPP) { 100 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && 101 adev->umc.ras->ras_block.hw_ops->query_ras_error_count) 102 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status); 103 104 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && 105 adev->umc.ras->ras_block.hw_ops->query_ras_error_address && 106 adev->umc.max_ras_err_cnt_per_query) { 107 err_data->err_addr = 108 kcalloc(adev->umc.max_ras_err_cnt_per_query, 109 sizeof(struct eeprom_table_record), GFP_KERNEL); 110 111 /* still call query_ras_error_address to clear error status 112 * even NOMEM error is encountered 113 */ 114 if(!err_data->err_addr) 115 dev_warn(adev->dev, "Failed to alloc memory for " 116 "umc error address record!\n"); 117 118 /* umc query_ras_error_address is also responsible for clearing 119 * error status 120 */ 121 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status); 122 } 123 } else if (!ret) { 124 if (adev->umc.ras && 125 adev->umc.ras->ecc_info_query_ras_error_count) 126 adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status); 127 128 if (adev->umc.ras && 129 adev->umc.ras->ecc_info_query_ras_error_address && 130 adev->umc.max_ras_err_cnt_per_query) { 131 err_data->err_addr = 132 kcalloc(adev->umc.max_ras_err_cnt_per_query, 133 sizeof(struct eeprom_table_record), GFP_KERNEL); 134 135 /* still call query_ras_error_address to clear error status 136 * even NOMEM error is encountered 137 */ 138 if(!err_data->err_addr) 139 dev_warn(adev->dev, "Failed to alloc memory for " 140 "umc error address record!\n"); 141 142 /* umc query_ras_error_address is also responsible for clearing 143 * error status 144 */ 145 adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status); 146 } 147 } 148 149 /* only uncorrectable error needs gpu reset */ 150 if (err_data->ue_count) { 151 dev_info(adev->dev, "%ld uncorrectable hardware errors " 152 "detected in UMC block\n", 153 err_data->ue_count); 154 155 if ((amdgpu_bad_page_threshold != 0) && 156 err_data->err_addr_cnt) { 157 amdgpu_ras_add_bad_pages(adev, err_data->err_addr, 158 err_data->err_addr_cnt); 159 amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count)); 160 161 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); 162 163 if (con->update_channel_flag == true) { 164 amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); 165 con->update_channel_flag = false; 166 } 167 } 168 169 if (reset) { 170 /* use mode-2 reset for poison consumption */ 171 if (!entry) 172 con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; 173 amdgpu_ras_reset_gpu(adev); 174 } 175 } 176 177 kfree(err_data->err_addr); 178 return AMDGPU_RAS_SUCCESS; 179 } 180 181 int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) 182 { 183 int ret = AMDGPU_RAS_SUCCESS; 184 185 if (adev->gmc.xgmi.connected_to_cpu || 186 adev->gmc.is_app_apu) { 187 if (reset) { 188 /* MCA poison handler is only responsible for GPU reset, 189 * let MCA notifier do page retirement. 190 */ 191 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); 192 amdgpu_ras_reset_gpu(adev); 193 } 194 return ret; 195 } 196 197 if (!amdgpu_sriov_vf(adev)) { 198 struct ras_err_data err_data; 199 struct ras_common_if head = { 200 .block = AMDGPU_RAS_BLOCK__UMC, 201 }; 202 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); 203 204 ret = amdgpu_ras_error_data_init(&err_data); 205 if (ret) 206 return ret; 207 208 ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset); 209 210 if (ret == AMDGPU_RAS_SUCCESS && obj) { 211 obj->err_data.ue_count += err_data.ue_count; 212 obj->err_data.ce_count += err_data.ce_count; 213 } 214 215 amdgpu_ras_error_data_fini(&err_data); 216 } else { 217 if (adev->virt.ops && adev->virt.ops->ras_poison_handler) 218 adev->virt.ops->ras_poison_handler(adev); 219 else 220 dev_warn(adev->dev, 221 "No ras_poison_handler interface in SRIOV!\n"); 222 } 223 224 return ret; 225 } 226 227 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, 228 void *ras_error_status, 229 struct amdgpu_iv_entry *entry) 230 { 231 return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true); 232 } 233 234 int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev) 235 { 236 int err; 237 struct amdgpu_umc_ras *ras; 238 239 if (!adev->umc.ras) 240 return 0; 241 242 ras = adev->umc.ras; 243 244 err = amdgpu_ras_register_ras_block(adev, &ras->ras_block); 245 if (err) { 246 dev_err(adev->dev, "Failed to register umc ras block!\n"); 247 return err; 248 } 249 250 strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc"); 251 ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC; 252 ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; 253 adev->umc.ras_if = &ras->ras_block.ras_comm; 254 255 if (!ras->ras_block.ras_late_init) 256 ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init; 257 258 if (!ras->ras_block.ras_cb) 259 ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb; 260 261 return 0; 262 } 263 264 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) 265 { 266 int r; 267 268 r = amdgpu_ras_block_late_init(adev, ras_block); 269 if (r) 270 return r; 271 272 if (amdgpu_ras_is_supported(adev, ras_block->block)) { 273 r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0); 274 if (r) 275 goto late_fini; 276 } 277 278 /* ras init of specific umc version */ 279 if (adev->umc.ras && 280 adev->umc.ras->err_cnt_init) 281 adev->umc.ras->err_cnt_init(adev); 282 283 return 0; 284 285 late_fini: 286 amdgpu_ras_block_late_fini(adev, ras_block); 287 return r; 288 } 289 290 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, 291 struct amdgpu_irq_src *source, 292 struct amdgpu_iv_entry *entry) 293 { 294 struct ras_common_if *ras_if = adev->umc.ras_if; 295 struct ras_dispatch_if ih_data = { 296 .entry = entry, 297 }; 298 299 if (!ras_if) 300 return 0; 301 302 ih_data.head = *ras_if; 303 304 amdgpu_ras_interrupt_dispatch(adev, &ih_data); 305 return 0; 306 } 307 308 void amdgpu_umc_fill_error_record(struct ras_err_data *err_data, 309 uint64_t err_addr, 310 uint64_t retired_page, 311 uint32_t channel_index, 312 uint32_t umc_inst) 313 { 314 struct eeprom_table_record *err_rec = 315 &err_data->err_addr[err_data->err_addr_cnt]; 316 317 err_rec->address = err_addr; 318 /* page frame address is saved */ 319 err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; 320 err_rec->ts = (uint64_t)ktime_get_real_seconds(); 321 err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; 322 err_rec->cu = 0; 323 err_rec->mem_channel = channel_index; 324 err_rec->mcumc_id = umc_inst; 325 326 err_data->err_addr_cnt++; 327 } 328 329 int amdgpu_umc_loop_channels(struct amdgpu_device *adev, 330 umc_func func, void *data) 331 { 332 uint32_t node_inst = 0; 333 uint32_t umc_inst = 0; 334 uint32_t ch_inst = 0; 335 int ret = 0; 336 337 if (adev->umc.node_inst_num) { 338 LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { 339 ret = func(adev, node_inst, umc_inst, ch_inst, data); 340 if (ret) { 341 dev_err(adev->dev, "Node %d umc %d ch %d func returns %d\n", 342 node_inst, umc_inst, ch_inst, ret); 343 return ret; 344 } 345 } 346 } else { 347 LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { 348 ret = func(adev, 0, umc_inst, ch_inst, data); 349 if (ret) { 350 dev_err(adev->dev, "Umc %d ch %d func returns %d\n", 351 umc_inst, ch_inst, ret); 352 return ret; 353 } 354 } 355 } 356 357 return 0; 358 } 359