1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a copy 6 * of this software and associated documentation files (the "Software"), to deal 7 * in the Software without restriction, including without limitation the rights 8 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 * copies of the Software, and to permit persons to whom the Software is 10 * furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 * THE SOFTWARE. 22 */ 23 24 #include "amdgpu.h" 25 #include "amdgpu_reset.h" 26 #include "amdgpu_xgmi.h" 27 #include "ras_sys.h" 28 #include "amdgpu_ras_mgr.h" 29 #include "amdgpu_ras_process.h" 30 31 #define RAS_MGR_RETIRE_PAGE_INTERVAL 100 32 #define RAS_EVENT_PROCESS_TIMEOUT 1200 33 34 static void ras_process_retire_page_dwork(struct work_struct *work) 35 { 36 struct amdgpu_ras_mgr *ras_mgr = 37 container_of(work, struct amdgpu_ras_mgr, retire_page_dwork.work); 38 struct amdgpu_device *adev = ras_mgr->adev; 39 int ret; 40 41 if (amdgpu_ras_is_rma(adev)) 42 return; 43 44 /* If gpu reset is ongoing, delay retiring the bad pages */ 45 if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { 46 schedule_delayed_work(&ras_mgr->retire_page_dwork, 47 msecs_to_jiffies(RAS_MGR_RETIRE_PAGE_INTERVAL * 3)); 48 return; 49 } 50 51 ret = ras_umc_handle_bad_pages(ras_mgr->ras_core, NULL); 52 if (!ret) 53 schedule_delayed_work(&ras_mgr->retire_page_dwork, 54 msecs_to_jiffies(RAS_MGR_RETIRE_PAGE_INTERVAL)); 55 } 56 57 int amdgpu_ras_process_init(struct amdgpu_device *adev) 58 { 59 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 60 61 ras_mgr->is_paused = false; 62 init_completion(&ras_mgr->ras_event_done); 63 64 INIT_DELAYED_WORK(&ras_mgr->retire_page_dwork, ras_process_retire_page_dwork); 65 66 return 0; 67 } 68 69 int amdgpu_ras_process_fini(struct amdgpu_device *adev) 70 { 71 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 72 73 ras_mgr->is_paused = false; 74 /* Save all cached bad pages to eeprom */ 75 flush_delayed_work(&ras_mgr->retire_page_dwork); 76 cancel_delayed_work_sync(&ras_mgr->retire_page_dwork); 77 return 0; 78 } 79 80 int amdgpu_ras_process_handle_umc_interrupt(struct amdgpu_device *adev, void *data) 81 { 82 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 83 84 if (!ras_mgr->ras_core) 85 return -EINVAL; 86 87 return ras_process_add_interrupt_req(ras_mgr->ras_core, NULL, true); 88 } 89 90 int amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device *adev, void *data) 91 { 92 amdgpu_ras_set_fed(adev, true); 93 return amdgpu_ras_mgr_reset_gpu(adev, AMDGPU_RAS_GPU_RESET_MODE1_RESET); 94 } 95 96 int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev, void *data) 97 { 98 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 99 struct ras_ih_info *ih_info = (struct ras_ih_info *)data; 100 struct ras_event_req req; 101 uint64_t seqno; 102 103 if (!ih_info) 104 return -EINVAL; 105 106 memset(&req, 0, sizeof(req)); 107 req.block = ih_info->block; 108 req.data = ih_info->data; 109 req.pasid = ih_info->pasid; 110 req.pasid_fn = ih_info->pasid_fn; 111 req.reset = ih_info->reset; 112 113 seqno = ras_core_get_seqno(ras_mgr->ras_core, 114 RAS_SEQNO_TYPE_POISON_CONSUMPTION, false); 115 116 /* When the ACA register cannot be read from FW, the poison 117 * consumption seqno in the fifo will not pop up, so it is 118 * necessary to check whether the seqno is the previous seqno. 119 */ 120 if (seqno == ras_mgr->last_poison_consumption_seqno) { 121 /* Pop and discard the previous seqno */ 122 ras_core_get_seqno(ras_mgr->ras_core, 123 RAS_SEQNO_TYPE_POISON_CONSUMPTION, true); 124 seqno = ras_core_get_seqno(ras_mgr->ras_core, 125 RAS_SEQNO_TYPE_POISON_CONSUMPTION, false); 126 } 127 ras_mgr->last_poison_consumption_seqno = seqno; 128 req.seqno = seqno; 129 130 return ras_process_add_interrupt_req(ras_mgr->ras_core, &req, false); 131 } 132 133 int amdgpu_ras_process_begin(struct amdgpu_device *adev) 134 { 135 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 136 137 if (ras_mgr->is_paused) 138 return -EAGAIN; 139 140 reinit_completion(&ras_mgr->ras_event_done); 141 return 0; 142 } 143 144 int amdgpu_ras_process_end(struct amdgpu_device *adev) 145 { 146 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 147 148 complete(&ras_mgr->ras_event_done); 149 return 0; 150 } 151 152 int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev) 153 { 154 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 155 long rc; 156 157 if (!ras_mgr || !ras_mgr->ras_core) 158 return -EINVAL; 159 160 if (!ras_mgr->ras_core->is_initialized) 161 return -EPERM; 162 163 ras_mgr->is_paused = true; 164 165 /* Wait for RAS event processing to complete */ 166 rc = wait_for_completion_interruptible_timeout(&ras_mgr->ras_event_done, 167 msecs_to_jiffies(RAS_EVENT_PROCESS_TIMEOUT)); 168 if (rc <= 0) 169 RAS_DEV_WARN(adev, "Waiting for ras process to complete %s\n", 170 rc ? "interrupted" : "timeout"); 171 172 flush_delayed_work(&ras_mgr->retire_page_dwork); 173 return 0; 174 } 175 176 int amdgpu_ras_process_post_reset(struct amdgpu_device *adev) 177 { 178 struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); 179 180 if (!ras_mgr || !ras_mgr->ras_core) 181 return -EINVAL; 182 183 if (!ras_mgr->ras_core->is_initialized) 184 return -EPERM; 185 186 ras_mgr->is_paused = false; 187 188 schedule_delayed_work(&ras_mgr->retire_page_dwork, 0); 189 return 0; 190 } 191