xref: /linux/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c (revision 24f171c7e145f43b9f187578e89b0982ce87e54c)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a copy
6  * of this software and associated documentation files (the "Software"), to deal
7  * in the Software without restriction, including without limitation the rights
8  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9  * copies of the Software, and to permit persons to whom the Software is
10  * furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21  * THE SOFTWARE.
22  */
23 
24 #include "amdgpu.h"
25 #include "amdgpu_reset.h"
26 #include "amdgpu_xgmi.h"
27 #include "ras_sys.h"
28 #include "amdgpu_ras_mgr.h"
29 #include "amdgpu_ras_process.h"
30 
31 #define RAS_MGR_RETIRE_PAGE_INTERVAL  100
32 #define RAS_EVENT_PROCESS_TIMEOUT  1200
33 
34 static void ras_process_retire_page_dwork(struct work_struct *work)
35 {
36 	struct amdgpu_ras_mgr *ras_mgr =
37 		container_of(work, struct amdgpu_ras_mgr, retire_page_dwork.work);
38 	struct amdgpu_device *adev = ras_mgr->adev;
39 	int ret;
40 
41 	if (amdgpu_ras_is_rma(adev))
42 		return;
43 
44 	/* If gpu reset is ongoing, delay retiring the bad pages */
45 	if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
46 		schedule_delayed_work(&ras_mgr->retire_page_dwork,
47 			msecs_to_jiffies(RAS_MGR_RETIRE_PAGE_INTERVAL * 3));
48 		return;
49 	}
50 
51 	ret = ras_umc_handle_bad_pages(ras_mgr->ras_core, NULL);
52 	if (!ret)
53 		schedule_delayed_work(&ras_mgr->retire_page_dwork,
54 			msecs_to_jiffies(RAS_MGR_RETIRE_PAGE_INTERVAL));
55 }
56 
57 int amdgpu_ras_process_init(struct amdgpu_device *adev)
58 {
59 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
60 
61 	ras_mgr->is_paused = false;
62 	init_completion(&ras_mgr->ras_event_done);
63 
64 	INIT_DELAYED_WORK(&ras_mgr->retire_page_dwork, ras_process_retire_page_dwork);
65 
66 	return 0;
67 }
68 
69 int amdgpu_ras_process_fini(struct amdgpu_device *adev)
70 {
71 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
72 
73 	ras_mgr->is_paused = false;
74 	/* Save all cached bad pages to eeprom */
75 	flush_delayed_work(&ras_mgr->retire_page_dwork);
76 	cancel_delayed_work_sync(&ras_mgr->retire_page_dwork);
77 	return 0;
78 }
79 
80 int amdgpu_ras_process_handle_umc_interrupt(struct amdgpu_device *adev, void *data)
81 {
82 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
83 
84 	if (!ras_mgr->ras_core)
85 		return -EINVAL;
86 
87 	return ras_process_add_interrupt_req(ras_mgr->ras_core, NULL, true);
88 }
89 
90 int amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device *adev, void *data)
91 {
92 	amdgpu_ras_set_fed(adev, true);
93 	return amdgpu_ras_mgr_reset_gpu(adev, AMDGPU_RAS_GPU_RESET_MODE1_RESET);
94 }
95 
96 int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev, void *data)
97 {
98 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
99 	struct ras_ih_info *ih_info = (struct ras_ih_info *)data;
100 	struct ras_event_req req;
101 	uint64_t seqno;
102 
103 	if (!ih_info)
104 		return -EINVAL;
105 
106 	memset(&req, 0, sizeof(req));
107 	req.block = ih_info->block;
108 	req.data = ih_info->data;
109 	req.pasid = ih_info->pasid;
110 	req.pasid_fn = ih_info->pasid_fn;
111 	req.reset = ih_info->reset;
112 
113 	seqno = ras_core_get_seqno(ras_mgr->ras_core,
114 				RAS_SEQNO_TYPE_POISON_CONSUMPTION, false);
115 
116 	/* When the ACA register cannot be read from FW, the poison
117 	 * consumption seqno in the fifo will not pop up, so it is
118 	 * necessary to check whether the seqno is the previous seqno.
119 	 */
120 	if (seqno == ras_mgr->last_poison_consumption_seqno) {
121 		/* Pop and discard the previous seqno */
122 		ras_core_get_seqno(ras_mgr->ras_core,
123 				RAS_SEQNO_TYPE_POISON_CONSUMPTION, true);
124 		seqno = ras_core_get_seqno(ras_mgr->ras_core,
125 					RAS_SEQNO_TYPE_POISON_CONSUMPTION, false);
126 	}
127 	ras_mgr->last_poison_consumption_seqno = seqno;
128 	req.seqno = seqno;
129 
130 	return ras_process_add_interrupt_req(ras_mgr->ras_core, &req, false);
131 }
132 
133 int amdgpu_ras_process_begin(struct amdgpu_device *adev)
134 {
135 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
136 
137 	if (ras_mgr->is_paused)
138 		return -EAGAIN;
139 
140 	reinit_completion(&ras_mgr->ras_event_done);
141 	return 0;
142 }
143 
144 int amdgpu_ras_process_end(struct amdgpu_device *adev)
145 {
146 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
147 
148 	complete(&ras_mgr->ras_event_done);
149 	return 0;
150 }
151 
152 int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev)
153 {
154 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
155 	long rc;
156 
157 	if (!ras_mgr || !ras_mgr->ras_core)
158 		return -EINVAL;
159 
160 	if (!ras_mgr->ras_core->is_initialized)
161 		return -EPERM;
162 
163 	ras_mgr->is_paused = true;
164 
165 	/* Wait for RAS event processing to complete */
166 	rc = wait_for_completion_interruptible_timeout(&ras_mgr->ras_event_done,
167 			msecs_to_jiffies(RAS_EVENT_PROCESS_TIMEOUT));
168 	if (rc <= 0)
169 		RAS_DEV_WARN(adev, "Waiting for ras process to complete %s\n",
170 			 rc ? "interrupted" : "timeout");
171 
172 	flush_delayed_work(&ras_mgr->retire_page_dwork);
173 	return 0;
174 }
175 
176 int amdgpu_ras_process_post_reset(struct amdgpu_device *adev)
177 {
178 	struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
179 
180 	if (!ras_mgr || !ras_mgr->ras_core)
181 		return -EINVAL;
182 
183 	if (!ras_mgr->ras_core->is_initialized)
184 		return -EPERM;
185 
186 	ras_mgr->is_paused = false;
187 
188 	schedule_delayed_work(&ras_mgr->retire_page_dwork, 0);
189 	return 0;
190 }
191