xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h (revision 6dfafbd0299a60bfb5d5e277fdf100037c7ded07)
1 /*
2  * Copyright 2019 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 
24 #ifndef _AMDGPU_RAS_EEPROM_H
25 #define _AMDGPU_RAS_EEPROM_H
26 
27 #include <linux/i2c.h>
28 
29 #define RAS_TABLE_VER_V1           0x00010000
30 #define RAS_TABLE_VER_V2_1         0x00021000
31 #define RAS_TABLE_VER_V3           0x00030000
32 
33 struct amdgpu_device;
34 
35 enum amdgpu_ras_gpu_health_status {
36 	GPU_HEALTH_USABLE = 0,
37 	GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
38 };
39 
40 enum amdgpu_ras_eeprom_err_type {
41 	AMDGPU_RAS_EEPROM_ERR_NA,
42 	AMDGPU_RAS_EEPROM_ERR_RECOVERABLE,
43 	AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE,
44 	AMDGPU_RAS_EEPROM_ERR_COUNT,
45 };
46 
47 struct amdgpu_ras_eeprom_table_header {
48 	uint32_t header;
49 	uint32_t version;
50 	uint32_t first_rec_offset;
51 	uint32_t tbl_size;
52 	uint32_t checksum;
53 } __packed;
54 
55 struct amdgpu_ras_eeprom_table_ras_info {
56 	u8  rma_status;
57 	u8  health_percent;
58 	u16 ecc_page_threshold;
59 	u32 padding[64 - 1];
60 } __packed;
61 
62 struct amdgpu_ras_eeprom_control {
63 	struct amdgpu_ras_eeprom_table_header tbl_hdr;
64 
65 	struct amdgpu_ras_eeprom_table_ras_info tbl_rai;
66 
67 	/* Base I2C EEPPROM 19-bit memory address,
68 	 * where the table is located. For more information,
69 	 * see top of amdgpu_eeprom.c.
70 	 */
71 	u32 i2c_address;
72 
73 	/* The byte offset off of @i2c_address
74 	 * where the table header is found,
75 	 * and where the records start--always
76 	 * right after the header.
77 	 */
78 	u32 ras_header_offset;
79 	u32 ras_info_offset;
80 	u32 ras_record_offset;
81 
82 	/* Number of records in the table.
83 	 */
84 	u32 ras_num_recs;
85 	u32 ras_num_recs_old;
86 
87 	/* the bad page number is ras_num_recs or
88 	 * ras_num_recs * umc.retire_unit
89 	 */
90 	u32 ras_num_bad_pages;
91 
92 	/* Number of records store mca address */
93 	u32 ras_num_mca_recs;
94 
95 	/* Number of records store physical address */
96 	u32 ras_num_pa_recs;
97 
98 	/* First record index to read, 0-based.
99 	 * Range is [0, num_recs-1]. This is
100 	 * an absolute index, starting right after
101 	 * the table header.
102 	 */
103 	u32 ras_fri;
104 
105 	/* Maximum possible number of records
106 	 * we could store, i.e. the maximum capacity
107 	 * of the table.
108 	 */
109 	u32 ras_max_record_count;
110 
111 	/* Protect table access via this mutex.
112 	 */
113 	struct mutex ras_tbl_mutex;
114 
115 	/* Record channel info which occurred bad pages
116 	 */
117 	u32 bad_channel_bitmap;
118 
119 	bool is_eeprom_valid;
120 };
121 
122 /*
123  * Represents single table record. Packed to be easily serialized into byte
124  * stream.
125  */
126 struct eeprom_table_record {
127 
128 	union {
129 		uint64_t address;
130 		uint64_t offset;
131 	};
132 
133 	uint64_t retired_page;
134 	uint64_t ts;
135 
136 	enum amdgpu_ras_eeprom_err_type err_type;
137 
138 	union {
139 		unsigned char bank;
140 		unsigned char cu;
141 	};
142 
143 	unsigned char mem_channel;
144 	unsigned char mcumc_id;
145 } __packed;
146 
147 int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
148 
149 int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
150 
151 bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev);
152 
153 int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
154 			   struct eeprom_table_record *records, const u32 num);
155 
156 int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
157 			     struct eeprom_table_record *records, const u32 num);
158 
159 uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control);
160 
161 void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control);
162 
163 int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control);
164 
165 void amdgpu_ras_eeprom_check_and_recover(struct amdgpu_device *adev);
166 
167 bool amdgpu_ras_smu_eeprom_supported(struct amdgpu_device *adev);
168 
169 int amdgpu_ras_smu_get_table_version(struct amdgpu_device *adev,
170 							uint32_t *table_version);
171 
172 int amdgpu_ras_smu_get_badpage_count(struct amdgpu_device *adev,
173 								uint32_t *count, uint32_t timeout);
174 
175 int amdgpu_ras_smu_get_badpage_mca_addr(struct amdgpu_device *adev,
176 								uint16_t index, uint64_t *mca_addr);
177 
178 int amdgpu_ras_smu_set_timestamp(struct amdgpu_device *adev,
179 										uint64_t timestamp);
180 
181 int amdgpu_ras_smu_get_timestamp(struct amdgpu_device *adev,
182 							uint16_t index, uint64_t *timestamp);
183 
184 int amdgpu_ras_smu_get_badpage_ipid(struct amdgpu_device *adev,
185 								uint16_t index, uint64_t *ipid);
186 
187 int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev,
188 									uint32_t *result);
189 
190 int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
191 			struct eeprom_table_record *record, u32 rec_idx,
192 			const u32 num);
193 
194 int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control);
195 
196 extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
197 extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;
198 
199 #endif // _AMDGPU_RAS_EEPROM_H
200