1 /* SPDX-License-Identifier: MIT */ 2 /* 3 * Copyright 2025 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 25 #ifndef __RAS_EEPROM_H__ 26 #define __RAS_EEPROM_H__ 27 #include "ras_sys.h" 28 29 #define RAS_TABLE_VER_V1 0x00010000 30 #define RAS_TABLE_VER_V2_1 0x00021000 31 #define RAS_TABLE_VER_V3 0x00030000 32 33 #define NONSTOP_OVER_THRESHOLD -2 34 #define WARN_NONSTOP_OVER_THRESHOLD -1 35 #define DISABLE_RETIRE_PAGE 0 36 37 /* 38 * Bad address pfn : eeprom_umc_record.retired_row_pfn[39:0], 39 * nps mode: eeprom_umc_record.retired_row_pfn[47:40] 40 */ 41 #define EEPROM_RECORD_UMC_ADDR_MASK 0xFFFFFFFFFFULL 42 #define EEPROM_RECORD_UMC_NPS_MASK 0xFF0000000000ULL 43 #define EEPROM_RECORD_UMC_NPS_SHIFT 40 44 45 #define EEPROM_RECORD_UMC_NPS_MODE(RECORD) \ 46 (((RECORD)->retired_row_pfn & EEPROM_RECORD_UMC_NPS_MASK) >> \ 47 EEPROM_RECORD_UMC_NPS_SHIFT) 48 49 #define EEPROM_RECORD_UMC_ADDR_PFN(RECORD) \ 50 ((RECORD)->retired_row_pfn & EEPROM_RECORD_UMC_ADDR_MASK) 51 52 #define EEPROM_RECORD_SETUP_UMC_ADDR_AND_NPS(RECORD, ADDR, NPS) \ 53 do { \ 54 uint64_t tmp = (NPS); \ 55 tmp = ((tmp << EEPROM_RECORD_UMC_NPS_SHIFT) & EEPROM_RECORD_UMC_NPS_MASK); \ 56 tmp |= (ADDR) & EEPROM_RECORD_UMC_ADDR_MASK; \ 57 (RECORD)->retired_row_pfn = tmp; \ 58 } while (0) 59 60 enum ras_gpu_health_status { 61 RAS_GPU_HEALTH_NONE = 0, 62 RAS_GPU_HEALTH_USABLE = 1, 63 RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2, 64 RAS_GPU_IN_BAD_STATUS = 3, 65 }; 66 67 enum ras_eeprom_err_type { 68 RAS_EEPROM_ERR_NA, 69 RAS_EEPROM_ERR_RECOVERABLE, 70 RAS_EEPROM_ERR_NON_RECOVERABLE, 71 RAS_EEPROM_ERR_COUNT, 72 }; 73 74 struct ras_eeprom_table_header { 75 uint32_t header; 76 uint32_t version; 77 uint32_t first_rec_offset; 78 uint32_t tbl_size; 79 uint32_t checksum; 80 } __packed; 81 82 struct ras_eeprom_table_ras_info { 83 u8 rma_status; 84 u8 health_percent; 85 u16 ecc_page_threshold; 86 u32 padding[64 - 1]; 87 } __packed; 88 89 struct ras_eeprom_control { 90 struct ras_eeprom_table_header tbl_hdr; 91 struct ras_eeprom_table_ras_info tbl_rai; 92 93 /* record threshold */ 94 int record_threshold_config; 95 uint32_t record_threshold_count; 96 bool update_channel_flag; 97 98 const struct ras_eeprom_sys_func *sys_func; 99 void *i2c_adapter; 100 u32 i2c_port; 101 u16 max_read_len; 102 u16 max_write_len; 103 104 /* Base I2C EEPPROM 19-bit memory address, 105 * where the table is located. For more information, 106 * see top of amdgpu_eeprom.c. 107 */ 108 u32 i2c_address; 109 110 /* The byte offset off of @i2c_address 111 * where the table header is found, 112 * and where the records start--always 113 * right after the header. 114 */ 115 u32 ras_header_offset; 116 u32 ras_info_offset; 117 u32 ras_record_offset; 118 119 /* Number of records in the table. 120 */ 121 u32 ras_num_recs; 122 123 /* First record index to read, 0-based. 124 * Range is [0, num_recs-1]. This is 125 * an absolute index, starting right after 126 * the table header. 127 */ 128 u32 ras_fri; 129 130 /* Maximum possible number of records 131 * we could store, i.e. the maximum capacity 132 * of the table. 133 */ 134 u32 ras_max_record_count; 135 136 /* Protect table access via this mutex. 137 */ 138 struct mutex ras_tbl_mutex; 139 140 /* Record channel info which occurred bad pages 141 */ 142 u32 bad_channel_bitmap; 143 }; 144 145 /* 146 * Represents single table record. Packed to be easily serialized into byte 147 * stream. 148 */ 149 struct eeprom_umc_record { 150 151 union { 152 uint64_t address; 153 uint64_t offset; 154 }; 155 156 uint64_t retired_row_pfn; 157 uint64_t ts; 158 159 enum ras_eeprom_err_type err_type; 160 161 union { 162 unsigned char bank; 163 unsigned char cu; 164 }; 165 166 unsigned char mem_channel; 167 unsigned char mcumc_id; 168 169 /* The following variables will not be saved to eeprom. 170 */ 171 uint64_t cur_nps_retired_row_pfn; 172 uint32_t cur_nps_bank; 173 uint32_t cur_nps; 174 }; 175 176 struct ras_core_context; 177 int ras_eeprom_hw_init(struct ras_core_context *ras_core); 178 int ras_eeprom_hw_fini(struct ras_core_context *ras_core); 179 180 int ras_eeprom_reset_table(struct ras_core_context *ras_core); 181 182 bool ras_eeprom_check_safety_watermark(struct ras_core_context *ras_core); 183 184 int ras_eeprom_read(struct ras_core_context *ras_core, 185 struct eeprom_umc_record *records, const u32 num); 186 187 int ras_eeprom_append(struct ras_core_context *ras_core, 188 struct eeprom_umc_record *records, const u32 num); 189 190 uint32_t ras_eeprom_max_record_count(struct ras_core_context *ras_core); 191 uint32_t ras_eeprom_get_record_count(struct ras_core_context *ras_core); 192 void ras_eeprom_sync_info(struct ras_core_context *ras_core); 193 194 int ras_eeprom_check_storage_status(struct ras_core_context *ras_core); 195 enum ras_gpu_health_status 196 ras_eeprom_check_gpu_status(struct ras_core_context *ras_core); 197 #endif 198