1 /* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * 23 */ 24 #include <linux/list.h> 25 #include "amdgpu.h" 26 #include "amdgpu_xgmi.h" 27 #include "amdgpu_ras.h" 28 #include "soc15.h" 29 #include "df/df_3_6_offset.h" 30 #include "xgmi/xgmi_4_0_0_smn.h" 31 #include "xgmi/xgmi_4_0_0_sh_mask.h" 32 #include "xgmi/xgmi_6_1_0_sh_mask.h" 33 #include "wafl/wafl2_4_0_0_smn.h" 34 #include "wafl/wafl2_4_0_0_sh_mask.h" 35 36 #include "amdgpu_reset.h" 37 38 #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c 39 #define smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK 0x11a00218 40 #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 41 #define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK 0x12200218 42 43 #define XGMI_STATE_DISABLE 0xD1 44 #define XGMI_STATE_LS0 0x81 45 46 static DEFINE_MUTEX(xgmi_mutex); 47 48 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4 49 50 static LIST_HEAD(xgmi_hive_list); 51 52 static const int xgmi_pcs_err_status_reg_vg20[] = { 53 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, 54 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, 55 }; 56 57 static const int wafl_pcs_err_status_reg_vg20[] = { 58 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, 59 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, 60 }; 61 62 static const int xgmi_pcs_err_status_reg_arct[] = { 63 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, 64 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, 65 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000, 66 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000, 67 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000, 68 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000, 69 }; 70 71 /* same as vg20*/ 72 static const int wafl_pcs_err_status_reg_arct[] = { 73 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, 74 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, 75 }; 76 77 static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = { 78 smnPCS_XGMI3X16_PCS_ERROR_STATUS, 79 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000, 80 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000, 81 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000, 82 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000, 83 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000, 84 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000, 85 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000 86 }; 87 88 static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[] = { 89 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK, 90 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000, 91 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x200000, 92 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x300000, 93 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x400000, 94 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x500000, 95 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x600000, 96 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x700000 97 }; 98 99 static const int walf_pcs_err_status_reg_aldebaran[] = { 100 smnPCS_GOPX1_PCS_ERROR_STATUS, 101 smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000 102 }; 103 104 static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = { 105 smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK, 106 smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000 107 }; 108 109 static const int xgmi3x16_pcs_err_status_reg_v6_4[] = { 110 smnPCS_XGMI3X16_PCS_ERROR_STATUS, 111 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000 112 }; 113 114 static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = { 115 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK, 116 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000 117 }; 118 119 static const u64 xgmi_v6_4_0_mca_base_array[] = { 120 0x11a09200, 121 0x11b09200, 122 }; 123 124 static const char *xgmi_v6_4_0_ras_error_code_ext[32] = { 125 [0x00] = "XGMI PCS DataLossErr", 126 [0x01] = "XGMI PCS TrainingErr", 127 [0x02] = "XGMI PCS FlowCtrlAckErr", 128 [0x03] = "XGMI PCS RxFifoUnderflowErr", 129 [0x04] = "XGMI PCS RxFifoOverflowErr", 130 [0x05] = "XGMI PCS CRCErr", 131 [0x06] = "XGMI PCS BERExceededErr", 132 [0x07] = "XGMI PCS TxMetaDataErr", 133 [0x08] = "XGMI PCS ReplayBufParityErr", 134 [0x09] = "XGMI PCS DataParityErr", 135 [0x0a] = "XGMI PCS ReplayFifoOverflowErr", 136 [0x0b] = "XGMI PCS ReplayFifoUnderflowErr", 137 [0x0c] = "XGMI PCS ElasticFifoOverflowErr", 138 [0x0d] = "XGMI PCS DeskewErr", 139 [0x0e] = "XGMI PCS FlowCtrlCRCErr", 140 [0x0f] = "XGMI PCS DataStartupLimitErr", 141 [0x10] = "XGMI PCS FCInitTimeoutErr", 142 [0x11] = "XGMI PCS RecoveryTimeoutErr", 143 [0x12] = "XGMI PCS ReadySerialTimeoutErr", 144 [0x13] = "XGMI PCS ReadySerialAttemptErr", 145 [0x14] = "XGMI PCS RecoveryAttemptErr", 146 [0x15] = "XGMI PCS RecoveryRelockAttemptErr", 147 [0x16] = "XGMI PCS ReplayAttemptErr", 148 [0x17] = "XGMI PCS SyncHdrErr", 149 [0x18] = "XGMI PCS TxReplayTimeoutErr", 150 [0x19] = "XGMI PCS RxReplayTimeoutErr", 151 [0x1a] = "XGMI PCS LinkSubTxTimeoutErr", 152 [0x1b] = "XGMI PCS LinkSubRxTimeoutErr", 153 [0x1c] = "XGMI PCS RxCMDPktErr", 154 }; 155 156 static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { 157 {"XGMI PCS DataLossErr", 158 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)}, 159 {"XGMI PCS TrainingErr", 160 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)}, 161 {"XGMI PCS CRCErr", 162 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)}, 163 {"XGMI PCS BERExceededErr", 164 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)}, 165 {"XGMI PCS TxMetaDataErr", 166 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)}, 167 {"XGMI PCS ReplayBufParityErr", 168 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)}, 169 {"XGMI PCS DataParityErr", 170 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)}, 171 {"XGMI PCS ReplayFifoOverflowErr", 172 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, 173 {"XGMI PCS ReplayFifoUnderflowErr", 174 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, 175 {"XGMI PCS ElasticFifoOverflowErr", 176 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, 177 {"XGMI PCS DeskewErr", 178 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)}, 179 {"XGMI PCS DataStartupLimitErr", 180 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)}, 181 {"XGMI PCS FCInitTimeoutErr", 182 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, 183 {"XGMI PCS RecoveryTimeoutErr", 184 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, 185 {"XGMI PCS ReadySerialTimeoutErr", 186 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, 187 {"XGMI PCS ReadySerialAttemptErr", 188 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, 189 {"XGMI PCS RecoveryAttemptErr", 190 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, 191 {"XGMI PCS RecoveryRelockAttemptErr", 192 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, 193 }; 194 195 static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { 196 {"WAFL PCS DataLossErr", 197 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)}, 198 {"WAFL PCS TrainingErr", 199 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)}, 200 {"WAFL PCS CRCErr", 201 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)}, 202 {"WAFL PCS BERExceededErr", 203 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)}, 204 {"WAFL PCS TxMetaDataErr", 205 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)}, 206 {"WAFL PCS ReplayBufParityErr", 207 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)}, 208 {"WAFL PCS DataParityErr", 209 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)}, 210 {"WAFL PCS ReplayFifoOverflowErr", 211 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, 212 {"WAFL PCS ReplayFifoUnderflowErr", 213 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, 214 {"WAFL PCS ElasticFifoOverflowErr", 215 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, 216 {"WAFL PCS DeskewErr", 217 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)}, 218 {"WAFL PCS DataStartupLimitErr", 219 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)}, 220 {"WAFL PCS FCInitTimeoutErr", 221 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)}, 222 {"WAFL PCS RecoveryTimeoutErr", 223 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, 224 {"WAFL PCS ReadySerialTimeoutErr", 225 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, 226 {"WAFL PCS ReadySerialAttemptErr", 227 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, 228 {"WAFL PCS RecoveryAttemptErr", 229 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)}, 230 {"WAFL PCS RecoveryRelockAttemptErr", 231 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, 232 }; 233 234 static const struct amdgpu_pcs_ras_field xgmi3x16_pcs_ras_fields[] = { 235 {"XGMI3X16 PCS DataLossErr", 236 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataLossErr)}, 237 {"XGMI3X16 PCS TrainingErr", 238 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TrainingErr)}, 239 {"XGMI3X16 PCS FlowCtrlAckErr", 240 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlAckErr)}, 241 {"XGMI3X16 PCS RxFifoUnderflowErr", 242 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoUnderflowErr)}, 243 {"XGMI3X16 PCS RxFifoOverflowErr", 244 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoOverflowErr)}, 245 {"XGMI3X16 PCS CRCErr", 246 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, CRCErr)}, 247 {"XGMI3X16 PCS BERExceededErr", 248 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, BERExceededErr)}, 249 {"XGMI3X16 PCS TxVcidDataErr", 250 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxVcidDataErr)}, 251 {"XGMI3X16 PCS ReplayBufParityErr", 252 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayBufParityErr)}, 253 {"XGMI3X16 PCS DataParityErr", 254 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataParityErr)}, 255 {"XGMI3X16 PCS ReplayFifoOverflowErr", 256 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, 257 {"XGMI3X16 PCS ReplayFifoUnderflowErr", 258 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, 259 {"XGMI3X16 PCS ElasticFifoOverflowErr", 260 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, 261 {"XGMI3X16 PCS DeskewErr", 262 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DeskewErr)}, 263 {"XGMI3X16 PCS FlowCtrlCRCErr", 264 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlCRCErr)}, 265 {"XGMI3X16 PCS DataStartupLimitErr", 266 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataStartupLimitErr)}, 267 {"XGMI3X16 PCS FCInitTimeoutErr", 268 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, 269 {"XGMI3X16 PCS RecoveryTimeoutErr", 270 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, 271 {"XGMI3X16 PCS ReadySerialTimeoutErr", 272 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, 273 {"XGMI3X16 PCS ReadySerialAttemptErr", 274 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, 275 {"XGMI3X16 PCS RecoveryAttemptErr", 276 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, 277 {"XGMI3X16 PCS RecoveryRelockAttemptErr", 278 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, 279 {"XGMI3X16 PCS ReplayAttemptErr", 280 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayAttemptErr)}, 281 {"XGMI3X16 PCS SyncHdrErr", 282 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, SyncHdrErr)}, 283 {"XGMI3X16 PCS TxReplayTimeoutErr", 284 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxReplayTimeoutErr)}, 285 {"XGMI3X16 PCS RxReplayTimeoutErr", 286 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxReplayTimeoutErr)}, 287 {"XGMI3X16 PCS LinkSubTxTimeoutErr", 288 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubTxTimeoutErr)}, 289 {"XGMI3X16 PCS LinkSubRxTimeoutErr", 290 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubRxTimeoutErr)}, 291 {"XGMI3X16 PCS RxCMDPktErr", 292 SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxCMDPktErr)}, 293 }; 294 295 int amdgpu_xgmi_get_ext_link(struct amdgpu_device *adev, int link_num) 296 { 297 int link_map_6_4_x[8] = { 0, 3, 1, 2, 7, 6, 4, 5 }; 298 299 if (adev->gmc.xgmi.num_physical_nodes <= 1) 300 return -EINVAL; 301 302 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { 303 case IP_VERSION(6, 4, 0): 304 case IP_VERSION(6, 4, 1): 305 if (link_num < ARRAY_SIZE(link_map_6_4_x)) 306 return link_map_6_4_x[link_num]; 307 break; 308 default: 309 return -EINVAL; 310 } 311 312 return -EINVAL; 313 } 314 315 static u32 xgmi_v6_4_get_link_status(struct amdgpu_device *adev, int global_link_num) 316 { 317 const u32 smn_xgmi_6_4_pcs_state_hist1[2] = { 0x11a00070, 0x11b00070 }; 318 const u32 smn_xgmi_6_4_1_pcs_state_hist1[2] = { 0x12100070, 319 0x11b00070 }; 320 u32 i, n; 321 u64 addr; 322 323 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { 324 case IP_VERSION(6, 4, 0): 325 n = ARRAY_SIZE(smn_xgmi_6_4_pcs_state_hist1); 326 addr = smn_xgmi_6_4_pcs_state_hist1[global_link_num % n]; 327 break; 328 case IP_VERSION(6, 4, 1): 329 n = ARRAY_SIZE(smn_xgmi_6_4_1_pcs_state_hist1); 330 addr = smn_xgmi_6_4_1_pcs_state_hist1[global_link_num % n]; 331 break; 332 default: 333 return U32_MAX; 334 } 335 336 i = global_link_num / n; 337 338 if (!(adev->aid_mask & BIT(i))) 339 return U32_MAX; 340 341 addr += adev->asic_funcs->encode_ext_smn_addressing(i); 342 343 return RREG32_PCIE_EXT(addr); 344 } 345 346 int amdgpu_get_xgmi_link_status(struct amdgpu_device *adev, int global_link_num) 347 { 348 u32 xgmi_state_reg_val; 349 350 if (adev->gmc.xgmi.num_physical_nodes <= 1) 351 return -EINVAL; 352 353 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { 354 case IP_VERSION(6, 4, 0): 355 case IP_VERSION(6, 4, 1): 356 xgmi_state_reg_val = xgmi_v6_4_get_link_status(adev, global_link_num); 357 break; 358 default: 359 return -EOPNOTSUPP; 360 } 361 362 if ((xgmi_state_reg_val & 0xFF) == XGMI_STATE_DISABLE) 363 return -ENOLINK; 364 365 if ((xgmi_state_reg_val & 0xFF) == XGMI_STATE_LS0) 366 return AMDGPU_XGMI_LINK_ACTIVE; 367 368 return AMDGPU_XGMI_LINK_INACTIVE; 369 } 370 371 /** 372 * DOC: AMDGPU XGMI Support 373 * 374 * XGMI is a high speed interconnect that joins multiple GPU cards 375 * into a homogeneous memory space that is organized by a collective 376 * hive ID and individual node IDs, both of which are 64-bit numbers. 377 * 378 * The file xgmi_device_id contains the unique per GPU device ID and 379 * is stored in the /sys/class/drm/card${cardno}/device/ directory. 380 * 381 * Inside the device directory a sub-directory 'xgmi_hive_info' is 382 * created which contains the hive ID and the list of nodes. 383 * 384 * The hive ID is stored in: 385 * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id 386 * 387 * The node information is stored in numbered directories: 388 * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id 389 * 390 * Each device has their own xgmi_hive_info direction with a mirror 391 * set of node sub-directories. 392 * 393 * The XGMI memory space is built by contiguously adding the power of 394 * two padded VRAM space from each node to each other. 395 * 396 */ 397 398 static struct attribute amdgpu_xgmi_hive_id = { 399 .name = "xgmi_hive_id", 400 .mode = S_IRUGO 401 }; 402 403 static struct attribute *amdgpu_xgmi_hive_attrs[] = { 404 &amdgpu_xgmi_hive_id, 405 NULL 406 }; 407 ATTRIBUTE_GROUPS(amdgpu_xgmi_hive); 408 409 static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj, 410 struct attribute *attr, char *buf) 411 { 412 struct amdgpu_hive_info *hive = container_of( 413 kobj, struct amdgpu_hive_info, kobj); 414 415 if (attr == &amdgpu_xgmi_hive_id) 416 return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); 417 418 return 0; 419 } 420 421 static void amdgpu_xgmi_hive_release(struct kobject *kobj) 422 { 423 struct amdgpu_hive_info *hive = container_of( 424 kobj, struct amdgpu_hive_info, kobj); 425 426 amdgpu_reset_put_reset_domain(hive->reset_domain); 427 hive->reset_domain = NULL; 428 429 mutex_destroy(&hive->hive_lock); 430 kfree(hive); 431 } 432 433 static const struct sysfs_ops amdgpu_xgmi_hive_ops = { 434 .show = amdgpu_xgmi_show_attrs, 435 }; 436 437 static const struct kobj_type amdgpu_xgmi_hive_type = { 438 .release = amdgpu_xgmi_hive_release, 439 .sysfs_ops = &amdgpu_xgmi_hive_ops, 440 .default_groups = amdgpu_xgmi_hive_groups, 441 }; 442 443 static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, 444 struct device_attribute *attr, 445 char *buf) 446 { 447 struct drm_device *ddev = dev_get_drvdata(dev); 448 struct amdgpu_device *adev = drm_to_adev(ddev); 449 450 return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id); 451 452 } 453 454 static ssize_t amdgpu_xgmi_show_physical_id(struct device *dev, 455 struct device_attribute *attr, 456 char *buf) 457 { 458 struct drm_device *ddev = dev_get_drvdata(dev); 459 struct amdgpu_device *adev = drm_to_adev(ddev); 460 461 return sysfs_emit(buf, "%u\n", adev->gmc.xgmi.physical_node_id); 462 463 } 464 465 static ssize_t amdgpu_xgmi_show_num_hops(struct device *dev, 466 struct device_attribute *attr, 467 char *buf) 468 { 469 struct drm_device *ddev = dev_get_drvdata(dev); 470 struct amdgpu_device *adev = drm_to_adev(ddev); 471 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; 472 int i; 473 474 for (i = 0; i < top->num_nodes; i++) 475 sprintf(buf + 3 * i, "%02x ", top->nodes[i].num_hops); 476 477 return sysfs_emit(buf, "%s\n", buf); 478 } 479 480 static ssize_t amdgpu_xgmi_show_num_links(struct device *dev, 481 struct device_attribute *attr, 482 char *buf) 483 { 484 struct drm_device *ddev = dev_get_drvdata(dev); 485 struct amdgpu_device *adev = drm_to_adev(ddev); 486 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; 487 int i; 488 489 for (i = 0; i < top->num_nodes; i++) 490 sprintf(buf + 3 * i, "%02x ", top->nodes[i].num_links); 491 492 return sysfs_emit(buf, "%s\n", buf); 493 } 494 495 static ssize_t amdgpu_xgmi_show_connected_port_num(struct device *dev, 496 struct device_attribute *attr, 497 char *buf) 498 { 499 struct drm_device *ddev = dev_get_drvdata(dev); 500 struct amdgpu_device *adev = drm_to_adev(ddev); 501 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; 502 int i, j, size = 0; 503 int current_node; 504 /* 505 * get the node id in the sysfs for the current socket and show 506 * it in the port num info output in the sysfs for easy reading. 507 * it is NOT the one retrieved from xgmi ta. 508 */ 509 for (i = 0; i < top->num_nodes; i++) { 510 if (top->nodes[i].node_id == adev->gmc.xgmi.node_id) { 511 current_node = i; 512 break; 513 } 514 } 515 516 if (i == top->num_nodes) 517 return -EINVAL; 518 519 for (i = 0; i < top->num_nodes; i++) { 520 for (j = 0; j < top->nodes[i].num_links; j++) 521 /* node id in sysfs starts from 1 rather than 0 so +1 here */ 522 size += sysfs_emit_at(buf, size, "%02x:%02x -> %02x:%02x\n", current_node + 1, 523 top->nodes[i].port_num[j].src_xgmi_port_num, i + 1, 524 top->nodes[i].port_num[j].dst_xgmi_port_num); 525 } 526 527 return size; 528 } 529 530 #define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801) 531 static ssize_t amdgpu_xgmi_show_error(struct device *dev, 532 struct device_attribute *attr, 533 char *buf) 534 { 535 struct drm_device *ddev = dev_get_drvdata(dev); 536 struct amdgpu_device *adev = drm_to_adev(ddev); 537 uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; 538 uint64_t fica_out; 539 unsigned int error_count = 0; 540 541 ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200); 542 ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208); 543 544 if ((!adev->df.funcs) || 545 (!adev->df.funcs->get_fica) || 546 (!adev->df.funcs->set_fica)) 547 return -EINVAL; 548 549 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in); 550 if (fica_out != 0x1f) 551 pr_err("xGMI error counters not enabled!\n"); 552 553 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in); 554 555 if ((fica_out & 0xffff) == 2) 556 error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63); 557 558 adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0); 559 560 return sysfs_emit(buf, "%u\n", error_count); 561 } 562 563 564 static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); 565 static DEVICE_ATTR(xgmi_physical_id, 0444, amdgpu_xgmi_show_physical_id, NULL); 566 static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL); 567 static DEVICE_ATTR(xgmi_num_hops, S_IRUGO, amdgpu_xgmi_show_num_hops, NULL); 568 static DEVICE_ATTR(xgmi_num_links, S_IRUGO, amdgpu_xgmi_show_num_links, NULL); 569 static DEVICE_ATTR(xgmi_port_num, S_IRUGO, amdgpu_xgmi_show_connected_port_num, NULL); 570 571 static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, 572 struct amdgpu_hive_info *hive) 573 { 574 int ret = 0; 575 char node[10] = { 0 }; 576 577 /* Create xgmi device id file */ 578 ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id); 579 if (ret) { 580 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n"); 581 return ret; 582 } 583 584 ret = device_create_file(adev->dev, &dev_attr_xgmi_physical_id); 585 if (ret) { 586 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_physical_id\n"); 587 return ret; 588 } 589 590 /* Create xgmi error file */ 591 ret = device_create_file(adev->dev, &dev_attr_xgmi_error); 592 if (ret) 593 pr_err("failed to create xgmi_error\n"); 594 595 /* Create xgmi num hops file */ 596 ret = device_create_file(adev->dev, &dev_attr_xgmi_num_hops); 597 if (ret) 598 pr_err("failed to create xgmi_num_hops\n"); 599 600 /* Create xgmi num links file */ 601 ret = device_create_file(adev->dev, &dev_attr_xgmi_num_links); 602 if (ret) 603 pr_err("failed to create xgmi_num_links\n"); 604 605 /* Create xgmi port num file if supported */ 606 if (adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG) { 607 ret = device_create_file(adev->dev, &dev_attr_xgmi_port_num); 608 if (ret) 609 dev_err(adev->dev, "failed to create xgmi_port_num\n"); 610 } 611 612 /* Create sysfs link to hive info folder on the first device */ 613 if (hive->kobj.parent != (&adev->dev->kobj)) { 614 ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj, 615 "xgmi_hive_info"); 616 if (ret) { 617 dev_err(adev->dev, "XGMI: Failed to create link to hive info"); 618 goto remove_file; 619 } 620 } 621 622 sprintf(node, "node%d", atomic_read(&hive->number_devices)); 623 /* Create sysfs link form the hive folder to yourself */ 624 ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node); 625 if (ret) { 626 dev_err(adev->dev, "XGMI: Failed to create link from hive info"); 627 goto remove_link; 628 } 629 630 goto success; 631 632 633 remove_link: 634 sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique); 635 636 remove_file: 637 device_remove_file(adev->dev, &dev_attr_xgmi_device_id); 638 device_remove_file(adev->dev, &dev_attr_xgmi_physical_id); 639 device_remove_file(adev->dev, &dev_attr_xgmi_error); 640 device_remove_file(adev->dev, &dev_attr_xgmi_num_hops); 641 device_remove_file(adev->dev, &dev_attr_xgmi_num_links); 642 if (adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG) 643 device_remove_file(adev->dev, &dev_attr_xgmi_port_num); 644 645 success: 646 return ret; 647 } 648 649 static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, 650 struct amdgpu_hive_info *hive) 651 { 652 char node[10]; 653 memset(node, 0, sizeof(node)); 654 655 device_remove_file(adev->dev, &dev_attr_xgmi_device_id); 656 device_remove_file(adev->dev, &dev_attr_xgmi_physical_id); 657 device_remove_file(adev->dev, &dev_attr_xgmi_error); 658 device_remove_file(adev->dev, &dev_attr_xgmi_num_hops); 659 device_remove_file(adev->dev, &dev_attr_xgmi_num_links); 660 if (adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG) 661 device_remove_file(adev->dev, &dev_attr_xgmi_port_num); 662 663 if (hive->kobj.parent != (&adev->dev->kobj)) 664 sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info"); 665 666 sprintf(node, "node%d", atomic_read(&hive->number_devices)); 667 sysfs_remove_link(&hive->kobj, node); 668 669 } 670 671 672 673 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) 674 { 675 struct amdgpu_hive_info *hive = NULL; 676 int ret; 677 678 if (!adev->gmc.xgmi.hive_id) 679 return NULL; 680 681 if (adev->hive) { 682 kobject_get(&adev->hive->kobj); 683 return adev->hive; 684 } 685 686 mutex_lock(&xgmi_mutex); 687 688 list_for_each_entry(hive, &xgmi_hive_list, node) { 689 if (hive->hive_id == adev->gmc.xgmi.hive_id) 690 goto pro_end; 691 } 692 693 hive = kzalloc_obj(*hive); 694 if (!hive) { 695 dev_err(adev->dev, "XGMI: allocation failed\n"); 696 ret = -ENOMEM; 697 hive = NULL; 698 goto pro_end; 699 } 700 701 /* initialize new hive if not exist */ 702 ret = kobject_init_and_add(&hive->kobj, 703 &amdgpu_xgmi_hive_type, 704 &adev->dev->kobj, 705 "%s", "xgmi_hive_info"); 706 if (ret) { 707 dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n"); 708 kobject_put(&hive->kobj); 709 hive = NULL; 710 goto pro_end; 711 } 712 713 /** 714 * Only init hive->reset_domain for none SRIOV configuration. For SRIOV, 715 * Host driver decide how to reset the GPU either through FLR or chain reset. 716 * Guest side will get individual notifications from the host for the FLR 717 * if necessary. 718 */ 719 if (!amdgpu_sriov_vf(adev)) { 720 /** 721 * Avoid recreating reset domain when hive is reconstructed for the case 722 * of reset the devices in the XGMI hive during probe for passthrough GPU 723 * See https://www.spinics.net/lists/amd-gfx/msg58836.html 724 */ 725 if (adev->reset_domain->type != XGMI_HIVE) { 726 hive->reset_domain = 727 amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive"); 728 if (!hive->reset_domain) { 729 dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n"); 730 ret = -ENOMEM; 731 kobject_put(&hive->kobj); 732 hive = NULL; 733 goto pro_end; 734 } 735 } else { 736 amdgpu_reset_get_reset_domain(adev->reset_domain); 737 hive->reset_domain = adev->reset_domain; 738 } 739 } 740 741 hive->hive_id = adev->gmc.xgmi.hive_id; 742 INIT_LIST_HEAD(&hive->device_list); 743 INIT_LIST_HEAD(&hive->node); 744 mutex_init(&hive->hive_lock); 745 atomic_set(&hive->number_devices, 0); 746 task_barrier_init(&hive->tb); 747 hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; 748 hive->hi_req_gpu = NULL; 749 atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE); 750 751 /* 752 * hive pstate on boot is high in vega20 so we have to go to low 753 * pstate on after boot. 754 */ 755 hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE; 756 list_add_tail(&hive->node, &xgmi_hive_list); 757 758 pro_end: 759 if (hive) 760 kobject_get(&hive->kobj); 761 mutex_unlock(&xgmi_mutex); 762 return hive; 763 } 764 765 void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive) 766 { 767 if (hive) 768 kobject_put(&hive->kobj); 769 } 770 771 int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) 772 { 773 int ret = 0; 774 struct amdgpu_hive_info *hive; 775 struct amdgpu_device *request_adev; 776 bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; 777 bool init_low; 778 779 hive = amdgpu_get_xgmi_hive(adev); 780 if (!hive) 781 return 0; 782 783 request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev; 784 init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; 785 amdgpu_put_xgmi_hive(hive); 786 /* fw bug so temporarily disable pstate switching */ 787 return 0; 788 789 if (!hive || adev->asic_type != CHIP_VEGA20) 790 return 0; 791 792 mutex_lock(&hive->hive_lock); 793 794 if (is_hi_req) 795 hive->hi_req_count++; 796 else 797 hive->hi_req_count--; 798 799 /* 800 * Vega20 only needs single peer to request pstate high for the hive to 801 * go high but all peers must request pstate low for the hive to go low 802 */ 803 if (hive->pstate == pstate || 804 (!is_hi_req && hive->hi_req_count && !init_low)) 805 goto out; 806 807 dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate); 808 809 ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate); 810 if (ret) { 811 dev_err(request_adev->dev, 812 "XGMI: Set pstate failure on device %llx, hive %llx, ret %d", 813 request_adev->gmc.xgmi.node_id, 814 request_adev->gmc.xgmi.hive_id, ret); 815 goto out; 816 } 817 818 if (init_low) 819 hive->pstate = hive->hi_req_count ? 820 hive->pstate : AMDGPU_XGMI_PSTATE_MIN; 821 else { 822 hive->pstate = pstate; 823 hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ? 824 adev : NULL; 825 } 826 out: 827 mutex_unlock(&hive->hive_lock); 828 return ret; 829 } 830 831 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev) 832 { 833 int ret; 834 835 if (amdgpu_sriov_vf(adev)) 836 return 0; 837 838 /* Each psp need to set the latest topology */ 839 ret = psp_xgmi_set_topology_info(&adev->psp, 840 atomic_read(&hive->number_devices), 841 &adev->psp.xgmi_context.top_info); 842 if (ret) 843 dev_err(adev->dev, 844 "XGMI: Set topology failure on device %llx, hive %llx, ret %d", 845 adev->gmc.xgmi.node_id, 846 adev->gmc.xgmi.hive_id, ret); 847 848 return ret; 849 } 850 851 852 /* 853 * NOTE psp_xgmi_node_info.num_hops layout is as follows: 854 * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved) 855 * num_hops[5:3] = reserved 856 * num_hops[2:0] = number of hops 857 */ 858 int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, 859 struct amdgpu_device *peer_adev) 860 { 861 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; 862 uint8_t num_hops_mask = 0x7; 863 int i; 864 865 if (!adev->gmc.xgmi.supported) 866 return 0; 867 868 for (i = 0 ; i < top->num_nodes; ++i) 869 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) 870 return top->nodes[i].num_hops & num_hops_mask; 871 872 dev_err(adev->dev, "Failed to get xgmi hops count for peer %d.\n", 873 peer_adev->gmc.xgmi.physical_node_id); 874 875 return 0; 876 } 877 878 int amdgpu_xgmi_get_bandwidth(struct amdgpu_device *adev, struct amdgpu_device *peer_adev, 879 enum amdgpu_xgmi_bw_mode bw_mode, enum amdgpu_xgmi_bw_unit bw_unit, 880 uint32_t *min_bw, uint32_t *max_bw) 881 { 882 bool peer_mode = bw_mode == AMDGPU_XGMI_BW_MODE_PER_PEER; 883 int unit_scale = bw_unit == AMDGPU_XGMI_BW_UNIT_MBYTES ? 1000 : 1; 884 int num_lanes = adev->gmc.xgmi.max_width; 885 int speed = adev->gmc.xgmi.max_speed; 886 int num_links = !peer_mode ? 1 : -1; 887 888 if (!(min_bw && max_bw)) 889 return -EINVAL; 890 891 *min_bw = 0; 892 *max_bw = 0; 893 894 if (!adev->gmc.xgmi.supported) 895 return -ENODATA; 896 897 if (peer_mode && !peer_adev) 898 return -EINVAL; 899 900 if (peer_mode) { 901 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; 902 int i; 903 904 for (i = 0 ; i < top->num_nodes; ++i) { 905 if (top->nodes[i].node_id != peer_adev->gmc.xgmi.node_id) 906 continue; 907 908 num_links = top->nodes[i].num_links; 909 break; 910 } 911 } 912 913 if (num_links == -1) { 914 dev_err(adev->dev, "Failed to get number of xgmi links for peer %d.\n", 915 peer_adev->gmc.xgmi.physical_node_id); 916 } else if (num_links) { 917 int per_link_bw = (speed * num_lanes * unit_scale)/BITS_PER_BYTE; 918 919 *min_bw = per_link_bw; 920 *max_bw = num_links * per_link_bw; 921 } 922 923 return 0; 924 } 925 926 bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev, 927 struct amdgpu_device *peer_adev) 928 { 929 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; 930 int i; 931 932 /* Sharing should always be enabled for non-SRIOV. */ 933 if (!amdgpu_sriov_vf(adev)) 934 return true; 935 936 for (i = 0 ; i < top->num_nodes; ++i) 937 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) 938 return !!top->nodes[i].is_sharing_enabled; 939 940 return false; 941 } 942 943 /* 944 * Devices that support extended data require the entire hive to initialize with 945 * the shared memory buffer flag set. 946 * 947 * Hive locks and conditions apply - see amdgpu_xgmi_add_device 948 */ 949 static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive, 950 bool set_extended_data) 951 { 952 struct amdgpu_device *tmp_adev; 953 int ret; 954 955 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 956 ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false); 957 if (ret) { 958 dev_err(tmp_adev->dev, 959 "XGMI: Failed to initialize xgmi session for data partition %i\n", 960 set_extended_data); 961 return ret; 962 } 963 964 } 965 966 return 0; 967 } 968 969 int amdgpu_xgmi_add_device(struct amdgpu_device *adev) 970 { 971 struct psp_xgmi_topology_info *top_info; 972 struct amdgpu_hive_info *hive; 973 struct amdgpu_xgmi *entry; 974 struct amdgpu_device *tmp_adev = NULL; 975 976 int count = 0, ret = 0; 977 978 if (!adev->gmc.xgmi.supported) 979 return 0; 980 981 if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { 982 ret = psp_xgmi_initialize(&adev->psp, false, true); 983 if (ret) { 984 dev_err(adev->dev, 985 "XGMI: Failed to initialize xgmi session\n"); 986 return ret; 987 } 988 989 ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id); 990 if (ret) { 991 dev_err(adev->dev, 992 "XGMI: Failed to get hive id\n"); 993 return ret; 994 } 995 996 ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id); 997 if (ret) { 998 dev_err(adev->dev, 999 "XGMI: Failed to get node id\n"); 1000 return ret; 1001 } 1002 } else { 1003 adev->gmc.xgmi.hive_id = 16; 1004 adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16; 1005 } 1006 1007 hive = amdgpu_get_xgmi_hive(adev); 1008 if (!hive) { 1009 ret = -EINVAL; 1010 dev_err(adev->dev, 1011 "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n", 1012 adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id); 1013 goto exit; 1014 } 1015 mutex_lock(&hive->hive_lock); 1016 1017 top_info = &adev->psp.xgmi_context.top_info; 1018 1019 list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); 1020 list_for_each_entry(entry, &hive->device_list, head) 1021 top_info->nodes[count++].node_id = entry->node_id; 1022 top_info->num_nodes = count; 1023 atomic_set(&hive->number_devices, count); 1024 1025 task_barrier_add_task(&hive->tb); 1026 1027 if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { 1028 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 1029 /* update node list for other device in the hive */ 1030 if (tmp_adev != adev) { 1031 top_info = &tmp_adev->psp.xgmi_context.top_info; 1032 top_info->nodes[count - 1].node_id = 1033 adev->gmc.xgmi.node_id; 1034 top_info->num_nodes = count; 1035 } 1036 ret = amdgpu_xgmi_update_topology(hive, tmp_adev); 1037 if (ret) 1038 goto exit_unlock; 1039 } 1040 1041 if (amdgpu_sriov_vf(adev) && 1042 adev->psp.xgmi_context.xgmi_ta_caps & EXTEND_PEER_LINK_INFO_CMD_FLAG) { 1043 /* only get topology for VF being init if it can support full duplex */ 1044 ret = psp_xgmi_get_topology_info(&adev->psp, count, 1045 &adev->psp.xgmi_context.top_info, false); 1046 if (ret) { 1047 dev_err(adev->dev, 1048 "XGMI: Get topology failure on device %llx, hive %llx, ret %d", 1049 adev->gmc.xgmi.node_id, 1050 adev->gmc.xgmi.hive_id, ret); 1051 /* To do: continue with some node failed or disable the whole hive*/ 1052 goto exit_unlock; 1053 } 1054 } else { 1055 /* get latest topology info for each device from psp */ 1056 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 1057 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, 1058 &tmp_adev->psp.xgmi_context.top_info, false); 1059 if (ret) { 1060 dev_err(tmp_adev->dev, 1061 "XGMI: Get topology failure on device %llx, hive %llx, ret %d", 1062 tmp_adev->gmc.xgmi.node_id, 1063 tmp_adev->gmc.xgmi.hive_id, ret); 1064 /* To do : continue with some node failed or disable the whole hive */ 1065 goto exit_unlock; 1066 } 1067 } 1068 } 1069 1070 /* get topology again for hives that support extended data */ 1071 if (adev->psp.xgmi_context.supports_extended_data) { 1072 1073 /* initialize the hive to get extended data. */ 1074 ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true); 1075 if (ret) 1076 goto exit_unlock; 1077 1078 /* get the extended data. */ 1079 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 1080 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, 1081 &tmp_adev->psp.xgmi_context.top_info, true); 1082 if (ret) { 1083 dev_err(tmp_adev->dev, 1084 "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d", 1085 tmp_adev->gmc.xgmi.node_id, 1086 tmp_adev->gmc.xgmi.hive_id, ret); 1087 goto exit_unlock; 1088 } 1089 } 1090 1091 /* initialize the hive to get non-extended data for the next round. */ 1092 ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false); 1093 if (ret) 1094 goto exit_unlock; 1095 1096 } 1097 } 1098 1099 if (!ret) 1100 ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); 1101 1102 exit_unlock: 1103 mutex_unlock(&hive->hive_lock); 1104 exit: 1105 if (!ret) { 1106 adev->hive = hive; 1107 dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n", 1108 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id); 1109 } else { 1110 amdgpu_put_xgmi_hive(hive); 1111 dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n", 1112 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, 1113 ret); 1114 } 1115 1116 return ret; 1117 } 1118 1119 int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) 1120 { 1121 struct amdgpu_hive_info *hive = adev->hive; 1122 1123 if (!adev->gmc.xgmi.supported) 1124 return -EINVAL; 1125 1126 if (!hive) 1127 return -EINVAL; 1128 1129 mutex_lock(&hive->hive_lock); 1130 task_barrier_rem_task(&hive->tb); 1131 amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); 1132 if (hive->hi_req_gpu == adev) 1133 hive->hi_req_gpu = NULL; 1134 list_del(&adev->gmc.xgmi.head); 1135 mutex_unlock(&hive->hive_lock); 1136 1137 amdgpu_put_xgmi_hive(hive); 1138 adev->hive = NULL; 1139 1140 if (atomic_dec_return(&hive->number_devices) == 0) { 1141 /* Remove the hive from global hive list */ 1142 mutex_lock(&xgmi_mutex); 1143 list_del(&hive->node); 1144 mutex_unlock(&xgmi_mutex); 1145 1146 amdgpu_put_xgmi_hive(hive); 1147 } 1148 1149 return 0; 1150 } 1151 1152 static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, 1153 enum aca_smu_type type, void *data) 1154 { 1155 struct amdgpu_device *adev = handle->adev; 1156 struct aca_bank_info info; 1157 const char *error_str; 1158 u64 status, count; 1159 int ret, ext_error_code; 1160 1161 ret = aca_bank_info_decode(bank, &info); 1162 if (ret) 1163 return ret; 1164 1165 status = bank->regs[ACA_REG_IDX_STATUS]; 1166 ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status); 1167 1168 error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ? 1169 xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL; 1170 if (error_str) 1171 dev_info(adev->dev, "%s detected\n", error_str); 1172 1173 count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]); 1174 1175 switch (type) { 1176 case ACA_SMU_TYPE_UE: 1177 if (ext_error_code != 0 && ext_error_code != 1 && ext_error_code != 9) 1178 count = 0ULL; 1179 1180 bank->aca_err_type = ACA_ERROR_TYPE_UE; 1181 ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, count); 1182 break; 1183 case ACA_SMU_TYPE_CE: 1184 count = ext_error_code == 6 ? count : 0ULL; 1185 bank->aca_err_type = ACA_ERROR_TYPE_CE; 1186 ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, count); 1187 break; 1188 default: 1189 return -EINVAL; 1190 } 1191 1192 return ret; 1193 } 1194 1195 static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = { 1196 .aca_bank_parser = xgmi_v6_4_0_aca_bank_parser, 1197 }; 1198 1199 static const struct aca_info xgmi_v6_4_0_aca_info = { 1200 .hwip = ACA_HWIP_TYPE_PCS_XGMI, 1201 .mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK, 1202 .bank_ops = &xgmi_v6_4_0_aca_bank_ops, 1203 }; 1204 1205 static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) 1206 { 1207 int r; 1208 1209 if (!adev->gmc.xgmi.supported || 1210 adev->gmc.xgmi.num_physical_nodes == 0) 1211 return 0; 1212 1213 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); 1214 1215 r = amdgpu_ras_block_late_init(adev, ras_block); 1216 if (r) 1217 return r; 1218 1219 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { 1220 case IP_VERSION(6, 4, 0): 1221 case IP_VERSION(6, 4, 1): 1222 r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL, 1223 &xgmi_v6_4_0_aca_info, NULL); 1224 if (r) 1225 goto late_fini; 1226 break; 1227 default: 1228 break; 1229 } 1230 1231 return 0; 1232 1233 late_fini: 1234 amdgpu_ras_block_late_fini(adev, ras_block); 1235 1236 return r; 1237 } 1238 1239 uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, 1240 uint64_t addr) 1241 { 1242 struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi; 1243 return (addr + xgmi->physical_node_id * xgmi->node_segment_size); 1244 } 1245 1246 static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg) 1247 { 1248 WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF); 1249 WREG32_PCIE(pcs_status_reg, 0); 1250 } 1251 1252 static void amdgpu_xgmi_legacy_reset_ras_error_count(struct amdgpu_device *adev) 1253 { 1254 uint32_t i; 1255 1256 switch (adev->asic_type) { 1257 case CHIP_ARCTURUS: 1258 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) 1259 pcs_clear_status(adev, 1260 xgmi_pcs_err_status_reg_arct[i]); 1261 break; 1262 case CHIP_VEGA20: 1263 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) 1264 pcs_clear_status(adev, 1265 xgmi_pcs_err_status_reg_vg20[i]); 1266 break; 1267 case CHIP_ALDEBARAN: 1268 for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) 1269 pcs_clear_status(adev, 1270 xgmi3x16_pcs_err_status_reg_aldebaran[i]); 1271 for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) 1272 pcs_clear_status(adev, 1273 walf_pcs_err_status_reg_aldebaran[i]); 1274 break; 1275 default: 1276 break; 1277 } 1278 1279 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { 1280 case IP_VERSION(6, 4, 0): 1281 case IP_VERSION(6, 4, 1): 1282 for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) 1283 pcs_clear_status(adev, 1284 xgmi3x16_pcs_err_status_reg_v6_4[i]); 1285 break; 1286 default: 1287 break; 1288 } 1289 } 1290 1291 static void __xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst, u64 mca_base) 1292 { 1293 WREG64_MCA(xgmi_inst, mca_base, ACA_REG_IDX_STATUS, 0ULL); 1294 } 1295 1296 static void xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst) 1297 { 1298 int i; 1299 1300 for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++) 1301 __xgmi_v6_4_0_reset_error_count(adev, xgmi_inst, xgmi_v6_4_0_mca_base_array[i]); 1302 } 1303 1304 static void xgmi_v6_4_0_reset_ras_error_count(struct amdgpu_device *adev) 1305 { 1306 int i; 1307 1308 for_each_inst(i, adev->aid_mask) 1309 xgmi_v6_4_0_reset_error_count(adev, i); 1310 } 1311 1312 static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) 1313 { 1314 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { 1315 case IP_VERSION(6, 4, 0): 1316 case IP_VERSION(6, 4, 1): 1317 xgmi_v6_4_0_reset_ras_error_count(adev); 1318 break; 1319 default: 1320 amdgpu_xgmi_legacy_reset_ras_error_count(adev); 1321 break; 1322 } 1323 } 1324 1325 static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, 1326 uint32_t value, 1327 uint32_t mask_value, 1328 uint32_t *ue_count, 1329 uint32_t *ce_count, 1330 bool is_xgmi_pcs, 1331 bool check_mask) 1332 { 1333 int i; 1334 int ue_cnt = 0; 1335 const struct amdgpu_pcs_ras_field *pcs_ras_fields = NULL; 1336 uint32_t field_array_size = 0; 1337 1338 if (is_xgmi_pcs) { 1339 if (amdgpu_ip_version(adev, XGMI_HWIP, 0) == 1340 IP_VERSION(6, 1, 0) || 1341 amdgpu_ip_version(adev, XGMI_HWIP, 0) == 1342 IP_VERSION(6, 4, 0) || 1343 amdgpu_ip_version(adev, XGMI_HWIP, 0) == 1344 IP_VERSION(6, 4, 1)) { 1345 pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0]; 1346 field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields); 1347 } else { 1348 pcs_ras_fields = &xgmi_pcs_ras_fields[0]; 1349 field_array_size = ARRAY_SIZE(xgmi_pcs_ras_fields); 1350 } 1351 } else { 1352 pcs_ras_fields = &wafl_pcs_ras_fields[0]; 1353 field_array_size = ARRAY_SIZE(wafl_pcs_ras_fields); 1354 } 1355 1356 if (check_mask) 1357 value = value & ~mask_value; 1358 1359 /* query xgmi/walf pcs error status, 1360 * only ue is supported */ 1361 for (i = 0; value && i < field_array_size; i++) { 1362 ue_cnt = (value & 1363 pcs_ras_fields[i].pcs_err_mask) >> 1364 pcs_ras_fields[i].pcs_err_shift; 1365 if (ue_cnt) { 1366 dev_info(adev->dev, "%s detected\n", 1367 pcs_ras_fields[i].err_name); 1368 *ue_count += ue_cnt; 1369 } 1370 1371 /* reset bit value if the bit is checked */ 1372 value &= ~(pcs_ras_fields[i].pcs_err_mask); 1373 } 1374 1375 return 0; 1376 } 1377 1378 static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev, 1379 void *ras_error_status) 1380 { 1381 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 1382 int i, supported = 1; 1383 uint32_t data, mask_data = 0; 1384 uint32_t ue_cnt = 0, ce_cnt = 0; 1385 1386 if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) 1387 return ; 1388 1389 err_data->ue_count = 0; 1390 err_data->ce_count = 0; 1391 1392 switch (adev->asic_type) { 1393 case CHIP_ARCTURUS: 1394 /* check xgmi pcs error */ 1395 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) { 1396 data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]); 1397 if (data) 1398 amdgpu_xgmi_query_pcs_error_status(adev, data, 1399 mask_data, &ue_cnt, &ce_cnt, true, false); 1400 } 1401 /* check wafl pcs error */ 1402 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) { 1403 data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]); 1404 if (data) 1405 amdgpu_xgmi_query_pcs_error_status(adev, data, 1406 mask_data, &ue_cnt, &ce_cnt, false, false); 1407 } 1408 break; 1409 case CHIP_VEGA20: 1410 /* check xgmi pcs error */ 1411 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) { 1412 data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]); 1413 if (data) 1414 amdgpu_xgmi_query_pcs_error_status(adev, data, 1415 mask_data, &ue_cnt, &ce_cnt, true, false); 1416 } 1417 /* check wafl pcs error */ 1418 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) { 1419 data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]); 1420 if (data) 1421 amdgpu_xgmi_query_pcs_error_status(adev, data, 1422 mask_data, &ue_cnt, &ce_cnt, false, false); 1423 } 1424 break; 1425 case CHIP_ALDEBARAN: 1426 /* check xgmi3x16 pcs error */ 1427 for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) { 1428 data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]); 1429 mask_data = 1430 RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[i]); 1431 if (data) 1432 amdgpu_xgmi_query_pcs_error_status(adev, data, 1433 mask_data, &ue_cnt, &ce_cnt, true, true); 1434 } 1435 /* check wafl pcs error */ 1436 for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) { 1437 data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]); 1438 mask_data = 1439 RREG32_PCIE(walf_pcs_err_noncorrectable_mask_reg_aldebaran[i]); 1440 if (data) 1441 amdgpu_xgmi_query_pcs_error_status(adev, data, 1442 mask_data, &ue_cnt, &ce_cnt, false, true); 1443 } 1444 break; 1445 default: 1446 supported = 0; 1447 break; 1448 } 1449 1450 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { 1451 case IP_VERSION(6, 4, 0): 1452 case IP_VERSION(6, 4, 1): 1453 /* check xgmi3x16 pcs error */ 1454 for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) { 1455 data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_v6_4[i]); 1456 mask_data = 1457 RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[i]); 1458 if (data) 1459 amdgpu_xgmi_query_pcs_error_status(adev, data, 1460 mask_data, &ue_cnt, &ce_cnt, true, true); 1461 } 1462 break; 1463 default: 1464 if (!supported) 1465 dev_warn(adev->dev, "XGMI RAS error query not supported"); 1466 break; 1467 } 1468 1469 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL); 1470 1471 err_data->ue_count += ue_cnt; 1472 err_data->ce_count += ce_cnt; 1473 } 1474 1475 static enum aca_error_type xgmi_v6_4_0_pcs_mca_get_error_type(struct amdgpu_device *adev, u64 status) 1476 { 1477 const char *error_str; 1478 int ext_error_code; 1479 1480 ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status); 1481 1482 error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ? 1483 xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL; 1484 if (error_str) 1485 dev_info(adev->dev, "%s detected\n", error_str); 1486 1487 switch (ext_error_code) { 1488 case 0: 1489 return ACA_ERROR_TYPE_UE; 1490 case 6: 1491 return ACA_ERROR_TYPE_CE; 1492 default: 1493 return -EINVAL; 1494 } 1495 1496 return -EINVAL; 1497 } 1498 1499 static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct amdgpu_smuio_mcm_config_info *mcm_info, 1500 u64 mca_base, struct ras_err_data *err_data) 1501 { 1502 int xgmi_inst = mcm_info->die_id; 1503 u64 status = 0; 1504 1505 status = RREG64_MCA(xgmi_inst, mca_base, ACA_REG_IDX_STATUS); 1506 if (!ACA_REG__STATUS__VAL(status)) 1507 return; 1508 1509 switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) { 1510 case ACA_ERROR_TYPE_UE: 1511 amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL); 1512 break; 1513 case ACA_ERROR_TYPE_CE: 1514 amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL); 1515 break; 1516 default: 1517 break; 1518 } 1519 1520 WREG64_MCA(xgmi_inst, mca_base, ACA_REG_IDX_STATUS, 0ULL); 1521 } 1522 1523 static void xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, int xgmi_inst, struct ras_err_data *err_data) 1524 { 1525 struct amdgpu_smuio_mcm_config_info mcm_info = { 1526 .socket_id = adev->smuio.funcs->get_socket_id(adev), 1527 .die_id = xgmi_inst, 1528 }; 1529 int i; 1530 1531 for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++) 1532 __xgmi_v6_4_0_query_error_count(adev, &mcm_info, xgmi_v6_4_0_mca_base_array[i], err_data); 1533 } 1534 1535 static void xgmi_v6_4_0_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status) 1536 { 1537 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; 1538 int i; 1539 1540 for_each_inst(i, adev->aid_mask) 1541 xgmi_v6_4_0_query_error_count(adev, i, err_data); 1542 } 1543 1544 static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, 1545 void *ras_error_status) 1546 { 1547 switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) { 1548 case IP_VERSION(6, 4, 0): 1549 case IP_VERSION(6, 4, 1): 1550 xgmi_v6_4_0_query_ras_error_count(adev, ras_error_status); 1551 break; 1552 default: 1553 amdgpu_xgmi_legacy_query_ras_error_count(adev, ras_error_status); 1554 break; 1555 } 1556 } 1557 1558 /* Trigger XGMI/WAFL error */ 1559 static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, 1560 void *inject_if, uint32_t instance_mask) 1561 { 1562 int ret1, ret2; 1563 struct ta_ras_trigger_error_input *block_info = 1564 (struct ta_ras_trigger_error_input *)inject_if; 1565 1566 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 1567 dev_warn(adev->dev, "Failed to disallow df cstate"); 1568 1569 ret1 = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, XGMI_PLPD_DISALLOW); 1570 if (ret1 && ret1 != -EOPNOTSUPP) 1571 dev_warn(adev->dev, "Failed to disallow XGMI power down"); 1572 1573 ret2 = psp_ras_trigger_error(&adev->psp, block_info, instance_mask); 1574 1575 if (amdgpu_ras_intr_triggered()) 1576 return ret2; 1577 1578 ret1 = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, XGMI_PLPD_DEFAULT); 1579 if (ret1 && ret1 != -EOPNOTSUPP) 1580 dev_warn(adev->dev, "Failed to allow XGMI power down"); 1581 1582 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) 1583 dev_warn(adev->dev, "Failed to allow df cstate"); 1584 1585 return ret2; 1586 } 1587 1588 struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { 1589 .query_ras_error_count = amdgpu_xgmi_query_ras_error_count, 1590 .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count, 1591 .ras_error_inject = amdgpu_ras_error_inject_xgmi, 1592 }; 1593 1594 struct amdgpu_xgmi_ras xgmi_ras = { 1595 .ras_block = { 1596 .hw_ops = &xgmi_ras_hw_ops, 1597 .ras_late_init = amdgpu_xgmi_ras_late_init, 1598 }, 1599 }; 1600 1601 int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev) 1602 { 1603 int err; 1604 struct amdgpu_xgmi_ras *ras; 1605 1606 if (!adev->gmc.xgmi.ras) 1607 return 0; 1608 1609 ras = adev->gmc.xgmi.ras; 1610 err = amdgpu_ras_register_ras_block(adev, &ras->ras_block); 1611 if (err) { 1612 dev_err(adev->dev, "Failed to register xgmi_wafl_pcs ras block!\n"); 1613 return err; 1614 } 1615 1616 strcpy(ras->ras_block.ras_comm.name, "xgmi_wafl"); 1617 ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__XGMI_WAFL; 1618 ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; 1619 adev->gmc.xgmi.ras_if = &ras->ras_block.ras_comm; 1620 1621 return 0; 1622 } 1623 1624 static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work) 1625 { 1626 struct amdgpu_hive_info *hive = 1627 container_of(work, struct amdgpu_hive_info, reset_on_init_work); 1628 struct amdgpu_reset_context reset_context; 1629 struct amdgpu_device *tmp_adev; 1630 struct list_head device_list; 1631 int r; 1632 1633 mutex_lock(&hive->hive_lock); 1634 1635 INIT_LIST_HEAD(&device_list); 1636 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 1637 list_add_tail(&tmp_adev->reset_list, &device_list); 1638 1639 tmp_adev = list_first_entry(&device_list, struct amdgpu_device, 1640 reset_list); 1641 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 1642 1643 reset_context.method = AMD_RESET_METHOD_ON_INIT; 1644 reset_context.reset_req_dev = tmp_adev; 1645 reset_context.hive = hive; 1646 reset_context.reset_device_list = &device_list; 1647 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 1648 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 1649 1650 amdgpu_reset_do_xgmi_reset_on_init(&reset_context); 1651 mutex_unlock(&hive->hive_lock); 1652 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 1653 1654 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 1655 r = amdgpu_ras_init_badpage_info(tmp_adev); 1656 if (r && r != -EHWPOISON) 1657 dev_err(tmp_adev->dev, 1658 "error during bad page data initialization"); 1659 } 1660 } 1661 1662 static void amdgpu_xgmi_schedule_reset_on_init(struct amdgpu_hive_info *hive) 1663 { 1664 INIT_WORK(&hive->reset_on_init_work, amdgpu_xgmi_reset_on_init_work); 1665 amdgpu_reset_domain_schedule(hive->reset_domain, 1666 &hive->reset_on_init_work); 1667 } 1668 1669 int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev) 1670 { 1671 struct amdgpu_hive_info *hive; 1672 bool reset_scheduled; 1673 int num_devs; 1674 1675 hive = amdgpu_get_xgmi_hive(adev); 1676 if (!hive) 1677 return -EINVAL; 1678 1679 mutex_lock(&hive->hive_lock); 1680 num_devs = atomic_read(&hive->number_devices); 1681 reset_scheduled = false; 1682 if (num_devs == adev->gmc.xgmi.num_physical_nodes) { 1683 amdgpu_xgmi_schedule_reset_on_init(hive); 1684 reset_scheduled = true; 1685 } 1686 1687 mutex_unlock(&hive->hive_lock); 1688 amdgpu_put_xgmi_hive(hive); 1689 1690 if (reset_scheduled) 1691 flush_work(&hive->reset_on_init_work); 1692 1693 return 0; 1694 } 1695 1696 int amdgpu_xgmi_request_nps_change(struct amdgpu_device *adev, 1697 struct amdgpu_hive_info *hive, 1698 int req_nps_mode) 1699 { 1700 struct amdgpu_device *tmp_adev; 1701 int cur_nps_mode, r; 1702 1703 /* This is expected to be called only during unload of driver. The 1704 * request needs to be placed only once for all devices in the hive. If 1705 * one of them fail, revert the request for previous successful devices. 1706 * After placing the request, make hive mode as UNKNOWN so that other 1707 * devices don't request anymore. 1708 */ 1709 mutex_lock(&hive->hive_lock); 1710 if (atomic_read(&hive->requested_nps_mode) == 1711 UNKNOWN_MEMORY_PARTITION_MODE) { 1712 dev_dbg(adev->dev, "Unexpected entry for hive NPS change"); 1713 mutex_unlock(&hive->hive_lock); 1714 return 0; 1715 } 1716 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 1717 r = adev->gmc.gmc_funcs->request_mem_partition_mode( 1718 tmp_adev, req_nps_mode); 1719 if (r) 1720 break; 1721 } 1722 if (r) { 1723 /* Request back current mode if one of the requests failed */ 1724 cur_nps_mode = 1725 adev->gmc.gmc_funcs->query_mem_partition_mode(tmp_adev); 1726 list_for_each_entry_continue_reverse( 1727 tmp_adev, &hive->device_list, gmc.xgmi.head) 1728 adev->gmc.gmc_funcs->request_mem_partition_mode( 1729 tmp_adev, cur_nps_mode); 1730 } 1731 /* Set to UNKNOWN so that other devices don't request anymore */ 1732 atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE); 1733 mutex_unlock(&hive->hive_lock); 1734 1735 return r; 1736 } 1737 1738 bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev, 1739 struct amdgpu_device *bo_adev) 1740 { 1741 return (amdgpu_use_xgmi_p2p && adev != bo_adev && 1742 adev->gmc.xgmi.hive_id && 1743 adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id); 1744 } 1745 1746 void amdgpu_xgmi_early_init(struct amdgpu_device *adev) 1747 { 1748 if (!adev->gmc.xgmi.supported) 1749 return; 1750 1751 switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { 1752 case IP_VERSION(9, 4, 0): 1753 case IP_VERSION(9, 4, 1): 1754 case IP_VERSION(9, 4, 2): 1755 /* 25 GT/s */ 1756 adev->gmc.xgmi.max_speed = 25; 1757 adev->gmc.xgmi.max_width = 16; 1758 break; 1759 case IP_VERSION(9, 4, 3): 1760 case IP_VERSION(9, 4, 4): 1761 case IP_VERSION(9, 5, 0): 1762 /* 32 GT/s */ 1763 adev->gmc.xgmi.max_speed = 32; 1764 adev->gmc.xgmi.max_width = 16; 1765 break; 1766 default: 1767 break; 1768 } 1769 } 1770 1771 void amgpu_xgmi_set_max_speed_width(struct amdgpu_device *adev, 1772 uint16_t max_speed, uint8_t max_width) 1773 { 1774 adev->gmc.xgmi.max_speed = max_speed; 1775 adev->gmc.xgmi.max_width = max_width; 1776 } 1777