1 /* 2 * Copyright 2016 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 24 #include <linux/module.h> 25 26 #ifdef CONFIG_X86 27 #include <asm/hypervisor.h> 28 #endif 29 30 #include <drm/drm_drv.h> 31 #include <xen/xen.h> 32 33 #include "amdgpu.h" 34 #include "amdgpu_ras.h" 35 #include "amdgpu_reset.h" 36 #include "amdgpu_dpm.h" 37 #include "vi.h" 38 #include "soc15.h" 39 #include "nv.h" 40 41 #define POPULATE_UCODE_INFO(vf2pf_info, ucode, ver) \ 42 do { \ 43 vf2pf_info->ucode_info[ucode].id = ucode; \ 44 vf2pf_info->ucode_info[ucode].version = ver; \ 45 } while (0) 46 47 #define mmRCC_CONFIG_MEMSIZE 0xde3 48 49 const char *amdgpu_virt_dynamic_crit_table_name[] = { 50 "IP DISCOVERY", 51 "VBIOS IMG", 52 "RAS TELEMETRY", 53 "DATA EXCHANGE", 54 "BAD PAGE INFO", 55 "INIT HEADER", 56 "LAST", 57 }; 58 59 bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev) 60 { 61 /* By now all MMIO pages except mailbox are blocked */ 62 /* if blocking is enabled in hypervisor. Choose the */ 63 /* SCRATCH_REG0 to test. */ 64 return RREG32_NO_KIQ(0xc040) == 0xffffffff; 65 } 66 67 void amdgpu_virt_init_setting(struct amdgpu_device *adev) 68 { 69 struct drm_device *ddev = adev_to_drm(adev); 70 71 /* enable virtual display */ 72 if (adev->asic_type != CHIP_ALDEBARAN && 73 adev->asic_type != CHIP_ARCTURUS && 74 ((adev->pdev->class >> 8) != PCI_CLASS_ACCELERATOR_PROCESSING)) { 75 if (adev->mode_info.num_crtc == 0) 76 adev->mode_info.num_crtc = 1; 77 adev->enable_virtual_display = true; 78 } 79 ddev->driver_features &= ~DRIVER_ATOMIC; 80 adev->cg_flags = 0; 81 adev->pg_flags = 0; 82 83 /* Reduce kcq number to 2 to reduce latency */ 84 if (amdgpu_num_kcq == -1) 85 amdgpu_num_kcq = 2; 86 } 87 88 /** 89 * amdgpu_virt_request_full_gpu() - request full gpu access 90 * @adev: amdgpu device. 91 * @init: is driver init time. 92 * When start to init/fini driver, first need to request full gpu access. 93 * Return: Zero if request success, otherwise will return error. 94 */ 95 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init) 96 { 97 struct amdgpu_virt *virt = &adev->virt; 98 int r; 99 100 if (virt->ops && virt->ops->req_full_gpu) { 101 r = virt->ops->req_full_gpu(adev, init); 102 if (r) { 103 adev->no_hw_access = true; 104 return r; 105 } 106 107 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 108 } 109 110 return 0; 111 } 112 113 /** 114 * amdgpu_virt_release_full_gpu() - release full gpu access 115 * @adev: amdgpu device. 116 * @init: is driver init time. 117 * When finishing driver init/fini, need to release full gpu access. 118 * Return: Zero if release success, otherwise will returen error. 119 */ 120 int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init) 121 { 122 struct amdgpu_virt *virt = &adev->virt; 123 int r; 124 125 if (virt->ops && virt->ops->rel_full_gpu) { 126 r = virt->ops->rel_full_gpu(adev, init); 127 if (r) 128 return r; 129 130 adev->virt.caps |= AMDGPU_SRIOV_CAPS_RUNTIME; 131 } 132 return 0; 133 } 134 135 /** 136 * amdgpu_virt_reset_gpu() - reset gpu 137 * @adev: amdgpu device. 138 * Send reset command to GPU hypervisor to reset GPU that VM is using 139 * Return: Zero if reset success, otherwise will return error. 140 */ 141 int amdgpu_virt_reset_gpu(struct amdgpu_device *adev) 142 { 143 struct amdgpu_virt *virt = &adev->virt; 144 int r; 145 146 if (virt->ops && virt->ops->reset_gpu) { 147 r = virt->ops->reset_gpu(adev); 148 if (r) 149 return r; 150 151 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 152 } 153 154 return 0; 155 } 156 157 void amdgpu_virt_request_init_data(struct amdgpu_device *adev) 158 { 159 struct amdgpu_virt *virt = &adev->virt; 160 161 if (virt->ops && virt->ops->req_init_data) 162 virt->ops->req_init_data(adev); 163 164 if (adev->virt.req_init_data_ver > 0) 165 dev_info(adev->dev, "host supports REQ_INIT_DATA handshake of critical_region_version %d\n", 166 adev->virt.req_init_data_ver); 167 else 168 dev_warn(adev->dev, "host doesn't support REQ_INIT_DATA handshake\n"); 169 } 170 171 /** 172 * amdgpu_virt_ready_to_reset() - send ready to reset to host 173 * @adev: amdgpu device. 174 * Send ready to reset message to GPU hypervisor to signal we have stopped GPU 175 * activity and is ready for host FLR 176 */ 177 void amdgpu_virt_ready_to_reset(struct amdgpu_device *adev) 178 { 179 struct amdgpu_virt *virt = &adev->virt; 180 181 if (virt->ops && virt->ops->reset_gpu) 182 virt->ops->ready_to_reset(adev); 183 } 184 185 /** 186 * amdgpu_virt_wait_reset() - wait for reset gpu completed 187 * @adev: amdgpu device. 188 * Wait for GPU reset completed. 189 * Return: Zero if reset success, otherwise will return error. 190 */ 191 int amdgpu_virt_wait_reset(struct amdgpu_device *adev) 192 { 193 struct amdgpu_virt *virt = &adev->virt; 194 195 if (!virt->ops || !virt->ops->wait_reset) 196 return -EINVAL; 197 198 return virt->ops->wait_reset(adev); 199 } 200 201 /** 202 * amdgpu_virt_alloc_mm_table() - alloc memory for mm table 203 * @adev: amdgpu device. 204 * MM table is used by UVD and VCE for its initialization 205 * Return: Zero if allocate success. 206 */ 207 int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev) 208 { 209 int r; 210 211 if (!amdgpu_sriov_vf(adev) || adev->virt.mm_table.gpu_addr) 212 return 0; 213 214 r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, 215 AMDGPU_GEM_DOMAIN_VRAM | 216 AMDGPU_GEM_DOMAIN_GTT, 217 &adev->virt.mm_table.bo, 218 &adev->virt.mm_table.gpu_addr, 219 (void *)&adev->virt.mm_table.cpu_addr); 220 if (r) { 221 dev_err(adev->dev, "failed to alloc mm table and error = %d.\n", r); 222 return r; 223 } 224 225 memset((void *)adev->virt.mm_table.cpu_addr, 0, PAGE_SIZE); 226 dev_info(adev->dev, "MM table gpu addr = 0x%llx, cpu addr = %p.\n", 227 adev->virt.mm_table.gpu_addr, 228 adev->virt.mm_table.cpu_addr); 229 return 0; 230 } 231 232 /** 233 * amdgpu_virt_free_mm_table() - free mm table memory 234 * @adev: amdgpu device. 235 * Free MM table memory 236 */ 237 void amdgpu_virt_free_mm_table(struct amdgpu_device *adev) 238 { 239 if (!amdgpu_sriov_vf(adev) || !adev->virt.mm_table.gpu_addr) 240 return; 241 242 amdgpu_bo_free_kernel(&adev->virt.mm_table.bo, 243 &adev->virt.mm_table.gpu_addr, 244 (void *)&adev->virt.mm_table.cpu_addr); 245 adev->virt.mm_table.gpu_addr = 0; 246 } 247 248 /** 249 * amdgpu_virt_rcvd_ras_interrupt() - receive ras interrupt 250 * @adev: amdgpu device. 251 * Check whether host sent RAS error message 252 * Return: true if found, otherwise false 253 */ 254 bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev) 255 { 256 struct amdgpu_virt *virt = &adev->virt; 257 258 if (!virt->ops || !virt->ops->rcvd_ras_intr) 259 return false; 260 261 return virt->ops->rcvd_ras_intr(adev); 262 } 263 264 265 unsigned int amd_sriov_msg_checksum(void *obj, 266 unsigned long obj_size, 267 unsigned int key, 268 unsigned int checksum) 269 { 270 unsigned int ret = key; 271 unsigned long i = 0; 272 unsigned char *pos; 273 274 pos = (char *)obj; 275 /* calculate checksum */ 276 for (i = 0; i < obj_size; ++i) 277 ret += *(pos + i); 278 /* minus the checksum itself */ 279 pos = (char *)&checksum; 280 for (i = 0; i < sizeof(checksum); ++i) 281 ret -= *(pos + i); 282 return ret; 283 } 284 285 static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev) 286 { 287 struct amdgpu_virt *virt = &adev->virt; 288 struct amdgpu_virt_ras_err_handler_data **data = &virt->virt_eh_data; 289 /* GPU will be marked bad on host if bp count more then 10, 290 * so alloc 512 is enough. 291 */ 292 unsigned int align_space = 512; 293 void *bps = NULL; 294 struct amdgpu_bo **bps_bo = NULL; 295 296 *data = kmalloc(sizeof(struct amdgpu_virt_ras_err_handler_data), GFP_KERNEL); 297 if (!*data) 298 goto data_failure; 299 300 bps = kmalloc_array(align_space, sizeof(*(*data)->bps), GFP_KERNEL); 301 if (!bps) 302 goto bps_failure; 303 304 bps_bo = kmalloc_array(align_space, sizeof(*(*data)->bps_bo), GFP_KERNEL); 305 if (!bps_bo) 306 goto bps_bo_failure; 307 308 (*data)->bps = bps; 309 (*data)->bps_bo = bps_bo; 310 (*data)->count = 0; 311 (*data)->last_reserved = 0; 312 313 virt->ras_init_done = true; 314 315 return 0; 316 317 bps_bo_failure: 318 kfree(bps); 319 bps_failure: 320 kfree(*data); 321 data_failure: 322 return -ENOMEM; 323 } 324 325 static void amdgpu_virt_ras_release_bp(struct amdgpu_device *adev) 326 { 327 struct amdgpu_virt *virt = &adev->virt; 328 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 329 struct amdgpu_bo *bo; 330 int i; 331 332 if (!data) 333 return; 334 335 for (i = data->last_reserved - 1; i >= 0; i--) { 336 bo = data->bps_bo[i]; 337 if (bo) { 338 amdgpu_bo_free_kernel(&bo, NULL, NULL); 339 data->bps_bo[i] = bo; 340 } 341 data->last_reserved = i; 342 } 343 } 344 345 void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev) 346 { 347 struct amdgpu_virt *virt = &adev->virt; 348 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 349 350 virt->ras_init_done = false; 351 352 if (!data) 353 return; 354 355 amdgpu_virt_ras_release_bp(adev); 356 357 kfree(data->bps); 358 kfree(data->bps_bo); 359 kfree(data); 360 virt->virt_eh_data = NULL; 361 } 362 363 static void amdgpu_virt_ras_add_bps(struct amdgpu_device *adev, 364 struct eeprom_table_record *bps, int pages) 365 { 366 struct amdgpu_virt *virt = &adev->virt; 367 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 368 369 if (!data) 370 return; 371 372 memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps)); 373 data->count += pages; 374 } 375 376 static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev) 377 { 378 struct amdgpu_virt *virt = &adev->virt; 379 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 380 struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr; 381 struct ttm_resource_manager *man = &mgr->manager; 382 struct amdgpu_bo *bo = NULL; 383 uint64_t bp; 384 int i; 385 386 if (!data) 387 return; 388 389 for (i = data->last_reserved; i < data->count; i++) { 390 bp = data->bps[i].retired_page; 391 392 /* There are two cases of reserve error should be ignored: 393 * 1) a ras bad page has been allocated (used by someone); 394 * 2) a ras bad page has been reserved (duplicate error injection 395 * for one page); 396 */ 397 if (ttm_resource_manager_used(man)) { 398 amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr, 399 bp << AMDGPU_GPU_PAGE_SHIFT, 400 AMDGPU_GPU_PAGE_SIZE); 401 data->bps_bo[i] = NULL; 402 } else { 403 if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT, 404 AMDGPU_GPU_PAGE_SIZE, 405 &bo, NULL)) 406 dev_dbg(adev->dev, 407 "RAS WARN: reserve vram for retired page %llx fail\n", 408 bp); 409 data->bps_bo[i] = bo; 410 } 411 data->last_reserved = i + 1; 412 bo = NULL; 413 } 414 } 415 416 static bool amdgpu_virt_ras_check_bad_page(struct amdgpu_device *adev, 417 uint64_t retired_page) 418 { 419 struct amdgpu_virt *virt = &adev->virt; 420 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 421 int i; 422 423 if (!data) 424 return true; 425 426 for (i = 0; i < data->count; i++) 427 if (retired_page == data->bps[i].retired_page) 428 return true; 429 430 return false; 431 } 432 433 static void amdgpu_virt_add_bad_page(struct amdgpu_device *adev, 434 uint64_t bp_block_offset, uint32_t bp_block_size) 435 { 436 struct eeprom_table_record bp; 437 uint64_t retired_page; 438 uint32_t bp_idx, bp_cnt; 439 void *vram_usage_va = NULL; 440 441 if (adev->mman.fw_vram_usage_va) 442 vram_usage_va = adev->mman.fw_vram_usage_va; 443 else 444 vram_usage_va = adev->mman.drv_vram_usage_va; 445 446 memset(&bp, 0, sizeof(bp)); 447 448 if (bp_block_size) { 449 bp_cnt = bp_block_size / sizeof(uint64_t); 450 for (bp_idx = 0; bp_idx < bp_cnt; bp_idx++) { 451 retired_page = *(uint64_t *)(vram_usage_va + 452 bp_block_offset + bp_idx * sizeof(uint64_t)); 453 bp.retired_page = retired_page; 454 455 if (amdgpu_virt_ras_check_bad_page(adev, retired_page)) 456 continue; 457 458 amdgpu_virt_ras_add_bps(adev, &bp, 1); 459 460 amdgpu_virt_ras_reserve_bps(adev); 461 } 462 } 463 } 464 465 static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) 466 { 467 struct amd_sriov_msg_pf2vf_info_header *pf2vf_info = adev->virt.fw_reserve.p_pf2vf; 468 uint32_t checksum; 469 uint32_t checkval; 470 471 uint32_t i; 472 uint32_t tmp; 473 474 if (adev->virt.fw_reserve.p_pf2vf == NULL) 475 return -EINVAL; 476 477 if (pf2vf_info->size > 1024) { 478 dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", pf2vf_info->size); 479 return -EINVAL; 480 } 481 482 switch (pf2vf_info->version) { 483 case 1: 484 checksum = ((struct amdgim_pf2vf_info_v1 *)pf2vf_info)->checksum; 485 checkval = amd_sriov_msg_checksum( 486 adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, 487 adev->virt.fw_reserve.checksum_key, checksum); 488 if (checksum != checkval) { 489 dev_err(adev->dev, 490 "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", 491 checksum, checkval); 492 return -EINVAL; 493 } 494 495 adev->virt.gim_feature = 496 ((struct amdgim_pf2vf_info_v1 *)pf2vf_info)->feature_flags; 497 break; 498 case 2: 499 /* TODO: missing key, need to add it later */ 500 checksum = ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->checksum; 501 checkval = amd_sriov_msg_checksum( 502 adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, 503 0, checksum); 504 if (checksum != checkval) { 505 dev_err(adev->dev, 506 "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", 507 checksum, checkval); 508 return -EINVAL; 509 } 510 511 adev->virt.vf2pf_update_interval_ms = 512 ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->vf2pf_update_interval_ms; 513 adev->virt.gim_feature = 514 ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->feature_flags.all; 515 adev->virt.reg_access = 516 ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->reg_access_flags.all; 517 518 adev->virt.decode_max_dimension_pixels = 0; 519 adev->virt.decode_max_frame_pixels = 0; 520 adev->virt.encode_max_dimension_pixels = 0; 521 adev->virt.encode_max_frame_pixels = 0; 522 adev->virt.is_mm_bw_enabled = false; 523 for (i = 0; i < AMD_SRIOV_MSG_RESERVE_VCN_INST; i++) { 524 tmp = ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->mm_bw_management[i].decode_max_dimension_pixels; 525 adev->virt.decode_max_dimension_pixels = max(tmp, adev->virt.decode_max_dimension_pixels); 526 527 tmp = ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->mm_bw_management[i].decode_max_frame_pixels; 528 adev->virt.decode_max_frame_pixels = max(tmp, adev->virt.decode_max_frame_pixels); 529 530 tmp = ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->mm_bw_management[i].encode_max_dimension_pixels; 531 adev->virt.encode_max_dimension_pixels = max(tmp, adev->virt.encode_max_dimension_pixels); 532 533 tmp = ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->mm_bw_management[i].encode_max_frame_pixels; 534 adev->virt.encode_max_frame_pixels = max(tmp, adev->virt.encode_max_frame_pixels); 535 } 536 if ((adev->virt.decode_max_dimension_pixels > 0) || (adev->virt.encode_max_dimension_pixels > 0)) 537 adev->virt.is_mm_bw_enabled = true; 538 539 adev->unique_id = 540 ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid; 541 adev->virt.ras_en_caps.all = ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->ras_en_caps.all; 542 adev->virt.ras_telemetry_en_caps.all = 543 ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->ras_telemetry_en_caps.all; 544 break; 545 default: 546 dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version); 547 return -EINVAL; 548 } 549 550 /* correct too large or too little interval value */ 551 if (adev->virt.vf2pf_update_interval_ms < 200 || adev->virt.vf2pf_update_interval_ms > 10000) 552 adev->virt.vf2pf_update_interval_ms = 2000; 553 554 return 0; 555 } 556 557 static void amdgpu_virt_populate_vf2pf_ucode_info(struct amdgpu_device *adev) 558 { 559 struct amd_sriov_msg_vf2pf_info *vf2pf_info; 560 vf2pf_info = (struct amd_sriov_msg_vf2pf_info *) adev->virt.fw_reserve.p_vf2pf; 561 562 if (adev->virt.fw_reserve.p_vf2pf == NULL) 563 return; 564 565 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_VCE, adev->vce.fw_version); 566 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_UVD, adev->uvd.fw_version); 567 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_MC, adev->gmc.fw_version); 568 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_ME, adev->gfx.me_fw_version); 569 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_PFP, adev->gfx.pfp_fw_version); 570 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_CE, adev->gfx.ce_fw_version); 571 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_RLC, adev->gfx.rlc_fw_version); 572 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_RLC_SRLC, adev->gfx.rlc_srlc_fw_version); 573 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_RLC_SRLG, adev->gfx.rlc_srlg_fw_version); 574 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_RLC_SRLS, adev->gfx.rlc_srls_fw_version); 575 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_MEC, adev->gfx.mec_fw_version); 576 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_MEC2, adev->gfx.mec2_fw_version); 577 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_SOS, adev->psp.sos.fw_version); 578 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_ASD, 579 adev->psp.asd_context.bin_desc.fw_version); 580 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_TA_RAS, 581 adev->psp.ras_context.context.bin_desc.fw_version); 582 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_TA_XGMI, 583 adev->psp.xgmi_context.context.bin_desc.fw_version); 584 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_SMC, adev->pm.fw_version); 585 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_SDMA, adev->sdma.instance[0].fw_version); 586 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_SDMA2, adev->sdma.instance[1].fw_version); 587 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_VCN, adev->vcn.fw_version); 588 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_DMCU, adev->dm.dmcu_fw_version); 589 } 590 591 static int amdgpu_virt_write_vf2pf_data(struct amdgpu_device *adev) 592 { 593 struct amd_sriov_msg_vf2pf_info *vf2pf_info; 594 595 vf2pf_info = (struct amd_sriov_msg_vf2pf_info *) adev->virt.fw_reserve.p_vf2pf; 596 597 if (adev->virt.fw_reserve.p_vf2pf == NULL) 598 return -EINVAL; 599 600 memset(vf2pf_info, 0, sizeof(struct amd_sriov_msg_vf2pf_info)); 601 602 vf2pf_info->header.size = sizeof(struct amd_sriov_msg_vf2pf_info); 603 vf2pf_info->header.version = AMD_SRIOV_MSG_FW_VRAM_VF2PF_VER; 604 605 #ifdef MODULE 606 if (THIS_MODULE->version != NULL) 607 strcpy(vf2pf_info->driver_version, THIS_MODULE->version); 608 else 609 #endif 610 strcpy(vf2pf_info->driver_version, "N/A"); 611 612 vf2pf_info->pf2vf_version_required = 0; // no requirement, guest understands all 613 vf2pf_info->driver_cert = 0; 614 vf2pf_info->os_info.all = 0; 615 616 vf2pf_info->fb_usage = ttm_resource_manager_used(&adev->mman.vram_mgr.manager) ? 617 ttm_resource_manager_usage(&adev->mman.vram_mgr.manager) >> 20 : 0; 618 vf2pf_info->fb_vis_usage = 619 amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr) >> 20; 620 vf2pf_info->fb_size = adev->gmc.real_vram_size >> 20; 621 vf2pf_info->fb_vis_size = adev->gmc.visible_vram_size >> 20; 622 623 amdgpu_virt_populate_vf2pf_ucode_info(adev); 624 625 /* TODO: read dynamic info */ 626 vf2pf_info->gfx_usage = 0; 627 vf2pf_info->compute_usage = 0; 628 vf2pf_info->encode_usage = 0; 629 vf2pf_info->decode_usage = 0; 630 631 vf2pf_info->dummy_page_addr = (uint64_t)adev->dummy_page_addr; 632 if (amdgpu_sriov_is_mes_info_enable(adev)) { 633 vf2pf_info->mes_info_addr = 634 (uint64_t)(adev->mes.resource_1_gpu_addr[0] + AMDGPU_GPU_PAGE_SIZE); 635 vf2pf_info->mes_info_size = 636 adev->mes.resource_1[0]->tbo.base.size - AMDGPU_GPU_PAGE_SIZE; 637 } 638 vf2pf_info->checksum = 639 amd_sriov_msg_checksum( 640 vf2pf_info, sizeof(*vf2pf_info), 0, 0); 641 642 return 0; 643 } 644 645 static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work) 646 { 647 struct amdgpu_device *adev = container_of(work, struct amdgpu_device, virt.vf2pf_work.work); 648 int ret; 649 650 ret = amdgpu_virt_read_pf2vf_data(adev); 651 if (ret) { 652 adev->virt.vf2pf_update_retry_cnt++; 653 654 if ((amdgpu_virt_rcvd_ras_interrupt(adev) || 655 adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && 656 amdgpu_sriov_runtime(adev)) { 657 658 amdgpu_ras_set_fed(adev, true); 659 if (amdgpu_reset_domain_schedule(adev->reset_domain, 660 &adev->kfd.reset_work)) 661 return; 662 else 663 dev_err(adev->dev, "Failed to queue work! at %s", __func__); 664 } 665 666 goto out; 667 } 668 669 adev->virt.vf2pf_update_retry_cnt = 0; 670 amdgpu_virt_write_vf2pf_data(adev); 671 672 out: 673 schedule_delayed_work(&(adev->virt.vf2pf_work), adev->virt.vf2pf_update_interval_ms); 674 } 675 676 static int amdgpu_virt_read_exchange_data_from_mem(struct amdgpu_device *adev, uint32_t *pfvf_data) 677 { 678 uint32_t dataexchange_offset = 679 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset; 680 uint32_t dataexchange_size = 681 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb << 10; 682 uint64_t pos = 0; 683 684 dev_info(adev->dev, 685 "Got data exchange info from dynamic crit_region_table at offset 0x%x with size of 0x%x bytes.\n", 686 dataexchange_offset, dataexchange_size); 687 688 if (!IS_ALIGNED(dataexchange_offset, 4) || !IS_ALIGNED(dataexchange_size, 4)) { 689 dev_err(adev->dev, "Data exchange data not aligned to 4 bytes\n"); 690 return -EINVAL; 691 } 692 693 pos = (uint64_t)dataexchange_offset; 694 amdgpu_device_vram_access(adev, pos, pfvf_data, 695 dataexchange_size, false); 696 697 return 0; 698 } 699 700 void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev) 701 { 702 if (adev->virt.vf2pf_update_interval_ms != 0) { 703 dev_info(adev->dev, "clean up the vf2pf work item\n"); 704 cancel_delayed_work_sync(&adev->virt.vf2pf_work); 705 adev->virt.vf2pf_update_interval_ms = 0; 706 } 707 } 708 709 void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev) 710 { 711 uint32_t *pfvf_data = NULL; 712 713 adev->virt.fw_reserve.p_pf2vf = NULL; 714 adev->virt.fw_reserve.p_vf2pf = NULL; 715 adev->virt.vf2pf_update_interval_ms = 0; 716 adev->virt.vf2pf_update_retry_cnt = 0; 717 718 if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) { 719 dev_warn(adev->dev, "Currently fw_vram and drv_vram should not have values at the same time!"); 720 } else if (adev->mman.fw_vram_usage_va || adev->mman.drv_vram_usage_va) { 721 /* go through this logic in ip_init and reset to init workqueue*/ 722 amdgpu_virt_exchange_data(adev); 723 724 INIT_DELAYED_WORK(&adev->virt.vf2pf_work, amdgpu_virt_update_vf2pf_work_item); 725 schedule_delayed_work(&(adev->virt.vf2pf_work), msecs_to_jiffies(adev->virt.vf2pf_update_interval_ms)); 726 } else if (adev->bios != NULL) { 727 /* got through this logic in early init stage to get necessary flags, e.g. rlcg_acc related*/ 728 if (adev->virt.req_init_data_ver == GPU_CRIT_REGION_V2) { 729 pfvf_data = 730 kzalloc(adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb << 10, 731 GFP_KERNEL); 732 if (!pfvf_data) { 733 dev_err(adev->dev, "Failed to allocate memory for pfvf_data\n"); 734 return; 735 } 736 737 if (amdgpu_virt_read_exchange_data_from_mem(adev, pfvf_data)) 738 goto free_pfvf_data; 739 740 adev->virt.fw_reserve.p_pf2vf = 741 (struct amd_sriov_msg_pf2vf_info_header *)pfvf_data; 742 743 amdgpu_virt_read_pf2vf_data(adev); 744 745 free_pfvf_data: 746 kfree(pfvf_data); 747 pfvf_data = NULL; 748 adev->virt.fw_reserve.p_pf2vf = NULL; 749 } else { 750 adev->virt.fw_reserve.p_pf2vf = 751 (struct amd_sriov_msg_pf2vf_info_header *) 752 (adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10)); 753 754 amdgpu_virt_read_pf2vf_data(adev); 755 } 756 } 757 } 758 759 760 void amdgpu_virt_exchange_data(struct amdgpu_device *adev) 761 { 762 uint64_t bp_block_offset = 0; 763 uint32_t bp_block_size = 0; 764 struct amd_sriov_msg_pf2vf_info *pf2vf_v2 = NULL; 765 766 if (adev->mman.fw_vram_usage_va || adev->mman.drv_vram_usage_va) { 767 if (adev->mman.fw_vram_usage_va) { 768 if (adev->virt.req_init_data_ver == GPU_CRIT_REGION_V2) { 769 adev->virt.fw_reserve.p_pf2vf = 770 (struct amd_sriov_msg_pf2vf_info_header *) 771 (adev->mman.fw_vram_usage_va + 772 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset); 773 adev->virt.fw_reserve.p_vf2pf = 774 (struct amd_sriov_msg_vf2pf_info_header *) 775 (adev->mman.fw_vram_usage_va + 776 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset + 777 (AMD_SRIOV_MSG_SIZE_KB << 10)); 778 adev->virt.fw_reserve.ras_telemetry = 779 (adev->mman.fw_vram_usage_va + 780 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset); 781 } else { 782 adev->virt.fw_reserve.p_pf2vf = 783 (struct amd_sriov_msg_pf2vf_info_header *) 784 (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10)); 785 adev->virt.fw_reserve.p_vf2pf = 786 (struct amd_sriov_msg_vf2pf_info_header *) 787 (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10)); 788 adev->virt.fw_reserve.ras_telemetry = 789 (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10)); 790 } 791 } else if (adev->mman.drv_vram_usage_va) { 792 adev->virt.fw_reserve.p_pf2vf = 793 (struct amd_sriov_msg_pf2vf_info_header *) 794 (adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10)); 795 adev->virt.fw_reserve.p_vf2pf = 796 (struct amd_sriov_msg_vf2pf_info_header *) 797 (adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10)); 798 adev->virt.fw_reserve.ras_telemetry = 799 (adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10)); 800 } 801 802 amdgpu_virt_read_pf2vf_data(adev); 803 amdgpu_virt_write_vf2pf_data(adev); 804 805 /* bad page handling for version 2 */ 806 if (adev->virt.fw_reserve.p_pf2vf->version == 2) { 807 pf2vf_v2 = (struct amd_sriov_msg_pf2vf_info *)adev->virt.fw_reserve.p_pf2vf; 808 809 bp_block_offset = ((uint64_t)pf2vf_v2->bp_block_offset_low & 0xFFFFFFFF) | 810 ((((uint64_t)pf2vf_v2->bp_block_offset_high) << 32) & 0xFFFFFFFF00000000); 811 bp_block_size = pf2vf_v2->bp_block_size; 812 813 if (bp_block_size && !adev->virt.ras_init_done) 814 amdgpu_virt_init_ras_err_handler_data(adev); 815 816 if (adev->virt.ras_init_done) 817 amdgpu_virt_add_bad_page(adev, bp_block_offset, bp_block_size); 818 } 819 } 820 } 821 822 static u32 amdgpu_virt_init_detect_asic(struct amdgpu_device *adev) 823 { 824 uint32_t reg; 825 826 switch (adev->asic_type) { 827 case CHIP_TONGA: 828 case CHIP_FIJI: 829 reg = RREG32(mmBIF_IOV_FUNC_IDENTIFIER); 830 break; 831 case CHIP_VEGA10: 832 case CHIP_VEGA20: 833 case CHIP_NAVI10: 834 case CHIP_NAVI12: 835 case CHIP_SIENNA_CICHLID: 836 case CHIP_ARCTURUS: 837 case CHIP_ALDEBARAN: 838 case CHIP_IP_DISCOVERY: 839 reg = RREG32(mmRCC_IOV_FUNC_IDENTIFIER); 840 break; 841 default: /* other chip doesn't support SRIOV */ 842 reg = 0; 843 break; 844 } 845 846 if (reg & 1) 847 adev->virt.caps |= AMDGPU_SRIOV_CAPS_IS_VF; 848 849 if (reg & 0x80000000) 850 adev->virt.caps |= AMDGPU_SRIOV_CAPS_ENABLE_IOV; 851 852 if (!reg) { 853 /* passthrough mode exclus sriov mod */ 854 if (is_virtual_machine() && !xen_initial_domain()) 855 adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE; 856 } 857 858 return reg; 859 } 860 861 static bool amdgpu_virt_init_req_data(struct amdgpu_device *adev, u32 reg) 862 { 863 bool is_sriov = false; 864 865 /* we have the ability to check now */ 866 if (amdgpu_sriov_vf(adev)) { 867 is_sriov = true; 868 869 switch (adev->asic_type) { 870 case CHIP_TONGA: 871 case CHIP_FIJI: 872 vi_set_virt_ops(adev); 873 break; 874 case CHIP_VEGA10: 875 soc15_set_virt_ops(adev); 876 #ifdef CONFIG_X86 877 /* not send GPU_INIT_DATA with MS_HYPERV*/ 878 if (!hypervisor_is_type(X86_HYPER_MS_HYPERV)) 879 #endif 880 /* send a dummy GPU_INIT_DATA request to host on vega10 */ 881 amdgpu_virt_request_init_data(adev); 882 break; 883 case CHIP_VEGA20: 884 case CHIP_ARCTURUS: 885 case CHIP_ALDEBARAN: 886 soc15_set_virt_ops(adev); 887 break; 888 case CHIP_NAVI10: 889 case CHIP_NAVI12: 890 case CHIP_SIENNA_CICHLID: 891 case CHIP_IP_DISCOVERY: 892 nv_set_virt_ops(adev); 893 /* try send GPU_INIT_DATA request to host */ 894 amdgpu_virt_request_init_data(adev); 895 break; 896 default: /* other chip doesn't support SRIOV */ 897 is_sriov = false; 898 dev_err(adev->dev, "Unknown asic type: %d!\n", adev->asic_type); 899 break; 900 } 901 } 902 903 return is_sriov; 904 } 905 906 static void amdgpu_virt_init_ras(struct amdgpu_device *adev) 907 { 908 ratelimit_state_init(&adev->virt.ras.ras_error_cnt_rs, 5 * HZ, 1); 909 ratelimit_state_init(&adev->virt.ras.ras_cper_dump_rs, 5 * HZ, 1); 910 ratelimit_state_init(&adev->virt.ras.ras_chk_criti_rs, 5 * HZ, 1); 911 912 ratelimit_set_flags(&adev->virt.ras.ras_error_cnt_rs, 913 RATELIMIT_MSG_ON_RELEASE); 914 ratelimit_set_flags(&adev->virt.ras.ras_cper_dump_rs, 915 RATELIMIT_MSG_ON_RELEASE); 916 ratelimit_set_flags(&adev->virt.ras.ras_chk_criti_rs, 917 RATELIMIT_MSG_ON_RELEASE); 918 919 mutex_init(&adev->virt.ras.ras_telemetry_mutex); 920 mutex_init(&adev->virt.access_req_mutex); 921 922 adev->virt.ras.cper_rptr = 0; 923 } 924 925 static uint8_t amdgpu_virt_crit_region_calc_checksum(uint8_t *buf_start, uint8_t *buf_end) 926 { 927 uint32_t sum = 0; 928 929 if (buf_start >= buf_end) 930 return 0; 931 932 for (; buf_start < buf_end; buf_start++) 933 sum += buf_start[0]; 934 935 return 0xffffffff - sum; 936 } 937 938 int amdgpu_virt_init_critical_region(struct amdgpu_device *adev) 939 { 940 struct amd_sriov_msg_init_data_header *init_data_hdr = NULL; 941 u64 init_hdr_offset = adev->virt.init_data_header.offset; 942 u64 init_hdr_size = (u64)adev->virt.init_data_header.size_kb << 10; /* KB → bytes */ 943 u64 vram_size; 944 u64 end; 945 int r = 0; 946 uint8_t checksum = 0; 947 948 /* Skip below init if critical region version != v2 */ 949 if (adev->virt.req_init_data_ver != GPU_CRIT_REGION_V2) 950 return 0; 951 952 if (init_hdr_offset < 0) { 953 dev_err(adev->dev, "Invalid init header offset\n"); 954 return -EINVAL; 955 } 956 957 vram_size = RREG32(mmRCC_CONFIG_MEMSIZE); 958 if (!vram_size || vram_size == U32_MAX) 959 return -EINVAL; 960 vram_size <<= 20; 961 962 if (check_add_overflow(init_hdr_offset, init_hdr_size, &end) || end > vram_size) { 963 dev_err(adev->dev, "init_data_header exceeds VRAM size, exiting\n"); 964 return -EINVAL; 965 } 966 967 /* Allocate for init_data_hdr */ 968 init_data_hdr = kzalloc(sizeof(struct amd_sriov_msg_init_data_header), GFP_KERNEL); 969 if (!init_data_hdr) 970 return -ENOMEM; 971 972 amdgpu_device_vram_access(adev, (uint64_t)init_hdr_offset, (uint32_t *)init_data_hdr, 973 sizeof(struct amd_sriov_msg_init_data_header), false); 974 975 /* Table validation */ 976 if (strncmp(init_data_hdr->signature, 977 AMDGPU_SRIOV_CRIT_DATA_SIGNATURE, 978 AMDGPU_SRIOV_CRIT_DATA_SIG_LEN) != 0) { 979 dev_err(adev->dev, "Invalid init data signature: %.4s\n", 980 init_data_hdr->signature); 981 r = -EINVAL; 982 goto out; 983 } 984 985 checksum = amdgpu_virt_crit_region_calc_checksum( 986 (uint8_t *)&init_data_hdr->initdata_offset, 987 (uint8_t *)init_data_hdr + 988 sizeof(struct amd_sriov_msg_init_data_header)); 989 if (checksum != init_data_hdr->checksum) { 990 dev_err(adev->dev, "Found unmatching checksum from calculation 0x%x and init_data 0x%x\n", 991 checksum, init_data_hdr->checksum); 992 r = -EINVAL; 993 goto out; 994 } 995 996 memset(&adev->virt.crit_regn, 0, sizeof(adev->virt.crit_regn)); 997 memset(adev->virt.crit_regn_tbl, 0, sizeof(adev->virt.crit_regn_tbl)); 998 999 adev->virt.crit_regn.offset = init_data_hdr->initdata_offset; 1000 adev->virt.crit_regn.size_kb = init_data_hdr->initdata_size_in_kb; 1001 1002 /* Validation and initialization for each table entry */ 1003 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_IPD_TABLE_ID)) { 1004 if (!init_data_hdr->ip_discovery_size_in_kb || 1005 init_data_hdr->ip_discovery_size_in_kb > DISCOVERY_TMR_SIZE) { 1006 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1007 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_IPD_TABLE_ID], 1008 init_data_hdr->ip_discovery_size_in_kb); 1009 r = -EINVAL; 1010 goto out; 1011 } 1012 1013 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].offset = 1014 init_data_hdr->ip_discovery_offset; 1015 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb = 1016 init_data_hdr->ip_discovery_size_in_kb; 1017 } 1018 1019 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID)) { 1020 if (!init_data_hdr->vbios_img_size_in_kb) { 1021 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1022 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID], 1023 init_data_hdr->vbios_img_size_in_kb); 1024 r = -EINVAL; 1025 goto out; 1026 } 1027 1028 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].offset = 1029 init_data_hdr->vbios_img_offset; 1030 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].size_kb = 1031 init_data_hdr->vbios_img_size_in_kb; 1032 } 1033 1034 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID)) { 1035 if (!init_data_hdr->ras_tele_info_size_in_kb) { 1036 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1037 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID], 1038 init_data_hdr->ras_tele_info_size_in_kb); 1039 r = -EINVAL; 1040 goto out; 1041 } 1042 1043 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset = 1044 init_data_hdr->ras_tele_info_offset; 1045 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].size_kb = 1046 init_data_hdr->ras_tele_info_size_in_kb; 1047 } 1048 1049 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID)) { 1050 if (!init_data_hdr->dataexchange_size_in_kb) { 1051 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1052 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID], 1053 init_data_hdr->dataexchange_size_in_kb); 1054 r = -EINVAL; 1055 goto out; 1056 } 1057 1058 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset = 1059 init_data_hdr->dataexchange_offset; 1060 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb = 1061 init_data_hdr->dataexchange_size_in_kb; 1062 } 1063 1064 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID)) { 1065 if (!init_data_hdr->bad_page_size_in_kb) { 1066 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1067 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID], 1068 init_data_hdr->bad_page_size_in_kb); 1069 r = -EINVAL; 1070 goto out; 1071 } 1072 1073 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].offset = 1074 init_data_hdr->bad_page_info_offset; 1075 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].size_kb = 1076 init_data_hdr->bad_page_size_in_kb; 1077 } 1078 1079 /* Validation for critical region info */ 1080 if (adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb > DISCOVERY_TMR_SIZE) { 1081 dev_err(adev->dev, "Invalid IP discovery size: 0x%x\n", 1082 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb); 1083 r = -EINVAL; 1084 goto out; 1085 } 1086 1087 /* reserved memory starts from crit region base offset with the size of 5MB */ 1088 adev->mman.fw_vram_usage_start_offset = adev->virt.crit_regn.offset; 1089 adev->mman.fw_vram_usage_size = adev->virt.crit_regn.size_kb << 10; 1090 dev_info(adev->dev, 1091 "critical region v%d requested to reserve memory start at %08llx with %llu KB.\n", 1092 init_data_hdr->version, 1093 adev->mman.fw_vram_usage_start_offset, 1094 adev->mman.fw_vram_usage_size >> 10); 1095 1096 adev->virt.is_dynamic_crit_regn_enabled = true; 1097 1098 out: 1099 kfree(init_data_hdr); 1100 init_data_hdr = NULL; 1101 1102 return r; 1103 } 1104 1105 int amdgpu_virt_get_dynamic_data_info(struct amdgpu_device *adev, 1106 int data_id, uint8_t *binary, u32 *size) 1107 { 1108 uint32_t data_offset = 0; 1109 uint32_t data_size = 0; 1110 enum amd_sriov_msg_table_id_enum data_table_id = data_id; 1111 1112 if (data_table_id >= AMD_SRIOV_MSG_MAX_TABLE_ID) 1113 return -EINVAL; 1114 1115 data_offset = adev->virt.crit_regn_tbl[data_table_id].offset; 1116 data_size = adev->virt.crit_regn_tbl[data_table_id].size_kb << 10; 1117 1118 /* Validate on input params */ 1119 if (!binary || !size || *size < (uint64_t)data_size) 1120 return -EINVAL; 1121 1122 /* Proceed to copy the dynamic content */ 1123 amdgpu_device_vram_access(adev, 1124 (uint64_t)data_offset, (uint32_t *)binary, data_size, false); 1125 *size = (uint64_t)data_size; 1126 1127 dev_dbg(adev->dev, 1128 "Got %s info from dynamic crit_region_table at offset 0x%x with size of 0x%x bytes.\n", 1129 amdgpu_virt_dynamic_crit_table_name[data_id], data_offset, data_size); 1130 1131 return 0; 1132 } 1133 1134 void amdgpu_virt_init(struct amdgpu_device *adev) 1135 { 1136 bool is_sriov = false; 1137 uint32_t reg = amdgpu_virt_init_detect_asic(adev); 1138 1139 is_sriov = amdgpu_virt_init_req_data(adev, reg); 1140 1141 if (is_sriov) 1142 amdgpu_virt_init_ras(adev); 1143 } 1144 1145 static bool amdgpu_virt_access_debugfs_is_mmio(struct amdgpu_device *adev) 1146 { 1147 return amdgpu_sriov_is_debug(adev) ? true : false; 1148 } 1149 1150 static bool amdgpu_virt_access_debugfs_is_kiq(struct amdgpu_device *adev) 1151 { 1152 return amdgpu_sriov_is_normal(adev) ? true : false; 1153 } 1154 1155 int amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev) 1156 { 1157 if (!amdgpu_sriov_vf(adev) || 1158 amdgpu_virt_access_debugfs_is_kiq(adev)) 1159 return 0; 1160 1161 if (amdgpu_virt_access_debugfs_is_mmio(adev)) 1162 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 1163 else 1164 return -EPERM; 1165 1166 return 0; 1167 } 1168 1169 void amdgpu_virt_disable_access_debugfs(struct amdgpu_device *adev) 1170 { 1171 if (amdgpu_sriov_vf(adev)) 1172 adev->virt.caps |= AMDGPU_SRIOV_CAPS_RUNTIME; 1173 } 1174 1175 enum amdgpu_sriov_vf_mode amdgpu_virt_get_sriov_vf_mode(struct amdgpu_device *adev) 1176 { 1177 enum amdgpu_sriov_vf_mode mode; 1178 1179 if (amdgpu_sriov_vf(adev)) { 1180 if (amdgpu_sriov_is_pp_one_vf(adev)) 1181 mode = SRIOV_VF_MODE_ONE_VF; 1182 else 1183 mode = SRIOV_VF_MODE_MULTI_VF; 1184 } else { 1185 mode = SRIOV_VF_MODE_BARE_METAL; 1186 } 1187 1188 return mode; 1189 } 1190 1191 void amdgpu_virt_pre_reset(struct amdgpu_device *adev) 1192 { 1193 /* stop the data exchange thread */ 1194 amdgpu_virt_fini_data_exchange(adev); 1195 amdgpu_dpm_set_mp1_state(adev, PP_MP1_STATE_FLR); 1196 } 1197 1198 void amdgpu_virt_post_reset(struct amdgpu_device *adev) 1199 { 1200 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) { 1201 /* force set to GFXOFF state after reset, 1202 * to avoid some invalid operation before GC enable 1203 */ 1204 adev->gfx.is_poweron = false; 1205 } 1206 1207 adev->mes.ring[0].sched.ready = false; 1208 } 1209 1210 bool amdgpu_virt_fw_load_skip_check(struct amdgpu_device *adev, uint32_t ucode_id) 1211 { 1212 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 1213 case IP_VERSION(13, 0, 0): 1214 /* no vf autoload, white list */ 1215 if (ucode_id == AMDGPU_UCODE_ID_VCN1 || 1216 ucode_id == AMDGPU_UCODE_ID_VCN) 1217 return false; 1218 else 1219 return true; 1220 case IP_VERSION(11, 0, 9): 1221 case IP_VERSION(11, 0, 7): 1222 /* black list for CHIP_NAVI12 and CHIP_SIENNA_CICHLID */ 1223 if (ucode_id == AMDGPU_UCODE_ID_RLC_G 1224 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL 1225 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM 1226 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM 1227 || ucode_id == AMDGPU_UCODE_ID_SMC) 1228 return true; 1229 else 1230 return false; 1231 case IP_VERSION(13, 0, 10): 1232 /* white list */ 1233 if (ucode_id == AMDGPU_UCODE_ID_CAP 1234 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_PFP 1235 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_ME 1236 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC 1237 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_PFP_P0_STACK 1238 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_PFP_P1_STACK 1239 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_ME_P0_STACK 1240 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_ME_P1_STACK 1241 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC_P0_STACK 1242 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC_P1_STACK 1243 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC_P2_STACK 1244 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC_P3_STACK 1245 || ucode_id == AMDGPU_UCODE_ID_CP_MES 1246 || ucode_id == AMDGPU_UCODE_ID_CP_MES_DATA 1247 || ucode_id == AMDGPU_UCODE_ID_CP_MES1 1248 || ucode_id == AMDGPU_UCODE_ID_CP_MES1_DATA 1249 || ucode_id == AMDGPU_UCODE_ID_VCN1 1250 || ucode_id == AMDGPU_UCODE_ID_VCN) 1251 return false; 1252 else 1253 return true; 1254 default: 1255 /* lagacy black list */ 1256 if (ucode_id == AMDGPU_UCODE_ID_SDMA0 1257 || ucode_id == AMDGPU_UCODE_ID_SDMA1 1258 || ucode_id == AMDGPU_UCODE_ID_SDMA2 1259 || ucode_id == AMDGPU_UCODE_ID_SDMA3 1260 || ucode_id == AMDGPU_UCODE_ID_SDMA4 1261 || ucode_id == AMDGPU_UCODE_ID_SDMA5 1262 || ucode_id == AMDGPU_UCODE_ID_SDMA6 1263 || ucode_id == AMDGPU_UCODE_ID_SDMA7 1264 || ucode_id == AMDGPU_UCODE_ID_RLC_G 1265 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL 1266 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM 1267 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM 1268 || ucode_id == AMDGPU_UCODE_ID_SMC) 1269 return true; 1270 else 1271 return false; 1272 } 1273 } 1274 1275 void amdgpu_virt_update_sriov_video_codec(struct amdgpu_device *adev, 1276 struct amdgpu_video_codec_info *encode, uint32_t encode_array_size, 1277 struct amdgpu_video_codec_info *decode, uint32_t decode_array_size) 1278 { 1279 uint32_t i; 1280 1281 if (!adev->virt.is_mm_bw_enabled) 1282 return; 1283 1284 if (encode) { 1285 for (i = 0; i < encode_array_size; i++) { 1286 encode[i].max_width = adev->virt.encode_max_dimension_pixels; 1287 encode[i].max_pixels_per_frame = adev->virt.encode_max_frame_pixels; 1288 if (encode[i].max_width > 0) 1289 encode[i].max_height = encode[i].max_pixels_per_frame / encode[i].max_width; 1290 else 1291 encode[i].max_height = 0; 1292 } 1293 } 1294 1295 if (decode) { 1296 for (i = 0; i < decode_array_size; i++) { 1297 decode[i].max_width = adev->virt.decode_max_dimension_pixels; 1298 decode[i].max_pixels_per_frame = adev->virt.decode_max_frame_pixels; 1299 if (decode[i].max_width > 0) 1300 decode[i].max_height = decode[i].max_pixels_per_frame / decode[i].max_width; 1301 else 1302 decode[i].max_height = 0; 1303 } 1304 } 1305 } 1306 1307 bool amdgpu_virt_get_rlcg_reg_access_flag(struct amdgpu_device *adev, 1308 u32 acc_flags, u32 hwip, 1309 bool write, u32 *rlcg_flag) 1310 { 1311 bool ret = false; 1312 1313 switch (hwip) { 1314 case GC_HWIP: 1315 if (amdgpu_sriov_reg_indirect_gc(adev)) { 1316 *rlcg_flag = 1317 write ? AMDGPU_RLCG_GC_WRITE : AMDGPU_RLCG_GC_READ; 1318 ret = true; 1319 /* only in new version, AMDGPU_REGS_NO_KIQ and 1320 * AMDGPU_REGS_RLC are enabled simultaneously */ 1321 } else if ((acc_flags & AMDGPU_REGS_RLC) && 1322 !(acc_flags & AMDGPU_REGS_NO_KIQ) && write) { 1323 *rlcg_flag = AMDGPU_RLCG_GC_WRITE_LEGACY; 1324 ret = true; 1325 } 1326 break; 1327 case MMHUB_HWIP: 1328 if (amdgpu_sriov_reg_indirect_mmhub(adev) && 1329 (acc_flags & AMDGPU_REGS_RLC) && write) { 1330 *rlcg_flag = AMDGPU_RLCG_MMHUB_WRITE; 1331 ret = true; 1332 } 1333 break; 1334 default: 1335 break; 1336 } 1337 return ret; 1338 } 1339 1340 u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 flag, u32 xcc_id) 1341 { 1342 struct amdgpu_rlcg_reg_access_ctrl *reg_access_ctrl; 1343 uint32_t timeout = 50000; 1344 uint32_t i, tmp; 1345 uint32_t ret = 0; 1346 void *scratch_reg0; 1347 void *scratch_reg1; 1348 void *scratch_reg2; 1349 void *scratch_reg3; 1350 void *spare_int; 1351 unsigned long flags; 1352 1353 if (!adev->gfx.rlc.rlcg_reg_access_supported) { 1354 dev_err(adev->dev, 1355 "indirect registers access through rlcg is not available\n"); 1356 return 0; 1357 } 1358 1359 if (adev->gfx.xcc_mask && (((1 << xcc_id) & adev->gfx.xcc_mask) == 0)) { 1360 dev_err(adev->dev, "invalid xcc\n"); 1361 return 0; 1362 } 1363 1364 if (amdgpu_device_skip_hw_access(adev)) 1365 return 0; 1366 1367 reg_access_ctrl = &adev->gfx.rlc.reg_access_ctrl[xcc_id]; 1368 scratch_reg0 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg0; 1369 scratch_reg1 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg1; 1370 scratch_reg2 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg2; 1371 scratch_reg3 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg3; 1372 1373 spin_lock_irqsave(&adev->virt.rlcg_reg_lock, flags); 1374 1375 if (reg_access_ctrl->spare_int) 1376 spare_int = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->spare_int; 1377 1378 if (offset == reg_access_ctrl->grbm_cntl) { 1379 /* if the target reg offset is grbm_cntl, write to scratch_reg2 */ 1380 writel(v, scratch_reg2); 1381 if (flag == AMDGPU_RLCG_GC_WRITE_LEGACY) 1382 writel(v, ((void __iomem *)adev->rmmio) + (offset * 4)); 1383 } else if (offset == reg_access_ctrl->grbm_idx) { 1384 /* if the target reg offset is grbm_idx, write to scratch_reg3 */ 1385 writel(v, scratch_reg3); 1386 if (flag == AMDGPU_RLCG_GC_WRITE_LEGACY) 1387 writel(v, ((void __iomem *)adev->rmmio) + (offset * 4)); 1388 } else { 1389 /* 1390 * SCRATCH_REG0 = read/write value 1391 * SCRATCH_REG1[30:28] = command 1392 * SCRATCH_REG1[19:0] = address in dword 1393 * SCRATCH_REG1[27:24] = Error reporting 1394 */ 1395 writel(v, scratch_reg0); 1396 writel((offset | flag), scratch_reg1); 1397 if (reg_access_ctrl->spare_int) 1398 writel(1, spare_int); 1399 1400 for (i = 0; i < timeout; i++) { 1401 tmp = readl(scratch_reg1); 1402 if (!(tmp & AMDGPU_RLCG_SCRATCH1_ADDRESS_MASK)) 1403 break; 1404 udelay(10); 1405 } 1406 1407 tmp = readl(scratch_reg1); 1408 if (i >= timeout || (tmp & AMDGPU_RLCG_SCRATCH1_ERROR_MASK) != 0) { 1409 if (amdgpu_sriov_rlcg_error_report_enabled(adev)) { 1410 if (tmp & AMDGPU_RLCG_VFGATE_DISABLED) { 1411 dev_err(adev->dev, 1412 "vfgate is disabled, rlcg failed to program reg: 0x%05x\n", offset); 1413 } else if (tmp & AMDGPU_RLCG_WRONG_OPERATION_TYPE) { 1414 dev_err(adev->dev, 1415 "wrong operation type, rlcg failed to program reg: 0x%05x\n", offset); 1416 } else if (tmp & AMDGPU_RLCG_REG_NOT_IN_RANGE) { 1417 dev_err(adev->dev, 1418 "register is not in range, rlcg failed to program reg: 0x%05x\n", offset); 1419 } else { 1420 dev_err(adev->dev, 1421 "unknown error type, rlcg failed to program reg: 0x%05x\n", offset); 1422 } 1423 } else { 1424 dev_err(adev->dev, 1425 "timeout: rlcg faled to program reg: 0x%05x\n", offset); 1426 } 1427 } 1428 } 1429 1430 ret = readl(scratch_reg0); 1431 1432 spin_unlock_irqrestore(&adev->virt.rlcg_reg_lock, flags); 1433 1434 return ret; 1435 } 1436 1437 void amdgpu_sriov_wreg(struct amdgpu_device *adev, 1438 u32 offset, u32 value, 1439 u32 acc_flags, u32 hwip, u32 xcc_id) 1440 { 1441 u32 rlcg_flag; 1442 1443 if (amdgpu_device_skip_hw_access(adev)) 1444 return; 1445 1446 if (!amdgpu_sriov_runtime(adev) && 1447 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, true, &rlcg_flag)) { 1448 amdgpu_virt_rlcg_reg_rw(adev, offset, value, rlcg_flag, xcc_id); 1449 return; 1450 } 1451 1452 if (acc_flags & AMDGPU_REGS_NO_KIQ) 1453 WREG32_NO_KIQ(offset, value); 1454 else 1455 WREG32(offset, value); 1456 } 1457 1458 u32 amdgpu_sriov_rreg(struct amdgpu_device *adev, 1459 u32 offset, u32 acc_flags, u32 hwip, u32 xcc_id) 1460 { 1461 u32 rlcg_flag; 1462 1463 if (amdgpu_device_skip_hw_access(adev)) 1464 return 0; 1465 1466 if (!amdgpu_sriov_runtime(adev) && 1467 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, false, &rlcg_flag)) 1468 return amdgpu_virt_rlcg_reg_rw(adev, offset, 0, rlcg_flag, xcc_id); 1469 1470 if (acc_flags & AMDGPU_REGS_NO_KIQ) 1471 return RREG32_NO_KIQ(offset); 1472 else 1473 return RREG32(offset); 1474 } 1475 1476 bool amdgpu_sriov_xnack_support(struct amdgpu_device *adev) 1477 { 1478 bool xnack_mode = true; 1479 1480 if (amdgpu_sriov_vf(adev) && 1481 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2)) 1482 xnack_mode = false; 1483 1484 return xnack_mode; 1485 } 1486 1487 bool amdgpu_virt_get_ras_capability(struct amdgpu_device *adev) 1488 { 1489 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1490 1491 if (!amdgpu_sriov_ras_caps_en(adev)) 1492 return false; 1493 1494 if (adev->virt.ras_en_caps.bits.block_umc) 1495 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__UMC); 1496 if (adev->virt.ras_en_caps.bits.block_sdma) 1497 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__SDMA); 1498 if (adev->virt.ras_en_caps.bits.block_gfx) 1499 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__GFX); 1500 if (adev->virt.ras_en_caps.bits.block_mmhub) 1501 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MMHUB); 1502 if (adev->virt.ras_en_caps.bits.block_athub) 1503 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__ATHUB); 1504 if (adev->virt.ras_en_caps.bits.block_pcie_bif) 1505 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__PCIE_BIF); 1506 if (adev->virt.ras_en_caps.bits.block_hdp) 1507 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__HDP); 1508 if (adev->virt.ras_en_caps.bits.block_xgmi_wafl) 1509 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__XGMI_WAFL); 1510 if (adev->virt.ras_en_caps.bits.block_df) 1511 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__DF); 1512 if (adev->virt.ras_en_caps.bits.block_smn) 1513 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__SMN); 1514 if (adev->virt.ras_en_caps.bits.block_sem) 1515 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__SEM); 1516 if (adev->virt.ras_en_caps.bits.block_mp0) 1517 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MP0); 1518 if (adev->virt.ras_en_caps.bits.block_mp1) 1519 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MP1); 1520 if (adev->virt.ras_en_caps.bits.block_fuse) 1521 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__FUSE); 1522 if (adev->virt.ras_en_caps.bits.block_mca) 1523 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MCA); 1524 if (adev->virt.ras_en_caps.bits.block_vcn) 1525 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__VCN); 1526 if (adev->virt.ras_en_caps.bits.block_jpeg) 1527 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__JPEG); 1528 if (adev->virt.ras_en_caps.bits.block_ih) 1529 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__IH); 1530 if (adev->virt.ras_en_caps.bits.block_mpio) 1531 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MPIO); 1532 1533 if (adev->virt.ras_en_caps.bits.poison_propogation_mode) 1534 con->poison_supported = true; /* Poison is handled by host */ 1535 1536 return true; 1537 } 1538 1539 static inline enum amd_sriov_ras_telemetry_gpu_block 1540 amdgpu_ras_block_to_sriov(struct amdgpu_device *adev, enum amdgpu_ras_block block) { 1541 switch (block) { 1542 case AMDGPU_RAS_BLOCK__UMC: 1543 return RAS_TELEMETRY_GPU_BLOCK_UMC; 1544 case AMDGPU_RAS_BLOCK__SDMA: 1545 return RAS_TELEMETRY_GPU_BLOCK_SDMA; 1546 case AMDGPU_RAS_BLOCK__GFX: 1547 return RAS_TELEMETRY_GPU_BLOCK_GFX; 1548 case AMDGPU_RAS_BLOCK__MMHUB: 1549 return RAS_TELEMETRY_GPU_BLOCK_MMHUB; 1550 case AMDGPU_RAS_BLOCK__ATHUB: 1551 return RAS_TELEMETRY_GPU_BLOCK_ATHUB; 1552 case AMDGPU_RAS_BLOCK__PCIE_BIF: 1553 return RAS_TELEMETRY_GPU_BLOCK_PCIE_BIF; 1554 case AMDGPU_RAS_BLOCK__HDP: 1555 return RAS_TELEMETRY_GPU_BLOCK_HDP; 1556 case AMDGPU_RAS_BLOCK__XGMI_WAFL: 1557 return RAS_TELEMETRY_GPU_BLOCK_XGMI_WAFL; 1558 case AMDGPU_RAS_BLOCK__DF: 1559 return RAS_TELEMETRY_GPU_BLOCK_DF; 1560 case AMDGPU_RAS_BLOCK__SMN: 1561 return RAS_TELEMETRY_GPU_BLOCK_SMN; 1562 case AMDGPU_RAS_BLOCK__SEM: 1563 return RAS_TELEMETRY_GPU_BLOCK_SEM; 1564 case AMDGPU_RAS_BLOCK__MP0: 1565 return RAS_TELEMETRY_GPU_BLOCK_MP0; 1566 case AMDGPU_RAS_BLOCK__MP1: 1567 return RAS_TELEMETRY_GPU_BLOCK_MP1; 1568 case AMDGPU_RAS_BLOCK__FUSE: 1569 return RAS_TELEMETRY_GPU_BLOCK_FUSE; 1570 case AMDGPU_RAS_BLOCK__MCA: 1571 return RAS_TELEMETRY_GPU_BLOCK_MCA; 1572 case AMDGPU_RAS_BLOCK__VCN: 1573 return RAS_TELEMETRY_GPU_BLOCK_VCN; 1574 case AMDGPU_RAS_BLOCK__JPEG: 1575 return RAS_TELEMETRY_GPU_BLOCK_JPEG; 1576 case AMDGPU_RAS_BLOCK__IH: 1577 return RAS_TELEMETRY_GPU_BLOCK_IH; 1578 case AMDGPU_RAS_BLOCK__MPIO: 1579 return RAS_TELEMETRY_GPU_BLOCK_MPIO; 1580 default: 1581 dev_warn(adev->dev, "Unsupported SRIOV RAS telemetry block 0x%x\n", 1582 block); 1583 return RAS_TELEMETRY_GPU_BLOCK_COUNT; 1584 } 1585 } 1586 1587 static int amdgpu_virt_cache_host_error_counts(struct amdgpu_device *adev, 1588 struct amdsriov_ras_telemetry *host_telemetry) 1589 { 1590 struct amd_sriov_ras_telemetry_error_count *tmp = NULL; 1591 uint32_t checksum, used_size; 1592 1593 checksum = host_telemetry->header.checksum; 1594 used_size = host_telemetry->header.used_size; 1595 1596 if (used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10)) 1597 return 0; 1598 1599 tmp = kmemdup(&host_telemetry->body.error_count, used_size, GFP_KERNEL); 1600 if (!tmp) 1601 return -ENOMEM; 1602 1603 if (checksum != amd_sriov_msg_checksum(tmp, used_size, 0, 0)) 1604 goto out; 1605 1606 memcpy(&adev->virt.count_cache, tmp, 1607 min(used_size, sizeof(adev->virt.count_cache))); 1608 out: 1609 kfree(tmp); 1610 1611 return 0; 1612 } 1613 1614 static int amdgpu_virt_req_ras_err_count_internal(struct amdgpu_device *adev, bool force_update) 1615 { 1616 struct amdgpu_virt *virt = &adev->virt; 1617 1618 if (!virt->ops || !virt->ops->req_ras_err_count) 1619 return -EOPNOTSUPP; 1620 1621 /* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host 1622 * will ignore incoming guest messages. Ratelimit the guest messages to 1623 * prevent guest self DOS. 1624 */ 1625 if (__ratelimit(&virt->ras.ras_error_cnt_rs) || force_update) { 1626 mutex_lock(&virt->ras.ras_telemetry_mutex); 1627 if (!virt->ops->req_ras_err_count(adev)) 1628 amdgpu_virt_cache_host_error_counts(adev, 1629 virt->fw_reserve.ras_telemetry); 1630 mutex_unlock(&virt->ras.ras_telemetry_mutex); 1631 } 1632 1633 return 0; 1634 } 1635 1636 /* Bypass ACA interface and query ECC counts directly from host */ 1637 int amdgpu_virt_req_ras_err_count(struct amdgpu_device *adev, enum amdgpu_ras_block block, 1638 struct ras_err_data *err_data) 1639 { 1640 enum amd_sriov_ras_telemetry_gpu_block sriov_block; 1641 1642 sriov_block = amdgpu_ras_block_to_sriov(adev, block); 1643 1644 if (sriov_block >= RAS_TELEMETRY_GPU_BLOCK_COUNT || 1645 !amdgpu_sriov_ras_telemetry_block_en(adev, sriov_block)) 1646 return -EOPNOTSUPP; 1647 1648 /* Host Access may be lost during reset, just return last cached data. */ 1649 if (down_read_trylock(&adev->reset_domain->sem)) { 1650 amdgpu_virt_req_ras_err_count_internal(adev, false); 1651 up_read(&adev->reset_domain->sem); 1652 } 1653 1654 err_data->ue_count = adev->virt.count_cache.block[sriov_block].ue_count; 1655 err_data->ce_count = adev->virt.count_cache.block[sriov_block].ce_count; 1656 err_data->de_count = adev->virt.count_cache.block[sriov_block].de_count; 1657 1658 return 0; 1659 } 1660 1661 static int 1662 amdgpu_virt_write_cpers_to_ring(struct amdgpu_device *adev, 1663 struct amdsriov_ras_telemetry *host_telemetry, 1664 u32 *more) 1665 { 1666 struct amd_sriov_ras_cper_dump *cper_dump = NULL; 1667 struct cper_hdr *entry = NULL; 1668 struct amdgpu_ring *ring = &adev->cper.ring_buf; 1669 uint32_t checksum, used_size, i; 1670 int ret = 0; 1671 1672 checksum = host_telemetry->header.checksum; 1673 used_size = host_telemetry->header.used_size; 1674 1675 if (used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10)) 1676 return -EINVAL; 1677 1678 cper_dump = kmemdup(&host_telemetry->body.cper_dump, used_size, GFP_KERNEL); 1679 if (!cper_dump) 1680 return -ENOMEM; 1681 1682 if (checksum != amd_sriov_msg_checksum(cper_dump, used_size, 0, 0)) { 1683 ret = -EINVAL; 1684 goto out; 1685 } 1686 1687 *more = cper_dump->more; 1688 1689 if (cper_dump->wptr < adev->virt.ras.cper_rptr) { 1690 dev_warn( 1691 adev->dev, 1692 "guest specified rptr that was too high! guest rptr: 0x%llx, host rptr: 0x%llx\n", 1693 adev->virt.ras.cper_rptr, cper_dump->wptr); 1694 1695 adev->virt.ras.cper_rptr = cper_dump->wptr; 1696 goto out; 1697 } 1698 1699 entry = (struct cper_hdr *)&cper_dump->buf[0]; 1700 1701 for (i = 0; i < cper_dump->count; i++) { 1702 amdgpu_cper_ring_write(ring, entry, entry->record_length); 1703 entry = (struct cper_hdr *)((char *)entry + 1704 entry->record_length); 1705 } 1706 1707 if (cper_dump->overflow_count) 1708 dev_warn(adev->dev, 1709 "host reported CPER overflow of 0x%llx entries!\n", 1710 cper_dump->overflow_count); 1711 1712 adev->virt.ras.cper_rptr = cper_dump->wptr; 1713 out: 1714 kfree(cper_dump); 1715 1716 return ret; 1717 } 1718 1719 static int amdgpu_virt_req_ras_cper_dump_internal(struct amdgpu_device *adev) 1720 { 1721 struct amdgpu_virt *virt = &adev->virt; 1722 int ret = 0; 1723 uint32_t more = 0; 1724 1725 if (!virt->ops || !virt->ops->req_ras_cper_dump) 1726 return -EOPNOTSUPP; 1727 1728 do { 1729 if (!virt->ops->req_ras_cper_dump(adev, virt->ras.cper_rptr)) 1730 ret = amdgpu_virt_write_cpers_to_ring( 1731 adev, virt->fw_reserve.ras_telemetry, &more); 1732 else 1733 ret = 0; 1734 } while (more && !ret); 1735 1736 return ret; 1737 } 1738 1739 int amdgpu_virt_req_ras_cper_dump(struct amdgpu_device *adev, bool force_update) 1740 { 1741 struct amdgpu_virt *virt = &adev->virt; 1742 int ret = 0; 1743 1744 if (!amdgpu_sriov_ras_cper_en(adev)) 1745 return -EOPNOTSUPP; 1746 1747 if ((__ratelimit(&virt->ras.ras_cper_dump_rs) || force_update) && 1748 down_read_trylock(&adev->reset_domain->sem)) { 1749 mutex_lock(&virt->ras.ras_telemetry_mutex); 1750 ret = amdgpu_virt_req_ras_cper_dump_internal(adev); 1751 mutex_unlock(&virt->ras.ras_telemetry_mutex); 1752 up_read(&adev->reset_domain->sem); 1753 } 1754 1755 return ret; 1756 } 1757 1758 int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev) 1759 { 1760 unsigned long ue_count, ce_count; 1761 1762 if (amdgpu_sriov_ras_telemetry_en(adev)) { 1763 amdgpu_virt_req_ras_err_count_internal(adev, true); 1764 amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL); 1765 } 1766 1767 return 0; 1768 } 1769 1770 bool amdgpu_virt_ras_telemetry_block_en(struct amdgpu_device *adev, 1771 enum amdgpu_ras_block block) 1772 { 1773 enum amd_sriov_ras_telemetry_gpu_block sriov_block; 1774 1775 sriov_block = amdgpu_ras_block_to_sriov(adev, block); 1776 1777 if (sriov_block >= RAS_TELEMETRY_GPU_BLOCK_COUNT || 1778 !amdgpu_sriov_ras_telemetry_block_en(adev, sriov_block)) 1779 return false; 1780 1781 return true; 1782 } 1783 1784 /* 1785 * amdgpu_virt_request_bad_pages() - request bad pages 1786 * @adev: amdgpu device. 1787 * Send command to GPU hypervisor to write new bad pages into the shared PF2VF region 1788 */ 1789 void amdgpu_virt_request_bad_pages(struct amdgpu_device *adev) 1790 { 1791 struct amdgpu_virt *virt = &adev->virt; 1792 1793 if (virt->ops && virt->ops->req_bad_pages) 1794 virt->ops->req_bad_pages(adev); 1795 } 1796 1797 static int amdgpu_virt_cache_chk_criti_hit(struct amdgpu_device *adev, 1798 struct amdsriov_ras_telemetry *host_telemetry, 1799 bool *hit) 1800 { 1801 struct amd_sriov_ras_chk_criti *tmp = NULL; 1802 uint32_t checksum, used_size; 1803 1804 checksum = host_telemetry->header.checksum; 1805 used_size = host_telemetry->header.used_size; 1806 1807 if (used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10)) 1808 return 0; 1809 1810 tmp = kmemdup(&host_telemetry->body.chk_criti, used_size, GFP_KERNEL); 1811 if (!tmp) 1812 return -ENOMEM; 1813 1814 if (checksum != amd_sriov_msg_checksum(tmp, used_size, 0, 0)) 1815 goto out; 1816 1817 if (hit) 1818 *hit = tmp->hit ? true : false; 1819 1820 out: 1821 kfree(tmp); 1822 1823 return 0; 1824 } 1825 1826 int amdgpu_virt_check_vf_critical_region(struct amdgpu_device *adev, u64 addr, bool *hit) 1827 { 1828 struct amdgpu_virt *virt = &adev->virt; 1829 int r = -EPERM; 1830 1831 if (!virt->ops || !virt->ops->req_ras_chk_criti) 1832 return -EOPNOTSUPP; 1833 1834 /* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host 1835 * will ignore incoming guest messages. Ratelimit the guest messages to 1836 * prevent guest self DOS. 1837 */ 1838 if (__ratelimit(&virt->ras.ras_chk_criti_rs)) { 1839 mutex_lock(&virt->ras.ras_telemetry_mutex); 1840 if (!virt->ops->req_ras_chk_criti(adev, addr)) 1841 r = amdgpu_virt_cache_chk_criti_hit( 1842 adev, virt->fw_reserve.ras_telemetry, hit); 1843 mutex_unlock(&virt->ras.ras_telemetry_mutex); 1844 } 1845 1846 return r; 1847 } 1848