1 /* 2 * Copyright 2016 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 24 #include <linux/module.h> 25 26 #ifdef CONFIG_X86 27 #include <asm/hypervisor.h> 28 #endif 29 30 #include <drm/drm_drv.h> 31 #include <xen/xen.h> 32 33 #include "amdgpu.h" 34 #include "amdgpu_ras.h" 35 #include "amdgpu_reset.h" 36 #include "amdgpu_dpm.h" 37 #include "vi.h" 38 #include "soc15.h" 39 #include "nv.h" 40 #include "amdgpu_virt_ras_cmd.h" 41 42 #define POPULATE_UCODE_INFO(vf2pf_info, ucode, ver) \ 43 do { \ 44 vf2pf_info->ucode_info[ucode].id = ucode; \ 45 vf2pf_info->ucode_info[ucode].version = ver; \ 46 } while (0) 47 48 #define mmRCC_CONFIG_MEMSIZE 0xde3 49 50 const char *amdgpu_virt_dynamic_crit_table_name[] = { 51 "IP DISCOVERY", 52 "VBIOS IMG", 53 "RAS TELEMETRY", 54 "DATA EXCHANGE", 55 "BAD PAGE INFO", 56 "INIT HEADER", 57 "LAST", 58 }; 59 60 bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev) 61 { 62 /* By now all MMIO pages except mailbox are blocked */ 63 /* if blocking is enabled in hypervisor. Choose the */ 64 /* SCRATCH_REG0 to test. */ 65 return RREG32_NO_KIQ(0xc040) == 0xffffffff; 66 } 67 68 void amdgpu_virt_init_setting(struct amdgpu_device *adev) 69 { 70 struct drm_device *ddev = adev_to_drm(adev); 71 72 /* enable virtual display */ 73 if (adev->asic_type != CHIP_ALDEBARAN && 74 adev->asic_type != CHIP_ARCTURUS && 75 ((adev->pdev->class >> 8) != PCI_CLASS_ACCELERATOR_PROCESSING)) { 76 if (adev->mode_info.num_crtc == 0) 77 adev->mode_info.num_crtc = 1; 78 adev->enable_virtual_display = true; 79 } 80 ddev->driver_features &= ~DRIVER_ATOMIC; 81 adev->cg_flags = 0; 82 adev->pg_flags = 0; 83 84 /* Reduce kcq number to 2 to reduce latency */ 85 if (amdgpu_num_kcq == -1) 86 amdgpu_num_kcq = 2; 87 } 88 89 /** 90 * amdgpu_virt_request_full_gpu() - request full gpu access 91 * @adev: amdgpu device. 92 * @init: is driver init time. 93 * When start to init/fini driver, first need to request full gpu access. 94 * Return: Zero if request success, otherwise will return error. 95 */ 96 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init) 97 { 98 struct amdgpu_virt *virt = &adev->virt; 99 int r; 100 101 if (virt->ops && virt->ops->req_full_gpu) { 102 r = virt->ops->req_full_gpu(adev, init); 103 if (r) { 104 adev->no_hw_access = true; 105 return r; 106 } 107 108 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 109 } 110 111 return 0; 112 } 113 114 /** 115 * amdgpu_virt_release_full_gpu() - release full gpu access 116 * @adev: amdgpu device. 117 * @init: is driver init time. 118 * When finishing driver init/fini, need to release full gpu access. 119 * Return: Zero if release success, otherwise will returen error. 120 */ 121 int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init) 122 { 123 struct amdgpu_virt *virt = &adev->virt; 124 int r; 125 126 if (virt->ops && virt->ops->rel_full_gpu) { 127 r = virt->ops->rel_full_gpu(adev, init); 128 if (r) 129 return r; 130 131 adev->virt.caps |= AMDGPU_SRIOV_CAPS_RUNTIME; 132 } 133 return 0; 134 } 135 136 /** 137 * amdgpu_virt_reset_gpu() - reset gpu 138 * @adev: amdgpu device. 139 * Send reset command to GPU hypervisor to reset GPU that VM is using 140 * Return: Zero if reset success, otherwise will return error. 141 */ 142 int amdgpu_virt_reset_gpu(struct amdgpu_device *adev) 143 { 144 struct amdgpu_virt *virt = &adev->virt; 145 int r; 146 147 if (virt->ops && virt->ops->reset_gpu) { 148 r = virt->ops->reset_gpu(adev); 149 if (r) 150 return r; 151 152 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 153 } 154 155 return 0; 156 } 157 158 void amdgpu_virt_request_init_data(struct amdgpu_device *adev) 159 { 160 struct amdgpu_virt *virt = &adev->virt; 161 162 if (virt->ops && virt->ops->req_init_data) 163 virt->ops->req_init_data(adev); 164 165 if (adev->virt.req_init_data_ver > 0) 166 dev_info(adev->dev, "host supports REQ_INIT_DATA handshake of critical_region_version %d\n", 167 adev->virt.req_init_data_ver); 168 else 169 dev_warn(adev->dev, "host doesn't support REQ_INIT_DATA handshake\n"); 170 } 171 172 /** 173 * amdgpu_virt_ready_to_reset() - send ready to reset to host 174 * @adev: amdgpu device. 175 * Send ready to reset message to GPU hypervisor to signal we have stopped GPU 176 * activity and is ready for host FLR 177 */ 178 void amdgpu_virt_ready_to_reset(struct amdgpu_device *adev) 179 { 180 struct amdgpu_virt *virt = &adev->virt; 181 182 if (virt->ops && virt->ops->reset_gpu) 183 virt->ops->ready_to_reset(adev); 184 } 185 186 /** 187 * amdgpu_virt_wait_reset() - wait for reset gpu completed 188 * @adev: amdgpu device. 189 * Wait for GPU reset completed. 190 * Return: Zero if reset success, otherwise will return error. 191 */ 192 int amdgpu_virt_wait_reset(struct amdgpu_device *adev) 193 { 194 struct amdgpu_virt *virt = &adev->virt; 195 196 if (!virt->ops || !virt->ops->wait_reset) 197 return -EINVAL; 198 199 return virt->ops->wait_reset(adev); 200 } 201 202 /** 203 * amdgpu_virt_alloc_mm_table() - alloc memory for mm table 204 * @adev: amdgpu device. 205 * MM table is used by UVD and VCE for its initialization 206 * Return: Zero if allocate success. 207 */ 208 int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev) 209 { 210 int r; 211 212 if (!amdgpu_sriov_vf(adev) || adev->virt.mm_table.gpu_addr) 213 return 0; 214 215 r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, 216 AMDGPU_GEM_DOMAIN_VRAM | 217 AMDGPU_GEM_DOMAIN_GTT, 218 &adev->virt.mm_table.bo, 219 &adev->virt.mm_table.gpu_addr, 220 (void *)&adev->virt.mm_table.cpu_addr); 221 if (r) { 222 dev_err(adev->dev, "failed to alloc mm table and error = %d.\n", r); 223 return r; 224 } 225 226 memset((void *)adev->virt.mm_table.cpu_addr, 0, PAGE_SIZE); 227 dev_info(adev->dev, "MM table gpu addr = 0x%llx, cpu addr = %p.\n", 228 adev->virt.mm_table.gpu_addr, 229 adev->virt.mm_table.cpu_addr); 230 return 0; 231 } 232 233 /** 234 * amdgpu_virt_free_mm_table() - free mm table memory 235 * @adev: amdgpu device. 236 * Free MM table memory 237 */ 238 void amdgpu_virt_free_mm_table(struct amdgpu_device *adev) 239 { 240 if (!amdgpu_sriov_vf(adev) || !adev->virt.mm_table.gpu_addr) 241 return; 242 243 amdgpu_bo_free_kernel(&adev->virt.mm_table.bo, 244 &adev->virt.mm_table.gpu_addr, 245 (void *)&adev->virt.mm_table.cpu_addr); 246 adev->virt.mm_table.gpu_addr = 0; 247 } 248 249 /** 250 * amdgpu_virt_rcvd_ras_interrupt() - receive ras interrupt 251 * @adev: amdgpu device. 252 * Check whether host sent RAS error message 253 * Return: true if found, otherwise false 254 */ 255 bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev) 256 { 257 struct amdgpu_virt *virt = &adev->virt; 258 259 if (!virt->ops || !virt->ops->rcvd_ras_intr) 260 return false; 261 262 return virt->ops->rcvd_ras_intr(adev); 263 } 264 265 266 unsigned int amd_sriov_msg_checksum(void *obj, 267 unsigned long obj_size, 268 unsigned int key, 269 unsigned int checksum) 270 { 271 unsigned int ret = key; 272 unsigned long i = 0; 273 unsigned char *pos; 274 275 pos = (char *)obj; 276 /* calculate checksum */ 277 for (i = 0; i < obj_size; ++i) 278 ret += *(pos + i); 279 /* minus the checksum itself */ 280 pos = (char *)&checksum; 281 for (i = 0; i < sizeof(checksum); ++i) 282 ret -= *(pos + i); 283 return ret; 284 } 285 286 static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev) 287 { 288 struct amdgpu_virt *virt = &adev->virt; 289 struct amdgpu_virt_ras_err_handler_data **data = &virt->virt_eh_data; 290 /* GPU will be marked bad on host if bp count more then 10, 291 * so alloc 512 is enough. 292 */ 293 unsigned int align_space = 512; 294 void *bps = NULL; 295 struct amdgpu_bo **bps_bo = NULL; 296 297 *data = kmalloc_obj(struct amdgpu_virt_ras_err_handler_data); 298 if (!*data) 299 goto data_failure; 300 301 bps = kmalloc_objs(*(*data)->bps, align_space); 302 if (!bps) 303 goto bps_failure; 304 305 bps_bo = kmalloc_objs(*(*data)->bps_bo, align_space); 306 if (!bps_bo) 307 goto bps_bo_failure; 308 309 (*data)->bps = bps; 310 (*data)->bps_bo = bps_bo; 311 (*data)->count = 0; 312 (*data)->last_reserved = 0; 313 314 virt->ras_init_done = true; 315 316 return 0; 317 318 bps_bo_failure: 319 kfree(bps); 320 bps_failure: 321 kfree(*data); 322 data_failure: 323 return -ENOMEM; 324 } 325 326 static void amdgpu_virt_ras_release_bp(struct amdgpu_device *adev) 327 { 328 struct amdgpu_virt *virt = &adev->virt; 329 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 330 struct amdgpu_bo *bo; 331 int i; 332 333 if (!data) 334 return; 335 336 for (i = data->last_reserved - 1; i >= 0; i--) { 337 bo = data->bps_bo[i]; 338 if (bo) { 339 amdgpu_bo_free_kernel(&bo, NULL, NULL); 340 data->bps_bo[i] = bo; 341 } 342 data->last_reserved = i; 343 } 344 } 345 346 void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev) 347 { 348 struct amdgpu_virt *virt = &adev->virt; 349 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 350 351 virt->ras_init_done = false; 352 353 if (!data) 354 return; 355 356 amdgpu_virt_ras_release_bp(adev); 357 358 kfree(data->bps); 359 kfree(data->bps_bo); 360 kfree(data); 361 virt->virt_eh_data = NULL; 362 } 363 364 static void amdgpu_virt_ras_add_bps(struct amdgpu_device *adev, 365 struct eeprom_table_record *bps, int pages) 366 { 367 struct amdgpu_virt *virt = &adev->virt; 368 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 369 370 if (!data) 371 return; 372 373 memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps)); 374 data->count += pages; 375 } 376 377 static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev) 378 { 379 struct amdgpu_virt *virt = &adev->virt; 380 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 381 struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr; 382 struct ttm_resource_manager *man = &mgr->manager; 383 struct amdgpu_bo *bo = NULL; 384 uint64_t bp; 385 int i; 386 387 if (!data) 388 return; 389 390 for (i = data->last_reserved; i < data->count; i++) { 391 bp = data->bps[i].retired_page; 392 393 /* There are two cases of reserve error should be ignored: 394 * 1) a ras bad page has been allocated (used by someone); 395 * 2) a ras bad page has been reserved (duplicate error injection 396 * for one page); 397 */ 398 if (ttm_resource_manager_used(man)) { 399 amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr, 400 bp << AMDGPU_GPU_PAGE_SHIFT, 401 AMDGPU_GPU_PAGE_SIZE); 402 data->bps_bo[i] = NULL; 403 } else { 404 if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT, 405 AMDGPU_GPU_PAGE_SIZE, 406 &bo, NULL)) 407 dev_dbg(adev->dev, 408 "RAS WARN: reserve vram for retired page %llx fail\n", 409 bp); 410 data->bps_bo[i] = bo; 411 } 412 data->last_reserved = i + 1; 413 bo = NULL; 414 } 415 } 416 417 static bool amdgpu_virt_ras_check_bad_page(struct amdgpu_device *adev, 418 uint64_t retired_page) 419 { 420 struct amdgpu_virt *virt = &adev->virt; 421 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 422 int i; 423 424 if (!data) 425 return true; 426 427 for (i = 0; i < data->count; i++) 428 if (retired_page == data->bps[i].retired_page) 429 return true; 430 431 return false; 432 } 433 434 static void amdgpu_virt_add_bad_page(struct amdgpu_device *adev, 435 uint64_t bp_block_offset, uint32_t bp_block_size) 436 { 437 struct eeprom_table_record bp; 438 uint64_t retired_page; 439 uint32_t bp_idx, bp_cnt; 440 void *fw_va = adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].cpu_ptr; 441 void *drv_va = adev->mman.resv_region[AMDGPU_RESV_DRV_VRAM_USAGE].cpu_ptr; 442 void *vram_usage_va = fw_va ? fw_va : drv_va; 443 444 memset(&bp, 0, sizeof(bp)); 445 446 if (bp_block_size) { 447 bp_cnt = bp_block_size / sizeof(uint64_t); 448 for (bp_idx = 0; bp_idx < bp_cnt; bp_idx++) { 449 retired_page = *(uint64_t *)(vram_usage_va + 450 bp_block_offset + bp_idx * sizeof(uint64_t)); 451 bp.retired_page = retired_page; 452 453 if (amdgpu_virt_ras_check_bad_page(adev, retired_page)) 454 continue; 455 456 amdgpu_virt_ras_add_bps(adev, &bp, 1); 457 458 amdgpu_virt_ras_reserve_bps(adev); 459 } 460 } 461 } 462 463 static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) 464 { 465 struct amd_sriov_msg_pf2vf_info_header *pf2vf_info = adev->virt.fw_reserve.p_pf2vf; 466 struct amdgim_pf2vf_info_v1 *pf2vf_v1; 467 struct amd_sriov_msg_pf2vf_info *pf2vf; 468 469 uint32_t checksum; 470 uint32_t checkval; 471 472 uint32_t i; 473 uint32_t tmp; 474 475 if (adev->virt.fw_reserve.p_pf2vf == NULL) 476 return -EINVAL; 477 478 if (pf2vf_info->size > 1024) { 479 dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", pf2vf_info->size); 480 return -EINVAL; 481 } 482 483 switch (pf2vf_info->version) { 484 case 1: 485 pf2vf_v1 = (struct amdgim_pf2vf_info_v1 *)pf2vf_info; 486 checksum = pf2vf_v1->checksum; 487 checkval = amd_sriov_msg_checksum( 488 adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, 489 adev->virt.fw_reserve.checksum_key, checksum); 490 if (checksum != checkval) { 491 dev_err(adev->dev, 492 "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", 493 checksum, checkval); 494 return -EINVAL; 495 } 496 497 adev->virt.gim_feature = pf2vf_v1->feature_flags; 498 break; 499 case 2: 500 /* TODO: missing key, need to add it later */ 501 pf2vf = (struct amd_sriov_msg_pf2vf_info *)pf2vf_info; 502 checksum = pf2vf->checksum; 503 checkval = amd_sriov_msg_checksum( 504 adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, 505 0, checksum); 506 if (checksum != checkval) { 507 dev_err(adev->dev, 508 "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", 509 checksum, checkval); 510 return -EINVAL; 511 } 512 513 adev->virt.vf2pf_update_interval_ms = 514 pf2vf->vf2pf_update_interval_ms; 515 adev->virt.gim_feature = pf2vf->feature_flags.all; 516 adev->virt.reg_access = pf2vf->reg_access_flags.all; 517 518 adev->virt.decode_max_dimension_pixels = 0; 519 adev->virt.decode_max_frame_pixels = 0; 520 adev->virt.encode_max_dimension_pixels = 0; 521 adev->virt.encode_max_frame_pixels = 0; 522 adev->virt.is_mm_bw_enabled = false; 523 for (i = 0; i < AMD_SRIOV_MSG_RESERVE_VCN_INST; i++) { 524 tmp = pf2vf->mm_bw_management[i].decode_max_dimension_pixels; 525 adev->virt.decode_max_dimension_pixels = max(tmp, adev->virt.decode_max_dimension_pixels); 526 527 tmp = pf2vf->mm_bw_management[i].decode_max_frame_pixels; 528 adev->virt.decode_max_frame_pixels = max(tmp, adev->virt.decode_max_frame_pixels); 529 530 tmp = pf2vf->mm_bw_management[i].encode_max_dimension_pixels; 531 adev->virt.encode_max_dimension_pixels = max(tmp, adev->virt.encode_max_dimension_pixels); 532 533 tmp = pf2vf->mm_bw_management[i].encode_max_frame_pixels; 534 adev->virt.encode_max_frame_pixels = max(tmp, adev->virt.encode_max_frame_pixels); 535 } 536 if ((adev->virt.decode_max_dimension_pixels > 0) || (adev->virt.encode_max_dimension_pixels > 0)) 537 adev->virt.is_mm_bw_enabled = true; 538 539 adev->unique_id = pf2vf->uuid; 540 541 adev->unitid = 0; 542 if (amdgpu_sriov_is_unitid_support(adev)) 543 adev->unitid = pf2vf->unitid; 544 545 adev->virt.ras_en_caps.all = pf2vf->ras_en_caps.all; 546 adev->virt.ras_telemetry_en_caps.all = 547 pf2vf->ras_telemetry_en_caps.all; 548 break; 549 default: 550 dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version); 551 return -EINVAL; 552 } 553 554 /* correct too large or too little interval value */ 555 if (adev->virt.vf2pf_update_interval_ms < 200 || adev->virt.vf2pf_update_interval_ms > 10000) 556 adev->virt.vf2pf_update_interval_ms = 2000; 557 558 return 0; 559 } 560 561 static void amdgpu_virt_populate_vf2pf_ucode_info(struct amdgpu_device *adev) 562 { 563 struct amd_sriov_msg_vf2pf_info *vf2pf_info; 564 vf2pf_info = (struct amd_sriov_msg_vf2pf_info *) adev->virt.fw_reserve.p_vf2pf; 565 566 if (adev->virt.fw_reserve.p_vf2pf == NULL) 567 return; 568 569 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_VCE, adev->vce.fw_version); 570 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_UVD, adev->uvd.fw_version); 571 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_MC, adev->gmc.fw_version); 572 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_ME, adev->gfx.me_fw_version); 573 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_PFP, adev->gfx.pfp_fw_version); 574 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_CE, adev->gfx.ce_fw_version); 575 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_RLC, adev->gfx.rlc_fw_version); 576 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_RLC_SRLC, adev->gfx.rlc_srlc_fw_version); 577 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_RLC_SRLG, adev->gfx.rlc_srlg_fw_version); 578 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_RLC_SRLS, adev->gfx.rlc_srls_fw_version); 579 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_MEC, adev->gfx.mec_fw_version); 580 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_MEC2, adev->gfx.mec2_fw_version); 581 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_SOS, adev->psp.sos.fw_version); 582 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_ASD, 583 adev->psp.asd_context.bin_desc.fw_version); 584 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_TA_RAS, 585 adev->psp.ras_context.context.bin_desc.fw_version); 586 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_TA_XGMI, 587 adev->psp.xgmi_context.context.bin_desc.fw_version); 588 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_SMC, adev->pm.fw_version); 589 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_SDMA, adev->sdma.instance[0].fw_version); 590 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_SDMA2, adev->sdma.instance[1].fw_version); 591 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_VCN, adev->vcn.fw_version); 592 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_DMCU, adev->dm.dmcu_fw_version); 593 } 594 595 static int amdgpu_virt_write_vf2pf_data(struct amdgpu_device *adev) 596 { 597 struct amd_sriov_msg_vf2pf_info *vf2pf_info; 598 599 vf2pf_info = (struct amd_sriov_msg_vf2pf_info *) adev->virt.fw_reserve.p_vf2pf; 600 601 if (adev->virt.fw_reserve.p_vf2pf == NULL) 602 return -EINVAL; 603 604 memset(vf2pf_info, 0, sizeof(struct amd_sriov_msg_vf2pf_info)); 605 606 vf2pf_info->header.size = sizeof(struct amd_sriov_msg_vf2pf_info); 607 vf2pf_info->header.version = AMD_SRIOV_MSG_FW_VRAM_VF2PF_VER; 608 609 #ifdef MODULE 610 if (THIS_MODULE->version != NULL) 611 strscpy(vf2pf_info->driver_version, THIS_MODULE->version); 612 else 613 #endif 614 strscpy(vf2pf_info->driver_version, "N/A"); 615 616 vf2pf_info->pf2vf_version_required = 0; // no requirement, guest understands all 617 vf2pf_info->driver_cert = 0; 618 vf2pf_info->os_info.all = 0; 619 620 vf2pf_info->fb_usage = ttm_resource_manager_used(&adev->mman.vram_mgr.manager) ? 621 ttm_resource_manager_usage(&adev->mman.vram_mgr.manager) >> 20 : 0; 622 vf2pf_info->fb_vis_usage = 623 amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr) >> 20; 624 vf2pf_info->fb_size = adev->gmc.real_vram_size >> 20; 625 vf2pf_info->fb_vis_size = adev->gmc.visible_vram_size >> 20; 626 627 amdgpu_virt_populate_vf2pf_ucode_info(adev); 628 629 /* TODO: read dynamic info */ 630 vf2pf_info->gfx_usage = 0; 631 vf2pf_info->compute_usage = 0; 632 vf2pf_info->encode_usage = 0; 633 vf2pf_info->decode_usage = 0; 634 635 vf2pf_info->dummy_page_addr = (uint64_t)adev->dummy_page_addr; 636 if (amdgpu_sriov_is_mes_info_enable(adev)) { 637 vf2pf_info->mes_info_addr = 638 (uint64_t)(adev->mes.resource_1_gpu_addr[0] + AMDGPU_GPU_PAGE_SIZE); 639 vf2pf_info->mes_info_size = 640 adev->mes.resource_1[0]->tbo.base.size - AMDGPU_GPU_PAGE_SIZE; 641 } 642 vf2pf_info->checksum = 643 amd_sriov_msg_checksum( 644 vf2pf_info, sizeof(*vf2pf_info), 0, 0); 645 646 return 0; 647 } 648 649 static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work) 650 { 651 struct amdgpu_device *adev = container_of(work, struct amdgpu_device, virt.vf2pf_work.work); 652 int ret; 653 654 ret = amdgpu_virt_read_pf2vf_data(adev); 655 if (ret) { 656 adev->virt.vf2pf_update_retry_cnt++; 657 658 if ((amdgpu_virt_rcvd_ras_interrupt(adev) || 659 adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && 660 amdgpu_sriov_runtime(adev)) { 661 662 amdgpu_ras_set_fed(adev, true); 663 if (amdgpu_reset_domain_schedule(adev->reset_domain, 664 &adev->kfd.reset_work)) 665 return; 666 else 667 dev_err(adev->dev, "Failed to queue work! at %s", __func__); 668 } 669 670 goto out; 671 } 672 673 adev->virt.vf2pf_update_retry_cnt = 0; 674 amdgpu_virt_write_vf2pf_data(adev); 675 676 out: 677 schedule_delayed_work(&(adev->virt.vf2pf_work), adev->virt.vf2pf_update_interval_ms); 678 } 679 680 static int amdgpu_virt_read_exchange_data_from_mem(struct amdgpu_device *adev, uint32_t *pfvf_data) 681 { 682 uint32_t dataexchange_offset = 683 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset; 684 uint32_t dataexchange_size = 685 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb << 10; 686 uint64_t pos = 0; 687 688 dev_info(adev->dev, 689 "Got data exchange info from dynamic crit_region_table at offset 0x%x with size of 0x%x bytes.\n", 690 dataexchange_offset, dataexchange_size); 691 692 if (!IS_ALIGNED(dataexchange_offset, 4) || !IS_ALIGNED(dataexchange_size, 4)) { 693 dev_err(adev->dev, "Data exchange data not aligned to 4 bytes\n"); 694 return -EINVAL; 695 } 696 697 pos = (uint64_t)dataexchange_offset; 698 amdgpu_device_vram_access(adev, pos, pfvf_data, 699 dataexchange_size, false); 700 701 return 0; 702 } 703 704 void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev) 705 { 706 if (adev->virt.vf2pf_update_interval_ms != 0) { 707 dev_info(adev->dev, "clean up the vf2pf work item\n"); 708 cancel_delayed_work_sync(&adev->virt.vf2pf_work); 709 adev->virt.vf2pf_update_interval_ms = 0; 710 } 711 } 712 713 void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev) 714 { 715 uint32_t *pfvf_data = NULL; 716 void *fw_va = adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].cpu_ptr; 717 void *drv_va = adev->mman.resv_region[AMDGPU_RESV_DRV_VRAM_USAGE].cpu_ptr; 718 719 adev->virt.fw_reserve.p_pf2vf = NULL; 720 adev->virt.fw_reserve.p_vf2pf = NULL; 721 adev->virt.vf2pf_update_interval_ms = 0; 722 adev->virt.vf2pf_update_retry_cnt = 0; 723 724 if (fw_va && drv_va) { 725 dev_warn(adev->dev, "Currently fw_vram and drv_vram should not have values at the same time!"); 726 } else if (fw_va || drv_va) { 727 /* go through this logic in ip_init and reset to init workqueue*/ 728 amdgpu_virt_exchange_data(adev); 729 730 INIT_DELAYED_WORK(&adev->virt.vf2pf_work, amdgpu_virt_update_vf2pf_work_item); 731 schedule_delayed_work(&(adev->virt.vf2pf_work), msecs_to_jiffies(adev->virt.vf2pf_update_interval_ms)); 732 } else if (adev->bios != NULL) { 733 /* got through this logic in early init stage to get necessary flags, e.g. rlcg_acc related*/ 734 if (adev->virt.req_init_data_ver == GPU_CRIT_REGION_V2) { 735 pfvf_data = 736 kzalloc(adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb << 10, 737 GFP_KERNEL); 738 if (!pfvf_data) { 739 dev_err(adev->dev, "Failed to allocate memory for pfvf_data\n"); 740 return; 741 } 742 743 if (amdgpu_virt_read_exchange_data_from_mem(adev, pfvf_data)) 744 goto free_pfvf_data; 745 746 adev->virt.fw_reserve.p_pf2vf = 747 (struct amd_sriov_msg_pf2vf_info_header *)pfvf_data; 748 749 amdgpu_virt_read_pf2vf_data(adev); 750 751 free_pfvf_data: 752 kfree(pfvf_data); 753 pfvf_data = NULL; 754 adev->virt.fw_reserve.p_pf2vf = NULL; 755 } else { 756 adev->virt.fw_reserve.p_pf2vf = 757 (struct amd_sriov_msg_pf2vf_info_header *) 758 (adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10)); 759 760 amdgpu_virt_read_pf2vf_data(adev); 761 } 762 } 763 } 764 765 766 void amdgpu_virt_exchange_data(struct amdgpu_device *adev) 767 { 768 uint64_t bp_block_offset = 0; 769 uint32_t bp_block_size = 0; 770 struct amd_sriov_msg_pf2vf_info *pf2vf_v2 = NULL; 771 void *fw_va = adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].cpu_ptr; 772 void *drv_va = adev->mman.resv_region[AMDGPU_RESV_DRV_VRAM_USAGE].cpu_ptr; 773 774 if (fw_va || drv_va) { 775 if (fw_va) { 776 if (adev->virt.req_init_data_ver == GPU_CRIT_REGION_V2) { 777 adev->virt.fw_reserve.p_pf2vf = 778 (struct amd_sriov_msg_pf2vf_info_header *) 779 (fw_va + 780 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset); 781 adev->virt.fw_reserve.p_vf2pf = 782 (struct amd_sriov_msg_vf2pf_info_header *) 783 (fw_va + 784 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset + 785 (AMD_SRIOV_MSG_SIZE_KB << 10)); 786 adev->virt.fw_reserve.ras_telemetry = 787 (fw_va + 788 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset); 789 } else { 790 adev->virt.fw_reserve.p_pf2vf = 791 (struct amd_sriov_msg_pf2vf_info_header *) 792 (fw_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10)); 793 adev->virt.fw_reserve.p_vf2pf = 794 (struct amd_sriov_msg_vf2pf_info_header *) 795 (fw_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10)); 796 adev->virt.fw_reserve.ras_telemetry = 797 (fw_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10)); 798 } 799 } else if (drv_va) { 800 adev->virt.fw_reserve.p_pf2vf = 801 (struct amd_sriov_msg_pf2vf_info_header *) 802 (drv_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10)); 803 adev->virt.fw_reserve.p_vf2pf = 804 (struct amd_sriov_msg_vf2pf_info_header *) 805 (drv_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10)); 806 adev->virt.fw_reserve.ras_telemetry = 807 (drv_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10)); 808 } 809 810 amdgpu_virt_read_pf2vf_data(adev); 811 amdgpu_virt_write_vf2pf_data(adev); 812 813 /* bad page handling for version 2 */ 814 if (adev->virt.fw_reserve.p_pf2vf->version == 2) { 815 pf2vf_v2 = (struct amd_sriov_msg_pf2vf_info *)adev->virt.fw_reserve.p_pf2vf; 816 817 bp_block_offset = ((uint64_t)pf2vf_v2->bp_block_offset_low & 0xFFFFFFFF) | 818 ((((uint64_t)pf2vf_v2->bp_block_offset_high) << 32) & 0xFFFFFFFF00000000); 819 bp_block_size = pf2vf_v2->bp_block_size; 820 821 if (bp_block_size && !adev->virt.ras_init_done) 822 amdgpu_virt_init_ras_err_handler_data(adev); 823 824 if (adev->virt.ras_init_done) 825 amdgpu_virt_add_bad_page(adev, bp_block_offset, bp_block_size); 826 } 827 } 828 } 829 830 static u32 amdgpu_virt_init_detect_asic(struct amdgpu_device *adev) 831 { 832 uint32_t reg; 833 834 switch (adev->asic_type) { 835 case CHIP_TONGA: 836 case CHIP_FIJI: 837 reg = RREG32(mmBIF_IOV_FUNC_IDENTIFIER); 838 break; 839 case CHIP_VEGA10: 840 case CHIP_VEGA20: 841 case CHIP_NAVI10: 842 case CHIP_NAVI12: 843 case CHIP_SIENNA_CICHLID: 844 case CHIP_ARCTURUS: 845 case CHIP_ALDEBARAN: 846 case CHIP_IP_DISCOVERY: 847 reg = RREG32(mmRCC_IOV_FUNC_IDENTIFIER); 848 break; 849 default: /* other chip doesn't support SRIOV */ 850 reg = 0; 851 break; 852 } 853 854 if (reg & 1) 855 adev->virt.caps |= AMDGPU_SRIOV_CAPS_IS_VF; 856 857 if (reg & 0x80000000) 858 adev->virt.caps |= AMDGPU_SRIOV_CAPS_ENABLE_IOV; 859 860 if (!reg) { 861 /* passthrough mode exclus sriov mod */ 862 if (is_virtual_machine() && !xen_initial_domain()) 863 adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE; 864 } 865 866 return reg; 867 } 868 869 static bool amdgpu_virt_init_req_data(struct amdgpu_device *adev, u32 reg) 870 { 871 bool is_sriov = false; 872 873 /* we have the ability to check now */ 874 if (amdgpu_sriov_vf(adev)) { 875 is_sriov = true; 876 877 switch (adev->asic_type) { 878 case CHIP_TONGA: 879 case CHIP_FIJI: 880 vi_set_virt_ops(adev); 881 break; 882 case CHIP_VEGA10: 883 soc15_set_virt_ops(adev); 884 #ifdef CONFIG_X86 885 /* not send GPU_INIT_DATA with MS_HYPERV*/ 886 if (!hypervisor_is_type(X86_HYPER_MS_HYPERV)) 887 #endif 888 /* send a dummy GPU_INIT_DATA request to host on vega10 */ 889 amdgpu_virt_request_init_data(adev); 890 break; 891 case CHIP_VEGA20: 892 case CHIP_ARCTURUS: 893 case CHIP_ALDEBARAN: 894 soc15_set_virt_ops(adev); 895 break; 896 case CHIP_NAVI10: 897 case CHIP_NAVI12: 898 case CHIP_SIENNA_CICHLID: 899 case CHIP_IP_DISCOVERY: 900 nv_set_virt_ops(adev); 901 /* try send GPU_INIT_DATA request to host */ 902 amdgpu_virt_request_init_data(adev); 903 break; 904 default: /* other chip doesn't support SRIOV */ 905 is_sriov = false; 906 dev_err(adev->dev, "Unknown asic type: %d!\n", adev->asic_type); 907 break; 908 } 909 } 910 911 return is_sriov; 912 } 913 914 static void amdgpu_virt_init_ras(struct amdgpu_device *adev) 915 { 916 ratelimit_state_init(&adev->virt.ras.ras_error_cnt_rs, 5 * HZ, 1); 917 ratelimit_state_init(&adev->virt.ras.ras_cper_dump_rs, 5 * HZ, 1); 918 ratelimit_state_init(&adev->virt.ras.ras_chk_criti_rs, 5 * HZ, 1); 919 920 ratelimit_set_flags(&adev->virt.ras.ras_error_cnt_rs, 921 RATELIMIT_MSG_ON_RELEASE); 922 ratelimit_set_flags(&adev->virt.ras.ras_cper_dump_rs, 923 RATELIMIT_MSG_ON_RELEASE); 924 ratelimit_set_flags(&adev->virt.ras.ras_chk_criti_rs, 925 RATELIMIT_MSG_ON_RELEASE); 926 927 mutex_init(&adev->virt.ras.ras_telemetry_mutex); 928 mutex_init(&adev->virt.access_req_mutex); 929 930 adev->virt.ras.cper_rptr = 0; 931 } 932 933 static uint8_t amdgpu_virt_crit_region_calc_checksum(uint8_t *buf_start, uint8_t *buf_end) 934 { 935 uint32_t sum = 0; 936 937 if (buf_start >= buf_end) 938 return 0; 939 940 for (; buf_start < buf_end; buf_start++) 941 sum += buf_start[0]; 942 943 return 0xffffffff - sum; 944 } 945 946 int amdgpu_virt_init_critical_region(struct amdgpu_device *adev) 947 { 948 struct amd_sriov_msg_init_data_header *init_data_hdr = NULL; 949 u64 init_hdr_offset = adev->virt.init_data_header.offset; 950 u64 init_hdr_size = (u64)adev->virt.init_data_header.size_kb << 10; /* KB → bytes */ 951 u64 vram_size; 952 u64 end; 953 int r = 0; 954 uint8_t checksum = 0; 955 956 /* Skip below init if critical region version != v2 */ 957 if (adev->virt.req_init_data_ver != GPU_CRIT_REGION_V2) 958 return 0; 959 960 vram_size = RREG32(mmRCC_CONFIG_MEMSIZE); 961 if (!vram_size || vram_size == U32_MAX) 962 return -EINVAL; 963 vram_size <<= 20; 964 965 if (check_add_overflow(init_hdr_offset, init_hdr_size, &end) || end > vram_size) { 966 dev_err(adev->dev, "init_data_header exceeds VRAM size, exiting\n"); 967 return -EINVAL; 968 } 969 970 /* Allocate for init_data_hdr */ 971 init_data_hdr = kzalloc_obj(struct amd_sriov_msg_init_data_header); 972 if (!init_data_hdr) 973 return -ENOMEM; 974 975 amdgpu_device_vram_access(adev, (uint64_t)init_hdr_offset, (uint32_t *)init_data_hdr, 976 sizeof(struct amd_sriov_msg_init_data_header), false); 977 978 /* Table validation */ 979 if (strncmp(init_data_hdr->signature, 980 AMDGPU_SRIOV_CRIT_DATA_SIGNATURE, 981 AMDGPU_SRIOV_CRIT_DATA_SIG_LEN) != 0) { 982 dev_err(adev->dev, "Invalid init data signature: %.4s\n", 983 init_data_hdr->signature); 984 r = -EINVAL; 985 goto out; 986 } 987 988 checksum = amdgpu_virt_crit_region_calc_checksum( 989 (uint8_t *)&init_data_hdr->initdata_offset, 990 (uint8_t *)init_data_hdr + 991 sizeof(struct amd_sriov_msg_init_data_header)); 992 if (checksum != init_data_hdr->checksum) { 993 dev_err(adev->dev, "Found unmatching checksum from calculation 0x%x and init_data 0x%x\n", 994 checksum, init_data_hdr->checksum); 995 r = -EINVAL; 996 goto out; 997 } 998 999 memset(&adev->virt.crit_regn, 0, sizeof(adev->virt.crit_regn)); 1000 memset(adev->virt.crit_regn_tbl, 0, sizeof(adev->virt.crit_regn_tbl)); 1001 1002 adev->virt.crit_regn.offset = init_data_hdr->initdata_offset; 1003 adev->virt.crit_regn.size_kb = init_data_hdr->initdata_size_in_kb; 1004 1005 /* Validation and initialization for each table entry */ 1006 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_IPD_TABLE_ID)) { 1007 if (!init_data_hdr->ip_discovery_size_in_kb || 1008 init_data_hdr->ip_discovery_size_in_kb > DISCOVERY_TMR_SIZE) { 1009 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1010 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_IPD_TABLE_ID], 1011 init_data_hdr->ip_discovery_size_in_kb); 1012 r = -EINVAL; 1013 goto out; 1014 } 1015 1016 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].offset = 1017 init_data_hdr->ip_discovery_offset; 1018 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb = 1019 init_data_hdr->ip_discovery_size_in_kb; 1020 } 1021 1022 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID)) { 1023 if (!init_data_hdr->vbios_img_size_in_kb) { 1024 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1025 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID], 1026 init_data_hdr->vbios_img_size_in_kb); 1027 r = -EINVAL; 1028 goto out; 1029 } 1030 1031 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].offset = 1032 init_data_hdr->vbios_img_offset; 1033 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].size_kb = 1034 init_data_hdr->vbios_img_size_in_kb; 1035 } 1036 1037 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID)) { 1038 if (!init_data_hdr->ras_tele_info_size_in_kb) { 1039 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1040 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID], 1041 init_data_hdr->ras_tele_info_size_in_kb); 1042 r = -EINVAL; 1043 goto out; 1044 } 1045 1046 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset = 1047 init_data_hdr->ras_tele_info_offset; 1048 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].size_kb = 1049 init_data_hdr->ras_tele_info_size_in_kb; 1050 } 1051 1052 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID)) { 1053 if (!init_data_hdr->dataexchange_size_in_kb) { 1054 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1055 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID], 1056 init_data_hdr->dataexchange_size_in_kb); 1057 r = -EINVAL; 1058 goto out; 1059 } 1060 1061 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset = 1062 init_data_hdr->dataexchange_offset; 1063 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb = 1064 init_data_hdr->dataexchange_size_in_kb; 1065 } 1066 1067 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID)) { 1068 if (!init_data_hdr->bad_page_size_in_kb) { 1069 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1070 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID], 1071 init_data_hdr->bad_page_size_in_kb); 1072 r = -EINVAL; 1073 goto out; 1074 } 1075 1076 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].offset = 1077 init_data_hdr->bad_page_info_offset; 1078 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].size_kb = 1079 init_data_hdr->bad_page_size_in_kb; 1080 } 1081 1082 /* Validation for critical region info */ 1083 if (adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb > DISCOVERY_TMR_SIZE) { 1084 dev_err(adev->dev, "Invalid IP discovery size: 0x%x\n", 1085 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb); 1086 r = -EINVAL; 1087 goto out; 1088 } 1089 1090 /* reserved memory starts from crit region base offset with the size of 5MB */ 1091 amdgpu_ttm_init_vram_resv(adev, AMDGPU_RESV_FW_VRAM_USAGE, 1092 adev->virt.crit_regn.offset, 1093 adev->virt.crit_regn.size_kb << 10, true); 1094 dev_info(adev->dev, 1095 "critical region v%d requested to reserve memory start at %08llx with %llu KB.\n", 1096 init_data_hdr->version, 1097 adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].offset, 1098 adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].size >> 10); 1099 1100 adev->virt.is_dynamic_crit_regn_enabled = true; 1101 1102 out: 1103 kfree(init_data_hdr); 1104 init_data_hdr = NULL; 1105 1106 return r; 1107 } 1108 1109 int amdgpu_virt_get_dynamic_data_info(struct amdgpu_device *adev, 1110 int data_id, uint8_t *binary, u32 *size) 1111 { 1112 uint32_t data_offset = 0; 1113 uint32_t data_size = 0; 1114 enum amd_sriov_msg_table_id_enum data_table_id = data_id; 1115 1116 if (data_table_id >= AMD_SRIOV_MSG_MAX_TABLE_ID) 1117 return -EINVAL; 1118 1119 data_offset = adev->virt.crit_regn_tbl[data_table_id].offset; 1120 data_size = adev->virt.crit_regn_tbl[data_table_id].size_kb << 10; 1121 1122 /* Validate on input params */ 1123 if (!binary || !size || *size < (uint64_t)data_size) 1124 return -EINVAL; 1125 1126 /* Proceed to copy the dynamic content */ 1127 amdgpu_device_vram_access(adev, 1128 (uint64_t)data_offset, (uint32_t *)binary, data_size, false); 1129 *size = (uint64_t)data_size; 1130 1131 dev_dbg(adev->dev, 1132 "Got %s info from dynamic crit_region_table at offset 0x%x with size of 0x%x bytes.\n", 1133 amdgpu_virt_dynamic_crit_table_name[data_id], data_offset, data_size); 1134 1135 return 0; 1136 } 1137 1138 void amdgpu_virt_init(struct amdgpu_device *adev) 1139 { 1140 bool is_sriov = false; 1141 uint32_t reg = amdgpu_virt_init_detect_asic(adev); 1142 1143 is_sriov = amdgpu_virt_init_req_data(adev, reg); 1144 1145 if (is_sriov) 1146 amdgpu_virt_init_ras(adev); 1147 } 1148 1149 static bool amdgpu_virt_access_debugfs_is_mmio(struct amdgpu_device *adev) 1150 { 1151 return amdgpu_sriov_is_debug(adev) ? true : false; 1152 } 1153 1154 static bool amdgpu_virt_access_debugfs_is_kiq(struct amdgpu_device *adev) 1155 { 1156 return amdgpu_sriov_is_normal(adev) ? true : false; 1157 } 1158 1159 int amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev) 1160 { 1161 if (!amdgpu_sriov_vf(adev) || 1162 amdgpu_virt_access_debugfs_is_kiq(adev)) 1163 return 0; 1164 1165 if (amdgpu_virt_access_debugfs_is_mmio(adev)) 1166 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 1167 else 1168 return -EPERM; 1169 1170 return 0; 1171 } 1172 1173 void amdgpu_virt_disable_access_debugfs(struct amdgpu_device *adev) 1174 { 1175 if (amdgpu_sriov_vf(adev)) 1176 adev->virt.caps |= AMDGPU_SRIOV_CAPS_RUNTIME; 1177 } 1178 1179 enum amdgpu_sriov_vf_mode amdgpu_virt_get_sriov_vf_mode(struct amdgpu_device *adev) 1180 { 1181 enum amdgpu_sriov_vf_mode mode; 1182 1183 if (amdgpu_sriov_vf(adev)) { 1184 if (amdgpu_sriov_is_pp_one_vf(adev)) 1185 mode = SRIOV_VF_MODE_ONE_VF; 1186 else 1187 mode = SRIOV_VF_MODE_MULTI_VF; 1188 } else { 1189 mode = SRIOV_VF_MODE_BARE_METAL; 1190 } 1191 1192 return mode; 1193 } 1194 1195 void amdgpu_virt_pre_reset(struct amdgpu_device *adev) 1196 { 1197 /* stop the data exchange thread */ 1198 amdgpu_virt_fini_data_exchange(adev); 1199 amdgpu_dpm_set_mp1_state(adev, PP_MP1_STATE_FLR); 1200 } 1201 1202 void amdgpu_virt_post_reset(struct amdgpu_device *adev) 1203 { 1204 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) { 1205 /* force set to GFXOFF state after reset, 1206 * to avoid some invalid operation before GC enable 1207 */ 1208 adev->gfx.is_poweron = false; 1209 } 1210 1211 adev->mes.ring[0].sched.ready = false; 1212 } 1213 1214 bool amdgpu_virt_fw_load_skip_check(struct amdgpu_device *adev, uint32_t ucode_id) 1215 { 1216 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 1217 case IP_VERSION(13, 0, 0): 1218 /* no vf autoload, white list */ 1219 if (ucode_id == AMDGPU_UCODE_ID_VCN1 || 1220 ucode_id == AMDGPU_UCODE_ID_VCN) 1221 return false; 1222 else 1223 return true; 1224 case IP_VERSION(11, 0, 9): 1225 case IP_VERSION(11, 0, 7): 1226 /* black list for CHIP_NAVI12 and CHIP_SIENNA_CICHLID */ 1227 if (ucode_id == AMDGPU_UCODE_ID_RLC_G 1228 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL 1229 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM 1230 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM 1231 || ucode_id == AMDGPU_UCODE_ID_SMC) 1232 return true; 1233 else 1234 return false; 1235 case IP_VERSION(13, 0, 10): 1236 /* white list */ 1237 if (ucode_id == AMDGPU_UCODE_ID_CAP 1238 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_PFP 1239 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_ME 1240 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC 1241 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_PFP_P0_STACK 1242 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_PFP_P1_STACK 1243 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_ME_P0_STACK 1244 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_ME_P1_STACK 1245 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC_P0_STACK 1246 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC_P1_STACK 1247 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC_P2_STACK 1248 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC_P3_STACK 1249 || ucode_id == AMDGPU_UCODE_ID_CP_MES 1250 || ucode_id == AMDGPU_UCODE_ID_CP_MES_DATA 1251 || ucode_id == AMDGPU_UCODE_ID_CP_MES1 1252 || ucode_id == AMDGPU_UCODE_ID_CP_MES1_DATA 1253 || ucode_id == AMDGPU_UCODE_ID_VCN1 1254 || ucode_id == AMDGPU_UCODE_ID_VCN) 1255 return false; 1256 else 1257 return true; 1258 default: 1259 /* lagacy black list */ 1260 if (ucode_id == AMDGPU_UCODE_ID_SDMA0 1261 || ucode_id == AMDGPU_UCODE_ID_SDMA1 1262 || ucode_id == AMDGPU_UCODE_ID_SDMA2 1263 || ucode_id == AMDGPU_UCODE_ID_SDMA3 1264 || ucode_id == AMDGPU_UCODE_ID_SDMA4 1265 || ucode_id == AMDGPU_UCODE_ID_SDMA5 1266 || ucode_id == AMDGPU_UCODE_ID_SDMA6 1267 || ucode_id == AMDGPU_UCODE_ID_SDMA7 1268 || ucode_id == AMDGPU_UCODE_ID_SDMA_RS64 1269 || ucode_id == AMDGPU_UCODE_ID_RLC_G 1270 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL 1271 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM 1272 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM 1273 || ucode_id == AMDGPU_UCODE_ID_SMC) 1274 return true; 1275 else 1276 return false; 1277 } 1278 } 1279 1280 void amdgpu_virt_update_sriov_video_codec(struct amdgpu_device *adev, 1281 struct amdgpu_video_codec_info *encode, uint32_t encode_array_size, 1282 struct amdgpu_video_codec_info *decode, uint32_t decode_array_size) 1283 { 1284 uint32_t i; 1285 1286 if (!adev->virt.is_mm_bw_enabled) 1287 return; 1288 1289 if (encode) { 1290 for (i = 0; i < encode_array_size; i++) { 1291 encode[i].max_width = adev->virt.encode_max_dimension_pixels; 1292 encode[i].max_pixels_per_frame = adev->virt.encode_max_frame_pixels; 1293 if (encode[i].max_width > 0) 1294 encode[i].max_height = encode[i].max_pixels_per_frame / encode[i].max_width; 1295 else 1296 encode[i].max_height = 0; 1297 } 1298 } 1299 1300 if (decode) { 1301 for (i = 0; i < decode_array_size; i++) { 1302 decode[i].max_width = adev->virt.decode_max_dimension_pixels; 1303 decode[i].max_pixels_per_frame = adev->virt.decode_max_frame_pixels; 1304 if (decode[i].max_width > 0) 1305 decode[i].max_height = decode[i].max_pixels_per_frame / decode[i].max_width; 1306 else 1307 decode[i].max_height = 0; 1308 } 1309 } 1310 } 1311 1312 bool amdgpu_virt_get_rlcg_reg_access_flag(struct amdgpu_device *adev, 1313 u32 acc_flags, u32 hwip, 1314 bool write, u32 *rlcg_flag) 1315 { 1316 bool ret = false; 1317 1318 switch (hwip) { 1319 case GC_HWIP: 1320 if (amdgpu_sriov_reg_indirect_gc(adev)) { 1321 *rlcg_flag = 1322 write ? AMDGPU_RLCG_GC_WRITE : AMDGPU_RLCG_GC_READ; 1323 ret = true; 1324 /* only in new version, AMDGPU_REGS_NO_KIQ and 1325 * AMDGPU_REGS_RLC are enabled simultaneously */ 1326 } else if ((acc_flags & AMDGPU_REGS_RLC) && 1327 !(acc_flags & AMDGPU_REGS_NO_KIQ) && write) { 1328 *rlcg_flag = AMDGPU_RLCG_GC_WRITE_LEGACY; 1329 ret = true; 1330 } 1331 break; 1332 case MMHUB_HWIP: 1333 if (amdgpu_sriov_reg_indirect_mmhub(adev) && 1334 (acc_flags & AMDGPU_REGS_RLC) && write) { 1335 *rlcg_flag = AMDGPU_RLCG_MMHUB_WRITE; 1336 ret = true; 1337 } 1338 break; 1339 default: 1340 break; 1341 } 1342 return ret; 1343 } 1344 1345 static u32 amdgpu_virt_rlcg_vfi_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 flag, u32 xcc_id) 1346 { 1347 uint32_t timeout = 100; 1348 uint32_t i; 1349 1350 struct amdgpu_rlcg_reg_access_ctrl *reg_access_ctrl; 1351 void *vfi_cmd; 1352 void *vfi_stat; 1353 void *vfi_addr; 1354 void *vfi_data; 1355 void *vfi_grbm_cntl; 1356 void *vfi_grbm_idx; 1357 uint32_t cmd; 1358 uint32_t stat; 1359 uint32_t addr = offset; 1360 uint32_t data; 1361 uint32_t grbm_cntl_data; 1362 uint32_t grbm_idx_data; 1363 1364 unsigned long flags; 1365 bool is_err = true; 1366 1367 if (!adev->gfx.rlc.rlcg_reg_access_supported) { 1368 dev_err(adev->dev, "VFi interface is not available\n"); 1369 return 0; 1370 } 1371 1372 if (adev->gfx.xcc_mask && (((1 << xcc_id) & adev->gfx.xcc_mask) == 0)) { 1373 dev_err(adev->dev, "VFi invalid XCC, xcc_id=0x%x\n", xcc_id); 1374 return 0; 1375 } 1376 1377 if (amdgpu_device_skip_hw_access(adev)) 1378 return 0; 1379 1380 reg_access_ctrl = &adev->gfx.rlc.reg_access_ctrl[xcc_id]; 1381 vfi_cmd = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->vfi_cmd; 1382 vfi_stat = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->vfi_stat; 1383 vfi_addr = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->vfi_addr; 1384 vfi_data = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->vfi_data; 1385 vfi_grbm_cntl = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->vfi_grbm_cntl; 1386 vfi_grbm_idx = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->vfi_grbm_idx; 1387 grbm_cntl_data = reg_access_ctrl->vfi_grbm_cntl_data; 1388 grbm_idx_data = reg_access_ctrl->vfi_grbm_idx_data; 1389 1390 if (flag == AMDGPU_RLCG_GC_WRITE) { 1391 data = v; 1392 cmd = AMDGPU_RLCG_VFI_CMD__WR; 1393 1394 // the GRBM_GFX_CNTL and GRBM_GFX_INDEX are protected by mutex outside this call 1395 if (addr == reg_access_ctrl->grbm_cntl) { 1396 reg_access_ctrl->vfi_grbm_cntl_data = data; 1397 return 0; 1398 } else if (addr == reg_access_ctrl->grbm_idx) { 1399 reg_access_ctrl->vfi_grbm_idx_data = data; 1400 return 0; 1401 } 1402 1403 } else if (flag == AMDGPU_RLCG_GC_READ) { 1404 data = 0; 1405 cmd = AMDGPU_RLCG_VFI_CMD__RD; 1406 1407 // the GRBM_GFX_CNTL and GRBM_GFX_INDEX are protected by mutex outside this call 1408 if (addr == reg_access_ctrl->grbm_cntl) 1409 return grbm_cntl_data; 1410 else if (addr == reg_access_ctrl->grbm_idx) 1411 return grbm_idx_data; 1412 1413 } else { 1414 dev_err(adev->dev, "VFi invalid access, flag=0x%x\n", flag); 1415 return 0; 1416 } 1417 1418 spin_lock_irqsave(&adev->virt.rlcg_reg_lock, flags); 1419 1420 writel(addr, vfi_addr); 1421 writel(data, vfi_data); 1422 writel(grbm_cntl_data, vfi_grbm_cntl); 1423 writel(grbm_idx_data, vfi_grbm_idx); 1424 1425 writel(AMDGPU_RLCG_VFI_STAT__BUSY, vfi_stat); 1426 writel(cmd, vfi_cmd); 1427 1428 for (i = 0; i < timeout; i++) { 1429 stat = readl(vfi_stat); 1430 if (stat != AMDGPU_RLCG_VFI_STAT__BUSY) 1431 break; 1432 udelay(10); 1433 } 1434 1435 switch (stat) { 1436 case AMDGPU_RLCG_VFI_STAT__DONE: 1437 is_err = false; 1438 if (cmd == AMDGPU_RLCG_VFI_CMD__RD) 1439 data = readl(vfi_data); 1440 break; 1441 case AMDGPU_RLCG_VFI_STAT__BUSY: 1442 dev_err(adev->dev, "VFi access timeout\n"); 1443 break; 1444 case AMDGPU_RLCG_VFI_STAT__INV_CMD: 1445 dev_err(adev->dev, "VFi invalid command\n"); 1446 break; 1447 case AMDGPU_RLCG_VFI_STAT__INV_ADDR: 1448 dev_err(adev->dev, "VFi invalid address\n"); 1449 break; 1450 case AMDGPU_RLCG_VFI_STAT__ERR: 1451 dev_err(adev->dev, "VFi unknown error\n"); 1452 break; 1453 default: 1454 dev_err(adev->dev, "VFi unknown status code\n"); 1455 break; 1456 } 1457 1458 spin_unlock_irqrestore(&adev->virt.rlcg_reg_lock, flags); 1459 1460 if (is_err) 1461 dev_err(adev->dev, "VFi: [grbm_cntl=0x%x grbm_idx=0x%x] addr=0x%x (byte addr 0x%x), data=0x%x, cmd=0x%x\n", 1462 grbm_cntl_data, grbm_idx_data, 1463 addr, addr * 4, data, cmd); 1464 else 1465 dev_dbg(adev->dev, "VFi: [grbm_cntl=0x%x grbm_idx=0x%x] addr=0x%x (byte addr 0x%x), data=0x%x, cmd=0x%x\n", 1466 grbm_cntl_data, grbm_idx_data, 1467 addr, addr * 4, data, cmd); 1468 1469 return data; 1470 } 1471 1472 u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 flag, u32 xcc_id) 1473 { 1474 struct amdgpu_rlcg_reg_access_ctrl *reg_access_ctrl; 1475 uint32_t timeout = 50000; 1476 uint32_t i, tmp; 1477 uint32_t ret = 0; 1478 void *scratch_reg0; 1479 void *scratch_reg1; 1480 void *scratch_reg2; 1481 void *scratch_reg3; 1482 void *spare_int; 1483 unsigned long flags; 1484 1485 if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 1, 0)) 1486 return amdgpu_virt_rlcg_vfi_reg_rw(adev, offset, v, flag, xcc_id); 1487 1488 if (!adev->gfx.rlc.rlcg_reg_access_supported) { 1489 dev_err(adev->dev, 1490 "indirect registers access through rlcg is not available\n"); 1491 return 0; 1492 } 1493 1494 if (adev->gfx.xcc_mask && (((1 << xcc_id) & adev->gfx.xcc_mask) == 0)) { 1495 dev_err(adev->dev, "invalid xcc\n"); 1496 return 0; 1497 } 1498 1499 if (amdgpu_device_skip_hw_access(adev)) 1500 return 0; 1501 1502 reg_access_ctrl = &adev->gfx.rlc.reg_access_ctrl[xcc_id]; 1503 scratch_reg0 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg0; 1504 scratch_reg1 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg1; 1505 scratch_reg2 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg2; 1506 scratch_reg3 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg3; 1507 1508 spin_lock_irqsave(&adev->virt.rlcg_reg_lock, flags); 1509 1510 if (reg_access_ctrl->spare_int) 1511 spare_int = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->spare_int; 1512 1513 if (offset == reg_access_ctrl->grbm_cntl) { 1514 /* if the target reg offset is grbm_cntl, write to scratch_reg2 */ 1515 writel(v, scratch_reg2); 1516 if (flag == AMDGPU_RLCG_GC_WRITE_LEGACY) 1517 writel(v, ((void __iomem *)adev->rmmio) + (offset * 4)); 1518 } else if (offset == reg_access_ctrl->grbm_idx) { 1519 /* if the target reg offset is grbm_idx, write to scratch_reg3 */ 1520 writel(v, scratch_reg3); 1521 if (flag == AMDGPU_RLCG_GC_WRITE_LEGACY) 1522 writel(v, ((void __iomem *)adev->rmmio) + (offset * 4)); 1523 } else { 1524 /* 1525 * SCRATCH_REG0 = read/write value 1526 * SCRATCH_REG1[30:28] = command 1527 * SCRATCH_REG1[19:0] = address in dword 1528 * SCRATCH_REG1[27:24] = Error reporting 1529 */ 1530 writel(v, scratch_reg0); 1531 writel((offset | flag), scratch_reg1); 1532 if (reg_access_ctrl->spare_int) 1533 writel(1, spare_int); 1534 1535 for (i = 0; i < timeout; i++) { 1536 tmp = readl(scratch_reg1); 1537 if (!(tmp & AMDGPU_RLCG_SCRATCH1_ADDRESS_MASK)) 1538 break; 1539 udelay(10); 1540 } 1541 1542 tmp = readl(scratch_reg1); 1543 if (i >= timeout || (tmp & AMDGPU_RLCG_SCRATCH1_ERROR_MASK) != 0) { 1544 if (amdgpu_sriov_rlcg_error_report_enabled(adev)) { 1545 if (tmp & AMDGPU_RLCG_VFGATE_DISABLED) { 1546 dev_err(adev->dev, 1547 "vfgate is disabled, rlcg failed to program reg: 0x%05x\n", offset); 1548 } else if (tmp & AMDGPU_RLCG_WRONG_OPERATION_TYPE) { 1549 dev_err(adev->dev, 1550 "wrong operation type, rlcg failed to program reg: 0x%05x\n", offset); 1551 } else if (tmp & AMDGPU_RLCG_REG_NOT_IN_RANGE) { 1552 dev_err(adev->dev, 1553 "register is not in range, rlcg failed to program reg: 0x%05x\n", offset); 1554 } else { 1555 dev_err(adev->dev, 1556 "unknown error type, rlcg failed to program reg: 0x%05x\n", offset); 1557 } 1558 } else { 1559 dev_err(adev->dev, 1560 "timeout: rlcg faled to program reg: 0x%05x\n", offset); 1561 } 1562 } 1563 } 1564 1565 ret = readl(scratch_reg0); 1566 1567 spin_unlock_irqrestore(&adev->virt.rlcg_reg_lock, flags); 1568 1569 return ret; 1570 } 1571 1572 void amdgpu_sriov_wreg(struct amdgpu_device *adev, 1573 u32 offset, u32 value, 1574 u32 acc_flags, u32 hwip, u32 xcc_id) 1575 { 1576 u32 rlcg_flag; 1577 1578 if (amdgpu_device_skip_hw_access(adev)) 1579 return; 1580 1581 if (!amdgpu_sriov_runtime(adev) && 1582 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, true, &rlcg_flag)) { 1583 amdgpu_virt_rlcg_reg_rw(adev, offset, value, rlcg_flag, xcc_id); 1584 return; 1585 } 1586 1587 if (acc_flags & AMDGPU_REGS_NO_KIQ) 1588 WREG32_NO_KIQ(offset, value); 1589 else 1590 WREG32(offset, value); 1591 } 1592 1593 u32 amdgpu_sriov_rreg(struct amdgpu_device *adev, 1594 u32 offset, u32 acc_flags, u32 hwip, u32 xcc_id) 1595 { 1596 u32 rlcg_flag; 1597 1598 if (amdgpu_device_skip_hw_access(adev)) 1599 return 0; 1600 1601 if (!amdgpu_sriov_runtime(adev) && 1602 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, false, &rlcg_flag)) 1603 return amdgpu_virt_rlcg_reg_rw(adev, offset, 0, rlcg_flag, xcc_id); 1604 1605 if (acc_flags & AMDGPU_REGS_NO_KIQ) 1606 return RREG32_NO_KIQ(offset); 1607 else 1608 return RREG32(offset); 1609 } 1610 1611 bool amdgpu_sriov_xnack_support(struct amdgpu_device *adev) 1612 { 1613 bool xnack_mode = true; 1614 1615 if (amdgpu_sriov_vf(adev) && 1616 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2)) 1617 xnack_mode = false; 1618 1619 return xnack_mode; 1620 } 1621 1622 bool amdgpu_virt_get_ras_capability(struct amdgpu_device *adev) 1623 { 1624 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1625 1626 if (!amdgpu_sriov_ras_caps_en(adev)) 1627 return false; 1628 1629 if (adev->virt.ras_en_caps.bits.block_umc) 1630 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__UMC); 1631 if (adev->virt.ras_en_caps.bits.block_sdma) 1632 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__SDMA); 1633 if (adev->virt.ras_en_caps.bits.block_gfx) 1634 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__GFX); 1635 if (adev->virt.ras_en_caps.bits.block_mmhub) 1636 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MMHUB); 1637 if (adev->virt.ras_en_caps.bits.block_athub) 1638 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__ATHUB); 1639 if (adev->virt.ras_en_caps.bits.block_pcie_bif) 1640 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__PCIE_BIF); 1641 if (adev->virt.ras_en_caps.bits.block_hdp) 1642 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__HDP); 1643 if (adev->virt.ras_en_caps.bits.block_xgmi_wafl) 1644 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__XGMI_WAFL); 1645 if (adev->virt.ras_en_caps.bits.block_df) 1646 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__DF); 1647 if (adev->virt.ras_en_caps.bits.block_smn) 1648 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__SMN); 1649 if (adev->virt.ras_en_caps.bits.block_sem) 1650 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__SEM); 1651 if (adev->virt.ras_en_caps.bits.block_mp0) 1652 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MP0); 1653 if (adev->virt.ras_en_caps.bits.block_mp1) 1654 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MP1); 1655 if (adev->virt.ras_en_caps.bits.block_fuse) 1656 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__FUSE); 1657 if (adev->virt.ras_en_caps.bits.block_mca) 1658 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MCA); 1659 if (adev->virt.ras_en_caps.bits.block_vcn) 1660 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__VCN); 1661 if (adev->virt.ras_en_caps.bits.block_jpeg) 1662 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__JPEG); 1663 if (adev->virt.ras_en_caps.bits.block_ih) 1664 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__IH); 1665 if (adev->virt.ras_en_caps.bits.block_mpio) 1666 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MPIO); 1667 1668 if (adev->virt.ras_en_caps.bits.poison_propogation_mode) 1669 con->poison_supported = true; /* Poison is handled by host */ 1670 1671 if (adev->virt.ras_en_caps.bits.uniras_supported) 1672 amdgpu_virt_ras_set_remote_uniras(adev, true); 1673 1674 return true; 1675 } 1676 1677 static inline enum amd_sriov_ras_telemetry_gpu_block 1678 amdgpu_ras_block_to_sriov(struct amdgpu_device *adev, enum amdgpu_ras_block block) { 1679 switch (block) { 1680 case AMDGPU_RAS_BLOCK__UMC: 1681 return RAS_TELEMETRY_GPU_BLOCK_UMC; 1682 case AMDGPU_RAS_BLOCK__SDMA: 1683 return RAS_TELEMETRY_GPU_BLOCK_SDMA; 1684 case AMDGPU_RAS_BLOCK__GFX: 1685 return RAS_TELEMETRY_GPU_BLOCK_GFX; 1686 case AMDGPU_RAS_BLOCK__MMHUB: 1687 return RAS_TELEMETRY_GPU_BLOCK_MMHUB; 1688 case AMDGPU_RAS_BLOCK__ATHUB: 1689 return RAS_TELEMETRY_GPU_BLOCK_ATHUB; 1690 case AMDGPU_RAS_BLOCK__PCIE_BIF: 1691 return RAS_TELEMETRY_GPU_BLOCK_PCIE_BIF; 1692 case AMDGPU_RAS_BLOCK__HDP: 1693 return RAS_TELEMETRY_GPU_BLOCK_HDP; 1694 case AMDGPU_RAS_BLOCK__XGMI_WAFL: 1695 return RAS_TELEMETRY_GPU_BLOCK_XGMI_WAFL; 1696 case AMDGPU_RAS_BLOCK__DF: 1697 return RAS_TELEMETRY_GPU_BLOCK_DF; 1698 case AMDGPU_RAS_BLOCK__SMN: 1699 return RAS_TELEMETRY_GPU_BLOCK_SMN; 1700 case AMDGPU_RAS_BLOCK__SEM: 1701 return RAS_TELEMETRY_GPU_BLOCK_SEM; 1702 case AMDGPU_RAS_BLOCK__MP0: 1703 return RAS_TELEMETRY_GPU_BLOCK_MP0; 1704 case AMDGPU_RAS_BLOCK__MP1: 1705 return RAS_TELEMETRY_GPU_BLOCK_MP1; 1706 case AMDGPU_RAS_BLOCK__FUSE: 1707 return RAS_TELEMETRY_GPU_BLOCK_FUSE; 1708 case AMDGPU_RAS_BLOCK__MCA: 1709 return RAS_TELEMETRY_GPU_BLOCK_MCA; 1710 case AMDGPU_RAS_BLOCK__VCN: 1711 return RAS_TELEMETRY_GPU_BLOCK_VCN; 1712 case AMDGPU_RAS_BLOCK__JPEG: 1713 return RAS_TELEMETRY_GPU_BLOCK_JPEG; 1714 case AMDGPU_RAS_BLOCK__IH: 1715 return RAS_TELEMETRY_GPU_BLOCK_IH; 1716 case AMDGPU_RAS_BLOCK__MPIO: 1717 return RAS_TELEMETRY_GPU_BLOCK_MPIO; 1718 default: 1719 dev_warn(adev->dev, "Unsupported SRIOV RAS telemetry block 0x%x\n", 1720 block); 1721 return RAS_TELEMETRY_GPU_BLOCK_COUNT; 1722 } 1723 } 1724 1725 static int amdgpu_virt_cache_host_error_counts(struct amdgpu_device *adev, 1726 struct amdsriov_ras_telemetry *host_telemetry) 1727 { 1728 struct amd_sriov_ras_telemetry_error_count *tmp = NULL; 1729 uint32_t checksum, used_size; 1730 1731 checksum = host_telemetry->header.checksum; 1732 used_size = host_telemetry->header.used_size; 1733 1734 if (used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10)) 1735 return 0; 1736 1737 tmp = kmemdup(&host_telemetry->body.error_count, used_size, GFP_KERNEL); 1738 if (!tmp) 1739 return -ENOMEM; 1740 1741 if (checksum != amd_sriov_msg_checksum(tmp, used_size, 0, 0)) 1742 goto out; 1743 1744 memcpy(&adev->virt.count_cache, tmp, 1745 min(used_size, sizeof(adev->virt.count_cache))); 1746 out: 1747 kfree(tmp); 1748 1749 return 0; 1750 } 1751 1752 static int amdgpu_virt_req_ras_err_count_internal(struct amdgpu_device *adev, bool force_update) 1753 { 1754 struct amdgpu_virt *virt = &adev->virt; 1755 1756 if (!virt->ops || !virt->ops->req_ras_err_count) 1757 return -EOPNOTSUPP; 1758 1759 /* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host 1760 * will ignore incoming guest messages. Ratelimit the guest messages to 1761 * prevent guest self DOS. 1762 */ 1763 if (__ratelimit(&virt->ras.ras_error_cnt_rs) || force_update) { 1764 mutex_lock(&virt->ras.ras_telemetry_mutex); 1765 if (!virt->ops->req_ras_err_count(adev)) 1766 amdgpu_virt_cache_host_error_counts(adev, 1767 virt->fw_reserve.ras_telemetry); 1768 mutex_unlock(&virt->ras.ras_telemetry_mutex); 1769 } 1770 1771 return 0; 1772 } 1773 1774 /* Bypass ACA interface and query ECC counts directly from host */ 1775 int amdgpu_virt_req_ras_err_count(struct amdgpu_device *adev, enum amdgpu_ras_block block, 1776 struct ras_err_data *err_data) 1777 { 1778 enum amd_sriov_ras_telemetry_gpu_block sriov_block; 1779 1780 sriov_block = amdgpu_ras_block_to_sriov(adev, block); 1781 1782 if (sriov_block >= RAS_TELEMETRY_GPU_BLOCK_COUNT || 1783 !amdgpu_sriov_ras_telemetry_block_en(adev, sriov_block)) 1784 return -EOPNOTSUPP; 1785 1786 /* Host Access may be lost during reset, just return last cached data. */ 1787 if (down_read_trylock(&adev->reset_domain->sem)) { 1788 amdgpu_virt_req_ras_err_count_internal(adev, false); 1789 up_read(&adev->reset_domain->sem); 1790 } 1791 1792 err_data->ue_count = adev->virt.count_cache.block[sriov_block].ue_count; 1793 err_data->ce_count = adev->virt.count_cache.block[sriov_block].ce_count; 1794 err_data->de_count = adev->virt.count_cache.block[sriov_block].de_count; 1795 1796 return 0; 1797 } 1798 1799 static int 1800 amdgpu_virt_write_cpers_to_ring(struct amdgpu_device *adev, 1801 struct amdsriov_ras_telemetry *host_telemetry, 1802 u32 *more) 1803 { 1804 struct amd_sriov_ras_cper_dump *cper_dump = NULL; 1805 struct cper_hdr *entry = NULL; 1806 struct amdgpu_ring *ring = &adev->cper.ring_buf; 1807 uint32_t checksum, used_size; 1808 u64 remaining, cnt, i; 1809 int ret = 0; 1810 1811 checksum = host_telemetry->header.checksum; 1812 used_size = host_telemetry->header.used_size; 1813 1814 if (used_size < offsetof(struct amd_sriov_ras_cper_dump, buf) || 1815 used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10)) 1816 return -EINVAL; 1817 1818 cper_dump = kmemdup(&host_telemetry->body.cper_dump, used_size, GFP_KERNEL); 1819 if (!cper_dump) 1820 return -ENOMEM; 1821 1822 if (checksum != amd_sriov_msg_checksum(cper_dump, used_size, 0, 0)) { 1823 ret = -EINVAL; 1824 goto out; 1825 } 1826 1827 *more = cper_dump->more; 1828 1829 if (cper_dump->wptr < adev->virt.ras.cper_rptr) { 1830 dev_warn( 1831 adev->dev, 1832 "guest specified rptr that was too high! guest rptr: 0x%llx, host rptr: 0x%llx\n", 1833 adev->virt.ras.cper_rptr, cper_dump->wptr); 1834 1835 adev->virt.ras.cper_rptr = cper_dump->wptr; 1836 goto out; 1837 } 1838 1839 entry = (struct cper_hdr *)&cper_dump->buf[0]; 1840 remaining = (u64)used_size - offsetof(struct amd_sriov_ras_cper_dump, buf); 1841 cnt = min_t(u64, cper_dump->count, CPER_MAX_ALLOWED_COUNT); 1842 1843 for (i = 0; i < cnt; i++) { 1844 if (entry->record_length < sizeof(struct cper_hdr) || 1845 entry->record_length > remaining) { 1846 ret = -EINVAL; 1847 goto out; 1848 } 1849 1850 amdgpu_cper_ring_write(ring, entry, entry->record_length); 1851 remaining -= entry->record_length; 1852 entry = (struct cper_hdr *)((char *)entry + entry->record_length); 1853 } 1854 1855 if (cper_dump->overflow_count) 1856 dev_warn(adev->dev, 1857 "host reported CPER overflow of 0x%llx entries!\n", 1858 cper_dump->overflow_count); 1859 1860 adev->virt.ras.cper_rptr = cper_dump->wptr; 1861 out: 1862 kfree(cper_dump); 1863 1864 return ret; 1865 } 1866 1867 static int amdgpu_virt_req_ras_cper_dump_internal(struct amdgpu_device *adev) 1868 { 1869 struct amdgpu_virt *virt = &adev->virt; 1870 int ret = 0; 1871 uint32_t more = 0; 1872 1873 if (!virt->ops || !virt->ops->req_ras_cper_dump) 1874 return -EOPNOTSUPP; 1875 1876 do { 1877 if (!virt->ops->req_ras_cper_dump(adev, virt->ras.cper_rptr)) 1878 ret = amdgpu_virt_write_cpers_to_ring( 1879 adev, virt->fw_reserve.ras_telemetry, &more); 1880 else 1881 ret = 0; 1882 } while (more && !ret); 1883 1884 return ret; 1885 } 1886 1887 int amdgpu_virt_req_ras_cper_dump(struct amdgpu_device *adev, bool force_update) 1888 { 1889 struct amdgpu_virt *virt = &adev->virt; 1890 int ret = 0; 1891 1892 if (!amdgpu_sriov_ras_cper_en(adev)) 1893 return -EOPNOTSUPP; 1894 1895 if ((__ratelimit(&virt->ras.ras_cper_dump_rs) || force_update) && 1896 down_read_trylock(&adev->reset_domain->sem)) { 1897 mutex_lock(&virt->ras.ras_telemetry_mutex); 1898 ret = amdgpu_virt_req_ras_cper_dump_internal(adev); 1899 mutex_unlock(&virt->ras.ras_telemetry_mutex); 1900 up_read(&adev->reset_domain->sem); 1901 } 1902 1903 return ret; 1904 } 1905 1906 int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev) 1907 { 1908 unsigned long ue_count, ce_count; 1909 1910 if (amdgpu_sriov_ras_telemetry_en(adev)) { 1911 amdgpu_virt_req_ras_err_count_internal(adev, true); 1912 amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL); 1913 } 1914 1915 return 0; 1916 } 1917 1918 bool amdgpu_virt_ras_telemetry_block_en(struct amdgpu_device *adev, 1919 enum amdgpu_ras_block block) 1920 { 1921 enum amd_sriov_ras_telemetry_gpu_block sriov_block; 1922 1923 sriov_block = amdgpu_ras_block_to_sriov(adev, block); 1924 1925 if (sriov_block >= RAS_TELEMETRY_GPU_BLOCK_COUNT || 1926 !amdgpu_sriov_ras_telemetry_block_en(adev, sriov_block)) 1927 return false; 1928 1929 return true; 1930 } 1931 1932 /* 1933 * amdgpu_virt_request_bad_pages() - request bad pages 1934 * @adev: amdgpu device. 1935 * Send command to GPU hypervisor to write new bad pages into the shared PF2VF region 1936 */ 1937 void amdgpu_virt_request_bad_pages(struct amdgpu_device *adev) 1938 { 1939 struct amdgpu_virt *virt = &adev->virt; 1940 1941 if (virt->ops && virt->ops->req_bad_pages) 1942 virt->ops->req_bad_pages(adev); 1943 } 1944 1945 static int amdgpu_virt_cache_chk_criti_hit(struct amdgpu_device *adev, 1946 struct amdsriov_ras_telemetry *host_telemetry, 1947 bool *hit) 1948 { 1949 struct amd_sriov_ras_chk_criti *tmp = NULL; 1950 uint32_t checksum, used_size; 1951 1952 checksum = host_telemetry->header.checksum; 1953 used_size = host_telemetry->header.used_size; 1954 1955 if (used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10)) 1956 return 0; 1957 1958 tmp = kmemdup(&host_telemetry->body.chk_criti, used_size, GFP_KERNEL); 1959 if (!tmp) 1960 return -ENOMEM; 1961 1962 if (checksum != amd_sriov_msg_checksum(tmp, used_size, 0, 0)) 1963 goto out; 1964 1965 if (hit) 1966 *hit = tmp->hit ? true : false; 1967 1968 out: 1969 kfree(tmp); 1970 1971 return 0; 1972 } 1973 1974 int amdgpu_virt_check_vf_critical_region(struct amdgpu_device *adev, u64 addr, bool *hit) 1975 { 1976 struct amdgpu_virt *virt = &adev->virt; 1977 int r = -EPERM; 1978 1979 if (!virt->ops || !virt->ops->req_ras_chk_criti) 1980 return -EOPNOTSUPP; 1981 1982 /* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host 1983 * will ignore incoming guest messages. Ratelimit the guest messages to 1984 * prevent guest self DOS. 1985 */ 1986 if (__ratelimit(&virt->ras.ras_chk_criti_rs)) { 1987 mutex_lock(&virt->ras.ras_telemetry_mutex); 1988 if (!virt->ops->req_ras_chk_criti(adev, addr)) 1989 r = amdgpu_virt_cache_chk_criti_hit( 1990 adev, virt->fw_reserve.ras_telemetry, hit); 1991 mutex_unlock(&virt->ras.ras_telemetry_mutex); 1992 } 1993 1994 return r; 1995 } 1996 1997 static int req_remote_ras_cmd(struct amdgpu_device *adev, 1998 u32 param1, u32 param2, u32 param3) 1999 { 2000 struct amdgpu_virt *virt = &adev->virt; 2001 2002 if (virt->ops && virt->ops->req_remote_ras_cmd) 2003 return virt->ops->req_remote_ras_cmd(adev, param1, param2, param3); 2004 return -ENOENT; 2005 } 2006 2007 int amdgpu_virt_send_remote_ras_cmd(struct amdgpu_device *adev, 2008 uint64_t buf, uint32_t buf_len) 2009 { 2010 uint64_t gpa = buf; 2011 int ret = -EIO; 2012 2013 if (down_read_trylock(&adev->reset_domain->sem)) { 2014 ret = req_remote_ras_cmd(adev, 2015 lower_32_bits(gpa), upper_32_bits(gpa), buf_len); 2016 up_read(&adev->reset_domain->sem); 2017 } 2018 2019 return ret; 2020 } 2021