1 /* 2 * Copyright 2016 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 24 #include <linux/module.h> 25 26 #ifdef CONFIG_X86 27 #include <asm/hypervisor.h> 28 #endif 29 30 #include <drm/drm_drv.h> 31 #include <xen/xen.h> 32 33 #include "amdgpu.h" 34 #include "amdgpu_ras.h" 35 #include "amdgpu_reset.h" 36 #include "amdgpu_dpm.h" 37 #include "vi.h" 38 #include "soc15.h" 39 #include "nv.h" 40 #include "amdgpu_virt_ras_cmd.h" 41 42 #define POPULATE_UCODE_INFO(vf2pf_info, ucode, ver) \ 43 do { \ 44 vf2pf_info->ucode_info[ucode].id = ucode; \ 45 vf2pf_info->ucode_info[ucode].version = ver; \ 46 } while (0) 47 48 #define mmRCC_CONFIG_MEMSIZE 0xde3 49 50 const char *amdgpu_virt_dynamic_crit_table_name[] = { 51 "IP DISCOVERY", 52 "VBIOS IMG", 53 "RAS TELEMETRY", 54 "DATA EXCHANGE", 55 "BAD PAGE INFO", 56 "INIT HEADER", 57 "LAST", 58 }; 59 60 bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev) 61 { 62 /* By now all MMIO pages except mailbox are blocked */ 63 /* if blocking is enabled in hypervisor. Choose the */ 64 /* SCRATCH_REG0 to test. */ 65 return RREG32_NO_KIQ(0xc040) == 0xffffffff; 66 } 67 68 void amdgpu_virt_init_setting(struct amdgpu_device *adev) 69 { 70 struct drm_device *ddev = adev_to_drm(adev); 71 72 /* enable virtual display */ 73 if (adev->asic_type != CHIP_ALDEBARAN && 74 adev->asic_type != CHIP_ARCTURUS && 75 ((adev->pdev->class >> 8) != PCI_CLASS_ACCELERATOR_PROCESSING)) { 76 if (adev->mode_info.num_crtc == 0) 77 adev->mode_info.num_crtc = 1; 78 adev->enable_virtual_display = true; 79 } 80 ddev->driver_features &= ~DRIVER_ATOMIC; 81 adev->cg_flags = 0; 82 adev->pg_flags = 0; 83 84 /* Reduce kcq number to 2 to reduce latency */ 85 if (amdgpu_num_kcq == -1) 86 amdgpu_num_kcq = 2; 87 } 88 89 /** 90 * amdgpu_virt_request_full_gpu() - request full gpu access 91 * @adev: amdgpu device. 92 * @init: is driver init time. 93 * When start to init/fini driver, first need to request full gpu access. 94 * Return: Zero if request success, otherwise will return error. 95 */ 96 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init) 97 { 98 struct amdgpu_virt *virt = &adev->virt; 99 int r; 100 101 if (virt->ops && virt->ops->req_full_gpu) { 102 r = virt->ops->req_full_gpu(adev, init); 103 if (r) { 104 adev->no_hw_access = true; 105 return r; 106 } 107 108 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 109 } 110 111 return 0; 112 } 113 114 /** 115 * amdgpu_virt_release_full_gpu() - release full gpu access 116 * @adev: amdgpu device. 117 * @init: is driver init time. 118 * When finishing driver init/fini, need to release full gpu access. 119 * Return: Zero if release success, otherwise will returen error. 120 */ 121 int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init) 122 { 123 struct amdgpu_virt *virt = &adev->virt; 124 int r; 125 126 if (virt->ops && virt->ops->rel_full_gpu) { 127 r = virt->ops->rel_full_gpu(adev, init); 128 if (r) 129 return r; 130 131 adev->virt.caps |= AMDGPU_SRIOV_CAPS_RUNTIME; 132 } 133 return 0; 134 } 135 136 /** 137 * amdgpu_virt_reset_gpu() - reset gpu 138 * @adev: amdgpu device. 139 * Send reset command to GPU hypervisor to reset GPU that VM is using 140 * Return: Zero if reset success, otherwise will return error. 141 */ 142 int amdgpu_virt_reset_gpu(struct amdgpu_device *adev) 143 { 144 struct amdgpu_virt *virt = &adev->virt; 145 int r; 146 147 if (virt->ops && virt->ops->reset_gpu) { 148 r = virt->ops->reset_gpu(adev); 149 if (r) 150 return r; 151 152 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 153 } 154 155 return 0; 156 } 157 158 void amdgpu_virt_request_init_data(struct amdgpu_device *adev) 159 { 160 struct amdgpu_virt *virt = &adev->virt; 161 162 if (virt->ops && virt->ops->req_init_data) 163 virt->ops->req_init_data(adev); 164 165 if (adev->virt.req_init_data_ver > 0) 166 dev_info(adev->dev, "host supports REQ_INIT_DATA handshake of critical_region_version %d\n", 167 adev->virt.req_init_data_ver); 168 else 169 dev_warn(adev->dev, "host doesn't support REQ_INIT_DATA handshake\n"); 170 } 171 172 /** 173 * amdgpu_virt_ready_to_reset() - send ready to reset to host 174 * @adev: amdgpu device. 175 * Send ready to reset message to GPU hypervisor to signal we have stopped GPU 176 * activity and is ready for host FLR 177 */ 178 void amdgpu_virt_ready_to_reset(struct amdgpu_device *adev) 179 { 180 struct amdgpu_virt *virt = &adev->virt; 181 182 if (virt->ops && virt->ops->reset_gpu) 183 virt->ops->ready_to_reset(adev); 184 } 185 186 /** 187 * amdgpu_virt_wait_reset() - wait for reset gpu completed 188 * @adev: amdgpu device. 189 * Wait for GPU reset completed. 190 * Return: Zero if reset success, otherwise will return error. 191 */ 192 int amdgpu_virt_wait_reset(struct amdgpu_device *adev) 193 { 194 struct amdgpu_virt *virt = &adev->virt; 195 196 if (!virt->ops || !virt->ops->wait_reset) 197 return -EINVAL; 198 199 return virt->ops->wait_reset(adev); 200 } 201 202 /** 203 * amdgpu_virt_alloc_mm_table() - alloc memory for mm table 204 * @adev: amdgpu device. 205 * MM table is used by UVD and VCE for its initialization 206 * Return: Zero if allocate success. 207 */ 208 int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev) 209 { 210 int r; 211 212 if (!amdgpu_sriov_vf(adev) || adev->virt.mm_table.gpu_addr) 213 return 0; 214 215 r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, 216 AMDGPU_GEM_DOMAIN_VRAM | 217 AMDGPU_GEM_DOMAIN_GTT, 218 &adev->virt.mm_table.bo, 219 &adev->virt.mm_table.gpu_addr, 220 (void *)&adev->virt.mm_table.cpu_addr); 221 if (r) { 222 dev_err(adev->dev, "failed to alloc mm table and error = %d.\n", r); 223 return r; 224 } 225 226 memset((void *)adev->virt.mm_table.cpu_addr, 0, PAGE_SIZE); 227 dev_info(adev->dev, "MM table gpu addr = 0x%llx, cpu addr = %p.\n", 228 adev->virt.mm_table.gpu_addr, 229 adev->virt.mm_table.cpu_addr); 230 return 0; 231 } 232 233 /** 234 * amdgpu_virt_free_mm_table() - free mm table memory 235 * @adev: amdgpu device. 236 * Free MM table memory 237 */ 238 void amdgpu_virt_free_mm_table(struct amdgpu_device *adev) 239 { 240 if (!amdgpu_sriov_vf(adev) || !adev->virt.mm_table.gpu_addr) 241 return; 242 243 amdgpu_bo_free_kernel(&adev->virt.mm_table.bo, 244 &adev->virt.mm_table.gpu_addr, 245 (void *)&adev->virt.mm_table.cpu_addr); 246 adev->virt.mm_table.gpu_addr = 0; 247 } 248 249 /** 250 * amdgpu_virt_rcvd_ras_interrupt() - receive ras interrupt 251 * @adev: amdgpu device. 252 * Check whether host sent RAS error message 253 * Return: true if found, otherwise false 254 */ 255 bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev) 256 { 257 struct amdgpu_virt *virt = &adev->virt; 258 259 if (!virt->ops || !virt->ops->rcvd_ras_intr) 260 return false; 261 262 return virt->ops->rcvd_ras_intr(adev); 263 } 264 265 266 unsigned int amd_sriov_msg_checksum(void *obj, 267 unsigned long obj_size, 268 unsigned int key, 269 unsigned int checksum) 270 { 271 unsigned int ret = key; 272 unsigned long i = 0; 273 unsigned char *pos; 274 275 pos = (char *)obj; 276 /* calculate checksum */ 277 for (i = 0; i < obj_size; ++i) 278 ret += *(pos + i); 279 /* minus the checksum itself */ 280 pos = (char *)&checksum; 281 for (i = 0; i < sizeof(checksum); ++i) 282 ret -= *(pos + i); 283 return ret; 284 } 285 286 #define AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY 512 287 /* Max bad page slots allowed for SRIOV*/ 288 #define AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_MAX_CAPACITY 10665U 289 290 /** 291 * amdgpu_virt_ras_realloc_eh_data_space - alloc/realloc VF bad-page @data->bps and @data->bps_bo 292 * @adev: amdgpu device 293 * @data: VF RAS error-handler data 294 * @pages: minimum number of new slots to add beyond @data->capacity 295 * 296 * Return: 0 on success, %-ENOMEM on failure. 297 */ 298 static int amdgpu_virt_ras_realloc_eh_data_space(struct amdgpu_device *adev, 299 struct amdgpu_virt_ras_err_handler_data *data, 300 int pages) 301 { 302 struct eeprom_table_record *new_bps; 303 struct amdgpu_bo **new_bo; 304 unsigned int old_space; 305 unsigned int new_space; 306 unsigned int align_space; 307 308 old_space = (unsigned int)data->capacity; 309 new_space = old_space + max_t(unsigned int, (unsigned int)pages, 310 (unsigned int)AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY); 311 if (new_space < old_space || new_space > AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_MAX_CAPACITY) 312 return -ENOMEM; 313 314 align_space = ALIGN(new_space, AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY); 315 if (align_space > AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_MAX_CAPACITY) 316 return -ENOMEM; 317 318 new_bps = kmalloc_array(align_space, sizeof(*data->bps), GFP_KERNEL); 319 new_bo = kcalloc(align_space, sizeof(*data->bps_bo), GFP_KERNEL); 320 if (!new_bps || !new_bo) { 321 kfree(new_bps); 322 kfree(new_bo); 323 dev_warn_ratelimited(adev->dev, 324 "RAS WARN: failed to grow bad page table to %u slots\n", 325 align_space); 326 return -ENOMEM; 327 } 328 329 memcpy(new_bps, data->bps, data->count * sizeof(*data->bps)); 330 memcpy(new_bo, data->bps_bo, data->count * sizeof(*data->bps_bo)); 331 332 kfree(data->bps); 333 kfree(data->bps_bo); 334 data->bps = new_bps; 335 data->bps_bo = new_bo; 336 data->capacity = (int)align_space; 337 338 return 0; 339 } 340 341 static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev) 342 { 343 struct amdgpu_virt *virt = &adev->virt; 344 struct amdgpu_virt_ras_err_handler_data **data = &virt->virt_eh_data; 345 unsigned int align_space = AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY; 346 void *bps = NULL; 347 struct amdgpu_bo **bps_bo = NULL; 348 349 *data = kmalloc_obj(struct amdgpu_virt_ras_err_handler_data); 350 if (!*data) 351 goto data_failure; 352 353 bps = kmalloc_objs(*(*data)->bps, align_space); 354 if (!bps) 355 goto bps_failure; 356 357 bps_bo = kcalloc(align_space, sizeof(*(*data)->bps_bo), GFP_KERNEL); 358 if (!bps_bo) 359 goto bps_bo_failure; 360 361 (*data)->bps = bps; 362 (*data)->bps_bo = bps_bo; 363 (*data)->capacity = align_space; 364 (*data)->count = 0; 365 (*data)->last_reserved = 0; 366 367 virt->ras_init_done = true; 368 369 return 0; 370 371 bps_bo_failure: 372 kfree(bps); 373 bps_failure: 374 kfree(*data); 375 data_failure: 376 return -ENOMEM; 377 } 378 379 static void amdgpu_virt_ras_release_bp(struct amdgpu_device *adev) 380 { 381 struct amdgpu_virt *virt = &adev->virt; 382 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 383 struct amdgpu_bo *bo; 384 int i; 385 386 if (!data) 387 return; 388 389 for (i = data->last_reserved - 1; i >= 0; i--) { 390 bo = data->bps_bo[i]; 391 if (bo) { 392 amdgpu_bo_free_kernel(&bo, NULL, NULL); 393 data->bps_bo[i] = bo; 394 } 395 data->last_reserved = i; 396 } 397 } 398 399 void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev) 400 { 401 struct amdgpu_virt *virt = &adev->virt; 402 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 403 404 virt->ras_init_done = false; 405 406 if (!data) 407 return; 408 409 amdgpu_virt_ras_release_bp(adev); 410 411 kfree(data->bps); 412 kfree(data->bps_bo); 413 kfree(data); 414 virt->virt_eh_data = NULL; 415 } 416 417 static bool amdgpu_virt_ras_add_bps(struct amdgpu_device *adev, 418 const struct eeprom_table_record *bps, int pages) 419 { 420 struct amdgpu_virt *virt = &adev->virt; 421 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 422 int need; 423 424 if (!data || pages <= 0) 425 return false; 426 427 if (pages > AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_MAX_CAPACITY - data->count) { 428 dev_warn_ratelimited(adev->dev, 429 "RAS WARN: bad page table at capacity (count=%d pages=%d max=%u)\n", 430 data->count, pages, 431 AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_MAX_CAPACITY); 432 return false; 433 } 434 435 need = data->count + pages; 436 if (need > data->capacity && 437 amdgpu_virt_ras_realloc_eh_data_space(adev, data, need - data->capacity)) 438 return false; 439 440 memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps)); 441 data->count += pages; 442 443 return true; 444 } 445 446 static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev) 447 { 448 struct amdgpu_virt *virt = &adev->virt; 449 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 450 struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr; 451 struct ttm_resource_manager *man = &mgr->manager; 452 struct amdgpu_bo *bo = NULL; 453 uint64_t bp; 454 int i; 455 456 if (!data) 457 return; 458 459 for (i = data->last_reserved; i < data->count; i++) { 460 bp = data->bps[i].retired_page; 461 462 /* There are two cases of reserve error should be ignored: 463 * 1) a ras bad page has been allocated (used by someone); 464 * 2) a ras bad page has been reserved (duplicate error injection 465 * for one page); 466 */ 467 if (ttm_resource_manager_used(man)) { 468 amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr, 469 bp << AMDGPU_GPU_PAGE_SHIFT, 470 AMDGPU_GPU_PAGE_SIZE); 471 data->bps_bo[i] = NULL; 472 } else { 473 if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT, 474 AMDGPU_GPU_PAGE_SIZE, 475 &bo, NULL)) 476 dev_dbg(adev->dev, 477 "RAS WARN: reserve vram for retired page %llx fail\n", 478 bp); 479 data->bps_bo[i] = bo; 480 } 481 data->last_reserved = i + 1; 482 bo = NULL; 483 } 484 } 485 486 static bool amdgpu_virt_ras_check_bad_page(struct amdgpu_device *adev, 487 uint64_t retired_page) 488 { 489 struct amdgpu_virt *virt = &adev->virt; 490 struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; 491 int i; 492 493 if (!data) 494 return true; 495 496 for (i = 0; i < data->count; i++) 497 if (retired_page == data->bps[i].retired_page) 498 return true; 499 500 return false; 501 } 502 503 static void amdgpu_virt_add_bad_page(struct amdgpu_device *adev, 504 uint64_t bp_block_offset, uint32_t bp_block_size) 505 { 506 struct eeprom_table_record bp; 507 uint64_t retired_page; 508 uint32_t bp_idx, bp_cnt; 509 void *fw_va = adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].cpu_ptr; 510 void *drv_va = adev->mman.resv_region[AMDGPU_RESV_DRV_VRAM_USAGE].cpu_ptr; 511 void *vram_usage_va = fw_va ? fw_va : drv_va; 512 513 memset(&bp, 0, sizeof(bp)); 514 515 if (!bp_block_size) 516 return; 517 518 bp_cnt = bp_block_size / sizeof(uint64_t); 519 for (bp_idx = 0; bp_idx < bp_cnt; bp_idx++) { 520 retired_page = *(uint64_t *)(vram_usage_va + 521 bp_block_offset + bp_idx * sizeof(uint64_t)); 522 bp.retired_page = retired_page; 523 524 if (amdgpu_virt_ras_check_bad_page(adev, retired_page)) 525 continue; 526 527 if (!amdgpu_virt_ras_add_bps(adev, &bp, 1)) 528 break; 529 530 amdgpu_virt_ras_reserve_bps(adev); 531 } 532 } 533 534 static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) 535 { 536 struct amd_sriov_msg_pf2vf_info_header *pf2vf_info = adev->virt.fw_reserve.p_pf2vf; 537 struct amdgim_pf2vf_info_v1 *pf2vf_v1; 538 struct amd_sriov_msg_pf2vf_info *pf2vf; 539 540 uint32_t checksum; 541 uint32_t checkval; 542 543 uint32_t i; 544 uint32_t tmp; 545 546 if (adev->virt.fw_reserve.p_pf2vf == NULL) 547 return -EINVAL; 548 549 if (pf2vf_info->size > 1024) { 550 dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", pf2vf_info->size); 551 return -EINVAL; 552 } 553 554 switch (pf2vf_info->version) { 555 case 1: 556 pf2vf_v1 = (struct amdgim_pf2vf_info_v1 *)pf2vf_info; 557 checksum = pf2vf_v1->checksum; 558 checkval = amd_sriov_msg_checksum( 559 adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, 560 adev->virt.fw_reserve.checksum_key, checksum); 561 if (checksum != checkval) { 562 dev_err(adev->dev, 563 "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", 564 checksum, checkval); 565 return -EINVAL; 566 } 567 568 adev->virt.gim_feature = pf2vf_v1->feature_flags; 569 break; 570 case 2: 571 /* TODO: missing key, need to add it later */ 572 pf2vf = (struct amd_sriov_msg_pf2vf_info *)pf2vf_info; 573 checksum = pf2vf->checksum; 574 checkval = amd_sriov_msg_checksum( 575 adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, 576 0, checksum); 577 if (checksum != checkval) { 578 dev_err(adev->dev, 579 "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", 580 checksum, checkval); 581 return -EINVAL; 582 } 583 584 adev->virt.vf2pf_update_interval_ms = 585 pf2vf->vf2pf_update_interval_ms; 586 adev->virt.gim_feature = pf2vf->feature_flags.all; 587 adev->virt.reg_access = pf2vf->reg_access_flags.all; 588 589 adev->virt.decode_max_dimension_pixels = 0; 590 adev->virt.decode_max_frame_pixels = 0; 591 adev->virt.encode_max_dimension_pixels = 0; 592 adev->virt.encode_max_frame_pixels = 0; 593 adev->virt.is_mm_bw_enabled = false; 594 for (i = 0; i < AMD_SRIOV_MSG_RESERVE_VCN_INST; i++) { 595 tmp = pf2vf->mm_bw_management[i].decode_max_dimension_pixels; 596 adev->virt.decode_max_dimension_pixels = max(tmp, adev->virt.decode_max_dimension_pixels); 597 598 tmp = pf2vf->mm_bw_management[i].decode_max_frame_pixels; 599 adev->virt.decode_max_frame_pixels = max(tmp, adev->virt.decode_max_frame_pixels); 600 601 tmp = pf2vf->mm_bw_management[i].encode_max_dimension_pixels; 602 adev->virt.encode_max_dimension_pixels = max(tmp, adev->virt.encode_max_dimension_pixels); 603 604 tmp = pf2vf->mm_bw_management[i].encode_max_frame_pixels; 605 adev->virt.encode_max_frame_pixels = max(tmp, adev->virt.encode_max_frame_pixels); 606 } 607 if ((adev->virt.decode_max_dimension_pixels > 0) || (adev->virt.encode_max_dimension_pixels > 0)) 608 adev->virt.is_mm_bw_enabled = true; 609 610 adev->unique_id = pf2vf->uuid; 611 612 adev->unitid = 0; 613 if (amdgpu_sriov_is_unitid_support(adev)) 614 adev->unitid = pf2vf->unitid; 615 616 adev->virt.ras_en_caps.all = pf2vf->ras_en_caps.all; 617 adev->virt.ras_telemetry_en_caps.all = 618 pf2vf->ras_telemetry_en_caps.all; 619 break; 620 default: 621 dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version); 622 return -EINVAL; 623 } 624 625 /* correct too large or too little interval value */ 626 if (adev->virt.vf2pf_update_interval_ms < 200 || adev->virt.vf2pf_update_interval_ms > 10000) 627 adev->virt.vf2pf_update_interval_ms = 2000; 628 629 return 0; 630 } 631 632 static void amdgpu_virt_populate_vf2pf_ucode_info(struct amdgpu_device *adev) 633 { 634 struct amd_sriov_msg_vf2pf_info *vf2pf_info; 635 vf2pf_info = (struct amd_sriov_msg_vf2pf_info *) adev->virt.fw_reserve.p_vf2pf; 636 637 if (adev->virt.fw_reserve.p_vf2pf == NULL) 638 return; 639 640 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_VCE, adev->vce.fw_version); 641 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_UVD, adev->uvd.fw_version); 642 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_MC, adev->gmc.fw_version); 643 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_ME, adev->gfx.me_fw_version); 644 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_PFP, adev->gfx.pfp_fw_version); 645 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_CE, adev->gfx.ce_fw_version); 646 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_RLC, adev->gfx.rlc_fw_version); 647 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_RLC_SRLC, adev->gfx.rlc_srlc_fw_version); 648 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_RLC_SRLG, adev->gfx.rlc_srlg_fw_version); 649 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_RLC_SRLS, adev->gfx.rlc_srls_fw_version); 650 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_MEC, adev->gfx.mec_fw_version); 651 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_MEC2, adev->gfx.mec2_fw_version); 652 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_SOS, adev->psp.sos.fw_version); 653 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_ASD, 654 adev->psp.asd_context.bin_desc.fw_version); 655 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_TA_RAS, 656 adev->psp.ras_context.context.bin_desc.fw_version); 657 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_TA_XGMI, 658 adev->psp.xgmi_context.context.bin_desc.fw_version); 659 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_SMC, adev->pm.fw_version); 660 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_SDMA, adev->sdma.instance[0].fw_version); 661 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_SDMA2, adev->sdma.instance[1].fw_version); 662 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_VCN, adev->vcn.fw_version); 663 POPULATE_UCODE_INFO(vf2pf_info, AMD_SRIOV_UCODE_ID_DMCU, adev->dm.dmcu_fw_version); 664 } 665 666 static int amdgpu_virt_write_vf2pf_data(struct amdgpu_device *adev) 667 { 668 struct amd_sriov_msg_vf2pf_info *vf2pf_info; 669 670 vf2pf_info = (struct amd_sriov_msg_vf2pf_info *) adev->virt.fw_reserve.p_vf2pf; 671 672 if (adev->virt.fw_reserve.p_vf2pf == NULL) 673 return -EINVAL; 674 675 memset(vf2pf_info, 0, sizeof(struct amd_sriov_msg_vf2pf_info)); 676 677 vf2pf_info->header.size = sizeof(struct amd_sriov_msg_vf2pf_info); 678 vf2pf_info->header.version = AMD_SRIOV_MSG_FW_VRAM_VF2PF_VER; 679 680 #ifdef MODULE 681 if (THIS_MODULE->version != NULL) 682 strscpy(vf2pf_info->driver_version, THIS_MODULE->version); 683 else 684 #endif 685 strscpy(vf2pf_info->driver_version, "N/A"); 686 687 vf2pf_info->pf2vf_version_required = 0; // no requirement, guest understands all 688 vf2pf_info->driver_cert = 0; 689 vf2pf_info->os_info.all = 0; 690 691 vf2pf_info->fb_usage = ttm_resource_manager_used(&adev->mman.vram_mgr.manager) ? 692 ttm_resource_manager_usage(&adev->mman.vram_mgr.manager) >> 20 : 0; 693 vf2pf_info->fb_vis_usage = 694 amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr) >> 20; 695 vf2pf_info->fb_size = adev->gmc.real_vram_size >> 20; 696 vf2pf_info->fb_vis_size = adev->gmc.visible_vram_size >> 20; 697 698 amdgpu_virt_populate_vf2pf_ucode_info(adev); 699 700 /* TODO: read dynamic info */ 701 vf2pf_info->gfx_usage = 0; 702 vf2pf_info->compute_usage = 0; 703 vf2pf_info->encode_usage = 0; 704 vf2pf_info->decode_usage = 0; 705 706 vf2pf_info->dummy_page_addr = (uint64_t)adev->dummy_page_addr; 707 if (amdgpu_sriov_is_mes_info_enable(adev)) { 708 vf2pf_info->mes_info_addr = 709 (uint64_t)(adev->mes.resource_1_gpu_addr[0] + AMDGPU_GPU_PAGE_SIZE); 710 vf2pf_info->mes_info_size = 711 adev->mes.resource_1[0]->tbo.base.size - AMDGPU_GPU_PAGE_SIZE; 712 } 713 vf2pf_info->checksum = 714 amd_sriov_msg_checksum( 715 vf2pf_info, sizeof(*vf2pf_info), 0, 0); 716 717 return 0; 718 } 719 720 static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work) 721 { 722 struct amdgpu_device *adev = container_of(work, struct amdgpu_device, virt.vf2pf_work.work); 723 int ret; 724 725 ret = amdgpu_virt_read_pf2vf_data(adev); 726 if (ret) { 727 adev->virt.vf2pf_update_retry_cnt++; 728 729 if ((amdgpu_virt_rcvd_ras_interrupt(adev) || 730 adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && 731 amdgpu_sriov_runtime(adev)) { 732 733 amdgpu_ras_set_fed(adev, true); 734 if (amdgpu_reset_domain_schedule(adev->reset_domain, 735 &adev->kfd.reset_work)) 736 return; 737 else 738 dev_err(adev->dev, "Failed to queue work! at %s", __func__); 739 } 740 741 goto out; 742 } 743 744 adev->virt.vf2pf_update_retry_cnt = 0; 745 amdgpu_virt_write_vf2pf_data(adev); 746 747 out: 748 schedule_delayed_work(&(adev->virt.vf2pf_work), adev->virt.vf2pf_update_interval_ms); 749 } 750 751 static int amdgpu_virt_read_exchange_data_from_mem(struct amdgpu_device *adev, uint32_t *pfvf_data) 752 { 753 uint32_t dataexchange_offset = 754 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset; 755 uint32_t dataexchange_size = 756 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb << 10; 757 uint64_t pos = 0; 758 759 dev_info(adev->dev, 760 "Got data exchange info from dynamic crit_region_table at offset 0x%x with size of 0x%x bytes.\n", 761 dataexchange_offset, dataexchange_size); 762 763 if (!IS_ALIGNED(dataexchange_offset, 4) || !IS_ALIGNED(dataexchange_size, 4)) { 764 dev_err(adev->dev, "Data exchange data not aligned to 4 bytes\n"); 765 return -EINVAL; 766 } 767 768 pos = (uint64_t)dataexchange_offset; 769 amdgpu_device_vram_access(adev, pos, pfvf_data, 770 dataexchange_size, false); 771 772 return 0; 773 } 774 775 void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev) 776 { 777 if (adev->virt.vf2pf_update_interval_ms != 0) { 778 dev_info(adev->dev, "clean up the vf2pf work item\n"); 779 cancel_delayed_work_sync(&adev->virt.vf2pf_work); 780 adev->virt.vf2pf_update_interval_ms = 0; 781 } 782 } 783 784 void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev) 785 { 786 uint32_t *pfvf_data = NULL; 787 void *fw_va = adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].cpu_ptr; 788 void *drv_va = adev->mman.resv_region[AMDGPU_RESV_DRV_VRAM_USAGE].cpu_ptr; 789 790 adev->virt.fw_reserve.p_pf2vf = NULL; 791 adev->virt.fw_reserve.p_vf2pf = NULL; 792 adev->virt.vf2pf_update_interval_ms = 0; 793 adev->virt.vf2pf_update_retry_cnt = 0; 794 795 if (fw_va && drv_va) { 796 dev_warn(adev->dev, "Currently fw_vram and drv_vram should not have values at the same time!"); 797 } else if (fw_va || drv_va) { 798 /* go through this logic in ip_init and reset to init workqueue*/ 799 amdgpu_virt_exchange_data(adev); 800 801 INIT_DELAYED_WORK(&adev->virt.vf2pf_work, amdgpu_virt_update_vf2pf_work_item); 802 schedule_delayed_work(&(adev->virt.vf2pf_work), msecs_to_jiffies(adev->virt.vf2pf_update_interval_ms)); 803 } else if (adev->bios != NULL) { 804 /* got through this logic in early init stage to get necessary flags, e.g. rlcg_acc related*/ 805 if (adev->virt.req_init_data_ver == GPU_CRIT_REGION_V2) { 806 pfvf_data = 807 kzalloc(adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb << 10, 808 GFP_KERNEL); 809 if (!pfvf_data) { 810 dev_err(adev->dev, "Failed to allocate memory for pfvf_data\n"); 811 return; 812 } 813 814 if (amdgpu_virt_read_exchange_data_from_mem(adev, pfvf_data)) 815 goto free_pfvf_data; 816 817 adev->virt.fw_reserve.p_pf2vf = 818 (struct amd_sriov_msg_pf2vf_info_header *)pfvf_data; 819 820 amdgpu_virt_read_pf2vf_data(adev); 821 822 free_pfvf_data: 823 kfree(pfvf_data); 824 pfvf_data = NULL; 825 adev->virt.fw_reserve.p_pf2vf = NULL; 826 } else { 827 adev->virt.fw_reserve.p_pf2vf = 828 (struct amd_sriov_msg_pf2vf_info_header *) 829 (adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10)); 830 831 amdgpu_virt_read_pf2vf_data(adev); 832 } 833 } 834 } 835 836 837 void amdgpu_virt_exchange_data(struct amdgpu_device *adev) 838 { 839 uint64_t bp_block_offset = 0; 840 uint32_t bp_block_size = 0; 841 struct amd_sriov_msg_pf2vf_info *pf2vf_v2 = NULL; 842 void *fw_va = adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].cpu_ptr; 843 void *drv_va = adev->mman.resv_region[AMDGPU_RESV_DRV_VRAM_USAGE].cpu_ptr; 844 845 if (fw_va || drv_va) { 846 if (fw_va) { 847 if (adev->virt.req_init_data_ver == GPU_CRIT_REGION_V2) { 848 adev->virt.fw_reserve.p_pf2vf = 849 (struct amd_sriov_msg_pf2vf_info_header *) 850 (fw_va + 851 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset); 852 adev->virt.fw_reserve.p_vf2pf = 853 (struct amd_sriov_msg_vf2pf_info_header *) 854 (fw_va + 855 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset + 856 (AMD_SRIOV_MSG_SIZE_KB << 10)); 857 adev->virt.fw_reserve.ras_telemetry = 858 (fw_va + 859 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset); 860 } else { 861 adev->virt.fw_reserve.p_pf2vf = 862 (struct amd_sriov_msg_pf2vf_info_header *) 863 (fw_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10)); 864 adev->virt.fw_reserve.p_vf2pf = 865 (struct amd_sriov_msg_vf2pf_info_header *) 866 (fw_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10)); 867 adev->virt.fw_reserve.ras_telemetry = 868 (fw_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10)); 869 } 870 } else if (drv_va) { 871 adev->virt.fw_reserve.p_pf2vf = 872 (struct amd_sriov_msg_pf2vf_info_header *) 873 (drv_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10)); 874 adev->virt.fw_reserve.p_vf2pf = 875 (struct amd_sriov_msg_vf2pf_info_header *) 876 (drv_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10)); 877 adev->virt.fw_reserve.ras_telemetry = 878 (drv_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10)); 879 } 880 881 amdgpu_virt_read_pf2vf_data(adev); 882 amdgpu_virt_write_vf2pf_data(adev); 883 884 /* bad page handling for version 2 */ 885 if (adev->virt.fw_reserve.p_pf2vf->version == 2) { 886 pf2vf_v2 = (struct amd_sriov_msg_pf2vf_info *)adev->virt.fw_reserve.p_pf2vf; 887 888 bp_block_offset = ((uint64_t)pf2vf_v2->bp_block_offset_low & 0xFFFFFFFF) | 889 ((((uint64_t)pf2vf_v2->bp_block_offset_high) << 32) & 0xFFFFFFFF00000000); 890 bp_block_size = pf2vf_v2->bp_block_size; 891 892 if (bp_block_size && !adev->virt.ras_init_done) 893 amdgpu_virt_init_ras_err_handler_data(adev); 894 895 if (adev->virt.ras_init_done) 896 amdgpu_virt_add_bad_page(adev, bp_block_offset, bp_block_size); 897 } 898 } 899 } 900 901 static u32 amdgpu_virt_init_detect_asic(struct amdgpu_device *adev) 902 { 903 uint32_t reg; 904 905 switch (adev->asic_type) { 906 case CHIP_TONGA: 907 case CHIP_FIJI: 908 reg = RREG32(mmBIF_IOV_FUNC_IDENTIFIER); 909 break; 910 case CHIP_VEGA10: 911 case CHIP_VEGA20: 912 case CHIP_NAVI10: 913 case CHIP_NAVI12: 914 case CHIP_SIENNA_CICHLID: 915 case CHIP_ARCTURUS: 916 case CHIP_ALDEBARAN: 917 case CHIP_IP_DISCOVERY: 918 reg = RREG32(mmRCC_IOV_FUNC_IDENTIFIER); 919 break; 920 default: /* other chip doesn't support SRIOV */ 921 reg = 0; 922 break; 923 } 924 925 if (reg & 1) 926 adev->virt.caps |= AMDGPU_SRIOV_CAPS_IS_VF; 927 928 if (reg & 0x80000000) 929 adev->virt.caps |= AMDGPU_SRIOV_CAPS_ENABLE_IOV; 930 931 if (!reg) { 932 /* passthrough mode exclus sriov mod */ 933 if (is_virtual_machine() && !xen_initial_domain()) 934 adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE; 935 } 936 937 return reg; 938 } 939 940 static bool amdgpu_virt_init_req_data(struct amdgpu_device *adev, u32 reg) 941 { 942 bool is_sriov = false; 943 944 /* we have the ability to check now */ 945 if (amdgpu_sriov_vf(adev)) { 946 is_sriov = true; 947 948 switch (adev->asic_type) { 949 case CHIP_TONGA: 950 case CHIP_FIJI: 951 vi_set_virt_ops(adev); 952 break; 953 case CHIP_VEGA10: 954 soc15_set_virt_ops(adev); 955 #ifdef CONFIG_X86 956 /* not send GPU_INIT_DATA with MS_HYPERV*/ 957 if (!hypervisor_is_type(X86_HYPER_MS_HYPERV)) 958 #endif 959 /* send a dummy GPU_INIT_DATA request to host on vega10 */ 960 amdgpu_virt_request_init_data(adev); 961 break; 962 case CHIP_VEGA20: 963 case CHIP_ARCTURUS: 964 case CHIP_ALDEBARAN: 965 soc15_set_virt_ops(adev); 966 break; 967 case CHIP_NAVI10: 968 case CHIP_NAVI12: 969 case CHIP_SIENNA_CICHLID: 970 case CHIP_IP_DISCOVERY: 971 nv_set_virt_ops(adev); 972 /* try send GPU_INIT_DATA request to host */ 973 amdgpu_virt_request_init_data(adev); 974 break; 975 default: /* other chip doesn't support SRIOV */ 976 is_sriov = false; 977 dev_err(adev->dev, "Unknown asic type: %d!\n", adev->asic_type); 978 break; 979 } 980 } 981 982 return is_sriov; 983 } 984 985 static void amdgpu_virt_init_ras(struct amdgpu_device *adev) 986 { 987 ratelimit_state_init(&adev->virt.ras.ras_error_cnt_rs, 5 * HZ, 1); 988 ratelimit_state_init(&adev->virt.ras.ras_cper_dump_rs, 5 * HZ, 1); 989 ratelimit_state_init(&adev->virt.ras.ras_chk_criti_rs, 5 * HZ, 1); 990 991 ratelimit_set_flags(&adev->virt.ras.ras_error_cnt_rs, 992 RATELIMIT_MSG_ON_RELEASE); 993 ratelimit_set_flags(&adev->virt.ras.ras_cper_dump_rs, 994 RATELIMIT_MSG_ON_RELEASE); 995 ratelimit_set_flags(&adev->virt.ras.ras_chk_criti_rs, 996 RATELIMIT_MSG_ON_RELEASE); 997 998 mutex_init(&adev->virt.ras.ras_telemetry_mutex); 999 mutex_init(&adev->virt.access_req_mutex); 1000 1001 adev->virt.ras.cper_rptr = 0; 1002 } 1003 1004 static uint8_t amdgpu_virt_crit_region_calc_checksum(uint8_t *buf_start, uint8_t *buf_end) 1005 { 1006 uint32_t sum = 0; 1007 1008 if (buf_start >= buf_end) 1009 return 0; 1010 1011 for (; buf_start < buf_end; buf_start++) 1012 sum += buf_start[0]; 1013 1014 return 0xffffffff - sum; 1015 } 1016 1017 int amdgpu_virt_init_critical_region(struct amdgpu_device *adev) 1018 { 1019 struct amd_sriov_msg_init_data_header *init_data_hdr = NULL; 1020 u64 init_hdr_offset = adev->virt.init_data_header.offset; 1021 u64 init_hdr_size = (u64)adev->virt.init_data_header.size_kb << 10; /* KB → bytes */ 1022 u64 vram_size; 1023 u64 end; 1024 int r = 0; 1025 uint8_t checksum = 0; 1026 1027 /* Skip below init if critical region version != v2 */ 1028 if (adev->virt.req_init_data_ver != GPU_CRIT_REGION_V2) 1029 return 0; 1030 1031 vram_size = RREG32(mmRCC_CONFIG_MEMSIZE); 1032 if (!vram_size || vram_size == U32_MAX) 1033 return -EINVAL; 1034 vram_size <<= 20; 1035 1036 if (check_add_overflow(init_hdr_offset, init_hdr_size, &end) || end > vram_size) { 1037 dev_err(adev->dev, "init_data_header exceeds VRAM size, exiting\n"); 1038 return -EINVAL; 1039 } 1040 1041 /* Allocate for init_data_hdr */ 1042 init_data_hdr = kzalloc_obj(struct amd_sriov_msg_init_data_header); 1043 if (!init_data_hdr) 1044 return -ENOMEM; 1045 1046 amdgpu_device_vram_access(adev, (uint64_t)init_hdr_offset, (uint32_t *)init_data_hdr, 1047 sizeof(struct amd_sriov_msg_init_data_header), false); 1048 1049 /* Table validation */ 1050 if (strncmp(init_data_hdr->signature, 1051 AMDGPU_SRIOV_CRIT_DATA_SIGNATURE, 1052 AMDGPU_SRIOV_CRIT_DATA_SIG_LEN) != 0) { 1053 dev_err(adev->dev, "Invalid init data signature: %.4s\n", 1054 init_data_hdr->signature); 1055 r = -EINVAL; 1056 goto out; 1057 } 1058 1059 checksum = amdgpu_virt_crit_region_calc_checksum( 1060 (uint8_t *)&init_data_hdr->initdata_offset, 1061 (uint8_t *)init_data_hdr + 1062 sizeof(struct amd_sriov_msg_init_data_header)); 1063 if (checksum != init_data_hdr->checksum) { 1064 dev_err(adev->dev, "Found unmatching checksum from calculation 0x%x and init_data 0x%x\n", 1065 checksum, init_data_hdr->checksum); 1066 r = -EINVAL; 1067 goto out; 1068 } 1069 1070 memset(&adev->virt.crit_regn, 0, sizeof(adev->virt.crit_regn)); 1071 memset(adev->virt.crit_regn_tbl, 0, sizeof(adev->virt.crit_regn_tbl)); 1072 1073 adev->virt.crit_regn.offset = init_data_hdr->initdata_offset; 1074 adev->virt.crit_regn.size_kb = init_data_hdr->initdata_size_in_kb; 1075 1076 /* Validation and initialization for each table entry */ 1077 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_IPD_TABLE_ID)) { 1078 if (!init_data_hdr->ip_discovery_size_in_kb || 1079 init_data_hdr->ip_discovery_size_in_kb > DISCOVERY_TMR_SIZE) { 1080 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1081 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_IPD_TABLE_ID], 1082 init_data_hdr->ip_discovery_size_in_kb); 1083 r = -EINVAL; 1084 goto out; 1085 } 1086 1087 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].offset = 1088 init_data_hdr->ip_discovery_offset; 1089 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb = 1090 init_data_hdr->ip_discovery_size_in_kb; 1091 } 1092 1093 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID)) { 1094 if (!init_data_hdr->vbios_img_size_in_kb) { 1095 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1096 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID], 1097 init_data_hdr->vbios_img_size_in_kb); 1098 r = -EINVAL; 1099 goto out; 1100 } 1101 1102 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].offset = 1103 init_data_hdr->vbios_img_offset; 1104 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].size_kb = 1105 init_data_hdr->vbios_img_size_in_kb; 1106 } 1107 1108 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID)) { 1109 if (!init_data_hdr->ras_tele_info_size_in_kb) { 1110 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1111 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID], 1112 init_data_hdr->ras_tele_info_size_in_kb); 1113 r = -EINVAL; 1114 goto out; 1115 } 1116 1117 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset = 1118 init_data_hdr->ras_tele_info_offset; 1119 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].size_kb = 1120 init_data_hdr->ras_tele_info_size_in_kb; 1121 } 1122 1123 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID)) { 1124 if (!init_data_hdr->dataexchange_size_in_kb) { 1125 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1126 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID], 1127 init_data_hdr->dataexchange_size_in_kb); 1128 r = -EINVAL; 1129 goto out; 1130 } 1131 1132 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset = 1133 init_data_hdr->dataexchange_offset; 1134 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb = 1135 init_data_hdr->dataexchange_size_in_kb; 1136 } 1137 1138 if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID)) { 1139 if (!init_data_hdr->bad_page_size_in_kb) { 1140 dev_err(adev->dev, "Invalid %s size: 0x%x\n", 1141 amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID], 1142 init_data_hdr->bad_page_size_in_kb); 1143 r = -EINVAL; 1144 goto out; 1145 } 1146 1147 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].offset = 1148 init_data_hdr->bad_page_info_offset; 1149 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].size_kb = 1150 init_data_hdr->bad_page_size_in_kb; 1151 } 1152 1153 /* Validation for critical region info */ 1154 if (adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb > DISCOVERY_TMR_SIZE) { 1155 dev_err(adev->dev, "Invalid IP discovery size: 0x%x\n", 1156 adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb); 1157 r = -EINVAL; 1158 goto out; 1159 } 1160 1161 /* reserved memory starts from crit region base offset with the size of 5MB */ 1162 amdgpu_ttm_init_vram_resv(adev, AMDGPU_RESV_FW_VRAM_USAGE, 1163 adev->virt.crit_regn.offset, 1164 adev->virt.crit_regn.size_kb << 10, true); 1165 dev_info(adev->dev, 1166 "critical region v%d requested to reserve memory start at %08llx with %llu KB.\n", 1167 init_data_hdr->version, 1168 adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].offset, 1169 adev->mman.resv_region[AMDGPU_RESV_FW_VRAM_USAGE].size >> 10); 1170 1171 adev->virt.is_dynamic_crit_regn_enabled = true; 1172 1173 out: 1174 kfree(init_data_hdr); 1175 init_data_hdr = NULL; 1176 1177 return r; 1178 } 1179 1180 int amdgpu_virt_get_dynamic_data_info(struct amdgpu_device *adev, 1181 int data_id, uint8_t *binary, u32 *size) 1182 { 1183 uint32_t data_offset = 0; 1184 uint32_t data_size = 0; 1185 enum amd_sriov_msg_table_id_enum data_table_id = data_id; 1186 1187 if (data_table_id >= AMD_SRIOV_MSG_MAX_TABLE_ID) 1188 return -EINVAL; 1189 1190 data_offset = adev->virt.crit_regn_tbl[data_table_id].offset; 1191 data_size = adev->virt.crit_regn_tbl[data_table_id].size_kb << 10; 1192 1193 /* Validate on input params */ 1194 if (!binary || !size || *size < (uint64_t)data_size) 1195 return -EINVAL; 1196 1197 /* Proceed to copy the dynamic content */ 1198 amdgpu_device_vram_access(adev, 1199 (uint64_t)data_offset, (uint32_t *)binary, data_size, false); 1200 *size = (uint64_t)data_size; 1201 1202 dev_dbg(adev->dev, 1203 "Got %s info from dynamic crit_region_table at offset 0x%x with size of 0x%x bytes.\n", 1204 amdgpu_virt_dynamic_crit_table_name[data_id], data_offset, data_size); 1205 1206 return 0; 1207 } 1208 1209 void amdgpu_virt_init(struct amdgpu_device *adev) 1210 { 1211 bool is_sriov = false; 1212 uint32_t reg = amdgpu_virt_init_detect_asic(adev); 1213 1214 is_sriov = amdgpu_virt_init_req_data(adev, reg); 1215 1216 if (is_sriov) 1217 amdgpu_virt_init_ras(adev); 1218 } 1219 1220 static bool amdgpu_virt_access_debugfs_is_mmio(struct amdgpu_device *adev) 1221 { 1222 return amdgpu_sriov_is_debug(adev) ? true : false; 1223 } 1224 1225 static bool amdgpu_virt_access_debugfs_is_kiq(struct amdgpu_device *adev) 1226 { 1227 return amdgpu_sriov_is_normal(adev) ? true : false; 1228 } 1229 1230 int amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev) 1231 { 1232 if (!amdgpu_sriov_vf(adev) || 1233 amdgpu_virt_access_debugfs_is_kiq(adev)) 1234 return 0; 1235 1236 if (amdgpu_virt_access_debugfs_is_mmio(adev)) 1237 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 1238 else 1239 return -EPERM; 1240 1241 return 0; 1242 } 1243 1244 void amdgpu_virt_disable_access_debugfs(struct amdgpu_device *adev) 1245 { 1246 if (amdgpu_sriov_vf(adev)) 1247 adev->virt.caps |= AMDGPU_SRIOV_CAPS_RUNTIME; 1248 } 1249 1250 enum amdgpu_sriov_vf_mode amdgpu_virt_get_sriov_vf_mode(struct amdgpu_device *adev) 1251 { 1252 enum amdgpu_sriov_vf_mode mode; 1253 1254 if (amdgpu_sriov_vf(adev)) { 1255 if (amdgpu_sriov_is_pp_one_vf(adev)) 1256 mode = SRIOV_VF_MODE_ONE_VF; 1257 else 1258 mode = SRIOV_VF_MODE_MULTI_VF; 1259 } else { 1260 mode = SRIOV_VF_MODE_BARE_METAL; 1261 } 1262 1263 return mode; 1264 } 1265 1266 void amdgpu_virt_pre_reset(struct amdgpu_device *adev) 1267 { 1268 /* stop the data exchange thread */ 1269 amdgpu_virt_fini_data_exchange(adev); 1270 amdgpu_dpm_set_mp1_state(adev, PP_MP1_STATE_FLR); 1271 } 1272 1273 void amdgpu_virt_post_reset(struct amdgpu_device *adev) 1274 { 1275 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) { 1276 /* force set to GFXOFF state after reset, 1277 * to avoid some invalid operation before GC enable 1278 */ 1279 adev->gfx.is_poweron = false; 1280 } 1281 1282 adev->mes.ring[0].sched.ready = false; 1283 } 1284 1285 bool amdgpu_virt_fw_load_skip_check(struct amdgpu_device *adev, uint32_t ucode_id) 1286 { 1287 switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { 1288 case IP_VERSION(13, 0, 0): 1289 /* no vf autoload, white list */ 1290 if (ucode_id == AMDGPU_UCODE_ID_VCN1 || 1291 ucode_id == AMDGPU_UCODE_ID_VCN) 1292 return false; 1293 else 1294 return true; 1295 case IP_VERSION(11, 0, 9): 1296 case IP_VERSION(11, 0, 7): 1297 /* black list for CHIP_NAVI12 and CHIP_SIENNA_CICHLID */ 1298 if (ucode_id == AMDGPU_UCODE_ID_RLC_G 1299 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL 1300 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM 1301 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM 1302 || ucode_id == AMDGPU_UCODE_ID_SMC) 1303 return true; 1304 else 1305 return false; 1306 case IP_VERSION(13, 0, 10): 1307 /* white list */ 1308 if (ucode_id == AMDGPU_UCODE_ID_CAP 1309 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_PFP 1310 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_ME 1311 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC 1312 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_PFP_P0_STACK 1313 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_PFP_P1_STACK 1314 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_ME_P0_STACK 1315 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_ME_P1_STACK 1316 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC_P0_STACK 1317 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC_P1_STACK 1318 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC_P2_STACK 1319 || ucode_id == AMDGPU_UCODE_ID_CP_RS64_MEC_P3_STACK 1320 || ucode_id == AMDGPU_UCODE_ID_CP_MES 1321 || ucode_id == AMDGPU_UCODE_ID_CP_MES_DATA 1322 || ucode_id == AMDGPU_UCODE_ID_CP_MES1 1323 || ucode_id == AMDGPU_UCODE_ID_CP_MES1_DATA 1324 || ucode_id == AMDGPU_UCODE_ID_VCN1 1325 || ucode_id == AMDGPU_UCODE_ID_VCN) 1326 return false; 1327 else 1328 return true; 1329 default: 1330 /* lagacy black list */ 1331 if (ucode_id == AMDGPU_UCODE_ID_SDMA0 1332 || ucode_id == AMDGPU_UCODE_ID_SDMA1 1333 || ucode_id == AMDGPU_UCODE_ID_SDMA2 1334 || ucode_id == AMDGPU_UCODE_ID_SDMA3 1335 || ucode_id == AMDGPU_UCODE_ID_SDMA4 1336 || ucode_id == AMDGPU_UCODE_ID_SDMA5 1337 || ucode_id == AMDGPU_UCODE_ID_SDMA6 1338 || ucode_id == AMDGPU_UCODE_ID_SDMA7 1339 || ucode_id == AMDGPU_UCODE_ID_SDMA_RS64 1340 || ucode_id == AMDGPU_UCODE_ID_RLC_G 1341 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL 1342 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM 1343 || ucode_id == AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM 1344 || ucode_id == AMDGPU_UCODE_ID_SMC) 1345 return true; 1346 else 1347 return false; 1348 } 1349 } 1350 1351 void amdgpu_virt_update_sriov_video_codec(struct amdgpu_device *adev, 1352 struct amdgpu_video_codec_info *encode, uint32_t encode_array_size, 1353 struct amdgpu_video_codec_info *decode, uint32_t decode_array_size) 1354 { 1355 uint32_t i; 1356 1357 if (!adev->virt.is_mm_bw_enabled) 1358 return; 1359 1360 if (encode) { 1361 for (i = 0; i < encode_array_size; i++) { 1362 encode[i].max_width = adev->virt.encode_max_dimension_pixels; 1363 encode[i].max_pixels_per_frame = adev->virt.encode_max_frame_pixels; 1364 if (encode[i].max_width > 0) 1365 encode[i].max_height = encode[i].max_pixels_per_frame / encode[i].max_width; 1366 else 1367 encode[i].max_height = 0; 1368 } 1369 } 1370 1371 if (decode) { 1372 for (i = 0; i < decode_array_size; i++) { 1373 decode[i].max_width = adev->virt.decode_max_dimension_pixels; 1374 decode[i].max_pixels_per_frame = adev->virt.decode_max_frame_pixels; 1375 if (decode[i].max_width > 0) 1376 decode[i].max_height = decode[i].max_pixels_per_frame / decode[i].max_width; 1377 else 1378 decode[i].max_height = 0; 1379 } 1380 } 1381 } 1382 1383 bool amdgpu_virt_get_rlcg_reg_access_flag(struct amdgpu_device *adev, 1384 u32 acc_flags, u32 hwip, 1385 bool write, u32 *rlcg_flag) 1386 { 1387 bool ret = false; 1388 1389 switch (hwip) { 1390 case GC_HWIP: 1391 if (amdgpu_sriov_reg_indirect_gc(adev)) { 1392 *rlcg_flag = 1393 write ? AMDGPU_RLCG_GC_WRITE : AMDGPU_RLCG_GC_READ; 1394 ret = true; 1395 /* only in new version, AMDGPU_REGS_NO_KIQ and 1396 * AMDGPU_REGS_RLC are enabled simultaneously */ 1397 } else if ((acc_flags & AMDGPU_REGS_RLC) && 1398 !(acc_flags & AMDGPU_REGS_NO_KIQ) && write) { 1399 *rlcg_flag = AMDGPU_RLCG_GC_WRITE_LEGACY; 1400 ret = true; 1401 } 1402 break; 1403 case MMHUB_HWIP: 1404 if (amdgpu_sriov_reg_indirect_mmhub(adev) && 1405 (acc_flags & AMDGPU_REGS_RLC) && write) { 1406 *rlcg_flag = AMDGPU_RLCG_MMHUB_WRITE; 1407 ret = true; 1408 } 1409 break; 1410 default: 1411 break; 1412 } 1413 return ret; 1414 } 1415 1416 static u32 amdgpu_virt_rlcg_vfi_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 flag, u32 xcc_id) 1417 { 1418 uint32_t timeout = 100; 1419 uint32_t i; 1420 1421 struct amdgpu_rlcg_reg_access_ctrl *reg_access_ctrl; 1422 void *vfi_cmd; 1423 void *vfi_stat; 1424 void *vfi_addr; 1425 void *vfi_data; 1426 void *vfi_grbm_cntl; 1427 void *vfi_grbm_idx; 1428 uint32_t cmd; 1429 uint32_t stat; 1430 uint32_t addr = offset; 1431 uint32_t data; 1432 uint32_t grbm_cntl_data; 1433 uint32_t grbm_idx_data; 1434 1435 unsigned long flags; 1436 bool is_err = true; 1437 1438 if (!adev->gfx.rlc.rlcg_reg_access_supported) { 1439 dev_err(adev->dev, "VFi interface is not available\n"); 1440 return 0; 1441 } 1442 1443 if (adev->gfx.xcc_mask && (((1 << xcc_id) & adev->gfx.xcc_mask) == 0)) { 1444 dev_err(adev->dev, "VFi invalid XCC, xcc_id=0x%x\n", xcc_id); 1445 return 0; 1446 } 1447 1448 if (amdgpu_device_skip_hw_access(adev)) 1449 return 0; 1450 1451 reg_access_ctrl = &adev->gfx.rlc.reg_access_ctrl[xcc_id]; 1452 vfi_cmd = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->vfi_cmd; 1453 vfi_stat = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->vfi_stat; 1454 vfi_addr = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->vfi_addr; 1455 vfi_data = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->vfi_data; 1456 vfi_grbm_cntl = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->vfi_grbm_cntl; 1457 vfi_grbm_idx = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->vfi_grbm_idx; 1458 grbm_cntl_data = reg_access_ctrl->vfi_grbm_cntl_data; 1459 grbm_idx_data = reg_access_ctrl->vfi_grbm_idx_data; 1460 1461 if (flag == AMDGPU_RLCG_GC_WRITE) { 1462 data = v; 1463 cmd = AMDGPU_RLCG_VFI_CMD__WR; 1464 1465 // the GRBM_GFX_CNTL and GRBM_GFX_INDEX are protected by mutex outside this call 1466 if (addr == reg_access_ctrl->grbm_cntl) { 1467 reg_access_ctrl->vfi_grbm_cntl_data = data; 1468 return 0; 1469 } else if (addr == reg_access_ctrl->grbm_idx) { 1470 reg_access_ctrl->vfi_grbm_idx_data = data; 1471 return 0; 1472 } 1473 1474 } else if (flag == AMDGPU_RLCG_GC_READ) { 1475 data = 0; 1476 cmd = AMDGPU_RLCG_VFI_CMD__RD; 1477 1478 // the GRBM_GFX_CNTL and GRBM_GFX_INDEX are protected by mutex outside this call 1479 if (addr == reg_access_ctrl->grbm_cntl) 1480 return grbm_cntl_data; 1481 else if (addr == reg_access_ctrl->grbm_idx) 1482 return grbm_idx_data; 1483 1484 } else { 1485 dev_err(adev->dev, "VFi invalid access, flag=0x%x\n", flag); 1486 return 0; 1487 } 1488 1489 spin_lock_irqsave(&adev->virt.rlcg_reg_lock, flags); 1490 1491 writel(addr, vfi_addr); 1492 writel(data, vfi_data); 1493 writel(grbm_cntl_data, vfi_grbm_cntl); 1494 writel(grbm_idx_data, vfi_grbm_idx); 1495 1496 writel(AMDGPU_RLCG_VFI_STAT__BUSY, vfi_stat); 1497 writel(cmd, vfi_cmd); 1498 1499 for (i = 0; i < timeout; i++) { 1500 stat = readl(vfi_stat); 1501 if (stat != AMDGPU_RLCG_VFI_STAT__BUSY) 1502 break; 1503 udelay(10); 1504 } 1505 1506 switch (stat) { 1507 case AMDGPU_RLCG_VFI_STAT__DONE: 1508 is_err = false; 1509 if (cmd == AMDGPU_RLCG_VFI_CMD__RD) 1510 data = readl(vfi_data); 1511 break; 1512 case AMDGPU_RLCG_VFI_STAT__BUSY: 1513 dev_err(adev->dev, "VFi access timeout\n"); 1514 break; 1515 case AMDGPU_RLCG_VFI_STAT__INV_CMD: 1516 dev_err(adev->dev, "VFi invalid command\n"); 1517 break; 1518 case AMDGPU_RLCG_VFI_STAT__INV_ADDR: 1519 dev_err(adev->dev, "VFi invalid address\n"); 1520 break; 1521 case AMDGPU_RLCG_VFI_STAT__ERR: 1522 dev_err(adev->dev, "VFi unknown error\n"); 1523 break; 1524 default: 1525 dev_err(adev->dev, "VFi unknown status code\n"); 1526 break; 1527 } 1528 1529 spin_unlock_irqrestore(&adev->virt.rlcg_reg_lock, flags); 1530 1531 if (is_err) 1532 dev_err(adev->dev, "VFi: [grbm_cntl=0x%x grbm_idx=0x%x] addr=0x%x (byte addr 0x%x), data=0x%x, cmd=0x%x\n", 1533 grbm_cntl_data, grbm_idx_data, 1534 addr, addr * 4, data, cmd); 1535 else 1536 dev_dbg(adev->dev, "VFi: [grbm_cntl=0x%x grbm_idx=0x%x] addr=0x%x (byte addr 0x%x), data=0x%x, cmd=0x%x\n", 1537 grbm_cntl_data, grbm_idx_data, 1538 addr, addr * 4, data, cmd); 1539 1540 return data; 1541 } 1542 1543 u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 flag, u32 xcc_id) 1544 { 1545 struct amdgpu_rlcg_reg_access_ctrl *reg_access_ctrl; 1546 uint32_t timeout = 50000; 1547 uint32_t i, tmp; 1548 uint32_t ret = 0; 1549 void *scratch_reg0; 1550 void *scratch_reg1; 1551 void *scratch_reg2; 1552 void *scratch_reg3; 1553 void *spare_int; 1554 unsigned long flags; 1555 1556 if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 1, 0)) 1557 return amdgpu_virt_rlcg_vfi_reg_rw(adev, offset, v, flag, xcc_id); 1558 1559 if (!adev->gfx.rlc.rlcg_reg_access_supported) { 1560 dev_err(adev->dev, 1561 "indirect registers access through rlcg is not available\n"); 1562 return 0; 1563 } 1564 1565 if (adev->gfx.xcc_mask && (((1 << xcc_id) & adev->gfx.xcc_mask) == 0)) { 1566 dev_err(adev->dev, "invalid xcc\n"); 1567 return 0; 1568 } 1569 1570 if (amdgpu_device_skip_hw_access(adev)) 1571 return 0; 1572 1573 reg_access_ctrl = &adev->gfx.rlc.reg_access_ctrl[xcc_id]; 1574 scratch_reg0 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg0; 1575 scratch_reg1 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg1; 1576 scratch_reg2 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg2; 1577 scratch_reg3 = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->scratch_reg3; 1578 1579 spin_lock_irqsave(&adev->virt.rlcg_reg_lock, flags); 1580 1581 if (reg_access_ctrl->spare_int) 1582 spare_int = (void __iomem *)adev->rmmio + 4 * reg_access_ctrl->spare_int; 1583 1584 if (offset == reg_access_ctrl->grbm_cntl) { 1585 /* if the target reg offset is grbm_cntl, write to scratch_reg2 */ 1586 writel(v, scratch_reg2); 1587 if (flag == AMDGPU_RLCG_GC_WRITE_LEGACY) 1588 writel(v, ((void __iomem *)adev->rmmio) + (offset * 4)); 1589 } else if (offset == reg_access_ctrl->grbm_idx) { 1590 /* if the target reg offset is grbm_idx, write to scratch_reg3 */ 1591 writel(v, scratch_reg3); 1592 if (flag == AMDGPU_RLCG_GC_WRITE_LEGACY) 1593 writel(v, ((void __iomem *)adev->rmmio) + (offset * 4)); 1594 } else { 1595 /* 1596 * SCRATCH_REG0 = read/write value 1597 * SCRATCH_REG1[30:28] = command 1598 * SCRATCH_REG1[19:0] = address in dword 1599 * SCRATCH_REG1[27:24] = Error reporting 1600 */ 1601 writel(v, scratch_reg0); 1602 writel((offset | flag), scratch_reg1); 1603 if (reg_access_ctrl->spare_int) 1604 writel(1, spare_int); 1605 1606 for (i = 0; i < timeout; i++) { 1607 tmp = readl(scratch_reg1); 1608 if (!(tmp & AMDGPU_RLCG_SCRATCH1_ADDRESS_MASK)) 1609 break; 1610 udelay(10); 1611 } 1612 1613 tmp = readl(scratch_reg1); 1614 if (i >= timeout || (tmp & AMDGPU_RLCG_SCRATCH1_ERROR_MASK) != 0) { 1615 if (amdgpu_sriov_rlcg_error_report_enabled(adev)) { 1616 if (tmp & AMDGPU_RLCG_VFGATE_DISABLED) { 1617 dev_err(adev->dev, 1618 "vfgate is disabled, rlcg failed to program reg: 0x%05x\n", offset); 1619 } else if (tmp & AMDGPU_RLCG_WRONG_OPERATION_TYPE) { 1620 dev_err(adev->dev, 1621 "wrong operation type, rlcg failed to program reg: 0x%05x\n", offset); 1622 } else if (tmp & AMDGPU_RLCG_REG_NOT_IN_RANGE) { 1623 dev_err(adev->dev, 1624 "register is not in range, rlcg failed to program reg: 0x%05x\n", offset); 1625 } else { 1626 dev_err(adev->dev, 1627 "unknown error type, rlcg failed to program reg: 0x%05x\n", offset); 1628 } 1629 } else { 1630 dev_err(adev->dev, 1631 "timeout: rlcg faled to program reg: 0x%05x\n", offset); 1632 } 1633 } 1634 } 1635 1636 ret = readl(scratch_reg0); 1637 1638 spin_unlock_irqrestore(&adev->virt.rlcg_reg_lock, flags); 1639 1640 return ret; 1641 } 1642 1643 void amdgpu_sriov_wreg(struct amdgpu_device *adev, 1644 u32 offset, u32 value, 1645 u32 acc_flags, u32 hwip, u32 xcc_id) 1646 { 1647 u32 rlcg_flag; 1648 1649 if (amdgpu_device_skip_hw_access(adev)) 1650 return; 1651 1652 if (!amdgpu_sriov_runtime(adev) && 1653 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, true, &rlcg_flag)) { 1654 amdgpu_virt_rlcg_reg_rw(adev, offset, value, rlcg_flag, xcc_id); 1655 return; 1656 } 1657 1658 if (acc_flags & AMDGPU_REGS_NO_KIQ) 1659 WREG32_NO_KIQ(offset, value); 1660 else 1661 WREG32(offset, value); 1662 } 1663 1664 u32 amdgpu_sriov_rreg(struct amdgpu_device *adev, 1665 u32 offset, u32 acc_flags, u32 hwip, u32 xcc_id) 1666 { 1667 u32 rlcg_flag; 1668 1669 if (amdgpu_device_skip_hw_access(adev)) 1670 return 0; 1671 1672 if (!amdgpu_sriov_runtime(adev) && 1673 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, false, &rlcg_flag)) 1674 return amdgpu_virt_rlcg_reg_rw(adev, offset, 0, rlcg_flag, xcc_id); 1675 1676 if (acc_flags & AMDGPU_REGS_NO_KIQ) 1677 return RREG32_NO_KIQ(offset); 1678 else 1679 return RREG32(offset); 1680 } 1681 1682 bool amdgpu_sriov_xnack_support(struct amdgpu_device *adev) 1683 { 1684 bool xnack_mode = true; 1685 1686 if (amdgpu_sriov_vf(adev) && 1687 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2)) 1688 xnack_mode = false; 1689 1690 return xnack_mode; 1691 } 1692 1693 bool amdgpu_virt_get_ras_capability(struct amdgpu_device *adev) 1694 { 1695 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 1696 1697 if (!amdgpu_sriov_ras_caps_en(adev)) 1698 return false; 1699 1700 if (adev->virt.ras_en_caps.bits.block_umc) 1701 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__UMC); 1702 if (adev->virt.ras_en_caps.bits.block_sdma) 1703 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__SDMA); 1704 if (adev->virt.ras_en_caps.bits.block_gfx) 1705 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__GFX); 1706 if (adev->virt.ras_en_caps.bits.block_mmhub) 1707 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MMHUB); 1708 if (adev->virt.ras_en_caps.bits.block_athub) 1709 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__ATHUB); 1710 if (adev->virt.ras_en_caps.bits.block_pcie_bif) 1711 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__PCIE_BIF); 1712 if (adev->virt.ras_en_caps.bits.block_hdp) 1713 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__HDP); 1714 if (adev->virt.ras_en_caps.bits.block_xgmi_wafl) 1715 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__XGMI_WAFL); 1716 if (adev->virt.ras_en_caps.bits.block_df) 1717 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__DF); 1718 if (adev->virt.ras_en_caps.bits.block_smn) 1719 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__SMN); 1720 if (adev->virt.ras_en_caps.bits.block_sem) 1721 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__SEM); 1722 if (adev->virt.ras_en_caps.bits.block_mp0) 1723 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MP0); 1724 if (adev->virt.ras_en_caps.bits.block_mp1) 1725 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MP1); 1726 if (adev->virt.ras_en_caps.bits.block_fuse) 1727 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__FUSE); 1728 if (adev->virt.ras_en_caps.bits.block_mca) 1729 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MCA); 1730 if (adev->virt.ras_en_caps.bits.block_vcn) 1731 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__VCN); 1732 if (adev->virt.ras_en_caps.bits.block_jpeg) 1733 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__JPEG); 1734 if (adev->virt.ras_en_caps.bits.block_ih) 1735 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__IH); 1736 if (adev->virt.ras_en_caps.bits.block_mpio) 1737 adev->ras_hw_enabled |= BIT(AMDGPU_RAS_BLOCK__MPIO); 1738 1739 if (adev->virt.ras_en_caps.bits.poison_propogation_mode) 1740 con->poison_supported = true; /* Poison is handled by host */ 1741 1742 if (adev->virt.ras_en_caps.bits.uniras_supported) 1743 amdgpu_virt_ras_set_remote_uniras(adev, true); 1744 1745 return true; 1746 } 1747 1748 static inline enum amd_sriov_ras_telemetry_gpu_block 1749 amdgpu_ras_block_to_sriov(struct amdgpu_device *adev, enum amdgpu_ras_block block) { 1750 switch (block) { 1751 case AMDGPU_RAS_BLOCK__UMC: 1752 return RAS_TELEMETRY_GPU_BLOCK_UMC; 1753 case AMDGPU_RAS_BLOCK__SDMA: 1754 return RAS_TELEMETRY_GPU_BLOCK_SDMA; 1755 case AMDGPU_RAS_BLOCK__GFX: 1756 return RAS_TELEMETRY_GPU_BLOCK_GFX; 1757 case AMDGPU_RAS_BLOCK__MMHUB: 1758 return RAS_TELEMETRY_GPU_BLOCK_MMHUB; 1759 case AMDGPU_RAS_BLOCK__ATHUB: 1760 return RAS_TELEMETRY_GPU_BLOCK_ATHUB; 1761 case AMDGPU_RAS_BLOCK__PCIE_BIF: 1762 return RAS_TELEMETRY_GPU_BLOCK_PCIE_BIF; 1763 case AMDGPU_RAS_BLOCK__HDP: 1764 return RAS_TELEMETRY_GPU_BLOCK_HDP; 1765 case AMDGPU_RAS_BLOCK__XGMI_WAFL: 1766 return RAS_TELEMETRY_GPU_BLOCK_XGMI_WAFL; 1767 case AMDGPU_RAS_BLOCK__DF: 1768 return RAS_TELEMETRY_GPU_BLOCK_DF; 1769 case AMDGPU_RAS_BLOCK__SMN: 1770 return RAS_TELEMETRY_GPU_BLOCK_SMN; 1771 case AMDGPU_RAS_BLOCK__SEM: 1772 return RAS_TELEMETRY_GPU_BLOCK_SEM; 1773 case AMDGPU_RAS_BLOCK__MP0: 1774 return RAS_TELEMETRY_GPU_BLOCK_MP0; 1775 case AMDGPU_RAS_BLOCK__MP1: 1776 return RAS_TELEMETRY_GPU_BLOCK_MP1; 1777 case AMDGPU_RAS_BLOCK__FUSE: 1778 return RAS_TELEMETRY_GPU_BLOCK_FUSE; 1779 case AMDGPU_RAS_BLOCK__MCA: 1780 return RAS_TELEMETRY_GPU_BLOCK_MCA; 1781 case AMDGPU_RAS_BLOCK__VCN: 1782 return RAS_TELEMETRY_GPU_BLOCK_VCN; 1783 case AMDGPU_RAS_BLOCK__JPEG: 1784 return RAS_TELEMETRY_GPU_BLOCK_JPEG; 1785 case AMDGPU_RAS_BLOCK__IH: 1786 return RAS_TELEMETRY_GPU_BLOCK_IH; 1787 case AMDGPU_RAS_BLOCK__MPIO: 1788 return RAS_TELEMETRY_GPU_BLOCK_MPIO; 1789 default: 1790 dev_warn(adev->dev, "Unsupported SRIOV RAS telemetry block 0x%x\n", 1791 block); 1792 return RAS_TELEMETRY_GPU_BLOCK_COUNT; 1793 } 1794 } 1795 1796 static int amdgpu_virt_cache_host_error_counts(struct amdgpu_device *adev, 1797 struct amdsriov_ras_telemetry *host_telemetry) 1798 { 1799 struct amd_sriov_ras_telemetry_error_count *tmp = NULL; 1800 uint32_t checksum, used_size; 1801 1802 checksum = host_telemetry->header.checksum; 1803 used_size = host_telemetry->header.used_size; 1804 1805 if (used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10)) 1806 return 0; 1807 1808 tmp = kmemdup(&host_telemetry->body.error_count, used_size, GFP_KERNEL); 1809 if (!tmp) 1810 return -ENOMEM; 1811 1812 if (checksum != amd_sriov_msg_checksum(tmp, used_size, 0, 0)) 1813 goto out; 1814 1815 memcpy(&adev->virt.count_cache, tmp, 1816 min(used_size, sizeof(adev->virt.count_cache))); 1817 out: 1818 kfree(tmp); 1819 1820 return 0; 1821 } 1822 1823 static int amdgpu_virt_req_ras_err_count_internal(struct amdgpu_device *adev, bool force_update) 1824 { 1825 struct amdgpu_virt *virt = &adev->virt; 1826 1827 if (!virt->ops || !virt->ops->req_ras_err_count) 1828 return -EOPNOTSUPP; 1829 1830 /* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host 1831 * will ignore incoming guest messages. Ratelimit the guest messages to 1832 * prevent guest self DOS. 1833 */ 1834 if (__ratelimit(&virt->ras.ras_error_cnt_rs) || force_update) { 1835 mutex_lock(&virt->ras.ras_telemetry_mutex); 1836 if (!virt->ops->req_ras_err_count(adev)) 1837 amdgpu_virt_cache_host_error_counts(adev, 1838 virt->fw_reserve.ras_telemetry); 1839 mutex_unlock(&virt->ras.ras_telemetry_mutex); 1840 } 1841 1842 return 0; 1843 } 1844 1845 /* Bypass ACA interface and query ECC counts directly from host */ 1846 int amdgpu_virt_req_ras_err_count(struct amdgpu_device *adev, enum amdgpu_ras_block block, 1847 struct ras_err_data *err_data) 1848 { 1849 enum amd_sriov_ras_telemetry_gpu_block sriov_block; 1850 1851 sriov_block = amdgpu_ras_block_to_sriov(adev, block); 1852 1853 if (sriov_block >= RAS_TELEMETRY_GPU_BLOCK_COUNT || 1854 !amdgpu_sriov_ras_telemetry_block_en(adev, sriov_block)) 1855 return -EOPNOTSUPP; 1856 1857 /* Host Access may be lost during reset, just return last cached data. */ 1858 if (down_read_trylock(&adev->reset_domain->sem)) { 1859 amdgpu_virt_req_ras_err_count_internal(adev, false); 1860 up_read(&adev->reset_domain->sem); 1861 } 1862 1863 err_data->ue_count = adev->virt.count_cache.block[sriov_block].ue_count; 1864 err_data->ce_count = adev->virt.count_cache.block[sriov_block].ce_count; 1865 err_data->de_count = adev->virt.count_cache.block[sriov_block].de_count; 1866 1867 return 0; 1868 } 1869 1870 static int 1871 amdgpu_virt_write_cpers_to_ring(struct amdgpu_device *adev, 1872 struct amdsriov_ras_telemetry *host_telemetry, 1873 u32 *more) 1874 { 1875 struct amd_sriov_ras_cper_dump *cper_dump = NULL; 1876 struct cper_hdr *entry = NULL; 1877 struct amdgpu_ring *ring = &adev->cper.ring_buf; 1878 uint32_t checksum, used_size; 1879 u64 remaining, cnt, i; 1880 int ret = 0; 1881 1882 checksum = host_telemetry->header.checksum; 1883 used_size = host_telemetry->header.used_size; 1884 1885 if (used_size < offsetof(struct amd_sriov_ras_cper_dump, buf) || 1886 used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10)) 1887 return -EINVAL; 1888 1889 cper_dump = kmemdup(&host_telemetry->body.cper_dump, used_size, GFP_KERNEL); 1890 if (!cper_dump) 1891 return -ENOMEM; 1892 1893 if (checksum != amd_sriov_msg_checksum(cper_dump, used_size, 0, 0)) { 1894 ret = -EINVAL; 1895 goto out; 1896 } 1897 1898 *more = cper_dump->more; 1899 1900 if (cper_dump->wptr < adev->virt.ras.cper_rptr) { 1901 dev_warn( 1902 adev->dev, 1903 "guest specified rptr that was too high! guest rptr: 0x%llx, host rptr: 0x%llx\n", 1904 adev->virt.ras.cper_rptr, cper_dump->wptr); 1905 1906 adev->virt.ras.cper_rptr = cper_dump->wptr; 1907 goto out; 1908 } 1909 1910 entry = (struct cper_hdr *)&cper_dump->buf[0]; 1911 remaining = (u64)used_size - offsetof(struct amd_sriov_ras_cper_dump, buf); 1912 cnt = min_t(u64, cper_dump->count, CPER_MAX_ALLOWED_COUNT); 1913 1914 for (i = 0; i < cnt; i++) { 1915 if (entry->record_length < sizeof(struct cper_hdr) || 1916 entry->record_length > remaining) { 1917 ret = -EINVAL; 1918 goto out; 1919 } 1920 1921 amdgpu_cper_ring_write(ring, entry, entry->record_length); 1922 remaining -= entry->record_length; 1923 entry = (struct cper_hdr *)((char *)entry + entry->record_length); 1924 } 1925 1926 if (cper_dump->overflow_count) 1927 dev_warn(adev->dev, 1928 "host reported CPER overflow of 0x%llx entries!\n", 1929 cper_dump->overflow_count); 1930 1931 adev->virt.ras.cper_rptr = cper_dump->wptr; 1932 out: 1933 kfree(cper_dump); 1934 1935 return ret; 1936 } 1937 1938 static int amdgpu_virt_req_ras_cper_dump_internal(struct amdgpu_device *adev) 1939 { 1940 struct amdgpu_virt *virt = &adev->virt; 1941 int ret = 0; 1942 uint32_t more = 0; 1943 1944 if (!virt->ops || !virt->ops->req_ras_cper_dump) 1945 return -EOPNOTSUPP; 1946 1947 do { 1948 if (!virt->ops->req_ras_cper_dump(adev, virt->ras.cper_rptr)) 1949 ret = amdgpu_virt_write_cpers_to_ring( 1950 adev, virt->fw_reserve.ras_telemetry, &more); 1951 else 1952 ret = 0; 1953 } while (more && !ret); 1954 1955 return ret; 1956 } 1957 1958 int amdgpu_virt_req_ras_cper_dump(struct amdgpu_device *adev, bool force_update) 1959 { 1960 struct amdgpu_virt *virt = &adev->virt; 1961 int ret = 0; 1962 1963 if (!amdgpu_sriov_ras_cper_en(adev)) 1964 return -EOPNOTSUPP; 1965 1966 if ((__ratelimit(&virt->ras.ras_cper_dump_rs) || force_update) && 1967 down_read_trylock(&adev->reset_domain->sem)) { 1968 mutex_lock(&virt->ras.ras_telemetry_mutex); 1969 ret = amdgpu_virt_req_ras_cper_dump_internal(adev); 1970 mutex_unlock(&virt->ras.ras_telemetry_mutex); 1971 up_read(&adev->reset_domain->sem); 1972 } 1973 1974 return ret; 1975 } 1976 1977 int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev) 1978 { 1979 unsigned long ue_count, ce_count; 1980 1981 if (amdgpu_sriov_ras_telemetry_en(adev)) { 1982 amdgpu_virt_req_ras_err_count_internal(adev, true); 1983 amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL); 1984 } 1985 1986 return 0; 1987 } 1988 1989 bool amdgpu_virt_ras_telemetry_block_en(struct amdgpu_device *adev, 1990 enum amdgpu_ras_block block) 1991 { 1992 enum amd_sriov_ras_telemetry_gpu_block sriov_block; 1993 1994 sriov_block = amdgpu_ras_block_to_sriov(adev, block); 1995 1996 if (sriov_block >= RAS_TELEMETRY_GPU_BLOCK_COUNT || 1997 !amdgpu_sriov_ras_telemetry_block_en(adev, sriov_block)) 1998 return false; 1999 2000 return true; 2001 } 2002 2003 /* 2004 * amdgpu_virt_request_bad_pages() - request bad pages 2005 * @adev: amdgpu device. 2006 * Send command to GPU hypervisor to write new bad pages into the shared PF2VF region 2007 */ 2008 void amdgpu_virt_request_bad_pages(struct amdgpu_device *adev) 2009 { 2010 struct amdgpu_virt *virt = &adev->virt; 2011 2012 if (virt->ops && virt->ops->req_bad_pages) 2013 virt->ops->req_bad_pages(adev); 2014 } 2015 2016 static int amdgpu_virt_cache_chk_criti_hit(struct amdgpu_device *adev, 2017 struct amdsriov_ras_telemetry *host_telemetry, 2018 bool *hit) 2019 { 2020 struct amd_sriov_ras_chk_criti *tmp = NULL; 2021 uint32_t checksum, used_size; 2022 2023 checksum = host_telemetry->header.checksum; 2024 used_size = host_telemetry->header.used_size; 2025 2026 if (used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10)) 2027 return 0; 2028 2029 tmp = kmemdup(&host_telemetry->body.chk_criti, used_size, GFP_KERNEL); 2030 if (!tmp) 2031 return -ENOMEM; 2032 2033 if (checksum != amd_sriov_msg_checksum(tmp, used_size, 0, 0)) 2034 goto out; 2035 2036 if (hit) 2037 *hit = tmp->hit ? true : false; 2038 2039 out: 2040 kfree(tmp); 2041 2042 return 0; 2043 } 2044 2045 int amdgpu_virt_check_vf_critical_region(struct amdgpu_device *adev, u64 addr, bool *hit) 2046 { 2047 struct amdgpu_virt *virt = &adev->virt; 2048 int r = -EPERM; 2049 2050 if (!virt->ops || !virt->ops->req_ras_chk_criti) 2051 return -EOPNOTSUPP; 2052 2053 /* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host 2054 * will ignore incoming guest messages. Ratelimit the guest messages to 2055 * prevent guest self DOS. 2056 */ 2057 if (__ratelimit(&virt->ras.ras_chk_criti_rs)) { 2058 mutex_lock(&virt->ras.ras_telemetry_mutex); 2059 if (!virt->ops->req_ras_chk_criti(adev, addr)) 2060 r = amdgpu_virt_cache_chk_criti_hit( 2061 adev, virt->fw_reserve.ras_telemetry, hit); 2062 mutex_unlock(&virt->ras.ras_telemetry_mutex); 2063 } 2064 2065 return r; 2066 } 2067 2068 static int req_remote_ras_cmd(struct amdgpu_device *adev, 2069 u32 param1, u32 param2, u32 param3) 2070 { 2071 struct amdgpu_virt *virt = &adev->virt; 2072 2073 if (virt->ops && virt->ops->req_remote_ras_cmd) 2074 return virt->ops->req_remote_ras_cmd(adev, param1, param2, param3); 2075 return -ENOENT; 2076 } 2077 2078 int amdgpu_virt_send_remote_ras_cmd(struct amdgpu_device *adev, 2079 uint64_t buf, uint32_t buf_len) 2080 { 2081 uint64_t gpa = buf; 2082 int ret = -EIO; 2083 2084 if (down_read_trylock(&adev->reset_domain->sem)) { 2085 ret = req_remote_ras_cmd(adev, 2086 lower_32_bits(gpa), upper_32_bits(gpa), buf_len); 2087 up_read(&adev->reset_domain->sem); 2088 } 2089 2090 return ret; 2091 } 2092