1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2008-2015 Intel Corporation 4 */ 5 6 #include <linux/highmem.h> 7 8 #include <drm/drm_print.h> 9 #include <drm/intel/intel_gmd_misc_regs.h> 10 11 #include "display/intel_display.h" 12 #include "i915_drv.h" 13 #include "i915_reg.h" 14 #include "i915_scatterlist.h" 15 #include "i915_pvinfo.h" 16 #include "i915_vgpu.h" 17 #include "intel_gt_regs.h" 18 #include "intel_mchbar_regs.h" 19 20 /** 21 * DOC: fence register handling 22 * 23 * Important to avoid confusions: "fences" in the i915 driver are not execution 24 * fences used to track command completion but hardware detiler objects which 25 * wrap a given range of the global GTT. Each platform has only a fairly limited 26 * set of these objects. 27 * 28 * Fences are used to detile GTT memory mappings. They're also connected to the 29 * hardware frontbuffer render tracking and hence interact with frontbuffer 30 * compression. Furthermore on older platforms fences are required for tiled 31 * objects used by the display engine. They can also be used by the render 32 * engine - they're required for blitter commands and are optional for render 33 * commands. But on gen4+ both display (with the exception of fbc) and rendering 34 * have their own tiling state bits and don't need fences. 35 * 36 * Also note that fences only support X and Y tiling and hence can't be used for 37 * the fancier new tiling formats like W, Ys and Yf. 38 * 39 * Finally note that because fences are such a restricted resource they're 40 * dynamically associated with objects. Furthermore fence state is committed to 41 * the hardware lazily to avoid unnecessary stalls on gen2/3. Therefore code must 42 * explicitly call i915_gem_object_get_fence() to synchronize fencing status 43 * for cpu access. Also note that some code wants an unfenced view, for those 44 * cases the fence can be removed forcefully with i915_gem_object_put_fence(). 45 * 46 * Internally these functions will synchronize with userspace access by removing 47 * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed. 48 */ 49 50 #define pipelined 0 51 52 static struct drm_i915_private *fence_to_i915(struct i915_fence_reg *fence) 53 { 54 return fence->ggtt->vm.i915; 55 } 56 57 static struct intel_uncore *fence_to_uncore(struct i915_fence_reg *fence) 58 { 59 return fence->ggtt->vm.gt->uncore; 60 } 61 62 static void i965_write_fence_reg(struct i915_fence_reg *fence) 63 { 64 i915_reg_t fence_reg_lo, fence_reg_hi; 65 int fence_pitch_shift; 66 u64 val; 67 68 if (GRAPHICS_VER(fence_to_i915(fence)) >= 6) { 69 fence_reg_lo = FENCE_REG_GEN6_LO(fence->id); 70 fence_reg_hi = FENCE_REG_GEN6_HI(fence->id); 71 fence_pitch_shift = GEN6_FENCE_PITCH_SHIFT; 72 73 } else { 74 fence_reg_lo = FENCE_REG_965_LO(fence->id); 75 fence_reg_hi = FENCE_REG_965_HI(fence->id); 76 fence_pitch_shift = I965_FENCE_PITCH_SHIFT; 77 } 78 79 val = 0; 80 if (fence->tiling) { 81 unsigned int stride = fence->stride; 82 83 GEM_BUG_ON(!IS_ALIGNED(stride, 128)); 84 85 val = fence->start + fence->size - I965_FENCE_PAGE; 86 val <<= 32; 87 val |= fence->start; 88 val |= (u64)((stride / 128) - 1) << fence_pitch_shift; 89 if (fence->tiling == I915_TILING_Y) 90 val |= BIT(I965_FENCE_TILING_Y_SHIFT); 91 val |= I965_FENCE_REG_VALID; 92 } 93 94 if (!pipelined) { 95 struct intel_uncore *uncore = fence_to_uncore(fence); 96 97 /* 98 * To w/a incoherency with non-atomic 64-bit register updates, 99 * we split the 64-bit update into two 32-bit writes. In order 100 * for a partial fence not to be evaluated between writes, we 101 * precede the update with write to turn off the fence register, 102 * and only enable the fence as the last step. 103 * 104 * For extra levels of paranoia, we make sure each step lands 105 * before applying the next step. 106 */ 107 intel_uncore_write_fw(uncore, fence_reg_lo, 0); 108 intel_uncore_posting_read_fw(uncore, fence_reg_lo); 109 110 intel_uncore_write_fw(uncore, fence_reg_hi, upper_32_bits(val)); 111 intel_uncore_write_fw(uncore, fence_reg_lo, lower_32_bits(val)); 112 intel_uncore_posting_read_fw(uncore, fence_reg_lo); 113 } 114 } 115 116 static void i915_write_fence_reg(struct i915_fence_reg *fence) 117 { 118 u32 val; 119 120 val = 0; 121 if (fence->tiling) { 122 unsigned int stride = fence->stride; 123 unsigned int tiling = fence->tiling; 124 bool is_y_tiled = tiling == I915_TILING_Y; 125 126 if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence_to_i915(fence))) 127 stride /= 128; 128 else 129 stride /= 512; 130 GEM_BUG_ON(!is_power_of_2(stride)); 131 132 val = fence->start; 133 if (is_y_tiled) 134 val |= BIT(I830_FENCE_TILING_Y_SHIFT); 135 val |= I915_FENCE_SIZE_BITS(fence->size); 136 val |= ilog2(stride) << I830_FENCE_PITCH_SHIFT; 137 138 val |= I830_FENCE_REG_VALID; 139 } 140 141 if (!pipelined) { 142 struct intel_uncore *uncore = fence_to_uncore(fence); 143 i915_reg_t reg = FENCE_REG(fence->id); 144 145 intel_uncore_write_fw(uncore, reg, val); 146 intel_uncore_posting_read_fw(uncore, reg); 147 } 148 } 149 150 static void i830_write_fence_reg(struct i915_fence_reg *fence) 151 { 152 u32 val; 153 154 val = 0; 155 if (fence->tiling) { 156 unsigned int stride = fence->stride; 157 158 val = fence->start; 159 if (fence->tiling == I915_TILING_Y) 160 val |= BIT(I830_FENCE_TILING_Y_SHIFT); 161 val |= I830_FENCE_SIZE_BITS(fence->size); 162 val |= ilog2(stride / 128) << I830_FENCE_PITCH_SHIFT; 163 val |= I830_FENCE_REG_VALID; 164 } 165 166 if (!pipelined) { 167 struct intel_uncore *uncore = fence_to_uncore(fence); 168 i915_reg_t reg = FENCE_REG(fence->id); 169 170 intel_uncore_write_fw(uncore, reg, val); 171 intel_uncore_posting_read_fw(uncore, reg); 172 } 173 } 174 175 static void fence_write(struct i915_fence_reg *fence) 176 { 177 struct drm_i915_private *i915 = fence_to_i915(fence); 178 179 /* 180 * Previous access through the fence register is marshalled by 181 * the mb() inside the fault handlers (i915_gem_release_mmaps) 182 * and explicitly managed for internal users. 183 */ 184 185 if (GRAPHICS_VER(i915) == 2) 186 i830_write_fence_reg(fence); 187 else if (GRAPHICS_VER(i915) == 3) 188 i915_write_fence_reg(fence); 189 else 190 i965_write_fence_reg(fence); 191 192 /* 193 * Access through the fenced region afterwards is 194 * ordered by the posting reads whilst writing the registers. 195 */ 196 } 197 198 static bool gpu_uses_fence_registers(struct i915_fence_reg *fence) 199 { 200 return GRAPHICS_VER(fence_to_i915(fence)) < 4; 201 } 202 203 static int fence_update(struct i915_fence_reg *fence, 204 struct i915_vma *vma) 205 { 206 struct i915_ggtt *ggtt = fence->ggtt; 207 struct intel_uncore *uncore = fence_to_uncore(fence); 208 intel_wakeref_t wakeref; 209 struct i915_vma *old; 210 int ret; 211 212 fence->tiling = 0; 213 if (vma) { 214 GEM_BUG_ON(!i915_gem_object_get_stride(vma->obj) || 215 !i915_gem_object_get_tiling(vma->obj)); 216 217 if (!i915_vma_is_map_and_fenceable(vma)) 218 return -EINVAL; 219 220 if (gpu_uses_fence_registers(fence)) { 221 /* implicit 'unfenced' GPU blits */ 222 ret = i915_vma_sync(vma); 223 if (ret) 224 return ret; 225 } 226 227 GEM_BUG_ON(vma->fence_size > i915_vma_size(vma)); 228 fence->start = i915_ggtt_offset(vma); 229 fence->size = vma->fence_size; 230 fence->stride = i915_gem_object_get_stride(vma->obj); 231 fence->tiling = i915_gem_object_get_tiling(vma->obj); 232 } 233 WRITE_ONCE(fence->dirty, false); 234 235 old = xchg(&fence->vma, NULL); 236 if (old) { 237 /* XXX Ideally we would move the waiting to outside the mutex */ 238 ret = i915_active_wait(&fence->active); 239 if (ret) { 240 fence->vma = old; 241 return ret; 242 } 243 244 i915_vma_flush_writes(old); 245 246 /* 247 * Ensure that all userspace CPU access is completed before 248 * stealing the fence. 249 */ 250 if (old != vma) { 251 GEM_BUG_ON(old->fence != fence); 252 i915_vma_revoke_mmap(old); 253 old->fence = NULL; 254 } 255 256 list_move(&fence->link, &ggtt->fence_list); 257 } 258 259 /* 260 * We only need to update the register itself if the device is awake. 261 * If the device is currently powered down, we will defer the write 262 * to the runtime resume, see intel_ggtt_restore_fences(). 263 * 264 * This only works for removing the fence register, on acquisition 265 * the caller must hold the rpm wakeref. The fence register must 266 * be cleared before we can use any other fences to ensure that 267 * the new fences do not overlap the elided clears, confusing HW. 268 */ 269 wakeref = intel_runtime_pm_get_if_in_use(uncore->rpm); 270 if (!wakeref) { 271 GEM_BUG_ON(vma); 272 return 0; 273 } 274 275 WRITE_ONCE(fence->vma, vma); 276 fence_write(fence); 277 278 if (vma) { 279 vma->fence = fence; 280 list_move_tail(&fence->link, &ggtt->fence_list); 281 } 282 283 intel_runtime_pm_put(uncore->rpm, wakeref); 284 return 0; 285 } 286 287 /** 288 * i915_vma_revoke_fence - force-remove fence for a VMA 289 * @vma: vma to map linearly (not through a fence reg) 290 * 291 * This function force-removes any fence from the given object, which is useful 292 * if the kernel wants to do untiled GTT access. 293 */ 294 void i915_vma_revoke_fence(struct i915_vma *vma) 295 { 296 struct i915_fence_reg *fence = vma->fence; 297 intel_wakeref_t wakeref; 298 299 lockdep_assert_held(&vma->vm->mutex); 300 if (!fence) 301 return; 302 303 GEM_BUG_ON(fence->vma != vma); 304 i915_active_wait(&fence->active); 305 GEM_BUG_ON(!i915_active_is_idle(&fence->active)); 306 GEM_BUG_ON(atomic_read(&fence->pin_count)); 307 308 fence->tiling = 0; 309 WRITE_ONCE(fence->vma, NULL); 310 vma->fence = NULL; 311 312 /* 313 * Skip the write to HW if and only if the device is currently 314 * suspended. 315 * 316 * If the driver does not currently hold a wakeref (if_in_use == 0), 317 * the device may currently be runtime suspended, or it may be woken 318 * up before the suspend takes place. If the device is not suspended 319 * (powered down) and we skip clearing the fence register, the HW is 320 * left in an undefined state where we may end up with multiple 321 * registers overlapping. 322 */ 323 with_intel_runtime_pm_if_active(fence_to_uncore(fence)->rpm, wakeref) 324 fence_write(fence); 325 } 326 327 static bool fence_is_active(const struct i915_fence_reg *fence) 328 { 329 return fence->vma && i915_vma_is_active(fence->vma); 330 } 331 332 static struct i915_fence_reg *fence_find(struct i915_ggtt *ggtt) 333 { 334 struct intel_display *display = ggtt->vm.i915->display; 335 struct i915_fence_reg *active = NULL; 336 struct i915_fence_reg *fence, *fn; 337 338 list_for_each_entry_safe(fence, fn, &ggtt->fence_list, link) { 339 GEM_BUG_ON(fence->vma && fence->vma->fence != fence); 340 341 if (fence == active) /* now seen this fence twice */ 342 active = ERR_PTR(-EAGAIN); 343 344 /* Prefer idle fences so we do not have to wait on the GPU */ 345 if (active != ERR_PTR(-EAGAIN) && fence_is_active(fence)) { 346 if (!active) 347 active = fence; 348 349 list_move_tail(&fence->link, &ggtt->fence_list); 350 continue; 351 } 352 353 if (atomic_read(&fence->pin_count)) 354 continue; 355 356 return fence; 357 } 358 359 /* Wait for completion of pending flips which consume fences */ 360 if (intel_has_pending_fb_unpin(display)) 361 return ERR_PTR(-EAGAIN); 362 363 return ERR_PTR(-ENOBUFS); 364 } 365 366 int __i915_vma_pin_fence(struct i915_vma *vma) 367 { 368 struct i915_ggtt *ggtt = i915_vm_to_ggtt(vma->vm); 369 struct i915_fence_reg *fence; 370 struct i915_vma *set = i915_gem_object_is_tiled(vma->obj) ? vma : NULL; 371 int err; 372 373 lockdep_assert_held(&vma->vm->mutex); 374 375 /* Just update our place in the LRU if our fence is getting reused. */ 376 if (vma->fence) { 377 fence = vma->fence; 378 GEM_BUG_ON(fence->vma != vma); 379 atomic_inc(&fence->pin_count); 380 if (!fence->dirty) { 381 list_move_tail(&fence->link, &ggtt->fence_list); 382 return 0; 383 } 384 } else if (set) { 385 fence = fence_find(ggtt); 386 if (IS_ERR(fence)) 387 return PTR_ERR(fence); 388 389 GEM_BUG_ON(atomic_read(&fence->pin_count)); 390 atomic_inc(&fence->pin_count); 391 } else { 392 return 0; 393 } 394 395 err = fence_update(fence, set); 396 if (err) 397 goto out_unpin; 398 399 GEM_BUG_ON(fence->vma != set); 400 GEM_BUG_ON(vma->fence != (set ? fence : NULL)); 401 402 if (set) 403 return 0; 404 405 out_unpin: 406 atomic_dec(&fence->pin_count); 407 return err; 408 } 409 410 /** 411 * i915_vma_pin_fence - set up fencing for a vma 412 * @vma: vma to map through a fence reg 413 * 414 * When mapping objects through the GTT, userspace wants to be able to write 415 * to them without having to worry about swizzling if the object is tiled. 416 * This function walks the fence regs looking for a free one for @obj, 417 * stealing one if it can't find any. 418 * 419 * It then sets up the reg based on the object's properties: address, pitch 420 * and tiling format. 421 * 422 * For an untiled surface, this removes any existing fence. 423 * 424 * Returns: 425 * 0 on success, negative error code on failure. 426 */ 427 int i915_vma_pin_fence(struct i915_vma *vma) 428 { 429 int err; 430 431 if (!vma->fence && !i915_gem_object_is_tiled(vma->obj)) 432 return 0; 433 434 /* 435 * Note that we revoke fences on runtime suspend. Therefore the user 436 * must keep the device awake whilst using the fence. 437 */ 438 assert_rpm_wakelock_held(vma->vm->gt->uncore->rpm); 439 GEM_BUG_ON(!i915_vma_is_ggtt(vma)); 440 441 err = mutex_lock_interruptible(&vma->vm->mutex); 442 if (err) 443 return err; 444 445 err = __i915_vma_pin_fence(vma); 446 mutex_unlock(&vma->vm->mutex); 447 448 return err; 449 } 450 451 /** 452 * i915_reserve_fence - Reserve a fence for vGPU 453 * @ggtt: Global GTT 454 * 455 * This function walks the fence regs looking for a free one and remove 456 * it from the fence_list. It is used to reserve fence for vGPU to use. 457 */ 458 struct i915_fence_reg *i915_reserve_fence(struct i915_ggtt *ggtt) 459 { 460 struct i915_fence_reg *fence; 461 int count; 462 int ret; 463 464 lockdep_assert_held(&ggtt->vm.mutex); 465 466 /* Keep at least one fence available for the display engine. */ 467 count = 0; 468 list_for_each_entry(fence, &ggtt->fence_list, link) 469 count += !atomic_read(&fence->pin_count); 470 if (count <= 1) 471 return ERR_PTR(-ENOSPC); 472 473 fence = fence_find(ggtt); 474 if (IS_ERR(fence)) 475 return fence; 476 477 if (fence->vma) { 478 /* Force-remove fence from VMA */ 479 ret = fence_update(fence, NULL); 480 if (ret) 481 return ERR_PTR(ret); 482 } 483 484 list_del(&fence->link); 485 486 return fence; 487 } 488 489 /** 490 * i915_unreserve_fence - Reclaim a reserved fence 491 * @fence: the fence reg 492 * 493 * This function add a reserved fence register from vGPU to the fence_list. 494 */ 495 void i915_unreserve_fence(struct i915_fence_reg *fence) 496 { 497 struct i915_ggtt *ggtt = fence->ggtt; 498 499 lockdep_assert_held(&ggtt->vm.mutex); 500 501 list_add(&fence->link, &ggtt->fence_list); 502 } 503 504 /** 505 * intel_ggtt_restore_fences - restore fence state 506 * @ggtt: Global GTT 507 * 508 * Restore the hw fence state to match the software tracking again, to be called 509 * after a gpu reset and on resume. Note that on runtime suspend we only cancel 510 * the fences, to be reacquired by the user later. 511 */ 512 void intel_ggtt_restore_fences(struct i915_ggtt *ggtt) 513 { 514 int i; 515 516 for (i = 0; i < ggtt->num_fences; i++) 517 fence_write(&ggtt->fence_regs[i]); 518 } 519 520 /** 521 * DOC: tiling swizzling details 522 * 523 * The idea behind tiling is to increase cache hit rates by rearranging 524 * pixel data so that a group of pixel accesses are in the same cacheline. 525 * Performance improvement from doing this on the back/depth buffer are on 526 * the order of 30%. 527 * 528 * Intel architectures make this somewhat more complicated, though, by 529 * adjustments made to addressing of data when the memory is in interleaved 530 * mode (matched pairs of DIMMS) to improve memory bandwidth. 531 * For interleaved memory, the CPU sends every sequential 64 bytes 532 * to an alternate memory channel so it can get the bandwidth from both. 533 * 534 * The GPU also rearranges its accesses for increased bandwidth to interleaved 535 * memory, and it matches what the CPU does for non-tiled. However, when tiled 536 * it does it a little differently, since one walks addresses not just in the 537 * X direction but also Y. So, along with alternating channels when bit 538 * 6 of the address flips, it also alternates when other bits flip -- Bits 9 539 * (every 512 bytes, an X tile scanline) and 10 (every two X tile scanlines) 540 * are common to both the 915 and 965-class hardware. 541 * 542 * The CPU also sometimes XORs in higher bits as well, to improve 543 * bandwidth doing strided access like we do so frequently in graphics. This 544 * is called "Channel XOR Randomization" in the MCH documentation. The result 545 * is that the CPU is XORing in either bit 11 or bit 17 to bit 6 of its address 546 * decode. 547 * 548 * All of this bit 6 XORing has an effect on our memory management, 549 * as we need to make sure that the 3d driver can correctly address object 550 * contents. 551 * 552 * If we don't have interleaved memory, all tiling is safe and no swizzling is 553 * required. 554 * 555 * When bit 17 is XORed in, we simply refuse to tile at all. Bit 556 * 17 is not just a page offset, so as we page an object out and back in, 557 * individual pages in it will have different bit 17 addresses, resulting in 558 * each 64 bytes being swapped with its neighbor! 559 * 560 * Otherwise, if interleaved, we have to tell the 3d driver what the address 561 * swizzling it needs to do is, since it's writing with the CPU to the pages 562 * (bit 6 and potentially bit 11 XORed in), and the GPU is reading from the 563 * pages (bit 6, 9, and 10 XORed in), resulting in a cumulative bit swizzling 564 * required by the CPU of XORing in bit 6, 9, 10, and potentially 11, in order 565 * to match what the GPU expects. 566 */ 567 568 /** 569 * detect_bit_6_swizzle - detect bit 6 swizzling pattern 570 * @ggtt: Global GGTT 571 * 572 * Detects bit 6 swizzling of address lookup between IGD access and CPU 573 * access through main memory. 574 */ 575 static void detect_bit_6_swizzle(struct i915_ggtt *ggtt) 576 { 577 struct intel_uncore *uncore = ggtt->vm.gt->uncore; 578 struct drm_i915_private *i915 = ggtt->vm.i915; 579 u32 swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN; 580 u32 swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN; 581 582 if (GRAPHICS_VER(i915) >= 8 || IS_VALLEYVIEW(i915)) { 583 /* 584 * On BDW+, swizzling is not used. We leave the CPU memory 585 * controller in charge of optimizing memory accesses without 586 * the extra address manipulation GPU side. 587 * 588 * VLV and CHV don't have GPU swizzling. 589 */ 590 swizzle_x = I915_BIT_6_SWIZZLE_NONE; 591 swizzle_y = I915_BIT_6_SWIZZLE_NONE; 592 } else if (GRAPHICS_VER(i915) >= 6) { 593 if (i915->preserve_bios_swizzle) { 594 if (intel_uncore_read(uncore, DISP_ARB_CTL) & 595 DISP_TILE_SURFACE_SWIZZLING) { 596 swizzle_x = I915_BIT_6_SWIZZLE_9_10; 597 swizzle_y = I915_BIT_6_SWIZZLE_9; 598 } else { 599 swizzle_x = I915_BIT_6_SWIZZLE_NONE; 600 swizzle_y = I915_BIT_6_SWIZZLE_NONE; 601 } 602 } else { 603 u32 dimm_c0, dimm_c1; 604 605 dimm_c0 = intel_uncore_read(uncore, MAD_DIMM_C0); 606 dimm_c1 = intel_uncore_read(uncore, MAD_DIMM_C1); 607 dimm_c0 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK; 608 dimm_c1 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK; 609 /* 610 * Enable swizzling when the channels are populated 611 * with identically sized dimms. We don't need to check 612 * the 3rd channel because no cpu with gpu attached 613 * ships in that configuration. Also, swizzling only 614 * makes sense for 2 channels anyway. 615 */ 616 if (dimm_c0 == dimm_c1) { 617 swizzle_x = I915_BIT_6_SWIZZLE_9_10; 618 swizzle_y = I915_BIT_6_SWIZZLE_9; 619 } else { 620 swizzle_x = I915_BIT_6_SWIZZLE_NONE; 621 swizzle_y = I915_BIT_6_SWIZZLE_NONE; 622 } 623 } 624 } else if (GRAPHICS_VER(i915) == 5) { 625 /* 626 * On Ironlake whatever DRAM config, GPU always do 627 * same swizzling setup. 628 */ 629 swizzle_x = I915_BIT_6_SWIZZLE_9_10; 630 swizzle_y = I915_BIT_6_SWIZZLE_9; 631 } else if (GRAPHICS_VER(i915) == 2) { 632 /* 633 * As far as we know, the 865 doesn't have these bit 6 634 * swizzling issues. 635 */ 636 swizzle_x = I915_BIT_6_SWIZZLE_NONE; 637 swizzle_y = I915_BIT_6_SWIZZLE_NONE; 638 } else if (IS_G45(i915) || IS_I965G(i915) || IS_G33(i915)) { 639 /* 640 * The 965, G33, and newer, have a very flexible memory 641 * configuration. It will enable dual-channel mode 642 * (interleaving) on as much memory as it can, and the GPU 643 * will additionally sometimes enable different bit 6 644 * swizzling for tiled objects from the CPU. 645 * 646 * Here's what I found on the G965: 647 * slot fill memory size swizzling 648 * 0A 0B 1A 1B 1-ch 2-ch 649 * 512 0 0 0 512 0 O 650 * 512 0 512 0 16 1008 X 651 * 512 0 0 512 16 1008 X 652 * 0 512 0 512 16 1008 X 653 * 1024 1024 1024 0 2048 1024 O 654 * 655 * We could probably detect this based on either the DRB 656 * matching, which was the case for the swizzling required in 657 * the table above, or from the 1-ch value being less than 658 * the minimum size of a rank. 659 * 660 * Reports indicate that the swizzling actually 661 * varies depending upon page placement inside the 662 * channels, i.e. we see swizzled pages where the 663 * banks of memory are paired and unswizzled on the 664 * uneven portion, so leave that as unknown. 665 */ 666 if (intel_uncore_read16(uncore, C0DRB3_BW) == 667 intel_uncore_read16(uncore, C1DRB3_BW)) { 668 swizzle_x = I915_BIT_6_SWIZZLE_9_10; 669 swizzle_y = I915_BIT_6_SWIZZLE_9; 670 } 671 } else { 672 u32 dcc = intel_uncore_read(uncore, DCC); 673 674 /* 675 * On 9xx chipsets, channel interleave by the CPU is 676 * determined by DCC. For single-channel, neither the CPU 677 * nor the GPU do swizzling. For dual channel interleaved, 678 * the GPU's interleave is bit 9 and 10 for X tiled, and bit 679 * 9 for Y tiled. The CPU's interleave is independent, and 680 * can be based on either bit 11 (haven't seen this yet) or 681 * bit 17 (common). 682 */ 683 switch (dcc & DCC_ADDRESSING_MODE_MASK) { 684 case DCC_ADDRESSING_MODE_SINGLE_CHANNEL: 685 case DCC_ADDRESSING_MODE_DUAL_CHANNEL_ASYMMETRIC: 686 swizzle_x = I915_BIT_6_SWIZZLE_NONE; 687 swizzle_y = I915_BIT_6_SWIZZLE_NONE; 688 break; 689 case DCC_ADDRESSING_MODE_DUAL_CHANNEL_INTERLEAVED: 690 if (dcc & DCC_CHANNEL_XOR_DISABLE) { 691 /* 692 * This is the base swizzling by the GPU for 693 * tiled buffers. 694 */ 695 swizzle_x = I915_BIT_6_SWIZZLE_9_10; 696 swizzle_y = I915_BIT_6_SWIZZLE_9; 697 } else if ((dcc & DCC_CHANNEL_XOR_BIT_17) == 0) { 698 /* Bit 11 swizzling by the CPU in addition. */ 699 swizzle_x = I915_BIT_6_SWIZZLE_9_10_11; 700 swizzle_y = I915_BIT_6_SWIZZLE_9_11; 701 } else { 702 /* Bit 17 swizzling by the CPU in addition. */ 703 swizzle_x = I915_BIT_6_SWIZZLE_9_10_17; 704 swizzle_y = I915_BIT_6_SWIZZLE_9_17; 705 } 706 break; 707 } 708 709 /* check for L-shaped memory aka modified enhanced addressing */ 710 if (GRAPHICS_VER(i915) == 4 && 711 !(intel_uncore_read(uncore, DCC2) & DCC2_MODIFIED_ENHANCED_DISABLE)) { 712 swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN; 713 swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN; 714 } 715 716 if (dcc == 0xffffffff) { 717 drm_err(&i915->drm, "Couldn't read from MCHBAR. " 718 "Disabling tiling.\n"); 719 swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN; 720 swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN; 721 } 722 } 723 724 if (swizzle_x == I915_BIT_6_SWIZZLE_UNKNOWN || 725 swizzle_y == I915_BIT_6_SWIZZLE_UNKNOWN) { 726 /* 727 * Userspace likes to explode if it sees unknown swizzling, 728 * so lie. We will finish the lie when reporting through 729 * the get-tiling-ioctl by reporting the physical swizzle 730 * mode as unknown instead. 731 * 732 * As we don't strictly know what the swizzling is, it may be 733 * bit17 dependent, and so we need to also prevent the pages 734 * from being moved. 735 */ 736 i915->gem_quirks |= GEM_QUIRK_PIN_SWIZZLED_PAGES; 737 swizzle_x = I915_BIT_6_SWIZZLE_NONE; 738 swizzle_y = I915_BIT_6_SWIZZLE_NONE; 739 } 740 741 to_gt(i915)->ggtt->bit_6_swizzle_x = swizzle_x; 742 to_gt(i915)->ggtt->bit_6_swizzle_y = swizzle_y; 743 } 744 745 /* 746 * Swap every 64 bytes of this page around, to account for it having a new 747 * bit 17 of its physical address and therefore being interpreted differently 748 * by the GPU. 749 */ 750 static void swizzle_page(struct page *page) 751 { 752 char temp[64]; 753 char *vaddr; 754 int i; 755 756 vaddr = kmap_local_page(page); 757 758 for (i = 0; i < PAGE_SIZE; i += 128) { 759 memcpy(temp, &vaddr[i], 64); 760 memcpy(&vaddr[i], &vaddr[i + 64], 64); 761 memcpy(&vaddr[i + 64], temp, 64); 762 } 763 764 kunmap_local(vaddr); 765 } 766 767 /** 768 * i915_gem_object_do_bit_17_swizzle - fixup bit 17 swizzling 769 * @obj: i915 GEM buffer object 770 * @pages: the scattergather list of physical pages 771 * 772 * This function fixes up the swizzling in case any page frame number for this 773 * object has changed in bit 17 since that state has been saved with 774 * i915_gem_object_save_bit_17_swizzle(). 775 * 776 * This is called when pinning backing storage again, since the kernel is free 777 * to move unpinned backing storage around (either by directly moving pages or 778 * by swapping them out and back in again). 779 */ 780 void 781 i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj, 782 struct sg_table *pages) 783 { 784 struct sgt_iter sgt_iter; 785 struct page *page; 786 int i; 787 788 if (obj->bit_17 == NULL) 789 return; 790 791 i = 0; 792 for_each_sgt_page(page, sgt_iter, pages) { 793 char new_bit_17 = page_to_phys(page) >> 17; 794 795 if ((new_bit_17 & 0x1) != (test_bit(i, obj->bit_17) != 0)) { 796 swizzle_page(page); 797 set_page_dirty(page); 798 } 799 800 i++; 801 } 802 } 803 804 /** 805 * i915_gem_object_save_bit_17_swizzle - save bit 17 swizzling 806 * @obj: i915 GEM buffer object 807 * @pages: the scattergather list of physical pages 808 * 809 * This function saves the bit 17 of each page frame number so that swizzling 810 * can be fixed up later on with i915_gem_object_do_bit_17_swizzle(). This must 811 * be called before the backing storage can be unpinned. 812 */ 813 void 814 i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj, 815 struct sg_table *pages) 816 { 817 const unsigned int page_count = obj->base.size >> PAGE_SHIFT; 818 struct sgt_iter sgt_iter; 819 struct page *page; 820 int i; 821 822 if (obj->bit_17 == NULL) { 823 obj->bit_17 = bitmap_zalloc(page_count, GFP_KERNEL); 824 if (obj->bit_17 == NULL) { 825 drm_err(obj->base.dev, 826 "Failed to allocate memory for bit 17 record\n"); 827 return; 828 } 829 } 830 831 i = 0; 832 833 for_each_sgt_page(page, sgt_iter, pages) { 834 if (page_to_phys(page) & (1 << 17)) 835 __set_bit(i, obj->bit_17); 836 else 837 __clear_bit(i, obj->bit_17); 838 i++; 839 } 840 } 841 842 void intel_ggtt_init_fences(struct i915_ggtt *ggtt) 843 { 844 struct drm_i915_private *i915 = ggtt->vm.i915; 845 struct intel_uncore *uncore = ggtt->vm.gt->uncore; 846 int num_fences; 847 int i; 848 849 INIT_LIST_HEAD(&ggtt->fence_list); 850 INIT_LIST_HEAD(&ggtt->userfault_list); 851 852 detect_bit_6_swizzle(ggtt); 853 854 if (!i915_ggtt_has_aperture(ggtt)) 855 num_fences = 0; 856 else if (GRAPHICS_VER(i915) >= 7 && 857 !(IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915))) 858 num_fences = 32; 859 else if (GRAPHICS_VER(i915) >= 4 || 860 IS_I945G(i915) || IS_I945GM(i915) || 861 IS_G33(i915) || IS_PINEVIEW(i915)) 862 num_fences = 16; 863 else 864 num_fences = 8; 865 866 if (intel_vgpu_active(i915)) 867 num_fences = intel_uncore_read(uncore, 868 vgtif_reg(avail_rs.fence_num)); 869 ggtt->fence_regs = kzalloc_objs(*ggtt->fence_regs, num_fences); 870 if (!ggtt->fence_regs) 871 num_fences = 0; 872 873 /* Initialize fence registers to zero */ 874 for (i = 0; i < num_fences; i++) { 875 struct i915_fence_reg *fence = &ggtt->fence_regs[i]; 876 877 i915_active_init(&fence->active, NULL, NULL, 0); 878 fence->ggtt = ggtt; 879 fence->id = i; 880 list_add_tail(&fence->link, &ggtt->fence_list); 881 } 882 ggtt->num_fences = num_fences; 883 884 intel_ggtt_restore_fences(ggtt); 885 } 886 887 void intel_ggtt_fini_fences(struct i915_ggtt *ggtt) 888 { 889 int i; 890 891 for (i = 0; i < ggtt->num_fences; i++) { 892 struct i915_fence_reg *fence = &ggtt->fence_regs[i]; 893 894 i915_active_fini(&fence->active); 895 } 896 897 kfree(ggtt->fence_regs); 898 } 899 900 void intel_gt_init_swizzling(struct intel_gt *gt) 901 { 902 struct drm_i915_private *i915 = gt->i915; 903 struct intel_uncore *uncore = gt->uncore; 904 905 if (GRAPHICS_VER(i915) < 5 || 906 to_gt(i915)->ggtt->bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE) 907 return; 908 909 intel_uncore_rmw(uncore, DISP_ARB_CTL, 0, DISP_TILE_SURFACE_SWIZZLING); 910 911 if (GRAPHICS_VER(i915) == 5) 912 return; 913 914 intel_uncore_rmw(uncore, TILECTL, 0, TILECTL_SWZCTL); 915 916 if (GRAPHICS_VER(i915) == 6) 917 intel_uncore_write(uncore, 918 ARB_MODE, 919 REG_MASKED_FIELD_ENABLE(ARB_MODE_SWIZZLE_SNB)); 920 else if (GRAPHICS_VER(i915) == 7) 921 intel_uncore_write(uncore, 922 ARB_MODE, 923 REG_MASKED_FIELD_ENABLE(ARB_MODE_SWIZZLE_IVB)); 924 else if (GRAPHICS_VER(i915) == 8) 925 intel_uncore_write(uncore, 926 GAMTARBMODE, 927 REG_MASKED_FIELD_ENABLE(ARB_MODE_SWIZZLE_BDW)); 928 else 929 MISSING_CASE(GRAPHICS_VER(i915)); 930 } 931