Branch data Line data Source code
1 : : /*
2 : : * Copyright © 2008-2015 Intel Corporation
3 : : *
4 : : * Permission is hereby granted, free of charge, to any person obtaining a
5 : : * copy of this software and associated documentation files (the "Software"),
6 : : * to deal in the Software without restriction, including without limitation
7 : : * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 : : * and/or sell copies of the Software, and to permit persons to whom the
9 : : * Software is furnished to do so, subject to the following conditions:
10 : : *
11 : : * The above copyright notice and this permission notice (including the next
12 : : * paragraph) shall be included in all copies or substantial portions of the
13 : : * Software.
14 : : *
15 : : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 : : * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 : : * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 : : * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 : : * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 : : * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 : : * IN THE SOFTWARE.
22 : : */
23 : :
24 : : #include <drm/i915_drm.h>
25 : :
26 : : #include "i915_drv.h"
27 : : #include "i915_scatterlist.h"
28 : : #include "i915_vgpu.h"
29 : :
30 : : /**
31 : : * DOC: fence register handling
32 : : *
33 : : * Important to avoid confusions: "fences" in the i915 driver are not execution
34 : : * fences used to track command completion but hardware detiler objects which
35 : : * wrap a given range of the global GTT. Each platform has only a fairly limited
36 : : * set of these objects.
37 : : *
38 : : * Fences are used to detile GTT memory mappings. They're also connected to the
39 : : * hardware frontbuffer render tracking and hence interact with frontbuffer
40 : : * compression. Furthermore on older platforms fences are required for tiled
41 : : * objects used by the display engine. They can also be used by the render
42 : : * engine - they're required for blitter commands and are optional for render
43 : : * commands. But on gen4+ both display (with the exception of fbc) and rendering
44 : : * have their own tiling state bits and don't need fences.
45 : : *
46 : : * Also note that fences only support X and Y tiling and hence can't be used for
47 : : * the fancier new tiling formats like W, Ys and Yf.
48 : : *
49 : : * Finally note that because fences are such a restricted resource they're
50 : : * dynamically associated with objects. Furthermore fence state is committed to
51 : : * the hardware lazily to avoid unnecessary stalls on gen2/3. Therefore code must
52 : : * explicitly call i915_gem_object_get_fence() to synchronize fencing status
53 : : * for cpu access. Also note that some code wants an unfenced view, for those
54 : : * cases the fence can be removed forcefully with i915_gem_object_put_fence().
55 : : *
56 : : * Internally these functions will synchronize with userspace access by removing
57 : : * CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
58 : : */
59 : :
60 : : #define pipelined 0
61 : :
62 : 0 : static struct drm_i915_private *fence_to_i915(struct i915_fence_reg *fence)
63 : : {
64 [ # # # # ]: 0 : return fence->ggtt->vm.i915;
65 : : }
66 : :
67 : 0 : static struct intel_uncore *fence_to_uncore(struct i915_fence_reg *fence)
68 : : {
69 : 0 : return fence->ggtt->vm.gt->uncore;
70 : : }
71 : :
72 : 0 : static void i965_write_fence_reg(struct i915_fence_reg *fence,
73 : : struct i915_vma *vma)
74 : : {
75 : 0 : i915_reg_t fence_reg_lo, fence_reg_hi;
76 : 0 : int fence_pitch_shift;
77 : 0 : u64 val;
78 : :
79 [ # # ]: 0 : if (INTEL_GEN(fence_to_i915(fence)) >= 6) {
80 : 0 : fence_reg_lo = FENCE_REG_GEN6_LO(fence->id);
81 : 0 : fence_reg_hi = FENCE_REG_GEN6_HI(fence->id);
82 : 0 : fence_pitch_shift = GEN6_FENCE_PITCH_SHIFT;
83 : :
84 : : } else {
85 : 0 : fence_reg_lo = FENCE_REG_965_LO(fence->id);
86 : 0 : fence_reg_hi = FENCE_REG_965_HI(fence->id);
87 : 0 : fence_pitch_shift = I965_FENCE_PITCH_SHIFT;
88 : : }
89 : :
90 : 0 : val = 0;
91 [ # # ]: 0 : if (vma) {
92 [ # # ]: 0 : unsigned int stride = i915_gem_object_get_stride(vma->obj);
93 : :
94 : 0 : GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
95 : 0 : GEM_BUG_ON(!IS_ALIGNED(vma->node.start, I965_FENCE_PAGE));
96 : 0 : GEM_BUG_ON(!IS_ALIGNED(vma->fence_size, I965_FENCE_PAGE));
97 : 0 : GEM_BUG_ON(!IS_ALIGNED(stride, 128));
98 : :
99 : 0 : val = (vma->node.start + vma->fence_size - I965_FENCE_PAGE) << 32;
100 : 0 : val |= vma->node.start;
101 : 0 : val |= (u64)((stride / 128) - 1) << fence_pitch_shift;
102 [ # # ]: 0 : if (i915_gem_object_get_tiling(vma->obj) == I915_TILING_Y)
103 : 0 : val |= BIT(I965_FENCE_TILING_Y_SHIFT);
104 : 0 : val |= I965_FENCE_REG_VALID;
105 : : }
106 : :
107 : 0 : if (!pipelined) {
108 : 0 : struct intel_uncore *uncore = fence_to_uncore(fence);
109 : :
110 : : /*
111 : : * To w/a incoherency with non-atomic 64-bit register updates,
112 : : * we split the 64-bit update into two 32-bit writes. In order
113 : : * for a partial fence not to be evaluated between writes, we
114 : : * precede the update with write to turn off the fence register,
115 : : * and only enable the fence as the last step.
116 : : *
117 : : * For extra levels of paranoia, we make sure each step lands
118 : : * before applying the next step.
119 : : */
120 : 0 : intel_uncore_write_fw(uncore, fence_reg_lo, 0);
121 : 0 : intel_uncore_posting_read_fw(uncore, fence_reg_lo);
122 : :
123 : 0 : intel_uncore_write_fw(uncore, fence_reg_hi, upper_32_bits(val));
124 : 0 : intel_uncore_write_fw(uncore, fence_reg_lo, lower_32_bits(val));
125 : 0 : intel_uncore_posting_read_fw(uncore, fence_reg_lo);
126 : : }
127 : 0 : }
128 : :
129 : 0 : static void i915_write_fence_reg(struct i915_fence_reg *fence,
130 : : struct i915_vma *vma)
131 : : {
132 : 0 : u32 val;
133 : :
134 : 0 : val = 0;
135 [ # # ]: 0 : if (vma) {
136 [ # # ]: 0 : unsigned int tiling = i915_gem_object_get_tiling(vma->obj);
137 : 0 : bool is_y_tiled = tiling == I915_TILING_Y;
138 [ # # ]: 0 : unsigned int stride = i915_gem_object_get_stride(vma->obj);
139 : :
140 : 0 : GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
141 : 0 : GEM_BUG_ON(vma->node.start & ~I915_FENCE_START_MASK);
142 : 0 : GEM_BUG_ON(!is_power_of_2(vma->fence_size));
143 : 0 : GEM_BUG_ON(!IS_ALIGNED(vma->node.start, vma->fence_size));
144 : :
145 [ # # # # : 0 : if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence_to_i915(fence)))
# # # # ]
146 : 0 : stride /= 128;
147 : : else
148 : 0 : stride /= 512;
149 : 0 : GEM_BUG_ON(!is_power_of_2(stride));
150 : :
151 : 0 : val = vma->node.start;
152 [ # # ]: 0 : if (is_y_tiled)
153 : 0 : val |= BIT(I830_FENCE_TILING_Y_SHIFT);
154 [ # # ]: 0 : val |= I915_FENCE_SIZE_BITS(vma->fence_size);
155 [ # # # # : 0 : val |= ilog2(stride) << I830_FENCE_PITCH_SHIFT;
# # # # #
# # # # #
# # # # #
# # # # #
# # # # #
# # # # #
# # # # #
# # # # #
# # # # #
# ]
156 : :
157 : 0 : val |= I830_FENCE_REG_VALID;
158 : : }
159 : :
160 : 0 : if (!pipelined) {
161 : 0 : struct intel_uncore *uncore = fence_to_uncore(fence);
162 : 0 : i915_reg_t reg = FENCE_REG(fence->id);
163 : :
164 : 0 : intel_uncore_write_fw(uncore, reg, val);
165 : 0 : intel_uncore_posting_read_fw(uncore, reg);
166 : : }
167 : 0 : }
168 : :
169 : 0 : static void i830_write_fence_reg(struct i915_fence_reg *fence,
170 : : struct i915_vma *vma)
171 : : {
172 : 0 : u32 val;
173 : :
174 : 0 : val = 0;
175 [ # # ]: 0 : if (vma) {
176 [ # # ]: 0 : unsigned int stride = i915_gem_object_get_stride(vma->obj);
177 : :
178 : 0 : GEM_BUG_ON(!i915_vma_is_map_and_fenceable(vma));
179 : 0 : GEM_BUG_ON(vma->node.start & ~I830_FENCE_START_MASK);
180 : 0 : GEM_BUG_ON(!is_power_of_2(vma->fence_size));
181 : 0 : GEM_BUG_ON(!is_power_of_2(stride / 128));
182 : 0 : GEM_BUG_ON(!IS_ALIGNED(vma->node.start, vma->fence_size));
183 : :
184 : 0 : val = vma->node.start;
185 [ # # ]: 0 : if (i915_gem_object_get_tiling(vma->obj) == I915_TILING_Y)
186 : 0 : val |= BIT(I830_FENCE_TILING_Y_SHIFT);
187 [ # # ]: 0 : val |= I830_FENCE_SIZE_BITS(vma->fence_size);
188 [ # # # # : 0 : val |= ilog2(stride / 128) << I830_FENCE_PITCH_SHIFT;
# # # # #
# # # # #
# # # # #
# # # # #
# # # # #
# # # # #
# # # # #
# # # # #
# # # # #
# ]
189 : 0 : val |= I830_FENCE_REG_VALID;
190 : : }
191 : :
192 : 0 : if (!pipelined) {
193 : 0 : struct intel_uncore *uncore = fence_to_uncore(fence);
194 : 0 : i915_reg_t reg = FENCE_REG(fence->id);
195 : :
196 : 0 : intel_uncore_write_fw(uncore, reg, val);
197 : 0 : intel_uncore_posting_read_fw(uncore, reg);
198 : : }
199 : 0 : }
200 : :
201 : 0 : static void fence_write(struct i915_fence_reg *fence,
202 : : struct i915_vma *vma)
203 : : {
204 : 0 : struct drm_i915_private *i915 = fence_to_i915(fence);
205 : :
206 : : /*
207 : : * Previous access through the fence register is marshalled by
208 : : * the mb() inside the fault handlers (i915_gem_release_mmaps)
209 : : * and explicitly managed for internal users.
210 : : */
211 : :
212 [ # # ]: 0 : if (IS_GEN(i915, 2))
213 : 0 : i830_write_fence_reg(fence, vma);
214 [ # # ]: 0 : else if (IS_GEN(i915, 3))
215 : 0 : i915_write_fence_reg(fence, vma);
216 : : else
217 : 0 : i965_write_fence_reg(fence, vma);
218 : :
219 : : /*
220 : : * Access through the fenced region afterwards is
221 : : * ordered by the posting reads whilst writing the registers.
222 : : */
223 : :
224 : 0 : fence->dirty = false;
225 : 0 : }
226 : :
227 : 0 : static int fence_update(struct i915_fence_reg *fence,
228 : : struct i915_vma *vma)
229 : : {
230 : 0 : struct i915_ggtt *ggtt = fence->ggtt;
231 : 0 : struct intel_uncore *uncore = fence_to_uncore(fence);
232 : 0 : intel_wakeref_t wakeref;
233 : 0 : struct i915_vma *old;
234 : 0 : int ret;
235 : :
236 [ # # ]: 0 : if (vma) {
237 [ # # ]: 0 : if (!i915_vma_is_map_and_fenceable(vma))
238 : : return -EINVAL;
239 : :
240 [ # # # # : 0 : if (WARN(!i915_gem_object_get_stride(vma->obj) ||
# # # # ]
241 : : !i915_gem_object_get_tiling(vma->obj),
242 : : "bogus fence setup with stride: 0x%x, tiling mode: %i\n",
243 : : i915_gem_object_get_stride(vma->obj),
244 : : i915_gem_object_get_tiling(vma->obj)))
245 : : return -EINVAL;
246 : :
247 : 0 : ret = i915_vma_sync(vma);
248 [ # # ]: 0 : if (ret)
249 : : return ret;
250 : : }
251 : :
252 : 0 : old = xchg(&fence->vma, NULL);
253 [ # # ]: 0 : if (old) {
254 : : /* XXX Ideally we would move the waiting to outside the mutex */
255 : 0 : ret = i915_vma_sync(old);
256 [ # # ]: 0 : if (ret) {
257 : 0 : fence->vma = old;
258 : 0 : return ret;
259 : : }
260 : :
261 : 0 : i915_vma_flush_writes(old);
262 : :
263 : : /*
264 : : * Ensure that all userspace CPU access is completed before
265 : : * stealing the fence.
266 : : */
267 [ # # ]: 0 : if (old != vma) {
268 : 0 : GEM_BUG_ON(old->fence != fence);
269 : 0 : i915_vma_revoke_mmap(old);
270 : 0 : old->fence = NULL;
271 : : }
272 : :
273 : 0 : list_move(&fence->link, &ggtt->fence_list);
274 : : }
275 : :
276 : : /*
277 : : * We only need to update the register itself if the device is awake.
278 : : * If the device is currently powered down, we will defer the write
279 : : * to the runtime resume, see i915_gem_restore_fences().
280 : : *
281 : : * This only works for removing the fence register, on acquisition
282 : : * the caller must hold the rpm wakeref. The fence register must
283 : : * be cleared before we can use any other fences to ensure that
284 : : * the new fences do not overlap the elided clears, confusing HW.
285 : : */
286 : 0 : wakeref = intel_runtime_pm_get_if_in_use(uncore->rpm);
287 [ # # ]: 0 : if (!wakeref) {
288 : : GEM_BUG_ON(vma);
289 : : return 0;
290 : : }
291 : :
292 : 0 : WRITE_ONCE(fence->vma, vma);
293 : 0 : fence_write(fence, vma);
294 : :
295 [ # # ]: 0 : if (vma) {
296 : 0 : vma->fence = fence;
297 : 0 : list_move_tail(&fence->link, &ggtt->fence_list);
298 : : }
299 : :
300 : 0 : intel_runtime_pm_put(uncore->rpm, wakeref);
301 : 0 : return 0;
302 : : }
303 : :
304 : : /**
305 : : * i915_vma_revoke_fence - force-remove fence for a VMA
306 : : * @vma: vma to map linearly (not through a fence reg)
307 : : *
308 : : * This function force-removes any fence from the given object, which is useful
309 : : * if the kernel wants to do untiled GTT access.
310 : : *
311 : : * Returns:
312 : : *
313 : : * 0 on success, negative error code on failure.
314 : : */
315 : 0 : int i915_vma_revoke_fence(struct i915_vma *vma)
316 : : {
317 : 0 : struct i915_fence_reg *fence = vma->fence;
318 : :
319 : 0 : lockdep_assert_held(&vma->vm->mutex);
320 [ # # ]: 0 : if (!fence)
321 : : return 0;
322 : :
323 [ # # ]: 0 : if (atomic_read(&fence->pin_count))
324 : : return -EBUSY;
325 : :
326 : 0 : return fence_update(fence, NULL);
327 : : }
328 : :
329 : 0 : static struct i915_fence_reg *fence_find(struct i915_ggtt *ggtt)
330 : : {
331 : 0 : struct i915_fence_reg *fence;
332 : :
333 [ # # ]: 0 : list_for_each_entry(fence, &ggtt->fence_list, link) {
334 : 0 : GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
335 : :
336 [ # # ]: 0 : if (atomic_read(&fence->pin_count))
337 : 0 : continue;
338 : :
339 : : return fence;
340 : : }
341 : :
342 : : /* Wait for completion of pending flips which consume fences */
343 [ # # ]: 0 : if (intel_has_pending_fb_unpin(ggtt->vm.i915))
344 : 0 : return ERR_PTR(-EAGAIN);
345 : :
346 : : return ERR_PTR(-EDEADLK);
347 : : }
348 : :
349 : 0 : int __i915_vma_pin_fence(struct i915_vma *vma)
350 : : {
351 [ # # ]: 0 : struct i915_ggtt *ggtt = i915_vm_to_ggtt(vma->vm);
352 : 0 : struct i915_fence_reg *fence;
353 [ # # ]: 0 : struct i915_vma *set = i915_gem_object_is_tiled(vma->obj) ? vma : NULL;
354 : 0 : int err;
355 : :
356 : 0 : lockdep_assert_held(&vma->vm->mutex);
357 : :
358 : : /* Just update our place in the LRU if our fence is getting reused. */
359 [ # # ]: 0 : if (vma->fence) {
360 : 0 : fence = vma->fence;
361 : 0 : GEM_BUG_ON(fence->vma != vma);
362 : 0 : atomic_inc(&fence->pin_count);
363 [ # # ]: 0 : if (!fence->dirty) {
364 : 0 : list_move_tail(&fence->link, &ggtt->fence_list);
365 : 0 : return 0;
366 : : }
367 [ # # ]: 0 : } else if (set) {
368 : 0 : fence = fence_find(ggtt);
369 [ # # ]: 0 : if (IS_ERR(fence))
370 : 0 : return PTR_ERR(fence);
371 : :
372 : 0 : GEM_BUG_ON(atomic_read(&fence->pin_count));
373 : 0 : atomic_inc(&fence->pin_count);
374 : : } else {
375 : : return 0;
376 : : }
377 : :
378 : 0 : err = fence_update(fence, set);
379 [ # # ]: 0 : if (err)
380 : 0 : goto out_unpin;
381 : :
382 : 0 : GEM_BUG_ON(fence->vma != set);
383 : 0 : GEM_BUG_ON(vma->fence != (set ? fence : NULL));
384 : :
385 [ # # ]: 0 : if (set)
386 : : return 0;
387 : :
388 : 0 : out_unpin:
389 : 0 : atomic_dec(&fence->pin_count);
390 : 0 : return err;
391 : : }
392 : :
393 : : /**
394 : : * i915_vma_pin_fence - set up fencing for a vma
395 : : * @vma: vma to map through a fence reg
396 : : *
397 : : * When mapping objects through the GTT, userspace wants to be able to write
398 : : * to them without having to worry about swizzling if the object is tiled.
399 : : * This function walks the fence regs looking for a free one for @obj,
400 : : * stealing one if it can't find any.
401 : : *
402 : : * It then sets up the reg based on the object's properties: address, pitch
403 : : * and tiling format.
404 : : *
405 : : * For an untiled surface, this removes any existing fence.
406 : : *
407 : : * Returns:
408 : : *
409 : : * 0 on success, negative error code on failure.
410 : : */
411 : 0 : int i915_vma_pin_fence(struct i915_vma *vma)
412 : : {
413 : 0 : int err;
414 : :
415 [ # # # # ]: 0 : if (!vma->fence && !i915_gem_object_is_tiled(vma->obj))
416 : : return 0;
417 : :
418 : : /*
419 : : * Note that we revoke fences on runtime suspend. Therefore the user
420 : : * must keep the device awake whilst using the fence.
421 : : */
422 : 0 : assert_rpm_wakelock_held(vma->vm->gt->uncore->rpm);
423 : 0 : GEM_BUG_ON(!i915_vma_is_pinned(vma));
424 : 0 : GEM_BUG_ON(!i915_vma_is_ggtt(vma));
425 : :
426 : 0 : err = mutex_lock_interruptible(&vma->vm->mutex);
427 [ # # ]: 0 : if (err)
428 : : return err;
429 : :
430 : 0 : err = __i915_vma_pin_fence(vma);
431 : 0 : mutex_unlock(&vma->vm->mutex);
432 : :
433 : 0 : return err;
434 : : }
435 : :
436 : : /**
437 : : * i915_reserve_fence - Reserve a fence for vGPU
438 : : * @ggtt: Global GTT
439 : : *
440 : : * This function walks the fence regs looking for a free one and remove
441 : : * it from the fence_list. It is used to reserve fence for vGPU to use.
442 : : */
443 : 0 : struct i915_fence_reg *i915_reserve_fence(struct i915_ggtt *ggtt)
444 : : {
445 : 0 : struct i915_fence_reg *fence;
446 : 0 : int count;
447 : 0 : int ret;
448 : :
449 : 0 : lockdep_assert_held(&ggtt->vm.mutex);
450 : :
451 : : /* Keep at least one fence available for the display engine. */
452 : 0 : count = 0;
453 [ # # ]: 0 : list_for_each_entry(fence, &ggtt->fence_list, link)
454 : 0 : count += !atomic_read(&fence->pin_count);
455 [ # # ]: 0 : if (count <= 1)
456 : : return ERR_PTR(-ENOSPC);
457 : :
458 : 0 : fence = fence_find(ggtt);
459 [ # # ]: 0 : if (IS_ERR(fence))
460 : : return fence;
461 : :
462 [ # # ]: 0 : if (fence->vma) {
463 : : /* Force-remove fence from VMA */
464 : 0 : ret = fence_update(fence, NULL);
465 [ # # ]: 0 : if (ret)
466 : 0 : return ERR_PTR(ret);
467 : : }
468 : :
469 : 0 : list_del(&fence->link);
470 : :
471 : 0 : return fence;
472 : : }
473 : :
474 : : /**
475 : : * i915_unreserve_fence - Reclaim a reserved fence
476 : : * @fence: the fence reg
477 : : *
478 : : * This function add a reserved fence register from vGPU to the fence_list.
479 : : */
480 : 0 : void i915_unreserve_fence(struct i915_fence_reg *fence)
481 : : {
482 : 0 : struct i915_ggtt *ggtt = fence->ggtt;
483 : :
484 : 0 : lockdep_assert_held(&ggtt->vm.mutex);
485 : :
486 : 0 : list_add(&fence->link, &ggtt->fence_list);
487 : 0 : }
488 : :
489 : : /**
490 : : * i915_gem_restore_fences - restore fence state
491 : : * @ggtt: Global GTT
492 : : *
493 : : * Restore the hw fence state to match the software tracking again, to be called
494 : : * after a gpu reset and on resume. Note that on runtime suspend we only cancel
495 : : * the fences, to be reacquired by the user later.
496 : : */
497 : 0 : void i915_gem_restore_fences(struct i915_ggtt *ggtt)
498 : : {
499 : 0 : int i;
500 : :
501 : 0 : rcu_read_lock(); /* keep obj alive as we dereference */
502 [ # # ]: 0 : for (i = 0; i < ggtt->num_fences; i++) {
503 : 0 : struct i915_fence_reg *reg = &ggtt->fence_regs[i];
504 [ # # ]: 0 : struct i915_vma *vma = READ_ONCE(reg->vma);
505 : :
506 : 0 : GEM_BUG_ON(vma && vma->fence != reg);
507 : :
508 : : /*
509 : : * Commit delayed tiling changes if we have an object still
510 : : * attached to the fence, otherwise just clear the fence.
511 : : */
512 [ # # # # ]: 0 : if (vma && !i915_gem_object_is_tiled(vma->obj))
513 : 0 : vma = NULL;
514 : :
515 : 0 : fence_write(reg, vma);
516 : : }
517 : 0 : rcu_read_unlock();
518 : 0 : }
519 : :
520 : : /**
521 : : * DOC: tiling swizzling details
522 : : *
523 : : * The idea behind tiling is to increase cache hit rates by rearranging
524 : : * pixel data so that a group of pixel accesses are in the same cacheline.
525 : : * Performance improvement from doing this on the back/depth buffer are on
526 : : * the order of 30%.
527 : : *
528 : : * Intel architectures make this somewhat more complicated, though, by
529 : : * adjustments made to addressing of data when the memory is in interleaved
530 : : * mode (matched pairs of DIMMS) to improve memory bandwidth.
531 : : * For interleaved memory, the CPU sends every sequential 64 bytes
532 : : * to an alternate memory channel so it can get the bandwidth from both.
533 : : *
534 : : * The GPU also rearranges its accesses for increased bandwidth to interleaved
535 : : * memory, and it matches what the CPU does for non-tiled. However, when tiled
536 : : * it does it a little differently, since one walks addresses not just in the
537 : : * X direction but also Y. So, along with alternating channels when bit
538 : : * 6 of the address flips, it also alternates when other bits flip -- Bits 9
539 : : * (every 512 bytes, an X tile scanline) and 10 (every two X tile scanlines)
540 : : * are common to both the 915 and 965-class hardware.
541 : : *
542 : : * The CPU also sometimes XORs in higher bits as well, to improve
543 : : * bandwidth doing strided access like we do so frequently in graphics. This
544 : : * is called "Channel XOR Randomization" in the MCH documentation. The result
545 : : * is that the CPU is XORing in either bit 11 or bit 17 to bit 6 of its address
546 : : * decode.
547 : : *
548 : : * All of this bit 6 XORing has an effect on our memory management,
549 : : * as we need to make sure that the 3d driver can correctly address object
550 : : * contents.
551 : : *
552 : : * If we don't have interleaved memory, all tiling is safe and no swizzling is
553 : : * required.
554 : : *
555 : : * When bit 17 is XORed in, we simply refuse to tile at all. Bit
556 : : * 17 is not just a page offset, so as we page an object out and back in,
557 : : * individual pages in it will have different bit 17 addresses, resulting in
558 : : * each 64 bytes being swapped with its neighbor!
559 : : *
560 : : * Otherwise, if interleaved, we have to tell the 3d driver what the address
561 : : * swizzling it needs to do is, since it's writing with the CPU to the pages
562 : : * (bit 6 and potentially bit 11 XORed in), and the GPU is reading from the
563 : : * pages (bit 6, 9, and 10 XORed in), resulting in a cumulative bit swizzling
564 : : * required by the CPU of XORing in bit 6, 9, 10, and potentially 11, in order
565 : : * to match what the GPU expects.
566 : : */
567 : :
568 : : /**
569 : : * detect_bit_6_swizzle - detect bit 6 swizzling pattern
570 : : * @ggtt: Global GGTT
571 : : *
572 : : * Detects bit 6 swizzling of address lookup between IGD access and CPU
573 : : * access through main memory.
574 : : */
575 : : static void detect_bit_6_swizzle(struct i915_ggtt *ggtt)
576 : : {
577 : : struct intel_uncore *uncore = ggtt->vm.gt->uncore;
578 : : struct drm_i915_private *i915 = ggtt->vm.i915;
579 : : u32 swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
580 : : u32 swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
581 : :
582 : : if (INTEL_GEN(i915) >= 8 || IS_VALLEYVIEW(i915)) {
583 : : /*
584 : : * On BDW+, swizzling is not used. We leave the CPU memory
585 : : * controller in charge of optimizing memory accesses without
586 : : * the extra address manipulation GPU side.
587 : : *
588 : : * VLV and CHV don't have GPU swizzling.
589 : : */
590 : : swizzle_x = I915_BIT_6_SWIZZLE_NONE;
591 : : swizzle_y = I915_BIT_6_SWIZZLE_NONE;
592 : : } else if (INTEL_GEN(i915) >= 6) {
593 : : if (i915->preserve_bios_swizzle) {
594 : : if (intel_uncore_read(uncore, DISP_ARB_CTL) &
595 : : DISP_TILE_SURFACE_SWIZZLING) {
596 : : swizzle_x = I915_BIT_6_SWIZZLE_9_10;
597 : : swizzle_y = I915_BIT_6_SWIZZLE_9;
598 : : } else {
599 : : swizzle_x = I915_BIT_6_SWIZZLE_NONE;
600 : : swizzle_y = I915_BIT_6_SWIZZLE_NONE;
601 : : }
602 : : } else {
603 : : u32 dimm_c0, dimm_c1;
604 : : dimm_c0 = intel_uncore_read(uncore, MAD_DIMM_C0);
605 : : dimm_c1 = intel_uncore_read(uncore, MAD_DIMM_C1);
606 : : dimm_c0 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK;
607 : : dimm_c1 &= MAD_DIMM_A_SIZE_MASK | MAD_DIMM_B_SIZE_MASK;
608 : : /*
609 : : * Enable swizzling when the channels are populated
610 : : * with identically sized dimms. We don't need to check
611 : : * the 3rd channel because no cpu with gpu attached
612 : : * ships in that configuration. Also, swizzling only
613 : : * makes sense for 2 channels anyway.
614 : : */
615 : : if (dimm_c0 == dimm_c1) {
616 : : swizzle_x = I915_BIT_6_SWIZZLE_9_10;
617 : : swizzle_y = I915_BIT_6_SWIZZLE_9;
618 : : } else {
619 : : swizzle_x = I915_BIT_6_SWIZZLE_NONE;
620 : : swizzle_y = I915_BIT_6_SWIZZLE_NONE;
621 : : }
622 : : }
623 : : } else if (IS_GEN(i915, 5)) {
624 : : /*
625 : : * On Ironlake whatever DRAM config, GPU always do
626 : : * same swizzling setup.
627 : : */
628 : : swizzle_x = I915_BIT_6_SWIZZLE_9_10;
629 : : swizzle_y = I915_BIT_6_SWIZZLE_9;
630 : : } else if (IS_GEN(i915, 2)) {
631 : : /*
632 : : * As far as we know, the 865 doesn't have these bit 6
633 : : * swizzling issues.
634 : : */
635 : : swizzle_x = I915_BIT_6_SWIZZLE_NONE;
636 : : swizzle_y = I915_BIT_6_SWIZZLE_NONE;
637 : : } else if (IS_G45(i915) || IS_I965G(i915) || IS_G33(i915)) {
638 : : /*
639 : : * The 965, G33, and newer, have a very flexible memory
640 : : * configuration. It will enable dual-channel mode
641 : : * (interleaving) on as much memory as it can, and the GPU
642 : : * will additionally sometimes enable different bit 6
643 : : * swizzling for tiled objects from the CPU.
644 : : *
645 : : * Here's what I found on the G965:
646 : : * slot fill memory size swizzling
647 : : * 0A 0B 1A 1B 1-ch 2-ch
648 : : * 512 0 0 0 512 0 O
649 : : * 512 0 512 0 16 1008 X
650 : : * 512 0 0 512 16 1008 X
651 : : * 0 512 0 512 16 1008 X
652 : : * 1024 1024 1024 0 2048 1024 O
653 : : *
654 : : * We could probably detect this based on either the DRB
655 : : * matching, which was the case for the swizzling required in
656 : : * the table above, or from the 1-ch value being less than
657 : : * the minimum size of a rank.
658 : : *
659 : : * Reports indicate that the swizzling actually
660 : : * varies depending upon page placement inside the
661 : : * channels, i.e. we see swizzled pages where the
662 : : * banks of memory are paired and unswizzled on the
663 : : * uneven portion, so leave that as unknown.
664 : : */
665 : : if (intel_uncore_read(uncore, C0DRB3) ==
666 : : intel_uncore_read(uncore, C1DRB3)) {
667 : : swizzle_x = I915_BIT_6_SWIZZLE_9_10;
668 : : swizzle_y = I915_BIT_6_SWIZZLE_9;
669 : : }
670 : : } else {
671 : : u32 dcc = intel_uncore_read(uncore, DCC);
672 : :
673 : : /*
674 : : * On 9xx chipsets, channel interleave by the CPU is
675 : : * determined by DCC. For single-channel, neither the CPU
676 : : * nor the GPU do swizzling. For dual channel interleaved,
677 : : * the GPU's interleave is bit 9 and 10 for X tiled, and bit
678 : : * 9 for Y tiled. The CPU's interleave is independent, and
679 : : * can be based on either bit 11 (haven't seen this yet) or
680 : : * bit 17 (common).
681 : : */
682 : : switch (dcc & DCC_ADDRESSING_MODE_MASK) {
683 : : case DCC_ADDRESSING_MODE_SINGLE_CHANNEL:
684 : : case DCC_ADDRESSING_MODE_DUAL_CHANNEL_ASYMMETRIC:
685 : : swizzle_x = I915_BIT_6_SWIZZLE_NONE;
686 : : swizzle_y = I915_BIT_6_SWIZZLE_NONE;
687 : : break;
688 : : case DCC_ADDRESSING_MODE_DUAL_CHANNEL_INTERLEAVED:
689 : : if (dcc & DCC_CHANNEL_XOR_DISABLE) {
690 : : /*
691 : : * This is the base swizzling by the GPU for
692 : : * tiled buffers.
693 : : */
694 : : swizzle_x = I915_BIT_6_SWIZZLE_9_10;
695 : : swizzle_y = I915_BIT_6_SWIZZLE_9;
696 : : } else if ((dcc & DCC_CHANNEL_XOR_BIT_17) == 0) {
697 : : /* Bit 11 swizzling by the CPU in addition. */
698 : : swizzle_x = I915_BIT_6_SWIZZLE_9_10_11;
699 : : swizzle_y = I915_BIT_6_SWIZZLE_9_11;
700 : : } else {
701 : : /* Bit 17 swizzling by the CPU in addition. */
702 : : swizzle_x = I915_BIT_6_SWIZZLE_9_10_17;
703 : : swizzle_y = I915_BIT_6_SWIZZLE_9_17;
704 : : }
705 : : break;
706 : : }
707 : :
708 : : /* check for L-shaped memory aka modified enhanced addressing */
709 : : if (IS_GEN(i915, 4) &&
710 : : !(intel_uncore_read(uncore, DCC2) & DCC2_MODIFIED_ENHANCED_DISABLE)) {
711 : : swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
712 : : swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
713 : : }
714 : :
715 : : if (dcc == 0xffffffff) {
716 : : DRM_ERROR("Couldn't read from MCHBAR. "
717 : : "Disabling tiling.\n");
718 : : swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
719 : : swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
720 : : }
721 : : }
722 : :
723 : : if (swizzle_x == I915_BIT_6_SWIZZLE_UNKNOWN ||
724 : : swizzle_y == I915_BIT_6_SWIZZLE_UNKNOWN) {
725 : : /*
726 : : * Userspace likes to explode if it sees unknown swizzling,
727 : : * so lie. We will finish the lie when reporting through
728 : : * the get-tiling-ioctl by reporting the physical swizzle
729 : : * mode as unknown instead.
730 : : *
731 : : * As we don't strictly know what the swizzling is, it may be
732 : : * bit17 dependent, and so we need to also prevent the pages
733 : : * from being moved.
734 : : */
735 : : i915->quirks |= QUIRK_PIN_SWIZZLED_PAGES;
736 : : swizzle_x = I915_BIT_6_SWIZZLE_NONE;
737 : : swizzle_y = I915_BIT_6_SWIZZLE_NONE;
738 : : }
739 : :
740 : : i915->ggtt.bit_6_swizzle_x = swizzle_x;
741 : : i915->ggtt.bit_6_swizzle_y = swizzle_y;
742 : : }
743 : :
744 : : /*
745 : : * Swap every 64 bytes of this page around, to account for it having a new
746 : : * bit 17 of its physical address and therefore being interpreted differently
747 : : * by the GPU.
748 : : */
749 : 0 : static void i915_gem_swizzle_page(struct page *page)
750 : : {
751 : 0 : char temp[64];
752 : 0 : char *vaddr;
753 : 0 : int i;
754 : :
755 : 0 : vaddr = kmap(page);
756 : :
757 [ # # ]: 0 : for (i = 0; i < PAGE_SIZE; i += 128) {
758 : 0 : memcpy(temp, &vaddr[i], 64);
759 : 0 : memcpy(&vaddr[i], &vaddr[i + 64], 64);
760 : 0 : memcpy(&vaddr[i + 64], temp, 64);
761 : : }
762 : :
763 : 0 : kunmap(page);
764 : 0 : }
765 : :
766 : : /**
767 : : * i915_gem_object_do_bit_17_swizzle - fixup bit 17 swizzling
768 : : * @obj: i915 GEM buffer object
769 : : * @pages: the scattergather list of physical pages
770 : : *
771 : : * This function fixes up the swizzling in case any page frame number for this
772 : : * object has changed in bit 17 since that state has been saved with
773 : : * i915_gem_object_save_bit_17_swizzle().
774 : : *
775 : : * This is called when pinning backing storage again, since the kernel is free
776 : : * to move unpinned backing storage around (either by directly moving pages or
777 : : * by swapping them out and back in again).
778 : : */
779 : : void
780 : 0 : i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj,
781 : : struct sg_table *pages)
782 : : {
783 : 0 : struct sgt_iter sgt_iter;
784 : 0 : struct page *page;
785 : 0 : int i;
786 : :
787 [ # # ]: 0 : if (obj->bit_17 == NULL)
788 : : return;
789 : :
790 : 0 : i = 0;
791 [ # # # # : 0 : for_each_sgt_page(page, sgt_iter, pages) {
# # ]
792 : 0 : char new_bit_17 = page_to_phys(page) >> 17;
793 [ # # ]: 0 : if ((new_bit_17 & 0x1) != (test_bit(i, obj->bit_17) != 0)) {
794 : 0 : i915_gem_swizzle_page(page);
795 : 0 : set_page_dirty(page);
796 : : }
797 [ # # ]: 0 : i++;
798 : : }
799 : : }
800 : :
801 : : /**
802 : : * i915_gem_object_save_bit_17_swizzle - save bit 17 swizzling
803 : : * @obj: i915 GEM buffer object
804 : : * @pages: the scattergather list of physical pages
805 : : *
806 : : * This function saves the bit 17 of each page frame number so that swizzling
807 : : * can be fixed up later on with i915_gem_object_do_bit_17_swizzle(). This must
808 : : * be called before the backing storage can be unpinned.
809 : : */
810 : : void
811 : 0 : i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj,
812 : : struct sg_table *pages)
813 : : {
814 : 0 : const unsigned int page_count = obj->base.size >> PAGE_SHIFT;
815 : 0 : struct sgt_iter sgt_iter;
816 : 0 : struct page *page;
817 : 0 : int i;
818 : :
819 [ # # ]: 0 : if (obj->bit_17 == NULL) {
820 : 0 : obj->bit_17 = bitmap_zalloc(page_count, GFP_KERNEL);
821 [ # # ]: 0 : if (obj->bit_17 == NULL) {
822 : 0 : DRM_ERROR("Failed to allocate memory for bit 17 "
823 : : "record\n");
824 : 0 : return;
825 : : }
826 : : }
827 : :
828 : 0 : i = 0;
829 : :
830 [ # # # # : 0 : for_each_sgt_page(page, sgt_iter, pages) {
# # ]
831 [ # # ]: 0 : if (page_to_phys(page) & (1 << 17))
832 : 0 : __set_bit(i, obj->bit_17);
833 : : else
834 : 0 : __clear_bit(i, obj->bit_17);
835 [ # # ]: 0 : i++;
836 : : }
837 : : }
838 : :
839 : 0 : void i915_ggtt_init_fences(struct i915_ggtt *ggtt)
840 : : {
841 : 0 : struct drm_i915_private *i915 = ggtt->vm.i915;
842 : 0 : struct intel_uncore *uncore = ggtt->vm.gt->uncore;
843 : 0 : int num_fences;
844 : 0 : int i;
845 : :
846 : 0 : INIT_LIST_HEAD(&ggtt->fence_list);
847 : 0 : INIT_LIST_HEAD(&ggtt->userfault_list);
848 : 0 : intel_wakeref_auto_init(&ggtt->userfault_wakeref, uncore->rpm);
849 : :
850 : 0 : detect_bit_6_swizzle(ggtt);
851 : :
852 [ # # ]: 0 : if (!i915_ggtt_has_aperture(ggtt))
853 : : num_fences = 0;
854 [ # # # # ]: 0 : else if (INTEL_GEN(i915) >= 7 &&
855 [ # # ]: 0 : !(IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915)))
856 : : num_fences = 32;
857 [ # # # # ]: 0 : else if (INTEL_GEN(i915) >= 4 ||
858 [ # # # # ]: 0 : IS_I945G(i915) || IS_I945GM(i915) ||
859 [ # # ]: 0 : IS_G33(i915) || IS_PINEVIEW(i915))
860 : : num_fences = 16;
861 : : else
862 : : num_fences = 8;
863 : :
864 [ # # ]: 0 : if (intel_vgpu_active(i915))
865 : 0 : num_fences = intel_uncore_read(uncore,
866 : : vgtif_reg(avail_rs.fence_num));
867 : :
868 : : /* Initialize fence registers to zero */
869 [ # # ]: 0 : for (i = 0; i < num_fences; i++) {
870 : 0 : struct i915_fence_reg *fence = &ggtt->fence_regs[i];
871 : :
872 : 0 : fence->ggtt = ggtt;
873 : 0 : fence->id = i;
874 : 0 : list_add_tail(&fence->link, &ggtt->fence_list);
875 : : }
876 : 0 : ggtt->num_fences = num_fences;
877 : :
878 : 0 : i915_gem_restore_fences(ggtt);
879 : 0 : }
880 : :
881 : 0 : void intel_gt_init_swizzling(struct intel_gt *gt)
882 : : {
883 : 0 : struct drm_i915_private *i915 = gt->i915;
884 : 0 : struct intel_uncore *uncore = gt->uncore;
885 : :
886 [ # # ]: 0 : if (INTEL_GEN(i915) < 5 ||
887 [ # # ]: 0 : i915->ggtt.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
888 : 0 : return;
889 : :
890 : 0 : intel_uncore_rmw(uncore, DISP_ARB_CTL, 0, DISP_TILE_SURFACE_SWIZZLING);
891 : :
892 [ # # ]: 0 : if (IS_GEN(i915, 5))
893 : : return;
894 : :
895 : 0 : intel_uncore_rmw(uncore, TILECTL, 0, TILECTL_SWZCTL);
896 : :
897 [ # # ]: 0 : if (IS_GEN(i915, 6))
898 : 0 : intel_uncore_write(uncore,
899 : : ARB_MODE,
900 : 0 : _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
901 [ # # ]: 0 : else if (IS_GEN(i915, 7))
902 : 0 : intel_uncore_write(uncore,
903 : : ARB_MODE,
904 : 0 : _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
905 [ # # ]: 0 : else if (IS_GEN(i915, 8))
906 : 0 : intel_uncore_write(uncore,
907 : : GAMTARBMODE,
908 : 0 : _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
909 : : else
910 : 0 : MISSING_CASE(INTEL_GEN(i915));
911 : : }
|